From 07eae07b8820156f84d6c0caa9c2f80677bd3f6f Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Mon, 18 Mar 2019 21:40:41 -0700 Subject: [PATCH 001/147] Incorported all the header files from the 3D factorization code. --- SRC/TreeInterface.cpp | 8 + SRC/cublas_utils.h | 1 + SRC/sp_ienv.c | 3 - SRC/superlu_ddefs.h | 662 +++++++++++++++++++++++++++++++++++++++++- SRC/superlu_defs.h | 326 ++++++++++++++++++++- SRC/util_dist.h | 136 ++++++++- 6 files changed, 1125 insertions(+), 11 deletions(-) diff --git a/SRC/TreeInterface.cpp b/SRC/TreeInterface.cpp index d73bb971..4874b8b4 100644 --- a/SRC/TreeInterface.cpp +++ b/SRC/TreeInterface.cpp @@ -19,6 +19,7 @@ namespace SuperLU_ASYNCOMM{ TreeBcast_slu* BcastTree = TreeBcast_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); return (BcTree) BcastTree; } + return 0; } void BcTree_Destroy(BcTree Tree, char precision){ @@ -54,6 +55,7 @@ namespace SuperLU_ASYNCOMM{ TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; return BcastTree->IsRoot()?YES:NO; } + return NO; } @@ -101,6 +103,7 @@ namespace SuperLU_ASYNCOMM{ TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; return BcastTree->GetDestCount(); } + return 0; } int BcTree_GetMsgSize(BcTree Tree, char precision){ @@ -112,6 +115,7 @@ namespace SuperLU_ASYNCOMM{ TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; return BcastTree->GetMsgSize(); } + return 0; } @@ -170,6 +174,7 @@ namespace SuperLU_ASYNCOMM{ TreeReduce_slu* ReduceTree = TreeReduce_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); return (RdTree) ReduceTree; } + return 0; } void RdTree_Destroy(RdTree Tree, char precision){ @@ -204,6 +209,7 @@ namespace SuperLU_ASYNCOMM{ TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; return ReduceTree->GetDestCount(); } + return 0; } int RdTree_GetMsgSize(RdTree Tree, char precision){ @@ -215,6 +221,7 @@ namespace SuperLU_ASYNCOMM{ TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; return ReduceTree->GetMsgSize(); } + return 0; } @@ -228,6 +235,7 @@ namespace SuperLU_ASYNCOMM{ TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; return ReduceTree->IsRoot()?YES:NO; } + return NO; } diff --git a/SRC/cublas_utils.h b/SRC/cublas_utils.h index 9c457abb..cde3d282 100644 --- a/SRC/cublas_utils.h +++ b/SRC/cublas_utils.h @@ -23,6 +23,7 @@ at the top-level directory. #include "cuda.h" #include "cuda_runtime_api.h" #include "cuda_runtime.h" +typedef struct LUstruct_gpu_ LUstruct_gpu; // Sherry - not in this distribution extern void DisplayHeader(); extern const char* cublasGetErrorString(cublasStatus_t status); diff --git a/SRC/sp_ienv.c b/SRC/sp_ienv.c index 08d1e8f1..3faababf 100644 --- a/SRC/sp_ienv.c +++ b/SRC/sp_ienv.c @@ -62,11 +62,9 @@ at the top-level directory. */ - #include #include - int_t sp_ienv_dist(int_t ispec) { @@ -119,6 +117,5 @@ sp_ienv_dist(int_t ispec) xerr_dist("sp_ienv", &i); return 0; - } /* sp_ienv_dist */ diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index 05df065d..a0dfe185 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -138,7 +138,7 @@ typedef struct { int_t *etree; Glu_persist_t *Glu_persist; LocalLU_t *Llu; - char dt; + char dt; } LUstruct_t; @@ -179,6 +179,107 @@ typedef struct { } SOLVEstruct_t; +/*==== For 3D code ====*/ + +// new structures for pdgstrf_4_8 + +typedef struct +{ + int_t nub; + int_t klst; + int_t ldu; + int_t* usub; + double* uval; +} uPanelInfo_t; + +typedef struct +{ + + int_t *lsub; + double *lusup; + int_t luptr0; + int_t nlb; //number of l blocks + int_t nsupr; +} lPanelInfo_t; + +typedef struct +{ + Remain_info_t *lookAhead_info, *Remain_info; + Ublock_info_t *Ublock_info, *Ublock_info_Phi; + + int_t first_l_block_acc , first_u_block_acc; + int_t last_offload ; + int_t *Lblock_dirty_bit, * Ublock_dirty_bit; + double *lookAhead_L_buff, *Remain_L_buff; + int_t lookAheadBlk , RemainBlk ; + int_t num_look_aheads, nsupers; + int_t ldu, ldu_Phi; + int_t num_u_blks, num_u_blks_Phi; + + int_t jj_cpu; + double *bigU_Phi; + double *bigU_host; + int_t Lnbrow; + int_t Rnbrow; + + int_t buffer_size; + int_t bigu_size; + int_t offloadCondition; + int_t superlu_acc_offload; + int_t nCudaStreams; + +} HyP_t; + +typedef struct +{ + int_t * Lsub_buf ; + double * Lval_buf ; + int_t * Usub_buf ; + double * Uval_buf ; +} LUValSubBuf_t; + +int_t scuStatUpdate( + int_t knsupc, + HyP_t* HyP, + SCT_t* SCT, + SuperLUStat_t *stat + ); + +typedef struct trf3Dpartition_t +{ + gEtreeInfo_t gEtreeInfo; + int_t* iperm_c_supno; + int_t* myNodeCount; + int_t* myTreeIdxs; + int_t* myZeroTrIdxs; + int_t** treePerm; + sForest_t** sForests; + int_t* supernode2treeMap; + LUValSubBuf_t *LUvsb; +} trf3Dpartition_t; + +typedef struct +{ + double *bigU; + double *bigV; +} scuBufs_t; + +typedef struct +{ + double* BlockLFactor; + double* BlockUFactor; +} diagFactBufs_t; + +typedef struct +{ + Ublock_info_t* Ublock_info; + Remain_info_t* Remain_info; + uPanelInfo_t* uPanelInfo; + lPanelInfo_t* lPanelInfo; +} packLUInfo_t; + +/*=====================*/ + /*********************************************************************** * Function prototypes ***********************************************************************/ @@ -424,6 +525,565 @@ extern void dger_(int*, int*, double*, double*, int*, #endif +/*==== For 3D code ====*/ + +extern int_t pdgstrf3d(superlu_dist_options_t *, int m, int n, double anorm, + trf3Dpartition_t*, SCT_t *, LUstruct_t *, gridinfo3d_t *, + SuperLUStat_t *, int *); +extern int_t zSendLPanel(int_t, int_t, LUstruct_t*, gridinfo3d_t*, SCT_t*); +extern int_t zRecvLPanel(int_t, int_t, double, double, double*, + LUstruct_t*, gridinfo3d_t*, SCT_t* SCT); +extern int_t zSendUPanel(int_t, int_t, LUstruct_t*, gridinfo3d_t*, SCT_t*); +extern int_t zRecvUPanel(int_t, int_t, double, double, double*, + LUstruct_t*, gridinfo3d_t*, SCT_t*); +extern void Init_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb ); +extern void Free_HyP(HyP_t* HyP); +extern void DistPrint(char* function_name, double value, char* Units, gridinfo_t* grid); +extern void DistPrint3D(char* function_name, double value, char* Units, gridinfo3d_t* grid3d); +extern void treeImbalance3D(gridinfo3d_t *grid3d, SCT_t* SCT); +extern void SCT_printComm3D(gridinfo3d_t *grid3d, SCT_t* SCT); +extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid); + + /* from scatter.h */ +extern void +block_gemm_scatter( int_t lb, int_t j, + Ublock_info_t *Ublock_info, + Remain_info_t *Remain_info, + double *L_mat, int_t ldl, + double *U_mat, int_t ldu, + double *bigV, + // int_t jj0, + int_t knsupc, int_t klst, + int_t *lsub, int_t *usub, int_t ldt, + int_t thread_id, + int_t *indirect, + int_t *indirect2, + int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, + int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, + int_t *xsup, gridinfo_t *grid, + SuperLUStat_t *stat +#ifdef SCATTER_PROFILE + , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer +#endif + ); +/*this version uses a lock to prevent multiple thread updating the same block*/ +void +block_gemm_scatter_lock( int_t lb, int_t j, + omp_lock_t* lock, + Ublock_info_t *Ublock_info, + Remain_info_t *Remain_info, + double *L_mat, int_t ldl, + double *U_mat, int_t ldu, + double *bigV, + // int_t jj0, + int_t knsupc, int_t klst, + int_t *lsub, int_t *usub, int_t ldt, + int_t thread_id, + int_t *indirect, + int_t *indirect2, + int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, + int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, + int_t *xsup, gridinfo_t *grid +#ifdef SCATTER_PROFILE + , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer +#endif + ); + +int_t block_gemm_scatterTopLeft( int_t lb, int_t j, + double* bigV, int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + HyP_t* HyP, + LUstruct_t *LUstruct, + gridinfo_t* grid, + SCT_t*SCT, SuperLUStat_t *stat + ); +int_t block_gemm_scatterTopRight( int_t lb, int_t j, + double* bigV, int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + HyP_t* HyP, + LUstruct_t *LUstruct, + gridinfo_t* grid, + SCT_t*SCT, SuperLUStat_t *stat + ); +int_t block_gemm_scatterBottomLeft( int_t lb, int_t j, + double* bigV, int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + HyP_t* HyP, + LUstruct_t *LUstruct, + gridinfo_t* grid, + SCT_t*SCT, SuperLUStat_t *stat + ); +int_t block_gemm_scatterBottomRight( int_t lb, int_t j, + double* bigV, int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + HyP_t* HyP, + LUstruct_t *LUstruct, + gridinfo_t* grid, + SCT_t*SCT, SuperLUStat_t *stat + ); + +extern void gather_u(int_t num_u_blks, + Ublock_info_t *Ublock_info, int_t * usub, + double *uval, double *bigU, int_t ldu, + int_t *xsup, int_t klst /* for SuperSize */ + ); + +extern void gather_l( int_t num_LBlk, int_t knsupc, + Remain_info_t *L_info, + double * lval, int_t LD_lval, + double * L_buff ); + + /* from gather.h */ +extern void Rgather_L(int_t k, int_t *lsub, double *lusup, gEtreeInfo_t*, + Glu_persist_t *, gridinfo_t *, HyP_t *, + int_t *myIperm, int_t *iperm_c_supno ); +extern void Rgather_U(int_t k, int_t jj0, int_t *usub, double *uval, + double *bigU, gEtreeInfo_t*, Glu_persist_t *, + gridinfo_t *, HyP_t *, int_t *myIperm, + int_t *iperm_c_supno, int_t *perm_u); + + /* from xtrf3Dpartition.h */ +extern trf3Dpartition_t* initTrf3Dpartition(int_t nsupers, + superlu_dist_options_t *options, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d); +extern void printMemUse(trf3Dpartition_t* trf3Dpartition, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d); + +extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat, + superlu_dist_options_t *options, LocalLU_t *Llu, + int_t* xsup, int_t num_look_aheads, int_t nsupers, + int_t * iperm_c_supno); + +extern void init3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + sForest_t** sForests, LUstruct_t* LUstruct, + gridinfo3d_t* grid3d); + +extern int_t gatherAllFactoredLUFr(int_t* myZeroTrIdxs, sForest_t* sForests, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, + SCT_t* SCT ); + + /* The following are from pdgstrf2.h */ +#if 0 // Sherry: same routine names, but different code !!!!!!! +extern void pdgstrf2_trsm(superlu_dist_options_t *options, int_t, int_t, + int_t k, double thresh, Glu_persist_t *, + gridinfo_t *, LocalLU_t *, MPI_Request *U_diag_blk_send_req, + SuperLUStat_t *, int *info, SCT_t *); +#ifdef _CRAY +void pdgstrs2_omp (int_t, int_t, int_t, Glu_persist_t *, gridinfo_t *, + LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd); +#else +void pdgstrs2_omp (int_t, int_t, int_t, int_t *, double*, Glu_persist_t *, gridinfo_t *, + LocalLU_t *, SuperLUStat_t *, Ublock_info_t *, double *bigV, int_t ldt, SCT_t *SCT ); +#endif + +#endif // same routine names !!!!!!!! + +extern int_t LpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr, + int_t ld_ujrow, double* lusup, int_t nsupr, SCT_t*); +extern void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, + double thresh, double *BlockUFactor, Glu_persist_t *, + gridinfo_t *, LocalLU_t *, + SuperLUStat_t *, int *info, SCT_t*); +extern int_t Trs2_GatherU(int_t iukp, int_t rukp, int_t klst, + int_t nsupc, int_t ldu, int_t *usub, + double* uval, double *tempv); +extern int_t Trs2_ScatterU(int_t iukp, int_t rukp, int_t klst, + int_t nsupc, int_t ldu, int_t *usub, + double* uval, double *tempv); +extern int_t Trs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, + int_t *usub, + double* uval, double *tempv, + int_t knsupc, int_t nsupr, double*lusup, + Glu_persist_t *Glu_persist) ; +extern int_t Trs2_InitUblock_info(int_t klst, int_t nb, Ublock_info_t *, + int_t *usub, Glu_persist_t *, SuperLUStat_t*); + +extern void pdgstrs2_mpf(int_t m, int_t k0, int_t k, double *Lval_buf, + int_t nsupr, Glu_persist_t *, + gridinfo_t *, LocalLU_t *, SuperLUStat_t *, + Ublock_info_t *, double *bigV, int_t ldt, SCT_t *); +extern void pdgstrs2 +#ifdef _CRAY +( + int_t m, int_t k0, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, + LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3 +); +#else +( + int_t m, int_t k0, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, + LocalLU_t *Llu, SuperLUStat_t *stat +); +#endif + +extern void pdgstrf2(superlu_dist_options_t *, int_t nsupers, int_t k0, + int_t k, double thresh, Glu_persist_t *, gridinfo_t *, + LocalLU_t *, MPI_Request *, SuperLUStat_t *, int *); + + /* from p3dcomm.h */ +int_t AllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); +int_t AllocGlu(int_t n, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); + +int_t p3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); + + +int_t scatter3dLPanels(int_t nsupers, + LUstruct_t * LUstruct, gridinfo3d_t* grid3d); + +int_t scatter3dUPanels(int_t nsupers, + LUstruct_t * LUstruct, gridinfo3d_t* grid3d); + +int_t collect3dLpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); + +int_t collect3dUpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); + +int_t p3dCollect(int_t layer, int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); + +/*zero out LU non zero entries*/ +int_t zeroSetLU(int_t nnodes, int_t* nodeList , LUstruct_t *LUstruct, gridinfo3d_t* grid3d); + + +/* Reduces L and U panels of nodes in the List nodeList (size=nnnodes) +receiver[L(nodelist)] =sender[L(nodelist)] +receiver[L(nodelist)] +receiver[U(nodelist)] =sender[U(nodelist)] +receiver[U(nodelist)] +*/ + +int_t reduceAncestors3d(int_t sender, int_t receiver, + int_t nnodes, int_t* nodeList, + double* Lval_buf, double* Uval_buf, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); + + +/*reduces all nodelists required in a level*/ +int_t reduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, + int_t** treePerm, + LUValSubBuf_t*LUvsb, + LUstruct_t* LUstruct, + gridinfo3d_t* grid3d, + SCT_t* SCT ); + +/* + Copies factored L and U panels from sender grid to receiver grid + receiver[L(nodelist)] <-- sender[L(nodelist)]; + receiver[U(nodelist)] <-- sender[U(nodelist)]; +*/ +int_t gatherFactoredLU(int_t sender, int_t receiver, + int_t nnodes, int_t *nodeList, LUValSubBuf_t*LUvsb, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d,SCT_t* SCT ); + +/*Gathers all the L and U factors to grid 0 for solve stage + By repeatidly calling above function + +*/ +int_t gatherAllFactoredLU( + trf3Dpartition_t* trf3Dpartition, + LUstruct_t* LUstruct, + gridinfo3d_t* grid3d, + SCT_t* SCT ); + + +/*Distributes data in each layer and initilizes ancestors + as zero in required nodes*/ +int_t init3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + int_t* nodeCount, int_t** nodeList, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d); + +/* +Returns list of permutation for each +tree that I update +*/ +int_t** getTreePerm( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + int_t* nodeCount, int_t** nodeList, + int_t* perm_c_supno, int_t* iperm_c_supno, + gridinfo3d_t* grid3d); + +/*number of nodes in each level of the trees which I update*/ +int_t* getMyNodeCounts(int_t maxLvl, int_t* myTreeIdxs, int_t* gNodeCount); + + +int_t checkIntVector3d(int_t* vec, int_t len, gridinfo3d_t* grid3d); + +int_t reduceStat(PhaseType PHASE, + SuperLUStat_t *stat, gridinfo3d_t * grid3d); + +int_t zSendLPanel(int_t k, int_t receiver, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); +int_t zRecvLPanel(int_t k, int_t sender, double alpha, double beta, + double* Lval_buf, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); + +int_t zSendUPanel(int_t k, int_t receiver, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); +int_t zRecvUPanel(int_t k, int_t sender, double alpha, double beta, + double* Uval_buf, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); + + /* from trfCommWrapper.h */ +extern int_t DiagFactIBCast(int_t k, int_t k0, + double *BlockUFactor, double *BlockLFactor, + int_t* IrecvPlcd_D, MPI_Request *, MPI_Request *, + MPI_Request *, MPI_Request *, gridinfo_t *, + superlu_dist_options_t *, double thresh, + LUstruct_t *LUstruct, SuperLUStat_t *, int *info, + SCT_t *); +extern int_t UPanelTrSolve( int_t k, double* BlockLFactor, double* bigV, + int_t ldt, Ublock_info_t*, gridinfo_t *, + LUstruct_t *, SuperLUStat_t *, SCT_t *); +extern int_t Wait_LUDiagSend(int_t k, MPI_Request *, MPI_Request *, + gridinfo_t *, SCT_t *); +extern int_t LPanelUpdate(int_t k, int_t* IrecvPlcd_D, int_t* factored_L, + MPI_Request *, double* BlockUFactor, gridinfo_t *, + LUstruct_t *, SCT_t *); +extern int_t UPanelUpdate(int_t k, int_t* factored_U, MPI_Request *, + double* BlockLFactor, double* bigV, + int_t ldt, Ublock_info_t*, gridinfo_t *, + LUstruct_t *, SuperLUStat_t *, SCT_t *); +extern int_t IBcastRecvLPanel(int_t k, int_t k0, int* msgcnt, + MPI_Request *, MPI_Request *, + int_t* Lsub_buf, double* Lval_buf, + int_t * factored, gridinfo_t *, LUstruct_t *, + SCT_t *); +extern int_t IBcastRecvUPanel(int_t k, int_t k0, int* msgcnt, MPI_Request *, + MPI_Request *, int_t* Usub_buf, double* Uval_buf, + gridinfo_t *, LUstruct_t *, SCT_t *); +extern int_t WaitL(int_t k, int* msgcnt, int* msgcntU, MPI_Request *, + MPI_Request *, gridinfo_t *, LUstruct_t *, SCT_t *); +extern int_t WaitU(int_t k, int* msgcnt, MPI_Request *, MPI_Request *, + gridinfo_t *, LUstruct_t *, SCT_t *); +extern int_t LPanelTrSolve(int_t k, int_t* factored_L, double* BlockUFactor, + gridinfo_t *, LUstruct_t *); + + /* from trfAux.h */ +extern int_t getNsupers(int, LUstruct_t *); +extern int_t SchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*, + Remain_info_t*, uPanelInfo_t *, + lPanelInfo_t *, int_t*, int_t *, int_t *, + double *bigU, int_t* Lsub_buf, + double* Lval_buf, int_t* Usub_buf, + double* Uval_buf, gridinfo_t *, LUstruct_t *); +extern int_t SchurComplementSetupGPU(int_t k, msgs_t* msgs, packLUInfo_t*, + int_t*, int_t*, int_t*, gEtreeInfo_t*, + factNodelists_t*, scuBufs_t*, + LUValSubBuf_t* LUvsb, gridinfo_t *, + LUstruct_t *, HyP_t*); +extern double* getBigV(int_t, int_t); +extern double* getBigU(int_t, gridinfo_t *, LUstruct_t *); +extern int_t getBigUSize(int_t, gridinfo_t *, LUstruct_t *); +// permutation from superLU default +extern int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *, + LUstruct_t *, gridinfo_t *); + + /* from treeFactorization.h */ +extern int_t LluBufInit(LUValSubBuf_t*, LUstruct_t *); +extern int_t initScuBufs(int_t ldt, int_t num_threads, int_t nsupers, + scuBufs_t* scuBufs, + LUstruct_t* LUstruct, + gridinfo_t * grid); +extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo); + +extern int_t ancestorFactor( + int_t ilvl, // level of factorization + sForest_t* sforest, + commRequests_t **comReqss, // lists of communication requests, size maxEtree level + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t*packLUInfo, + msgs_t**msgss, // size=num Look ahead + LUValSubBuf_t**LUvsbs, // size=num Look ahead + diagFactBufs_t **dFBufs, // size maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t* gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t * gIperm_c_supno, + int_t ldt, + HyP_t* HyP, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, + int *info +); + +// the generic tree factoring code +extern int_t treeFactor( + int_t nnnodes, // number of nodes in the tree + int_t *perm_c_supno, // list of nodes in the order of factorization + commRequests_t *comReqs, // lists of communication requests + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t*packLUInfo, + msgs_t*msgs, + LUValSubBuf_t*LUvsb, + diagFactBufs_t *dFBuf, + factStat_t *factStat, + factNodelists_t *fNlists, + superlu_dist_options_t *options, + int_t * gIperm_c_supno, + int_t ldt, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, + int *info +); + +extern int_t sparseTreeFactor( + int_t nnodes, // number of nodes in the tree + int_t *perm_c_supno, // list of nodes in the order of factorization + treeTopoInfo_t* treeTopoInfo, + commRequests_t *comReqs, // lists of communication requests + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t*packLUInfo, + msgs_t*msgs, + LUValSubBuf_t*LUvsb, + diagFactBufs_t *dFBuf, + factStat_t *factStat, + factNodelists_t *fNlists, + superlu_dist_options_t *options, + int_t * gIperm_c_supno, + int_t ldt, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, + int *info +); + +extern int_t denseTreeFactor( + int_t nnnodes, // number of nodes in the tree + int_t *perm_c_supno, // list of nodes in the order of factorization + commRequests_t *comReqs, // lists of communication requests + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t*packLUInfo, + msgs_t*msgs, + LUValSubBuf_t*LUvsb, + diagFactBufs_t *dFBuf, + factStat_t *factStat, + factNodelists_t *fNlists, + superlu_dist_options_t *options, + int_t * gIperm_c_supno, + int_t ldt, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, + int *info +); + +extern int_t sparseTreeFactor_ASYNC( + sForest_t* sforest, + commRequests_t **comReqss, // lists of communication requests // size maxEtree level + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t*packLUInfo, + msgs_t**msgss, // size=num Look ahead + LUValSubBuf_t**LUvsbs, // size=num Look ahead + diagFactBufs_t **dFBufs, // size maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t* gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t * gIperm_c_supno, + int_t ldt, + HyP_t* HyP, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, + int *info +); +extern LUValSubBuf_t** LluBufInitArr(int_t numLA, LUstruct_t *LUstruct); +extern diagFactBufs_t** initDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); +extern int_t initDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf); +extern int_t sDiagFactIBCast(int_t k, diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + gridinfo_t *grid, + superlu_dist_options_t *options, + double thresh, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, int *info, + SCT_t *SCT + ); +extern int_t sLPanelUpdate( int_t k, diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT); +extern int_t sUPanelUpdate( int_t k, + int_t ldt, + diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + scuBufs_t* scuBufs, + packLUInfo_t* packLUInfo, + gridinfo_t *grid, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, SCT_t *SCT); +extern int_t sIBcastRecvLPanel( + int_t k, + commRequests_t *comReqs, + LUValSubBuf_t* LUvsb, + msgs_t* msgs, + factStat_t *factStat, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT); + +extern int_t sIBcastRecvUPanel( + int_t k, + commRequests_t *comReqs, + LUValSubBuf_t* LUvsb, + msgs_t* msgs, + factStat_t *factStat, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT); +extern int_t sWaitL(int_t k, + commRequests_t *comReqs, + msgs_t* msgs, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT); +extern int_t sWaitU(int_t k, + commRequests_t *comReqs, + msgs_t* msgs, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT); +extern int_t sWait_LUDiagSend(int_t k, commRequests_t *comReqs, + gridinfo_t *grid, SCT_t *SCT); +extern int_t sSchurComplementSetup(int_t k, msgs_t* msgs, + packLUInfo_t* packLUInfo, + int_t* gIperm_c_supno, int_t*perm_c_supno, + factNodelists_t* fNlists, + scuBufs_t* scuBufs, LUValSubBuf_t* LUvsb, + gridinfo_t *grid, LUstruct_t *LUstruct); +extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs, + gridinfo_t *grid, SCT_t *SCT); +extern int_t sLPanelTrSolve( int_t k, diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT); +extern int_t checkRecvLDiag(int_t k, + commRequests_t *comReqs, + gridinfo_t *grid, + SCT_t *SCT); +extern int_t sUPanelTrSolve( int_t k, + int_t ldt, + diagFactBufs_t *dFBuf, + scuBufs_t* scuBufs, + packLUInfo_t* packLUInfo, + gridinfo_t *grid, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, SCT_t *SCT); + /* from ancFactorization.h */ +int_t ancestorFactor( + int_t ilvl, // level of factorization + sForest_t* sforest, + commRequests_t **comReqss, // lists of communication requests // size maxEtree level + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t*packLUInfo, + msgs_t**msgss, // size=num Look ahead + LUValSubBuf_t**LUvsbs, // size=num Look ahead + diagFactBufs_t **dFBufs, // size maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t* gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t * gIperm_c_supno, + int_t ldt, + HyP_t* HyP, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, + int *info +); + +/*=====================*/ #ifdef __cplusplus } diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index 3bbabd5c..9205f60b 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -90,6 +90,22 @@ at the top-level directory. #define IFMT "%8d" #endif +#ifdef __INTEL_COMPILER +#include "mkl.h" +#else +//#include "cblas.h" +#if 0 // Sherry: the following does not work with gcc on Linux. +#define _mm_malloc(a,b) malloc(a) +#define _mm_free(a) free(a) +#endif +static __inline__ unsigned long long _rdtsc(void) +{ + unsigned long long int x; + __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x)); + return x; +} +#endif + #include "superlu_enum_consts.h" #include "Cnames.h" #include "supermatrix.h" @@ -160,7 +176,7 @@ at the top-level directory. * 0,1: for sending L to "right" * * 2,3: for sending off-diagonal blocks of U "down" * * 4 : for sending the diagonal blcok down (in pxgstrf2) */ -#define SLU_MPI_TAG(id,num) ( (5*(num)+id) % tag_ub ) +//#define SLU_MPI_TAG(id,num) ( (5*(num)+id) % tag_ub ) /* For numeric factorization. */ #if 0 @@ -307,7 +323,7 @@ typedef struct { int Iam; /* my process number */ } superlu_scope_t; -/*-- Process grid definition */ +/*-- 2D process grid definition */ typedef struct { MPI_Comm comm; /* MPI communicator */ superlu_scope_t rscp; /* process scope in rowwise, horizontal directon */ @@ -317,6 +333,19 @@ typedef struct { int_t npcol; /* number of process columns */ } gridinfo_t; +/*-- 3D process grid definition */ +typedef struct { + MPI_Comm comm; /* MPI communicator */ + superlu_scope_t rscp; /* row scope */ + superlu_scope_t cscp; /* column scope */ + superlu_scope_t zscp; /* scope in third dim */ + int iam; /* my process number in this grid */ + int_t nprow; /* number of process rows */ + int_t npcol; /* number of process columns */ + int_t npdep; /* number of process columns */ + gridinfo_t grid2d; /* for using 2D functions */ +} gridinfo3d_t; + /* *-- The structures are determined by SYMBFACT and used thereafter. @@ -653,13 +682,18 @@ typedef struct { int_t iukp; int_t jb; int_t full_u_cols; - + int_t eo; /* order of elimination. For 3D algorithm */ + int_t ncols; + int_t StCol; } Ublock_info_t; typedef struct { int_t lptr; int_t ib; + int_t eo; /* order of elimination, for 3D code */ + int_t nrows; int_t FullRow; + int_t StRow; } Remain_info_t; typedef struct @@ -676,6 +710,162 @@ struct superlu_pair /**--------**/ +/*==== For 3D code ====*/ + +/* return the mpi_tag assuming 5 pairs of communications and MPI_TAG_UB >= 5 * + * for each supernodal column, the five communications are: * + * 0,1: for sending L to "right" * + * 2,3: for sending off-diagonal blocks of U "down" * + * 4 : for sending the diagonal blcok down (in pxgstrf2) */ +// int tag_ub; +// #define SLU_MPI_TAG(id,num) ( (5*(num)+id) % tag_ub ) + +// #undef SLU_MPI_TAG +/*defining my own MPI tags */ +/* return the mpi_tag assuming 5 pairs of communications and MPI_TAG_UB >= 5 * + * for each supernodal column, the five communications are: * + * 0,1: for sending L to "right" * + * 2,3: for sending off-diagonal blocks of U "down" * + * 4 : for sending the diagonal blcok down (in pxgstrf2) * + * 5 : for sending the diagonal L block right () : added by piyush */ +#define SLU_MPI_TAG(id,num) ( (6*(num)+id) % tag_ub ) + +/*structs for quick look up */ +typedef struct +{ + int_t luptrj; + int_t lptrj; + int_t lib; +} local_l_blk_info_t; + +typedef struct +{ + int_t iuip; + int_t ruip; + int_t ljb; +} local_u_blk_info_t; + + +//global variable +extern double CPU_CLOCK_RATE; + +typedef struct +{ + int_t *perm_c_supno; + int_t *iperm_c_supno; +} perm_array_t; + +typedef struct +{ + int_t* factored; + int_t* factored_D; + int_t* factored_L; + int_t* factored_U; + int_t* IrecvPlcd_D; + int_t* IbcastPanel_L; /*I bcast and recv placed for the k-th L panel*/ + int_t* IbcastPanel_U; /*I bcast and recv placed for the k-th U panel*/ + int_t* numChildLeft; /*number of children left to be factored*/ + int_t* gpuLUreduced; /*New for GPU acceleration*/ +}factStat_t; + +typedef struct +{ + int_t next_col; + int_t next_k; + int_t kljb; + int_t kijb; + int_t copyL_kljb; + int_t copyU_kljb; + int_t l_copy_len; + int_t u_copy_len; + int_t *kindexL; + int_t *kindexU; + int_t mkrow; + int_t mkcol; + int_t ksup_size; +} d2Hreduce_t; + +typedef struct{ + int_t numChild; + int_t numDescendents; + int_t left; + int_t right; + int_t extra; + int_t* childrenList; + int_t depth; // distance from the top + double weight; // weight of the supernode + double iWeight; // weight of the whole subtree below + double scuWeight; // weight of schur complement update = max|n_k||L_k||U_k| +} treeList_t; + +typedef struct +{ + int_t numLvl; // number of level in tree; + int_t* eTreeTopLims; // boundaries of each level of size + int_t* myIperm; // Iperm for my tree size nsupers; + +} treeTopoInfo_t; + +typedef struct +{ + int_t* setree; // global supernodal elimination tree + int_t* numChildLeft; +} gEtreeInfo_t; + +typedef enum treePartStrat{ + ND, // nested dissection ordering or natural ordering + GD // greedy load balance stregy +}treePartStrat; + +typedef struct +{ + /* data */ + int_t nNodes; // total number of nodes + int_t* nodeList; // list of nodes, should be in order of factorization + int_t* treeHeads; + /*topological information about the tree*/ + int_t numLvl; // number of Topological levels in the forest + int_t numTrees; // number of tree in the forest + treeTopoInfo_t topoInfo; // + int_t* eTreeTopLims; // boundaries of each level of size + int_t* myIperm; // Iperm for my tree size nsupers; + + /*information about load balance*/ + double weight; // estimated cost + double cost; // measured cost + +} sForest_t; + +typedef struct +{ + /* data */ + MPI_Request* L_diag_blk_recv_req; + MPI_Request* L_diag_blk_send_req; + MPI_Request* U_diag_blk_recv_req; + MPI_Request* U_diag_blk_send_req; + MPI_Request* recv_req; + MPI_Request* recv_requ; + MPI_Request* send_req; + MPI_Request* send_requ; +} commRequests_t; + +typedef struct +{ + int_t *iperm_c_supno; + int_t *iperm_u; + int_t *perm_u; + int_t *indirect; + int_t *indirect2; + +} factNodelists_t; + +typedef struct +{ + int* msgcnt; + int* msgcntU; +} msgs_t; + +/*====================*/ /*********************************************************************** * Function prototypes @@ -685,11 +875,15 @@ struct superlu_pair extern "C" { #endif -extern void set_default_options_dist(superlu_dist_options_t *); extern void superlu_gridinit(MPI_Comm, int_t, int_t, gridinfo_t *); extern void superlu_gridmap(MPI_Comm, int_t, int_t, int_t [], int_t, gridinfo_t *); extern void superlu_gridexit(gridinfo_t *); +extern void superlu_gridinit3d(MPI_Comm Bcomm, int_t nprow, + int_t npcol, int_t npdep, gridinfo3d_t *grid) ; +extern void superlu_gridexit3d(gridinfo3d_t *grid); + +extern void set_default_options_dist(superlu_dist_options_t *); extern void print_options_dist(superlu_dist_options_t *); extern void print_sp_ienv_dist(superlu_dist_options_t *); extern void Destroy_CompCol_Matrix_dist(SuperMatrix *); @@ -789,6 +983,18 @@ extern int_t psymbfact_LUXpand_RL extern int_t psymbfact_prLUXpand (int_t, int_t, int, Llu_symbfact_t *, psymbfact_stat_t *); +#ifdef ISORT +extern void isort (int_t N, int_t *ARRAY1, int_t *ARRAY2); +extern void isort1 (int_t N, int_t *ARRAY); +#else +int superlu_sort_perm (const void *arg1, const void *arg2) +{ + const int_t *val1 = (const int_t *) arg1; + const int_t *val2 = (const int_t *) arg2; + return (*val2 < *val1); +} +#endif + #ifdef GPU_ACC /* GPU related */ extern void gemm_division_cpu_gpu (int *, int *, int *, int, int, int, int *, int); @@ -814,7 +1020,6 @@ extern int file_PrintInt10(FILE *, char *, int_t, int_t *); extern int file_PrintInt32(FILE *, char *, int, int *); extern int file_PrintLong10(FILE *, char *, int_t, int_t *); - /* Routines for Async_tree communication*/ #ifndef __SUPERLU_ASYNC_TREE /* allow multiple inclusions */ @@ -853,7 +1058,116 @@ extern yes_no_t StdList_Find(StdList lst, int_t dat); extern int_t StdList_Size(StdList lst); yes_no_t StdList_Empty(StdList lst); - +/*==== For 3D code ====*/ + +/* Manipulate counters */ +extern void SCT_init(SCT_t*); +extern void SCT_print(gridinfo_t *grid, SCT_t* SCT); +extern void SCT_print3D(gridinfo3d_t *grid3d, SCT_t* SCT); +extern void SCT_free(SCT_t*); + +extern treeList_t* setree2list(int_t nsuper, int_t* setree ); +// int_t calcTreeWeight(int_t nsupers, treeList_t* treeList, int_t* xsup); +extern int_t calcTreeWeight(int_t nsupers, int_t*setree, treeList_t* treeList, int_t* xsup); +extern int_t getDescendList(int_t k, int_t*dlist, treeList_t* treeList); +extern int_t getCommonAncestorList(int_t k, int_t* alist, int_t* seTree, treeList_t* treeList); +extern int_t getCommonAncsCount(int_t k, treeList_t* treeList); +extern int_t* getPermNodeList(int_t nnode, // number of nodes + int_t* nlist, int_t* perm_c_sup,int_t* iperm_c_sup); +extern int_t* getEtreeLB(int_t nnodes, int_t* perm_l, int_t* gTopOrder); +extern int_t* getSubTreeRoots(int_t k, treeList_t* treeList); +// int_t* treeList2perm(treeList_t* , ..); +extern int_t* merg_perms(int_t nperms, int_t* nnodes, int_t** perms); +// returns a concatenated permutation for three permutation arrays + +extern int_t* getGlobal_iperm(int_t nsupers, int_t nperms, int_t** perms, + int_t* nnodes); +extern int_t log2i(int_t index); +extern int_t *supernodal_etree(int_t nsuper, int_t * etree, int_t* supno, int_t *xsup); +extern int_t testSubtreeNodelist(int_t nsupers, int_t numList, int_t** nodeList, int_t* nodeCount); +extern int_t testListPerm(int_t nodeCount, int_t* nodeList, int_t* permList, int_t* gTopLevel); + +/*takes supernodal elimination tree and for each + supernode calculates "level" in elimination tree*/ +extern int_t* topological_ordering(int_t nsuper, int_t* setree); +extern int_t* Etree_LevelBoundry(int_t* perm,int_t* tsort_etree, int_t nsuper); + +/*calculated boundries of the topological levels*/ +extern int_t* calculate_num_children(int_t nsuper, int_t* setree); +extern void Print_EtreeLevelBoundry(int_t *Etree_LvlBdry, int_t max_level, int_t nsuper); +extern void print_etree_leveled(int_t *setree, int_t* tsort_etree, int_t nsuper); +extern void print_etree(int_t *setree, int_t* iperm, int_t nsuper); +extern int_t printFileList(char* sname, int_t nnodes, int_t*dlist, int_t*setree); +int* getLastDepBtree( int_t nsupers, treeList_t* treeList); + +/*returns array R with of size maxLevel with either 0 or 1 + R[i] = 1; then Tree[level-i] is set to zero= to only + accumulate the results */ +extern int_t* getReplicatedTrees( gridinfo3d_t* grid3d); + +/*returns indices in gNodeList of trees that belongs to my layer*/ +extern int_t* getGridTrees( gridinfo3d_t* grid3d); + + +/*returns global nodelist*/ +extern int_t** getNodeList(int_t maxLvl, int_t* setree, int_t* nnodes, + int_t* treeHeads, treeList_t* treeList); + +/* calculate number of nodes in subtrees starting from treeHead[i]*/ +extern int_t* calcNumNodes(int_t maxLvl, int_t* treeHeads, treeList_t* treeList); + +/*Returns list of (last) node of the trees */ +extern int_t* getTreeHeads(int_t maxLvl, int_t nsupers, treeList_t* treeList); + +extern int_t* getMyIperm(int_t nnodes, int_t nsupers, int_t* myPerm); + +extern int_t* getMyTopOrder(int_t nnodes, int_t* myPerm, int_t* myIperm, int_t* setree ); + +extern int_t* getMyEtLims(int_t nnodes, int_t* myTopOrder); + + +extern treeTopoInfo_t getMyTreeTopoInfo(int_t nnodes, int_t nsupers, + int_t* myPerm,int_t* setree); + +extern sForest_t** getNestDissForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList); + +extern int_t** getTreePermForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + sForest_t* sForests, + int_t* perm_c_supno, int_t* iperm_c_supno, + gridinfo3d_t* grid3d); +extern int_t** getTreePermFr( int_t* myTreeIdxs, + sForest_t** sForests, gridinfo3d_t* grid3d); +extern int_t* getMyNodeCountsFr(int_t maxLvl, int_t* myTreeIdxs, + sForest_t** sForests); +extern int_t** getNodeListFr(int_t maxLvl, sForest_t** sForests); +extern int_t* getNodeCountsFr(int_t maxLvl, sForest_t** sForests); +// int_t* getNodeToForstMap(int_t nsupers, sForest_t** sForests, gridinfo3d_t* grid3d); +extern int_t* getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t* myNodeCount, int_t** treePerm); +extern void printForestWeightCost(sForest_t** sForests, SCT_t* SCT, gridinfo3d_t* grid3d); +extern sForest_t** getGreedyLoadBalForests( int_t maxLvl, int_t nsupers, int_t* setree, treeList_t* treeList); +extern sForest_t** getForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList); + + /* from trfAux.h */ +extern void set_tag_ub(); +extern int getNumThreads(int); +#if 0 // Sherry: conflicting with existing routine +extern int_t num_full_cols_U(int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup, + gridinfo_t *, int_t *); +extern int_t estimate_bigu_size(int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr, + Glu_persist_t *, gridinfo_t*, int_t* perm_u); +#endif +extern int_t* getFactPerm(int_t); +extern int_t* getFactIperm(int_t*, int_t); + +extern int_t initCommRequests(commRequests_t* comReqs, gridinfo_t * grid); +extern int_t initFactStat(int_t nsupers, factStat_t* factStat); +extern int_t initFactNodelists(int_t, int_t, int_t, factNodelists_t*); +extern int_t initMsgs(msgs_t* msgs); +extern int_t getNumLookAhead(); +extern commRequests_t** initCommRequestsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); +extern msgs_t** initMsgsArr(int_t numLA); + +/*=====================*/ #ifdef __cplusplus } diff --git a/SRC/util_dist.h b/SRC/util_dist.h index 96fcb99f..3456f96a 100644 --- a/SRC/util_dist.h +++ b/SRC/util_dist.h @@ -56,7 +56,11 @@ at the top-level directory. #define SUPERLU_MAX(x, y) ( (x) > (y) ? (x) : (y) ) #define SUPERLU_MIN(x, y) ( (x) < (y) ? (x) : (y) ) - +// allocating macros +#define MPI_REQ_ALLOC(x) ((MPI_Request *) SUPERLU_MALLOC ( (x) * sizeof (MPI_Request))) +#define INT_T_ALLOC(x) ((int_t *) SUPERLU_MALLOC ( (x) * sizeof (int_t))) +#define DOUBLE_ALLOC(x) ((double *) SUPERLU_MALLOC ( (x) * sizeof (double))) + /* * Constants */ @@ -68,6 +72,13 @@ at the top-level directory. #define TRUE (1) #endif +/*==== For 3D code ====*/ +#define MAX_3D_LEVEL 32 /*allows for z dimensions of 2^32*/ +#define CBLOCK 192 +#define CACHE_LINE_SIZE 8 +#define CSTEPPING 8 +/*=====================*/ + /* * Type definitions */ @@ -150,4 +161,127 @@ typedef struct { #define SuperLU_U_NZ_START(col) ( Ustore->colptr[col] ) #define SuperLU_U_SUB(ptr) ( Ustore->rowind[ptr] ) +/*********************************************************************** + * For 3D code */ +typedef struct +{ + int_t datatransfer_count; + int_t schurPhiCallCount; + int_t PhiMemCpyCounter; + double acc_load_imbal; + double LookAheadGEMMFlOp; + double PhiWaitTimer_2; + double LookAheadGEMMTimer; + double LookAheadRowSepTimer; + double LookAheadScatterTimer; + double GatherTimer ; + double GatherMOP ; + double scatter_mem_op_counter; + double LookAheadRowSepMOP ; + double scatter_mem_op_timer; + double schur_flop_counter; + double schur_flop_timer; + double CPUOffloadTimer; + double PhiWaitTimer; + double NetSchurUpTimer; + double AssemblyTimer; + double PhiMemCpyTimer; + double datatransfer_timer; + double LookAheadScatterMOP; + double schurPhiCallTimer; + double autotunetime; + double *Predicted_acc_sch_time; + double *Predicted_acc_gemm_time; + double *Predicted_acc_scatter_time; + + double trf2_flops; + double trf2_time; + double offloadable_flops; /*flops that can be done on ACC*/ + double offloadable_mops; /*mops that can be done on ACC*/ + + double *SchurCompUdtThreadTime; + double *Predicted_host_sch_time; + double *Measured_host_sch_time; + +#ifdef SCATTER_PROFILE + double *Host_TheadScatterMOP ; + double *Host_TheadScatterTimer; +#endif + +#ifdef OFFLOAD_PROFILE + double *Predicted_acc_scatter_time_strat1; + double *Predicted_host_sch_time_strat1; + size_t pci_transfer_count[18]; /*number of transfers*/ + double pci_transfer_time[18]; /*time for each transfer */ + double pci_transfer_prediction_error[18]; /*error in prediction*/ + double host_sch_time[24][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING]; /**/ + double host_sch_flop[24][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING]; /**/ +#endif + + double pdgstrs2_timer; + double pdgstrf2_timer; + double lookaheadupdatetimer; + double pdgstrfTimer; + +// new timers for different wait times + //convention: tl suffix refers to times measured from rdtsc + // td : suffix refers to times measured in SuerpLU_timer + + /* diagonal block factorization; part of pdgstrf2; called from thread*/ + // double Local_Dgstrf2_tl; + double *Local_Dgstrf2_Thread_tl; + /*wait for receiving U diagonal block: part of mpf*/ + double Wait_UDiagBlock_Recv_tl; + /*wait for receiving L diagonal block: part of mpf*/ + double Wait_LDiagBlock_Recv_tl; + + + /*Wait for U diagnal bloc kto receive; part of pdgstrf2 */ + double Recv_UDiagBlock_tl; + /*wait for previous U block send to finish; part of pdgstrf2 */ + double Wait_UDiagBlockSend_tl; + /*after obtaining U block, time spent in calculating L panel*/ + double L_PanelUpdate_tl; + /*Synchronous Broadcasting L and U panel*/ + double Bcast_UPanel_tl; + double Bcast_LPanel_tl; + /*Wait for L send to finish */ + double Wait_LSend_tl; + + /*Wait for U send to finish */ + double Wait_USend_tl; + /*Wait for U receive */ + double Wait_URecv_tl; + /*Wait for L receive */ + double Wait_LRecv_tl; + + /*time to get lock*/ + double *GetAijLock_Thread_tl; + + /*U_panelupdate*/ + double PDGSTRS2_tl; + + /*profiling by phases */ + double Phase_Factor_tl; + double Phase_LU_Update_tl; + double Phase_SC_Update_tl; + + /*3D timers*/ + double ancsReduce; /*timer for reducing ancestors before factorization*/ + double gatherLUtimer; /*timer for gather LU factors into bottom layer*/ + double tFactor3D[MAX_3D_LEVEL]; + double tSchCompUdt3d[MAX_3D_LEVEL]; + + /*ASync Profiler timing*/ + double tAsyncPipeTail; + + /*double t_Startup time before factorization starts*/ + double tStartup; + + /*keeping track of data sent*/ + double commVolFactor; + double commVolRed; + +} SCT_t; + #endif /* __SUPERLU_UTIL */ From 00a7d8f967419f6ce3052ac55ec472a948a5f10b Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Thu, 18 Apr 2019 12:49:48 -0700 Subject: [PATCH 002/147] Initial porting to Linux and Mac-OS. --- CMakeLists.txt | 2 + EXAMPLE/Makefile | 6 +- SRC/Makefile | 12 +- SRC/pdgstrf.c | 11 - SRC/pdgstrf2.c | 478 ++++++++++++- SRC/pdgstrf_X1.c | 1347 ------------------------------------- SRC/superlu_ddefs.h | 132 ++-- SRC/superlu_defs.h | 55 +- SRC/superlu_dist_config.h | 17 +- SRC/superlu_grid.c | 2 +- SRC/util.c | 4 +- 11 files changed, 635 insertions(+), 1431 deletions(-) delete mode 100644 SRC/pdgstrf_X1.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 83bc8c89..d02d457a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,8 +23,10 @@ option(enable_complex16 "Enable complex16 precision library" ON) option(enable_tests "Build tests" ON) option(enable_examples "Build examples" ON) option(TPL_ENABLE_BLASLIB "Build the CBLAS library" ${enable_blaslib_DEFAULT}) +option(TPL_BLAS_LIBRARIES "List of absolute paths to blas libraries [].") option(TPL_ENABLE_PARMETISLIB "Build the ParMETIS library" ON) option(TPL_ENABLE_LAPACKLIB "Enable LAPACK library" ON) +option(TPL_LAPACK_LIBRARIES "List of absolute paths to lapack libraries [].") option(TPL_PARMETIS_LIBRARIES "List of absolute paths to ParMETIS link libraries [].") option(TPL_PARMETIS_INCLUDE_DIRS "List of absolute paths to ParMETIS include directories [].") option(TPL_ENABLE_COMBBLASLIB "Build the CombBLAS library" OFF) diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile index 999be731..ea6035b8 100644 --- a/EXAMPLE/Makefile +++ b/EXAMPLE/Makefile @@ -30,6 +30,7 @@ ####################################################################### include ../make.inc +DEXM3D = pddrive3d.o dcreate_matrix.o DEXM = pddrive.o dcreate_matrix.o sp_ienv.o #pdgstrf2.o #pdgssvx.o # pdgstrs_lsum_X1.o pdgstrf_X1.o @@ -55,7 +56,7 @@ ZEXMG3 = pzdrive3_ABglobal.o ZEXMG4 = pzdrive4_ABglobal.o -all: double complex16 +all: pddrive3d double complex16 double: pddrive pddrive1 pddrive2 pddrive3 pddrive4 \ pddrive_ABglobal pddrive1_ABglobal pddrive2_ABglobal \ @@ -65,6 +66,9 @@ complex16: pzdrive pzdrive1 pzdrive2 pzdrive3 pzdrive4 \ pzdrive_ABglobal pzdrive1_ABglobal pzdrive2_ABglobal \ pzdrive3_ABglobal pzdrive4_ABglobal +pddrive3d: $(DEXM3D) $(DSUPERLULIB) + $(LOADER) $(LOADOPTS) $(DEXM3D) $(LIBS) -lm -o $@ + pddrive: $(DEXM) $(DSUPERLULIB) $(LOADER) $(LOADOPTS) $(DEXM) $(LIBS) -lm -o $@ diff --git a/SRC/Makefile b/SRC/Makefile index 8e67da69..5519b74f 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -26,6 +26,16 @@ # ####################################################################### include ../make.inc + +FACT3D = pdgssvx3d.o pdgstrf3d.o ancFactorization.o treeFactorization.o \ + p3dcomm.o gather.o sec_structs.o trfCommWrapper.o trfAux.o \ + communication_aux.o superlu_grid3d.o \ + supernodal_etree.o supernodalForest.o xtrf3Dpartition.o scatter.o +# pdgstrs_vecpar.o + +# pddrive_params.o +# scatter.o + # # Precision independent routines # @@ -62,7 +72,7 @@ DPLUSRC = pdgssvx.o pdgssvx_ABglobal.o \ pdgstrf.o pdgstrf2.o pdGetDiagU.o \ pdgstrs.o pdgstrs1.o pdgstrs_lsum.o pdgstrs_Bglobal.o \ pdgsrfs.o pdgsmv.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o \ - dreadtriple_noheader.o + dreadtriple_noheader.o $(FACT3D) # # Routines for double complex parallel SuperLU ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \ diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c index a4e9446a..8f5017bf 100644 --- a/SRC/pdgstrf.c +++ b/SRC/pdgstrf.c @@ -153,17 +153,6 @@ at the top-level directory. #define PDGSTRF2 pdgstrf2_trsm #define PDGSTRS2 pdgstrs2_omp -extern void PDGSTRF2 (superlu_dist_options_t *, int_t, int_t, double, - Glu_persist_t *, gridinfo_t *, LocalLU_t *, - MPI_Request *, int, SuperLUStat_t *, int *); -#ifdef _CRAY -extern void PDGSTRS2 (int_t, int_t, Glu_persist_t *, gridinfo_t *, - LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd); -#else -extern void PDGSTRS2 (int_t, int_t, Glu_persist_t *, gridinfo_t *, - LocalLU_t *, SuperLUStat_t *); -#endif - #ifdef ISORT extern void isort (int_t N, int_t * ARRAY1, int_t * ARRAY2); extern void isort1 (int_t N, int_t * ARRAY); diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c index 92759ad3..547de08f 100644 --- a/SRC/pdgstrf2.c +++ b/SRC/pdgstrf2.c @@ -7,8 +7,8 @@ All rights reserved. The source code is distributed under BSD license, see the file License.txt at the top-level directory. -*/ +*/ /*! @file * \brief Performs panel LU factorization. @@ -20,6 +20,18 @@ at the top-level directory. * * Modified: * September 30, 2017 + * March 31, 2019 version 7.0.0 + * + */ + +#include +#include "superlu_ddefs.h" +#include "cblas.h" + +/***************************************************************************** + * The following pdgstrf2_trsm is in version 6 and earlier. + *****************************************************************************/ +/*! \brief * *
  * Purpose
@@ -73,10 +85,6 @@ at the top-level directory.
  *             system of equations.
  * 
*/ - -#include -#include "superlu_ddefs.h" - /* This pdgstrf2 is based on TRSM function */ void pdgstrf2_trsm @@ -89,7 +97,7 @@ pdgstrf2_trsm int cols_left, iam, l, pkk, pr; int incx = 1, incy = 1; - int nsupr; /* number of rows in the block (LDA) */ + int nsupr; /* number of rows in the block (LDA) */ int nsupc; /* number of columns in the block */ int luptr; int_t i, myrow, krow, j, jfst, jlst, u_diag_cnt; @@ -303,9 +311,419 @@ pdgstrf2_trsm } /* PDGSTRF2_trsm */ +/***************************************************************************** + * The following functions are for the new pdgstrf2_dtrsm in the 3D code. + *****************************************************************************/ + +int_t LpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr, int_t ld_ujrow, + double* lusup, int_t nsupr, SCT_t* SCT) +{ + int_t l = nsupr - off0; + + unsigned long long t1 = _rdtsc(); + +#define GT 32 + #pragma omp parallel for + for (int i = 0; i < CEILING(l, GT); ++i) + { + int_t off = i * GT; + int_t len = SUPERLU_MIN(GT, l - i * GT); + cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, + len, nsupc, 1.0, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); + } + + t1 = _rdtsc() - t1; + + + SCT->trf2_flops += (double) l * (double)nsupc * (double)nsupc; + SCT->trf2_time += t1; + SCT->L_PanelUpdate_tl += t1; + return 0; + +} + +#pragma GCC push_options +#pragma GCC optimize ("O0") +/*factorizes the diagonal block; called from process that owns the (k,k) block*/ +void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, + double *BlockUFactor, /*factored U is over writen here*/ + Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, + SuperLUStat_t *stat, int *info, SCT_t* SCT) +{ + //unsigned long long t1 = _rdtsc(); + int_t *xsup = Glu_persist->xsup; + double alpha = -1, zero = 0.0; + + // printf("Entering dgetrf2 %d \n", k); + /* Initialization. */ + int_t lk = LBj (k, grid); /* Local block number */ + int_t jfst = FstBlockC (k); + int_t jlst = FstBlockC (k + 1); + double *lusup = Llu->Lnzval_bc_ptr[lk]; + int_t nsupc = SuperSize (k); + int_t nsupr; + if (Llu->Lrowind_bc_ptr[lk]) + nsupr = Llu->Lrowind_bc_ptr[lk][1]; + else + nsupr = 0; + double *ublk_ptr = BlockUFactor; + double *ujrow = BlockUFactor; + int_t luptr = 0; /* Point_t to the diagonal entries. */ + int_t cols_left = nsupc; /* supernode size */ + int_t u_diag_cnt = 0; + int_t ld_ujrow = nsupc; /* leading dimension of ujrow */ + int_t incx = 1; + int_t incy = ld_ujrow; + + for (int_t j = 0; j < jlst - jfst; ++j) /* for each column in panel */ + { + + /* Diagonal pivot */ + int_t i = luptr; + if (options->ReplaceTinyPivot == YES || lusup[i] == 0.0) + { + if (fabs (lusup[i]) < thresh) /* Diagonal */ + { + + /* Keep the new diagonal entry with the same sign. */ + if (lusup[i] < 0) + lusup[i] = -thresh; + else + lusup[i] = thresh; + ++(stat->TinyPivots); + } + } + + + + for (int_t l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) + { + int_t st = j * ld_ujrow + j; + ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */ + + } + + if (ujrow[0] == zero) /* Test for singularity. */ + { + *info = j + jfst + 1; + } + else /* Scale the j-th column. */ + { + double temp = 1.0 / ujrow[0]; + for (int_t i = luptr + 1; i < luptr - j + nsupc; ++i) + lusup[i] *= temp; + stat->ops[FACT] += nsupc - j - 1; + } + + /* Rank-1 update of the trailing submatrix. */ + if (--cols_left) + { + /*following must be int*/ + int_t l = nsupc - j - 1; + + + cblas_dger (CblasColMajor, l, cols_left, alpha, &lusup[luptr + 1], incx, + &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], + nsupr); + stat->ops[FACT] += 2 * l * cols_left; + } + + ujrow = ujrow + ld_ujrow + 1; + luptr += nsupr + 1; /* move to next column */ + + } /* for column j ... first loop */ + + + //int_t thread_id = omp_get_thread_num(); + // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( _rdtsc() - t1); +} + +#pragma GCC pop_options /************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Panel factorization -- block column k
+ *
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that own block column *k* participate
+ *   in the work.
+ *
+ * Arguments
+ * =========
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * nsupers (input) int_t (global)
+ *         Number of supernodes.
+ *
+ * k0     (input) int (global)
+ *        Counter of the next supernode to be factorized.
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) LocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * tag_ub (input) int
+ *        Upper bound of MPI tag values.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ * SCT    (output) SCT_t*
+ *        Additional statistics used in the 3D algorithm.
+ *
+ * 
+ */ +void pdgstrf2_dtrsm +(superlu_dist_options_t *options, int_t nsupers, + int_t k0, int_t k, double thresh, Glu_persist_t *Glu_persist, + gridinfo_t *grid, LocalLU_t *Llu, MPI_Request *U_diag_blk_send_req, + int tag_ub, SuperLUStat_t *stat, int *info, SCT_t *SCT) +{ + int cols_left, iam, pkk; + int incy = 1; + + int nsupr; /* number of rows in the block (LDA) */ + int luptr; + int_t myrow, krow, j, jfst, jlst, u_diag_cnt; + int_t nsupc; /* number of columns in the block */ + int_t *xsup = Glu_persist->xsup; + double *lusup; + double *ujrow, *ublk_ptr; /* pointer to the U block */ + int_t Pr; + + /* Quick return. */ + *info = 0; + + /* Initialization. */ + iam = grid->iam; + Pr = grid->nprow; + myrow = MYROW (iam, grid); + krow = PROW (k, grid); + pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + j = LBj (k, grid); /* Local block number */ + jfst = FstBlockC (k); + jlst = FstBlockC (k + 1); + lusup = Llu->Lnzval_bc_ptr[j]; + nsupc = SuperSize (k); + if (Llu->Lrowind_bc_ptr[j]) + nsupr = Llu->Lrowind_bc_ptr[j][1]; + else + nsupr = 0; + ublk_ptr = ujrow = Llu->ujrow; + + luptr = 0; /* Point to the diagonal entries. */ + cols_left = nsupc; /* supernode size */ + int ld_ujrow = nsupc; /* leading dimension of ujrow */ + u_diag_cnt = 0; + incy = ld_ujrow; + + if (U_diag_blk_send_req && U_diag_blk_send_req[myrow]) + { + /* There are pending sends - wait for all Isend to complete */ + Wait_UDiagBlockSend(U_diag_blk_send_req, grid, SCT); + + } + + if (iam == pkk) /* diagonal process */ + { + + /*factorize the diagonal block*/ + Local_Dgstrf2(options, k, thresh, Llu->ujrow, Glu_persist, + grid, Llu, stat, info, SCT); + ublk_ptr = ujrow = Llu->ujrow; + + + if (U_diag_blk_send_req && iam == pkk) /* Send the U block */ + { + ISend_UDiagBlock(k0, ublk_ptr, nsupc * nsupc, U_diag_blk_send_req, + grid, tag_ub); + U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* flag outstanding Isend */ + } + + LpanelUpdate(nsupc, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); + + } + else /* non-diagonal process */ + { + /* ================================================ * + * Receive the diagonal block of U * + * for panel factorization of L(:,k) * + * note: we block for panel factorization of L(:,k) * + * but panel factorization of U(:,k) don't * + * ================================================ */ + + Recv_UDiagBlock( k0, ublk_ptr, (nsupc * nsupc), krow, grid, SCT, tag_ub); + + if (nsupr > 0) + { + LpanelUpdate(0, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); + } + } /* end if pkk ... */ + +} /* pdgstrf2_dtrsm */ + +/***************************************************************************** + * The following functions are for the new pdgstrs2_omp in the 3D code. + *****************************************************************************/ + +/* PDGSTRS2 helping kernels*/ + +int_t Trs2_GatherU(int_t iukp, int_t rukp, int_t klst, + int_t nsupc, int_t ldu, + int_t *usub, + double* uval, double *tempv) +{ + int_t ncols = 0; + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + int_t segsize = klst - usub[jj]; + if ( segsize ) + { + int_t lead_zero = ldu - segsize; + for (int_t i = 0; i < lead_zero; ++i) tempv[i] = 0.0; + tempv += lead_zero; + for (int_t i = 0; i < segsize; ++i) + tempv[i] = uval[rukp + i]; + rukp += segsize; + tempv += segsize; + ncols++; + } + } + + return ncols; +} + +int_t Trs2_ScatterU(int_t iukp, int_t rukp, int_t klst, + int_t nsupc, int_t ldu, + int_t *usub, + double* uval, double *tempv) +{ + for (int_t jj = 0; jj < nsupc; ++jj) + { + int_t segsize = klst - usub[iukp + jj]; + if (segsize) + { + int_t lead_zero = ldu - segsize; + tempv += lead_zero; + for (int i = 0; i < segsize; ++i) + { + + uval[rukp + i] = tempv[i]; + + } + tempv += segsize; + rukp += segsize; + + } + + + } /*for jj=0:nsupc */ + return 0; +} + +int_t Trs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, + int_t *usub, + double* uval, double *tempv, + int_t knsupc, int_t nsupr, double*lusup, + Glu_persist_t *Glu_persist) /*glupersist for xsup for supersize*/ +{ + int_t *xsup = Glu_persist->xsup; + // int_t iukp = Ublock_info.iukp; + // int_t rukp = Ublock_info.rukp; + int_t gb = usub[iukp]; + int_t nsupc = SuperSize (gb); + iukp += UB_DESCRIPTOR; + + // printf("klst inside task%d\n", ); + /*find ldu */ + int_t ldu = 0; + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + ldu = SUPERLU_MAX( klst - usub[jj], ldu) ; + + } + + /*pack U block into a dense Block*/ + int_t ncols = Trs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv); + + + + /*now call dtrsm on packed dense block*/ + int_t luptr = (knsupc - ldu) * (nsupr + 1); + // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu ); + cblas_dtrsm (CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, + ldu, ncols, 1.0, &lusup[luptr], nsupr, tempv, ldu); + + /*now scatter the output into sparse U block*/ + Trs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv); + + return 0; + +} + +int_t Trs2_InitUblock_info(int_t klst, int_t nb, + Ublock_info_t *Ublock_info, + int_t *usub, + Glu_persist_t *Glu_persist, SuperLUStat_t *stat ) +{ + int_t *xsup = Glu_persist->xsup; + int_t iukp, rukp; + iukp = BR_HEADER; + rukp = 0; + + for (int_t b = 0; b < nb; ++b) + { + int_t gb = usub[iukp]; + int_t nsupc = SuperSize (gb); + + Ublock_info[b].iukp = iukp; + Ublock_info[b].rukp = rukp; + // Ublock_info[b].nsupc = nsupc; + + iukp += UB_DESCRIPTOR; + for (int_t j = 0; j < nsupc; ++j) + { + int_t segsize = klst - usub[iukp++]; + rukp += segsize; + stat->ops[FACT] += segsize * (segsize + 1); + } + } + return 0; +} + +#if 1 +/***************************************************************************** + * The following pdgstrf2_omp is in version 6 and earlier. + *****************************************************************************/ void pdgstrs2_omp -/************************************************************************/ (int_t k0, int_t k, Glu_persist_t * Glu_persist, gridinfo_t * grid, LocalLU_t * Llu, SuperLUStat_t * stat) { @@ -414,5 +832,49 @@ void pdgstrs2_omp __SSC_MARK(0x222); // stop SDE tracing #endif -} /* PDGSTRS2_omp */ +} /* pdgstrs2_omp */ + +#else /*==== Use the new version from Piyush ====*/ + +void pdgstrs2_omp(int_t m, int_t k0, int_t k, int_t* Lsub_buf, + double *Lval_buf, Glu_persist_t *Glu_persist, + gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, + Ublock_info_t *Ublock_info, double *bigV, int_t ldt, SCT_t *SCT) +{ + unsigned long long t1 = _rdtsc(); + int_t *xsup = Glu_persist->xsup; + /* Quick return. */ + int_t lk = LBi (k, grid); /* Local block number */ + + if (!Llu->Unzval_br_ptr[lk]) return; + + /* Initialization. */ + int_t klst = FstBlockC (k + 1); + int_t knsupc = SuperSize (k); + int_t *usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ + double *uval = Llu->Unzval_br_ptr[lk]; + int_t nb = usub[0]; + + int_t nsupr = Lsub_buf[1]; /* LDA of lusup[] */ + double *lusup = Lval_buf; + + /* Loop through all the row blocks. to get the iukp and rukp*/ + Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); + + /* Loop through all the row blocks. */ + #pragma omp parallel for schedule(dynamic,2) + for (int_t b = 0; b < nb; ++b) + { + int_t thread_id = omp_get_thread_num(); + double *tempv = bigV + thread_id * ldt * ldt; + Trs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, + usub, uval, tempv, + knsupc, nsupr, lusup, + Glu_persist); + } /* for b ... */ + + SCT->PDGSTRS2_tl += (double) ( _rdtsc() - t1); +} /* pdgstrs2_omp */ + +#endif diff --git a/SRC/pdgstrf_X1.c b/SRC/pdgstrf_X1.c deleted file mode 100644 index e02b74a0..00000000 --- a/SRC/pdgstrf_X1.c +++ /dev/null @@ -1,1347 +0,0 @@ -/*! \file -Copyright (c) 2003, The Regents of the University of California, through -Lawrence Berkeley National Laboratory (subject to receipt of any required -approvals from U.S. Dept. of Energy) - -All rights reserved. - -The source code is distributed under BSD license, see the file License.txt -at the top-level directory. -*/ -/*! @file - * \brief Performs the LU factorization in parallel - * - *
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- *
- * Modified:
- *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
- *
- * Sketch of the algorithm
- * =======================
- *
- * The following relations hold:
- *     * A_kk = L_kk * U_kk
- *     * L_ik = Aik * U_kk^(-1)
- *     * U_kj = L_kk^(-1) * A_kj
- *
- *              ----------------------------------
- *              |   |                            |
- *              ----|-----------------------------
- *              |   | \ U_kk|                    |
- *              |   |   \   |        U_kj        |
- *              |   |L_kk \ |         ||         |
- *              ----|-------|---------||----------
- *              |   |       |         \/         |
- *              |   |       |                    |
- *              |   |       |                    |
- *              |   |       |                    |
- *              |   | L_ik ==>       A_ij        |
- *              |   |       |                    |
- *              |   |       |                    |
- *              |   |       |                    |
- *              ----------------------------------
- *
- * Handle the first block of columns separately.
- *     * Factor diagonal and subdiagonal blocks and test for exact
- *       singularity. ( pdgstrf2(0), one column at a time )
- *     * Compute block row of U
- *     * Update trailing matrix
- * 
- * Loop over the remaining blocks of columns.
- *   mycol = MYCOL( iam, grid );
- *   myrow = MYROW( iam, grid );
- *   N = nsupers;
- *   For (k = 1; k < N; ++k) {
- *       krow = PROW( k, grid );
- *       kcol = PCOL( k, grid );
- *       Pkk = PNUM( krow, kcol, grid );
- *
- *     * Factor diagonal and subdiagonal blocks and test for exact
- *       singularity.
- *       if ( mycol == kcol ) {
- *           pdgstrf2(k), one column at a time 
- *       }
- *
- *     * Parallel triangular solve
- *       if ( iam == Pkk ) multicast L_k,k to this process row;
- *       if ( myrow == krow && mycol != kcol ) {
- *          Recv L_k,k from process Pkk;
- *          for (j = k+1; j < N; ++j) 
- *              if ( PCOL( j, grid ) == mycol && A_k,j != 0 )
- *                 U_k,j = L_k,k \ A_k,j;
- *       }
- *
- *     * Parallel rank-k update
- *       if ( myrow == krow ) multicast U_k,k+1:N to this process column;
- *       if ( mycol == kcol ) multicast L_k+1:N,k to this process row;
- *       if ( myrow != krow ) {
- *          Pkj = PNUM( krow, mycol, grid );
- *          Recv U_k,k+1:N from process Pkj;
- *       }
- *       if ( mycol != kcol ) {
- *          Pik = PNUM( myrow, kcol, grid );
- *          Recv L_k+1:N,k from process Pik;
- *       }
- *       for (j = k+1; k < N; ++k) {
- *          for (i = k+1; i < N; ++i) 
- *              if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
- *                   && L_i,k != 0 && U_k,j != 0 )
- *                 A_i,j = A_i,j - L_i,k * U_k,j;
- *       }
- *  }
- *
- *
- * Remaining issues
- *   (1) Use local indices for L subscripts and SPA.  [DONE]
- * 
- */ - -#include -#include "superlu_ddefs.h" -#define CRAY_X1 -#if ( VAMPIR>=1 ) -#include -#endif - -/* - * Internal prototypes - */ -static void pdgstrf2(superlu_options_t *, int_t, double, Glu_persist_t *, - gridinfo_t *, LocalLU_t *, SuperLUStat_t *, int *); -#ifdef _CRAY -static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *, - LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd); -#else -static void pdgstrs2(int_t, int_t, Glu_persist_t *, gridinfo_t *, - LocalLU_t *, SuperLUStat_t *); -#endif - -/* - - * - */ -/************************************************************************/ -/*! \brief - * - *
- * Purpose
- * =======
- *
- *  PDGSTRF performs the LU factorization in parallel.
- *
- * Arguments
- * =========
- * 
- * options (input) superlu_options_t*
- *         The structure defines the input parameters to control
- *         how the LU decomposition will be performed.
- *         The following field should be defined:
- *         o ReplaceTinyPivot (yes_no_t)
- *           Specifies whether to replace the tiny diagonals by
- *           sqrt(epsilon)*norm(A) during LU factorization.
- *
- * m      (input) int
- *        Number of rows in the matrix.
- *
- * n      (input) int
- *        Number of columns in the matrix.
- *
- * anorm  (input) double
- *        The norm of the original matrix A, or the scaled A if
- *        equilibration was done.
- *
- * LUstruct (input/output) LUstruct_t*
- *         The data structures to store the distributed L and U factors.
- *         The following fields should be defined:
- *
- *         o Glu_persist (input) Glu_persist_t*
- *           Global data structure (xsup, supno) replicated on all processes,
- *           describing the supernode partition in the factored matrices
- *           L and U:
- *	       xsup[s] is the leading column of the s-th supernode,
- *             supno[i] is the supernode number to which column i belongs.
- *
- *         o Llu (input/output) LocalLU_t*
- *           The distributed data structures to store L and U factors.
- *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh. It contains the MPI communicator, the number
- *        of process rows (NPROW), the number of process columns (NPCOL),
- *        and my process rank. It is an input argument to all the
- *        parallel routines.
- *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
- *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics on runtime and floating-point operation count.
- *        See util.h for the definition of 'SuperLUStat_t'.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
- *             been completed, but the factor U is exactly singular,
- *             and division by zero will occur if it is used to solve a
- *             system of equations.
- * 
- */ -void pdgstrf -/************************************************************************/ -( - superlu_options_t *options, int m, int n, double anorm, - LUstruct_t *LUstruct, gridinfo_t *grid, SuperLUStat_t *stat, int *info - ) - -{ -#ifdef _CRAY - _fcd ftcs = _cptofcd("N", strlen("N")); - _fcd ftcs1 = _cptofcd("L", strlen("L")); - _fcd ftcs2 = _cptofcd("N", strlen("N")); - _fcd ftcs3 = _cptofcd("U", strlen("U")); -#endif - double alpha = 1.0, beta = 0.0; - int_t *xsup; - int_t *lsub, *lsub1, *usub, *Usub_buf, - *Lsub_buf_2[2]; /* Need 2 buffers to implement Irecv. */ - double *lusup, *lusup1, *uval, *Uval_buf, - *Lval_buf_2[2]; /* Need 2 buffers to implement Irecv. */ - int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc, - lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj, - nlb, nub, nsupc, rel, rukp; - int_t Pc, Pr; - int iam, kcol, krow, mycol, myrow, pi, pj; - int j, k, lk, nsupers; - int nsupr, nbrow, segsize; - int msgcnt[4]; /* Count the size of the message xfer'd in each buffer: - * 0 : transferred in Lsub_buf[] - * 1 : transferred in Lval_buf[] - * 2 : transferred in Usub_buf[] - * 3 : transferred in Uval_buf[] - */ - int_t msg0, msg2; - int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr; - double **Unzval_br_ptr, **Lnzval_bc_ptr; - int_t *index; - double *nzval; - int_t *iuip, *ruip;/* Pointers to U index/nzval; size ceil(NSUPERS/Pr). */ - double *ucol; - int_t *indirect; - double *tempv, *tempv2d; - int_t iinfo; - int_t *ToRecv, *ToSendD, **ToSendR; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - superlu_scope_t *scp; - double s_eps, thresh; - double *tempU2d, *tempu; - int full, ldt, ldu, lead_zero, ncols; - MPI_Request recv_req[4], *send_req; - MPI_Status status; -#ifdef CRAY_X1 - int nonzero_segs; -#endif -#if ( DEBUGlevel>=2 ) - int_t num_copy=0, num_update=0; -#endif -#if ( PRNTlevel==3 ) - int_t zero_msg = 0, total_msg = 0; -#endif -#if ( PROFlevel>=1 ) - double t1, t2; - float msg_vol = 0, msg_cnt = 0; - int_t iword = sizeof(int_t), dword = sizeof(double); -#endif - - /* Test the input parameters. */ - *info = 0; - if ( m < 0 ) *info = -2; - else if ( n < 0 ) *info = -3; - if ( *info ) { - pxerbla("pdgstrf", grid, -*info); - return; - } - - /* Quick return if possible. */ - if ( m == 0 || n == 0 ) return; - - /* - * Initialization. - */ - iam = grid->iam; - Pc = grid->npcol; - Pr = grid->nprow; - myrow = MYROW( iam, grid ); - mycol = MYCOL( iam, grid ); - nsupers = Glu_persist->supno[n-1] + 1; - xsup = Glu_persist->xsup; - s_eps = slamch_("Epsilon"); - thresh = s_eps * anorm; - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter pdgstrf()"); -#endif - - stat->ops[FACT] = 0.0; - - if ( Pr*Pc > 1 ) { - i = Llu->bufmax[0]; - if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist(2 * ((size_t)i))) ) - ABORT("Malloc fails for Lsub_buf."); - Llu->Lsub_buf_2[1] = Llu->Lsub_buf_2[0] + i; - i = Llu->bufmax[1]; - if ( !(Llu->Lval_buf_2[0] = doubleMalloc_dist(2 * ((size_t)i))) ) - ABORT("Malloc fails for Lval_buf[]."); - Llu->Lval_buf_2[1] = Llu->Lval_buf_2[0] + i; - if ( Llu->bufmax[2] != 0 ) - if ( !(Llu->Usub_buf = intMalloc_dist(Llu->bufmax[2])) ) - ABORT("Malloc fails for Usub_buf[]."); - if ( Llu->bufmax[3] != 0 ) - if ( !(Llu->Uval_buf = doubleMalloc_dist(Llu->bufmax[3])) ) - ABORT("Malloc fails for Uval_buf[]."); - if ( !(send_req = - (MPI_Request *) SUPERLU_MALLOC(2*Pc*sizeof(MPI_Request)))) - ABORT("Malloc fails for send_req[]."); - } - if ( !(Llu->ujrow = doubleMalloc_dist(sp_ienv_dist(3))) ) - ABORT("Malloc fails for ujrow[]."); - -#if ( PRNTlevel>=1 ) - if ( !iam ) { - printf(".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, thresh); - printf(".. Buffer size: Lsub %d\tLval %d\tUsub %d\tUval %d\tLDA %d\n", - Llu->bufmax[0], Llu->bufmax[1], - Llu->bufmax[2], Llu->bufmax[3], Llu->bufmax[4]); - } -#endif - - Lsub_buf_2[0] = Llu->Lsub_buf_2[0]; - Lsub_buf_2[1] = Llu->Lsub_buf_2[1]; - Lval_buf_2[0] = Llu->Lval_buf_2[0]; - Lval_buf_2[1] = Llu->Lval_buf_2[1]; - Usub_buf = Llu->Usub_buf; - Uval_buf = Llu->Uval_buf; - Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - Unzval_br_ptr = Llu->Unzval_br_ptr; - ToRecv = Llu->ToRecv; - ToSendD = Llu->ToSendD; - ToSendR = Llu->ToSendR; - - ldt = sp_ienv_dist(3); /* Size of maximum supernode */ - if ( !(tempv2d = doubleCalloc_dist(2*((size_t)ldt)*ldt)) ) - ABORT("Calloc fails for tempv2d[]."); - tempU2d = tempv2d + ldt*ldt; -#ifdef CRAY_X1 - if ( !(indirect = intMalloc_dist(2*ldt)) ) - ABORT("Malloc fails for indirect[]."); -#else - if ( !(indirect = intMalloc_dist(ldt)) ) - ABORT("Malloc fails for indirect[]."); -#endif - k = CEILING( nsupers, Pr ); /* Number of local block rows */ - if ( !(iuip = intMalloc_dist(k)) ) - ABORT("Malloc fails for iuip[]."); - if ( !(ruip = intMalloc_dist(k)) ) - ABORT("Malloc fails for ruip[]."); - -#if ( VAMPIR>=1 ) - VT_symdef(1, "Send-L", "Comm"); - VT_symdef(2, "Recv-L", "Comm"); - VT_symdef(3, "Send-U", "Comm"); - VT_symdef(4, "Recv-U", "Comm"); - VT_symdef(5, "TRF2", "Factor"); - VT_symdef(100, "Factor", "Factor"); - VT_begin(100); - VT_traceon(); -#endif - - /* --------------------------------------------------------------- - Handle the first block column separately to start the pipeline. - --------------------------------------------------------------- */ - if ( mycol == 0 ) { -#if ( VAMPIR>=1 ) - VT_begin(5); -#endif - pdgstrf2(options, 0, thresh, Glu_persist, grid, Llu, stat, info); -#if ( VAMPIR>=1 ) - VT_end(5); -#endif - - scp = &grid->rscp; /* The scope of process row. */ - - /* Process column *kcol* multicasts numeric values of L(:,k) - to process rows. */ - lsub = Lrowind_bc_ptr[0]; - lusup = Lnzval_bc_ptr[0]; - if ( lsub ) { - msgcnt[0] = lsub[1] + BC_HEADER + lsub[0]*LB_DESCRIPTOR; - msgcnt[1] = lsub[1] * SuperSize( 0 ); - } else { - msgcnt[0] = msgcnt[1] = 0; - } - - for (pj = 0; pj < Pc; ++pj) { - if ( ToSendR[0][pj] != EMPTY ) { -#if ( PROFlevel>=1 ) - TIC(t1); -#endif -#if ( VAMPIR>=1 ) - VT_begin(1); -#endif - MPI_Isend( lsub, msgcnt[0], mpi_int_t, pj, 0, scp->comm, - &send_req[pj] ); - MPI_Isend( lusup, msgcnt[1], MPI_DOUBLE, pj, 1, scp->comm, - &send_req[pj+Pc] ); -#if ( DEBUGlevel>=2 ) - printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", - iam, 0, msgcnt[0], msgcnt[1], pj); -#endif -#if ( VAMPIR>=1 ) - VT_end(1); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - msg_cnt += 2; - msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; -#endif - } - } /* for pj ... */ - } else { /* Post immediate receives. */ - if ( ToRecv[0] >= 1 ) { /* Recv block column L(:,0). */ - scp = &grid->rscp; /* The scope of process row. */ - MPI_Irecv( Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, 0, - 0, scp->comm, &recv_req[0] ); - MPI_Irecv( Lval_buf_2[0], Llu->bufmax[1], MPI_DOUBLE, 0, - 1, scp->comm, &recv_req[1] ); -#if ( DEBUGlevel>=2 ) - printf("(%d) Post Irecv L(:,%4d)\n", iam, 0); -#endif - } - } /* if mycol == 0 */ - - /* ------------------------------------------ - MAIN LOOP: Loop through all block columns. - ------------------------------------------ */ - for (k = 0; k < nsupers; ++k) { - - knsupc = SuperSize( k ); - krow = PROW( k, grid ); - kcol = PCOL( k, grid ); - - if ( mycol == kcol ) { - lk = LBj( k, grid ); /* Local block number. */ - - for (pj = 0; pj < Pc; ++pj) { - /* Wait for Isend to complete before using lsub/lusup. */ - if ( ToSendR[lk][pj] != EMPTY ) { - MPI_Wait( &send_req[pj], &status ); - MPI_Wait( &send_req[pj+Pc], &status ); - } - } - lsub = Lrowind_bc_ptr[lk]; - lusup = Lnzval_bc_ptr[lk]; - } else { - if ( ToRecv[k] >= 1 ) { /* Recv block column L(:,k). */ - scp = &grid->rscp; /* The scope of process row. */ -#if ( PROFlevel>=1 ) - TIC(t1); -#endif -#if ( VAMPIR>=1 ) - VT_begin(2); -#endif - /*probe_recv(iam, kcol, (4*k)%NTAGS, mpi_int_t, scp->comm, - Llu->bufmax[0]);*/ - /*MPI_Recv( Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, - (4*k)%NTAGS, scp->comm, &status );*/ - MPI_Wait( &recv_req[0], &status ); - MPI_Get_count( &status, mpi_int_t, &msgcnt[0] ); - /*probe_recv(iam, kcol, (4*k+1)%NTAGS, MPI_DOUBLE, scp->comm, - Llu->bufmax[1]);*/ - /*MPI_Recv( Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, - (4*k+1)%NTAGS, scp->comm, &status );*/ - MPI_Wait( &recv_req[1], &status ); - MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[1] ); -#if ( VAMPIR>=1 ) - VT_end(2); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; -#endif -#if ( DEBUGlevel>=2 ) - printf("(%d) Recv L(:,%4d): lsub %4d, lusup %4d from Pc %2d\n", - iam, k, msgcnt[0], msgcnt[1], kcol); - fflush(stdout); -#endif - lsub = Lsub_buf_2[k%2]; - lusup = Lval_buf_2[k%2]; -#if ( PRNTlevel==3 ) - ++total_msg; - if ( !msgcnt[0] ) ++zero_msg; -#endif - } else msgcnt[0] = 0; - } /* if mycol = Pc(k) */ - - scp = &grid->cscp; /* The scope of process column. */ - - if ( myrow == krow ) { - /* Parallel triangular solve across process row *krow* -- - U(k,j) = L(k,k) \ A(k,j). */ -#ifdef _CRAY - pdgstrs2(n, k, Glu_persist, grid, Llu, stat, ftcs1, ftcs2, ftcs3); -#else - pdgstrs2(n, k, Glu_persist, grid, Llu, stat); -#endif - - /* Multicasts U(k,:) to process columns. */ - lk = LBi( k, grid ); - usub = Ufstnz_br_ptr[lk]; - uval = Unzval_br_ptr[lk]; - if ( usub ) { - msgcnt[2] = usub[2]; - msgcnt[3] = usub[1]; - } else { - msgcnt[2] = msgcnt[3] = 0; - } - - if ( ToSendD[lk] == YES ) { - for (pi = 0; pi < Pr; ++pi) { - if ( pi != myrow ) { -#if ( PROFlevel>=1 ) - TIC(t1); -#endif -#if ( VAMPIR>=1 ) - VT_begin(3); -#endif - MPI_Send( usub, msgcnt[2], mpi_int_t, pi, - (4*k+2)%NTAGS, scp->comm); - MPI_Send( uval, msgcnt[3], MPI_DOUBLE, pi, - (4*k+3)%NTAGS, scp->comm); -#if ( VAMPIR>=1 ) - VT_end(3); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - msg_cnt += 2; - msg_vol += msgcnt[2]*iword + msgcnt[3]*dword; -#endif -#if ( DEBUGlevel>=2 ) - printf("(%d) Send U(%4d,:) to Pr %2d\n", iam, k, pi); -#endif - } /* if pi ... */ - } /* for pi ... */ - } /* if ToSendD ... */ - } else { /* myrow != krow */ - if ( ToRecv[k] == 2 ) { /* Recv block row U(k,:). */ -#if ( PROFlevel>=1 ) - TIC(t1); -#endif -#if ( VAMPIR>=1 ) - VT_begin(4); -#endif - /*probe_recv(iam, krow, (4*k+2)%NTAGS, mpi_int_t, scp->comm, - Llu->bufmax[2]);*/ - MPI_Recv( Usub_buf, Llu->bufmax[2], mpi_int_t, krow, - (4*k+2)%NTAGS, scp->comm, &status ); - MPI_Get_count( &status, mpi_int_t, &msgcnt[2] ); - /*probe_recv(iam, krow, (4*k+3)%NTAGS, MPI_DOUBLE, scp->comm, - Llu->bufmax[3]);*/ - MPI_Recv( Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, - (4*k+3)%NTAGS, scp->comm, &status ); - MPI_Get_count( &status, MPI_DOUBLE, &msgcnt[3] ); -#if ( VAMPIR>=1 ) - VT_end(4); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; -#endif - usub = Usub_buf; - uval = Uval_buf; -#if ( DEBUGlevel>=2 ) - printf("(%d) Recv U(%4d,:) from Pr %2d\n", iam, k, krow); -#endif -#if ( PRNTlevel==3 ) - ++total_msg; - if ( !msgcnt[2] ) ++zero_msg; -#endif - } else msgcnt[2] = 0; - } /* if myrow == Pr(k) */ - - /* - * Parallel rank-k update; pair up blocks L(i,k) and U(k,j). - * for (j = k+1; k < N; ++k) { - * for (i = k+1; i < N; ++i) - * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) - * && L(i,k) != 0 && U(k,j) != 0 ) - * A(i,j) = A(i,j) - L(i,k) * U(k,j); - */ - msg0 = msgcnt[0]; - msg2 = msgcnt[2]; - if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ - nsupr = lsub[1]; /* LDA of lusup. */ - if ( myrow == krow ) { /* Skip diagonal block L(k,k). */ - lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER+1]; - luptr0 = knsupc; - nlb = lsub[0] - 1; - } else { - lptr0 = BC_HEADER; - luptr0 = 0; - nlb = lsub[0]; - } - lptr = lptr0; - for (lb = 0; lb < nlb; ++lb) { /* Initialize block row pointers. */ - ib = lsub[lptr]; - lib = LBi( ib, grid ); - iuip[lib] = BR_HEADER; - ruip[lib] = 0; - lptr += LB_DESCRIPTOR + lsub[lptr+1]; - } - nub = usub[0]; /* Number of blocks in the block row U(k,:) */ - iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ - rukp = 0; /* Pointer to nzval[] of U(k,:) */ - klst = FstBlockC( k+1 ); - - /* --------------------------------------------------- - Update the first block column A(:,k+1). - --------------------------------------------------- */ - jb = usub[iukp]; /* Global block number of block U(k,j). */ - if ( jb == k+1 ) { /* First update (k+1)-th block. */ - --nub; - lptr = lptr0; - luptr = luptr0; - ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ - nsupc = SuperSize( jb ); - iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ - - /* Prepare to call DGEMM. */ - jj = iukp; - while ( usub[jj] == klst ) ++jj; - ldu = klst - usub[jj++]; - ncols = 1; - full = 1; - for (; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - ++ncols; - if ( segsize != ldu ) full = 0; - if ( segsize > ldu ) ldu = segsize; - } - } -#if ( DEBUGlevel>=3 ) - ++num_update; -#endif - if ( full ) { - tempu = &uval[rukp]; - } else { /* Copy block U(k,j) into tempU2d. */ -#if ( DEBUGlevel>=3 ) - printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", - iam, full, k, jb, ldu, ncols, nsupc); - ++num_copy; -#endif - tempu = tempU2d; - for (jj = iukp; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - lead_zero = ldu - segsize; - for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; - tempu += lead_zero; - for (i = 0; i < segsize; ++i) - tempu[i] = uval[rukp+i]; - rukp += segsize; - tempu += segsize; - } - } - tempu = tempU2d; - rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ - } /* if full ... */ - - for (lb = 0; lb < nlb; ++lb) { - ib = lsub[lptr]; /* Row block L(i,k). */ - nbrow = lsub[lptr+1]; /* Number of full rows. */ - lptr += LB_DESCRIPTOR; /* Skip descriptor. */ - tempv = tempv2d; -#ifdef _CRAY - SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#else - dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#endif - stat->ops[FACT] += 2 * nbrow * ldu * ncols; - - /* Now gather the result into the destination block. */ - if ( ib < jb ) { /* A(i,j) is in U. */ - ilst = FstBlockC( ib+1 ); - lib = LBi( ib, grid ); - index = Ufstnz_br_ptr[lib]; - ijb = index[iuip[lib]]; - while ( ijb < jb ) { /* Search for dest block. */ - ruip[lib] += index[iuip[lib]+1]; - iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); - ijb = index[iuip[lib]]; - } - iuip[lib] += UB_DESCRIPTOR; /* Skip descriptor. */ - - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - fnz = index[iuip[lib]++]; - if ( segsize ) { /* Nonzero segment in U(k.j). */ - ucol = &Unzval_br_ptr[lib][ruip[lib]]; - for (i = 0, it = 0; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - ucol[rel] -= tempv[it++]; - } - tempv += ldt; - } - ruip[lib] += ilst - fnz; - } - } else { /* A(i,j) is in L. */ - index = Lrowind_bc_ptr[ljb]; - ldv = index[1]; /* LDA of the dest lusup. */ - lptrj = BC_HEADER; - luptrj = 0; - ijb = index[lptrj]; - while ( ijb != ib ) { /* Search for dest block -- - blocks are not ordered! */ - luptrj += index[lptrj+1]; - lptrj += LB_DESCRIPTOR + index[lptrj+1]; - ijb = index[lptrj]; - } - /* - * Build indirect table. This is needed because the - * indices are not sorted. - */ - fnz = FstBlockC( ib ); - lptrj += LB_DESCRIPTOR; - for (i = 0; i < index[lptrj-1]; ++i) { - rel = index[lptrj + i] - fnz; - indirect[rel] = i; - } - nzval = Lnzval_bc_ptr[ljb] + luptrj; - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - if ( segsize ) { -/*#pragma _CRI cache_bypass nzval,tempv*/ - for (it = 0, i = 0; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - nzval[indirect[rel]] -= tempv[it++]; - } - tempv += ldt; - } - nzval += ldv; - } - } /* if ib < jb ... */ - lptr += nbrow; - luptr += nbrow; - } /* for lb ... */ - rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ - iukp += nsupc; - } /* if jb == k+1 */ - } /* if L(:,k) and U(k,:) not empty */ - - - if ( k+1 < nsupers ) { - kcol = PCOL( k+1, grid ); - if ( mycol == kcol ) { -#if ( VAMPIR>=1 ) - VT_begin(5); -#endif - /* Factor diagonal and subdiagonal blocks and test for exact - singularity. */ - pdgstrf2(options, k+1, thresh, Glu_persist, grid, Llu, stat, info); -#if ( VAMPIR>=1 ) - VT_end(5); -#endif - - /* Process column *kcol+1* multicasts numeric values of L(:,k+1) - to process rows. */ - lk = LBj( k+1, grid ); /* Local block number. */ - lsub1 = Lrowind_bc_ptr[lk]; - if ( lsub1 ) { - msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0]*LB_DESCRIPTOR; - msgcnt[1] = lsub1[1] * SuperSize( k+1 ); - } else { - msgcnt[0] = 0; - msgcnt[1] = 0; - } - scp = &grid->rscp; /* The scope of process row. */ - for (pj = 0; pj < Pc; ++pj) { - if ( ToSendR[lk][pj] != EMPTY ) { - lusup1 = Lnzval_bc_ptr[lk]; -#if ( PROFlevel>=1 ) - TIC(t1); -#endif -#if ( VAMPIR>=1 ) - VT_begin(1); -#endif - MPI_Isend( lsub1, msgcnt[0], mpi_int_t, pj, - (4*(k+1))%NTAGS, scp->comm, &send_req[pj] ); - MPI_Isend( lusup1, msgcnt[1], MPI_DOUBLE, pj, - (4*(k+1)+1)%NTAGS, scp->comm, &send_req[pj+Pc] ); -#if ( VAMPIR>=1 ) - VT_end(1); -#endif -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - msg_cnt += 2; - msg_vol += msgcnt[0]*iword + msgcnt[1]*dword; -#endif -#if ( DEBUGlevel>=2 ) - printf("(%d) Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", - iam, k+1, msgcnt[0], msgcnt[1], pj); -#endif - } - } /* for pj ... */ - } else { /* Post Recv of block column L(:,k+1). */ - if ( ToRecv[k+1] >= 1 ) { - scp = &grid->rscp; /* The scope of process row. */ - MPI_Irecv(Lsub_buf_2[(k+1)%2], Llu->bufmax[0], mpi_int_t, kcol, - (4*(k+1))%NTAGS, scp->comm, &recv_req[0]); - MPI_Irecv(Lval_buf_2[(k+1)%2], Llu->bufmax[1], MPI_DOUBLE, kcol, - (4*(k+1)+1)%NTAGS, scp->comm, &recv_req[1]); -#if ( DEBUGlevel>=2 ) - printf("(%d) Post Irecv L(:,%4d)\n", iam, k+1); -#endif - } - } /* if mycol == Pc(k+1) */ - } /* if k+1 < nsupers */ - - if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */ - /* --------------------------------------------------- - Update all other blocks using block row U(k,:) - --------------------------------------------------- */ - for (j = 0; j < nub; ++j) { - lptr = lptr0; - luptr = luptr0; - jb = usub[iukp]; /* Global block number of block U(k,j). */ - ljb = LBj( jb, grid ); /* Local block number of U(k,j). */ - nsupc = SuperSize( jb ); - iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ - - /* Prepare to call DGEMM. */ - jj = iukp; - while ( usub[jj] == klst ) ++jj; - ldu = klst - usub[jj++]; - ncols = 1; - full = 1; - for (; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - ++ncols; - if ( segsize != ldu ) full = 0; - if ( segsize > ldu ) ldu = segsize; - } - } -#if ( DEBUGlevel>=3 ) - printf("(%d) full=%d,k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n", - iam, full, k, jb, ldu, ncols, nsupc); - ++num_update; -#endif - if ( full ) { - tempu = &uval[rukp]; - } else { /* Copy block U(k,j) into tempU2d. */ -#if ( DEBUGlevel>=3 ) - ++num_copy; -#endif - tempu = tempU2d; - for (jj = iukp; jj < iukp+nsupc; ++jj) { - segsize = klst - usub[jj]; - if ( segsize ) { - lead_zero = ldu - segsize; - for (i = 0; i < lead_zero; ++i) tempu[i] = 0.0; - tempu += lead_zero; - for (i = 0; i < segsize; ++i) - tempu[i] = uval[rukp+i]; - rukp += segsize; - tempu += segsize; - } - } - tempu = tempU2d; - rukp -= usub[iukp - 1]; /* Return to start of U(k,j). */ - } /* if full ... */ - - for (lb = 0; lb < nlb; ++lb) { - ib = lsub[lptr]; /* Row block L(i,k). */ - nbrow = lsub[lptr+1]; /* Number of full rows. */ - lptr += LB_DESCRIPTOR; /* Skip descriptor. */ - tempv = tempv2d; -#ifdef _CRAY - SGEMM(ftcs, ftcs, &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#else - dgemm_("N", "N", &nbrow, &ncols, &ldu, &alpha, - &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr, - tempu, &ldu, &beta, tempv, &ldt); -#endif - stat->ops[FACT] += 2 * nbrow * ldu * ncols; - - /* Now gather the result into the destination block. */ - if ( ib < jb ) { /* A(i,j) is in U. */ - ilst = FstBlockC( ib+1 ); - lib = LBi( ib, grid ); - index = Ufstnz_br_ptr[lib]; - ijb = index[iuip[lib]]; - while ( ijb < jb ) { /* Search for dest block. */ - ruip[lib] += index[iuip[lib]+1]; - iuip[lib] += UB_DESCRIPTOR + SuperSize( ijb ); - ijb = index[iuip[lib]]; - } - /* Skip descriptor. Now point to fstnz index of - block U(i,j). */ - iuip[lib] += UB_DESCRIPTOR; - - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - fnz = index[iuip[lib]++]; - if ( segsize ) { /* Nonzero segment in U(k.j). */ - ucol = &Unzval_br_ptr[lib][ruip[lib]]; - for (i = 0 ; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - ucol[rel] -= tempv[i]; - } - tempv += ldt; - } - ruip[lib] += ilst - fnz; - } - } else { /* A(i,j) is in L. */ - index = Lrowind_bc_ptr[ljb]; - ldv = index[1]; /* LDA of the dest lusup. */ - lptrj = BC_HEADER; - luptrj = 0; - ijb = index[lptrj]; - while ( ijb != ib ) { /* Search for dest block -- - blocks are not ordered! */ - luptrj += index[lptrj+1]; - lptrj += LB_DESCRIPTOR + index[lptrj+1]; - ijb = index[lptrj]; - } - /* - * Build indirect table. This is needed because the - * indices are not sorted for the L blocks. - */ - fnz = FstBlockC( ib ); - lptrj += LB_DESCRIPTOR; - for (i = 0; i < index[lptrj-1]; ++i) { - rel = index[lptrj + i] - fnz; - indirect[rel] = i; - } - nzval = Lnzval_bc_ptr[ljb] + luptrj; - tempv = tempv2d; - for (jj = 0; jj < nsupc; ++jj) { - segsize = klst - usub[iukp + jj]; - if ( segsize ) { -/*#pragma _CRI cache_bypass nzval,tempv*/ - for (i = 0; i < nbrow; ++i) { - rel = lsub[lptr + i] - fnz; - nzval[indirect[rel]] -= tempv[i]; - } - tempv += ldt; - } - nzval += ldv; - } - } /* if ib < jb ... */ - lptr += nbrow; - luptr += nbrow; - } /* for lb ... */ - rukp += usub[iukp - 1]; /* Move to block U(k,j+1) */ - iukp += nsupc; - } /* for j ... */ - } /* if k L(:,k) and U(k,:) are not empty */ - - } - /* ------------------------------------------ - END MAIN LOOP: for k = ... - ------------------------------------------ */ - -#if ( VAMPIR>=1 ) - VT_end(100); - VT_traceoff(); -#endif - - if ( Pr*Pc > 1 ) { - SUPERLU_FREE(Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */ - SUPERLU_FREE(Lval_buf_2[0]); /* also free Lval_buf_2[1] */ - if ( Llu->bufmax[2] != 0 ) SUPERLU_FREE(Usub_buf); - if ( Llu->bufmax[3] != 0 ) SUPERLU_FREE(Uval_buf); - SUPERLU_FREE(send_req); - } - - SUPERLU_FREE(Llu->ujrow); - SUPERLU_FREE(tempv2d); - SUPERLU_FREE(indirect); - SUPERLU_FREE(iuip); - SUPERLU_FREE(ruip); - - /* Prepare error message. */ - if ( *info == 0 ) *info = n + 1; -#if ( PROFlevel>=1 ) - TIC(t1); -#endif - MPI_Allreduce( info, &iinfo, 1, mpi_int_t, MPI_MIN, grid->comm ); -#if ( PROFlevel>=1 ) - TOC(t2, t1); - stat->utime[COMM] += t2; - { - float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; - - MPI_Reduce( &msg_cnt, &msg_cnt_sum, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Reduce( &msg_cnt, &msg_cnt_max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); - MPI_Reduce( &msg_vol, &msg_vol_sum, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); - MPI_Reduce( &msg_vol, &msg_vol_max, - 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); - if ( !iam ) { - printf("\tPDGSTRF comm stat:" - "\tAvg\tMax\t\tAvg\tMax\n" - "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", - msg_cnt_sum/Pr/Pc, msg_cnt_max, - msg_vol_sum/Pr/Pc*1e-6, msg_vol_max*1e-6); - } - } -#endif - if ( iinfo == n + 1 ) *info = 0; - else *info = iinfo; - - -#if ( PRNTlevel==3 ) - MPI_Allreduce( &zero_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); - if ( !iam ) printf(".. # msg of zero size\t%d\n", iinfo); - MPI_Allreduce( &total_msg, &iinfo, 1, mpi_int_t, MPI_SUM, grid->comm ); - if ( !iam ) printf(".. # total msg\t%d\n", iinfo); -#endif - -#if ( PRNTlevel==2 ) - for (i = 0; i < Pr * Pc; ++i) { - if ( iam == i ) { - dPrintLblocks(iam, nsupers, grid, Glu_persist, Llu); - dPrintUblocks(iam, nsupers, grid, Glu_persist, Llu); - printf("(%d)\n", iam); - PrintInt10("Recv", nsupers, Llu->ToRecv); - } - MPI_Barrier( grid->comm ); - } -#endif - -#if ( DEBUGlevel>=3 ) - printf("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update); -#endif -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Exit pdgstrf()"); -#endif -} /* PDGSTRF */ - - -/************************************************************************/ -/*! \brief - * - *
- * Purpose
- * =======
- *   Factor diagonal and subdiagonal blocks and test for exact singularity.
- *   Only the process column that owns block column *k* participates
- *   in the work.
- * 
- * Arguments
- * =========
- *
- * k      (input) int (global)
- *        The column number of the block column to be factorized.
- *
- * thresh (input) double (global)
- *        The threshold value = s_eps * anorm.
- *
- * Glu_persist (input) Glu_persist_t*
- *        Global data structures (xsup, supno) replicated on all processes.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Llu    (input/output) LocalLU_t*
- *        Local data structures to store distributed L and U matrices.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the factorization.
- *        See SuperLUStat_t structure defined in util.h.
- *
- * info   (output) int*
- *        = 0: successful exit
- *        < 0: if info = -i, the i-th argument had an illegal value
- *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
- *             been completed, but the factor U is exactly singular,
- *             and division by zero will occur if it is used to solve a
- *             system of equations.
- * 
- */ -static void pdgstrf2 -/************************************************************************/ -( - superlu_options_t *options, - int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid, - LocalLU_t *Llu, SuperLUStat_t *stat, int* info - ) - -{ - int c, iam, l, pkk; - int incx = 1, incy = 1; - int nsupr; /* number of rows in the block (LDA) */ - int luptr; - int_t i, krow, j, jfst, jlst; - int_t nsupc; /* number of columns in the block */ - int_t *xsup = Glu_persist->xsup; - double *lusup, temp; - double *ujrow; - double alpha = -1; - *info = 0; - - /* Quick return. */ - - /* Initialization. */ - iam = grid->iam; - krow = PROW( k, grid ); - pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); - j = LBj( k, grid ); /* Local block number */ - jfst = FstBlockC( k ); - jlst = FstBlockC( k+1 ); - lusup = Llu->Lnzval_bc_ptr[j]; - nsupc = SuperSize( k ); - if ( Llu->Lrowind_bc_ptr[j] ) nsupr = Llu->Lrowind_bc_ptr[j][1]; - ujrow = Llu->ujrow; - - luptr = 0; /* Point to the diagonal entries. */ - c = nsupc; - for (j = 0; j < jlst - jfst; ++j) { - /* Broadcast the j-th row (nsupc - j) elements to - the process column. */ - if ( iam == pkk ) { /* Diagonal process. */ - i = luptr; - if ( options->ReplaceTinyPivot == YES || lusup[i] == 0.0 ) { - if ( fabs(lusup[i]) < thresh ) { /* Diagonal */ -#if ( PRNTlevel>=2 ) - printf("(%d) .. col %d, tiny pivot %e ", - iam, jfst+j, lusup[i]); -#endif - /* Keep the replaced diagonal with the same sign. */ - if ( lusup[i] < 0 ) lusup[i] = -thresh; - else lusup[i] = thresh; -#if ( PRNTlevel>=2 ) - printf("replaced by %e\n", lusup[i]); -#endif - ++(stat->TinyPivots); - } - } - for (l = 0; l < c; ++l, i += nsupr) ujrow[l] = lusup[i]; - } -#if 0 - dbcast_col(ujrow, c, pkk, UjROW, grid, &c); -#else - MPI_Bcast(ujrow, c, MPI_DOUBLE, krow, (grid->cscp).comm); - /*bcast_tree(ujrow, c, MPI_DOUBLE, krow, (24*k+j)%NTAGS, - grid, COMM_COLUMN, &c);*/ -#endif - -#if ( DEBUGlevel>=2 ) -if ( k == 3329 && j == 2 ) { - if ( iam == pkk ) { - printf("..(%d) k %d, j %d: Send ujrow[0] %e\n",iam,k,j,ujrow[0]); - } else { - printf("..(%d) k %d, j %d: Recv ujrow[0] %e\n",iam,k,j,ujrow[0]); - } -} -#endif - - if ( !lusup ) { /* Empty block column. */ - --c; - if ( ujrow[0] == 0.0 ) *info = j+jfst+1; - continue; - } - - /* Test for singularity. */ - if ( ujrow[0] == 0.0 ) { - *info = j+jfst+1; - } else { - /* Scale the j-th column of the matrix. */ - temp = 1.0 / ujrow[0]; - if ( iam == pkk ) { - for (i = luptr+1; i < luptr-j+nsupr; ++i) lusup[i] *= temp; - stat->ops[FACT] += nsupr-j-1; - } else { - for (i = luptr; i < luptr+nsupr; ++i) lusup[i] *= temp; - stat->ops[FACT] += nsupr; - } - } - - /* Rank-1 update of the trailing submatrix. */ - if ( --c ) { - if ( iam == pkk ) { - l = nsupr - j - 1; -#ifdef _CRAY - SGER(&l, &c, &alpha, &lusup[luptr+1], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); -#else - dger_(&l, &c, &alpha, &lusup[luptr+1], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr+1], &nsupr); -#endif - stat->ops[FACT] += 2 * l * c; - } else { -#ifdef _CRAY - SGER(&nsupr, &c, &alpha, &lusup[luptr], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); -#else - dger_(&nsupr, &c, &alpha, &lusup[luptr], &incx, - &ujrow[1], &incy, &lusup[luptr+nsupr], &nsupr); -#endif - stat->ops[FACT] += 2 * nsupr * c; - } - } - - /* Move to the next column. */ - if ( iam == pkk ) luptr += nsupr + 1; - else luptr += nsupr; - - } /* for j ... */ - -} /* PDGSTRF2 */ - - -/************************************************************************/ -/*! \brief - * - *
 
- * Purpose
- * =======
- *   Perform parallel triangular solves
- *           U(k,:) := A(k,:) \ L(k,k). 
- *   Only the process column that owns block column *k* participates
- *   in the work.
- * 
- * Arguments
- * =========
- *
- * m      (input) int (global)
- *        Number of rows in the matrix.
- *
- * k      (input) int (global)
- *        The row number of the block row to be factorized.
- *
- * Glu_persist (input) Glu_persist_t*
- *        Global data structures (xsup, supno) replicated on all processes.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- *
- * Llu    (input/output) LocalLU_t*
- *        Local data structures to store distributed L and U matrices.
- *
- * stat   (output) SuperLUStat_t*
- *        Record the statistics about the factorization; 
- *        See SuperLUStat_t structure defined in util.h.
- * 
- */ -static void pdgstrs2 -/************************************************************************/ -#ifdef _CRAY -( - int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, - LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3 - ) -#else -( - int_t m, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, - LocalLU_t *Llu, SuperLUStat_t *stat - ) -#endif - -{ - int iam, pkk; - int incx = 1; - int nsupr; /* number of rows in the block L(:,k) (LDA) */ - int segsize; - int_t nsupc; /* number of columns in the block */ - int_t luptr, iukp, rukp; - int_t b, gb, j, klst, knsupc, lk, nb; - int_t *xsup = Glu_persist->xsup; - int_t *usub; - double *lusup, *uval; - - /* Quick return. */ - lk = LBi( k, grid ); /* Local block number */ - if ( !Llu->Unzval_br_ptr[lk] ) return; - - /* Initialization. */ - iam = grid->iam; - pkk = PNUM( PROW(k, grid), PCOL(k, grid), grid ); - klst = FstBlockC( k+1 ); - knsupc = SuperSize( k ); - usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ - uval = Llu->Unzval_br_ptr[lk]; - nb = usub[0]; - iukp = BR_HEADER; - rukp = 0; - if ( iam == pkk ) { - lk = LBj( k, grid ); - nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ - lusup = Llu->Lnzval_bc_ptr[lk]; - } else { - nsupr = Llu->Lsub_buf_2[k%2][1]; /* LDA of lusup[] */ - lusup = Llu->Lval_buf_2[k%2]; - } - - /* Loop through all the row blocks. */ - for (b = 0; b < nb; ++b) { - gb = usub[iukp]; - nsupc = SuperSize( gb ); - iukp += UB_DESCRIPTOR; - - /* Loop through all the segments in the block. */ - for (j = 0; j < nsupc; ++j) { - segsize = klst - usub[iukp++]; - if ( segsize ) { /* Nonzero segment. */ - luptr = (knsupc - segsize) * (nsupr + 1); -#ifdef _CRAY - STRSV(ftcs1, ftcs2, ftcs3, &segsize, &lusup[luptr], &nsupr, - &uval[rukp], &incx); -#else - dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, - &uval[rukp], &incx); -#endif - stat->ops[FACT] += segsize * (segsize + 1); - rukp += segsize; - } - } - } /* for b ... */ - -} /* PDGSTRS2 */ - -static int -probe_recv(int iam, int source, int tag, MPI_Datatype datatype, MPI_Comm comm, - int buf_size) -{ - MPI_Status status; - int count; - - MPI_Probe( source, tag, comm, &status ); - MPI_Get_count( &status, datatype, &count ); - if ( count > buf_size ) { - printf("(%d) Recv'ed count %d > buffer size $d\n", - iam, count, buf_size); - exit(-1); - } - return 0; -} diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index a0dfe185..d91fa367 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -393,6 +393,14 @@ extern void pdgstrs_Bglobal(int_t, LUstruct_t *, gridinfo_t *, extern void pdgstrs(int_t, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *, double *, int_t, int_t, int_t, int, SOLVEstruct_t *, SuperLUStat_t *, int *); +extern int_t pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb, + int_t fst_row, int_t *ilsum, double *x, + ScalePermstruct_t *, Glu_persist_t *, + gridinfo_t *, SOLVEstruct_t *); + // Sherry: to be removed +extern void pdgstrs_vecpar(int_t, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *, + double *, int_t, int_t, int_t, int, SOLVEstruct_t *, + SuperLUStat_t *, int *); extern void dlsum_fmod(double *, double *, double *, double *, int, int, int_t , int_t *, int_t, int_t, int_t, int_t *, gridinfo_t *, LocalLU_t *, @@ -407,12 +415,12 @@ extern void dlsum_fmod_inv(double *, double *, double *, double *, int_t *, gridinfo_t *, LocalLU_t *, SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int_t, int_t, int, int); extern void dlsum_fmod_inv_master(double *, double *, double *, double *, - int, int, int_t , int_t *, int_t, - int_t *, gridinfo_t *, LocalLU_t *, - SuperLUStat_t **, int_t, int_t, int_t, int_t, int, int); + int, int, int_t , int_t *, int_t, int_t *, + gridinfo_t *, LocalLU_t *, SuperLUStat_t **, + int_t, int_t, int_t, int_t, int, int); extern void dlsum_bmod_inv(double *, double *, double *, double *, - int, int_t, int_t *, int_t *, Ucb_indptr_t **, - int_t **, int_t *, gridinfo_t *, LocalLU_t *, + int, int_t, int_t *, int_t *, Ucb_indptr_t **, + int_t **, int_t *, gridinfo_t *, LocalLU_t *, SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int, int); extern void dlsum_bmod_inv_master(double *, double *, double *, double *, int, int_t, int_t *, int_t *, Ucb_indptr_t **, @@ -527,9 +535,13 @@ extern void dger_(int*, int*, double*, double*, int*, /*==== For 3D code ====*/ +extern void pdgssvx3d (superlu_dist_options_t *, SuperMatrix *, + ScalePermstruct_t *, double B[], int ldb, int nrhs, + gridinfo3d_t *, LUstruct_t *, SOLVEstruct_t *, + double *berr, SuperLUStat_t *, int *info); extern int_t pdgstrf3d(superlu_dist_options_t *, int m, int n, double anorm, - trf3Dpartition_t*, SCT_t *, LUstruct_t *, gridinfo3d_t *, - SuperLUStat_t *, int *); + trf3Dpartition_t*, SCT_t *, LUstruct_t *, + gridinfo3d_t *, SuperLUStat_t *, int *); extern int_t zSendLPanel(int_t, int_t, LUstruct_t*, gridinfo3d_t*, SCT_t*); extern int_t zRecvLPanel(int_t, int_t, double, double, double*, LUstruct_t*, gridinfo3d_t*, SCT_t* SCT); @@ -667,7 +679,7 @@ extern int_t gatherAllFactoredLUFr(int_t* myZeroTrIdxs, sForest_t* sForests, extern void pdgstrf2_trsm(superlu_dist_options_t *options, int_t, int_t, int_t k, double thresh, Glu_persist_t *, gridinfo_t *, LocalLU_t *, MPI_Request *U_diag_blk_send_req, - SuperLUStat_t *, int *info, SCT_t *); + int tag_ub, SuperLUStat_t *, int *info, SCT_t *); #ifdef _CRAY void pdgstrs2_omp (int_t, int_t, int_t, Glu_persist_t *, gridinfo_t *, LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd); @@ -676,6 +688,13 @@ void pdgstrs2_omp (int_t, int_t, int_t, int_t *, double*, Glu_persist_t *, gridi LocalLU_t *, SuperLUStat_t *, Ublock_info_t *, double *bigV, int_t ldt, SCT_t *SCT ); #endif +#else +extern void pdgstrf2_trsm(superlu_dist_options_t * options, int_t k0, int_t k, + double thresh, Glu_persist_t *, gridinfo_t *, + LocalLU_t *, MPI_Request *, int tag_ub, + SuperLUStat_t *, int *info); +extern void pdgstrs2_omp(int_t k0, int_t k, Glu_persist_t *, gridinfo_t *, + LocalLU_t *, SuperLUStat_t *); #endif // same routine names !!!!!!!! extern int_t LpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr, @@ -717,7 +736,7 @@ extern void pdgstrs2 extern void pdgstrf2(superlu_dist_options_t *, int_t nsupers, int_t k0, int_t k, double thresh, Glu_persist_t *, gridinfo_t *, - LocalLU_t *, MPI_Request *, SuperLUStat_t *, int *); + LocalLU_t *, MPI_Request *, int, SuperLUStat_t *, int *); /* from p3dcomm.h */ int_t AllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); @@ -817,6 +836,57 @@ int_t zRecvUPanel(int_t k, int_t sender, double alpha, double beta, double* Uval_buf, LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); + /* from communication_aux.h */ +extern int_t IBcast_LPanel (int_t k, int_t k0, int_t* lsub, double* lusup, + gridinfo_t *, int* msgcnt, MPI_Request *, + int_t **ToSendR, int_t *xsup, int ); +extern int_t Bcast_LPanel(int_t k, int_t k0, int_t* lsub, double* lusup, + gridinfo_t *, int* msgcnt, int_t **ToSendR, + int_t *xsup , SCT_t*, int); +extern int_t IBcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval, + gridinfo_t *, int* msgcnt, MPI_Request *, + int_t *ToSendD, int ); +extern int_t Bcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval, + gridinfo_t *, int* msgcnt, int_t *ToSendD, SCT_t*, int); +extern int_t Irecv_LPanel (int_t k, int_t k0, int_t* Lsub_buf, + double* Lval_buf, gridinfo_t *, + MPI_Request *, LocalLU_t *, int); +extern int_t Irecv_UPanel(int_t k, int_t k0, int_t* Usub_buf, double*, + LocalLU_t *, gridinfo_t*, MPI_Request *, int); +extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int_t **ToSendR, + MPI_Request *s, SCT_t*); +extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *); +extern int_t Wait_URecv(MPI_Request *, int* msgcnt, SCT_t *); +extern int_t Check_LRecv(MPI_Request*, int* msgcnt); +extern int_t Wait_LRecv(MPI_Request*, int* msgcnt, int* msgcntsU, + gridinfo_t *, SCT_t*); +extern int_t ISend_UDiagBlock(int_t k0, double *ublk_ptr, int_t size, + MPI_Request *, gridinfo_t *, int); +extern int_t Recv_UDiagBlock(int_t k0, double *ublk_ptr, int_t size, + int_t src, gridinfo_t *, SCT_t*, int); +extern int_t Wait_UDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *); +extern int_t Wait_LDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *); +extern int_t PackLBlock(int_t k, double* Dest, Glu_persist_t *, + gridinfo_t *, LocalLU_t *); +extern int_t ISend_LDiagBlock(int_t k0, double *lblk_ptr, int_t size, + MPI_Request *, gridinfo_t *, int); +extern int_t IRecv_UDiagBlock(int_t k0, double *ublk_ptr, int_t size, + int_t src, MPI_Request *, gridinfo_t *, + SCT_t*, int); +extern int_t Wait_UDiagBlock_Recv(MPI_Request *, SCT_t *); +extern int_t Test_UDiagBlock_Recv(MPI_Request *, SCT_t *); +extern int_t IRecv_LDiagBlock(int_t k0, double *L_blk_ptr, int_t size, + int_t src, MPI_Request *, gridinfo_t*, SCT_t*, + int); +extern int_t Wait_LDiagBlock_Recv(MPI_Request *, SCT_t *); +extern int_t Test_LDiagBlock_Recv(MPI_Request *, SCT_t *); +#if (MPI_VERSION>2) +extern int_t IBcast_UDiagBlock(int_t k, double *ublk_ptr, int_t size, + MPI_Request *, gridinfo_t *); +extern int_t IBcast_LDiagBlock(int_t k, double *lblk_ptr, int_t size, + MPI_Request *, gridinfo_t *); +#endif + /* from trfCommWrapper.h */ extern int_t DiagFactIBCast(int_t k, int_t k0, double *BlockUFactor, double *BlockLFactor, @@ -824,7 +894,7 @@ extern int_t DiagFactIBCast(int_t k, int_t k0, MPI_Request *, MPI_Request *, gridinfo_t *, superlu_dist_options_t *, double thresh, LUstruct_t *LUstruct, SuperLUStat_t *, int *info, - SCT_t *); + SCT_t *, int tag_ub); extern int_t UPanelTrSolve( int_t k, double* BlockLFactor, double* bigV, int_t ldt, Ublock_info_t*, gridinfo_t *, LUstruct_t *, SuperLUStat_t *, SCT_t *); @@ -841,10 +911,10 @@ extern int_t IBcastRecvLPanel(int_t k, int_t k0, int* msgcnt, MPI_Request *, MPI_Request *, int_t* Lsub_buf, double* Lval_buf, int_t * factored, gridinfo_t *, LUstruct_t *, - SCT_t *); + SCT_t *, int tag_ub); extern int_t IBcastRecvUPanel(int_t k, int_t k0, int* msgcnt, MPI_Request *, MPI_Request *, int_t* Usub_buf, double* Uval_buf, - gridinfo_t *, LUstruct_t *, SCT_t *); + gridinfo_t *, LUstruct_t *, SCT_t *, int tag_ub); extern int_t WaitL(int_t k, int* msgcnt, int* msgcntU, MPI_Request *, MPI_Request *, gridinfo_t *, LUstruct_t *, SCT_t *); extern int_t WaitU(int_t k, int* msgcnt, MPI_Request *, MPI_Request *, @@ -852,6 +922,7 @@ extern int_t WaitU(int_t k, int* msgcnt, MPI_Request *, MPI_Request *, extern int_t LPanelTrSolve(int_t k, int_t* factored_L, double* BlockUFactor, gridinfo_t *, LUstruct_t *); + /* from trfAux.h */ extern int_t getNsupers(int, LUstruct_t *); extern int_t SchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*, @@ -880,27 +951,6 @@ extern int_t initScuBufs(int_t ldt, int_t num_threads, int_t nsupers, gridinfo_t * grid); extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo); -extern int_t ancestorFactor( - int_t ilvl, // level of factorization - sForest_t* sforest, - commRequests_t **comReqss, // lists of communication requests, size maxEtree level - scuBufs_t *scuBufs, // contains buffers for schur complement update - packLUInfo_t*packLUInfo, - msgs_t**msgss, // size=num Look ahead - LUValSubBuf_t**LUvsbs, // size=num Look ahead - diagFactBufs_t **dFBufs, // size maxEtree level - factStat_t *factStat, - factNodelists_t *fNlists, - gEtreeInfo_t* gEtreeInfo, // global etree info - superlu_dist_options_t *options, - int_t * gIperm_c_supno, - int_t ldt, - HyP_t* HyP, - LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, - double thresh, SCT_t *SCT, - int *info -); - // the generic tree factoring code extern int_t treeFactor( int_t nnnodes, // number of nodes in the tree @@ -956,7 +1006,7 @@ extern int_t denseTreeFactor( int_t * gIperm_c_supno, int_t ldt, LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, - double thresh, SCT_t *SCT, + double thresh, SCT_t *SCT, int tag_ub, int *info ); @@ -976,7 +1026,7 @@ extern int_t sparseTreeFactor_ASYNC( int_t ldt, HyP_t* HyP, LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, - double thresh, SCT_t *SCT, + double thresh, SCT_t *SCT, int tag_ub, int *info ); extern LUValSubBuf_t** LluBufInitArr(int_t numLA, LUstruct_t *LUstruct); @@ -990,8 +1040,7 @@ extern int_t sDiagFactIBCast(int_t k, diagFactBufs_t *dFBuf, double thresh, LUstruct_t *LUstruct, SuperLUStat_t *stat, int *info, - SCT_t *SCT - ); + SCT_t *SCT, int tag_ub); extern int_t sLPanelUpdate( int_t k, diagFactBufs_t *dFBuf, factStat_t *factStat, commRequests_t *comReqs, @@ -1014,7 +1063,7 @@ extern int_t sIBcastRecvLPanel( msgs_t* msgs, factStat_t *factStat, gridinfo_t *grid, - LUstruct_t *LUstruct, SCT_t *SCT); + LUstruct_t *LUstruct, SCT_t *SCT, int tag_ub); extern int_t sIBcastRecvUPanel( int_t k, @@ -1023,7 +1072,7 @@ extern int_t sIBcastRecvUPanel( msgs_t* msgs, factStat_t *factStat, gridinfo_t *grid, - LUstruct_t *LUstruct, SCT_t *SCT); + LUstruct_t *LUstruct, SCT_t *SCT, int tag_ub); extern int_t sWaitL(int_t k, commRequests_t *comReqs, msgs_t* msgs, @@ -1062,7 +1111,7 @@ extern int_t sUPanelTrSolve( int_t k, LUstruct_t *LUstruct, SuperLUStat_t *stat, SCT_t *SCT); /* from ancFactorization.h */ -int_t ancestorFactor( +extern int_t ancestorFactor( int_t ilvl, // level of factorization sForest_t* sforest, commRequests_t **comReqss, // lists of communication requests // size maxEtree level @@ -1079,8 +1128,7 @@ int_t ancestorFactor( int_t ldt, HyP_t* HyP, LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, - double thresh, SCT_t *SCT, - int *info + double thresh, SCT_t *SCT, int tag_ub, int *info ); /*=====================*/ diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index 9205f60b..fe267352 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -35,9 +35,9 @@ at the top-level directory. #include #endif -#ifdef _OPENMP +//#ifdef _OPENMP #include -#endif +//#endif #include #include @@ -47,6 +47,7 @@ at the top-level directory. //#include #include #include +//#include Sherry: not available on Mac OS // /* Following is for vtune */ // #if 0 // #include @@ -90,9 +91,14 @@ at the top-level directory. #define IFMT "%8d" #endif +/* This is defined in superlu_grid.c */ +extern MPI_Datatype SuperLU_MPI_DOUBLE_COMPLEX; + #ifdef __INTEL_COMPILER #include "mkl.h" + #else + //#include "cblas.h" #if 0 // Sherry: the following does not work with gcc on Linux. #define _mm_malloc(a,b) malloc(a) @@ -865,6 +871,33 @@ typedef struct int* msgcntU; } msgs_t; +typedef struct xtrsTimer_t +{ + double trsDataSendXY; + double trsDataSendZ; + double trsDataRecvXY; + double trsDataRecvZ; + double t_pdReDistribute_X_to_B; + double t_pdReDistribute_B_to_X; + double t_forwardSolve; + double tfs_compute; + double tfs_comm; + double t_backwardSolve; + double tbs_compute; + double tbs_comm; + double tbs_tree[2*MAX_3D_LEVEL]; + double tfs_tree[2*MAX_3D_LEVEL]; + + // counters for communication and computation volume + + int_t trsMsgSentXY; + int_t trsMsgSentZ; + int_t trsMsgRecvXY; + int_t trsMsgRecvZ; + + double ppXmem; // perprocess X-memory +} xtrsTimer_t; + /*====================*/ /*********************************************************************** @@ -1148,11 +1181,11 @@ extern sForest_t** getGreedyLoadBalForests( int_t maxLvl, int_t nsupers, int_t* extern sForest_t** getForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList); /* from trfAux.h */ -extern void set_tag_ub(); +extern int set_tag_ub(); extern int getNumThreads(int); -#if 0 // Sherry: conflicting with existing routine extern int_t num_full_cols_U(int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup, - gridinfo_t *, int_t *); + gridinfo_t *, int_t *, int_t *); +#if 0 // Sherry: conflicting with existing routine extern int_t estimate_bigu_size(int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr, Glu_persist_t *, gridinfo_t*, int_t* perm_u); #endif @@ -1167,6 +1200,18 @@ extern int_t getNumLookAhead(); extern commRequests_t** initCommRequestsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); extern msgs_t** initMsgsArr(int_t numLA); + /* from sec_structs.h */ +extern int Cmpfunc_R_info (const void * a, const void * b); +extern int Cmpfunc_U_info (const void * a, const void * b); +extern int sort_R_info( Remain_info_t* Remain_info, int n ); +extern int sort_U_info( Ublock_info_t* Ublock_info, int n ); +extern int sort_R_info_elm( Remain_info_t* Remain_info, int n ); +extern int sort_U_info_elm( Ublock_info_t* Ublock_info, int n ); + + /* from pdgstrs.h */ +extern void printTRStimer(xtrsTimer_t *xtrsTimer, gridinfo3d_t *grid3d); +extern void initTRStimer(xtrsTimer_t *xtrsTimer, gridinfo_t *grid); + /*=====================*/ #ifdef __cplusplus diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index 425958a2..ec3d9f9a 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -1,18 +1,7 @@ -/* superlu_dist_config.h.in */ - -/* Enable parmetis */ +/* #define XSDK_INDEX_SIZE 64 */ +/* #define SLU_HAVE_LAPACK TRUE */ #define HAVE_PARMETIS TRUE - -/* Enable LAPACK */ -/* #undef SLU_HAVE_LAPACK */ - -/* Enable CombBLAS */ -/* #undef HAVE_COMBBLAS */ - -/* enable 64bit index mode */ -/* #undef XSDK_INDEX_SIZE */ - +/* #define HAVE_COMBBLAS TRUE */ #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 - #endif diff --git a/SRC/superlu_grid.c b/SRC/superlu_grid.c index ee605999..d17b50b1 100644 --- a/SRC/superlu_grid.c +++ b/SRC/superlu_grid.c @@ -21,7 +21,7 @@ at the top-level directory. #include "superlu_ddefs.h" -/* Define global variables */ +/* Define global variable for MPI double complex derived data type */ MPI_Datatype SuperLU_MPI_DOUBLE_COMPLEX = MPI_DATATYPE_NULL; /*! \brief All processes in the MPI communicator must call this routine. diff --git a/SRC/util.c b/SRC/util.c index 225547d4..5fa742bd 100644 --- a/SRC/util.c +++ b/SRC/util.c @@ -1262,7 +1262,7 @@ arrive_at_ublock (int_t j, /* j-th block in a U panel */ * September 28, 2016. * Modified December 4, 2018. */ -static int_t num_full_cols_U +int_t num_full_cols_U ( int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup, gridinfo_t *grid, int_t *perm_u, @@ -1288,6 +1288,8 @@ static int_t num_full_cols_U int_t temp_ncols = 0; int_t segsize; + *ldu = 0; + for (int_t j = 0; j < nub; ++j) { /* Sherry -- no need to search from beginning ?? */ From d7df9537ab4e652d57961e7f01118e97a6b14bff Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Sat, 27 Apr 2019 23:14:11 -0700 Subject: [PATCH 003/147] First attepmt with 3D code, real working, complex not tested yet. --- DoxyConfig | 2 +- EXAMPLE/dcreate_matrix.c | 1 - SRC/Makefile | 17 +- SRC/communication_aux.c | 229 ++++++ SRC/dcommunication_aux.c | 481 ++++++++++++ SRC/dgather.c | 381 ++++++++++ SRC/dmemory_dist.c | 119 +++ SRC/dtreeFactorization.c | 680 +++++++++++++++++ SRC/dtrfAux.c | 651 ++++++++++++++++ SRC/dtrfCommWrapper.c | 520 +++++++++++++ SRC/pd3dcomm.c | 856 +++++++++++++++++++++ SRC/pdgssvx3d.c | 1537 +++++++++++++++++++++++++++++++++++++ SRC/pdgstrf.c | 34 +- SRC/pdgstrf2.c | 55 +- SRC/pdgstrf3d.c | 315 ++++++++ SRC/pz3dcomm.c | 855 +++++++++++++++++++++ SRC/pzgssvx3d.c | 1538 ++++++++++++++++++++++++++++++++++++++ SRC/pzgstrf3d.c | 314 ++++++++ SRC/scatter.h | 147 ++++ SRC/sec_structs.c | 654 ++++++++++++++++ SRC/superlu_ddefs.h | 269 +++---- SRC/superlu_defs.h | 13 +- SRC/superlu_grid3d.c | 288 +++++++ SRC/supernodalForest.c | 948 +++++++++++++++++++++++ SRC/supernodal_etree.c | 1008 +++++++++++++++++++++++++ SRC/treeFactorization.c | 366 +++++++++ SRC/trfAux.c | 1221 ++++++++++++++++++++++++++++++ SRC/util.c | 94 +++ SRC/zcommunication_aux.c | 480 ++++++++++++ SRC/zgather.c | 380 ++++++++++ SRC/ztreeFactorization.c | 679 +++++++++++++++++ SRC/ztrfAux.c | 650 ++++++++++++++++ SRC/ztrfCommWrapper.c | 519 +++++++++++++ 33 files changed, 16102 insertions(+), 199 deletions(-) create mode 100644 SRC/communication_aux.c create mode 100644 SRC/dcommunication_aux.c create mode 100644 SRC/dgather.c create mode 100644 SRC/dtreeFactorization.c create mode 100644 SRC/dtrfAux.c create mode 100644 SRC/dtrfCommWrapper.c create mode 100644 SRC/pd3dcomm.c create mode 100644 SRC/pdgssvx3d.c create mode 100644 SRC/pdgstrf3d.c create mode 100644 SRC/pz3dcomm.c create mode 100644 SRC/pzgssvx3d.c create mode 100644 SRC/pzgstrf3d.c create mode 100644 SRC/scatter.h create mode 100644 SRC/sec_structs.c create mode 100644 SRC/superlu_grid3d.c create mode 100644 SRC/supernodalForest.c create mode 100644 SRC/supernodal_etree.c create mode 100644 SRC/treeFactorization.c create mode 100644 SRC/trfAux.c create mode 100644 SRC/zcommunication_aux.c create mode 100644 SRC/zgather.c create mode 100644 SRC/ztreeFactorization.c create mode 100644 SRC/ztrfAux.c create mode 100644 SRC/ztrfCommWrapper.c diff --git a/DoxyConfig b/DoxyConfig index 9496aa29..432628e3 100644 --- a/DoxyConfig +++ b/DoxyConfig @@ -31,7 +31,7 @@ PROJECT_NAME = SuperLU Distributed # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 5.4.0 +PROJECT_NUMBER = 7.0.0 e # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/EXAMPLE/dcreate_matrix.c b/EXAMPLE/dcreate_matrix.c index 1784949b..8486f419 100644 --- a/EXAMPLE/dcreate_matrix.c +++ b/EXAMPLE/dcreate_matrix.c @@ -235,7 +235,6 @@ int dcreate_matrix(SuperMatrix *A, int nrhs, double **rhs, } - int dcreate_matrix_postfix(SuperMatrix *A, int nrhs, double **rhs, int *ldb, double **x, int *ldx, FILE *fp, char * postfix, gridinfo_t *grid) diff --git a/SRC/Makefile b/SRC/Makefile index 5519b74f..00b76220 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -27,11 +27,8 @@ ####################################################################### include ../make.inc -FACT3D = pdgssvx3d.o pdgstrf3d.o ancFactorization.o treeFactorization.o \ - p3dcomm.o gather.o sec_structs.o trfCommWrapper.o trfAux.o \ - communication_aux.o superlu_grid3d.o \ - supernodal_etree.o supernodalForest.o xtrf3Dpartition.o scatter.o -# pdgstrs_vecpar.o +FACT3D = scatter.o +# pdgstrs_vecpar.o ancFactorization.o # pddrive_params.o # scatter.o @@ -45,6 +42,9 @@ ALLAUX = sp_ienv.o etree.o sp_colorder.o get_perm_c.o \ psymbfact.o psymbfact_util.o get_perm_c_parmetis.o mc64ad_dist.o \ static_schedule.o xerr_dist.o smach_dist.o dmach_dist.o \ superlu_dist_version.o TreeInterface.o +# Following are from 3D code +ALLAUX += superlu_grid3d.o supernodal_etree.o supernodalForest.o \ + trfAux.o communication_aux.o treeFactorization.o sec_structs.o ifeq "${ACC}" "GPU" ALLAUX += cublas_utils.o @@ -73,6 +73,10 @@ DPLUSRC = pdgssvx.o pdgssvx_ABglobal.o \ pdgstrs.o pdgstrs1.o pdgstrs_lsum.o pdgstrs_Bglobal.o \ pdgsrfs.o pdgsmv.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o \ dreadtriple_noheader.o $(FACT3D) +# from 3D code +DPLUSRC += pdgssvx3d.o pdgstrf3d.o dtreeFactorization.o \ + dgather.o pd3dcomm.o dtrfAux.o dcommunication_aux.o dtrfCommWrapper.o + # # Routines for double complex parallel SuperLU ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \ @@ -83,6 +87,9 @@ ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \ pzgstrs.o pzgstrs1.o pzgstrs_lsum.o pzgstrs_Bglobal.o \ pzgsrfs.o pzgsmv.o pzgsrfs_ABXglobal.o pzgsmv_AXglobal.o \ zreadtriple_noheader.o +# from 3D code +DPLUSRC += pzgssvx3d.o pzgstrf3d.o ztreeFactorization.o \ + zgather.o pz3dcomm.o ztrfAux.o zcommunication_aux.o ztrfCommWrapper.o all: double complex16 diff --git a/SRC/communication_aux.c b/SRC/communication_aux.c new file mode 100644 index 00000000..552b23fb --- /dev/null +++ b/SRC/communication_aux.c @@ -0,0 +1,229 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +#include "superlu_defs.h" +#if 0 +#include "sec_structs.h" +#include "communication_aux.h" +#include "compiler.h" +#endif + + +int_t Wait_LSend +/*wait till broadcast of L finished*/ +(int_t k, gridinfo_t *grid, int_t **ToSendR, MPI_Request *send_req, SCT_t* SCT) +{ + unsigned long long t1 = _rdtsc(); + int_t Pc = grid->npcol; + int_t iam = grid->iam; + int_t lk = LBj (k, grid); + int_t mycol = MYCOL (iam, grid); + MPI_Status status; + for (int_t pj = 0; pj < Pc; ++pj) + { + /* Wait for Isend to complete before using lsub/lusup. */ + if (ToSendR[lk][pj] != EMPTY && pj != mycol) + { + MPI_Wait (&send_req[pj], &status); + MPI_Wait (&send_req[pj + Pc], &status); + } + } + SCT->Wait_LSend_tl += (double) ( _rdtsc() - t1); + return 0; +} + + +int_t Wait_USend +/*wait till broadcast of U panels finished*/ +( MPI_Request *send_req, gridinfo_t *grid, SCT_t* SCT) +{ + unsigned long long t1 = _rdtsc(); + int_t iam = grid->iam; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + MPI_Status status; + for (int_t pi = 0; pi < Pr; ++pi) + { + if (pi != myrow) + { + MPI_Wait (&send_req[pi], &status); + MPI_Wait (&send_req[pi + Pr], &status); + } + } + SCT->Wait_USend_tl += (double) ( _rdtsc() - t1); + return 0; +} + + +int_t Check_LRecv +/*checks if diagnoal blocks have been received*/ + +( MPI_Request* recv_req, int* msgcnt ) +{ + int flag0, flag1; + MPI_Status status; + + flag0 = flag1 = 0; + if (recv_req[0] != MPI_REQUEST_NULL) + { + MPI_Test (&recv_req[0], &flag0, &status); + if (flag0) + { + MPI_Get_count (&status, mpi_int_t, &msgcnt[0]); + recv_req[0] = MPI_REQUEST_NULL; + } + } + else + flag0 = 1; + if (recv_req[1] != MPI_REQUEST_NULL) + { + MPI_Test (&recv_req[1], &flag1, &status); + if (flag1) + { + MPI_Get_count (&status, mpi_int_t, &msgcnt[1]); + recv_req[1] = MPI_REQUEST_NULL; + } + } + else + flag1 = 1; + + return flag1 && flag0; +} + + +int_t Wait_UDiagBlockSend(MPI_Request *U_diag_blk_send_req, + gridinfo_t * grid, SCT_t* SCT) +{ + + unsigned long long t1 = _rdtsc(); + int_t iam = grid->iam; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + MPI_Status status; + for (int_t pr = 0; pr < Pr; ++pr) + { + if (pr != myrow) + { + MPI_Wait (U_diag_blk_send_req + pr, &status); + } + } + SCT->Wait_UDiagBlockSend_tl += (double) ( _rdtsc() - t1); + return 0; +} + +int_t Wait_LDiagBlockSend(MPI_Request *L_diag_blk_send_req, + gridinfo_t * grid, SCT_t* SCT) +{ + + unsigned long long t1 = _rdtsc(); + int_t iam = grid->iam; + int_t Pc = grid->npcol; + int_t mycol = MYCOL (iam, grid); + MPI_Status status; + for (int_t pc = 0; pc < Pc; ++pc) + { + if (pc != mycol) + { + MPI_Wait (L_diag_blk_send_req + pc, &status); + } + } + SCT->Wait_UDiagBlockSend_tl += (double) ( _rdtsc() - t1); + return 0; +} + + +int_t Wait_UDiagBlock_Recv( MPI_Request *request, SCT_t* SCT) +{ + unsigned long long t1 = _rdtsc(); + MPI_Status status; + MPI_Wait(request, &status); + SCT->Wait_UDiagBlock_Recv_tl += (double) ( _rdtsc() - t1); + return 0; + +} + +int_t Test_UDiagBlock_Recv( MPI_Request *request, SCT_t* SCT) +{ + unsigned long long t1 = _rdtsc(); + MPI_Status status; + int flag; + MPI_Test(request,&flag, &status); + SCT->Wait_UDiagBlock_Recv_tl += (double) ( _rdtsc() - t1); + return flag; + +} + +int_t Wait_LDiagBlock_Recv( MPI_Request *request, SCT_t* SCT) +{ + unsigned long long t1 = _rdtsc(); + MPI_Status status; + MPI_Wait(request, &status); + SCT->Wait_LDiagBlock_Recv_tl += (double) ( _rdtsc() - t1); + return 0; + +} + +int_t Test_LDiagBlock_Recv( MPI_Request *request, SCT_t* SCT) +{ + unsigned long long t1 = _rdtsc(); + MPI_Status status; + int flag; + MPI_Test(request, &flag, &status); + SCT->Wait_LDiagBlock_Recv_tl += (double) ( _rdtsc() - t1); + return flag; + +} + +/* + * The following are from trfCommWrapper.c. + */ +int_t Wait_LUDiagSend(int_t k, MPI_Request *U_diag_blk_send_req, + MPI_Request *L_diag_blk_send_req, + gridinfo_t *grid, SCT_t *SCT) +{ + // Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + // LocalLU_t *Llu = LUstruct->Llu; + // int_t* xsup = Glu_persist->xsup; + + int_t iam = grid->iam; + + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + + if (iam == pkk) + { + Wait_UDiagBlockSend(U_diag_blk_send_req, grid, SCT); + Wait_LDiagBlockSend(L_diag_blk_send_req, grid, SCT); + } + + return 0; +} + + + +int_t LDiagBlockRecvWait( int_t k, int_t* factored_U, + MPI_Request * L_diag_blk_recv_req, + gridinfo_t *grid) +{ + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + int_t krow = PROW (k, grid); + + /*factor the U panel*/ + if (myrow == krow && factored_U[k] == 0 && iam != pkk) + { + factored_U[k] = 1; + MPI_Status status; + MPI_Wait(L_diag_blk_recv_req, &status); + } + return 0; +} + diff --git a/SRC/dcommunication_aux.c b/SRC/dcommunication_aux.c new file mode 100644 index 00000000..8bc113b5 --- /dev/null +++ b/SRC/dcommunication_aux.c @@ -0,0 +1,481 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +#include "superlu_ddefs.h" +#if 0 +#include "sec_structs.h" +#include "communication_aux.h" +#include "compiler.h" +#endif + +int_t dIBcast_LPanel +/*broadcasts index array lsub and non-zero value + array lusup of a newly factored L column to my process row*/ +(int_t k, int_t k0, int_t* lsub, double* lusup, gridinfo_t *grid, + int* msgcnt, MPI_Request *send_req, int_t **ToSendR, int_t *xsup, + int tag_ub) +{ + int_t Pc = grid->npcol; + int_t lk = LBj (k, grid); + superlu_scope_t *scp = &grid->rscp; /* The scope of process row. */ + if (lsub) + { + msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + msgcnt[1] = lsub[1] * SuperSize (k); + } + else + { + msgcnt[0] = msgcnt[1] = 0; + } + + for (int_t pj = 0; pj < Pc; ++pj) + { + if (ToSendR[lk][pj] != EMPTY) + { + + + MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj, + SLU_MPI_TAG (0, k0) /* 0 */ , + scp->comm, &send_req[pj]); + MPI_Isend (lusup, msgcnt[1], MPI_DOUBLE, pj, + SLU_MPI_TAG (1, k0) /* 1 */ , + scp->comm, &send_req[pj + Pc]); + + } + } + + return 0; +} + + +int_t dBcast_LPanel +/*broadcasts index array lsub and non-zero value + array lusup of a newly factored L column to my process row*/ +(int_t k, int_t k0, int_t* lsub, double* lusup, gridinfo_t *grid, + int* msgcnt, int_t **ToSendR, int_t *xsup , SCT_t* SCT, + int tag_ub) +{ + unsigned long long t1 = _rdtsc(); + int_t Pc = grid->npcol; + int_t lk = LBj (k, grid); + superlu_scope_t *scp = &grid->rscp; /* The scope of process row. */ + if (lsub) + { + msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + msgcnt[1] = lsub[1] * SuperSize (k); + } + else + { + msgcnt[0] = msgcnt[1] = 0; + } + + for (int_t pj = 0; pj < Pc; ++pj) + { + if (ToSendR[lk][pj] != EMPTY) + { + + + MPI_Send (lsub, msgcnt[0], mpi_int_t, pj, + SLU_MPI_TAG (0, k0) /* 0 */ , + scp->comm); + MPI_Send (lusup, msgcnt[1], MPI_DOUBLE, pj, + SLU_MPI_TAG (1, k0) /* 1 */ , + scp->comm); + + } + } + SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1); + return 0; +} + + + +int_t dIBcast_UPanel +/*asynchronously braodcasts U panel to my process row */ +(int_t k, int_t k0, int_t* usub, double* uval, gridinfo_t *grid, + int* msgcnt, MPI_Request *send_req_u, int_t *ToSendD, int tag_ub ) +{ + + int_t iam = grid->iam; + int_t lk = LBi (k, grid); + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */ + if (usub) + { + msgcnt[2] = usub[2]; + msgcnt[3] = usub[1]; + } + else + { + msgcnt[2] = msgcnt[3] = 0; + } + + if (ToSendD[lk] == YES) + { + for (int_t pi = 0; pi < Pr; ++pi) + { + if (pi != myrow) + { + + MPI_Isend (usub, msgcnt[2], mpi_int_t, pi, + SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ , + scp->comm, + &send_req_u[pi]); + MPI_Isend (uval, msgcnt[3], MPI_DOUBLE, + pi, SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ , + scp->comm, + &send_req_u[pi + Pr]); + + } /* if pi ... */ + } /* for pi ... */ + } /* if ToSendD ... */ + return 0; +} + +/*Synchronously braodcasts U panel to my process row */ +int_t dBcast_UPanel(int_t k, int_t k0, int_t* usub, + double* uval, gridinfo_t *grid, + int* msgcnt, int_t *ToSendD, SCT_t* SCT, int tag_ub) + +{ + unsigned long long t1 = _rdtsc(); + int_t iam = grid->iam; + int_t lk = LBi (k, grid); + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */ + if (usub) + { + msgcnt[2] = usub[2]; + msgcnt[3] = usub[1]; + } + else + { + msgcnt[2] = msgcnt[3] = 0; + } + + if (ToSendD[lk] == YES) + { + for (int_t pi = 0; pi < Pr; ++pi) + { + if (pi != myrow) + { + MPI_Send (usub, msgcnt[2], mpi_int_t, pi, + SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ , + scp->comm); + MPI_Send (uval, msgcnt[3], MPI_DOUBLE, pi, + SLU_MPI_TAG (3, k0) /* (4*k0+3)%tag_ub */ , + scp->comm); + + } /* if pi ... */ + } /* for pi ... */ + } + SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1); + return 0; +} + +int_t dIrecv_LPanel +/*it places Irecv call for L panel*/ +(int_t k, int_t k0, int_t* Lsub_buf, double* Lval_buf, + gridinfo_t *grid, MPI_Request *recv_req, LocalLU_t *Llu, int tag_ub ) +{ + int_t kcol = PCOL (k, grid); + + superlu_scope_t *scp = &grid->rscp; /* The scope of process row. */ + MPI_Irecv (Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, + SLU_MPI_TAG (0, k0) /* 0 */ , + scp->comm, &recv_req[0]); + MPI_Irecv (Lval_buf, Llu->bufmax[1], MPI_DOUBLE, kcol, + SLU_MPI_TAG (1, k0) /* 1 */ , + scp->comm, &recv_req[1]); + return 0; +} + + +int_t dIrecv_UPanel +/*it places Irecv calls to receive U panels*/ +(int_t k, int_t k0, int_t* Usub_buf, double* Uval_buf, LocalLU_t *Llu, + gridinfo_t* grid, MPI_Request *recv_req_u, int tag_ub ) +{ + int_t krow = PROW (k, grid); + superlu_scope_t *scp = &grid->cscp; /* The scope of process column. */ + MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow, + SLU_MPI_TAG (2, k0) /* (4*kk0+2)%tag_ub */ , + scp->comm, &recv_req_u[0]); + MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_DOUBLE, krow, + SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ , + scp->comm, &recv_req_u[1]); + + return 0; +} + +int_t dWait_URecv +( MPI_Request *recv_req, int* msgcnt, SCT_t* SCT) +{ + unsigned long long t1 = _rdtsc(); + MPI_Status status; + MPI_Wait (&recv_req[0], &status); + MPI_Get_count (&status, mpi_int_t, &msgcnt[2]); + MPI_Wait (&recv_req[1], &status); + MPI_Get_count (&status, MPI_DOUBLE, &msgcnt[3]); + SCT->Wait_URecv_tl += (double) ( _rdtsc() - t1); + return 0; +} + +int_t dWait_LRecv +/*waits till L blocks have been received*/ +( MPI_Request* recv_req, int* msgcnt, int* msgcntsU, gridinfo_t * grid, SCT_t* SCT) +{ + unsigned long long t1 = _rdtsc(); + MPI_Status status; + + if (recv_req[0] != MPI_REQUEST_NULL) + { + MPI_Wait (&recv_req[0], &status); + MPI_Get_count (&status, mpi_int_t, &msgcnt[0]); + recv_req[0] = MPI_REQUEST_NULL; + } + else + { + msgcnt[0] = msgcntsU[0]; + } + + if (recv_req[1] != MPI_REQUEST_NULL) + { + MPI_Wait (&recv_req[1], &status); + MPI_Get_count (&status, MPI_DOUBLE, &msgcnt[1]); + recv_req[1] = MPI_REQUEST_NULL; + } + else + { + msgcnt[1] = msgcntsU[1]; + } + SCT->Wait_LRecv_tl += (double) ( _rdtsc() - t1); + return 0; +} + + +int_t dISend_UDiagBlock(int_t k0, double *ublk_ptr, /*pointer for the diagonal block*/ + int_t size, /*number of elements to be broadcasted*/ + MPI_Request *U_diag_blk_send_req, + gridinfo_t * grid, int tag_ub) +{ + int_t iam = grid->iam; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + MPI_Comm comm = (grid->cscp).comm; + /** ALWAYS SEND TO ALL OTHERS - TO FIX **/ + for (int_t pr = 0; pr < Pr; ++pr) + { + if (pr != myrow) + { + /* tag = ((k0<<2)+2) % tag_ub; */ + /* tag = (4*(nsupers+k0)+2) % tag_ub; */ + MPI_Isend (ublk_ptr, size, MPI_DOUBLE, pr, + SLU_MPI_TAG (4, k0) /* tag */ , + comm, U_diag_blk_send_req + pr); + } + } + + return 0; +} + + +int_t dRecv_UDiagBlock(int_t k0, double *ublk_ptr, /*pointer for the diagonal block*/ + int_t size, /*number of elements to be broadcasted*/ + int_t src, + gridinfo_t * grid, SCT_t* SCT, int tag_ub) +{ + unsigned long long t1 = _rdtsc(); + MPI_Status status; + MPI_Comm comm = (grid->cscp).comm; + /* tag = ((k0<<2)+2) % tag_ub; */ + /* tag = (4*(nsupers+k0)+2) % tag_ub; */ + + MPI_Recv (ublk_ptr, size, MPI_DOUBLE, src, + SLU_MPI_TAG (4, k0), comm, &status); + SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1); + return 0; +} + + +int_t dPackLBlock(int_t k, double* Dest, Glu_persist_t *Glu_persist, + gridinfo_t *grid, LocalLU_t *Llu) +/*Copies src matrix into dest matrix*/ +{ + /* Initialization. */ + int_t *xsup = Glu_persist->xsup; + int_t lk = LBj (k, grid); /* Local block number */ + double *lusup = Llu->Lnzval_bc_ptr[lk]; + int_t nsupc = SuperSize (k); + int_t nsupr; + if (Llu->Lrowind_bc_ptr[lk]) + nsupr = Llu->Lrowind_bc_ptr[lk][1]; + else + nsupr = 0; +#if 0 + LAPACKE_dlacpy (LAPACK_COL_MAJOR, 'A', nsupc, nsupc, lusup, nsupr, Dest, nsupc); +#else /* Sherry */ + for (int j = 0; j < nsupc; ++j) { + memcpy( &Dest[j * nsupc], &lusup[j * nsupr], nsupc * sizeof(double) ); + } +#endif + + return 0; +} + +int_t dISend_LDiagBlock(int_t k0, double *lblk_ptr, /*pointer for the diagonal block*/ + int_t size, /*number of elements to be broadcasted*/ + MPI_Request *L_diag_blk_send_req, + gridinfo_t * grid, int tag_ub) +{ + int_t iam = grid->iam; + int_t Pc = grid->npcol; + int_t mycol = MYCOL (iam, grid); + MPI_Comm comm = (grid->rscp).comm; /*Row communicator*/ + /** ALWAYS SEND TO ALL OTHERS - TO FIX **/ + for (int_t pc = 0; pc < Pc; ++pc) + { + if (pc != mycol) + { + /* tag = ((k0<<2)+2) % tag_ub; */ + /* tag = (4*(nsupers+k0)+2) % tag_ub; */ + MPI_Isend (lblk_ptr, size, MPI_DOUBLE, pc, + SLU_MPI_TAG (5, k0) /* tag */ , + comm, L_diag_blk_send_req + pc); + + } + } + + return 0; +} + + +int_t dIRecv_UDiagBlock(int_t k0, double *ublk_ptr, /*pointer for the diagonal block*/ + int_t size, /*number of elements to be broadcasted*/ + int_t src, + MPI_Request *U_diag_blk_recv_req, + gridinfo_t * grid, SCT_t* SCT, int tag_ub) +{ + unsigned long long t1 = _rdtsc(); + MPI_Comm comm = (grid->cscp).comm; + /* tag = ((k0<<2)+2) % tag_ub; */ + /* tag = (4*(nsupers+k0)+2) % tag_ub; */ + + int_t err = MPI_Irecv (ublk_ptr, size, MPI_DOUBLE, src, + SLU_MPI_TAG (4, k0), comm, U_diag_blk_recv_req); + if (err==MPI_ERR_COUNT) + { + printf("Error in IRecv_UDiagBlock count\n"); + } + SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1); + return 0; +} + +int_t dIRecv_LDiagBlock(int_t k0, double *L_blk_ptr, /*pointer for the diagonal block*/ + int_t size, /*number of elements to be broadcasted*/ + int_t src, + MPI_Request *L_diag_blk_recv_req, + gridinfo_t * grid, SCT_t* SCT, int tag_ub) +{ + unsigned long long t1 = _rdtsc(); + MPI_Comm comm = (grid->rscp).comm; + /* tag = ((k0<<2)+2) % tag_ub; */ + /* tag = (4*(nsupers+k0)+2) % tag_ub; */ + + int_t err = MPI_Irecv (L_blk_ptr, size, MPI_DOUBLE, src, + SLU_MPI_TAG (5, k0), + comm, L_diag_blk_recv_req); + if (err==MPI_ERR_COUNT) + { + printf("Error in IRecv_lDiagBlock count\n"); + } + SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1); + return 0; +} + +#if (MPI_VERSION>2) + +/****Ibcast based on mpi ibcast****/ +int_t dIBcast_UDiagBlock(int_t k, double *ublk_ptr, /*pointer for the diagonal block*/ + int_t size, /*number of elements to be broadcasted*/ + MPI_Request *L_diag_blk_ibcast_req, + gridinfo_t * grid) +{ + int_t krow = PROW (k, grid); + MPI_Comm comm = (grid->cscp).comm; + + MPI_Ibcast(ublk_ptr, size, MPI_DOUBLE, krow,comm, L_diag_blk_ibcast_req); + + // MPI_Status status; + // MPI_Wait(L_diag_blk_ibcast_req, &status); + return 0; +} + +int_t dIBcast_LDiagBlock(int_t k, double *lblk_ptr, /*pointer for the diagonal block*/ + int_t size, /*number of elements to be broadcasted*/ + MPI_Request *U_diag_blk_ibcast_req, + gridinfo_t * grid) +{ + int_t kcol = PCOL (k, grid); + MPI_Comm comm = (grid->rscp).comm; + + MPI_Ibcast(lblk_ptr, size, MPI_DOUBLE, kcol,comm, U_diag_blk_ibcast_req); + // MPI_Status status; + // MPI_Wait(U_diag_blk_ibcast_req, &status); + return 0; +} + +#endif + +int_t dUDiagBlockRecvWait( int_t k, int_t* IrecvPlcd_D, int_t* factored_L, + MPI_Request * U_diag_blk_recv_req, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT) +{ + LocalLU_t *Llu = LUstruct->Llu; + + int_t iam = grid->iam; + + int_t mycol = MYCOL (iam, grid); + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + + int_t kcol = PCOL (k, grid); + + if (IrecvPlcd_D[k] == 1) + { + /* code */ + /*factor the L panel*/ + if (mycol == kcol && factored_L[k] == 0 && iam != pkk) + { + factored_L[k] = 1; + int_t lk = LBj (k, grid); + + int_t nsupr; + if (Llu->Lrowind_bc_ptr[lk]) + nsupr = Llu->Lrowind_bc_ptr[lk][1]; + else + nsupr = 0; + /*wait for communication to finish*/ + + // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT); + int_t flag = 0; + while (flag == 0) + { + flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT); + } + } + } + return 0; +} + diff --git a/SRC/dgather.c b/SRC/dgather.c new file mode 100644 index 00000000..9ad1fe14 --- /dev/null +++ b/SRC/dgather.c @@ -0,0 +1,381 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +#include +#include "superlu_ddefs.h" +#if 0 +#include "scatter.h" +#include "sec_structs.h" +#include "superlu_defs.h" +#include "gather.h" +#endif + +int_t dprintMatrix(char*s, int n, int m, double* A, int LDA) +{ + printf("%s\n", s ); + for(int i=0; ixsup; + int_t knsupc = SuperSize (k); + int_t krow = PROW (k, grid); + int_t nlb, lptr0, luptr0; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + + HyP->lookAheadBlk = 0, HyP->RemainBlk = 0; + + int_t nsupr = lsub[1]; /* LDA of lusup. */ + if (myrow == krow) /* Skip diagonal block L(k,k). */ + { + lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1]; + luptr0 = knsupc; + nlb = lsub[0] - 1; + } + else + { + lptr0 = BC_HEADER; + luptr0 = 0; + nlb = lsub[0]; + } + // printf("nLb =%d ", nlb ); + + int_t lptr = lptr0; + int_t luptr = luptr0; + for (int_t i = 0; i < nlb; ++i) + { + ib = lsub[lptr]; /* Row block L(i,k). */ + temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */ + + int_t look_up_flag = 1; + + // if elimination order is greater than first block stored on GPU + if (iperm_c_supno[ib] < HyP->first_u_block_acc) look_up_flag = 0; + + // if it myIperm[ib] is within look ahead window + if (myIperm[ib]< myIperm[k] + HyP->nCudaStreams && myIperm[ib]>0) look_up_flag = 0; + + if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 ) + { + int_t k_parent = gEtreeInfo->setree[k]; + if (ib == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 ) + { + look_up_flag = 0; + } + } + // look_up_flag = 0; + if (!look_up_flag) + { + /* ib is within look up window */ + HyP->lookAhead_info[HyP->lookAheadBlk].nrows = temp_nbrow; + if (HyP->lookAheadBlk == 0) + { + HyP->lookAhead_info[HyP->lookAheadBlk].FullRow = temp_nbrow; + } + else + { + HyP->lookAhead_info[HyP->lookAheadBlk].FullRow + = temp_nbrow + HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow; + } + HyP->lookAhead_info[HyP->lookAheadBlk].StRow = cum_nrow; + HyP->lookAhead_info[HyP->lookAheadBlk].lptr = lptr; + HyP->lookAhead_info[HyP->lookAheadBlk].ib = ib; + HyP->lookAheadBlk++; + } + else + { + /* ib is not in look up window */ + HyP->Remain_info[HyP->RemainBlk].nrows = temp_nbrow; + if (HyP->RemainBlk == 0) + { + HyP->Remain_info[HyP->RemainBlk].FullRow = temp_nbrow; + } + else + { + HyP->Remain_info[HyP->RemainBlk].FullRow + = temp_nbrow + HyP->Remain_info[HyP->RemainBlk - 1].FullRow; + } + HyP->Remain_info[HyP->RemainBlk].StRow = cum_nrow; + HyP->Remain_info[HyP->RemainBlk].lptr = lptr; + HyP->Remain_info[HyP->RemainBlk].ib = ib; + HyP->RemainBlk++; + } + + cum_nrow += temp_nbrow; + + lptr += LB_DESCRIPTOR; /* Skip descriptor. */ + lptr += temp_nbrow; + luptr += temp_nbrow; + } + lptr = lptr0; + luptr = luptr0; + + dgather_l( HyP->lookAheadBlk, knsupc, HyP->lookAhead_info, + &lusup[luptr], nsupr, HyP->lookAhead_L_buff); + + dgather_l( HyP->RemainBlk, knsupc, HyP->Remain_info, + &lusup[luptr], nsupr, HyP->Remain_L_buff); + + assert(HyP->lookAheadBlk + HyP->RemainBlk ==nlb ); + HyP->Lnbrow = HyP->lookAheadBlk == 0 ? 0 : HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow; + HyP->Rnbrow = HyP->RemainBlk == 0 ? 0 : HyP->Remain_info[HyP->RemainBlk - 1].FullRow; + + // dprintMatrix("LookAhead Block", HyP->Lnbrow, knsupc, HyP->lookAhead_L_buff, HyP->Lnbrow); + // dprintMatrix("Remaining Block", HyP->Rnbrow, knsupc, HyP->Remain_L_buff, HyP->Rnbrow); +} + +// void Rgather_U(int_t k, +// HyP_t *HyP, +// int_t st, int_t end, +// int_t *usub, double *uval, double *bigU, +// Glu_persist_t *Glu_persist, gridinfo_t *grid, +// int_t *perm_u) + +void dRgather_U( int_t k, int_t jj0, int_t *usub, double *uval, + double *bigU, gEtreeInfo_t* gEtreeInfo, + Glu_persist_t *Glu_persist, gridinfo_t *grid, HyP_t *HyP, + int_t* myIperm, int_t *iperm_c_supno, int_t *perm_u) +{ + HyP->ldu = 0; + HyP->num_u_blks = 0; + HyP->ldu_Phi = 0; + HyP->num_u_blks_Phi = 0; + + int_t iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ + int_t rukp = 0; /* Pointer to nzval[] of U(k,:) */ + int_t nub = usub[0]; /* Number of blocks in the block row U(k,:) */ + int_t *xsup = Glu_persist->xsup; + // int_t k = perm_c_supno[k0]; + int_t klst = FstBlockC (k + 1); + int_t iukp0 = iukp; + int_t rukp0 = rukp; + int_t jb, ljb; + int_t nsupc; + int_t full = 1; + int_t full_Phi = 1; + int_t temp_ncols = 0; + int_t segsize; + HyP->num_u_blks = 0; + HyP->ldu = 0; + + for (int_t j = jj0; j < nub; ++j) + { + temp_ncols = 0; + arrive_at_ublock( + j, &iukp, &rukp, &jb, &ljb, &nsupc, + iukp0, rukp0, usub, perm_u, xsup, grid + ); + + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + segsize = klst - usub[jj]; + if ( segsize ) ++temp_ncols; + } + /*here goes the condition wether jb block exists on Phi or not*/ + int_t u_blk_acc_cond = 0; + // if (j == jj0) u_blk_acc_cond = 1; /* must schedule first colum on cpu */ + if (iperm_c_supno[jb] < HyP->first_l_block_acc) + { + // printf("k=%d jb=%d got at condition-1:%d, %d \n",k,jb, iperm_c_supno[jb] , HyP->first_l_block_acc); + u_blk_acc_cond = 1; + } + // if jb is within lookahead window + if (myIperm[jb]< myIperm[k] + HyP->nCudaStreams && myIperm[jb]>0) + { + // printf("k=%d jb=%d got at condition-2:%d, %d\n ",k,jb, myIperm[jb] , myIperm[k]); + u_blk_acc_cond = 1; + } + + if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 ) + { + int_t k_parent = gEtreeInfo->setree[k]; + if (jb == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 ) + { + u_blk_acc_cond = 1; + // printf("k=%d jb=%d got at condition-3\n",k,jb); + u_blk_acc_cond = 1; + } + } + + + if (u_blk_acc_cond) + { + HyP->Ublock_info[HyP->num_u_blks].iukp = iukp; + HyP->Ublock_info[HyP->num_u_blks].rukp = rukp; + HyP->Ublock_info[HyP->num_u_blks].jb = jb; + + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + segsize = klst - usub[jj]; + if ( segsize ) + { + + if ( segsize != HyP->ldu ) full = 0; + if ( segsize > HyP->ldu ) HyP->ldu = segsize; + } + } + + HyP->Ublock_info[HyP->num_u_blks].ncols = temp_ncols; + // ncols += temp_ncols; + HyP->num_u_blks++; + } + else + { + HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].iukp = iukp; + HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].rukp = rukp; + HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].jb = jb; + HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].eo = HyP->nsupers - iperm_c_supno[jb]; /*since we want it to be in descending order*/ + + /* Prepare to call DGEMM. */ + + + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + segsize = klst - usub[jj]; + if ( segsize ) + { + + if ( segsize != HyP->ldu_Phi ) full_Phi = 0; + if ( segsize > HyP->ldu_Phi ) HyP->ldu_Phi = segsize; + } + } + + HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].ncols = temp_ncols; + // ncols_Phi += temp_ncols; + HyP->num_u_blks_Phi++; + } + } + + /* Now doing prefix sum on on ncols*/ + HyP->Ublock_info[0].full_u_cols = HyP->Ublock_info[0 ].ncols; + for (int_t j = 1; j < HyP->num_u_blks; ++j) + { + HyP->Ublock_info[j].full_u_cols = HyP->Ublock_info[j ].ncols + HyP->Ublock_info[j - 1].full_u_cols; + } + + /*sorting u blocks based on elimination order */ + // sort_U_info_elm(HyP->Ublock_info_Phi,HyP->num_u_blks_Phi ); + HyP->Ublock_info_Phi[0].full_u_cols = HyP->Ublock_info_Phi[0 ].ncols; + for ( int_t j = 1; j < HyP->num_u_blks_Phi; ++j) + { + HyP->Ublock_info_Phi[j].full_u_cols = HyP->Ublock_info_Phi[j ].ncols + HyP->Ublock_info_Phi[j - 1].full_u_cols; + } + + HyP->bigU_Phi = bigU; + HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; + + dgather_u(HyP->num_u_blks, HyP->Ublock_info, usub, uval, HyP->bigU_host, + HyP->ldu, xsup, klst ); + + dgather_u(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, usub, uval, + HyP->bigU_Phi, HyP->ldu_Phi, xsup, klst ); + +} /* dRgather_U */ diff --git a/SRC/dmemory_dist.c b/SRC/dmemory_dist.c index c81a63ca..e9319bdb 100644 --- a/SRC/dmemory_dist.c +++ b/SRC/dmemory_dist.c @@ -170,3 +170,122 @@ double *doubleCalloc_dist(int_t n) return (buf); } + +/*************************************** + * The following are from 3D code. + ***************************************/ + +double dgetLUMem(int_t nodeId, LUstruct_t *LUstruct, gridinfo3d_t *grid3d) +{ + + double memlu = 0.0; + gridinfo_t* grid = &(grid3d->grid2d); + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + // double** Unzval_br_ptr = Llu->Unzval_br_ptr; + int_t iam = grid->iam; + + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + + + int_t pc = PCOL( nodeId, grid ); + if (mycol == pc) + { + int_t ljb = LBj( nodeId, grid ); /* Local block number */ + int_t *lsub; + double* lnzval; + lsub = Lrowind_bc_ptr[ljb]; + lnzval = Lnzval_bc_ptr[ljb]; + + if (lsub != NULL) + { + int_t nrbl = lsub[0]; /*number of L blocks */ + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + int_t len2 = SuperSize(nodeId) * len; + memlu += 1.0 * (len1 * sizeof(int_t) + len2 * sizeof(double)); + } + } + + int_t pr = PROW( nodeId, grid ); + if (myrow == pr) + { + int_t lib = LBi( nodeId, grid ); /* Local block number */ + int_t *usub; + // double* unzval; + usub = Ufstnz_br_ptr[lib]; + + + if (usub != NULL) + { + int_t lenv = usub[1]; + int_t lens = usub[2]; + memlu += 1.0 * (lenv * sizeof(int_t) + lens * sizeof(double)); + } + } + return memlu; +} + + +double dmemForest(sForest_t*sforest, LUstruct_t *LUstruct, gridinfo3d_t *grid3d) +{ + double memlu = 0; + + int_t *perm_c_supno = sforest->nodeList; + int_t nnodes = sforest->nNodes; + for (int i = 0; i < nnodes; ++i) + { + memlu += dgetLUMem(perm_c_supno[i], LUstruct, grid3d); + } + + return memlu; +} + +void d3D_printMemUse( trf3Dpartition_t* trf3Dpartition, LUstruct_t *LUstruct, + gridinfo3d_t * grid3d ) +{ + int_t* myTreeIdxs = trf3Dpartition->myTreeIdxs; + int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs; + sForest_t** sForests = trf3Dpartition->sForests; + + double memNzLU = 0.0; + double memzLU = 0.0; + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl) + { + sForest_t* sforest = sForests[myTreeIdxs[ilvl]]; + + if (sforest) + { + if (!myZeroTrIdxs[ilvl]) + { + memNzLU += dmemForest(sforest, LUstruct, grid3d); + } + else + { + memzLU += dmemForest(sforest, LUstruct, grid3d); + } + } + } + double sumMem = memNzLU + memzLU; + double maxMem, minMem, avgNzLU, avgzLU; + /*Now reduce it among all the procs*/ + MPI_Reduce(&sumMem, &maxMem, 1, MPI_DOUBLE, MPI_MAX, 0, grid3d->comm); + MPI_Reduce(&sumMem, &minMem, 1, MPI_DOUBLE, MPI_MIN, 0, grid3d->comm); + MPI_Reduce(&memNzLU, &avgNzLU, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm); + MPI_Reduce(&memzLU, &avgzLU, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm); + + int_t nProcs = grid3d->nprow * grid3d->npcol * grid3d->npdep; + if (!(grid3d->iam)) + { + /* code */ + printf("| Total Memory \t| %.2g \t| %.2g \t|%.2g \t|\n", (avgNzLU + avgzLU) / nProcs, maxMem, minMem ); + printf("| LU-LU(repli) \t| %.2g \t| %.2g \t|\n", (avgNzLU) / nProcs, avgzLU / nProcs ); + } +} + diff --git a/SRC/dtreeFactorization.c b/SRC/dtreeFactorization.c new file mode 100644 index 00000000..5aba253f --- /dev/null +++ b/SRC/dtreeFactorization.c @@ -0,0 +1,680 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +#include "superlu_ddefs.h" +#if 0 +#include "treeFactorization.h" +#include "trfCommWrapper.h" +#endif + +int_t dLluBufInit(LUValSubBuf_t* LUvsb, LUstruct_t *LUstruct) +{ + LocalLU_t *Llu = LUstruct->Llu; + LUvsb->Lsub_buf = intMalloc_dist(Llu->bufmax[0]); //INT_T_ALLOC(Llu->bufmax[0]); + LUvsb->Lval_buf = doubleMalloc_dist(Llu->bufmax[1]); //DOUBLE_ALLOC(Llu->bufmax[1]); + LUvsb->Usub_buf = intMalloc_dist(Llu->bufmax[2]); //INT_T_ALLOC(Llu->bufmax[2]); + LUvsb->Uval_buf = doubleMalloc_dist(Llu->bufmax[3]); //DOUBLE_ALLOC(Llu->bufmax[3]); + return 0; +} + +diagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid) +{ + diagFactBufs_t** dFBufs; + dFBufs = (diagFactBufs_t** ) SUPERLU_MALLOC(mxLeafNode * sizeof(diagFactBufs_t*)); + for (int i = 0; i < mxLeafNode; ++i) + { + /* code */ + dFBufs[i] = (diagFactBufs_t* ) SUPERLU_MALLOC(sizeof(diagFactBufs_t)); + assert(dFBufs[i]); + dinitDiagFactBufs(ldt, dFBufs[i]); + + }/*Minor for loop -2 for (int i = 0; i < mxLeafNode; ++i)*/ + + return dFBufs; +} + +LUValSubBuf_t** dLluBufInitArr(int_t numLA, LUstruct_t *LUstruct) +{ + LUValSubBuf_t** LUvsbs = (LUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(LUValSubBuf_t*)); + for (int_t i = 0; i < numLA; ++i) + { + /* code */ + LUvsbs[i] = (LUValSubBuf_t*) SUPERLU_MALLOC(sizeof(LUValSubBuf_t)); + dLluBufInit(LUvsbs[i], LUstruct); + } /*minor for loop-3 for (int_t i = 0; i < numLA; ++i)*/ + + return LUvsbs; +} + + +int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, + scuBufs_t* scuBufs, + LUstruct_t* LUstruct, + gridinfo_t * grid) +{ + scuBufs->bigV = dgetBigV(ldt, num_threads); + scuBufs->bigU = dgetBigU(nsupers, grid, LUstruct); + return 0; +} +int_t dinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf) +{ + dFBuf->BlockUFactor = doubleMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt); + dFBuf->BlockLFactor = doubleMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt); + return 0; +} + +int_t ddenseTreeFactor( + int_t nnodes, // number of nodes in the tree + int_t *perm_c_supno, // list of nodes in the order of factorization + commRequests_t *comReqs, // lists of communication requests + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t*packLUInfo, + msgs_t*msgs, + LUValSubBuf_t*LUvsb, + diagFactBufs_t *dFBuf, + factStat_t *factStat, + factNodelists_t *fNlists, + superlu_dist_options_t *options, + int_t * gIperm_c_supno, + int_t ldt, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info +) +{ + gridinfo_t* grid = &(grid3d->grid2d); + LocalLU_t *Llu = LUstruct->Llu; + + /*main loop over all the super nodes*/ + for (int_t k0 = 0; k0 < nnodes ; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + + /* diagonal factorization */ +#if 0 + sDiagFactIBCast(k, dFBuf, factStat, comReqs, grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + dDiagFactIBCast(k, k, dFBuf->BlockUFactor, dFBuf->BlockLFactor, + factStat->IrecvPlcd_D, + comReqs->U_diag_blk_recv_req, + comReqs->L_diag_blk_recv_req, + comReqs->U_diag_blk_send_req, + comReqs->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + +#if 0 + /*L update */ + sLPanelUpdate(k, dFBuf, factStat, comReqs, grid, LUstruct, SCT); + /*L Ibcast*/ + sIBcastRecvLPanel( k, comReqs, LUvsb, msgs, factStat, grid, LUstruct, SCT, tag_ub ); + /*U update*/ + sUPanelUpdate(k, ldt, dFBuf, factStat, comReqs, scuBufs, + packLUInfo, grid, LUstruct, stat, SCT); + /*U bcast*/ + sIBcastRecvUPanel( k, comReqs, LUvsb, msgs, factStat, grid, LUstruct, SCT, tag_ub ); + /*Wait for L panel*/ + sWaitL(k, comReqs, msgs, grid, LUstruct, SCT); + /*Wait for U panel*/ + sWaitU(k, comReqs, msgs, grid, LUstruct, SCT); +#else + /*L update */ + dLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L, + comReqs->U_diag_blk_recv_req, dFBuf->BlockUFactor, grid, LUstruct, SCT); + /*L Ibcast*/ + dIBcastRecvLPanel(k, k, msgs->msgcnt, comReqs->send_req, comReqs->recv_req, + LUvsb->Lsub_buf, LUvsb->Lval_buf, factStat->factored, + grid, LUstruct, SCT, tag_ub); + /*U update*/ + dUPanelUpdate(k, factStat->factored_U, comReqs->L_diag_blk_recv_req, + dFBuf->BlockLFactor, scuBufs->bigV, ldt, + packLUInfo->Ublock_info, grid, LUstruct, stat, SCT); + /*U bcast*/ + dIBcastRecvUPanel(k, k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ, + LUvsb->Usub_buf, LUvsb->Uval_buf, + grid, LUstruct, SCT, tag_ub); + dWaitL(k, msgs->msgcnt, msgs->msgcntU, comReqs->send_req, comReqs->recv_req, + grid, LUstruct, SCT); + dWaitU(k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ, grid, LUstruct, SCT); +#endif + double tsch = SuperLU_timer_(); +#if 0 + int_t LU_nonempty = sSchurComplementSetup(k, + msgs, packLUInfo, gIperm_c_supno, perm_c_supno, + fNlists, scuBufs, LUvsb, grid, LUstruct); +#else + int_t LU_nonempty= dSchurComplementSetup(k, msgs->msgcnt, + packLUInfo->Ublock_info, packLUInfo->Remain_info, + packLUInfo->uPanelInfo, packLUInfo->lPanelInfo, + gIperm_c_supno, fNlists->iperm_u, fNlists->perm_u, + scuBufs->bigU, LUvsb->Lsub_buf, LUvsb->Lval_buf, + LUvsb->Usub_buf, LUvsb->Uval_buf, + grid, LUstruct); +#endif + if (LU_nonempty) + { + Ublock_info_t* Ublock_info = packLUInfo->Ublock_info; + Remain_info_t* Remain_info = packLUInfo->Remain_info; + uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo; + lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo; + int_t* indirect = fNlists->indirect; + int_t* indirect2 = fNlists->indirect2; + /*Schurcomplement Update*/ + int_t nub = uPanelInfo->nub; + int_t nlb = lPanelInfo->nlb; + double* bigV = scuBufs->bigV; + double* bigU = scuBufs->bigU; + +#pragma omp parallel for schedule(dynamic) + for (int_t ij = 0; ij < nub * nlb; ++ij) + { + /* code */ + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + int_t** Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = LUstruct->Llu->Unzval_br_ptr; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t ub = ij / nlb; + int_t lb + = ij % nlb; + double *L_mat = lPanelInfo->lusup; + int_t ldl = lPanelInfo->nsupr; + int_t luptr0 = lPanelInfo->luptr0; + double *U_mat = bigU; + int_t ldu = uPanelInfo->ldu; + int_t knsupc = SuperSize(k); + int_t klst = FstBlockC (k + 1); + int_t *lsub = lPanelInfo->lsub; + int_t *usub = uPanelInfo->usub; + int_t thread_id = omp_get_thread_num(); + block_gemm_scatter( lb, ub, + Ublock_info, + Remain_info, + &L_mat[luptr0], ldl, + U_mat, ldu, + bigV, + knsupc, klst, + lsub, usub, ldt, + thread_id, indirect, indirect2, + Lrowind_bc_ptr, Lnzval_bc_ptr, + Ufstnz_br_ptr, Unzval_br_ptr, + xsup, grid, stat +#ifdef SCATTER_PROFILE + , Host_TheadScatterMOP, Host_TheadScatterTimer +#endif + ); + } /*for (int_t ij = 0; ij < nub * nlb;*/ + } /*if (LU_nonempty)*/ + SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; +#if 0 + sWait_LUDiagSend(k, comReqs, grid, SCT); +#else + Wait_LUDiagSend(k, comReqs->U_diag_blk_send_req, comReqs->L_diag_blk_send_req, + grid, SCT); +#endif + }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/ + + return 0; +} /* ddenseTreeFactor */ + +/* + * 2D factorization at individual subtree. + */ +int_t dsparseTreeFactor_ASYNC( + sForest_t* sforest, + commRequests_t **comReqss, // lists of communication requests // size maxEtree level + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t*packLUInfo, + msgs_t**msgss, // size=num Look ahead + LUValSubBuf_t**LUvsbs, // size=num Look ahead + diagFactBufs_t **dFBufs, // size maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t* gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t * gIperm_c_supno, + int_t ldt, + HyP_t* HyP, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info +) +{ + int_t nnodes = sforest->nNodes ; // number of nodes in the tree + if (nnodes < 1) + { + return 1; + } + + int_t *perm_c_supno = sforest->nodeList ; // list of nodes in the order of factorization + treeTopoInfo_t* treeTopoInfo = &sforest->topoInfo; + int_t* myIperm = treeTopoInfo->myIperm; + + gridinfo_t* grid = &(grid3d->grid2d); + /*main loop over all the levels*/ + + int_t maxTopoLevel = treeTopoInfo->numLvl; + int_t* eTreeTopLims = treeTopoInfo->eTreeTopLims; + int_t * IrecvPlcd_D = factStat->IrecvPlcd_D; + int_t* factored_D = factStat->factored_D; + int_t * factored_L = factStat->factored_L; + int_t * factored_U = factStat->factored_U; + int_t* IbcastPanel_L = factStat->IbcastPanel_L; + int_t* IbcastPanel_U = factStat->IbcastPanel_U; + int_t* xsup = LUstruct->Glu_persist->xsup; + + int_t numLAMax = getNumLookAhead(options); + int_t numLA = numLAMax; + + for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0; + /* k-th diagonal factorization */ + /*Now factor and broadcast diagonal block*/ +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k] = 1; + } + + for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl) + { + /* code */ + int_t k_st = eTreeTopLims[topoLvl]; + int_t k_end = eTreeTopLims[topoLvl + 1]; + for (int_t k0 = k_st; k0 < k_end; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0 - k_st; + /* diagonal factorization */ + if (!factored_D[k] ) + { + /*If LU panels from GPU are not reduced then reduce + them before diagonal factorization*/ +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, + dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + } + } + double t_apt = SuperLU_timer_(); + + for (int_t k0 = k_st; k0 < k_end; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0 - k_st; + + /*L update */ + if (factored_L[k] == 0) + { +#if 0 + sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + dLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L, + comReqss[offset]->U_diag_blk_recv_req, + dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT); +#endif + factored_L[k] = 1; + } + /*U update*/ + if (factored_U[k] == 0) + { +#if 0 + sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset], + scuBufs, packLUInfo, grid, LUstruct, stat, SCT); +#else + dUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req, + dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt, + packLUInfo->Ublock_info, grid, LUstruct, stat, SCT); +#endif + factored_U[k] = 1; + } + } + + for (int_t k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0 % numLA; + /* diagonal factorization */ + + /*L Ibcast*/ + if (IbcastPanel_L[k] == 0) + { +#if 0 + sIBcastRecvLPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req, + comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf, + LUvsbs[offset]->Lval_buf, factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[k] = 1; /*for consistancy; unused later*/ + } + + /*U Ibcast*/ + if (IbcastPanel_U[k] == 0) + { +#if 0 + sIBcastRecvUPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[k] = 1; + } + } + + // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt; + SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt; + + for (int_t k0 = k_st; k0 < k_end; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0 % numLA; + +#if 0 + sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); + /*Wait for U panel*/ + sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); +#else + dWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + grid, LUstruct, SCT); + dWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, grid, LUstruct, SCT); +#endif + double tsch = SuperLU_timer_(); + int_t LU_nonempty = dSchurComplementSetupGPU(k, + msgss[offset], packLUInfo, + myIperm, gIperm_c_supno, + perm_c_supno, gEtreeInfo, + fNlists, scuBufs, + LUvsbs[offset], + grid, LUstruct, HyP); + // initializing D2H data transfer + int_t jj_cpu = 0; + + scuStatUpdate( SuperSize(k), HyP, SCT, stat); + uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo; + lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo; + int_t *lsub = lPanelInfo->lsub; + int_t *usub = uPanelInfo->usub; + int_t* indirect = fNlists->indirect; + int_t* indirect2 = fNlists->indirect2; + + /*Schurcomplement Update*/ + + int_t knsupc = SuperSize(k); + int_t klst = FstBlockC (k + 1); + + double* bigV = scuBufs->bigV; + +#pragma omp parallel + { +#pragma omp for schedule(dynamic,2) nowait + /* Each thread is assigned one loop index ij, responsible for + block update L(lb,k) * U(k,j) -> tempv[]. */ + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij) + { + /* Get the entire area of L (look-ahead) X U (all-blocks). */ + /* for each j-block in U, go through all L-blocks in the + look-ahead window. */ + int_t j = ij / HyP->lookAheadBlk; + + int_t lb = ij % HyP->lookAheadBlk; + block_gemm_scatterTopLeft( lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, + LUstruct, grid, SCT, stat ); + } + +#pragma omp for schedule(dynamic,2) nowait + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij) + { + int_t j = ij / HyP->lookAheadBlk ; + int_t lb = ij % HyP->lookAheadBlk; + block_gemm_scatterTopRight( lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, + LUstruct, grid, SCT, stat); + } + +#pragma omp for schedule(dynamic,2) nowait + for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) // + { + int_t j = ij / HyP->RemainBlk; + int_t lb = ij % HyP->RemainBlk; + block_gemm_scatterBottomLeft( lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, + HyP, LUstruct, grid, SCT, stat); + } /*for (int_t ij =*/ + } + + if (topoLvl < maxTopoLevel - 1) + { + int_t k_parent = gEtreeInfo->setree[k]; + gEtreeInfo->numChildLeft[k_parent]--; + if (gEtreeInfo->numChildLeft[k_parent] == 0) + { + int_t k0_parent = myIperm[k_parent]; + if (k0_parent > 0) + { + /* code */ + assert(k0_parent < nnodes); + int_t offset = k0_parent - k_end; +#if 0 + sDiagFactIBCast(k_parent, dFBufs[offset], factStat, + comReqss[offset], grid, options, thresh, + LUstruct, stat, info, SCT, tag_ub); +#else + dDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor, + dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k_parent] = 1; + } + + } + } + +#pragma omp parallel + { +#pragma omp for schedule(dynamic,2) nowait + for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu) ; ++ij) + { + int_t j = ij / HyP->RemainBlk + jj_cpu; + int_t lb = ij % HyP->RemainBlk; + block_gemm_scatterBottomRight( lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, + HyP, LUstruct, grid, SCT, stat); + } /*for (int_t ij =*/ + + } + + SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + // finish waiting for diag block send + int_t abs_offset = k0 - k_st; +#if 0 + sWait_LUDiagSend(k, comReqss[abs_offset], grid, SCT); +#else + Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, + comReqss[abs_offset]->L_diag_blk_send_req, + grid, SCT); +#endif + /*Schedule next I bcasts*/ + for (int_t next_k0 = k0 + 1; next_k0 < SUPERLU_MIN( k0 + 1 + numLA, nnodes); ++next_k0) + { + /* code */ + int_t next_k = perm_c_supno[next_k0]; + int_t offset = next_k0 % numLA; + + /*L Ibcast*/ + if (IbcastPanel_L[next_k] == 0 && factored_L[next_k]) + { +#if 0 + sIBcastRecvLPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf, + factStat->factored, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[next_k] = 1; /*will be used later*/ + } + /*U Ibcast*/ + if (IbcastPanel_U[next_k] == 0 && factored_U[next_k]) + { +#if 0 + sIBcastRecvUPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[next_k] = 1; + } + } + + if (topoLvl < maxTopoLevel - 1) + { + + /*look ahead LU factorization*/ + int_t kx_st = eTreeTopLims[topoLvl + 1]; + int_t kx_end = eTreeTopLims[topoLvl + 2]; + for (int_t k0x = kx_st; k0x < kx_end; k0x++) + { + /* code */ + int_t kx = perm_c_supno[k0x]; + int_t offset = k0x - kx_st; + if (IrecvPlcd_D[kx] && !factored_L[kx]) + { + /*check if received*/ + int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset], + grid, SCT); + if (recvUDiag) + { +#if 0 + sLPanelTrSolve( kx, dFBufs[offset], + factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + dLPanelTrSolve( kx, factStat->factored_L, + dFBufs[offset]->BlockUFactor, grid, LUstruct); +#endif + + factored_L[kx] = 1; + + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_L[kx] == 0 && + k0x - k0 < numLA + 1 && // is within lookahead window + factored_L[kx]) + { + int_t offset1 = k0x % numLA; +#if 0 + sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1], + msgss[offset1], factStat, + grid, LUstruct, SCT, tag_ub); +#else + dIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, + comReqss[offset1]->send_req, + comReqss[offset1]->recv_req, + LUvsbs[offset1]->Lsub_buf, + LUvsbs[offset1]->Lval_buf, + factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[kx] = 1; /*will be used later*/ + } + + } + } + + if (IrecvPlcd_D[kx] && !factored_U[kx]) + { + /*check if received*/ + int_t recvLDiag = checkRecvLDiag( kx, comReqss[offset], + grid, SCT); + if (recvLDiag) + { +#if 0 + sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo, + grid, LUstruct, stat, SCT); +#else + dUPanelTrSolve( kx, dFBufs[offset]->BlockLFactor, + scuBufs->bigV, + ldt, packLUInfo->Ublock_info, + grid, LUstruct, stat, SCT); +#endif + factored_U[kx] = 1; + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_U[kx] == 0 && + k0x - k0 < numLA + 1 && // is within lookahead window + factored_U[kx]) + { + int_t offset = k0x % numLA; +#if 0 + sIBcastRecvUPanel( kx, comReqss[offset], + LUvsbs[offset], + msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub); +#else + dIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[kx] = 1; /*will be used later*/ + } + } + } + } + + } + }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/ + + } + return 0; +} /* dsparseTreeFactor_ASYNC */ diff --git a/SRC/dtrfAux.c b/SRC/dtrfAux.c new file mode 100644 index 00000000..614642e3 --- /dev/null +++ b/SRC/dtrfAux.c @@ -0,0 +1,651 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +#include "superlu_ddefs.h" + +#if 0 +#include "pdgstrf3d.h" +#include "trfAux.h" +#endif + +/*init3DLUstruct with forest interface */ +void dinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + sForest_t** sForests, LUstruct_t* LUstruct, + gridinfo3d_t* grid3d) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t numForests = (1 << maxLvl) - 1; + int_t* gNodeCount = INT_T_ALLOC (numForests); + int_t** gNodeLists = (int_t**) SUPERLU_MALLOC(numForests * sizeof(int_t*)); + + for (int i = 0; i < numForests; ++i) + { + gNodeCount[i] = 0; + gNodeLists[i] = NULL; + /* code */ + if (sForests[i]) + { + gNodeCount[i] = sForests[i]->nNodes; + gNodeLists[i] = sForests[i]->nodeList; + } + } + + /*call the old forest*/ + dinit3DLUstruct( myTreeIdxs, myZeroTrIdxs, + gNodeCount, gNodeLists, LUstruct, grid3d); + + SUPERLU_FREE(gNodeCount); + SUPERLU_FREE(gNodeLists); +} + +int_t dSchurComplementSetup( + int_t k, + int *msgcnt, + Ublock_info_t* Ublock_info, + Remain_info_t* Remain_info, + uPanelInfo_t *uPanelInfo, + lPanelInfo_t *lPanelInfo, + int_t* iperm_c_supno, + int_t * iperm_u, + int_t * perm_u, + double *bigU, + int_t* Lsub_buf, + double *Lval_buf, + int_t* Usub_buf, + double *Uval_buf, + gridinfo_t *grid, + LUstruct_t *LUstruct +) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + + int_t* ToRecv = Llu->ToRecv; + int_t iam = grid->iam; + + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + + int_t krow = PROW (k, grid); + int_t kcol = PCOL (k, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + + int_t *usub; + double* uval; + int_t* lsub; + double* lusup; + + if (mycol == kcol) + { + /*send the L panel to myrow*/ + int_t lk = LBj (k, grid); /* Local block number. */ + lsub = Lrowind_bc_ptr[lk]; + lPanelInfo->lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + lPanelInfo->lusup = Lnzval_bc_ptr[lk]; + } + else + { + lsub = Lsub_buf; + lPanelInfo->lsub = Lsub_buf; + lusup = Lval_buf; + lPanelInfo->lusup = Lval_buf; + } + + if (myrow == krow) + { + int_t lk = LBi (k, grid); + usub = Ufstnz_br_ptr[lk]; + uval = Unzval_br_ptr[lk]; + uPanelInfo->usub = usub; + } + else + { + if (ToRecv[k] == 2) + { + usub = Usub_buf; + uval = Uval_buf; + uPanelInfo->usub = usub; + } + } + + /*now each procs does the schurcomplement update*/ + int_t msg0 = msgcnt[0]; + int_t msg2 = msgcnt[2]; + int_t knsupc = SuperSize (k); + + int_t lptr0, luptr0; + int_t LU_nonempty = msg0 && msg2; + if (LU_nonempty == 0) return 0; + if (msg0 && msg2) /* L(:,k) and U(k,:) are not empty. */ + { + lPanelInfo->nsupr = lsub[1]; + int_t nlb; + if (myrow == krow) /* Skip diagonal block L(k,k). */ + { + lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1]; + luptr0 = knsupc; + nlb = lsub[0] - 1; + lPanelInfo->nlb = nlb; + } + else + { + lptr0 = BC_HEADER; + luptr0 = 0; + nlb = lsub[0]; + lPanelInfo->nlb = nlb; + } + int_t iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ + int_t rukp = 0; /* Pointer to nzval[] of U(k,:) */ + int_t nub = usub[0]; /* Number of blocks in the block row U(k,:) */ + int_t klst = FstBlockC (k + 1); + uPanelInfo->klst = klst; + + /* -------------------------------------------------------------- + Update the look-ahead block columns A(:,k+1:k+num_look_ahead). + -------------------------------------------------------------- */ + int_t iukp0 = iukp; + int_t rukp0 = rukp; + + /* reorder the remaining columns in bottom-up */ + for (int_t jj = 0; jj < nub; jj++) + { +#ifdef ISORT + iperm_u[jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */ + perm_u[jj] = jj; +#else + perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */ + perm_u[2 * jj + 1] = jj; +#endif + int_t jb = usub[iukp]; /* Global block number of block U(k,j). */ + int_t nsupc = SuperSize (jb); + iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ + iukp += nsupc; + } + iukp = iukp0; +#ifdef ISORT + isort (nub, iperm_u, perm_u); +#else + qsort (perm_u, (size_t) nub, 2 * sizeof (int_t), + &superlu_sort_perm); +#endif + // j = jj0 = 0; + + int_t ldu = 0; + int_t full = 1; + int_t num_u_blks = 0; + + for (int_t j = 0; j < nub ; ++j) + { + int_t iukp, temp_ncols; + + temp_ncols = 0; + int_t rukp, jb, ljb, nsupc, segsize; + arrive_at_ublock( + j, &iukp, &rukp, &jb, &ljb, &nsupc, + iukp0, rukp0, usub, perm_u, xsup, grid + ); + + int_t jj = iukp; + for (; jj < iukp + nsupc; ++jj) + { + segsize = klst - usub[jj]; + if ( segsize ) ++temp_ncols; + } + Ublock_info[num_u_blks].iukp = iukp; + Ublock_info[num_u_blks].rukp = rukp; + Ublock_info[num_u_blks].jb = jb; + Ublock_info[num_u_blks].eo = iperm_c_supno[jb]; + /* Prepare to call DGEMM. */ + jj = iukp; + + for (; jj < iukp + nsupc; ++jj) + { + segsize = klst - usub[jj]; + if ( segsize ) + { + if ( segsize != ldu ) full = 0; + if ( segsize > ldu ) ldu = segsize; + } + } + + Ublock_info[num_u_blks].ncols = temp_ncols; + // ncols += temp_ncols; + num_u_blks++; + + } + + uPanelInfo->ldu = ldu; + uPanelInfo->nub = num_u_blks; + + Ublock_info[0].full_u_cols = Ublock_info[0 ].ncols; + Ublock_info[0].StCol = 0; + for ( int_t j = 1; j < num_u_blks; ++j) + { + Ublock_info[j].full_u_cols = Ublock_info[j ].ncols + Ublock_info[j - 1].full_u_cols; + Ublock_info[j].StCol = Ublock_info[j - 1].StCol + Ublock_info[j - 1].ncols; + } + + dgather_u(num_u_blks, Ublock_info, usub, uval, bigU, ldu, xsup, klst ); + + sort_U_info_elm(Ublock_info, num_u_blks ); + + int_t cum_nrow = 0; + int_t RemainBlk = 0; + + int_t lptr = lptr0; + int_t luptr = luptr0; + for (int_t i = 0; i < nlb; ++i) + { + int_t ib = lsub[lptr]; /* Row block L(i,k). */ + int_t temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */ + + Remain_info[RemainBlk].nrows = temp_nbrow; + Remain_info[RemainBlk].StRow = cum_nrow; + Remain_info[RemainBlk].FullRow = cum_nrow; + Remain_info[RemainBlk].lptr = lptr; + Remain_info[RemainBlk].ib = ib; + Remain_info[RemainBlk].eo = iperm_c_supno[ib]; + RemainBlk++; + + cum_nrow += temp_nbrow; + lptr += LB_DESCRIPTOR; /* Skip descriptor. */ + lptr += temp_nbrow; + luptr += temp_nbrow; + } + + lptr = lptr0; + luptr = luptr0; + sort_R_info_elm( Remain_info, lPanelInfo->nlb ); + lPanelInfo->luptr0 = luptr0; + } + return LU_nonempty; +} /* dSchurComplementSetup */ + +/* + * Gather L and U panels into respective buffers, to prepare for GEMM call. + * Divide Schur complement update into two parts: CPU vs. GPU. + */ +int_t dSchurComplementSetupGPU( + int_t k, msgs_t* msgs, + packLUInfo_t* packLUInfo, + int_t* myIperm, + int_t* iperm_c_supno, int_t*perm_c_supno, + gEtreeInfo_t* gEtreeInfo, factNodelists_t* fNlists, + scuBufs_t* scuBufs, LUValSubBuf_t* LUvsb, + gridinfo_t *grid, LUstruct_t *LUstruct, + HyP_t* HyP) +{ + int_t * Lsub_buf = LUvsb->Lsub_buf; + double * Lval_buf = LUvsb->Lval_buf; + int_t * Usub_buf = LUvsb->Usub_buf; + double * Uval_buf = LUvsb->Uval_buf; + uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo; + lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo; + int* msgcnt = msgs->msgcnt; + int_t* iperm_u = fNlists->iperm_u; + int_t* perm_u = fNlists->perm_u; + double* bigU = scuBufs->bigU; + + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + + int_t* ToRecv = Llu->ToRecv; + int_t iam = grid->iam; + + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + + int_t krow = PROW (k, grid); + int_t kcol = PCOL (k, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + + int_t *usub; + double* uval; + int_t* lsub; + double* lusup; + + HyP->lookAheadBlk = 0, HyP->RemainBlk = 0; + HyP->Lnbrow =0, HyP->Rnbrow=0; + HyP->num_u_blks_Phi=0; + HyP->num_u_blks=0; + + if (mycol == kcol) + { + /*send the L panel to myrow*/ + int_t lk = LBj (k, grid); /* Local block number. */ + lsub = Lrowind_bc_ptr[lk]; + lPanelInfo->lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + lPanelInfo->lusup = Lnzval_bc_ptr[lk]; + } + else + { + lsub = Lsub_buf; + lPanelInfo->lsub = Lsub_buf; + lusup = Lval_buf; + lPanelInfo->lusup = Lval_buf; + } + if (myrow == krow) + { + int_t lk = LBi (k, grid); + usub = Ufstnz_br_ptr[lk]; + uval = Unzval_br_ptr[lk]; + uPanelInfo->usub = usub; + } + else + { + if (ToRecv[k] == 2) + { + usub = Usub_buf; + uval = Uval_buf; + uPanelInfo->usub = usub; + } + } + + /*now each procs does the schurcomplement update*/ + int_t msg0 = msgcnt[0]; + int_t msg2 = msgcnt[2]; + int_t knsupc = SuperSize (k); + + int_t lptr0, luptr0; + int_t LU_nonempty = msg0 && msg2; + if (LU_nonempty == 0) return 0; + if (msg0 && msg2) /* L(:,k) and U(k,:) are not empty. */ + { + lPanelInfo->nsupr = lsub[1]; + int_t nlb; + if (myrow == krow) /* Skip diagonal block L(k,k). */ + { + lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1]; + luptr0 = knsupc; + nlb = lsub[0] - 1; + lPanelInfo->nlb = nlb; + } + else + { + lptr0 = BC_HEADER; + luptr0 = 0; + nlb = lsub[0]; + lPanelInfo->nlb = nlb; + } + int_t iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ + + int_t nub = usub[0]; /* Number of blocks in the block row U(k,:) */ + int_t klst = FstBlockC (k + 1); + uPanelInfo->klst = klst; + + /* -------------------------------------------------------------- + Update the look-ahead block columns A(:,k+1:k+num_look_ahead). + -------------------------------------------------------------- */ + int_t iukp0 = iukp; + + /* reorder the remaining columns in bottom-up */ + for (int_t jj = 0; jj < nub; jj++) + { +#ifdef ISORT + iperm_u[jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */ + perm_u[jj] = jj; +#else + perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */ + perm_u[2 * jj + 1] = jj; +#endif + int_t jb = usub[iukp]; /* Global block number of block U(k,j). */ + int_t nsupc = SuperSize (jb); + iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ + iukp += nsupc; + } + iukp = iukp0; +#ifdef ISORT + isort (nub, iperm_u, perm_u); +#else + qsort (perm_u, (size_t) nub, 2 * sizeof (int_t), + &superlu_sort_perm); +#endif + HyP->Lnbrow = 0; + HyP->Rnbrow = 0; + HyP->num_u_blks_Phi=0; + HyP->num_u_blks=0; + + dRgather_L(k, lsub, lusup, gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno); + if (HyP->Lnbrow + HyP->Rnbrow > 0) + { + dRgather_U( k, 0, usub, uval, bigU, gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno, perm_u); + }/*if(nbrow>0) */ + + } + + return LU_nonempty; +} /* dSchurComplementSetupGPU */ + + +double* dgetBigV(int_t ldt, int_t num_threads) +{ + double *bigV; + if (!(bigV = doubleMalloc_dist (8 * ldt * ldt * num_threads))) + ABORT ("Malloc failed for dgemm buffV"); + return bigV; +} + +double* dgetBigU(int_t nsupers, gridinfo_t *grid, + LUstruct_t *LUstruct) +{ + int_t Pr = grid->nprow; + int_t Pc = grid->npcol; + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + + /* Following circuit is for finding maximum block size */ + int local_max_row_size = 0; + int max_row_size; + + for (int_t i = 0; i < nsupers; ++i) + { + int_t tpc = PCOL (i, grid); + if (mycol == tpc) + { + int_t lk = LBj (i, grid); + int_t* lsub = LUstruct->Llu->Lrowind_bc_ptr[lk]; + if (lsub != NULL) + { + local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]); + } + } + + } + + /* Max row size is global reduction of within A row */ + MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX, + (grid->rscp.comm)); + + // int_t Threads_per_process = get_thread_per_process (); + + /*Buffer size is max of of look ahead window*/ + + int_t bigu_size = + 8 * sp_ienv_dist (3) * (max_row_size) * SUPERLU_MAX(Pr / Pc, 1); + //Sherry: 8 * sp_ienv_dist (3) * (max_row_size) * MY_MAX(Pr / Pc, 1); + + // printf("Size of big U is %d\n",bigu_size ); + double* bigU = doubleMalloc_dist(bigu_size); + + return bigU; +} /* dgetBigU */ + +trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers, + superlu_dist_options_t *options, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d + ) +{ + gridinfo_t* grid = &(grid3d->grid2d); + +#if ( DEBUGlevel>=1 ) + int iam = grid3d->iam; + CHECK_MALLOC (iam, "Enter initTrf3Dpartition()"); +#endif + int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid); + int_t* iperm_c_supno = getFactIperm(perm_c_supno, nsupers); + + // calculating tree factorization + int_t *setree = supernodal_etree(nsupers, LUstruct->etree, LUstruct->Glu_persist->supno, LUstruct->Glu_persist->xsup); + treeList_t* treeList = setree2list(nsupers, setree ); + + /*update treelist with weight and depth*/ + getSCUweight(nsupers, treeList, LUstruct, grid3d); + + calcTreeWeight(nsupers, setree, treeList, LUstruct->Glu_persist->xsup); + + gEtreeInfo_t gEtreeInfo; + gEtreeInfo.setree = setree; + gEtreeInfo.numChildLeft = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * nsupers); + for (int_t i = 0; i < nsupers; ++i) + { + /* code */ + gEtreeInfo.numChildLeft[i] = treeList[i].numChild; + } + + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + sForest_t** sForests = getForests( maxLvl, nsupers, setree, treeList); + /*indexes of trees for my process grid in gNodeList size(maxLvl)*/ + int_t* myTreeIdxs = getGridTrees(grid3d); + int_t* myZeroTrIdxs = getReplicatedTrees(grid3d); + int_t* gNodeCount = getNodeCountsFr(maxLvl, sForests); + int_t** gNodeLists = getNodeListFr(maxLvl, sForests); + + dinit3DLUstructForest(myTreeIdxs, myZeroTrIdxs, + sForests, LUstruct, grid3d); + int_t* myNodeCount = getMyNodeCountsFr(maxLvl, myTreeIdxs, sForests); + int_t** treePerm = getTreePermFr( myTreeIdxs, sForests, grid3d); + + LUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(LUValSubBuf_t)); + dLluBufInit(LUvsb, LUstruct); + + int_t* supernode2treeMap = SUPERLU_MALLOC(nsupers*sizeof(int_t)); + int_t numForests = (1 << maxLvl) - 1; + for (int_t Fr = 0; Fr < numForests; ++Fr) + { + /* code */ + for (int_t nd = 0; nd < gNodeCount[Fr]; ++nd) + { + /* code */ + supernode2treeMap[gNodeLists[Fr][nd]]=Fr; + } + } + + trf3Dpartition_t* trf3Dpartition = SUPERLU_MALLOC(sizeof(trf3Dpartition_t)); + + trf3Dpartition->gEtreeInfo = gEtreeInfo; + trf3Dpartition->iperm_c_supno = iperm_c_supno; + trf3Dpartition->myNodeCount = myNodeCount; + trf3Dpartition->myTreeIdxs = myTreeIdxs; + trf3Dpartition->myZeroTrIdxs = myZeroTrIdxs; + trf3Dpartition->sForests = sForests; + trf3Dpartition->treePerm = treePerm; + trf3Dpartition->LUvsb = LUvsb; + trf3Dpartition->supernode2treeMap = supernode2treeMap; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit initTrf3Dpartition()"); +#endif + return trf3Dpartition; +} /* dinitTrf3Dpartition */ + + +#if 0 //**** Sherry: following two routines are old, the new ones are in util.c +int_t num_full_cols_U(int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup, + gridinfo_t *grid, int_t *perm_u) +{ + int_t lk = LBi (kk, grid); + int_t *usub = Ufstnz_br_ptr[lk]; + + if (usub == NULL) + { + /* code */ + return 0; + } + int_t iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ + int_t rukp = 0; /* Pointer to nzval[] of U(k,:) */ + int_t nub = usub[0]; /* Number of blocks in the block row U(k,:) */ + + int_t klst = FstBlockC (kk + 1); + int_t iukp0 = iukp; + int_t rukp0 = rukp; + int_t jb, ljb; + int_t nsupc; + int_t temp_ncols = 0; + int_t segsize; + + temp_ncols = 0; + + for (int_t j = 0; j < nub; ++j) + { + arrive_at_ublock( + j, &iukp, &rukp, &jb, &ljb, &nsupc, + iukp0, rukp0, usub, perm_u, xsup, grid + ); + + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + segsize = klst - usub[jj]; + if ( segsize ) ++temp_ncols; + } + } + return temp_ncols; +} + +// Sherry: this is old; new version is in util.c +int_t estimate_bigu_size( int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr, + Glu_persist_t *Glu_persist, gridinfo_t* grid, int_t* perm_u) +{ + + int_t iam = grid->iam; + + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + + int_t* xsup = Glu_persist->xsup; + + int ncols = 0; + int_t ldu = 0; + + /*initilize perm_u*/ + for (int i = 0; i < nsupers; ++i) + { + perm_u[i] = i; + } + + for (int lk = myrow; lk < nsupers; lk += Pr ) + { + ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr, + xsup, grid, perm_u, &ldu)); + } + + int_t max_ncols = 0; + + MPI_Allreduce(&ncols, &max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm); + + printf("max_ncols =%d, bigu_size=%ld\n", (int) max_ncols, (long long) ldt * max_ncols); + return ldt * max_ncols; +} /* old estimate_bigu_size. New one is in util.c */ +#endif /**** end old ones ****/ + + diff --git a/SRC/dtrfCommWrapper.c b/SRC/dtrfCommWrapper.c new file mode 100644 index 00000000..387a1977 --- /dev/null +++ b/SRC/dtrfCommWrapper.c @@ -0,0 +1,520 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +#include "superlu_ddefs.h" + +#if 0 +#include "pdgstrf3d.h" +#include "trfCommWrapper.h" +#endif + +#ifdef __INTEL_COMPILER +#include "mkl.h" +#else +#include "cblas.h" +#endif + +int_t dDiagFactIBCast(int_t k, int_t k0, // supernode to be factored + double *BlockUFactor, + double *BlockLFactor, + int_t* IrecvPlcd_D, + MPI_Request *U_diag_blk_recv_req, + MPI_Request *L_diag_blk_recv_req, + MPI_Request *U_diag_blk_send_req, + MPI_Request *L_diag_blk_send_req, + gridinfo_t *grid, + superlu_dist_options_t *options, + double thresh, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, int *info, + SCT_t *SCT, + int tag_ub + ) +{ + // unpacking variables + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + + int_t iam = grid->iam; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + int_t krow = PROW (k, grid); + int_t kcol = PCOL (k, grid); + + //xsup for supersize + + /*Place Irecvs first*/ + // if (IrecvPlcd_D[k] == 0 ) + // { + int_t nsupc = SuperSize (k); + if (mycol == kcol && iam != pkk) + { + dIRecv_UDiagBlock(k0, BlockUFactor, /*pointer for the diagonal block*/ + nsupc * nsupc, krow, + U_diag_blk_recv_req, grid, SCT, tag_ub); + } + + if (myrow == krow && iam != pkk) + { + dIRecv_LDiagBlock(k0, BlockLFactor, /*pointer for the diagonal block*/ + nsupc * nsupc, kcol, + L_diag_blk_recv_req, grid, SCT, tag_ub); + } + IrecvPlcd_D[k] = 1; + // } + + /*DiagFact and send */ + // if ( factored_D[k] == 0 ) + // { + + // int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + // int_t krow = PROW (k, grid); + // int_t kcol = PCOL (k, grid); + /*factorize the leaf node and broadcast them + process row and process column*/ + if (iam == pkk) + { + // printf("Entering factorization %d\n", k); + // int_t offset = (k0 - k_st); // offset is input + /*factorize A[kk]*/ + Local_Dgstrf2(options, k, thresh, + BlockUFactor, /*factored U is over writen here*/ + Glu_persist, grid, Llu, stat, info, SCT); + + /*Pack L[kk] into blockLfactor*/ + dPackLBlock(k, BlockLFactor, Glu_persist, grid, Llu); + + /*Isend U blocks to the process row*/ + int_t nsupc = SuperSize(k); + dISend_UDiagBlock(k0, BlockLFactor, + nsupc * nsupc, U_diag_blk_send_req , grid, tag_ub); + + /*Isend L blocks to the process col*/ + dISend_LDiagBlock(k0, BlockLFactor, + nsupc * nsupc, L_diag_blk_send_req, grid, tag_ub); + SCT->commVolFactor += 1.0 * nsupc * nsupc * (Pr + Pc); + } + // } + return 0; +} + +int_t dLPanelTrSolve( int_t k, int_t* factored_L, + double* BlockUFactor, + gridinfo_t *grid, + LUstruct_t *LUstruct) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + + int_t iam = grid->iam; + + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + int_t kcol = PCOL (k, grid); + int_t mycol = MYCOL (iam, grid); + int_t nsupc = SuperSize(k); + + /*factor the L panel*/ + if (mycol == kcol && iam != pkk) + { + // factored_L[k] = 1; + int_t lk = LBj (k, grid); + double *lusup = Llu->Lnzval_bc_ptr[lk]; + int_t nsupr; + if (Llu->Lrowind_bc_ptr[lk]) + nsupr = Llu->Lrowind_bc_ptr[lk][1]; + else + nsupr = 0; + /*wait for communication to finish*/ + + // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT); + // int_t flag = 0; + // while (flag == 0) + // { + // flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT); + // } + + int_t l = nsupr; + double* ublk_ptr = BlockUFactor; + int_t ld_ujrow = nsupc; + + // unsigned long long t1 = _rdtsc(); + + // #pragma omp for schedule(dynamic) nowait +#define BL 32 + for (int i = 0; i < CEILING(l, BL); ++i) + { + #pragma omp task + { + int_t off = i * BL; + // Sherry: int_t len = MY_MIN(BL, l - i * BL); + int_t len = SUPERLU_MIN(BL, l - i * BL); + cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, + len, nsupc, 1.0, ublk_ptr, ld_ujrow, &lusup[off], nsupr); + } + } + } + + if (iam == pkk) + { + /* if (factored_L[k] == 0) + { */ + /* code */ + factored_L[k] = 1; + int_t lk = LBj (k, grid); + double *lusup = Llu->Lnzval_bc_ptr[lk]; + int_t nsupr; + if (Llu->Lrowind_bc_ptr[lk]) + nsupr = Llu->Lrowind_bc_ptr[lk][1]; + else + nsupr = 0; + + /*factorize A[kk]*/ + + int_t l = nsupr - nsupc; + + double* ublk_ptr = BlockUFactor; + int_t ld_ujrow = nsupc; + // printf("%d: L update \n",k ); + +#define BL 32 + // #pragma omp parallel for + for (int i = 0; i < CEILING(l, BL); ++i) + { + int_t off = i * BL; + // Sherry: int_t len = MY_MIN(BL, l - i * BL); + int_t len = SUPERLU_MIN(BL, (l - i * BL)); + #pragma omp task + { + cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, + len, nsupc, 1.0, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr); + } + } + } + + return 0; +} /* dLPanelTrSolve */ + +int_t dLPanelUpdate( int_t k, int_t* IrecvPlcd_D, int_t* factored_L, + MPI_Request * U_diag_blk_recv_req, + double* BlockUFactor, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT) +{ + + dUDiagBlockRecvWait( k, IrecvPlcd_D, factored_L, + U_diag_blk_recv_req, grid, LUstruct, SCT); + + dLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct ); + + return 0; +} /* dLPanelUpdate */ + +#define BL 32 + +int_t dUPanelTrSolve( int_t k, + double* BlockLFactor, + double* bigV, + int_t ldt, + Ublock_info_t* Ublock_info, + gridinfo_t *grid, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, SCT_t *SCT) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + int_t krow = PROW (k, grid); + int_t nsupc = SuperSize(k); + + /*factor the U panel*/ + if (myrow == krow && iam != pkk) + { + int_t lk = LBi (k, grid); /* Local block number */ + if (!Llu->Unzval_br_ptr[lk]) + return 0; + /* Initialization. */ + int_t klst = FstBlockC (k + 1); + + int_t *usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ + double *uval = Llu->Unzval_br_ptr[lk]; + int_t nb = usub[0]; + + // int_t nsupr = Lsub_buf[1]; /* LDA of lusup[] */ + double *lusup = BlockLFactor; + + /* Loop through all the row blocks. to get the iukp and rukp*/ + Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); + + /* Loop through all the row blocks. */ + // #pragma omp for schedule(dynamic,2) nowait + for (int_t b = 0; b < nb; ++b) + { + #pragma omp task + { + int_t thread_id = omp_get_thread_num(); + double *tempv = bigV + thread_id * ldt * ldt; + Trs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, + usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist); + } + } + } + + /*factor the U panel*/ + if (iam == pkk) + { + /* code */ + // factored_U[k] = 1; + int_t *Lsub_buf; + double *Lval_buf; + int_t lk = LBj (k, grid); + Lsub_buf = Llu->Lrowind_bc_ptr[lk]; + Lval_buf = Llu->Lnzval_bc_ptr[lk]; + + + /* calculate U panel */ + // PDGSTRS2 (n, k0, k, Lsub_buf, Lval_buf, Glu_persist, grid, Llu, + // stat, HyP->Ublock_info, bigV, ldt, SCT); + + lk = LBi (k, grid); /* Local block number */ + if (Llu->Unzval_br_ptr[lk]) + { + /* Initialization. */ + int_t klst = FstBlockC (k + 1); + + int_t *usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ + double *uval = Llu->Unzval_br_ptr[lk]; + int_t nb = usub[0]; + + // int_t nsupr = Lsub_buf[1]; /* LDA of lusup[] */ + int_t nsupr = Lsub_buf[1]; /* LDA of lusup[] */ + double *lusup = Lval_buf; + + /* Loop through all the row blocks. to get the iukp and rukp*/ + Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); + + /* Loop through all the row blocks. */ + // printf("%d :U update \n", k); + for (int_t b = 0; b < nb; ++b) + { + #pragma omp task + { + int_t thread_id = omp_get_thread_num(); + double *tempv = bigV + thread_id * ldt * ldt; + Trs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, + usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist); + } + + } + } + } + + return 0; +} /* dUPanelTrSolve */ + +int_t dUPanelUpdate( int_t k, int_t* factored_U, + MPI_Request * L_diag_blk_recv_req, + double* BlockLFactor, + double* bigV, + int_t ldt, + Ublock_info_t* Ublock_info, + gridinfo_t *grid, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, SCT_t *SCT) +{ + + LDiagBlockRecvWait( k, factored_U, L_diag_blk_recv_req, grid); + + dUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid, + LUstruct, stat, SCT); + return 0; +} + +int_t dIBcastRecvLPanel( + int_t k, + int_t k0, + int* msgcnt, + MPI_Request *send_req, + MPI_Request *recv_req , + int_t* Lsub_buf, + double* Lval_buf, + int_t * factored, + gridinfo_t *grid, + LUstruct_t *LUstruct, + SCT_t *SCT, + int tag_ub +) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + int_t** ToSendR = Llu->ToSendR; + int_t* ToRecv = Llu->ToRecv; + int_t iam = grid->iam; + int_t Pc = grid->npcol; + int_t mycol = MYCOL (iam, grid); + int_t kcol = PCOL (k, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + /* code */ + if (mycol == kcol) + { + /*send the L panel to myrow*/ + + int_t lk = LBj (k, grid); /* Local block number. */ + int_t* lsub = Lrowind_bc_ptr[lk]; + double* lusup = Lnzval_bc_ptr[lk]; + + dIBcast_LPanel (k, k0, lsub, lusup, grid, msgcnt, send_req, + ToSendR, xsup, tag_ub); + + if (lsub) + { + int_t nrbl = lsub[0]; /*number of L blocks */ + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + int_t len2 = SuperSize(lk) * len; + SCT->commVolFactor += 1.0 * (Pc - 1) * (len1 * sizeof(int_t) + len2 * sizeof(double)); + } + } + else + { + /*receive factored L panels*/ + if (ToRecv[k] >= 1) /* Recv block column L(:,0). */ + { + /*place Irecv*/ + dIrecv_LPanel (k, k0, Lsub_buf, Lval_buf, grid, recv_req, Llu, tag_ub); + } + else + { + msgcnt[0] = 0; + } + + } + factored[k] = 0; + + return 0; +} + +int_t dIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt, + MPI_Request *send_requ, + MPI_Request *recv_requ, + int_t* Usub_buf, double* Uval_buf, + gridinfo_t *grid, LUstruct_t *LUstruct, + SCT_t *SCT, int tag_ub) +{ + LocalLU_t *Llu = LUstruct->Llu; + + int_t* ToSendD = Llu->ToSendD; + int_t* ToRecv = Llu->ToRecv; + int_t iam = grid->iam; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + int_t krow = PROW (k, grid); + + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + if (myrow == krow) + { + /*send U panel to myrow*/ + int_t lk = LBi (k, grid); + int_t* usub = Ufstnz_br_ptr[lk]; + double* uval = Unzval_br_ptr[lk]; + dIBcast_UPanel(k, k0, usub, uval, grid, msgcnt, + send_requ, ToSendD, tag_ub); + if (usub) + { + /* code */ + int_t lenv = usub[1]; + int_t lens = usub[2]; + SCT->commVolFactor += 1.0 * (Pr - 1) * (lens * sizeof(int_t) + lenv * sizeof(double)); + } + } + else + { + /*receive U panels */ + if (ToRecv[k] == 2) /* Recv block row U(k,:). */ + { + dIrecv_UPanel (k, k0, Usub_buf, Uval_buf, Llu, grid, recv_requ, tag_ub); + } + else + { + msgcnt[2] = 0; + } + } + + return 0; +} + +int_t dWaitL( int_t k, int* msgcnt, int* msgcntU, + MPI_Request *send_req, MPI_Request *recv_req, + gridinfo_t *grid, LUstruct_t *LUstruct, SCT_t *SCT) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t** ToSendR = Llu->ToSendR; + int_t* ToRecv = Llu->ToRecv; + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + int_t kcol = PCOL (k, grid); + if (mycol == kcol) + { + /*send the L panel to myrow*/ + Wait_LSend (k, grid, ToSendR, send_req, SCT); + } + else + { + /*receive factored L panels*/ + if (ToRecv[k] >= 1) /* Recv block column L(:,0). */ + { + /*force wait for I recv to complete*/ + dWait_LRecv( recv_req, msgcnt, msgcntU, grid, SCT); + } + } + + return 0; +} + +int_t dWaitU( int_t k, int* msgcnt, + MPI_Request *send_requ, MPI_Request *recv_requ, + gridinfo_t *grid, LUstruct_t *LUstruct, SCT_t *SCT) +{ + LocalLU_t *Llu = LUstruct->Llu; + + int_t* ToRecv = Llu->ToRecv; + int_t* ToSendD = Llu->ToSendD; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t krow = PROW (k, grid); + if (myrow == krow) + { + int_t lk = LBi (k, grid); + if (ToSendD[lk] == YES) + Wait_USend(send_requ, grid, SCT); + } + else + { + /*receive U panels */ + if (ToRecv[k] == 2) /* Recv block row U(k,:). */ + { + /*force wait*/ + dWait_URecv( recv_requ, msgcnt, SCT); + } + } + return 0; +} diff --git a/SRC/pd3dcomm.c b/SRC/pd3dcomm.c new file mode 100644 index 00000000..7ddf2394 --- /dev/null +++ b/SRC/pd3dcomm.c @@ -0,0 +1,856 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +#include "superlu_ddefs.h" +#include "cblas.h" +#if 0 +#include "p3dcomm.h" +#include "sec_structs.h" +//#include "load-balance/supernodal_etree.h" +//#include "load-balance/supernodalForest.h" +#include "supernodal_etree.h" +#include "supernodalForest.h" +#include "trfAux.h" +#include "treeFactorization.h" +#include "xtrf3Dpartition.h" +#endif + +#define INT_T_ALLOC(x) ((int_t *) SUPERLU_MALLOC ( (x) * sizeof (int_t))) +#define DOUBLE_ALLOC(x) ((double *) SUPERLU_MALLOC ( (x) * sizeof (double))) + +// #define MPI_MALLOC +#define MPI_INT_ALLOC(a, b) (MPI_Alloc_mem( (b)*sizeof(int_t), MPI_INFO_NULL, &(a) )) +#define MPI_DATATYPE_ALLOC(a, b) (MPI_Alloc_mem((b)*sizeof(double), MPI_INFO_NULL, &(a))) + +int_t dAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +{ + int_t Pc = grid3d->npcol; + int_t Pr = grid3d->nprow; + + int_t nbc = CEILING(nsupers, Pc); + int_t nbr = CEILING(nsupers, Pr); + + LocalLU_t *Llu = LUstruct->Llu; + int_t **Lrowind_bc_ptr = + (int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbc); /* size ceil(NSUPERS/Pc) */ + double **Lnzval_bc_ptr = + (double **) SUPERLU_MALLOC(sizeof(double*)*nbc); /* size ceil(NSUPERS/Pc) */ + + for (int_t i = 0; i < nbc ; ++i) + { + /* code */ + Lrowind_bc_ptr[i] = NULL; + Lnzval_bc_ptr[i] = NULL; + } + + int_t **Ufstnz_br_ptr = + (int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbr); /* size ceil(NSUPERS/Pr) */ + double **Unzval_br_ptr = + (double **) SUPERLU_MALLOC(sizeof(double*)*nbr); /* size ceil(NSUPERS/Pr) */ + + for (int_t i = 0; i < nbr ; ++i) + { + /* code */ + Ufstnz_br_ptr[i] = NULL; + Unzval_br_ptr[i] = NULL; + } + + int_t *ToRecv = intCalloc_dist(nsupers); /* Recv from no one (0), left (1), and up (2).*/ + int_t *ToSendD = intCalloc_dist(nbr); /* Whether need to send down block row. */ + int_t **ToSendR = (int_t **) SUPERLU_MALLOC(nbc * sizeof(int_t*)); /* List of processes to send right block col. */ + + for (int_t i = 0; i < nbc; ++i) + { + /* code */ + ToSendR[i] = INT_T_ALLOC(Pc); + } + + /*now setup the pointers*/ + Llu->Lrowind_bc_ptr = Lrowind_bc_ptr ; + Llu->Lnzval_bc_ptr = Lnzval_bc_ptr ; + Llu->Ufstnz_br_ptr = Ufstnz_br_ptr ; + Llu->Unzval_br_ptr = Unzval_br_ptr ; + Llu->ToRecv = ToRecv ; + Llu->ToSendD = ToSendD ; + Llu->ToSendR = ToSendR ; + + return 0; +} /* dAllocLlu */ + +int_t dmpiMallocLUStruct(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + for ( int_t lb = 0; lb < k; ++lb) + { + int_t *usub, *usub_new; + usub = Ufstnz_br_ptr[lb]; + + double * uval = Unzval_br_ptr[lb]; + double * uval_new; + + /*if non empty set the flag*/ + if (usub != NULL) + { + int_t lenv, lens; + lenv = usub[1]; + lens = usub[2]; + + MPI_INT_ALLOC(usub_new, lens); + memcpy( usub_new, usub, lens * sizeof(int_t)); + MPI_DATATYPE_ALLOC(uval_new, lenv); + memcpy( uval_new, uval, lenv * sizeof(double)); + Ufstnz_br_ptr[lb] = usub_new; + Unzval_br_ptr[lb] = uval_new; + SUPERLU_FREE(usub); + SUPERLU_FREE(uval); + } + } /*for ( int_t lb = 0; lb < k; ++lb)*/ + + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + + /*start broadcasting blocks*/ + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *lsub , *lsub_new; + double *lnzval, *lnzval_new; + lsub = Lrowind_bc_ptr[ljb]; + lnzval = Lnzval_bc_ptr[ljb]; + + if (lsub) + { + int_t nrbl, len, len1, len2; + + nrbl = lsub[0]; /*number of L blocks */ + len = lsub[1]; /* LDA of the nzval[] */ + len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + len2 = SuperSize(jb) * len; + + MPI_INT_ALLOC(lsub_new, len1); + memcpy( lsub_new, lsub, len1 * sizeof(int_t)); + MPI_DATATYPE_ALLOC(lnzval_new, len2); + memcpy( lnzval_new, lnzval, len2 * sizeof(double)); + Lrowind_bc_ptr[ljb] = lsub_new; + SUPERLU_FREE(lsub ); + Lnzval_bc_ptr[ljb] = lnzval_new; + SUPERLU_FREE(lnzval ); + } + } /* if mycol == pc ... */ + } /* for jb ... */ + + return 0; +} + + +int_t dzSendLPanel(int_t k, int_t receiver, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + + int_t pc = PCOL( k, grid ); + if (mycol == pc) + { + int_t lk = LBj( k, grid ); /* Local block number */ + int_t *lsub; + double* lnzval; + lsub = Lrowind_bc_ptr[lk]; + lnzval = Lnzval_bc_ptr[lk]; + + if (lsub != NULL) + { + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(k) * len; /* size of nzval of L panel */ + + MPI_Send(lnzval, len2, MPI_DOUBLE, receiver, k, grid3d->zscp.comm); + SCT->commVolRed += len2 * sizeof(double); + } + } + return 0; +} + + +int_t dzRecvLPanel(int_t k, int_t sender, double alpha, double beta, + double* Lval_buf, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) +{ + + // A(k) = alpha*A(k) + beta* A^{sender}(k) + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + + int_t pc = PCOL( k, grid ); + if (mycol == pc) + { + int_t lk = LBj( k, grid ); /* Local block number */ + int_t *lsub; + double* lnzval; + lsub = Lrowind_bc_ptr[lk]; + lnzval = Lnzval_bc_ptr[lk]; + + if (lsub != NULL) + { + + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(k) * len; /*size of nzval of L panels*/ + + MPI_Status status; + MPI_Recv(Lval_buf , len2, MPI_DOUBLE, sender, k, + grid3d->zscp.comm, &status); + + /*reduce the updates*/ + cblas_dscal (len2, alpha, lnzval, 1); + cblas_daxpy (len2, beta, Lval_buf, 1, lnzval, 1); + } + } + + return 0; +} + +int_t dzSendUPanel(int_t k, int_t receiver, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + int_t iam = grid->iam; + + int_t myrow = MYROW (iam, grid); + int_t pr = PROW( k, grid ); + if (myrow == pr) + { + int_t lk = LBi( k, grid ); /* Local block number */ + int_t *usub; + double* unzval; + usub = Ufstnz_br_ptr[lk]; + unzval = Unzval_br_ptr[lk]; + + if (usub != NULL) + { + int lenv = usub[1]; + + /* code */ + MPI_Send(unzval, lenv, MPI_DOUBLE, receiver, k, grid3d->zscp.comm); + SCT->commVolRed += lenv * sizeof(double); + } + } + + return 0; +} + + +int_t dzRecvUPanel(int_t k, int_t sender, double alpha, double beta, + double* Uval_buf, LUstruct_t* LUstruct, + gridinfo3d_t* grid3d, SCT_t* SCT) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + int_t iam = grid->iam; + + int_t myrow = MYROW (iam, grid); + int_t pr = PROW( k, grid ); + if (myrow == pr) + { + int_t lk = LBi( k, grid ); /* Local block number */ + int_t *usub; + double* unzval; + usub = Ufstnz_br_ptr[lk]; + unzval = Unzval_br_ptr[lk]; + + if (usub != NULL) + { + int lenv = usub[1]; + MPI_Status status; + MPI_Recv(Uval_buf , lenv, MPI_DOUBLE, sender, k, + grid3d->zscp.comm, &status); + + /*reduce the updates*/ + cblas_dscal (lenv, alpha, unzval, 1); + cblas_daxpy (lenv, beta, Uval_buf, 1, unzval, 1); + } + } + return 0; +} + + +int_t dp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +/* Copies LU structure from layer 0 to all the layers */ +{ + gridinfo_t* grid = &(grid3d->grid2d); + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + + /* broadcast etree */ + int_t *etree = LUstruct->etree; + MPI_Bcast( etree, n, mpi_int_t, 0, grid3d->zscp.comm); + + int_t nsupers; + + if (!grid3d->zscp.Iam) + nsupers = getNsupers(n, LUstruct); + + /* broadcast nsupers */ + MPI_Bcast( &nsupers, 1, mpi_int_t, 0, grid3d->zscp.comm); + + /* Scatter and alloc Glu_persist */ + if (grid3d->zscp.Iam) + AllocGlu(n, nsupers, LUstruct, grid3d); + + /* broadcast Glu_persist */ + int_t *xsup = LUstruct->Glu_persist->xsup; + MPI_Bcast( xsup, nsupers + 1, mpi_int_t, 0, grid3d->zscp.comm); + + int_t *supno = LUstruct->Glu_persist->supno; + MPI_Bcast( supno, n, mpi_int_t, 0, grid3d->zscp.comm); + + /* now broadcast localLu_t */ + /* first allocating space for it */ + if (grid3d->zscp.Iam) + dAllocLlu(nsupers, LUstruct, grid3d); + + LocalLU_t *Llu = LUstruct->Llu; + + /*scatter all the L blocks and indexes*/ + dscatter3dLPanels( nsupers, LUstruct, grid3d); + + /*scatter all the U blocks and indexes*/ + dscatter3dUPanels( nsupers, LUstruct, grid3d); + + int_t* bufmax = Llu->bufmax; + MPI_Bcast( bufmax, NBUFFERS, mpi_int_t, 0, grid3d->zscp.comm); + + /* now sending tosendR etc */ + int_t** ToSendR = Llu->ToSendR; + int_t* ToRecv = Llu->ToRecv; + int_t* ToSendD = Llu->ToSendD; + + int_t nbr = CEILING(nsupers, Pr); + int_t nbc = CEILING(nsupers, Pc); + MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0, grid3d->zscp.comm); + + MPI_Bcast( ToSendD, nbr, mpi_int_t, 0, grid3d->zscp.comm); + for (int_t i = 0; i < nbc; ++i) + { + /* code */ + MPI_Bcast( ToSendR[i], Pc, mpi_int_t, 0, grid3d->zscp.comm); + } + + // +#ifdef MPI_MALLOC + // change MY LU struct into MPI malloc based + if (!grid3d->zscp.Iam) + mpiMallocLUStruct(nsupers, LUstruct, grid3d); +#endif + return 0; +} + + +int_t dscatter3dUPanels(int_t nsupers, + LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +{ + + LocalLU_t *Llu = LUstruct->Llu; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + for ( int_t lb = 0; lb < k; ++lb) { + int_t *usub; + usub = Ufstnz_br_ptr[lb]; + + double * uval = Unzval_br_ptr[lb]; + + int_t flag = 0; + /*if non empty set the flag*/ + if (!grid3d->zscp.Iam && usub != NULL) + flag = 1; + /*bcast the flag*/ + MPI_Bcast( &flag, 1, mpi_int_t, 0, grid3d->zscp.comm); + + if (flag) { + int_t lenv, lens; + lenv = 0; + lens = 0; + + if (!grid3d->zscp.Iam) + { + lenv = usub[1]; + lens = usub[2]; + } + + /*broadcast the size of sub array*/ + MPI_Bcast( &lens, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &lenv, 1, mpi_int_t, 0, grid3d->zscp.comm); + + /*allocate lsub*/ + if (grid3d->zscp.Iam) +#ifdef MPI_MALLOC + MPI_INT_ALLOC(usub, lens); +#else + usub = INT_T_ALLOC(lens); +#endif + + /*bcast usub*/ + MPI_Bcast( usub, lens, mpi_int_t, 0, grid3d->zscp.comm); + + /*allocate uval*/ + if (grid3d->zscp.Iam) +#ifdef MPI_MALLOC + MPI_DATATYPE_ALLOC(uval, lenv); +#else + uval = DOUBLE_ALLOC(lenv); +#endif + /*broadcast uval*/ + MPI_Bcast( uval, lenv, MPI_DOUBLE, 0, grid3d->zscp.comm); + + /*setup the pointer*/ + Unzval_br_ptr[lb] = uval; + Ufstnz_br_ptr[lb] = usub; + } /* end if flag */ + + } /* end for lb ... */ + return 0; +} /* end dScatter3dUPanels */ + + +int_t dscatter3dLPanels(int_t nsupers, + LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + gridinfo_t* grid = &(grid3d->grid2d); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + int_t iam = grid->iam; + + int_t mycol = MYCOL (iam, grid); + + /*start broadcasting blocks*/ + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *lsub; + double* lnzval; + lsub = Lrowind_bc_ptr[ljb]; + lnzval = Lnzval_bc_ptr[ljb]; + + int_t flag = 0; + /*if non empty set the flag*/ + if (!grid3d->zscp.Iam && lsub != NULL) + flag = 1; + /*bcast the flag*/ + MPI_Bcast( &flag, 1, mpi_int_t, 0, grid3d->zscp.comm); + + if (flag) { + int_t nrbl, len, len1, len2; + if (!grid3d->zscp.Iam) + { + nrbl = lsub[0]; /*number of L blocks */ + len = lsub[1]; /* LDA of the nzval[] */ + len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + len2 = SuperSize(jb) * len; + } + + /*bcast lsub len*/ + MPI_Bcast( &len1, 1, mpi_int_t, 0, grid3d->zscp.comm); + + /*allocate lsub*/ + if (grid3d->zscp.Iam) +#ifdef MPI_MALLOC + MPI_INT_ALLOC(lsub, len1); +#else + + lsub = INT_T_ALLOC(len1); +#endif + /*now broadcast lsub*/ + MPI_Bcast( lsub, len1, mpi_int_t, 0, grid3d->zscp.comm); + + /*set up pointer*/ + Lrowind_bc_ptr[ljb] = lsub; + + /*bcast lnzval len*/ + MPI_Bcast( &len2, 1, mpi_int_t, 0, grid3d->zscp.comm); + + /*allocate space for nzval*/ + if (grid3d->zscp.Iam) +#ifdef MPI_MALLOC + MPI_DATATYPE_ALLOC(lnzval, len2); +#else + lnzval = doubleCalloc_dist(len2); +#endif + + /*bcast nonzero values*/ + MPI_Bcast( lnzval, len2, MPI_DOUBLE, 0, grid3d->zscp.comm); + + /*setup the pointers*/ + Lnzval_bc_ptr[ljb] = lnzval; + + } /* end if flag */ + + } /* end if mycol == pc */ + } /* end for jb ... */ + + return 0; +} /* dscatter3dLPanels */ + +int_t dcollect3dLpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct, + gridinfo3d_t* grid3d) +{ + + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + + /*start broadcasting blocks*/ + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *lsub; + double* lnzval; + lsub = Lrowind_bc_ptr[ljb]; + lnzval = Lnzval_bc_ptr[ljb]; + + if (lsub != NULL) + { + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(jb) * len; /*size of nzval of L panel */ + + if (grid3d->zscp.Iam == layer) + { + MPI_Send(lnzval, len2, MPI_DOUBLE, 0, jb, grid3d->zscp.comm); + } + if (!grid3d->zscp.Iam) + { + MPI_Status status; + MPI_Recv(lnzval, len2, MPI_DOUBLE, layer, jb, grid3d->zscp.comm, &status); + } + } + } + } /* for jb ... */ + return 0; +} + +int_t dcollect3dUpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct, + gridinfo3d_t* grid3d) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + for ( int_t lb = 0; lb < k; ++lb) + { + int_t *usub; + usub = Ufstnz_br_ptr[lb]; + double * uval = Unzval_br_ptr[lb]; + + if (usub) + { + /* code */ + int lenv = usub[1]; + if (grid3d->zscp.Iam == layer) + { + MPI_Send(uval, lenv, MPI_DOUBLE, 0, lb, grid3d->zscp.comm); + } + + if (!grid3d->zscp.Iam) + { + MPI_Status status; + MPI_Recv(uval, lenv, MPI_DOUBLE, layer, lb, grid3d->zscp.comm, &status); + } + } + } /* for lb ... */ + return 0; +} + +/* Gather the LU factors on layer-0 */ +int_t dp3dCollect(int_t layer, int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +{ + int_t nsupers = getNsupers(n, LUstruct); + dcollect3dLpanels(layer, nsupers, LUstruct, grid3d); + dcollect3dUpanels(layer, nsupers, LUstruct, grid3d); + return 0; +} + + +/* Zero out LU non zero entries */ +int_t dzeroSetLU(int_t nnodes, int_t* nodeList, LUstruct_t *LUstruct, + gridinfo3d_t* grid3d) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t iam = grid->iam; + + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + + /*first setting the L blocks to zero*/ + for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ + { + + int_t jb = nodeList[node]; + int_t pc = PCOL( jb, grid ); + if (mycol == pc) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *lsub; + double* lnzval; + lsub = Lrowind_bc_ptr[ljb]; + lnzval = Lnzval_bc_ptr[ljb]; + + if (lsub != NULL) + { + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(jb) * len; /*size of nzval of L panel */ + memset( lnzval, 0, len2 * sizeof(double) ); + } + } + } + + for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ + { + + int_t ib = nodeList[node]; + int_t pr = PROW( ib, grid ); + if (myrow == pr) + { + int_t lib = LBi( ib, grid ); /* Local block number */ + int_t *usub; + double* unzval; + usub = Ufstnz_br_ptr[lib]; + unzval = Unzval_br_ptr[lib]; + + if (usub != NULL) + { + int lenv = usub[1]; + memset( unzval, 0, lenv * sizeof(double) ); + } + } + } + + return 0; +} + + +int_t dreduceAncestors3d(int_t sender, int_t receiver, + int_t nnodes, int_t* nodeList, + double* Lval_buf, double* Uval_buf, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) +{ + int_t myGrid = grid3d->zscp.Iam; + + /*first setting the L blocks to zero*/ + for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ + { + int_t jb = nodeList[node]; + + if (myGrid == sender) + { + dzSendLPanel(jb, receiver, LUstruct, grid3d, SCT); + dzSendUPanel(jb, receiver, LUstruct, grid3d, SCT); + } + else { + dzRecvLPanel(jb, sender, 1.0, 1.0, Lval_buf, LUstruct, grid3d, SCT); + dzRecvUPanel(jb, sender, 1.0, 1.0, + Uval_buf, LUstruct, grid3d, SCT); + } + + } + return 0; + +} + + +int_t dgatherFactoredLU(int_t sender, int_t receiver, + int_t nnodes, int_t *nodeList, + LUValSubBuf_t*LUvsb, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) +{ + double * Lval_buf = LUvsb->Lval_buf; + double * Uval_buf = LUvsb->Uval_buf; + int_t myGrid = grid3d->zscp.Iam; + for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ + { + int_t jb = nodeList[node]; + if (myGrid == sender) + { + dzSendLPanel(jb, receiver, LUstruct, grid3d, SCT); + dzSendUPanel(jb, receiver, LUstruct, grid3d, SCT); + + } + else + { + dzRecvLPanel(jb, sender, 0.0, 1.0, Lval_buf, LUstruct, + grid3d, SCT); + dzRecvUPanel(jb, sender, 0.0, 1.0, Uval_buf, LUstruct, + grid3d, SCT); + } + } + return 0; + +} + + +int_t dinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + int_t* nodeCount, int_t** nodeList, LUstruct_t* LUstruct, + gridinfo3d_t* grid3d) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + for (int_t lvl = 0; lvl < maxLvl; lvl++) + { + if (myZeroTrIdxs[lvl]) + { + /* code */ + int_t treeId = myTreeIdxs[lvl]; + dzeroSetLU(nodeCount[treeId], nodeList[treeId], LUstruct, grid3d); + } + } + + return 0; +} + + +int_t dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm, + LUValSubBuf_t* LUvsb, LUstruct_t* LUstruct, + gridinfo3d_t* grid3d, SCT_t* SCT ) +{ + double * Lval_buf = LUvsb->Lval_buf; + double * Uval_buf = LUvsb->Uval_buf; + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t myGrid = grid3d->zscp.Iam; + + int_t sender, receiver; + if ((myGrid % (1 << (ilvl + 1))) == 0) + { + sender = myGrid + (1 << ilvl); + receiver = myGrid; + } + else + { + sender = myGrid; + receiver = myGrid - (1 << ilvl); + } + + /*Reduce all the ancestors*/ + for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl) + { + /* code */ + // int_t atree = myTreeIdxs[alvl]; + int_t nsAncestor = myNodeCount[alvl]; + int_t* cAncestorList = treePerm[alvl]; + double treduce = SuperLU_timer_(); + dreduceAncestors3d(sender, receiver, nsAncestor, cAncestorList, + Lval_buf, Uval_buf, LUstruct, grid3d, SCT); + SCT->ancsReduce += SuperLU_timer_() - treduce; + + } + return 0; +} + +int_t dgatherAllFactoredLU( trf3Dpartition_t* trf3Dpartition, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT ) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t myGrid = grid3d->zscp.Iam; + int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs; + sForest_t** sForests = trf3Dpartition->sForests; + LUValSubBuf_t* LUvsb = trf3Dpartition->LUvsb; + int_t* gNodeCount = getNodeCountsFr(maxLvl, sForests); + int_t** gNodeLists = getNodeListFr(maxLvl, sForests); + + for (int_t ilvl = 0; ilvl < maxLvl - 1; ++ilvl) + { + /* code */ + int_t sender, receiver; + if (!myZeroTrIdxs[ilvl]) + { + if ((myGrid % (1 << (ilvl + 1))) == 0) + { + sender = myGrid + (1 << ilvl); + receiver = myGrid; + } + else + { + sender = myGrid; + receiver = myGrid - (1 << ilvl); + } + + for (int_t alvl = 0; alvl <= ilvl; alvl++) + { + int_t diffLvl = ilvl - alvl; + int_t numTrees = 1 << diffLvl; + int_t blvl = maxLvl - alvl - 1; + int_t st = (1 << blvl) - 1 + (sender >> alvl); + + for (int_t tr = st; tr < st + numTrees; ++tr) + { + /* code */ + dgatherFactoredLU(sender, receiver, + gNodeCount[tr], gNodeLists[tr], + LUvsb, + LUstruct, grid3d, SCT ); + } + } + + } + } /* for ilvl ... */ + + return 0; +} + diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c new file mode 100644 index 00000000..64e13dd9 --- /dev/null +++ b/SRC/pdgssvx3d.c @@ -0,0 +1,1537 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Solves a system of linear equations A*X=B + * + *
+ * -- Distributed SuperLU routine (version 7.0.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 1, 2007
+ * October 22, 2012
+ * March 31, 2019
+ * 
+ */ + +#include +#include "superlu_ddefs.h" +#if 0 +#include "p3dcomm.h" +#include "pdgstrf3d.h" +#include "triangularSolve/pdgstrs.h" +#include "triangularSolve/pdgstrs3d.h" +#include "xtrf3Dpartition.h" +#endif +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * PDGSSVX solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinment is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input/output) SuperMatrix* (local)
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PDGSTRF can factorize rectangular matrices.
+ *         On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) ScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double*) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double*) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) double* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) LUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (LocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
+ *
+ * SOLVEstruct (input/output) SOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be intialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'.
+ *
+ * berr    (output) double*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_ddefs.h for the definitions of varioous data types.
+ * 
+ */ + +void +pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, + ScalePermstruct_t * ScalePermstruct, + double B[], int ldb, int nrhs, gridinfo3d_t * grid3d, + LUstruct_t * LUstruct, SOLVEstruct_t * SOLVEstruct, double *berr, + SuperLUStat_t * stat, int *info) +{ + NRformat_loc *Astore; + SuperMatrix GA; /* Global A in NC format */ + NCformat *GAstore; + double *a_GA; + SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ + NCPformat *GACstore; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t *Glu_freeable; + /* The nonzero structures of L and U factors, which are + replicated on all processrs. + (lsub, xlsub) contains the compressed subscript of + supernodes in L. + (usub, xusub) contains the compressed subscript of + nonzero segments in U. + If options->Fact != SamePattern_SameRowPerm, they are + computed by SYMBFACT routine, and then used by PDDISTRIBUTE + routine. They will be freed after PDDISTRIBUTE routine. + If options->Fact == SamePattern_SameRowPerm, these + structures are not used. */ + fact_t Fact; + double *a; + int_t *colptr, *rowind; + int_t *perm_r; /* row permutations from partial pivoting */ + int_t *perm_c; /* column permutation vector */ + int_t *etree; /* elimination tree */ + int_t *rowptr, *colind; /* Local A in NR */ + int_t colequ, Equil, factored, job, notran, rowequ, need_value; + int_t i, iinfo, j, irow, m, n, nnz, permc_spec; + int_t nnz_loc, m_loc, fst_row, icol; + int iam; + int ldx; /* LDA for matrix X (local). */ + char equed[1], norm[1]; + double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; + double *X, *b_col, *b_work, *x_col; + double t; + float GA_mem_use; /* memory usage by global A */ + float dist_mem_use; /* memory usage during distribution */ + superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; +#if ( PRNTlevel>= 2 ) + double dmin, dsum, dprod; +#endif + + /* Structures needed for parallel symbolic factorization */ + int_t *sizes, *fstVtxSep, parSymbFact; + int noDomains, nprocs_num; + MPI_Comm symb_comm; /* communicator for symbolic factorization */ + int col, key; /* parameters for creating a new communicator */ + Pslu_freeable_t Pslu_freeable; + float flinfo; + + /* Initialization. */ + + /* definifition of factored seen by each process layer */ + Fact = options->Fact; + factored = (Fact == FACTORED); + + // get the 2d grid + gridinfo_t *grid = &(grid3d->grid2d); + iam = grid->iam; + + /* Perform preprocessing steps on process layer zero, including: + ordering, symbolic factorization, distribution of L & U */ + if (grid3d->zscp.Iam == 0) + { + m = A->nrow; + n = A->ncol; + Astore = (NRformat_loc *) A->Store; + nnz_loc = Astore->nnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + a = (double *) Astore->nzval; + rowptr = Astore->rowptr; + colind = Astore->colind; + sizes = NULL; + fstVtxSep = NULL; + symb_comm = MPI_COMM_NULL; + + /* Test the input parameters. */ + *info = 0; + Fact = options->Fact; + if (Fact < 0 || Fact > FACTORED) + *info = -1; + else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR) + *info = -1; + else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC) + *info = -1; + else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA) + *info = -1; + else if (options->IterRefine == SLU_EXTRA) { + *info = -1; + fprintf (stderr, + "Extra precise iterative refinement yet to support."); + } + else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc + || A->Dtype != SLU_D || A->Mtype != SLU_GE) + *info = -2; + else if (ldb < m_loc) + *info = -5; + else if (nrhs < 0) + *info = -6; + if (*info) { + i = -(*info); + pxerr_dist ("pdgssvx", grid, -*info); + return; + } + + factored = (Fact == FACTORED); + Equil = (!factored && options->Equil == YES); + notran = (options->Trans == NOTRANS); + parSymbFact = options->ParSymbFact; + + iam = grid->iam; + job = 5; + if (factored || (Fact == SamePattern_SameRowPerm && Equil)) + { + rowequ = (ScalePermstruct->DiagScale == ROW) || + (ScalePermstruct->DiagScale == BOTH); + colequ = (ScalePermstruct->DiagScale == COL) || + (ScalePermstruct->DiagScale == BOTH); + } + else + rowequ = colequ = FALSE; + + /* The following arrays are replicated on all processes. */ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + etree = LUstruct->etree; + R = ScalePermstruct->R; + C = ScalePermstruct->C; + /********/ + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter pdgssvx()"); +#endif + + /* Not factored & ask for equilibration */ + if (Equil && Fact != SamePattern_SameRowPerm) { + /* Allocate storage if not done so before. */ + switch (ScalePermstruct->DiagScale) + { + case NOEQUIL: + if (!(R = (double *) doubleMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + if (!(C = (double *) doubleMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->R = R; + ScalePermstruct->C = C; + break; + case ROW: + if (!(C = (double *) doubleMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->C = C; + break; + case COL: + if (!(R = (double *) doubleMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + ScalePermstruct->R = R; + break; + } + } + + /* ------------------------------------------------------------ + Diagonal scaling to equilibrate the matrix. + ------------------------------------------------------------ */ + if (Equil) { +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter equil"); +#endif + t = SuperLU_timer_ (); + + if (Fact == SamePattern_SameRowPerm) { + /* Reuse R and C. */ + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + break; + case ROW: + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + a[i] *= R[irow]; /* Scale rows. */ + } + ++irow; + } + break; + case COL: + for (j = 0; j < m_loc; ++j) + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + icol = colind[i]; + a[i] *= C[icol]; /* Scale columns. */ + } + break; + case BOTH: + irow = fst_row; + for (j = 0; j < m_loc; ++j) + { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) + { + icol = colind[i]; + a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ + } + ++irow; + } + break; + } + } else { /* Compute R & C from scratch */ + /* Compute the row and column scalings. */ + pdgsequ (A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); + + if ( iinfo > 0 ) { + if ( iinfo <= m ) { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo); +#endif + } else { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n); +#endif + } + } else if ( iinfo < 0 ) return; + + /* Now iinfo == 0 */ + + /* Equilibrate matrix A if it is badly-scaled. + A <-- diag(R)*A*diag(C) */ + pdlaqgs (A, R, C, rowcnd, colcnd, amax, equed); + + if ( strncmp(equed, "R", 1)==0 ) { + ScalePermstruct->DiagScale = ROW; + rowequ = ROW; + } else if ( strncmp(equed, "C", 1)==0 ) { + ScalePermstruct->DiagScale = COL; + colequ = COL; + } else if ( strncmp(equed, "B", 1)==0 ) { + ScalePermstruct->DiagScale = BOTH; + rowequ = ROW; + colequ = COL; + } else ScalePermstruct->DiagScale = NOEQUIL; + +#if ( PRNTlevel>=1 ) + if (iam==0) { + printf (".. equilibrated? *equed = %c\n", *equed); + fflush(stdout); + } +#endif + } /* end if-else Fact ... */ + + stat->utime[EQUIL] = SuperLU_timer_ () - t; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit equil"); +#endif + } /* end if Equil ... LAPACK style, not involving MC64 */ + + if (!factored) { /* Skip this if already factored. */ + /* + * Gather A from the distributed compressed row format to + * global A in compressed column format. + * Numerical values are gathered only when a row permutation + * for large diagonal is sought after. + */ + if (Fact != SamePattern_SameRowPerm && + (parSymbFact == NO || options->RowPerm != NO)) { + + need_value = (options->RowPerm == LargeDiag_MC64); + + pdCompRow_loc_to_CompCol_global (need_value, A, grid, &GA); + + GAstore = (NCformat *) GA.Store; + colptr = GAstore->colptr; + rowind = GAstore->rowind; + nnz = GAstore->nnz; + GA_mem_use = (nnz + n + 1) * sizeof (int_t); + + if (need_value) { + a_GA = (double *) GAstore->nzval; + GA_mem_use += nnz * sizeof (double); + } + else + assert (GAstore->nzval == NULL); + } + + /* ------------------------------------------------------------ + Find the row permutation for A. + ------------------------------------------------------------ */ + if (options->RowPerm != NO) { + t = SuperLU_timer_ (); + if (Fact != SamePattern_SameRowPerm) { + if (options->RowPerm == MY_PERMR) { + /* Use user's perm_r. */ + /* Permute the global matrix GA for symbfact() */ + for (i = 0; i < colptr[n]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } else if ( options->RowPerm == LargeDiag_MC64 ) { + /* Get a new perm_r[] */ + if (job == 5) { + /* Allocate storage for scaling factors. */ + if (!(R1 = doubleMalloc_dist (m))) + ABORT ("SUPERLU_MALLOC fails for R1[]"); + if (!(C1 = doubleMalloc_dist (n))) + ABORT ("SUPERLU_MALLOC fails for C1[]"); + } + + if ( iam==0 ) { + /* Process 0 finds a row permutation */ + iinfo = dldperm_dist (job, m, nnz, colptr, rowind, a_GA, + perm_r, R1, C1); + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) { + MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); + } + } + } else { + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) { + MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); + } + } + } + + if ( iinfo && job == 5) { /* Error return */ + SUPERLU_FREE(R1); + SUPERLU_FREE(C1); + } +#if ( PRNTlevel>=2 ) + dmin = damch_dist ("Overflow"); + dsum = 0.0; + dprod = 1.0; +#endif + if ( iinfo == 0 ) { + if (job == 5) { + if ( Equil ) { + for (i = 0; i < n; ++i) { + R1[i] = exp (R1[i]); + C1[i] = exp (C1[i]); + } + + /* Scale the distributed matrix further. + A <-- diag(R1)*A*diag(C1) */ + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + icol = colind[i]; + a[i] *= R1[irow] * C1[icol]; +#if ( PRNTlevel>=2 ) + if (perm_r[irow] == icol) { + /* New diagonal */ + if (job == 2 || job == 3) + dmin = SUPERLU_MIN(dmin, fabs(a[i])); + else if (job == 4) + dsum += fabs(a[i]); + else if (job == 5) + dprod *= fabs(a[i]); + } +#endif + } + ++irow; + } + + /* Multiply together the scaling factors -- + R/C from simple scheme, R1/C1 from MC64. */ + if (rowequ) + for (i = 0; i < m; ++i) R[i] *= R1[i]; + else + for (i = 0; i < m; ++i) R[i] = R1[i]; + if (colequ) + for (i = 0; i < n; ++i) C[i] *= C1[i]; + else + for (i = 0; i < n; ++i) C[i] = C1[i]; + + ScalePermstruct->DiagScale = BOTH; + rowequ = colequ = 1; + + } /* end if Equil */ + + /* Now permute global A to prepare for symbfact() */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } + SUPERLU_FREE (R1); + SUPERLU_FREE (C1); + } else { /* job = 2,3,4 */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) + { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } /* end for i ... */ + } /* end for j ... */ + } /* end else job ... */ + } else { /* if iinfo != 0 */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } +#if ( PRNTlevel>=2 ) + if (job == 2 || job == 3) { + if (!iam) + printf ("\tsmallest diagonal %e\n", dmin); + } else if (job == 4) { + if (!iam) + printf ("\tsum of diagonal %e\n", dsum); + } else if (job == 5) { + if (!iam) + printf ("\t product of diagonal %e\n", dprod); + } +#endif + } else { /* use largeDiag_AWPM */ +#ifdef HAVE_COMBBLAS + c2cpp_GetAWPM(A, grid, ScalePermstruct); +#else + if ( iam == 0 ) { + printf("CombBLAS is not available\n"); fflush(stdout); + } +#endif + } /* end if-else options->RowPerm ... */ + + t = SuperLU_timer_ () - t; + stat->utime[ROWPERM] = t; +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); + fflush(stdout); + } +#endif + } /* end if Fact not SamePattern_SameRowPerm ... */ + } else { /* options->RowPerm == NOROWPERM / NATURAL */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } + +#if ( DEBUGlevel>=2 ) + if (!iam) + PrintInt10 ("perm_r", m, perm_r); +#endif + } /* end if (!factored) */ + + if (!factored || options->IterRefine) { + /* Compute norm(A), which will be used to adjust small diagonal. */ + if (notran) + *(unsigned char *) norm = '1'; + else + *(unsigned char *) norm = 'I'; + anorm = pdlangs (norm, A, grid); +#if ( PRNTlevel>=1 ) + if (!iam) { + printf (".. anorm %e\n", anorm); fflush(stdout); + } +#endif + } + + + /* ------------------------------------------------------------ + Perform the LU factorization. + ------------------------------------------------------------ */ + if (!factored) { + t = SuperLU_timer_ (); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A + * permc_spec = PARMETIS: parallel METIS on structure of A'+A + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + + if (parSymbFact == YES || permc_spec == PARMETIS) { + nprocs_num = grid->nprow * grid->npcol; + noDomains = (int) (pow (2, ((int) LOG2 (nprocs_num)))); + + /* create a new communicator for the first noDomains + processes in grid->comm */ + key = iam; + if (iam < noDomains) + col = 0; + else + col = MPI_UNDEFINED; + MPI_Comm_split (grid->comm, col, key, &symb_comm); + + if (permc_spec == NATURAL || permc_spec == MY_PERMC) { + if (permc_spec == NATURAL) + { + for (j = 0; j < n; ++j) + perm_c[j] = j; + } + if (!(sizes = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for sizes."); + if (!(fstVtxSep = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for fstVtxSep."); + for (i = 0; i < 2 * noDomains - 2; ++i) { + sizes[i] = 0; + fstVtxSep[i] = 0; + } + sizes[2 * noDomains - 2] = m; + fstVtxSep[2 * noDomains - 2] = 0; + } else if (permc_spec != PARMETIS) { + /* same as before */ + printf + ("{%4d,%4d}: pdgssvx: invalid ColPerm option when ParSymbfact is used\n", + (int) MYROW (grid->iam, grid), (int) MYCOL (grid->iam, grid)); + } + } /* end ... use parmetis */ + + if (permc_spec != MY_PERMC && Fact == DOFACT) { + if (permc_spec == PARMETIS) { + /* Get column permutation vector in perm_c. * + * This routine takes as input the distributed input matrix A * + * and does not modify it. It also allocates memory for * + * sizes[] and fstVtxSep[] arrays, that contain information * + * on the separator tree computed by ParMETIS. */ + flinfo = get_perm_c_parmetis (A, perm_r, perm_c, nprocs_num, + noDomains, &sizes, &fstVtxSep, + grid, &symb_comm); + if (flinfo > 0) + ABORT ("ERROR in get perm_c parmetis."); + } else { + get_perm_c_dist (iam, permc_spec, &GA, perm_c); + } + } + + stat->utime[COLPERM] = SuperLU_timer_ () - t; + + /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' + (a.k.a. column etree), depending on the choice of ColPerm. + Adjust perm_c[] to be consistent with a postorder of etree. + Permute columns of A to form A*Pc'. */ + if (Fact != SamePattern_SameRowPerm) { + if (parSymbFact == NO) { + + int_t *GACcolbeg, *GACcolend, *GACrowind; + + sp_colorder (options, &GA, perm_c, etree, &GAC); + + /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ + GACstore = (NCPformat *) GAC.Store; + GACcolbeg = GACstore->colbeg; + GACcolend = GACstore->colend; + GACrowind = GACstore->rowind; + for (j = 0; j < n; ++j) { + for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { + irow = GACrowind[i]; + GACrowind[i] = perm_c[irow]; + } + } + + + /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up + the nonzero data structures for L & U. */ +#if ( PRNTlevel>=1 ) + if (!iam) + printf + (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", + sp_ienv_dist (2), sp_ienv_dist (3), sp_ienv_dist (6)); +#endif + t = SuperLU_timer_ (); + if (!(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC (sizeof (Glu_freeable_t)))) + ABORT ("Malloc fails for Glu_freeable."); + + /* Every process does this. */ + iinfo = symbfact (options, iam, &GAC, perm_c, etree, + Glu_persist, Glu_freeable); + + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (iinfo < 0) { + /* Successful return */ + QuerySpace_dist (n, -iinfo, Glu_freeable, &symb_mem_usage); + +#if ( PRNTlevel>=1 ) + if (!iam) { + printf ("\tNo of supers %ld\n", + Glu_persist->supno[n - 1] + 1); + printf ("\tSize of G(L) %ld\n", + Glu_freeable->xlsub[n]); + printf ("\tSize of G(U) %ld\n", + Glu_freeable->xusub[n]); + printf ("\tint %d, short %d, float %d, double %d\n", + sizeof (int_t), sizeof (short), + sizeof (float), sizeof (double)); + printf + ("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", + symb_mem_usage.for_lu * 1e-6, + symb_mem_usage.total * 1e-6, + symb_mem_usage.expansions); + } +#endif + } else { + if (!iam) { + fprintf (stderr, "symbfact() error returns %d\n", + (int) iinfo); + exit (-1); + } + } + + } /* end serial symbolic factorization */ + else { /* parallel symbolic factorization */ + t = SuperLU_timer_ (); + flinfo = + symbfact_dist (nprocs_num, noDomains, A, perm_c, perm_r, + sizes, fstVtxSep, &Pslu_freeable, + &(grid->comm), &symb_comm, + &symb_mem_usage); + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (flinfo > 0) + ABORT + ("Insufficient memory for parallel symbolic factorization."); + } + + /* Destroy GA */ + if (parSymbFact == NO || options->RowPerm != NO) + Destroy_CompCol_Matrix_dist (&GA); + if (parSymbFact == NO) + Destroy_CompCol_Permuted_dist (&GAC); + + } /* end if Fact not SamePattern_SameRowPerm */ + + if (sizes) + SUPERLU_FREE (sizes); + if (fstVtxSep) + SUPERLU_FREE (fstVtxSep); + if (symb_comm != MPI_COMM_NULL) + MPI_Comm_free (&symb_comm); + + if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + t = SuperLU_timer_ (); + dist_mem_use = pddistribute (Fact, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid); + stat->utime[DIST] = SuperLU_timer_ () - t; + + /* Deallocate storage used in symbolic factorization. */ + if (Fact != SamePattern_SameRowPerm) + { + iinfo = symbfact_SubFree (Glu_freeable); + SUPERLU_FREE (Glu_freeable); + } + } else { + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + t = SuperLU_timer_ (); + dist_mem_use = ddist_psymbtonum (Fact, n, A, ScalePermstruct, + &Pslu_freeable, LUstruct, grid); + if (dist_mem_use > 0) + ABORT ("Not enough memory available for dist_psymbtonum\n"); + + stat->utime[DIST] = SuperLU_timer_ () - t; + } + + /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); */ + } /* end if not Factored */ + } /* end if process layer 0 */ + + trf3Dpartition_t* trf3Dpartition; + + /* Perform numerical factorization in parallel on all process layers. */ + if (!factored ) { + + /* send the data across all the layers */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &anorm, 1, MPI_DOUBLE, 0, grid3d->zscp.comm); + + /* send the LU structure to all the grids */ + dp3dScatter(n, LUstruct, grid3d); + int_t nsupers = getNsupers(n, LUstruct); + trf3Dpartition = dinitTrf3Dpartition(nsupers, options, LUstruct, grid3d); + SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t)); + + SCT_init(SCT); + +#if ( PRNTlevel>=1 ) + if (iam==0) { + printf("after 3D initialization.\n"); fflush(stdout); + } +#endif + + t = SuperLU_timer_ (); + + /*factorize in grid 1*/ + // if(grid3d->zscp.Iam) + + pdgstrf3d (options, m, n, anorm, trf3Dpartition, SCT, LUstruct, + grid3d, stat, info); + stat->utime[FACT] = SuperLU_timer_ () - t; + + double tgather = SuperLU_timer_(); + + dgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT); + + SCT->gatherLUtimer += SuperLU_timer_() - tgather; + /*print stats for bottom grid*/ + +#if ( PRNTlevel>=1 ) + if (!grid3d->zscp.Iam) + { + SCT_print(grid, SCT); + SCT_print3D(grid3d, SCT); + } + SCT_printComm3D(grid3d, SCT); + /*print memory usage*/ + printMemUse( trf3Dpartition, LUstruct, grid3d ); + /*print forest weight and costs*/ + printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d); + /*reduces stat from all the layers*/ +#endif + + } /* end if not Factored */ + + if ( grid3d->zscp.Iam == 0 ) { + if (!factored) { + if (options->PrintStat) { + int_t TinyPivots; + float for_lu, total, max, avg, temp; + + dQuerySpace_dist (n, LUstruct, grid, stat, &num_mem_usage); + + if (parSymbFact == TRUE) { + /* The memory used in the redistribution routine + includes the memory used for storing the symbolic + structure and the memory allocated for numerical factorization */ + temp = SUPERLU_MAX (symb_mem_usage.total, -dist_mem_use); + if (options->RowPerm != NO) + temp = SUPERLU_MAX (temp, GA_mem_use); + } + else { + temp = SUPERLU_MAX (symb_mem_usage.total + GA_mem_use, /* symbfact step */ + symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu /* distribution step */ + ); + } + + temp = SUPERLU_MAX (temp, num_mem_usage.total); + + MPI_Reduce (&temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + MPI_Reduce (&temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Allreduce (&stat->TinyPivots, &TinyPivots, 1, mpi_int_t, + MPI_SUM, grid->comm); + stat->TinyPivots = TinyPivots; + + MPI_Reduce (&num_mem_usage.for_lu, &for_lu, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&num_mem_usage.total, &total, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + + if (!iam) { + printf("\tNUMfact space (MB) sum(procs): L\\U\t%.2f\tall\t%.2f\n", + for_lu * 1e-6, total * 1e-6); + printf ("\tTotal highmark (MB): " + "All\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg * 1e-6, + avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); + printf("**************************************************\n"); + fflush(stdout); + } + } + + } /* end if (!factored) */ + + /* ------------------------------------------------------------ + Compute the solution matrix X. + ------------------------------------------------------------ */ + if (nrhs) + { + if (!(b_work = doubleMalloc_dist (n))) + ABORT ("Malloc fails for b_work[]"); + + /* ------------------------------------------------------ + Scale the right-hand side if equilibration was performed. + ------------------------------------------------------ */ + if (notran) + { + if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if (!(X = doubleMalloc_dist (((size_t) ldx) * nrhs))) + ABORT ("Malloc fails for X[]"); + x_col = X; + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; + x_col += ldx; + b_col += ldb; + } + + /* ------------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------------ */ + if (options->SolveInitialized == NO) /* First time */ + /* Inside this routine, SolveInitialized is set to YES. + For repeated call to pdgssvx(), no need to re-initialilze + the Solve data & communication structures, unless a new + factorization with Fact == DOFACT or SamePattern is asked for. */ + { + dSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct, + grid, SOLVEstruct); + } + stat->utime[SOLVE] = 0.0; +#if 0 // Sherry: the following interface is needed by 3D trisolve. + pdgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); +#else + pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); +#endif + + /* ------------------------------------------------------------ + Use iterative refinement to improve the computed solution and + compute error bounds and backward error estimates for it. + ------------------------------------------------------------ */ + if (options->IterRefine) + { + /* Improve the solution by iterative refinement. */ + int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; + SOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ + + t = SuperLU_timer_ (); + if (options->RefineInitialized == NO || Fact == DOFACT) { + /* All these cases need to re-initialize gsmv structure */ + if (options->RefineInitialized) + pdgsmv_finalize (SOLVEstruct->gsmv_comm); + pdgsmv_init (A, SOLVEstruct->row_to_proc, grid, + SOLVEstruct->gsmv_comm); + + /* Save a copy of the transformed local col indices + in colind_gsmv[]. */ + if (colind_gsmv) SUPERLU_FREE (colind_gsmv); + if (!(it = intMalloc_dist (nnz_loc))) + ABORT ("Malloc fails for colind_gsmv[]"); + colind_gsmv = SOLVEstruct->A_colind_gsmv = it; + for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; + options->RefineInitialized = YES; + } + else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) { + double at; + int_t k, jcol, p; + /* Swap to beginning the part of A corresponding to the + local part of X, as was done in pdgsmv_init() */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + k = rowptr[i]; + for (j = rowptr[i]; j < rowptr[i + 1]; ++j) + { + jcol = colind[j]; + p = SOLVEstruct->row_to_proc[jcol]; + if (p == iam) + { /* Local */ + at = a[k]; + a[k] = a[j]; + a[j] = at; + ++k; + } + } + } + + /* Re-use the local col indices of A obtained from the + previous call to pdgsmv_init() */ + for (i = 0; i < nnz_loc; ++i) + colind[i] = colind_gsmv[i]; + } + + if (nrhs == 1) + { /* Use the existing solve structure */ + SOLVEstruct1 = SOLVEstruct; + } + else + { /* For nrhs > 1, since refinement is performed for RHS + one at a time, the communication structure for pdgstrs + is different than the solve with nrhs RHS. + So we use SOLVEstruct1 for the refinement step. + */ + if (!(SOLVEstruct1 = (SOLVEstruct_t *) + SUPERLU_MALLOC (sizeof (SOLVEstruct_t)))) + ABORT ("Malloc fails for SOLVEstruct1"); + /* Copy the same stuff */ + SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; + SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; + SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; + SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; + SOLVEstruct1->diag_len = SOLVEstruct->diag_len; + SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; + SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; + + /* Initialize the *gstrs_comm for 1 RHS. */ + if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) + ABORT ("Malloc fails for gstrs_comm[]"); + pxgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, + Glu_persist, SOLVEstruct1); + } + + pdgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid, + B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); + + /* Deallocate the storage associated with SOLVEstruct1 */ + if (nrhs > 1) + { + pxgstrs_finalize (SOLVEstruct1->gstrs_comm); + SUPERLU_FREE (SOLVEstruct1); + } + + stat->utime[REFINE] = SuperLU_timer_ () - t; + } + + /* Permute the solution matrix B <= Pc'*X. */ + pdPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc, + SOLVEstruct->inv_perm_c, + X, ldx, B, ldb, nrhs, grid); +#if ( DEBUGlevel>=2 ) + printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); + for (i = 0; i < m_loc; ++i) + printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]); +#endif + + /* Transform the solution matrix X to a solution of the original + system before the equilibration. */ + if (notran) + { + if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + + SUPERLU_FREE (b_work); + SUPERLU_FREE (X); + + } /* end if nrhs != 0 */ + +#if ( PRNTlevel>=1 ) + if (!iam) + printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale); +#endif + + /* Deallocate R and/or C if it was not used. */ + if (Equil && Fact != SamePattern_SameRowPerm) + { + switch (ScalePermstruct->DiagScale) + { + case NOEQUIL: + SUPERLU_FREE (R); + SUPERLU_FREE (C); + break; + case ROW: + SUPERLU_FREE (C); + break; + case COL: + SUPERLU_FREE (R); + break; + } + } + +#if 0 + if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact) + Destroy_CompCol_Permuted_dist (&GAC); +#endif +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit pdgssvx3d()"); +#endif + + } /* process layer 0 */ +} diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c index 8f5017bf..be001c2b 100644 --- a/SRC/pdgstrf.c +++ b/SRC/pdgstrf.c @@ -150,8 +150,8 @@ at the top-level directory. //#define GEMM_PADLEN 1 #define GEMM_PADLEN 8 +/* #define PDGSTRF2 pdgstrf2_dtrsm */ #define PDGSTRF2 pdgstrf2_trsm -#define PDGSTRS2 pdgstrs2_omp #ifdef ISORT extern void isort (int_t N, int_t * ARRAY1, int_t * ARRAY2); @@ -967,16 +967,13 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #if 0 Remain_L_buff = (double *) _mm_malloc( sizeof(double)*(Llu->bufmax[1]),64); Ublock_info = (Ublock_info_t *) _mm_malloc(mcb*sizeof(Ublock_info_t),64); - int * Ublock_info_iukp = (int *) _mm_malloc(mcb*sizeof(int),64); - int * Ublock_info_rukp = (int *) _mm_malloc(mcb*sizeof(int),64); - int * Ublock_info_jb = (int *) _mm_malloc(mcb*sizeof(int),64); #else j = gemm_m_pad * (ldt + max_row_size + gemm_k_pad); Remain_L_buff = doubleMalloc_dist(Llu->bufmax[1] + j); /* This is loose */ Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb*sizeof(Ublock_info_t)); - int *Ublock_info_iukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); - int *Ublock_info_rukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); - int *Ublock_info_jb = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); + /*int *Ublock_info_iukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); + int *Ublock_info_rukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); + int *Ublock_info_jb = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); */ #endif long long alloc_mem = 3 * mrb * iword + mrb * sizeof(Remain_info_t) @@ -1296,8 +1293,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, /* #pragma omp parallel */ /* Sherry -- parallel done inside pdgstrs2 */ #endif { - PDGSTRS2 (kk0, kk, Glu_persist, grid, Llu, - stat); + pdgstrs2_omp (kk0, kk, Glu_persist, grid, Llu, + Ublock_info, stat); } pdgstrs2_timer += SuperLU_timer_()-ttt2; @@ -1458,13 +1455,10 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, /* Parallel triangular solve across process row *krow* -- U(k,j) = L(k,k) \ A(k,j). */ double ttt2 = SuperLU_timer_(); -#ifdef _OPENMP -/* #pragma omp parallel */ /* Sherry -- parallel done inside pdgstrs2 */ -#endif - { - PDGSTRS2 (k0, k, Glu_persist, grid, Llu, stat); - } - pdgstrs2_timer += SuperLU_timer_() - ttt2; + + pdgstrs2_omp (k0, k, Glu_persist, grid, Llu, Ublock_info, stat); + + pdgstrs2_timer += SuperLU_timer_() - ttt2; /* Sherry -- need to set factoredU[k0] = 1; ?? */ @@ -1732,7 +1726,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #else /*#include "SchCompUdt--Phi-2Ddynamic-alt.c"*/ -//#include "dSchCompUdt-2Ddynamic_v6.c" +/*#include "dSchCompUdt-2Ddynamic_v6.c"*/ #include "dSchCompUdt-2Ddynamic.c" @@ -1909,9 +1903,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, Llu->bufmax[1] * dword), stat ); SUPERLU_FREE(Ublock_info); - SUPERLU_FREE(Ublock_info_iukp); - SUPERLU_FREE(Ublock_info_rukp); - SUPERLU_FREE(Ublock_info_jb); + /* SUPERLU_FREE(Ublock_info_iukp); + SUPERLU_FREE(Ublock_info_rukp); + SUPERLU_FREE(Ublock_info_jb); */ #if ( PROFlevel>=1 ) diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c index 547de08f..e07ed770 100644 --- a/SRC/pdgstrf2.c +++ b/SRC/pdgstrf2.c @@ -323,7 +323,7 @@ int_t LpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr, int_t ld_ujrow, unsigned long long t1 = _rdtsc(); #define GT 32 - #pragma omp parallel for +#pragma omp parallel for for (int i = 0; i < CEILING(l, GT); ++i) { int_t off = i * GT; @@ -563,7 +563,7 @@ void pdgstrf2_dtrsm if (U_diag_blk_send_req && iam == pkk) /* Send the U block */ { - ISend_UDiagBlock(k0, ublk_ptr, nsupc * nsupc, U_diag_blk_send_req, + dISend_UDiagBlock(k0, ublk_ptr, nsupc * nsupc, U_diag_blk_send_req, grid, tag_ub); U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* flag outstanding Isend */ } @@ -580,7 +580,7 @@ void pdgstrf2_dtrsm * but panel factorization of U(:,k) don't * * ================================================ */ - Recv_UDiagBlock( k0, ublk_ptr, (nsupc * nsupc), krow, grid, SCT, tag_ub); + dRecv_UDiagBlock( k0, ublk_ptr, (nsupc * nsupc), krow, grid, SCT, tag_ub); if (nsupr > 0) { @@ -668,14 +668,11 @@ int_t Trs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, for (int_t jj = iukp; jj < iukp + nsupc; ++jj) { ldu = SUPERLU_MAX( klst - usub[jj], ldu) ; - } /*pack U block into a dense Block*/ int_t ncols = Trs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv); - - /*now call dtrsm on packed dense block*/ int_t luptr = (knsupc - ldu) * (nsupr + 1); // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu ); @@ -709,7 +706,10 @@ int_t Trs2_InitUblock_info(int_t klst, int_t nb, // Ublock_info[b].nsupc = nsupc; iukp += UB_DESCRIPTOR; - for (int_t j = 0; j < nsupc; ++j) + /* Sherry: can remove this loop for rukp + rukp += usub[iukp-1]; + */ + for (int_t j = 0; j < nsupc; ++j) { int_t segsize = klst - usub[iukp++]; rukp += segsize; @@ -721,15 +721,12 @@ int_t Trs2_InitUblock_info(int_t klst, int_t nb, #if 1 /***************************************************************************** - * The following pdgstrf2_omp is in version 6 and earlier. + * The following pdgstrf2_omp is improved for KNL, since Version 5.2.0. *****************************************************************************/ void pdgstrs2_omp -(int_t k0, int_t k, Glu_persist_t * Glu_persist, - gridinfo_t * grid, LocalLU_t * Llu, SuperLUStat_t * stat) +(int_t k0, int_t k, Glu_persist_t * Glu_persist, gridinfo_t * grid, + LocalLU_t * Llu, Ublock_info_t *Ublock_info, SuperLUStat_t * stat) { -#ifdef PI_DEBUG - printf("====Entering pdgstrs2==== \n"); -#endif int iam, pkk; int incx = 1; int nsupr; /* number of rows in the block L(:,k) (LDA) */ @@ -777,6 +774,13 @@ void pdgstrs2_omp iukp = BR_HEADER; rukp = 0; + /* Sherry: can use the existing Ublock_info[] array, call + Trs2_InitUblock_info(); */ +#undef USE_Ublock_info +#ifdef USE_Ublock_info /** 4/19/2019 **/ + /* Loop through all the row blocks. to get the iukp and rukp*/ + Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); +#else int* blocks_index_pointers = SUPERLU_MALLOC (3 * nb * sizeof(int)); int* blocks_value_pointers = blocks_index_pointers + nb; int* nsupc_temp = blocks_value_pointers + nb; @@ -789,6 +793,7 @@ void pdgstrs2_omp nsupc_temp[b] = nsupc; iukp += (UB_DESCRIPTOR + nsupc); /* move to the next block */ } +#endif // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for @@ -796,11 +801,23 @@ void pdgstrs2_omp private(b,j,iukp,rukp,segsize) /* Loop through all the blocks in the row. */ for (b = 0; b < nb; ++b) { +#ifdef USE_Ublock_info + iukp = Ublock_info[b].iukp; + rukp = Ublock_info[b].rukp; +#else iukp = blocks_index_pointers[b]; rukp = blocks_value_pointers[b]; +#endif /* Loop through all the segments in the block. */ +#ifdef USE_Ublock_info + gb = usub[iukp]; + nsupc = SuperSize( gb ); + iukp += UB_DESCRIPTOR; + for (j = 0; j < nsupc; j++) { +#else for (j = 0; j < nsupc_temp[b]; j++) { +#endif segsize = klst - usub[iukp++]; if (segsize) { #pragma omp task default(shared) firstprivate(segsize,rukp) if (segsize > 30) @@ -823,8 +840,10 @@ void pdgstrs2_omp /* #pragma omp taskwait */ } /* end for b ... */ +#ifndef USE_Ublock_info /* Deallocate memory */ SUPERLU_FREE(blocks_index_pointers); +#endif #if 0 //#ifdef USE_VTUNE @@ -834,9 +853,9 @@ void pdgstrs2_omp } /* pdgstrs2_omp */ -#else /*==== Use the new version from Piyush ====*/ +#else /*==== new version from Piyush ====*/ -void pdgstrs2_omp(int_t m, int_t k0, int_t k, int_t* Lsub_buf, +void pdgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf, double *Lval_buf, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, Ublock_info_t *Ublock_info, double *bigV, int_t ldt, SCT_t *SCT) @@ -859,10 +878,10 @@ void pdgstrs2_omp(int_t m, int_t k0, int_t k, int_t* Lsub_buf, double *lusup = Lval_buf; /* Loop through all the row blocks. to get the iukp and rukp*/ - Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); + Trs2_InitUbloc_kinfo(klst, nb, Ublock_info, usub, Glu_persist, stat ); /* Loop through all the row blocks. */ - #pragma omp parallel for schedule(dynamic,2) +#pragma omp parallel for schedule(dynamic,2) for (int_t b = 0; b < nb; ++b) { int_t thread_id = omp_get_thread_num(); @@ -874,7 +893,7 @@ void pdgstrs2_omp(int_t m, int_t k0, int_t k, int_t* Lsub_buf, } /* for b ... */ SCT->PDGSTRS2_tl += (double) ( _rdtsc() - t1); -} /* pdgstrs2_omp */ +} /* pdgstrs2_omp new version from Piyush */ #endif diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c new file mode 100644 index 00000000..e11ea5d5 --- /dev/null +++ b/SRC/pdgstrf3d.c @@ -0,0 +1,315 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +#include "superlu_ddefs.h" +#if 0 +#include "pdgstrf3d.h" +#include "trfCommWrapper.h" +#include "trfAux.h" +//#include "load-balance/supernodal_etree.h" +//#include "load-balance/supernodalForest.h" +#include "supernodal_etree.h" +#include "supernodalForest.h" +#include "p3dcomm.h" +#include "treeFactorization.h" +#include "ancFactorization.h" +#include "xtrf3Dpartition.h" +#endif + +#ifdef MAP_PROFILE +#include "mapsampler_api.h" +#endif + +#ifdef GPU_ACC +#include "lustruct_gpu.h" +#include "acc_aux.c" +#endif + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * PDGSTRF3D performs the LU factorization in parallel using 3D process grid,
+ * which is a communication-avoiding algorithm compared to the 2D algorithm.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * trf3Dpartition (input) trf3Dpartition*
+ *        Matrix partitioning information in 3D process grid.
+ *
+ * SCT    (input/output) SCT_t*
+ *        Various statistics of 3D factorization.
+ *
+ * LUstruct (input/output) LUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *         xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) LocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
+ *
+ * grid3d (input) gridinfo3d_t*
+ *        The 3D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and replication factor in Z-dimension. It is an input argument to all
+ *        the 3D parallel routines.
+ *        Grid3d can be initialized by subroutine SUPERLU_GRIDINIT3D.
+ *        See superlu_defs.h for the definition of 'gridinfo3d_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ */ +int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, + trf3Dpartition_t* trf3Dpartition, SCT_t *SCT, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, + SuperLUStat_t *stat, int *info) +{ + gridinfo_t* grid = &(grid3d->grid2d); + LocalLU_t *Llu = LUstruct->Llu; + + // problem specific contants + int_t ldt = sp_ienv_dist (3); /* Size of maximum supernode */ + // double s_eps = slamch_ ("Epsilon"); -Sherry + double s_eps = smach_dist("Epsilon"); + double thresh = s_eps * anorm; + + // initilize stat + stat->ops[FACT] = 0; + //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt); + + //getting Nsupers + int_t nsupers = getNsupers(n, LUstruct); + + // Grid related Variables + int_t iam = grid->iam; // in 2D grid + int num_threads = getNumThreads(grid3d->iam); + + diagFactBufs_t dFBuf; + dinitDiagFactBufs(ldt, &dFBuf); + + factStat_t factStat; + initFactStat(nsupers, &factStat); + + commRequests_t comReqs; + initCommRequests(&comReqs, grid); + + SCT->tStartup = SuperLU_timer_(); + packLUInfo_t packLUInfo; + initPackLUInfo(nsupers, &packLUInfo); + + scuBufs_t scuBufs; + dinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid); + + msgs_t msgs; + initMsgs(&msgs); + + factNodelists_t fNlists; + initFactNodelists( ldt, num_threads, nsupers, &fNlists); + + // tag_ub initialization + int tag_ub = set_tag_ub(); + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + // trf3Dpartition_t* trf3Dpartition = initTrf3Dpartition(nsupers, options, LUstruct, grid3d); + gEtreeInfo_t gEtreeInfo = trf3Dpartition->gEtreeInfo; + int_t* iperm_c_supno = trf3Dpartition->iperm_c_supno; + int_t* myNodeCount = trf3Dpartition->myNodeCount; + int_t* myTreeIdxs = trf3Dpartition->myTreeIdxs; + int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs; + sForest_t** sForests = trf3Dpartition->sForests; + int_t** treePerm = trf3Dpartition->treePerm ; + LUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb; + /*Initializing factorization specific buffers*/ + + int_t numLA = getNumLookAhead(options); + LUValSubBuf_t**LUvsbs = dLluBufInitArr( SUPERLU_MAX( numLA, grid3d->zscp.Np ), LUstruct); + msgs_t**msgss = initMsgsArr(numLA); + int_t mxLeafNode = 0; + for (int ilvl = 0; ilvl < maxLvl; ++ilvl) + { + /* code */ + if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode ) + mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1]; + } + diagFactBufs_t** dFBufs = dinitDiagFactBufsArr(mxLeafNode, ldt, grid); + commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), + ldt, grid); + + /*setting up GPU related stuff*/ + + int_t first_l_block_acc = 0; + int_t first_u_block_acc = 0; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t mrb = (nsupers + Pr - 1) / Pr; + int_t mcb = (nsupers + Pc - 1) / Pc; + HyP_t *HyP = (HyP_t *) malloc(sizeof(HyP_t)); + Init_HyP(HyP, Llu, mcb, mrb); + HyP->first_l_block_acc = first_l_block_acc; + HyP->first_u_block_acc = first_u_block_acc; + int_t bigu_size = getBigUSize(nsupers, grid, LUstruct); + // int_t buffer_size = get_max_buffer_size (); + // HyP->buffer_size = buffer_size; + HyP->bigu_size = bigu_size; + HyP->nsupers = nsupers; + +#ifdef GPU_ACC + + /*Now initialize the GPU data structure*/ + LUstruct_gpu *A_gpu, *dA_gpu; + + d2Hreduce_t d2HredObj; + d2Hreduce_t* d2Hred = &d2HredObj; + sluGPU_t sluGPUobj; + sluGPU_t *sluGPU = &sluGPUobj; + sluGPU->isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm); + if (superlu_acc_offload) + { + /*Initilize the lookup tables */ + LookUpTableInit(iam); + acc_async_cost = get_acc_async_cost(); +#ifdef GPU_DEBUG + if (!iam) printf("Using MIC async cost of %lf \n", acc_async_cost); +#endif + + initSluGPU3D_t(sluGPU, LUstruct, grid3d, perm_c_supno, + n, buffer_size, bigu_size, ldt); + + HyP->first_u_block_acc = sluGPU->A_gpu->first_u_block_gpu; + HyP->first_l_block_acc = sluGPU->A_gpu->first_l_block_gpu; + HyP->nCudaStreams = sluGPU->nCudaStreams; + } + +#endif // GPU_ACC + + /*==== starting main factorization loop =====*/ + MPI_Barrier( grid3d->comm); + SCT->tStartup = SuperLU_timer_() - SCT->tStartup; + // int_t myGrid = grid3d->zscp.Iam; + +#ifdef ITAC_PROF + VT_traceon(); +#endif +#ifdef MAP_PROFILE + allinea_start_sampling(); +#endif + SCT->pdgstrfTimer = SuperLU_timer_(); + + for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl) + { + /* if I participate in this level */ + if (!myZeroTrIdxs[ilvl]) + { + //int_t tree = myTreeIdxs[ilvl]; + + sForest_t* sforest = sForests[myTreeIdxs[ilvl]]; + + /*main loop over all the super nodes*/ + if (sforest) + { + double tilvl = SuperLU_timer_(); +#ifdef GPU_ACC + dsparseTreeFactor_ASYNC_GPU( + sforest, + comReqss, &scuBufs, &packLUInfo, + msgss, LUvsbs, dFBufs, &factStat, &fNlists, + &gEtreeInfo, options, iperm_c_supno, ldt, + sluGPU, d2Hred, HyP, LUstruct, grid3d, stat, + thresh, SCT, tag_ub, info); +#else + dsparseTreeFactor_ASYNC(sforest, comReqss, &scuBufs, &packLUInfo, + msgss, LUvsbs, dFBufs, &factStat, &fNlists, + &gEtreeInfo, options, iperm_c_supno, ldt, + HyP, LUstruct, grid3d, stat, + thresh, SCT, tag_ub, info ); +#endif + + /*now reduce the updates*/ + SCT->tFactor3D[ilvl] = SuperLU_timer_() - tilvl; + sForests[myTreeIdxs[ilvl]]->cost = SCT->tFactor3D[ilvl]; + } + + if (ilvl < maxLvl - 1) /*then reduce before factorization*/ + { +#ifdef GPU_ACC + dreduceAllAncestors3d_GPU( + ilvl, myNodeCount, treePerm, LUvsb, + LUstruct, grid3d, sluGPU, d2Hred, &factStat, HyP, + SCT ); +#else + + dreduceAllAncestors3d(ilvl, myNodeCount, treePerm, + LUvsb, LUstruct, grid3d, SCT ); +#endif + + } + } /*if (!myZeroTrIdxs[ilvl]) ... If I participate in this level*/ + + SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer + : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1]; + } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/ + + MPI_Barrier( grid3d->comm); + SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer; + +#ifdef ITAC_PROF + VT_traceoff(); +#endif + +#ifdef MAP_PROFILE + allinea_stop_sampling(); +#endif + + reduceStat(FACT, stat, grid3d); + + return 0; + +} /* pdgstrf3d */ diff --git a/SRC/pz3dcomm.c b/SRC/pz3dcomm.c new file mode 100644 index 00000000..8f3f6d67 --- /dev/null +++ b/SRC/pz3dcomm.c @@ -0,0 +1,855 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +#include "superlu_ddefs.h" +#include "cblas.h" +#if 0 +#include "p3dcomm.h" +#include "sec_structs.h" +//#include "load-balance/supernodal_etree.h" +//#include "load-balance/supernodalForest.h" +#include "supernodal_etree.h" +#include "supernodalForest.h" +#include "trfAux.h" +#include "treeFactorization.h" +#include "xtrf3Dpartition.h" +#endif + +#define INT_T_ALLOC(x) ((int_t *) SUPERLU_MALLOC ( (x) * sizeof (int_t))) +#define DOUBLE_ALLOC(x) ((double *) SUPERLU_MALLOC ( (x) * sizeof (double))) + +// #define MPI_MALLOC +#define MPI_INT_ALLOC(a, b) (MPI_Alloc_mem( (b)*sizeof(int_t), MPI_INFO_NULL, &(a) )) +#define MPI_DATATYPE_ALLOC(a, b) (MPI_Alloc_mem((b)*sizeof(doublecomplex), MPI_INFO_NULL, &(a))) + +int_t zAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +{ + int_t Pc = grid3d->npcol; + int_t Pr = grid3d->nprow; + + int_t nbc = CEILING(nsupers, Pc); + int_t nbr = CEILING(nsupers, Pr); + + LocalLU_t *Llu = LUstruct->Llu; + int_t **Lrowind_bc_ptr = + (int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbc); /* size ceil(NSUPERS/Pc) */ + doublecomplex **Lnzval_bc_ptr = + (doublecomplex **) SUPERLU_MALLOC(sizeof(doublecomplex*)*nbc); /* size ceil(NSUPERS/Pc) */ + + for (int_t i = 0; i < nbc ; ++i) + { + /* code */ + Lrowind_bc_ptr[i] = NULL; + Lnzval_bc_ptr[i] = NULL; + } + + int_t **Ufstnz_br_ptr = + (int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbr); /* size ceil(NSUPERS/Pr) */ + doublecomplex **Unzval_br_ptr = + (doublecomplex **) SUPERLU_MALLOC(sizeof(doublecomplex*)*nbr); /* size ceil(NSUPERS/Pr) */ + + for (int_t i = 0; i < nbr ; ++i) + { + /* code */ + Ufstnz_br_ptr[i] = NULL; + Unzval_br_ptr[i] = NULL; + } + + int_t *ToRecv = intCalloc_dist(nsupers); /* Recv from no one (0), left (1), and up (2).*/ + int_t *ToSendD = intCalloc_dist(nbr); /* Whether need to send down block row. */ + int_t **ToSendR = (int_t **) SUPERLU_MALLOC(nbc * sizeof(int_t*)); /* List of processes to send right block col. */ + + for (int_t i = 0; i < nbc; ++i) + { + /* code */ + ToSendR[i] = INT_T_ALLOC(Pc); + } + + /*now setup the pointers*/ + Llu->Lrowind_bc_ptr = Lrowind_bc_ptr ; + Llu->Lnzval_bc_ptr = Lnzval_bc_ptr ; + Llu->Ufstnz_br_ptr = Ufstnz_br_ptr ; + Llu->Unzval_br_ptr = Unzval_br_ptr ; + Llu->ToRecv = ToRecv ; + Llu->ToSendD = ToSendD ; + Llu->ToSendR = ToSendR ; + + return 0; +} /* zAllocLlu */ + +int_t zmpiMallocLUStruct(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + for ( int_t lb = 0; lb < k; ++lb) + { + int_t *usub, *usub_new; + usub = Ufstnz_br_ptr[lb]; + + doublecomplex * uval = Unzval_br_ptr[lb]; + doublecomplex * uval_new; + + /*if non empty set the flag*/ + if (usub != NULL) + { + int_t lenv, lens; + lenv = usub[1]; + lens = usub[2]; + + MPI_INT_ALLOC(usub_new, lens); + memcpy( usub_new, usub, lens * sizeof(int_t)); + MPI_DATATYPE_ALLOC(uval_new, lenv); + memcpy( uval_new, uval, lenv * sizeof(doublecomplex)); + Ufstnz_br_ptr[lb] = usub_new; + Unzval_br_ptr[lb] = uval_new; + SUPERLU_FREE(usub); + SUPERLU_FREE(uval); + } + } /*for ( int_t lb = 0; lb < k; ++lb)*/ + + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + + /*start broadcasting blocks*/ + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *lsub , *lsub_new; + doublecomplex *lnzval, *lnzval_new; + lsub = Lrowind_bc_ptr[ljb]; + lnzval = Lnzval_bc_ptr[ljb]; + + if (lsub) + { + int_t nrbl, len, len1, len2; + + nrbl = lsub[0]; /*number of L blocks */ + len = lsub[1]; /* LDA of the nzval[] */ + len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + len2 = SuperSize(jb) * len; + + MPI_INT_ALLOC(lsub_new, len1); + memcpy( lsub_new, lsub, len1 * sizeof(int_t)); + MPI_DATATYPE_ALLOC(lnzval_new, len2); + memcpy( lnzval_new, lnzval, len2 * sizeof(doublecomplex)); + Lrowind_bc_ptr[ljb] = lsub_new; + SUPERLU_FREE(lsub ); + Lnzval_bc_ptr[ljb] = lnzval_new; + SUPERLU_FREE(lnzval ); + } + } /* if mycol == pc ... */ + } /* for jb ... */ + + return 0; +} + + +int_t zzSendLPanel(int_t k, int_t receiver, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + + int_t pc = PCOL( k, grid ); + if (mycol == pc) + { + int_t lk = LBj( k, grid ); /* Local block number */ + int_t *lsub; + doublecomplex* lnzval; + lsub = Lrowind_bc_ptr[lk]; + lnzval = Lnzval_bc_ptr[lk]; + + if (lsub != NULL) + { + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(k) * len; /* size of nzval of L panel */ + + MPI_Send(lnzval, len2, SuperLU_MPI_DOUBLE_COMPLEX, receiver, k, grid3d->zscp.comm); + SCT->commVolRed += len2 * sizeof(doublecomplex); + } + } + return 0; +} + + +int_t zzRecvLPanel(int_t k, int_t sender, double alpha, double beta, + doublecomplex* Lval_buf, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) +{ + + // A(k) = alpha*A(k) + beta* A^{sender}(k) + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + + int_t pc = PCOL( k, grid ); + if (mycol == pc) + { + int_t lk = LBj( k, grid ); /* Local block number */ + int_t *lsub; + doublecomplex* lnzval; + lsub = Lrowind_bc_ptr[lk]; + lnzval = Lnzval_bc_ptr[lk]; + + if (lsub != NULL) + { + + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(k) * len; /*size of nzval of L panels*/ + + MPI_Status status; + MPI_Recv(Lval_buf , len2, SuperLU_MPI_DOUBLE_COMPLEX, sender, k, + grid3d->zscp.comm, &status); + + /*reduce the updates*/ + cblas_dscal (len2, alpha, lnzval, 1); + cblas_daxpy (len2, beta, Lval_buf, 1, lnzval, 1); + } + } + + return 0; +} + +int_t zzSendUPanel(int_t k, int_t receiver, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + int_t iam = grid->iam; + + int_t myrow = MYROW (iam, grid); + int_t pr = PROW( k, grid ); + if (myrow == pr) + { + int_t lk = LBi( k, grid ); /* Local block number */ + int_t *usub; + doublecomplex* unzval; + usub = Ufstnz_br_ptr[lk]; + unzval = Unzval_br_ptr[lk]; + + if (usub != NULL) + { + int lenv = usub[1]; + + /* code */ + MPI_Send(unzval, lenv, SuperLU_MPI_DOUBLE_COMPLEX, receiver, k, grid3d->zscp.comm); + SCT->commVolRed += lenv * sizeof(doublecomplex); + } + } + + return 0; +} + + +int_t zzRecvUPanel(int_t k, int_t sender, double alpha, double beta, + doublecomplex* Uval_buf, LUstruct_t* LUstruct, + gridinfo3d_t* grid3d, SCT_t* SCT) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + int_t iam = grid->iam; + + int_t myrow = MYROW (iam, grid); + int_t pr = PROW( k, grid ); + if (myrow == pr) + { + int_t lk = LBi( k, grid ); /* Local block number */ + int_t *usub; + doublecomplex* unzval; + usub = Ufstnz_br_ptr[lk]; + unzval = Unzval_br_ptr[lk]; + + if (usub != NULL) + { + int lenv = usub[1]; + MPI_Status status; + MPI_Recv(Uval_buf , lenv, SuperLU_MPI_DOUBLE_COMPLEX, sender, k, + grid3d->zscp.comm, &status); + + /*reduce the updates*/ + cblas_dscal (lenv, alpha, unzval, 1); + cblas_daxpy (lenv, beta, Uval_buf, 1, unzval, 1); + } + } + return 0; +} + + +int_t zp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +/* Copies LU structure from layer 0 to all the layers */ +{ + gridinfo_t* grid = &(grid3d->grid2d); + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + + /* broadcast etree */ + int_t *etree = LUstruct->etree; + MPI_Bcast( etree, n, mpi_int_t, 0, grid3d->zscp.comm); + + int_t nsupers; + + if (!grid3d->zscp.Iam) + nsupers = getNsupers(n, LUstruct); + + /* broadcast nsupers */ + MPI_Bcast( &nsupers, 1, mpi_int_t, 0, grid3d->zscp.comm); + + /* Scatter and alloc Glu_persist */ + if (grid3d->zscp.Iam) + AllocGlu(n, nsupers, LUstruct, grid3d); + + /* broadcast Glu_persist */ + int_t *xsup = LUstruct->Glu_persist->xsup; + MPI_Bcast( xsup, nsupers + 1, mpi_int_t, 0, grid3d->zscp.comm); + + int_t *supno = LUstruct->Glu_persist->supno; + MPI_Bcast( supno, n, mpi_int_t, 0, grid3d->zscp.comm); + + /* now broadcast localLu_t */ + /* first allocating space for it */ + if (grid3d->zscp.Iam) + zAllocLlu(nsupers, LUstruct, grid3d); + + LocalLU_t *Llu = LUstruct->Llu; + + /*scatter all the L blocks and indexes*/ + zscatter3dLPanels( nsupers, LUstruct, grid3d); + + /*scatter all the U blocks and indexes*/ + zscatter3dUPanels( nsupers, LUstruct, grid3d); + + int_t* bufmax = Llu->bufmax; + MPI_Bcast( bufmax, NBUFFERS, mpi_int_t, 0, grid3d->zscp.comm); + + /* now sending tosendR etc */ + int_t** ToSendR = Llu->ToSendR; + int_t* ToRecv = Llu->ToRecv; + int_t* ToSendD = Llu->ToSendD; + + int_t nbr = CEILING(nsupers, Pr); + int_t nbc = CEILING(nsupers, Pc); + MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0, grid3d->zscp.comm); + + MPI_Bcast( ToSendD, nbr, mpi_int_t, 0, grid3d->zscp.comm); + for (int_t i = 0; i < nbc; ++i) + { + /* code */ + MPI_Bcast( ToSendR[i], Pc, mpi_int_t, 0, grid3d->zscp.comm); + } + + // +#ifdef MPI_MALLOC + // change MY LU struct into MPI malloc based + if (!grid3d->zscp.Iam) + mpiMallocLUStruct(nsupers, LUstruct, grid3d); +#endif + return 0; +} + + +int_t zscatter3dUPanels(int_t nsupers, + LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +{ + + LocalLU_t *Llu = LUstruct->Llu; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + for ( int_t lb = 0; lb < k; ++lb) { + int_t *usub; + usub = Ufstnz_br_ptr[lb]; + + doublecomplex * uval = Unzval_br_ptr[lb]; + + int_t flag = 0; + /*if non empty set the flag*/ + if (!grid3d->zscp.Iam && usub != NULL) + flag = 1; + /*bcast the flag*/ + MPI_Bcast( &flag, 1, mpi_int_t, 0, grid3d->zscp.comm); + + if (flag) { + int_t lenv, lens; + lenv = 0; + lens = 0; + + if (!grid3d->zscp.Iam) + { + lenv = usub[1]; + lens = usub[2]; + } + + /*broadcast the size of sub array*/ + MPI_Bcast( &lens, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &lenv, 1, mpi_int_t, 0, grid3d->zscp.comm); + + /*allocate lsub*/ + if (grid3d->zscp.Iam) +#ifdef MPI_MALLOC + MPI_INT_ALLOC(usub, lens); +#else + usub = INT_T_ALLOC(lens); +#endif + + /*bcast usub*/ + MPI_Bcast( usub, lens, mpi_int_t, 0, grid3d->zscp.comm); + + /*allocate uval*/ + if (grid3d->zscp.Iam) +#ifdef MPI_MALLOC + MPI_DATATYPE_ALLOC(uval, lenv); +#else + uval = DOUBLE_ALLOC(lenv); +#endif + /*broadcast uval*/ + MPI_Bcast( uval, lenv, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm); + + /*setup the pointer*/ + Unzval_br_ptr[lb] = uval; + Ufstnz_br_ptr[lb] = usub; + } /* end if flag */ + + } /* end for lb ... */ + return 0; +} /* end zScatter3dUPanels */ + + +int_t zscatter3dLPanels(int_t nsupers, + LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + gridinfo_t* grid = &(grid3d->grid2d); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + int_t iam = grid->iam; + + int_t mycol = MYCOL (iam, grid); + + /*start broadcasting blocks*/ + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *lsub; + doublecomplex* lnzval; + lsub = Lrowind_bc_ptr[ljb]; + lnzval = Lnzval_bc_ptr[ljb]; + + int_t flag = 0; + /*if non empty set the flag*/ + if (!grid3d->zscp.Iam && lsub != NULL) + flag = 1; + /*bcast the flag*/ + MPI_Bcast( &flag, 1, mpi_int_t, 0, grid3d->zscp.comm); + + if (flag) { + int_t nrbl, len, len1, len2; + if (!grid3d->zscp.Iam) + { + nrbl = lsub[0]; /*number of L blocks */ + len = lsub[1]; /* LDA of the nzval[] */ + len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + len2 = SuperSize(jb) * len; + } + + /*bcast lsub len*/ + MPI_Bcast( &len1, 1, mpi_int_t, 0, grid3d->zscp.comm); + + /*allocate lsub*/ + if (grid3d->zscp.Iam) +#ifdef MPI_MALLOC + MPI_INT_ALLOC(lsub, len1); +#else + + lsub = INT_T_ALLOC(len1); +#endif + /*now broadcast lsub*/ + MPI_Bcast( lsub, len1, mpi_int_t, 0, grid3d->zscp.comm); + + /*set up pointer*/ + Lrowind_bc_ptr[ljb] = lsub; + + /*bcast lnzval len*/ + MPI_Bcast( &len2, 1, mpi_int_t, 0, grid3d->zscp.comm); + + /*allocate space for nzval*/ + if (grid3d->zscp.Iam) +#ifdef MPI_MALLOC + MPI_DATATYPE_ALLOC(lnzval, len2); +#else + lnzval = doublecomplexCalloc_dist(len2); +#endif + + /*bcast nonzero values*/ + MPI_Bcast( lnzval, len2, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm); + + /*setup the pointers*/ + Lnzval_bc_ptr[ljb] = lnzval; + + } /* end if flag */ + + } /* end if mycol == pc */ + } /* end for jb ... */ + + return 0; +} /* zscatter3dLPanels */ + +int_t zcollect3dLpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct, + gridinfo3d_t* grid3d) +{ + + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + + /*start broadcasting blocks*/ + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *lsub; + doublecomplex* lnzval; + lsub = Lrowind_bc_ptr[ljb]; + lnzval = Lnzval_bc_ptr[ljb]; + + if (lsub != NULL) + { + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(jb) * len; /*size of nzval of L panel */ + + if (grid3d->zscp.Iam == layer) + { + MPI_Send(lnzval, len2, SuperLU_MPI_DOUBLE_COMPLEX, 0, jb, grid3d->zscp.comm); + } + if (!grid3d->zscp.Iam) + { + MPI_Status status; + MPI_Recv(lnzval, len2, MPI_DOUBLE, layer, jb, grid3d->zscp.comm, &status); + } + } + } + } /* for jb ... */ + return 0; +} + +int_t zcollect3dUpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct, + gridinfo3d_t* grid3d) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + for ( int_t lb = 0; lb < k; ++lb) + { + int_t *usub; + usub = Ufstnz_br_ptr[lb]; + doublecomplex * uval = Unzval_br_ptr[lb]; + + if (usub) + { + /* code */ + int lenv = usub[1]; + if (grid3d->zscp.Iam == layer) + { + MPI_Send(uval, lenv, SuperLU_MPI_DOUBLE_COMPLEX, 0, lb, grid3d->zscp.comm); + } + + if (!grid3d->zscp.Iam) + { + MPI_Status status; + MPI_Recv(uval, lenv, SuperLU_MPI_DOUBLE_COMPLEX, layer, lb, grid3d->zscp.comm, &status); + } + } + } /* for lb ... */ + return 0; +} + +/* Gather the LU factors on layer-0 */ +int_t zp3dCollect(int_t layer, int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +{ + int_t nsupers = getNsupers(n, LUstruct); + zcollect3dLpanels(layer, nsupers, LUstruct, grid3d); + zcollect3dUpanels(layer, nsupers, LUstruct, grid3d); + return 0; +} + + +/* Zero out LU non zero entries */ +int_t zzeroSetLU(int_t nnodes, int_t* nodeList, LUstruct_t *LUstruct, + gridinfo3d_t* grid3d) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; + + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t iam = grid->iam; + + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + + /*first setting the L blocks to zero*/ + for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ + { + + int_t jb = nodeList[node]; + int_t pc = PCOL( jb, grid ); + if (mycol == pc) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *lsub; + doublecomplex* lnzval; + lsub = Lrowind_bc_ptr[ljb]; + lnzval = Lnzval_bc_ptr[ljb]; + + if (lsub != NULL) + { + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(jb) * len; /*size of nzval of L panel */ + memset( lnzval, 0, len2 * sizeof(doublecomplex) ); + } + } + } + + for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ + { + + int_t ib = nodeList[node]; + int_t pr = PROW( ib, grid ); + if (myrow == pr) + { + int_t lib = LBi( ib, grid ); /* Local block number */ + int_t *usub; + doublecomplex* unzval; + usub = Ufstnz_br_ptr[lib]; + unzval = Unzval_br_ptr[lib]; + + if (usub != NULL) + { + int lenv = usub[1]; + memset( unzval, 0, lenv * sizeof(doublecomplex) ); + } + } + } + + return 0; +} + + +int_t zreduceAncestors3d(int_t sender, int_t receiver, + int_t nnodes, int_t* nodeList, + doublecomplex* Lval_buf, doublecomplex* Uval_buf, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) +{ + int_t myGrid = grid3d->zscp.Iam; + + /*first setting the L blocks to zero*/ + for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ + { + int_t jb = nodeList[node]; + + if (myGrid == sender) + { + zzSendLPanel(jb, receiver, LUstruct, grid3d, SCT); + zzSendUPanel(jb, receiver, LUstruct, grid3d, SCT); + } + else { + zzRecvLPanel(jb, sender, 1.0, 1.0, Lval_buf, LUstruct, grid3d, SCT); + zzRecvUPanel(jb, sender, 1.0, 1.0, + Uval_buf, LUstruct, grid3d, SCT); + } + + } + return 0; + +} + + +int_t zgatherFactoredLU(int_t sender, int_t receiver, + int_t nnodes, int_t *nodeList, + LUValSubBuf_t*LUvsb, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) +{ + doublecomplex * Lval_buf = LUvsb->Lval_buf; + doublecomplex * Uval_buf = LUvsb->Uval_buf; + int_t myGrid = grid3d->zscp.Iam; + for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ + { + int_t jb = nodeList[node]; + if (myGrid == sender) + { + zzSendLPanel(jb, receiver, LUstruct, grid3d, SCT); + zzSendUPanel(jb, receiver, LUstruct, grid3d, SCT); + + } + else + { + zzRecvLPanel(jb, sender, 0.0, 1.0, Lval_buf, LUstruct, + grid3d, SCT); + zzRecvUPanel(jb, sender, 0.0, 1.0, Uval_buf, LUstruct, + grid3d, SCT); + } + } + return 0; + +} + + +int_t zinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + int_t* nodeCount, int_t** nodeList, LUstruct_t* LUstruct, + gridinfo3d_t* grid3d) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + for (int_t lvl = 0; lvl < maxLvl; lvl++) + { + if (myZeroTrIdxs[lvl]) + { + /* code */ + int_t treeId = myTreeIdxs[lvl]; + zzeroSetLU(nodeCount[treeId], nodeList[treeId], LUstruct, grid3d); + } + } + + return 0; +} + + +int_t zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm, + LUValSubBuf_t* LUvsb, LUstruct_t* LUstruct, + gridinfo3d_t* grid3d, SCT_t* SCT ) +{ + doublecomplex * Lval_buf = LUvsb->Lval_buf; + doublecomplex * Uval_buf = LUvsb->Uval_buf; + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t myGrid = grid3d->zscp.Iam; + + int_t sender, receiver; + if ((myGrid % (1 << (ilvl + 1))) == 0) + { + sender = myGrid + (1 << ilvl); + receiver = myGrid; + } + else + { + sender = myGrid; + receiver = myGrid - (1 << ilvl); + } + + /*Reduce all the ancestors*/ + for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl) + { + /* code */ + // int_t atree = myTreeIdxs[alvl]; + int_t nsAncestor = myNodeCount[alvl]; + int_t* cAncestorList = treePerm[alvl]; + double treduce = SuperLU_timer_(); + zreduceAncestors3d(sender, receiver, nsAncestor, cAncestorList, + Lval_buf, Uval_buf, LUstruct, grid3d, SCT); + SCT->ancsReduce += SuperLU_timer_() - treduce; + + } + return 0; +} + +int_t zgatherAllFactoredLU( trf3Dpartition_t* trf3Dpartition, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT ) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t myGrid = grid3d->zscp.Iam; + int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs; + sForest_t** sForests = trf3Dpartition->sForests; + LUValSubBuf_t* LUvsb = trf3Dpartition->LUvsb; + int_t* gNodeCount = getNodeCountsFr(maxLvl, sForests); + int_t** gNodeLists = getNodeListFr(maxLvl, sForests); + + for (int_t ilvl = 0; ilvl < maxLvl - 1; ++ilvl) + { + /* code */ + int_t sender, receiver; + if (!myZeroTrIdxs[ilvl]) + { + if ((myGrid % (1 << (ilvl + 1))) == 0) + { + sender = myGrid + (1 << ilvl); + receiver = myGrid; + } + else + { + sender = myGrid; + receiver = myGrid - (1 << ilvl); + } + + for (int_t alvl = 0; alvl <= ilvl; alvl++) + { + int_t diffLvl = ilvl - alvl; + int_t numTrees = 1 << diffLvl; + int_t blvl = maxLvl - alvl - 1; + int_t st = (1 << blvl) - 1 + (sender >> alvl); + + for (int_t tr = st; tr < st + numTrees; ++tr) + { + /* code */ + zgatherFactoredLU(sender, receiver, + gNodeCount[tr], gNodeLists[tr], + LUvsb, + LUstruct, grid3d, SCT ); + } + } + + } + } /* for ilvl ... */ + + return 0; +} + diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c new file mode 100644 index 00000000..ac4fb077 --- /dev/null +++ b/SRC/pzgssvx3d.c @@ -0,0 +1,1538 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Solves a system of linear equations A*X=B + * + *
+ * -- Distributed SuperLU routine (version 7.0.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 1, 2007
+ * October 22, 2012
+ * March 31, 2019
+ * 
+ */ + +#include +#include "superlu_zdefs.h" +#if 0 +#include "p3dcomm.h" +#include "pdgstrf3d.h" +#include "triangularSolve/pdgstrs.h" +#include "triangularSolve/pdgstrs3d.h" +#include "xtrf3Dpartition.h" +#endif +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * PZGSSVX solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinment is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input/output) SuperMatrix* (local)
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PDGSTRF can factorize rectangular matrices.
+ *         On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) ScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double*) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double*) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) doublecomplex* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) LUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (LocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
+ *
+ * SOLVEstruct (input/output) SOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be intialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'.
+ *
+ * berr    (output) double*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_ddefs.h for the definitions of varioous data types.
+ * 
+ */ + +void +pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, + ScalePermstruct_t * ScalePermstruct, + doublecomplex B[], int ldb, int nrhs, gridinfo3d_t * grid3d, + LUstruct_t * LUstruct, SOLVEstruct_t * SOLVEstruct, double *berr, + SuperLUStat_t * stat, int *info) +{ + NRformat_loc *Astore; + SuperMatrix GA; /* Global A in NC format */ + NCformat *GAstore; + doublecomplex *a_GA; + SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ + NCPformat *GACstore; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t *Glu_freeable; + /* The nonzero structures of L and U factors, which are + replicated on all processrs. + (lsub, xlsub) contains the compressed subscript of + supernodes in L. + (usub, xusub) contains the compressed subscript of + nonzero segments in U. + If options->Fact != SamePattern_SameRowPerm, they are + computed by SYMBFACT routine, and then used by PDDISTRIBUTE + routine. They will be freed after PDDISTRIBUTE routine. + If options->Fact == SamePattern_SameRowPerm, these + structures are not used. */ + fact_t Fact; + doublecomplex *a; + int_t *colptr, *rowind; + int_t *perm_r; /* row permutations from partial pivoting */ + int_t *perm_c; /* column permutation vector */ + int_t *etree; /* elimination tree */ + int_t *rowptr, *colind; /* Local A in NR */ + int_t colequ, Equil, factored, job, notran, rowequ, need_value; + int_t i, iinfo, j, irow, m, n, nnz, permc_spec; + int_t nnz_loc, m_loc, fst_row, icol; + int iam; + int ldx; /* LDA for matrix X (local). */ + char equed[1], norm[1]; + double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; + doublecomplex *X, *b_col, *b_work, *x_col; + double t; + float GA_mem_use; /* memory usage by global A */ + float dist_mem_use; /* memory usage during distribution */ + superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; +#if ( PRNTlevel>= 2 ) + double dmin, dsum, dprod; +#endif + + /* Structures needed for parallel symbolic factorization */ + int_t *sizes, *fstVtxSep, parSymbFact; + int noDomains, nprocs_num; + MPI_Comm symb_comm; /* communicator for symbolic factorization */ + int col, key; /* parameters for creating a new communicator */ + Pslu_freeable_t Pslu_freeable; + float flinfo; + + /* Initialization. */ + + /* definifition of factored seen by each process layer */ + Fact = options->Fact; + factored = (Fact == FACTORED); + + // get the 2d grid + gridinfo_t *grid = &(grid3d->grid2d); + iam = grid->iam; + + /* Perform preprocessing steps on process layer zero, including: + ordering, symbolic factorization, distribution of L & U */ + if (grid3d->zscp.Iam == 0) + { + m = A->nrow; + n = A->ncol; + Astore = (NRformat_loc *) A->Store; + nnz_loc = Astore->nnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + a = (doublecomplex *) Astore->nzval; + rowptr = Astore->rowptr; + colind = Astore->colind; + sizes = NULL; + fstVtxSep = NULL; + symb_comm = MPI_COMM_NULL; + + /* Test the input parameters. */ + *info = 0; + Fact = options->Fact; + if (Fact < 0 || Fact > FACTORED) + *info = -1; + else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR) + *info = -1; + else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC) + *info = -1; + else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA) + *info = -1; + else if (options->IterRefine == SLU_EXTRA) { + *info = -1; + fprintf (stderr, + "Extra precise iterative refinement yet to support."); + } + else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc + || A->Dtype != SLU_D || A->Mtype != SLU_GE) + *info = -2; + else if (ldb < m_loc) + *info = -5; + else if (nrhs < 0) + *info = -6; + if (*info) { + i = -(*info); + pxerr_dist ("pdgssvx", grid, -*info); + return; + } + + factored = (Fact == FACTORED); + Equil = (!factored && options->Equil == YES); + notran = (options->Trans == NOTRANS); + parSymbFact = options->ParSymbFact; + + iam = grid->iam; + job = 5; + if (factored || (Fact == SamePattern_SameRowPerm && Equil)) + { + rowequ = (ScalePermstruct->DiagScale == ROW) || + (ScalePermstruct->DiagScale == BOTH); + colequ = (ScalePermstruct->DiagScale == COL) || + (ScalePermstruct->DiagScale == BOTH); + } + else + rowequ = colequ = FALSE; + + /* The following arrays are replicated on all processes. */ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + etree = LUstruct->etree; + R = ScalePermstruct->R; + C = ScalePermstruct->C; + /********/ + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter pdgssvx()"); +#endif + + /* Not factored & ask for equilibration */ + if (Equil && Fact != SamePattern_SameRowPerm) { + /* Allocate storage if not done so before. */ + switch (ScalePermstruct->DiagScale) + { + case NOEQUIL: + if (!(R = (doublecomplex *) doublecomplexMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + if (!(C = (doublecomplex *) doublecomplexMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->R = R; + ScalePermstruct->C = C; + break; + case ROW: + if (!(C = (doublecomplex *) doublecomplexMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->C = C; + break; + case COL: + if (!(R = (doublecomplex *) doublecomplexMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + ScalePermstruct->R = R; + break; + } + } + + /* ------------------------------------------------------------ + Diagonal scaling to equilibrate the matrix. + ------------------------------------------------------------ */ + if (Equil) { +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter equil"); +#endif + t = SuperLU_timer_ (); + + if (Fact == SamePattern_SameRowPerm) { + /* Reuse R and C. */ + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + break; + case ROW: + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */ + } + ++irow; + } + break; + case COL: + for (j = 0; j < m_loc; ++j) + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + icol = colind[i]; + zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */ + } + break; + case BOTH: + irow = fst_row; + for (j = 0; j < m_loc; ++j) + { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) + { + icol = colind[i]; + zd_mult(&a[i], &a[i], R[irow]); /* Scale rows */ + zd_mult(&a[i], &a[i], C[icol]); /* Scale columns */ + } + ++irow; + } + break; + } + } else { /* Compute R & C from scratch */ + /* Compute the row and column scalings. */ + pzgsequ (A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); + + if ( iinfo > 0 ) { + if ( iinfo <= m ) { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo); +#endif + } else { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n); +#endif + } + } else if ( iinfo < 0 ) return; + + /* Now iinfo == 0 */ + + /* Equilibrate matrix A if it is badly-scaled. + A <-- diag(R)*A*diag(C) */ + pzlaqgs (A, R, C, rowcnd, colcnd, amax, equed); + + if ( strncmp(equed, "R", 1)==0 ) { + ScalePermstruct->DiagScale = ROW; + rowequ = ROW; + } else if ( strncmp(equed, "C", 1)==0 ) { + ScalePermstruct->DiagScale = COL; + colequ = COL; + } else if ( strncmp(equed, "B", 1)==0 ) { + ScalePermstruct->DiagScale = BOTH; + rowequ = ROW; + colequ = COL; + } else ScalePermstruct->DiagScale = NOEQUIL; + +#if ( PRNTlevel>=1 ) + if (iam==0) { + printf (".. equilibrated? *equed = %c\n", *equed); + fflush(stdout); + } +#endif + } /* end if-else Fact ... */ + + stat->utime[EQUIL] = SuperLU_timer_ () - t; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit equil"); +#endif + } /* end if Equil ... LAPACK style, not involving MC64 */ + + if (!factored) { /* Skip this if already factored. */ + /* + * Gather A from the distributed compressed row format to + * global A in compressed column format. + * Numerical values are gathered only when a row permutation + * for large diagonal is sought after. + */ + if (Fact != SamePattern_SameRowPerm && + (parSymbFact == NO || options->RowPerm != NO)) { + + need_value = (options->RowPerm == LargeDiag_MC64); + + pzCompRow_loc_to_CompCol_global (need_value, A, grid, &GA); + + GAstore = (NCformat *) GA.Store; + colptr = GAstore->colptr; + rowind = GAstore->rowind; + nnz = GAstore->nnz; + GA_mem_use = (nnz + n + 1) * sizeof (int_t); + + if (need_value) { + a_GA = (double *) GAstore->nzval; + GA_mem_use += nnz * sizeof (double); + } + else + assert (GAstore->nzval == NULL); + } + + /* ------------------------------------------------------------ + Find the row permutation for A. + ------------------------------------------------------------ */ + if (options->RowPerm != NO) { + t = SuperLU_timer_ (); + if (Fact != SamePattern_SameRowPerm) { + if (options->RowPerm == MY_PERMR) { + /* Use user's perm_r. */ + /* Permute the global matrix GA for symbfact() */ + for (i = 0; i < colptr[n]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } else if ( options->RowPerm == LargeDiag_MC64 ) { + /* Get a new perm_r[] */ + if (job == 5) { + /* Allocate storage for scaling factors. */ + if (!(R1 = doubleMalloc_dist (m))) + ABORT ("SUPERLU_MALLOC fails for R1[]"); + if (!(C1 = doubleMalloc_dist (n))) + ABORT ("SUPERLU_MALLOC fails for C1[]"); + } + + if ( iam==0 ) { + /* Process 0 finds a row permutation */ + iinfo = dldperm_dist (job, m, nnz, colptr, rowind, a_GA, + perm_r, R1, C1); + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) { + MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); + } + } + } else { + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) { + MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); + } + } + } + + if ( iinfo && job == 5) { /* Error return */ + SUPERLU_FREE(R1); + SUPERLU_FREE(C1); + } +#if ( PRNTlevel>=2 ) + dmin = damch_dist ("Overflow"); + dsum = 0.0; + dprod = 1.0; +#endif + if ( iinfo == 0 ) { + if (job == 5) { + if ( Equil ) { + for (i = 0; i < n; ++i) { + R1[i] = exp (R1[i]); + C1[i] = exp (C1[i]); + } + + /* Scale the distributed matrix further. + A <-- diag(R1)*A*diag(C1) */ + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + icol = colind[i]; + zd_mult(&a[i], &a[i], R1[irow]); + zd_mult(&a[i], &a[i], C1[icol]); +#if ( PRNTlevel>=2 ) + if (perm_r[irow] == icol) { + /* New diagonal */ + if (job == 2 || job == 3) + dmin = SUPERLU_MIN(dmin, slud_z_abs1(&a[i])); + else if (job == 4) + dsum += slud_z_abs1(&a[i]); + else if (job == 5) + dprod *= slud_z_abs1(&a[i]); + } +#endif + } + ++irow; + } + + /* Multiply together the scaling factors -- + R/C from simple scheme, R1/C1 from MC64. */ + if (rowequ) + for (i = 0; i < m; ++i) R[i] *= R1[i]; + else + for (i = 0; i < m; ++i) R[i] = R1[i]; + if (colequ) + for (i = 0; i < n; ++i) C[i] *= C1[i]; + else + for (i = 0; i < n; ++i) C[i] = C1[i]; + + ScalePermstruct->DiagScale = BOTH; + rowequ = colequ = 1; + + } /* end if Equil */ + + /* Now permute global A to prepare for symbfact() */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } + SUPERLU_FREE (R1); + SUPERLU_FREE (C1); + } else { /* job = 2,3,4 */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) + { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } /* end for i ... */ + } /* end for j ... */ + } /* end else job ... */ + } else { /* if iinfo != 0 */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } +#if ( PRNTlevel>=2 ) + if (job == 2 || job == 3) { + if (!iam) + printf ("\tsmallest diagonal %e\n", dmin); + } else if (job == 4) { + if (!iam) + printf ("\tsum of diagonal %e\n", dsum); + } else if (job == 5) { + if (!iam) + printf ("\t product of diagonal %e\n", dprod); + } +#endif + } else { /* use largeDiag_AWPM */ +#ifdef HAVE_COMBBLAS + c2cpp_GetAWPM(A, grid, ScalePermstruct); +#else + if ( iam == 0 ) { + printf("CombBLAS is not available\n"); fflush(stdout); + } +#endif + } /* end if-else options->RowPerm ... */ + + t = SuperLU_timer_ () - t; + stat->utime[ROWPERM] = t; +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); + fflush(stdout); + } +#endif + } /* end if Fact not SamePattern_SameRowPerm ... */ + } else { /* options->RowPerm == NOROWPERM / NATURAL */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } + +#if ( DEBUGlevel>=2 ) + if (!iam) + PrintInt10 ("perm_r", m, perm_r); +#endif + } /* end if (!factored) */ + + if (!factored || options->IterRefine) { + /* Compute norm(A), which will be used to adjust small diagonal. */ + if (notran) + *(unsigned char *) norm = '1'; + else + *(unsigned char *) norm = 'I'; + anorm = pdlangs (norm, A, grid); +#if ( PRNTlevel>=1 ) + if (!iam) { + printf (".. anorm %e\n", anorm); fflush(stdout); + } +#endif + } + + + /* ------------------------------------------------------------ + Perform the LU factorization. + ------------------------------------------------------------ */ + if (!factored) { + t = SuperLU_timer_ (); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A + * permc_spec = PARMETIS: parallel METIS on structure of A'+A + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + + if (parSymbFact == YES || permc_spec == PARMETIS) { + nprocs_num = grid->nprow * grid->npcol; + noDomains = (int) (pow (2, ((int) LOG2 (nprocs_num)))); + + /* create a new communicator for the first noDomains + processes in grid->comm */ + key = iam; + if (iam < noDomains) + col = 0; + else + col = MPI_UNDEFINED; + MPI_Comm_split (grid->comm, col, key, &symb_comm); + + if (permc_spec == NATURAL || permc_spec == MY_PERMC) { + if (permc_spec == NATURAL) + { + for (j = 0; j < n; ++j) + perm_c[j] = j; + } + if (!(sizes = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for sizes."); + if (!(fstVtxSep = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for fstVtxSep."); + for (i = 0; i < 2 * noDomains - 2; ++i) { + sizes[i] = 0; + fstVtxSep[i] = 0; + } + sizes[2 * noDomains - 2] = m; + fstVtxSep[2 * noDomains - 2] = 0; + } else if (permc_spec != PARMETIS) { + /* same as before */ + printf + ("{%4d,%4d}: pdgssvx: invalid ColPerm option when ParSymbfact is used\n", + (int) MYROW (grid->iam, grid), (int) MYCOL (grid->iam, grid)); + } + } /* end ... use parmetis */ + + if (permc_spec != MY_PERMC && Fact == DOFACT) { + if (permc_spec == PARMETIS) { + /* Get column permutation vector in perm_c. * + * This routine takes as input the distributed input matrix A * + * and does not modify it. It also allocates memory for * + * sizes[] and fstVtxSep[] arrays, that contain information * + * on the separator tree computed by ParMETIS. */ + flinfo = get_perm_c_parmetis (A, perm_r, perm_c, nprocs_num, + noDomains, &sizes, &fstVtxSep, + grid, &symb_comm); + if (flinfo > 0) + ABORT ("ERROR in get perm_c parmetis."); + } else { + get_perm_c_dist (iam, permc_spec, &GA, perm_c); + } + } + + stat->utime[COLPERM] = SuperLU_timer_ () - t; + + /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' + (a.k.a. column etree), depending on the choice of ColPerm. + Adjust perm_c[] to be consistent with a postorder of etree. + Permute columns of A to form A*Pc'. */ + if (Fact != SamePattern_SameRowPerm) { + if (parSymbFact == NO) { + + int_t *GACcolbeg, *GACcolend, *GACrowind; + + sp_colorder (options, &GA, perm_c, etree, &GAC); + + /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ + GACstore = (NCPformat *) GAC.Store; + GACcolbeg = GACstore->colbeg; + GACcolend = GACstore->colend; + GACrowind = GACstore->rowind; + for (j = 0; j < n; ++j) { + for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { + irow = GACrowind[i]; + GACrowind[i] = perm_c[irow]; + } + } + + + /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up + the nonzero data structures for L & U. */ +#if ( PRNTlevel>=1 ) + if (!iam) + printf + (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", + sp_ienv_dist (2), sp_ienv_dist (3), sp_ienv_dist (6)); +#endif + t = SuperLU_timer_ (); + if (!(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC (sizeof (Glu_freeable_t)))) + ABORT ("Malloc fails for Glu_freeable."); + + /* Every process does this. */ + iinfo = symbfact (options, iam, &GAC, perm_c, etree, + Glu_persist, Glu_freeable); + + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (iinfo < 0) { + /* Successful return */ + QuerySpace_dist (n, -iinfo, Glu_freeable, &symb_mem_usage); + +#if ( PRNTlevel>=1 ) + if (!iam) { + printf ("\tNo of supers %ld\n", + Glu_persist->supno[n - 1] + 1); + printf ("\tSize of G(L) %ld\n", + Glu_freeable->xlsub[n]); + printf ("\tSize of G(U) %ld\n", + Glu_freeable->xusub[n]); + printf ("\tint %d, short %d, float %d, double %d\n", + sizeof (int_t), sizeof (short), + sizeof (float), sizeof (double)); + printf + ("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", + symb_mem_usage.for_lu * 1e-6, + symb_mem_usage.total * 1e-6, + symb_mem_usage.expansions); + } +#endif + } else { + if (!iam) { + fprintf (stderr, "symbfact() error returns %d\n", + (int) iinfo); + exit (-1); + } + } + + } /* end serial symbolic factorization */ + else { /* parallel symbolic factorization */ + t = SuperLU_timer_ (); + flinfo = + symbfact_dist (nprocs_num, noDomains, A, perm_c, perm_r, + sizes, fstVtxSep, &Pslu_freeable, + &(grid->comm), &symb_comm, + &symb_mem_usage); + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (flinfo > 0) + ABORT + ("Insufficient memory for parallel symbolic factorization."); + } + + /* Destroy GA */ + if (parSymbFact == NO || options->RowPerm != NO) + Destroy_CompCol_Matrix_dist (&GA); + if (parSymbFact == NO) + Destroy_CompCol_Permuted_dist (&GAC); + + } /* end if Fact not SamePattern_SameRowPerm */ + + if (sizes) + SUPERLU_FREE (sizes); + if (fstVtxSep) + SUPERLU_FREE (fstVtxSep); + if (symb_comm != MPI_COMM_NULL) + MPI_Comm_free (&symb_comm); + + if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + t = SuperLU_timer_ (); + dist_mem_use = pddistribute (Fact, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid); + stat->utime[DIST] = SuperLU_timer_ () - t; + + /* Deallocate storage used in symbolic factorization. */ + if (Fact != SamePattern_SameRowPerm) + { + iinfo = symbfact_SubFree (Glu_freeable); + SUPERLU_FREE (Glu_freeable); + } + } else { + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + t = SuperLU_timer_ (); + dist_mem_use = ddist_psymbtonum (Fact, n, A, ScalePermstruct, + &Pslu_freeable, LUstruct, grid); + if (dist_mem_use > 0) + ABORT ("Not enough memory available for dist_psymbtonum\n"); + + stat->utime[DIST] = SuperLU_timer_ () - t; + } + + /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); */ + } /* end if not Factored */ + } /* end if process layer 0 */ + + trf3Dpartition_t* trf3Dpartition; + + /* Perform numerical factorization in parallel on all process layers. */ + if (!factored ) { + + /* send the data across all the layers */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &anorm, 1, MPI_DOUBLE, 0, grid3d->zscp.comm); + + /* send the LU structure to all the grids */ + zp3dScatter(n, LUstruct, grid3d); + int_t nsupers = getNsupers(n, LUstruct); + trf3Dpartition = zinitTrf3Dpartition(nsupers, options, LUstruct, grid3d); + SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t)); + + SCT_init(SCT); + +#if ( PRNTlevel>=1 ) + if (iam==0) { + printf("after 3D initialization.\n"); fflush(stdout); + } +#endif + + t = SuperLU_timer_ (); + + /*factorize in grid 1*/ + // if(grid3d->zscp.Iam) + + pzgstrf3d (options, m, n, anorm, trf3Dpartition, SCT, LUstruct, + grid3d, stat, info); + stat->utime[FACT] = SuperLU_timer_ () - t; + + double tgather = SuperLU_timer_(); + + zgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT); + + SCT->gatherLUtimer += SuperLU_timer_() - tgather; + /*print stats for bottom grid*/ + +#if ( PRNTlevel>=1 ) + if (!grid3d->zscp.Iam) + { + SCT_print(grid, SCT); + SCT_print3D(grid3d, SCT); + } + SCT_printComm3D(grid3d, SCT); + /*print memory usage*/ + printMemUse( trf3Dpartition, LUstruct, grid3d ); + /*print forest weight and costs*/ + printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d); + /*reduces stat from all the layers*/ +#endif + + } /* end if not Factored */ + + if ( grid3d->zscp.Iam == 0 ) { + if (!factored) { + if (options->PrintStat) { + int_t TinyPivots; + float for_lu, total, max, avg, temp; + + zQuerySpace_dist (n, LUstruct, grid, stat, &num_mem_usage); + + if (parSymbFact == TRUE) { + /* The memory used in the redistribution routine + includes the memory used for storing the symbolic + structure and the memory allocated for numerical factorization */ + temp = SUPERLU_MAX (symb_mem_usage.total, -dist_mem_use); + if (options->RowPerm != NO) + temp = SUPERLU_MAX (temp, GA_mem_use); + } + else { + temp = SUPERLU_MAX (symb_mem_usage.total + GA_mem_use, /* symbfact step */ + symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu /* distribution step */ + ); + } + + temp = SUPERLU_MAX (temp, num_mem_usage.total); + + MPI_Reduce (&temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + MPI_Reduce (&temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Allreduce (&stat->TinyPivots, &TinyPivots, 1, mpi_int_t, + MPI_SUM, grid->comm); + stat->TinyPivots = TinyPivots; + + MPI_Reduce (&num_mem_usage.for_lu, &for_lu, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&num_mem_usage.total, &total, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + + if (!iam) { + printf("\tNUMfact space (MB) sum(procs): L\\U\t%.2f\tall\t%.2f\n", + for_lu * 1e-6, total * 1e-6); + printf ("\tTotal highmark (MB): " + "All\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg * 1e-6, + avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); + printf("**************************************************\n"); + fflush(stdout); + } + } + + } /* end if (!factored) */ + + /* ------------------------------------------------------------ + Compute the solution matrix X. + ------------------------------------------------------------ */ + if (nrhs) + { + if (!(b_work = doublecomplexMalloc_dist (n))) + ABORT ("Malloc fails for b_work[]"); + + /* ------------------------------------------------------ + Scale the right-hand side if equilibration was performed. + ------------------------------------------------------ */ + if (notran) + { + if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + zd_mult(&b_col[i], &b_col[i], R[irow]); + ++irow; + } + b_col += ldb; + } + } + } + else if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + zd_mult(&b_col[i], &b_col[i], C[irow]); + ++irow; + } + b_col += ldb; + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if (!(X = doublecomplexMalloc_dist (((size_t) ldx) * nrhs))) + ABORT ("Malloc fails for X[]"); + x_col = X; + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; + x_col += ldx; + b_col += ldb; + } + + /* ------------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------------ */ + if (options->SolveInitialized == NO) /* First time */ + /* Inside this routine, SolveInitialized is set to YES. + For repeated call to pzgssvx(), no need to re-initialilze + the Solve data & communication structures, unless a new + factorization with Fact == DOFACT or SamePattern is asked for. */ + { + zSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct, + grid, SOLVEstruct); + } + stat->utime[SOLVE] = 0.0; +#if 0 // Sherry: the following interface is needed by 3D trisolve. + pzgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); +#else + pzgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); +#endif + + /* ------------------------------------------------------------ + Use iterative refinement to improve the computed solution and + compute error bounds and backward error estimates for it. + ------------------------------------------------------------ */ + if (options->IterRefine) + { + /* Improve the solution by iterative refinement. */ + int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; + SOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ + + t = SuperLU_timer_ (); + if (options->RefineInitialized == NO || Fact == DOFACT) { + /* All these cases need to re-initialize gsmv structure */ + if (options->RefineInitialized) + pdgsmv_finalize (SOLVEstruct->gsmv_comm); + pzgsmv_init (A, SOLVEstruct->row_to_proc, grid, + SOLVEstruct->gsmv_comm); + + /* Save a copy of the transformed local col indices + in colind_gsmv[]. */ + if (colind_gsmv) SUPERLU_FREE (colind_gsmv); + if (!(it = intMalloc_dist (nnz_loc))) + ABORT ("Malloc fails for colind_gsmv[]"); + colind_gsmv = SOLVEstruct->A_colind_gsmv = it; + for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; + options->RefineInitialized = YES; + } + else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) { + double at; + int_t k, jcol, p; + /* Swap to beginning the part of A corresponding to the + local part of X, as was done in pdgsmv_init() */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + k = rowptr[i]; + for (j = rowptr[i]; j < rowptr[i + 1]; ++j) + { + jcol = colind[j]; + p = SOLVEstruct->row_to_proc[jcol]; + if (p == iam) + { /* Local */ + at = a[k]; + a[k] = a[j]; + a[j] = at; + ++k; + } + } + } + + /* Re-use the local col indices of A obtained from the + previous call to pdgsmv_init() */ + for (i = 0; i < nnz_loc; ++i) + colind[i] = colind_gsmv[i]; + } + + if (nrhs == 1) + { /* Use the existing solve structure */ + SOLVEstruct1 = SOLVEstruct; + } + else + { /* For nrhs > 1, since refinement is performed for RHS + one at a time, the communication structure for pdgstrs + is different than the solve with nrhs RHS. + So we use SOLVEstruct1 for the refinement step. + */ + if (!(SOLVEstruct1 = (SOLVEstruct_t *) + SUPERLU_MALLOC (sizeof (SOLVEstruct_t)))) + ABORT ("Malloc fails for SOLVEstruct1"); + /* Copy the same stuff */ + SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; + SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; + SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; + SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; + SOLVEstruct1->diag_len = SOLVEstruct->diag_len; + SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; + SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; + + /* Initialize the *gstrs_comm for 1 RHS. */ + if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) + ABORT ("Malloc fails for gstrs_comm[]"); + pxgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, + Glu_persist, SOLVEstruct1); + } + + pzgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid, + B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); + + /* Deallocate the storage associated with SOLVEstruct1 */ + if (nrhs > 1) + { + pxgstrs_finalize (SOLVEstruct1->gstrs_comm); + SUPERLU_FREE (SOLVEstruct1); + } + + stat->utime[REFINE] = SuperLU_timer_ () - t; + } + + /* Permute the solution matrix B <= Pc'*X. */ + pzPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc, + SOLVEstruct->inv_perm_c, + X, ldx, B, ldb, nrhs, grid); +#if ( DEBUGlevel>=2 ) + printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); + for (i = 0; i < m_loc; ++i) + printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]); +#endif + + /* Transform the solution matrix X to a solution of the original + system before the equilibration. */ + if (notran) + { + if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + zd_mult(&b_col[i], &b_col[i], C[irow]); + ++irow; + } + b_col += ldb; + } + } + } + else if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + zd_mult(&b_col[i], &b_col[i], R[irow]); + ++irow; + } + b_col += ldb; + } + } + + SUPERLU_FREE (b_work); + SUPERLU_FREE (X); + + } /* end if nrhs != 0 */ + +#if ( PRNTlevel>=1 ) + if (!iam) + printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale); +#endif + + /* Deallocate R and/or C if it was not used. */ + if (Equil && Fact != SamePattern_SameRowPerm) + { + switch (ScalePermstruct->DiagScale) + { + case NOEQUIL: + SUPERLU_FREE (R); + SUPERLU_FREE (C); + break; + case ROW: + SUPERLU_FREE (C); + break; + case COL: + SUPERLU_FREE (R); + break; + } + } + +#if 0 + if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact) + Destroy_CompCol_Permuted_dist (&GAC); +#endif +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit pzgssvx3d()"); +#endif + + } /* process layer 0 */ +} diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c new file mode 100644 index 00000000..6a0e842d --- /dev/null +++ b/SRC/pzgstrf3d.c @@ -0,0 +1,314 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +#include "superlu_zdefs.h" +#if 0 +#include "pdgstrf3d.h" +#include "trfCommWrapper.h" +#include "trfAux.h" +//#include "load-balance/supernodal_etree.h" +//#include "load-balance/supernodalForest.h" +#include "supernodal_etree.h" +#include "supernodalForest.h" +#include "p3dcomm.h" +#include "treeFactorization.h" +#include "ancFactorization.h" +#include "xtrf3Dpartition.h" +#endif + +#ifdef MAP_PROFILE +#include "mapsampler_api.h" +#endif + +#ifdef GPU_ACC +#include "lustruct_gpu.h" +#include "acc_aux.c" +#endif + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * PZGSTRF3D performs the LU factorization in parallel using 3D process grid,
+ * which is a communication-avoiding algorithm compared to the 2D algorithm.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * trf3Dpartition (input) trf3Dpartition*
+ *        Matrix partitioning information in 3D process grid.
+ *
+ * SCT    (input/output) SCT_t*
+ *        Various statistics of 3D factorization.
+ *
+ * LUstruct (input/output) LUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *         xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) LocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
+ *
+ * grid3d (input) gridinfo3d_t*
+ *        The 3D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and replication factor in Z-dimension. It is an input argument to all
+ *        the 3D parallel routines.
+ *        Grid3d can be initialized by subroutine SUPERLU_GRIDINIT3D.
+ *        See superlu_defs.h for the definition of 'gridinfo3d_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ */ +int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, + trf3Dpartition_t* trf3Dpartition, SCT_t *SCT, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, + SuperLUStat_t *stat, int *info) +{ + gridinfo_t* grid = &(grid3d->grid2d); + LocalLU_t *Llu = LUstruct->Llu; + + // problem specific contants + int_t ldt = sp_ienv_dist (3); /* Size of maximum supernode */ + // double s_eps = slamch_ ("Epsilon"); -Sherry + double s_eps = smach_dist("Epsilon"); + double thresh = s_eps * anorm; + + // initilize stat + stat->ops[FACT] = 0; + //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt); + + //getting Nsupers + int_t nsupers = getNsupers(n, LUstruct); + + // Grid related Variables + int_t iam = grid->iam; // in 2D grid + int num_threads = getNumThreads(grid3d->iam); + + diagFactBufs_t dFBuf; + zinitDiagFactBufs(ldt, &dFBuf); + + factStat_t factStat; + initFactStat(nsupers, &factStat); + + commRequests_t comReqs; + initCommRequests(&comReqs, grid); + + SCT->tStartup = SuperLU_timer_(); + packLUInfo_t packLUInfo; + initPackLUInfo(nsupers, &packLUInfo); + + scuBufs_t scuBufs; + zinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid); + + msgs_t msgs; + initMsgs(&msgs); + + factNodelists_t fNlists; + initFactNodelists( ldt, num_threads, nsupers, &fNlists); + + // tag_ub initialization + int tag_ub = set_tag_ub(); + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + // trf3Dpartition_t* trf3Dpartition = initTrf3Dpartition(nsupers, options, LUstruct, grid3d); + gEtreeInfo_t gEtreeInfo = trf3Dpartition->gEtreeInfo; + int_t* iperm_c_supno = trf3Dpartition->iperm_c_supno; + int_t* myNodeCount = trf3Dpartition->myNodeCount; + int_t* myTreeIdxs = trf3Dpartition->myTreeIdxs; + int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs; + sForest_t** sForests = trf3Dpartition->sForests; + int_t** treePerm = trf3Dpartition->treePerm ; + LUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb; + /*Initializing factorization specific buffers*/ + + int_t numLA = getNumLookAhead(options); + LUValSubBuf_t**LUvsbs = zLluBufInitArr( SUPERLU_MAX( numLA, grid3d->zscp.Np ), LUstruct); + msgs_t**msgss = initMsgsArr(numLA); + int_t mxLeafNode = 0; + for (int ilvl = 0; ilvl < maxLvl; ++ilvl) + { + /* code */ + if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode ) + mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1]; + } + diagFactBufs_t** dFBufs = zinitDiagFactBufsArr(mxLeafNode, ldt, grid); + commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), + ldt, grid); + + /*setting up GPU related stuff*/ + + int_t first_l_block_acc = 0; + int_t first_u_block_acc = 0; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t mrb = (nsupers + Pr - 1) / Pr; + int_t mcb = (nsupers + Pc - 1) / Pc; + HyP_t *HyP = (HyP_t *) malloc(sizeof(HyP_t)); + Init_HyP(HyP, Llu, mcb, mrb); + HyP->first_l_block_acc = first_l_block_acc; + HyP->first_u_block_acc = first_u_block_acc; + int_t bigu_size = getBigUSize(nsupers, grid, LUstruct); + // int_t buffer_size = get_max_buffer_size (); + // HyP->buffer_size = buffer_size; + HyP->bigu_size = bigu_size; + HyP->nsupers = nsupers; + +#ifdef GPU_ACC + + /*Now initialize the GPU data structure*/ + LUstruct_gpu *A_gpu, *dA_gpu; + + d2Hreduce_t d2HredObj; + d2Hreduce_t* d2Hred = &d2HredObj; + sluGPU_t sluGPUobj; + sluGPU_t *sluGPU = &sluGPUobj; + sluGPU->isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm); + if (superlu_acc_offload) + { + /*Initilize the lookup tables */ + LookUpTableInit(iam); + acc_async_cost = get_acc_async_cost(); +#ifdef GPU_DEBUG + if (!iam) printf("Using MIC async cost of %lf \n", acc_async_cost); +#endif + + initSluGPU3D_t(sluGPU, LUstruct, grid3d, perm_c_supno, + n, buffer_size, bigu_size, ldt); + + HyP->first_u_block_acc = sluGPU->A_gpu->first_u_block_gpu; + HyP->first_l_block_acc = sluGPU->A_gpu->first_l_block_gpu; + HyP->nCudaStreams = sluGPU->nCudaStreams; + } + +#endif // GPU_ACC + + /*==== starting main factorization loop =====*/ + MPI_Barrier( grid3d->comm); + SCT->tStartup = SuperLU_timer_() - SCT->tStartup; + // int_t myGrid = grid3d->zscp.Iam; + +#ifdef ITAC_PROF + VT_traceon(); +#endif +#ifdef MAP_PROFILE + allinea_start_sampling(); +#endif + SCT->pdgstrfTimer = SuperLU_timer_(); + + for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl) + { + /* if I participate in this level */ + if (!myZeroTrIdxs[ilvl]) + { + //int_t tree = myTreeIdxs[ilvl]; + + sForest_t* sforest = sForests[myTreeIdxs[ilvl]]; + + /*main loop over all the super nodes*/ + if (sforest) + { + double tilvl = SuperLU_timer_(); +#ifdef GPU_ACC + zsparseTreeFactor_ASYNC_GPU( + sforest, + comReqss, &scuBufs, &packLUInfo, + msgss, LUvsbs, dFBufs, &factStat, &fNlists, + &gEtreeInfo, options, iperm_c_supno, ldt, + sluGPU, d2Hred, HyP, LUstruct, grid3d, stat, + thresh, SCT, tag_ub, info); +#else + zsparseTreeFactor_ASYNC(sforest, comReqss, &scuBufs, &packLUInfo, + msgss, LUvsbs, dFBufs, &factStat, &fNlists, + &gEtreeInfo, options, iperm_c_supno, ldt, + HyP, LUstruct, grid3d, stat, + thresh, SCT, tag_ub, info ); +#endif + + /*now reduce the updates*/ + SCT->tFactor3D[ilvl] = SuperLU_timer_() - tilvl; + sForests[myTreeIdxs[ilvl]]->cost = SCT->tFactor3D[ilvl]; + } + + if (ilvl < maxLvl - 1) /*then reduce before factorization*/ + { +#ifdef GPU_ACC + zreduceAllAncestors3d_GPU( + ilvl, myNodeCount, treePerm, LUvsb, + LUstruct, grid3d, sluGPU, d2Hred, &factStat, HyP, + SCT ); +#else + + zreduceAllAncestors3d(ilvl, myNodeCount, treePerm, + LUvsb, LUstruct, grid3d, SCT ); +#endif + + } + } /*if (!myZeroTrIdxs[ilvl]) ... If I participate in this level*/ + + SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer + : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1]; + } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/ + + MPI_Barrier( grid3d->comm); + SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer; + +#ifdef ITAC_PROF + VT_traceoff(); +#endif + +#ifdef MAP_PROFILE + allinea_stop_sampling(); +#endif + + reduceStat(FACT, stat, grid3d); + + return 0; + +} /* pzgstrf3d */ diff --git a/SRC/scatter.h b/SRC/scatter.h new file mode 100644 index 00000000..568a1687 --- /dev/null +++ b/SRC/scatter.h @@ -0,0 +1,147 @@ +#ifndef _SCATTER_H_ +#define _SCATTER_H_ + +#ifdef CLEAN_SCATTER +#define SCATTER_L_CPU scatter_l +#define SCATTER_U_CPU scatter_u +#else +#define SCATTER_L_CPU scatter_l +#define SCATTER_U_CPU scatter_u + +#endif + +void +scatter_l (int_t ib, + int_t ljb, + int_t nsupc, + int_t iukp, + int_t *xsup, + int_t klst, + int_t nbrow, + int_t lptr, + int_t temp_nbrow, + int_t *usub, + int_t *lsub, + double *tempv, + int_t *indirect_thread, int_t *indirect2, + int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid); + +void +scatter_u (int_t ib, + int_t jb, + int_t nsupc, + int_t iukp, + int_t *xsup, + int_t klst, + int_t nbrow, + int_t lptr, + int_t temp_nbrow, + int_t *lsub, + int_t *usub, + double *tempv, + int_t *indirect, + int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, gridinfo_t *grid); + +void +arrive_at_ublock (int_t j, //block number + int_t *iukp, // output + int_t *rukp, int_t *jb, /* Global block number of block U(k,j). */ + int_t *ljb, /* Local block number of U(k,j). */ + int_t *nsupc, /*supernode size of destination block */ + int_t iukp0, //input + int_t rukp0, int_t *usub, /*usub scripts */ + int_t *perm_u, /*permutation matrix */ + int_t *xsup, /*for SuperSize and LBj */ + gridinfo_t *grid); + + +void +block_gemm_scatter( int_t lb, int_t j, + Ublock_info_t *Ublock_info, + Remain_info_t *Remain_info, + double *L_mat, int_t ldl, + double *U_mat, int_t ldu, + double *bigV, + // int_t jj0, + int_t knsupc, int_t klst, + int_t *lsub, int_t *usub, int_t ldt, + int_t thread_id, + int_t *indirect, + int_t *indirect2, + int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, + int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, + int_t *xsup, gridinfo_t *grid, + SuperLUStat_t *stat +#ifdef SCATTER_PROFILE + , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer +#endif + ); + + +/*this version uses a lock to prevent multiple thread updating the same block*/ +void +block_gemm_scatter_lock( int_t lb, int_t j, + omp_lock_t* lock, + Ublock_info_t *Ublock_info, + Remain_info_t *Remain_info, + double *L_mat, int_t ldl, + double *U_mat, int_t ldu, + double *bigV, + // int_t jj0, + int_t knsupc, int_t klst, + int_t *lsub, int_t *usub, int_t ldt, + int_t thread_id, + int_t *indirect, + int_t *indirect2, + int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, + int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, + int_t *xsup, gridinfo_t *grid +#ifdef SCATTER_PROFILE + , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer +#endif + ); + +int_t block_gemm_scatterTopLeft( int_t lb, int_t j, + double* bigV, int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + HyP_t* HyP, + LUstruct_t *LUstruct, + gridinfo_t* grid, + SCT_t*SCT, SuperLUStat_t *stat + ); +int_t block_gemm_scatterTopRight( int_t lb, int_t j, + double* bigV, int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + HyP_t* HyP, + LUstruct_t *LUstruct, + gridinfo_t* grid, + SCT_t*SCT, SuperLUStat_t *stat + ); +int_t block_gemm_scatterBottomLeft( int_t lb, int_t j, + double* bigV, int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + HyP_t* HyP, + LUstruct_t *LUstruct, + gridinfo_t* grid, + SCT_t*SCT, SuperLUStat_t *stat + ); +int_t block_gemm_scatterBottomRight( int_t lb, int_t j, + double* bigV, int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + HyP_t* HyP, + LUstruct_t *LUstruct, + gridinfo_t* grid, + SCT_t*SCT, SuperLUStat_t *stat + ); + +void gather_u(int_t num_u_blks, + Ublock_info_t *Ublock_info, int_t * usub, + double *uval, double *bigU, int_t ldu, + int_t *xsup, int_t klst /* for SuperSize */ + ); + +void gather_l( int_t num_LBlk, int_t knsupc, + Remain_info_t *L_info, + double * lval, int_t LD_lval, + double * L_buff ); +#endif diff --git a/SRC/sec_structs.c b/SRC/sec_structs.c new file mode 100644 index 00000000..162e20b2 --- /dev/null +++ b/SRC/sec_structs.c @@ -0,0 +1,654 @@ +#include +#include "superlu_ddefs.h" +#if 0 +#include "sec_structs.h" +#include /*for printfs*/ +#include /*for Qsort */ +#include +#include +#include /*for sqrt*/ +#include +#include "compiler.h" +//#include "load-balance/supernodal_etree.h" +#include "supernodal_etree.h" +#endif + +double CPU_CLOCK_RATE; +/*for sorting structures */ +int Cmpfunc_R_info (const void * a, const void * b) +{ + return ( ((Remain_info_t*)a)->nrows - ((Remain_info_t*)b)->nrows ); +} + + + +int Cmpfunc_U_info (const void * a, const void * b) +{ + return ( ((Ublock_info_t*)a)->ncols - ((Ublock_info_t*)b)->ncols ); +} + + +int sort_R_info( Remain_info_t* Remain_info, int n ) +{ + qsort((void *) Remain_info , n , sizeof(Remain_info_t), Cmpfunc_R_info); + + return 0; +} + +int sort_U_info( Ublock_info_t* Ublock_info, int n ) +{ + qsort((void *) Ublock_info , n , sizeof(Ublock_info_t), Cmpfunc_U_info); + + return 0; +} + +int Cmpfunc_R_info_elm (const void * a, const void * b) +{ + return ( ((Remain_info_t*)a)->eo - ((Remain_info_t*)b)->eo ); +} + + + +int Cmpfunc_U_info_elm (const void * a, const void * b) +{ + return ( ((Ublock_info_t*)a)->eo - ((Ublock_info_t*)b)->eo ); +} + + + +int sort_R_info_elm( Remain_info_t* Remain_info, int n ) +{ + /*sorts on the basis of order of elimination*/ + qsort((void *) Remain_info , n , sizeof(Remain_info_t), Cmpfunc_R_info_elm); + + return 0; +} + +int sort_U_info_elm( Ublock_info_t* Ublock_info, int n ) +{ + qsort((void *) Ublock_info , n , sizeof(Ublock_info_t), Cmpfunc_U_info_elm); + + return 0; +} + +double *SCT_ThreadVarInit(int_t num_threads) +{ +#if 0 + double *var = (double *) _mm_malloc(num_threads * CACHE_LINE_SIZE * sizeof(double), 64); +#else + double *var = (double *) doubleMalloc_dist(num_threads * CACHE_LINE_SIZE); +#endif + for (int_t i = 0; i < num_threads * CACHE_LINE_SIZE; ++i) + { + var[i] = 0.0; + } + return var; +} + + +#define DEFAULT_CPU_FREQ 3000.0 // 3 GHz + +double getFreq(void) +{ + FILE *fp = fopen("/proc/cpuinfo", "rb"); + if(!fp) { + // the file /proc/cpuinfo doesn't exists, return 3000 Mhz as the frequency +#if ( PRNTlevel>=2 ) + printf("/proc/cpuinfo doesn't exists, using 3GHz as CPU frequency. Some timers will not be correct\n"); +#endif + return DEFAULT_CPU_FREQ; + } + + char *arg = 0; + char *line = NULL; + size_t len = 0; + size_t read; + while ((read = getline(&line, &len, fp)) != -1) + { + // printf("%s", line); + char * pch; + pch = strtok (line, " \t:"); + if (pch != NULL && strcmp(pch, "cpu") == 0) + { + + /* code */ + pch = strtok (NULL, " \t:"); + // printf("%s\n", pch ); + if (pch != NULL && strcmp(pch, "MHz") == 0) + { + pch = strtok (NULL, " \t:"); + double freq = atof(pch); + free(arg); + fclose(fp); + return freq; + + break; + } + } + + } + return 0; +} + +void SCT_init(SCT_t* SCT) +{ +#if 1 + CPU_CLOCK_RATE = getFreq() * 1e-3; +#else + CPU_CLOCK_RATE = 3000. * 1e-3; +#endif + int num_threads; + #pragma omp parallel default(shared) + { + #pragma omp master + { + num_threads = omp_get_num_threads (); + } + } + + SCT->acc_load_imbal = 0.0; + + /* Counter for couting memory operations */ + SCT->scatter_mem_op_counter = 0.0; + SCT->scatter_mem_op_timer = 0.0; +#ifdef SCATTER_PROFILE + SCT->Host_TheadScatterMOP = (double *)_mm_malloc(sizeof(double) * (num_threads * (192 / 8) * (192 / 8)), 64); + SCT->Host_TheadScatterTimer = (double *)_mm_malloc(sizeof(double) * (num_threads * (192 / 8) * (192 / 8)), 64); + memset(SCT->Host_TheadScatterMOP, 0, sizeof(double) * (num_threads * (192 / 8) * (192 / 8))); + memset(SCT->Host_TheadScatterTimer, 0, sizeof(double) * (num_threads * (192 / 8) * (192 / 8))); +#endif + + SCT->LookAheadRowSepTimer = 0.0; + SCT->LookAheadRowSepMOP = 0.0; + SCT->GatherTimer = 0.0; + SCT->GatherMOP = 0.0; + SCT->LookAheadGEMMTimer = 0.0; + SCT->LookAheadGEMMFlOp = 0.0; + SCT->LookAheadScatterTimer = 0.0; + SCT->LookAheadScatterMOP = 0.0; + SCT->AssemblyTimer = 0.0; + + SCT->offloadable_flops = 0.0; + SCT->offloadable_mops = 0.0; + + SCT->SchurCompUdtThreadTime; +#if 0 + SCT->SchurCompUdtThreadTime = (double *) _mm_malloc(num_threads * CACHE_LINE_SIZE * sizeof(double), 64); +#else + SCT->SchurCompUdtThreadTime = (double *) doubleMalloc_dist(num_threads * CACHE_LINE_SIZE); +#endif + + for (int_t i = 0; i < num_threads * CACHE_LINE_SIZE; ++i) + { + SCT->SchurCompUdtThreadTime[i] = 0.0; + } + + SCT->schur_flop_counter = 0.0; + SCT->schur_flop_timer = 0.0; + + SCT->datatransfer_timer = 0; + SCT->schurPhiCallTimer = 0; + SCT->schurPhiCallCount = 0; + SCT->datatransfer_count = 0; + SCT->PhiWaitTimer = 0; + SCT->PhiWaitTimer_2 = 0; + SCT->NetSchurUpTimer = 0; + SCT->PhiMemCpyTimer = 0; + SCT->PhiMemCpyCounter = 0; + + SCT->pdgstrs2_timer = 0.0; + SCT->trf2_flops = 0; + SCT->trf2_time = 0; + SCT->CPUOffloadTimer = 0; + SCT->pdgstrf2_timer = 0.0; + SCT->lookaheadupdatetimer = 0; + + /* diagonal block factorization; part of pdgstrf2*/ + // SCT->Local_Dgstrf2_tl = 0; + SCT->Local_Dgstrf2_Thread_tl = SCT_ThreadVarInit(num_threads); + /*Wait for U diagnal bloc kto receive; part of pdgstrf2 */ + SCT->Wait_UDiagBlock_Recv_tl = 0; + /*wait for receiving L diagonal block: part of mpf*/ + SCT->Wait_LDiagBlock_Recv_tl = 0; + SCT->Recv_UDiagBlock_tl = 0; + /*wait for previous U block send to finish; part of pdgstrf2 */ + SCT->Wait_UDiagBlockSend_tl = 0; + /*after obtaining U block, time spent in calculating L panel;part of pdgstrf2*/ + SCT->L_PanelUpdate_tl = 0; + /*Synchronous Broadcasting U panel*/ + SCT->Bcast_UPanel_tl = 0; + SCT->Bcast_LPanel_tl = 0; + /*Wait for L send to finish */ + SCT->Wait_LSend_tl = 0; + + /*Wait for U send to finish */ + SCT->Wait_USend_tl = 0; + /*Wait for U receive */ + SCT->Wait_URecv_tl = 0; + /*Wait for L receive */ + SCT->Wait_LRecv_tl = 0; + + /*U_panelupdate*/ + SCT->PDGSTRS2_tl = 0; + + /*profiling by phases*/ + SCT->Phase_Factor_tl = 0; + SCT->Phase_LU_Update_tl = 0; + SCT->Phase_SC_Update_tl = 0; + + /*time to get the lock*/ + SCT->GetAijLock_Thread_tl = SCT_ThreadVarInit(num_threads); + + /*3d timers*/ + SCT->ancsReduce = 0.0; + SCT->gatherLUtimer = 0.0; + + for (int i = 0; i < MAX_3D_LEVEL; ++i) + { + /* code */ + SCT->tFactor3D[i] = 0; + SCT->tSchCompUdt3d[i] = 0; + } + + SCT->tAsyncPipeTail = 0.0; + SCT->tStartup =0.0; + + SCT->commVolFactor =0.0; + SCT->commVolRed =0.0; +} /* SCT_init */ + +void SCT_free(SCT_t* SCT) +{ +#ifdef SCATTER_PROFILE + free(SCT->Host_TheadScatterMOP); + free(SCT->Host_TheadScatterTimer); +#endif + _mm_free(SCT->SchurCompUdtThreadTime); + _mm_free(SCT->Local_Dgstrf2_Thread_tl); + _mm_free(SCT->GetAijLock_Thread_tl); +} + + +void DistPrint(char* function_name, double value, char* Units, gridinfo_t* grid) +/* +Prints average of the value across all the MPI ranks; +Displays as function_name \t value \t units; +*/ +{ + int iam = grid->iam; + int num_procs = grid->nprow * grid->npcol; + double sum; + double min = 0; + double max = 0; + double value_squared = value * value; + double sum_value_squared; + + MPI_Reduce( &value, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm ); + MPI_Reduce( &value, &min, 1, MPI_DOUBLE, MPI_MIN, 0, grid->comm ); + MPI_Reduce( &value, &max, 1, MPI_DOUBLE, MPI_MAX, 0, grid->comm ); + MPI_Reduce( &value_squared, &sum_value_squared, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm ); + double std_dev = sqrt((sum_value_squared - (sum * sum / num_procs) ) / num_procs); + if (!iam) + { + printf("|%s \t| %10.4f \t| %10.4f \t| %10.4f \t| %10.4f%%| %s|\n", function_name, + sum / num_procs, min, max, 100 * num_procs * std_dev / sum, Units ); + // printf("%s \t %lf %s\n", function_name, value, Units ); + } + +} + +void DistPrint3D(char* function_name, double value, char* Units, gridinfo3d_t* grid3d) +/* +Prints average of the value across all the MPI ranks; +Displays as function_name \t value \t units; +*/ +{ + int iam = grid3d->iam; + int num_procs = grid3d->nprow * grid3d->npcol * grid3d->npdep; + double sum; + double min = 0; + double max = 0; + double value_squared = value * value; + double sum_value_squared; + + MPI_Reduce( &value, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm ); + MPI_Reduce( &value, &min, 1, MPI_DOUBLE, MPI_MIN, 0, grid3d->comm ); + MPI_Reduce( &value, &max, 1, MPI_DOUBLE, MPI_MAX, 0, grid3d->comm ); + MPI_Reduce( &value_squared, &sum_value_squared, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm ); + double std_dev = sqrt((sum_value_squared - (sum * sum / num_procs) ) / num_procs); + if (!iam) + { + printf("|%s \t| %10.4f \t| %10.4f \t| %10.4f \t| %10.4f%%| %s|\n", function_name, + sum / num_procs, min, max, 100 * num_procs * std_dev / sum, Units ); + // printf("%s \t %lf %s\n", function_name, value, Units ); + } + +} + +void DistPrintMarkupHeader(char* headerTitle, double value, gridinfo_t* grid) +{ + + int iam = grid->iam; + int num_procs = grid->nprow * grid->npcol; + double sum; + double min = 0; + double max = 0; + double value_squared = value * value; + double sum_value_squared; + + MPI_Reduce( &value, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm ); + MPI_Reduce( &value, &min, 1, MPI_DOUBLE, MPI_MIN, 0, grid->comm ); + MPI_Reduce( &value, &max, 1, MPI_DOUBLE, MPI_MAX, 0, grid->comm ); + MPI_Reduce( &value_squared, &sum_value_squared, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm ); + + if (!iam) + { + printf("#### %s : %10.4f \n\n", headerTitle,sum / num_procs ); + printf("|Function name \t| avg \t| min \t| max \t| std-dev| units|\n"); + printf("|---|---|---|---|---|---|\n"); + // printf("%s \t %lf %s\n", function_name, value, Units ); + } + +} +void DistPrintThreaded(char* function_name, double* value, double Norm, int_t num_threads, char* Units, gridinfo_t* grid) +/* +Prints average of the value across all the MPI ranks, for threaded variables; +First averages over all the threads; +Norm is normalizing constant +Displays as function_name \t value \t units; +*/ +{ + int iam = grid->iam; + int num_procs = grid->nprow * grid->npcol; + double local_sum = 0; + for (int i = 0; i < num_threads ; ++i) + { + local_sum += value[i * CACHE_LINE_SIZE]; + } + + local_sum = local_sum / (Norm * num_threads); + double sum; + double min = 0; + double max = 0; + double value_squared = local_sum * local_sum; + double sum_value_squared; + + MPI_Reduce( &local_sum, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm ); + MPI_Reduce( &local_sum, &min, 1, MPI_DOUBLE, MPI_MIN, 0, grid->comm ); + MPI_Reduce( &local_sum, &max, 1, MPI_DOUBLE, MPI_MAX, 0, grid->comm ); + MPI_Reduce( &value_squared, &sum_value_squared, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm ); + double std_dev = sqrt((sum_value_squared - (sum * sum / num_procs) ) / num_procs); + if (!iam) + { + printf("|%s \t| %10.4f \t| %10.4f \t| %10.4f \t| %10.4f%% %s|\n", function_name, + sum / num_procs, min, max, 100 * num_procs * std_dev / sum, Units ); + // printf("%s \t %lf %s\n", function_name, value, Units ); + } +} + + +/*for mkl_get_blocks_frequency*/ +// #include "mkl.h" +void SCT_print(gridinfo_t *grid, SCT_t* SCT) +{ + int num_threads; + #pragma omp parallel default(shared) + { + #pragma omp master + { + num_threads = omp_get_num_threads (); + } + } + CPU_CLOCK_RATE = 1e9 * CPU_CLOCK_RATE; + + int iam = grid->iam; + int_t num_procs = grid->npcol * grid->nprow; + double temp_holder; + MPI_Reduce( &SCT->NetSchurUpTimer, &temp_holder, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm ); + if (!iam) + { + printf("CPU_CLOCK_RATE %.1f\n", CPU_CLOCK_RATE ); + printf("Total time in factorization \t: %5.2lf\n", SCT->pdgstrfTimer); + printf("MPI-communication phase \t: %5.2lf\n", SCT->pdgstrfTimer - (temp_holder / num_procs)); + + } + + /* Printing Panel factorization profile*/ + // double CPU_CLOCK_RATE = 1e9 * mkl_get_clocks_frequency(); + + + // DistPrint("Local_Dgstrf2", SCT->Local_Dgstrf2_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrintThreaded( + // "Local_Dgstrf2 ", SCT->Local_Dgstrf2_Thread_tl, CPU_CLOCK_RATE, num_threads, + // "Seconds", grid); + + // DistPrint("Wait_UDiagBlock_Recv ", SCT->Wait_UDiagBlock_Recv_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrint("Wait_LDiagBlock_Recv ", SCT->Wait_LDiagBlock_Recv_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrint("Recv_UDiagBlock ", SCT->Recv_UDiagBlock_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrint("Wait_UDiagBlockSend ", SCT->Wait_UDiagBlockSend_tl / CPU_CLOCK_RATE, "Seconds", grid); + + // DistPrint("Bcast_UPanel ", SCT->Bcast_UPanel_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrint("Bcast_LPanel ", SCT->Bcast_LPanel_tl / CPU_CLOCK_RATE, "Seconds", grid); + DistPrint("Wait_LSend ", SCT->Wait_LSend_tl / CPU_CLOCK_RATE, "Seconds", grid); + DistPrint("Wait_USend ", SCT->Wait_USend_tl / CPU_CLOCK_RATE, "Seconds", grid); + DistPrint("Wait_URecv ", SCT->Wait_URecv_tl / CPU_CLOCK_RATE, "Seconds", grid); + DistPrint("Wait_LRecv ", SCT->Wait_LRecv_tl / CPU_CLOCK_RATE, "Seconds", grid); + DistPrint("L_PanelUpdate ", SCT->L_PanelUpdate_tl , "Seconds", grid); + DistPrint("PDGSTRS2 ", SCT->PDGSTRS2_tl , "Seconds", grid); + + DistPrint("wait-FunCallStream ", SCT->PhiWaitTimer , "Seconds", grid); + DistPrint("wait-copyStream ", SCT->PhiWaitTimer_2 , "Seconds", grid); + DistPrint("waitGPU2CPU ", SCT->PhiWaitTimer , "Seconds", grid); + DistPrint("SchurCompUpdate ", SCT->NetSchurUpTimer, "Seconds", grid); + DistPrint("PanelFactorization ", SCT->pdgstrfTimer - SCT->NetSchurUpTimer, "Seconds", grid); + + // DistPrint("Phase_Factor ", SCT->Phase_Factor_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrint("Phase_LU_Update ", SCT->Phase_LU_Update_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrint("Phase_SC_Update ", SCT->Phase_SC_Update_tl / CPU_CLOCK_RATE, "Seconds", grid); + // DistPrintThreaded( + // "GetAijLock ", SCT->GetAijLock_Thread_tl, CPU_CLOCK_RATE, num_threads, + // "Seconds", grid); + double t_total = SCT->tStartup + SCT->pdgstrfTimer + SCT->gatherLUtimer; + DistPrintMarkupHeader("High Level Time Breakdown", t_total, grid); + DistPrint("Startup ", SCT->tStartup, "Seconds", grid); + DistPrint("Main-Factor loop ", SCT->pdgstrfTimer, "Seconds", grid); + DistPrint("3D-GatherLU ", SCT->gatherLUtimer, "Seconds", grid); + DistPrint("tTotal ", t_total, "Seconds", grid); + + DistPrintMarkupHeader("Components of Factor Loop",SCT->pdgstrfTimer, grid); + DistPrint("3D-AncestorReduce ", SCT->ancsReduce, "Seconds", grid); + DistPrint("Pipeline Tail ", SCT->tAsyncPipeTail, "Seconds", grid); + +} + +void SCT_print3D(gridinfo3d_t *grid3d, SCT_t* SCT) +{ + + gridinfo_t* grid = &(grid3d->grid2d); + + char funName[100]; + + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + for (int i = maxLvl-1; i >-1; --i) + { + /* code */ + sprintf( funName, "Grid-%d Factor:Level-%d ", grid3d->zscp.Iam, + (int) maxLvl-1-i); + DistPrint(funName, SCT->tFactor3D[i], "Seconds", grid); + // sprintf( funName, "SchurCU:Level-%d ", maxLvl-1-i); + // DistPrint(funName, SCT->tSchCompUdt3d[i], "Seconds", grid); + // sprintf( funName, "PanelFact:Level-%d ", maxLvl-1-i); + // DistPrint(funName, SCT->tFactor3D[i]-SCT->tSchCompUdt3d[i], "Seconds", grid); + } + +} + + +void treeImbalance3D(gridinfo3d_t *grid3d, SCT_t* SCT) +{ + + gridinfo_t* grid = &(grid3d->grid2d); + char funName[100]; + + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + for (int i = maxLvl-1; i >-1; --i) + { + /* code */ + double tsum; + MPI_Reduce( &SCT->tFactor3D[i], &tsum, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->zscp.comm ); + + double tmax; + MPI_Reduce( &SCT->tFactor3D[i], &tmax, 1, MPI_DOUBLE, MPI_MAX, 0, grid3d->zscp.comm ); + + double tavg = tsum /(grid3d->zscp.Np>>i); + double lLmb = 100*(tmax-tavg)/tavg; + sprintf( funName, "Imbalance Factor:Level-%d ", (int) maxLvl-1-i); + if(!grid3d->zscp.Iam) + DistPrint(funName, lLmb, "Seconds", grid); + // sprintf( funName, "SchurCU:Level-%d ", maxLvl-1-i); + // DistPrint(funName, SCT->tSchCompUdt3d[i], "Seconds", grid); + // sprintf( funName, "PanelFact:Level-%d ", maxLvl-1-i); + // DistPrint(funName, SCT->tFactor3D[i]-SCT->tSchCompUdt3d[i], "Seconds", grid); + } + +} + + +void SCT_printComm3D(gridinfo3d_t *grid3d, SCT_t* SCT) +{ + // + double cvolFactor; + MPI_Reduce( &SCT->commVolFactor, &cvolFactor, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm ); + double cvolRed; + MPI_Reduce( &SCT->commVolRed, &cvolRed, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm ); + + int_t Np = (grid3d->npcol) * (grid3d->nprow) * (grid3d->npdep); + if (!grid3d->iam) + { + /* code */ + printf("| commVolRed | %g | %g |\n", cvolRed, cvolRed/Np ); + printf("| commVolFactor | %g | %g |\n", cvolFactor, cvolFactor/Np ); + } + +} + +int +get_acc_offload () +{ + char *ttemp; + ttemp = getenv ("SUPERLU_ACC_OFFLOAD"); + + if (ttemp) + return atoi (ttemp); + else + return 0; +} + + +void Init_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb ) +{ + HyP->last_offload = -1; +#if 0 + HyP->lookAhead_info = (Remain_info_t *) _mm_malloc((mrb) * sizeof(Remain_info_t), 64); + + HyP->lookAhead_L_buff = (double *) _mm_malloc( sizeof(double) * (Llu->bufmax[1]), 64); + + HyP->Remain_L_buff = (double *) _mm_malloc( sizeof(double) * (Llu->bufmax[1]), 64); + HyP->Remain_info = (Remain_info_t *) _mm_malloc(mrb * sizeof(Remain_info_t), 64); + HyP->Ublock_info_Phi = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64); + HyP->Ublock_info = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64); + HyP->Lblock_dirty_bit = (int_t *) _mm_malloc(mcb * sizeof(int_t), 64); +#else + HyP->lookAhead_info = (Remain_info_t *) SUPERLU_MALLOC((mrb) * sizeof(Remain_info_t)); + HyP->lookAhead_L_buff = (double *) doubleMalloc_dist((Llu->bufmax[1])); + HyP->Remain_L_buff = (double *) doubleMalloc_dist((Llu->bufmax[1])); + HyP->Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb * sizeof(Remain_info_t)); + HyP->Ublock_info_Phi = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t)); + HyP->Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t)); + HyP->Lblock_dirty_bit = (int_t *) intMalloc_dist(mcb); +#endif + + for (int_t i = 0; i < mcb; ++i) + { + HyP->Lblock_dirty_bit[i] = -1; + } + + HyP->Ublock_dirty_bit = (int_t *) _mm_malloc(mrb * sizeof(int_t), 64); + + for (int_t i = 0; i < mrb; ++i) + { + HyP->Ublock_dirty_bit[i] = -1; + } + + HyP->last_offload = -1; + HyP->superlu_acc_offload = get_acc_offload (); + + HyP->nCudaStreams =0; +} + +void Free_HyP(HyP_t* HyP) +{ + + _mm_free(HyP->lookAhead_info ); + _mm_free(HyP->Remain_info ); + _mm_free(HyP->lookAhead_L_buff ); + _mm_free(HyP->Remain_L_buff ); + _mm_free(HyP->Ublock_info ); + _mm_free(HyP->Ublock_info_Phi ); + _mm_free(HyP->Lblock_dirty_bit ); + _mm_free(HyP->Ublock_dirty_bit ); + + // #endif + +} + +int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid) +{ + for (int_t i = 0; i < HyP->RemainBlk; ++i) + { + int_t lib = LBi( HyP->Remain_info[i].ib, grid) ; + HyP->Ublock_dirty_bit[lib] = k0; + } + + + for (int_t j = 0; j < HyP->jj_cpu; ++j) + { + int_t ljb = LBj( HyP->Ublock_info_Phi[j].jb, grid) ; + HyP->Lblock_dirty_bit[ljb] = k0; + } + return 0; +} + +int_t scuStatUpdate( + int_t knsupc, + HyP_t* HyP, + SCT_t* SCT, + SuperLUStat_t *stat + ) +{ + int_t Lnbrow = HyP->lookAheadBlk == 0 ? 0 : HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow; + int_t Rnbrow = HyP->RemainBlk == 0 ? 0 : HyP->Remain_info[HyP->RemainBlk - 1].FullRow; + int_t nbrow = Lnbrow + Rnbrow; + int_t ncols_host = HyP->num_u_blks == 0 ? 0 : HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols; + int_t ncols_Phi = HyP->num_u_blks_Phi == 0 ? 0 : HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; + int_t ncols = ncols_Phi+ ncols_host; + // int_t ncols = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols + // + HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; // ### + SCT->LookAheadRowSepMOP += 2 * (double)knsupc * (double)(nbrow); + SCT->GatherMOP += 2 * (double)HyP->ldu * (double)ncols; + + + SCT->LookAheadGEMMFlOp += 2 * ((double)Lnbrow * (double)HyP->ldu * (double)ncols_host + + (double)Lnbrow * (double)HyP->ldu_Phi * (double)ncols_Phi) ; + SCT->LookAheadScatterMOP += 3 * Lnbrow * ncols; + SCT->schur_flop_counter += 2 * ((double)Rnbrow * (double)HyP->ldu * (double)ncols_host + + (double)Rnbrow * (double)HyP->ldu_Phi * (double)ncols_Phi) ; + SCT->scatter_mem_op_counter += 3 * Rnbrow * ncols; + stat->ops[FACT] += 2 * ((double)(Rnbrow + Lnbrow) * (double)HyP->ldu * (double)ncols_host + + (double)(Rnbrow + Lnbrow) * (double)HyP->ldu_Phi * (double)ncols_Phi) ; + + return 0; + +} diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index d91fa367..07753693 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -211,7 +211,8 @@ typedef struct int_t last_offload ; int_t *Lblock_dirty_bit, * Ublock_dirty_bit; double *lookAhead_L_buff, *Remain_L_buff; - int_t lookAheadBlk , RemainBlk ; + int_t lookAheadBlk; /* number of blocks in look-ahead window */ + int_t RemainBlk ; /* number of blocks outside look-ahead window */ int_t num_look_aheads, nsupers; int_t ldu, ldu_Phi; int_t num_u_blks, num_u_blks_Phi; @@ -227,8 +228,7 @@ typedef struct int_t offloadCondition; int_t superlu_acc_offload; int_t nCudaStreams; - -} HyP_t; +} HyP_t; /* Data structures for Schur complement update. */ typedef struct { @@ -634,39 +634,39 @@ int_t block_gemm_scatterBottomRight( int_t lb, int_t j, SCT_t*SCT, SuperLUStat_t *stat ); -extern void gather_u(int_t num_u_blks, +extern void dgather_u(int_t num_u_blks, Ublock_info_t *Ublock_info, int_t * usub, double *uval, double *bigU, int_t ldu, int_t *xsup, int_t klst /* for SuperSize */ ); -extern void gather_l( int_t num_LBlk, int_t knsupc, +extern void dgather_l( int_t num_LBlk, int_t knsupc, Remain_info_t *L_info, double * lval, int_t LD_lval, double * L_buff ); /* from gather.h */ -extern void Rgather_L(int_t k, int_t *lsub, double *lusup, gEtreeInfo_t*, +extern void dRgather_L(int_t k, int_t *lsub, double *lusup, gEtreeInfo_t*, Glu_persist_t *, gridinfo_t *, HyP_t *, int_t *myIperm, int_t *iperm_c_supno ); -extern void Rgather_U(int_t k, int_t jj0, int_t *usub, double *uval, +extern void dRgather_U(int_t k, int_t jj0, int_t *usub, double *uval, double *bigU, gEtreeInfo_t*, Glu_persist_t *, gridinfo_t *, HyP_t *, int_t *myIperm, int_t *iperm_c_supno, int_t *perm_u); /* from xtrf3Dpartition.h */ -extern trf3Dpartition_t* initTrf3Dpartition(int_t nsupers, - superlu_dist_options_t *options, - LUstruct_t *LUstruct, gridinfo3d_t * grid3d); -extern void printMemUse(trf3Dpartition_t* trf3Dpartition, - LUstruct_t *LUstruct, gridinfo3d_t * grid3d); +extern trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers, + superlu_dist_options_t *options, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d); +extern void d3D_printMemUse(trf3Dpartition_t* trf3Dpartition, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d); extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat, superlu_dist_options_t *options, LocalLU_t *Llu, int_t* xsup, int_t num_look_aheads, int_t nsupers, int_t * iperm_c_supno); -extern void init3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, +extern void dinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, sForest_t** sForests, LUstruct_t* LUstruct, gridinfo3d_t* grid3d); @@ -694,7 +694,7 @@ extern void pdgstrf2_trsm(superlu_dist_options_t * options, int_t k0, int_t k, LocalLU_t *, MPI_Request *, int tag_ub, SuperLUStat_t *, int *info); extern void pdgstrs2_omp(int_t k0, int_t k, Glu_persist_t *, gridinfo_t *, - LocalLU_t *, SuperLUStat_t *); + LocalLU_t *, Ublock_info_t *, SuperLUStat_t *); #endif // same routine names !!!!!!!! extern int_t LpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr, @@ -739,91 +739,54 @@ extern void pdgstrf2(superlu_dist_options_t *, int_t nsupers, int_t k0, LocalLU_t *, MPI_Request *, int, SuperLUStat_t *, int *); /* from p3dcomm.h */ -int_t AllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); -int_t AllocGlu(int_t n, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); - -int_t p3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); - - -int_t scatter3dLPanels(int_t nsupers, +extern int_t dAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); +extern int_t dp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); +extern int_t dscatter3dLPanels(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); - -int_t scatter3dUPanels(int_t nsupers, +extern int_t dscatter3dUPanels(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); - -int_t collect3dLpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); - -int_t collect3dUpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); - -int_t p3dCollect(int_t layer, int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); - +extern int_t dcollect3dLpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); +extern int_t dcollect3dUpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); +extern int_t dp3dCollect(int_t layer, int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); /*zero out LU non zero entries*/ -int_t zeroSetLU(int_t nnodes, int_t* nodeList , LUstruct_t *LUstruct, gridinfo3d_t* grid3d); - +extern int_t dzeroSetLU(int_t nnodes, int_t* nodeList , LUstruct_t *LUstruct, gridinfo3d_t* grid3d); +extern int_t AllocGlu(int_t n, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); /* Reduces L and U panels of nodes in the List nodeList (size=nnnodes) receiver[L(nodelist)] =sender[L(nodelist)] +receiver[L(nodelist)] receiver[U(nodelist)] =sender[U(nodelist)] +receiver[U(nodelist)] */ - -int_t reduceAncestors3d(int_t sender, int_t receiver, +int_t dreduceAncestors3d(int_t sender, int_t receiver, int_t nnodes, int_t* nodeList, double* Lval_buf, double* Uval_buf, LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); - - /*reduces all nodelists required in a level*/ -int_t reduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, +int_t dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm, LUValSubBuf_t*LUvsb, LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT ); - /* Copies factored L and U panels from sender grid to receiver grid receiver[L(nodelist)] <-- sender[L(nodelist)]; receiver[U(nodelist)] <-- sender[U(nodelist)]; */ -int_t gatherFactoredLU(int_t sender, int_t receiver, +int_t dgatherFactoredLU(int_t sender, int_t receiver, int_t nnodes, int_t *nodeList, LUValSubBuf_t*LUvsb, LUstruct_t* LUstruct, gridinfo3d_t* grid3d,SCT_t* SCT ); /*Gathers all the L and U factors to grid 0 for solve stage - By repeatidly calling above function - -*/ -int_t gatherAllFactoredLU( - trf3Dpartition_t* trf3Dpartition, - LUstruct_t* LUstruct, - gridinfo3d_t* grid3d, - SCT_t* SCT ); - + By repeatidly calling above function*/ +int_t dgatherAllFactoredLU(trf3Dpartition_t* trf3Dpartition, LUstruct_t* LUstruct, + gridinfo3d_t* grid3d, SCT_t* SCT ); /*Distributes data in each layer and initilizes ancestors as zero in required nodes*/ -int_t init3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs, +int_t dinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs, int_t* nodeCount, int_t** nodeList, LUstruct_t* LUstruct, gridinfo3d_t* grid3d); -/* -Returns list of permutation for each -tree that I update -*/ -int_t** getTreePerm( int_t* myTreeIdxs, int_t* myZeroTrIdxs, - int_t* nodeCount, int_t** nodeList, - int_t* perm_c_supno, int_t* iperm_c_supno, - gridinfo3d_t* grid3d); - -/*number of nodes in each level of the trees which I update*/ -int_t* getMyNodeCounts(int_t maxLvl, int_t* myTreeIdxs, int_t* gNodeCount); - - -int_t checkIntVector3d(int_t* vec, int_t len, gridinfo3d_t* grid3d); - -int_t reduceStat(PhaseType PHASE, - SuperLUStat_t *stat, gridinfo3d_t * grid3d); - int_t zSendLPanel(int_t k, int_t receiver, LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); int_t zRecvLPanel(int_t k, int_t sender, double alpha, double beta, @@ -837,118 +800,120 @@ int_t zRecvUPanel(int_t k, int_t sender, double alpha, double beta, LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); /* from communication_aux.h */ -extern int_t IBcast_LPanel (int_t k, int_t k0, int_t* lsub, double* lusup, +extern int_t dIBcast_LPanel (int_t k, int_t k0, int_t* lsub, double* lusup, + gridinfo_t *, int* msgcnt, MPI_Request *, + int_t **ToSendR, int_t *xsup, int ); +extern int_t dBcast_LPanel(int_t k, int_t k0, int_t* lsub, double* lusup, + gridinfo_t *, int* msgcnt, int_t **ToSendR, + int_t *xsup , SCT_t*, int); +extern int_t dIBcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval, gridinfo_t *, int* msgcnt, MPI_Request *, - int_t **ToSendR, int_t *xsup, int ); -extern int_t Bcast_LPanel(int_t k, int_t k0, int_t* lsub, double* lusup, - gridinfo_t *, int* msgcnt, int_t **ToSendR, - int_t *xsup , SCT_t*, int); -extern int_t IBcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval, - gridinfo_t *, int* msgcnt, MPI_Request *, - int_t *ToSendD, int ); -extern int_t Bcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval, - gridinfo_t *, int* msgcnt, int_t *ToSendD, SCT_t*, int); -extern int_t Irecv_LPanel (int_t k, int_t k0, int_t* Lsub_buf, - double* Lval_buf, gridinfo_t *, - MPI_Request *, LocalLU_t *, int); -extern int_t Irecv_UPanel(int_t k, int_t k0, int_t* Usub_buf, double*, - LocalLU_t *, gridinfo_t*, MPI_Request *, int); + int_t *ToSendD, int ); +extern int_t dBcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval, + gridinfo_t *, int* msgcnt, int_t *ToSendD, SCT_t*, int); +extern int_t dIrecv_LPanel (int_t k, int_t k0, int_t* Lsub_buf, + double* Lval_buf, gridinfo_t *, + MPI_Request *, LocalLU_t *, int); +extern int_t dIrecv_UPanel(int_t k, int_t k0, int_t* Usub_buf, double*, + LocalLU_t *, gridinfo_t*, MPI_Request *, int); extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int_t **ToSendR, MPI_Request *s, SCT_t*); extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *); -extern int_t Wait_URecv(MPI_Request *, int* msgcnt, SCT_t *); +extern int_t dWait_URecv(MPI_Request *, int* msgcnt, SCT_t *); extern int_t Check_LRecv(MPI_Request*, int* msgcnt); -extern int_t Wait_LRecv(MPI_Request*, int* msgcnt, int* msgcntsU, - gridinfo_t *, SCT_t*); -extern int_t ISend_UDiagBlock(int_t k0, double *ublk_ptr, int_t size, - MPI_Request *, gridinfo_t *, int); -extern int_t Recv_UDiagBlock(int_t k0, double *ublk_ptr, int_t size, - int_t src, gridinfo_t *, SCT_t*, int); +extern int_t dWait_LRecv(MPI_Request*, int* msgcnt, int* msgcntsU, + gridinfo_t *, SCT_t*); +extern int_t dISend_UDiagBlock(int_t k0, double *ublk_ptr, int_t size, + MPI_Request *, gridinfo_t *, int); +extern int_t dRecv_UDiagBlock(int_t k0, double *ublk_ptr, int_t size, + int_t src, gridinfo_t *, SCT_t*, int); extern int_t Wait_UDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *); extern int_t Wait_LDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *); -extern int_t PackLBlock(int_t k, double* Dest, Glu_persist_t *, - gridinfo_t *, LocalLU_t *); -extern int_t ISend_LDiagBlock(int_t k0, double *lblk_ptr, int_t size, - MPI_Request *, gridinfo_t *, int); -extern int_t IRecv_UDiagBlock(int_t k0, double *ublk_ptr, int_t size, - int_t src, MPI_Request *, gridinfo_t *, - SCT_t*, int); +extern int_t dPackLBlock(int_t k, double* Dest, Glu_persist_t *, + gridinfo_t *, LocalLU_t *); +extern int_t dISend_LDiagBlock(int_t k0, double *lblk_ptr, int_t size, + MPI_Request *, gridinfo_t *, int); +extern int_t dIRecv_UDiagBlock(int_t k0, double *ublk_ptr, int_t size, + int_t src, MPI_Request *, gridinfo_t *, + SCT_t*, int); extern int_t Wait_UDiagBlock_Recv(MPI_Request *, SCT_t *); extern int_t Test_UDiagBlock_Recv(MPI_Request *, SCT_t *); -extern int_t IRecv_LDiagBlock(int_t k0, double *L_blk_ptr, int_t size, - int_t src, MPI_Request *, gridinfo_t*, SCT_t*, - int); +extern int_t dIRecv_LDiagBlock(int_t k0, double *L_blk_ptr, int_t size, + int_t src, MPI_Request *, gridinfo_t*, SCT_t*, int); extern int_t Wait_LDiagBlock_Recv(MPI_Request *, SCT_t *); extern int_t Test_LDiagBlock_Recv(MPI_Request *, SCT_t *); + +extern int_t dUDiagBlockRecvWait( int_t k, int_t* IrecvPlcd_D, int_t* factored_L, + MPI_Request *, gridinfo_t *, LUstruct_t *, SCT_t *); +extern int_t LDiagBlockRecvWait( int_t k, int_t* factored_U, MPI_Request *, gridinfo_t *); #if (MPI_VERSION>2) -extern int_t IBcast_UDiagBlock(int_t k, double *ublk_ptr, int_t size, - MPI_Request *, gridinfo_t *); +extern int_t dIBcast_UDiagBlock(int_t k, double *ublk_ptr, int_t size, + MPI_Request *, gridinfo_t *); extern int_t IBcast_LDiagBlock(int_t k, double *lblk_ptr, int_t size, MPI_Request *, gridinfo_t *); #endif /* from trfCommWrapper.h */ -extern int_t DiagFactIBCast(int_t k, int_t k0, - double *BlockUFactor, double *BlockLFactor, - int_t* IrecvPlcd_D, MPI_Request *, MPI_Request *, - MPI_Request *, MPI_Request *, gridinfo_t *, - superlu_dist_options_t *, double thresh, - LUstruct_t *LUstruct, SuperLUStat_t *, int *info, - SCT_t *, int tag_ub); -extern int_t UPanelTrSolve( int_t k, double* BlockLFactor, double* bigV, - int_t ldt, Ublock_info_t*, gridinfo_t *, - LUstruct_t *, SuperLUStat_t *, SCT_t *); +extern int_t dDiagFactIBCast(int_t k, int_t k0, + double *BlockUFactor, double *BlockLFactor, + int_t* IrecvPlcd_D, MPI_Request *, MPI_Request *, + MPI_Request *, MPI_Request *, gridinfo_t *, + superlu_dist_options_t *, double thresh, + LUstruct_t *LUstruct, SuperLUStat_t *, int *info, + SCT_t *, int tag_ub); +extern int_t dUPanelTrSolve( int_t k, double* BlockLFactor, double* bigV, + int_t ldt, Ublock_info_t*, gridinfo_t *, + LUstruct_t *, SuperLUStat_t *, SCT_t *); extern int_t Wait_LUDiagSend(int_t k, MPI_Request *, MPI_Request *, gridinfo_t *, SCT_t *); -extern int_t LPanelUpdate(int_t k, int_t* IrecvPlcd_D, int_t* factored_L, - MPI_Request *, double* BlockUFactor, gridinfo_t *, - LUstruct_t *, SCT_t *); -extern int_t UPanelUpdate(int_t k, int_t* factored_U, MPI_Request *, - double* BlockLFactor, double* bigV, - int_t ldt, Ublock_info_t*, gridinfo_t *, - LUstruct_t *, SuperLUStat_t *, SCT_t *); -extern int_t IBcastRecvLPanel(int_t k, int_t k0, int* msgcnt, - MPI_Request *, MPI_Request *, - int_t* Lsub_buf, double* Lval_buf, +extern int_t dLPanelUpdate(int_t k, int_t* IrecvPlcd_D, int_t* factored_L, + MPI_Request *, double* BlockUFactor, gridinfo_t *, + LUstruct_t *, SCT_t *); +extern int_t dUPanelUpdate(int_t k, int_t* factored_U, MPI_Request *, + double* BlockLFactor, double* bigV, + int_t ldt, Ublock_info_t*, gridinfo_t *, + LUstruct_t *, SuperLUStat_t *, SCT_t *); +extern int_t dIBcastRecvLPanel(int_t k, int_t k0, int* msgcnt, + MPI_Request *, MPI_Request *, + int_t* Lsub_buf, double* Lval_buf, int_t * factored, gridinfo_t *, LUstruct_t *, SCT_t *, int tag_ub); -extern int_t IBcastRecvUPanel(int_t k, int_t k0, int* msgcnt, MPI_Request *, - MPI_Request *, int_t* Usub_buf, double* Uval_buf, - gridinfo_t *, LUstruct_t *, SCT_t *, int tag_ub); -extern int_t WaitL(int_t k, int* msgcnt, int* msgcntU, MPI_Request *, - MPI_Request *, gridinfo_t *, LUstruct_t *, SCT_t *); -extern int_t WaitU(int_t k, int* msgcnt, MPI_Request *, MPI_Request *, +extern int_t dIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt, MPI_Request *, + MPI_Request *, int_t* Usub_buf, double* Uval_buf, + gridinfo_t *, LUstruct_t *, SCT_t *, int tag_ub); +extern int_t dWaitL(int_t k, int* msgcnt, int* msgcntU, MPI_Request *, + MPI_Request *, gridinfo_t *, LUstruct_t *, SCT_t *); +extern int_t dWaitU(int_t k, int* msgcnt, MPI_Request *, MPI_Request *, gridinfo_t *, LUstruct_t *, SCT_t *); -extern int_t LPanelTrSolve(int_t k, int_t* factored_L, double* BlockUFactor, - gridinfo_t *, LUstruct_t *); +extern int_t dLPanelTrSolve(int_t k, int_t* factored_L, double* BlockUFactor, + gridinfo_t *, LUstruct_t *); /* from trfAux.h */ extern int_t getNsupers(int, LUstruct_t *); -extern int_t SchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*, - Remain_info_t*, uPanelInfo_t *, - lPanelInfo_t *, int_t*, int_t *, int_t *, - double *bigU, int_t* Lsub_buf, - double* Lval_buf, int_t* Usub_buf, - double* Uval_buf, gridinfo_t *, LUstruct_t *); -extern int_t SchurComplementSetupGPU(int_t k, msgs_t* msgs, packLUInfo_t*, - int_t*, int_t*, int_t*, gEtreeInfo_t*, - factNodelists_t*, scuBufs_t*, - LUValSubBuf_t* LUvsb, gridinfo_t *, - LUstruct_t *, HyP_t*); -extern double* getBigV(int_t, int_t); -extern double* getBigU(int_t, gridinfo_t *, LUstruct_t *); +extern int_t dSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*, + Remain_info_t*, uPanelInfo_t *, + lPanelInfo_t *, int_t*, int_t *, int_t *, + double *bigU, int_t* Lsub_buf, + double* Lval_buf, int_t* Usub_buf, + double* Uval_buf, gridinfo_t *, LUstruct_t *); +extern int_t dSchurComplementSetupGPU(int_t k, msgs_t* msgs, packLUInfo_t*, + int_t*, int_t*, int_t*, gEtreeInfo_t*, + factNodelists_t*, scuBufs_t*, + LUValSubBuf_t* LUvsb, gridinfo_t *, + LUstruct_t *, HyP_t*); +extern double* dgetBigV(int_t, int_t); +extern double* dgetBigU(int_t, gridinfo_t *, LUstruct_t *); extern int_t getBigUSize(int_t, gridinfo_t *, LUstruct_t *); // permutation from superLU default extern int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *, LUstruct_t *, gridinfo_t *); +extern void getSCUweight(int_t nsupers, treeList_t* treeList, LUstruct_t *, gridinfo3d_t *); /* from treeFactorization.h */ -extern int_t LluBufInit(LUValSubBuf_t*, LUstruct_t *); -extern int_t initScuBufs(int_t ldt, int_t num_threads, int_t nsupers, - scuBufs_t* scuBufs, - LUstruct_t* LUstruct, - gridinfo_t * grid); +extern int_t dLluBufInit(LUValSubBuf_t*, LUstruct_t *); +extern int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, + scuBufs_t*, LUstruct_t*, gridinfo_t *); extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo); // the generic tree factoring code @@ -971,7 +936,7 @@ extern int_t treeFactor( int *info ); -extern int_t sparseTreeFactor( +extern int_t dsparseTreeFactor( int_t nnodes, // number of nodes in the tree int_t *perm_c_supno, // list of nodes in the order of factorization treeTopoInfo_t* treeTopoInfo, @@ -1010,7 +975,7 @@ extern int_t denseTreeFactor( int *info ); -extern int_t sparseTreeFactor_ASYNC( +extern int_t dsparseTreeFactor_ASYNC( sForest_t* sforest, commRequests_t **comReqss, // lists of communication requests // size maxEtree level scuBufs_t *scuBufs, // contains buffers for schur complement update @@ -1029,9 +994,9 @@ extern int_t sparseTreeFactor_ASYNC( double thresh, SCT_t *SCT, int tag_ub, int *info ); -extern LUValSubBuf_t** LluBufInitArr(int_t numLA, LUstruct_t *LUstruct); -extern diagFactBufs_t** initDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); -extern int_t initDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf); +extern LUValSubBuf_t** dLluBufInitArr(int_t numLA, LUstruct_t *LUstruct); +extern diagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); +extern int_t dinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf); extern int_t sDiagFactIBCast(int_t k, diagFactBufs_t *dFBuf, factStat_t *factStat, commRequests_t *comReqs, diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index fe267352..16cee7a4 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -348,7 +348,7 @@ typedef struct { int iam; /* my process number in this grid */ int_t nprow; /* number of process rows */ int_t npcol; /* number of process columns */ - int_t npdep; /* number of process columns */ + int_t npdep; /* number of replication factor in Z-dimension */ gridinfo_t grid2d; /* for using 2D functions */ } gridinfo3d_t; @@ -1196,7 +1196,7 @@ extern int_t initCommRequests(commRequests_t* comReqs, gridinfo_t * grid); extern int_t initFactStat(int_t nsupers, factStat_t* factStat); extern int_t initFactNodelists(int_t, int_t, int_t, factNodelists_t*); extern int_t initMsgs(msgs_t* msgs); -extern int_t getNumLookAhead(); +extern int_t getNumLookAhead(superlu_dist_options_t*); extern commRequests_t** initCommRequestsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); extern msgs_t** initMsgsArr(int_t numLA); @@ -1212,6 +1212,15 @@ extern int sort_U_info_elm( Ublock_info_t* Ublock_info, int n ); extern void printTRStimer(xtrsTimer_t *xtrsTimer, gridinfo3d_t *grid3d); extern void initTRStimer(xtrsTimer_t *xtrsTimer, gridinfo_t *grid); + /* from p3dcomm.c */ +extern int_t** getTreePerm( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + int_t* nodeCount, int_t** nodeList, + int_t* perm_c_supno, int_t* iperm_c_supno, + gridinfo3d_t* grid3d); +extern int_t* getMyNodeCounts(int_t maxLvl, int_t* myTreeIdxs, int_t* gNodeCount); +extern int_t checkIntVector3d(int_t* vec, int_t len, gridinfo3d_t* grid3d); +extern int_t reduceStat(PhaseType PHASE, SuperLUStat_t *stat, gridinfo3d_t * grid3d); + /*=====================*/ #ifdef __cplusplus diff --git a/SRC/superlu_grid3d.c b/SRC/superlu_grid3d.c new file mode 100644 index 00000000..e9ee4cbf --- /dev/null +++ b/SRC/superlu_grid3d.c @@ -0,0 +1,288 @@ +/*! @file + * \brief SuperLU grid utilities + * + *
+ * -- Distributed SuperLU routine (version 7.0.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 30, 2019
+ * 
+ */ + +#include "superlu_ddefs.h" + +void superlu_gridmap3d( + MPI_Comm Bcomm, /* The base communicator upon which + the new grid is formed. */ + int_t nprow, + int_t npcol, + int_t npdep, + gridinfo3d_t *grid); + + +/*! \brief All processes in the MPI communicator must call this routine. + */ +void superlu_gridinit3d(MPI_Comm Bcomm, /* The base communicator upon which + the new grid is formed. */ + int_t nprow, int_t npcol, int_t npdep, + gridinfo3d_t *grid) +{ + int Np = nprow * npcol * npdep; + int_t *usermap; + int i, j, info; + + /* Make a list of the processes in the new communicator. */ + // usermap = (int_t *) SUPERLU_MALLOC(Np*sizeof(int_t)); + // for (j = 0; j < npcol; ++j) + // for (i = 0; i < nprow; ++i) usermap[j*nprow+i] = i*npcol+j; + + /* Check MPI environment initialization. */ + MPI_Initialized( &info ); + if ( !info ) + ABORT("C main program must explicitly call MPI_Init()"); + + MPI_Comm_size( Bcomm, &info ); + if ( info < Np ) + ABORT("Number of processes is smaller than NPROW * NPCOL"); + + superlu_gridmap3d(Bcomm, nprow, npcol, npdep, grid); + + // SUPERLU_FREE(usermap); +} + + +/*! \brief All processes in the MPI communicator must call this routine. + */ +void superlu_gridmap3d( + MPI_Comm Bcomm, /* The base communicator upon which + the new grid is formed. */ + int_t nprow, + int_t npcol, + int_t npdep, + gridinfo3d_t *grid) +{ + MPI_Group mpi_base_group, superlu_grp; + int Np = nprow * npcol * npdep, mycol, myrow; + int *pranks; + int i, j, info; + + /* Create datatype in C for MPI complex. */ + if ( SuperLU_MPI_DOUBLE_COMPLEX == MPI_DATATYPE_NULL ) { + MPI_Type_contiguous( 2, MPI_DOUBLE, &SuperLU_MPI_DOUBLE_COMPLEX ); + MPI_Type_commit( &SuperLU_MPI_DOUBLE_COMPLEX ); + } + + /* Check MPI environment initialization. */ + MPI_Initialized( &info ); + if ( !info ) + ABORT("C main program must explicitly call MPI_Init()"); + + /* Make a list of the processes in the new communicator. */ + pranks = (int *) SUPERLU_MALLOC(Np * sizeof(int)); + for (j = 0; j < Np; ++j) + pranks[j] = j; + + /* + * Form MPI communicator for all. + */ + /* Get the group underlying Bcomm. */ + MPI_Comm_group( Bcomm, &mpi_base_group ); + /* Create the new group. */ + MPI_Group_incl( mpi_base_group, Np, pranks, &superlu_grp ); + /* Create the new communicator. */ + /* NOTE: The call is to be executed by all processes in Bcomm, + even if they do not belong in the new group -- superlu_grp. */ + MPI_Comm_create( Bcomm, superlu_grp, &grid->comm ); + + /* Bail out if I am not in the group, superlu_group. */ + if ( grid->comm == MPI_COMM_NULL ) { + grid->comm = Bcomm; + MPI_Comm_rank( Bcomm, &i ); + grid->iam = i; + /*grid->iam = -1;*/ + SUPERLU_FREE(pranks); + return; + } + + grid->nprow = nprow; + grid->npcol = npcol; + grid->npdep = npdep; + + /* Create 3D grid */ + int ndim = 3; + int dims[3]; + int reorder = 1; + int periodic[] = {0, 0, 0}; + int coords3d[3]; + int iam; + MPI_Comm superlu3d_comm; + + if (getenv("RANKORDER") && strcmp(getenv("RANKORDER"), "XY" )) + { + dims[0] = nprow; + dims[1] = npcol; + dims[2] = npdep; + + // create the new communicator + int error = MPI_Cart_create(grid->comm, ndim, dims, periodic, reorder, &superlu3d_comm); + + // get the coordinate of the processor + + MPI_Comm_rank (superlu3d_comm, &iam); + grid->iam = iam; + MPI_Cart_coords(superlu3d_comm, iam, ndim, coords3d); + + // printf("My coordinats are (%d %d %d)\n", coords3d[0], coords3d[1], coords3d[2] ); + int rowc[3] = {1, 0, 0}; + int colc[3] = {0, 1, 0}; + int depc[3] = {0, 0, 1}; + + // Partition a communicator into subgroups which form + // lower-dimensional cartesian subgrids + MPI_Cart_sub(superlu3d_comm, colc, &(grid->rscp.comm)); /* XZ grids */ + MPI_Cart_sub(superlu3d_comm, rowc, &(grid->cscp.comm)); /* YZ grids */ + MPI_Cart_sub(superlu3d_comm, depc, &(grid->zscp.comm)); /* XY grids */ + + grid->cscp.Np = nprow; + grid->cscp.Iam = coords3d[0]; + grid->rscp.Np = npcol; + grid->rscp.Iam = coords3d[1]; + grid->zscp.Np = npdep; + grid->zscp.Iam = coords3d[2]; + + // + grid->nprow = nprow; + grid->npcol = npcol; + grid->npdep = npdep; + + // 2D communicator + int xyc[3] = {1, 1, 0}; + MPI_Cart_sub(superlu3d_comm, xyc, &(grid->grid2d.comm)); + + } else { + dims[1] = nprow; + dims[2] = npcol; + dims[0] = npdep; + + // get the communicator + int error = MPI_Cart_create(grid->comm, ndim, dims, periodic, reorder, &superlu3d_comm); + + //get the coordinate of the processor + + MPI_Comm_rank (superlu3d_comm, &iam); + grid->iam = iam; + MPI_Cart_coords(superlu3d_comm, iam, ndim, coords3d); + + // create row communicator; + + // printf("My coordinats are (%d %d %d)\n", coords3d[0], coords3d[1], coords3d[2] ); + int rowc[3] = {0, 1, 0}; + int colc[3] = {0, 0, 1}; + int depc[3] = {1, 0, 0}; + + + MPI_Cart_sub(superlu3d_comm, colc, &(grid->rscp.comm)); + MPI_Cart_sub(superlu3d_comm, rowc, &(grid->cscp.comm)); + MPI_Cart_sub(superlu3d_comm, depc, &(grid->zscp.comm)); + + grid->cscp.Np = nprow; + grid->cscp.Iam = coords3d[1]; + grid->rscp.Np = npcol; + grid->rscp.Iam = coords3d[2]; + grid->zscp.Np = npdep; + grid->zscp.Iam = coords3d[0]; + + //printf("(Iam %d) grid->zscp.Np = %d\n", grid->iam, grid->zscp.Np); + + grid->nprow = nprow; + grid->npcol = npcol; + grid->npdep = npdep; + + // 2D communicator + int xyc[3] = {0, 1, 1}; + MPI_Cart_sub(superlu3d_comm, xyc, &(grid->grid2d.comm)); + + } /* if RANKORDER */ + + + // Initialize grid2d; + + grid->grid2d.rscp = grid->rscp; + grid->grid2d.cscp = grid->cscp; + grid->grid2d.nprow = nprow; + grid->grid2d.npcol = npcol; + MPI_Comm_rank( grid->grid2d.comm, &(grid->grid2d.iam)); + + + // grid->grid2d.cscp = grid->cscp; + +#if 0 + /* Make a list of the processes in the new communicator. */ + pranks = (int *) SUPERLU_MALLOC(Np * sizeof(int)); + for (j = 0; j < npcol; ++j) + for (i = 0; i < nprow; ++i) + pranks[i * npcol + j] = usermap[j * ldumap + i]; + + /* + * Form MPI communicator for all. + */ + /* Get the group underlying Bcomm. */ + MPI_Comm_group( Bcomm, &mpi_base_group ); + /* Create the new group. */ + MPI_Group_incl( mpi_base_group, Np, pranks, &superlu_grp ); + /* Create the new communicator. */ + /* NOTE: The call is to be executed by all processes in Bcomm, + even if they do not belong in the new group -- superlu_grp. */ + MPI_Comm_create( Bcomm, superlu_grp, &grid->comm ); + + /* Bail out if I am not in the group, superlu_group. */ + if ( grid->comm == MPI_COMM_NULL ) { + grid->comm = Bcomm; + MPI_Comm_rank( Bcomm, &i ); + grid->iam = i; + /*grid->iam = -1;*/ + SUPERLU_FREE(pranks); + return; + } + + MPI_Comm_rank( grid->comm, &(grid->iam) ); + myrow = grid->iam / npcol; + mycol = grid->iam % npcol; + + /* + * Form MPI communicator for myrow, scope = COMM_ROW. + */ + + MPI_Comm_split(grid->comm, myrow, mycol, &(grid->rscp.comm)); + + + /* + * Form MPI communicator for mycol, scope = COMM_COLUMN. + */ + MPI_Comm_split(grid->comm, mycol, myrow, &(grid->cscp.comm)); + + + grid->rscp.Np = npcol; + grid->rscp.Iam = mycol; + grid->cscp.Np = nprow; + grid->cscp.Iam = myrow; +#endif + + SUPERLU_FREE(pranks); + MPI_Group_free(&superlu_grp); + MPI_Group_free(&mpi_base_group); +} + +void superlu_gridexit3d(gridinfo3d_t *grid) +{ + if ( grid->comm != MPI_COMM_NULL && grid->comm != MPI_COMM_WORLD ) { + /* Marks the communicator objects for deallocation. */ + MPI_Comm_free( &grid->rscp.comm ); + MPI_Comm_free( &grid->cscp.comm ); + MPI_Comm_free( &grid->zscp.comm ); + MPI_Comm_free( &grid->grid2d.comm ); + MPI_Comm_free( &grid->comm ); + } + if ( SuperLU_MPI_DOUBLE_COMPLEX != MPI_DATATYPE_NULL ) { + MPI_Type_free( &SuperLU_MPI_DOUBLE_COMPLEX ); + } +} diff --git a/SRC/supernodalForest.c b/SRC/supernodalForest.c new file mode 100644 index 00000000..f3a3dfed --- /dev/null +++ b/SRC/supernodalForest.c @@ -0,0 +1,948 @@ +#include +#include +#include "superlu_ddefs.h" +#if 0 +#include "sec_structs.h" +#include "supernodal_etree.h" +#include "load-balance/supernodalForest.h" +#include "p3dcomm.h" +#endif +#include + +#define INT_T_ALLOC(x) ((int_t *) SUPERLU_MALLOC ( (x) * sizeof (int_t))) +#define DOUBLE_ALLOC(x) ((double *) SUPERLU_MALLOC ( (x) * sizeof (double))) + + +int_t calcTopInfoForest(sForest_t *forest, + int_t nsupers, int_t* setree); + + +sForest_t** getForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList) +{ + // treePartStrat tps; + if (getenv("LBS")) + { + if (strcmp(getenv("LBS"), "ND" ) == 0) + { + return getNestDissForests( maxLvl, nsupers, setree, treeList); + } + if (strcmp(getenv("LBS"), "GD" ) == 0) + { + return getGreedyLoadBalForests( maxLvl, nsupers, setree, treeList); + } + } + else + { + return getGreedyLoadBalForests( maxLvl, nsupers, setree, treeList); + } + +} +double calcNodeListWeight(int_t nnodes, int_t* nodeList, treeList_t* treeList) +{ + double trWeight = 0; + + for (int i = 0; i < nnodes; ++i) + { + trWeight += treeList[nodeList[i]].weight; + } + + return trWeight; +} + +sForest_t** getNestDissForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList) +{ + + int_t numForests = (1 << maxLvl) - 1; + + // allocate space for forests + sForest_t** sForests = SUPERLU_MALLOC (numForests * sizeof (sForest_t*)); + + + int_t* gTreeHeads = getTreeHeads(maxLvl, nsupers, treeList); + + int_t* gNodeCount = calcNumNodes(maxLvl, gTreeHeads, treeList); + int_t** gNodeLists = getNodeList(maxLvl, setree, gNodeCount, + gTreeHeads, treeList); + + for (int i = 0; i < numForests; ++i) + { + sForests[i] = NULL; + if (gNodeCount[i] > 0) + { + sForests[i] = SUPERLU_MALLOC (sizeof (sForest_t)); + sForests[i]->nNodes = gNodeCount[i]; + sForests[i]->numTrees = 1; + sForests[i]->nodeList = gNodeLists[i]; + sForests[i]->weight = calcNodeListWeight(sForests[i]->nNodes, sForests[i]->nodeList, treeList); + + calcTopInfoForest(sForests[i], nsupers, setree); + } + } + + return sForests; +} + +static int_t* sortPtr; + +static int cmpfuncInd (const void * a, const void * b) +{ + return ( sortPtr[*(int_t*)a] - sortPtr[*(int_t*)b] ); +} +// doesn't sort A but gives the index of sorted array +int_t* getSortIndex(int_t n, int_t* A) +{ + int_t* idx = INT_T_ALLOC(n); + + for (int i = 0; i < n; ++i) + { + /* code */ + idx[i] = i; + } + sortPtr = A; + + qsort(idx, n, sizeof(int_t), cmpfuncInd); + + return idx; +} + + +static double* sortPtrDouble; + +static int cmpfuncIndDouble (const void * a, const void * b) +{ + return ( sortPtrDouble[*(int_t*)a] > sortPtrDouble[*(int_t*)b] ); +} +// doesn't sort A but gives the index of sorted array +int_t* getSortIndexDouble(int_t n, double* A) +{ + int_t* idx = INT_T_ALLOC(n); + + for (int i = 0; i < n; ++i) + { + /* code */ + idx[i] = i; + } + sortPtrDouble = A; + + qsort(idx, n, sizeof(int_t), cmpfuncIndDouble); + + return idx; +} + +static int cmpfunc(const void * a, const void * b) +{ + return ( *(int_t*)a - * (int_t*)b ); +} + + +int_t* permuteArr(int_t n, int_t* A, int_t* perm) +{ + int_t* permA = INT_T_ALLOC(n); + + for (int i = 0; i < n; ++i) + { + /* code */ + permA[i] = A[perm[i]]; + } + + return permA; +} + + + +int_t calcTopInfoForest(sForest_t *forest, + int_t nsupers, int_t* setree) +{ + + int_t nnodes = forest->nNodes; + int_t* nodeList = forest->nodeList; + + qsort(nodeList, nnodes, sizeof(int_t), cmpfunc); + int_t* myIperm = getMyIperm(nnodes, nsupers, nodeList); + int_t* myTopOrderOld = getMyTopOrder(nnodes, nodeList, myIperm, setree ); + int_t* myTopSortIdx = getSortIndex(nnodes, myTopOrderOld); + int_t* nodeListNew = permuteArr(nnodes, nodeList, myTopSortIdx); + int_t* myTopOrder = permuteArr(nnodes, myTopOrderOld, myTopSortIdx); + + SUPERLU_FREE(nodeList); + SUPERLU_FREE(myTopSortIdx); + SUPERLU_FREE(myIperm); + SUPERLU_FREE(myTopOrderOld); + myIperm = getMyIperm(nnodes, nsupers, nodeListNew); + + + + treeTopoInfo_t ttI; + ttI.myIperm = myIperm; + ttI.numLvl = myTopOrder[nnodes - 1] + 1; + ttI.eTreeTopLims = getMyEtLims(nnodes, myTopOrder); + + forest->nodeList = nodeListNew; + forest->topoInfo = ttI; + + return 0; + +} + +// #pragma optimize ("", off) + +double* getTreeWeights(int_t numTrees, int_t* gNodeCount, int_t** gNodeLists, treeList_t* treeList) +{ + double* gTreeWeights = DOUBLE_ALLOC(numTrees); + + // initialize with weight with whole subtree weights + for (int_t i = 0; i < numTrees; ++i) + { + gTreeWeights[i] = calcNodeListWeight(gNodeCount[i], gNodeLists[i], treeList); + } + + return gTreeWeights; + +} + +int_t* getNodeCountsFr(int_t maxLvl, sForest_t** sForests) +{ + int_t numForests = (1 << maxLvl) - 1; + int_t* gNodeCount = INT_T_ALLOC (numForests); + + for (int i = 0; i < numForests; ++i) + { + /* code */ + if (sForests[i]) + {gNodeCount[i] = sForests[i]->nNodes;} + else + { + gNodeCount[i] = 0; + } + } + return gNodeCount; +} + +int_t** getNodeListFr(int_t maxLvl, sForest_t** sForests) +{ + int_t numForests = (1 << maxLvl) - 1; + int_t** gNodeLists = (int_t**) SUPERLU_MALLOC(numForests * sizeof(int_t*)); + + for (int i = 0; i < numForests; ++i) + { + /* code */ + if (sForests[i]) + { + gNodeLists[i] = sForests[i]->nodeList; + } + else + { + gNodeLists[i] = NULL; + } + } + + return gNodeLists; +} + +int_t* getNodeToForstMap(int_t nsupers, sForest_t** sForests, gridinfo3d_t* grid3d) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t numForests = (1 << maxLvl) - 1; + int_t* gNodeToForstMap = INT_T_ALLOC (nsupers); + + for (int i = 0; i < numForests; ++i) + { + /* code */ + if (sForests[i]) + { int_t nnodes = sForests[i]->nNodes; + int_t* nodeList = sForests[i]->nodeList; + for(int_t node = 0; nodenNodes; + } + + return myNodeCount; +} + + +int_t** getTreePermFr( int_t* myTreeIdxs, + sForest_t** sForests, gridinfo3d_t* grid3d) +{ + + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + int_t** treePerm = (int_t** ) SUPERLU_MALLOC(sizeof(int_t*)*maxLvl); + for (int_t lvl = 0; lvl < maxLvl; lvl++) + { + treePerm[lvl] = NULL; + if (sForests[myTreeIdxs[lvl]]) + treePerm[lvl] = sForests[myTreeIdxs[lvl]]->nodeList; + } + return treePerm; +} + +int_t* getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t* myNodeCount, int_t** treePerm) +{ + int_t* isNodeInMyGrid = INT_T_ALLOC (nsupers); + + for(int_t i=0; igrid2d); + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t numForests = (1 << maxLvl) - 1; + double* gFrstCost = DOUBLE_ALLOC(numForests); + double* gFrstCostAcc = DOUBLE_ALLOC(numForests); + double* gFrstWt = DOUBLE_ALLOC(numForests); + + for (int i = 0; i < numForests; ++i) + { + gFrstCost[i] = 0; + gFrstWt[i] = 0; + if (sForests[i]) + { + gFrstCost[i] = sForests[i]->cost; + gFrstWt[i] = sForests[i]->weight; + } + } + + // reduce forest costs from all the grid; + MPI_Reduce(gFrstCost, gFrstCostAcc, numForests, MPI_DOUBLE, MPI_SUM, 0, grid3d->zscp.comm); + + if (!grid3d->zscp.Iam && !grid->iam) + { + printf("|Forest | weight | cost | weight/Cost | \n"); + for (int i = 0; i < numForests; ++i) + { + /* code */ + double wt, ct; + wt = 0.0; + ct = 0.0; + if (sForests[i]) + { + wt = sForests[i]->weight; + } + printf("|%d | %.2e | %.2e | %.2e |\n", i, wt, gFrstCostAcc[i], 1e-9 * wt / gFrstCostAcc[i] ); + + } + + double* crPathCost = DOUBLE_ALLOC(numForests); + double* crPathWeight = DOUBLE_ALLOC(numForests); + // print the critcal path + for (int i = numForests - 1; i > -1 ; --i) + { + crPathCost[i] = gFrstCostAcc[i]; + crPathWeight[i] = gFrstWt[i]; + + if (2 * i + 1 < numForests) + { + + if (crPathCost[2 * i + 1] > crPathCost[2 * i + 2]) + { + /* code */ + crPathCost[i] += crPathCost[2 * i + 1]; + crPathWeight[i] += crPathWeight[2 * i + 1]; + } + else + { + crPathCost[i] += crPathCost[2 * i + 2]; + crPathWeight[i] += crPathWeight[2 * i + 2]; + } + } + } + + + printf("|CritcalPath | %.2e | %.2e | %.2e |\n", crPathWeight[0], crPathCost[0], 1e-9 * crPathWeight[0] / crPathCost[0] ); + + double prsnCoeff = pearsonCoeff(numForests, gFrstCost, gFrstWt); + printf("|Pearsoncoefficient | %.3f |\n", prsnCoeff); + + printf("\n~~~mermaid \n"); + printf("\ngantt \n \ + \t\t dateFormat mm-ss \n\ + \t\t title TreeCost and Time Gantt Chart\n\n\n" ); + printf("\t Section Time\n"); + printGantt(0, numForests, "Time", 1.0 , gFrstCostAcc, crPathCost); + printf("\t Section Weight\n"); + printGantt(0, numForests, "weight", crPathCost[0]/crPathWeight[0] , gFrstWt, crPathWeight); + + printf("~~~\n\n\n"); + SUPERLU_FREE(crPathCost); + SUPERLU_FREE(crPathWeight); + + } + + SUPERLU_FREE( gFrstCost); + SUPERLU_FREE( gFrstCostAcc); + SUPERLU_FREE( gFrstWt); +} + + +void printGantt(int_t root, int_t numForests, char* nodename, double scale, double* gFrstCostAcc, double* crPathCost) +{ + + + if (2*root+1>=numForests) + { + /* if there are no more childrens*/ + printf("\t tree-%d \t:%s-%d, 0d, %.0fd \n", root,nodename, root, 100*scale*gFrstCostAcc[root] ); + } + else + { + printGantt(2*root+1, numForests, nodename, scale, gFrstCostAcc, crPathCost); + int_t depTree = crPathCost[2*root+1]> crPathCost[2*root+2]? 2*root+1:2*root+2; + printf("\t tree-%d %.2g \t:%s-%d, after %s-%d, %.0fd \n", root,100*scale*crPathCost[root], nodename, root, nodename, depTree, 100*scale*gFrstCostAcc[root] ); + printGantt(2*root+2, numForests, nodename, scale, gFrstCostAcc, crPathCost); + } + +} + +#define ABS(a) ((a)<0?-(a):a) +double getLoadImbalance(int_t nTrees, + int_t * treeIndx, // index of tree in gtrees + double * gTreeWeights) +{ + + if (nTrees < 1) + { + /* code */ + return 0; + } + double w1 = 0; + double w2 = 0; + + int_t* wSortIdx = getSortIndexDouble(nTrees, gTreeWeights); + // can not change weight array + w1 = gTreeWeights[wSortIdx[nTrees - 1]]; + + + for (int i = nTrees - 2 ; i > -1; --i) + { + /* code */ + if (w1 > w2) + { + /* code */ + w2 += gTreeWeights[wSortIdx[i]]; + + } + else + { + w1 += gTreeWeights[wSortIdx[i]]; + + } + } + + SUPERLU_FREE(wSortIdx); + return ABS(w2 - w1) / (w2 + w1); + // return trPart; + +} + + +// maximum allowed imbalance +#define ACCEPTABLE_TREE_IMBALANCE 0.2 + + +// r forest contains a list of tree heads +// each treehead is an entire subtree (all level beloe) +#define MAX_TREE_ALLOWED 1024 + +typedef struct +{ + int_t ntrees; + int_t* treeHeads; +} rForest_t; + +typedef struct +{ + sForest_t* Ans; + rForest_t* S[2]; +} forestPartition_t; + +void freeRforest(rForest_t* rforest) +{ + SUPERLU_FREE(rforest->treeHeads); +} + + +sForest_t* createForestNew(int_t numTrees, int_t nsupers, int_t * nodeCounts, int_t** NodeLists, int_t * setree, treeList_t* treeList) +{ + if (numTrees == 0) return NULL; + + sForest_t* forest = SUPERLU_MALLOC(sizeof(sForest_t)); + forest->numTrees = numTrees; + + double frWeight = 0; + int_t nodecount = 0; + for (int_t i = 0; i < numTrees; ++i) + { + nodecount += nodeCounts[i]; + frWeight += calcNodeListWeight(nodeCounts[i], NodeLists[i], treeList); + } + + forest->nNodes = nodecount; + forest->weight = frWeight; + + int_t* nodeList = INT_T_ALLOC(forest->nNodes); + + int_t ptr = 0; + for (int_t i = 0; i < numTrees; ++i) + { + for (int_t j = 0; j < nodeCounts[i]; ++j) + { + /* copy the loop */ + nodeList[ptr] = NodeLists[i][j]; + ptr++; + } + } + + forest->nodeList = nodeList; + forest->cost = 0.0; + + + // using the nodelist create factorization ordering + calcTopInfoForest(forest, nsupers, setree); + + return forest; +} + +void oneLeveltreeFrPartition( int_t nTrees, int_t * trCount, int_t** trList, + int_t * treeSet, + double * sWeightArr) +{ + if (nTrees < 1) + { + /* code */ + trCount[0] = 0; + trCount[1] = 0; + return; + } + double w1 = 0; + double w2 = 0; + + int_t* wSortIdx = getSortIndexDouble(nTrees, sWeightArr); + // treeIndx= permuteArr(nTrees, treeIndx, wSortIdx); + + int_t S1ptr = 0; + int_t S2ptr = 0; + + // can not change weight array + w1 = sWeightArr[wSortIdx[nTrees - 1]]; + trList[0][S1ptr++] = treeSet[wSortIdx[nTrees - 1]]; + + for (int i = nTrees - 2 ; i > -1; --i) + { + /* code */ + if (w1 > w2) + { + /* code */ + w2 += sWeightArr[wSortIdx[i]]; + trList[1][S2ptr++] = treeSet[wSortIdx[i]]; + } + else + { + w1 += sWeightArr[wSortIdx[i]]; + trList[0][S1ptr++] = treeSet[wSortIdx[i]]; + } + } + + trCount[0] = S1ptr; + trCount[1] = S2ptr; + + SUPERLU_FREE(wSortIdx); + +} + +forestPartition_t iterativeFrPartitioning(rForest_t* rforest, int_t nsupers, int_t * setree, treeList_t* treeList) +{ + + int_t nTreeSet = rforest->ntrees; + int_t* treeHeads = rforest->treeHeads; + + + int_t nAnc = 0; + int_t* ancTreeCount = INT_T_ALLOC(MAX_TREE_ALLOWED); + int_t** ancNodeLists = SUPERLU_MALLOC(MAX_TREE_ALLOWED * sizeof(int_t*)); + + double * weightArr = DOUBLE_ALLOC (MAX_TREE_ALLOWED); + // int_t* treeSet = INT_T_ALLOC(nTreeSet); + int_t* treeSet = INT_T_ALLOC(MAX_TREE_ALLOWED); + + for (int i = 0; i < nTreeSet; ++i) + { + treeSet[i] = treeHeads[i]; + weightArr[i] = treeList[treeHeads[i]].iWeight; + } + + while (getLoadImbalance(nTreeSet, treeSet, weightArr) > ACCEPTABLE_TREE_IMBALANCE ) + { + // get index of maximum weight subtree + int_t idx = 0; + for (int i = 0; i < nTreeSet; ++i) + { + /* code */ + if (treeList[treeSet[i]].iWeight > treeList[treeSet[idx]].iWeight) + { + /* code */ + idx = i; + } + } + + + int_t MaxTree = treeSet[idx]; + int_t* sroots = getSubTreeRoots(MaxTree, treeList); + if (sroots[0] == -1) + { + /* code */ + SUPERLU_FREE(sroots); + break; + } + + ancTreeCount[nAnc] = getCommonAncsCount(MaxTree, treeList); + int_t * alist = INT_T_ALLOC (ancTreeCount[nAnc]); + getCommonAncestorList(MaxTree, alist, setree, treeList); + ancNodeLists[nAnc] = alist; + nAnc++; + + + treeSet[idx] = treeSet[nTreeSet - 1]; + weightArr[idx] = treeList[treeSet[idx]].iWeight; + treeSet[nTreeSet - 1] = sroots[0]; + weightArr[nTreeSet - 1] = treeList[treeSet[nTreeSet - 1]].iWeight; + treeSet[nTreeSet] = sroots[1]; + weightArr[nTreeSet] = treeList[treeSet[nTreeSet]].iWeight; + nTreeSet += 1; + + SUPERLU_FREE(sroots); + + if (nTreeSet == MAX_TREE_ALLOWED) + { + break; + } + } + + // Create the Ancestor forest + sForest_t* aforest = createForestNew(nAnc, nsupers, ancTreeCount, ancNodeLists, setree, treeList); + + + + // create the weight array; + double* sWeightArr = DOUBLE_ALLOC(nTreeSet); + for (int i = 0; i < nTreeSet ; ++i) + sWeightArr[i] = treeList[treeSet[i]].iWeight; + + int_t trCount[2] = {0, 0}; + int_t* trList[2]; + trList[0] = INT_T_ALLOC(nTreeSet); + trList[1] = INT_T_ALLOC(nTreeSet); + + oneLeveltreeFrPartition( nTreeSet, trCount, trList, + treeSet, + sWeightArr); + + rForest_t *rforestS1, *rforestS2; + rforestS1 = SUPERLU_MALLOC(sizeof(rforest)); + rforestS2 = SUPERLU_MALLOC(sizeof(rforest)); + + rforestS1->ntrees = trCount[0]; + rforestS1->treeHeads = trList[0]; + + rforestS2->ntrees = trCount[1]; + rforestS2->treeHeads = trList[1]; + + forestPartition_t frPr_t; + frPr_t.Ans = aforest; + frPr_t.S[0] = rforestS1; + frPr_t.S[1] = rforestS2; + + // free stuff + // int_t* ancTreeCount = INT_T_ALLOC(MAX_TREE_ALLOWED); + // int_t** ancNodeLists = SUPERLU_MALLOC(MAX_TREE_ALLOWED * sizeof(int_t*)); + + SUPERLU_FREE(weightArr); + SUPERLU_FREE (treeSet); + SUPERLU_FREE (sWeightArr); + + for (int i = 0; i < nAnc ; ++i) + { + /* code */ + SUPERLU_FREE(ancNodeLists[i]); + } + + SUPERLU_FREE(ancTreeCount); + SUPERLU_FREE(ancNodeLists); + + return frPr_t; +} + + + +sForest_t* r2sForest(rForest_t* rforest, int_t nsupers, int_t * setree, treeList_t* treeList) +{ + int_t nTree = rforest->ntrees; + + // quick return + if (nTree < 1) return NULL; + + int_t* treeHeads = rforest->treeHeads; + int_t* nodeCounts = INT_T_ALLOC(nTree); + int_t** NodeLists = SUPERLU_MALLOC(nTree * sizeof(int_t*)); + + for (int i = 0; i < nTree; ++i) + { + /* code */ + nodeCounts[i] = treeList[treeHeads[i]].numDescendents; + NodeLists[i] = INT_T_ALLOC(nodeCounts[i]); + getDescendList(treeHeads[i], NodeLists[i], treeList); + } + + + sForest_t* sforest = createForestNew(nTree, nsupers, nodeCounts, NodeLists, setree, treeList); + + for (int i = 0; i < nTree; ++i) + { + /* code */ + SUPERLU_FREE(NodeLists[i]); + } + + SUPERLU_FREE(NodeLists); + SUPERLU_FREE(nodeCounts); + + return sforest; +} + + +sForest_t** getGreedyLoadBalForests( int_t maxLvl, int_t nsupers, int_t * setree, treeList_t* treeList) +{ + + // assert(maxLvl == 2); + int_t numForests = (1 << maxLvl) - 1; + sForest_t** sForests = (sForest_t** ) SUPERLU_MALLOC (numForests * sizeof (sForest_t*)); + + int_t numRForests = SUPERLU_MAX( (1 << (maxLvl - 1)) - 1, 1) ; + rForest_t* rForests = SUPERLU_MALLOC (numRForests * sizeof (rForest_t)); + + // intialize rfortes[0] + int_t nRootTrees = 0; + + for (int i = 0; i < nsupers; ++i) + { + /* code */ + if (setree[i] == nsupers) nRootTrees++; + + } + + rForests[0].ntrees = nRootTrees; + rForests[0].treeHeads = INT_T_ALLOC(nRootTrees); + + nRootTrees = 0; + for (int i = 0; i < nsupers; ++i) + { + /* code */ + if (setree[i] == nsupers) + { + rForests[0].treeHeads[nRootTrees] = i; + nRootTrees++; + } + + } + + + if (maxLvl == 1) + { + /* code */ + sForests[0] = r2sForest(&rForests[0], nsupers, setree, treeList); + return sForests; + } + + // now loop over level + for (int_t lvl = 0; lvl < maxLvl - 1; ++lvl) + { + /* loop over all r forest in this level */ + int_t lvlSt = (1 << lvl) - 1; + int_t lvlEnd = (1 << (lvl + 1)) - 1; + + for (int_t tr = lvlSt; tr < lvlEnd; ++tr) + { + /* code */ + forestPartition_t frPr_t = iterativeFrPartitioning(&rForests[tr], nsupers, setree, treeList); + sForests[tr] = frPr_t.Ans; + + if (lvl == maxLvl - 2) + { + /* code */ + sForests[2 * tr + 1] = r2sForest(frPr_t.S[0], nsupers, setree, treeList); + sForests[2 * tr + 2] = r2sForest(frPr_t.S[1], nsupers, setree, treeList); + } + else + { + rForests[2 * tr + 1] = *(frPr_t.S[0]); + rForests[2 * tr + 2] = *(frPr_t.S[1]); + } + + } + + } + + for (int i = 0; i < numRForests; ++i) + { + /* code */ + freeRforest(&rForests[i]); + } + + SUPERLU_FREE(rForests); + + + + return sForests; + +} + +// balanced forests at one level +sForest_t** getOneLevelBalForests( int_t maxLvl, int_t nsupers, int_t * setree, treeList_t* treeList) +{ + + // assert(maxLvl == 2); + int_t numForests = (1 << maxLvl) - 1; + sForest_t** sForests = (sForest_t** ) SUPERLU_MALLOC (numForests * sizeof (sForest_t*)); + + int_t numRForests = SUPERLU_MAX( (1 << (maxLvl - 1)) - 1, 1) ; + rForest_t* rForests = SUPERLU_MALLOC (numRForests * sizeof (rForest_t)); + + // intialize rfortes[0] + int_t nRootTrees = 0; + + for (int i = 0; i < nsupers; ++i) + { + /* code */ + if (setree[i] == nsupers) + { + nRootTrees += 2; + } + + } + + rForests[0].ntrees = nRootTrees; + rForests[0].treeHeads = INT_T_ALLOC(nRootTrees); + + nRootTrees = 0; + for (int i = 0; i < nsupers; ++i) + { + /* code */ + if (setree[i] == nsupers) + { + rForests[0].treeHeads[nRootTrees] = i; + nRootTrees++; + } + + } + + + if (maxLvl == 1) + { + /* code */ + sForests[0] = r2sForest(&rForests[0], nsupers, setree, treeList); + return sForests; + } + + // now loop over level + for (int_t lvl = 0; lvl < maxLvl - 1; ++lvl) + { + /* loop over all r forest in this level */ + int_t lvlSt = (1 << lvl) - 1; + int_t lvlEnd = (1 << (lvl + 1)) - 1; + + for (int_t tr = lvlSt; tr < lvlEnd; ++tr) + { + /* code */ + forestPartition_t frPr_t = iterativeFrPartitioning(&rForests[tr], nsupers, setree, treeList); + sForests[tr] = frPr_t.Ans; + + if (lvl == maxLvl - 2) + { + /* code */ + sForests[2 * tr + 1] = r2sForest(frPr_t.S[0], nsupers, setree, treeList); + sForests[2 * tr + 2] = r2sForest(frPr_t.S[1], nsupers, setree, treeList); + } + else + { + rForests[2 * tr + 1] = *(frPr_t.S[0]); + rForests[2 * tr + 2] = *(frPr_t.S[1]); + } + + } + + } + + for (int i = 0; i < numRForests; ++i) + { + /* code */ + freeRforest(&rForests[i]); + } + + SUPERLU_FREE(rForests); + + + + return sForests; + +} diff --git a/SRC/supernodal_etree.c b/SRC/supernodal_etree.c new file mode 100644 index 00000000..78757a37 --- /dev/null +++ b/SRC/supernodal_etree.c @@ -0,0 +1,1008 @@ +/*function to generate supernodal etree*/ +#include +#include +#include "superlu_ddefs.h" +//#include "supernodal_etree.h" + +#define INT_T_ALLOC(x) ((int_t *) SUPERLU_MALLOC ( (x) * sizeof (int_t))) +int_t log2i(int_t index) +{ + int_t targetlevel = 0; + while (index >>= 1) ++targetlevel; + return targetlevel; +} + +/** + * Returns Supernodal Elimination Tree + * @param nsuper Number of Supernodes + * @param etree Scalar elimination tree + * @param supno Vertex to supernode mapping + * @param xsup Supernodal boundaries + * @return Supernodal elimination tree + */ +int_t *supernodal_etree(int_t nsuper, int_t * etree, int_t* supno, int_t *xsup) +{ + int_t *setree = malloc(sizeof(int_t) * nsuper); + /*initialzing the loop*/ + for (int i = 0; i < nsuper; ++i) + { + setree[i] = nsuper; + } + /*calculating the setree*/ + for (int i = 0; i < nsuper - 1; ++i) + { + int_t ftree = etree[xsup[i + 1] - 1]; + if (ftree < xsup[nsuper]) + { + setree[i] = supno[etree[xsup[i + 1] - 1]]; + } + } + return setree; +} +/*takes supernodal elimination tree and for each +supernode calculates "level" in elimination tree*/ +int_t* topological_ordering(int_t nsuper, int_t* setree) +{ + int_t *tsort_setree = malloc(sizeof(int_t) * nsuper); + for (int i = 0; i < nsuper; ++i) + { + tsort_setree[i] = 0; /*initializing all levels to zero*/ + } + for (int i = 0; i < nsuper - 1; ++i) + { + /*level of parent = MAX(level_of_children()+1)*/ + tsort_setree[setree[i]] = SUPERLU_MAX(tsort_setree[setree[i]], tsort_setree[i] + 1); + } + return tsort_setree; +} + + +treeList_t* setree2list(int_t nsuper, int_t* setree ) +{ + treeList_t* treeList = (treeList_t* ) malloc (sizeof(treeList_t) * (nsuper + 1)); + // initialize the struct + for (int i = 0; i < nsuper + 1; ++i) + { + treeList[i].numChild = 0; + treeList[i].numDescendents = 1; /*numdescen includes myself*/ + treeList[i].left = -1; + treeList[i].right = -1; + treeList[i].right = -1; + treeList[i].depth = 0; + } + for (int i = 0; i < nsuper; ++i) + { + // updating i-th supernodes parents + int_t parenti = setree[i]; + treeList[parenti].numDescendents += treeList[i].numDescendents; + treeList[parenti].numChild++; + } + /*allocate memory for children lists*/ + for (int i = 0; i < nsuper + 1; ++i) + { + treeList[i].childrenList = INT_T_ALLOC (treeList[i].numChild); + treeList[i].numChild = 0; + } + + for (int i = 0; i < nsuper; ++i) + { + // updating i-th supernodes parents + int_t parenti = setree[i]; + treeList[parenti].childrenList[treeList[parenti].numChild] = i; + treeList[parenti].numChild++; + } + return treeList; +} /* setree2list */ + +int_t estimateWeight(int_t nsupers, int_t*setree, treeList_t* treeList, int_t* xsup) +{ + if (getenv("WF")) + { + if (strcmp(getenv("WF"), "One" ) == 0) + { + for (int i = 0; i < nsupers; ++i) + { + treeList[i].weight = 1.0; + } + } + else if (strcmp(getenv("WF"), "Ns" ) == 0) + { + for (int i = 0; i < nsupers; ++i) + { + double sz = 1.0 * SuperSize(i); + treeList[i].weight = sz; + } + } + else if (strcmp(getenv("WF"), "NsDep" ) == 0) + { + for (int i = 0; i < nsupers; ++i) + { + double dep = 1.0 * treeList[i].depth ; + double sz = 1.0 * SuperSize(i); + treeList[i].weight = sz * dep; + } + } + else if (strcmp(getenv("WF"), "NsDep2" ) == 0) + { + for (int i = 0; i < nsupers; ++i) + { + double dep = 1.0 * treeList[i].depth ; + double sz = 1.0 * SuperSize(i); + treeList[i].weight = 3 * sz * dep * (sz + dep) + sz * sz * sz ; + + } + + } + else + { + for (int i = 0; i < nsupers; ++i) + { + treeList[i].weight = treeList[i].scuWeight; + } + } + } + else + { + for (int i = 0; i < nsupers; ++i) + { + treeList[i].weight = treeList[i].scuWeight; + + } + } +} /* estimateWeight */ + + +int_t calcTreeWeight(int_t nsupers, int_t*setree, treeList_t* treeList, int_t* xsup) +{ + + // initializing naive weight + for (int i = 0; i < nsupers; ++i) + { + treeList[i].depth = 0; + } + + for (int i = nsupers - 1; i > -1; --i) + { + /* code */ + int_t myDep = treeList[i].depth; + for (int cIdx = 0; cIdx < treeList[i].numChild; ++cIdx) + { + /* code */ + int_t child = treeList[i].childrenList[cIdx]; + treeList[child].depth = myDep + SuperSize(i) ; + + } + } + + + // for (int i = 0; i < nsupers; ++i) + // { + + // // treeList[i].weight = 1.0 * treeList[i].numDescendents; + // double dep = 1.0 * treeList[i].depth ; + // double sz = 1.0 * SuperSize(i); + // treeList[i].weight = 1.0; + // treeList[i].weight = sz; + // treeList[i].weight = sz * sz * sz; + // treeList[i].weight = 3 * sz * dep * (sz + dep) + sz * sz * sz ; + // treeList[i].weight = treeList[i].scuWeight; + // // treeList[i].treeWeight = treeList[i].weight; + // // treeList[i].depth = 0; + // } + + estimateWeight(nsupers, setree, treeList, xsup); + + for (int i = 0; i < nsupers; ++i) + { + treeList[i].iWeight = treeList[i].weight; + } + + + for (int i = 0; i < nsupers; ++i) + { + int_t parenti = setree[i]; + treeList[parenti].iWeight += treeList[i].iWeight; + } + + + return 0; + +} /* calcTreeWeight */ + + +int_t printFileList(char* sname, int_t nnodes, int_t*dlist, int_t*setree) +{ + FILE* fp = fopen(sname, "w"); + /*beginning of the file */ + fprintf(fp, "//dot file generated by pdgstrf\n"); + fprintf(fp, "digraph elimination_tree {\n"); + for (int i = 0; i < nnodes; ++i) + { + /* code */ + fprintf(fp, "%lld -> %lld;\n", dlist[i], setree[dlist[i]]); + } + /*end of the file */ + fprintf(fp, "}\n"); + fprintf(fp, "//EOF\n"); + fclose(fp); +} + +int_t getDescendList(int_t k, int_t*dlist, treeList_t* treeList) +// post order traversal +{ + if (k < 0) return 0; + + int_t cDesc = 0; + + for (int_t child = 0; child < treeList[k].numChild; ++child) + { + /* code */ + int_t nChild = treeList[k].childrenList[child]; + cDesc += getDescendList(nChild, dlist + cDesc, treeList); + } + + dlist[cDesc] = k; + return cDesc + 1; +} + + +int_t getCommonAncsCount(int_t k, treeList_t* treeList) +{ + // given a supernode k, give me the list of ancestors nodes + int_t cur = k; + int_t count = 1; + while (treeList[cur].numChild == 1) + { + cur = treeList[cur].childrenList[0]; + count++; + } + return count; +} +int_t getCommonAncestorList(int_t k, int_t* alist, int_t* seTree, treeList_t* treeList) +{ + // given a supernode k, give me the list of ancestors nodes + int_t cur = k; + int_t count = 1; + while (treeList[cur].numChild == 1) + { + cur = treeList[cur].childrenList[0]; + count++; + } + + + alist[0] = cur; + for (int i = 1; i < count; ++i) + { + /* code */ + alist[i] = seTree[cur]; + cur = seTree[cur]; + } + return count; +} + +int cmpfunc (const void * a, const void * b) +{ + return ( *(int_t*)a - * (int_t*)b ); +} + +int_t* getPermNodeList(int_t nnode, // number of nodes + int_t* nlist, int_t* perm_c_sup, int_t* iperm_c_sup) +//from list of nodes, get permutation of factorization +{ + int_t* perm_l = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * nnode); + int_t* iperm_l = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * nnode); + for (int_t i = 0; i < nnode; ++i) + { + /* code */ + // printf("%d %d %d\n",i, nlist[i],iperm_c_sup[nlist[i]] ); + iperm_l[i] = iperm_c_sup[nlist[i]]; //order of factorization + } + qsort(iperm_l, nnode, sizeof(int_t), cmpfunc); + + for (int_t i = 0; i < nnode; ++i) + { + /* code */ + perm_l[i] = perm_c_sup[iperm_l[i]]; //order of factorization + } + SUPERLU_FREE(iperm_l); + return perm_l; +} +int_t* getEtreeLB(int_t nnodes, int_t* perm_l, int_t* gTopOrder) +// calculates EtreeLB boundaries for given list of nodes, via perm_l +{ + //calculate minimum and maximum topOrder + int_t minTop, maxTop; + minTop = gTopOrder[perm_l[0]]; + maxTop = gTopOrder[perm_l[nnodes - 1]]; + int_t numLB = maxTop - minTop + 2; + int_t* lEtreeLB = (int_t *) malloc( sizeof(int_t) * numLB); + for (int i = 0; i < numLB; ++i) + { + /* initalize */ + lEtreeLB[i] = 0; + } + lEtreeLB[0] = 0; + int_t curLevel = minTop; + int_t curPtr = 1; + for (int i = 0; i < nnodes ; ++i) + { + /* code */ + if (curLevel != gTopOrder[perm_l[i]]) + { + /* creset */ + curLevel = gTopOrder[perm_l[i]]; + lEtreeLB[curPtr] = i; + curPtr++; + } + } + lEtreeLB[curPtr] = lEtreeLB[curPtr - 1] + 1; + printf("numLB=%d curPtr=%d \n", numLB, curPtr); + for (int i = 0; i < numLB; ++i) + { + printf("%d ", lEtreeLB[i]); + } + + return lEtreeLB; +} + +int_t* getSubTreeRoots(int_t k, treeList_t* treeList) +{ + int_t* srootList = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * 2); + int_t cur = k; + while (treeList[cur].numChild == 1 && cur > 0) + { + cur = treeList[cur].childrenList[0]; + } + + if (treeList[cur].numChild == 2) + { + /* code */ + srootList[0] = treeList[cur].childrenList[0]; + srootList[1] = treeList[cur].childrenList[1]; + // printf("Last node =%d, numchilds=%d, desc[%d] = %d, desc[%d] = %d \n ", + // cur, treeList[cur].numChild, + // srootList[0], treeList[srootList[0]].numDescendents, + // srootList[1], treeList[srootList[1]].numDescendents ); + } + else + { + /* code */ + srootList[0] = -1; + srootList[1] = -1; + } + + return srootList; +} + +int_t testSubtreeNodelist(int_t nsupers, int_t numList, int_t** nodeList, int_t* nodeCount) +// tests disjoint and union +{ + int_t* slist = (int_t* ) malloc(sizeof(int_t) * nsupers); + /*intialize each entry with zero */ + for (int_t i = 0; i < nsupers; ++i) + { + /* code */ + slist[i] = 0; + } + for (int_t list = 0; list < numList; ++list) + { + /* code */ + for (int_t nd = 0; nd < nodeCount[list]; ++nd) + { + slist[nodeList[list][nd]]++; + } + } + + for (int_t i = 0; i < nsupers; ++i) + { + /* code */ + assert(slist[i] == 1); + } + printf("testSubtreeNodelist Passed\n"); + free(slist); + return 0; +} +int_t testListPerm(int_t nodeCount, int_t* nodeList, int_t* permList, int_t* gTopLevel) +{ + // checking monotonicity + for (int_t i = 0; i < nodeCount - 1; ++i) + { + if (!( gTopLevel[permList[i]] <= gTopLevel[permList[i + 1]])) + { + /* code */ + printf("%d : %d (%d) %d (%d)\n", i, + permList[i], gTopLevel[permList[i]], + permList[i + 1], gTopLevel[permList[i + 1]] ); + } + assert( gTopLevel[permList[i]] <= gTopLevel[permList[i + 1]]); + } + + int_t* slist = (int_t* ) malloc(sizeof(int_t) * nodeCount); + int_t* plist = (int_t* ) malloc(sizeof(int_t) * nodeCount); + // copy lists + for (int_t i = 0; i < nodeCount; ++i) + { + slist[i] = nodeList[i]; + plist[i] = permList[i]; + } + // sort them + qsort(slist, nodeCount, sizeof(int_t), cmpfunc); + qsort(plist, nodeCount, sizeof(int_t), cmpfunc); + for (int_t i = 0; i < nodeCount; ++i) + { + assert( slist[i] == plist[i]); + } + printf("permList Test Passed\n"); + free(slist); + free(plist); + return 0; +} + + +int_t mergPermTest(int_t nperms, int_t* gperms, int_t* nnodes); +int_t* merg_perms(int_t nperms, int_t* nnodes, int_t** perms) +{ + // merges three permutations + int_t nn = 0; + //add permutations + for (int i = 0; i < nperms; ++i) + { + nn += nnodes[i]; + } + // alloc address + int_t* gperm = (int_t*) malloc(nn * sizeof(int_t)); + //now concatenat arrays + int_t ptr = 0; + for (int_t tr = 0; tr < nperms; ++tr) + { + /* code */ + for (int_t nd = 0; nd < nnodes[tr]; ++nd) + { + /* code */ + gperm[ptr] = perms[tr][nd]; + printf("%d %d %d %d\n", tr, ptr, nd, perms[tr][nd] ); + ptr++; + } + } + mergPermTest( nperms, gperm, nnodes); + return gperm; +} +int_t mergPermTest(int_t nperms, int_t* gperms, int_t* nnodes) +{ + // merges three permutations + int_t nn = 0; + //add permutations + for (int i = 0; i < nperms; ++i) + { + nn += nnodes[i]; + } + // alloc address + int_t* tperm = (int_t*) malloc(nn * sizeof(int_t)); + for (int i = 0; i < nn; ++i) + { + tperm[i] = 0; + } + for (int i = 0; i < nn; ++i) + { + /* code */ + printf("%d %d \n", i, gperms[i] ); + tperm[gperms[i]]++; + } + for (int i = 0; i < nn; ++i) + { + /* code */ + assert(tperm[i] == 1); + } + free(tperm); + return nn; +} + +int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat, + superlu_dist_options_t *options, + LocalLU_t *Llu, int_t* xsup, + int_t num_look_aheads, int_t nsupers, int_t * iperm_c_supno) +{ + /* constructing look-ahead table to indicate the last dependency */ + int_t iam = grid->iam; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t ncb = nsupers / Pc; + int_t nrb = nsupers / Pr; + stat->num_look_aheads = num_look_aheads; + int* look_ahead_l = SUPERLU_MALLOC (nsupers * sizeof (int)); + int* look_ahead = SUPERLU_MALLOC (nsupers * sizeof (int)); + for (int_t lb = 0; lb < nsupers; lb++) + look_ahead_l[lb] = -1; + /* go through U-factor */ + for (int_t lb = 0; lb < nrb; ++lb) + { + int_t ib = lb * Pr + myrow; + int_t* index = Llu->Ufstnz_br_ptr[lb]; + if (index) /* Not an empty row */ + { + int_t k = BR_HEADER; + for (int_t j = 0; j < index[0]; ++j) + { + int_t jb = index[k]; + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + if (myrow < nsupers % grid->nprow) + { + int_t ib = nrb * Pr + myrow; + int_t* index = Llu->Ufstnz_br_ptr[nrb]; + if (index) /* Not an empty row */ + { + int_t k = BR_HEADER; + for (int_t j = 0; j < index[0]; ++j) + { + int_t jb = index[k]; + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + if (options->SymPattern == NO) + { + /* go through L-factor */ + for (int_t lb = 0; lb < ncb; lb++) + { + int_t ib = lb * Pc + mycol; + int_t* index = Llu->Lrowind_bc_ptr[lb]; + if (index) + { + int_t k = BC_HEADER; + for (int_t j = 0; j < index[0]; j++) + { + int_t jb = index[k]; + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + if (mycol < nsupers % grid->npcol) + { + int_t ib = ncb * Pc + mycol; + int_t* index = Llu->Lrowind_bc_ptr[ncb]; + if (index) + { + int_t k = BC_HEADER; + for (int_t j = 0; j < index[0]; j++) + { + int_t jb = index[k]; + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + } + MPI_Allreduce (look_ahead_l, look_ahead, nsupers, MPI_INT, MPI_MAX, + grid->comm); + SUPERLU_FREE (look_ahead_l); + return look_ahead; +} + +int* getLastDepBtree( int_t nsupers, treeList_t* treeList) +{ + int* look_ahead = SUPERLU_MALLOC (nsupers * sizeof (int)); + for (int i = 0; i < nsupers; ++i) + { + look_ahead[i] = -1; + } + for (int k = 0; k < nsupers; ++k) + { + /* code */ + for (int_t child = 0; child < treeList[k].numChild; ++child) + { + /* code */ + switch ( child) + { + case 0: + look_ahead[k] = SUPERLU_MAX(look_ahead[k], treeList[k].left); + break; + case 1: + look_ahead[k] = SUPERLU_MAX(look_ahead[k], treeList[k].right); + break; + case 2: + look_ahead[k] = SUPERLU_MAX(look_ahead[k], treeList[k].extra); + break; + default: + break; + } + } + } + return look_ahead; +} + +int_t* getGlobal_iperm(int_t nsupers, int_t nperms, // number of permutations + int_t** perms, // array of permutations + int_t* nnodes // number of nodes in each permutation + ) +{ + int_t* gperm = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + int_t* giperm = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + int_t ptr = 0; + for (int_t perm = 0; perm < nperms; ++perm) + { + /* code */ + for (int_t node = 0; node < nnodes[perm]; ++node) + { + /* code */ + gperm[ptr] = perms[perm][node]; + ptr++; + } + } + assert(ptr == nsupers); + for (int_t i = 0; i < nsupers; ++i) + { + giperm[gperm[i]] = i; + } + SUPERLU_FREE(gperm); + return giperm; +} +int_t* getTreeHeads(int_t maxLvl, int_t nsupers, treeList_t* treeList) +{ + int_t numTrees = (1 << maxLvl) - 1; + int_t* treeHeads = SUPERLU_MALLOC (numTrees * sizeof (int_t)); + // for (int i = 0; i < numTrees; ++i) + // { + // /* code */ + // treeHeads[i]=0; + // } + treeHeads[0] = nsupers - 1; + for (int_t lvl = 0; lvl < maxLvl - 1; ++lvl) + { + /* code */ + int_t st = (1 << lvl) - 1; + int_t end = 2 * st + 1; + for (int_t i = st; i < end; ++i) + { + /* code */ + int_t * sroots; + sroots = getSubTreeRoots(treeHeads[i], treeList); + treeHeads[2 * i + 1] = sroots[0]; + treeHeads[2 * i + 2] = sroots[1]; + SUPERLU_FREE(sroots); + } + } + return treeHeads; +} + +int_t* calcNumNodes(int_t maxLvl, int_t* treeHeads, treeList_t* treeList) +{ + int_t numTrees = (1 << maxLvl) - 1; + int_t* nnodes = SUPERLU_MALLOC (numTrees * sizeof (int_t)); + for (int_t i = 0; i < numTrees; ++i) + { + /* code */ + if (treeHeads[i] > -1) + { + /* code */ + nnodes[i] = treeList[treeHeads[i]].numDescendents; + } + else + { + nnodes[i] = 0; + } + + } + for (int_t i = 0; i < numTrees / 2 ; ++i) + { + /* code */ + nnodes[i] -= (nnodes[2 * i + 1] + nnodes[2 * i + 2]); + } + return nnodes; +} + +int_t** getNodeList(int_t maxLvl, int_t* setree, int_t* nnodes, + int_t* treeHeads, treeList_t* treeList) +{ + int_t numTrees = (1 << maxLvl) - 1; + int_t** nodeList = SUPERLU_MALLOC (numTrees * sizeof (int_t*)); + for (int_t i = 0; i < numTrees; ++i) + { + /* code */ + if (nnodes[i] > 0) + { + nodeList[i] = SUPERLU_MALLOC (nnodes[i] * sizeof (int_t)); + assert(nodeList[i]); + } + else + { + nodeList[i] = NULL; + } + + } + + for (int_t lvl = 0; lvl < maxLvl - 1; ++lvl) + { + /* code */ + int_t st = (1 << lvl) - 1; + int_t end = 2 * st + 1; + for (int_t i = st; i < end; ++i) + { + /* code */ + if (nodeList[i]) + getCommonAncestorList(treeHeads[i], nodeList[i], setree, treeList); + } + } + + int_t st = (1 << (maxLvl - 1)) - 1; + int_t end = 2 * st + 1; + for (int_t i = st; i < end; ++i) + { + /* code */ + getDescendList(treeHeads[i], nodeList[i], treeList); + } + return nodeList; +} + +int_t* getGridTrees( gridinfo3d_t* grid3d) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t* myTreeIdx = (int_t*) SUPERLU_MALLOC (maxLvl * sizeof (int_t)); + myTreeIdx[0] = grid3d->zscp.Np - 1 + grid3d->zscp.Iam ; + for (int i = 1; i < maxLvl; ++i) + { + /* code */ + myTreeIdx[i] = (myTreeIdx[i - 1] - 1) / 2; + } + return myTreeIdx; +} + +int_t* getReplicatedTrees( gridinfo3d_t* grid3d) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t* myZeroTrIdxs = (int_t*) SUPERLU_MALLOC (maxLvl * sizeof (int_t)); + for (int i = 0; i < maxLvl; ++i) + { + /* code */ + if (grid3d->zscp.Iam % (1 << i) ) + { + myZeroTrIdxs[i] = 1; + } + else + { + myZeroTrIdxs[i] = 0; + } + } + return myZeroTrIdxs; +} + + +int_t* getMyIperm(int_t nnodes, int_t nsupers, int_t* myPerm) +{ + if (nnodes < 0) return NULL; + int_t* myIperm = INT_T_ALLOC(nsupers); + for (int_t i = 0; i < nsupers; ++i) + { + /* code */ + myIperm[i] = -1; + } + for (int_t i = 0; i < nnodes; ++i) + { + /* code */ + assert(myPerm[i] < nsupers); + myIperm[myPerm[i]] = i; + } + return myIperm; +} +int_t* getMyTopOrder(int_t nnodes, int_t* myPerm, int_t* myIperm, int_t* setree ) +{ + if (nnodes < 0) return NULL; + int_t* myTopOrder = INT_T_ALLOC(nnodes); + for (int_t i = 0; i < nnodes; ++i) + { + myTopOrder[i] = 0; /*initializing all levels to zero*/ + } + for (int_t i = 0; i < nnodes - 1; ++i) + { + /*level of parent = MAX(level_of_children()+1)*/ + int_t inode = myPerm[i]; + int_t iparent = setree[inode]; + int_t iparentIdx = myIperm[iparent]; + // if(iparentIdx >= nnodes) printf("%d %d %d %d \n", inode, iparent, nnodes, iparentIdx); + // assert(iparentIdx < nnodes); + // if (iparentIdx != -1) + if (0<= iparentIdx && iparentIdx 0) + { + /* code */ + st = Etree_LvlBdry[i]; + } + for (int_t j = st; j < nsuper; ++j) + { + /* code */ + if (tsort_etree[perm[j]] == i + 1) + { + /* code */ + Etree_LvlBdry[i + 1] = j; + break; + } + } + } + Etree_LvlBdry[max_level] = nsuper; + return Etree_LvlBdry; +} + +int_t* calculate_num_children(int_t nsuper, int_t* setree) +{ + int_t* etree_num_children = malloc(sizeof(int_t) * (nsuper)); + for (int_t i = 0; i < nsuper; ++i) + { + /*initialize num children to zero*/ + etree_num_children[i] = 0; + } + for (int_t i = 0; i < nsuper; i++) + { + if (setree[i] < nsuper) + etree_num_children[setree[i]]++; + } + return etree_num_children; +} +void Print_EtreeLevelBoundry(int_t *Etree_LvlBdry, int_t max_level, int_t nsuper) +{ + for (int i = 0; i < max_level; ++i) + { + int_t st = 0; + int_t ed = nsuper; + st = Etree_LvlBdry[i]; + ed = Etree_LvlBdry[i + 1]; + printf("Level %d, NumSuperNodes=%d,\t Start=%d end=%d\n", i, ed - st, st, ed); + } +} + +void print_etree_leveled(int_t *setree, int_t* tsort_etree, int_t nsuper) +{ + FILE* fp = fopen("output_sorted.dot", "w"); + int_t max_level = tsort_etree[nsuper - 1]; + /*beginning of the file */ + fprintf(fp, "//dot file generated by pdgstrf\n"); + fprintf(fp, "digraph elimination_tree {\n"); + fprintf(fp, "labelloc=\"t\";\n"); + fprintf(fp, "label=\"Depth of the tree is %d\";\n", max_level); + + for (int i = 0; i < nsuper - 1; ++i) + { + /* code */ + // fprintf(fp, "%lld -> %lld;\n",iperm[i],iperm[setree[i]]); + fprintf(fp, "%lld -> %lld;\n", i, setree[i]); + } + /*adding rank information*/ + for (int i = 0; i < max_level; ++i) + { + fprintf(fp, "{ rank=same; "); + for (int j = 0; j < nsuper; ++j) + { + if (tsort_etree[j] == i) + fprintf(fp, "%lld ", j); + } + fprintf(fp, "}\n"); + } + /*end of the file */ + fprintf(fp, "}\n"); + fprintf(fp, "//EOF\n"); + fclose(fp); +} + + +void printEtree(int_t nsuper, int_t *setree, treeList_t* treeList) +{ + FILE* fp = fopen("output_sorted.dot", "w"); + // int_t max_level = tsort_etree[nsuper - 1]; + /*beginning of the file */ + fprintf(fp, "//dot file generated by pdgstrf\n"); + fprintf(fp, "digraph elimination_tree {\n"); + // fprintf(fp, "labelloc=\"t\";\n"); + // fprintf(fp, "label=\"Depth of the tree is %d\";\n", max_level); + + for (int i = 0; i < nsuper - 1; ++i) + { + /* code */ + // fprintf(fp, "%lld -> %lld;\n",iperm[i],iperm[setree[i]]); + fprintf(fp, " \"%lld|%lld\" -> \"%lld|%lld\";\n", i, treeList[i].depth, + setree[i], treeList[setree[i]].depth); + } + + /*end of the file */ + fprintf(fp, "}\n"); + fprintf(fp, "//EOF\n"); + fclose(fp); +} + + +void print_etree(int_t *setree, int_t* iperm, int_t nsuper) +{ + FILE* fp = fopen("output.dot", "w"); + /*beginning of the file */ + fprintf(fp, "//dot file generated by pdgstrf\n"); + fprintf(fp, "digraph elimination_tree {\n"); + for (int i = 0; i < nsuper; ++i) + { + /* code */ + fprintf(fp, "%lld -> %lld;\n", iperm[i], iperm[setree[i]]); + } + /*end of the file */ + fprintf(fp, "}\n"); + fprintf(fp, "//EOF\n"); + fclose(fp); +} diff --git a/SRC/treeFactorization.c b/SRC/treeFactorization.c new file mode 100644 index 00000000..9c242582 --- /dev/null +++ b/SRC/treeFactorization.c @@ -0,0 +1,366 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +#include "superlu_ddefs.h" +#if 0 +#include "treeFactorization.h" +#include "trfCommWrapper.h" +#endif + +int_t sDiagFactIBCast(int_t k, diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + gridinfo_t *grid, + superlu_dist_options_t *options, + double thresh, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, int *info, + SCT_t *SCT, + int tag_ub + ) +{ + MPI_Request * U_diag_blk_recv_req = comReqs->U_diag_blk_recv_req; + MPI_Request * L_diag_blk_recv_req = comReqs->L_diag_blk_recv_req; + MPI_Request * U_diag_blk_send_req = comReqs->U_diag_blk_send_req; + MPI_Request * L_diag_blk_send_req = comReqs->L_diag_blk_send_req; + int_t * IrecvPlcd_D = factStat->IrecvPlcd_D; + + double * BlockUFactor = dFBuf->BlockUFactor; + double * BlockLFactor = dFBuf->BlockLFactor; + dDiagFactIBCast(k, k, BlockUFactor, BlockLFactor, + IrecvPlcd_D, + U_diag_blk_recv_req, L_diag_blk_recv_req, + U_diag_blk_send_req, L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); + return 0; +} +int_t sLPanelUpdate( int_t k, diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT) +{ + MPI_Request * U_diag_blk_recv_req = comReqs->U_diag_blk_recv_req; + int_t * IrecvPlcd_D = factStat->IrecvPlcd_D; + int_t * factored_L = factStat->factored_L; + double * BlockUFactor = dFBuf->BlockUFactor; + + dLPanelUpdate( k, IrecvPlcd_D, factored_L, + U_diag_blk_recv_req, BlockUFactor, grid, LUstruct, SCT); + return 0; +} + +int_t sUPanelUpdate( int_t k, + int_t ldt, + diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + scuBufs_t* scuBufs, + packLUInfo_t* packLUInfo, + gridinfo_t *grid, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, SCT_t *SCT) +{ + double* bigV = scuBufs->bigV; + Ublock_info_t* Ublock_info = packLUInfo->Ublock_info; + + MPI_Request * L_diag_blk_recv_req = comReqs->L_diag_blk_recv_req; + + int_t * factored_U = factStat->factored_U; + + double * BlockLFactor = dFBuf->BlockLFactor; + dUPanelUpdate(k, factored_U, L_diag_blk_recv_req, BlockLFactor, bigV, ldt, + Ublock_info, grid, LUstruct, stat, SCT); + return 0; +} +int_t sIBcastRecvLPanel( + int_t k, + commRequests_t *comReqs, + LUValSubBuf_t* LUvsb, + msgs_t* msgs, + factStat_t *factStat, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT, int tag_ub) + +{ + int* msgcnt = msgs->msgcnt; + MPI_Request *send_req = comReqs->send_req; + MPI_Request *recv_req = comReqs->recv_req; + int_t * Lsub_buf = LUvsb->Lsub_buf; + double * Lval_buf = LUvsb->Lval_buf; + int_t* factored = factStat->factored; + dIBcastRecvLPanel(k, k, + msgcnt, send_req, recv_req, + Lsub_buf, Lval_buf, factored, grid, LUstruct, SCT, tag_ub); + return 0; +} + +int_t sIBcastRecvUPanel( + int_t k, + commRequests_t *comReqs, + LUValSubBuf_t* LUvsb, + msgs_t* msgs, + factStat_t *factStat, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT, int tag_ub) +{ + int* msgcnt = msgs->msgcnt; + MPI_Request *send_requ = comReqs->send_requ; + MPI_Request *recv_requ = comReqs->recv_requ; + int_t * Usub_buf = LUvsb->Usub_buf; + double * Uval_buf = LUvsb->Uval_buf; + dIBcastRecvUPanel(k, k, msgcnt, send_requ, recv_requ, Usub_buf, + Uval_buf, grid, LUstruct, SCT, tag_ub); + return 0; +} +int_t sWaitL(int_t k, + commRequests_t *comReqs, + msgs_t* msgs, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT) +{ + int* msgcnt = msgs->msgcnt; + int* msgcntU = msgs->msgcntU; + MPI_Request *send_req = comReqs->send_req; + MPI_Request *recv_req = comReqs->recv_req; + dWaitL(k, msgcnt, msgcntU, send_req, recv_req, grid, LUstruct, SCT); + return 0; +} +int_t sWaitU(int_t k, + commRequests_t *comReqs, + msgs_t* msgs, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT) +{ + int* msgcnt = msgs->msgcnt; + MPI_Request *send_requ = comReqs->send_requ; + MPI_Request *recv_requ = comReqs->recv_requ; + dWaitU(k, msgcnt, send_requ, recv_requ, grid, LUstruct, SCT); + return 0; +} +int_t sWait_LUDiagSend(int_t k, commRequests_t *comReqs, + gridinfo_t *grid, SCT_t *SCT) +{ + MPI_Request *U_diag_blk_send_req = comReqs->U_diag_blk_send_req; + MPI_Request *L_diag_blk_send_req = comReqs->L_diag_blk_send_req; + Wait_LUDiagSend(k, U_diag_blk_send_req, L_diag_blk_send_req, grid, SCT); + return 0; +} +int_t sSchurComplementSetup(int_t k, msgs_t* msgs, + packLUInfo_t* packLUInfo, + int_t* gIperm_c_supno, int_t*perm_c_supno, + factNodelists_t* fNlists, + scuBufs_t* scuBufs, LUValSubBuf_t* LUvsb, + gridinfo_t *grid, LUstruct_t *LUstruct) +{ + int_t * Lsub_buf = LUvsb->Lsub_buf; + double * Lval_buf = LUvsb->Lval_buf; + int_t * Usub_buf = LUvsb->Usub_buf; + double * Uval_buf = LUvsb->Uval_buf; + Ublock_info_t* Ublock_info = packLUInfo->Ublock_info; + Remain_info_t* Remain_info = packLUInfo->Remain_info; + uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo; + lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo; + int* msgcnt = msgs->msgcnt; + int_t* iperm_u = fNlists->iperm_u; + int_t* perm_u = fNlists->perm_u; + + double* bigU = scuBufs->bigU; + return dSchurComplementSetup(k, msgcnt, + Ublock_info, Remain_info, uPanelInfo, lPanelInfo, + gIperm_c_supno, iperm_u, perm_u, + bigU, Lsub_buf, Lval_buf, Usub_buf, Uval_buf, + grid, LUstruct); +} + +int_t initCommRequests(commRequests_t* comReqs, gridinfo_t * grid) +{ + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + // allocating MPI requests (for one) + comReqs->U_diag_blk_recv_req = MPI_REQ_ALLOC( 1 ); + comReqs->L_diag_blk_recv_req = MPI_REQ_ALLOC( 1 ); + comReqs->U_diag_blk_send_req = MPI_REQ_ALLOC( Pr ); + comReqs->L_diag_blk_send_req = MPI_REQ_ALLOC( Pc ); + comReqs->send_req = MPI_REQ_ALLOC(2 * Pc); + comReqs->recv_req = MPI_REQ_ALLOC(4); + comReqs->send_requ = MPI_REQ_ALLOC(2 * Pr); + comReqs->recv_requ = MPI_REQ_ALLOC(2); + return 0; +} + +commRequests_t** initCommRequestsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid) +{ + commRequests_t** comReqss; + comReqss = (commRequests_t** ) SUPERLU_MALLOC(mxLeafNode * sizeof(commRequests_t*)); + for (int i = 0; i < mxLeafNode; ++i) + { + /* code */ + comReqss[i] = (commRequests_t* ) SUPERLU_MALLOC(sizeof(commRequests_t)); + initCommRequests(comReqss[i], grid); + }/*Minor for loop -2 for (int i = 0; i < mxLeafNode; ++i)*/ + return comReqss; +} + +int_t initFactStat(int_t nsupers, factStat_t* factStat) +{ + factStat->IrecvPlcd_D = intMalloc_dist( nsupers); + factStat->factored_D = intMalloc_dist( nsupers); //INT_T_ALLOC( nsupers); + factStat->factored_L = intMalloc_dist( nsupers); //INT_T_ALLOC( nsupers); + factStat->factored_U = intMalloc_dist( nsupers); //INT_T_ALLOC( nsupers); + factStat->factored = intMalloc_dist( nsupers); //INT_T_ALLOC( nsupers); + factStat->IbcastPanel_L = intMalloc_dist(nsupers); //INT_T_ALLOC(nsupers); + factStat->IbcastPanel_U = intMalloc_dist(nsupers); //INT_T_ALLOC(nsupers); + factStat->gpuLUreduced = intMalloc_dist(nsupers); //INT_T_ALLOC(nsupers); + + for (int_t i = 0; i < nsupers; ++i) + { + /* code */ + factStat->IrecvPlcd_D[i] = 0; + factStat->factored_D[i] = 0; + factStat->factored_L[i] = 0; + factStat->factored_U[i] = 0; + factStat->IbcastPanel_L[i] = 0; + factStat->IbcastPanel_U[i] = 0; + factStat->gpuLUreduced[i] = 0; + } + return 0; +} + +int_t initFactNodelists(int_t ldt, int_t num_threads, int_t nsupers, + factNodelists_t* fNlists) +{ + fNlists->iperm_u = INT_T_ALLOC(nsupers); + fNlists->perm_u = INT_T_ALLOC(nsupers); + fNlists->indirect = INT_T_ALLOC(num_threads * ldt); + fNlists->indirect2 = INT_T_ALLOC(num_threads * ldt); + return 0; +} + +int_t initMsgs(msgs_t* msgs) +{ + msgs->msgcnt = (int *) SUPERLU_MALLOC(4 * sizeof(int)); + msgs->msgcntU = (int *) SUPERLU_MALLOC(4 * sizeof(int)); + return 0; +} + +msgs_t** initMsgsArr(int_t numLA) +{ + msgs_t**msgss = (msgs_t**) SUPERLU_MALLOC(numLA * sizeof(msgs_t*)); + for (int_t i = 0; i < numLA; ++i) + { + /* code */ + msgss[i] = (msgs_t*) SUPERLU_MALLOC(sizeof(msgs_t)); + initMsgs(msgss[i]); + } /*minor for loop-3 for (int i = 0; i < numLA; ++i)*/ + return msgss; +} + +int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo) +{ + packLUInfo->Ublock_info = (Ublock_info_t*) SUPERLU_MALLOC (sizeof(Ublock_info_t) * nsupers); + packLUInfo->Remain_info = (Remain_info_t* ) SUPERLU_MALLOC(sizeof(Remain_info_t) * nsupers); + packLUInfo->uPanelInfo = (uPanelInfo_t* ) SUPERLU_MALLOC(sizeof(uPanelInfo_t)); + packLUInfo->lPanelInfo = (lPanelInfo_t*) SUPERLU_MALLOC(sizeof(lPanelInfo_t)); + return 0; +} + +int_t getNumLookAhead(superlu_dist_options_t *options) +{ + int_t numLA; + if (getenv("NLULA")) + { + numLA = atoi(getenv("NLULA")); + } + else + { + // printf("NLULA not set using default 2\n"); + // numLA = 2; + numLA = options->num_lookaheads; + } + return numLA; +} + +int_t checkRecvUDiag(int_t k, commRequests_t *comReqs, + gridinfo_t *grid, SCT_t *SCT) +{ + + MPI_Request * U_diag_blk_recv_req = comReqs->U_diag_blk_recv_req; + int_t iam = grid->iam; + + int_t mycol = MYCOL (iam, grid); + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + + int_t kcol = PCOL (k, grid); + + if (mycol == kcol && iam != pkk) + { + int_t flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT); + return flag; + } + + return 1; +} + +int_t sLPanelTrSolve( int_t k, diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT) +{ + int_t * factored_L = factStat->factored_L; + double * BlockUFactor = dFBuf->BlockUFactor; + dLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct); + + return 0; +} + +int_t checkRecvLDiag(int_t k, + commRequests_t *comReqs, + gridinfo_t *grid, + SCT_t *SCT) +{ + MPI_Request * L_diag_blk_recv_req = comReqs->L_diag_blk_recv_req; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + int_t krow = PROW (k, grid); + + /*factor the U panel*/ + if (myrow == krow && iam != pkk) + { + int_t flag = 0; + + flag = Test_LDiagBlock_Recv( L_diag_blk_recv_req , SCT); + + return flag; + } + return 1; +} + +int_t sUPanelTrSolve( int_t k, + int_t ldt, + diagFactBufs_t *dFBuf, + scuBufs_t* scuBufs, + packLUInfo_t* packLUInfo, + gridinfo_t *grid, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, SCT_t *SCT) +{ + double* bigV = scuBufs->bigV; + Ublock_info_t* Ublock_info = packLUInfo->Ublock_info; + double * BlockLFactor = dFBuf->BlockLFactor; + + dUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid, LUstruct, stat, SCT); + return 0; +} + diff --git a/SRC/trfAux.c b/SRC/trfAux.c new file mode 100644 index 00000000..f9737f1c --- /dev/null +++ b/SRC/trfAux.c @@ -0,0 +1,1221 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +#include "superlu_ddefs.h" + +#if 0 +#include "pdgstrf3d.h" +#include "trfAux.h" +#endif + +int_t getslu25D_enabled() +{ + if ( getenv("SLU25D") != NULL) + { + return atoi(getenv("SLU25D")); + } + else + { + return 0; + } +} + +int_t getNsupers(int n, LUstruct_t *LUstruct) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + int_t nsupers = Glu_persist->supno[n - 1] + 1; + return nsupers; +} + +int set_tag_ub() +{ + void *attr_val; + int flag; + MPI_Comm_get_attr (MPI_COMM_WORLD, MPI_TAG_UB, &attr_val, &flag); + if (!flag) + { + fprintf (stderr, "Could not get TAG_UB\n"); + exit(-1); + } + return ( *(int_t *) attr_val ); +} + +int getNumThreads(int iam) +{ + int num_threads; + #pragma omp parallel default(shared) + { + #pragma omp master + { + num_threads = omp_get_num_threads (); + + } + } + + if (!iam) + { + printf(".. Starting with %d openMP threads \n", num_threads ); + + } + return num_threads; +} + + +#if 0 //**** Sherry: following two routines are old, the new ones are in util.c +int_t num_full_cols_U(int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup, + gridinfo_t *grid, int_t *perm_u) +{ + int_t lk = LBi (kk, grid); + int_t *usub = Ufstnz_br_ptr[lk]; + + if (usub == NULL) + { + /* code */ + return 0; + } + int_t iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ + int_t rukp = 0; /* Pointer to nzval[] of U(k,:) */ + int_t nub = usub[0]; /* Number of blocks in the block row U(k,:) */ + + int_t klst = FstBlockC (kk + 1); + int_t iukp0 = iukp; + int_t rukp0 = rukp; + int_t jb, ljb; + int_t nsupc; + int_t temp_ncols = 0; + int_t segsize; + + temp_ncols = 0; + + for (int_t j = 0; j < nub; ++j) + { + arrive_at_ublock( + j, &iukp, &rukp, &jb, &ljb, &nsupc, + iukp0, rukp0, usub, perm_u, xsup, grid + ); + + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + segsize = klst - usub[jj]; + if ( segsize ) ++temp_ncols; + } + } + return temp_ncols; +} + +// Sherry: this is old; new version is in util.c +int_t estimate_bigu_size( int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr, + Glu_persist_t *Glu_persist, gridinfo_t* grid, int_t* perm_u) +{ + + int_t iam = grid->iam; + + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + + int_t* xsup = Glu_persist->xsup; + + int ncols = 0; + int_t ldu = 0; + + /*initilize perm_u*/ + for (int i = 0; i < nsupers; ++i) + { + perm_u[i] = i; + } + + for (int lk = myrow; lk < nsupers; lk += Pr ) + { + ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr, + xsup, grid, perm_u, &ldu)); + } + + int_t max_ncols = 0; + + MPI_Allreduce(&ncols, &max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm); + + printf("max_ncols =%d, bigu_size=%ld\n", (int) max_ncols, (long long) ldt * max_ncols); + return ldt * max_ncols; +} /* old estimate_bigu_size. New one is in util.c */ +#endif /**** end old ones ****/ + +int_t getBigUSize(int_t nsupers, gridinfo_t *grid, + LUstruct_t *LUstruct) +{ + + int_t Pr = grid->nprow; + int_t Pc = grid->npcol; + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + + + /* Following circuit is for finding maximum block size */ + int local_max_row_size = 0; + int max_row_size; + + for (int_t i = 0; i < nsupers; ++i) + { + int_t tpc = PCOL (i, grid); + if (mycol == tpc) + { + int_t lk = LBj (i, grid); + int_t* lsub = LUstruct->Llu->Lrowind_bc_ptr[lk]; + if (lsub != NULL) + { + local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]); + } + } + + } + + /* Max row size is global reduction of within A row */ + MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX, + (grid->rscp.comm)); + + + // int_t Threads_per_process = get_thread_per_process (); + + /*Buffer size is max of of look ahead window*/ + + + int_t bigu_size = + 8 * sp_ienv_dist (3) * (max_row_size) * SUPERLU_MAX(Pr / Pc, 1); + + return bigu_size; +} + +int_t* getFactPerm(int_t nsupers) +{ + int_t* perm = INT_T_ALLOC(nsupers); + + for (int_t i = 0; i < nsupers; ++i) + { + /* code */ + perm[i] = i; + } + + return perm; +} + +int_t* getFactIperm(int_t* perm, int_t nsupers) +{ + int_t* iperm = INT_T_ALLOC(nsupers); + + for (int_t i = 0; i < nsupers; ++i) + { + /* code */ + iperm[perm[i]] = i; + } + + return iperm; +} + +int_t* getPerm_c_supno(int_t nsupers, + superlu_dist_options_t *options, + LUstruct_t *LUstruct, gridinfo_t *grid) + +{ + /*I do not understand the following code in detail, + I have just written a wrapper around it*/ + + int_t* perm_c_supno; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + + int_t iam = grid->iam; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + + int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows, *Lblock, *Lrows, + *sf_block, *sf_block_l, *nnodes_l, *nnodes_u, *edag_supno_l, *recvbuf, + **edag_supno; + int_t i, ib, jb, + lb, + nlb, il, iu; + int ncb, nrb, p, pr, pc, nblocks; + int_t *index; + int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows; + int_t j, k, krow, yourcol; + etree_node *head, *tail, *ptr; + int *num_child; + nblocks = 0; + ncb = nsupers / Pc; + nrb = nsupers / Pr; + /* ================================================== * + * static scheduling of j-th step of LU-factorization * + * ================================================== */ + if ( options->lookahead_etree == YES && /* use e-tree of symmetrized matrix, and */ + (options->ParSymbFact == NO || /* 1) symmetric fact with serial symbolic, or */ + (options->SymPattern == YES && /* 2) symmetric pattern, and */ + options->RowPerm == NOROWPERM)) ) /* no rowperm to destroy the symmetry */ + { + /* if symmetric pattern or using e-tree of |A^T|+|A|, + then we can use a simple tree structure for static schduling */ + + if ( options->ParSymbFact == NO ) + { + /* Use the etree computed from serial symb. fact., and turn it + into supernodal tree. */ + int_t *etree = LUstruct->etree; +#if ( PRNTlevel>=1 ) + if ( grid->iam == 0 ) printf( " === using column e-tree ===\n" ); +#endif + + /* look for the first off-diagonal blocks */ + etree_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t) ); + for ( i = 0; i < nsupers; i++ ) etree_supno[i] = nsupers; + for ( j = 0, lb = 0; lb < nsupers; lb++ ) + { + for ( k = 0; k < SuperSize(lb); k++ ) + { + jb = Glu_persist->supno[etree[j + k]]; + if ( jb != lb ) etree_supno[lb] = SUPERLU_MIN( etree_supno[lb], jb ); + } + j += SuperSize(lb); + } + } + else /* ParSymbFACT==YES and SymPattern==YES and RowPerm == NOROWPERM */ + { + /* Compute an "etree" based on struct(L), + assuming struct(U) = struct(L'). */ +#if ( PRNTlevel>=1 ) + if ( grid->iam == 0 ) printf( " === using supernodal e-tree ===\n" ); +#endif + + /* find the first block in each supernodal-column of local L-factor */ + etree_supno_l = SUPERLU_MALLOC( nsupers * sizeof(int_t) ); + for ( i = 0; i < nsupers; i++ ) etree_supno_l[i] = nsupers; + for ( lb = 0; lb < ncb; lb++ ) + { + jb = lb * grid->npcol + mycol; + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) /* Not an empty column */ + { + i = index[0]; + k = BC_HEADER; + krow = PROW( jb, grid ); + if ( krow == myrow ) /* skip the diagonal block */ + { + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + if ( i > 0 ) + { + etree_supno_l[jb] = index[k]; + k += LB_DESCRIPTOR + index[k + 1]; + i --; + } + + for ( j = 0; j < i; j++ ) + { + etree_supno_l[jb] = SUPERLU_MIN( etree_supno_l[jb], index[k] ); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + if ( mycol < nsupers % grid->npcol ) + { + jb = ncb * grid->npcol + mycol; + index = Llu->Lrowind_bc_ptr[ncb]; + if ( index ) /* Not an empty column */ + { + i = index[0]; + k = BC_HEADER; + krow = PROW( jb, grid ); + if ( krow == myrow ) /* skip the diagonal block */ + { + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + if ( i > 0 ) + { + etree_supno_l[jb] = index[k]; + k += LB_DESCRIPTOR + index[k + 1]; + i --; + } + for ( j = 0; j < i; j++ ) + { + etree_supno_l[jb] = SUPERLU_MIN( etree_supno_l[jb], index[k] ); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + + /* form global e-tree */ + etree_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t) ); + MPI_Allreduce( etree_supno_l, etree_supno, nsupers, mpi_int_t, MPI_MIN, grid->comm ); + SUPERLU_FREE(etree_supno_l); + } + + /* initialize the num of child for each node */ + num_child = SUPERLU_MALLOC( nsupers * sizeof(int_t) ); + for ( i = 0; i < nsupers; i++ ) num_child[i] = 0; + for ( i = 0; i < nsupers; i++ ) if ( etree_supno[i] != nsupers ) num_child[etree_supno[i]] ++; + + /* push initial leaves to the fifo queue */ + nnodes = 0; + for ( i = 0; i < nsupers; i++ ) + { + if ( num_child[i] == 0 ) + { + ptr = SUPERLU_MALLOC( sizeof(etree_node) ); + ptr->id = i; + ptr->next = NULL; + /*printf( " == push leaf %d (%d) ==\n",i,nnodes );*/ + nnodes ++; + + if ( nnodes == 1 ) + { + head = ptr; + tail = ptr; + } + else + { + tail->next = ptr; + tail = ptr; + } + } + } + + /* process fifo queue, and compute the ordering */ + i = 0; + perm_c_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t) ); + while ( nnodes > 0 ) + { + ptr = head; j = ptr->id; + head = ptr->next; + perm_c_supno[i] = j; + SUPERLU_FREE(ptr); + i++; nnodes --; + + if ( etree_supno[j] != nsupers ) + { + num_child[etree_supno[j]] --; + if ( num_child[etree_supno[j]] == 0 ) + { + nnodes ++; + + ptr = SUPERLU_MALLOC( sizeof(etree_node) ); + ptr->id = etree_supno[j]; + ptr->next = NULL; + + /*printf( "=== push %d ===\n",ptr->id );*/ + if ( nnodes == 1 ) + { + head = ptr; + tail = ptr; + } + else + { + tail->next = ptr; + tail = ptr; + } + } + } + /*printf( "\n" );*/ + } + SUPERLU_FREE(num_child); + SUPERLU_FREE(etree_supno); + + } + else /* Unsymmetric pattern */ + { + /* Need to process both L- and U-factors, use the symmetrically + pruned graph of L & U instead of tree (very naive implementation) */ + int nrbp1 = nrb + 1; + + /* allocate some workspace */ + if ( !(sendcnts = SUPERLU_MALLOC( (4 + 2 * nrbp1) * Pr * Pc * sizeof(int))) ) + ABORT("Malloc fails for sendcnts[]."); + sdispls = &sendcnts[Pr * Pc]; + recvcnts = &sdispls [Pr * Pc]; + rdispls = &recvcnts[Pr * Pc]; + srows = &rdispls [Pr * Pc]; + rrows = &srows [Pr * Pc * nrbp1]; + + myrow = MYROW( iam, grid ); +#if ( PRNTlevel>=1 ) + if ( grid->iam == 0 ) printf( " === using DAG ===\n" ); +#endif + + /* send supno block of local U-factor to a processor * + * who owns the corresponding block of L-factor */ + + /* srows : # of block to send to a processor from each supno row */ + /* sendcnts: total # of blocks to send to a processor */ + for (p = 0; p < Pr * Pc * nrbp1; p++) srows[p] = 0; + for (p = 0; p < Pr * Pc; p++ ) sendcnts[p] = 0; + + /* sending blocks of U-factors corresponding to L-factors */ + /* count the number of blocks to send */ + for (lb = 0; lb < nrb; ++lb) + { + jb = lb * Pr + myrow; + pc = jb % Pc; + index = Llu->Ufstnz_br_ptr[lb]; + + if ( index ) /* Not an empty row */ + { + k = BR_HEADER; + nblocks += index[0]; + for (j = 0; j < index[0]; ++j) + { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + sendcnts[p] ++; + srows[p * nrbp1 + lb] ++; + + k += UB_DESCRIPTOR + SuperSize( index[k] ); + } + } + } + if ( myrow < nsupers % grid->nprow ) + { + jb = nrb * Pr + myrow; + pc = jb % Pc; + index = Llu->Ufstnz_br_ptr[nrb]; + + if ( index ) /* Not an empty row */ + { + k = BR_HEADER; + nblocks += index[0]; + for (j = 0; j < index[0]; ++j) + { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + sendcnts[p] ++; + srows[p * nrbp1 + nrb] ++; + k += UB_DESCRIPTOR + SuperSize( index[k] ); + } + } + } + + /* insert blocks to send */ + sdispls[0] = 0; + for ( p = 1; p < Pr * Pc; p++ ) sdispls[p] = sdispls[p - 1] + sendcnts[p - 1]; + if ( !(blocks = intMalloc_dist( nblocks )) ) ABORT("Malloc fails for blocks[]."); + for (lb = 0; lb < nrb; ++lb) + { + jb = lb * Pr + myrow; + pc = jb % Pc; + index = Llu->Ufstnz_br_ptr[lb]; + + if ( index ) /* Not an empty row */ + { + k = BR_HEADER; + for (j = 0; j < index[0]; ++j) + { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + blocks[sdispls[p]] = ib; + sdispls[p] ++; + + k += UB_DESCRIPTOR + SuperSize( index[k] ); + } + } + } + if ( myrow < nsupers % grid->nprow ) + { + jb = nrb * Pr + myrow; + pc = jb % Pc; + index = Llu->Ufstnz_br_ptr[nrb]; + + if ( index ) /* Not an empty row */ + { + k = BR_HEADER; + for (j = 0; j < index[0]; ++j) + { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + blocks[sdispls[p]] = ib; + sdispls[p] ++; + + k += UB_DESCRIPTOR + SuperSize( index[k] ); + } + } + } + + /* communication */ + MPI_Alltoall( sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm ); + MPI_Alltoall( srows, nrbp1, MPI_INT, rrows, nrbp1, MPI_INT, grid->comm ); + + nblocks = recvcnts[0]; + rdispls[0] = sdispls[0] = 0; + for ( p = 1; p < Pr * Pc; p++ ) + { + rdispls[p] = rdispls[p - 1] + recvcnts[p - 1]; + sdispls[p] = sdispls[p - 1] + sendcnts[p - 1]; + nblocks += recvcnts[p]; + } + + if ( !(blockr = intMalloc_dist( nblocks )) ) ABORT("Malloc fails for blockr[]."); + MPI_Alltoallv( blocks, sendcnts, sdispls, mpi_int_t, blockr, recvcnts, rdispls, mpi_int_t, grid->comm ); + SUPERLU_FREE( blocks ); + + /* store the received U-blocks by rows */ + nlb = nsupers / Pc; + if ( !(Ublock = intMalloc_dist( nblocks )) ) ABORT("Malloc fails for Ublock[]."); + if ( !(Urows = intMalloc_dist( 1 + nlb )) ) ABORT("Malloc fails for Urows[]."); + k = 0; + for (jb = 0; jb < nlb; jb++ ) + { + j = jb * Pc + mycol; + pr = j % Pr; + lb = j / Pr; + Urows[jb] = 0; + + for ( pc = 0; pc < Pc; pc++ ) + { + p = pr * Pc + pc; /* the processor owning this block of U-factor */ + + for ( i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb]; i++) + { + Ublock[k] = blockr[i]; + k++; Urows[jb] ++; + } + rdispls[p] += rrows[p * nrbp1 + lb]; + } + /* sort by the column indices to make things easier for later on */ + +#ifdef ISORT + isort1( Urows[jb], &(Ublock[k - Urows[jb]]) ); +#else + qsort( &(Ublock[k - Urows[jb]]), (size_t)(Urows[jb]), sizeof(int_t), &superlu_sort_perm ); +#endif + } + if ( mycol < nsupers % grid->npcol ) + { + j = nlb * Pc + mycol; + pr = j % Pr; + lb = j / Pr; + Urows[nlb] = 0; + + for ( pc = 0; pc < Pc; pc++ ) + { + p = pr * Pc + pc; + for ( i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb]; i++) + { + Ublock[k] = blockr[i]; + k++; Urows[nlb] ++; + } + rdispls[p] += rrows[p * nrb + lb]; + } +#ifdef ISORT + isort1( Urows[nlb], &(Ublock[k - Urows[nlb]]) ); +#else + qsort( &(Ublock[k - Urows[nlb]]), (size_t)(Urows[nlb]), sizeof(int_t), &superlu_sort_perm ); +#endif + } + SUPERLU_FREE( blockr ); + + /* sort the block in L-factor */ + nblocks = 0; + for ( lb = 0; lb < ncb; lb++ ) + { + jb = lb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) /* Not an empty column */ + { + nblocks += index[0]; + } + } + if ( mycol < nsupers % grid->npcol ) + { + jb = ncb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[ncb]; + if ( index ) /* Not an empty column */ + { + nblocks += index[0]; + } + } + + if ( !(Lblock = intMalloc_dist( nblocks )) ) ABORT("Malloc fails for Lblock[]."); + if ( !(Lrows = intMalloc_dist( 1 + ncb )) ) ABORT("Malloc fails for Lrows[]."); + for ( lb = 0; lb <= ncb; lb++ ) Lrows[lb] = 0; + nblocks = 0; + for ( lb = 0; lb < ncb; lb++ ) + { + Lrows[lb] = 0; + + jb = lb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) /* Not an empty column */ + { + i = index[0]; + k = BC_HEADER; + krow = PROW( jb, grid ); + if ( krow == myrow ) /* skip the diagonal block */ + { + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + + for ( j = 0; j < i; j++ ) + { + Lblock[nblocks] = index[k]; + Lrows[lb] ++; + nblocks++; + + k += LB_DESCRIPTOR + index[k + 1]; + } + } +#ifdef ISORT + isort1( Lrows[lb], &(Lblock[nblocks - Lrows[lb]]) ); +#else + qsort( &(Lblock[nblocks - Lrows[lb]]), (size_t)(Lrows[lb]), sizeof(int_t), &superlu_sort_perm ); +#endif + } + if ( mycol < nsupers % grid->npcol ) + { + Lrows[ncb] = 0; + jb = ncb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[ncb]; + if ( index ) /* Not an empty column */ + { + i = index[0]; + k = BC_HEADER; + krow = PROW( jb, grid ); + if ( krow == myrow ) /* skip the diagonal block */ + { + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + for ( j = 0; j < i; j++ ) + { + Lblock[nblocks] = index[k]; + Lrows[ncb] ++; + nblocks++; + k += LB_DESCRIPTOR + index[k + 1]; + } +#ifdef ISORT + isort1( Lrows[ncb], &(Lblock[nblocks - Lrows[ncb]]) ); +#else + qsort( &(Lblock[nblocks - Lrows[ncb]]), (size_t)(Lrows[ncb]), sizeof(int_t), &superlu_sort_perm ); +#endif + } + } + + /* look for the first local symmetric nonzero block match */ + if ( !(sf_block = intMalloc_dist( nsupers )) ) + ABORT("Malloc fails for sf_block[]."); + if ( !(sf_block_l = intMalloc_dist( nsupers )) ) + ABORT("Malloc fails for sf_block_l[]."); + for ( lb = 0; lb < nsupers; lb++ ) sf_block_l[lb] = nsupers; + i = 0; j = 0; + for ( jb = 0; jb < nlb; jb++ ) + { + if ( Urows[jb] > 0 ) + { + ib = i + Urows[jb]; + lb = jb * Pc + mycol; + for ( k = 0; k < Lrows[jb]; k++ ) + { + while ( Ublock[i] < Lblock[j] && i + 1 < ib ) i++; + + if ( Ublock[i] == Lblock[j] ) + { + sf_block_l[lb] = Lblock[j]; + j += (Lrows[jb] - k); + k = Lrows[jb]; + } + else + { + j++; + } + } + i = ib; + } + else + { + j += Lrows[jb]; + } + } + if ( mycol < nsupers % grid->npcol ) + { + if ( Urows[nlb] > 0 ) + { + ib = i + Urows[nlb]; + lb = nlb * Pc + mycol; + for ( k = 0; k < Lrows[nlb]; k++ ) + { + while ( Ublock[i] < Lblock[j] && i + 1 < ib ) i++; + + if ( Ublock[i] == Lblock[j] ) + { + sf_block_l[lb] = Lblock[j]; + j += (Lrows[nlb] - k); + k = Lrows[nlb]; + } + else + { + j++; + } + } + i = ib; + } + else + { + j += Lrows[nlb]; + } + } + /* compute the first global symmetric matchs */ + MPI_Allreduce( sf_block_l, sf_block, nsupers, mpi_int_t, MPI_MIN, grid->comm ); + SUPERLU_FREE( sf_block_l ); + + /* count number of nodes in DAG (i.e., the number of blocks on and above the first match) */ + if ( !(nnodes_l = intMalloc_dist( nsupers )) ) + ABORT("Malloc fails for nnodes_l[]."); + if ( !(nnodes_u = intMalloc_dist( nsupers )) ) + ABORT("Malloc fails for nnodes_u[]."); + for ( lb = 0; lb < nsupers; lb++ ) nnodes_l[lb] = 0; + for ( lb = 0; lb < nsupers; lb++ ) nnodes_u[lb] = 0; + + nblocks = 0; + /* from U-factor */ + for (i = 0, jb = 0; jb < nlb; jb++ ) + { + lb = jb * Pc + mycol; + ib = i + Urows[jb]; + while ( i < ib ) + { + if ( Ublock[i] <= sf_block[lb] ) + { + nnodes_u[lb] ++; + i++; nblocks++; + } + else /* get out*/ + { + i = ib; + } + } + i = ib; + } + if ( mycol < nsupers % grid->npcol ) + { + lb = nlb * Pc + mycol; + ib = i + Urows[nlb]; + while ( i < ib ) + { + if ( Ublock[i] <= sf_block[lb] ) + { + nnodes_u[lb] ++; + i++; nblocks++; + } + else /* get out*/ + { + i = ib; + } + } + i = ib; + } + + /* from L-factor */ + for (i = 0, jb = 0; jb < nlb; jb++ ) + { + lb = jb * Pc + mycol; + ib = i + Lrows[jb]; + while ( i < ib ) + { + if ( Lblock[i] < sf_block[lb] ) + { + nnodes_l[lb] ++; + i++; nblocks++; + } + else + { + i = ib; + } + } + i = ib; + } + if ( mycol < nsupers % grid->npcol ) + { + lb = nlb * Pc + mycol; + ib = i + Lrows[nlb]; + while ( i < ib ) + { + if ( Lblock[i] < sf_block[lb] ) + { + nnodes_l[lb] ++; + i++; nblocks++; + } + else + { + i = ib; + } + } + i = ib; + } + +#ifdef USE_ALLGATHER + /* insert local nodes in DAG */ + if ( !(edag_supno_l = intMalloc_dist( nsupers + nblocks )) ) + ABORT("Malloc fails for edag_supno_l[]."); + iu = il = nblocks = 0; + for ( lb = 0; lb < nsupers; lb++ ) + { + j = lb / Pc; + pc = lb % Pc; + + edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb]; nblocks ++; + if ( mycol == pc ) + { + /* from U-factor */ + ib = iu + Urows[j]; + for ( jb = 0; jb < nnodes_u[lb]; jb++ ) + { + edag_supno_l[nblocks] = Ublock[iu]; + iu++; nblocks++; + } + iu = ib; + + /* from L-factor */ + ib = il + Lrows[j]; + for ( jb = 0; jb < nnodes_l[lb]; jb++ ) + { + edag_supno_l[nblocks] = Lblock[il]; + il++; nblocks++; + } + il = ib; + } + } + SUPERLU_FREE( nnodes_u ); + + /* form global DAG on each processor */ + MPI_Allgather( &nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm ); + nblocks = recvcnts[0]; + rdispls[0] = 0; + for ( lb = 1; lb < Pc * Pr; lb++ ) + { + rdispls[lb] = nblocks; + nblocks += recvcnts[lb]; + } + if ( !(recvbuf = intMalloc_dist( nblocks )) ) + ABORT("Malloc fails for recvbuf[]."); + MPI_Allgatherv( edag_supno_l, recvcnts[iam], mpi_int_t, + recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm ); + SUPERLU_FREE(edag_supno_l); + + if ( !(edag_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t*) )) ) + ABORT("Malloc fails for edag_supno[]."); + k = 0; + for ( lb = 0; lb < nsupers; lb++ ) nnodes_l[lb] = 0; + for ( p = 0; p < Pc * Pr; p++ ) + { + for ( lb = 0; lb < nsupers; lb++ ) + { + nnodes_l[lb] += recvbuf[k]; + k += (1 + recvbuf[k]); + } + } + for ( lb = 0; lb < nsupers; lb++ ) + { + if ( nnodes_l[lb] > 0 ) + if ( !(edag_supno[lb] = intMalloc_dist( nnodes_l[lb] )) ) + ABORT("Malloc fails for edag_supno[lb]."); + nnodes_l[lb] = 0; + } + k = 0; + for ( p = 0; p < Pc * Pr; p++ ) + { + for ( lb = 0; lb < nsupers; lb++ ) + { + jb = k + recvbuf[k] + 1; + k ++; + for ( ; k < jb; k++ ) + { + edag_supno[lb][nnodes_l[lb]] = recvbuf[k]; + nnodes_l[lb] ++; + } + } + } + SUPERLU_FREE(recvbuf); +#else + int nlsupers = nsupers / Pc; + if ( mycol < nsupers % Pc ) nlsupers ++; + + /* insert local nodes in DAG */ + if ( !(edag_supno_l = intMalloc_dist( nlsupers + nblocks )) ) + ABORT("Malloc fails for edag_supno_l[]."); + iu = il = nblocks = 0; + for ( lb = 0; lb < nsupers; lb++ ) + { + j = lb / Pc; + pc = lb % Pc; + if ( mycol == pc ) + { + edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb]; nblocks ++; + /* from U-factor */ + ib = iu + Urows[j]; + for ( jb = 0; jb < nnodes_u[lb]; jb++ ) + { + edag_supno_l[nblocks] = Ublock[iu]; + iu++; nblocks++; + } + iu = ib; + + /* from L-factor */ + ib = il + Lrows[j]; + for ( jb = 0; jb < nnodes_l[lb]; jb++ ) + { + edag_supno_l[nblocks] = Lblock[il]; + il++; nblocks++; + } + il = ib; + } + else if ( nnodes_l[lb] + nnodes_u[lb] != 0 ) + printf( " # %d: nnodes[%d]=%d+%d\n", grid->iam, + (int) lb, (int) nnodes_l[lb], (int) nnodes_u[lb] ); + } + SUPERLU_FREE( nnodes_u ); + /* form global DAG on each processor */ + MPI_Allgather( &nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm ); + nblocks = recvcnts[0]; + rdispls[0] = 0; + for ( lb = 1; lb < Pc * Pr; lb++ ) + { + rdispls[lb] = nblocks; + nblocks += recvcnts[lb]; + } + if ( !(recvbuf = intMalloc_dist( nblocks )) ) + ABORT("Malloc fails for recvbuf[]."); + + MPI_Allgatherv( edag_supno_l, recvcnts[iam], mpi_int_t, + recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm ); + SUPERLU_FREE(edag_supno_l); + + if ( !(edag_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t*) )) ) + ABORT("Malloc fails for edag_supno[]."); + k = 0; + for ( lb = 0; lb < nsupers; lb++ ) nnodes_l[lb] = 0; + for ( p = 0; p < Pc * Pr; p++ ) + { + yourcol = MYCOL( p, grid ); + + for ( lb = 0; lb < nsupers; lb++ ) + { + j = lb / Pc; + pc = lb % Pc; + if ( yourcol == pc ) + { + nnodes_l[lb] += recvbuf[k]; + k += (1 + recvbuf[k]); + } + } + } + for ( lb = 0; lb < nsupers; lb++ ) + { + if ( nnodes_l[lb] > 0 ) + if ( !(edag_supno[lb] = intMalloc_dist( nnodes_l[lb] )) ) + ABORT("Malloc fails for edag_supno[lb]."); + nnodes_l[lb] = 0; + } + k = 0; + for ( p = 0; p < Pc * Pr; p++ ) + { + yourcol = MYCOL( p, grid ); + + for ( lb = 0; lb < nsupers; lb++ ) + { + j = lb / Pc; + pc = lb % Pc; + if ( yourcol == pc ) + { + jb = k + recvbuf[k] + 1; + k ++; + for ( ; k < jb; k++ ) + { + edag_supno[lb][nnodes_l[lb]] = recvbuf[k]; + nnodes_l[lb] ++; + } + } + } + } + SUPERLU_FREE(recvbuf); +#endif + + /* initialize the num of child for each node */ + num_child = SUPERLU_MALLOC( nsupers * sizeof(int_t) ); + for ( i = 0; i < nsupers; i++ ) num_child[i] = 0; + for ( i = 0; i < nsupers; i++ ) + { + for ( jb = 0; jb < nnodes_l[i]; jb++ ) + { + num_child[edag_supno[i][jb]]++; + } + } + + /* push initial leaves to the fifo queue */ + nnodes = 0; + for ( i = 0; i < nsupers; i++ ) + { + if ( num_child[i] == 0 ) + { + ptr = SUPERLU_MALLOC( sizeof(etree_node) ); + ptr->id = i; + ptr->next = NULL; + /*printf( " == push leaf %d (%d) ==\n",i,nnodes );*/ + nnodes ++; + + if ( nnodes == 1 ) + { + head = ptr; + tail = ptr; + } + else + { + tail->next = ptr; + tail = ptr; + } + } + } + + /* process fifo queue, and compute the ordering */ + i = 0; + perm_c_supno = SUPERLU_MALLOC( nsupers * sizeof(int_t) ); + while ( nnodes > 0 ) + { + + /*printf( "=== pop %d (%d) ===\n",head->id,i );*/ + ptr = head; j = ptr->id; + head = ptr->next; + + perm_c_supno[i] = j; + SUPERLU_FREE(ptr); + i++; nnodes --; + + for ( jb = 0; jb < nnodes_l[j]; jb++ ) + { + num_child[edag_supno[j][jb]]--; + if ( num_child[edag_supno[j][jb]] == 0 ) + { + nnodes ++; + + ptr = SUPERLU_MALLOC( sizeof(etree_node) ); + ptr->id = edag_supno[j][jb]; + ptr->next = NULL; + + /*printf( "=== push %d ===\n",ptr->id );*/ + if ( nnodes == 1 ) + { + head = ptr; + tail = ptr; + } + else + { + tail->next = ptr; + tail = ptr; + } + } + } + /*printf( "\n" );*/ + } + SUPERLU_FREE(num_child); + + for ( lb = 0; lb < nsupers; lb++ ) if ( nnodes_l[lb] > 0 ) SUPERLU_FREE(edag_supno[lb] ); + SUPERLU_FREE(edag_supno); + SUPERLU_FREE(nnodes_l); + SUPERLU_FREE(sendcnts); + SUPERLU_FREE(sf_block); + SUPERLU_FREE(Ublock); + SUPERLU_FREE(Urows); + SUPERLU_FREE(Lblock); + SUPERLU_FREE(Lrows); + } + /* ======================== * + * end of static scheduling * + * ======================== */ + + return perm_c_supno; +} /* getPerm_c_supno */ + + +void getSCUweight(int_t nsupers, treeList_t* treeList, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d + ) +{ + gridinfo_t* grid = &(grid3d->grid2d); + int_t** Lrowind_bc_ptr = LUstruct->Llu->Lrowind_bc_ptr; + int_t** Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr; + int_t* xsup = LUstruct->Glu_persist->xsup; + + int_t * perm_u = INT_T_ALLOC(nsupers); + int_t * mylsize = INT_T_ALLOC(nsupers); + int_t * myusize = INT_T_ALLOC(nsupers); + // int_t * maxlsize = INT_T_ALLOC(nsupers); + // int_t * maxusize = INT_T_ALLOC(nsupers); + int ldu; + + for (int i = 0; i < nsupers; ++i) + { + perm_u[i] = i; + mylsize[i] = 0; + myusize[i] = 0; + } + + for (int_t k = 0; k < nsupers ; ++k) + { + treeList[k].scuWeight = 0.0; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + // int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + int_t krow = PROW (k, grid); + int_t kcol = PCOL (k, grid); + int_t ldu; + + if (myrow == krow) + { + /* code */ + myusize[k] = num_full_cols_U(k, Ufstnz_br_ptr, xsup, grid, + perm_u, &ldu); + } + + if (mycol == kcol) + { + /* code */ + int_t lk = LBj( k, grid ); /* Local block number */ + int_t *lsub; + // double* lnzval; + lsub = Lrowind_bc_ptr[lk]; + if (lsub) + { + /* code */ + mylsize[k] = lsub[1]; + } + } + } + + // int_t maxlsize = 0; + MPI_Allreduce( MPI_IN_PLACE, mylsize, nsupers, mpi_int_t, MPI_MAX, grid->comm ); + // int_t maxusize = 0; + MPI_Allreduce( MPI_IN_PLACE, myusize, nsupers, mpi_int_t, MPI_MAX, grid->comm ); + + for (int_t k = 0; k < nsupers ; ++k) + { + + treeList[k].scuWeight = 0.0; + int_t ksupc = SuperSize(k); + treeList[k].scuWeight = 1.0 * ksupc * mylsize[k] * myusize[k]; + } + + SUPERLU_FREE(mylsize); + SUPERLU_FREE(myusize); + SUPERLU_FREE(perm_u); + +} /* getSCUweight */ diff --git a/SRC/util.c b/SRC/util.c index 5fa742bd..3040891a 100644 --- a/SRC/util.c +++ b/SRC/util.c @@ -1451,3 +1451,97 @@ int_t partitionM( int_t* a, int_t l, int_t r, int_t lda, int_t dir, int_t dims) } } + +/* + * The following are from 3D code p3dcomm.c + */ + +int_t AllocGlu(int_t n, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +{ + /*broadcasting Glu_persist*/ + LUstruct->Glu_persist->xsup = intMalloc_dist(nsupers+1); //INT_T_ALLOC(nsupers+1); + LUstruct->Glu_persist->supno = intMalloc_dist(n); //INT_T_ALLOC(n); + return 0; +} + +int_t** getTreePerm( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + int_t* nodeCount, int_t** nodeList, + int_t* perm_c_supno, int_t* iperm_c_supno, + gridinfo3d_t* grid3d) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + int_t** treePerm = SUPERLU_MALLOC(sizeof(int_t*)*maxLvl); + for (int_t lvl = 0; lvl < maxLvl; lvl++) + { + // treePerm[lvl] = NULL; + int_t treeId = myTreeIdxs[lvl]; + treePerm[lvl] = getPermNodeList(nodeCount[treeId], nodeList[treeId], + perm_c_supno, iperm_c_supno); + + } + return treePerm; +} + +int_t* getMyNodeCounts(int_t maxLvl, int_t* myTreeIdxs, int_t* gNodeCount) +{ + int_t* myNodeCount = INT_T_ALLOC(maxLvl); + for (int i = 0; i < maxLvl; ++i) + { + myNodeCount[i] = gNodeCount[myTreeIdxs[i]]; + } + return myNodeCount; +} + +/*chekc a vector vec of len across different process grids*/ +int_t checkIntVector3d(int_t* vec, int_t len, gridinfo3d_t* grid3d) +{ + int_t nP = grid3d->zscp.Np; + int_t myGrid = grid3d->zscp.Iam; + int_t * buf = intMalloc_dist(len); + + if (!myGrid) { + for (int_t p = 1; p < nP; ++p) + { + MPI_Status status; + MPI_Recv(buf, len, mpi_int_t, p, p, grid3d->zscp.comm, &status); + + for (int_t i = 0; i < len ; ++i) { + /* code */ + if (buf[i] != vec[i]) { + /* code */ + printf("Error occured at (%d) Loc %d \n", (int) p, (int) i); + exit(0); + } + } + } + } + else + { + MPI_Send(vec, len, mpi_int_t, 0, myGrid, grid3d->zscp.comm); + } + + return 0; +} + +/** + * reduce the states from all the two grids before prinitng it out + * See the defenition of enum PhaseType in superlu_enum_const.h + */ +int_t reduceStat(PhaseType PHASE, + SuperLUStat_t *stat, gridinfo3d_t * grid3d) +{ + flops_t *ops = stat->ops; + + flops_t flopcnt; + MPI_Reduce(&ops[PHASE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, 0, grid3d->zscp.comm); + + if (!grid3d->zscp.Iam) + { + ops[PHASE] = flopcnt; + } + + return 0; +} + +/*---- end from 3D code p3dcomm.c ----*/ diff --git a/SRC/zcommunication_aux.c b/SRC/zcommunication_aux.c new file mode 100644 index 00000000..0e3961c7 --- /dev/null +++ b/SRC/zcommunication_aux.c @@ -0,0 +1,480 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +#include "superlu_zdefs.h" +#if 0 +#include "sec_structs.h" +#include "communication_aux.h" +#include "compiler.h" +#endif + +int_t zIBcast_LPanel +/*broadcasts index array lsub and non-zero value + array lusup of a newly factored L column to my process row*/ +(int_t k, int_t k0, int_t* lsub, doublecomplex* lusup, gridinfo_t *grid, + int* msgcnt, MPI_Request *send_req, int_t **ToSendR, int_t *xsup, + int tag_ub) +{ + int_t Pc = grid->npcol; + int_t lk = LBj (k, grid); + superlu_scope_t *scp = &grid->rscp; /* The scope of process row. */ + if (lsub) + { + msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + msgcnt[1] = lsub[1] * SuperSize (k); + } + else + { + msgcnt[0] = msgcnt[1] = 0; + } + + for (int_t pj = 0; pj < Pc; ++pj) + { + if (ToSendR[lk][pj] != EMPTY) + { + + + MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj, + SLU_MPI_TAG (0, k0) /* 0 */ , + scp->comm, &send_req[pj]); + MPI_Isend (lusup, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj, + SLU_MPI_TAG (1, k0) /* 1 */ , + scp->comm, &send_req[pj + Pc]); + + } + } + + return 0; +} + + +int_t zBcast_LPanel +/*broadcasts index array lsub and non-zero value + array lusup of a newly factored L column to my process row*/ +(int_t k, int_t k0, int_t* lsub, doublecomplex* lusup, gridinfo_t *grid, + int* msgcnt, int_t **ToSendR, int_t *xsup , SCT_t* SCT, + int tag_ub) +{ + unsigned long long t1 = _rdtsc(); + int_t Pc = grid->npcol; + int_t lk = LBj (k, grid); + superlu_scope_t *scp = &grid->rscp; /* The scope of process row. */ + if (lsub) + { + msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + msgcnt[1] = lsub[1] * SuperSize (k); + } + else + { + msgcnt[0] = msgcnt[1] = 0; + } + + for (int_t pj = 0; pj < Pc; ++pj) + { + if (ToSendR[lk][pj] != EMPTY) + { + + + MPI_Send (lsub, msgcnt[0], mpi_int_t, pj, + SLU_MPI_TAG (0, k0) /* 0 */ , + scp->comm); + MPI_Send (lusup, msgcnt[1], SuperLU_MPI_DOUBLE_COMPLEX, pj, + SLU_MPI_TAG (1, k0) /* 1 */ , + scp->comm); + + } + } + SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1); + return 0; +} + + + +int_t zIBcast_UPanel +/*asynchronously braodcasts U panel to my process row */ +(int_t k, int_t k0, int_t* usub, doublecomplex* uval, gridinfo_t *grid, + int* msgcnt, MPI_Request *send_req_u, int_t *ToSendD, int tag_ub ) +{ + + int_t iam = grid->iam; + int_t lk = LBi (k, grid); + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */ + if (usub) + { + msgcnt[2] = usub[2]; + msgcnt[3] = usub[1]; + } + else + { + msgcnt[2] = msgcnt[3] = 0; + } + + if (ToSendD[lk] == YES) + { + for (int_t pi = 0; pi < Pr; ++pi) + { + if (pi != myrow) + { + + MPI_Isend (usub, msgcnt[2], mpi_int_t, pi, + SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ , + scp->comm, + &send_req_u[pi]); + MPI_Isend (uval, msgcnt[3], SuperLU_MPI_DOUBLE_COMPLEX, + pi, SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ , + scp->comm, + &send_req_u[pi + Pr]); + + } /* if pi ... */ + } /* for pi ... */ + } /* if ToSendD ... */ + return 0; +} + +/*Synchronously braodcasts U panel to my process row */ +int_t zBcast_UPanel(int_t k, int_t k0, int_t* usub, + doublecomplex* uval, gridinfo_t *grid, + int* msgcnt, int_t *ToSendD, SCT_t* SCT, int tag_ub) + +{ + unsigned long long t1 = _rdtsc(); + int_t iam = grid->iam; + int_t lk = LBi (k, grid); + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */ + if (usub) + { + msgcnt[2] = usub[2]; + msgcnt[3] = usub[1]; + } + else + { + msgcnt[2] = msgcnt[3] = 0; + } + + if (ToSendD[lk] == YES) + { + for (int_t pi = 0; pi < Pr; ++pi) + { + if (pi != myrow) + { + MPI_Send (usub, msgcnt[2], mpi_int_t, pi, + SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ , + scp->comm); + MPI_Send (uval, msgcnt[3], SuperLU_MPI_DOUBLE_COMPLEX, pi, + SLU_MPI_TAG (3, k0) /* (4*k0+3)%tag_ub */ , + scp->comm); + + } /* if pi ... */ + } /* for pi ... */ + } + SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1); + return 0; +} + +int_t zIrecv_LPanel +/*it places Irecv call for L panel*/ +(int_t k, int_t k0, int_t* Lsub_buf, doublecomplex* Lval_buf, + gridinfo_t *grid, MPI_Request *recv_req, LocalLU_t *Llu, int tag_ub ) +{ + int_t kcol = PCOL (k, grid); + + superlu_scope_t *scp = &grid->rscp; /* The scope of process row. */ + MPI_Irecv (Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol, + SLU_MPI_TAG (0, k0) /* 0 */ , + scp->comm, &recv_req[0]); + MPI_Irecv (Lval_buf, Llu->bufmax[1], SuperLU_MPI_DOUBLE_COMPLEX, kcol, + SLU_MPI_TAG (1, k0) /* 1 */ , + scp->comm, &recv_req[1]); + return 0; +} + + +int_t zIrecv_UPanel +/*it places Irecv calls to receive U panels*/ +(int_t k, int_t k0, int_t* Usub_buf, doublecomplex* Uval_buf, LocalLU_t *Llu, + gridinfo_t* grid, MPI_Request *recv_req_u, int tag_ub ) +{ + int_t krow = PROW (k, grid); + superlu_scope_t *scp = &grid->cscp; /* The scope of process column. */ + MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow, + SLU_MPI_TAG (2, k0) /* (4*kk0+2)%tag_ub */ , + scp->comm, &recv_req_u[0]); + MPI_Irecv (Uval_buf, Llu->bufmax[3], SuperLU_MPI_DOUBLE_COMPLEX, krow, + SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ , + scp->comm, &recv_req_u[1]); + + return 0; +} + +int_t zWait_URecv +( MPI_Request *recv_req, int* msgcnt, SCT_t* SCT) +{ + unsigned long long t1 = _rdtsc(); + MPI_Status status; + MPI_Wait (&recv_req[0], &status); + MPI_Get_count (&status, mpi_int_t, &msgcnt[2]); + MPI_Wait (&recv_req[1], &status); + MPI_Get_count (&status, SuperLU_MPI_DOUBLE_COMPLEX, &msgcnt[3]); + SCT->Wait_URecv_tl += (double) ( _rdtsc() - t1); + return 0; +} + +int_t zWait_LRecv +/*waits till L blocks have been received*/ +( MPI_Request* recv_req, int* msgcnt, int* msgcntsU, gridinfo_t * grid, SCT_t* SCT) +{ + unsigned long long t1 = _rdtsc(); + MPI_Status status; + + if (recv_req[0] != MPI_REQUEST_NULL) + { + MPI_Wait (&recv_req[0], &status); + MPI_Get_count (&status, mpi_int_t, &msgcnt[0]); + recv_req[0] = MPI_REQUEST_NULL; + } + else + { + msgcnt[0] = msgcntsU[0]; + } + + if (recv_req[1] != MPI_REQUEST_NULL) + { + MPI_Wait (&recv_req[1], &status); + MPI_Get_count (&status, SuperLU_MPI_DOUBLE_COMPLEX, &msgcnt[1]); + recv_req[1] = MPI_REQUEST_NULL; + } + else + { + msgcnt[1] = msgcntsU[1]; + } + SCT->Wait_LRecv_tl += (double) ( _rdtsc() - t1); + return 0; +} + + +int_t zISend_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, /*pointer for the diagonal block*/ + int_t size, /*number of elements to be broadcasted*/ + MPI_Request *U_diag_blk_send_req, + gridinfo_t * grid, int tag_ub) +{ + int_t iam = grid->iam; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + MPI_Comm comm = (grid->cscp).comm; + /** ALWAYS SEND TO ALL OTHERS - TO FIX **/ + for (int_t pr = 0; pr < Pr; ++pr) + { + if (pr != myrow) + { + /* tag = ((k0<<2)+2) % tag_ub; */ + /* tag = (4*(nsupers+k0)+2) % tag_ub; */ + MPI_Isend (ublk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, pr, + SLU_MPI_TAG (4, k0) /* tag */ , + comm, U_diag_blk_send_req + pr); + } + } + + return 0; +} + + +int_t zRecv_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, /*pointer for the diagonal block*/ + int_t size, /*number of elements to be broadcasted*/ + int_t src, + gridinfo_t * grid, SCT_t* SCT, int tag_ub) +{ + unsigned long long t1 = _rdtsc(); + MPI_Status status; + MPI_Comm comm = (grid->cscp).comm; + /* tag = ((k0<<2)+2) % tag_ub; */ + /* tag = (4*(nsupers+k0)+2) % tag_ub; */ + + MPI_Recv (ublk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, src, + SLU_MPI_TAG (4, k0), comm, &status); + SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1); + return 0; +} + + +int_t zPackLBlock(int_t k, double* Dest, Glu_persist_t *Glu_persist, + gridinfo_t *grid, LocalLU_t *Llu) +/*Copies src matrix into dest matrix*/ +{ + /* Initialization. */ + int_t *xsup = Glu_persist->xsup; + int_t lk = LBj (k, grid); /* Local block number */ + doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk]; + int_t nsupc = SuperSize (k); + int_t nsupr; + if (Llu->Lrowind_bc_ptr[lk]) + nsupr = Llu->Lrowind_bc_ptr[lk][1]; + else + nsupr = 0; +#if 0 + LAPACKE_dlacpy (LAPACK_COL_MAJOR, 'A', nsupc, nsupc, lusup, nsupr, Dest, nsupc); +#else /* Sherry */ + for (int j = 0; j < nsupc; ++j) { + memcpy( &Dest[j * nsupc], &lusup[j * nsupr], nsupc * sizeof(doublecomplex) ); + } +#endif + + return 0; +} + +int_t zISend_LDiagBlock(int_t k0, doublecomplex *lblk_ptr, /*pointer for the diagonal block*/ + int_t size, /*number of elements to be broadcasted*/ + MPI_Request *L_diag_blk_send_req, + gridinfo_t * grid, int tag_ub) +{ + int_t iam = grid->iam; + int_t Pc = grid->npcol; + int_t mycol = MYCOL (iam, grid); + MPI_Comm comm = (grid->rscp).comm; /*Row communicator*/ + /** ALWAYS SEND TO ALL OTHERS - TO FIX **/ + for (int_t pc = 0; pc < Pc; ++pc) + { + if (pc != mycol) + { + /* tag = ((k0<<2)+2) % tag_ub; */ + /* tag = (4*(nsupers+k0)+2) % tag_ub; */ + MPI_Isend (lblk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, pc, + SLU_MPI_TAG (5, k0) /* tag */ , + comm, L_diag_blk_send_req + pc); + + } + } + + return 0; +} + + +int_t zIRecv_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, /*pointer for the diagonal block*/ + int_t size, /*number of elements to be broadcasted*/ + int_t src, + MPI_Request *U_diag_blk_recv_req, + gridinfo_t * grid, SCT_t* SCT, int tag_ub) +{ + unsigned long long t1 = _rdtsc(); + MPI_Comm comm = (grid->cscp).comm; + /* tag = ((k0<<2)+2) % tag_ub; */ + /* tag = (4*(nsupers+k0)+2) % tag_ub; */ + + int_t err = MPI_Irecv (ublk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, src, + SLU_MPI_TAG (4, k0), comm, U_diag_blk_recv_req); + if (err==MPI_ERR_COUNT) + { + printf("Error in IRecv_UDiagBlock count\n"); + } + SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1); + return 0; +} + +int_t zIRecv_LDiagBlock(int_t k0, doublecomplex *L_blk_ptr, /*pointer for the diagonal block*/ + int_t size, /*number of elements to be broadcasted*/ + int_t src, + MPI_Request *L_diag_blk_recv_req, + gridinfo_t * grid, SCT_t* SCT, int tag_ub) +{ + unsigned long long t1 = _rdtsc(); + MPI_Comm comm = (grid->rscp).comm; + /* tag = ((k0<<2)+2) % tag_ub; */ + /* tag = (4*(nsupers+k0)+2) % tag_ub; */ + + int_t err = MPI_Irecv (L_blk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, src, + SLU_MPI_TAG (5, k0), + comm, L_diag_blk_recv_req); + if (err==MPI_ERR_COUNT) + { + printf("Error in IRecv_lDiagBlock count\n"); + } + SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1); + return 0; +} + +#if (MPI_VERSION>2) + +/****Ibcast based on mpi ibcast****/ +int_t zIBcast_UDiagBlock(int_t k, double *ublk_ptr, /*pointer for the diagonal block*/ + int_t size, /*number of elements to be broadcasted*/ + MPI_Request *L_diag_blk_ibcast_req, + gridinfo_t * grid) +{ + int_t krow = PROW (k, grid); + MPI_Comm comm = (grid->cscp).comm; + + MPI_Ibcast(ublk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, krow,comm, L_diag_blk_ibcast_req); + + // MPI_Status status; + // MPI_Wait(L_diag_blk_ibcast_req, &status); + return 0; +} + +int_t zIBcast_LDiagBlock(int_t k, doublecomplex *lblk_ptr, /*pointer for the diagonal block*/ + int_t size, /*number of elements to be broadcasted*/ + MPI_Request *U_diag_blk_ibcast_req, + gridinfo_t * grid) +{ + int_t kcol = PCOL (k, grid); + MPI_Comm comm = (grid->rscp).comm; + + MPI_Ibcast(lblk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, kcol,comm, U_diag_blk_ibcast_req); + // MPI_Status status; + // MPI_Wait(U_diag_blk_ibcast_req, &status); + return 0; +} + +#endif + +int_t zUDiagBlockRecvWait( int_t k, int_t* IrecvPlcd_D, int_t* factored_L, + MPI_Request * U_diag_blk_recv_req, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT) +{ + LocalLU_t *Llu = LUstruct->Llu; + + int_t iam = grid->iam; + + int_t mycol = MYCOL (iam, grid); + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + + int_t kcol = PCOL (k, grid); + + if (IrecvPlcd_D[k] == 1) + { + /* code */ + /*factor the L panel*/ + if (mycol == kcol && factored_L[k] == 0 && iam != pkk) + { + factored_L[k] = 1; + int_t lk = LBj (k, grid); + + int_t nsupr; + if (Llu->Lrowind_bc_ptr[lk]) + nsupr = Llu->Lrowind_bc_ptr[lk][1]; + else + nsupr = 0; + /*wait for communication to finish*/ + + // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT); + int_t flag = 0; + while (flag == 0) + { + flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT); + } + } + } + return 0; +} + diff --git a/SRC/zgather.c b/SRC/zgather.c new file mode 100644 index 00000000..f2745899 --- /dev/null +++ b/SRC/zgather.c @@ -0,0 +1,380 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +#include +#include "superlu_zdefs.h" +#if 0 +#include "scatter.h" +#include "sec_structs.h" +#include "superlu_defs.h" +#include "gather.h" +#endif + +int_t zprintMatrix(char*s, int n, int m, doublecomplex* A, int LDA) +{ + printf("%s\n", s ); + for(int i=0; ixsup; + int_t knsupc = SuperSize (k); + int_t krow = PROW (k, grid); + int_t nlb, lptr0, luptr0; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + + HyP->lookAheadBlk = 0, HyP->RemainBlk = 0; + + int_t nsupr = lsub[1]; /* LDA of lusup. */ + if (myrow == krow) /* Skip diagonal block L(k,k). */ + { + lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1]; + luptr0 = knsupc; + nlb = lsub[0] - 1; + } + else + { + lptr0 = BC_HEADER; + luptr0 = 0; + nlb = lsub[0]; + } + // printf("nLb =%d ", nlb ); + + int_t lptr = lptr0; + int_t luptr = luptr0; + for (int_t i = 0; i < nlb; ++i) + { + ib = lsub[lptr]; /* Row block L(i,k). */ + temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */ + + int_t look_up_flag = 1; + + // if elimination order is greater than first block stored on GPU + if (iperm_c_supno[ib] < HyP->first_u_block_acc) look_up_flag = 0; + + // if it myIperm[ib] is within look ahead window + if (myIperm[ib]< myIperm[k] + HyP->nCudaStreams && myIperm[ib]>0) look_up_flag = 0; + + if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 ) + { + int_t k_parent = gEtreeInfo->setree[k]; + if (ib == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 ) + { + look_up_flag = 0; + } + } + // look_up_flag = 0; + if (!look_up_flag) + { + /* ib is within look up window */ + HyP->lookAhead_info[HyP->lookAheadBlk].nrows = temp_nbrow; + if (HyP->lookAheadBlk == 0) + { + HyP->lookAhead_info[HyP->lookAheadBlk].FullRow = temp_nbrow; + } + else + { + HyP->lookAhead_info[HyP->lookAheadBlk].FullRow + = temp_nbrow + HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow; + } + HyP->lookAhead_info[HyP->lookAheadBlk].StRow = cum_nrow; + HyP->lookAhead_info[HyP->lookAheadBlk].lptr = lptr; + HyP->lookAhead_info[HyP->lookAheadBlk].ib = ib; + HyP->lookAheadBlk++; + } + else + { + /* ib is not in look up window */ + HyP->Remain_info[HyP->RemainBlk].nrows = temp_nbrow; + if (HyP->RemainBlk == 0) + { + HyP->Remain_info[HyP->RemainBlk].FullRow = temp_nbrow; + } + else + { + HyP->Remain_info[HyP->RemainBlk].FullRow + = temp_nbrow + HyP->Remain_info[HyP->RemainBlk - 1].FullRow; + } + HyP->Remain_info[HyP->RemainBlk].StRow = cum_nrow; + HyP->Remain_info[HyP->RemainBlk].lptr = lptr; + HyP->Remain_info[HyP->RemainBlk].ib = ib; + HyP->RemainBlk++; + } + + cum_nrow += temp_nbrow; + + lptr += LB_DESCRIPTOR; /* Skip descriptor. */ + lptr += temp_nbrow; + luptr += temp_nbrow; + } + lptr = lptr0; + luptr = luptr0; + + zgather_l( HyP->lookAheadBlk, knsupc, HyP->lookAhead_info, + &lusup[luptr], nsupr, HyP->lookAhead_L_buff); + + zgather_l( HyP->RemainBlk, knsupc, HyP->Remain_info, + &lusup[luptr], nsupr, HyP->Remain_L_buff); + + assert(HyP->lookAheadBlk + HyP->RemainBlk ==nlb ); + HyP->Lnbrow = HyP->lookAheadBlk == 0 ? 0 : HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow; + HyP->Rnbrow = HyP->RemainBlk == 0 ? 0 : HyP->Remain_info[HyP->RemainBlk - 1].FullRow; + + // zprintMatrix("LookAhead Block", HyP->Lnbrow, knsupc, HyP->lookAhead_L_buff, HyP->Lnbrow); + // zprintMatrix("Remaining Block", HyP->Rnbrow, knsupc, HyP->Remain_L_buff, HyP->Rnbrow); +} + +// void Rgather_U(int_t k, +// HyP_t *HyP, +// int_t st, int_t end, +// int_t *usub, double *uval, double *bigU, +// Glu_persist_t *Glu_persist, gridinfo_t *grid, +// int_t *perm_u) + +void zRgather_U( int_t k, int_t jj0, int_t *usub, doublecomplex *uval, + doublecomplex *bigU, gEtreeInfo_t* gEtreeInfo, + Glu_persist_t *Glu_persist, gridinfo_t *grid, HyP_t *HyP, + int_t* myIperm, int_t *iperm_c_supno, int_t *perm_u) +{ + HyP->ldu = 0; + HyP->num_u_blks = 0; + HyP->ldu_Phi = 0; + HyP->num_u_blks_Phi = 0; + + int_t iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ + int_t rukp = 0; /* Pointer to nzval[] of U(k,:) */ + int_t nub = usub[0]; /* Number of blocks in the block row U(k,:) */ + int_t *xsup = Glu_persist->xsup; + // int_t k = perm_c_supno[k0]; + int_t klst = FstBlockC (k + 1); + int_t iukp0 = iukp; + int_t rukp0 = rukp; + int_t jb, ljb; + int_t nsupc; + int_t full = 1; + int_t full_Phi = 1; + int_t temp_ncols = 0; + int_t segsize; + HyP->num_u_blks = 0; + HyP->ldu = 0; + + for (int_t j = jj0; j < nub; ++j) + { + temp_ncols = 0; + arrive_at_ublock( + j, &iukp, &rukp, &jb, &ljb, &nsupc, + iukp0, rukp0, usub, perm_u, xsup, grid + ); + + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + segsize = klst - usub[jj]; + if ( segsize ) ++temp_ncols; + } + /*here goes the condition wether jb block exists on Phi or not*/ + int_t u_blk_acc_cond = 0; + // if (j == jj0) u_blk_acc_cond = 1; /* must schedule first colum on cpu */ + if (iperm_c_supno[jb] < HyP->first_l_block_acc) + { + // printf("k=%d jb=%d got at condition-1:%d, %d \n",k,jb, iperm_c_supno[jb] , HyP->first_l_block_acc); + u_blk_acc_cond = 1; + } + // if jb is within lookahead window + if (myIperm[jb]< myIperm[k] + HyP->nCudaStreams && myIperm[jb]>0) + { + // printf("k=%d jb=%d got at condition-2:%d, %d\n ",k,jb, myIperm[jb] , myIperm[k]); + u_blk_acc_cond = 1; + } + + if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 ) + { + int_t k_parent = gEtreeInfo->setree[k]; + if (jb == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 ) + { + u_blk_acc_cond = 1; + // printf("k=%d jb=%d got at condition-3\n",k,jb); + u_blk_acc_cond = 1; + } + } + + + if (u_blk_acc_cond) + { + HyP->Ublock_info[HyP->num_u_blks].iukp = iukp; + HyP->Ublock_info[HyP->num_u_blks].rukp = rukp; + HyP->Ublock_info[HyP->num_u_blks].jb = jb; + + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + segsize = klst - usub[jj]; + if ( segsize ) + { + + if ( segsize != HyP->ldu ) full = 0; + if ( segsize > HyP->ldu ) HyP->ldu = segsize; + } + } + + HyP->Ublock_info[HyP->num_u_blks].ncols = temp_ncols; + // ncols += temp_ncols; + HyP->num_u_blks++; + } + else + { + HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].iukp = iukp; + HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].rukp = rukp; + HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].jb = jb; + HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].eo = HyP->nsupers - iperm_c_supno[jb]; /*since we want it to be in descending order*/ + + /* Prepare to call DGEMM. */ + + + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + segsize = klst - usub[jj]; + if ( segsize ) + { + + if ( segsize != HyP->ldu_Phi ) full_Phi = 0; + if ( segsize > HyP->ldu_Phi ) HyP->ldu_Phi = segsize; + } + } + + HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].ncols = temp_ncols; + // ncols_Phi += temp_ncols; + HyP->num_u_blks_Phi++; + } + } + + /* Now doing prefix sum on on ncols*/ + HyP->Ublock_info[0].full_u_cols = HyP->Ublock_info[0 ].ncols; + for (int_t j = 1; j < HyP->num_u_blks; ++j) + { + HyP->Ublock_info[j].full_u_cols = HyP->Ublock_info[j ].ncols + HyP->Ublock_info[j - 1].full_u_cols; + } + + /*sorting u blocks based on elimination order */ + // sort_U_info_elm(HyP->Ublock_info_Phi,HyP->num_u_blks_Phi ); + HyP->Ublock_info_Phi[0].full_u_cols = HyP->Ublock_info_Phi[0 ].ncols; + for ( int_t j = 1; j < HyP->num_u_blks_Phi; ++j) + { + HyP->Ublock_info_Phi[j].full_u_cols = HyP->Ublock_info_Phi[j ].ncols + HyP->Ublock_info_Phi[j - 1].full_u_cols; + } + + HyP->bigU_Phi = bigU; + HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; + + zgather_u(HyP->num_u_blks, HyP->Ublock_info, usub, uval, HyP->bigU_host, + HyP->ldu, xsup, klst ); + + zgather_u(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, usub, uval, + HyP->bigU_Phi, HyP->ldu_Phi, xsup, klst ); + +} /* zRgather_U */ diff --git a/SRC/ztreeFactorization.c b/SRC/ztreeFactorization.c new file mode 100644 index 00000000..6f07559f --- /dev/null +++ b/SRC/ztreeFactorization.c @@ -0,0 +1,679 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +#include "superlu_zdefs.h" +#if 0 +#include "treeFactorization.h" +#include "trfCommWrapper.h" +#endif + +int_t zLluBufInit(LUValSubBuf_t* LUvsb, LUstruct_t *LUstruct) +{ + LocalLU_t *Llu = LUstruct->Llu; + LUvsb->Lsub_buf = intMalloc_dist(Llu->bufmax[0]); //INT_T_ALLOC(Llu->bufmax[0]); + LUvsb->Lval_buf = doubleMalloc_dist(Llu->bufmax[1]); //DOUBLE_ALLOC(Llu->bufmax[1]); + LUvsb->Usub_buf = intMalloc_dist(Llu->bufmax[2]); //INT_T_ALLOC(Llu->bufmax[2]); + LUvsb->Uval_buf = doubleMalloc_dist(Llu->bufmax[3]); //DOUBLE_ALLOC(Llu->bufmax[3]); + return 0; +} + +diagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid) +{ + diagFactBufs_t** dFBufs; + dFBufs = (diagFactBufs_t** ) SUPERLU_MALLOC(mxLeafNode * sizeof(diagFactBufs_t*)); + for (int i = 0; i < mxLeafNode; ++i) + { + /* code */ + dFBufs[i] = (diagFactBufs_t* ) SUPERLU_MALLOC(sizeof(diagFactBufs_t)); + assert(dFBufs[i]); + zinitDiagFactBufs(ldt, dFBufs[i]); + + }/*Minor for loop -2 for (int i = 0; i < mxLeafNode; ++i)*/ + + return dFBufs; +} + +LUValSubBuf_t** zLluBufInitArr(int_t numLA, LUstruct_t *LUstruct) +{ + LUValSubBuf_t** LUvsbs = (LUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(LUValSubBuf_t*)); + for (int_t i = 0; i < numLA; ++i) + { + /* code */ + LUvsbs[i] = (LUValSubBuf_t*) SUPERLU_MALLOC(sizeof(LUValSubBuf_t)); + zLluBufInit(LUvsbs[i], LUstruct); + } /*minor for loop-3 for (int_t i = 0; i < numLA; ++i)*/ + + return LUvsbs; +} + + +int_t zinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, + scuBufs_t* scuBufs, + LUstruct_t* LUstruct, + gridinfo_t * grid) +{ + scuBufs->bigV = dgetBigV(ldt, num_threads); + scuBufs->bigU = dgetBigU(nsupers, grid, LUstruct); + return 0; +} +int_t dinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf) +{ + dFBuf->BlockUFactor = doubleMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt); + dFBuf->BlockLFactor = doubleMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt); + return 0; +} + +int_t zdenseTreeFactor( + int_t nnodes, // number of nodes in the tree + int_t *perm_c_supno, // list of nodes in the order of factorization + commRequests_t *comReqs, // lists of communication requests + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t*packLUInfo, + msgs_t*msgs, + LUValSubBuf_t*LUvsb, + diagFactBufs_t *dFBuf, + factStat_t *factStat, + factNodelists_t *fNlists, + superlu_dist_options_t *options, + int_t * gIperm_c_supno, + int_t ldt, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info +) +{ + gridinfo_t* grid = &(grid3d->grid2d); + LocalLU_t *Llu = LUstruct->Llu; + + /*main loop over all the super nodes*/ + for (int_t k0 = 0; k0 < nnodes ; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + + /* diagonal factorization */ +#if 0 + sDiagFactIBCast(k, dFBuf, factStat, comReqs, grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + zDiagFactIBCast(k, k, dFBuf->BlockUFactor, dFBuf->BlockLFactor, + factStat->IrecvPlcd_D, + comReqs->U_diag_blk_recv_req, + comReqs->L_diag_blk_recv_req, + comReqs->U_diag_blk_send_req, + comReqs->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + +#if 0 + /*L update */ + sLPanelUpdate(k, dFBuf, factStat, comReqs, grid, LUstruct, SCT); + /*L Ibcast*/ + sIBcastRecvLPanel( k, comReqs, LUvsb, msgs, factStat, grid, LUstruct, SCT, tag_ub ); + /*U update*/ + sUPanelUpdate(k, ldt, dFBuf, factStat, comReqs, scuBufs, + packLUInfo, grid, LUstruct, stat, SCT); + /*U bcast*/ + sIBcastRecvUPanel( k, comReqs, LUvsb, msgs, factStat, grid, LUstruct, SCT, tag_ub ); + /*Wait for L panel*/ + sWaitL(k, comReqs, msgs, grid, LUstruct, SCT); + /*Wait for U panel*/ + sWaitU(k, comReqs, msgs, grid, LUstruct, SCT); +#else + /*L update */ + zLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L, + comReqs->U_diag_blk_recv_req, dFBuf->BlockUFactor, grid, LUstruct, SCT); + /*L Ibcast*/ + zIBcastRecvLPanel(k, k, msgs->msgcnt, comReqs->send_req, comReqs->recv_req, + LUvsb->Lsub_buf, LUvsb->Lval_buf, factStat->factored, + grid, LUstruct, SCT, tag_ub); + /*U update*/ + zUPanelUpdate(k, factStat->factored_U, comReqs->L_diag_blk_recv_req, + dFBuf->BlockLFactor, scuBufs->bigV, ldt, + packLUInfo->Ublock_info, grid, LUstruct, stat, SCT); + /*U bcast*/ + zIBcastRecvUPanel(k, k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ, + LUvsb->Usub_buf, LUvsb->Uval_buf, + grid, LUstruct, SCT, tag_ub); + zWaitL(k, msgs->msgcnt, msgs->msgcntU, comReqs->send_req, comReqs->recv_req, + grid, LUstruct, SCT); + zWaitU(k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ, grid, LUstruct, SCT); +#endif + double tsch = SuperLU_timer_(); +#if 0 + int_t LU_nonempty = sSchurComplementSetup(k, + msgs, packLUInfo, gIperm_c_supno, perm_c_supno, + fNlists, scuBufs, LUvsb, grid, LUstruct); +#else + int_t LU_nonempty= zSchurComplementSetup(k, msgs->msgcnt, + packLUInfo->Ublock_info, packLUInfo->Remain_info, + packLUInfo->uPanelInfo, packLUInfo->lPanelInfo, + gIperm_c_supno, fNlists->iperm_u, fNlists->perm_u, + scuBufs->bigU, LUvsb->Lsub_buf, LUvsb->Lval_buf, + LUvsb->Usub_buf, LUvsb->Uval_buf, + grid, LUstruct); +#endif + if (LU_nonempty) + { + Ublock_info_t* Ublock_info = packLUInfo->Ublock_info; + Remain_info_t* Remain_info = packLUInfo->Remain_info; + uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo; + lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo; + int_t* indirect = fNlists->indirect; + int_t* indirect2 = fNlists->indirect2; + /*Schurcomplement Update*/ + int_t nub = uPanelInfo->nub; + int_t nlb = lPanelInfo->nlb; + doublecomplex* bigV = scuBufs->bigV; + doublecomplex* bigU = scuBufs->bigU; + +#pragma omp parallel for schedule(dynamic) + for (int_t ij = 0; ij < nub * nlb; ++ij) + { + /* code */ + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + int_t** Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr; + doublecomplex** Unzval_br_ptr = LUstruct->Llu->Unzval_br_ptr; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t ub = ij / nlb; + int_t lb + = ij % nlb; + doublecomplex *L_mat = lPanelInfo->lusup; + int_t ldl = lPanelInfo->nsupr; + int_t luptr0 = lPanelInfo->luptr0; + doublecomplex *U_mat = bigU; + int_t ldu = uPanelInfo->ldu; + int_t knsupc = SuperSize(k); + int_t klst = FstBlockC (k + 1); + int_t *lsub = lPanelInfo->lsub; + int_t *usub = uPanelInfo->usub; + int_t thread_id = omp_get_thread_num(); + block_gemm_scatter( lb, ub, + Ublock_info, + Remain_info, + &L_mat[luptr0], ldl, + U_mat, ldu, + bigV, + knsupc, klst, + lsub, usub, ldt, + thread_id, indirect, indirect2, + Lrowind_bc_ptr, Lnzval_bc_ptr, + Ufstnz_br_ptr, Unzval_br_ptr, + xsup, grid, stat +#ifdef SCATTER_PROFILE + , Host_TheadScatterMOP, Host_TheadScatterTimer +#endif + ); + } /*for (int_t ij = 0; ij < nub * nlb;*/ + } /*if (LU_nonempty)*/ + SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; +#if 0 + sWait_LUDiagSend(k, comReqs, grid, SCT); +#else + Wait_LUDiagSend(k, comReqs->U_diag_blk_send_req, comReqs->L_diag_blk_send_req, + grid, SCT); +#endif + }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/ + + return 0; +} /* zdenseTreeFactor */ + +/* + * 2D factorization at individual subtree. + */ +int_t zsparseTreeFactor_ASYNC( + sForest_t* sforest, + commRequests_t **comReqss, // lists of communication requests // size maxEtree level + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t*packLUInfo, + msgs_t**msgss, // size=num Look ahead + LUValSubBuf_t**LUvsbs, // size=num Look ahead + diagFactBufs_t **dFBufs, // size maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t* gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t * gIperm_c_supno, + int_t ldt, + HyP_t* HyP, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info +) +{ + int_t nnodes = sforest->nNodes ; // number of nodes in the tree + if (nnodes < 1) + { + return 1; + } + + int_t *perm_c_supno = sforest->nodeList ; // list of nodes in the order of factorization + treeTopoInfo_t* treeTopoInfo = &sforest->topoInfo; + int_t* myIperm = treeTopoInfo->myIperm; + + gridinfo_t* grid = &(grid3d->grid2d); + /*main loop over all the levels*/ + + int_t maxTopoLevel = treeTopoInfo->numLvl; + int_t* eTreeTopLims = treeTopoInfo->eTreeTopLims; + int_t * IrecvPlcd_D = factStat->IrecvPlcd_D; + int_t* factored_D = factStat->factored_D; + int_t * factored_L = factStat->factored_L; + int_t * factored_U = factStat->factored_U; + int_t* IbcastPanel_L = factStat->IbcastPanel_L; + int_t* IbcastPanel_U = factStat->IbcastPanel_U; + int_t* xsup = LUstruct->Glu_persist->xsup; + + int_t numLAMax = getNumLookAhead(options); + int_t numLA = numLAMax; + + for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0; + /* k-th diagonal factorization */ + /*Now factor and broadcast diagonal block*/ +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + zDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k] = 1; + } + + for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl) + { + /* code */ + int_t k_st = eTreeTopLims[topoLvl]; + int_t k_end = eTreeTopLims[topoLvl + 1]; + for (int_t k0 = k_st; k0 < k_end; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0 - k_st; + /* diagonal factorization */ + if (!factored_D[k] ) + { + /*If LU panels from GPU are not reduced then reduce + them before diagonal factorization*/ +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + zDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, + dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + } + } + double t_apt = SuperLU_timer_(); + + for (int_t k0 = k_st; k0 < k_end; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0 - k_st; + + /*L update */ + if (factored_L[k] == 0) + { +#if 0 + sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + zLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L, + comReqss[offset]->U_diag_blk_recv_req, + dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT); +#endif + factored_L[k] = 1; + } + /*U update*/ + if (factored_U[k] == 0) + { +#if 0 + sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset], + scuBufs, packLUInfo, grid, LUstruct, stat, SCT); +#else + zUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req, + dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt, + packLUInfo->Ublock_info, grid, LUstruct, stat, SCT); +#endif + factored_U[k] = 1; + } + } + + for (int_t k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0 % numLA; + /* diagonal factorization */ + + /*L Ibcast*/ + if (IbcastPanel_L[k] == 0) + { +#if 0 + sIBcastRecvLPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + zIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req, + comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf, + LUvsbs[offset]->Lval_buf, factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[k] = 1; /*for consistancy; unused later*/ + } + + /*U Ibcast*/ + if (IbcastPanel_U[k] == 0) + { +#if 0 + sIBcastRecvUPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + zIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[k] = 1; + } + } + + // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt; + SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt; + + for (int_t k0 = k_st; k0 < k_end; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0 % numLA; + +#if 0 + sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); + /*Wait for U panel*/ + sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); +#else + zWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + grid, LUstruct, SCT); + zWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, grid, LUstruct, SCT); +#endif + double tsch = SuperLU_timer_(); + int_t LU_nonempty = dSchurComplementSetupGPU(k, + msgss[offset], packLUInfo, + myIperm, gIperm_c_supno, + perm_c_supno, gEtreeInfo, + fNlists, scuBufs, + LUvsbs[offset], + grid, LUstruct, HyP); + // initializing D2H data transfer + int_t jj_cpu = 0; + + scuStatUpdate( SuperSize(k), HyP, SCT, stat); + uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo; + lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo; + int_t *lsub = lPanelInfo->lsub; + int_t *usub = uPanelInfo->usub; + int_t* indirect = fNlists->indirect; + int_t* indirect2 = fNlists->indirect2; + + /*Schurcomplement Update*/ + + int_t knsupc = SuperSize(k); + int_t klst = FstBlockC (k + 1); + + doublecomplex* bigV = scuBufs->bigV; + +#pragma omp parallel + { +#pragma omp for schedule(dynamic,2) nowait + /* Each thread is assigned one loop index ij, responsible for + block update L(lb,k) * U(k,j) -> tempv[]. */ + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij) + { + /* Get the entire area of L (look-ahead) X U (all-blocks). */ + /* for each j-block in U, go through all L-blocks in the + look-ahead window. */ + int_t j = ij / HyP->lookAheadBlk; + + int_t lb = ij % HyP->lookAheadBlk; + block_gemm_scatterTopLeft( lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, + LUstruct, grid, SCT, stat ); + } + +#pragma omp for schedule(dynamic,2) nowait + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij) + { + int_t j = ij / HyP->lookAheadBlk ; + int_t lb = ij % HyP->lookAheadBlk; + block_gemm_scatterTopRight( lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, + LUstruct, grid, SCT, stat); + } + +#pragma omp for schedule(dynamic,2) nowait + for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) // + { + int_t j = ij / HyP->RemainBlk; + int_t lb = ij % HyP->RemainBlk; + block_gemm_scatterBottomLeft( lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, + HyP, LUstruct, grid, SCT, stat); + } /*for (int_t ij =*/ + } + + if (topoLvl < maxTopoLevel - 1) + { + int_t k_parent = gEtreeInfo->setree[k]; + gEtreeInfo->numChildLeft[k_parent]--; + if (gEtreeInfo->numChildLeft[k_parent] == 0) + { + int_t k0_parent = myIperm[k_parent]; + if (k0_parent > 0) + { + /* code */ + assert(k0_parent < nnodes); + int_t offset = k0_parent - k_end; +#if 0 + sDiagFactIBCast(k_parent, dFBufs[offset], factStat, + comReqss[offset], grid, options, thresh, + LUstruct, stat, info, SCT, tag_ub); +#else + zDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor, + dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k_parent] = 1; + } + + } + } + +#pragma omp parallel + { +#pragma omp for schedule(dynamic,2) nowait + for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu) ; ++ij) + { + int_t j = ij / HyP->RemainBlk + jj_cpu; + int_t lb = ij % HyP->RemainBlk; + block_gemm_scatterBottomRight( lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, + HyP, LUstruct, grid, SCT, stat); + } /*for (int_t ij =*/ + + } + + SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + // finish waiting for diag block send + int_t abs_offset = k0 - k_st; +#if 0 + sWait_LUDiagSend(k, comReqss[abs_offset], grid, SCT); +#else + Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, + comReqss[abs_offset]->L_diag_blk_send_req, + grid, SCT); +#endif + /*Schedule next I bcasts*/ + for (int_t next_k0 = k0 + 1; next_k0 < SUPERLU_MIN( k0 + 1 + numLA, nnodes); ++next_k0) + { + /* code */ + int_t next_k = perm_c_supno[next_k0]; + int_t offset = next_k0 % numLA; + + /*L Ibcast*/ + if (IbcastPanel_L[next_k] == 0 && factored_L[next_k]) + { +#if 0 + sIBcastRecvLPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + zIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf, + factStat->factored, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[next_k] = 1; /*will be used later*/ + } + /*U Ibcast*/ + if (IbcastPanel_U[next_k] == 0 && factored_U[next_k]) + { +#if 0 + sIBcastRecvUPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + zIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[next_k] = 1; + } + } + + if (topoLvl < maxTopoLevel - 1) + { + + /*look ahead LU factorization*/ + int_t kx_st = eTreeTopLims[topoLvl + 1]; + int_t kx_end = eTreeTopLims[topoLvl + 2]; + for (int_t k0x = kx_st; k0x < kx_end; k0x++) + { + /* code */ + int_t kx = perm_c_supno[k0x]; + int_t offset = k0x - kx_st; + if (IrecvPlcd_D[kx] && !factored_L[kx]) + { + /*check if received*/ + int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset], + grid, SCT); + if (recvUDiag) + { +#if 0 + sLPanelTrSolve( kx, dFBufs[offset], + factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + zLPanelTrSolve( kx, factStat->factored_L, + dFBufs[offset]->BlockUFactor, grid, LUstruct); +#endif + + factored_L[kx] = 1; + + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_L[kx] == 0 && + k0x - k0 < numLA + 1 && // is within lookahead window + factored_L[kx]) + { + int_t offset1 = k0x % numLA; +#if 0 + sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1], + msgss[offset1], factStat, + grid, LUstruct, SCT, tag_ub); +#else + zIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, + comReqss[offset1]->send_req, + comReqss[offset1]->recv_req, + LUvsbs[offset1]->Lsub_buf, + LUvsbs[offset1]->Lval_buf, + factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[kx] = 1; /*will be used later*/ + } + + } + } + + if (IrecvPlcd_D[kx] && !factored_U[kx]) + { + /*check if received*/ + int_t recvLDiag = checkRecvLDiag( kx, comReqss[offset], + grid, SCT); + if (recvLDiag) + { +#if 0 + sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo, + grid, LUstruct, stat, SCT); +#else + zUPanelTrSolve( kx, dFBufs[offset]->BlockLFactor, + scuBufs->bigV, + ldt, packLUInfo->Ublock_info, + grid, LUstruct, stat, SCT); +#endif + factored_U[kx] = 1; + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_U[kx] == 0 && + k0x - k0 < numLA + 1 && // is within lookahead window + factored_U[kx]) + { + int_t offset = k0x % numLA; +#if 0 + sIBcastRecvUPanel( kx, comReqss[offset], + LUvsbs[offset], + msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub); +#else + zIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[kx] = 1; /*will be used later*/ + } + } + } + } + + } + }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/ + + } + return 0; +} /* zsparseTreeFactor_ASYNC */ diff --git a/SRC/ztrfAux.c b/SRC/ztrfAux.c new file mode 100644 index 00000000..d7211f30 --- /dev/null +++ b/SRC/ztrfAux.c @@ -0,0 +1,650 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +#include "superlu_zdefs.h" + +#if 0 +#include "pdgstrf3d.h" +#include "trfAux.h" +#endif + +/*init3DLUstruct with forest interface */ +void zinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + sForest_t** sForests, LUstruct_t* LUstruct, + gridinfo3d_t* grid3d) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t numForests = (1 << maxLvl) - 1; + int_t* gNodeCount = INT_T_ALLOC (numForests); + int_t** gNodeLists = (int_t**) SUPERLU_MALLOC(numForests * sizeof(int_t*)); + + for (int i = 0; i < numForests; ++i) + { + gNodeCount[i] = 0; + gNodeLists[i] = NULL; + /* code */ + if (sForests[i]) + { + gNodeCount[i] = sForests[i]->nNodes; + gNodeLists[i] = sForests[i]->nodeList; + } + } + + /*call the old forest*/ + zinit3DLUstruct( myTreeIdxs, myZeroTrIdxs, + gNodeCount, gNodeLists, LUstruct, grid3d); + + SUPERLU_FREE(gNodeCount); + SUPERLU_FREE(gNodeLists); +} + +int_t zSchurComplementSetup( + int_t k, + int *msgcnt, + Ublock_info_t* Ublock_info, + Remain_info_t* Remain_info, + uPanelInfo_t *uPanelInfo, + lPanelInfo_t *lPanelInfo, + int_t* iperm_c_supno, + int_t * iperm_u, + int_t * perm_u, + doublecomplex *bigU, + int_t* Lsub_buf, + doublecomplex *Lval_buf, + int_t* Usub_buf, + doublecomplex *Uval_buf, + gridinfo_t *grid, + LUstruct_t *LUstruct +) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + + int_t* ToRecv = Llu->ToRecv; + int_t iam = grid->iam; + + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + + int_t krow = PROW (k, grid); + int_t kcol = PCOL (k, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; + + int_t *usub; + doublecomplex* uval; + int_t* lsub; + doublecomplex* lusup; + + if (mycol == kcol) + { + /*send the L panel to myrow*/ + int_t lk = LBj (k, grid); /* Local block number. */ + lsub = Lrowind_bc_ptr[lk]; + lPanelInfo->lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + lPanelInfo->lusup = Lnzval_bc_ptr[lk]; + } + else + { + lsub = Lsub_buf; + lPanelInfo->lsub = Lsub_buf; + lusup = Lval_buf; + lPanelInfo->lusup = Lval_buf; + } + + if (myrow == krow) + { + int_t lk = LBi (k, grid); + usub = Ufstnz_br_ptr[lk]; + uval = Unzval_br_ptr[lk]; + uPanelInfo->usub = usub; + } + else + { + if (ToRecv[k] == 2) + { + usub = Usub_buf; + uval = Uval_buf; + uPanelInfo->usub = usub; + } + } + + /*now each procs does the schurcomplement update*/ + int_t msg0 = msgcnt[0]; + int_t msg2 = msgcnt[2]; + int_t knsupc = SuperSize (k); + + int_t lptr0, luptr0; + int_t LU_nonempty = msg0 && msg2; + if (LU_nonempty == 0) return 0; + if (msg0 && msg2) /* L(:,k) and U(k,:) are not empty. */ + { + lPanelInfo->nsupr = lsub[1]; + int_t nlb; + if (myrow == krow) /* Skip diagonal block L(k,k). */ + { + lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1]; + luptr0 = knsupc; + nlb = lsub[0] - 1; + lPanelInfo->nlb = nlb; + } + else + { + lptr0 = BC_HEADER; + luptr0 = 0; + nlb = lsub[0]; + lPanelInfo->nlb = nlb; + } + int_t iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ + int_t rukp = 0; /* Pointer to nzval[] of U(k,:) */ + int_t nub = usub[0]; /* Number of blocks in the block row U(k,:) */ + int_t klst = FstBlockC (k + 1); + uPanelInfo->klst = klst; + + /* -------------------------------------------------------------- + Update the look-ahead block columns A(:,k+1:k+num_look_ahead). + -------------------------------------------------------------- */ + int_t iukp0 = iukp; + int_t rukp0 = rukp; + + /* reorder the remaining columns in bottom-up */ + for (int_t jj = 0; jj < nub; jj++) + { +#ifdef ISORT + iperm_u[jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */ + perm_u[jj] = jj; +#else + perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */ + perm_u[2 * jj + 1] = jj; +#endif + int_t jb = usub[iukp]; /* Global block number of block U(k,j). */ + int_t nsupc = SuperSize (jb); + iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ + iukp += nsupc; + } + iukp = iukp0; +#ifdef ISORT + isort (nub, iperm_u, perm_u); +#else + qsort (perm_u, (size_t) nub, 2 * sizeof (int_t), + &superlu_sort_perm); +#endif + // j = jj0 = 0; + + int_t ldu = 0; + int_t full = 1; + int_t num_u_blks = 0; + + for (int_t j = 0; j < nub ; ++j) + { + int_t iukp, temp_ncols; + + temp_ncols = 0; + int_t rukp, jb, ljb, nsupc, segsize; + arrive_at_ublock( + j, &iukp, &rukp, &jb, &ljb, &nsupc, + iukp0, rukp0, usub, perm_u, xsup, grid + ); + + int_t jj = iukp; + for (; jj < iukp + nsupc; ++jj) + { + segsize = klst - usub[jj]; + if ( segsize ) ++temp_ncols; + } + Ublock_info[num_u_blks].iukp = iukp; + Ublock_info[num_u_blks].rukp = rukp; + Ublock_info[num_u_blks].jb = jb; + Ublock_info[num_u_blks].eo = iperm_c_supno[jb]; + /* Prepare to call DGEMM. */ + jj = iukp; + + for (; jj < iukp + nsupc; ++jj) + { + segsize = klst - usub[jj]; + if ( segsize ) + { + if ( segsize != ldu ) full = 0; + if ( segsize > ldu ) ldu = segsize; + } + } + + Ublock_info[num_u_blks].ncols = temp_ncols; + // ncols += temp_ncols; + num_u_blks++; + + } + + uPanelInfo->ldu = ldu; + uPanelInfo->nub = num_u_blks; + + Ublock_info[0].full_u_cols = Ublock_info[0 ].ncols; + Ublock_info[0].StCol = 0; + for ( int_t j = 1; j < num_u_blks; ++j) + { + Ublock_info[j].full_u_cols = Ublock_info[j ].ncols + Ublock_info[j - 1].full_u_cols; + Ublock_info[j].StCol = Ublock_info[j - 1].StCol + Ublock_info[j - 1].ncols; + } + + zgather_u(num_u_blks, Ublock_info, usub, uval, bigU, ldu, xsup, klst ); + + sort_U_info_elm(Ublock_info, num_u_blks ); + + int_t cum_nrow = 0; + int_t RemainBlk = 0; + + int_t lptr = lptr0; + int_t luptr = luptr0; + for (int_t i = 0; i < nlb; ++i) + { + int_t ib = lsub[lptr]; /* Row block L(i,k). */ + int_t temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */ + + Remain_info[RemainBlk].nrows = temp_nbrow; + Remain_info[RemainBlk].StRow = cum_nrow; + Remain_info[RemainBlk].FullRow = cum_nrow; + Remain_info[RemainBlk].lptr = lptr; + Remain_info[RemainBlk].ib = ib; + Remain_info[RemainBlk].eo = iperm_c_supno[ib]; + RemainBlk++; + + cum_nrow += temp_nbrow; + lptr += LB_DESCRIPTOR; /* Skip descriptor. */ + lptr += temp_nbrow; + luptr += temp_nbrow; + } + + lptr = lptr0; + luptr = luptr0; + sort_R_info_elm( Remain_info, lPanelInfo->nlb ); + lPanelInfo->luptr0 = luptr0; + } + return LU_nonempty; +} /* zSchurComplementSetup */ + +/* + * Gather L and U panels into respective buffers, to prepare for GEMM call. + * Divide Schur complement update into two parts: CPU vs. GPU. + */ +int_t zSchurComplementSetupGPU( + int_t k, msgs_t* msgs, + packLUInfo_t* packLUInfo, + int_t* myIperm, + int_t* iperm_c_supno, int_t*perm_c_supno, + gEtreeInfo_t* gEtreeInfo, factNodelists_t* fNlists, + scuBufs_t* scuBufs, LUValSubBuf_t* LUvsb, + gridinfo_t *grid, LUstruct_t *LUstruct, + HyP_t* HyP) +{ + int_t * Lsub_buf = LUvsb->Lsub_buf; + doublecomplex * Lval_buf = LUvsb->Lval_buf; + int_t * Usub_buf = LUvsb->Usub_buf; + doublecomplex * Uval_buf = LUvsb->Uval_buf; + uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo; + lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo; + int* msgcnt = msgs->msgcnt; + int_t* iperm_u = fNlists->iperm_u; + int_t* perm_u = fNlists->perm_u; + doublecomplex* bigU = scuBufs->bigU; + + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + + int_t* ToRecv = Llu->ToRecv; + int_t iam = grid->iam; + + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + + int_t krow = PROW (k, grid); + int_t kcol = PCOL (k, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; + + int_t *usub; + doublecomplex* uval; + int_t* lsub; + doublecomplex* lusup; + + HyP->lookAheadBlk = 0, HyP->RemainBlk = 0; + HyP->Lnbrow =0, HyP->Rnbrow=0; + HyP->num_u_blks_Phi=0; + HyP->num_u_blks=0; + + if (mycol == kcol) + { + /*send the L panel to myrow*/ + int_t lk = LBj (k, grid); /* Local block number. */ + lsub = Lrowind_bc_ptr[lk]; + lPanelInfo->lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + lPanelInfo->lusup = Lnzval_bc_ptr[lk]; + } + else + { + lsub = Lsub_buf; + lPanelInfo->lsub = Lsub_buf; + lusup = Lval_buf; + lPanelInfo->lusup = Lval_buf; + } + if (myrow == krow) + { + int_t lk = LBi (k, grid); + usub = Ufstnz_br_ptr[lk]; + uval = Unzval_br_ptr[lk]; + uPanelInfo->usub = usub; + } + else + { + if (ToRecv[k] == 2) + { + usub = Usub_buf; + uval = Uval_buf; + uPanelInfo->usub = usub; + } + } + + /*now each procs does the schurcomplement update*/ + int_t msg0 = msgcnt[0]; + int_t msg2 = msgcnt[2]; + int_t knsupc = SuperSize (k); + + int_t lptr0, luptr0; + int_t LU_nonempty = msg0 && msg2; + if (LU_nonempty == 0) return 0; + if (msg0 && msg2) /* L(:,k) and U(k,:) are not empty. */ + { + lPanelInfo->nsupr = lsub[1]; + int_t nlb; + if (myrow == krow) /* Skip diagonal block L(k,k). */ + { + lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1]; + luptr0 = knsupc; + nlb = lsub[0] - 1; + lPanelInfo->nlb = nlb; + } + else + { + lptr0 = BC_HEADER; + luptr0 = 0; + nlb = lsub[0]; + lPanelInfo->nlb = nlb; + } + int_t iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ + + int_t nub = usub[0]; /* Number of blocks in the block row U(k,:) */ + int_t klst = FstBlockC (k + 1); + uPanelInfo->klst = klst; + + /* -------------------------------------------------------------- + Update the look-ahead block columns A(:,k+1:k+num_look_ahead). + -------------------------------------------------------------- */ + int_t iukp0 = iukp; + + /* reorder the remaining columns in bottom-up */ + for (int_t jj = 0; jj < nub; jj++) + { +#ifdef ISORT + iperm_u[jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */ + perm_u[jj] = jj; +#else + perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */ + perm_u[2 * jj + 1] = jj; +#endif + int_t jb = usub[iukp]; /* Global block number of block U(k,j). */ + int_t nsupc = SuperSize (jb); + iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ + iukp += nsupc; + } + iukp = iukp0; +#ifdef ISORT + isort (nub, iperm_u, perm_u); +#else + qsort (perm_u, (size_t) nub, 2 * sizeof (int_t), + &superlu_sort_perm); +#endif + HyP->Lnbrow = 0; + HyP->Rnbrow = 0; + HyP->num_u_blks_Phi=0; + HyP->num_u_blks=0; + + dRgather_L(k, lsub, lusup, gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno); + if (HyP->Lnbrow + HyP->Rnbrow > 0) + { + dRgather_U( k, 0, usub, uval, bigU, gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno, perm_u); + }/*if(nbrow>0) */ + + } + + return LU_nonempty; +} /* dSchurComplementSetupGPU */ + + +doublecomplex* zgetBigV(int_t ldt, int_t num_threads) +{ + doublecomplex *bigV; + if (!(bigV = doublecomplexMalloc_dist (8 * ldt * ldt * num_threads))) + ABORT ("Malloc failed for dgemm buffV"); + return bigV; +} + +doublecomplex* zgetBigU(int_t nsupers, gridinfo_t *grid, + LUstruct_t *LUstruct) +{ + int_t Pr = grid->nprow; + int_t Pc = grid->npcol; + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + + /* Following circuit is for finding maximum block size */ + int local_max_row_size = 0; + int max_row_size; + + for (int_t i = 0; i < nsupers; ++i) + { + int_t tpc = PCOL (i, grid); + if (mycol == tpc) + { + int_t lk = LBj (i, grid); + int_t* lsub = LUstruct->Llu->Lrowind_bc_ptr[lk]; + if (lsub != NULL) + { + local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]); + } + } + + } + + /* Max row size is global reduction of within A row */ + MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX, + (grid->rscp.comm)); + + // int_t Threads_per_process = get_thread_per_process (); + + /*Buffer size is max of of look ahead window*/ + + int_t bigu_size = + 8 * sp_ienv_dist (3) * (max_row_size) * SUPERLU_MAX(Pr / Pc, 1); + //Sherry: 8 * sp_ienv_dist (3) * (max_row_size) * MY_MAX(Pr / Pc, 1); + + // printf("Size of big U is %d\n",bigu_size ); + doublecomplex* bigU = doublecomplexMalloc_dist(bigu_size); + + return bigU; +} /* zgetBigU */ + +trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers, + superlu_dist_options_t *options, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d + ) +{ + gridinfo_t* grid = &(grid3d->grid2d); + +#if ( DEBUGlevel>=1 ) + int iam = grid3d->iam; + CHECK_MALLOC (iam, "Enter initTrf3Dpartition()"); +#endif + int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid); + int_t* iperm_c_supno = getFactIperm(perm_c_supno, nsupers); + + // calculating tree factorization + int_t *setree = supernodal_etree(nsupers, LUstruct->etree, LUstruct->Glu_persist->supno, LUstruct->Glu_persist->xsup); + treeList_t* treeList = setree2list(nsupers, setree ); + + /*update treelist with weight and depth*/ + getSCUweight(nsupers, treeList, LUstruct, grid3d); + + calcTreeWeight(nsupers, setree, treeList, LUstruct->Glu_persist->xsup); + + gEtreeInfo_t gEtreeInfo; + gEtreeInfo.setree = setree; + gEtreeInfo.numChildLeft = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * nsupers); + for (int_t i = 0; i < nsupers; ++i) + { + /* code */ + gEtreeInfo.numChildLeft[i] = treeList[i].numChild; + } + + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + sForest_t** sForests = getForests( maxLvl, nsupers, setree, treeList); + /*indexes of trees for my process grid in gNodeList size(maxLvl)*/ + int_t* myTreeIdxs = getGridTrees(grid3d); + int_t* myZeroTrIdxs = getReplicatedTrees(grid3d); + int_t* gNodeCount = getNodeCountsFr(maxLvl, sForests); + int_t** gNodeLists = getNodeListFr(maxLvl, sForests); + + zinit3DLUstructForest(myTreeIdxs, myZeroTrIdxs, + sForests, LUstruct, grid3d); + int_t* myNodeCount = getMyNodeCountsFr(maxLvl, myTreeIdxs, sForests); + int_t** treePerm = getTreePermFr( myTreeIdxs, sForests, grid3d); + + LUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(LUValSubBuf_t)); + zLluBufInit(LUvsb, LUstruct); + + int_t* supernode2treeMap = SUPERLU_MALLOC(nsupers*sizeof(int_t)); + int_t numForests = (1 << maxLvl) - 1; + for (int_t Fr = 0; Fr < numForests; ++Fr) + { + /* code */ + for (int_t nd = 0; nd < gNodeCount[Fr]; ++nd) + { + /* code */ + supernode2treeMap[gNodeLists[Fr][nd]]=Fr; + } + } + + trf3Dpartition_t* trf3Dpartition = SUPERLU_MALLOC(sizeof(trf3Dpartition_t)); + + trf3Dpartition->gEtreeInfo = gEtreeInfo; + trf3Dpartition->iperm_c_supno = iperm_c_supno; + trf3Dpartition->myNodeCount = myNodeCount; + trf3Dpartition->myTreeIdxs = myTreeIdxs; + trf3Dpartition->myZeroTrIdxs = myZeroTrIdxs; + trf3Dpartition->sForests = sForests; + trf3Dpartition->treePerm = treePerm; + trf3Dpartition->LUvsb = LUvsb; + trf3Dpartition->supernode2treeMap = supernode2treeMap; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit initTrf3Dpartition()"); +#endif + return trf3Dpartition; +} /* zinitTrf3Dpartition */ + + +#if 0 //**** Sherry: following two routines are old, the new ones are in util.c +int_t num_full_cols_U(int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup, + gridinfo_t *grid, int_t *perm_u) +{ + int_t lk = LBi (kk, grid); + int_t *usub = Ufstnz_br_ptr[lk]; + + if (usub == NULL) + { + /* code */ + return 0; + } + int_t iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ + int_t rukp = 0; /* Pointer to nzval[] of U(k,:) */ + int_t nub = usub[0]; /* Number of blocks in the block row U(k,:) */ + + int_t klst = FstBlockC (kk + 1); + int_t iukp0 = iukp; + int_t rukp0 = rukp; + int_t jb, ljb; + int_t nsupc; + int_t temp_ncols = 0; + int_t segsize; + + temp_ncols = 0; + + for (int_t j = 0; j < nub; ++j) + { + arrive_at_ublock( + j, &iukp, &rukp, &jb, &ljb, &nsupc, + iukp0, rukp0, usub, perm_u, xsup, grid + ); + + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + segsize = klst - usub[jj]; + if ( segsize ) ++temp_ncols; + } + } + return temp_ncols; +} + +// Sherry: this is old; new version is in util.c +int_t estimate_bigu_size( int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr, + Glu_persist_t *Glu_persist, gridinfo_t* grid, int_t* perm_u) +{ + + int_t iam = grid->iam; + + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + + int_t* xsup = Glu_persist->xsup; + + int ncols = 0; + int_t ldu = 0; + + /*initilize perm_u*/ + for (int i = 0; i < nsupers; ++i) + { + perm_u[i] = i; + } + + for (int lk = myrow; lk < nsupers; lk += Pr ) + { + ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr, + xsup, grid, perm_u, &ldu)); + } + + int_t max_ncols = 0; + + MPI_Allreduce(&ncols, &max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm); + + printf("max_ncols =%d, bigu_size=%ld\n", (int) max_ncols, (long long) ldt * max_ncols); + return ldt * max_ncols; +} /* old estimate_bigu_size. New one is in util.c */ +#endif /**** end old ones ****/ + + diff --git a/SRC/ztrfCommWrapper.c b/SRC/ztrfCommWrapper.c new file mode 100644 index 00000000..9baeba42 --- /dev/null +++ b/SRC/ztrfCommWrapper.c @@ -0,0 +1,519 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +#include "superlu_zdefs.h" + +#if 0 +#include "pdgstrf3d.h" +#include "trfCommWrapper.h" +#endif + +#ifdef __INTEL_COMPILER +#include "mkl.h" +#else +#include "cblas.h" +#endif + +int_t zDiagFactIBCast(int_t k, int_t k0, // supernode to be factored + doublecomplex *BlockUFactor, + doublecomplex *BlockLFactor, + int_t* IrecvPlcd_D, + MPI_Request *U_diag_blk_recv_req, + MPI_Request *L_diag_blk_recv_req, + MPI_Request *U_diag_blk_send_req, + MPI_Request *L_diag_blk_send_req, + gridinfo_t *grid, + superlu_dist_options_t *options, + double thresh, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, int *info, + SCT_t *SCT, + int tag_ub + ) +{ + // unpacking variables + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + + int_t iam = grid->iam; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + int_t krow = PROW (k, grid); + int_t kcol = PCOL (k, grid); + + //xsup for supersize + + /*Place Irecvs first*/ + // if (IrecvPlcd_D[k] == 0 ) + // { + int_t nsupc = SuperSize (k); + if (mycol == kcol && iam != pkk) + { + zIRecv_UDiagBlock(k0, BlockUFactor, /*pointer for the diagonal block*/ + nsupc * nsupc, krow, + U_diag_blk_recv_req, grid, SCT, tag_ub); + } + + if (myrow == krow && iam != pkk) + { + zIRecv_LDiagBlock(k0, BlockLFactor, /*pointer for the diagonal block*/ + nsupc * nsupc, kcol, + L_diag_blk_recv_req, grid, SCT, tag_ub); + } + IrecvPlcd_D[k] = 1; + // } + + /*DiagFact and send */ + // if ( factored_D[k] == 0 ) + // { + + // int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + // int_t krow = PROW (k, grid); + // int_t kcol = PCOL (k, grid); + /*factorize the leaf node and broadcast them + process row and process column*/ + if (iam == pkk) + { + // printf("Entering factorization %d\n", k); + // int_t offset = (k0 - k_st); // offset is input + /*factorize A[kk]*/ + Local_Dgstrf2(options, k, thresh, + BlockUFactor, /*factored U is over writen here*/ + Glu_persist, grid, Llu, stat, info, SCT); + + /*Pack L[kk] into blockLfactor*/ + dPackLBlock(k, BlockLFactor, Glu_persist, grid, Llu); + + /*Isend U blocks to the process row*/ + int_t nsupc = SuperSize(k); + zISend_UDiagBlock(k0, BlockLFactor, + nsupc * nsupc, U_diag_blk_send_req , grid, tag_ub); + + /*Isend L blocks to the process col*/ + zISend_LDiagBlock(k0, BlockLFactor, + nsupc * nsupc, L_diag_blk_send_req, grid, tag_ub); + SCT->commVolFactor += 1.0 * nsupc * nsupc * (Pr + Pc); + } + // } + return 0; +} + +int_t zLPanelTrSolve( int_t k, int_t* factored_L, + doublecomplex* BlockUFactor, + gridinfo_t *grid, + LUstruct_t *LUstruct) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + + int_t iam = grid->iam; + + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + int_t kcol = PCOL (k, grid); + int_t mycol = MYCOL (iam, grid); + int_t nsupc = SuperSize(k); + + /*factor the L panel*/ + if (mycol == kcol && iam != pkk) + { + // factored_L[k] = 1; + int_t lk = LBj (k, grid); + doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk]; + int_t nsupr; + if (Llu->Lrowind_bc_ptr[lk]) + nsupr = Llu->Lrowind_bc_ptr[lk][1]; + else + nsupr = 0; + /*wait for communication to finish*/ + + // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT); + // int_t flag = 0; + // while (flag == 0) + // { + // flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT); + // } + + int_t l = nsupr; + doublecomplex* ublk_ptr = BlockUFactor; + int_t ld_ujrow = nsupc; + + // unsigned long long t1 = _rdtsc(); + + // #pragma omp for schedule(dynamic) nowait +#define BL 32 + for (int i = 0; i < CEILING(l, BL); ++i) + { + #pragma omp task + { + int_t off = i * BL; + // Sherry: int_t len = MY_MIN(BL, l - i * BL); + int_t len = SUPERLU_MIN(BL, l - i * BL); + cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, + len, nsupc, 1.0, ublk_ptr, ld_ujrow, &lusup[off], nsupr); + } + } + } + + if (iam == pkk) + { + /* if (factored_L[k] == 0) + { */ + /* code */ + factored_L[k] = 1; + int_t lk = LBj (k, grid); + doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk]; + int_t nsupr; + if (Llu->Lrowind_bc_ptr[lk]) + nsupr = Llu->Lrowind_bc_ptr[lk][1]; + else + nsupr = 0; + + /*factorize A[kk]*/ + + int_t l = nsupr - nsupc; + + doublecomplex* ublk_ptr = BlockUFactor; + int_t ld_ujrow = nsupc; + // printf("%d: L update \n",k ); + +#define BL 32 + // #pragma omp parallel for + for (int i = 0; i < CEILING(l, BL); ++i) + { + int_t off = i * BL; + // Sherry: int_t len = MY_MIN(BL, l - i * BL); + int_t len = SUPERLU_MIN(BL, (l - i * BL)); + #pragma omp task + { + cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, + len, nsupc, 1.0, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr); + } + } + } + + return 0; +} /* zLPanelTrSolve */ + +int_t zLPanelUpdate( int_t k, int_t* IrecvPlcd_D, int_t* factored_L, + MPI_Request * U_diag_blk_recv_req, + doublecomplex* BlockUFactor, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT) +{ + + zUDiagBlockRecvWait( k, IrecvPlcd_D, factored_L, + U_diag_blk_recv_req, grid, LUstruct, SCT); + + zLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct ); + + return 0; +} /* zLPanelUpdate */ + +#define BL 32 + +int_t zUPanelTrSolve( int_t k, + doublecomplex* BlockLFactor, + doublecomplex* bigV, + int_t ldt, + Ublock_info_t* Ublock_info, + gridinfo_t *grid, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, SCT_t *SCT) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + int_t krow = PROW (k, grid); + int_t nsupc = SuperSize(k); + + /*factor the U panel*/ + if (myrow == krow && iam != pkk) + { + int_t lk = LBi (k, grid); /* Local block number */ + if (!Llu->Unzval_br_ptr[lk]) + return 0; + /* Initialization. */ + int_t klst = FstBlockC (k + 1); + + int_t *usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ + doublecomplex *uval = Llu->Unzval_br_ptr[lk]; + int_t nb = usub[0]; + + // int_t nsupr = Lsub_buf[1]; /* LDA of lusup[] */ + doublecomplex *lusup = BlockLFactor; + + /* Loop through all the row blocks. to get the iukp and rukp*/ + Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); + + /* Loop through all the row blocks. */ + // #pragma omp for schedule(dynamic,2) nowait + for (int_t b = 0; b < nb; ++b) + { + #pragma omp task + { + int_t thread_id = omp_get_thread_num(); + doublecomplex *tempv = bigV + thread_id * ldt * ldt; + Trs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, + usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist); + } + } + } + + /*factor the U panel*/ + if (iam == pkk) + { + /* code */ + // factored_U[k] = 1; + int_t *Lsub_buf; + doublecomplex *Lval_buf; + int_t lk = LBj (k, grid); + Lsub_buf = Llu->Lrowind_bc_ptr[lk]; + Lval_buf = Llu->Lnzval_bc_ptr[lk]; + + + /* calculate U panel */ + // PDGSTRS2 (n, k0, k, Lsub_buf, Lval_buf, Glu_persist, grid, Llu, + // stat, HyP->Ublock_info, bigV, ldt, SCT); + + lk = LBi (k, grid); /* Local block number */ + if (Llu->Unzval_br_ptr[lk]) + { + /* Initialization. */ + int_t klst = FstBlockC (k + 1); + + int_t *usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ + doublecomplex *uval = Llu->Unzval_br_ptr[lk]; + int_t nb = usub[0]; + + // int_t nsupr = Lsub_buf[1]; /* LDA of lusup[] */ + int_t nsupr = Lsub_buf[1]; /* LDA of lusup[] */ + doublecomplex *lusup = Lval_buf; + + /* Loop through all the row blocks. to get the iukp and rukp*/ + Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); + + /* Loop through all the row blocks. */ + // printf("%d :U update \n", k); + for (int_t b = 0; b < nb; ++b) + { + #pragma omp task + { + int_t thread_id = omp_get_thread_num(); + doublecomplex *tempv = bigV + thread_id * ldt * ldt; + Trs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, + usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist); + } + + } + } + } + + return 0; +} /* zUPanelTrSolve */ + +int_t zUPanelUpdate( int_t k, int_t* factored_U, + MPI_Request * L_diag_blk_recv_req, + doublecomplex* BlockLFactor, + doublecomplex* bigV, + int_t ldt, + Ublock_info_t* Ublock_info, + gridinfo_t *grid, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, SCT_t *SCT) +{ + + LDiagBlockRecvWait( k, factored_U, L_diag_blk_recv_req, grid); + + zUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid, + LUstruct, stat, SCT); + return 0; +} + +int_t zIBcastRecvLPanel( + int_t k, + int_t k0, + int* msgcnt, + MPI_Request *send_req, + MPI_Request *recv_req , + int_t* Lsub_buf, + doublecomplex* Lval_buf, + int_t * factored, + gridinfo_t *grid, + LUstruct_t *LUstruct, + SCT_t *SCT, + int tag_ub +) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + LocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + int_t** ToSendR = Llu->ToSendR; + int_t* ToRecv = Llu->ToRecv; + int_t iam = grid->iam; + int_t Pc = grid->npcol; + int_t mycol = MYCOL (iam, grid); + int_t kcol = PCOL (k, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + /* code */ + if (mycol == kcol) + { + /*send the L panel to myrow*/ + + int_t lk = LBj (k, grid); /* Local block number. */ + int_t* lsub = Lrowind_bc_ptr[lk]; + doublecomplex* lusup = Lnzval_bc_ptr[lk]; + + zIBcast_LPanel (k, k0, lsub, lusup, grid, msgcnt, send_req, + ToSendR, xsup, tag_ub); + + if (lsub) + { + int_t nrbl = lsub[0]; /*number of L blocks */ + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + int_t len2 = SuperSize(lk) * len; + SCT->commVolFactor += 1.0 * (Pc - 1) * (len1 * sizeof(int_t) + len2 * sizeof(doublecomplex)); + } + } + else + { + /*receive factored L panels*/ + if (ToRecv[k] >= 1) /* Recv block column L(:,0). */ + { + /*place Irecv*/ + zIrecv_LPanel (k, k0, Lsub_buf, Lval_buf, grid, recv_req, Llu, tag_ub); + } + else + { + msgcnt[0] = 0; + } + + } + factored[k] = 0; + + return 0; +} + +int_t zIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt, + MPI_Request *send_requ, + MPI_Request *recv_requ, + int_t* Usub_buf, doublecomplex* Uval_buf, + gridinfo_t *grid, LUstruct_t *LUstruct, + SCT_t *SCT, int tag_ub) +{ + LocalLU_t *Llu = LUstruct->Llu; + + int_t* ToSendD = Llu->ToSendD; + int_t* ToRecv = Llu->ToRecv; + int_t iam = grid->iam; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + int_t krow = PROW (k, grid); + + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; + if (myrow == krow) + { + /*send U panel to myrow*/ + int_t lk = LBi (k, grid); + int_t* usub = Ufstnz_br_ptr[lk]; + doublecomplex* uval = Unzval_br_ptr[lk]; + zIBcast_UPanel(k, k0, usub, uval, grid, msgcnt, + send_requ, ToSendD, tag_ub); + if (usub) + { + /* code */ + int_t lenv = usub[1]; + int_t lens = usub[2]; + SCT->commVolFactor += 1.0 * (Pr - 1) * (lens * sizeof(int_t) + lenv * sizeof(doublecomplex)); + } + } + else + { + /*receive U panels */ + if (ToRecv[k] == 2) /* Recv block row U(k,:). */ + { + zIrecv_UPanel (k, k0, Usub_buf, Uval_buf, Llu, grid, recv_requ, tag_ub); + } + else + { + msgcnt[2] = 0; + } + } + + return 0; +} + +int_t zWaitL( int_t k, int* msgcnt, int* msgcntU, + MPI_Request *send_req, MPI_Request *recv_req, + gridinfo_t *grid, LUstruct_t *LUstruct, SCT_t *SCT) +{ + LocalLU_t *Llu = LUstruct->Llu; + int_t** ToSendR = Llu->ToSendR; + int_t* ToRecv = Llu->ToRecv; + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + int_t kcol = PCOL (k, grid); + if (mycol == kcol) + { + /*send the L panel to myrow*/ + Wait_LSend (k, grid, ToSendR, send_req, SCT); + } + else + { + /*receive factored L panels*/ + if (ToRecv[k] >= 1) /* Recv block column L(:,0). */ + { + /*force wait for I recv to complete*/ + zWait_LRecv( recv_req, msgcnt, msgcntU, grid, SCT); + } + } + + return 0; +} + +int_t zWaitU( int_t k, int* msgcnt, + MPI_Request *send_requ, MPI_Request *recv_requ, + gridinfo_t *grid, LUstruct_t *LUstruct, SCT_t *SCT) +{ + LocalLU_t *Llu = LUstruct->Llu; + + int_t* ToRecv = Llu->ToRecv; + int_t* ToSendD = Llu->ToSendD; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t krow = PROW (k, grid); + if (myrow == krow) + { + int_t lk = LBi (k, grid); + if (ToSendD[lk] == YES) + Wait_USend(send_requ, grid, SCT); + } + else + { + /*receive U panels */ + if (ToRecv[k] == 2) /* Recv block row U(k,:). */ + { + /*force wait*/ + zWait_URecv( recv_requ, msgcnt, SCT); + } + } + return 0; +} From 1dc2a23bf0cb372d602d224fdf5c3415e9adb34d Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Sat, 4 May 2019 10:41:12 -0700 Subject: [PATCH 004/147] Implemented most of the complex code, except scatter.c, which is not compatible with the existing 2D code. --- EXAMPLE/Makefile | 16 +- EXAMPLE/pddrive.c | 2 +- EXAMPLE/pddrive3d.c | 304 +++++++++++++++++++++ EXAMPLE/pzdrive.c | 2 +- SRC/Makefile | 2 +- SRC/dgather.c | 5 +- SRC/dtrfCommWrapper.c | 10 +- SRC/pd3dcomm.c | 19 +- SRC/pdgssvx3d.c | 9 +- SRC/pdgstrf.c | 28 +- SRC/pdgstrf2.c | 202 +++++++------- SRC/pz3dcomm.c | 33 ++- SRC/pzgssvx3d.c | 33 ++- SRC/pzgstrf.c | 37 +-- SRC/pzgstrf2.c | 508 +++++++++++++++++++++++++++++++++- SRC/superlu_ddefs.h | 163 +++-------- SRC/superlu_defs.h | 10 + SRC/superlu_zdefs.h | 577 ++++++++++++++++++++++++++++++++++++++- SRC/treeFactorization.c | 62 +++-- SRC/trfAux.c | 36 ++- SRC/zcommunication_aux.c | 4 +- SRC/zgather.c | 5 +- SRC/ztreeFactorization.c | 16 +- SRC/ztrfAux.c | 4 +- SRC/ztrfCommWrapper.c | 18 +- 25 files changed, 1722 insertions(+), 383 deletions(-) create mode 100644 EXAMPLE/pddrive3d.c diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile index ea6035b8..74a3e3c8 100644 --- a/EXAMPLE/Makefile +++ b/EXAMPLE/Makefile @@ -30,7 +30,6 @@ ####################################################################### include ../make.inc -DEXM3D = pddrive3d.o dcreate_matrix.o DEXM = pddrive.o dcreate_matrix.o sp_ienv.o #pdgstrf2.o #pdgssvx.o # pdgstrs_lsum_X1.o pdgstrf_X1.o @@ -38,6 +37,7 @@ DEXM1 = pddrive1.o dcreate_matrix.o DEXM2 = pddrive2.o dcreate_matrix.o dcreate_matrix_perturbed.o DEXM3 = pddrive3.o dcreate_matrix.o DEXM4 = pddrive4.o dcreate_matrix.o +DEXM3D = pddrive3d.o dcreate_matrix.o DEXMG = pddrive_ABglobal.o DEXMG1 = pddrive1_ABglobal.o DEXMG2 = pddrive2_ABglobal.o @@ -49,6 +49,7 @@ ZEXM1 = pzdrive1.o zcreate_matrix.o ZEXM2 = pzdrive2.o zcreate_matrix.o zcreate_matrix_perturbed.o ZEXM3 = pzdrive3.o zcreate_matrix.o ZEXM4 = pzdrive4.o zcreate_matrix.o +ZEXM3D = pzdrive3d.o zcreate_matrix.o ZEXMG = pzdrive_ABglobal.o ZEXMG1 = pzdrive1_ABglobal.o ZEXMG2 = pzdrive2_ABglobal.o @@ -56,19 +57,18 @@ ZEXMG3 = pzdrive3_ABglobal.o ZEXMG4 = pzdrive4_ABglobal.o -all: pddrive3d double complex16 +all: double complex16 double: pddrive pddrive1 pddrive2 pddrive3 pddrive4 \ + pddrive3d \ pddrive_ABglobal pddrive1_ABglobal pddrive2_ABglobal \ pddrive3_ABglobal pddrive4_ABglobal complex16: pzdrive pzdrive1 pzdrive2 pzdrive3 pzdrive4 \ + pzdrive3d \ pzdrive_ABglobal pzdrive1_ABglobal pzdrive2_ABglobal \ pzdrive3_ABglobal pzdrive4_ABglobal -pddrive3d: $(DEXM3D) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(DEXM3D) $(LIBS) -lm -o $@ - pddrive: $(DEXM) $(DSUPERLULIB) $(LOADER) $(LOADOPTS) $(DEXM) $(LIBS) -lm -o $@ @@ -84,6 +84,9 @@ pddrive3: $(DEXM3) $(DSUPERLULIB) pddrive4: $(DEXM4) $(DSUPERLULIB) $(LOADER) $(LOADOPTS) $(DEXM4) $(LIBS) -lm -o $@ +pddrive3d: $(DEXM3D) $(DSUPERLULIB) + $(LOADER) $(LOADOPTS) $(DEXM3D) $(LIBS) -lm -o $@ + pddrive_ABglobal: $(DEXMG) $(DSUPERLULIB) $(LOADER) $(LOADOPTS) $(DEXMG) $(LIBS) -lm -o $@ @@ -117,6 +120,9 @@ pzdrive3: $(ZEXM3) $(DSUPERLULIB) pzdrive4: $(ZEXM4) $(DSUPERLULIB) $(LOADER) $(LOADOPTS) $(ZEXM4) $(LIBS) -lm -o $@ +pzdrive3d: $(ZEXM3D) $(DSUPERLULIB) + $(LOADER) $(LOADOPTS) $(ZEXM3D) $(LIBS) -lm -o $@ + pzdrive_ABglobal: $(ZEXMG) $(DSUPERLULIB) $(LOADER) $(LOADOPTS) $(ZEXMG) $(LIBS) -lm -o $@ diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c index 126c0244..8f88cb36 100644 --- a/EXAMPLE/pddrive.c +++ b/EXAMPLE/pddrive.c @@ -194,7 +194,7 @@ int main(int argc, char *argv[]) options.SolveInitialized = NO; options.RefineInitialized = NO; options.PrintStat = YES; - options.DiagInv = NO; + options.DiagInv = NO; */ set_default_options_dist(&options); #if 0 diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c new file mode 100644 index 00000000..0abcb6c4 --- /dev/null +++ b/EXAMPLE/pddrive3d.c @@ -0,0 +1,304 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PDGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * May 10, 2019
+ *
+ */
+#include "superlu_ddefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVEx3D.
+ *
+ * This example illustrates how to use PDGSSVX3D with the full
+ * (default) options to solve a linear system.
+ *
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call pdgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * The program may be run by typing
+ *    mpiexec -np 

pddrive -r -c \ + * -d + *

+ */ + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + ScalePermstruct_t ScalePermstruct; + LUstruct_t LUstruct; + SOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + double *b, *xtrue; + int_t m, n; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam >= nprow * npcol *npdep) + goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iim_loc, + nrhs, b, ldb, xtrue, ldx, &(grid.grid2d)); + fflush(stdout); + if (!grid.zscp.Iam) + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + + PStatFree (&stat); + if (grid.zscp.Iam == 0) { + Destroy_CompRowLoc_Matrix_dist (&A); + Destroy_LU (n, &(grid.grid2d), &LUstruct); + SUPERLU_FREE (b); + SUPERLU_FREE (xtrue); + SUPERLU_FREE (berr); + } + ScalePermstructFree (&ScalePermstruct); + LUstructFree (&LUstruct); + if (options.SolveInitialized) + { + dSolveFinalize (&options, &SOLVEstruct); + } + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c index f7580097..a7d03097 100644 --- a/EXAMPLE/pzdrive.c +++ b/EXAMPLE/pzdrive.c @@ -193,7 +193,7 @@ int main(int argc, char *argv[]) options.SolveInitialized = NO; options.RefineInitialized = NO; options.PrintStat = YES; - options.DiagInv = NO; + options.DiagInv = NO; */ set_default_options_dist(&options); #if 0 diff --git a/SRC/Makefile b/SRC/Makefile index 00b76220..5e0efb89 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -88,7 +88,7 @@ ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \ pzgsrfs.o pzgsmv.o pzgsrfs_ABXglobal.o pzgsmv_AXglobal.o \ zreadtriple_noheader.o # from 3D code -DPLUSRC += pzgssvx3d.o pzgstrf3d.o ztreeFactorization.o \ +ZPLUSRC += pzgssvx3d.o pzgstrf3d.o ztreeFactorization.o \ zgather.o pz3dcomm.o ztrfAux.o zcommunication_aux.o ztrfCommWrapper.o all: double complex16 diff --git a/SRC/dgather.c b/SRC/dgather.c index 9ad1fe14..556e49bb 100644 --- a/SRC/dgather.c +++ b/SRC/dgather.c @@ -41,7 +41,8 @@ void dgather_u(int_t num_u_blks, { // return; // private(j,iukp,rukp,tempu, jb, nsupc,ljb,segsize,lead_zero, \ - // jj, i) \ + // jj, i) + double zero = 0.0; #pragma omp parallel for default (shared) schedule(dynamic) for (int_t j = 0; j < num_u_blks; ++j) @@ -61,7 +62,7 @@ void dgather_u(int_t num_u_blks, if ( segsize ) { int_t lead_zero = ldu - segsize; - for (int_t i = 0; i < lead_zero; ++i) tempu[i] = 0.0; + for (int_t i = 0; i < lead_zero; ++i) tempu[i] = zero; tempu += lead_zero; for (int_t i = 0; i < segsize; ++i) { diff --git a/SRC/dtrfCommWrapper.c b/SRC/dtrfCommWrapper.c index 387a1977..6dcaa285 100644 --- a/SRC/dtrfCommWrapper.c +++ b/SRC/dtrfCommWrapper.c @@ -116,6 +116,7 @@ int_t dLPanelTrSolve( int_t k, int_t* factored_L, gridinfo_t *grid, LUstruct_t *LUstruct) { + double alpha = 1.0; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; int_t* xsup = Glu_persist->xsup; @@ -163,7 +164,7 @@ int_t dLPanelTrSolve( int_t k, int_t* factored_L, // Sherry: int_t len = MY_MIN(BL, l - i * BL); int_t len = SUPERLU_MIN(BL, l - i * BL); cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, - len, nsupc, 1.0, ublk_ptr, ld_ujrow, &lusup[off], nsupr); + len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off], nsupr); } } } @@ -200,7 +201,8 @@ int_t dLPanelTrSolve( int_t k, int_t* factored_L, #pragma omp task { cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, - len, nsupc, 1.0, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr); + len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr); + } } } @@ -270,7 +272,7 @@ int_t dUPanelTrSolve( int_t k, { int_t thread_id = omp_get_thread_num(); double *tempv = bigV + thread_id * ldt * ldt; - Trs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, + dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist); } } @@ -317,7 +319,7 @@ int_t dUPanelTrSolve( int_t k, { int_t thread_id = omp_get_thread_num(); double *tempv = bigV + thread_id * ldt * ldt; - Trs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, + dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist); } diff --git a/SRC/pd3dcomm.c b/SRC/pd3dcomm.c index 7ddf2394..303fb0de 100644 --- a/SRC/pd3dcomm.c +++ b/SRC/pd3dcomm.c @@ -434,7 +434,7 @@ int_t dscatter3dUPanels(int_t nsupers, #ifdef MPI_MALLOC MPI_DATATYPE_ALLOC(uval, lenv); #else - uval = DOUBLE_ALLOC(lenv); + uval = doubleMalloc_dist(lenv); //DOUBLE_ALLOC(lenv); #endif /*broadcast uval*/ MPI_Bcast( uval, lenv, MPI_DOUBLE, 0, grid3d->zscp.comm); @@ -692,6 +692,7 @@ int_t dreduceAncestors3d(int_t sender, int_t receiver, double* Lval_buf, double* Uval_buf, LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) { + double alpha = 1.0, beta = 1.0; int_t myGrid = grid3d->zscp.Iam; /*first setting the L blocks to zero*/ @@ -705,9 +706,10 @@ int_t dreduceAncestors3d(int_t sender, int_t receiver, dzSendUPanel(jb, receiver, LUstruct, grid3d, SCT); } else { - dzRecvLPanel(jb, sender, 1.0, 1.0, Lval_buf, LUstruct, grid3d, SCT); - dzRecvUPanel(jb, sender, 1.0, 1.0, - Uval_buf, LUstruct, grid3d, SCT); + dzRecvLPanel(jb, sender, alpha, beta, Lval_buf, + LUstruct, grid3d, SCT); + dzRecvUPanel(jb, sender, alpha, beta, Uval_buf, + LUstruct, grid3d, SCT); } } @@ -721,6 +723,7 @@ int_t dgatherFactoredLU(int_t sender, int_t receiver, LUValSubBuf_t*LUvsb, LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) { + double alpha = 0.0, beta = 1.0; double * Lval_buf = LUvsb->Lval_buf; double * Uval_buf = LUvsb->Uval_buf; int_t myGrid = grid3d->zscp.Iam; @@ -735,10 +738,10 @@ int_t dgatherFactoredLU(int_t sender, int_t receiver, } else { - dzRecvLPanel(jb, sender, 0.0, 1.0, Lval_buf, LUstruct, - grid3d, SCT); - dzRecvUPanel(jb, sender, 0.0, 1.0, Uval_buf, LUstruct, - grid3d, SCT); + dzRecvLPanel(jb, sender, alpha, beta, Lval_buf, + LUstruct, grid3d, SCT); + dzRecvUPanel(jb, sender, alpha, beta, Uval_buf, + LUstruct, grid3d, SCT); } } return 0; diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index 64e13dd9..fdd19272 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -607,7 +607,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, *info = -6; if (*info) { i = -(*info); - pxerr_dist ("pdgssvx", grid, -*info); + pxerr_dist ("pdgssvx3d", grid, -*info); return; } @@ -637,7 +637,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /********/ #if ( DEBUGlevel>=1 ) - CHECK_MALLOC (iam, "Enter pdgssvx()"); + CHECK_MALLOC (iam, "Enter pdgssvx3d()"); #endif /* Not factored & ask for equilibration */ @@ -1010,9 +1010,8 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, fstVtxSep[2 * noDomains - 2] = 0; } else if (permc_spec != PARMETIS) { /* same as before */ - printf - ("{%4d,%4d}: pdgssvx: invalid ColPerm option when ParSymbfact is used\n", - (int) MYROW (grid->iam, grid), (int) MYCOL (grid->iam, grid)); + printf("{%4d,%4d}: pdgssvx3d: invalid ColPerm option when ParSymbfact is used\n", + (int) MYROW(grid->iam, grid), (int) MYCOL(grid->iam, grid)); } } /* end ... use parmetis */ diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c index be001c2b..b7bd07b2 100644 --- a/SRC/pdgstrf.c +++ b/SRC/pdgstrf.c @@ -150,7 +150,6 @@ at the top-level directory. //#define GEMM_PADLEN 1 #define GEMM_PADLEN 8 -/* #define PDGSTRF2 pdgstrf2_dtrsm */ #define PDGSTRF2 pdgstrf2_trsm #ifdef ISORT @@ -967,13 +966,16 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #if 0 Remain_L_buff = (double *) _mm_malloc( sizeof(double)*(Llu->bufmax[1]),64); Ublock_info = (Ublock_info_t *) _mm_malloc(mcb*sizeof(Ublock_info_t),64); + /*int * Ublock_info_iukp = (int *) _mm_malloc(mcb*sizeof(int),64); + int * Ublock_info_rukp = (int *) _mm_malloc(mcb*sizeof(int),64); + int * Ublock_info_jb = (int *) _mm_malloc(mcb*sizeof(int),64); */ #else j = gemm_m_pad * (ldt + max_row_size + gemm_k_pad); Remain_L_buff = doubleMalloc_dist(Llu->bufmax[1] + j); /* This is loose */ Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb*sizeof(Ublock_info_t)); /*int *Ublock_info_iukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); int *Ublock_info_rukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); - int *Ublock_info_jb = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); */ + int *Ublock_info_jb = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); */ #endif long long alloc_mem = 3 * mrb * iword + mrb * sizeof(Remain_info_t) @@ -1294,7 +1296,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #endif { pdgstrs2_omp (kk0, kk, Glu_persist, grid, Llu, - Ublock_info, stat); + Ublock_info, stat); } pdgstrs2_timer += SuperLU_timer_()-ttt2; @@ -1455,10 +1457,14 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, /* Parallel triangular solve across process row *krow* -- U(k,j) = L(k,k) \ A(k,j). */ double ttt2 = SuperLU_timer_(); - - pdgstrs2_omp (k0, k, Glu_persist, grid, Llu, Ublock_info, stat); - - pdgstrs2_timer += SuperLU_timer_() - ttt2; +#ifdef _OPENMP +/* #pragma omp parallel */ /* Sherry -- parallel done inside pdgstrs2 */ +#endif + { + pdgstrs2_omp (k0, k, Glu_persist, grid, Llu, + Ublock_info, stat); + } + pdgstrs2_timer += SuperLU_timer_() - ttt2; /* Sherry -- need to set factoredU[k0] = 1; ?? */ @@ -1726,7 +1732,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #else /*#include "SchCompUdt--Phi-2Ddynamic-alt.c"*/ -/*#include "dSchCompUdt-2Ddynamic_v6.c"*/ +//#include "dSchCompUdt-2Ddynamic_v6.c" #include "dSchCompUdt-2Ddynamic.c" @@ -1903,9 +1909,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, Llu->bufmax[1] * dword), stat ); SUPERLU_FREE(Ublock_info); - /* SUPERLU_FREE(Ublock_info_iukp); - SUPERLU_FREE(Ublock_info_rukp); - SUPERLU_FREE(Ublock_info_jb); */ + /*SUPERLU_FREE(Ublock_info_iukp); + SUPERLU_FREE(Ublock_info_rukp); + SUPERLU_FREE(Ublock_info_jb); */ #if ( PROFlevel>=1 ) diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c index e07ed770..0402a2f7 100644 --- a/SRC/pdgstrf2.c +++ b/SRC/pdgstrf2.c @@ -7,20 +7,71 @@ All rights reserved. The source code is distributed under BSD license, see the file License.txt at the top-level directory. - */ + /*! @file * \brief Performs panel LU factorization. * *
- * -- Distributed SuperLU routine (version 5.2) --
+ * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * August 15, 2014
  *
  * Modified:
  *   September 30, 2017
- *   March 31, 2019  version 7.0.0
+ *   May 5, 2019 version 7.0.0
+ *
+ * 
+ * Purpose
+ * =======
+ *   Panel factorization -- block column k
+ *
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that own block column *k* participate
+ *   in the work.
+ *
+ * Arguments
+ * =========
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * k0     (input) int (global)
+ *        Counter of the next supernode to be factorized.
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) LocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * tag_ub (input) int
+ *        Upper bound of MPI tag values.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
  * 
*/ @@ -97,8 +148,8 @@ pdgstrf2_trsm int cols_left, iam, l, pkk, pr; int incx = 1, incy = 1; - int nsupr; /* number of rows in the block (LDA) */ - int nsupc; /* number of columns in the block */ + int nsupr; /* number of rows in the block (LDA) */ + int nsupc; /* number of columns in the block */ int luptr; int_t i, myrow, krow, j, jfst, jlst, u_diag_cnt; int_t *xsup = Glu_persist->xsup; @@ -310,16 +361,15 @@ pdgstrf2_trsm } /* PDGSTRF2_trsm */ - /***************************************************************************** * The following functions are for the new pdgstrf2_dtrsm in the 3D code. *****************************************************************************/ - +static int_t LpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr, int_t ld_ujrow, double* lusup, int_t nsupr, SCT_t* SCT) { int_t l = nsupr - off0; - + double alpha = 1.0; unsigned long long t1 = _rdtsc(); #define GT 32 @@ -329,24 +379,22 @@ int_t LpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr, int_t ld_ujrow, int_t off = i * GT; int_t len = SUPERLU_MIN(GT, l - i * GT); cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, - len, nsupc, 1.0, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); + len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); } t1 = _rdtsc() - t1; - SCT->trf2_flops += (double) l * (double)nsupc * (double)nsupc; SCT->trf2_time += t1; SCT->L_PanelUpdate_tl += t1; return 0; - } #pragma GCC push_options #pragma GCC optimize ("O0") /*factorizes the diagonal block; called from process that owns the (k,k) block*/ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, - double *BlockUFactor, /*factored U is over writen here*/ + double *BlockUFactor, /*factored U is overwritten here*/ Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, int *info, SCT_t* SCT) { @@ -377,30 +425,31 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, for (int_t j = 0; j < jlst - jfst; ++j) /* for each column in panel */ { - /* Diagonal pivot */ int_t i = luptr; - if (options->ReplaceTinyPivot == YES || lusup[i] == 0.0) + /* Not to replace zero pivot. */ + if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0) { - if (fabs (lusup[i]) < thresh) /* Diagonal */ - { + if (fabs (lusup[i]) < thresh) { /* Diagonal */ +#if ( PRNTlevel>=2 ) + printf ("(%d) .. col %d, tiny pivot %e ", + iam, jfst + j, lusup[i]); +#endif /* Keep the new diagonal entry with the same sign. */ - if (lusup[i] < 0) - lusup[i] = -thresh; - else - lusup[i] = thresh; + if (lusup[i] < 0) lusup[i] = -thresh; + else lusup[i] = thresh; +#if ( PRNTlevel>=2 ) + printf ("replaced by %e\n", lusup[i]); +#endif ++(stat->TinyPivots); } } - - for (int_t l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) { int_t st = j * ld_ujrow + j; ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */ - } if (ujrow[0] == zero) /* Test for singularity. */ @@ -409,7 +458,8 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, } else /* Scale the j-th column. */ { - double temp = 1.0 / ujrow[0]; + double temp; + temp = 1.0 / ujrow[0]; for (int_t i = luptr + 1; i < luptr - j + nsupc; ++i) lusup[i] *= temp; stat->ops[FACT] += nsupc - j - 1; @@ -421,15 +471,15 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, /*following must be int*/ int_t l = nsupc - j - 1; - + /* Rank-1 update */ cblas_dger (CblasColMajor, l, cols_left, alpha, &lusup[luptr + 1], incx, &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], nsupr); stat->ops[FACT] += 2 * l * cols_left; } - ujrow = ujrow + ld_ujrow + 1; - luptr += nsupr + 1; /* move to next column */ + ujrow = ujrow + ld_ujrow + 1; /* move to next row of U */ + luptr += nsupr + 1; /* move to next column */ } /* for column j ... first loop */ @@ -501,7 +551,7 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, * *
*/ -void pdgstrf2_dtrsm +void pdgstrf2_xtrsm (superlu_dist_options_t *options, int_t nsupers, int_t k0, int_t k, double thresh, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, MPI_Request *U_diag_blk_send_req, @@ -549,18 +599,15 @@ void pdgstrf2_dtrsm { /* There are pending sends - wait for all Isend to complete */ Wait_UDiagBlockSend(U_diag_blk_send_req, grid, SCT); - } if (iam == pkk) /* diagonal process */ { - /*factorize the diagonal block*/ Local_Dgstrf2(options, k, thresh, Llu->ujrow, Glu_persist, grid, Llu, stat, info, SCT); ublk_ptr = ujrow = Llu->ujrow; - if (U_diag_blk_send_req && iam == pkk) /* Send the U block */ { dISend_UDiagBlock(k0, ublk_ptr, nsupc * nsupc, U_diag_blk_send_req, @@ -569,7 +616,6 @@ void pdgstrf2_dtrsm } LpanelUpdate(nsupc, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); - } else /* non-diagonal process */ { @@ -588,7 +634,7 @@ void pdgstrf2_dtrsm } } /* end if pkk ... */ -} /* pdgstrf2_dtrsm */ +} /* pdgstrf2_xtrsm */ /***************************************************************************** * The following functions are for the new pdgstrs2_omp in the 3D code. @@ -596,11 +642,12 @@ void pdgstrf2_dtrsm /* PDGSTRS2 helping kernels*/ -int_t Trs2_GatherU(int_t iukp, int_t rukp, int_t klst, - int_t nsupc, int_t ldu, - int_t *usub, - double* uval, double *tempv) +int_t dTrs2_GatherU(int_t iukp, int_t rukp, int_t klst, + int_t nsupc, int_t ldu, + int_t *usub, + double* uval, double *tempv) { + double zero = 0.0; int_t ncols = 0; for (int_t jj = iukp; jj < iukp + nsupc; ++jj) { @@ -608,7 +655,7 @@ int_t Trs2_GatherU(int_t iukp, int_t rukp, int_t klst, if ( segsize ) { int_t lead_zero = ldu - segsize; - for (int_t i = 0; i < lead_zero; ++i) tempv[i] = 0.0; + for (int_t i = 0; i < lead_zero; ++i) tempv[i] = zero; tempv += lead_zero; for (int_t i = 0; i < segsize; ++i) tempv[i] = uval[rukp + i]; @@ -617,14 +664,12 @@ int_t Trs2_GatherU(int_t iukp, int_t rukp, int_t klst, ncols++; } } - return ncols; } -int_t Trs2_ScatterU(int_t iukp, int_t rukp, int_t klst, - int_t nsupc, int_t ldu, - int_t *usub, - double* uval, double *tempv) +int_t dTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, + int_t nsupc, int_t ldu, + int_t *usub, double* uval, double *tempv) { for (int_t jj = 0; jj < nsupc; ++jj) { @@ -635,26 +680,21 @@ int_t Trs2_ScatterU(int_t iukp, int_t rukp, int_t klst, tempv += lead_zero; for (int i = 0; i < segsize; ++i) { - uval[rukp + i] = tempv[i]; - } tempv += segsize; rukp += segsize; - } - - } /*for jj=0:nsupc */ return 0; } -int_t Trs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, - int_t *usub, - double* uval, double *tempv, - int_t knsupc, int_t nsupr, double*lusup, - Glu_persist_t *Glu_persist) /*glupersist for xsup for supersize*/ +int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, + int_t *usub, double *uval, double *tempv, + int_t knsupc, int_t nsupr, double *lusup, + Glu_persist_t *Glu_persist) /*glupersist for xsup for supersize*/ { + double alpha = 1.0; int_t *xsup = Glu_persist->xsup; // int_t iukp = Ublock_info.iukp; // int_t rukp = Ublock_info.rukp; @@ -671,55 +711,23 @@ int_t Trs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, } /*pack U block into a dense Block*/ - int_t ncols = Trs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv); + int_t ncols = dTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub, + uval, tempv); /*now call dtrsm on packed dense block*/ int_t luptr = (knsupc - ldu) * (nsupr + 1); // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu ); cblas_dtrsm (CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, - ldu, ncols, 1.0, &lusup[luptr], nsupr, tempv, ldu); + ldu, ncols, alpha, &lusup[luptr], nsupr, tempv, ldu); /*now scatter the output into sparse U block*/ - Trs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv); + dTrs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv); return 0; - -} - -int_t Trs2_InitUblock_info(int_t klst, int_t nb, - Ublock_info_t *Ublock_info, - int_t *usub, - Glu_persist_t *Glu_persist, SuperLUStat_t *stat ) -{ - int_t *xsup = Glu_persist->xsup; - int_t iukp, rukp; - iukp = BR_HEADER; - rukp = 0; - - for (int_t b = 0; b < nb; ++b) - { - int_t gb = usub[iukp]; - int_t nsupc = SuperSize (gb); - - Ublock_info[b].iukp = iukp; - Ublock_info[b].rukp = rukp; - // Ublock_info[b].nsupc = nsupc; - - iukp += UB_DESCRIPTOR; - /* Sherry: can remove this loop for rukp - rukp += usub[iukp-1]; - */ - for (int_t j = 0; j < nsupc; ++j) - { - int_t segsize = klst - usub[iukp++]; - rukp += segsize; - stat->ops[FACT] += segsize * (segsize + 1); - } - } - return 0; } #if 1 + /***************************************************************************** * The following pdgstrf2_omp is improved for KNL, since Version 5.2.0. *****************************************************************************/ @@ -727,6 +735,9 @@ void pdgstrs2_omp (int_t k0, int_t k, Glu_persist_t * Glu_persist, gridinfo_t * grid, LocalLU_t * Llu, Ublock_info_t *Ublock_info, SuperLUStat_t * stat) { +#ifdef PI_DEBUG + printf("====Entering pdgstrs2==== \n"); +#endif int iam, pkk; int incx = 1; int nsupr; /* number of rows in the block L(:,k) (LDA) */ @@ -834,7 +845,9 @@ void pdgstrs2_omp #endif } /* end task */ rukp += segsize; +#ifndef USE_Ublock_info stat->ops[FACT] += segsize * (segsize + 1); +#endif } /* end if segsize > 0 */ } /* end for j in parallel ... */ /* #pragma omp taskwait */ @@ -878,7 +891,7 @@ void pdgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf, double *lusup = Lval_buf; /* Loop through all the row blocks. to get the iukp and rukp*/ - Trs2_InitUbloc_kinfo(klst, nb, Ublock_info, usub, Glu_persist, stat ); + Trs2_InitUbloc_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); /* Loop through all the row blocks. */ #pragma omp parallel for schedule(dynamic,2) @@ -886,14 +899,11 @@ void pdgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf, { int_t thread_id = omp_get_thread_num(); double *tempv = bigV + thread_id * ldt * ldt; - Trs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, - usub, uval, tempv, - knsupc, nsupr, lusup, - Glu_persist); + dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, + usub, uval, tempv, knsupc, nsupr, lusup, Glu_persist); } /* for b ... */ SCT->PDGSTRS2_tl += (double) ( _rdtsc() - t1); } /* pdgstrs2_omp new version from Piyush */ #endif - diff --git a/SRC/pz3dcomm.c b/SRC/pz3dcomm.c index 8f3f6d67..03ba89af 100644 --- a/SRC/pz3dcomm.c +++ b/SRC/pz3dcomm.c @@ -9,7 +9,7 @@ The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ -#include "superlu_ddefs.h" +#include "superlu_zdefs.h" #include "cblas.h" #if 0 #include "p3dcomm.h" @@ -195,7 +195,7 @@ int_t zzSendLPanel(int_t k, int_t receiver, } -int_t zzRecvLPanel(int_t k, int_t sender, double alpha, double beta, +int_t zzRecvLPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex beta, doublecomplex* Lval_buf, LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) { @@ -230,8 +230,8 @@ int_t zzRecvLPanel(int_t k, int_t sender, double alpha, double beta, grid3d->zscp.comm, &status); /*reduce the updates*/ - cblas_dscal (len2, alpha, lnzval, 1); - cblas_daxpy (len2, beta, Lval_buf, 1, lnzval, 1); + cblas_zscal (len2, (void*) &alpha, lnzval, 1); + cblas_zaxpy (len2, (void*) &beta, Lval_buf, 1, lnzval, 1); } } @@ -271,7 +271,7 @@ int_t zzSendUPanel(int_t k, int_t receiver, } -int_t zzRecvUPanel(int_t k, int_t sender, double alpha, double beta, +int_t zzRecvUPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex beta, doublecomplex* Uval_buf, LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) { @@ -299,8 +299,8 @@ int_t zzRecvUPanel(int_t k, int_t sender, double alpha, double beta, grid3d->zscp.comm, &status); /*reduce the updates*/ - cblas_dscal (lenv, alpha, unzval, 1); - cblas_daxpy (lenv, beta, Uval_buf, 1, unzval, 1); + cblas_zscal (lenv, (void*) &alpha, unzval, 1); + cblas_zaxpy (lenv, (void*) &beta, Uval_buf, 1, unzval, 1); } } return 0; @@ -433,7 +433,7 @@ int_t zscatter3dUPanels(int_t nsupers, #ifdef MPI_MALLOC MPI_DATATYPE_ALLOC(uval, lenv); #else - uval = DOUBLE_ALLOC(lenv); + uval = doublecomplexMalloc_dist(lenv); //DOUBLE_ALLOC(lenv); #endif /*broadcast uval*/ MPI_Bcast( uval, lenv, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm); @@ -691,6 +691,7 @@ int_t zreduceAncestors3d(int_t sender, int_t receiver, doublecomplex* Lval_buf, doublecomplex* Uval_buf, LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) { + doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0}; int_t myGrid = grid3d->zscp.Iam; /*first setting the L blocks to zero*/ @@ -704,9 +705,10 @@ int_t zreduceAncestors3d(int_t sender, int_t receiver, zzSendUPanel(jb, receiver, LUstruct, grid3d, SCT); } else { - zzRecvLPanel(jb, sender, 1.0, 1.0, Lval_buf, LUstruct, grid3d, SCT); - zzRecvUPanel(jb, sender, 1.0, 1.0, - Uval_buf, LUstruct, grid3d, SCT); + zzRecvLPanel(jb, sender, alpha, beta, Lval_buf, + LUstruct, grid3d, SCT); + zzRecvUPanel(jb, sender, alpha, beta, Uval_buf, + LUstruct, grid3d, SCT); } } @@ -720,6 +722,7 @@ int_t zgatherFactoredLU(int_t sender, int_t receiver, LUValSubBuf_t*LUvsb, LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) { + doublecomplex alpha = {0.0, 0.0}, beta = {1.0, 0.0}; doublecomplex * Lval_buf = LUvsb->Lval_buf; doublecomplex * Uval_buf = LUvsb->Uval_buf; int_t myGrid = grid3d->zscp.Iam; @@ -734,10 +737,10 @@ int_t zgatherFactoredLU(int_t sender, int_t receiver, } else { - zzRecvLPanel(jb, sender, 0.0, 1.0, Lval_buf, LUstruct, - grid3d, SCT); - zzRecvUPanel(jb, sender, 0.0, 1.0, Uval_buf, LUstruct, - grid3d, SCT); + zzRecvLPanel(jb, sender, alpha, beta, Lval_buf, + LUstruct, grid3d, SCT); + zzRecvUPanel(jb, sender, alpha, beta, Uval_buf, + LUstruct, grid3d, SCT); } } return 0; diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c index ac4fb077..e1cad175 100644 --- a/SRC/pzgssvx3d.c +++ b/SRC/pzgssvx3d.c @@ -598,7 +598,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, "Extra precise iterative refinement yet to support."); } else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc - || A->Dtype != SLU_D || A->Mtype != SLU_GE) + || A->Dtype != SLU_Z || A->Mtype != SLU_GE) *info = -2; else if (ldb < m_loc) *info = -5; @@ -606,7 +606,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, *info = -6; if (*info) { i = -(*info); - pxerr_dist ("pdgssvx", grid, -*info); + pxerr_dist ("pzgssvx3d", grid, -*info); return; } @@ -636,7 +636,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /********/ #if ( DEBUGlevel>=1 ) - CHECK_MALLOC (iam, "Enter pdgssvx()"); + CHECK_MALLOC (iam, "Enter pzgssvx3d()"); #endif /* Not factored & ask for equilibration */ @@ -645,20 +645,20 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, switch (ScalePermstruct->DiagScale) { case NOEQUIL: - if (!(R = (doublecomplex *) doublecomplexMalloc_dist (m))) + if (!(R = (double *) doubleMalloc_dist (m))) ABORT ("Malloc fails for R[]."); - if (!(C = (doublecomplex *) doublecomplexMalloc_dist (n))) + if (!(C = (double *) doubleMalloc_dist (n))) ABORT ("Malloc fails for C[]."); ScalePermstruct->R = R; ScalePermstruct->C = C; break; case ROW: - if (!(C = (doublecomplex *) doublecomplexMalloc_dist (n))) + if (!(C = (double *) doubleMalloc_dist (n))) ABORT ("Malloc fails for C[]."); ScalePermstruct->C = C; break; case COL: - if (!(R = (doublecomplex *) doublecomplexMalloc_dist (m))) + if (!(R = (double *) doubleMalloc_dist (m))) ABORT ("Malloc fails for R[]."); ScalePermstruct->R = R; break; @@ -778,7 +778,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, GA_mem_use = (nnz + n + 1) * sizeof (int_t); if (need_value) { - a_GA = (double *) GAstore->nzval; + a_GA = (doublecomplex *) GAstore->nzval; GA_mem_use += nnz * sizeof (double); } else @@ -810,7 +810,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, if ( iam==0 ) { /* Process 0 finds a row permutation */ - iinfo = dldperm_dist (job, m, nnz, colptr, rowind, a_GA, + iinfo = zldperm_dist (job, m, nnz, colptr, rowind, a_GA, perm_r, R1, C1); MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); if ( iinfo == 0 ) { @@ -955,7 +955,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, *(unsigned char *) norm = '1'; else *(unsigned char *) norm = 'I'; - anorm = pdlangs (norm, A, grid); + anorm = pzlangs (norm, A, grid); #if ( PRNTlevel>=1 ) if (!iam) { printf (".. anorm %e\n", anorm); fflush(stdout); @@ -1011,9 +1011,8 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, fstVtxSep[2 * noDomains - 2] = 0; } else if (permc_spec != PARMETIS) { /* same as before */ - printf - ("{%4d,%4d}: pdgssvx: invalid ColPerm option when ParSymbfact is used\n", - (int) MYROW (grid->iam, grid), (int) MYCOL (grid->iam, grid)); + printf("{%4d,%4d}: pzgssvx3d: invalid ColPerm option when ParSymbfact is used\n", + (int) MYROW(grid->iam, grid), (int) MYCOL(grid->iam, grid)); } } /* end ... use parmetis */ @@ -1146,7 +1145,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, NOTE: the row permutation Pc*Pr is applied internally in the distribution routine. */ t = SuperLU_timer_ (); - dist_mem_use = pddistribute (Fact, n, A, ScalePermstruct, + dist_mem_use = pzdistribute (Fact, n, A, ScalePermstruct, Glu_freeable, LUstruct, grid); stat->utime[DIST] = SuperLU_timer_ () - t; @@ -1165,7 +1164,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, colind[j] = perm_c[colind[j]]; t = SuperLU_timer_ (); - dist_mem_use = ddist_psymbtonum (Fact, n, A, ScalePermstruct, + dist_mem_use = zdist_psymbtonum (Fact, n, A, ScalePermstruct, &Pslu_freeable, LUstruct, grid); if (dist_mem_use > 0) ABORT ("Not enough memory available for dist_psymbtonum\n"); @@ -1371,7 +1370,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, if (options->RefineInitialized == NO || Fact == DOFACT) { /* All these cases need to re-initialize gsmv structure */ if (options->RefineInitialized) - pdgsmv_finalize (SOLVEstruct->gsmv_comm); + pzgsmv_finalize (SOLVEstruct->gsmv_comm); pzgsmv_init (A, SOLVEstruct->row_to_proc, grid, SOLVEstruct->gsmv_comm); @@ -1385,7 +1384,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, options->RefineInitialized = YES; } else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) { - double at; + doublecomplex at; int_t k, jcol, p; /* Swap to beginning the part of A corresponding to the local part of X, as was done in pdgsmv_init() */ diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c index 4001068e..f90759f8 100644 --- a/SRC/pzgstrf.c +++ b/SRC/pzgstrf.c @@ -150,18 +150,6 @@ at the top-level directory. #define GEMM_PADLEN 8 #define PZGSTRF2 pzgstrf2_trsm -#define PZGSTRS2 pzgstrs2_omp - -extern void PZGSTRF2 (superlu_dist_options_t *, int_t, int_t, double, - Glu_persist_t *, gridinfo_t *, LocalLU_t *, - MPI_Request *, int, SuperLUStat_t *, int *); -#ifdef _CRAY -extern void PZGSTRS2 (int_t, int_t, Glu_persist_t *, gridinfo_t *, - LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd); -#else -extern void PZGSTRS2 (int_t, int_t, Glu_persist_t *, gridinfo_t *, - LocalLU_t *, SuperLUStat_t *); -#endif #ifdef ISORT extern void isort (int_t N, int_t * ARRAY1, int_t * ARRAY2); @@ -978,16 +966,16 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #if 0 Remain_L_buff = (doublecomplex *) _mm_malloc( sizeof(doublecomplex)*(Llu->bufmax[1]),64); Ublock_info = (Ublock_info_t *) _mm_malloc(mcb*sizeof(Ublock_info_t),64); - int * Ublock_info_iukp = (int *) _mm_malloc(mcb*sizeof(int),64); - int * Ublock_info_rukp = (int *) _mm_malloc(mcb*sizeof(int),64); - int * Ublock_info_jb = (int *) _mm_malloc(mcb*sizeof(int),64); + /*int * Ublock_info_iukp = (int *) _mm_malloc(mcb*sizeof(int),64); + int * Ublock_info_rukp = (int *) _mm_malloc(mcb*sizeof(int),64); + int * Ublock_info_jb = (int *) _mm_malloc(mcb*sizeof(int),64); */ #else j = gemm_m_pad * (ldt + max_row_size + gemm_k_pad); Remain_L_buff = doublecomplexMalloc_dist(Llu->bufmax[1] + j); /* This is loose */ Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb*sizeof(Ublock_info_t)); - int *Ublock_info_iukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); - int *Ublock_info_rukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); - int *Ublock_info_jb = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); + /*int *Ublock_info_iukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); + int *Ublock_info_rukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); + int *Ublock_info_jb = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); */ #endif long long alloc_mem = 3 * mrb * iword + mrb * sizeof(Remain_info_t) @@ -1307,8 +1295,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, /* #pragma omp parallel */ /* Sherry -- parallel done inside pzgstrs2 */ #endif { - PZGSTRS2 (kk0, kk, Glu_persist, grid, Llu, - stat); + pzgstrs2_omp (kk0, kk, Glu_persist, grid, Llu, + Ublock_info, stat); } pdgstrs2_timer += SuperLU_timer_()-ttt2; @@ -1473,7 +1461,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, /* #pragma omp parallel */ /* Sherry -- parallel done inside pzgstrs2 */ #endif { - PZGSTRS2 (k0, k, Glu_persist, grid, Llu, stat); + pzgstrs2_omp (k0, k, Glu_persist, grid, Llu, + Ublock_info, stat); } pdgstrs2_timer += SuperLU_timer_() - ttt2; @@ -1920,9 +1909,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, Llu->bufmax[1] * dword), stat ); SUPERLU_FREE(Ublock_info); - SUPERLU_FREE(Ublock_info_iukp); - SUPERLU_FREE(Ublock_info_rukp); - SUPERLU_FREE(Ublock_info_jb); + /*SUPERLU_FREE(Ublock_info_iukp); + SUPERLU_FREE(Ublock_info_rukp); + SUPERLU_FREE(Ublock_info_jb); */ #if ( PROFlevel>=1 ) diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c index 948fd470..6af84d93 100644 --- a/SRC/pzgstrf2.c +++ b/SRC/pzgstrf2.c @@ -13,12 +13,13 @@ at the top-level directory. * \brief Performs panel LU factorization. * *
- * -- Distributed SuperLU routine (version 5.2) --
+ * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * August 15, 2014
  *
  * Modified:
  *   September 30, 2017
+ *   May 5, 2019 version 7.0.0
  *
  * 
  * Purpose
@@ -75,7 +76,65 @@ at the top-level directory.
 
 #include 
 #include "superlu_zdefs.h"
+#include "cblas.h"
 
+/*****************************************************************************
+ * The following pzgstrf2_trsm is in version 6 and earlier.
+ *****************************************************************************/
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *   Panel factorization -- block column k
+ *
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that own block column *k* participate
+ *   in the work.
+ *
+ * Arguments
+ * =========
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * k0     (input) int (global)
+ *        Counter of the next supernode to be factorized.
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) LocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * tag_ub (input) int
+ *        Upper bound of MPI tag values.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ */ /* This pdgstrf2 is based on TRSM function */ void pzgstrf2_trsm @@ -88,14 +147,14 @@ pzgstrf2_trsm int cols_left, iam, l, pkk, pr; int incx = 1, incy = 1; - int nsupr; /* number of rows in the block (LDA) */ - int nsupc; /* number of columns in the block */ + int nsupr; /* number of rows in the block (LDA) */ + int nsupc; /* number of columns in the block */ int luptr; int_t i, myrow, krow, j, jfst, jlst, u_diag_cnt; int_t *xsup = Glu_persist->xsup; doublecomplex *lusup, temp; doublecomplex *ujrow, *ublk_ptr; /* pointer to the U block */ - doublecomplex one = {1.0, 0.0}, alpha = {-1.0, 0.0}; + doublecomplex alpha = {-1.0, 0.0}, zero = {0.0, 0.0}, one = {1.0, 0.0}; int_t Pr; MPI_Status status; MPI_Comm comm = (grid->cscp).comm; @@ -303,12 +362,380 @@ pzgstrf2_trsm } /* PZGSTRF2_trsm */ +/***************************************************************************** + * The following functions are for the new pdgstrf2_ztrsm in the 3D code. + *****************************************************************************/ +static +int_t LpanelUpdate(int_t off0, int_t nsupc, doublecomplex* ublk_ptr, int_t ld_ujrow, + doublecomplex* lusup, int_t nsupr, SCT_t* SCT) +{ + int_t l = nsupr - off0; + doublecomplex alpha = {1.0, 0.0}; + unsigned long long t1 = _rdtsc(); + +#define GT 32 +#pragma omp parallel for + for (int i = 0; i < CEILING(l, GT); ++i) + { + int_t off = i * GT; + int_t len = SUPERLU_MIN(GT, l - i * GT); + cblas_ztrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, + len, nsupc, (void*) &alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); + } + t1 = _rdtsc() - t1; + + SCT->trf2_flops += (double) l * (double)nsupc * (double)nsupc; + SCT->trf2_time += t1; + SCT->L_PanelUpdate_tl += t1; + return 0; +} + +#pragma GCC push_options +#pragma GCC optimize ("O0") +/*factorizes the diagonal block; called from process that owns the (k,k) block*/ +void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh, + doublecomplex *BlockUFactor, /*factored U is overwritten here*/ + Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, + SuperLUStat_t *stat, int *info, SCT_t* SCT) +{ + //unsigned long long t1 = _rdtsc(); + int_t *xsup = Glu_persist->xsup; + doublecomplex alpha = {-1.0, 0.0}, zero = {0.0, 0.0}, one = {1.0, 0.0}; + + // printf("Entering dgetrf2 %d \n", k); + /* Initialization. */ + int_t lk = LBj (k, grid); /* Local block number */ + int_t jfst = FstBlockC (k); + int_t jlst = FstBlockC (k + 1); + doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk]; + int_t nsupc = SuperSize (k); + int_t nsupr; + if (Llu->Lrowind_bc_ptr[lk]) + nsupr = Llu->Lrowind_bc_ptr[lk][1]; + else + nsupr = 0; + doublecomplex *ublk_ptr = BlockUFactor; + doublecomplex *ujrow = BlockUFactor; + int_t luptr = 0; /* Point_t to the diagonal entries. */ + int_t cols_left = nsupc; /* supernode size */ + int_t u_diag_cnt = 0; + int_t ld_ujrow = nsupc; /* leading dimension of ujrow */ + int_t incx = 1; + int_t incy = ld_ujrow; + + for (int_t j = 0; j < jlst - jfst; ++j) /* for each column in panel */ + { + /* Diagonal pivot */ + int_t i = luptr; + if ( options->ReplaceTinyPivot == YES ) { + if ( slud_z_abs1(&lusup[i]) < thresh && + lusup[i].r != 0.0 && lusup[i].i != 0.0 ) { /* Diagonal */ + +#if ( PRNTlevel>=2 ) + printf ("(%d) .. col %d, tiny pivot %e ", + iam, jfst + j, lusup[i]); +#endif + /* Keep the new diagonal entry with the same sign. */ + if ( lusup[i].r < 0 ) lusup[i].r = -thresh; + else lusup[i].r = thresh; + lusup[i].i = 0.0; +#if ( PRNTlevel>=2 ) + printf ("replaced by %e\n", lusup[i]); +#endif + ++(stat->TinyPivots); + } + } + + for (int_t l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) + { + int_t st = j * ld_ujrow + j; + ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */ + } + + /* Test for singularity. */ + if ( ujrow[0].r == 0.0 && ujrow[0].i == 0.0 ) + { + *info = j + jfst + 1; + } + else /* Scale the j-th column. */ + { + doublecomplex temp; + slud_z_div(&temp, &one, &ujrow[0]); + for (i = luptr + 1; i < luptr - j + nsupc; ++i) + zz_mult(&lusup[i], &lusup[i], &temp); + stat->ops[FACT] += 6*(nsupc-j-1) + 10; + } + + /* Rank-1 update of the trailing submatrix. */ + if (--cols_left) + { + /*following must be int*/ + int_t l = nsupc - j - 1; + + /* Rank-1 update */ + cblas_zgeru (CblasColMajor, l, cols_left, &alpha, &lusup[luptr + 1], incx, + &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], + nsupr); + stat->ops[FACT] += 8 * l * cols_left; + } + + ujrow = ujrow + ld_ujrow + 1; /* move to next row of U */ + luptr += nsupr + 1; /* move to next column */ + + } /* for column j ... first loop */ + + + //int_t thread_id = omp_get_thread_num(); + // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( _rdtsc() - t1); +} + +#pragma GCC pop_options /************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Panel factorization -- block column k
+ *
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that own block column *k* participate
+ *   in the work.
+ *
+ * Arguments
+ * =========
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * nsupers (input) int_t (global)
+ *         Number of supernodes.
+ *
+ * k0     (input) int (global)
+ *        Counter of the next supernode to be factorized.
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) LocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * tag_ub (input) int
+ *        Upper bound of MPI tag values.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ * SCT    (output) SCT_t*
+ *        Additional statistics used in the 3D algorithm.
+ *
+ * 
+ */ +void pzgstrf2_xtrsm +(superlu_dist_options_t *options, int_t nsupers, + int_t k0, int_t k, double thresh, Glu_persist_t *Glu_persist, + gridinfo_t *grid, LocalLU_t *Llu, MPI_Request *U_diag_blk_send_req, + int tag_ub, SuperLUStat_t *stat, int *info, SCT_t *SCT) +{ + int cols_left, iam, pkk; + int incy = 1; + + int nsupr; /* number of rows in the block (LDA) */ + int luptr; + int_t myrow, krow, j, jfst, jlst, u_diag_cnt; + int_t nsupc; /* number of columns in the block */ + int_t *xsup = Glu_persist->xsup; + doublecomplex *lusup; + doublecomplex *ujrow, *ublk_ptr; /* pointer to the U block */ + int_t Pr; + + /* Quick return. */ + *info = 0; + + /* Initialization. */ + iam = grid->iam; + Pr = grid->nprow; + myrow = MYROW (iam, grid); + krow = PROW (k, grid); + pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + j = LBj (k, grid); /* Local block number */ + jfst = FstBlockC (k); + jlst = FstBlockC (k + 1); + lusup = Llu->Lnzval_bc_ptr[j]; + nsupc = SuperSize (k); + if (Llu->Lrowind_bc_ptr[j]) + nsupr = Llu->Lrowind_bc_ptr[j][1]; + else + nsupr = 0; + ublk_ptr = ujrow = Llu->ujrow; + + luptr = 0; /* Point to the diagonal entries. */ + cols_left = nsupc; /* supernode size */ + int ld_ujrow = nsupc; /* leading dimension of ujrow */ + u_diag_cnt = 0; + incy = ld_ujrow; + + if (U_diag_blk_send_req && U_diag_blk_send_req[myrow]) + { + /* There are pending sends - wait for all Isend to complete */ + Wait_UDiagBlockSend(U_diag_blk_send_req, grid, SCT); + } + + if (iam == pkk) /* diagonal process */ + { + /*factorize the diagonal block*/ + Local_Zgstrf2(options, k, thresh, Llu->ujrow, Glu_persist, + grid, Llu, stat, info, SCT); + ublk_ptr = ujrow = Llu->ujrow; + + if (U_diag_blk_send_req && iam == pkk) /* Send the U block */ + { + zISend_UDiagBlock(k0, ublk_ptr, nsupc * nsupc, U_diag_blk_send_req, + grid, tag_ub); + U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* flag outstanding Isend */ + } + + LpanelUpdate(nsupc, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); + } + else /* non-diagonal process */ + { + /* ================================================ * + * Receive the diagonal block of U * + * for panel factorization of L(:,k) * + * note: we block for panel factorization of L(:,k) * + * but panel factorization of U(:,k) don't * + * ================================================ */ + + zRecv_UDiagBlock( k0, ublk_ptr, (nsupc * nsupc), krow, grid, SCT, tag_ub); + + if (nsupr > 0) + { + LpanelUpdate(0, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); + } + } /* end if pkk ... */ + +} /* pzgstrf2_xtrsm */ + +/***************************************************************************** + * The following functions are for the new pzgstrs2_omp in the 3D code. + *****************************************************************************/ + +/* PZGSTRS2 helping kernels*/ + +int_t zTrs2_GatherU(int_t iukp, int_t rukp, int_t klst, + int_t nsupc, int_t ldu, + int_t *usub, + doublecomplex* uval, doublecomplex *tempv) +{ + doublecomplex zero = {0.0, 0.0}; + int_t ncols = 0; + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + int_t segsize = klst - usub[jj]; + if ( segsize ) + { + int_t lead_zero = ldu - segsize; + for (int_t i = 0; i < lead_zero; ++i) tempv[i] = zero; + tempv += lead_zero; + for (int_t i = 0; i < segsize; ++i) + tempv[i] = uval[rukp + i]; + rukp += segsize; + tempv += segsize; + ncols++; + } + } + return ncols; +} + +int_t zTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, + int_t nsupc, int_t ldu, + int_t *usub, doublecomplex* uval, doublecomplex *tempv) +{ + for (int_t jj = 0; jj < nsupc; ++jj) + { + int_t segsize = klst - usub[iukp + jj]; + if (segsize) + { + int_t lead_zero = ldu - segsize; + tempv += lead_zero; + for (int i = 0; i < segsize; ++i) + { + uval[rukp + i] = tempv[i]; + } + tempv += segsize; + rukp += segsize; + } + } /*for jj=0:nsupc */ + return 0; +} + +int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, + int_t *usub, doublecomplex *uval, doublecomplex *tempv, + int_t knsupc, int_t nsupr, doublecomplex *lusup, + Glu_persist_t *Glu_persist) /*glupersist for xsup for supersize*/ +{ + doublecomplex alpha = {1.0, 0.0}; + int_t *xsup = Glu_persist->xsup; + // int_t iukp = Ublock_info.iukp; + // int_t rukp = Ublock_info.rukp; + int_t gb = usub[iukp]; + int_t nsupc = SuperSize (gb); + iukp += UB_DESCRIPTOR; + + // printf("klst inside task%d\n", ); + /*find ldu */ + int_t ldu = 0; + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + ldu = SUPERLU_MAX( klst - usub[jj], ldu) ; + } + + /*pack U block into a dense Block*/ + int_t ncols = zTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub, + uval, tempv); + + /*now call ztrsm on packed dense block*/ + int_t luptr = (knsupc - ldu) * (nsupr + 1); + // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu ); + cblas_ztrsm (CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, + ldu, ncols, (void*) &alpha, &lusup[luptr], nsupr, tempv, ldu); + + /*now scatter the output into sparse U block*/ + zTrs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv); + + return 0; +} + +#if 1 + +/***************************************************************************** + * The following pdgstrf2_omp is improved for KNL, since Version 5.2.0. + *****************************************************************************/ void pzgstrs2_omp -/************************************************************************/ -(int_t k0, int_t k, Glu_persist_t * Glu_persist, - gridinfo_t * grid, LocalLU_t * Llu, SuperLUStat_t * stat) +(int_t k0, int_t k, Glu_persist_t * Glu_persist, gridinfo_t * grid, + LocalLU_t * Llu, Ublock_info_t *Ublock_info, SuperLUStat_t * stat) { #ifdef PI_DEBUG printf("====Entering pzgstrs2==== \n"); @@ -360,6 +787,13 @@ void pzgstrs2_omp iukp = BR_HEADER; rukp = 0; + /* Sherry: can use the existing Ublock_info[] array, call + Trs2_InitUblock_info(); */ +#undef USE_Ublock_info +#ifdef USE_Ublock_info /** 4/19/2019 **/ + /* Loop through all the row blocks. to get the iukp and rukp*/ + Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); +#else int* blocks_index_pointers = SUPERLU_MALLOC (3 * nb * sizeof(int)); int* blocks_value_pointers = blocks_index_pointers + nb; int* nsupc_temp = blocks_value_pointers + nb; @@ -372,6 +806,7 @@ void pzgstrs2_omp nsupc_temp[b] = nsupc; iukp += (UB_DESCRIPTOR + nsupc); /* move to the next block */ } +#endif // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for @@ -379,11 +814,23 @@ void pzgstrs2_omp private(b,j,iukp,rukp,segsize) /* Loop through all the blocks in the row. */ for (b = 0; b < nb; ++b) { +#ifdef USE_Ublock_info + iukp = Ublock_info[b].iukp; + rukp = Ublock_info[b].rukp; +#else iukp = blocks_index_pointers[b]; rukp = blocks_value_pointers[b]; +#endif /* Loop through all the segments in the block. */ +#ifdef USE_Ublock_info + gb = usub[iukp]; + nsupc = SuperSize( gb ); + iukp += UB_DESCRIPTOR; + for (j = 0; j < nsupc; j++) { +#else for (j = 0; j < nsupc_temp[b]; j++) { +#endif segsize = klst - usub[iukp++]; if (segsize) { #pragma omp task default(shared) firstprivate(segsize,rukp) if (segsize > 30) @@ -400,14 +847,18 @@ void pzgstrs2_omp #endif } /* end task */ rukp += segsize; +#ifndef USE_Ublock_info stat->ops[FACT] += segsize * (segsize + 1); +#endif } /* end if segsize > 0 */ } /* end for j in parallel ... */ /* #pragma omp taskwait */ } /* end for b ... */ +#ifndef USE_Ublock_info /* Deallocate memory */ SUPERLU_FREE(blocks_index_pointers); +#endif #if 0 //#ifdef USE_VTUNE @@ -415,5 +866,46 @@ void pzgstrs2_omp __SSC_MARK(0x222); // stop SDE tracing #endif -} /* PZGSTRS2_omp */ +} /* pzgstrs2_omp */ + +#else /*==== new version from Piyush ====*/ + +void pzgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf, + doublecomplex *Lval_buf, Glu_persist_t *Glu_persist, + gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, + Ublock_info_t *Ublock_info, doublecomplex *bigV, int_t ldt, SCT_t *SCT) +{ + unsigned long long t1 = _rdtsc(); + int_t *xsup = Glu_persist->xsup; + /* Quick return. */ + int_t lk = LBi (k, grid); /* Local block number */ + + if (!Llu->Unzval_br_ptr[lk]) return; + /* Initialization. */ + int_t klst = FstBlockC (k + 1); + int_t knsupc = SuperSize (k); + int_t *usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ + doublecomplex *uval = Llu->Unzval_br_ptr[lk]; + int_t nb = usub[0]; + + int_t nsupr = Lsub_buf[1]; /* LDA of lusup[] */ + doublecomplex *lusup = Lval_buf; + + /* Loop through all the row blocks. to get the iukp and rukp*/ + Trs2_InitUbloc_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); + + /* Loop through all the row blocks. */ +#pragma omp parallel for schedule(dynamic,2) + for (int_t b = 0; b < nb; ++b) + { + int_t thread_id = omp_get_thread_num(); + doublecomplex *tempv = bigV + thread_id * ldt * ldt; + zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, + usub, uval, tempv, knsupc, nsupr, lusup, Glu_persist); + } /* for b ... */ + + SCT->PDGSTRS2_tl += (double) ( _rdtsc() - t1); +} /* pdgstrs2_omp new version from Piyush */ + +#endif diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index 07753693..12410d37 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -138,7 +138,7 @@ typedef struct { int_t *etree; Glu_persist_t *Glu_persist; LocalLU_t *Llu; - char dt; + char dt; } LUstruct_t; @@ -194,7 +194,6 @@ typedef struct typedef struct { - int_t *lsub; double *lusup; int_t luptr0; @@ -397,10 +396,6 @@ extern int_t pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb, int_t fst_row, int_t *ilsum, double *x, ScalePermstruct_t *, Glu_persist_t *, gridinfo_t *, SOLVEstruct_t *); - // Sherry: to be removed -extern void pdgstrs_vecpar(int_t, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *, - double *, int_t, int_t, int_t, int, SOLVEstruct_t *, - SuperLUStat_t *, int *); extern void dlsum_fmod(double *, double *, double *, double *, int, int, int_t , int_t *, int_t, int_t, int_t, int_t *, gridinfo_t *, LocalLU_t *, @@ -415,12 +410,12 @@ extern void dlsum_fmod_inv(double *, double *, double *, double *, int_t *, gridinfo_t *, LocalLU_t *, SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int_t, int_t, int, int); extern void dlsum_fmod_inv_master(double *, double *, double *, double *, - int, int, int_t , int_t *, int_t, int_t *, - gridinfo_t *, LocalLU_t *, SuperLUStat_t **, - int_t, int_t, int_t, int_t, int, int); + int, int, int_t , int_t *, int_t, + int_t *, gridinfo_t *, LocalLU_t *, + SuperLUStat_t **, int_t, int_t, int_t, int_t, int, int); extern void dlsum_bmod_inv(double *, double *, double *, double *, - int, int_t, int_t *, int_t *, Ucb_indptr_t **, - int_t **, int_t *, gridinfo_t *, LocalLU_t *, + int, int_t, int_t *, int_t *, Ucb_indptr_t **, + int_t **, int_t *, gridinfo_t *, LocalLU_t *, SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int, int); extern void dlsum_bmod_inv_master(double *, double *, double *, double *, int, int_t, int_t *, int_t *, Ucb_indptr_t **, @@ -542,18 +537,8 @@ extern void pdgssvx3d (superlu_dist_options_t *, SuperMatrix *, extern int_t pdgstrf3d(superlu_dist_options_t *, int m, int n, double anorm, trf3Dpartition_t*, SCT_t *, LUstruct_t *, gridinfo3d_t *, SuperLUStat_t *, int *); -extern int_t zSendLPanel(int_t, int_t, LUstruct_t*, gridinfo3d_t*, SCT_t*); -extern int_t zRecvLPanel(int_t, int_t, double, double, double*, - LUstruct_t*, gridinfo3d_t*, SCT_t* SCT); -extern int_t zSendUPanel(int_t, int_t, LUstruct_t*, gridinfo3d_t*, SCT_t*); -extern int_t zRecvUPanel(int_t, int_t, double, double, double*, - LUstruct_t*, gridinfo3d_t*, SCT_t*); extern void Init_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb ); extern void Free_HyP(HyP_t* HyP); -extern void DistPrint(char* function_name, double value, char* Units, gridinfo_t* grid); -extern void DistPrint3D(char* function_name, double value, char* Units, gridinfo3d_t* grid3d); -extern void treeImbalance3D(gridinfo3d_t *grid3d, SCT_t* SCT); -extern void SCT_printComm3D(gridinfo3d_t *grid3d, SCT_t* SCT); extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid); /* from scatter.h */ @@ -634,6 +619,7 @@ int_t block_gemm_scatterBottomRight( int_t lb, int_t j, SCT_t*SCT, SuperLUStat_t *stat ); + /* from gather.h */ extern void dgather_u(int_t num_u_blks, Ublock_info_t *Ublock_info, int_t * usub, double *uval, double *bigU, int_t ldu, @@ -645,7 +631,6 @@ extern void dgather_l( int_t num_LBlk, int_t knsupc, double * lval, int_t LD_lval, double * L_buff ); - /* from gather.h */ extern void dRgather_L(int_t k, int_t *lsub, double *lusup, gEtreeInfo_t*, Glu_persist_t *, gridinfo_t *, HyP_t *, int_t *myIperm, int_t *iperm_c_supno ); @@ -670,7 +655,7 @@ extern void dinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, sForest_t** sForests, LUstruct_t* LUstruct, gridinfo3d_t* grid3d); -extern int_t gatherAllFactoredLUFr(int_t* myZeroTrIdxs, sForest_t* sForests, +extern int_t dgatherAllFactoredLUFr(int_t* myZeroTrIdxs, sForest_t* sForests, LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT ); @@ -697,30 +682,22 @@ extern void pdgstrs2_omp(int_t k0, int_t k, Glu_persist_t *, gridinfo_t *, LocalLU_t *, Ublock_info_t *, SuperLUStat_t *); #endif // same routine names !!!!!!!! -extern int_t LpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr, +extern int_t dLpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr, int_t ld_ujrow, double* lusup, int_t nsupr, SCT_t*); extern void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, double *BlockUFactor, Glu_persist_t *, gridinfo_t *, LocalLU_t *, SuperLUStat_t *, int *info, SCT_t*); -extern int_t Trs2_GatherU(int_t iukp, int_t rukp, int_t klst, +extern int_t dTrs2_GatherU(int_t iukp, int_t rukp, int_t klst, int_t nsupc, int_t ldu, int_t *usub, double* uval, double *tempv); -extern int_t Trs2_ScatterU(int_t iukp, int_t rukp, int_t klst, +extern int_t dTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, int_t nsupc, int_t ldu, int_t *usub, double* uval, double *tempv); -extern int_t Trs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, - int_t *usub, - double* uval, double *tempv, - int_t knsupc, int_t nsupr, double*lusup, +extern int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, + int_t *usub, double* uval, double *tempv, + int_t knsupc, int_t nsupr, double* lusup, Glu_persist_t *Glu_persist) ; -extern int_t Trs2_InitUblock_info(int_t klst, int_t nb, Ublock_info_t *, - int_t *usub, Glu_persist_t *, SuperLUStat_t*); - -extern void pdgstrs2_mpf(int_t m, int_t k0, int_t k, double *Lval_buf, - int_t nsupr, Glu_persist_t *, - gridinfo_t *, LocalLU_t *, SuperLUStat_t *, - Ublock_info_t *, double *bigV, int_t ldt, SCT_t *); extern void pdgstrs2 #ifdef _CRAY ( @@ -749,8 +726,8 @@ extern int_t dcollect3dLpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct extern int_t dcollect3dUpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); extern int_t dp3dCollect(int_t layer, int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); /*zero out LU non zero entries*/ -extern int_t dzeroSetLU(int_t nnodes, int_t* nodeList , LUstruct_t *LUstruct, gridinfo3d_t* grid3d); -extern int_t AllocGlu(int_t n, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); +extern int_t dzeroSetLU(int_t nnodes, int_t* nodeList , LUstruct_t *, gridinfo3d_t*); +extern int_t AllocGlu(int_t n, int_t nsupers, LUstruct_t *, gridinfo3d_t*); /* Reduces L and U panels of nodes in the List nodeList (size=nnnodes) receiver[L(nodelist)] =sender[L(nodelist)] +receiver[L(nodelist)] @@ -787,17 +764,16 @@ int_t dinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs, int_t* nodeCount, int_t** nodeList, LUstruct_t* LUstruct, gridinfo3d_t* grid3d); -int_t zSendLPanel(int_t k, int_t receiver, - LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); -int_t zRecvLPanel(int_t k, int_t sender, double alpha, double beta, - double* Lval_buf, - LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); - -int_t zSendUPanel(int_t k, int_t receiver, - LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); -int_t zRecvUPanel(int_t k, int_t sender, double alpha, double beta, - double* Uval_buf, - LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); +int_t dzSendLPanel(int_t k, int_t receiver, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); +int_t dzRecvLPanel(int_t k, int_t sender, double alpha, + double beta, double* Lval_buf, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); +int_t dzSendUPanel(int_t k, int_t receiver, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); +int_t dzRecvUPanel(int_t k, int_t sender, double alpha, + double beta, double* Uval_buf, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); /* from communication_aux.h */ extern int_t dIBcast_LPanel (int_t k, int_t k0, int_t* lsub, double* lusup, @@ -849,7 +825,7 @@ extern int_t LDiagBlockRecvWait( int_t k, int_t* factored_U, MPI_Request *, grid #if (MPI_VERSION>2) extern int_t dIBcast_UDiagBlock(int_t k, double *ublk_ptr, int_t size, MPI_Request *, gridinfo_t *); -extern int_t IBcast_LDiagBlock(int_t k, double *lblk_ptr, int_t size, +extern int_t dIBcast_LDiagBlock(int_t k, double *lblk_ptr, int_t size, MPI_Request *, gridinfo_t *); #endif @@ -888,9 +864,9 @@ extern int_t dWaitU(int_t k, int* msgcnt, MPI_Request *, MPI_Request *, extern int_t dLPanelTrSolve(int_t k, int_t* factored_L, double* BlockUFactor, gridinfo_t *, LUstruct_t *); - /* from trfAux.h */ extern int_t getNsupers(int, LUstruct_t *); +extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo); extern int_t dSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*, Remain_info_t*, uPanelInfo_t *, lPanelInfo_t *, int_t*, int_t *, int_t *, @@ -914,7 +890,6 @@ extern void getSCUweight(int_t nsupers, treeList_t* treeList, LUstruct_t *, grid extern int_t dLluBufInit(LUValSubBuf_t*, LUstruct_t *); extern int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, scuBufs_t*, LUstruct_t*, gridinfo_t *); -extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo); // the generic tree factoring code extern int_t treeFactor( @@ -956,7 +931,7 @@ extern int_t dsparseTreeFactor( int *info ); -extern int_t denseTreeFactor( +extern int_t ddenseTreeFactor( int_t nnnodes, // number of nodes in the tree int_t *perm_c_supno, // list of nodes in the order of factorization commRequests_t *comReqs, // lists of communication requests @@ -997,84 +972,10 @@ extern int_t dsparseTreeFactor_ASYNC( extern LUValSubBuf_t** dLluBufInitArr(int_t numLA, LUstruct_t *LUstruct); extern diagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); extern int_t dinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf); -extern int_t sDiagFactIBCast(int_t k, diagFactBufs_t *dFBuf, - factStat_t *factStat, - commRequests_t *comReqs, - gridinfo_t *grid, - superlu_dist_options_t *options, - double thresh, - LUstruct_t *LUstruct, - SuperLUStat_t *stat, int *info, - SCT_t *SCT, int tag_ub); -extern int_t sLPanelUpdate( int_t k, diagFactBufs_t *dFBuf, - factStat_t *factStat, - commRequests_t *comReqs, - gridinfo_t *grid, - LUstruct_t *LUstruct, SCT_t *SCT); -extern int_t sUPanelUpdate( int_t k, - int_t ldt, - diagFactBufs_t *dFBuf, - factStat_t *factStat, - commRequests_t *comReqs, - scuBufs_t* scuBufs, - packLUInfo_t* packLUInfo, - gridinfo_t *grid, - LUstruct_t *LUstruct, - SuperLUStat_t *stat, SCT_t *SCT); -extern int_t sIBcastRecvLPanel( - int_t k, - commRequests_t *comReqs, - LUValSubBuf_t* LUvsb, - msgs_t* msgs, - factStat_t *factStat, - gridinfo_t *grid, - LUstruct_t *LUstruct, SCT_t *SCT, int tag_ub); - -extern int_t sIBcastRecvUPanel( - int_t k, - commRequests_t *comReqs, - LUValSubBuf_t* LUvsb, - msgs_t* msgs, - factStat_t *factStat, - gridinfo_t *grid, - LUstruct_t *LUstruct, SCT_t *SCT, int tag_ub); -extern int_t sWaitL(int_t k, - commRequests_t *comReqs, - msgs_t* msgs, - gridinfo_t *grid, - LUstruct_t *LUstruct, SCT_t *SCT); -extern int_t sWaitU(int_t k, - commRequests_t *comReqs, - msgs_t* msgs, - gridinfo_t *grid, - LUstruct_t *LUstruct, SCT_t *SCT); -extern int_t sWait_LUDiagSend(int_t k, commRequests_t *comReqs, - gridinfo_t *grid, SCT_t *SCT); -extern int_t sSchurComplementSetup(int_t k, msgs_t* msgs, - packLUInfo_t* packLUInfo, - int_t* gIperm_c_supno, int_t*perm_c_supno, - factNodelists_t* fNlists, - scuBufs_t* scuBufs, LUValSubBuf_t* LUvsb, - gridinfo_t *grid, LUstruct_t *LUstruct); extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs, - gridinfo_t *grid, SCT_t *SCT); -extern int_t sLPanelTrSolve( int_t k, diagFactBufs_t *dFBuf, - factStat_t *factStat, - commRequests_t *comReqs, - gridinfo_t *grid, - LUstruct_t *LUstruct, SCT_t *SCT); -extern int_t checkRecvLDiag(int_t k, - commRequests_t *comReqs, - gridinfo_t *grid, - SCT_t *SCT); -extern int_t sUPanelTrSolve( int_t k, - int_t ldt, - diagFactBufs_t *dFBuf, - scuBufs_t* scuBufs, - packLUInfo_t* packLUInfo, - gridinfo_t *grid, - LUstruct_t *LUstruct, - SuperLUStat_t *stat, SCT_t *SCT); + gridinfo_t *grid, SCT_t *SCT); +extern int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *, SCT_t *); + /* from ancFactorization.h */ extern int_t ancestorFactor( int_t ilvl, // level of factorization diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index 16cee7a4..a0bfbe8b 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -898,6 +898,8 @@ typedef struct xtrsTimer_t double ppXmem; // perprocess X-memory } xtrsTimer_t; +/*==== For 3D code ====*/ + /*====================*/ /*********************************************************************** @@ -1093,6 +1095,11 @@ yes_no_t StdList_Empty(StdList lst); /*==== For 3D code ====*/ +extern void DistPrint(char* function_name, double value, char* Units, gridinfo_t* grid); +extern void DistPrint3D(char* function_name, double value, char* Units, gridinfo3d_t* grid3d); +extern void treeImbalance3D(gridinfo3d_t *grid3d, SCT_t* SCT); +extern void SCT_printComm3D(gridinfo3d_t *grid3d, SCT_t* SCT); + /* Manipulate counters */ extern void SCT_init(SCT_t*); extern void SCT_print(gridinfo_t *grid, SCT_t* SCT); @@ -1200,6 +1207,9 @@ extern int_t getNumLookAhead(superlu_dist_options_t*); extern commRequests_t** initCommRequestsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); extern msgs_t** initMsgsArr(int_t numLA); +extern int_t Trs2_InitUblock_info(int_t klst, int_t nb, Ublock_info_t *, + int_t *usub, Glu_persist_t *, SuperLUStat_t*); + /* from sec_structs.h */ extern int Cmpfunc_R_info (const void * a, const void * b); extern int Cmpfunc_U_info (const void * a, const void * b); diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h index cb1c231c..39494d55 100644 --- a/SRC/superlu_zdefs.h +++ b/SRC/superlu_zdefs.h @@ -62,7 +62,7 @@ typedef struct { RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ #if 0 int_t *Lsub_buf; /* Buffer for the remote subscripts of L */ - double *Lval_buf; /* Buffer for the remote nonzeros of L */ + doublecomplex *Lval_buf; /* Buffer for the remote nonzeros of L */ int_t *Usub_buf; /* Buffer for the remote subscripts of U */ doublecomplex *Uval_buf; /* Buffer for the remote nonzeros of U */ #endif @@ -179,6 +179,106 @@ typedef struct { } SOLVEstruct_t; +/*==== For 3D code ====*/ + +// new structures for pdgstrf_4_8 + +typedef struct +{ + int_t nub; + int_t klst; + int_t ldu; + int_t* usub; + doublecomplex* uval; +} uPanelInfo_t; + +typedef struct +{ + int_t *lsub; + doublecomplex *lusup; + int_t luptr0; + int_t nlb; //number of l blocks + int_t nsupr; +} lPanelInfo_t; + +typedef struct +{ + Remain_info_t *lookAhead_info, *Remain_info; + Ublock_info_t *Ublock_info, *Ublock_info_Phi; + + int_t first_l_block_acc , first_u_block_acc; + int_t last_offload ; + int_t *Lblock_dirty_bit, * Ublock_dirty_bit; + doublecomplex *lookAhead_L_buff, *Remain_L_buff; + int_t lookAheadBlk; /* number of blocks in look-ahead window */ + int_t RemainBlk ; /* number of blocks outside look-ahead window */ + int_t num_look_aheads, nsupers; + int_t ldu, ldu_Phi; + int_t num_u_blks, num_u_blks_Phi; + + int_t jj_cpu; + doublecomplex *bigU_Phi; + doublecomplex *bigU_host; + int_t Lnbrow; + int_t Rnbrow; + + int_t buffer_size; + int_t bigu_size; + int_t offloadCondition; + int_t superlu_acc_offload; + int_t nCudaStreams; +} HyP_t; /* Data structures for Schur complement update. */ + +typedef struct +{ + int_t * Lsub_buf ; + doublecomplex * Lval_buf ; + int_t * Usub_buf ; + doublecomplex * Uval_buf ; +} LUValSubBuf_t; + +int_t scuStatUpdate( + int_t knsupc, + HyP_t* HyP, + SCT_t* SCT, + SuperLUStat_t *stat + ); + +typedef struct trf3Dpartition_t +{ + gEtreeInfo_t gEtreeInfo; + int_t* iperm_c_supno; + int_t* myNodeCount; + int_t* myTreeIdxs; + int_t* myZeroTrIdxs; + int_t** treePerm; + sForest_t** sForests; + int_t* supernode2treeMap; + LUValSubBuf_t *LUvsb; +} trf3Dpartition_t; + +typedef struct +{ + doublecomplex *bigU; + doublecomplex *bigV; +} scuBufs_t; + +typedef struct +{ + doublecomplex* BlockLFactor; + doublecomplex* BlockUFactor; +} diagFactBufs_t; + +typedef struct +{ + Ublock_info_t* Ublock_info; + Remain_info_t* Remain_info; + uPanelInfo_t* uPanelInfo; + lPanelInfo_t* lPanelInfo; +} packLUInfo_t; + +/*=====================*/ + /*********************************************************************** * Function prototypes ***********************************************************************/ @@ -187,7 +287,6 @@ typedef struct { extern "C" { #endif - /* Supernodal LU factor related */ extern void zCreate_CompCol_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, doublecomplex *, @@ -292,6 +391,10 @@ extern void pzgstrs_Bglobal(int_t, LUstruct_t *, gridinfo_t *, extern void pzgstrs(int_t, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *, doublecomplex *, int_t, int_t, int_t, int, SOLVEstruct_t *, SuperLUStat_t *, int *); +extern int_t pzReDistribute_B_to_X(doublecomplex *B, int_t m_loc, int nrhs, int_t ldb, + int_t fst_row, int_t *ilsum, doublecomplex *x, + ScalePermstruct_t *, Glu_persist_t *, + gridinfo_t *, SOLVEstruct_t *); extern void zlsum_fmod(doublecomplex *, doublecomplex *, doublecomplex *, doublecomplex *, int, int, int_t , int_t *, int_t, int_t, int_t, int_t *, gridinfo_t *, LocalLU_t *, @@ -427,6 +530,476 @@ extern int zgeru_(int*, int*, doublecomplex*, doublecomplex*, int*, #endif +/*==== For 3D code ====*/ + +extern void pzgssvx3d (superlu_dist_options_t *, SuperMatrix *, + ScalePermstruct_t *, doublecomplex B[], int ldb, int nrhs, + gridinfo3d_t *, LUstruct_t *, SOLVEstruct_t *, + double *berr, SuperLUStat_t *, int *info); +extern int_t pzgstrf3d(superlu_dist_options_t *, int m, int n, double anorm, + trf3Dpartition_t*, SCT_t *, LUstruct_t *, + gridinfo3d_t *, SuperLUStat_t *, int *); +extern void Init_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb ); +extern void Free_HyP(HyP_t* HyP); +extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid); + + /* from scatter.h */ +extern void +block_gemm_scatter( int_t lb, int_t j, + Ublock_info_t *Ublock_info, + Remain_info_t *Remain_info, + doublecomplex *L_mat, int_t ldl, + doublecomplex *U_mat, int_t ldu, + doublecomplex *bigV, + // int_t jj0, + int_t knsupc, int_t klst, + int_t *lsub, int_t *usub, int_t ldt, + int_t thread_id, + int_t *indirect, + int_t *indirect2, + int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr, + int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, + int_t *xsup, gridinfo_t *grid, + SuperLUStat_t *stat +#ifdef SCATTER_PROFILE + , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer +#endif + ); +/*this version uses a lock to prevent multiple thread updating the same block*/ +void +block_gemm_scatter_lock( int_t lb, int_t j, + omp_lock_t* lock, + Ublock_info_t *Ublock_info, + Remain_info_t *Remain_info, + doublecomplex *L_mat, int_t ldl, + doublecomplex *U_mat, int_t ldu, + doublecomplex *bigV, + // int_t jj0, + int_t knsupc, int_t klst, + int_t *lsub, int_t *usub, int_t ldt, + int_t thread_id, + int_t *indirect, + int_t *indirect2, + int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr, + int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, + int_t *xsup, gridinfo_t *grid +#ifdef SCATTER_PROFILE + , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer +#endif + ); + +int_t block_gemm_scatterTopLeft( int_t lb, int_t j, + doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + HyP_t* HyP, + LUstruct_t *LUstruct, + gridinfo_t* grid, + SCT_t*SCT, SuperLUStat_t *stat + ); +int_t block_gemm_scatterTopRight( int_t lb, int_t j, + doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + HyP_t* HyP, + LUstruct_t *LUstruct, + gridinfo_t* grid, + SCT_t*SCT, SuperLUStat_t *stat + ); +int_t block_gemm_scatterBottomLeft( int_t lb, int_t j, + doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + HyP_t* HyP, + LUstruct_t *LUstruct, + gridinfo_t* grid, + SCT_t*SCT, SuperLUStat_t *stat + ); +int_t block_gemm_scatterBottomRight( int_t lb, int_t j, + doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + HyP_t* HyP, + LUstruct_t *LUstruct, + gridinfo_t* grid, + SCT_t*SCT, SuperLUStat_t *stat + ); + + /* from gather.h */ +extern void zgather_u(int_t num_u_blks, + Ublock_info_t *Ublock_info, int_t * usub, + doublecomplex *uval, doublecomplex *bigU, int_t ldu, + int_t *xsup, int_t klst /* for SuperSize */ + ); + +extern void zgather_l( int_t num_LBlk, int_t knsupc, + Remain_info_t *L_info, + doublecomplex * lval, int_t LD_lval, + doublecomplex * L_buff ); + +extern void zRgather_L(int_t k, int_t *lsub, doublecomplex *lusup, gEtreeInfo_t*, + Glu_persist_t *, gridinfo_t *, HyP_t *, + int_t *myIperm, int_t *iperm_c_supno ); +extern void zRgather_U(int_t k, int_t jj0, int_t *usub, doublecomplex *uval, + doublecomplex *bigU, gEtreeInfo_t*, Glu_persist_t *, + gridinfo_t *, HyP_t *, int_t *myIperm, + int_t *iperm_c_supno, int_t *perm_u); + + /* from xtrf3Dpartition.h */ +extern trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers, + superlu_dist_options_t *options, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d); +extern void z3D_printMemUse(trf3Dpartition_t* trf3Dpartition, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d); + +extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat, + superlu_dist_options_t *options, LocalLU_t *Llu, + int_t* xsup, int_t num_look_aheads, int_t nsupers, + int_t * iperm_c_supno); + +extern void zinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + sForest_t** sForests, LUstruct_t* LUstruct, + gridinfo3d_t* grid3d); + +extern int_t zgatherAllFactoredLUFr(int_t* myZeroTrIdxs, sForest_t* sForests, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, + SCT_t* SCT ); + + /* The following are from pdgstrf2.h */ +#if 0 // Sherry: same routine names, but different code !!!!!!! +extern void pzgstrf2_trsm(superlu_dist_options_t *options, int_t, int_t, + int_t k, double thresh, Glu_persist_t *, + gridinfo_t *, LocalLU_t *, MPI_Request *U_diag_blk_send_req, + int tag_ub, SuperLUStat_t *, int *info, SCT_t *); +#ifdef _CRAY +void pzgstrs2_omp (int_t, int_t, int_t, Glu_persist_t *, gridinfo_t *, + LocalLU_t *, SuperLUStat_t *, _fcd, _fcd, _fcd); +#else +void pzgstrs2_omp (int_t, int_t, int_t, int_t *, doublecomplex*, Glu_persist_t *, gridinfo_t *, + LocalLU_t *, SuperLUStat_t *, Ublock_info_t *, doublecomplex *bigV, int_t ldt, SCT_t *SCT ); +#endif + +#else +extern void pzgstrf2_trsm(superlu_dist_options_t * options, int_t k0, int_t k, + double thresh, Glu_persist_t *, gridinfo_t *, + LocalLU_t *, MPI_Request *, int tag_ub, + SuperLUStat_t *, int *info); +extern void pzgstrs2_omp(int_t k0, int_t k, Glu_persist_t *, gridinfo_t *, + LocalLU_t *, Ublock_info_t *, SuperLUStat_t *); +#endif // same routine names !!!!!!!! + +extern int_t zLpanelUpdate(int_t off0, int_t nsupc, doublecomplex* ublk_ptr, + int_t ld_ujrow, doublecomplex* lusup, int_t nsupr, SCT_t*); +extern void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, + double thresh, doublecomplex *BlockUFactor, Glu_persist_t *, + gridinfo_t *, LocalLU_t *, + SuperLUStat_t *, int *info, SCT_t*); +extern int_t zTrs2_GatherU(int_t iukp, int_t rukp, int_t klst, + int_t nsupc, int_t ldu, int_t *usub, + doublecomplex* uval, doublecomplex *tempv); +extern int_t zTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, + int_t nsupc, int_t ldu, int_t *usub, + doublecomplex* uval, doublecomplex *tempv); +extern int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, + int_t *usub, doublecomplex* uval, doublecomplex *tempv, + int_t knsupc, int_t nsupr, doublecomplex* lusup, + Glu_persist_t *Glu_persist) ; +extern void pzgstrs2 +#ifdef _CRAY +( + int_t m, int_t k0, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, + LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3 +); +#else +( + int_t m, int_t k0, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, + LocalLU_t *Llu, SuperLUStat_t *stat +); +#endif + +extern void pzgstrf2(superlu_dist_options_t *, int_t nsupers, int_t k0, + int_t k, double thresh, Glu_persist_t *, gridinfo_t *, + LocalLU_t *, MPI_Request *, int, SuperLUStat_t *, int *); + + /* from p3dcomm.h */ +extern int_t zAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); +extern int_t zp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); +extern int_t zscatter3dLPanels(int_t nsupers, + LUstruct_t * LUstruct, gridinfo3d_t* grid3d); +extern int_t zscatter3dUPanels(int_t nsupers, + LUstruct_t * LUstruct, gridinfo3d_t* grid3d); +extern int_t zcollect3dLpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); +extern int_t zcollect3dUpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); +extern int_t zp3dCollect(int_t layer, int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); +/*zero out LU non zero entries*/ +extern int_t zzeroSetLU(int_t nnodes, int_t* nodeList , LUstruct_t *, gridinfo3d_t*); +extern int_t AllocGlu(int_t n, int_t nsupers, LUstruct_t *, gridinfo3d_t*); + +/* Reduces L and U panels of nodes in the List nodeList (size=nnnodes) +receiver[L(nodelist)] =sender[L(nodelist)] +receiver[L(nodelist)] +receiver[U(nodelist)] =sender[U(nodelist)] +receiver[U(nodelist)] +*/ +int_t zreduceAncestors3d(int_t sender, int_t receiver, + int_t nnodes, int_t* nodeList, + doublecomplex* Lval_buf, doublecomplex* Uval_buf, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); +/*reduces all nodelists required in a level*/ +int_t zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, + int_t** treePerm, + LUValSubBuf_t*LUvsb, + LUstruct_t* LUstruct, + gridinfo3d_t* grid3d, + SCT_t* SCT ); +/* + Copies factored L and U panels from sender grid to receiver grid + receiver[L(nodelist)] <-- sender[L(nodelist)]; + receiver[U(nodelist)] <-- sender[U(nodelist)]; +*/ +int_t zgatherFactoredLU(int_t sender, int_t receiver, + int_t nnodes, int_t *nodeList, LUValSubBuf_t*LUvsb, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d,SCT_t* SCT ); + +/*Gathers all the L and U factors to grid 0 for solve stage + By repeatidly calling above function*/ +int_t zgatherAllFactoredLU(trf3Dpartition_t* trf3Dpartition, LUstruct_t* LUstruct, + gridinfo3d_t* grid3d, SCT_t* SCT ); + +/*Distributes data in each layer and initilizes ancestors + as zero in required nodes*/ +int_t zinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + int_t* nodeCount, int_t** nodeList, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d); + +int_t zzSendLPanel(int_t k, int_t receiver, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); +int_t zzRecvLPanel(int_t k, int_t sender, doublecomplex alpha, + doublecomplex beta, doublecomplex* Lval_buf, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); +int_t zzSendUPanel(int_t k, int_t receiver, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); +int_t zzRecvUPanel(int_t k, int_t sender, doublecomplex alpha, + doublecomplex beta, doublecomplex* Uval_buf, + LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); + + /* from communication_aux.h */ +extern int_t zIBcast_LPanel (int_t k, int_t k0, int_t* lsub, doublecomplex* lusup, + gridinfo_t *, int* msgcnt, MPI_Request *, + int_t **ToSendR, int_t *xsup, int ); +extern int_t zBcast_LPanel(int_t k, int_t k0, int_t* lsub, doublecomplex* lusup, + gridinfo_t *, int* msgcnt, int_t **ToSendR, + int_t *xsup , SCT_t*, int); +extern int_t zIBcast_UPanel(int_t k, int_t k0, int_t* usub, doublecomplex* uval, + gridinfo_t *, int* msgcnt, MPI_Request *, + int_t *ToSendD, int ); +extern int_t zBcast_UPanel(int_t k, int_t k0, int_t* usub, doublecomplex* uval, + gridinfo_t *, int* msgcnt, int_t *ToSendD, SCT_t*, int); +extern int_t zIrecv_LPanel (int_t k, int_t k0, int_t* Lsub_buf, + doublecomplex* Lval_buf, gridinfo_t *, + MPI_Request *, LocalLU_t *, int); +extern int_t zIrecv_UPanel(int_t k, int_t k0, int_t* Usub_buf, doublecomplex*, + LocalLU_t *, gridinfo_t*, MPI_Request *, int); +extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int_t **ToSendR, + MPI_Request *s, SCT_t*); +extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *); +extern int_t zWait_URecv(MPI_Request *, int* msgcnt, SCT_t *); +extern int_t Check_LRecv(MPI_Request*, int* msgcnt); +extern int_t zWait_LRecv(MPI_Request*, int* msgcnt, int* msgcntsU, + gridinfo_t *, SCT_t*); +extern int_t zISend_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, int_t size, + MPI_Request *, gridinfo_t *, int); +extern int_t zRecv_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, int_t size, + int_t src, gridinfo_t *, SCT_t*, int); +extern int_t Wait_UDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *); +extern int_t Wait_LDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *); +extern int_t zPackLBlock(int_t k, doublecomplex* Dest, Glu_persist_t *, + gridinfo_t *, LocalLU_t *); +extern int_t zISend_LDiagBlock(int_t k0, doublecomplex *lblk_ptr, int_t size, + MPI_Request *, gridinfo_t *, int); +extern int_t zIRecv_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, int_t size, + int_t src, MPI_Request *, gridinfo_t *, + SCT_t*, int); +extern int_t Wait_UDiagBlock_Recv(MPI_Request *, SCT_t *); +extern int_t Test_UDiagBlock_Recv(MPI_Request *, SCT_t *); +extern int_t zIRecv_LDiagBlock(int_t k0, doublecomplex *L_blk_ptr, int_t size, + int_t src, MPI_Request *, gridinfo_t*, SCT_t*, int); +extern int_t Wait_LDiagBlock_Recv(MPI_Request *, SCT_t *); +extern int_t Test_LDiagBlock_Recv(MPI_Request *, SCT_t *); + +extern int_t zUDiagBlockRecvWait( int_t k, int_t* IrecvPlcd_D, int_t* factored_L, + MPI_Request *, gridinfo_t *, LUstruct_t *, SCT_t *); +extern int_t LDiagBlockRecvWait( int_t k, int_t* factored_U, MPI_Request *, gridinfo_t *); +#if (MPI_VERSION>2) +extern int_t zIBcast_UDiagBlock(int_t k, doublecomplex *ublk_ptr, int_t size, + MPI_Request *, gridinfo_t *); +extern int_t zIBcast_LDiagBlock(int_t k, doublecomplex *lblk_ptr, int_t size, + MPI_Request *, gridinfo_t *); +#endif + + /* from trfCommWrapper.h */ +extern int_t zDiagFactIBCast(int_t k, int_t k0, + doublecomplex *BlockUFactor, doublecomplex *BlockLFactor, + int_t* IrecvPlcd_D, MPI_Request *, MPI_Request *, + MPI_Request *, MPI_Request *, gridinfo_t *, + superlu_dist_options_t *, double thresh, + LUstruct_t *LUstruct, SuperLUStat_t *, int *info, + SCT_t *, int tag_ub); +extern int_t zUPanelTrSolve( int_t k, doublecomplex* BlockLFactor, doublecomplex* bigV, + int_t ldt, Ublock_info_t*, gridinfo_t *, + LUstruct_t *, SuperLUStat_t *, SCT_t *); +extern int_t Wait_LUDiagSend(int_t k, MPI_Request *, MPI_Request *, + gridinfo_t *, SCT_t *); +extern int_t zLPanelUpdate(int_t k, int_t* IrecvPlcd_D, int_t* factored_L, + MPI_Request *, doublecomplex* BlockUFactor, gridinfo_t *, + LUstruct_t *, SCT_t *); +extern int_t zUPanelUpdate(int_t k, int_t* factored_U, MPI_Request *, + doublecomplex* BlockLFactor, doublecomplex* bigV, + int_t ldt, Ublock_info_t*, gridinfo_t *, + LUstruct_t *, SuperLUStat_t *, SCT_t *); +extern int_t zIBcastRecvLPanel(int_t k, int_t k0, int* msgcnt, + MPI_Request *, MPI_Request *, + int_t* Lsub_buf, doublecomplex* Lval_buf, + int_t * factored, gridinfo_t *, LUstruct_t *, + SCT_t *, int tag_ub); +extern int_t zIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt, MPI_Request *, + MPI_Request *, int_t* Usub_buf, doublecomplex* Uval_buf, + gridinfo_t *, LUstruct_t *, SCT_t *, int tag_ub); +extern int_t zWaitL(int_t k, int* msgcnt, int* msgcntU, MPI_Request *, + MPI_Request *, gridinfo_t *, LUstruct_t *, SCT_t *); +extern int_t zWaitU(int_t k, int* msgcnt, MPI_Request *, MPI_Request *, + gridinfo_t *, LUstruct_t *, SCT_t *); +extern int_t zLPanelTrSolve(int_t k, int_t* factored_L, doublecomplex* BlockUFactor, + gridinfo_t *, LUstruct_t *); + + /* from trfAux.h */ +extern int_t getNsupers(int, LUstruct_t *); +extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo); +extern int_t zSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*, + Remain_info_t*, uPanelInfo_t *, + lPanelInfo_t *, int_t*, int_t *, int_t *, + doublecomplex *bigU, int_t* Lsub_buf, + doublecomplex* Lval_buf, int_t* Usub_buf, + doublecomplex* Uval_buf, gridinfo_t *, LUstruct_t *); +extern int_t zSchurComplementSetupGPU(int_t k, msgs_t* msgs, packLUInfo_t*, + int_t*, int_t*, int_t*, gEtreeInfo_t*, + factNodelists_t*, scuBufs_t*, + LUValSubBuf_t* LUvsb, gridinfo_t *, + LUstruct_t *, HyP_t*); +extern doublecomplex* zgetBigV(int_t, int_t); +extern doublecomplex* zgetBigU(int_t, gridinfo_t *, LUstruct_t *); +extern int_t getBigUSize(int_t, gridinfo_t *, LUstruct_t *); +// permutation from superLU default +extern int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *, + LUstruct_t *, gridinfo_t *); +extern void getSCUweight(int_t nsupers, treeList_t* treeList, LUstruct_t *, gridinfo3d_t *); + + /* from treeFactorization.h */ +extern int_t zLluBufInit(LUValSubBuf_t*, LUstruct_t *); +extern int_t zinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, + scuBufs_t*, LUstruct_t*, gridinfo_t *); + +// the generic tree factoring code +extern int_t treeFactor( + int_t nnnodes, // number of nodes in the tree + int_t *perm_c_supno, // list of nodes in the order of factorization + commRequests_t *comReqs, // lists of communication requests + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t*packLUInfo, + msgs_t*msgs, + LUValSubBuf_t*LUvsb, + diagFactBufs_t *dFBuf, + factStat_t *factStat, + factNodelists_t *fNlists, + superlu_dist_options_t *options, + int_t * gIperm_c_supno, + int_t ldt, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, + int *info +); + +extern int_t zsparseTreeFactor( + int_t nnodes, // number of nodes in the tree + int_t *perm_c_supno, // list of nodes in the order of factorization + treeTopoInfo_t* treeTopoInfo, + commRequests_t *comReqs, // lists of communication requests + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t*packLUInfo, + msgs_t*msgs, + LUValSubBuf_t*LUvsb, + diagFactBufs_t *dFBuf, + factStat_t *factStat, + factNodelists_t *fNlists, + superlu_dist_options_t *options, + int_t * gIperm_c_supno, + int_t ldt, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, + int *info +); + +extern int_t zdenseTreeFactor( + int_t nnnodes, // number of nodes in the tree + int_t *perm_c_supno, // list of nodes in the order of factorization + commRequests_t *comReqs, // lists of communication requests + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t*packLUInfo, + msgs_t*msgs, + LUValSubBuf_t*LUvsb, + diagFactBufs_t *dFBuf, + factStat_t *factStat, + factNodelists_t *fNlists, + superlu_dist_options_t *options, + int_t * gIperm_c_supno, + int_t ldt, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info +); + +extern int_t zsparseTreeFactor_ASYNC( + sForest_t* sforest, + commRequests_t **comReqss, // lists of communication requests // size maxEtree level + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t*packLUInfo, + msgs_t**msgss, // size=num Look ahead + LUValSubBuf_t**LUvsbs, // size=num Look ahead + diagFactBufs_t **dFBufs, // size maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t* gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t * gIperm_c_supno, + int_t ldt, + HyP_t* HyP, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info +); +extern LUValSubBuf_t** zLluBufInitArr(int_t numLA, LUstruct_t *LUstruct); +extern diagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); +extern int_t zinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf); +extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs, + gridinfo_t *grid, SCT_t *SCT); +extern int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *, SCT_t *); + + /* from ancFactorization.h */ +extern int_t ancestorFactor( + int_t ilvl, // level of factorization + sForest_t* sforest, + commRequests_t **comReqss, // lists of communication requests // size maxEtree level + scuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t*packLUInfo, + msgs_t**msgss, // size=num Look ahead + LUValSubBuf_t**LUvsbs, // size=num Look ahead + diagFactBufs_t **dFBufs, // size maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t* gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t * gIperm_c_supno, + int_t ldt, + HyP_t* HyP, + LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, int *info +); + +/*=====================*/ #ifdef __cplusplus } diff --git a/SRC/treeFactorization.c b/SRC/treeFactorization.c index 9c242582..260f275e 100644 --- a/SRC/treeFactorization.c +++ b/SRC/treeFactorization.c @@ -15,6 +15,7 @@ at the top-level directory. #include "trfCommWrapper.h" #endif +#if 0 /******** Sherry: Remove extra layer of function calls. *******/ int_t sDiagFactIBCast(int_t k, diagFactBufs_t *dFBuf, factStat_t *factStat, commRequests_t *comReqs, @@ -180,6 +181,38 @@ int_t sSchurComplementSetup(int_t k, msgs_t* msgs, bigU, Lsub_buf, Lval_buf, Usub_buf, Uval_buf, grid, LUstruct); } +int_t sLPanelTrSolve( int_t k, diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT) +{ + int_t * factored_L = factStat->factored_L; + double * BlockUFactor = dFBuf->BlockUFactor; + dLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct); + + return 0; +} + +int_t sUPanelTrSolve( int_t k, + int_t ldt, + diagFactBufs_t *dFBuf, + scuBufs_t* scuBufs, + packLUInfo_t* packLUInfo, + gridinfo_t *grid, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, SCT_t *SCT) +{ + double* bigV = scuBufs->bigV; + Ublock_info_t* Ublock_info = packLUInfo->Ublock_info; + double * BlockLFactor = dFBuf->BlockLFactor; + + dUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid, LUstruct, stat, SCT); + return 0; +} + +#endif /******** End removing extra layer of function calls. *******/ + int_t initCommRequests(commRequests_t* comReqs, gridinfo_t * grid) { @@ -310,19 +343,6 @@ int_t checkRecvUDiag(int_t k, commRequests_t *comReqs, return 1; } -int_t sLPanelTrSolve( int_t k, diagFactBufs_t *dFBuf, - factStat_t *factStat, - commRequests_t *comReqs, - gridinfo_t *grid, - LUstruct_t *LUstruct, SCT_t *SCT) -{ - int_t * factored_L = factStat->factored_L; - double * BlockUFactor = dFBuf->BlockUFactor; - dLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct); - - return 0; -} - int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *grid, @@ -347,20 +367,4 @@ int_t checkRecvLDiag(int_t k, return 1; } -int_t sUPanelTrSolve( int_t k, - int_t ldt, - diagFactBufs_t *dFBuf, - scuBufs_t* scuBufs, - packLUInfo_t* packLUInfo, - gridinfo_t *grid, - LUstruct_t *LUstruct, - SuperLUStat_t *stat, SCT_t *SCT) -{ - double* bigV = scuBufs->bigV; - Ublock_info_t* Ublock_info = packLUInfo->Ublock_info; - double * BlockLFactor = dFBuf->BlockLFactor; - - dUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid, LUstruct, stat, SCT); - return 0; -} diff --git a/SRC/trfAux.c b/SRC/trfAux.c index f9737f1c..f9310eee 100644 --- a/SRC/trfAux.c +++ b/SRC/trfAux.c @@ -16,6 +16,7 @@ at the top-level directory. #include "trfAux.h" #endif + int_t getslu25D_enabled() { if ( getenv("SLU25D") != NULL) @@ -334,7 +335,7 @@ int_t* getPerm_c_supno(int_t nsupers, k = BC_HEADER; krow = PROW( jb, grid ); if ( krow == myrow ) /* skip the diagonal block */ - { + { k += LB_DESCRIPTOR + index[k + 1]; i--; } @@ -1219,3 +1220,36 @@ void getSCUweight(int_t nsupers, treeList_t* treeList, SUPERLU_FREE(perm_u); } /* getSCUweight */ + +int_t Trs2_InitUblock_info(int_t klst, int_t nb, + Ublock_info_t *Ublock_info, + int_t *usub, + Glu_persist_t *Glu_persist, SuperLUStat_t *stat ) +{ + int_t *xsup = Glu_persist->xsup; + int_t iukp, rukp; + iukp = BR_HEADER; + rukp = 0; + + for (int_t b = 0; b < nb; ++b) + { + int_t gb = usub[iukp]; + int_t nsupc = SuperSize (gb); + + Ublock_info[b].iukp = iukp; + Ublock_info[b].rukp = rukp; + // Ublock_info[b].nsupc = nsupc; + + iukp += UB_DESCRIPTOR; + /* Sherry: can remove this loop for rukp + rukp += usub[iukp-1]; + */ + for (int_t j = 0; j < nsupc; ++j) + { + int_t segsize = klst - usub[iukp++]; + rukp += segsize; + stat->ops[FACT] += segsize * (segsize + 1); + } + } + return 0; +} diff --git a/SRC/zcommunication_aux.c b/SRC/zcommunication_aux.c index 0e3961c7..3f10c72e 100644 --- a/SRC/zcommunication_aux.c +++ b/SRC/zcommunication_aux.c @@ -308,7 +308,7 @@ int_t zRecv_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, /*pointer for the diag } -int_t zPackLBlock(int_t k, double* Dest, Glu_persist_t *Glu_persist, +int_t zPackLBlock(int_t k, doublecomplex* Dest, Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu) /*Copies src matrix into dest matrix*/ { @@ -406,7 +406,7 @@ int_t zIRecv_LDiagBlock(int_t k0, doublecomplex *L_blk_ptr, /*pointer for the di #if (MPI_VERSION>2) /****Ibcast based on mpi ibcast****/ -int_t zIBcast_UDiagBlock(int_t k, double *ublk_ptr, /*pointer for the diagonal block*/ +int_t zIBcast_UDiagBlock(int_t k, doublecomplex *ublk_ptr, /*pointer for the diagonal block*/ int_t size, /*number of elements to be broadcasted*/ MPI_Request *L_diag_blk_ibcast_req, gridinfo_t * grid) diff --git a/SRC/zgather.c b/SRC/zgather.c index f2745899..6cb9fab8 100644 --- a/SRC/zgather.c +++ b/SRC/zgather.c @@ -40,7 +40,8 @@ void zgather_u(int_t num_u_blks, { // return; // private(j,iukp,rukp,tempu, jb, nsupc,ljb,segsize,lead_zero, \ - // jj, i) \ + // jj, i) + doublecomplex zero = {0.0, 0.0}; #pragma omp parallel for default (shared) schedule(dynamic) for (int_t j = 0; j < num_u_blks; ++j) @@ -60,7 +61,7 @@ void zgather_u(int_t num_u_blks, if ( segsize ) { int_t lead_zero = ldu - segsize; - for (int_t i = 0; i < lead_zero; ++i) tempu[i] = 0.0; + for (int_t i = 0; i < lead_zero; ++i) tempu[i] = zero; tempu += lead_zero; for (int_t i = 0; i < segsize; ++i) { diff --git a/SRC/ztreeFactorization.c b/SRC/ztreeFactorization.c index 6f07559f..b80c8b0b 100644 --- a/SRC/ztreeFactorization.c +++ b/SRC/ztreeFactorization.c @@ -19,9 +19,9 @@ int_t zLluBufInit(LUValSubBuf_t* LUvsb, LUstruct_t *LUstruct) { LocalLU_t *Llu = LUstruct->Llu; LUvsb->Lsub_buf = intMalloc_dist(Llu->bufmax[0]); //INT_T_ALLOC(Llu->bufmax[0]); - LUvsb->Lval_buf = doubleMalloc_dist(Llu->bufmax[1]); //DOUBLE_ALLOC(Llu->bufmax[1]); + LUvsb->Lval_buf = doublecomplexMalloc_dist(Llu->bufmax[1]); //DOUBLE_ALLOC(Llu->bufmax[1]); LUvsb->Usub_buf = intMalloc_dist(Llu->bufmax[2]); //INT_T_ALLOC(Llu->bufmax[2]); - LUvsb->Uval_buf = doubleMalloc_dist(Llu->bufmax[3]); //DOUBLE_ALLOC(Llu->bufmax[3]); + LUvsb->Uval_buf = doublecomplexMalloc_dist(Llu->bufmax[3]); //DOUBLE_ALLOC(Llu->bufmax[3]); return 0; } @@ -60,14 +60,14 @@ int_t zinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, LUstruct_t* LUstruct, gridinfo_t * grid) { - scuBufs->bigV = dgetBigV(ldt, num_threads); - scuBufs->bigU = dgetBigU(nsupers, grid, LUstruct); + scuBufs->bigV = zgetBigV(ldt, num_threads); + scuBufs->bigU = zgetBigU(nsupers, grid, LUstruct); return 0; } -int_t dinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf) +int_t zinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf) { - dFBuf->BlockUFactor = doubleMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt); - dFBuf->BlockLFactor = doubleMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt); + dFBuf->BlockUFactor = doublecomplexMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt); + dFBuf->BlockLFactor = doublecomplexMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt); return 0; } @@ -415,7 +415,7 @@ int_t zsparseTreeFactor_ASYNC( comReqss[offset]->recv_requ, grid, LUstruct, SCT); #endif double tsch = SuperLU_timer_(); - int_t LU_nonempty = dSchurComplementSetupGPU(k, + int_t LU_nonempty = zSchurComplementSetupGPU(k, msgss[offset], packLUInfo, myIperm, gIperm_c_supno, perm_c_supno, gEtreeInfo, diff --git a/SRC/ztrfAux.c b/SRC/ztrfAux.c index d7211f30..1e8c2610 100644 --- a/SRC/ztrfAux.c +++ b/SRC/ztrfAux.c @@ -425,10 +425,10 @@ int_t zSchurComplementSetupGPU( HyP->num_u_blks_Phi=0; HyP->num_u_blks=0; - dRgather_L(k, lsub, lusup, gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno); + zRgather_L(k, lsub, lusup, gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno); if (HyP->Lnbrow + HyP->Rnbrow > 0) { - dRgather_U( k, 0, usub, uval, bigU, gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno, perm_u); + zRgather_U( k, 0, usub, uval, bigU, gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno, perm_u); }/*if(nbrow>0) */ } diff --git a/SRC/ztrfCommWrapper.c b/SRC/ztrfCommWrapper.c index 9baeba42..7f0bea68 100644 --- a/SRC/ztrfCommWrapper.c +++ b/SRC/ztrfCommWrapper.c @@ -89,12 +89,12 @@ int_t zDiagFactIBCast(int_t k, int_t k0, // supernode to be factored // printf("Entering factorization %d\n", k); // int_t offset = (k0 - k_st); // offset is input /*factorize A[kk]*/ - Local_Dgstrf2(options, k, thresh, + Local_Zgstrf2(options, k, thresh, BlockUFactor, /*factored U is over writen here*/ Glu_persist, grid, Llu, stat, info, SCT); /*Pack L[kk] into blockLfactor*/ - dPackLBlock(k, BlockLFactor, Glu_persist, grid, Llu); + zPackLBlock(k, BlockLFactor, Glu_persist, grid, Llu); /*Isend U blocks to the process row*/ int_t nsupc = SuperSize(k); @@ -115,6 +115,7 @@ int_t zLPanelTrSolve( int_t k, int_t* factored_L, gridinfo_t *grid, LUstruct_t *LUstruct) { + doublecomplex alpha = {1.0, 0.0}; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; int_t* xsup = Glu_persist->xsup; @@ -161,8 +162,8 @@ int_t zLPanelTrSolve( int_t k, int_t* factored_L, int_t off = i * BL; // Sherry: int_t len = MY_MIN(BL, l - i * BL); int_t len = SUPERLU_MIN(BL, l - i * BL); - cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, - len, nsupc, 1.0, ublk_ptr, ld_ujrow, &lusup[off], nsupr); + cblas_ztrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, + len, nsupc, (void*) &alpha, ublk_ptr, ld_ujrow, &lusup[off], nsupr); } } } @@ -198,8 +199,9 @@ int_t zLPanelTrSolve( int_t k, int_t* factored_L, int_t len = SUPERLU_MIN(BL, (l - i * BL)); #pragma omp task { - cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, - len, nsupc, 1.0, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr); + cblas_ztrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, + len, nsupc, (void*) &alpha, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr); + } } } @@ -269,7 +271,7 @@ int_t zUPanelTrSolve( int_t k, { int_t thread_id = omp_get_thread_num(); doublecomplex *tempv = bigV + thread_id * ldt * ldt; - Trs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, + zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist); } } @@ -316,7 +318,7 @@ int_t zUPanelTrSolve( int_t k, { int_t thread_id = omp_get_thread_num(); doublecomplex *tempv = bigV + thread_id * ldt * ldt; - Trs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, + zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist); } From edb5287751d99b0f99c53484bebd3d517adb98c2 Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Thu, 6 Jun 2019 14:54:54 -0700 Subject: [PATCH 005/147] Added scatter.c --- EXAMPLE/pddrive3d.c | 2 +- SRC/Makefile | 1 - SRC/dcommunication_aux.c | 8 + SRC/dgather.c | 8 + SRC/dtreeFactorization.c | 8 + SRC/dtrfAux.c | 9 + SRC/dtrfCommWrapper.c | 9 + SRC/pd3dcomm.c | 8 + SRC/pdgssvx3d.c | 13 +- SRC/pdgstrf.c | 2 +- SRC/pdgstrf2.c | 2 +- SRC/pdgstrf3d.c | 9 + SRC/pz3dcomm.c | 8 + SRC/pzgssvx3d.c | 13 +- SRC/pzgstrf.c | 2 +- SRC/pzgstrf2.c | 2 +- SRC/pzgstrf3d.c | 9 + SRC/scatter.c | 598 +++++++++++++++++++++++++++++++++++++++ SRC/superlu_ddefs.h | 6 +- SRC/superlu_zdefs.h | 7 +- SRC/zcommunication_aux.c | 8 + SRC/zgather.c | 8 + SRC/ztreeFactorization.c | 8 + SRC/ztrfAux.c | 9 + SRC/ztrfCommWrapper.c | 9 + 25 files changed, 738 insertions(+), 28 deletions(-) create mode 100644 SRC/scatter.c diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c index 0abcb6c4..7c51aee2 100644 --- a/EXAMPLE/pddrive3d.c +++ b/EXAMPLE/pddrive3d.c @@ -15,7 +15,7 @@ at the top-level directory. * *
  * -- Distributed SuperLU routine (version 7.0.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
  * May 10, 2019
  *
  */
diff --git a/SRC/Makefile b/SRC/Makefile
index 5e0efb89..af4394f2 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -31,7 +31,6 @@ FACT3D = 	scatter.o
 #	pdgstrs_vecpar.o ancFactorization.o
 
 #  pddrive_params.o
-# scatter.o 
 
 #
 # Precision independent routines
diff --git a/SRC/dcommunication_aux.c b/SRC/dcommunication_aux.c
index 8bc113b5..9c5965f0 100644
--- a/SRC/dcommunication_aux.c
+++ b/SRC/dcommunication_aux.c
@@ -10,6 +10,14 @@ at the top-level directory.
 */
 
 
+/*! @file
+ * \brief Communication routines.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
 #include "superlu_ddefs.h"
 #if 0
 #include "sec_structs.h"
diff --git a/SRC/dgather.c b/SRC/dgather.c
index 556e49bb..0c666200 100644
--- a/SRC/dgather.c
+++ b/SRC/dgather.c
@@ -10,6 +10,14 @@ at the top-level directory.
 */
 
 
+/*! @file
+ * \brief Various gather routines.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
 #include 
 #include "superlu_ddefs.h"
 #if 0
diff --git a/SRC/dtreeFactorization.c b/SRC/dtreeFactorization.c
index 5aba253f..dc459d15 100644
--- a/SRC/dtreeFactorization.c
+++ b/SRC/dtreeFactorization.c
@@ -10,6 +10,14 @@ at the top-level directory.
 */
 
 
+/*! @file
+ * \brief Factorization routines for the subtree using 2D process grid.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
 #include "superlu_ddefs.h"
 #if 0
 #include "treeFactorization.h"
diff --git a/SRC/dtrfAux.c b/SRC/dtrfAux.c
index 614642e3..a89898ef 100644
--- a/SRC/dtrfAux.c
+++ b/SRC/dtrfAux.c
@@ -10,6 +10,15 @@ at the top-level directory.
 */
 
 
+/*! @file
+ * \brief Auxiliary routine for 3D factorization.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+
 #include "superlu_ddefs.h"
 
 #if 0
diff --git a/SRC/dtrfCommWrapper.c b/SRC/dtrfCommWrapper.c
index 6dcaa285..f5c6ce42 100644
--- a/SRC/dtrfCommWrapper.c
+++ b/SRC/dtrfCommWrapper.c
@@ -10,6 +10,15 @@ at the top-level directory.
 */
 
 
+/*! @file
+ * \brief Communication wrapper routines for 2D factorization.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+
 #include "superlu_ddefs.h"
 
 #if 0
diff --git a/SRC/pd3dcomm.c b/SRC/pd3dcomm.c
index 303fb0de..4377642c 100644
--- a/SRC/pd3dcomm.c
+++ b/SRC/pd3dcomm.c
@@ -10,6 +10,14 @@ at the top-level directory.
 */
 
 
+/*! @file
+ * \brief Communication routines for the 3D algorithm.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
 #include "superlu_ddefs.h"
 #include "cblas.h"
 #if 0
diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c
index fdd19272..45015803 100644
--- a/SRC/pdgssvx3d.c
+++ b/SRC/pdgssvx3d.c
@@ -11,18 +11,13 @@ at the top-level directory.
 
 
 /*! @file
- * \brief Solves a system of linear equations A*X=B
+ * \brief Solves a system of linear equations A*X=B using 3D process grid.
  *
  * 
- * -- Distributed SuperLU routine (version 7.0.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * November 1, 2007
- * October 22, 2012
- * March 31, 2019
- * 
+ * -- Distributed SuperLU routine (version 7.0) -- + * Lawrence Berkeley National Lab, Georgia Institute of Technology. + * May 10, 2019 */ - -#include #include "superlu_ddefs.h" #if 0 #include "p3dcomm.h" diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c index b7bd07b2..2d055dcd 100644 --- a/SRC/pdgstrf.c +++ b/SRC/pdgstrf.c @@ -11,7 +11,7 @@ at the top-level directory. /*! @file - * \brief Performs LU factorization in parallel + * \brief Performs LU factorization in parallel. * *
  * -- Distributed SuperLU routine (version 6.1) --
diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c
index 0402a2f7..756301c1 100644
--- a/SRC/pdgstrf2.c
+++ b/SRC/pdgstrf2.c
@@ -20,7 +20,7 @@ at the top-level directory.
  *
  * Modified:
  *   September 30, 2017
- *   May 5, 2019 version 7.0.0
+ *   May 10, 2019 version 7.0.0
  *
  * 
  * Purpose
diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c
index e11ea5d5..8161771b 100644
--- a/SRC/pdgstrf3d.c
+++ b/SRC/pdgstrf3d.c
@@ -10,6 +10,15 @@ at the top-level directory.
 */
 
 
+/*! @file
+ * \brief Performs LU factorization in 3D process grid.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+
 #include "superlu_ddefs.h"
 #if 0
 #include "pdgstrf3d.h"
diff --git a/SRC/pz3dcomm.c b/SRC/pz3dcomm.c
index 03ba89af..969ddbd0 100644
--- a/SRC/pz3dcomm.c
+++ b/SRC/pz3dcomm.c
@@ -9,6 +9,14 @@ The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
+/*! @file
+ * \brief Communication routines for the 3D algorithm.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
 #include "superlu_zdefs.h"
 #include "cblas.h"
 #if 0
diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c
index e1cad175..eb140cbc 100644
--- a/SRC/pzgssvx3d.c
+++ b/SRC/pzgssvx3d.c
@@ -10,18 +10,13 @@ at the top-level directory.
 */
 
 /*! @file
- * \brief Solves a system of linear equations A*X=B
+ * \brief Solves a system of linear equations A*X=B using 3D process grid.
  *
  * 
- * -- Distributed SuperLU routine (version 7.0.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * November 1, 2007
- * October 22, 2012
- * March 31, 2019
- * 
+ * -- Distributed SuperLU routine (version 7.0) -- + * Lawrence Berkeley National Lab, Georgia Institute of Technology. + * May 10, 2019 */ - -#include #include "superlu_zdefs.h" #if 0 #include "p3dcomm.h" diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c index f90759f8..997cc051 100644 --- a/SRC/pzgstrf.c +++ b/SRC/pzgstrf.c @@ -10,7 +10,7 @@ at the top-level directory. */ /*! @file - * \brief Performs LU factorization in parallel + * \brief Performs LU factorization in parallel. * *
  * -- Distributed SuperLU routine (version 6.1) --
diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c
index 6af84d93..b3b093d8 100644
--- a/SRC/pzgstrf2.c
+++ b/SRC/pzgstrf2.c
@@ -19,7 +19,7 @@ at the top-level directory.
  *
  * Modified:
  *   September 30, 2017
- *   May 5, 2019 version 7.0.0
+ *   May 10, 2019 version 7.0.0
  *
  * 
  * Purpose
diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c
index 6a0e842d..380dac54 100644
--- a/SRC/pzgstrf3d.c
+++ b/SRC/pzgstrf3d.c
@@ -9,6 +9,15 @@ The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
+/*! @file
+ * \brief Performs LU factorization in 3D process grid.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+
 #include "superlu_zdefs.h"
 #if 0
 #include "pdgstrf3d.h"
diff --git a/SRC/scatter.c b/SRC/scatter.c
new file mode 100644
index 00000000..b26d49dd
--- /dev/null
+++ b/SRC/scatter.c
@@ -0,0 +1,598 @@
+#include "superlu_ddefs.h"
+#include "scatter.h"
+//#include "compiler.h"
+
+#ifdef __INTEL_COMPILER
+#include "mkl.h"
+#else
+#include "cblas.h"
+#endif
+#include "omp.h"
+
+#define ISORT
+
+#if 0 /**** Sherry: this routine is moved to util.c ****/
+void
+arrive_at_ublock (int_t j,      //block number
+                  int_t *iukp,  // output
+                  int_t *rukp, int_t *jb,   /* Global block number of block U(k,j). */
+                  int_t *ljb,   /* Local block number of U(k,j). */
+                  int_t *nsupc,     /*supernode size of destination block */
+                  int_t iukp0,  //input
+                  int_t rukp0, int_t *usub,     /*usub scripts */
+                  int_t *perm_u,    /*permutation matrix */
+                  int_t *xsup,  /*for SuperSize and LBj */
+                  gridinfo_t *grid)
+{
+    int_t jj;
+    *iukp = iukp0;
+    *rukp = rukp0;
+
+#ifdef ISORT
+    for (jj = 0; jj < perm_u[j]; jj++)
+#else
+    for (jj = 0; jj < perm_u[2 * j + 1]; jj++)
+#endif
+    {
+
+        *jb = usub[*iukp];      /* Global block number of block U(k,j). */
+        *nsupc = SuperSize (*jb);
+        *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+        *rukp += usub[*iukp - 1];   /* Move to block U(k,j+1) */
+        *iukp += *nsupc;
+    }
+
+    /* reinitilize the pointers to the begining of the */
+    /* kth column/row of L/U factors                   */
+    *jb = usub[*iukp];          /* Global block number of block U(k,j). */
+    *ljb = LBj (*jb, grid);     /* Local block number of U(k,j). */
+    *nsupc = SuperSize (*jb);
+    *iukp += UB_DESCRIPTOR;     /* Start fstnz of block U(k,j). */
+}
+#endif
+/*--------------------------------------------------------------*/
+
+void
+block_gemm_scatter( int_t lb, int_t j,
+                    Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info,
+                    double *L_mat, int_t ldl,
+                    double *U_mat, int_t ldu,
+                    double *bigV,
+                    // int_t jj0,
+                    int_t knsupc,  int_t klst,
+                    int_t *lsub, int_t *usub, int_t ldt,
+                    int_t thread_id,
+                    int_t *indirect,
+                    int_t *indirect2,
+                    int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                    int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                    int_t *xsup, gridinfo_t *grid,
+                    SuperLUStat_t *stat
+#ifdef SCATTER_PROFILE
+                    , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                  )
+{
+    // return ;
+    thread_id = omp_get_thread_num();
+    int_t *indirect_thread = indirect + ldt * thread_id;
+    int_t *indirect2_thread = indirect2 + ldt * thread_id;
+    double *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col;
+    int_t ncols;
+    // if (j > jj0)
+    if (j > 0)
+    {
+        ncols  = Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols;
+        st_col = Ublock_info[j - 1].full_u_cols;
+    }
+    else
+    {
+        ncols  = Ublock_info[j].full_u_cols;
+        st_col = 0;
+    }
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int_t temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int_t cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+    /* Getting L block information */
+    // int_t lptr = Remain_info[lb].lptr;
+    // int_t ib   = Remain_info[lb].ib;
+    // int_t temp_nbrow = lsub[lptr + 1];
+    // lptr += LB_DESCRIPTOR;
+    // int_t cum_nrow =  Remain_info[lb].StRow;
+
+    double alpha = 1.0;
+    double beta = 0.0;
+
+
+    /* calling DGEMM */
+    // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+    // dgemm("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+    //       &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl,
+    //       &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow);
+
+
+    // printf("%d %d %d %d  %d %d %d %d\n", temp_nbrow, ncols, ldu,  ldl,st_col,(knsupc - ldu)*ldl + cum_nrow,cum_nrow,st_col);
+
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+
+    // printf("SCU update: (%d, %d)\n",ib,jb );
+#ifdef SCATTER_PROFILE
+    unsigned long long ttx = __rdtsc();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        scatter_l (
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    // #pragma omp atomic
+    // stat->ops[FACT] += 2*temp_nbrow*ncols*ldu + temp_nbrow*ncols;
+
+#ifdef SCATTER_PROFILE
+    double t_s = (double) __rdtsc() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* block_gemm_scatter */
+
+/*this version uses a lock to prevent multiple thread updating the same block*/
+void
+block_gemm_scatter_lock( int_t lb, int_t j,
+                         omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,
+                         Remain_info_t *Remain_info,
+                         double *L_mat, int_t ldl,
+                         double *U_mat, int_t ldu,
+                         double *bigV,
+                         // int_t jj0,
+                         int_t knsupc,  int_t klst,
+                         int_t *lsub, int_t *usub, int_t ldt,
+                         int_t thread_id,
+                         int_t *indirect,
+                         int_t *indirect2,
+                         int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                         int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                         int_t *xsup, gridinfo_t *grid
+#ifdef SCATTER_PROFILE
+                         , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                       )
+{
+    int_t *indirect_thread = indirect + ldt * thread_id;
+    int_t *indirect2_thread = indirect2 + ldt * thread_id;
+    double *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col = Ublock_info[j].StCol;
+    int_t ncols = Ublock_info[j].ncols;
+
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int_t temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int_t cum_nrow =  Remain_info[lb].StRow;
+
+    double alpha = 1.0;
+    double beta = 0.0;
+
+
+    /* calling DGEMM */
+    // printf(" m %d n %d k %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+    // dgemm("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+    //       &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl,
+    //       &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow);
+
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+
+    /*try to get the lock for the block*/
+    if (lock)       /*lock is not null*/
+        while (!omp_test_lock(lock))
+        {
+
+        }
+
+#ifdef SCATTER_PROFILE
+    unsigned long long ttx = __rdtsc();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        scatter_l (
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    if (lock)
+        omp_unset_lock(lock);
+
+#ifdef SCATTER_PROFILE
+    double t_s = (double) __rdtsc() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* block_gemm_scatter_lock */
+
+// there are following three variations of block_gemm_scatter call
+/*
++---------------------------------------+
+|          ||                           |
+|  CPU     ||          CPU+TopRight     |
+|  Top     ||                           |
+|  Left    ||                           |
+|          ||                           |
++---------------------------------------+
++---------------------------------------+
+|          ||        |                  |
+|          ||        |                  |
+|          ||        |                  |
+|  CPU     ||  CPU   |Accelerator       |
+|  Bottom  ||  Bottom|                  |
+|  Left    ||  Right |                  |
+|          ||        |                  |
+|          ||        |                  |
++--------------------+------------------+
+                  jj_cpu
+*/
+
+int_t block_gemm_scatterTopLeft( int_t lb, /* block number in L */
+				 int_t j,  /* block number in U */
+                                 double* bigV, int_t knsupc,  int_t klst,
+				 int_t* lsub, int_t * usub, int_t ldt,
+				 int_t* indirect, int_t* indirect2, HyP_t* HyP,
+                                 LUstruct_t *LUstruct,
+                                 gridinfo_t* grid,
+                                 SCT_t*SCT, SuperLUStat_t *stat
+                               )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    volatile int_t thread_id = omp_get_thread_num();
+    
+//    printf("Thread's ID %lld \n", thread_id);
+    unsigned long long t1 = _rdtsc();
+    block_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info,
+			HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id,
+			indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr,
+			xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* block_gemm_scatterTopLeft */
+
+int_t block_gemm_scatterTopRight( int_t lb,  int_t j,
+                                  double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                  HyP_t* HyP,
+                                  LUstruct_t *LUstruct,
+                                  gridinfo_t* grid,
+                                  SCT_t*SCT, SuperLUStat_t *stat
+                                )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+   volatile  int_t thread_id = omp_get_thread_num();
+    unsigned long long t1 = _rdtsc();
+    block_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* block_gemm_scatterTopRight */
+
+
+int_t block_gemm_scatterBottomLeft( int_t lb,  int_t j,
+                                    double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                    HyP_t* HyP,
+                                    LUstruct_t *LUstruct,
+                                    gridinfo_t* grid,
+                                    SCT_t*SCT, SuperLUStat_t *stat
+                                  )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    volatile int_t thread_id = omp_get_thread_num();
+    //printf("Thread's ID %lld \n", thread_id);
+    unsigned long long t1 = _rdtsc();
+    block_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* block_gemm_scatterBottomLeft */
+
+int_t block_gemm_scatterBottomRight( int_t lb,  int_t j,
+                                     double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                     HyP_t* HyP,
+                                     LUstruct_t *LUstruct,
+                                     gridinfo_t* grid,
+                                     SCT_t*SCT, SuperLUStat_t *stat
+                                   )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+   volatile  int_t thread_id = omp_get_thread_num();
+   // printf("Thread's ID %lld \n", thread_id);
+    unsigned long long t1 = _rdtsc();
+    block_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* block_gemm_scatterBottomRight */
+
+/******************************************************************
+ * SHERRY: The following routines may conflict with dscatter.c
+ ******************************************************************/
+#if 1
+void
+scatter_l (int_t ib,
+           int_t ljb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *usub,
+           int_t *lsub,
+           double *tempv,
+           int_t *indirect_thread, int_t *indirect2,
+           int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid)
+{
+
+    int_t rel, i, segsize, jj;
+    double *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+
+    while (ijb != ib)
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+
+
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow;
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow = index[lptrj - 1];
+
+    for (i = 0; i < dest_nbrow; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    /* can be precalculated */
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] = indirect_thread[rel];
+    }
+
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+
+        segsize = klst - usub[iukp + jj];
+        if (segsize)
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+
+                nzval[indirect2[i]] -= tempv[i];
+
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+
+}
+
+void   // SHERRY: NOT CALLED!!
+scatter_u (int_t ib,
+           int_t jb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *lsub,
+           int_t *usub,
+           double *tempv,
+           int_t *indirect,
+           int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, gridinfo_t *grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    int_t jj, i, fnz;
+    int_t segsize;
+    double *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* reinitialize the pointer to each row of U */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb)            /* Search for dest block. */
+    {
+        ruip_lib += index[iuip_lib + 1];
+
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor.  Now point_t to fstnz index of
+       block U(i,j). */
+
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        indirect[i] = lsub[lptr + i] ;
+    }
+
+
+    iuip_lib += UB_DESCRIPTOR;
+
+    ucol = &Unzval_br_ptr[lib][ruip_lib];
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        ucol -= fnz;
+        if (segsize)            /* Nonzero segment in U(k.j). */
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                ucol[indirect[i]] -= tempv[i];
+            }                   /* for i=0..temp_nbropw */
+            tempv += nbrow;
+
+        } /*if segsize */
+        ucol += ilst ;
+
+    } /*for jj=0:nsupc */
+
+}
+
+#endif // comment out
+
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index 12410d37..6b86006e 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -14,12 +14,14 @@ at the top-level directory.
  * \brief  Distributed SuperLU data types and function prototypes
  *
  * 
- * -- Distributed SuperLU routine (version 6.1) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology
  * November 1, 2007
  * April 5, 2015
  * September 18, 2018  version 6.0
  * February 8, 2019  version 6.1.1
+ * May 10, 2019 version 7.0.0
  * 
*/ diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h index 39494d55..42a4f128 100644 --- a/SRC/superlu_zdefs.h +++ b/SRC/superlu_zdefs.h @@ -13,12 +13,14 @@ at the top-level directory. * \brief Distributed SuperLU data types and function prototypes * *
- * -- Distributed SuperLU routine (version 6.1) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology
  * November 1, 2007
  * April 5, 2015
  * September 18, 2018  version 6.0
  * February 8, 2019  version 6.1.1
+ * May 10, 2019 version 7.0.0
  * 
*/ @@ -287,6 +289,7 @@ typedef struct extern "C" { #endif + /* Supernodal LU factor related */ extern void zCreate_CompCol_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, doublecomplex *, diff --git a/SRC/zcommunication_aux.c b/SRC/zcommunication_aux.c index 3f10c72e..4f78ed9a 100644 --- a/SRC/zcommunication_aux.c +++ b/SRC/zcommunication_aux.c @@ -9,6 +9,14 @@ The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ +/*! @file + * \brief Communication routines. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
 #include "superlu_zdefs.h"
 #if 0
 #include "sec_structs.h"
diff --git a/SRC/zgather.c b/SRC/zgather.c
index 6cb9fab8..ce00a643 100644
--- a/SRC/zgather.c
+++ b/SRC/zgather.c
@@ -9,6 +9,14 @@ The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
+/*! @file
+ * \brief Various gather routines.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
 #include 
 #include "superlu_zdefs.h"
 #if 0
diff --git a/SRC/ztreeFactorization.c b/SRC/ztreeFactorization.c
index b80c8b0b..bb6b16cc 100644
--- a/SRC/ztreeFactorization.c
+++ b/SRC/ztreeFactorization.c
@@ -9,6 +9,14 @@ The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
+/*! @file
+ * \brief Factorization routines for the subtree using 2D process grid.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
 #include "superlu_zdefs.h"
 #if 0
 #include "treeFactorization.h"
diff --git a/SRC/ztrfAux.c b/SRC/ztrfAux.c
index 1e8c2610..04d4b8a0 100644
--- a/SRC/ztrfAux.c
+++ b/SRC/ztrfAux.c
@@ -9,6 +9,15 @@ The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
+/*! @file
+ * \brief Auxiliary routine for 3D factorization.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+
 #include "superlu_zdefs.h"
 
 #if 0
diff --git a/SRC/ztrfCommWrapper.c b/SRC/ztrfCommWrapper.c
index 7f0bea68..07cc4a4e 100644
--- a/SRC/ztrfCommWrapper.c
+++ b/SRC/ztrfCommWrapper.c
@@ -9,6 +9,15 @@ The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
+/*! @file
+ * \brief Communication wrapper routines for 2D factorization.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ */
+
 #include "superlu_zdefs.h"
 
 #if 0

From b339df3c4214f6240baba23eb372206a68174833 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Fri, 21 Jun 2019 15:25:34 -0700
Subject: [PATCH 006/147] small updates of CUDA code.

---
 DoxyConfig             | 4 ++--
 SRC/dSchCompUdt-cuda.c | 2 +-
 SRC/zSchCompUdt-cuda.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/DoxyConfig b/DoxyConfig
index 432628e3..11ad8db9 100644
--- a/DoxyConfig
+++ b/DoxyConfig
@@ -847,7 +847,7 @@ TREEVIEW_WIDTH         = 250
 # If the GENERATE_LATEX tag is set to YES (the default) Doxygen will 
 # generate Latex output.
 
-GENERATE_LATEX         = NO
+GENERATE_LATEX         = YES
 
 # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. 
 # If a relative path is entered the value of OUTPUT_DIRECTORY will be 
@@ -858,7 +858,7 @@ LATEX_OUTPUT           = latex
 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be 
 # invoked. If left blank `latex' will be used as the default command name.
 
-LATEX_CMD_NAME         = latex
+LATEX_CMD_NAME         = pdflatex
 
 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to 
 # generate index for LaTeX. If left blank `makeindex' will be used as the 
diff --git a/SRC/dSchCompUdt-cuda.c b/SRC/dSchCompUdt-cuda.c
index dbbf3bea..a1b2ec81 100644
--- a/SRC/dSchCompUdt-cuda.c
+++ b/SRC/dSchCompUdt-cuda.c
@@ -124,7 +124,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k,:) are not empty. */
             jjj = jjj_global;
             // printf("thread_id %d, jjj %d \n",thread_id,jjj );
             if (jjj == jjj_st+1 && full_u_cols[jjj_st] > ncol_max) {
-                printf("allocate more memory for buffer !!!!\n");
+                // printf("allocate more memory for buffer !!!!\n"); -- Sherry
                 if(nbrow * full_u_cols[jjj_st] > buffer_size)
                     printf("%d buffer_size %d\n",nbrow*full_u_cols[jjj_st],buffer_size );
             }
diff --git a/SRC/zSchCompUdt-cuda.c b/SRC/zSchCompUdt-cuda.c
index 7f1f20ff..749b3664 100644
--- a/SRC/zSchCompUdt-cuda.c
+++ b/SRC/zSchCompUdt-cuda.c
@@ -123,7 +123,7 @@ if ( msg0 && msg2 ) {  /* L(:,k) and U(k,:) are not empty. */
             jjj = jjj_global;
             // printf("thread_id %d, jjj %d \n",thread_id,jjj );
             if (jjj == jjj_st+1 && full_u_cols[jjj_st] > ncol_max) {
-                printf("allocate more memory for buffer !!!!\n");
+                // printf("allocate more memory for buffer !!!!\n"); -- Sherry
                 if(nbrow * full_u_cols[jjj_st] > buffer_size)
                     printf("%d buffer_size %d\n",nbrow*full_u_cols[jjj_st],buffer_size );
             }

From aba87d414b0e6bf4a1a894e3269283949dc6d9cb Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Wed, 26 Jun 2019 10:54:18 -0700
Subject: [PATCH 007/147] Split scatter.c into {d,z}scatter3d.c. (some of the
 routines here are older than the sames ones in {d,z}scatter.c, will need
 update later.) 	- affecting {d,z}treeFactorization.c

---
 EXAMPLE/README           |  13 +-
 SRC/Makefile             |   4 +-
 SRC/dscatter.c           |   4 +-
 SRC/dscatter3d.c         | 613 +++++++++++++++++++++++++++++++++++++++
 SRC/dtreeFactorization.c |  10 +-
 SRC/scatter.c            |   2 -
 SRC/superlu_ddefs.h      | 109 ++++---
 SRC/superlu_zdefs.h      | 109 ++++---
 SRC/zscatter.c           |   4 +-
 SRC/zscatter3d.c         | 611 ++++++++++++++++++++++++++++++++++++++
 SRC/ztreeFactorization.c |  10 +-
 11 files changed, 1360 insertions(+), 129 deletions(-)
 create mode 100644 SRC/dscatter3d.c
 create mode 100644 SRC/zscatter3d.c

diff --git a/EXAMPLE/README b/EXAMPLE/README
index b8fa138c..2819f1cc 100644
--- a/EXAMPLE/README
+++ b/EXAMPLE/README
@@ -6,6 +6,9 @@ various functions provided in SuperLU_DIST. You can modify these
 examples to suit your applications.
 
 The examples illustrate the following functionalities:
+  0. pddrive3d.c
+     Use PDGSSVX3D communication-avoiding 3D algorithm with the default options
+     to solve a linear system
   1. pddrive.c, pddrive_ABglobal.c
      Use PDGSSVX with the full (default) options to solve a linear system.
   2. pddrive1.c, pddrive1_ABglobal.c
@@ -22,11 +25,15 @@ The examples illustrate the following functionalities:
 
 
 The command line options "-r " and "-c "
-defines the 2-D process grid. The total number of processes  is:
+defines the 2D process grid. The total number of processes  is:
 	 =  * 
 If the options is not provided at the command line, the programs
 will use 1 processor as default in each case.
 
+In the 3D code pddrive3d, the command line
+   "-r ", "-c " and "-d "
+defines the 3D process grid. The Z-dimension must be power of two.
+
 Three input matrices (Harwell-Boeing format) are provided in this directory:
 	g20.rua  -- a real matrix of dimension 400x400
 	big.rua  -- a real matrix of dimension 4960x4960
@@ -36,6 +43,10 @@ The command lines given below show how to run the parallel programs
 using "mpiexec". You may need to replace mpiexec by platform specific
 command.
 
+0. To run the 3D example (pddrive3d), type
+   % mpiexec -n  pddrive3d -r  -c  -d  g20.rua 
+     (e.g., mpiexec -n 8 pddrive3d -r 2 -c 2 -d 2 g20.rua)
+
 1. To run the real examples (pddrive, pddrive1, etc.)
    you may type:
    % mpiexec -n  pddrive -r  -c  g20.rua 
diff --git a/SRC/Makefile b/SRC/Makefile
index af4394f2..a0c7e98a 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -71,7 +71,7 @@ DPLUSRC = pdgssvx.o pdgssvx_ABglobal.o \
 	  pdgstrf.o pdgstrf2.o pdGetDiagU.o \
 	  pdgstrs.o pdgstrs1.o pdgstrs_lsum.o pdgstrs_Bglobal.o \
 	  pdgsrfs.o pdgsmv.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o \
-	  dreadtriple_noheader.o $(FACT3D)
+	  dreadtriple_noheader.o dscatter3d.o  #$(FACT3D)
 # from 3D code
 DPLUSRC += pdgssvx3d.o pdgstrf3d.o dtreeFactorization.o \
 	dgather.o pd3dcomm.o dtrfAux.o dcommunication_aux.o dtrfCommWrapper.o
@@ -85,7 +85,7 @@ ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \
 	  pzgstrf.o pzgstrf2.o pzGetDiagU.o \
 	  pzgstrs.o pzgstrs1.o pzgstrs_lsum.o pzgstrs_Bglobal.o \
 	  pzgsrfs.o pzgsmv.o pzgsrfs_ABXglobal.o pzgsmv_AXglobal.o \
-	  zreadtriple_noheader.o
+	  zreadtriple_noheader.o zscatter3d.o
 # from 3D code
 ZPLUSRC += pzgssvx3d.o pzgstrf3d.o ztreeFactorization.o \
 	zgather.o pz3dcomm.o ztrfAux.o zcommunication_aux.o ztrfCommWrapper.o
diff --git a/SRC/dscatter.c b/SRC/dscatter.c
index 13738362..7b88570e 100644
--- a/SRC/dscatter.c
+++ b/SRC/dscatter.c
@@ -107,7 +107,7 @@ dscatter_l_1 (int ib,
     // TAU_STATIC_TIMER_STOP("SCATTER_LB");
 } /* dscatter_l_1 */
 
-static void
+void
 dscatter_l (
            int ib,    /* row block number of source block L(i,k) */
            int ljb,   /* local column block number of dest. block L(i,j) */
@@ -189,7 +189,7 @@ dscatter_l (
 } /* dscatter_l */
 
 
-static void
+void
 dscatter_u (int ib,
            int jb,
            int nsupc,
diff --git a/SRC/dscatter3d.c b/SRC/dscatter3d.c
new file mode 100644
index 00000000..e03f3822
--- /dev/null
+++ b/SRC/dscatter3d.c
@@ -0,0 +1,613 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+#include "superlu_ddefs.h"
+//#include "scatter.h"
+//#include "compiler.h"
+
+#ifdef __INTEL_COMPILER
+#include "mkl.h"
+#else
+#include "cblas.h"
+#endif
+#include "omp.h"
+
+#define ISORT
+#define SCATTER_U_CPU  scatter_u
+
+static void scatter_u (int_t ib, int_t jb, int_t nsupc, int_t iukp, int_t *xsup,
+                 int_t klst, int_t nbrow, int_t lptr, int_t temp_nbrow,
+ 		 int_t *lsub, int_t *usub, double *tempv,
+		 int_t *indirect,
+           	 int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, gridinfo_t *grid);
+
+
+#if 0 /**** Sherry: this routine is moved to util.c ****/
+void
+arrive_at_ublock (int_t j,      //block number
+                  int_t *iukp,  // output
+                  int_t *rukp, int_t *jb,   /* Global block number of block U(k,j). */
+                  int_t *ljb,   /* Local block number of U(k,j). */
+                  int_t *nsupc,     /*supernode size of destination block */
+                  int_t iukp0,  //input
+                  int_t rukp0, int_t *usub,     /*usub scripts */
+                  int_t *perm_u,    /*permutation matrix */
+                  int_t *xsup,  /*for SuperSize and LBj */
+                  gridinfo_t *grid)
+{
+    int_t jj;
+    *iukp = iukp0;
+    *rukp = rukp0;
+
+#ifdef ISORT
+    for (jj = 0; jj < perm_u[j]; jj++)
+#else
+    for (jj = 0; jj < perm_u[2 * j + 1]; jj++)
+#endif
+    {
+
+        *jb = usub[*iukp];      /* Global block number of block U(k,j). */
+        *nsupc = SuperSize (*jb);
+        *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+        *rukp += usub[*iukp - 1];   /* Move to block U(k,j+1) */
+        *iukp += *nsupc;
+    }
+
+    /* reinitilize the pointers to the begining of the */
+    /* kth column/row of L/U factors                   */
+    *jb = usub[*iukp];          /* Global block number of block U(k,j). */
+    *ljb = LBj (*jb, grid);     /* Local block number of U(k,j). */
+    *nsupc = SuperSize (*jb);
+    *iukp += UB_DESCRIPTOR;     /* Start fstnz of block U(k,j). */
+}
+#endif
+/*--------------------------------------------------------------*/
+
+void
+dblock_gemm_scatter( int_t lb, int_t j,
+                    Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info,
+                    double *L_mat, int_t ldl,
+                    double *U_mat, int_t ldu,
+                    double *bigV,
+                    // int_t jj0,
+                    int_t knsupc,  int_t klst,
+                    int_t *lsub, int_t *usub, int_t ldt,
+                    int_t thread_id,
+                    int_t *indirect,
+                    int_t *indirect2,
+                    int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                    int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                    int_t *xsup, gridinfo_t *grid,
+                    SuperLUStat_t *stat
+#ifdef SCATTER_PROFILE
+                    , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                  )
+{
+    // return ;
+    thread_id = omp_get_thread_num();
+    int_t *indirect_thread = indirect + ldt * thread_id;
+    int_t *indirect2_thread = indirect2 + ldt * thread_id;
+    double *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col;
+    int_t ncols;
+    // if (j > jj0)
+    if (j > 0)
+    {
+        ncols  = Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols;
+        st_col = Ublock_info[j - 1].full_u_cols;
+    }
+    else
+    {
+        ncols  = Ublock_info[j].full_u_cols;
+        st_col = 0;
+    }
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int_t temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int_t cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+    /* Getting L block information */
+    // int_t lptr = Remain_info[lb].lptr;
+    // int_t ib   = Remain_info[lb].ib;
+    // int_t temp_nbrow = lsub[lptr + 1];
+    // lptr += LB_DESCRIPTOR;
+    // int_t cum_nrow =  Remain_info[lb].StRow;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    /* calling DGEMM */
+    // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+#if 1
+    dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+          &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl,
+          &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow);
+#else
+    // printf("%d %d %d %d  %d %d %d %d\n", temp_nbrow, ncols, ldu,  ldl,st_col,(knsupc - ldu)*ldl + cum_nrow,cum_nrow,st_col);
+
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+#endif
+
+    // printf("SCU update: (%d, %d)\n",ib,jb );
+#ifdef SCATTER_PROFILE
+    unsigned long long ttx = __rdtsc();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (    Sherry
+        dscatter_l (
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    // #pragma omp atomic
+    // stat->ops[FACT] += 2*temp_nbrow*ncols*ldu + temp_nbrow*ncols;
+
+#ifdef SCATTER_PROFILE
+    double t_s = (double) __rdtsc() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* dblock_gemm_scatter */
+
+/*this version uses a lock to prevent multiple thread updating the same block*/
+void
+dblock_gemm_scatter_lock( int_t lb, int_t j,
+                         omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,
+                         Remain_info_t *Remain_info,
+                         double *L_mat, int_t ldl,
+                         double *U_mat, int_t ldu,
+                         double *bigV,
+                         // int_t jj0,
+                         int_t knsupc,  int_t klst,
+                         int_t *lsub, int_t *usub, int_t ldt,
+                         int_t thread_id,
+                         int_t *indirect,
+                         int_t *indirect2,
+                         int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                         int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                         int_t *xsup, gridinfo_t *grid
+#ifdef SCATTER_PROFILE
+                         , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                       )
+{
+    int_t *indirect_thread = indirect + ldt * thread_id;
+    int_t *indirect2_thread = indirect2 + ldt * thread_id;
+    double *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col = Ublock_info[j].StCol;
+    int_t ncols = Ublock_info[j].ncols;
+
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int_t temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int_t cum_nrow =  Remain_info[lb].StRow;
+
+    double alpha = 1.0;  double beta = 0.0;
+
+    /* calling DGEMM */
+#if 1
+    // printf(" m %d n %d k %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+    dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+           &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl,
+           &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow);
+#else
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+#endif
+
+    /*try to get the lock for the block*/
+    if (lock)       /*lock is not null*/
+        while (!omp_test_lock(lock))
+        {
+        }
+
+#ifdef SCATTER_PROFILE
+    unsigned long long ttx = __rdtsc();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (  Sherry
+        dscatter_l ( 
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    if (lock)
+        omp_unset_lock(lock);
+
+#ifdef SCATTER_PROFILE
+    double t_s = (double) __rdtsc() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* dblock_gemm_scatter_lock */
+
+// there are following three variations of block_gemm_scatter call
+/*
++---------------------------------------+
+|          ||                           |
+|  CPU     ||          CPU+TopRight     |
+|  Top     ||                           |
+|  Left    ||                           |
+|          ||                           |
++---------------------------------------+
++---------------------------------------+
+|          ||        |                  |
+|          ||        |                  |
+|          ||        |                  |
+|  CPU     ||  CPU   |Accelerator       |
+|  Bottom  ||  Bottom|                  |
+|  Left    ||  Right |                  |
+|          ||        |                  |
+|          ||        |                  |
++--------------------+------------------+
+                  jj_cpu
+*/
+
+int_t dblock_gemm_scatterTopLeft( int_t lb, /* block number in L */
+				 int_t j,  /* block number in U */
+                                 double* bigV, int_t knsupc,  int_t klst,
+				 int_t* lsub, int_t * usub, int_t ldt,
+				 int_t* indirect, int_t* indirect2, HyP_t* HyP,
+                                 LUstruct_t *LUstruct,
+                                 gridinfo_t* grid,
+                                 SCT_t*SCT, SuperLUStat_t *stat
+                               )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    volatile int_t thread_id = omp_get_thread_num();
+    
+//    printf("Thread's ID %lld \n", thread_id);
+    unsigned long long t1 = _rdtsc();
+    dblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info,
+			HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id,
+			indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr,
+			xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* dgemm_scatterTopLeft */
+
+int_t dblock_gemm_scatterTopRight( int_t lb,  int_t j,
+                                  double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                  HyP_t* HyP,
+                                  LUstruct_t *LUstruct,
+                                  gridinfo_t* grid,
+                                  SCT_t*SCT, SuperLUStat_t *stat
+                                )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    volatile  int_t thread_id = omp_get_thread_num();
+    unsigned long long t1 = _rdtsc();
+    dblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* dblock_gemm_scatterTopRight */
+
+int_t dblock_gemm_scatterBottomLeft( int_t lb,  int_t j,
+                                    double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                    HyP_t* HyP,
+                                    LUstruct_t *LUstruct,
+                                    gridinfo_t* grid,
+                                    SCT_t*SCT, SuperLUStat_t *stat
+                                  )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    volatile int_t thread_id = omp_get_thread_num();
+    //printf("Thread's ID %lld \n", thread_id);
+    unsigned long long t1 = _rdtsc();
+    dblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* dblock_gemm_scatterBottomLeft */
+
+int_t dblock_gemm_scatterBottomRight( int_t lb,  int_t j,
+                                     double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                     HyP_t* HyP,
+                                     LUstruct_t *LUstruct,
+                                     gridinfo_t* grid,
+                                     SCT_t*SCT, SuperLUStat_t *stat
+                                   )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+   volatile  int_t thread_id = omp_get_thread_num();
+   // printf("Thread's ID %lld \n", thread_id);
+    unsigned long long t1 = _rdtsc();
+    dblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* dblock_gemm_scatterBottomRight */
+
+/******************************************************************
+ * SHERRY: scatter_l is the same as dscatter_l in dscatter.c
+ *         scatter_u is ALMOST the same as dscatter_u in dscatter.c
+ ******************************************************************/
+#if 0
+void
+scatter_l (int_t ib,
+           int_t ljb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *usub,
+           int_t *lsub,
+           double *tempv,
+           int_t *indirect_thread, int_t *indirect2,
+           int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid)
+{
+    int_t rel, i, segsize, jj;
+    double *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+
+    while (ijb != ib)
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+
+
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow;
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow = index[lptrj - 1];
+
+    for (i = 0; i < dest_nbrow; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    /* can be precalculated */
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] = indirect_thread[rel];
+    }
+
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+
+        segsize = klst - usub[iukp + jj];
+        if (segsize)
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                nzval[indirect2[i]] -= tempv[i];
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+
+} /* scatter_l */
+#endif // comment out
+
+static void   // SHERRY: ALMOST the same as dscatter_u in dscatter.c
+scatter_u (int_t ib,
+           int_t jb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *lsub,
+           int_t *usub,
+           double *tempv,
+           int_t *indirect,
+           int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, gridinfo_t *grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    int_t jj, i, fnz;
+    int_t segsize;
+    double *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* reinitialize the pointer to each row of U */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb)            /* Search for dest block. */
+    {
+        ruip_lib += index[iuip_lib + 1];
+
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor.  Now point_t to fstnz index of
+       block U(i,j). */
+
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        indirect[i] = lsub[lptr + i] ;
+    }
+
+
+    iuip_lib += UB_DESCRIPTOR;
+
+    ucol = &Unzval_br_ptr[lib][ruip_lib];
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        ucol -= fnz;
+        if (segsize)            /* Nonzero segment in U(k.j). */
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                ucol[indirect[i]] -= tempv[i];
+            }                   /* for i=0..temp_nbropw */
+            tempv += nbrow;
+
+        } /*if segsize */
+        ucol += ilst ;
+
+    } /*for jj=0:nsupc */
+
+}
+
+
diff --git a/SRC/dtreeFactorization.c b/SRC/dtreeFactorization.c
index dc459d15..d1c9aed4 100644
--- a/SRC/dtreeFactorization.c
+++ b/SRC/dtreeFactorization.c
@@ -205,7 +205,7 @@ int_t ddenseTreeFactor(
                 int_t *lsub = lPanelInfo->lsub;
                 int_t *usub = uPanelInfo->usub;
                 int_t thread_id = omp_get_thread_num();
-                block_gemm_scatter( lb, ub,
+                dblock_gemm_scatter( lb, ub,
                                     Ublock_info,
                                     Remain_info,
                                     &L_mat[luptr0], ldl,
@@ -462,7 +462,7 @@ int_t dsparseTreeFactor_ASYNC(
                     int_t j   = ij / HyP->lookAheadBlk; 
 							   
                     int_t lb  = ij % HyP->lookAheadBlk;
-                    block_gemm_scatterTopLeft( lb,  j, bigV, knsupc, klst, lsub,
+                    dblock_gemm_scatterTopLeft( lb,  j, bigV, knsupc, klst, lsub,
 					       usub, ldt,  indirect, indirect2, HyP,
 					       LUstruct, grid, SCT, stat );
                 }
@@ -472,7 +472,7 @@ int_t dsparseTreeFactor_ASYNC(
                 {
                     int_t j   = ij / HyP->lookAheadBlk ;
                     int_t lb  = ij % HyP->lookAheadBlk;
-                    block_gemm_scatterTopRight( lb,  j, bigV, knsupc, klst, lsub,
+                    dblock_gemm_scatterTopRight( lb,  j, bigV, knsupc, klst, lsub,
                                                 usub, ldt,  indirect, indirect2, HyP,
 						LUstruct, grid, SCT, stat);
                 }
@@ -482,7 +482,7 @@ int_t dsparseTreeFactor_ASYNC(
                 {
                     int_t j   = ij / HyP->RemainBlk;
                     int_t lb  = ij % HyP->RemainBlk;
-                    block_gemm_scatterBottomLeft( lb,  j, bigV, knsupc, klst, lsub,
+                    dblock_gemm_scatterBottomLeft( lb,  j, bigV, knsupc, klst, lsub,
                                                   usub, ldt,  indirect, indirect2,
 						  HyP, LUstruct, grid, SCT, stat);
                 } /*for (int_t ij =*/
@@ -526,7 +526,7 @@ int_t dsparseTreeFactor_ASYNC(
                 {
                     int_t j   = ij / HyP->RemainBlk + jj_cpu;
                     int_t lb  = ij % HyP->RemainBlk;
-                    block_gemm_scatterBottomRight( lb,  j, bigV, knsupc, klst, lsub,
+                    dblock_gemm_scatterBottomRight( lb,  j, bigV, knsupc, klst, lsub,
                                                    usub, ldt,  indirect, indirect2,
 						   HyP, LUstruct, grid, SCT, stat);
                 } /*for (int_t ij =*/
diff --git a/SRC/scatter.c b/SRC/scatter.c
index b26d49dd..6612bba5 100644
--- a/SRC/scatter.c
+++ b/SRC/scatter.c
@@ -512,9 +512,7 @@ scatter_l (int_t ib,
         {
             for (i = 0; i < temp_nbrow; ++i)
             {
-
                 nzval[indirect2[i]] -= tempv[i];
-
             }
             tempv += nbrow;
         }
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index 6b86006e..493f6f8e 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -383,12 +383,24 @@ extern void LUstructInit(const int_t, LUstruct_t *);
 extern void LUstructFree(LUstruct_t *);
 extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
 extern void Destroy_Tree(int_t, gridinfo_t *, LUstruct_t *);
+extern void dscatter_l (int ib, int ljb, int nsupc, int_t iukp, int_t* xsup,
+			int klst, int nbrow, int_t lptr, int temp_nbrow,
+			int_t* usub, int_t* lsub, double *tempv,
+			int* indirect_thread, int* indirect2,
+			int_t ** Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+			gridinfo_t * grid);
+extern void dscatter_u (int ib, int jb, int nsupc, int_t iukp, int_t * xsup,
+                        int klst, int nbrow, int_t lptr, int temp_nbrow,
+                        int_t* lsub, int_t* usub, double* tempv,
+                        int_t ** Ufstnz_br_ptr, double **Unzval_br_ptr,
+                        gridinfo_t * grid);
+extern int_t pdgstrf(superlu_dist_options_t *, int, int, double,
+		    LUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*);
 
 /* #define GPU_PROF
 #define IPM_PROF */
 
-extern int_t pdgstrf(superlu_dist_options_t *, int, int, double,
-		    LUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*);
+/* Solve related */
 extern void pdgstrs_Bglobal(int_t, LUstruct_t *, gridinfo_t *,
 			     double *, int_t, int, SuperLUStat_t *, int *);
 extern void pdgstrs(int_t, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *,
@@ -545,81 +557,68 @@ extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid);
 
     /* from scatter.h */
 extern void
-block_gemm_scatter( int_t lb, int_t j,
-                    Ublock_info_t *Ublock_info,
-                    Remain_info_t *Remain_info,
-                    double *L_mat, int_t ldl,
-                    double *U_mat, int_t ldu,
-                    double *bigV,
+dblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info, double *L_mat, int_t ldl,
+                    double *U_mat, int_t ldu,  double *bigV,
                     // int_t jj0,
                     int_t knsupc,  int_t klst,
                     int_t *lsub, int_t *usub, int_t ldt,
                     int_t thread_id,
-                    int_t *indirect,
-                    int_t *indirect2,
+                    int_t *indirect, int_t *indirect2,
                     int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
                     int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
-                    int_t *xsup, gridinfo_t *grid,
-                    SuperLUStat_t *stat
+                    int_t *xsup, gridinfo_t *, SuperLUStat_t *
 #ifdef SCATTER_PROFILE
                     , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
 #endif
                   );
 /*this version uses a lock to prevent multiple thread updating the same block*/
-void
-block_gemm_scatter_lock( int_t lb, int_t j,
-                         omp_lock_t* lock,
-                         Ublock_info_t *Ublock_info,
-                         Remain_info_t *Remain_info,
-                         double *L_mat, int_t ldl,
-                         double *U_mat, int_t ldu,
+extern void
+dblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,  Remain_info_t *Remain_info,
+                         double *L_mat, int_t ldl, double *U_mat, int_t ldu,
                          double *bigV,
                          // int_t jj0,
                          int_t knsupc,  int_t klst,
                          int_t *lsub, int_t *usub, int_t ldt,
                          int_t thread_id,
-                         int_t *indirect,
-                         int_t *indirect2,
+                         int_t *indirect, int_t *indirect2,
                          int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
                          int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
-                         int_t *xsup, gridinfo_t *grid
+                         int_t *xsup, gridinfo_t *
 #ifdef SCATTER_PROFILE
                          , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
 #endif
                        );
-
-int_t block_gemm_scatterTopLeft( int_t lb,  int_t j,
-                                 double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
-                                 int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
-                                 HyP_t* HyP,
-                                 LUstruct_t *LUstruct,
-                                 gridinfo_t* grid,
-                                 SCT_t*SCT, SuperLUStat_t *stat
+extern int_t
+dblock_gemm_scatterTopLeft( int_t lb,  int_t j, double* bigV,
+				 int_t knsupc,  int_t klst, int_t* lsub,
+                                 int_t * usub, int_t ldt,
+				 int_t* indirect, int_t* indirect2,
+                                 HyP_t* HyP, LUstruct_t *, gridinfo_t*,
+                                 SCT_t*SCT, SuperLUStat_t *
                                );
-int_t block_gemm_scatterTopRight( int_t lb,  int_t j,
-                                  double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
-                                  int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
-                                  HyP_t* HyP,
-                                  LUstruct_t *LUstruct,
-                                  gridinfo_t* grid,
-                                  SCT_t*SCT, SuperLUStat_t *stat
-                                );
-int_t block_gemm_scatterBottomLeft( int_t lb,  int_t j,
-                                    double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
-                                    int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
-                                    HyP_t* HyP,
-                                    LUstruct_t *LUstruct,
-                                    gridinfo_t* grid,
-                                    SCT_t*SCT, SuperLUStat_t *stat
-                                  );
-int_t block_gemm_scatterBottomRight( int_t lb,  int_t j,
-                                     double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
-                                     int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
-                                     HyP_t* HyP,
-                                     LUstruct_t *LUstruct,
-                                     gridinfo_t* grid,
-                                     SCT_t*SCT, SuperLUStat_t *stat
-                                   );
+extern int_t 
+dblock_gemm_scatterTopRight( int_t lb,  int_t j, double* bigV,
+				  int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t * usub, int_t ldt,
+				  int_t* indirect, int_t* indirect2,
+                                  HyP_t* HyP, LUstruct_t *, gridinfo_t*,
+                                  SCT_t*SCT, SuperLUStat_t * );
+extern int_t
+dblock_gemm_scatterBottomLeft( int_t lb,  int_t j, double* bigV,
+				    int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t * usub, int_t ldt, 
+				    int_t* indirect, int_t* indirect2,
+                                    HyP_t* HyP, LUstruct_t *, gridinfo_t*,
+                                    SCT_t*SCT, SuperLUStat_t * );
+extern int_t 
+dblock_gemm_scatterBottomRight( int_t lb,  int_t j, double* bigV,
+				     int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t * usub, int_t ldt,
+				     int_t* indirect, int_t* indirect2,
+                                     HyP_t* HyP, LUstruct_t *, gridinfo_t*,
+                                     SCT_t*SCT, SuperLUStat_t * );
 
     /* from gather.h */
 extern void dgather_u(int_t num_u_blks,
@@ -978,7 +977,7 @@ extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs,
 			    gridinfo_t *grid, SCT_t *SCT);
 extern int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *, SCT_t *);
 
-    /* from ancFactorization.h */
+    /* from ancFactorization.h (not called) */
 extern int_t ancestorFactor(
     int_t ilvl,             // level of factorization 
     sForest_t* sforest,
diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h
index 42a4f128..2341ac7a 100644
--- a/SRC/superlu_zdefs.h
+++ b/SRC/superlu_zdefs.h
@@ -383,12 +383,24 @@ extern void LUstructInit(const int_t, LUstruct_t *);
 extern void LUstructFree(LUstruct_t *);
 extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
 extern void Destroy_Tree(int_t, gridinfo_t *, LUstruct_t *);
+extern void zscatter_l (int ib, int ljb, int nsupc, int_t iukp, int_t* xsup,
+			int klst, int nbrow, int_t lptr, int temp_nbrow,
+			int_t* usub, int_t* lsub, doublecomplex *tempv,
+			int* indirect_thread, int* indirect2,
+			int_t ** Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr,
+			gridinfo_t * grid);
+extern void dscatter_u (int ib, int jb, int nsupc, int_t iukp, int_t * xsup,
+                        int klst, int nbrow, int_t lptr, int temp_nbrow,
+                        int_t* lsub, int_t* usub, doublecomplex* tempv,
+                        int_t ** Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr,
+                        gridinfo_t * grid);
+extern int_t pzgstrf(superlu_dist_options_t *, int, int, double,
+		    LUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*);
 
 /* #define GPU_PROF
 #define IPM_PROF */
 
-extern int_t pzgstrf(superlu_dist_options_t *, int, int, double,
-		    LUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*);
+/* Solve related */
 extern void pzgstrs_Bglobal(int_t, LUstruct_t *, gridinfo_t *,
 			     doublecomplex *, int_t, int, SuperLUStat_t *, int *);
 extern void pzgstrs(int_t, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *,
@@ -548,81 +560,68 @@ extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid);
 
     /* from scatter.h */
 extern void
-block_gemm_scatter( int_t lb, int_t j,
-                    Ublock_info_t *Ublock_info,
-                    Remain_info_t *Remain_info,
-                    doublecomplex *L_mat, int_t ldl,
-                    doublecomplex *U_mat, int_t ldu,
-                    doublecomplex *bigV,
+zblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info, doublecomplex *L_mat, int_t ldl,
+                    doublecomplex *U_mat, int_t ldu,  doublecomplex *bigV,
                     // int_t jj0,
                     int_t knsupc,  int_t klst,
                     int_t *lsub, int_t *usub, int_t ldt,
                     int_t thread_id,
-                    int_t *indirect,
-                    int_t *indirect2,
+                    int_t *indirect, int_t *indirect2,
                     int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr,
                     int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr,
-                    int_t *xsup, gridinfo_t *grid,
-                    SuperLUStat_t *stat
+                    int_t *xsup, gridinfo_t *, SuperLUStat_t *
 #ifdef SCATTER_PROFILE
                     , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
 #endif
                   );
 /*this version uses a lock to prevent multiple thread updating the same block*/
-void
-block_gemm_scatter_lock( int_t lb, int_t j,
-                         omp_lock_t* lock,
-                         Ublock_info_t *Ublock_info,
-                         Remain_info_t *Remain_info,
-                         doublecomplex *L_mat, int_t ldl,
-                         doublecomplex *U_mat, int_t ldu,
+extern void
+zblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,  Remain_info_t *Remain_info,
+                         doublecomplex *L_mat, int_t ldl, doublecomplex *U_mat, int_t ldu,
                          doublecomplex *bigV,
                          // int_t jj0,
                          int_t knsupc,  int_t klst,
                          int_t *lsub, int_t *usub, int_t ldt,
                          int_t thread_id,
-                         int_t *indirect,
-                         int_t *indirect2,
+                         int_t *indirect, int_t *indirect2,
                          int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr,
                          int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr,
-                         int_t *xsup, gridinfo_t *grid
+                         int_t *xsup, gridinfo_t *
 #ifdef SCATTER_PROFILE
                          , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
 #endif
                        );
-
-int_t block_gemm_scatterTopLeft( int_t lb,  int_t j,
-                                 doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
-                                 int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
-                                 HyP_t* HyP,
-                                 LUstruct_t *LUstruct,
-                                 gridinfo_t* grid,
-                                 SCT_t*SCT, SuperLUStat_t *stat
+extern int_t
+zblock_gemm_scatterTopLeft( int_t lb,  int_t j, doublecomplex* bigV,
+				 int_t knsupc,  int_t klst, int_t* lsub,
+                                 int_t * usub, int_t ldt,
+				 int_t* indirect, int_t* indirect2,
+                                 HyP_t* HyP, LUstruct_t *, gridinfo_t*,
+                                 SCT_t*SCT, SuperLUStat_t *
                                );
-int_t block_gemm_scatterTopRight( int_t lb,  int_t j,
-                                  doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
-                                  int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
-                                  HyP_t* HyP,
-                                  LUstruct_t *LUstruct,
-                                  gridinfo_t* grid,
-                                  SCT_t*SCT, SuperLUStat_t *stat
-                                );
-int_t block_gemm_scatterBottomLeft( int_t lb,  int_t j,
-                                    doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
-                                    int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
-                                    HyP_t* HyP,
-                                    LUstruct_t *LUstruct,
-                                    gridinfo_t* grid,
-                                    SCT_t*SCT, SuperLUStat_t *stat
-                                  );
-int_t block_gemm_scatterBottomRight( int_t lb,  int_t j,
-                                     doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
-                                     int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
-                                     HyP_t* HyP,
-                                     LUstruct_t *LUstruct,
-                                     gridinfo_t* grid,
-                                     SCT_t*SCT, SuperLUStat_t *stat
-                                   );
+extern int_t 
+zblock_gemm_scatterTopRight( int_t lb,  int_t j, doublecomplex* bigV,
+				  int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t * usub, int_t ldt,
+				  int_t* indirect, int_t* indirect2,
+                                  HyP_t* HyP, LUstruct_t *, gridinfo_t*,
+                                  SCT_t*SCT, SuperLUStat_t * );
+extern int_t
+zblock_gemm_scatterBottomLeft( int_t lb,  int_t j, doublecomplex* bigV,
+				    int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t * usub, int_t ldt, 
+				    int_t* indirect, int_t* indirect2,
+                                    HyP_t* HyP, LUstruct_t *, gridinfo_t*,
+                                    SCT_t*SCT, SuperLUStat_t * );
+extern int_t 
+zblock_gemm_scatterBottomRight( int_t lb,  int_t j, doublecomplex* bigV,
+				     int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t * usub, int_t ldt,
+				     int_t* indirect, int_t* indirect2,
+                                     HyP_t* HyP, LUstruct_t *, gridinfo_t*,
+                                     SCT_t*SCT, SuperLUStat_t * );
 
     /* from gather.h */
 extern void zgather_u(int_t num_u_blks,
@@ -981,7 +980,7 @@ extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs,
 			    gridinfo_t *grid, SCT_t *SCT);
 extern int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *, SCT_t *);
 
-    /* from ancFactorization.h */
+    /* from ancFactorization.h (not called) */
 extern int_t ancestorFactor(
     int_t ilvl,             // level of factorization 
     sForest_t* sforest,
diff --git a/SRC/zscatter.c b/SRC/zscatter.c
index 82cae7f2..908d5eeb 100644
--- a/SRC/zscatter.c
+++ b/SRC/zscatter.c
@@ -107,7 +107,7 @@ zscatter_l_1 (int ib,
     // TAU_STATIC_TIMER_STOP("SCATTER_LB");
 } /* zscatter_l_1 */
 
-static void
+void
 zscatter_l (
            int ib,    /* row block number of source block L(i,k) */
            int ljb,   /* local column block number of dest. block L(i,j) */
@@ -189,7 +189,7 @@ zscatter_l (
 } /* zscatter_l */
 
 
-static void
+void
 zscatter_u (int ib,
            int jb,
            int nsupc,
diff --git a/SRC/zscatter3d.c b/SRC/zscatter3d.c
new file mode 100644
index 00000000..da54583d
--- /dev/null
+++ b/SRC/zscatter3d.c
@@ -0,0 +1,611 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+#include "superlu_zdefs.h"
+//#include "scatter.h"
+//#include "compiler.h"
+
+#ifdef __INTEL_COMPILER
+#include "mkl.h"
+#else
+#include "cblas.h"
+#endif
+#include "omp.h"
+
+#define ISORT
+#define SCATTER_U_CPU  scatter_u
+
+static void scatter_u (int_t ib, int_t jb, int_t nsupc, int_t iukp, int_t *xsup,
+                 int_t klst, int_t nbrow, int_t lptr, int_t temp_nbrow,
+ 		 int_t *lsub, int_t *usub, doublecomplex *tempv,
+		 int_t *indirect,
+           	 int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, gridinfo_t *grid);
+
+
+#if 0 /**** Sherry: this routine is moved to util.c ****/
+void
+arrive_at_ublock (int_t j,      //block number
+                  int_t *iukp,  // output
+                  int_t *rukp, int_t *jb,   /* Global block number of block U(k,j). */
+                  int_t *ljb,   /* Local block number of U(k,j). */
+                  int_t *nsupc,     /*supernode size of destination block */
+                  int_t iukp0,  //input
+                  int_t rukp0, int_t *usub,     /*usub scripts */
+                  int_t *perm_u,    /*permutation matrix */
+                  int_t *xsup,  /*for SuperSize and LBj */
+                  gridinfo_t *grid)
+{
+    int_t jj;
+    *iukp = iukp0;
+    *rukp = rukp0;
+
+#ifdef ISORT
+    for (jj = 0; jj < perm_u[j]; jj++)
+#else
+    for (jj = 0; jj < perm_u[2 * j + 1]; jj++)
+#endif
+    {
+
+        *jb = usub[*iukp];      /* Global block number of block U(k,j). */
+        *nsupc = SuperSize (*jb);
+        *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+        *rukp += usub[*iukp - 1];   /* Move to block U(k,j+1) */
+        *iukp += *nsupc;
+    }
+
+    /* reinitilize the pointers to the begining of the */
+    /* kth column/row of L/U factors                   */
+    *jb = usub[*iukp];          /* Global block number of block U(k,j). */
+    *ljb = LBj (*jb, grid);     /* Local block number of U(k,j). */
+    *nsupc = SuperSize (*jb);
+    *iukp += UB_DESCRIPTOR;     /* Start fstnz of block U(k,j). */
+}
+#endif
+/*--------------------------------------------------------------*/
+
+void
+zblock_gemm_scatter( int_t lb, int_t j,
+                    Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info,
+                    doublecomplex *L_mat, int_t ldl,
+                    doublecomplex *U_mat, int_t ldu,
+                    doublecomplex *bigV,
+                    // int_t jj0,
+                    int_t knsupc,  int_t klst,
+                    int_t *lsub, int_t *usub, int_t ldt,
+                    int_t thread_id,
+                    int_t *indirect,
+                    int_t *indirect2,
+                    int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr,
+                    int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr,
+                    int_t *xsup, gridinfo_t *grid,
+                    SuperLUStat_t *stat
+#ifdef SCATTER_PROFILE
+                    , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                  )
+{
+    // return ;
+    thread_id = omp_get_thread_num();
+    int_t *indirect_thread = indirect + ldt * thread_id;
+    int_t *indirect2_thread = indirect2 + ldt * thread_id;
+    doublecomplex *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col;
+    int_t ncols;
+    // if (j > jj0)
+    if (j > 0)
+    {
+        ncols  = Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols;
+        st_col = Ublock_info[j - 1].full_u_cols;
+    }
+    else
+    {
+        ncols  = Ublock_info[j].full_u_cols;
+        st_col = 0;
+    }
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int_t temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int_t cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+    /* Getting L block information */
+    // int_t lptr = Remain_info[lb].lptr;
+    // int_t ib   = Remain_info[lb].ib;
+    // int_t temp_nbrow = lsub[lptr + 1];
+    // lptr += LB_DESCRIPTOR;
+    // int_t cum_nrow =  Remain_info[lb].StRow;
+    doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
+
+    /* calling DGEMM */
+    // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+#if 1
+    zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+          &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl,
+          &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow);
+#else
+    // printf("%d %d %d %d  %d %d %d %d\n", temp_nbrow, ncols, ldu,  ldl,st_col,(knsupc - ldu)*ldl + cum_nrow,cum_nrow,st_col);
+
+    cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+#endif
+
+    // printf("SCU update: (%d, %d)\n",ib,jb );
+#ifdef SCATTER_PROFILE
+    unsigned long long ttx = __rdtsc();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (    Sherry
+        zscatter_l (
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    // #pragma omp atomic
+    // stat->ops[FACT] += 2*temp_nbrow*ncols*ldu + temp_nbrow*ncols;
+
+#ifdef SCATTER_PROFILE
+    double t_s = (double) __rdtsc() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* zblock_gemm_scatter */
+
+/*this version uses a lock to prevent multiple thread updating the same block*/
+void
+zblock_gemm_scatter_lock( int_t lb, int_t j,
+                         omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,
+                         Remain_info_t *Remain_info,
+                         doublecomplex *L_mat, int_t ldl,
+                         doublecomplex *U_mat, int_t ldu,
+                         doublecomplex *bigV,
+                         // int_t jj0,
+                         int_t knsupc,  int_t klst,
+                         int_t *lsub, int_t *usub, int_t ldt,
+                         int_t thread_id,
+                         int_t *indirect,
+                         int_t *indirect2,
+                         int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr,
+                         int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr,
+                         int_t *xsup, gridinfo_t *grid
+#ifdef SCATTER_PROFILE
+                         , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                       )
+{
+    int_t *indirect_thread = indirect + ldt * thread_id;
+    int_t *indirect2_thread = indirect2 + ldt * thread_id;
+    doublecomplex *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col = Ublock_info[j].StCol;
+    int_t ncols = Ublock_info[j].ncols;
+
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int_t temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int_t cum_nrow =  Remain_info[lb].StRow;
+
+    doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
+
+    /* calling DGEMM */
+#if 1
+    // printf(" m %d n %d k %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+    zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+           &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl,
+           &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow);
+#else
+    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+                temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+#endif
+
+    /*try to get the lock for the block*/
+    if (lock)       /*lock is not null*/
+        while (!omp_test_lock(lock))
+        {
+        }
+
+#ifdef SCATTER_PROFILE
+    unsigned long long ttx = __rdtsc();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (  Sherry
+        zscatter_l ( 
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    if (lock)
+        omp_unset_lock(lock);
+
+#ifdef SCATTER_PROFILE
+    double t_s = (double) __rdtsc() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* zblock_gemm_scatter_lock */
+
+// there are following three variations of block_gemm_scatter call
+/*
++---------------------------------------+
+|          ||                           |
+|  CPU     ||          CPU+TopRight     |
+|  Top     ||                           |
+|  Left    ||                           |
+|          ||                           |
++---------------------------------------+
++---------------------------------------+
+|          ||        |                  |
+|          ||        |                  |
+|          ||        |                  |
+|  CPU     ||  CPU   |Accelerator       |
+|  Bottom  ||  Bottom|                  |
+|  Left    ||  Right |                  |
+|          ||        |                  |
+|          ||        |                  |
++--------------------+------------------+
+                  jj_cpu
+*/
+
+int_t zblock_gemm_scatterTopLeft( int_t lb, /* block number in L */
+				 int_t j,  /* block number in U */
+                                 doublecomplex* bigV, int_t knsupc,  int_t klst,
+				 int_t* lsub, int_t * usub, int_t ldt,
+				 int_t* indirect, int_t* indirect2, HyP_t* HyP,
+                                 LUstruct_t *LUstruct,
+                                 gridinfo_t* grid,
+                                 SCT_t*SCT, SuperLUStat_t *stat
+                               )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    volatile int_t thread_id = omp_get_thread_num();
+    
+//    printf("Thread's ID %lld \n", thread_id);
+    unsigned long long t1 = _rdtsc();
+    zblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info,
+			HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id,
+			indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr,
+			xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* zgemm_scatterTopLeft */
+
+int_t zblock_gemm_scatterTopRight( int_t lb,  int_t j,
+                                  doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                  HyP_t* HyP,
+                                  LUstruct_t *LUstruct,
+                                  gridinfo_t* grid,
+                                  SCT_t*SCT, SuperLUStat_t *stat
+                                )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    volatile  int_t thread_id = omp_get_thread_num();
+    unsigned long long t1 = _rdtsc();
+    zblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* zblock_gemm_scatterTopRight */
+
+int_t zblock_gemm_scatterBottomLeft( int_t lb,  int_t j,
+                                    doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                    HyP_t* HyP,
+                                    LUstruct_t *LUstruct,
+                                    gridinfo_t* grid,
+                                    SCT_t*SCT, SuperLUStat_t *stat
+                                  )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    volatile int_t thread_id = omp_get_thread_num();
+    //printf("Thread's ID %lld \n", thread_id);
+    unsigned long long t1 = _rdtsc();
+    zblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* zblock_gemm_scatterBottomLeft */
+
+int_t zblock_gemm_scatterBottomRight( int_t lb,  int_t j,
+                                     doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t * usub, int_t ldt,  int_t* indirect, int_t* indirect2,
+                                     HyP_t* HyP,
+                                     LUstruct_t *LUstruct,
+                                     gridinfo_t* grid,
+                                     SCT_t*SCT, SuperLUStat_t *stat
+                                   )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
+   volatile  int_t thread_id = omp_get_thread_num();
+   // printf("Thread's ID %lld \n", thread_id);
+    unsigned long long t1 = _rdtsc();
+    zblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+
+    unsigned long long t2 = _rdtsc();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* zblock_gemm_scatterBottomRight */
+
+/******************************************************************
+ * SHERRY: scatter_l is the same as dscatter_l in dscatter.c
+ *         scatter_u is ALMOST the same as dscatter_u in dscatter.c
+ ******************************************************************/
+#if 0
+void
+scatter_l (int_t ib,
+           int_t ljb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *usub,
+           int_t *lsub,
+           double *tempv,
+           int_t *indirect_thread, int_t *indirect2,
+           int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid)
+{
+    int_t rel, i, segsize, jj;
+    double *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+
+    while (ijb != ib)
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+
+
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow;
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow = index[lptrj - 1];
+
+    for (i = 0; i < dest_nbrow; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    /* can be precalculated */
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] = indirect_thread[rel];
+    }
+
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+
+        segsize = klst - usub[iukp + jj];
+        if (segsize)
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                nzval[indirect2[i]] -= tempv[i];
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+
+} /* scatter_l */
+#endif // comment out
+
+static void   // SHERRY: ALMOST the same as dscatter_u in dscatter.c
+scatter_u (int_t ib,
+           int_t jb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *lsub,
+           int_t *usub,
+           doublecomplex *tempv,
+           int_t *indirect,
+           int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, gridinfo_t *grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    int_t jj, i, fnz;
+    int_t segsize;
+    doublecomplex *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* reinitialize the pointer to each row of U */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb)            /* Search for dest block. */
+    {
+        ruip_lib += index[iuip_lib + 1];
+
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor.  Now point_t to fstnz index of
+       block U(i,j). */
+
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        indirect[i] = lsub[lptr + i] ;
+    }
+
+
+    iuip_lib += UB_DESCRIPTOR;
+
+    ucol = &Unzval_br_ptr[lib][ruip_lib];
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        ucol -= fnz;
+        if (segsize)            /* Nonzero segment in U(k.j). */
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                z_sub(&ucol[indirect[i]], &ucol[indirect[i]], &tempv[i]);
+            }                   /* for i=0..temp_nbropw */
+            tempv += nbrow;
+
+        } /*if segsize */
+        ucol += ilst ;
+
+    } /*for jj=0:nsupc */
+
+}
+
+
diff --git a/SRC/ztreeFactorization.c b/SRC/ztreeFactorization.c
index bb6b16cc..bd03bb2e 100644
--- a/SRC/ztreeFactorization.c
+++ b/SRC/ztreeFactorization.c
@@ -204,7 +204,7 @@ int_t zdenseTreeFactor(
                 int_t *lsub = lPanelInfo->lsub;
                 int_t *usub = uPanelInfo->usub;
                 int_t thread_id = omp_get_thread_num();
-                block_gemm_scatter( lb, ub,
+                zblock_gemm_scatter( lb, ub,
                                     Ublock_info,
                                     Remain_info,
                                     &L_mat[luptr0], ldl,
@@ -461,7 +461,7 @@ int_t zsparseTreeFactor_ASYNC(
                     int_t j   = ij / HyP->lookAheadBlk; 
 							   
                     int_t lb  = ij % HyP->lookAheadBlk;
-                    block_gemm_scatterTopLeft( lb,  j, bigV, knsupc, klst, lsub,
+                    zblock_gemm_scatterTopLeft( lb,  j, bigV, knsupc, klst, lsub,
 					       usub, ldt,  indirect, indirect2, HyP,
 					       LUstruct, grid, SCT, stat );
                 }
@@ -471,7 +471,7 @@ int_t zsparseTreeFactor_ASYNC(
                 {
                     int_t j   = ij / HyP->lookAheadBlk ;
                     int_t lb  = ij % HyP->lookAheadBlk;
-                    block_gemm_scatterTopRight( lb,  j, bigV, knsupc, klst, lsub,
+                    zblock_gemm_scatterTopRight( lb,  j, bigV, knsupc, klst, lsub,
                                                 usub, ldt,  indirect, indirect2, HyP,
 						LUstruct, grid, SCT, stat);
                 }
@@ -481,7 +481,7 @@ int_t zsparseTreeFactor_ASYNC(
                 {
                     int_t j   = ij / HyP->RemainBlk;
                     int_t lb  = ij % HyP->RemainBlk;
-                    block_gemm_scatterBottomLeft( lb,  j, bigV, knsupc, klst, lsub,
+                    zblock_gemm_scatterBottomLeft( lb,  j, bigV, knsupc, klst, lsub,
                                                   usub, ldt,  indirect, indirect2,
 						  HyP, LUstruct, grid, SCT, stat);
                 } /*for (int_t ij =*/
@@ -525,7 +525,7 @@ int_t zsparseTreeFactor_ASYNC(
                 {
                     int_t j   = ij / HyP->RemainBlk + jj_cpu;
                     int_t lb  = ij % HyP->RemainBlk;
-                    block_gemm_scatterBottomRight( lb,  j, bigV, knsupc, klst, lsub,
+                    zblock_gemm_scatterBottomRight( lb,  j, bigV, knsupc, klst, lsub,
                                                    usub, ldt,  indirect, indirect2,
 						   HyP, LUstruct, grid, SCT, stat);
                 } /*for (int_t ij =*/

From ccc93a222b85145cc68cbe20eb9a2ec3854be0aa Mon Sep 17 00:00:00 2001
From: "X. Sherry Li" 
Date: Tue, 6 Aug 2019 07:22:11 -0700
Subject: [PATCH 008/147] Update superlu_dist.pc.in

---
 superlu_dist.pc.in | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/superlu_dist.pc.in b/superlu_dist.pc.in
index 0a98d316..b18bf19e 100644
--- a/superlu_dist.pc.in
+++ b/superlu_dist.pc.in
@@ -5,8 +5,8 @@ includedir=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_INCLUDEDIR@
 Name: @CMAKE_PROJECT_NAME@
 Description: Distributed-memory direct solution of sparse systems of linear equations
 Version: @PROJECT_VERSION@
-URL: http://crd-legacy.lbl.gov/~xiaoye/SuperLU/
+URL: https://portal.nersc.gov/project/sparse/superlu/
 
-Libs: -L${libdir} -lsuperlu
+Libs: -L${libdir} -lsuperlu_dist
 Libs.private: @BLAS_LIB@ -lm
 Cflags: -I${includedir}

From 357d25b8a0dbf5f8f6dc60d5d57ed9e5ca85cee7 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Tue, 6 Aug 2019 14:00:04 -0700
Subject: [PATCH 009/147] Add LAPACK name {d,z}trtri in Cnames.h

---
 SRC/Cnames.h     | 15 +++++++++++++--
 SRC/dutil_dist.c |  2 +-
 SRC/zutil_dist.c |  2 +-
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/SRC/Cnames.h b/SRC/Cnames.h
index 792f5145..a4d1672d 100644
--- a/SRC/Cnames.h
+++ b/SRC/Cnames.h
@@ -148,7 +148,6 @@ at the top-level directory.
 #define dtrsv_    DTRSV
 #define dgemm_    DGEMM
 #define dtrsm_    DTRSM
-#define dtrtri_   DTRTRI
 
 #define scasum_   SCASUM
 #define icamax_   ICAMAX
@@ -174,11 +173,18 @@ at the top-level directory.
 #define ztrsv_    ZTRSV
 #define zgemm_    ZGEMM
 #define ztrsm_    ZTRSM
+
 #define zgerc_    ZGERC
 #define zhemv_    ZHEMV
 #define zher2_    ZHER2
 #define zgeru_    ZGERU
 
+/* LAPACK */
+#define strtri_   STRTRI
+#define dtrtri_   DTRTRI
+#define ctrtri_   CTRTRI
+#define ztrtri_   ZTRTRI
+
 /*
 #define mc64id_dist     MC64ID_DIST
 #define mc64ad_dist     MC64AD_DIST
@@ -276,7 +282,6 @@ at the top-level directory.
 #define dtrsv_    dtrsv
 #define dgemm_    dgemm
 #define dtrsm_    dtrsm
-#define dtrtri_   dtrtri
 
 #define scasum_   scasum
 #define icamax_   icamax
@@ -307,6 +312,12 @@ at the top-level directory.
 #define zher2_    zher2
 #define zgeru_    zgeru
 
+/* LAPACK */
+#define strtri_   strtri
+#define dtrtri_   dtrtri
+#define ctrtri_   ctrtri
+#define ztrtri_   ztrtri
+
 /*
 #define mc64id_dist         mc64id_dist
 #define mc64ad_dist         mc64ad_dist
diff --git a/SRC/dutil_dist.c b/SRC/dutil_dist.c
index d716dc58..411691d4 100644
--- a/SRC/dutil_dist.c
+++ b/SRC/dutil_dist.c
@@ -562,7 +562,7 @@ void dZeroLblocks(int iam, int_t n, gridinfo_t *grid, LUstruct_t *LUstruct)
 } /* dZeroLblocks */
 
 
-/*! \Dump the factored matrix L using matlab triple-let format
+/*! \brief Dump the factored matrix L using matlab triple-let format
  */
 void dDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 		  Glu_persist_t *Glu_persist, LocalLU_t *Llu)
diff --git a/SRC/zutil_dist.c b/SRC/zutil_dist.c
index 0c1693ee..80703f11 100644
--- a/SRC/zutil_dist.c
+++ b/SRC/zutil_dist.c
@@ -560,7 +560,7 @@ void zZeroLblocks(int iam, int_t n, gridinfo_t *grid, LUstruct_t *LUstruct)
 } /* zZeroLblocks */
 
 
-/*! \Dump the factored matrix L using matlab triple-let format
+/*! \brief Dump the factored matrix L using matlab triple-let format
  */
 void zDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 		  Glu_persist_t *Glu_persist, LocalLU_t *Llu)

From fb29451850c1dffc7e35e5071e65301fe4769057 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Sat, 31 Aug 2019 13:56:32 -0700
Subject: [PATCH 010/147] Debug version of SUPERLU_MALLOC: allows malloc size
 of 0. supernodal_etree.c::    setree2list(): replace malloc by SUPERLU_MALLOC
    supernodal_etree(): replace malloc by SUPERLU_MALLOC        ( free
 treeList in xinitTrf3Dpartition() ) supernodalForest.c::   
 getNestDissForests(): add SUPERLU_FREE(gTreeHeads);

Add new function in xtrfAux.c:: Destroy_trf3Dpartition() to free memory after 3D factorization.
---
 CMakeLists.txt           |   6 +-
 EXAMPLE/CMakeLists.txt   |   8 ++
 EXAMPLE/pddrive3d.c      |  35 ++++----
 SRC/CMakeLists.txt       |  26 +++++-
 SRC/Makefile             |  10 +--
 SRC/dmemory_dist.c       |   5 --
 SRC/dtreeFactorization.c |  12 +--
 SRC/dtrfAux.c            |  48 ++++++++++-
 SRC/dtrfCommWrapper.c    |  23 +++++
 SRC/memory.c             |   2 +-
 SRC/pd3dcomm.c           |  21 +++--
 SRC/pdgssvx3d.c          | 173 ++++++++++++++++++++------------------
 SRC/pdgstrf2.c           |  32 ++++++-
 SRC/pdgstrf3d.c          |  28 +++++--
 SRC/pz3dcomm.c           |  21 +++--
 SRC/pzgssvx3d.c          | 175 +++++++++++++++++++++------------------
 SRC/pzgstrf.c            |   2 +-
 SRC/pzgstrf2.c           |  33 +++++++-
 SRC/pzgstrf3d.c          |  28 +++++--
 SRC/pzgstrs.c            |   4 +-
 SRC/superlu_ddefs.h      |  37 +++++----
 SRC/superlu_defs.h       |   4 +-
 SRC/superlu_zdefs.h      |  37 +++++----
 SRC/supernodalForest.c   |   7 +-
 SRC/supernodal_etree.c   |   9 +-
 SRC/trfAux.c             |   1 +
 SRC/zmemory_dist.c       | 114 +++++++++++++++++++++++++
 SRC/zscatter3d.c         |   6 +-
 SRC/ztreeFactorization.c |  12 +--
 SRC/ztrfAux.c            |  48 ++++++++++-
 SRC/ztrfCommWrapper.c    |  23 +++++
 31 files changed, 709 insertions(+), 281 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d02d457a..5f4786c1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,9 +9,9 @@ cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
 
 # Project version numbers
 project(SuperLU_DIST C CXX)
-set(VERSION_MAJOR "6")
-set(VERSION_MINOR "1")
-set(VERSION_BugFix "1")
+set(VERSION_MAJOR "7")
+set(VERSION_MINOR "0")
+set(VERSION_BugFix "0")
 set(PROJECT_VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_BugFix})
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
diff --git a/EXAMPLE/CMakeLists.txt b/EXAMPLE/CMakeLists.txt
index ac6df12f..785ebdcb 100644
--- a/EXAMPLE/CMakeLists.txt
+++ b/EXAMPLE/CMakeLists.txt
@@ -57,6 +57,10 @@ if(enable_double)
   add_executable(pddrive4 ${DEXM4})
   target_link_libraries(pddrive4 ${all_link_libs})
 
+  set(DEXM3D pddrive3d.c dcreate_matrix.c)
+  add_executable(pddrive3d ${DEXM3D})
+  target_link_libraries(pddrive3d ${all_link_libs})
+
   set(DEXMG pddrive_ABglobal.c)
   add_executable(pddrive_ABglobal ${DEXMG})
   target_link_libraries(pddrive_ABglobal ${all_link_libs})
@@ -104,6 +108,10 @@ if(enable_complex16)
   add_executable(pzdrive4 ${ZEXM4})
   target_link_libraries(pzdrive4 ${all_link_libs})
 
+  set(ZEXM3D pzdrive3d.c zcreate_matrix.c)
+  add_executable(pzdrive3d ${ZEXM3D})
+  target_link_libraries(pzdrive3d ${all_link_libs})
+
   set(ZEXMG pzdrive_ABglobal.c)
   add_executable(pzdrive_ABglobal ${ZEXMG})
   target_link_libraries(pzdrive_ABglobal ${all_link_libs})
diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c
index 7c51aee2..619fe98b 100644
--- a/EXAMPLE/pddrive3d.c
+++ b/EXAMPLE/pddrive3d.c
@@ -27,7 +27,7 @@ at the top-level directory.
  * Purpose
  * =======
  *
- * The driver program PDDRIVEx3D.
+ * The driver program PDDRIVE3D.
  *
  * This example illustrates how to use PDGSSVX3D with the full
  * (default) options to solve a linear system.
@@ -50,7 +50,7 @@ main (int argc, char *argv[])
 {
     superlu_dist_options_t options;
     SuperLUStat_t stat;
-    SuperMatrix A;
+    SuperMatrix A;  // only on process layer 0
     ScalePermstruct_t ScalePermstruct;
     LUstruct_t LUstruct;
     SOLVEstruct_t SOLVEstruct;
@@ -181,9 +181,8 @@ main (int argc, char *argv[])
 	    // printf("%s\n", suffix);
 	}
     }
-    if (!grid.zscp.Iam)  // only in process layer 0
+    if ( grid.zscp.Iam == 0 )  // only in process layer 0
 	dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &(grid.grid2d));
-	//dcreate_matrix (&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, &(grid.grid2d));
 
     if (!(berr = doubleMalloc_dist (nrhs)))
         ABORT ("Malloc fails for berr[].");
@@ -197,13 +196,16 @@ main (int argc, char *argv[])
        options.Equil             = YES;
        options.ParSymbFact       = NO;
        options.ColPerm           = METIS_AT_PLUS_A;
-       options.RowPerm           = LargeDiag;
+       options.RowPerm           = LargeDiag_MC64;
        options.ReplaceTinyPivot  = YES;
        options.IterRefine        = DOUBLE;
        options.Trans             = NOTRANS;
        options.SolveInitialized  = NO;
        options.RefineInitialized = NO;
        options.PrintStat         = YES;
+       options->num_lookaheads    = 10;
+       options->lookahead_etree   = NO;
+       options->SymPattern        = NO;
        options.DiagInv           = NO;
      */
     set_default_options_dist (&options);
@@ -221,7 +223,7 @@ main (int argc, char *argv[])
 	fflush(stdout);
     }
 
-    if (!grid.zscp.Iam)
+    if ( grid.zscp.Iam == 0 )  // Process layer 0
     {
 	m = A.nrow;
         n = A.ncol;
@@ -242,31 +244,34 @@ main (int argc, char *argv[])
                &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
     /* Check the accuracy of the solution. */
-    if (!grid.zscp.Iam)
+    if ( grid.zscp.Iam == 0 )  // Process layer 0
         pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
                           nrhs, b, ldb, xtrue, ldx, &(grid.grid2d));
     fflush(stdout);
-    if (!grid.zscp.Iam)
-	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
 
     /* ------------------------------------------------------------
        DEALLOCATE STORAGE.
        ------------------------------------------------------------ */
 
-    PStatFree (&stat);
-    if (grid.zscp.Iam == 0) {
+    if ( grid.zscp.Iam == 0 ) { // process layer 0
+
+	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
+
         Destroy_CompRowLoc_Matrix_dist (&A);
         Destroy_LU (n, &(grid.grid2d), &LUstruct);
         SUPERLU_FREE (b);
         SUPERLU_FREE (xtrue);
         SUPERLU_FREE (berr);
+        if (options.SolveInitialized) {
+            dSolveFinalize (&options, &SOLVEstruct);
+        }
     }
+
     ScalePermstructFree (&ScalePermstruct);
     LUstructFree (&LUstruct);
-    if (options.SolveInitialized)
-    {
-        dSolveFinalize (&options, &SOLVEstruct);
-    }
+
+    PStatFree (&stat);
+    printf("(%d) after StatFree\n", iam);
 
     /* ------------------------------------------------------------
        RELEASE THE SUPERLU PROCESS GRID.
diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 021f0c98..4b2c3f1f 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -22,7 +22,6 @@ endif ()
 # first: precision-independent files
 #  global.cpp
 set(sources
-  TreeInterface.cpp
   sp_ienv.c
   etree.c 
   sp_colorder.c
@@ -45,6 +44,14 @@ set(sources
   dmach_dist.c
   colamd.c
   superlu_dist_version.c
+  TreeInterface.cpp
+  superlu_grid3d.c    ## 3D code
+  supernodal_etree.c
+  supernodalForest.c
+  trfAux.c 
+  communication_aux.c
+  treeFactorization.c 
+  sec_structs.o
 )
 if (MSVC)
   list(APPEND sources wingetopt.c)
@@ -95,6 +102,15 @@ if(enable_double)
     pdgsrfs_ABXglobal.c
     pdgsmv_AXglobal.c
     pdGetDiagU.c
+    pdgssvx3d.c     ## 3D code
+    pdgstrf3d.c 
+    dtreeFactorization.c
+    dgather.c
+    dscatter3d.c
+    pd3dcomm.c
+    dtrfAux.c	
+    dcommunication_aux.c 
+    dtrfCommWrapper.c
   )
 endif()
 
@@ -138,6 +154,14 @@ if(enable_complex16)
     pzgsrfs_ABXglobal.c
     pzgsmv_AXglobal.c
     pzGetDiagU.c
+    pzgssvx3d.c   ## 3D code
+    pzgstrf3d.c
+    ztreeFactorization.c 
+    zscatter3d.c
+    zgather.c 
+    pz3dcomm.c ztrfAux.c
+    zcommunication_aux.c
+    ztrfCommWrapper.c
   )
 endif()
 
diff --git a/SRC/Makefile b/SRC/Makefile
index a0c7e98a..9470d48a 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -27,7 +27,7 @@
 #######################################################################
 include ../make.inc
 
-FACT3D = 	scatter.o
+# FACT3D = 	scatter.o
 #	pdgstrs_vecpar.o ancFactorization.o
 
 #  pddrive_params.o
@@ -71,9 +71,9 @@ DPLUSRC = pdgssvx.o pdgssvx_ABglobal.o \
 	  pdgstrf.o pdgstrf2.o pdGetDiagU.o \
 	  pdgstrs.o pdgstrs1.o pdgstrs_lsum.o pdgstrs_Bglobal.o \
 	  pdgsrfs.o pdgsmv.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o \
-	  dreadtriple_noheader.o dscatter3d.o  #$(FACT3D)
+	  dreadtriple_noheader.o  ##$(FACT3D)
 # from 3D code
-DPLUSRC += pdgssvx3d.o pdgstrf3d.o dtreeFactorization.o \
+DPLUSRC += pdgssvx3d.o pdgstrf3d.o dtreeFactorization.o dscatter3d.o \
 	dgather.o pd3dcomm.o dtrfAux.o dcommunication_aux.o dtrfCommWrapper.o
 
 #
@@ -85,9 +85,9 @@ ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \
 	  pzgstrf.o pzgstrf2.o pzGetDiagU.o \
 	  pzgstrs.o pzgstrs1.o pzgstrs_lsum.o pzgstrs_Bglobal.o \
 	  pzgsrfs.o pzgsmv.o pzgsrfs_ABXglobal.o pzgsmv_AXglobal.o \
-	  zreadtriple_noheader.o zscatter3d.o
+	  zreadtriple_noheader.o
 # from 3D code
-ZPLUSRC += pzgssvx3d.o pzgstrf3d.o ztreeFactorization.o \
+ZPLUSRC += pzgssvx3d.o pzgstrf3d.o ztreeFactorization.o zscatter3d.o \
 	zgather.o pz3dcomm.o ztrfAux.o zcommunication_aux.o ztrfCommWrapper.o
 
 all:  double complex16
diff --git a/SRC/dmemory_dist.c b/SRC/dmemory_dist.c
index e9319bdb..449ccd65 100644
--- a/SRC/dmemory_dist.c
+++ b/SRC/dmemory_dist.c
@@ -170,14 +170,12 @@ double *doubleCalloc_dist(int_t n)
     return (buf);
 }
 
-
 /***************************************
  * The following are from 3D code.
  ***************************************/
 
 double dgetLUMem(int_t nodeId, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 {
-
     double memlu = 0.0;
     gridinfo_t* grid = &(grid3d->grid2d);
     LocalLU_t *Llu = LUstruct->Llu;
@@ -191,7 +189,6 @@ double dgetLUMem(int_t nodeId, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
     int_t myrow = MYROW (iam, grid);
     int_t mycol = MYCOL (iam, grid);
 
-
     int_t pc = PCOL( nodeId, grid );
     if (mycol == pc)
     {
@@ -219,7 +216,6 @@ double dgetLUMem(int_t nodeId, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
         // double* unzval;
         usub = Ufstnz_br_ptr[lib];
 
-
         if (usub != NULL)
         {
             int_t lenv = usub[1];
@@ -230,7 +226,6 @@ double dgetLUMem(int_t nodeId, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
     return memlu;
 }
 
-
 double  dmemForest(sForest_t*sforest, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 {
     double memlu = 0;
diff --git a/SRC/dtreeFactorization.c b/SRC/dtreeFactorization.c
index d1c9aed4..4e4d11a5 100644
--- a/SRC/dtreeFactorization.c
+++ b/SRC/dtreeFactorization.c
@@ -24,7 +24,7 @@ at the top-level directory.
 #include "trfCommWrapper.h"
 #endif
 
-int_t dLluBufInit(LUValSubBuf_t* LUvsb, LUstruct_t *LUstruct)
+int_t dLluBufInit(dLUValSubBuf_t* LUvsb, LUstruct_t *LUstruct)
 {
     LocalLU_t *Llu = LUstruct->Llu;
     LUvsb->Lsub_buf = intMalloc_dist(Llu->bufmax[0]); //INT_T_ALLOC(Llu->bufmax[0]);
@@ -50,13 +50,13 @@ diagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* g
     return dFBufs;
 }
 
-LUValSubBuf_t** dLluBufInitArr(int_t numLA, LUstruct_t *LUstruct)
+dLUValSubBuf_t** dLluBufInitArr(int_t numLA, LUstruct_t *LUstruct)
 {
-    LUValSubBuf_t** LUvsbs = (LUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(LUValSubBuf_t*));
+    dLUValSubBuf_t** LUvsbs = (dLUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(dLUValSubBuf_t*));
     for (int_t i = 0; i < numLA; ++i)
     {
         /* code */
-        LUvsbs[i] = (LUValSubBuf_t*) SUPERLU_MALLOC(sizeof(LUValSubBuf_t));
+        LUvsbs[i] = (dLUValSubBuf_t*) SUPERLU_MALLOC(sizeof(dLUValSubBuf_t));
         dLluBufInit(LUvsbs[i], LUstruct);
     } /*minor for loop-3 for (int_t i = 0; i < numLA; ++i)*/
 
@@ -87,7 +87,7 @@ int_t ddenseTreeFactor(
     scuBufs_t *scuBufs,          // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
-    LUValSubBuf_t*LUvsb,
+    dLUValSubBuf_t* LUvsb,
     diagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
@@ -244,7 +244,7 @@ int_t dsparseTreeFactor_ASYNC(
     scuBufs_t *scuBufs,          // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t**msgss,                  // size=num Look ahead
-    LUValSubBuf_t**LUvsbs,          // size=num Look ahead
+    dLUValSubBuf_t** LUvsbs,          // size=num Look ahead
     diagFactBufs_t **dFBufs,         // size maxEtree level
     factStat_t *factStat,
     factNodelists_t  *fNlists,
diff --git a/SRC/dtrfAux.c b/SRC/dtrfAux.c
index a89898ef..2a94955e 100644
--- a/SRC/dtrfAux.c
+++ b/SRC/dtrfAux.c
@@ -295,7 +295,7 @@ int_t dSchurComplementSetupGPU(
     int_t* myIperm, 
     int_t* iperm_c_supno, int_t*perm_c_supno,
     gEtreeInfo_t*   gEtreeInfo, factNodelists_t* fNlists,
-    scuBufs_t* scuBufs, LUValSubBuf_t* LUvsb,
+    scuBufs_t* scuBufs, dLUValSubBuf_t* LUvsb,
     gridinfo_t *grid, LUstruct_t *LUstruct,
     HyP_t* HyP)
 {
@@ -509,7 +509,7 @@ trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers,
 
 #if ( DEBUGlevel>=1 )
     int iam = grid3d->iam;
-    CHECK_MALLOC (iam, "Enter initTrf3Dpartition()");
+    CHECK_MALLOC (iam, "Enter dinitTrf3Dpartition()");
 #endif
     int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid);
     int_t* iperm_c_supno = getFactIperm(perm_c_supno, nsupers);
@@ -545,7 +545,7 @@ trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers,
     int_t* myNodeCount = getMyNodeCountsFr(maxLvl, myTreeIdxs, sForests);
     int_t** treePerm = getTreePermFr( myTreeIdxs, sForests, grid3d);
 
-    LUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(LUValSubBuf_t));
+    dLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(dLUValSubBuf_t));
     dLluBufInit(LUvsb, LUstruct);
 
     int_t* supernode2treeMap = SUPERLU_MALLOC(nsupers*sizeof(int_t));
@@ -572,12 +572,52 @@ trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers,
     trf3Dpartition->LUvsb = LUvsb;
     trf3Dpartition->supernode2treeMap = supernode2treeMap;
 
+    SUPERLU_FREE(treeList);  // Sherry added
+
 #if ( DEBUGlevel>=1 )
-    CHECK_MALLOC (iam, "Exit initTrf3Dpartition()");
+    CHECK_MALLOC (iam, "Exit dinitTrf3Dpartition()");
 #endif
     return trf3Dpartition;
 } /* dinitTrf3Dpartition */
 
+/* Free memory allocated for trf3Dpartition structure. Sherry added this routine */
+void dDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d)
+{
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter dDestroy_trf3Dpartition()");
+#endif
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.setree);
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.numChildLeft);
+    SUPERLU_FREE(trf3Dpartition->iperm_c_supno);
+    SUPERLU_FREE(trf3Dpartition->myNodeCount);
+    SUPERLU_FREE(trf3Dpartition->myTreeIdxs);
+    SUPERLU_FREE(trf3Dpartition->myZeroTrIdxs);
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    for (int i = 0; i < numForests; ++i) {
+	if ( sForests[i] ) {
+	    SUPERLU_FREE(sForests[i]->nodeList);
+	    //SUPERLU_FREE(sForests[i]->treeHeads); // already freed
+	    SUPERLU_FREE((sForests[i]->topoInfo).eTreeTopLims);
+	    SUPERLU_FREE((sForests[i]->topoInfo).myIperm);
+	}
+    }
+    SUPERLU_FREE(trf3Dpartition->sForests); // double pointer 
+    SUPERLU_FREE(trf3Dpartition->treePerm); // double pointer pointing to sForests->nodeList
+    SUPERLU_FREE(trf3Dpartition->supernode2treeMap);
+
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lsub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lval_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Usub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Uval_buf);
+    SUPERLU_FREE(trf3Dpartition->LUvsb); // Sherry: check this ...
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter dDestroy_trf3Dpartition()");
+#endif
+}
+
 
 #if 0  //**** Sherry: following two routines are old, the new ones are in util.c
 int_t num_full_cols_U(int_t kk,  int_t **Ufstnz_br_ptr, int_t *xsup,
diff --git a/SRC/dtrfCommWrapper.c b/SRC/dtrfCommWrapper.c
index f5c6ce42..b1c73469 100644
--- a/SRC/dtrfCommWrapper.c
+++ b/SRC/dtrfCommWrapper.c
@@ -172,8 +172,20 @@ int_t dLPanelTrSolve( int_t k,   int_t* factored_L,
                 int_t off = i * BL;
                 // Sherry: int_t len = MY_MIN(BL, l - i * BL);
                 int_t len = SUPERLU_MIN(BL, l - i * BL);
+
+#if 1
+  #if defined (USE_VENDOR_BLAS)
+		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+			ublk_ptr, &ld_ujrow, &lusup[off], &nsupr,
+			1, 1, 1, 1);
+  #else
+		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+			ublk_ptr, &ld_ujrow, &lusup[off], &nsupr);
+  #endif
+#else
                 cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
                 len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off], nsupr);
+#endif
             }
         }
     }
@@ -209,8 +221,19 @@ int_t dLPanelTrSolve( int_t k,   int_t* factored_L,
             int_t len = SUPERLU_MIN(BL, (l - i * BL));
             #pragma omp task
             {
+#if 1
+  #if defined (USE_VENDOR_BLAS)
+		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+			ublk_ptr, &ld_ujrow, &lusup[nsupc + off], &nsupr,
+			1, 1, 1, 1);
+  #else
+		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+			ublk_ptr, &ld_ujrow, &lusup[nsupc + off], &nsupr);
+  #endif
+#else
                 cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
                              len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
+#endif
 
             }
         }
diff --git a/SRC/memory.c b/SRC/memory.c
index 6fdaae66..4695ccb6 100644
--- a/SRC/memory.c
+++ b/SRC/memory.c
@@ -66,7 +66,7 @@ void *superlu_malloc_dist(size_t size)
     int iam;
 
     MPI_Comm_rank(MPI_COMM_WORLD, &iam);
-    if ( size <= 0 ) {
+    if ( size < 0 ) {
 	printf("(%d) superlu_malloc size %lld\n", iam, size);
 	ABORT("superlu_malloc: nonpositive size");
     }
diff --git a/SRC/pd3dcomm.c b/SRC/pd3dcomm.c
index 4377642c..ca54fb50 100644
--- a/SRC/pd3dcomm.c
+++ b/SRC/pd3dcomm.c
@@ -215,7 +215,7 @@ int_t dzRecvLPanel(int_t k, int_t sender, double alpha, double beta,
     int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
     gridinfo_t* grid = &(grid3d->grid2d);
-    
+    int inc = 1;    
     int_t iam = grid->iam;
     int_t mycol = MYCOL (iam, grid);
     
@@ -239,8 +239,13 @@ int_t dzRecvLPanel(int_t k, int_t sender, double alpha, double beta,
 			     grid3d->zscp.comm, &status);
 		    
 		    /*reduce the updates*/
+#if 1
+		    dscal_(&len2, &alpha, lnzval, &inc);
+		    daxpy_(&len2, &beta, Lval_buf, &inc, lnzval, &inc);
+#else
 		    cblas_dscal (len2, alpha, lnzval, 1);
 		    cblas_daxpy (len2, beta, Lval_buf, 1, lnzval, 1);
+#endif
 		}
 	}
 
@@ -288,10 +293,11 @@ int_t dzRecvUPanel(int_t k, int_t sender, double alpha, double beta,
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
     double** Unzval_br_ptr = Llu->Unzval_br_ptr;
     gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;
     int_t iam = grid->iam;
-
     int_t myrow = MYROW (iam, grid);
     int_t pr = PROW( k, grid );
+
     if (myrow == pr)
 	{
 	    int_t lk = LBi( k, grid ); /* Local block number */
@@ -308,8 +314,13 @@ int_t dzRecvUPanel(int_t k, int_t sender, double alpha, double beta,
 			     grid3d->zscp.comm, &status);
 		    
 		    /*reduce the updates*/
+#if 1
+		    dscal_(&lenv, &alpha, unzval, &inc);
+		    daxpy_(&lenv, &beta, Uval_buf, &inc, unzval, &inc);
+#else
 		    cblas_dscal (lenv, alpha, unzval, 1);
 		    cblas_daxpy (lenv, beta, Uval_buf, 1, unzval, 1);
+#endif
 		}
 	}
     return 0;
@@ -728,7 +739,7 @@ int_t dreduceAncestors3d(int_t sender, int_t receiver,
 
 int_t dgatherFactoredLU(int_t sender, int_t receiver,
                         int_t nnodes, int_t *nodeList,
-                        LUValSubBuf_t*LUvsb,
+                        dLUValSubBuf_t* LUvsb,
                         LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT)
 {
     double alpha = 0.0, beta = 1.0;	
@@ -778,7 +789,7 @@ int_t dinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
 
 
 int_t dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm,
-                             LUValSubBuf_t* LUvsb, LUstruct_t* LUstruct,
+                             dLUValSubBuf_t* LUvsb, LUstruct_t* LUstruct,
                              gridinfo3d_t* grid3d, SCT_t* SCT )
 {
     double * Lval_buf  = LUvsb->Lval_buf;
@@ -821,7 +832,7 @@ int_t dgatherAllFactoredLU( trf3Dpartition_t*  trf3Dpartition,
     int_t myGrid = grid3d->zscp.Iam;
     int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
     sForest_t** sForests = trf3Dpartition->sForests;
-    LUValSubBuf_t* LUvsb =  trf3Dpartition->LUvsb;
+    dLUValSubBuf_t*  LUvsb =  trf3Dpartition->LUvsb;
     int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
     int_t** gNodeLists = getNodeListFr(maxLvl, sForests);
     
diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c
index 45015803..91b05016 100644
--- a/SRC/pdgssvx3d.c
+++ b/SRC/pdgssvx3d.c
@@ -317,7 +317,7 @@ at the top-level directory.
  *         o RowPerm (rowperm_t)
  *           Specifies how to permute rows of the matrix A.
  *           = NATURAL:   use the natural ordering.
- *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows of
  *                        the original matrix to make the diagonal large
  *                        relative to the off-diagonal.
  *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
@@ -344,7 +344,7 @@ at the top-level directory.
  *         NOTE: all options must be indentical on all processes when
  *               calling this routine.
  *
- * A (input/output) SuperMatrix* (local)
+ * A (input/output) SuperMatrix* (local); A resides only on process layer 0.
  *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
  *           The number of linear equations is A->nrow. The type of A must be:
  *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
@@ -497,8 +497,8 @@ void
 pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
            ScalePermstruct_t * ScalePermstruct,
            double B[], int ldb, int nrhs, gridinfo3d_t * grid3d,
-           LUstruct_t * LUstruct, SOLVEstruct_t * SOLVEstruct, double *berr,
-           SuperLUStat_t * stat, int *info)
+           LUstruct_t * LUstruct, SOLVEstruct_t * SOLVEstruct,
+           double *berr, SuperLUStat_t * stat, int *info)
 {
     NRformat_loc *Astore;
     SuperMatrix GA;        /* Global A in NC format */
@@ -519,6 +519,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
        routine. They will be freed after PDDISTRIBUTE routine.
        If options->Fact == SamePattern_SameRowPerm, these
        structures are not used.                                  */
+    yes_no_t parSymbFact = options->ParSymbFact;
     fact_t Fact;
     double *a;
     int_t *colptr, *rowind;
@@ -541,75 +542,85 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 #if ( PRNTlevel>= 2 )
     double dmin, dsum, dprod;
 #endif
-    
-    /* Structures needed for parallel symbolic factorization */
-    int_t *sizes, *fstVtxSep, parSymbFact;
-    int noDomains, nprocs_num;
-    MPI_Comm symb_comm;  /* communicator for symbolic factorization */
-    int col, key;        /* parameters for creating a new communicator */
-    Pslu_freeable_t Pslu_freeable;
-    float flinfo;
-    
-    /* Initialization. */
 
-    /* definifition of factored seen by each process layer */
-    Fact = options->Fact;
-    factored = (Fact == FACTORED);
-    
     // get the 2d grid
     gridinfo_t *grid  = &(grid3d->grid2d);
     iam = grid->iam;
     
+    /* Initialization. */
+
+    /* definition of factored seen by each process layer */
+    Fact = options->Fact;
+    factored = (Fact == FACTORED);
+
+    /* Test the options choices. */
+    *info = 0;
+    Fact = options->Fact;
+    if (Fact < 0 || Fact > FACTORED)
+	*info = -1;
+    else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR)
+	*info = -1;
+    else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC)
+	*info = -1;
+    else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA)
+	*info = -1;
+    else if (options->IterRefine == SLU_EXTRA) {
+	*info = -1;
+        fprintf (stderr,
+	         "Extra precise iterative refinement yet to support.");
+    }
+    if (*info) {
+	i = -(*info);
+	pxerr_dist ("pdgssvx3d", grid, -*info);
+	return;
+    }
+
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC (iam, "Enter pdgssvx3d()");
+#endif
+	
     /* Perform preprocessing steps on process layer zero, including:
        ordering, symbolic factorization, distribution of L & U */
     if (grid3d->zscp.Iam == 0)
     {
-	m = A->nrow;
-	n = A->ncol;
-	Astore = (NRformat_loc *) A->Store;
-	nnz_loc = Astore->nnz_loc;
-	m_loc = Astore->m_loc;
-	fst_row = Astore->fst_row;
-	a = (double *) Astore->nzval;
-	rowptr = Astore->rowptr;
-	colind = Astore->colind;
+        m = A->nrow;
+    	n = A->ncol;
+    	Astore = (NRformat_loc *) A->Store;
+    	nnz_loc = Astore->nnz_loc;
+    	m_loc = Astore->m_loc;
+    	fst_row = Astore->fst_row;
+    	a = (double *) Astore->nzval;
+    	rowptr = Astore->rowptr;
+    	colind = Astore->colind;
+
+	/* Test the other input parameters. */
+	if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
+	     || A->Dtype != SLU_D || A->Mtype != SLU_GE)
+	     *info = -2;
+    	else if (ldb < m_loc)
+	     *info = -5;
+    	else if (nrhs < 0)
+	     *info = -6;
+	if (*info) {
+	   i = -(*info);
+	   pxerr_dist ("pdgssvx3d", grid, -*info);
+	   return;
+	}
+
+        /* Structures needed for parallel symbolic factorization */
+    	int_t *sizes, *fstVtxSep;
+	int noDomains, nprocs_num;
+    	MPI_Comm symb_comm;  /* communicator for symbolic factorization */
+    	int col, key; /* parameters for creating a new communicator */
+    	Pslu_freeable_t Pslu_freeable;
+    	float flinfo;
+    
 	sizes = NULL;
 	fstVtxSep = NULL;
 	symb_comm = MPI_COMM_NULL;
 	
-	/* Test the input parameters. */
-	*info = 0;
-	Fact = options->Fact;
-	if (Fact < 0 || Fact > FACTORED)
-	    *info = -1;
-	else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR)
-	    *info = -1;
-	else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC)
-	    *info = -1;
-	else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA)
-	    *info = -1;
-	else if (options->IterRefine == SLU_EXTRA) {
-	    *info = -1;
-	    fprintf (stderr,
-		     "Extra precise iterative refinement yet to support.");
-	}
-	else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
-		 || A->Dtype != SLU_D || A->Mtype != SLU_GE)
-	    *info = -2;
-	else if (ldb < m_loc)
-	    *info = -5;
-	else if (nrhs < 0)
-	    *info = -6;
-	if (*info) {
-	    i = -(*info);
-	    pxerr_dist ("pdgssvx3d", grid, -*info);
-	    return;
-	}
-	
-	factored = (Fact == FACTORED);
 	Equil = (!factored && options->Equil == YES);
 	notran = (options->Trans == NOTRANS);
-	parSymbFact = options->ParSymbFact;
 	
 	iam = grid->iam;
 	job = 5;
@@ -631,10 +642,6 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	C = ScalePermstruct->C;
 	/********/
 	
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC (iam, "Enter pdgssvx3d()");
-#endif
-	
 	/* Not factored & ask for equilibration */
 	if (Equil && Fact != SamePattern_SameRowPerm) {
 	    /* Allocate storage if not done so before. */
@@ -776,6 +783,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		    a_GA = (double *) GAstore->nzval;
 		    GA_mem_use += nnz * sizeof (double);
 		}
+
 		else
 		    assert (GAstore->nzval == NULL);
 	    }
@@ -1172,8 +1180,8 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 
     trf3Dpartition_t*  trf3Dpartition;
 
-    /* Perform numerical factorization in parallel on all process layers. */
-    if (!factored ) {
+    /* Perform numerical factorization in parallel on all process layers.*/
+    if ( !factored ) {
 
 	/* send the data across all the layers */
 	MPI_Bcast( &m, 1, mpi_int_t, 0,  grid3d->zscp.comm);
@@ -1217,16 +1225,20 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		SCT_print3D(grid3d, SCT);
 	    }
 	SCT_printComm3D(grid3d, SCT);
+
 	/*print memory usage*/
-	printMemUse( trf3Dpartition, LUstruct, grid3d );
+	d3D_printMemUse( trf3Dpartition, LUstruct, grid3d );
+
 	/*print forest weight and costs*/
 	printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d);
 	/*reduces stat from all the layers*/
 #endif
 
+	dDestroy_trf3Dpartition(trf3Dpartition, grid3d);
+
     } /* end if not Factored */
     
-    if ( grid3d->zscp.Iam == 0 ) {
+    if ( grid3d->zscp.Iam == 0 ) { // only process layer 0
 	if (!factored) {
 	    if (options->PrintStat) {
 		int_t TinyPivots;
@@ -1272,19 +1284,18 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		}
 	    }
 	    
-	}   /* end if (!factored) */
+	}   /* end if not Factored */
 
 	/* ------------------------------------------------------------
 	   Compute the solution matrix X.
 	   ------------------------------------------------------------ */
-	if (nrhs)
-	    {
+	if (nrhs) {
 		if (!(b_work = doubleMalloc_dist (n)))
 		    ABORT ("Malloc fails for b_work[]");
 
 		/* ------------------------------------------------------
-		   Scale the right-hand side if equilibration was performed.
-		   ------------------------------------------------------ */
+		   Scale the right-hand side if equilibration was performed
+		   ------------------------------------------------------*/
 		if (notran)
 		    {
 			if (rowequ)
@@ -1329,9 +1340,9 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		    b_col += ldb;
 		}
 
-		/* ------------------------------------------------------------
+		/* ------------------------------------------------------
 		   Solve the linear system.
-		   ------------------------------------------------------------ */
+		   ------------------------------------------------------*/
 		if (options->SolveInitialized == NO) /* First time */
                    /* Inside this routine, SolveInitialized is set to YES.
 	              For repeated call to pdgssvx(), no need to re-initialilze
@@ -1408,12 +1419,12 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 			    {	/* Use the existing solve structure */
 				SOLVEstruct1 = SOLVEstruct;
 			    }
-			else
-			    {	/* For nrhs > 1, since refinement is performed for RHS
-				   one at a time, the communication structure for pdgstrs
-				   is different than the solve with nrhs RHS.
-				   So we use SOLVEstruct1 for the refinement step.
-				*/
+			else {
+             /* For nrhs > 1, since refinement is performed for RHS
+		one at a time, the communication structure for pdgstrs
+		is different than the solve with nrhs RHS.
+		So we use SOLVEstruct1 for the refinement step.
+	      */
 				if (!(SOLVEstruct1 = (SOLVEstruct_t *)
 				      SUPERLU_MALLOC (sizeof (SOLVEstruct_t))))
 				    ABORT ("Malloc fails for SOLVEstruct1");
@@ -1523,9 +1534,11 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
 	    Destroy_CompCol_Permuted_dist (&GAC);
 #endif
+
+    } /* process layer 0 done solve */
+
 #if ( DEBUGlevel>=1 )
 	CHECK_MALLOC (iam, "Exit pdgssvx3d()");
 #endif
 
-    } /* process layer 0 */
 }
diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c
index 756301c1..c122b92a 100644
--- a/SRC/pdgstrf2.c
+++ b/SRC/pdgstrf2.c
@@ -378,9 +378,21 @@ int_t LpanelUpdate(int_t off0,  int_t nsupc, double* ublk_ptr, int_t ld_ujrow,
     {
         int_t off = i * GT;
         int_t len = SUPERLU_MIN(GT, l - i * GT);
+#if 1
+  #if defined (USE_VENDOR_BLAS)
+        dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+		ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr,
+		1, 1, 1, 1);
+  #else
+        dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+		ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr);
+  #endif
+#else
         cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
                      len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr);
-    }
+#endif
+
+    } /* for i = ... */
 
     t1 = _rdtsc() - t1;
 
@@ -472,9 +484,14 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
             int_t l = nsupc - j - 1;
 
 	    /* Rank-1 update */
+#if 1
+	    dger_ (&l, &cols_left, &alpha, &lusup[luptr + 1], &incx,
+		   &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1], &nsupr);
+#else
             cblas_dger (CblasColMajor, l, cols_left, alpha, &lusup[luptr + 1], incx,
                         &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1],
                         nsupr);
+#endif
             stat->ops[FACT] += 2 * l * cols_left;
         }
 
@@ -717,8 +734,21 @@ int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
     /*now call dtrsm on packed dense block*/
     int_t luptr = (knsupc - ldu) * (nsupr + 1);
     // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu );
+
+#if 1
+  #if defined (USE_VENDOR_BLAS)
+     dtrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
+	     &lusup[luptr], &nsupr, tempv, &ldu,
+	     1, 1, 1, 1);
+  #else
+     dtrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
+	     &lusup[luptr], &nsupr, tempv, &ldu);
+  #endif
+#else
+
     cblas_dtrsm (CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit,
                  ldu, ncols, alpha, &lusup[luptr], nsupr, tempv, ldu);
+#endif
 
     /*now scatter the output into sparse U block*/
     dTrs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv);
diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c
index 8161771b..953ceacc 100644
--- a/SRC/pdgstrf3d.c
+++ b/SRC/pdgstrf3d.c
@@ -130,6 +130,10 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     double s_eps = smach_dist("Epsilon");
     double thresh = s_eps * anorm;
 
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter pdgstrf3d()");
+#endif
+
     // initilize stat
     stat->ops[FACT] = 0;
     //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt);
@@ -167,6 +171,12 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int tag_ub = set_tag_ub();
     int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
 
+#if ( PRNTlevel>=1 )
+    if (!iam) {
+        printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout);
+    }
+#endif
+
     // trf3Dpartition_t*  trf3Dpartition = initTrf3Dpartition(nsupers, options, LUstruct, grid3d);
     gEtreeInfo_t gEtreeInfo = trf3Dpartition->gEtreeInfo;
     int_t* iperm_c_supno = trf3Dpartition->iperm_c_supno;
@@ -175,16 +185,15 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
     sForest_t** sForests = trf3Dpartition->sForests;
     int_t** treePerm = trf3Dpartition->treePerm ;
-    LUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb;
-    /*Initializing factorization specific buffers*/
+    dLUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb;
+
+    /* Initializing factorization specific buffers */
 
     int_t numLA = getNumLookAhead(options);
-    LUValSubBuf_t**LUvsbs = dLluBufInitArr( SUPERLU_MAX( numLA, grid3d->zscp.Np ), LUstruct);
+    dLUValSubBuf_t** LUvsbs = dLluBufInitArr( SUPERLU_MAX( numLA, grid3d->zscp.Np ), LUstruct);
     msgs_t**msgss = initMsgsArr(numLA);
     int_t mxLeafNode    = 0;
-    for (int ilvl = 0; ilvl < maxLvl; ++ilvl)
-    {
-        /* code */
+    for (int ilvl = 0; ilvl < maxLvl; ++ilvl) {
         if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode )
             mxLeafNode    = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1];
     }
@@ -192,7 +201,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA),
                                                     ldt, grid);
 
-    /*setting up GPU related stuff*/
+    /* Setting up GPU related data structures */
 
     int_t first_l_block_acc = 0;
     int_t first_u_block_acc = 0;
@@ -237,7 +246,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
         HyP->nCudaStreams = sluGPU->nCudaStreams;
     }
 
-#endif  // GPU_ACC
+#endif  // end GPU_ACC
 
     /*====  starting main factorization loop =====*/
     MPI_Barrier( grid3d->comm);
@@ -319,6 +328,9 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
 
     reduceStat(FACT, stat, grid3d);
 
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit pdgstrf3d()");
+#endif
     return 0;
 
 } /* pdgstrf3d */
diff --git a/SRC/pz3dcomm.c b/SRC/pz3dcomm.c
index 969ddbd0..22835f83 100644
--- a/SRC/pz3dcomm.c
+++ b/SRC/pz3dcomm.c
@@ -214,7 +214,7 @@ int_t zzRecvLPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex bet
     int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
     gridinfo_t* grid = &(grid3d->grid2d);
-    
+    int inc = 1;    
     int_t iam = grid->iam;
     int_t mycol = MYCOL (iam, grid);
     
@@ -238,8 +238,13 @@ int_t zzRecvLPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex bet
 			     grid3d->zscp.comm, &status);
 		    
 		    /*reduce the updates*/
+#if 1
+		    zscal_(&len2, &alpha, lnzval, &inc);
+		    zaxpy_(&len2, &beta, Lval_buf, &inc, lnzval, &inc);
+#else
 		    cblas_zscal (len2, (void*) &alpha, lnzval, 1);
 		    cblas_zaxpy (len2, (void*) &beta, Lval_buf, 1, lnzval, 1);
+#endif
 		}
 	}
 
@@ -287,10 +292,11 @@ int_t zzRecvUPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex bet
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
     doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
     gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;
     int_t iam = grid->iam;
-
     int_t myrow = MYROW (iam, grid);
     int_t pr = PROW( k, grid );
+
     if (myrow == pr)
 	{
 	    int_t lk = LBi( k, grid ); /* Local block number */
@@ -307,8 +313,13 @@ int_t zzRecvUPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex bet
 			     grid3d->zscp.comm, &status);
 		    
 		    /*reduce the updates*/
+#if 1
+		    zscal_(&lenv, &alpha, unzval, &inc);
+		    zaxpy_(&lenv, &beta, Uval_buf, &inc, unzval, &inc);
+#else
 		    cblas_zscal (lenv, (void*) &alpha, unzval, 1);
 		    cblas_zaxpy (lenv, (void*) &beta, Uval_buf, 1, unzval, 1);
+#endif
 		}
 	}
     return 0;
@@ -727,7 +738,7 @@ int_t zreduceAncestors3d(int_t sender, int_t receiver,
 
 int_t zgatherFactoredLU(int_t sender, int_t receiver,
                         int_t nnodes, int_t *nodeList,
-                        LUValSubBuf_t*LUvsb,
+                        zLUValSubBuf_t* LUvsb,
                         LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT)
 {
     doublecomplex alpha = {0.0, 0.0}, beta = {1.0, 0.0};
@@ -777,7 +788,7 @@ int_t zinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
 
 
 int_t zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm,
-                             LUValSubBuf_t* LUvsb, LUstruct_t* LUstruct,
+                             zLUValSubBuf_t* LUvsb, LUstruct_t* LUstruct,
                              gridinfo3d_t* grid3d, SCT_t* SCT )
 {
     doublecomplex * Lval_buf  = LUvsb->Lval_buf;
@@ -820,7 +831,7 @@ int_t zgatherAllFactoredLU( trf3Dpartition_t*  trf3Dpartition,
     int_t myGrid = grid3d->zscp.Iam;
     int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
     sForest_t** sForests = trf3Dpartition->sForests;
-    LUValSubBuf_t* LUvsb =  trf3Dpartition->LUvsb;
+    zLUValSubBuf_t*  LUvsb =  trf3Dpartition->LUvsb;
     int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
     int_t** gNodeLists = getNodeListFr(maxLvl, sForests);
     
diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c
index eb140cbc..a740e2dd 100644
--- a/SRC/pzgssvx3d.c
+++ b/SRC/pzgssvx3d.c
@@ -316,7 +316,7 @@ at the top-level directory.
  *         o RowPerm (rowperm_t)
  *           Specifies how to permute rows of the matrix A.
  *           = NATURAL:   use the natural ordering.
- *           = LargeDiag: use the Duff/Koster algorithm to permute rows of
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows of
  *                        the original matrix to make the diagonal large
  *                        relative to the off-diagonal.
  *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
@@ -343,7 +343,7 @@ at the top-level directory.
  *         NOTE: all options must be indentical on all processes when
  *               calling this routine.
  *
- * A (input/output) SuperMatrix* (local)
+ * A (input/output) SuperMatrix* (local); A resides only on process layer 0.
  *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
  *           The number of linear equations is A->nrow. The type of A must be:
  *           Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE.
@@ -496,8 +496,8 @@ void
 pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
            ScalePermstruct_t * ScalePermstruct,
            doublecomplex B[], int ldb, int nrhs, gridinfo3d_t * grid3d,
-           LUstruct_t * LUstruct, SOLVEstruct_t * SOLVEstruct, double *berr,
-           SuperLUStat_t * stat, int *info)
+           LUstruct_t * LUstruct, SOLVEstruct_t * SOLVEstruct,
+           double *berr, SuperLUStat_t * stat, int *info)
 {
     NRformat_loc *Astore;
     SuperMatrix GA;        /* Global A in NC format */
@@ -518,6 +518,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
        routine. They will be freed after PDDISTRIBUTE routine.
        If options->Fact == SamePattern_SameRowPerm, these
        structures are not used.                                  */
+    yes_no_t parSymbFact = options->ParSymbFact;
     fact_t Fact;
     doublecomplex *a;
     int_t *colptr, *rowind;
@@ -540,75 +541,85 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 #if ( PRNTlevel>= 2 )
     double dmin, dsum, dprod;
 #endif
-    
-    /* Structures needed for parallel symbolic factorization */
-    int_t *sizes, *fstVtxSep, parSymbFact;
-    int noDomains, nprocs_num;
-    MPI_Comm symb_comm;  /* communicator for symbolic factorization */
-    int col, key;        /* parameters for creating a new communicator */
-    Pslu_freeable_t Pslu_freeable;
-    float flinfo;
-    
-    /* Initialization. */
 
-    /* definifition of factored seen by each process layer */
-    Fact = options->Fact;
-    factored = (Fact == FACTORED);
-    
     // get the 2d grid
     gridinfo_t *grid  = &(grid3d->grid2d);
     iam = grid->iam;
     
+    /* Initialization. */
+
+    /* definition of factored seen by each process layer */
+    Fact = options->Fact;
+    factored = (Fact == FACTORED);
+
+    /* Test the options choices. */
+    *info = 0;
+    Fact = options->Fact;
+    if (Fact < 0 || Fact > FACTORED)
+	*info = -1;
+    else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR)
+	*info = -1;
+    else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC)
+	*info = -1;
+    else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA)
+	*info = -1;
+    else if (options->IterRefine == SLU_EXTRA) {
+	*info = -1;
+        fprintf (stderr,
+	         "Extra precise iterative refinement yet to support.");
+    }
+    if (*info) {
+	i = -(*info);
+	pxerr_dist ("pzgssvx3d", grid, -*info);
+	return;
+    }
+
+#if ( DEBUGlevel>=1 )
+	CHECK_MALLOC (iam, "Enter pzgssvx3d()");
+#endif
+	
     /* Perform preprocessing steps on process layer zero, including:
        ordering, symbolic factorization, distribution of L & U */
     if (grid3d->zscp.Iam == 0)
     {
-	m = A->nrow;
-	n = A->ncol;
-	Astore = (NRformat_loc *) A->Store;
-	nnz_loc = Astore->nnz_loc;
-	m_loc = Astore->m_loc;
-	fst_row = Astore->fst_row;
-	a = (doublecomplex *) Astore->nzval;
-	rowptr = Astore->rowptr;
-	colind = Astore->colind;
+        m = A->nrow;
+    	n = A->ncol;
+    	Astore = (NRformat_loc *) A->Store;
+    	nnz_loc = Astore->nnz_loc;
+    	m_loc = Astore->m_loc;
+    	fst_row = Astore->fst_row;
+    	a = (doublecomplex *) Astore->nzval;
+    	rowptr = Astore->rowptr;
+    	colind = Astore->colind;
+
+	/* Test the other input parameters. */
+	if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
+	     || A->Dtype != SLU_Z || A->Mtype != SLU_GE)
+	     *info = -2;
+    	else if (ldb < m_loc)
+	     *info = -5;
+    	else if (nrhs < 0)
+	     *info = -6;
+	if (*info) {
+	   i = -(*info);
+	   pxerr_dist ("pzgssvx3d", grid, -*info);
+	   return;
+	}
+
+        /* Structures needed for parallel symbolic factorization */
+    	int_t *sizes, *fstVtxSep;
+	int noDomains, nprocs_num;
+    	MPI_Comm symb_comm;  /* communicator for symbolic factorization */
+    	int col, key; /* parameters for creating a new communicator */
+    	Pslu_freeable_t Pslu_freeable;
+    	float flinfo;
+    
 	sizes = NULL;
 	fstVtxSep = NULL;
 	symb_comm = MPI_COMM_NULL;
 	
-	/* Test the input parameters. */
-	*info = 0;
-	Fact = options->Fact;
-	if (Fact < 0 || Fact > FACTORED)
-	    *info = -1;
-	else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR)
-	    *info = -1;
-	else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC)
-	    *info = -1;
-	else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA)
-	    *info = -1;
-	else if (options->IterRefine == SLU_EXTRA) {
-	    *info = -1;
-	    fprintf (stderr,
-		     "Extra precise iterative refinement yet to support.");
-	}
-	else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
-		 || A->Dtype != SLU_Z || A->Mtype != SLU_GE)
-	    *info = -2;
-	else if (ldb < m_loc)
-	    *info = -5;
-	else if (nrhs < 0)
-	    *info = -6;
-	if (*info) {
-	    i = -(*info);
-	    pxerr_dist ("pzgssvx3d", grid, -*info);
-	    return;
-	}
-	
-	factored = (Fact == FACTORED);
 	Equil = (!factored && options->Equil == YES);
 	notran = (options->Trans == NOTRANS);
-	parSymbFact = options->ParSymbFact;
 	
 	iam = grid->iam;
 	job = 5;
@@ -630,10 +641,6 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	C = ScalePermstruct->C;
 	/********/
 	
-#if ( DEBUGlevel>=1 )
-	CHECK_MALLOC (iam, "Enter pzgssvx3d()");
-#endif
-	
 	/* Not factored & ask for equilibration */
 	if (Equil && Fact != SamePattern_SameRowPerm) {
 	    /* Allocate storage if not done so before. */
@@ -774,8 +781,9 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		
 		if (need_value) {
 		    a_GA = (doublecomplex *) GAstore->nzval;
-		    GA_mem_use += nnz * sizeof (double);
+		    GA_mem_use += nnz * sizeof (doublecomplex);
 		}
+
 		else
 		    assert (GAstore->nzval == NULL);
 	    }
@@ -1173,8 +1181,8 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 
     trf3Dpartition_t*  trf3Dpartition;
 
-    /* Perform numerical factorization in parallel on all process layers. */
-    if (!factored ) {
+    /* Perform numerical factorization in parallel on all process layers.*/
+    if ( !factored ) {
 
 	/* send the data across all the layers */
 	MPI_Bcast( &m, 1, mpi_int_t, 0,  grid3d->zscp.comm);
@@ -1218,16 +1226,20 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		SCT_print3D(grid3d, SCT);
 	    }
 	SCT_printComm3D(grid3d, SCT);
+
 	/*print memory usage*/
-	printMemUse( trf3Dpartition, LUstruct, grid3d );
+	z3D_printMemUse( trf3Dpartition, LUstruct, grid3d );
+
 	/*print forest weight and costs*/
 	printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d);
 	/*reduces stat from all the layers*/
 #endif
 
+        zDestroy_trf3Dpartition(trf3Dpartition, grid3d);
+
     } /* end if not Factored */
     
-    if ( grid3d->zscp.Iam == 0 ) {
+    if ( grid3d->zscp.Iam == 0 ) { // only process layer 0
 	if (!factored) {
 	    if (options->PrintStat) {
 		int_t TinyPivots;
@@ -1273,19 +1285,18 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		}
 	    }
 	    
-	}   /* end if (!factored) */
+	}   /* end if not Factored */
 
 	/* ------------------------------------------------------------
 	   Compute the solution matrix X.
 	   ------------------------------------------------------------ */
-	if (nrhs)
-	    {
+	if (nrhs) {
 		if (!(b_work = doublecomplexMalloc_dist (n)))
 		    ABORT ("Malloc fails for b_work[]");
 
 		/* ------------------------------------------------------
-		   Scale the right-hand side if equilibration was performed.
-		   ------------------------------------------------------ */
+		   Scale the right-hand side if equilibration was performed
+		   ------------------------------------------------------*/
 		if (notran)
 		    {
 			if (rowequ)
@@ -1330,9 +1341,9 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		    b_col += ldb;
 		}
 
-		/* ------------------------------------------------------------
+		/* ------------------------------------------------------
 		   Solve the linear system.
-		   ------------------------------------------------------------ */
+		   ------------------------------------------------------*/
 		if (options->SolveInitialized == NO) /* First time */
                    /* Inside this routine, SolveInitialized is set to YES.
 	              For repeated call to pzgssvx(), no need to re-initialilze
@@ -1409,12 +1420,12 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 			    {	/* Use the existing solve structure */
 				SOLVEstruct1 = SOLVEstruct;
 			    }
-			else
-			    {	/* For nrhs > 1, since refinement is performed for RHS
-				   one at a time, the communication structure for pdgstrs
-				   is different than the solve with nrhs RHS.
-				   So we use SOLVEstruct1 for the refinement step.
-				*/
+			else {
+             /* For nrhs > 1, since refinement is performed for RHS
+		one at a time, the communication structure for pdgstrs
+		is different than the solve with nrhs RHS.
+		So we use SOLVEstruct1 for the refinement step.
+	      */
 				if (!(SOLVEstruct1 = (SOLVEstruct_t *)
 				      SUPERLU_MALLOC (sizeof (SOLVEstruct_t))))
 				    ABORT ("Malloc fails for SOLVEstruct1");
@@ -1524,9 +1535,11 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
 	    Destroy_CompCol_Permuted_dist (&GAC);
 #endif
+
+    } /* process layer 0 done solve */
+
 #if ( DEBUGlevel>=1 )
 	CHECK_MALLOC (iam, "Exit pzgssvx3d()");
 #endif
 
-    } /* process layer 0 */
 }
diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c
index 997cc051..1b79dee4 100644
--- a/SRC/pzgstrf.c
+++ b/SRC/pzgstrf.c
@@ -431,7 +431,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #if ( DEBUGlevel>=1 )
     if (s_eps == 0.0)
         printf (" ***** warning s_eps = %e *****\n", s_eps);
-    CHECK_MALLOC (iam, "Enter pdgstrf()");
+    CHECK_MALLOC (iam, "Enter pzgstrf()");
 #endif
 #if (PROFlevel >= 1 )
     gemm_stats = (gemm_profile *) SUPERLU_MALLOC(nsupers * sizeof(gemm_profile));
diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c
index b3b093d8..6329364d 100644
--- a/SRC/pzgstrf2.c
+++ b/SRC/pzgstrf2.c
@@ -379,9 +379,21 @@ int_t LpanelUpdate(int_t off0,  int_t nsupc, doublecomplex* ublk_ptr, int_t ld_u
     {
         int_t off = i * GT;
         int_t len = SUPERLU_MIN(GT, l - i * GT);
+#if 1
+  #if defined (USE_VENDOR_BLAS)
+        ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+		ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr,
+		1, 1, 1, 1);
+  #else
+        ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+		ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr);
+  #endif
+#else
         cblas_ztrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
                      len, nsupc, (void*) &alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr);
-    }
+#endif
+
+    } /* for i = ... */
 
     t1 = _rdtsc() - t1;
 
@@ -474,9 +486,15 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
             int_t l = nsupc - j - 1;
 
 	    /* Rank-1 update */
+#if 1
+	    zgeru_ (&l, &cols_left, &alpha, &lusup[luptr + 1], &incx,
+		    &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1],
+		    &nsupr);
+#else
             cblas_zgeru (CblasColMajor, l, cols_left, &alpha, &lusup[luptr + 1], incx,
                         &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1],
                         nsupr);
+#endif
             stat->ops[FACT] += 8 * l * cols_left;
         }
 
@@ -719,8 +737,21 @@ int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
     /*now call ztrsm on packed dense block*/
     int_t luptr = (knsupc - ldu) * (nsupr + 1);
     // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu );
+
+#if 1
+  #if defined (USE_VENDOR_BLAS)
+     ztrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
+	     &lusup[luptr], &nsupr, tempv, &ldu,
+	     1, 1, 1, 1);
+  #else
+     ztrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
+	     &lusup[luptr], &nsupr, tempv, &ldu);
+  #endif
+#else
+
     cblas_ztrsm (CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit,
                  ldu, ncols, (void*) &alpha, &lusup[luptr], nsupr, tempv, ldu);
+#endif
 
     /*now scatter the output into sparse U block*/
     zTrs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv);
diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c
index 380dac54..be579f93 100644
--- a/SRC/pzgstrf3d.c
+++ b/SRC/pzgstrf3d.c
@@ -129,6 +129,10 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     double s_eps = smach_dist("Epsilon");
     double thresh = s_eps * anorm;
 
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter pzgstrf3d()");
+#endif
+
     // initilize stat
     stat->ops[FACT] = 0;
     //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt);
@@ -166,6 +170,12 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int tag_ub = set_tag_ub();
     int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
 
+#if ( PRNTlevel>=1 )
+    if (!iam) {
+        printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout);
+    }
+#endif
+
     // trf3Dpartition_t*  trf3Dpartition = initTrf3Dpartition(nsupers, options, LUstruct, grid3d);
     gEtreeInfo_t gEtreeInfo = trf3Dpartition->gEtreeInfo;
     int_t* iperm_c_supno = trf3Dpartition->iperm_c_supno;
@@ -174,16 +184,15 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
     sForest_t** sForests = trf3Dpartition->sForests;
     int_t** treePerm = trf3Dpartition->treePerm ;
-    LUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb;
-    /*Initializing factorization specific buffers*/
+    zLUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb;
+
+    /* Initializing factorization specific buffers */
 
     int_t numLA = getNumLookAhead(options);
-    LUValSubBuf_t**LUvsbs = zLluBufInitArr( SUPERLU_MAX( numLA, grid3d->zscp.Np ), LUstruct);
+    zLUValSubBuf_t** LUvsbs = zLluBufInitArr( SUPERLU_MAX( numLA, grid3d->zscp.Np ), LUstruct);
     msgs_t**msgss = initMsgsArr(numLA);
     int_t mxLeafNode    = 0;
-    for (int ilvl = 0; ilvl < maxLvl; ++ilvl)
-    {
-        /* code */
+    for (int ilvl = 0; ilvl < maxLvl; ++ilvl) {
         if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode )
             mxLeafNode    = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1];
     }
@@ -191,7 +200,7 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA),
                                                     ldt, grid);
 
-    /*setting up GPU related stuff*/
+    /* Setting up GPU related data structures */
 
     int_t first_l_block_acc = 0;
     int_t first_u_block_acc = 0;
@@ -236,7 +245,7 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
         HyP->nCudaStreams = sluGPU->nCudaStreams;
     }
 
-#endif  // GPU_ACC
+#endif  // end GPU_ACC
 
     /*====  starting main factorization loop =====*/
     MPI_Barrier( grid3d->comm);
@@ -318,6 +327,9 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
 
     reduceStat(FACT, stat, grid3d);
 
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit pzgstrf3d()");
+#endif
     return 0;
 
 } /* pzgstrf3d */
diff --git a/SRC/pzgstrs.c b/SRC/pzgstrs.c
index d0d38563..194f6db3 100644
--- a/SRC/pzgstrs.c
+++ b/SRC/pzgstrs.c
@@ -2403,6 +2403,8 @@ for (i=0;icomm );
 
-
+		//		if (!iam) { printf("DBG: pzgstrs: after Barrier\n"); fflush(stdout);}
 #if ( PROFlevel>=2 )
 		{
 			float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum;
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index 493f6f8e..de4ab045 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -237,7 +237,7 @@ typedef struct
     double * Lval_buf ;
     int_t * Usub_buf ;
     double * Uval_buf ;
-} LUValSubBuf_t;
+} dLUValSubBuf_t;
 
 int_t scuStatUpdate(
     int_t knsupc,
@@ -246,7 +246,7 @@ int_t scuStatUpdate(
     SuperLUStat_t *stat
     );
 
-typedef struct trf3Dpartition_t
+typedef struct
 {
     gEtreeInfo_t gEtreeInfo;
     int_t* iperm_c_supno;
@@ -256,7 +256,7 @@ typedef struct trf3Dpartition_t
     int_t** treePerm;
     sForest_t** sForests;
     int_t* supernode2treeMap;
-    LUValSubBuf_t *LUvsb;
+    dLUValSubBuf_t  *LUvsb;
 } trf3Dpartition_t;
 
 typedef struct
@@ -279,6 +279,7 @@ typedef struct
     lPanelInfo_t* lPanelInfo;
 } packLUInfo_t;
 
+
 /*=====================*/
 
 /***********************************************************************
@@ -522,7 +523,6 @@ extern void dtrsm_(char*, char*, char*, char*, int*, int*,
                   int*, int, int, int, int);
 extern void dgemv_(char *, int *, int *, double *, double *a, int *,
                   double *, int *, double *, double *, int *, int);
-extern void dtrtri_(char*, char*, int*, double*, int*,int*);
 
 extern void dger_(int*, int*, double*, double*, int*,
                  double*, int*, double*, int*);
@@ -542,6 +542,13 @@ extern void dger_(int*, int*, double*, double*, int*,
 
 #endif
 
+extern int dscal_(int *n, double *da, double *dx, int *incx);
+extern int daxpy_(int *n, double *za, double *zx, 
+	               int *incx, double *zy, int *incy);
+// LAPACK routine
+extern void dtrtri_(char*, char*, int*, double*, int*, int*);
+
+
 /*==== For 3D code ====*/
 
 extern void pdgssvx3d (superlu_dist_options_t *, SuperMatrix *,
@@ -644,6 +651,8 @@ extern void dRgather_U(int_t k, int_t jj0, int_t *usub, double *uval,
 extern trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers,
 					     superlu_dist_options_t *options,
 					     LUstruct_t *LUstruct, gridinfo3d_t * grid3d);
+extern void dDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d);
+
 extern void d3D_printMemUse(trf3Dpartition_t*  trf3Dpartition,
 			    LUstruct_t *LUstruct, gridinfo3d_t * grid3d);
 
@@ -741,7 +750,7 @@ int_t dreduceAncestors3d(int_t sender, int_t receiver,
 /*reduces all nodelists required in a level*/
 int_t dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount,
                            int_t** treePerm,
-                           LUValSubBuf_t*LUvsb,
+                           dLUValSubBuf_t* LUvsb,
                            LUstruct_t* LUstruct,
                            gridinfo3d_t* grid3d,
                            SCT_t* SCT );
@@ -751,7 +760,7 @@ int_t dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount,
 	receiver[U(nodelist)] <-- sender[U(nodelist)];
 */
 int_t dgatherFactoredLU(int_t sender, int_t receiver,
-                       int_t nnodes, int_t *nodeList, LUValSubBuf_t*LUvsb,
+                       int_t nnodes, int_t *nodeList, dLUValSubBuf_t*  LUvsb,
                        LUstruct_t* LUstruct, gridinfo3d_t* grid3d,SCT_t* SCT );
 
 /*Gathers all the L and U factors to grid 0 for solve stage 
@@ -877,7 +886,7 @@ extern int_t dSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*,
 extern int_t dSchurComplementSetupGPU(int_t k, msgs_t* msgs, packLUInfo_t*,
 				      int_t*, int_t*, int_t*, gEtreeInfo_t*,
 				      factNodelists_t*, scuBufs_t*,
-				      LUValSubBuf_t* LUvsb, gridinfo_t *,
+				      dLUValSubBuf_t* LUvsb, gridinfo_t *,
 				      LUstruct_t *, HyP_t*);
 extern double* dgetBigV(int_t, int_t);
 extern double* dgetBigU(int_t, gridinfo_t *, LUstruct_t *);
@@ -888,7 +897,7 @@ extern int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *,
 extern void getSCUweight(int_t nsupers, treeList_t* treeList, LUstruct_t *, gridinfo3d_t *);
 
     /* from treeFactorization.h */
-extern int_t dLluBufInit(LUValSubBuf_t*, LUstruct_t *);
+extern int_t dLluBufInit(dLUValSubBuf_t*, LUstruct_t *);
 extern int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
 			  scuBufs_t*, LUstruct_t*, gridinfo_t *);
 
@@ -900,7 +909,7 @@ extern int_t treeFactor(
     scuBufs_t *scuBufs,          // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
-    LUValSubBuf_t*LUvsb,
+    dLUValSubBuf_t* LUvsb,
     diagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
@@ -920,7 +929,7 @@ extern int_t dsparseTreeFactor(
     scuBufs_t *scuBufs,          // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
-    LUValSubBuf_t*LUvsb,
+    dLUValSubBuf_t* LUvsb,
     diagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
@@ -939,7 +948,7 @@ extern int_t ddenseTreeFactor(
     scuBufs_t *scuBufs,          // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
-    LUValSubBuf_t*LUvsb,
+    dLUValSubBuf_t* LUvsb,
     diagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
@@ -957,7 +966,7 @@ extern int_t dsparseTreeFactor_ASYNC(
     scuBufs_t *scuBufs,          // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t**msgss,                  // size=num Look ahead
-    LUValSubBuf_t**LUvsbs,          // size=num Look ahead
+    dLUValSubBuf_t** LUvsbs,          // size=num Look ahead
     diagFactBufs_t **dFBufs,         // size maxEtree level
     factStat_t *factStat,
     factNodelists_t  *fNlists,
@@ -970,7 +979,7 @@ extern int_t dsparseTreeFactor_ASYNC(
     double thresh,  SCT_t *SCT, int tag_ub,
     int *info
 );
-extern LUValSubBuf_t** dLluBufInitArr(int_t numLA, LUstruct_t *LUstruct);
+extern dLUValSubBuf_t** dLluBufInitArr(int_t numLA, LUstruct_t *LUstruct);
 extern diagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid);
 extern int_t dinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf);
 extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs,
@@ -985,7 +994,7 @@ extern int_t ancestorFactor(
     scuBufs_t *scuBufs,          // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t**msgss,                  // size=num Look ahead
-    LUValSubBuf_t**LUvsbs,          // size=num Look ahead
+    dLUValSubBuf_t** LUvsbs,          // size=num Look ahead
     diagFactBufs_t **dFBufs,         // size maxEtree level
     factStat_t *factStat,
     factNodelists_t  *fNlists,
diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h
index a0bfbe8b..96d9d62a 100644
--- a/SRC/superlu_defs.h
+++ b/SRC/superlu_defs.h
@@ -344,7 +344,7 @@ typedef struct {
     MPI_Comm comm;        /* MPI communicator */
     superlu_scope_t rscp; /* row scope */
     superlu_scope_t cscp; /* column scope */
-    superlu_scope_t zscp; /* scope in third dim */
+    superlu_scope_t zscp; /* scope in third dimension */
     int iam;              /* my process number in this grid */
     int_t nprow;          /* number of process rows */
     int_t npcol;          /* number of process columns */
@@ -1186,7 +1186,7 @@ extern int_t* getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t* myNodeCount,
 extern void printForestWeightCost(sForest_t**  sForests, SCT_t* SCT, gridinfo3d_t* grid3d);
 extern sForest_t**  getGreedyLoadBalForests( int_t maxLvl, int_t nsupers, int_t* setree, treeList_t* treeList);
 extern sForest_t**  getForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList);
-    
+
     /* from trfAux.h */
 extern int set_tag_ub();
 extern int getNumThreads(int);
diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h
index 2341ac7a..edd24a1a 100644
--- a/SRC/superlu_zdefs.h
+++ b/SRC/superlu_zdefs.h
@@ -237,7 +237,7 @@ typedef struct
     doublecomplex * Lval_buf ;
     int_t * Usub_buf ;
     doublecomplex * Uval_buf ;
-} LUValSubBuf_t;
+} zLUValSubBuf_t;
 
 int_t scuStatUpdate(
     int_t knsupc,
@@ -246,7 +246,7 @@ int_t scuStatUpdate(
     SuperLUStat_t *stat
     );
 
-typedef struct trf3Dpartition_t
+typedef struct
 {
     gEtreeInfo_t gEtreeInfo;
     int_t* iperm_c_supno;
@@ -256,7 +256,7 @@ typedef struct trf3Dpartition_t
     int_t** treePerm;
     sForest_t** sForests;
     int_t* supernode2treeMap;
-    LUValSubBuf_t *LUvsb;
+    zLUValSubBuf_t  *LUvsb;
 } trf3Dpartition_t;
 
 typedef struct
@@ -279,6 +279,7 @@ typedef struct
     lPanelInfo_t* lPanelInfo;
 } packLUInfo_t;
 
+
 /*=====================*/
 
 /***********************************************************************
@@ -525,7 +526,6 @@ extern void ztrsm_(char*, char*, char*, char*, int*, int*,
                   int*, int, int, int, int);
 extern void zgemv_(char *, int *, int *, doublecomplex *, doublecomplex *a, int *,
                   doublecomplex *, int *, doublecomplex *, doublecomplex *, int *, int);
-extern void ztrtri_(char*, char*, int*, doublecomplex*, int*,int*);
 
 extern void zgeru_(int*, int*, doublecomplex*, doublecomplex*, int*,
                  doublecomplex*, int*, doublecomplex*, int*);
@@ -545,6 +545,13 @@ extern int zgeru_(int*, int*, doublecomplex*, doublecomplex*, int*,
 
 #endif
 
+extern int zscal_(int *n, doublecomplex *da, doublecomplex *dx, int *incx);
+extern int zaxpy_(int *n, doublecomplex *za, doublecomplex *zx, 
+	               int *incx, doublecomplex *zy, int *incy);
+// LAPACK routine
+extern void ztrtri_(char*, char*, int*, doublecomplex*, int*, int*);
+
+
 /*==== For 3D code ====*/
 
 extern void pzgssvx3d (superlu_dist_options_t *, SuperMatrix *,
@@ -647,6 +654,8 @@ extern void zRgather_U(int_t k, int_t jj0, int_t *usub, doublecomplex *uval,
 extern trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers,
 					     superlu_dist_options_t *options,
 					     LUstruct_t *LUstruct, gridinfo3d_t * grid3d);
+extern void zDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d);
+
 extern void z3D_printMemUse(trf3Dpartition_t*  trf3Dpartition,
 			    LUstruct_t *LUstruct, gridinfo3d_t * grid3d);
 
@@ -744,7 +753,7 @@ int_t zreduceAncestors3d(int_t sender, int_t receiver,
 /*reduces all nodelists required in a level*/
 int_t zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount,
                            int_t** treePerm,
-                           LUValSubBuf_t*LUvsb,
+                           zLUValSubBuf_t* LUvsb,
                            LUstruct_t* LUstruct,
                            gridinfo3d_t* grid3d,
                            SCT_t* SCT );
@@ -754,7 +763,7 @@ int_t zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount,
 	receiver[U(nodelist)] <-- sender[U(nodelist)];
 */
 int_t zgatherFactoredLU(int_t sender, int_t receiver,
-                       int_t nnodes, int_t *nodeList, LUValSubBuf_t*LUvsb,
+                       int_t nnodes, int_t *nodeList, zLUValSubBuf_t*  LUvsb,
                        LUstruct_t* LUstruct, gridinfo3d_t* grid3d,SCT_t* SCT );
 
 /*Gathers all the L and U factors to grid 0 for solve stage 
@@ -880,7 +889,7 @@ extern int_t zSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*,
 extern int_t zSchurComplementSetupGPU(int_t k, msgs_t* msgs, packLUInfo_t*,
 				      int_t*, int_t*, int_t*, gEtreeInfo_t*,
 				      factNodelists_t*, scuBufs_t*,
-				      LUValSubBuf_t* LUvsb, gridinfo_t *,
+				      zLUValSubBuf_t* LUvsb, gridinfo_t *,
 				      LUstruct_t *, HyP_t*);
 extern doublecomplex* zgetBigV(int_t, int_t);
 extern doublecomplex* zgetBigU(int_t, gridinfo_t *, LUstruct_t *);
@@ -891,7 +900,7 @@ extern int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *,
 extern void getSCUweight(int_t nsupers, treeList_t* treeList, LUstruct_t *, gridinfo3d_t *);
 
     /* from treeFactorization.h */
-extern int_t zLluBufInit(LUValSubBuf_t*, LUstruct_t *);
+extern int_t zLluBufInit(zLUValSubBuf_t*, LUstruct_t *);
 extern int_t zinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
 			  scuBufs_t*, LUstruct_t*, gridinfo_t *);
 
@@ -903,7 +912,7 @@ extern int_t treeFactor(
     scuBufs_t *scuBufs,          // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
-    LUValSubBuf_t*LUvsb,
+    zLUValSubBuf_t* LUvsb,
     diagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
@@ -923,7 +932,7 @@ extern int_t zsparseTreeFactor(
     scuBufs_t *scuBufs,          // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
-    LUValSubBuf_t*LUvsb,
+    zLUValSubBuf_t* LUvsb,
     diagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
@@ -942,7 +951,7 @@ extern int_t zdenseTreeFactor(
     scuBufs_t *scuBufs,          // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
-    LUValSubBuf_t*LUvsb,
+    zLUValSubBuf_t* LUvsb,
     diagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
@@ -960,7 +969,7 @@ extern int_t zsparseTreeFactor_ASYNC(
     scuBufs_t *scuBufs,          // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t**msgss,                  // size=num Look ahead
-    LUValSubBuf_t**LUvsbs,          // size=num Look ahead
+    zLUValSubBuf_t** LUvsbs,          // size=num Look ahead
     diagFactBufs_t **dFBufs,         // size maxEtree level
     factStat_t *factStat,
     factNodelists_t  *fNlists,
@@ -973,7 +982,7 @@ extern int_t zsparseTreeFactor_ASYNC(
     double thresh,  SCT_t *SCT, int tag_ub,
     int *info
 );
-extern LUValSubBuf_t** zLluBufInitArr(int_t numLA, LUstruct_t *LUstruct);
+extern zLUValSubBuf_t** zLluBufInitArr(int_t numLA, LUstruct_t *LUstruct);
 extern diagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid);
 extern int_t zinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf);
 extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs,
@@ -988,7 +997,7 @@ extern int_t ancestorFactor(
     scuBufs_t *scuBufs,          // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t**msgss,                  // size=num Look ahead
-    LUValSubBuf_t**LUvsbs,          // size=num Look ahead
+    zLUValSubBuf_t** LUvsbs,          // size=num Look ahead
     diagFactBufs_t **dFBufs,         // size maxEtree level
     factStat_t *factStat,
     factNodelists_t  *fNlists,
diff --git a/SRC/supernodalForest.c b/SRC/supernodalForest.c
index f3a3dfed..547ac29c 100644
--- a/SRC/supernodalForest.c
+++ b/SRC/supernodalForest.c
@@ -64,6 +64,8 @@ sForest_t**  getNestDissForests( int_t maxLvl, int_t nsupers, int_t*setree, tree
 	int_t** gNodeLists = getNodeList(maxLvl, setree, gNodeCount,
 	                                 gTreeHeads,  treeList);
 
+	SUPERLU_FREE(gTreeHeads); // Sherry added
+
 	for (int i = 0; i < numForests; ++i)
 	{
 		sForests[i] = NULL;
@@ -170,8 +172,6 @@ int_t calcTopInfoForest(sForest_t *forest,
 	SUPERLU_FREE(myTopOrderOld);
 	myIperm = getMyIperm(nnodes, nsupers, nodeListNew);
 
-
-
 	treeTopoInfo_t ttI;
 	ttI.myIperm = myIperm;
 	ttI.numLvl = myTopOrder[nnodes - 1] + 1;
@@ -181,7 +181,6 @@ int_t calcTopInfoForest(sForest_t *forest,
 	forest->topoInfo = ttI;
 
 	return 0;
-
 }
 
 // #pragma optimize ("", off)
@@ -893,10 +892,8 @@ sForest_t**  getOneLevelBalForests( int_t maxLvl, int_t nsupers, int_t * setree,
 			rForests[0].treeHeads[nRootTrees] = i;
 			nRootTrees++;
 		}
-
 	}
 
-
 	if (maxLvl == 1)
 	{
 		/* code */
diff --git a/SRC/supernodal_etree.c b/SRC/supernodal_etree.c
index 78757a37..35981350 100644
--- a/SRC/supernodal_etree.c
+++ b/SRC/supernodal_etree.c
@@ -22,7 +22,8 @@ int_t log2i(int_t index)
  */
 int_t *supernodal_etree(int_t nsuper, int_t * etree, int_t* supno, int_t *xsup)
 {
-	int_t *setree = malloc(sizeof(int_t) * nsuper);
+    //	int_t *setree = malloc(sizeof(int_t) * nsuper);
+	int_t *setree = intMalloc_dist(nsuper);
 	/*initialzing the loop*/
 	for (int i = 0; i < nsuper; ++i)
 	{
@@ -59,7 +60,8 @@ int_t* topological_ordering(int_t nsuper, int_t* setree)
 
 treeList_t* setree2list(int_t nsuper, int_t* setree )
 {
-    treeList_t* treeList = 	(treeList_t* ) malloc (sizeof(treeList_t) * (nsuper + 1));
+    treeList_t* treeList = (treeList_t* ) SUPERLU_MALLOC (sizeof(treeList_t) * (nsuper + 1));
+
 	// initialize the struct
 	for (int i = 0; i < nsuper + 1; ++i)
 	{
@@ -77,6 +79,7 @@ treeList_t* setree2list(int_t nsuper, int_t* setree )
 	    treeList[parenti].numDescendents +=  treeList[i].numDescendents;
 	    treeList[parenti].numChild++;
 	}
+
 	/*allocate memory for children lists*/
 	for (int i = 0; i < nsuper + 1; ++i)
 	{
@@ -91,7 +94,9 @@ treeList_t* setree2list(int_t nsuper, int_t* setree )
 	    treeList[parenti].childrenList[treeList[parenti].numChild] = i;
 	    treeList[parenti].numChild++;
 	}
+
 	return treeList;
+
 } /* setree2list */
 
 int_t estimateWeight(int_t nsupers, int_t*setree, treeList_t* treeList, int_t* xsup)
diff --git a/SRC/trfAux.c b/SRC/trfAux.c
index f9310eee..ec8ea2c4 100644
--- a/SRC/trfAux.c
+++ b/SRC/trfAux.c
@@ -1253,3 +1253,4 @@ int_t Trs2_InitUblock_info(int_t klst, int_t nb,
     }
     return 0;
 }
+
diff --git a/SRC/zmemory_dist.c b/SRC/zmemory_dist.c
index d5bd0978..cdf50f4d 100644
--- a/SRC/zmemory_dist.c
+++ b/SRC/zmemory_dist.c
@@ -169,3 +169,117 @@ doublecomplex *doublecomplexCalloc_dist(int_t n)
     return (buf);
 }
 
+/***************************************
+ * The following are from 3D code.
+ ***************************************/
+
+double zgetLUMem(int_t nodeId, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
+{
+    double memlu = 0.0;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    LocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    // double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t pc = PCOL( nodeId, grid );
+    if (mycol == pc)
+    {
+        int_t ljb = LBj( nodeId, grid ); /* Local block number */
+        int_t  *lsub;
+        doublecomplex* lnzval;
+        lsub = Lrowind_bc_ptr[ljb];
+        lnzval = Lnzval_bc_ptr[ljb];
+
+        if (lsub != NULL)
+        {
+            int_t nrbl  =   lsub[0]; /*number of L blocks */
+            int_t  len   = lsub[1];       /* LDA of the nzval[] */
+            int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+            int_t len2  = SuperSize(nodeId) * len;
+            memlu += 1.0 * (len1 * sizeof(int_t)  + len2 * sizeof(doublecomplex));
+        }
+    }
+
+    int_t pr = PROW( nodeId, grid );
+    if (myrow == pr)
+    {
+        int_t lib = LBi( nodeId, grid ); /* Local block number */
+        int_t  *usub;
+        // double* unzval;
+        usub = Ufstnz_br_ptr[lib];
+
+        if (usub != NULL)
+        {
+            int_t lenv = usub[1];
+            int_t lens = usub[2];
+            memlu += 1.0 * (lenv * sizeof(int_t)  + lens * sizeof(doublecomplex));
+        }
+    }
+    return memlu;
+}
+
+double  zmemForest(sForest_t*sforest, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
+{
+    double memlu = 0;
+
+    int_t *perm_c_supno = sforest->nodeList;
+    int_t nnodes =   sforest->nNodes;
+    for (int i = 0; i < nnodes; ++i)
+    {
+        memlu += zgetLUMem(perm_c_supno[i], LUstruct, grid3d);
+    }
+
+    return memlu;
+}
+
+void z3D_printMemUse( trf3Dpartition_t*  trf3Dpartition,  LUstruct_t *LUstruct,
+		      gridinfo3d_t * grid3d )
+{
+    int_t* myTreeIdxs = trf3Dpartition->myTreeIdxs;
+    int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
+    sForest_t** sForests = trf3Dpartition->sForests;
+
+    double memNzLU = 0.0;
+    double memzLU = 0.0;
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+
+    for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)
+    {
+        sForest_t* sforest = sForests[myTreeIdxs[ilvl]];
+
+        if (sforest)
+        {
+            if (!myZeroTrIdxs[ilvl])
+            {
+                memNzLU += zmemForest(sforest, LUstruct, grid3d);
+            }
+            else
+            {
+                memzLU += zmemForest(sforest, LUstruct, grid3d);
+            }
+        }
+    }
+    double sumMem = memNzLU + memzLU;
+    double maxMem, minMem,  avgNzLU, avgzLU;
+    /*Now reduce it among all the procs*/
+    MPI_Reduce(&sumMem, &maxMem, 1, MPI_DOUBLE, MPI_MAX, 0, grid3d->comm);
+    MPI_Reduce(&sumMem, &minMem, 1, MPI_DOUBLE, MPI_MIN, 0, grid3d->comm);
+    MPI_Reduce(&memNzLU, &avgNzLU, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm);
+    MPI_Reduce(&memzLU, &avgzLU, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm);
+
+    int_t nProcs = grid3d->nprow * grid3d->npcol * grid3d->npdep;
+    if (!(grid3d->iam))
+    {
+        /* code */
+        printf("| Total Memory \t| %.2g  \t| %.2g  \t|%.2g  \t|\n", (avgNzLU + avgzLU) / nProcs, maxMem, minMem );
+        printf("| LU-LU(repli) \t| %.2g  \t| %.2g  \t|\n", (avgNzLU) / nProcs, avgzLU / nProcs );
+    }
+}
+
diff --git a/SRC/zscatter3d.c b/SRC/zscatter3d.c
index da54583d..356d6552 100644
--- a/SRC/zscatter3d.c
+++ b/SRC/zscatter3d.c
@@ -133,7 +133,7 @@ zblock_gemm_scatter( int_t lb, int_t j,
     // int_t cum_nrow =  Remain_info[lb].StRow;
     doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
 
-    /* calling DGEMM */
+    /* calling ZGEMM */
     // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
 #if 1
     zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
@@ -238,14 +238,14 @@ zblock_gemm_scatter_lock( int_t lb, int_t j,
 
     doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0};
 
-    /* calling DGEMM */
+    /* calling ZGEMM */
 #if 1
     // printf(" m %d n %d k %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
     zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
            &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl,
            &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow);
 #else
-    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+    cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
                 temp_nbrow, ncols, ldu, alpha,
                 &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
                 &U_mat[st_col * ldu], ldu,
diff --git a/SRC/ztreeFactorization.c b/SRC/ztreeFactorization.c
index bd03bb2e..db34bb00 100644
--- a/SRC/ztreeFactorization.c
+++ b/SRC/ztreeFactorization.c
@@ -23,7 +23,7 @@ at the top-level directory.
 #include "trfCommWrapper.h"
 #endif
 
-int_t zLluBufInit(LUValSubBuf_t* LUvsb, LUstruct_t *LUstruct)
+int_t zLluBufInit(zLUValSubBuf_t* LUvsb, LUstruct_t *LUstruct)
 {
     LocalLU_t *Llu = LUstruct->Llu;
     LUvsb->Lsub_buf = intMalloc_dist(Llu->bufmax[0]); //INT_T_ALLOC(Llu->bufmax[0]);
@@ -49,13 +49,13 @@ diagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* g
     return dFBufs;
 }
 
-LUValSubBuf_t** zLluBufInitArr(int_t numLA, LUstruct_t *LUstruct)
+zLUValSubBuf_t** zLluBufInitArr(int_t numLA, LUstruct_t *LUstruct)
 {
-    LUValSubBuf_t** LUvsbs = (LUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(LUValSubBuf_t*));
+    zLUValSubBuf_t** LUvsbs = (zLUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(zLUValSubBuf_t*));
     for (int_t i = 0; i < numLA; ++i)
     {
         /* code */
-        LUvsbs[i] = (LUValSubBuf_t*) SUPERLU_MALLOC(sizeof(LUValSubBuf_t));
+        LUvsbs[i] = (zLUValSubBuf_t*) SUPERLU_MALLOC(sizeof(zLUValSubBuf_t));
         zLluBufInit(LUvsbs[i], LUstruct);
     } /*minor for loop-3 for (int_t i = 0; i < numLA; ++i)*/
 
@@ -86,7 +86,7 @@ int_t zdenseTreeFactor(
     scuBufs_t *scuBufs,          // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t*msgs,
-    LUValSubBuf_t*LUvsb,
+    zLUValSubBuf_t* LUvsb,
     diagFactBufs_t *dFBuf,
     factStat_t *factStat,
     factNodelists_t  *fNlists,
@@ -243,7 +243,7 @@ int_t zsparseTreeFactor_ASYNC(
     scuBufs_t *scuBufs,          // contains buffers for schur complement update
     packLUInfo_t*packLUInfo,
     msgs_t**msgss,                  // size=num Look ahead
-    LUValSubBuf_t**LUvsbs,          // size=num Look ahead
+    zLUValSubBuf_t** LUvsbs,          // size=num Look ahead
     diagFactBufs_t **dFBufs,         // size maxEtree level
     factStat_t *factStat,
     factNodelists_t  *fNlists,
diff --git a/SRC/ztrfAux.c b/SRC/ztrfAux.c
index 04d4b8a0..d2408197 100644
--- a/SRC/ztrfAux.c
+++ b/SRC/ztrfAux.c
@@ -294,7 +294,7 @@ int_t zSchurComplementSetupGPU(
     int_t* myIperm, 
     int_t* iperm_c_supno, int_t*perm_c_supno,
     gEtreeInfo_t*   gEtreeInfo, factNodelists_t* fNlists,
-    scuBufs_t* scuBufs, LUValSubBuf_t* LUvsb,
+    scuBufs_t* scuBufs, zLUValSubBuf_t* LUvsb,
     gridinfo_t *grid, LUstruct_t *LUstruct,
     HyP_t* HyP)
 {
@@ -508,7 +508,7 @@ trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers,
 
 #if ( DEBUGlevel>=1 )
     int iam = grid3d->iam;
-    CHECK_MALLOC (iam, "Enter initTrf3Dpartition()");
+    CHECK_MALLOC (iam, "Enter zinitTrf3Dpartition()");
 #endif
     int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid);
     int_t* iperm_c_supno = getFactIperm(perm_c_supno, nsupers);
@@ -544,7 +544,7 @@ trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers,
     int_t* myNodeCount = getMyNodeCountsFr(maxLvl, myTreeIdxs, sForests);
     int_t** treePerm = getTreePermFr( myTreeIdxs, sForests, grid3d);
 
-    LUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(LUValSubBuf_t));
+    zLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(zLUValSubBuf_t));
     zLluBufInit(LUvsb, LUstruct);
 
     int_t* supernode2treeMap = SUPERLU_MALLOC(nsupers*sizeof(int_t));
@@ -571,12 +571,52 @@ trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers,
     trf3Dpartition->LUvsb = LUvsb;
     trf3Dpartition->supernode2treeMap = supernode2treeMap;
 
+    SUPERLU_FREE(treeList);  // Sherry added
+
 #if ( DEBUGlevel>=1 )
-    CHECK_MALLOC (iam, "Exit initTrf3Dpartition()");
+    CHECK_MALLOC (iam, "Exit zinitTrf3Dpartition()");
 #endif
     return trf3Dpartition;
 } /* zinitTrf3Dpartition */
 
+/* Free memory allocated for trf3Dpartition structure. Sherry added this routine */
+void zDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d)
+{
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter zDestroy_trf3Dpartition()");
+#endif
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.setree);
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.numChildLeft);
+    SUPERLU_FREE(trf3Dpartition->iperm_c_supno);
+    SUPERLU_FREE(trf3Dpartition->myNodeCount);
+    SUPERLU_FREE(trf3Dpartition->myTreeIdxs);
+    SUPERLU_FREE(trf3Dpartition->myZeroTrIdxs);
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    for (int i = 0; i < numForests; ++i) {
+	if ( sForests[i] ) {
+	    SUPERLU_FREE(sForests[i]->nodeList);
+	    //SUPERLU_FREE(sForests[i]->treeHeads); // already freed
+	    SUPERLU_FREE((sForests[i]->topoInfo).eTreeTopLims);
+	    SUPERLU_FREE((sForests[i]->topoInfo).myIperm);
+	}
+    }
+    SUPERLU_FREE(trf3Dpartition->sForests); // double pointer 
+    SUPERLU_FREE(trf3Dpartition->treePerm); // double pointer pointing to sForests->nodeList
+    SUPERLU_FREE(trf3Dpartition->supernode2treeMap);
+
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lsub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lval_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Usub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Uval_buf);
+    SUPERLU_FREE(trf3Dpartition->LUvsb); // Sherry: check this ...
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter zDestroy_trf3Dpartition()");
+#endif
+}
+
 
 #if 0  //**** Sherry: following two routines are old, the new ones are in util.c
 int_t num_full_cols_U(int_t kk,  int_t **Ufstnz_br_ptr, int_t *xsup,
diff --git a/SRC/ztrfCommWrapper.c b/SRC/ztrfCommWrapper.c
index 07cc4a4e..f7027b7f 100644
--- a/SRC/ztrfCommWrapper.c
+++ b/SRC/ztrfCommWrapper.c
@@ -171,8 +171,20 @@ int_t zLPanelTrSolve( int_t k,   int_t* factored_L,
                 int_t off = i * BL;
                 // Sherry: int_t len = MY_MIN(BL, l - i * BL);
                 int_t len = SUPERLU_MIN(BL, l - i * BL);
+
+#if 1
+  #if defined (USE_VENDOR_BLAS)
+		ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+			ublk_ptr, &ld_ujrow, &lusup[off], &nsupr,
+			1, 1, 1, 1);
+  #else
+		ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+			ublk_ptr, &ld_ujrow, &lusup[off], &nsupr);
+  #endif
+#else
                 cblas_ztrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
                 len, nsupc, (void*) &alpha, ublk_ptr, ld_ujrow, &lusup[off], nsupr);
+#endif
             }
         }
     }
@@ -208,8 +220,19 @@ int_t zLPanelTrSolve( int_t k,   int_t* factored_L,
             int_t len = SUPERLU_MIN(BL, (l - i * BL));
             #pragma omp task
             {
+#if 1
+  #if defined (USE_VENDOR_BLAS)
+		ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+			ublk_ptr, &ld_ujrow, &lusup[nsupc + off], &nsupr,
+			1, 1, 1, 1);
+  #else
+		ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+			ublk_ptr, &ld_ujrow, &lusup[nsupc + off], &nsupr);
+  #endif
+#else
                 cblas_ztrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
                              len, nsupc, (void*) &alpha, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
+#endif
 
             }
         }

From 5e07490bd1844840264fc0e19cf538c79580c8e3 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Sat, 31 Aug 2019 14:55:38 -0700
Subject: [PATCH 011/147] Add EXAMPLE/pzdrive3d.c

---
 EXAMPLE/pzdrive3d.c | 306 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 306 insertions(+)
 create mode 100644 EXAMPLE/pzdrive3d.c

diff --git a/EXAMPLE/pzdrive3d.c b/EXAMPLE/pzdrive3d.c
new file mode 100644
index 00000000..9c1c5a02
--- /dev/null
+++ b/EXAMPLE/pzdrive3d.c
@@ -0,0 +1,306 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Driver program for PZGSSVX3D example
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology.
+ * May 10, 2019
+ *
+ */
+#include "superlu_zdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE3D.
+ *
+ * This example illustrates how to use PZGSSVX3D with the full
+ * (default) options to solve a linear system.
+ *
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call pzgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * The program may be run by typing
+ *    mpiexec -np 

pzdrive -r -c \ + * -d + *

+ */ + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // only on process layer 0 + ScalePermstruct_t ScalePermstruct; + LUstruct_t LUstruct; + SOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + doublecomplex *b, *xtrue; + int_t m, n; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam >= nprow * npcol *npdep) + goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iinum_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = NO; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + if ( grid.zscp.Iam == 0 ) // Process layer 0 + { + m = A.nrow; + n = A.ncol; + } + // broadcast m, n to all the process layers; + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.zscp.comm); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.zscp.comm); + + /* Initialize ScalePermstruct and LUstruct. */ + ScalePermstructInit (m, n, &ScalePermstruct); + LUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( grid.zscp.Iam == 0 ) // Process layer 0 + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, &(grid.grid2d)); + fflush(stdout); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + Destroy_CompRowLoc_Matrix_dist (&A); + Destroy_LU (n, &(grid.grid2d), &LUstruct); + SUPERLU_FREE (b); + SUPERLU_FREE (xtrue); + SUPERLU_FREE (berr); + if (options.SolveInitialized) { + zSolveFinalize (&options, &SOLVEstruct); + } + } + + ScalePermstructFree (&ScalePermstruct); + LUstructFree (&LUstruct); + PStatFree (&stat); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} From c021ca5e402ceb2569119ccac9f1e7bdddd26221 Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Mon, 2 Sep 2019 10:44:56 -0700 Subject: [PATCH 012/147] in xscatter3d.c: for VENDOR BLAS, add string length "1" at the end of GEMM calls. in pxgstrf3d.c: initialize stat->peak_buffer = 0; remove xinitDiagFactBufs(ldt, &dFBuf); dFBuf is not used. remove initCommRequests(&comReqs, grid); comReqs is not used. remove initMsgs(&msgs); msgs is not used. add calls freePackLUInfo(), etc. to deallocate memory. in superlu_grid3d.c: add MPI_Comm_free(&superlu3d_comm); in px3dcomm.c:: xGatherAllFactoredLU(): add SUPERLU_FREE(gNodeCount); SUPERLU_FREE(gNodeLists); in xtrfAux.c: xinit3DLUstructForest(): add SUPERLU_FREE(gNodeCount); SUPERLU_FREE(gNodeLists); xinitTrf3Dpartition(): add SUPERLU_FREE(perm_c_supno); in treeFactorization.c: add functions: freePackLUInfo(packLUInfo_t* packLUInfo); freeFactStat(factStat_t* factStat); freeFactNodelists(factNodelists_t *fNlists); freeMsgsArr(int_t numLA, msgs_t **msgss); freeCommRequestsArr(); in xtreeFactorization.c: add functions: xLluBufFreeArr(); xfreeDiagFactBufsArr(mxLeafNode, dFBufs); xfreeScuBufs(scuBufs_t* scuBufs); in supernodalForest.c :: calcTopInfoForest(): add SUPERLU_FREE(myTopOrder); // sherry added in pxgssvx3d.c: add call to SCT_free(); in sec_structs.h: add free(line); --- SRC/CMakeLists.txt | 2 +- SRC/dgather.c | 6 ++++ SRC/dreadtriple_noheader.c | 2 +- SRC/dscatter3d.c | 12 ++++++++ SRC/dtreeFactorization.c | 33 ++++++++++++++++++++++ SRC/dtrfAux.c | 16 ++++++++--- SRC/pd3dcomm.c | 7 ++++- SRC/pdgssvx3d.c | 3 +- SRC/pdgstrf3d.c | 31 ++++++++++++++++---- SRC/pz3dcomm.c | 5 +++- SRC/pzgssvx3d.c | 1 + SRC/pzgstrf3d.c | 5 +++- SRC/sec_structs.c | 29 +++++++++++++++---- SRC/superlu_ddefs.h | 4 +++ SRC/superlu_defs.h | 7 +++++ SRC/superlu_grid3d.c | 5 ++-- SRC/superlu_zdefs.h | 4 +++ SRC/supernodalForest.c | 21 +++++++------- SRC/supernodal_etree.c | 13 +++++++-- SRC/treeFactorization.c | 58 ++++++++++++++++++++++++++++++++++++++ SRC/zreadtriple_noheader.c | 6 ++-- SRC/zscatter3d.c | 12 ++++++++ SRC/ztreeFactorization.c | 34 ++++++++++++++++++++++ SRC/ztrfAux.c | 16 ++++++++--- 24 files changed, 288 insertions(+), 44 deletions(-) diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 4b2c3f1f..b3dc1036 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -51,7 +51,7 @@ set(sources trfAux.c communication_aux.c treeFactorization.c - sec_structs.o + sec_structs.c ) if (MSVC) list(APPEND sources wingetopt.c) diff --git a/SRC/dgather.c b/SRC/dgather.c index 0c666200..4bdc470f 100644 --- a/SRC/dgather.c +++ b/SRC/dgather.c @@ -378,6 +378,12 @@ void dRgather_U( int_t k, int_t jj0, int_t *usub, double *uval, HyP->Ublock_info_Phi[j].full_u_cols = HyP->Ublock_info_Phi[j ].ncols + HyP->Ublock_info_Phi[j - 1].full_u_cols; } + // if (!grid->iam) { // Sherry to remove + // printf(".. k %d jj0 %d\t num_u_blks_phi %d\t mcb %d\n", k, jj0, HyP->num_u_blks_Phi, + // (HyP->nsupers + grid->npcol - 1) / grid->npcol); + //fflush(stdout); + //} + HyP->bigU_Phi = bigU; HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; diff --git a/SRC/dreadtriple_noheader.c b/SRC/dreadtriple_noheader.c index c76a4fb7..fb12f1a7 100644 --- a/SRC/dreadtriple_noheader.c +++ b/SRC/dreadtriple_noheader.c @@ -87,7 +87,7 @@ dreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, #endif /* Second pass: read the actual matrix values */ - printf("m %ld, n %ld, nonz %ld\n", *m, *n, *nonz); + printf("m %ld, n %ld, nonz %ld\n", (long int) *m, (long int) *n, (long int) *nonz); dallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */ a = *nzval; asub = *rowind; diff --git a/SRC/dscatter3d.c b/SRC/dscatter3d.c index e03f3822..bd887361 100644 --- a/SRC/dscatter3d.c +++ b/SRC/dscatter3d.c @@ -138,9 +138,15 @@ dblock_gemm_scatter( int_t lb, int_t j, /* calling DGEMM */ // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col ); #if 1 + #if defined (USE_VENDOR_BLAS) + dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha, + &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl, + &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1); + #else dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha, &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl, &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow); + #endif #else // printf("%d %d %d %d %d %d %d %d\n", temp_nbrow, ncols, ldu, ldl,st_col,(knsupc - ldu)*ldl + cum_nrow,cum_nrow,st_col); @@ -242,10 +248,16 @@ dblock_gemm_scatter_lock( int_t lb, int_t j, /* calling DGEMM */ #if 1 + #if defined (USE_VENDOR_BLAS) // printf(" m %d n %d k %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col ); + dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha, + &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl, + &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1); + #else dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha, &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl, &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow); + #endif #else cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, temp_nbrow, ncols, ldu, alpha, diff --git a/SRC/dtreeFactorization.c b/SRC/dtreeFactorization.c index 4e4d11a5..4ee0e931 100644 --- a/SRC/dtreeFactorization.c +++ b/SRC/dtreeFactorization.c @@ -50,6 +50,18 @@ diagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* g return dFBufs; } +// sherry added +int dfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs) +{ + for (int i = 0; i < mxLeafNode; ++i) { + SUPERLU_FREE(dFBufs[i]->BlockUFactor); + SUPERLU_FREE(dFBufs[i]->BlockLFactor); + SUPERLU_FREE(dFBufs[i]); + } + SUPERLU_FREE(dFBufs); + return 0; +} + dLUValSubBuf_t** dLluBufInitArr(int_t numLA, LUstruct_t *LUstruct) { dLUValSubBuf_t** LUvsbs = (dLUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(dLUValSubBuf_t*)); @@ -63,6 +75,18 @@ dLUValSubBuf_t** dLluBufInitArr(int_t numLA, LUstruct_t *LUstruct) return LUvsbs; } +// sherry added +int dLluBufFreeArr(int_t numLA, dLUValSubBuf_t **LUvsbs) +{ + for (int_t i = 0; i < numLA; ++i) { + SUPERLU_FREE(LUvsbs[i]->Lsub_buf); + SUPERLU_FREE(LUvsbs[i]->Lval_buf); + SUPERLU_FREE(LUvsbs[i]->Usub_buf); + SUPERLU_FREE(LUvsbs[i]->Uval_buf); + SUPERLU_FREE(LUvsbs[i]); + } + SUPERLU_FREE(LUvsbs); +} int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, scuBufs_t* scuBufs, @@ -73,6 +97,15 @@ int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, scuBufs->bigU = dgetBigU(nsupers, grid, LUstruct); return 0; } + +// sherry added +int dfreeScuBufs(scuBufs_t* scuBufs) +{ + SUPERLU_FREE(scuBufs->bigV); + SUPERLU_FREE(scuBufs->bigU); + return 0; +} + int_t dinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf) { dFBuf->BlockUFactor = doubleMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt); diff --git a/SRC/dtrfAux.c b/SRC/dtrfAux.c index 2a94955e..7fffdf13 100644 --- a/SRC/dtrfAux.c +++ b/SRC/dtrfAux.c @@ -52,7 +52,7 @@ void dinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, dinit3DLUstruct( myTreeIdxs, myZeroTrIdxs, gNodeCount, gNodeLists, LUstruct, grid3d); - SUPERLU_FREE(gNodeCount); + SUPERLU_FREE(gNodeCount); // sherry added SUPERLU_FREE(gNodeLists); } @@ -538,7 +538,7 @@ trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers, int_t* myTreeIdxs = getGridTrees(grid3d); int_t* myZeroTrIdxs = getReplicatedTrees(grid3d); int_t* gNodeCount = getNodeCountsFr(maxLvl, sForests); - int_t** gNodeLists = getNodeListFr(maxLvl, sForests); + int_t** gNodeLists = getNodeListFr(maxLvl, sForests); // reuse NodeLists stored in sForests[] dinit3DLUstructForest(myTreeIdxs, myZeroTrIdxs, sForests, LUstruct, grid3d); @@ -556,7 +556,7 @@ trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers, for (int_t nd = 0; nd < gNodeCount[Fr]; ++nd) { /* code */ - supernode2treeMap[gNodeLists[Fr][nd]]=Fr; + supernode2treeMap[gNodeLists[Fr][nd]]=Fr; } } @@ -572,7 +572,12 @@ trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers, trf3Dpartition->LUvsb = LUvsb; trf3Dpartition->supernode2treeMap = supernode2treeMap; - SUPERLU_FREE(treeList); // Sherry added + // Sherry added + // Deallocate storage + SUPERLU_FREE(gNodeCount); + SUPERLU_FREE(gNodeLists); + SUPERLU_FREE(perm_c_supno); + free_treelist(nsupers, treeList); #if ( DEBUGlevel>=1 ) CHECK_MALLOC (iam, "Exit dinitTrf3Dpartition()"); @@ -613,6 +618,9 @@ void dDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *gri SUPERLU_FREE((trf3Dpartition->LUvsb)->Usub_buf); SUPERLU_FREE((trf3Dpartition->LUvsb)->Uval_buf); SUPERLU_FREE(trf3Dpartition->LUvsb); // Sherry: check this ... + + SUPERLU_FREE(trf3Dpartition); + #if ( DEBUGlevel>=1 ) CHECK_MALLOC (grid3d->iam, "Enter dDestroy_trf3Dpartition()"); #endif diff --git a/SRC/pd3dcomm.c b/SRC/pd3dcomm.c index ca54fb50..aefe3a26 100644 --- a/SRC/pd3dcomm.c +++ b/SRC/pd3dcomm.c @@ -873,6 +873,11 @@ int_t dgatherAllFactoredLU( trf3Dpartition_t* trf3Dpartition, } } /* for ilvl ... */ + SUPERLU_FREE(gNodeCount); // sherry added + //int numForests = (1 << maxLvl) - 1; // sherry added ???? + //for (int i = 0; i < numForests; ++i) SUPERLU_FREE(gNodeLists[i]); + SUPERLU_FREE(gNodeLists); + return 0; -} +} /* dgatherAllFactoredLU */ diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index 91b05016..fc62058e 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -1234,7 +1234,8 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /*reduces stat from all the layers*/ #endif - dDestroy_trf3Dpartition(trf3Dpartition, grid3d); + dDestroy_trf3Dpartition(trf3Dpartition, grid3d); + SCT_free(SCT); } /* end if not Factored */ diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c index 953ceacc..35930233 100644 --- a/SRC/pdgstrf3d.c +++ b/SRC/pdgstrf3d.c @@ -134,8 +134,11 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, CHECK_MALLOC (grid3d->iam, "Enter pdgstrf3d()"); #endif - // initilize stat + // Initilize stat stat->ops[FACT] = 0; + stat->current_buffer = 0.0; + stat->peak_buffer = 0.0; + stat->gpu_buffer = 0.0; //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt); //getting Nsupers @@ -145,14 +148,16 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, int_t iam = grid->iam; // in 2D grid int num_threads = getNumThreads(grid3d->iam); - diagFactBufs_t dFBuf; - dinitDiagFactBufs(ldt, &dFBuf); - factStat_t factStat; initFactStat(nsupers, &factStat); +#if 0 // sherry: not used + diagFactBufs_t dFBuf; + dinitDiagFactBufs(ldt, &dFBuf); + commRequests_t comReqs; initCommRequests(&comReqs, grid); +#endif SCT->tStartup = SuperLU_timer_(); packLUInfo_t packLUInfo; @@ -161,8 +166,8 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, scuBufs_t scuBufs; dinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid); - msgs_t msgs; - initMsgs(&msgs); + // msgs_t msgs; + // initMsgs(&msgs); // sherry: not used factNodelists_t fNlists; initFactNodelists( ldt, num_threads, nsupers, &fNlists); @@ -209,8 +214,10 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, int_t Pr = grid->nprow; int_t mrb = (nsupers + Pr - 1) / Pr; int_t mcb = (nsupers + Pc - 1) / Pc; + HyP_t *HyP = (HyP_t *) malloc(sizeof(HyP_t)); Init_HyP(HyP, Llu, mcb, mrb); + HyP->first_l_block_acc = first_l_block_acc; HyP->first_u_block_acc = first_u_block_acc; int_t bigu_size = getBigUSize(nsupers, grid, LUstruct); @@ -328,6 +335,18 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, reduceStat(FACT, stat, grid3d); + // sherry added + /* Deallocate factorization specific buffers */ + freePackLUInfo(&packLUInfo); + dfreeScuBufs(&scuBufs); + freeFactStat(&factStat); + freeFactNodelists(&fNlists); + freeMsgsArr(numLA, msgss); + freeCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), comReqss); + dLluBufFreeArr(numLA, LUvsbs); + dfreeDiagFactBufsArr(mxLeafNode, dFBufs); + Free_HyP(HyP); + #if ( DEBUGlevel>=1 ) CHECK_MALLOC (grid3d->iam, "Exit pdgstrf3d()"); #endif diff --git a/SRC/pz3dcomm.c b/SRC/pz3dcomm.c index 22835f83..8e5c6dae 100644 --- a/SRC/pz3dcomm.c +++ b/SRC/pz3dcomm.c @@ -872,6 +872,9 @@ int_t zgatherAllFactoredLU( trf3Dpartition_t* trf3Dpartition, } } /* for ilvl ... */ + SUPERLU_FREE(gNodeCount); // sherry added + SUPERLU_FREE(gNodeLists); + return 0; -} +} /* zgatherAllFactoredLU */ diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c index a740e2dd..6914ba0a 100644 --- a/SRC/pzgssvx3d.c +++ b/SRC/pzgssvx3d.c @@ -1236,6 +1236,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, #endif zDestroy_trf3Dpartition(trf3Dpartition, grid3d); + SCT_free(SCT); } /* end if not Factored */ diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c index be579f93..cd9bef61 100644 --- a/SRC/pzgstrf3d.c +++ b/SRC/pzgstrf3d.c @@ -133,8 +133,11 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, CHECK_MALLOC (grid3d->iam, "Enter pzgstrf3d()"); #endif - // initilize stat + // Initilize stat stat->ops[FACT] = 0; + stat->current_buffer = 0.0; + stat->peak_buffer = 0.0; + stat->gpu_buffer = 0.0; //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt); //getting Nsupers diff --git a/SRC/sec_structs.c b/SRC/sec_structs.c index 162e20b2..55ed42e0 100644 --- a/SRC/sec_structs.c +++ b/SRC/sec_structs.c @@ -127,6 +127,8 @@ double getFreq(void) } } + + free(line); // sherry added return 0; } @@ -263,9 +265,16 @@ void SCT_free(SCT_t* SCT) free(SCT->Host_TheadScatterMOP); free(SCT->Host_TheadScatterTimer); #endif +#if 0 _mm_free(SCT->SchurCompUdtThreadTime); _mm_free(SCT->Local_Dgstrf2_Thread_tl); _mm_free(SCT->GetAijLock_Thread_tl); +#else + SUPERLU_FREE(SCT->SchurCompUdtThreadTime); + SUPERLU_FREE(SCT->Local_Dgstrf2_Thread_tl); + SUPERLU_FREE(SCT->GetAijLock_Thread_tl); +#endif + SUPERLU_FREE(SCT); // sherry added } @@ -560,6 +569,7 @@ void Init_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb ) HyP->Ublock_info_Phi = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64); HyP->Ublock_info = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64); HyP->Lblock_dirty_bit = (int_t *) _mm_malloc(mcb * sizeof(int_t), 64); + HyP->Ublock_dirty_bit = (int_t *) _mm_malloc(mrb * sizeof(int_t), 64); #else HyP->lookAhead_info = (Remain_info_t *) SUPERLU_MALLOC((mrb) * sizeof(Remain_info_t)); HyP->lookAhead_L_buff = (double *) doubleMalloc_dist((Llu->bufmax[1])); @@ -568,6 +578,7 @@ void Init_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb ) HyP->Ublock_info_Phi = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t)); HyP->Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t)); HyP->Lblock_dirty_bit = (int_t *) intMalloc_dist(mcb); + HyP->Ublock_dirty_bit = (int_t *) intMalloc_dist(mrb); #endif for (int_t i = 0; i < mcb; ++i) @@ -575,8 +586,6 @@ void Init_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb ) HyP->Lblock_dirty_bit[i] = -1; } - HyP->Ublock_dirty_bit = (int_t *) _mm_malloc(mrb * sizeof(int_t), 64); - for (int_t i = 0; i < mrb; ++i) { HyP->Ublock_dirty_bit[i] = -1; @@ -590,7 +599,7 @@ void Init_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb ) void Free_HyP(HyP_t* HyP) { - +#if 0 _mm_free(HyP->lookAhead_info ); _mm_free(HyP->Remain_info ); _mm_free(HyP->lookAhead_L_buff ); @@ -599,9 +608,17 @@ void Free_HyP(HyP_t* HyP) _mm_free(HyP->Ublock_info_Phi ); _mm_free(HyP->Lblock_dirty_bit ); _mm_free(HyP->Ublock_dirty_bit ); - - // #endif - +#else + SUPERLU_FREE(HyP->lookAhead_info ); + SUPERLU_FREE(HyP->Remain_info ); + SUPERLU_FREE(HyP->lookAhead_L_buff ); + SUPERLU_FREE(HyP->Remain_L_buff ); + SUPERLU_FREE(HyP->Ublock_info ); + SUPERLU_FREE(HyP->Ublock_info_Phi ); + SUPERLU_FREE(HyP->Lblock_dirty_bit ); + SUPERLU_FREE(HyP->Ublock_dirty_bit ); +#endif + SUPERLU_FREE(HyP); } int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid) diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index de4ab045..b05e87ee 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -877,6 +877,7 @@ extern int_t dLPanelTrSolve(int_t k, int_t* factored_L, double* BlockUFactor, /* from trfAux.h */ extern int_t getNsupers(int, LUstruct_t *); extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo); +extern int freePackLUInfo(packLUInfo_t* packLUInfo); extern int_t dSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*, Remain_info_t*, uPanelInfo_t *, lPanelInfo_t *, int_t*, int_t *, int_t *, @@ -900,6 +901,7 @@ extern void getSCUweight(int_t nsupers, treeList_t* treeList, LUstruct_t *, grid extern int_t dLluBufInit(dLUValSubBuf_t*, LUstruct_t *); extern int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, scuBufs_t*, LUstruct_t*, gridinfo_t *); +extern int dfreeScuBufs(scuBufs_t* scuBufs); // the generic tree factoring code extern int_t treeFactor( @@ -980,7 +982,9 @@ extern int_t dsparseTreeFactor_ASYNC( int *info ); extern dLUValSubBuf_t** dLluBufInitArr(int_t numLA, LUstruct_t *LUstruct); +extern int dLluBufFreeArr(int_t numLA, dLUValSubBuf_t **LUvsbs); extern diagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); +extern int dfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs); extern int_t dinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf); extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs, gridinfo_t *grid, SCT_t *SCT); diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index 96d9d62a..fa84c143 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -1107,6 +1107,8 @@ extern void SCT_print3D(gridinfo3d_t *grid3d, SCT_t* SCT); extern void SCT_free(SCT_t*); extern treeList_t* setree2list(int_t nsuper, int_t* setree ); +extern int free_treelist(int_t nsuper, treeList_t* treeList); + // int_t calcTreeWeight(int_t nsupers, treeList_t* treeList, int_t* xsup); extern int_t calcTreeWeight(int_t nsupers, int_t*setree, treeList_t* treeList, int_t* xsup); extern int_t getDescendList(int_t k, int_t*dlist, treeList_t* treeList); @@ -1201,11 +1203,16 @@ extern int_t* getFactIperm(int_t*, int_t); extern int_t initCommRequests(commRequests_t* comReqs, gridinfo_t * grid); extern int_t initFactStat(int_t nsupers, factStat_t* factStat); +extern int freeFactStat(factStat_t* factStat); extern int_t initFactNodelists(int_t, int_t, int_t, factNodelists_t*); +extern int freeFactNodelists(factNodelists_t* fNlists); extern int_t initMsgs(msgs_t* msgs); extern int_t getNumLookAhead(superlu_dist_options_t*); extern commRequests_t** initCommRequestsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); +extern int freeCommRequestsArr(int_t mxLeafNode, commRequests_t** comReqss); + extern msgs_t** initMsgsArr(int_t numLA); +extern int freeMsgsArr(int_t numLA, msgs_t **msgss); extern int_t Trs2_InitUblock_info(int_t klst, int_t nb, Ublock_info_t *, int_t *usub, Glu_persist_t *, SuperLUStat_t*); diff --git a/SRC/superlu_grid3d.c b/SRC/superlu_grid3d.c index e9ee4cbf..ce8a8259 100644 --- a/SRC/superlu_grid3d.c +++ b/SRC/superlu_grid3d.c @@ -268,8 +268,9 @@ void superlu_gridmap3d( #endif SUPERLU_FREE(pranks); - MPI_Group_free(&superlu_grp); - MPI_Group_free(&mpi_base_group); + MPI_Group_free( &superlu_grp ); + MPI_Group_free( &mpi_base_group ); + MPI_Comm_free( &superlu3d_comm ); // Sherry added } void superlu_gridexit3d(gridinfo3d_t *grid) diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h index edd24a1a..5a49ca68 100644 --- a/SRC/superlu_zdefs.h +++ b/SRC/superlu_zdefs.h @@ -880,6 +880,7 @@ extern int_t zLPanelTrSolve(int_t k, int_t* factored_L, doublecomplex* BlockUFac /* from trfAux.h */ extern int_t getNsupers(int, LUstruct_t *); extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo); +extern int freePackLUInfo(packLUInfo_t* packLUInfo); extern int_t zSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*, Remain_info_t*, uPanelInfo_t *, lPanelInfo_t *, int_t*, int_t *, int_t *, @@ -903,6 +904,7 @@ extern void getSCUweight(int_t nsupers, treeList_t* treeList, LUstruct_t *, grid extern int_t zLluBufInit(zLUValSubBuf_t*, LUstruct_t *); extern int_t zinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, scuBufs_t*, LUstruct_t*, gridinfo_t *); +extern int zfreeScuBufs(scuBufs_t* scuBufs); // the generic tree factoring code extern int_t treeFactor( @@ -983,7 +985,9 @@ extern int_t zsparseTreeFactor_ASYNC( int *info ); extern zLUValSubBuf_t** zLluBufInitArr(int_t numLA, LUstruct_t *LUstruct); +extern int zLluBufFreeArr(int_t numLA, zLUValSubBuf_t **LUvsbs); extern diagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); +extern int zfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs); extern int_t zinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf); extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs, gridinfo_t *grid, SCT_t *SCT); diff --git a/SRC/supernodalForest.c b/SRC/supernodalForest.c index 547ac29c..ad1aff0f 100644 --- a/SRC/supernodalForest.c +++ b/SRC/supernodalForest.c @@ -180,6 +180,8 @@ int_t calcTopInfoForest(sForest_t *forest, forest->nodeList = nodeListNew; forest->topoInfo = ttI; + SUPERLU_FREE(myTopOrder); // sherry added + return 0; } @@ -617,7 +619,6 @@ forestPartition_t iterativeFrPartitioning(rForest_t* rforest, int_t nsupers, int int_t nTreeSet = rforest->ntrees; int_t* treeHeads = rforest->treeHeads; - int_t nAnc = 0; int_t* ancTreeCount = INT_T_ALLOC(MAX_TREE_ALLOWED); int_t** ancNodeLists = SUPERLU_MALLOC(MAX_TREE_ALLOWED * sizeof(int_t*)); @@ -682,8 +683,6 @@ forestPartition_t iterativeFrPartitioning(rForest_t* rforest, int_t nsupers, int // Create the Ancestor forest sForest_t* aforest = createForestNew(nAnc, nsupers, ancTreeCount, ancNodeLists, setree, treeList); - - // create the weight array; double* sWeightArr = DOUBLE_ALLOC(nTreeSet); for (int i = 0; i < nTreeSet ; ++i) @@ -713,14 +712,14 @@ forestPartition_t iterativeFrPartitioning(rForest_t* rforest, int_t nsupers, int frPr_t.S[0] = rforestS1; frPr_t.S[1] = rforestS2; + SUPERLU_FREE(weightArr); + SUPERLU_FREE(treeSet); + SUPERLU_FREE(sWeightArr); + // free stuff // int_t* ancTreeCount = INT_T_ALLOC(MAX_TREE_ALLOWED); // int_t** ancNodeLists = SUPERLU_MALLOC(MAX_TREE_ALLOWED * sizeof(int_t*)); - SUPERLU_FREE(weightArr); - SUPERLU_FREE (treeSet); - SUPERLU_FREE (sWeightArr); - for (int i = 0; i < nAnc ; ++i) { /* code */ @@ -731,7 +730,7 @@ forestPartition_t iterativeFrPartitioning(rForest_t* rforest, int_t nsupers, int SUPERLU_FREE(ancNodeLists); return frPr_t; -} +} /* iterativeFrPartitioning */ @@ -805,11 +804,13 @@ sForest_t** getGreedyLoadBalForests( int_t maxLvl, int_t nsupers, int_t * setre } - if (maxLvl == 1) { /* code */ sForests[0] = r2sForest(&rForests[0], nsupers, setree, treeList); + + freeRforest(&rForests[0]); // sherry added + SUPERLU_FREE(rForests); return sForests; } @@ -850,8 +851,6 @@ sForest_t** getGreedyLoadBalForests( int_t maxLvl, int_t nsupers, int_t * setre SUPERLU_FREE(rForests); - - return sForests; } diff --git a/SRC/supernodal_etree.c b/SRC/supernodal_etree.c index 35981350..aceef5fa 100644 --- a/SRC/supernodal_etree.c +++ b/SRC/supernodal_etree.c @@ -99,6 +99,15 @@ treeList_t* setree2list(int_t nsuper, int_t* setree ) } /* setree2list */ +// sherry added +int free_treelist(int_t nsuper, treeList_t* treeList) +{ + for (int i = 0; i < nsuper + 1; ++i) { + SUPERLU_FREE(treeList[i].childrenList); + } + SUPERLU_FREE(treeList); +} + int_t estimateWeight(int_t nsupers, int_t*setree, treeList_t* treeList, int_t* xsup) { if (getenv("WF")) @@ -984,8 +993,8 @@ void printEtree(int_t nsuper, int_t *setree, treeList_t* treeList) { /* code */ // fprintf(fp, "%lld -> %lld;\n",iperm[i],iperm[setree[i]]); - fprintf(fp, " \"%lld|%lld\" -> \"%lld|%lld\";\n", i, treeList[i].depth, - setree[i], treeList[setree[i]].depth); + fprintf(fp, " \"%d|%ld\" -> \"%ld|%ld\";\n", i, treeList[i].depth, + (long int) setree[i], (long int) treeList[setree[i]].depth); } /*end of the file */ diff --git a/SRC/treeFactorization.c b/SRC/treeFactorization.c index 260f275e..e01d58ab 100644 --- a/SRC/treeFactorization.c +++ b/SRC/treeFactorization.c @@ -243,6 +243,23 @@ commRequests_t** initCommRequestsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* gr return comReqss; } +// sherry added +int freeCommRequestsArr(int_t mxLeafNode, commRequests_t** comReqss) +{ + for (int i = 0; i < mxLeafNode; ++i) { + SUPERLU_FREE(comReqss[i]->U_diag_blk_recv_req); + SUPERLU_FREE(comReqss[i]->L_diag_blk_recv_req); + SUPERLU_FREE(comReqss[i]->U_diag_blk_send_req); + SUPERLU_FREE(comReqss[i]->L_diag_blk_send_req); + SUPERLU_FREE(comReqss[i]->send_req); + SUPERLU_FREE(comReqss[i]->recv_req); + SUPERLU_FREE(comReqss[i]->send_requ); + SUPERLU_FREE(comReqss[i]->recv_requ); + SUPERLU_FREE(comReqss[i]); + } + SUPERLU_FREE(comReqss); +} + int_t initFactStat(int_t nsupers, factStat_t* factStat) { factStat->IrecvPlcd_D = intMalloc_dist( nsupers); @@ -268,6 +285,18 @@ int_t initFactStat(int_t nsupers, factStat_t* factStat) return 0; } +int freeFactStat(factStat_t* factStat) +{ + SUPERLU_FREE(factStat->IrecvPlcd_D); + SUPERLU_FREE(factStat->factored_D); + SUPERLU_FREE(factStat->factored_L); + SUPERLU_FREE(factStat->factored_U); + SUPERLU_FREE(factStat->factored); + SUPERLU_FREE(factStat->IbcastPanel_L); + SUPERLU_FREE(factStat->IbcastPanel_U); + SUPERLU_FREE(factStat->gpuLUreduced); +} + int_t initFactNodelists(int_t ldt, int_t num_threads, int_t nsupers, factNodelists_t* fNlists) { @@ -278,6 +307,15 @@ int_t initFactNodelists(int_t ldt, int_t num_threads, int_t nsupers, return 0; } +int freeFactNodelists(factNodelists_t* fNlists) +{ + SUPERLU_FREE(fNlists->iperm_u); + SUPERLU_FREE(fNlists->perm_u); + SUPERLU_FREE(fNlists->indirect); + SUPERLU_FREE(fNlists->indirect2); + return 0; +} + int_t initMsgs(msgs_t* msgs) { msgs->msgcnt = (int *) SUPERLU_MALLOC(4 * sizeof(int)); @@ -297,6 +335,17 @@ msgs_t** initMsgsArr(int_t numLA) return msgss; } +// sherry added +int freeMsgsArr(int_t numLA, msgs_t **msgss) +{ + for (int i = 0; i < numLA; ++i) { + SUPERLU_FREE(msgss[i]->msgcnt); + SUPERLU_FREE(msgss[i]->msgcntU); + SUPERLU_FREE(msgss[i]); + } + SUPERLU_FREE(msgss); +} + int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo) { packLUInfo->Ublock_info = (Ublock_info_t*) SUPERLU_MALLOC (sizeof(Ublock_info_t) * nsupers); @@ -306,6 +355,15 @@ int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo) return 0; } +int freePackLUInfo(packLUInfo_t* packLUInfo) // sherry added +{ + SUPERLU_FREE(packLUInfo->Ublock_info); + SUPERLU_FREE(packLUInfo->Remain_info); + SUPERLU_FREE(packLUInfo->uPanelInfo); + SUPERLU_FREE(packLUInfo->lPanelInfo); + return 0; +} + int_t getNumLookAhead(superlu_dist_options_t *options) { int_t numLA; diff --git a/SRC/zreadtriple_noheader.c b/SRC/zreadtriple_noheader.c index 81c6684a..160f3f6e 100644 --- a/SRC/zreadtriple_noheader.c +++ b/SRC/zreadtriple_noheader.c @@ -86,7 +86,7 @@ zreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, #endif /* Second pass: read the actual matrix values */ - printf("m %ld, n %ld, nonz %ld\n", *m, *n, *nonz); + printf("m %ld, n %ld, nonz %ld\n", (long int) *m, (long int) *n, (long int) *nonz); zallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */ a = *nzval; asub = *rowind; @@ -117,8 +117,8 @@ zreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n /*|| val[nz] == 0.*/) { - fprintf(stderr, "nz %d, (%d, %d) = %e out of bound, removed\n", - nz, row[nz], col[nz], val[nz]); + fprintf(stderr, "nz %d, (%d, %d) = (%e, %e) out of bound, removed\n", + nz, row[nz], col[nz], val[nz].r, val[nz].i); exit(-1); } else { ++xa[col[nz]]; diff --git a/SRC/zscatter3d.c b/SRC/zscatter3d.c index 356d6552..66bb5af8 100644 --- a/SRC/zscatter3d.c +++ b/SRC/zscatter3d.c @@ -136,9 +136,15 @@ zblock_gemm_scatter( int_t lb, int_t j, /* calling ZGEMM */ // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col ); #if 1 + #if defined (USE_VENDOR_BLAS) + zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha, + &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl, + &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1); + #else zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha, &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl, &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow); + #endif #else // printf("%d %d %d %d %d %d %d %d\n", temp_nbrow, ncols, ldu, ldl,st_col,(knsupc - ldu)*ldl + cum_nrow,cum_nrow,st_col); @@ -240,10 +246,16 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, /* calling ZGEMM */ #if 1 + #if defined (USE_VENDOR_BLAS) // printf(" m %d n %d k %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col ); + zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha, + &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl, + &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1); + #else zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha, &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl, &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow); + #endif #else cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, temp_nbrow, ncols, ldu, alpha, diff --git a/SRC/ztreeFactorization.c b/SRC/ztreeFactorization.c index db34bb00..46d13de5 100644 --- a/SRC/ztreeFactorization.c +++ b/SRC/ztreeFactorization.c @@ -49,6 +49,18 @@ diagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* g return dFBufs; } +// sherry added +int zfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs) +{ + for (int i = 0; i < mxLeafNode; ++i) { + SUPERLU_FREE(dFBufs[i]->BlockUFactor); + SUPERLU_FREE(dFBufs[i]->BlockLFactor); + SUPERLU_FREE(dFBufs[i]); + } + SUPERLU_FREE(dFBufs); + return 0; +} + zLUValSubBuf_t** zLluBufInitArr(int_t numLA, LUstruct_t *LUstruct) { zLUValSubBuf_t** LUvsbs = (zLUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(zLUValSubBuf_t*)); @@ -62,6 +74,19 @@ zLUValSubBuf_t** zLluBufInitArr(int_t numLA, LUstruct_t *LUstruct) return LUvsbs; } +// sherry added +int zLluBufFreeArr(int_t numLA, zLUValSubBuf_t **LUvsbs) +{ + for (int_t i = 0; i < numLA; ++i) { + SUPERLU_FREE(LUvsbs[i]->Lsub_buf); + SUPERLU_FREE(LUvsbs[i]->Lval_buf); + SUPERLU_FREE(LUvsbs[i]->Usub_buf); + SUPERLU_FREE(LUvsbs[i]->Uval_buf); + SUPERLU_FREE(LUvsbs[i]); + } + SUPERLU_FREE(LUvsbs); +} + int_t zinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, scuBufs_t* scuBufs, @@ -72,6 +97,15 @@ int_t zinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, scuBufs->bigU = zgetBigU(nsupers, grid, LUstruct); return 0; } + +// sherry added +int zfreeScuBufs(scuBufs_t* scuBufs) +{ + SUPERLU_FREE(scuBufs->bigV); + SUPERLU_FREE(scuBufs->bigU); + return 0; +} + int_t zinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf) { dFBuf->BlockUFactor = doublecomplexMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt); diff --git a/SRC/ztrfAux.c b/SRC/ztrfAux.c index d2408197..c88f9fc6 100644 --- a/SRC/ztrfAux.c +++ b/SRC/ztrfAux.c @@ -51,7 +51,7 @@ void zinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, zinit3DLUstruct( myTreeIdxs, myZeroTrIdxs, gNodeCount, gNodeLists, LUstruct, grid3d); - SUPERLU_FREE(gNodeCount); + SUPERLU_FREE(gNodeCount); // sherry added SUPERLU_FREE(gNodeLists); } @@ -537,7 +537,7 @@ trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers, int_t* myTreeIdxs = getGridTrees(grid3d); int_t* myZeroTrIdxs = getReplicatedTrees(grid3d); int_t* gNodeCount = getNodeCountsFr(maxLvl, sForests); - int_t** gNodeLists = getNodeListFr(maxLvl, sForests); + int_t** gNodeLists = getNodeListFr(maxLvl, sForests); // reuse NodeLists stored in sForests[] zinit3DLUstructForest(myTreeIdxs, myZeroTrIdxs, sForests, LUstruct, grid3d); @@ -555,7 +555,7 @@ trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers, for (int_t nd = 0; nd < gNodeCount[Fr]; ++nd) { /* code */ - supernode2treeMap[gNodeLists[Fr][nd]]=Fr; + supernode2treeMap[gNodeLists[Fr][nd]]=Fr; } } @@ -571,7 +571,12 @@ trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers, trf3Dpartition->LUvsb = LUvsb; trf3Dpartition->supernode2treeMap = supernode2treeMap; - SUPERLU_FREE(treeList); // Sherry added + // Sherry added + // Deallocate storage + SUPERLU_FREE(gNodeCount); + SUPERLU_FREE(gNodeLists); + SUPERLU_FREE(perm_c_supno); + free_treelist(nsupers, treeList); #if ( DEBUGlevel>=1 ) CHECK_MALLOC (iam, "Exit zinitTrf3Dpartition()"); @@ -612,6 +617,9 @@ void zDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *gri SUPERLU_FREE((trf3Dpartition->LUvsb)->Usub_buf); SUPERLU_FREE((trf3Dpartition->LUvsb)->Uval_buf); SUPERLU_FREE(trf3Dpartition->LUvsb); // Sherry: check this ... + + SUPERLU_FREE(trf3Dpartition); + #if ( DEBUGlevel>=1 ) CHECK_MALLOC (grid3d->iam, "Enter zDestroy_trf3Dpartition()"); #endif From 7c262383bb1cfc4e961a1c618706c200ff80e03a Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Thu, 19 Sep 2019 16:25:43 -0700 Subject: [PATCH 013/147] Some updates to CMakeLists.txt, make.inc.in, and run_cmake_build.sh. Small changes to code. --- CBLAS/CMakeLists.txt | 4 +-- CMakeLists.txt | 11 +++--- EXAMPLE/Makefile | 3 +- SRC/dtreeFactorization.c | 8 +++++ SRC/dtrfAux.c | 2 +- SRC/pdgssvx3d.c | 3 +- SRC/pdgstrf.c | 2 +- SRC/pdgstrf3d.c | 4 +++ SRC/superlu_dist_config.h | 16 +++++++-- SRC/superlu_dist_config.h.in | 1 - make.inc.in | 8 +++-- run_cmake_build.sh | 70 +++++++++++++++++++----------------- 12 files changed, 83 insertions(+), 49 deletions(-) diff --git a/CBLAS/CMakeLists.txt b/CBLAS/CMakeLists.txt index 3d259fef..b0be69b7 100644 --- a/CBLAS/CMakeLists.txt +++ b/CBLAS/CMakeLists.txt @@ -46,5 +46,5 @@ endif() add_library(blas ${sources} ${HEADERS}) -install(TARGETS blas DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) -install(FILES ${headers} DESTINATION ${CMAKE_INSTALL_PREFIX}/include) +install(TARGETS blas DESTINATION "${INSTALL_LIB_DIR}") +install(FILES ${headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f4786c1..b7b92edb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -81,6 +81,7 @@ if (BUILD_SHARED_LIBS) if (BUILD_STATIC_LIBS) message("-- SuperLU_DIST will also be built as a static library.") endif() + set(SHARED_C_FLAGS_EXPORT ${CMAKE_SHARED_LIBRARY_C_FLAGS}) else() message("-- SuperLU_DIST will be built as a static library.") set(PROJECT_NAME_LIB_EXPORT libsuperlu_dist.a) @@ -106,13 +107,12 @@ if (NOT CMAKE_INSTALL_PREFIX) set(CMAKE_INSTALL_PREFIX /usr/local) endif() - if(NOT MSVC) include(GNUInstallDirs) set(default_install_inc_dir ${CMAKE_INSTALL_INCLUDEDIR}) set(default_install_lib_dir ${CMAKE_INSTALL_LIBDIR}) set(default_install_bin_dir ${CMAKE_INSTALL_BINDIR}) -else() +else() # for Windows set(default_install_inc_dir "include") set(default_install_lib_dir "lib") set(default_install_bin_dir "bin") @@ -130,6 +130,8 @@ if(XSDK_INDEX_SIZE EQUAL 64) message("-- Using 64 bit integer for index size.") endif() set(CMAKE_C_FLAGS_RELEASE "-O3" CACHE STRING "") +message("cmake_c_flags_release '${CMAKE_C_FLAGS_RELEASE}'") +message("cmake_shared_library_c_flags '${CMAKE_SHARED_LIBRARY_C_FLAGS}'") set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "") ###################################################################### @@ -189,9 +191,10 @@ else() add_subdirectory(CBLAS) set(BLAS_LIB blas) if (BUILD_SHARED_LIBS) # export to be referenced by downstream makefile - set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/CBLAS/libblas.so) +# set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/CBLAS/libblas.so) + set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/${INSTALL_LIB_DIR}/libblas.so) else() - set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/CBLAS/libblas.a) + set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/${INSTALL_LIB_DIR}/libblas.a) endif() endif() diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile index 74a3e3c8..56a17b99 100644 --- a/EXAMPLE/Makefile +++ b/EXAMPLE/Makefile @@ -37,7 +37,8 @@ DEXM1 = pddrive1.o dcreate_matrix.o DEXM2 = pddrive2.o dcreate_matrix.o dcreate_matrix_perturbed.o DEXM3 = pddrive3.o dcreate_matrix.o DEXM4 = pddrive4.o dcreate_matrix.o -DEXM3D = pddrive3d.o dcreate_matrix.o +DEXM3D = pddrive3d.o dcreate_matrix.o +# dtrfAux.o dtreeFactorization.o treeFactorization.o pd3dcomm.o superlu_grid3d.o pdgstrf3d.o DEXMG = pddrive_ABglobal.o DEXMG1 = pddrive1_ABglobal.o DEXMG2 = pddrive2_ABglobal.o diff --git a/SRC/dtreeFactorization.c b/SRC/dtreeFactorization.c index 4ee0e931..6e8714b8 100644 --- a/SRC/dtreeFactorization.c +++ b/SRC/dtreeFactorization.c @@ -297,6 +297,10 @@ int_t dsparseTreeFactor_ASYNC( return 1; } +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (grid3d->iam, "Enter dsparseTreeFactor_ASYNC()"); +#endif + int_t *perm_c_supno = sforest->nodeList ; // list of nodes in the order of factorization treeTopoInfo_t* treeTopoInfo = &sforest->topoInfo; int_t* myIperm = treeTopoInfo->myIperm; @@ -327,6 +331,8 @@ int_t dsparseTreeFactor_ASYNC( sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); #else + + // printf("[1] (iam %d) k0 %d offset %d\n", grid3d->iam, k0, offset); fflush(stdout); dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D, comReqss[offset]->U_diag_blk_recv_req, @@ -356,6 +362,7 @@ int_t dsparseTreeFactor_ASYNC( sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); #else + // printf("[2] (iam %d) topoLvl %d\n", grid3d->iam, topoLvl); fflush(stdout); dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D, comReqss[offset]->U_diag_blk_recv_req, @@ -380,6 +387,7 @@ int_t dsparseTreeFactor_ASYNC( sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset], grid, LUstruct, SCT); #else + printf("[3] (iam %d) k0 %d\n", grid3d->iam, k0); fflush(stdout); dLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L, comReqss[offset]->U_diag_blk_recv_req, dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT); diff --git a/SRC/dtrfAux.c b/SRC/dtrfAux.c index 7fffdf13..ed489438 100644 --- a/SRC/dtrfAux.c +++ b/SRC/dtrfAux.c @@ -577,7 +577,7 @@ trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers, SUPERLU_FREE(gNodeCount); SUPERLU_FREE(gNodeLists); SUPERLU_FREE(perm_c_supno); - free_treelist(nsupers, treeList); + // free_treelist(nsupers, treeList); #if ( DEBUGlevel>=1 ) CHECK_MALLOC (iam, "Exit dinitTrf3Dpartition()"); diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index fc62058e..1bbcc017 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -1190,10 +1190,11 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* send the LU structure to all the grids */ dp3dScatter(n, LUstruct, grid3d); + int_t nsupers = getNsupers(n, LUstruct); trf3Dpartition = dinitTrf3Dpartition(nsupers, options, LUstruct, grid3d); - SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t)); + SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t)); SCT_init(SCT); #if ( PRNTlevel>=1 ) diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c index 2d055dcd..10a649fa 100644 --- a/SRC/pdgstrf.c +++ b/SRC/pdgstrf.c @@ -749,7 +749,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #pragma omp parallel for reduction(max :local_max_row_size) private(lk,lsub) #endif #endif - for (int i = mycol; i < nsupers; i += Pc) { /* grab my local columns */ + for (i = mycol; i < nsupers; i += Pc) { /* grab my local columns */ //int tpc = PCOL (i, grid); lk = LBj (i, grid); lsub = Lrowind_bc_ptr[lk]; diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c index 35930233..9a514f5a 100644 --- a/SRC/pdgstrf3d.c +++ b/SRC/pdgstrf3d.c @@ -347,6 +347,10 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, dfreeDiagFactBufsArr(mxLeafNode, dFBufs); Free_HyP(HyP); + if (!iam) { + printf ("exit pdgstrf3d()\n"); fflush(stdout); + } + #if ( DEBUGlevel>=1 ) CHECK_MALLOC (grid3d->iam, "Exit pdgstrf3d()"); #endif diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index ec3d9f9a..c3def71c 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -1,7 +1,17 @@ -/* #define XSDK_INDEX_SIZE 64 */ -/* #define SLU_HAVE_LAPACK TRUE */ +/* superlu_dist_config.h.in */ + +/* Enable parmetis */ #define HAVE_PARMETIS TRUE -/* #define HAVE_COMBBLAS TRUE */ + +/* Enable LAPACK */ +/* #undef SLU_HAVE_LAPACK */ + +/* Enable CombBLAS */ +/* #undef HAVE_COMBBLAS */ + +/* enable 64bit index mode */ +#define XSDK_INDEX_SIZE 64 + #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 #endif diff --git a/SRC/superlu_dist_config.h.in b/SRC/superlu_dist_config.h.in index 625d52b0..b8529d6c 100644 --- a/SRC/superlu_dist_config.h.in +++ b/SRC/superlu_dist_config.h.in @@ -14,5 +14,4 @@ #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 - #endif diff --git a/make.inc.in b/make.inc.in index 1c7f9147..490c5613 100644 --- a/make.inc.in +++ b/make.inc.in @@ -16,10 +16,12 @@ # The name of the libraries to be created/linked to # SuperLUroot = ${CMAKE_INSTALL_PREFIX} -DSUPERLULIB = $(SuperLUroot)/SRC/${PROJECT_NAME_LIB_EXPORT} +#DSUPERLULIB = $(SuperLUroot)/SRC/${PROJECT_NAME_LIB_EXPORT} +DSUPERLULIB = $(SuperLUroot)/@CMAKE_INSTALL_LIBDIR@/${PROJECT_NAME_LIB_EXPORT} INCLUDEDIR = $(SuperLUroot)/@CMAKE_INSTALL_INCLUDEDIR@ -LIBS = $(DSUPERLULIB) ${BLAS_LIB_EXPORT} +LIBS = $(DSUPERLULIB) ${BLAS_LIB_EXPORT} +BLASLIB = ${BLAS_LIB_EXPORT} LIBS += ${LAPACK_LIB_EXPORT} LIBS += ${PARMETIS_LIB_EXPORT} LIBS += ${COMBBLAS_LIB_EXPORT} @@ -33,7 +35,7 @@ ARCHFLAGS = cr RANLIB = @CMAKE_RANLIB@ CC = @CMAKE_C_COMPILER@ -CFLAGS = @CMAKE_C_FLAGS_RELEASE@ @CMAKE_C_FLAGS@ +CFLAGS = @CMAKE_C_FLAGS_RELEASE@ @CMAKE_C_FLAGS@ ##@CMAKE_SHARED_LIBRARY_C_FLAGS@ #CFLAGS += -D${DirDefs} # CFLAGS += @COMPILE_DEFINITIONS@ CXX = @CMAKE_CXX_COMPILER@ diff --git a/run_cmake_build.sh b/run_cmake_build.sh index f2cc638e..ffd3689a 100755 --- a/run_cmake_build.sh +++ b/run_cmake_build.sh @@ -1,10 +1,44 @@ #!/bin/bash -## if [ !$?NERSC_HOST ] -if [ -z $NERSC_HOST ] +THISHOST=`hostname -s` +echo "host: $THISHOST" + +if [ "$THISHOST" == "ssg1" ] then - echo "NERSC_HOST undefined" -elif [ "$NERSC_HOST" == "edison" ] +# rm -fr ssg1-build; mkdir ssg1-build; cd ssg1-build; +# export PARMETIS_ROOT=~/lib/static/parmetis-4.0.3 + rm -fr int64-build; mkdir int64-build; cd int64-build; + export PARMETIS_ROOT=~/lib/static/64-bit/parmetis-4.0.3 + export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64 + echo "ParMetis root: $PARMETIS_ROOT" + cmake .. \ + -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \ + -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \ + -DTPL_COMBBLAS_INCLUDE_DIRS="${COMBBLAS_ROOT}/_install/include;${COMBBLAS_R\ +OOT}/Applications/BipartiteMatchings" \ + -DTPL_COMBBLAS_LIBRARIES="${COMBBLAS_BUILD_DIR}/libCombBLAS.a" \ + -DCMAKE_C_FLAGS="-std=c99 -O3 -g -DPRNTlevel=1 -DDEBUGlevel=0" \ + -DCMAKE_C_COMPILER=mpicc \ + -DCMAKE_CXX_COMPILER=mpicxx \ + -DTPL_ENABLE_BLASLIB=ON \ + -DTPL_ENABLE_COMBBLASLIB=OFF \ + -DTPL_ENABLE_LAPACKLIB=OFF \ + -DBUILD_SHARED_LIBS=OFF \ + -DXSDK_INDEX_SIZE=64 \ + -DCMAKE_INSTALL_PREFIX=. +fi +# -DCMAKE_INSTALL_PREFIX=install +# -DTPL_ENABLE_PARMETISLIB=OFF +# -DCMAKE_CXX_FLAGS="-std=c++11" \ + +## if [ !$?NERSC_HOST ] + +# if [ -z $NERSC_HOST ] +# then +# echo "NERSC_HOST undefined" +# fi + +if [ "$NERSC_HOST" == "edison" ] then mkdir edison-build; cd edison-build; # export PARMETIS_ROOT=~/Edison/lib/parmetis-4.0.3_64 @@ -39,34 +73,6 @@ then # -DCMAKE_EXE_LINKER_FLAGS="-shared" \ fi -THISHOST=`hostname -s` -echo "host: $THISHOST" -if [ "$THISHOST" == "ssg1" ] -then - rm -fr ssg1-build; mkdir ssg1-build; cd ssg1-build; - export PARMETIS_ROOT=~/lib/static/parmetis-4.0.3 -# rm -fr int64-build; mkdir int64-build; cd int64-build; -# export PARMETIS_ROOT=~/lib/static/64-bit/parmetis-4.0.3 - export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64 - echo "ParMetis root: $PARMETIS_ROOT" - cmake .. \ - -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \ - -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \ - -DTPL_COMBBLAS_INCLUDE_DIRS="${COMBBLAS_ROOT}/_install/include;${COMBBLAS_R\ -OOT}/Applications/BipartiteMatchings" \ - -DTPL_COMBBLAS_LIBRARIES="${COMBBLAS_BUILD_DIR}/libCombBLAS.a" \ - -DCMAKE_C_FLAGS="-std=c99 -g -DPRNTlevel=1 -DDEBUGlevel=0" \ - -DCMAKE_C_COMPILER=mpicc \ - -DCMAKE_CXX_COMPILER=mpicxx \ - -DTPL_ENABLE_BLASLIB=OFF \ - -DTPL_ENABLE_COMBBLASLIB=OFF \ - -DTPL_ENABLE_LAPACKLIB=OFF \ - -DBUILD_SHARED_LIBS=OFF \ - -DCMAKE_INSTALL_PREFIX=. -fi -# -DXSDK_INDEX_SIZE=64 \ -# -DTPL_ENABLE_PARMETISLIB=OFF -# -DCMAKE_CXX_FLAGS="-std=c++11" \ # make VERBOSE=1 # make test From f3b5e72fab9315c15485929d6f27f4dbb2af9157 Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Mon, 7 Oct 2019 17:14:59 -0700 Subject: [PATCH 014/147] Fix memory leaks: - in Destroy-trf3Dpartition(): free sForests[i]; free treePerm - free L & U structures replicated on the non-zero process layers in EXAMPLE/pxdrive3d.c, call: DeAllocLlu_3d(n, &LUstruct, &grid); DeAllocGlu_3d(&LUstruct); - in supernodalForest.c: change INT_T_ALLOC to intMalloc_dist() change DOUBLE_ALLOC to doubleMalloc_dist() add several SUPERLU_FREE() --- EXAMPLE/pddrive3d.c | 14 +++-- EXAMPLE/pzdrive3d.c | 12 +++-- SRC/dgather.c | 17 +++--- SRC/dtreeFactorization.c | 27 +++++++--- SRC/dtrfAux.c | 11 ++-- SRC/pd3dcomm.c | 10 ++-- SRC/pdgstrf3d.c | 21 +++----- SRC/pz3dcomm.c | 8 +-- SRC/pzgstrf3d.c | 33 ++++++++---- SRC/sec_structs.c | 15 ++++-- SRC/superlu_ddefs.h | 6 ++- SRC/superlu_defs.h | 6 ++- SRC/superlu_dist_config.h | 2 +- SRC/superlu_enum_consts.h | 3 +- SRC/superlu_zdefs.h | 6 ++- SRC/supernodalForest.c | 108 ++++++++++++++++++++++---------------- SRC/supernodal_etree.c | 59 ++++++++++++++------- SRC/util.c | 94 ++++++++++++++++++++++++--------- SRC/ztreeFactorization.c | 17 +++++- SRC/ztrfAux.c | 9 ++-- make.inc.in | 3 +- run_cmake_build.sh | 14 ++--- 22 files changed, 319 insertions(+), 176 deletions(-) diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c index 619fe98b..a3c55229 100644 --- a/EXAMPLE/pddrive3d.c +++ b/EXAMPLE/pddrive3d.c @@ -40,8 +40,11 @@ at the top-level directory. * 5. Release the process grid and terminate the MPI environment * * The program may be run by typing - * mpiexec -np

pddrive -r -c \ - * -d + * mpiexec -np

pddrive3d -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * d must be a power-of-two, e.g., 1, 2, 4, ... + * *

*/ @@ -261,17 +264,18 @@ main (int argc, char *argv[]) Destroy_LU (n, &(grid.grid2d), &LUstruct); SUPERLU_FREE (b); SUPERLU_FREE (xtrue); - SUPERLU_FREE (berr); if (options.SolveInitialized) { dSolveFinalize (&options, &SOLVEstruct); } + } else { // Process layers not equal 0 + DeAllocLlu_3d(n, &LUstruct, &grid); + DeAllocGlu_3d(&LUstruct); } + SUPERLU_FREE (berr); ScalePermstructFree (&ScalePermstruct); LUstructFree (&LUstruct); - PStatFree (&stat); - printf("(%d) after StatFree\n", iam); /* ------------------------------------------------------------ RELEASE THE SUPERLU PROCESS GRID. diff --git a/EXAMPLE/pzdrive3d.c b/EXAMPLE/pzdrive3d.c index 9c1c5a02..2ab20daa 100644 --- a/EXAMPLE/pzdrive3d.c +++ b/EXAMPLE/pzdrive3d.c @@ -39,8 +39,11 @@ at the top-level directory. * 5. Release the process grid and terminate the MPI environment * * The program may be run by typing - * mpiexec -np

pzdrive -r -c \ - * -d + * mpiexec -np

pzdrive3d -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * d must be a power-of-two, e.g., 1, 2, 4, ... + * *

*/ @@ -260,12 +263,15 @@ main (int argc, char *argv[]) Destroy_LU (n, &(grid.grid2d), &LUstruct); SUPERLU_FREE (b); SUPERLU_FREE (xtrue); - SUPERLU_FREE (berr); if (options.SolveInitialized) { zSolveFinalize (&options, &SOLVEstruct); } + } else { // Process layers not equal 0 + DeAllocLlu_3d(n, &LUstruct, &grid); + DeAllocGlu_3d(&LUstruct); } + SUPERLU_FREE (berr); ScalePermstructFree (&ScalePermstruct); LUstructFree (&LUstruct); PStatFree (&stat); diff --git a/SRC/dgather.c b/SRC/dgather.c index 4bdc470f..887963c0 100644 --- a/SRC/dgather.c +++ b/SRC/dgather.c @@ -378,14 +378,19 @@ void dRgather_U( int_t k, int_t jj0, int_t *usub, double *uval, HyP->Ublock_info_Phi[j].full_u_cols = HyP->Ublock_info_Phi[j ].ncols + HyP->Ublock_info_Phi[j - 1].full_u_cols; } - // if (!grid->iam) { // Sherry to remove - // printf(".. k %d jj0 %d\t num_u_blks_phi %d\t mcb %d\n", k, jj0, HyP->num_u_blks_Phi, - // (HyP->nsupers + grid->npcol - 1) / grid->npcol); - //fflush(stdout); - //} +#if 0 + if (!grid->iam) { // Sherry to remove + printf(".. k %d jj0 %d\t num_u_blks_phi %d\t mcb %d\n", k, jj0, HyP->num_u_blks_Phi, + (HyP->nsupers + grid->npcol - 1) / grid->npcol); + fflush(stdout); + } +#endif HyP->bigU_Phi = bigU; - HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; + if ( HyP->num_u_blks_Phi == 0 ) + HyP->bigU_host = bigU; + else + HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; dgather_u(HyP->num_u_blks, HyP->Ublock_info, usub, uval, HyP->bigU_host, HyP->ldu, xsup, klst ); diff --git a/SRC/dtreeFactorization.c b/SRC/dtreeFactorization.c index 6e8714b8..36d34ce8 100644 --- a/SRC/dtreeFactorization.c +++ b/SRC/dtreeFactorization.c @@ -37,7 +37,12 @@ int_t dLluBufInit(dLUValSubBuf_t* LUvsb, LUstruct_t *LUstruct) diagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid) { diagFactBufs_t** dFBufs; - dFBufs = (diagFactBufs_t** ) SUPERLU_MALLOC(mxLeafNode * sizeof(diagFactBufs_t*)); + + /* Sherry fix: + * mxLeafNode can be 0 for the replicated layers of the processes ?? */ + if ( mxLeafNode ) dFBufs = (diagFactBufs_t** ) + SUPERLU_MALLOC(mxLeafNode * sizeof(diagFactBufs_t*)); + for (int i = 0; i < mxLeafNode; ++i) { /* code */ @@ -58,7 +63,11 @@ int dfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs) SUPERLU_FREE(dFBufs[i]->BlockLFactor); SUPERLU_FREE(dFBufs[i]); } - SUPERLU_FREE(dFBufs); + + /* Sherry fix: + * mxLeafNode can be 0 for the replicated layers of the processes ?? */ + if ( mxLeafNode ) SUPERLU_FREE(dFBufs); + return 0; } @@ -88,6 +97,7 @@ int dLluBufFreeArr(int_t numLA, dLUValSubBuf_t **LUvsbs) SUPERLU_FREE(LUvsbs); } + int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, scuBufs_t* scuBufs, LUstruct_t* LUstruct, @@ -269,7 +279,7 @@ int_t ddenseTreeFactor( } /* ddenseTreeFactor */ /* - * 2D factorization at individual subtree. + * 2D factorization at individual subtree. -- CPU only */ int_t dsparseTreeFactor_ASYNC( sForest_t* sforest, @@ -300,7 +310,7 @@ int_t dsparseTreeFactor_ASYNC( #if ( DEBUGlevel>=1 ) CHECK_MALLOC (grid3d->iam, "Enter dsparseTreeFactor_ASYNC()"); #endif - + int_t *perm_c_supno = sforest->nodeList ; // list of nodes in the order of factorization treeTopoInfo_t* treeTopoInfo = &sforest->topoInfo; int_t* myIperm = treeTopoInfo->myIperm; @@ -331,8 +341,6 @@ int_t dsparseTreeFactor_ASYNC( sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); #else - - // printf("[1] (iam %d) k0 %d offset %d\n", grid3d->iam, k0, offset); fflush(stdout); dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D, comReqss[offset]->U_diag_blk_recv_req, @@ -362,7 +370,6 @@ int_t dsparseTreeFactor_ASYNC( sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); #else - // printf("[2] (iam %d) topoLvl %d\n", grid3d->iam, topoLvl); fflush(stdout); dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D, comReqss[offset]->U_diag_blk_recv_req, @@ -387,7 +394,6 @@ int_t dsparseTreeFactor_ASYNC( sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset], grid, LUstruct, SCT); #else - printf("[3] (iam %d) k0 %d\n", grid3d->iam, k0); fflush(stdout); dLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L, comReqss[offset]->U_diag_blk_recv_req, dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT); @@ -725,5 +731,10 @@ int_t dsparseTreeFactor_ASYNC( }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/ } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (grid3d->iam, "Exit dsparseTreeFactor_ASYNC()"); +#endif + return 0; } /* dsparseTreeFactor_ASYNC */ diff --git a/SRC/dtrfAux.c b/SRC/dtrfAux.c index ed489438..e6c68ef4 100644 --- a/SRC/dtrfAux.c +++ b/SRC/dtrfAux.c @@ -577,7 +577,7 @@ trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers, SUPERLU_FREE(gNodeCount); SUPERLU_FREE(gNodeLists); SUPERLU_FREE(perm_c_supno); - // free_treelist(nsupers, treeList); + free_treelist(nsupers, treeList); #if ( DEBUGlevel>=1 ) CHECK_MALLOC (iam, "Exit dinitTrf3Dpartition()"); @@ -588,6 +588,7 @@ trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers, /* Free memory allocated for trf3Dpartition structure. Sherry added this routine */ void dDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d) { + int i; #if ( DEBUGlevel>=1 ) CHECK_MALLOC (grid3d->iam, "Enter dDestroy_trf3Dpartition()"); #endif @@ -597,20 +598,20 @@ void dDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *gri SUPERLU_FREE(trf3Dpartition->myNodeCount); SUPERLU_FREE(trf3Dpartition->myTreeIdxs); SUPERLU_FREE(trf3Dpartition->myZeroTrIdxs); + SUPERLU_FREE(trf3Dpartition->treePerm); // double pointer pointing to sForests->nodeList int_t maxLvl = log2i(grid3d->zscp.Np) + 1; int_t numForests = (1 << maxLvl) - 1; sForest_t** sForests = trf3Dpartition->sForests; - for (int i = 0; i < numForests; ++i) { + for (i = 0; i < numForests; ++i) { if ( sForests[i] ) { SUPERLU_FREE(sForests[i]->nodeList); - //SUPERLU_FREE(sForests[i]->treeHeads); // already freed SUPERLU_FREE((sForests[i]->topoInfo).eTreeTopLims); SUPERLU_FREE((sForests[i]->topoInfo).myIperm); + SUPERLU_FREE(sForests[i]); // Sherry added } } SUPERLU_FREE(trf3Dpartition->sForests); // double pointer - SUPERLU_FREE(trf3Dpartition->treePerm); // double pointer pointing to sForests->nodeList SUPERLU_FREE(trf3Dpartition->supernode2treeMap); SUPERLU_FREE((trf3Dpartition->LUvsb)->Lsub_buf); @@ -622,7 +623,7 @@ void dDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *gri SUPERLU_FREE(trf3Dpartition); #if ( DEBUGlevel>=1 ) - CHECK_MALLOC (grid3d->iam, "Enter dDestroy_trf3Dpartition()"); + CHECK_MALLOC (grid3d->iam, "Exit dDestroy_trf3Dpartition()"); #endif } diff --git a/SRC/pd3dcomm.c b/SRC/pd3dcomm.c index aefe3a26..e1e03d76 100644 --- a/SRC/pd3dcomm.c +++ b/SRC/pd3dcomm.c @@ -347,8 +347,8 @@ int_t dp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) MPI_Bcast( &nsupers, 1, mpi_int_t, 0, grid3d->zscp.comm); /* Scatter and alloc Glu_persist */ - if (grid3d->zscp.Iam) - AllocGlu(n, nsupers, LUstruct, grid3d); + if ( grid3d->zscp.Iam ) // all other process layers not equal 0 + AllocGlu_3d(n, nsupers, LUstruct); /* broadcast Glu_persist */ int_t *xsup = LUstruct->Glu_persist->xsup; @@ -359,7 +359,7 @@ int_t dp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) /* now broadcast localLu_t */ /* first allocating space for it */ - if (grid3d->zscp.Iam) + if ( grid3d->zscp.Iam ) // all other process layers not equal 0 dAllocLlu(nsupers, LUstruct, grid3d); LocalLU_t *Llu = LUstruct->Llu; @@ -396,7 +396,7 @@ int_t dp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) mpiMallocLUStruct(nsupers, LUstruct, grid3d); #endif return 0; -} +} /* dp3dScatter */ int_t dscatter3dUPanels(int_t nsupers, @@ -874,8 +874,6 @@ int_t dgatherAllFactoredLU( trf3Dpartition_t* trf3Dpartition, } /* for ilvl ... */ SUPERLU_FREE(gNodeCount); // sherry added - //int numForests = (1 << maxLvl) - 1; // sherry added ???? - //for (int i = 0; i < numForests; ++i) SUPERLU_FREE(gNodeLists[i]); SUPERLU_FREE(gNodeLists); return 0; diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c index 9a514f5a..f60c13e2 100644 --- a/SRC/pdgstrf3d.c +++ b/SRC/pdgstrf3d.c @@ -155,8 +155,11 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, diagFactBufs_t dFBuf; dinitDiagFactBufs(ldt, &dFBuf); - commRequests_t comReqs; + commRequests_t comReqs; initCommRequests(&comReqs, grid); + + msgs_t msgs; + initMsgs(&msgs); #endif SCT->tStartup = SuperLU_timer_(); @@ -166,9 +169,6 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, scuBufs_t scuBufs; dinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid); - // msgs_t msgs; - // initMsgs(&msgs); // sherry: not used - factNodelists_t fNlists; initFactNodelists( ldt, num_threads, nsupers, &fNlists); @@ -203,8 +203,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1]; } diagFactBufs_t** dFBufs = dinitDiagFactBufsArr(mxLeafNode, ldt, grid); - commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), - ldt, grid); + commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), ldt, grid); /* Setting up GPU related data structures */ @@ -214,10 +213,8 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, int_t Pr = grid->nprow; int_t mrb = (nsupers + Pr - 1) / Pr; int_t mcb = (nsupers + Pc - 1) / Pc; - - HyP_t *HyP = (HyP_t *) malloc(sizeof(HyP_t)); + HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t)); Init_HyP(HyP, Llu, mcb, mrb); - HyP->first_l_block_acc = first_l_block_acc; HyP->first_u_block_acc = first_u_block_acc; int_t bigu_size = getBigUSize(nsupers, grid, LUstruct); @@ -337,7 +334,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, // sherry added /* Deallocate factorization specific buffers */ - freePackLUInfo(&packLUInfo); + freePackLUInfo(&packLUInfo); dfreeScuBufs(&scuBufs); freeFactStat(&factStat); freeFactNodelists(&fNlists); @@ -347,10 +344,6 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, dfreeDiagFactBufsArr(mxLeafNode, dFBufs); Free_HyP(HyP); - if (!iam) { - printf ("exit pdgstrf3d()\n"); fflush(stdout); - } - #if ( DEBUGlevel>=1 ) CHECK_MALLOC (grid3d->iam, "Exit pdgstrf3d()"); #endif diff --git a/SRC/pz3dcomm.c b/SRC/pz3dcomm.c index 8e5c6dae..55b313e1 100644 --- a/SRC/pz3dcomm.c +++ b/SRC/pz3dcomm.c @@ -346,8 +346,8 @@ int_t zp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) MPI_Bcast( &nsupers, 1, mpi_int_t, 0, grid3d->zscp.comm); /* Scatter and alloc Glu_persist */ - if (grid3d->zscp.Iam) - AllocGlu(n, nsupers, LUstruct, grid3d); + if ( grid3d->zscp.Iam ) // all other process layers not equal 0 + AllocGlu_3d(n, nsupers, LUstruct); /* broadcast Glu_persist */ int_t *xsup = LUstruct->Glu_persist->xsup; @@ -358,7 +358,7 @@ int_t zp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) /* now broadcast localLu_t */ /* first allocating space for it */ - if (grid3d->zscp.Iam) + if ( grid3d->zscp.Iam ) // all other process layers not equal 0 zAllocLlu(nsupers, LUstruct, grid3d); LocalLU_t *Llu = LUstruct->Llu; @@ -395,7 +395,7 @@ int_t zp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) mpiMallocLUStruct(nsupers, LUstruct, grid3d); #endif return 0; -} +} /* zp3dScatter */ int_t zscatter3dUPanels(int_t nsupers, diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c index cd9bef61..f204c713 100644 --- a/SRC/pzgstrf3d.c +++ b/SRC/pzgstrf3d.c @@ -147,15 +147,20 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, int_t iam = grid->iam; // in 2D grid int num_threads = getNumThreads(grid3d->iam); - diagFactBufs_t dFBuf; - zinitDiagFactBufs(ldt, &dFBuf); - factStat_t factStat; initFactStat(nsupers, &factStat); - commRequests_t comReqs; +#if 0 // sherry: not used + diagFactBufs_t dFBuf; + zinitDiagFactBufs(ldt, &dFBuf); + + commRequests_t comReqs; initCommRequests(&comReqs, grid); + msgs_t msgs; + initMsgs(&msgs); +#endif + SCT->tStartup = SuperLU_timer_(); packLUInfo_t packLUInfo; initPackLUInfo(nsupers, &packLUInfo); @@ -163,9 +168,6 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, scuBufs_t scuBufs; zinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid); - msgs_t msgs; - initMsgs(&msgs); - factNodelists_t fNlists; initFactNodelists( ldt, num_threads, nsupers, &fNlists); @@ -200,8 +202,7 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1]; } diagFactBufs_t** dFBufs = zinitDiagFactBufsArr(mxLeafNode, ldt, grid); - commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), - ldt, grid); + commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), ldt, grid); /* Setting up GPU related data structures */ @@ -211,7 +212,7 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, int_t Pr = grid->nprow; int_t mrb = (nsupers + Pr - 1) / Pr; int_t mcb = (nsupers + Pc - 1) / Pc; - HyP_t *HyP = (HyP_t *) malloc(sizeof(HyP_t)); + HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t)); Init_HyP(HyP, Llu, mcb, mrb); HyP->first_l_block_acc = first_l_block_acc; HyP->first_u_block_acc = first_u_block_acc; @@ -330,6 +331,18 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, reduceStat(FACT, stat, grid3d); + // sherry added + /* Deallocate factorization specific buffers */ + freePackLUInfo(&packLUInfo); + zfreeScuBufs(&scuBufs); + freeFactStat(&factStat); + freeFactNodelists(&fNlists); + freeMsgsArr(numLA, msgss); + freeCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), comReqss); + zLluBufFreeArr(numLA, LUvsbs); + zfreeDiagFactBufsArr(mxLeafNode, dFBufs); + Free_HyP(HyP); + #if ( DEBUGlevel>=1 ) CHECK_MALLOC (grid3d->iam, "Exit pzgstrf3d()"); #endif diff --git a/SRC/sec_structs.c b/SRC/sec_structs.c index 55ed42e0..d8e13c5a 100644 --- a/SRC/sec_structs.c +++ b/SRC/sec_structs.c @@ -2,7 +2,6 @@ #include "superlu_ddefs.h" #if 0 #include "sec_structs.h" -#include /*for printfs*/ #include /*for Qsort */ #include #include @@ -13,6 +12,8 @@ #include "supernodal_etree.h" #endif +#include /*for printfs*/ + double CPU_CLOCK_RATE; /*for sorting structures */ int Cmpfunc_R_info (const void * a, const void * b) @@ -100,8 +101,13 @@ double getFreq(void) } char *arg = 0; - char *line = NULL; +#if 1 size_t len = 0; + char *line = NULL; +#else + size_t len = 100; // Sherry fix + char *line = SUPERLU_MALLOC(len * sizeof(char)); +#endif size_t read; while ((read = getline(&line, &len, fp)) != -1) { @@ -125,10 +131,11 @@ double getFreq(void) break; } } - + free(line); + line = NULL; } - free(line); // sherry added + //SUPERLU_FREE(line); // sherry added return 0; } diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index b05e87ee..3cca2576 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -726,7 +726,7 @@ extern void pdgstrf2(superlu_dist_options_t *, int_t nsupers, int_t k0, LocalLU_t *, MPI_Request *, int, SuperLUStat_t *, int *); /* from p3dcomm.h */ -extern int_t dAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); +extern int_t dAllocLlu_3d(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); extern int_t dp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); extern int_t dscatter3dLPanels(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); @@ -737,7 +737,9 @@ extern int_t dcollect3dUpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct extern int_t dp3dCollect(int_t layer, int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); /*zero out LU non zero entries*/ extern int_t dzeroSetLU(int_t nnodes, int_t* nodeList , LUstruct_t *, gridinfo3d_t*); -extern int_t AllocGlu(int_t n, int_t nsupers, LUstruct_t *, gridinfo3d_t*); +extern int AllocGlu_3d(int_t n, int_t nsupers, LUstruct_t *); +extern int DeAllocLlu_3d(int_t n, LUstruct_t *, gridinfo3d_t*); +extern int DeAllocGlu_3d(LUstruct_t *); /* Reduces L and U panels of nodes in the List nodeList (size=nnnodes) receiver[L(nodelist)] =sender[L(nodelist)] +receiver[L(nodelist)] diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index fa84c143..4c657b3a 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -828,13 +828,17 @@ typedef struct /* data */ int_t nNodes; // total number of nodes int_t* nodeList; // list of nodes, should be in order of factorization +#if 0 // Sherry: the following array is used on rForest_t. ??? int_t* treeHeads; - /*topological information about the tree*/ +#endif + /*topological information about the tree*/ int_t numLvl; // number of Topological levels in the forest int_t numTrees; // number of tree in the forest treeTopoInfo_t topoInfo; // +#if 0 // Sherry fix: the following two structures are in treeTopoInfo_t. ??? int_t* eTreeTopLims; // boundaries of each level of size int_t* myIperm; // Iperm for my tree size nsupers; +#endif /*information about load balance*/ double weight; // estimated cost diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index c3def71c..2416fef5 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -10,7 +10,7 @@ /* #undef HAVE_COMBBLAS */ /* enable 64bit index mode */ -#define XSDK_INDEX_SIZE 64 +/* #undef XSDK_INDEX_SIZE */ #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 diff --git a/SRC/superlu_enum_consts.h b/SRC/superlu_enum_consts.h index 5c79d7d2..24a7e787 100644 --- a/SRC/superlu_enum_consts.h +++ b/SRC/superlu_enum_consts.h @@ -15,7 +15,6 @@ at the top-level directory. * Lawrence Berkeley National Lab, Univ. of California Berkeley, * October 1, 2010 * January 28, 2018 - * January 28, 2018 * */ @@ -76,7 +75,7 @@ typedef enum { SOL_COMM,/* communication for solve */ SOL_GEMM,/* gemm for solve */ SOL_TRSM,/* trsm for solve */ - SOL_TOT, /* LU-solve time*/ + SOL_TOT, /* LU-solve time*/ RCOND, /* estimate reciprocal condition number */ SOLVE, /* forward and back solves */ REFINE, /* perform iterative refinement */ diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h index 5a49ca68..a6f395ac 100644 --- a/SRC/superlu_zdefs.h +++ b/SRC/superlu_zdefs.h @@ -729,7 +729,7 @@ extern void pzgstrf2(superlu_dist_options_t *, int_t nsupers, int_t k0, LocalLU_t *, MPI_Request *, int, SuperLUStat_t *, int *); /* from p3dcomm.h */ -extern int_t zAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); +extern int_t zAllocLlu_3d(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); extern int_t zp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); extern int_t zscatter3dLPanels(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); @@ -740,7 +740,9 @@ extern int_t zcollect3dUpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct extern int_t zp3dCollect(int_t layer, int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d); /*zero out LU non zero entries*/ extern int_t zzeroSetLU(int_t nnodes, int_t* nodeList , LUstruct_t *, gridinfo3d_t*); -extern int_t AllocGlu(int_t n, int_t nsupers, LUstruct_t *, gridinfo3d_t*); +extern int AllocGlu_3d(int_t n, int_t nsupers, LUstruct_t *); +extern int DeAllocLlu_3d(int_t n, LUstruct_t *, gridinfo3d_t*); +extern int DeAllocGlu_3d(LUstruct_t *); /* Reduces L and U panels of nodes in the List nodeList (size=nnnodes) receiver[L(nodelist)] =sender[L(nodelist)] +receiver[L(nodelist)] diff --git a/SRC/supernodalForest.c b/SRC/supernodalForest.c index ad1aff0f..f5bc3dda 100644 --- a/SRC/supernodalForest.c +++ b/SRC/supernodalForest.c @@ -263,9 +263,6 @@ int_t* getNodeToForstMap(int_t nsupers, sForest_t** sForests, gridinfo3d_t* gri } - - - int_t* getMyNodeCountsFr(int_t maxLvl, int_t* myTreeIdxs, sForest_t** sForests) { int_t* myNodeCount = INT_T_ALLOC(maxLvl); @@ -283,7 +280,6 @@ int_t* getMyNodeCountsFr(int_t maxLvl, int_t* myTreeIdxs, sForest_t** sForests) int_t** getTreePermFr( int_t* myTreeIdxs, sForest_t** sForests, gridinfo3d_t* grid3d) { - int_t maxLvl = log2i(grid3d->zscp.Np) + 1; int_t** treePerm = (int_t** ) SUPERLU_MALLOC(sizeof(int_t*)*maxLvl); @@ -313,7 +309,6 @@ int_t* getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t* myNodeCount, int_t* } return isNodeInMyGrid; - } double pearsonCoeff(int_t numForests, double* frCost, double* frWeight) @@ -561,7 +556,7 @@ sForest_t* createForestNew(int_t numTrees, int_t nsupers, int_t * nodeCounts, // using the nodelist create factorization ordering - calcTopInfoForest(forest, nsupers, setree); + calcTopInfoForest(forest, nsupers, setree); return forest; } @@ -611,21 +606,29 @@ void oneLeveltreeFrPartition( int_t nTrees, int_t * trCount, int_t** trList, SUPERLU_FREE(wSortIdx); -} +} /* oneLeveltreeFrPartition */ forestPartition_t iterativeFrPartitioning(rForest_t* rforest, int_t nsupers, int_t * setree, treeList_t* treeList) { - int_t nTreeSet = rforest->ntrees; - int_t* treeHeads = rforest->treeHeads; - - int_t nAnc = 0; - int_t* ancTreeCount = INT_T_ALLOC(MAX_TREE_ALLOWED); - int_t** ancNodeLists = SUPERLU_MALLOC(MAX_TREE_ALLOWED * sizeof(int_t*)); + int_t nTreeSet = rforest->ntrees; + int_t* treeHeads = rforest->treeHeads; - double * weightArr = DOUBLE_ALLOC (MAX_TREE_ALLOWED); - // int_t* treeSet = INT_T_ALLOC(nTreeSet); - int_t* treeSet = INT_T_ALLOC(MAX_TREE_ALLOWED); + int_t nAnc = 0; +#if 0 + int_t* ancTreeCount = INT_T_ALLOC(MAX_TREE_ALLOWED); + int_t** ancNodeLists = SUPERLU_MALLOC(MAX_TREE_ALLOWED * sizeof(int_t*)); + + double * weightArr = DOUBLE_ALLOC (MAX_TREE_ALLOWED); + // int_t* treeSet = INT_T_ALLOC(nTreeSet); + int_t* treeSet = INT_T_ALLOC(MAX_TREE_ALLOWED); +#else // Sherry fix + int_t* ancTreeCount = intMalloc_dist(MAX_TREE_ALLOWED); + int_t** ancNodeLists = SUPERLU_MALLOC(MAX_TREE_ALLOWED * sizeof(int_t*)); + + double * weightArr = doubleMalloc_dist(MAX_TREE_ALLOWED); + int_t* treeSet = intMalloc_dist(MAX_TREE_ALLOWED); +#endif for (int i = 0; i < nTreeSet; ++i) { @@ -658,7 +661,8 @@ forestPartition_t iterativeFrPartitioning(rForest_t* rforest, int_t nsupers, int } ancTreeCount[nAnc] = getCommonAncsCount(MaxTree, treeList); - int_t * alist = INT_T_ALLOC (ancTreeCount[nAnc]); + //int_t * alist = INT_T_ALLOC (ancTreeCount[nAnc]); + int_t * alist = intMalloc_dist(ancTreeCount[nAnc]); getCommonAncestorList(MaxTree, alist, setree, treeList); ancNodeLists[nAnc] = alist; nAnc++; @@ -684,22 +688,33 @@ forestPartition_t iterativeFrPartitioning(rForest_t* rforest, int_t nsupers, int sForest_t* aforest = createForestNew(nAnc, nsupers, ancTreeCount, ancNodeLists, setree, treeList); // create the weight array; - double* sWeightArr = DOUBLE_ALLOC(nTreeSet); + //double* sWeightArr = DOUBLE_ALLOC(nTreeSet); + double* sWeightArr = doubleMalloc_dist(nTreeSet); // Sherry fix for (int i = 0; i < nTreeSet ; ++i) sWeightArr[i] = treeList[treeSet[i]].iWeight; int_t trCount[2] = {0, 0}; int_t* trList[2]; +#if 0 trList[0] = INT_T_ALLOC(nTreeSet); trList[1] = INT_T_ALLOC(nTreeSet); +#else // Sherry fix + trList[0] = intMalloc_dist(nTreeSet); + trList[1] = intMalloc_dist(nTreeSet); +#endif oneLeveltreeFrPartition( nTreeSet, trCount, trList, treeSet, sWeightArr); rForest_t *rforestS1, *rforestS2; +#if 0 rforestS1 = SUPERLU_MALLOC(sizeof(rforest)); rforestS2 = SUPERLU_MALLOC(sizeof(rforest)); +#else + rforestS1 = (rForest_t *) SUPERLU_MALLOC(sizeof(rForest_t)); // Sherry fix + rforestS2 = (rForest_t *) SUPERLU_MALLOC(sizeof(rForest_t)); +#endif rforestS1->ntrees = trCount[0]; rforestS1->treeHeads = trList[0]; @@ -709,7 +724,7 @@ forestPartition_t iterativeFrPartitioning(rForest_t* rforest, int_t nsupers, int forestPartition_t frPr_t; frPr_t.Ans = aforest; - frPr_t.S[0] = rforestS1; + frPr_t.S[0] = rforestS1; frPr_t.S[1] = rforestS2; SUPERLU_FREE(weightArr); @@ -733,7 +748,7 @@ forestPartition_t iterativeFrPartitioning(rForest_t* rforest, int_t nsupers, int } /* iterativeFrPartitioning */ - +/* Create a single sforest */ sForest_t* r2sForest(rForest_t* rforest, int_t nsupers, int_t * setree, treeList_t* treeList) { int_t nTree = rforest->ntrees; @@ -747,10 +762,10 @@ sForest_t* r2sForest(rForest_t* rforest, int_t nsupers, int_t * setree, treeList for (int i = 0; i < nTree; ++i) { - /* code */ - nodeCounts[i] = treeList[treeHeads[i]].numDescendents; - NodeLists[i] = INT_T_ALLOC(nodeCounts[i]); - getDescendList(treeHeads[i], NodeLists[i], treeList); + /* code */ + nodeCounts[i] = treeList[treeHeads[i]].numDescendents; + NodeLists[i] = INT_T_ALLOC(nodeCounts[i]); + getDescendList(treeHeads[i], NodeLists[i], treeList); } @@ -766,7 +781,7 @@ sForest_t* r2sForest(rForest_t* rforest, int_t nsupers, int_t * setree, treeList SUPERLU_FREE(nodeCounts); return sforest; -} +} /* r2sForest */ sForest_t** getGreedyLoadBalForests( int_t maxLvl, int_t nsupers, int_t * setree, treeList_t* treeList) @@ -823,37 +838,42 @@ sForest_t** getGreedyLoadBalForests( int_t maxLvl, int_t nsupers, int_t * setre for (int_t tr = lvlSt; tr < lvlEnd; ++tr) { - /* code */ - forestPartition_t frPr_t = iterativeFrPartitioning(&rForests[tr], nsupers, setree, treeList); - sForests[tr] = frPr_t.Ans; - - if (lvl == maxLvl - 2) - { - /* code */ - sForests[2 * tr + 1] = r2sForest(frPr_t.S[0], nsupers, setree, treeList); - sForests[2 * tr + 2] = r2sForest(frPr_t.S[1], nsupers, setree, treeList); - } - else - { - rForests[2 * tr + 1] = *(frPr_t.S[0]); - rForests[2 * tr + 2] = *(frPr_t.S[1]); - } + /* code */ + forestPartition_t frPr_t = iterativeFrPartitioning(&rForests[tr], nsupers, setree, treeList); + sForests[tr] = frPr_t.Ans; + if (lvl == maxLvl - 2) { + /* code */ + sForests[2 * tr + 1] = r2sForest(frPr_t.S[0], nsupers, setree, treeList); + sForests[2 * tr + 2] = r2sForest(frPr_t.S[1], nsupers, setree, treeList); + freeRforest(frPr_t.S[0]); // Sherry added + freeRforest(frPr_t.S[1]); +#if 0 + SUPERLU_FREE(frPr_t.S[0]); // Sherry added + SUPERLU_FREE(frPr_t.S[1]); +#endif + } else { + rForests[2 * tr + 1] = *(frPr_t.S[0]); + rForests[2 * tr + 2] = *(frPr_t.S[1]); + + } + SUPERLU_FREE(frPr_t.S[0]); // Sherry added + SUPERLU_FREE(frPr_t.S[1]); } } for (int i = 0; i < numRForests; ++i) { - /* code */ - freeRforest(&rForests[i]); + /* code */ + freeRforest(&rForests[i]); // Sherry added } - SUPERLU_FREE(rForests); + SUPERLU_FREE(rForests); // Sherry added return sForests; -} +} /* getGreedyLoadBalForests */ // balanced forests at one level sForest_t** getOneLevelBalForests( int_t maxLvl, int_t nsupers, int_t * setree, treeList_t* treeList) diff --git a/SRC/supernodal_etree.c b/SRC/supernodal_etree.c index aceef5fa..f32a5c6c 100644 --- a/SRC/supernodal_etree.c +++ b/SRC/supernodal_etree.c @@ -23,7 +23,7 @@ int_t log2i(int_t index) int_t *supernodal_etree(int_t nsuper, int_t * etree, int_t* supno, int_t *xsup) { // int_t *setree = malloc(sizeof(int_t) * nsuper); - int_t *setree = intMalloc_dist(nsuper); + int_t *setree = intMalloc_dist(nsuper); // Sherry fix /*initialzing the loop*/ for (int i = 0; i < nsuper; ++i) { @@ -44,7 +44,8 @@ int_t *supernodal_etree(int_t nsuper, int_t * etree, int_t* supno, int_t *xsup) supernode calculates "level" in elimination tree*/ int_t* topological_ordering(int_t nsuper, int_t* setree) { - int_t *tsort_setree = malloc(sizeof(int_t) * nsuper); + // int_t *tsort_setree = malloc(sizeof(int_t) * nsuper); + int_t *tsort_setree = intMalloc_dist(nsuper); // Sherry fix for (int i = 0; i < nsuper; ++i) { tsort_setree[i] = 0; /*initializing all levels to zero*/ @@ -83,7 +84,8 @@ treeList_t* setree2list(int_t nsuper, int_t* setree ) /*allocate memory for children lists*/ for (int i = 0; i < nsuper + 1; ++i) { - treeList[i].childrenList = INT_T_ALLOC (treeList[i].numChild); + //treeList[i].childrenList = INT_T_ALLOC (treeList[i].numChild); + treeList[i].childrenList = intMalloc_dist(treeList[i].numChild); treeList[i].numChild = 0; } @@ -99,7 +101,7 @@ treeList_t* setree2list(int_t nsuper, int_t* setree ) } /* setree2list */ -// sherry added +// Sherry added int free_treelist(int_t nsuper, treeList_t* treeList) { for (int i = 0; i < nsuper + 1; ++i) { @@ -329,7 +331,8 @@ int_t* getEtreeLB(int_t nnodes, int_t* perm_l, int_t* gTopOrder) minTop = gTopOrder[perm_l[0]]; maxTop = gTopOrder[perm_l[nnodes - 1]]; int_t numLB = maxTop - minTop + 2; - int_t* lEtreeLB = (int_t *) malloc( sizeof(int_t) * numLB); + //int_t* lEtreeLB = (int_t *) malloc( sizeof(int_t) * numLB); + int_t* lEtreeLB = (int_t *) intMalloc_dist(numLB); // Sherry fix for (int i = 0; i < numLB; ++i) { /* initalize */ @@ -391,7 +394,8 @@ int_t* getSubTreeRoots(int_t k, treeList_t* treeList) int_t testSubtreeNodelist(int_t nsupers, int_t numList, int_t** nodeList, int_t* nodeCount) // tests disjoint and union { - int_t* slist = (int_t* ) malloc(sizeof(int_t) * nsupers); + //int_t* slist = (int_t* ) malloc(sizeof(int_t) * nsupers); + int_t* slist = intMalloc_dist(nsupers); // Sherry fix /*intialize each entry with zero */ for (int_t i = 0; i < nsupers; ++i) { @@ -413,7 +417,7 @@ int_t testSubtreeNodelist(int_t nsupers, int_t numList, int_t** nodeList, int_t* assert(slist[i] == 1); } printf("testSubtreeNodelist Passed\n"); - free(slist); + SUPERLU_FREE(slist); return 0; } int_t testListPerm(int_t nodeCount, int_t* nodeList, int_t* permList, int_t* gTopLevel) @@ -430,9 +434,13 @@ int_t testListPerm(int_t nodeCount, int_t* nodeList, int_t* permList, int_t* gTo } assert( gTopLevel[permList[i]] <= gTopLevel[permList[i + 1]]); } - +#if 0 int_t* slist = (int_t* ) malloc(sizeof(int_t) * nodeCount); int_t* plist = (int_t* ) malloc(sizeof(int_t) * nodeCount); +#else + int_t* slist = intMalloc_dist(nodeCount); + int_t* plist = intMalloc_dist(nodeCount); +#endif // copy lists for (int_t i = 0; i < nodeCount; ++i) { @@ -447,13 +455,17 @@ int_t testListPerm(int_t nodeCount, int_t* nodeList, int_t* permList, int_t* gTo assert( slist[i] == plist[i]); } printf("permList Test Passed\n"); - free(slist); - free(plist); + + SUPERLU_FREE(slist); + SUPERLU_FREE(plist); + return 0; } int_t mergPermTest(int_t nperms, int_t* gperms, int_t* nnodes); + +// Sherry: the following routine is not called ?? int_t* merg_perms(int_t nperms, int_t* nnodes, int_t** perms) { // merges three permutations @@ -463,8 +475,11 @@ int_t* merg_perms(int_t nperms, int_t* nnodes, int_t** perms) { nn += nnodes[i]; } + // alloc address - int_t* gperm = (int_t*) malloc(nn * sizeof(int_t)); + //int_t* gperm = (int_t*) malloc(nn * sizeof(int_t)); + int_t* gperm = intMalloc_dist(nn); // Sherry fix + //now concatenat arrays int_t ptr = 0; for (int_t tr = 0; tr < nperms; ++tr) @@ -480,7 +495,8 @@ int_t* merg_perms(int_t nperms, int_t* nnodes, int_t** perms) } mergPermTest( nperms, gperm, nnodes); return gperm; -} +} /* merg_perms */ + int_t mergPermTest(int_t nperms, int_t* gperms, int_t* nnodes) { // merges three permutations @@ -490,8 +506,11 @@ int_t mergPermTest(int_t nperms, int_t* gperms, int_t* nnodes) { nn += nnodes[i]; } + // alloc address - int_t* tperm = (int_t*) malloc(nn * sizeof(int_t)); + // int_t* tperm = (int_t*) malloc(nn * sizeof(int_t)); + int_t* tperm = intMalloc_dist(nn); // Sherry fix + for (int i = 0; i < nn; ++i) { tperm[i] = 0; @@ -507,9 +526,9 @@ int_t mergPermTest(int_t nperms, int_t* gperms, int_t* nnodes) /* code */ assert(tperm[i] == 1); } - free(tperm); + SUPERLU_FREE(tperm); return nn; -} +} /* mergPermTest */ int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat, superlu_dist_options_t *options, @@ -886,12 +905,13 @@ treeTopoInfo_t getMyTreeTopoInfo(int_t nnodes, int_t nsupers, return ttI; } - -int_t* Etree_LevelBoundry(int_t* perm, int_t* tsort_etree, int_t nsuper) +// Sherry: the following function is not called ?? /*calculated boundries of the topological levels*/ +int_t* Etree_LevelBoundry(int_t* perm, int_t* tsort_etree, int_t nsuper) { int_t max_level = tsort_etree[nsuper - 1] + 1; - int_t *Etree_LvlBdry = malloc(sizeof(int_t) * (max_level + 1)); + //int_t *Etree_LvlBdry = malloc(sizeof(int_t) * (max_level + 1)); + int_t *Etree_LvlBdry = intMalloc_dist(max_level + 1); // Sherry fix Etree_LvlBdry[0] = 0; /*calculate end of boundries for each level*/ for (int_t i = 0; i < max_level; ++i) @@ -920,7 +940,8 @@ int_t* Etree_LevelBoundry(int_t* perm, int_t* tsort_etree, int_t nsuper) int_t* calculate_num_children(int_t nsuper, int_t* setree) { - int_t* etree_num_children = malloc(sizeof(int_t) * (nsuper)); + //int_t* etree_num_children = malloc(sizeof(int_t) * (nsuper)); + int_t* etree_num_children = intMalloc_dist(nsuper); // Sherry fix for (int_t i = 0; i < nsuper; ++i) { /*initialize num children to zero*/ diff --git a/SRC/util.c b/SRC/util.c index 3040891a..eb962f92 100644 --- a/SRC/util.c +++ b/SRC/util.c @@ -93,7 +93,8 @@ Destroy_Dense_Matrix_dist(SuperMatrix *A) -/*! \brief Destroy distributed L & U matrices. */ +/*! \brief Destroy the binary trees associated with the panel. + These are used in triangular solve. */ void Destroy_Tree(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) { @@ -108,31 +109,29 @@ Destroy_Tree(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) nsupers = Glu_persist->supno[n-1] + 1; - nb = CEILING(nsupers, grid->npcol); - for (i=0;iLBtree_ptr[i]!=NULL){ - BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt); - } - if(Llu->UBtree_ptr[i]!=NULL){ - BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt); - } + nb = CEILING(nsupers, grid->npcol); + for (i=0;iLBtree_ptr[i]!=NULL){ + BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt); } - SUPERLU_FREE(Llu->LBtree_ptr); - SUPERLU_FREE(Llu->UBtree_ptr); + if(Llu->UBtree_ptr[i]!=NULL){ + BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt); + } + } + SUPERLU_FREE(Llu->LBtree_ptr); + SUPERLU_FREE(Llu->UBtree_ptr); - nb = CEILING(nsupers, grid->nprow); - for (i=0;iLRtree_ptr[i]!=NULL){ - RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt); - } - if(Llu->URtree_ptr[i]!=NULL){ - RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt); - } + nb = CEILING(nsupers, grid->nprow); + for (i=0;iLRtree_ptr[i]!=NULL){ + RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt); } - SUPERLU_FREE(Llu->LRtree_ptr); - SUPERLU_FREE(Llu->URtree_ptr); - - + if(Llu->URtree_ptr[i]!=NULL){ + RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt); + } + } + SUPERLU_FREE(Llu->LRtree_ptr); + SUPERLU_FREE(Llu->URtree_ptr); #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Exit Destroy_Tree()"); @@ -140,7 +139,6 @@ Destroy_Tree(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) } - /*! \brief Destroy distributed L & U matrices. */ void Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) @@ -155,7 +153,7 @@ Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) CHECK_MALLOC(iam, "Enter Destroy_LU()"); #endif - Destroy_Tree(n, grid, LUstruct); + Destroy_Tree(n, grid, LUstruct); // from asynchronous triangular solve nsupers = Glu_persist->supno[n-1] + 1; @@ -235,6 +233,42 @@ Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) #endif } +int DeAllocLlu_3d(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +{ + int i, nbc, nbr, nsupers; + LocalLU_t *Llu = LUstruct->Llu; + + nsupers = (LUstruct->Glu_persist)->supno[n-1] + 1; + + nbc = CEILING(nsupers, grid3d->npcol); + for (i = 0; i < nbc; ++i) + if ( Llu->Lrowind_bc_ptr[i] ) { + SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); +#ifdef GPU_ACC + checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i])); +#else + SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); +#endif + } + SUPERLU_FREE (Llu->Lrowind_bc_ptr); + SUPERLU_FREE (Llu->Lnzval_bc_ptr); + + nbr = CEILING(nsupers, grid3d->nprow); + for (i = 0; i < nbr; ++i) + if ( Llu->Ufstnz_br_ptr[i] ) { + SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]); + SUPERLU_FREE (Llu->Unzval_br_ptr[i]); + } + SUPERLU_FREE (Llu->Ufstnz_br_ptr); + SUPERLU_FREE (Llu->Unzval_br_ptr); + + /* The following can be freed after factorization. */ + SUPERLU_FREE(Llu->ToRecv); + SUPERLU_FREE(Llu->ToSendD); + for (i = 0; i < nbc; ++i) SUPERLU_FREE(Llu->ToSendR[i]); + SUPERLU_FREE(Llu->ToSendR); +} + /*! \brief Allocate storage in ScalePermstruct */ void ScalePermstructInit(const int_t m, const int_t n, ScalePermstruct_t *ScalePermstruct) @@ -1456,7 +1490,7 @@ int_t partitionM( int_t* a, int_t l, int_t r, int_t lda, int_t dir, int_t dims) * The following are from 3D code p3dcomm.c */ -int_t AllocGlu(int_t n, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +int AllocGlu_3d(int_t n, int_t nsupers, LUstruct_t * LUstruct) { /*broadcasting Glu_persist*/ LUstruct->Glu_persist->xsup = intMalloc_dist(nsupers+1); //INT_T_ALLOC(nsupers+1); @@ -1464,6 +1498,14 @@ int_t AllocGlu(int_t n, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid return 0; } +// Sherry added +int DeAllocGlu_3d(LUstruct_t * LUstruct) +{ + SUPERLU_FREE(LUstruct->Glu_persist->xsup); + SUPERLU_FREE(LUstruct->Glu_persist->supno); + return 0; +} + int_t** getTreePerm( int_t* myTreeIdxs, int_t* myZeroTrIdxs, int_t* nodeCount, int_t** nodeList, int_t* perm_c_supno, int_t* iperm_c_supno, diff --git a/SRC/ztreeFactorization.c b/SRC/ztreeFactorization.c index 46d13de5..4c677274 100644 --- a/SRC/ztreeFactorization.c +++ b/SRC/ztreeFactorization.c @@ -57,7 +57,11 @@ int zfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs) SUPERLU_FREE(dFBufs[i]->BlockLFactor); SUPERLU_FREE(dFBufs[i]); } - SUPERLU_FREE(dFBufs); + + /* Sherry fix: + * mxLeafNode can be 0 for the replicated layers of the processes ?? */ + if ( mxLeafNode ) SUPERLU_FREE(dFBufs); + return 0; } @@ -269,7 +273,7 @@ int_t zdenseTreeFactor( } /* zdenseTreeFactor */ /* - * 2D factorization at individual subtree. + * 2D factorization at individual subtree. -- CPU only */ int_t zsparseTreeFactor_ASYNC( sForest_t* sforest, @@ -297,6 +301,10 @@ int_t zsparseTreeFactor_ASYNC( return 1; } +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (grid3d->iam, "Enter zsparseTreeFactor_ASYNC()"); +#endif + int_t *perm_c_supno = sforest->nodeList ; // list of nodes in the order of factorization treeTopoInfo_t* treeTopoInfo = &sforest->topoInfo; int_t* myIperm = treeTopoInfo->myIperm; @@ -717,5 +725,10 @@ int_t zsparseTreeFactor_ASYNC( }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/ } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (grid3d->iam, "Exit zsparseTreeFactor_ASYNC()"); +#endif + return 0; } /* zsparseTreeFactor_ASYNC */ diff --git a/SRC/ztrfAux.c b/SRC/ztrfAux.c index c88f9fc6..9347db6d 100644 --- a/SRC/ztrfAux.c +++ b/SRC/ztrfAux.c @@ -587,6 +587,7 @@ trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers, /* Free memory allocated for trf3Dpartition structure. Sherry added this routine */ void zDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d) { + int i; #if ( DEBUGlevel>=1 ) CHECK_MALLOC (grid3d->iam, "Enter zDestroy_trf3Dpartition()"); #endif @@ -596,20 +597,20 @@ void zDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *gri SUPERLU_FREE(trf3Dpartition->myNodeCount); SUPERLU_FREE(trf3Dpartition->myTreeIdxs); SUPERLU_FREE(trf3Dpartition->myZeroTrIdxs); + SUPERLU_FREE(trf3Dpartition->treePerm); // double pointer pointing to sForests->nodeList int_t maxLvl = log2i(grid3d->zscp.Np) + 1; int_t numForests = (1 << maxLvl) - 1; sForest_t** sForests = trf3Dpartition->sForests; - for (int i = 0; i < numForests; ++i) { + for (i = 0; i < numForests; ++i) { if ( sForests[i] ) { SUPERLU_FREE(sForests[i]->nodeList); - //SUPERLU_FREE(sForests[i]->treeHeads); // already freed SUPERLU_FREE((sForests[i]->topoInfo).eTreeTopLims); SUPERLU_FREE((sForests[i]->topoInfo).myIperm); + SUPERLU_FREE(sForests[i]); // Sherry added } } SUPERLU_FREE(trf3Dpartition->sForests); // double pointer - SUPERLU_FREE(trf3Dpartition->treePerm); // double pointer pointing to sForests->nodeList SUPERLU_FREE(trf3Dpartition->supernode2treeMap); SUPERLU_FREE((trf3Dpartition->LUvsb)->Lsub_buf); @@ -621,7 +622,7 @@ void zDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *gri SUPERLU_FREE(trf3Dpartition); #if ( DEBUGlevel>=1 ) - CHECK_MALLOC (grid3d->iam, "Enter zDestroy_trf3Dpartition()"); + CHECK_MALLOC (grid3d->iam, "Exit zDestroy_trf3Dpartition()"); #endif } diff --git a/make.inc.in b/make.inc.in index 490c5613..115b2396 100644 --- a/make.inc.in +++ b/make.inc.in @@ -35,7 +35,8 @@ ARCHFLAGS = cr RANLIB = @CMAKE_RANLIB@ CC = @CMAKE_C_COMPILER@ -CFLAGS = @CMAKE_C_FLAGS_RELEASE@ @CMAKE_C_FLAGS@ ##@CMAKE_SHARED_LIBRARY_C_FLAGS@ +CFLAGS = @CMAKE_C_FLAGS_RELEASE@ @CMAKE_C_FLAGS@ ${SHARED_C_FLAGS_EXPORT} + ##@CMAKE_SHARED_LIBRARY_C_FLAGS@ #CFLAGS += -D${DirDefs} # CFLAGS += @COMPILE_DEFINITIONS@ CXX = @CMAKE_CXX_COMPILER@ diff --git a/run_cmake_build.sh b/run_cmake_build.sh index ffd3689a..bd272408 100755 --- a/run_cmake_build.sh +++ b/run_cmake_build.sh @@ -5,10 +5,10 @@ echo "host: $THISHOST" if [ "$THISHOST" == "ssg1" ] then -# rm -fr ssg1-build; mkdir ssg1-build; cd ssg1-build; -# export PARMETIS_ROOT=~/lib/static/parmetis-4.0.3 - rm -fr int64-build; mkdir int64-build; cd int64-build; - export PARMETIS_ROOT=~/lib/static/64-bit/parmetis-4.0.3 + rm -fr ssg1-build; mkdir ssg1-build; cd ssg1-build; + export PARMETIS_ROOT=~/lib/static/parmetis-4.0.3 +# rm -fr int64-build; mkdir int64-build; cd int64-build; +# export PARMETIS_ROOT=~/lib/static/64-bit/parmetis-4.0.3 export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64 echo "ParMetis root: $PARMETIS_ROOT" cmake .. \ @@ -17,16 +17,16 @@ then -DTPL_COMBBLAS_INCLUDE_DIRS="${COMBBLAS_ROOT}/_install/include;${COMBBLAS_R\ OOT}/Applications/BipartiteMatchings" \ -DTPL_COMBBLAS_LIBRARIES="${COMBBLAS_BUILD_DIR}/libCombBLAS.a" \ - -DCMAKE_C_FLAGS="-std=c99 -O3 -g -DPRNTlevel=1 -DDEBUGlevel=0" \ + -DCMAKE_C_FLAGS="-std=c99 -g -O3 -DPRNTlevel=0 -DDEBUGlevel=0" \ -DCMAKE_C_COMPILER=mpicc \ -DCMAKE_CXX_COMPILER=mpicxx \ - -DTPL_ENABLE_BLASLIB=ON \ + -DTPL_ENABLE_BLASLIB=OFF \ -DTPL_ENABLE_COMBBLASLIB=OFF \ -DTPL_ENABLE_LAPACKLIB=OFF \ -DBUILD_SHARED_LIBS=OFF \ - -DXSDK_INDEX_SIZE=64 \ -DCMAKE_INSTALL_PREFIX=. fi +# -DXSDK_INDEX_SIZE=64 \ # -DCMAKE_INSTALL_PREFIX=install # -DTPL_ENABLE_PARMETISLIB=OFF # -DCMAKE_CXX_FLAGS="-std=c++11" \ From 585cf95c752a1958592b231aa062ad71ca15df10 Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Wed, 27 Nov 2019 22:18:19 -0800 Subject: [PATCH 015/147] Fix a bug in complex code: Init_HyP() changed to {d,z}Init_HyP(), because the Schur complement update buffers are precision-dependent: doubleMalloc_dist() should be doubleComplexMalloc_dist(). --- SRC/dtrfAux.c | 42 ++++++++++++++++++++++++++++++++++++++++++ SRC/pdgstrf3d.c | 8 +++++--- SRC/pzgstrf2.c | 14 +++++++------- SRC/pzgstrf3d.c | 8 +++++--- SRC/sec_structs.c | 42 +----------------------------------------- SRC/superlu_ddefs.h | 7 +++++-- SRC/superlu_grid3d.c | 2 +- SRC/superlu_zdefs.h | 7 +++++-- SRC/util_dist.h | 4 ++++ SRC/ztrfAux.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 117 insertions(+), 59 deletions(-) diff --git a/SRC/dtrfAux.c b/SRC/dtrfAux.c index e6c68ef4..ddf8c25b 100644 --- a/SRC/dtrfAux.c +++ b/SRC/dtrfAux.c @@ -26,6 +26,48 @@ at the top-level directory. #include "trfAux.h" #endif +/* Inititalize the data structure to assist HALO offload of Schur-complement. */ +void dInit_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb ) +{ + HyP->last_offload = -1; +#if 0 + HyP->lookAhead_info = (Remain_info_t *) _mm_malloc((mrb) * sizeof(Remain_info_t), 64); + + HyP->lookAhead_L_buff = (double *) _mm_malloc( sizeof(double) * (Llu->bufmax[1]), 64); + + HyP->Remain_L_buff = (double *) _mm_malloc( sizeof(double) * (Llu->bufmax[1]), 64); + HyP->Remain_info = (Remain_info_t *) _mm_malloc(mrb * sizeof(Remain_info_t), 64); + HyP->Ublock_info_Phi = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64); + HyP->Ublock_info = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64); + HyP->Lblock_dirty_bit = (int_t *) _mm_malloc(mcb * sizeof(int_t), 64); + HyP->Ublock_dirty_bit = (int_t *) _mm_malloc(mrb * sizeof(int_t), 64); +#else + HyP->lookAhead_info = (Remain_info_t *) SUPERLU_MALLOC((mrb) * sizeof(Remain_info_t)); + HyP->lookAhead_L_buff = (double *) doubleMalloc_dist((Llu->bufmax[1])); + HyP->Remain_L_buff = (double *) doubleMalloc_dist((Llu->bufmax[1])); + HyP->Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb * sizeof(Remain_info_t)); + HyP->Ublock_info_Phi = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t)); + HyP->Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t)); + HyP->Lblock_dirty_bit = (int_t *) intMalloc_dist(mcb); + HyP->Ublock_dirty_bit = (int_t *) intMalloc_dist(mrb); +#endif + + for (int_t i = 0; i < mcb; ++i) + { + HyP->Lblock_dirty_bit[i] = -1; + } + + for (int_t i = 0; i < mrb; ++i) + { + HyP->Ublock_dirty_bit[i] = -1; + } + + HyP->last_offload = -1; + HyP->superlu_acc_offload = get_acc_offload (); + + HyP->nCudaStreams =0; +} /* dInit_HyP */ + /*init3DLUstruct with forest interface */ void dinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, sForest_t** sForests, LUstruct_t* LUstruct, diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c index f60c13e2..ed8cc562 100644 --- a/SRC/pdgstrf3d.c +++ b/SRC/pdgstrf3d.c @@ -214,7 +214,9 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, int_t mrb = (nsupers + Pr - 1) / Pr; int_t mcb = (nsupers + Pc - 1) / Pc; HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t)); - Init_HyP(HyP, Llu, mcb, mrb); + + dInit_HyP(HyP, Llu, mcb, mrb); + HyP->first_l_block_acc = first_l_block_acc; HyP->first_u_block_acc = first_u_block_acc; int_t bigu_size = getBigUSize(nsupers, grid, LUstruct); @@ -274,8 +276,8 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, sForest_t* sforest = sForests[myTreeIdxs[ilvl]]; - /*main loop over all the super nodes*/ - if (sforest) + /* main loop over all the supernodes */ + if (sforest) /* 2D factorization at individual subtree */ { double tilvl = SuperLU_timer_(); #ifdef GPU_ACC diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c index 6329364d..6341fb64 100644 --- a/SRC/pzgstrf2.c +++ b/SRC/pzgstrf2.c @@ -430,7 +430,7 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh, doublecomplex *ublk_ptr = BlockUFactor; doublecomplex *ujrow = BlockUFactor; int_t luptr = 0; /* Point_t to the diagonal entries. */ - int_t cols_left = nsupc; /* supernode size */ + int cols_left = nsupc; /* supernode size */ int_t u_diag_cnt = 0; int_t ld_ujrow = nsupc; /* leading dimension of ujrow */ int_t incx = 1; @@ -439,9 +439,9 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh, for (int_t j = 0; j < jlst - jfst; ++j) /* for each column in panel */ { /* Diagonal pivot */ - int_t i = luptr; - if ( options->ReplaceTinyPivot == YES ) { - if ( slud_z_abs1(&lusup[i]) < thresh && + int i = luptr; + if ( options->ReplaceTinyPivot == YES ) { + if ( slud_z_abs1(&lusup[i]) < thresh && lusup[i].r != 0.0 && lusup[i].i != 0.0 ) { /* Diagonal */ #if ( PRNTlevel>=2 ) @@ -459,7 +459,7 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh, } } - for (int_t l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) + for (int l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) { int_t st = j * ld_ujrow + j; ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */ @@ -470,7 +470,7 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh, { *info = j + jfst + 1; } - else /* Scale the j-th column. */ + else /* Scale the j-th column within diagonal block. */ { doublecomplex temp; slud_z_div(&temp, &one, &ujrow[0]); @@ -483,7 +483,7 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh, if (--cols_left) { /*following must be int*/ - int_t l = nsupc - j - 1; + int l = nsupc - j - 1; /* Rank-1 update */ #if 1 diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c index f204c713..e33c4a23 100644 --- a/SRC/pzgstrf3d.c +++ b/SRC/pzgstrf3d.c @@ -213,7 +213,9 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, int_t mrb = (nsupers + Pr - 1) / Pr; int_t mcb = (nsupers + Pc - 1) / Pc; HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t)); - Init_HyP(HyP, Llu, mcb, mrb); + + zInit_HyP(HyP, Llu, mcb, mrb); + HyP->first_l_block_acc = first_l_block_acc; HyP->first_u_block_acc = first_u_block_acc; int_t bigu_size = getBigUSize(nsupers, grid, LUstruct); @@ -273,8 +275,8 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, sForest_t* sforest = sForests[myTreeIdxs[ilvl]]; - /*main loop over all the super nodes*/ - if (sforest) + /* main loop over all the supernodes */ + if (sforest) /* 2D factorization at individual subtree */ { double tilvl = SuperLU_timer_(); #ifdef GPU_ACC diff --git a/SRC/sec_structs.c b/SRC/sec_structs.c index d8e13c5a..d227ac4e 100644 --- a/SRC/sec_structs.c +++ b/SRC/sec_structs.c @@ -139,6 +139,7 @@ double getFreq(void) return 0; } +/* Initialize various counters. */ void SCT_init(SCT_t* SCT) { #if 1 @@ -563,47 +564,6 @@ get_acc_offload () } -void Init_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb ) -{ - HyP->last_offload = -1; -#if 0 - HyP->lookAhead_info = (Remain_info_t *) _mm_malloc((mrb) * sizeof(Remain_info_t), 64); - - HyP->lookAhead_L_buff = (double *) _mm_malloc( sizeof(double) * (Llu->bufmax[1]), 64); - - HyP->Remain_L_buff = (double *) _mm_malloc( sizeof(double) * (Llu->bufmax[1]), 64); - HyP->Remain_info = (Remain_info_t *) _mm_malloc(mrb * sizeof(Remain_info_t), 64); - HyP->Ublock_info_Phi = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64); - HyP->Ublock_info = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64); - HyP->Lblock_dirty_bit = (int_t *) _mm_malloc(mcb * sizeof(int_t), 64); - HyP->Ublock_dirty_bit = (int_t *) _mm_malloc(mrb * sizeof(int_t), 64); -#else - HyP->lookAhead_info = (Remain_info_t *) SUPERLU_MALLOC((mrb) * sizeof(Remain_info_t)); - HyP->lookAhead_L_buff = (double *) doubleMalloc_dist((Llu->bufmax[1])); - HyP->Remain_L_buff = (double *) doubleMalloc_dist((Llu->bufmax[1])); - HyP->Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb * sizeof(Remain_info_t)); - HyP->Ublock_info_Phi = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t)); - HyP->Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t)); - HyP->Lblock_dirty_bit = (int_t *) intMalloc_dist(mcb); - HyP->Ublock_dirty_bit = (int_t *) intMalloc_dist(mrb); -#endif - - for (int_t i = 0; i < mcb; ++i) - { - HyP->Lblock_dirty_bit[i] = -1; - } - - for (int_t i = 0; i < mrb; ++i) - { - HyP->Ublock_dirty_bit[i] = -1; - } - - HyP->last_offload = -1; - HyP->superlu_acc_offload = get_acc_offload (); - - HyP->nCudaStreams =0; -} - void Free_HyP(HyP_t* HyP) { #if 0 diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index 3cca2576..011833cc 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -203,6 +203,9 @@ typedef struct int_t nsupr; } lPanelInfo_t; + + +/* HyP_t is the data structure to assist HALO offload of Schur-complement. */ typedef struct { Remain_info_t *lookAhead_info, *Remain_info; @@ -229,7 +232,7 @@ typedef struct int_t offloadCondition; int_t superlu_acc_offload; int_t nCudaStreams; -} HyP_t; /* Data structures for Schur complement update. */ +} HyP_t; typedef struct { @@ -558,7 +561,7 @@ extern void pdgssvx3d (superlu_dist_options_t *, SuperMatrix *, extern int_t pdgstrf3d(superlu_dist_options_t *, int m, int n, double anorm, trf3Dpartition_t*, SCT_t *, LUstruct_t *, gridinfo3d_t *, SuperLUStat_t *, int *); -extern void Init_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb ); +extern void dInit_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb ); extern void Free_HyP(HyP_t* HyP); extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid); diff --git a/SRC/superlu_grid3d.c b/SRC/superlu_grid3d.c index ce8a8259..3a5ff2de 100644 --- a/SRC/superlu_grid3d.c +++ b/SRC/superlu_grid3d.c @@ -42,7 +42,7 @@ void superlu_gridinit3d(MPI_Comm Bcomm, /* The base communicator upon which MPI_Comm_size( Bcomm, &info ); if ( info < Np ) - ABORT("Number of processes is smaller than NPROW * NPCOL"); + ABORT("Number of processes is smaller than NPROW * NPCOL * NPDEP"); superlu_gridmap3d(Bcomm, nprow, npcol, npdep, grid); diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h index a6f395ac..f01f1f3f 100644 --- a/SRC/superlu_zdefs.h +++ b/SRC/superlu_zdefs.h @@ -203,6 +203,9 @@ typedef struct int_t nsupr; } lPanelInfo_t; + + +/* HyP_t is the data structure to assist HALO offload of Schur-complement. */ typedef struct { Remain_info_t *lookAhead_info, *Remain_info; @@ -229,7 +232,7 @@ typedef struct int_t offloadCondition; int_t superlu_acc_offload; int_t nCudaStreams; -} HyP_t; /* Data structures for Schur complement update. */ +} HyP_t; typedef struct { @@ -561,7 +564,7 @@ extern void pzgssvx3d (superlu_dist_options_t *, SuperMatrix *, extern int_t pzgstrf3d(superlu_dist_options_t *, int m, int n, double anorm, trf3Dpartition_t*, SCT_t *, LUstruct_t *, gridinfo3d_t *, SuperLUStat_t *, int *); -extern void Init_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb ); +extern void zInit_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb ); extern void Free_HyP(HyP_t* HyP); extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid); diff --git a/SRC/util_dist.h b/SRC/util_dist.h index 3456f96a..06b65edd 100644 --- a/SRC/util_dist.h +++ b/SRC/util_dist.h @@ -163,6 +163,10 @@ typedef struct { /*********************************************************************** * For 3D code */ +/* SCT_t was initially Schur-complement counter to compute different + metrics of Schur-complement Update. + Later, it includes counters to keep track of many other metrics. +*/ typedef struct { int_t datatransfer_count; diff --git a/SRC/ztrfAux.c b/SRC/ztrfAux.c index 9347db6d..e96cc496 100644 --- a/SRC/ztrfAux.c +++ b/SRC/ztrfAux.c @@ -25,6 +25,48 @@ at the top-level directory. #include "trfAux.h" #endif +/* Inititalize the data structure to assist HALO offload of Schur-complement. */ +void zInit_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb ) +{ + HyP->last_offload = -1; +#if 0 + HyP->lookAhead_info = (Remain_info_t *) _mm_malloc((mrb) * sizeof(Remain_info_t), 64); + + HyP->lookAhead_L_buff = (doublecomplex *) _mm_malloc( sizeof(doublecomplex) * (Llu->bufmax[1]), 64); + + HyP->Remain_L_buff = (doublecomplex *) _mm_malloc( sizeof(doublecomplex) * (Llu->bufmax[1]), 64); + HyP->Remain_info = (Remain_info_t *) _mm_malloc(mrb * sizeof(Remain_info_t), 64); + HyP->Ublock_info_Phi = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64); + HyP->Ublock_info = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64); + HyP->Lblock_dirty_bit = (int_t *) _mm_malloc(mcb * sizeof(int_t), 64); + HyP->Ublock_dirty_bit = (int_t *) _mm_malloc(mrb * sizeof(int_t), 64); +#else + HyP->lookAhead_info = (Remain_info_t *) SUPERLU_MALLOC((mrb) * sizeof(Remain_info_t)); + HyP->lookAhead_L_buff = (doublecomplex *) doublecomplexMalloc_dist((Llu->bufmax[1])); + HyP->Remain_L_buff = (doublecomplex *) doublecomplexMalloc_dist((Llu->bufmax[1])); + HyP->Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb * sizeof(Remain_info_t)); + HyP->Ublock_info_Phi = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t)); + HyP->Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t)); + HyP->Lblock_dirty_bit = (int_t *) intMalloc_dist(mcb); + HyP->Ublock_dirty_bit = (int_t *) intMalloc_dist(mrb); +#endif + + for (int_t i = 0; i < mcb; ++i) + { + HyP->Lblock_dirty_bit[i] = -1; + } + + for (int_t i = 0; i < mrb; ++i) + { + HyP->Ublock_dirty_bit[i] = -1; + } + + HyP->last_offload = -1; + HyP->superlu_acc_offload = get_acc_offload (); + + HyP->nCudaStreams =0; +} /* zInit_HyP */ + /*init3DLUstruct with forest interface */ void zinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, sForest_t** sForests, LUstruct_t* LUstruct, From 264c3aef90b4d0c43a41ee8cfcd1c946680d463d Mon Sep 17 00:00:00 2001 From: jacobrking Date: Thu, 7 May 2020 18:16:39 -0600 Subject: [PATCH 016/147] Standardize binary output file name. --- SRC/dbinary_io.c | 2 +- SRC/zbinary_io.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/SRC/dbinary_io.c b/SRC/dbinary_io.c index bb842852..65d29f06 100644 --- a/SRC/dbinary_io.c +++ b/SRC/dbinary_io.c @@ -27,7 +27,7 @@ dwrite_binary(int_t n, int_t nnz, FILE *fp1; int nnz_written; size_t isize = sizeof(int_t), dsize = sizeof(double); - fp1 = fopen("/scratch/scratchdirs/xiaoye/temp.bin", "wb"); + fp1 = fopen("matrix.bin", "wb"); fwrite(&n, isize, 1, fp1); fwrite(&nnz, isize, 1, fp1); fwrite(colptr, isize, n+1, fp1); diff --git a/SRC/zbinary_io.c b/SRC/zbinary_io.c index e2957379..d49b9a8a 100644 --- a/SRC/zbinary_io.c +++ b/SRC/zbinary_io.c @@ -27,7 +27,7 @@ zwrite_binary(int_t n, int_t nnz, FILE *fp1; int nnz_written; size_t isize = sizeof(int_t), dsize = sizeof(double); - fp1 = fopen("/scratch/scratchdirs/xiaoye/temp.bin", "wb"); + fp1 = fopen("cmatrix.bin", "wb"); fwrite(&n, isize, 1, fp1); fwrite(&nnz, isize, 1, fp1); fwrite(colptr, isize, n+1, fp1); From 0b24e73e24d34d389465a531fd23eff04a671c2a Mon Sep 17 00:00:00 2001 From: jacobrking Date: Thu, 7 May 2020 18:17:16 -0600 Subject: [PATCH 017/147] Merge code from dgather.c --- SRC/zgather.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/SRC/zgather.c b/SRC/zgather.c index ce00a643..113dd311 100644 --- a/SRC/zgather.c +++ b/SRC/zgather.c @@ -378,7 +378,10 @@ void zRgather_U( int_t k, int_t jj0, int_t *usub, doublecomplex *uval, } HyP->bigU_Phi = bigU; - HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; + if ( HyP->num_u_blks_Phi == 0 ) + HyP->bigU_host = bigU; + else + HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; zgather_u(HyP->num_u_blks, HyP->Ublock_info, usub, uval, HyP->bigU_host, HyP->ldu, xsup, klst ); From 0237ee3f97d5d26292982e06f3cfed6456d1f514 Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Wed, 13 May 2020 17:15:10 -0700 Subject: [PATCH 018/147] in superlu_defs.h: remove #include ... non-standard C++ add #ifdef _OPENMP around some OpenMP functions. on MacOS, use mpicxx instead of mpiCC for C++ compiler. --- SRC/dgather.c | 10 +--------- SRC/dscatter3d.c | 29 +++++++++++++++++++++++++++-- SRC/dtreeFactorization.c | 4 ++++ SRC/dtrfCommWrapper.c | 14 +++++++++++--- SRC/pd3dcomm.c | 2 +- SRC/pdgssvx3d.c | 4 ++-- SRC/pdgstrf2.c | 2 +- SRC/pz3dcomm.c | 2 +- SRC/pzgssvx3d.c | 7 ++++--- SRC/pzgstrf2.c | 2 +- SRC/sec_structs.c | 14 ++++++++++---- SRC/superlu_ddefs.h | 4 ++++ SRC/superlu_defs.h | 6 +++--- SRC/superlu_zdefs.h | 4 ++++ SRC/trfAux.c | 4 +++- SRC/util_dist.h | 1 + SRC/zgather.c | 5 ++++- SRC/zscatter3d.c | 32 +++++++++++++++++++++++++++++--- SRC/ztreeFactorization.c | 11 ++++++++++- SRC/ztrfCommWrapper.c | 14 +++++++++++--- 20 files changed, 132 insertions(+), 39 deletions(-) diff --git a/SRC/dgather.c b/SRC/dgather.c index 887963c0..f573b6be 100644 --- a/SRC/dgather.c +++ b/SRC/dgather.c @@ -378,16 +378,8 @@ void dRgather_U( int_t k, int_t jj0, int_t *usub, double *uval, HyP->Ublock_info_Phi[j].full_u_cols = HyP->Ublock_info_Phi[j ].ncols + HyP->Ublock_info_Phi[j - 1].full_u_cols; } -#if 0 - if (!grid->iam) { // Sherry to remove - printf(".. k %d jj0 %d\t num_u_blks_phi %d\t mcb %d\n", k, jj0, HyP->num_u_blks_Phi, - (HyP->nsupers + grid->npcol - 1) / grid->npcol); - fflush(stdout); - } -#endif - HyP->bigU_Phi = bigU; - if ( HyP->num_u_blks_Phi == 0 ) + if ( HyP->num_u_blks_Phi == 0 ) // Sherry fix HyP->bigU_host = bigU; else HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; diff --git a/SRC/dscatter3d.c b/SRC/dscatter3d.c index bd887361..2b8ce2ea 100644 --- a/SRC/dscatter3d.c +++ b/SRC/dscatter3d.c @@ -17,9 +17,12 @@ at the top-level directory. #ifdef __INTEL_COMPILER #include "mkl.h" #else -#include "cblas.h" +//#include "cblas.h" #endif + +#ifdef _OPENMP #include "omp.h" +#endif #define ISORT #define SCATTER_U_CPU scatter_u @@ -95,7 +98,11 @@ dblock_gemm_scatter( int_t lb, int_t j, ) { // return ; +#ifdef _OPENMP thread_id = omp_get_thread_num(); +#else + thread_id = 0; +#endif int_t *indirect_thread = indirect + ldt * thread_id; int_t *indirect2_thread = indirect2 + ldt * thread_id; double *tempv1 = bigV + thread_id * ldt * ldt; @@ -200,6 +207,7 @@ dblock_gemm_scatter( int_t lb, int_t j, #endif } /* dblock_gemm_scatter */ +#ifdef _OPENMP /*this version uses a lock to prevent multiple thread updating the same block*/ void dblock_gemm_scatter_lock( int_t lb, int_t j, @@ -313,6 +321,7 @@ dblock_gemm_scatter_lock( int_t lb, int_t j, += t_s; #endif } /* dblock_gemm_scatter_lock */ +#endif // only if _OPENMP is defined // there are following three variations of block_gemm_scatter call /* @@ -353,7 +362,11 @@ int_t dblock_gemm_scatterTopLeft( int_t lb, /* block number in L */ int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; double** Unzval_br_ptr = Llu->Unzval_br_ptr; +#ifdef _OPENMP volatile int_t thread_id = omp_get_thread_num(); +#else + volatile int_t thread_id = 0; +#endif // printf("Thread's ID %lld \n", thread_id); unsigned long long t1 = _rdtsc(); @@ -389,7 +402,11 @@ int_t dblock_gemm_scatterTopRight( int_t lb, int_t j, int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; double** Unzval_br_ptr = Llu->Unzval_br_ptr; +#ifdef _OPENMP volatile int_t thread_id = omp_get_thread_num(); +#else + volatile int_t thread_id = 0; +#endif unsigned long long t1 = _rdtsc(); dblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow, HyP->bigU_Phi, HyP->ldu_Phi, @@ -420,7 +437,11 @@ int_t dblock_gemm_scatterBottomLeft( int_t lb, int_t j, int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; double** Unzval_br_ptr = Llu->Unzval_br_ptr; +#ifdef _OPENMP volatile int_t thread_id = omp_get_thread_num(); +#else + volatile int_t thread_id = 0; +#endif //printf("Thread's ID %lld \n", thread_id); unsigned long long t1 = _rdtsc(); dblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow, @@ -453,7 +474,11 @@ int_t dblock_gemm_scatterBottomRight( int_t lb, int_t j, int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; double** Unzval_br_ptr = Llu->Unzval_br_ptr; - volatile int_t thread_id = omp_get_thread_num(); +#ifdef _OPENMP + volatile int_t thread_id = omp_get_thread_num(); +#else + volatile int_t thread_id = 0; +#endif // printf("Thread's ID %lld \n", thread_id); unsigned long long t1 = _rdtsc(); dblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow, diff --git a/SRC/dtreeFactorization.c b/SRC/dtreeFactorization.c index 36d34ce8..e22be396 100644 --- a/SRC/dtreeFactorization.c +++ b/SRC/dtreeFactorization.c @@ -247,7 +247,11 @@ int_t ddenseTreeFactor( int_t klst = FstBlockC (k + 1); int_t *lsub = lPanelInfo->lsub; int_t *usub = uPanelInfo->usub; +#ifdef _OPENMP int_t thread_id = omp_get_thread_num(); +#else + int_t thread_id = 0; +#endif dblock_gemm_scatter( lb, ub, Ublock_info, Remain_info, diff --git a/SRC/dtrfCommWrapper.c b/SRC/dtrfCommWrapper.c index b1c73469..77ea33aa 100644 --- a/SRC/dtrfCommWrapper.c +++ b/SRC/dtrfCommWrapper.c @@ -29,7 +29,7 @@ at the top-level directory. #ifdef __INTEL_COMPILER #include "mkl.h" #else -#include "cblas.h" +//#include "cblas.h" #endif int_t dDiagFactIBCast(int_t k, int_t k0, // supernode to be factored @@ -300,9 +300,13 @@ int_t dUPanelTrSolve( int_t k, // #pragma omp for schedule(dynamic,2) nowait for (int_t b = 0; b < nb; ++b) { - #pragma omp task + #pragma omp task { +#ifdef _OPENMP int_t thread_id = omp_get_thread_num(); +#else + int_t thread_id = 0; +#endif double *tempv = bigV + thread_id * ldt * ldt; dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist); @@ -347,9 +351,13 @@ int_t dUPanelTrSolve( int_t k, // printf("%d :U update \n", k); for (int_t b = 0; b < nb; ++b) { - #pragma omp task + #pragma omp task { +#ifdef _OPENMP int_t thread_id = omp_get_thread_num(); +#else + int_t thread_id = 0; +#endif double *tempv = bigV + thread_id * ldt * ldt; dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist); diff --git a/SRC/pd3dcomm.c b/SRC/pd3dcomm.c index e1e03d76..6e4e68b0 100644 --- a/SRC/pd3dcomm.c +++ b/SRC/pd3dcomm.c @@ -19,7 +19,7 @@ at the top-level directory. * May 10, 2019 */ #include "superlu_ddefs.h" -#include "cblas.h" +// #include "cblas.h" #if 0 #include "p3dcomm.h" #include "sec_structs.h" diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index 1bbcc017..d8ea203d 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -32,7 +32,7 @@ at the top-level directory. * Purpose * ======= * - * PDGSSVX solves a system of linear equations A*X=B, + * PDGSSVX3D solves a system of linear equations A*X=B, * by using Gaussian elimination with "static pivoting" to * compute the LU factorization of A. * @@ -1347,7 +1347,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, ------------------------------------------------------*/ if (options->SolveInitialized == NO) /* First time */ /* Inside this routine, SolveInitialized is set to YES. - For repeated call to pdgssvx(), no need to re-initialilze + For repeated call to pdgssvx3d(), no need to re-initialilze the Solve data & communication structures, unless a new factorization with Fact == DOFACT or SamePattern is asked for. */ { diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c index c122b92a..c349f0b8 100644 --- a/SRC/pdgstrf2.c +++ b/SRC/pdgstrf2.c @@ -77,7 +77,7 @@ at the top-level directory. #include #include "superlu_ddefs.h" -#include "cblas.h" +//#include "cblas.h" /***************************************************************************** * The following pdgstrf2_trsm is in version 6 and earlier. diff --git a/SRC/pz3dcomm.c b/SRC/pz3dcomm.c index 55b313e1..dba7ba75 100644 --- a/SRC/pz3dcomm.c +++ b/SRC/pz3dcomm.c @@ -18,7 +18,7 @@ at the top-level directory. * May 10, 2019 */ #include "superlu_zdefs.h" -#include "cblas.h" +// #include "cblas.h" #if 0 #include "p3dcomm.h" #include "sec_structs.h" diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c index 6914ba0a..9a73cdad 100644 --- a/SRC/pzgssvx3d.c +++ b/SRC/pzgssvx3d.c @@ -31,7 +31,7 @@ at the top-level directory. * Purpose * ======= * - * PZGSSVX solves a system of linear equations A*X=B, + * PZGSSVX3D solves a system of linear equations A*X=B, * by using Gaussian elimination with "static pivoting" to * compute the LU factorization of A. * @@ -1191,10 +1191,11 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* send the LU structure to all the grids */ zp3dScatter(n, LUstruct, grid3d); + int_t nsupers = getNsupers(n, LUstruct); trf3Dpartition = zinitTrf3Dpartition(nsupers, options, LUstruct, grid3d); - SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t)); + SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t)); SCT_init(SCT); #if ( PRNTlevel>=1 ) @@ -1347,7 +1348,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, ------------------------------------------------------*/ if (options->SolveInitialized == NO) /* First time */ /* Inside this routine, SolveInitialized is set to YES. - For repeated call to pzgssvx(), no need to re-initialilze + For repeated call to pzgssvx3d(), no need to re-initialilze the Solve data & communication structures, unless a new factorization with Fact == DOFACT or SamePattern is asked for. */ { diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c index 6341fb64..3b9d6137 100644 --- a/SRC/pzgstrf2.c +++ b/SRC/pzgstrf2.c @@ -76,7 +76,7 @@ at the top-level directory. #include #include "superlu_zdefs.h" -#include "cblas.h" +//#include "cblas.h" /***************************************************************************** * The following pzgstrf2_trsm is in version 6 and earlier. diff --git a/SRC/sec_structs.c b/SRC/sec_structs.c index d227ac4e..7b5ea143 100644 --- a/SRC/sec_structs.c +++ b/SRC/sec_structs.c @@ -147,14 +147,17 @@ void SCT_init(SCT_t* SCT) #else CPU_CLOCK_RATE = 3000. * 1e-3; #endif - int num_threads; - #pragma omp parallel default(shared) + int num_threads = 1; + +#ifdef _OPENMP +#pragma omp parallel default(shared) { #pragma omp master { num_threads = omp_get_num_threads (); } } +#endif SCT->acc_load_imbal = 0.0; @@ -408,14 +411,17 @@ Displays as function_name \t value \t units; // #include "mkl.h" void SCT_print(gridinfo_t *grid, SCT_t* SCT) { - int num_threads; - #pragma omp parallel default(shared) + int num_threads = 1; + +#ifdef _OPENMP +#pragma omp parallel default(shared) { #pragma omp master { num_threads = omp_get_num_threads (); } } +#endif CPU_CLOCK_RATE = 1e9 * CPU_CLOCK_RATE; int iam = grid->iam; diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index 011833cc..05b71fdc 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -582,6 +582,8 @@ dblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info, , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer #endif ); + +#ifdef _OPENMP /*this version uses a lock to prevent multiple thread updating the same block*/ extern void dblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock, @@ -600,6 +602,8 @@ dblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock, , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer #endif ); +#endif + extern int_t dblock_gemm_scatterTopLeft( int_t lb, int_t j, double* bigV, int_t knsupc, int_t klst, int_t* lsub, diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index 4c657b3a..b45e2659 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -35,16 +35,16 @@ at the top-level directory. #include #endif -//#ifdef _OPENMP +#ifdef _OPENMP #include -//#endif +#endif #include #include #include #include #include -//#include +// #include #include #include //#include Sherry: not available on Mac OS diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h index f01f1f3f..f4c86e0b 100644 --- a/SRC/superlu_zdefs.h +++ b/SRC/superlu_zdefs.h @@ -585,6 +585,8 @@ zblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info, , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer #endif ); + +#ifdef _OPENMP /*this version uses a lock to prevent multiple thread updating the same block*/ extern void zblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock, @@ -603,6 +605,8 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock, , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer #endif ); +#endif + extern int_t zblock_gemm_scatterTopLeft( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, diff --git a/SRC/trfAux.c b/SRC/trfAux.c index ec8ea2c4..c6ef1535 100644 --- a/SRC/trfAux.c +++ b/SRC/trfAux.c @@ -51,7 +51,8 @@ int set_tag_ub() int getNumThreads(int iam) { - int num_threads; + int num_threads = 1; +#ifdef _OPENMP #pragma omp parallel default(shared) { #pragma omp master @@ -60,6 +61,7 @@ int getNumThreads(int iam) } } +#endif if (!iam) { diff --git a/SRC/util_dist.h b/SRC/util_dist.h index 06b65edd..efc68c64 100644 --- a/SRC/util_dist.h +++ b/SRC/util_dist.h @@ -19,6 +19,7 @@ at the top-level directory. #include #include #include + #include "superlu_enum_consts.h" /* diff --git a/SRC/zgather.c b/SRC/zgather.c index ce00a643..96e01cfa 100644 --- a/SRC/zgather.c +++ b/SRC/zgather.c @@ -378,7 +378,10 @@ void zRgather_U( int_t k, int_t jj0, int_t *usub, doublecomplex *uval, } HyP->bigU_Phi = bigU; - HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; + if ( HyP->num_u_blks_Phi == 0 ) // Sherry fix + HyP->bigU_host = bigU; + else + HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; zgather_u(HyP->num_u_blks, HyP->Ublock_info, usub, uval, HyP->bigU_host, HyP->ldu, xsup, klst ); diff --git a/SRC/zscatter3d.c b/SRC/zscatter3d.c index 66bb5af8..7e4b18fd 100644 --- a/SRC/zscatter3d.c +++ b/SRC/zscatter3d.c @@ -16,9 +16,12 @@ at the top-level directory. #ifdef __INTEL_COMPILER #include "mkl.h" #else -#include "cblas.h" +//#include "cblas.h" #endif + +#ifdef _OPENMP #include "omp.h" +#endif #define ISORT #define SCATTER_U_CPU scatter_u @@ -94,7 +97,11 @@ zblock_gemm_scatter( int_t lb, int_t j, ) { // return ; +#ifdef _OPENMP thread_id = omp_get_thread_num(); +#else + thread_id = 0; +#endif int_t *indirect_thread = indirect + ldt * thread_id; int_t *indirect2_thread = indirect2 + ldt * thread_id; doublecomplex *tempv1 = bigV + thread_id * ldt * ldt; @@ -198,6 +205,7 @@ zblock_gemm_scatter( int_t lb, int_t j, #endif } /* zblock_gemm_scatter */ +#ifdef _OPENMP /*this version uses a lock to prevent multiple thread updating the same block*/ void zblock_gemm_scatter_lock( int_t lb, int_t j, @@ -311,6 +319,8 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, += t_s; #endif } /* zblock_gemm_scatter_lock */ +#endif // only if _OPENMP is defined + // there are following three variations of block_gemm_scatter call /* @@ -351,8 +361,12 @@ int_t zblock_gemm_scatterTopLeft( int_t lb, /* block number in L */ int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; +#ifdef _OPENMP volatile int_t thread_id = omp_get_thread_num(); - +#else + volatile int_t thread_id = 0; +#endif + // printf("Thread's ID %lld \n", thread_id); unsigned long long t1 = _rdtsc(); zblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info, @@ -387,7 +401,11 @@ int_t zblock_gemm_scatterTopRight( int_t lb, int_t j, int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; +#ifdef _OPENMP volatile int_t thread_id = omp_get_thread_num(); +#else + volatile int_t thread_id = 0; +#endif unsigned long long t1 = _rdtsc(); zblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow, HyP->bigU_Phi, HyP->ldu_Phi, @@ -418,7 +436,11 @@ int_t zblock_gemm_scatterBottomLeft( int_t lb, int_t j, int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; +#ifdef _OPENMP volatile int_t thread_id = omp_get_thread_num(); +#else + volatile int_t thread_id = 0; +#endif //printf("Thread's ID %lld \n", thread_id); unsigned long long t1 = _rdtsc(); zblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow, @@ -451,7 +473,11 @@ int_t zblock_gemm_scatterBottomRight( int_t lb, int_t j, int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; - volatile int_t thread_id = omp_get_thread_num(); +#ifdef _OPENMP + volatile int_t thread_id = omp_get_thread_num(); +#else + volatile int_t thread_id = 0; +#endif // printf("Thread's ID %lld \n", thread_id); unsigned long long t1 = _rdtsc(); zblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow, diff --git a/SRC/ztreeFactorization.c b/SRC/ztreeFactorization.c index 4c677274..c33272e0 100644 --- a/SRC/ztreeFactorization.c +++ b/SRC/ztreeFactorization.c @@ -36,7 +36,12 @@ int_t zLluBufInit(zLUValSubBuf_t* LUvsb, LUstruct_t *LUstruct) diagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid) { diagFactBufs_t** dFBufs; - dFBufs = (diagFactBufs_t** ) SUPERLU_MALLOC(mxLeafNode * sizeof(diagFactBufs_t*)); + + /* Sherry fix: + * mxLeafNode can be 0 for the replicated layers of the processes ?? */ + if ( mxLeafNode ) dFBufs = (diagFactBufs_t** ) + SUPERLU_MALLOC(mxLeafNode * sizeof(diagFactBufs_t*)); + for (int i = 0; i < mxLeafNode; ++i) { /* code */ @@ -241,7 +246,11 @@ int_t zdenseTreeFactor( int_t klst = FstBlockC (k + 1); int_t *lsub = lPanelInfo->lsub; int_t *usub = uPanelInfo->usub; +#ifdef _OPENMP int_t thread_id = omp_get_thread_num(); +#else + int_t thread_id = 0; +#endif zblock_gemm_scatter( lb, ub, Ublock_info, Remain_info, diff --git a/SRC/ztrfCommWrapper.c b/SRC/ztrfCommWrapper.c index f7027b7f..a9cf4f11 100644 --- a/SRC/ztrfCommWrapper.c +++ b/SRC/ztrfCommWrapper.c @@ -28,7 +28,7 @@ at the top-level directory. #ifdef __INTEL_COMPILER #include "mkl.h" #else -#include "cblas.h" +//#include "cblas.h" #endif int_t zDiagFactIBCast(int_t k, int_t k0, // supernode to be factored @@ -299,9 +299,13 @@ int_t zUPanelTrSolve( int_t k, // #pragma omp for schedule(dynamic,2) nowait for (int_t b = 0; b < nb; ++b) { - #pragma omp task + #pragma omp task { +#ifdef _OPENNP int_t thread_id = omp_get_thread_num(); +#else + int_t thread_id = 0; +#endif doublecomplex *tempv = bigV + thread_id * ldt * ldt; zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist); @@ -346,9 +350,13 @@ int_t zUPanelTrSolve( int_t k, // printf("%d :U update \n", k); for (int_t b = 0; b < nb; ++b) { - #pragma omp task + #pragma omp task { +#ifdef _OPENMP int_t thread_id = omp_get_thread_num(); +#else + int_t thread_id = 0; +#endif doublecomplex *tempv = bigV + thread_id * ldt * ldt; zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist); From 02dc18fc84b9eb1df9fd0e3fd13b11ffa062a51d Mon Sep 17 00:00:00 2001 From: piyush sao Date: Mon, 17 Aug 2020 23:09:32 -0400 Subject: [PATCH 019/147] Allowing debug build tto use "O0" optimization level --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index b7b92edb..62a5b283 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -140,6 +140,8 @@ set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "") # ###################################################################### # +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0") +set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0") #--------------------- MPI --------------------- find_package(MPI) if(MPI_C_FOUND) From fcf9aa8dfaefeca87aecfbaad1f677bd8ce8ca67 Mon Sep 17 00:00:00 2001 From: xiaoye Date: Tue, 8 Sep 2020 21:37:39 -0700 Subject: [PATCH 020/147] Fixed problems with 64bit index. The following arrays are of type 'int': *indirect, *ToRecv, *ToSendD, **ToSendR --- EXAMPLE/Makefile | 2 +- SRC/communication_aux.c | 2 +- SRC/dbinary_io.c | 2 + SRC/dcommunication_aux.c | 8 +- SRC/dscatter3d.c | 38 ++++---- SRC/dtreeFactorization.c | 8 +- SRC/dtrfAux.c | 4 +- SRC/dtrfCommWrapper.c | 30 +++---- SRC/pd3dcomm.c | 44 +++++---- SRC/pdgsmv.c | 8 +- SRC/pdgssvx.c | 1 + SRC/pdgssvx3d.c | 1 + SRC/pdgstrf2.c | 24 ++--- SRC/pdutil.c | 2 +- SRC/psymbfact.c | 188 +++++++++++++++++++-------------------- SRC/psymbfact_util.c | 16 ++-- SRC/pz3dcomm.c | 40 ++++++--- SRC/pzgsmv.c | 8 +- SRC/pzgssvx.c | 2 + SRC/pzgssvx3d.c | 2 + SRC/pzgssvx_ABglobal.c | 2 + SRC/pzgstrf2.c | 20 ++--- SRC/pzutil.c | 2 +- SRC/superlu_ddefs.h | 28 +++--- SRC/superlu_defs.h | 8 +- SRC/superlu_zdefs.h | 28 +++--- SRC/supernodalForest.c | 3 +- SRC/supernodal_etree.c | 3 + SRC/symbfact.c | 12 +-- SRC/treeFactorization.c | 8 ++ SRC/util.c | 5 ++ SRC/zbinary_io.c | 2 + SRC/zcommunication_aux.c | 8 +- SRC/zscatter3d.c | 40 ++++----- SRC/zsp_blas2_dist.c | 5 +- SRC/ztreeFactorization.c | 9 +- SRC/ztrfAux.c | 4 +- SRC/ztrfCommWrapper.c | 30 +++---- run_cmake_build.sh | 31 ++----- 39 files changed, 357 insertions(+), 321 deletions(-) diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile index 56a17b99..a40bbf3b 100644 --- a/EXAMPLE/Makefile +++ b/EXAMPLE/Makefile @@ -37,7 +37,7 @@ DEXM1 = pddrive1.o dcreate_matrix.o DEXM2 = pddrive2.o dcreate_matrix.o dcreate_matrix_perturbed.o DEXM3 = pddrive3.o dcreate_matrix.o DEXM4 = pddrive4.o dcreate_matrix.o -DEXM3D = pddrive3d.o dcreate_matrix.o +DEXM3D = pddrive3d.o dcreate_matrix.o dscatter3d.o # dtrfAux.o dtreeFactorization.o treeFactorization.o pd3dcomm.o superlu_grid3d.o pdgstrf3d.o DEXMG = pddrive_ABglobal.o DEXMG1 = pddrive1_ABglobal.o diff --git a/SRC/communication_aux.c b/SRC/communication_aux.c index 552b23fb..9529bcdd 100644 --- a/SRC/communication_aux.c +++ b/SRC/communication_aux.c @@ -19,7 +19,7 @@ at the top-level directory. int_t Wait_LSend /*wait till broadcast of L finished*/ -(int_t k, gridinfo_t *grid, int_t **ToSendR, MPI_Request *send_req, SCT_t* SCT) +(int_t k, gridinfo_t *grid, int **ToSendR, MPI_Request *send_req, SCT_t* SCT) { unsigned long long t1 = _rdtsc(); int_t Pc = grid->npcol; diff --git a/SRC/dbinary_io.c b/SRC/dbinary_io.c index 65d29f06..cdf0dc21 100644 --- a/SRC/dbinary_io.c +++ b/SRC/dbinary_io.c @@ -18,6 +18,7 @@ dread_binary(FILE *fp, int_t *m, int_t *n, int_t *nnz, nnz_read = fread(*nzval, dsize, (size_t) (*nnz), fp); printf("# of doubles fread: %d\n", nnz_read); fclose(fp); + return 0; } int @@ -37,4 +38,5 @@ dwrite_binary(int_t n, int_t nnz, printf("dump binary file ... # of double fwrite: %d\n", nnz_written); assert(nnz_written==nnz); fclose(fp1); + return 0; } diff --git a/SRC/dcommunication_aux.c b/SRC/dcommunication_aux.c index 9c5965f0..608fba76 100644 --- a/SRC/dcommunication_aux.c +++ b/SRC/dcommunication_aux.c @@ -29,7 +29,7 @@ int_t dIBcast_LPanel /*broadcasts index array lsub and non-zero value array lusup of a newly factored L column to my process row*/ (int_t k, int_t k0, int_t* lsub, double* lusup, gridinfo_t *grid, - int* msgcnt, MPI_Request *send_req, int_t **ToSendR, int_t *xsup, + int* msgcnt, MPI_Request *send_req, int **ToSendR, int_t *xsup, int tag_ub) { int_t Pc = grid->npcol; @@ -69,7 +69,7 @@ int_t dBcast_LPanel /*broadcasts index array lsub and non-zero value array lusup of a newly factored L column to my process row*/ (int_t k, int_t k0, int_t* lsub, double* lusup, gridinfo_t *grid, - int* msgcnt, int_t **ToSendR, int_t *xsup , SCT_t* SCT, + int* msgcnt, int **ToSendR, int_t *xsup , SCT_t* SCT, int tag_ub) { unsigned long long t1 = _rdtsc(); @@ -110,7 +110,7 @@ int_t dBcast_LPanel int_t dIBcast_UPanel /*asynchronously braodcasts U panel to my process row */ (int_t k, int_t k0, int_t* usub, double* uval, gridinfo_t *grid, - int* msgcnt, MPI_Request *send_req_u, int_t *ToSendD, int tag_ub ) + int* msgcnt, MPI_Request *send_req_u, int *ToSendD, int tag_ub ) { int_t iam = grid->iam; @@ -153,7 +153,7 @@ int_t dIBcast_UPanel /*Synchronously braodcasts U panel to my process row */ int_t dBcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval, gridinfo_t *grid, - int* msgcnt, int_t *ToSendD, SCT_t* SCT, int tag_ub) + int* msgcnt, int *ToSendD, SCT_t* SCT, int tag_ub) { unsigned long long t1 = _rdtsc(); diff --git a/SRC/dscatter3d.c b/SRC/dscatter3d.c index 2b8ce2ea..a71a1f8e 100644 --- a/SRC/dscatter3d.c +++ b/SRC/dscatter3d.c @@ -30,7 +30,7 @@ at the top-level directory. static void scatter_u (int_t ib, int_t jb, int_t nsupc, int_t iukp, int_t *xsup, int_t klst, int_t nbrow, int_t lptr, int_t temp_nbrow, int_t *lsub, int_t *usub, double *tempv, - int_t *indirect, + int *indirect, int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, gridinfo_t *grid); @@ -79,15 +79,15 @@ void dblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info, Remain_info_t *Remain_info, - double *L_mat, int_t ldl, - double *U_mat, int_t ldu, + double *L_mat, int ldl, + double *U_mat, int ldu, double *bigV, // int_t jj0, int_t knsupc, int_t klst, int_t *lsub, int_t *usub, int_t ldt, int_t thread_id, - int_t *indirect, - int_t *indirect2, + int *indirect, + int *indirect2, int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, int_t *xsup, gridinfo_t *grid, @@ -103,8 +103,8 @@ dblock_gemm_scatter( int_t lb, int_t j, #else thread_id = 0; #endif - int_t *indirect_thread = indirect + ldt * thread_id; - int_t *indirect2_thread = indirect2 + ldt * thread_id; + int *indirect_thread = indirect + ldt * thread_id; + int *indirect2_thread = indirect2 + ldt * thread_id; double *tempv1 = bigV + thread_id * ldt * ldt; /* Getting U block information */ @@ -114,7 +114,7 @@ dblock_gemm_scatter( int_t lb, int_t j, int_t nsupc = SuperSize(jb); int_t ljb = LBj (jb, grid); int_t st_col; - int_t ncols; + int ncols; // if (j > jj0) if (j > 0) { @@ -130,7 +130,7 @@ dblock_gemm_scatter( int_t lb, int_t j, /* Getting L block information */ int_t lptr = Remain_info[lb].lptr; int_t ib = Remain_info[lb].ib; - int_t temp_nbrow = lsub[lptr + 1]; + int temp_nbrow = lsub[lptr + 1]; lptr += LB_DESCRIPTOR; int_t cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow); /* Getting L block information */ @@ -221,8 +221,8 @@ dblock_gemm_scatter_lock( int_t lb, int_t j, int_t knsupc, int_t klst, int_t *lsub, int_t *usub, int_t ldt, int_t thread_id, - int_t *indirect, - int_t *indirect2, + int *indirect, + int *indirect2, int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, int_t *xsup, gridinfo_t *grid @@ -231,8 +231,8 @@ dblock_gemm_scatter_lock( int_t lb, int_t j, #endif ) { - int_t *indirect_thread = indirect + ldt * thread_id; - int_t *indirect2_thread = indirect2 + ldt * thread_id; + int *indirect_thread = indirect + ldt * thread_id; + int *indirect2_thread = indirect2 + ldt * thread_id; double *tempv1 = bigV + thread_id * ldt * ldt; /* Getting U block information */ @@ -349,7 +349,7 @@ int_t dblock_gemm_scatterTopLeft( int_t lb, /* block number in L */ int_t j, /* block number in U */ double* bigV, int_t knsupc, int_t klst, int_t* lsub, int_t * usub, int_t ldt, - int_t* indirect, int_t* indirect2, HyP_t* HyP, + int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *LUstruct, gridinfo_t* grid, SCT_t*SCT, SuperLUStat_t *stat @@ -388,7 +388,7 @@ int_t dblock_gemm_scatterTopLeft( int_t lb, /* block number in L */ int_t dblock_gemm_scatterTopRight( int_t lb, int_t j, double* bigV, int_t knsupc, int_t klst, int_t* lsub, - int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + int_t * usub, int_t ldt, int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *LUstruct, gridinfo_t* grid, @@ -423,7 +423,7 @@ int_t dblock_gemm_scatterTopRight( int_t lb, int_t j, int_t dblock_gemm_scatterBottomLeft( int_t lb, int_t j, double* bigV, int_t knsupc, int_t klst, int_t* lsub, - int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + int_t * usub, int_t ldt, int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *LUstruct, gridinfo_t* grid, @@ -460,7 +460,7 @@ int_t dblock_gemm_scatterBottomLeft( int_t lb, int_t j, int_t dblock_gemm_scatterBottomRight( int_t lb, int_t j, double* bigV, int_t knsupc, int_t klst, int_t* lsub, - int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + int_t * usub, int_t ldt, int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *LUstruct, gridinfo_t* grid, @@ -514,7 +514,7 @@ scatter_l (int_t ib, int_t *usub, int_t *lsub, double *tempv, - int_t *indirect_thread, int_t *indirect2, + int *indirect_thread, int *indirect2, int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid) { int_t rel, i, segsize, jj; @@ -589,7 +589,7 @@ scatter_u (int_t ib, int_t *lsub, int_t *usub, double *tempv, - int_t *indirect, + int *indirect, int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, gridinfo_t *grid) { #ifdef PI_DEBUG diff --git a/SRC/dtreeFactorization.c b/SRC/dtreeFactorization.c index e22be396..43ef7de2 100644 --- a/SRC/dtreeFactorization.c +++ b/SRC/dtreeFactorization.c @@ -218,8 +218,8 @@ int_t ddenseTreeFactor( Remain_info_t* Remain_info = packLUInfo->Remain_info; uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo; lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo; - int_t* indirect = fNlists->indirect; - int_t* indirect2 = fNlists->indirect2; + int* indirect = fNlists->indirect; + int* indirect2 = fNlists->indirect2; /*Schurcomplement Update*/ int_t nub = uPanelInfo->nub; int_t nlb = lPanelInfo->nlb; @@ -490,8 +490,8 @@ int_t dsparseTreeFactor_ASYNC( lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo; int_t *lsub = lPanelInfo->lsub; int_t *usub = uPanelInfo->usub; - int_t* indirect = fNlists->indirect; - int_t* indirect2 = fNlists->indirect2; + int* indirect = fNlists->indirect; + int* indirect2 = fNlists->indirect2; /*Schurcomplement Update*/ diff --git a/SRC/dtrfAux.c b/SRC/dtrfAux.c index ddf8c25b..869d2fbb 100644 --- a/SRC/dtrfAux.c +++ b/SRC/dtrfAux.c @@ -121,7 +121,7 @@ int_t dSchurComplementSetup( LocalLU_t *Llu = LUstruct->Llu; int_t* xsup = Glu_persist->xsup; - int_t* ToRecv = Llu->ToRecv; + int* ToRecv = Llu->ToRecv; int_t iam = grid->iam; int_t myrow = MYROW (iam, grid); @@ -356,7 +356,7 @@ int_t dSchurComplementSetupGPU( LocalLU_t *Llu = LUstruct->Llu; int_t* xsup = Glu_persist->xsup; - int_t* ToRecv = Llu->ToRecv; + int* ToRecv = Llu->ToRecv; int_t iam = grid->iam; int_t myrow = MYROW (iam, grid); diff --git a/SRC/dtrfCommWrapper.c b/SRC/dtrfCommWrapper.c index 77ea33aa..13455b08 100644 --- a/SRC/dtrfCommWrapper.c +++ b/SRC/dtrfCommWrapper.c @@ -135,7 +135,7 @@ int_t dLPanelTrSolve( int_t k, int_t* factored_L, int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); int_t kcol = PCOL (k, grid); int_t mycol = MYCOL (iam, grid); - int_t nsupc = SuperSize(k); + int nsupc = SuperSize(k); /*factor the L panel*/ if (mycol == kcol && iam != pkk) @@ -143,7 +143,7 @@ int_t dLPanelTrSolve( int_t k, int_t* factored_L, // factored_L[k] = 1; int_t lk = LBj (k, grid); double *lusup = Llu->Lnzval_bc_ptr[lk]; - int_t nsupr; + int nsupr; if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1]; else @@ -159,7 +159,7 @@ int_t dLPanelTrSolve( int_t k, int_t* factored_L, int_t l = nsupr; double* ublk_ptr = BlockUFactor; - int_t ld_ujrow = nsupc; + int ld_ujrow = nsupc; // unsigned long long t1 = _rdtsc(); @@ -171,7 +171,7 @@ int_t dLPanelTrSolve( int_t k, int_t* factored_L, { int_t off = i * BL; // Sherry: int_t len = MY_MIN(BL, l - i * BL); - int_t len = SUPERLU_MIN(BL, l - i * BL); + int len = SUPERLU_MIN(BL, l - i * BL); #if 1 #if defined (USE_VENDOR_BLAS) @@ -198,7 +198,7 @@ int_t dLPanelTrSolve( int_t k, int_t* factored_L, factored_L[k] = 1; int_t lk = LBj (k, grid); double *lusup = Llu->Lnzval_bc_ptr[lk]; - int_t nsupr; + int nsupr; if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1]; else @@ -209,7 +209,7 @@ int_t dLPanelTrSolve( int_t k, int_t* factored_L, int_t l = nsupr - nsupc; double* ublk_ptr = BlockUFactor; - int_t ld_ujrow = nsupc; + int ld_ujrow = nsupc; // printf("%d: L update \n",k ); #define BL 32 @@ -218,7 +218,7 @@ int_t dLPanelTrSolve( int_t k, int_t* factored_L, { int_t off = i * BL; // Sherry: int_t len = MY_MIN(BL, l - i * BL); - int_t len = SUPERLU_MIN(BL, (l - i * BL)); + int len = SUPERLU_MIN(BL, (l - i * BL)); #pragma omp task { #if 1 @@ -406,8 +406,8 @@ int_t dIBcastRecvLPanel( Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; int_t* xsup = Glu_persist->xsup; - int_t** ToSendR = Llu->ToSendR; - int_t* ToRecv = Llu->ToRecv; + int** ToSendR = Llu->ToSendR; + int* ToRecv = Llu->ToRecv; int_t iam = grid->iam; int_t Pc = grid->npcol; int_t mycol = MYCOL (iam, grid); @@ -463,8 +463,8 @@ int_t dIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt, { LocalLU_t *Llu = LUstruct->Llu; - int_t* ToSendD = Llu->ToSendD; - int_t* ToRecv = Llu->ToRecv; + int* ToSendD = Llu->ToSendD; + int* ToRecv = Llu->ToRecv; int_t iam = grid->iam; int_t Pr = grid->nprow; int_t myrow = MYROW (iam, grid); @@ -509,8 +509,8 @@ int_t dWaitL( int_t k, int* msgcnt, int* msgcntU, gridinfo_t *grid, LUstruct_t *LUstruct, SCT_t *SCT) { LocalLU_t *Llu = LUstruct->Llu; - int_t** ToSendR = Llu->ToSendR; - int_t* ToRecv = Llu->ToRecv; + int** ToSendR = Llu->ToSendR; + int* ToRecv = Llu->ToRecv; int_t iam = grid->iam; int_t mycol = MYCOL (iam, grid); int_t kcol = PCOL (k, grid); @@ -538,8 +538,8 @@ int_t dWaitU( int_t k, int* msgcnt, { LocalLU_t *Llu = LUstruct->Llu; - int_t* ToRecv = Llu->ToRecv; - int_t* ToSendD = Llu->ToSendD; + int* ToRecv = Llu->ToRecv; + int* ToSendD = Llu->ToSendD; int_t iam = grid->iam; int_t myrow = MYROW (iam, grid); int_t krow = PROW (k, grid); diff --git a/SRC/pd3dcomm.c b/SRC/pd3dcomm.c index 6e4e68b0..98ecd37b 100644 --- a/SRC/pd3dcomm.c +++ b/SRC/pd3dcomm.c @@ -41,6 +41,7 @@ at the top-level directory. int_t dAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) { + int i; int_t Pc = grid3d->npcol; int_t Pr = grid3d->nprow; @@ -53,7 +54,7 @@ int_t dAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) double **Lnzval_bc_ptr = (double **) SUPERLU_MALLOC(sizeof(double*)*nbc); /* size ceil(NSUPERS/Pc) */ - for (int_t i = 0; i < nbc ; ++i) + for (i = 0; i < nbc ; ++i) { /* code */ Lrowind_bc_ptr[i] = NULL; @@ -65,21 +66,33 @@ int_t dAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) double **Unzval_br_ptr = (double **) SUPERLU_MALLOC(sizeof(double*)*nbr); /* size ceil(NSUPERS/Pr) */ - for (int_t i = 0; i < nbr ; ++i) + for (i = 0; i < nbr ; ++i) { /* code */ Ufstnz_br_ptr[i] = NULL; Unzval_br_ptr[i] = NULL; } - + +#if 0 // Sherry: change to int type int_t *ToRecv = intCalloc_dist(nsupers); /* Recv from no one (0), left (1), and up (2).*/ int_t *ToSendD = intCalloc_dist(nbr); /* Whether need to send down block row. */ int_t **ToSendR = (int_t **) SUPERLU_MALLOC(nbc * sizeof(int_t*)); /* List of processes to send right block col. */ - - for (int_t i = 0; i < nbc; ++i) +#else + /* Recv from no one (0), left (1), and up (2).*/ + int *ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int)); + for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; + /* Whether need to send down block row. */ + int *ToSendD = SUPERLU_MALLOC(nbr * sizeof(int)); + for (i = 0; i < nbr; ++i) ToSendD[i] = 0; + /* List of processes to send right block col. */ + int **ToSendR = (int **) SUPERLU_MALLOC(nbc * sizeof(int*)); +#endif + + for (i = 0; i < nbc; ++i) { /* code */ - ToSendR[i] = INT_T_ALLOC(Pc); + //ToSendR[i] = INT_T_ALLOC(Pc); + ToSendR[i] = SUPERLU_MALLOC(Pc * sizeof(int)); } /*now setup the pointers*/ @@ -231,8 +244,8 @@ int_t dzRecvLPanel(int_t k, int_t sender, double alpha, double beta, if (lsub != NULL) { - int_t len = lsub[1]; /* LDA of the nzval[] */ - int_t len2 = SuperSize(k) * len; /*size of nzval of L panels*/ + int len = lsub[1]; /* LDA of the nzval[] */ + int len2 = SuperSize(k) * len; /*size of nzval of L panels*/ MPI_Status status; MPI_Recv(Lval_buf , len2, MPI_DOUBLE, sender, k, @@ -374,19 +387,20 @@ int_t dp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) MPI_Bcast( bufmax, NBUFFERS, mpi_int_t, 0, grid3d->zscp.comm); /* now sending tosendR etc */ - int_t** ToSendR = Llu->ToSendR; - int_t* ToRecv = Llu->ToRecv; - int_t* ToSendD = Llu->ToSendD; + int** ToSendR = Llu->ToSendR; + int* ToRecv = Llu->ToRecv; + int* ToSendD = Llu->ToSendD; int_t nbr = CEILING(nsupers, Pr); int_t nbc = CEILING(nsupers, Pc); - MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0, grid3d->zscp.comm); + //Sherry MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( ToRecv, nsupers, MPI_INT, 0, grid3d->zscp.comm); - MPI_Bcast( ToSendD, nbr, mpi_int_t, 0, grid3d->zscp.comm); - for (int_t i = 0; i < nbc; ++i) + MPI_Bcast( ToSendD, nbr, MPI_INT, 0, grid3d->zscp.comm); + for (int i = 0; i < nbc; ++i) { /* code */ - MPI_Bcast( ToSendR[i], Pc, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( ToSendR[i], Pc, MPI_INT, 0, grid3d->zscp.comm); } // diff --git a/SRC/pdgsmv.c b/SRC/pdgsmv.c index db767647..1b2882f4 100644 --- a/SRC/pdgsmv.c +++ b/SRC/pdgsmv.c @@ -373,11 +373,11 @@ void pdgsmv_finalize(pdgsmv_comm_t *gsmv_comm) int_t *it; double *dt; SUPERLU_FREE(gsmv_comm->extern_start); - if ( it = gsmv_comm->ind_tosend ) SUPERLU_FREE(it); - if ( it = gsmv_comm->ind_torecv ) SUPERLU_FREE(it); + if ( (it = gsmv_comm->ind_tosend) ) SUPERLU_FREE(it); + if ( (it = gsmv_comm->ind_torecv) ) SUPERLU_FREE(it); SUPERLU_FREE(gsmv_comm->ptr_ind_tosend); SUPERLU_FREE(gsmv_comm->SendCounts); - if ( dt = gsmv_comm->val_tosend ) SUPERLU_FREE(dt); - if ( dt = gsmv_comm->val_torecv ) SUPERLU_FREE(dt); + if ( (dt = gsmv_comm->val_tosend) ) SUPERLU_FREE(dt); + if ( (dt = gsmv_comm->val_torecv) ) SUPERLU_FREE(dt); } diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c index 2cb3098c..69ae0e99 100644 --- a/SRC/pdgssvx.c +++ b/SRC/pdgssvx.c @@ -664,6 +664,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; + default: break; } } diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index d8ea203d..66f5155b 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -665,6 +665,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, ABORT ("Malloc fails for R[]."); ScalePermstruct->R = R; break; + // default: break; } } diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c index c349f0b8..4b1b63de 100644 --- a/SRC/pdgstrf2.c +++ b/SRC/pdgstrf2.c @@ -365,8 +365,8 @@ pdgstrf2_trsm * The following functions are for the new pdgstrf2_dtrsm in the 3D code. *****************************************************************************/ static -int_t LpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr, int_t ld_ujrow, - double* lusup, int_t nsupr, SCT_t* SCT) +int_t LpanelUpdate(int off0, int nsupc, double* ublk_ptr, int ld_ujrow, + double* lusup, int nsupr, SCT_t* SCT) { int_t l = nsupr - off0; double alpha = 1.0; @@ -377,7 +377,7 @@ int_t LpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr, int_t ld_ujrow, for (int i = 0; i < CEILING(l, GT); ++i) { int_t off = i * GT; - int_t len = SUPERLU_MIN(GT, l - i * GT); + int len = SUPERLU_MIN(GT, l - i * GT); #if 1 #if defined (USE_VENDOR_BLAS) dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha, @@ -421,7 +421,7 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, int_t jlst = FstBlockC (k + 1); double *lusup = Llu->Lnzval_bc_ptr[lk]; int_t nsupc = SuperSize (k); - int_t nsupr; + int nsupr; if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1]; else @@ -429,11 +429,11 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, double *ublk_ptr = BlockUFactor; double *ujrow = BlockUFactor; int_t luptr = 0; /* Point_t to the diagonal entries. */ - int_t cols_left = nsupc; /* supernode size */ + int cols_left = nsupc; /* supernode size */ int_t u_diag_cnt = 0; int_t ld_ujrow = nsupc; /* leading dimension of ujrow */ - int_t incx = 1; - int_t incy = ld_ujrow; + int incx = 1; + int incy = ld_ujrow; for (int_t j = 0; j < jlst - jfst; ++j) /* for each column in panel */ { @@ -481,7 +481,7 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, if (--cols_left) { /*following must be int*/ - int_t l = nsupc - j - 1; + int l = nsupc - j - 1; /* Rank-1 update */ #if 1 @@ -580,7 +580,7 @@ void pdgstrf2_xtrsm int nsupr; /* number of rows in the block (LDA) */ int luptr; int_t myrow, krow, j, jfst, jlst, u_diag_cnt; - int_t nsupc; /* number of columns in the block */ + int nsupc; /* number of columns in the block */ int_t *xsup = Glu_persist->xsup; double *lusup; double *ujrow, *ublk_ptr; /* pointer to the U block */ @@ -708,7 +708,7 @@ int_t dTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, int_t *usub, double *uval, double *tempv, - int_t knsupc, int_t nsupr, double *lusup, + int_t knsupc, int nsupr, double *lusup, Glu_persist_t *Glu_persist) /*glupersist for xsup for supersize*/ { double alpha = 1.0; @@ -721,14 +721,14 @@ int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, // printf("klst inside task%d\n", ); /*find ldu */ - int_t ldu = 0; + int ldu = 0; for (int_t jj = iukp; jj < iukp + nsupc; ++jj) { ldu = SUPERLU_MAX( klst - usub[jj], ldu) ; } /*pack U block into a dense Block*/ - int_t ncols = dTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub, + int ncols = dTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv); /*now call dtrsm on packed dense block*/ diff --git a/SRC/pdutil.c b/SRC/pdutil.c index a3bf6766..a7069cdf 100644 --- a/SRC/pdutil.c +++ b/SRC/pdutil.c @@ -503,7 +503,7 @@ void dSolveFinalize(superlu_dist_options_t *options, SOLVEstruct_t *SOLVEstruct) SUPERLU_FREE(SOLVEstruct->inv_perm_c); SUPERLU_FREE(SOLVEstruct->diag_procs); SUPERLU_FREE(SOLVEstruct->diag_len); - if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it); + if ( (it = SOLVEstruct->A_colind_gsmv) ) SUPERLU_FREE(it); options->SolveInitialized = NO; } /* dSolveFinalize */ diff --git a/SRC/psymbfact.c b/SRC/psymbfact.c index 597c364a..b42af073 100644 --- a/SRC/psymbfact.c +++ b/SRC/psymbfact.c @@ -353,8 +353,8 @@ float symbfact_dist #endif /* Allocate storage common to the symbolic factor routines */ - if (iinfo = symbfact_alloc (n, nprocs_symb, Pslu_freeable, - &Llu_symbfact, &VInfo, &CS, &PS)) + if ((iinfo = symbfact_alloc (n, nprocs_symb, Pslu_freeable, + &Llu_symbfact, &VInfo, &CS, &PS))) return (PS.allocMem); /* Copy the redistributed input matrix AS at the end of the memory buffer allocated to store L and U. That is, copy (AS.x_ainf, AS.ind_ainf) in @@ -2067,15 +2067,15 @@ symbfact_vtx /* TEST available memory */ if (next >= x_aind_end) { if (domain_symb) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, vtx, next, 0, - computeL, DOMAIN_SYMB, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); - } else if (mem_error = - psymbfact_LUXpand (iam, n, EMPTY, vtx, &next, 0, - computeL, LL_SYMB, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, vtx, next, 0, + computeL, DOMAIN_SYMB, 1, + Pslu_freeable, Llu_symbfact, VInfo, PS)) ) + return (mem_error); + } else if ( (mem_error = + psymbfact_LUXpand (iam, n, EMPTY, vtx, &next, 0, + computeL, LL_SYMB, 1, + Pslu_freeable, Llu_symbfact, VInfo, PS)) ) return (mem_error); x_aind_end = xsub[vtx_lid + 1]; @@ -2246,9 +2246,9 @@ updateRcvd_prGraph /* test if enough memory in usubPr array */ if (ind >= szsubPr) { - if (mem_error = - psymbfact_prLUXpand (iam, ind, computeL, Llu_symbfact, PS)) - return (mem_error); + if ( (mem_error = + psymbfact_prLUXpand (iam, ind, computeL, Llu_symbfact, PS)) ) + return (mem_error); if (computeL) subPr = Llu_symbfact->lsubPr; else @@ -2367,9 +2367,9 @@ update_prGraph if (sn_elt < lstVtx_blk) { sn_elt_prid = LOCAL_IND( globToLoc[sn_elt] ) - pr_offset; if ((*p_indsubPr) + 2 >= szsubPr) { - if (mem_error = - psymbfact_prLUXpand (iam, 0, computeL, Llu_symbfact, PS)) - return (mem_error); + if ( (mem_error = + psymbfact_prLUXpand (iam, 0, computeL, Llu_symbfact, PS)) ) + return (mem_error); if (computeL) { subPr = Llu_symbfact->lsubPr; szsubPr = Llu_symbfact->szLsubPr; } @@ -2523,16 +2523,16 @@ blk_symbfact prval_curvtx = n; /* Compute nonzero structure L(:,vtx) */ - if (mem_error = - symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 1, domain_symb, - fstVtx_blk, lstVtx, - snrep_lid, szsn, &nextl, - marker, - lsub_rcvd, lsub_rcvd_sz, - Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_l, - &neltsVtx_L, &neltsVtx_CSep_L, &neltsZrVtx_L, - &neltsMatched_L, markl1_vtx, &prval_curvtx, - vtx_bel_snU, &vtx_bel_snL)) + if ( (mem_error = + symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 1, domain_symb, + fstVtx_blk, lstVtx, + snrep_lid, szsn, &nextl, + marker, + lsub_rcvd, lsub_rcvd_sz, + Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_l, + &neltsVtx_L, &neltsVtx_CSep_L, &neltsZrVtx_L, + &neltsMatched_L, markl1_vtx, &prval_curvtx, + vtx_bel_snU, &vtx_bel_snL)) ) return (mem_error); lsub = Llu_symbfact->lsub; @@ -2541,17 +2541,17 @@ blk_symbfact #endif /* Compute nonzero structure of U(vtx,:) */ - if (mem_error = - symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 0, domain_symb, - fstVtx_blk, lstVtx, - snrep_lid, szsn, &nextu, - marker, - usub_rcvd, usub_rcvd_sz, - Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_u, - &neltsVtx_U, &neltsVtx_CSep_U, &neltsZrVtx_U, - &neltsMatched_U, marku1_vtx, &prval_curvtx, - vtx_bel_snL, &vtx_bel_snU)) - return (mem_error); + if ( (mem_error = + symbfact_vtx (n, iam, vtx, vtx_lid, vtx_prid, 0, domain_symb, + fstVtx_blk, lstVtx, + snrep_lid, szsn, &nextu, + marker, + usub_rcvd, usub_rcvd_sz, + Pslu_freeable, Llu_symbfact, VInfo, PS, &neltsVtxInit_u, + &neltsVtx_U, &neltsVtx_CSep_U, &neltsZrVtx_U, + &neltsMatched_U, marku1_vtx, &prval_curvtx, + vtx_bel_snL, &vtx_bel_snU)) ) + return (mem_error); usub = Llu_symbfact->usub; #ifdef TEST_SYMB @@ -2618,11 +2618,11 @@ blk_symbfact *p_nextu = xusub[vtx_lid]; nsuper_loc += 1; *p_nsuper_loc = nsuper_loc; - if (mem_error = - dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, - sizes, fstVtxSep, vtx, - Llu_symbfact, Pslu_freeable, VInfo, CS, PS, - p_nextl, p_nextu, p_nsuper_loc)) + if ( (mem_error = + dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, + sizes, fstVtxSep, vtx, + Llu_symbfact, Pslu_freeable, VInfo, CS, PS, + p_nextl, p_nextu, p_nsuper_loc)) ) return (mem_error); /* set up neltsZr and neltsTotal */ vtx = lstVtx_blk; @@ -3201,10 +3201,10 @@ expand_RL } nextl = xlsub[vtxXp_lid+1]; - if (mem_error = - psymbfact_LUXpand_RL (iam, n, vtxXp, nextl, len_texp, - computeL, Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); + if ( (mem_error = + psymbfact_LUXpand_RL (iam, n, vtxXp, nextl, len_texp, + computeL, Pslu_freeable, Llu_symbfact, VInfo, PS)) ) + return (mem_error); return 0; } @@ -3332,9 +3332,9 @@ rl_update /* test if enough memory in usubPr array */ if (ind > Llu_symbfact->szLsubPr) { - if (mem_error = - psymbfact_prLUXpand (iam, ind, LSUB_PR, Llu_symbfact, PS)) - return (mem_error); + if ( (mem_error = + psymbfact_prLUXpand (iam, ind, LSUB_PR, Llu_symbfact, PS)) ) + return (mem_error); usubPr = Llu_symbfact->lsubPr; } @@ -3462,13 +3462,13 @@ rl_update if (marker[elt] != markl) { /* add elt to structure of vtx */ if (nextl >= xlsub[vtx_lid + 1]) { - if (mem_error = - expand_RL (computeRcvd, n, iam, lsub_rcvd, lsub_rcvd_sz, - usub_rcvd, usub_rcvd_sz, vtx, i, - lstVtx_upd, fstVtx_srcUpd, lstVtx_srcUpd, - fstVtx_toUpd, lstVtx_toUpd, nvtcs_toUpd, computeL, - &markl, marker, Pslu_freeable, Llu_symbfact, VInfo, PS)) - return (mem_error); + if ( (mem_error = + expand_RL (computeRcvd, n, iam, lsub_rcvd, lsub_rcvd_sz, + usub_rcvd, usub_rcvd_sz, vtx, i, + lstVtx_upd, fstVtx_srcUpd, lstVtx_srcUpd, + fstVtx_toUpd, lstVtx_toUpd, nvtcs_toUpd, computeL, + &markl, marker, Pslu_freeable, Llu_symbfact, VInfo, PS)) ) + return (mem_error); if (computeL) { lsub = Llu_symbfact->lsub; if (!computeRcvd) @@ -3565,20 +3565,20 @@ dnsUpSeps_symbfact else vtx_elt = fstVtx_lvl; if (nextl + lstVtx_lvl - vtx_elt >= Llu_symbfact->szLsub) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, - nextl + fstVtx_lvl - vtx_elt, - LSUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, + nextl + fstVtx_lvl - vtx_elt, + LSUB, DNS_UPSEPS, 1, + Pslu_freeable, Llu_symbfact, VInfo, PS)) ) return (mem_error); lsub = Llu_symbfact->lsub; } if (nextu + lstVtx_lvl - vtx_elt >= Llu_symbfact->szUsub) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, - nextu + fstVtx_lvl - vtx_elt, - LSUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, + nextu + fstVtx_lvl - vtx_elt, + LSUB, DNS_UPSEPS, 1, + Pslu_freeable, Llu_symbfact, VInfo, PS)) ) return (mem_error); usub = Llu_symbfact->usub; } @@ -3610,10 +3610,10 @@ dnsUpSeps_symbfact if (lsub[k] >= fstVtx_blk) { lsub[nextl] = lsub[k]; nextl ++; if (nextl >= MEM_LSUB( Llu_symbfact, VInfo )) - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 0, - LSUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextl, 0, + LSUB, DNS_UPSEPS, 1, + Pslu_freeable, Llu_symbfact, VInfo, PS)) ) return (mem_error); lsub = Llu_symbfact->lsub; } @@ -3621,10 +3621,10 @@ dnsUpSeps_symbfact if (usub[k] > fstVtx_blk) { usub[nextu] = usub[k]; nextu ++; if (nextu >= MEM_USUB( Llu_symbfact, VInfo )) - if (mem_error = - psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 0, - USUB, DNS_UPSEPS, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, fstVtx_blk, nextu, 0, + USUB, DNS_UPSEPS, 1, + Pslu_freeable, Llu_symbfact, VInfo, PS)) ) return (mem_error); usub = Llu_symbfact->usub; } @@ -3892,10 +3892,10 @@ dnsCurSep_symbfact j = x_newelts[vtx_lid_x+1] + lstVtx - vtx; if ((computeL && next+j >= MEM_LSUB(Llu_symbfact, VInfo)) || (computeU && next+j >= MEM_USUB(Llu_symbfact, VInfo))) { - if (mem_error = - psymbfact_LUXpandMem (iam, n, vtx, next, next + j, - computeL, DNS_CURSEP, 1, - Pslu_freeable, Llu_symbfact, VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, vtx, next, next + j, + computeL, DNS_CURSEP, 1, + Pslu_freeable, Llu_symbfact, VInfo, PS)) ) return (mem_error); if (computeL) sub = Llu_symbfact->lsub; else sub = Llu_symbfact->usub; @@ -4131,19 +4131,19 @@ denseSep_symbfact } if (VInfo->filledSep == FILLED_SEP) { - if (mem_error = - dnsCurSep_symbfact (n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, - szSep, lstP - fstP, rcvd_dnsSep, p_nextl, - p_nextu, p_mark, p_nsuper_loc, marker, ndCom, - Llu_symbfact, Pslu_freeable, VInfo, CS, PS)) + if ( (mem_error = + dnsCurSep_symbfact (n, iam, ind_sizes1, ind_sizes2, sizes, fstVtxSep, + szSep, lstP - fstP, rcvd_dnsSep, p_nextl, + p_nextu, p_mark, p_nsuper_loc, marker, ndCom, + Llu_symbfact, Pslu_freeable, VInfo, CS, PS)) ) return (mem_error); } else if (rcvd_dnsSep) - if (mem_error = - dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, - sizes, fstVtxSep, EMPTY, - Llu_symbfact, Pslu_freeable, VInfo, CS, PS, - p_nextl, p_nextu, p_nsuper_loc)) + if ( (mem_error = + dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, + sizes, fstVtxSep, EMPTY, + Llu_symbfact, Pslu_freeable, VInfo, CS, PS, + p_nextl, p_nextu, p_nsuper_loc)) ) return (mem_error); return 0; } @@ -4251,11 +4251,11 @@ interLvl_symbfact /* quick return if all upper separators are dense */ if (VInfo->filledSep != FILLED_SEPS) { VInfo->filledSep = FILLED_SEPS; - if (mem_error = - dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, sizes, - fstVtxSep, - EMPTY, Llu_symbfact, Pslu_freeable, VInfo, CS, PS, - p_nextl, p_nextu, p_nsuper_loc)) + if ( (mem_error = + dnsUpSeps_symbfact (n, iam, szSep, ind_sizes1, ind_sizes2, sizes, + fstVtxSep, + EMPTY, Llu_symbfact, Pslu_freeable, VInfo, CS, PS, + p_nextl, p_nextu, p_nsuper_loc)) ) return (mem_error); } return 0; diff --git a/SRC/psymbfact_util.c b/SRC/psymbfact_util.c index 40b9c864..39c066ae 100644 --- a/SRC/psymbfact_util.c +++ b/SRC/psymbfact_util.c @@ -292,10 +292,10 @@ int_t psymbfact_LUXpand if (prev_len + len_texp >= prev_xsub_nextLvl) { /* not enough memory */ min_new_len = prev_len + len_texp + (sz_prev_mem - prev_xsub_nextLvl); - if (mem_error = - psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, - mem_type, rout_type, 0, Pslu_freeable, Llu_symbfact, - VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, + mem_type, rout_type, 0, Pslu_freeable, Llu_symbfact, + VInfo, PS)) ) return (mem_error); if ( mem_type == LSUB ) new_mem = Llu_symbfact->lsub; @@ -437,10 +437,10 @@ int_t psymbfact_LUXpand_RL if (prev_len + len_texp >= prev_xsub_nextLvl) { /* not enough memory */ min_new_len = prev_len + len_texp + (sz_prev_mem - prev_xsub_nextLvl); - if (mem_error = - psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, - mem_type, RL_SYMB, 0, Pslu_freeable, Llu_symbfact, - VInfo, PS)) + if ( (mem_error = + psymbfact_LUXpandMem (iam, n, vtxXp, next, min_new_len, + mem_type, RL_SYMB, 0, Pslu_freeable, Llu_symbfact, + VInfo, PS)) ) return (mem_error); if ( mem_type == LSUB ) new_mem = Llu_symbfact->lsub; diff --git a/SRC/pz3dcomm.c b/SRC/pz3dcomm.c index dba7ba75..fc79bc15 100644 --- a/SRC/pz3dcomm.c +++ b/SRC/pz3dcomm.c @@ -40,6 +40,7 @@ at the top-level directory. int_t zAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) { + int i; int_t Pc = grid3d->npcol; int_t Pr = grid3d->nprow; @@ -52,7 +53,7 @@ int_t zAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) doublecomplex **Lnzval_bc_ptr = (doublecomplex **) SUPERLU_MALLOC(sizeof(doublecomplex*)*nbc); /* size ceil(NSUPERS/Pc) */ - for (int_t i = 0; i < nbc ; ++i) + for (i = 0; i < nbc ; ++i) { /* code */ Lrowind_bc_ptr[i] = NULL; @@ -64,21 +65,33 @@ int_t zAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) doublecomplex **Unzval_br_ptr = (doublecomplex **) SUPERLU_MALLOC(sizeof(doublecomplex*)*nbr); /* size ceil(NSUPERS/Pr) */ - for (int_t i = 0; i < nbr ; ++i) + for (i = 0; i < nbr ; ++i) { /* code */ Ufstnz_br_ptr[i] = NULL; Unzval_br_ptr[i] = NULL; } +#if 0 // Sherry: change to int type int_t *ToRecv = intCalloc_dist(nsupers); /* Recv from no one (0), left (1), and up (2).*/ int_t *ToSendD = intCalloc_dist(nbr); /* Whether need to send down block row. */ int_t **ToSendR = (int_t **) SUPERLU_MALLOC(nbc * sizeof(int_t*)); /* List of processes to send right block col. */ - - for (int_t i = 0; i < nbc; ++i) +#else + /* Recv from no one (0), left (1), and up (2).*/ + int *ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int)); + for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; + /* Whether need to send down block row. */ + int *ToSendD = SUPERLU_MALLOC(nbr * sizeof(int)); + for (i = 0; i < nbr; ++i) ToSendD[i] = 0; + /* List of processes to send right block col. */ + int **ToSendR = (int **) SUPERLU_MALLOC(nbc * sizeof(int*)); +#endif + + for (i = 0; i < nbc; ++i) { /* code */ - ToSendR[i] = INT_T_ALLOC(Pc); + //ToSendR[i] = INT_T_ALLOC(Pc); + ToSendR[i] = SUPERLU_MALLOC(Pc * sizeof(int)); } /*now setup the pointers*/ @@ -229,9 +242,8 @@ int_t zzRecvLPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex bet if (lsub != NULL) { - - int_t len = lsub[1]; /* LDA of the nzval[] */ - int_t len2 = SuperSize(k) * len; /*size of nzval of L panels*/ + int len = lsub[1]; /* LDA of the nzval[] */ + int len2 = SuperSize(k) * len; /*size of nzval of L panels*/ MPI_Status status; MPI_Recv(Lval_buf , len2, SuperLU_MPI_DOUBLE_COMPLEX, sender, k, @@ -373,19 +385,19 @@ int_t zp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) MPI_Bcast( bufmax, NBUFFERS, mpi_int_t, 0, grid3d->zscp.comm); /* now sending tosendR etc */ - int_t** ToSendR = Llu->ToSendR; - int_t* ToRecv = Llu->ToRecv; - int_t* ToSendD = Llu->ToSendD; + int** ToSendR = Llu->ToSendR; + int* ToRecv = Llu->ToRecv; + int* ToSendD = Llu->ToSendD; int_t nbr = CEILING(nsupers, Pr); int_t nbc = CEILING(nsupers, Pc); - MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( ToRecv, nsupers, MPI_INT, 0, grid3d->zscp.comm); - MPI_Bcast( ToSendD, nbr, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( ToSendD, nbr, MPI_INT, 0, grid3d->zscp.comm); for (int_t i = 0; i < nbc; ++i) { /* code */ - MPI_Bcast( ToSendR[i], Pc, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( ToSendR[i], Pc, MPI_INT, 0, grid3d->zscp.comm); } // diff --git a/SRC/pzgsmv.c b/SRC/pzgsmv.c index 0c0838ba..f76c3293 100644 --- a/SRC/pzgsmv.c +++ b/SRC/pzgsmv.c @@ -375,11 +375,11 @@ void pzgsmv_finalize(pzgsmv_comm_t *gsmv_comm) int_t *it; doublecomplex *dt; SUPERLU_FREE(gsmv_comm->extern_start); - if ( it = gsmv_comm->ind_tosend ) SUPERLU_FREE(it); - if ( it = gsmv_comm->ind_torecv ) SUPERLU_FREE(it); + if ( (it = gsmv_comm->ind_tosend) ) SUPERLU_FREE(it); + if ( (it = gsmv_comm->ind_torecv) ) SUPERLU_FREE(it); SUPERLU_FREE(gsmv_comm->ptr_ind_tosend); SUPERLU_FREE(gsmv_comm->SendCounts); - if ( dt = gsmv_comm->val_tosend ) SUPERLU_FREE(dt); - if ( dt = gsmv_comm->val_torecv ) SUPERLU_FREE(dt); + if ( (dt = gsmv_comm->val_tosend) ) SUPERLU_FREE(dt); + if ( (dt = gsmv_comm->val_torecv) ) SUPERLU_FREE(dt); } diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c index 2d9d6989..295f43d8 100644 --- a/SRC/pzgssvx.c +++ b/SRC/pzgssvx.c @@ -663,6 +663,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; + default: break; } } @@ -1571,6 +1572,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, case COL: SUPERLU_FREE(R); break; + default: break; } } diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c index 9a73cdad..93a22bd8 100644 --- a/SRC/pzgssvx3d.c +++ b/SRC/pzgssvx3d.c @@ -664,6 +664,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, ABORT ("Malloc fails for R[]."); ScalePermstruct->R = R; break; + default: break; } } @@ -1530,6 +1531,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, case COL: SUPERLU_FREE (R); break; + default: break; } } diff --git a/SRC/pzgssvx_ABglobal.c b/SRC/pzgssvx_ABglobal.c index b4abcd28..343077f6 100644 --- a/SRC/pzgssvx_ABglobal.c +++ b/SRC/pzgssvx_ABglobal.c @@ -587,6 +587,7 @@ pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; + default: break; } } @@ -1097,6 +1098,7 @@ pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, case COL: SUPERLU_FREE(R); break; + default: break; } } if ( !factored || (factored && options->IterRefine) ) diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c index 3b9d6137..2116d03a 100644 --- a/SRC/pzgstrf2.c +++ b/SRC/pzgstrf2.c @@ -366,8 +366,8 @@ pzgstrf2_trsm * The following functions are for the new pdgstrf2_ztrsm in the 3D code. *****************************************************************************/ static -int_t LpanelUpdate(int_t off0, int_t nsupc, doublecomplex* ublk_ptr, int_t ld_ujrow, - doublecomplex* lusup, int_t nsupr, SCT_t* SCT) +int_t LpanelUpdate(int_t off0, int nsupc, doublecomplex* ublk_ptr, int ld_ujrow, + doublecomplex* lusup, int nsupr, SCT_t* SCT) { int_t l = nsupr - off0; doublecomplex alpha = {1.0, 0.0}; @@ -378,7 +378,7 @@ int_t LpanelUpdate(int_t off0, int_t nsupc, doublecomplex* ublk_ptr, int_t ld_u for (int i = 0; i < CEILING(l, GT); ++i) { int_t off = i * GT; - int_t len = SUPERLU_MIN(GT, l - i * GT); + int len = SUPERLU_MIN(GT, l - i * GT); #if 1 #if defined (USE_VENDOR_BLAS) ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha, @@ -421,8 +421,8 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh, int_t jfst = FstBlockC (k); int_t jlst = FstBlockC (k + 1); doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk]; - int_t nsupc = SuperSize (k); - int_t nsupr; + int nsupc = SuperSize (k); + int nsupr; if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1]; else @@ -433,8 +433,8 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh, int cols_left = nsupc; /* supernode size */ int_t u_diag_cnt = 0; int_t ld_ujrow = nsupc; /* leading dimension of ujrow */ - int_t incx = 1; - int_t incy = ld_ujrow; + int incx = 1; + int incy = ld_ujrow; for (int_t j = 0; j < jlst - jfst; ++j) /* for each column in panel */ { @@ -711,7 +711,7 @@ int_t zTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, int_t *usub, doublecomplex *uval, doublecomplex *tempv, - int_t knsupc, int_t nsupr, doublecomplex *lusup, + int_t knsupc, int nsupr, doublecomplex *lusup, Glu_persist_t *Glu_persist) /*glupersist for xsup for supersize*/ { doublecomplex alpha = {1.0, 0.0}; @@ -724,14 +724,14 @@ int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, // printf("klst inside task%d\n", ); /*find ldu */ - int_t ldu = 0; + int ldu = 0; for (int_t jj = iukp; jj < iukp + nsupc; ++jj) { ldu = SUPERLU_MAX( klst - usub[jj], ldu) ; } /*pack U block into a dense Block*/ - int_t ncols = zTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub, + int ncols = zTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv); /*now call ztrsm on packed dense block*/ diff --git a/SRC/pzutil.c b/SRC/pzutil.c index 2b56925a..a018b59b 100644 --- a/SRC/pzutil.c +++ b/SRC/pzutil.c @@ -502,7 +502,7 @@ void zSolveFinalize(superlu_dist_options_t *options, SOLVEstruct_t *SOLVEstruct) SUPERLU_FREE(SOLVEstruct->inv_perm_c); SUPERLU_FREE(SOLVEstruct->diag_procs); SUPERLU_FREE(SOLVEstruct->diag_len); - if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it); + if ( (it = SOLVEstruct->A_colind_gsmv) ) SUPERLU_FREE(it); options->SolveInitialized = NO; } /* zSolveFinalize */ diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index 05b71fdc..39184d60 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -568,13 +568,13 @@ extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid); /* from scatter.h */ extern void dblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info, - Remain_info_t *Remain_info, double *L_mat, int_t ldl, - double *U_mat, int_t ldu, double *bigV, + Remain_info_t *Remain_info, double *L_mat, int ldl, + double *U_mat, int ldu, double *bigV, // int_t jj0, int_t knsupc, int_t klst, int_t *lsub, int_t *usub, int_t ldt, int_t thread_id, - int_t *indirect, int_t *indirect2, + int *indirect, int *indirect2, int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, int_t *xsup, gridinfo_t *, SuperLUStat_t * @@ -594,7 +594,7 @@ dblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock, int_t knsupc, int_t klst, int_t *lsub, int_t *usub, int_t ldt, int_t thread_id, - int_t *indirect, int_t *indirect2, + int *indirect, int *indirect2, int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, int_t *xsup, gridinfo_t * @@ -608,7 +608,7 @@ extern int_t dblock_gemm_scatterTopLeft( int_t lb, int_t j, double* bigV, int_t knsupc, int_t klst, int_t* lsub, int_t * usub, int_t ldt, - int_t* indirect, int_t* indirect2, + int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *, gridinfo_t*, SCT_t*SCT, SuperLUStat_t * ); @@ -616,21 +616,21 @@ extern int_t dblock_gemm_scatterTopRight( int_t lb, int_t j, double* bigV, int_t knsupc, int_t klst, int_t* lsub, int_t * usub, int_t ldt, - int_t* indirect, int_t* indirect2, + int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *, gridinfo_t*, SCT_t*SCT, SuperLUStat_t * ); extern int_t dblock_gemm_scatterBottomLeft( int_t lb, int_t j, double* bigV, int_t knsupc, int_t klst, int_t* lsub, int_t * usub, int_t ldt, - int_t* indirect, int_t* indirect2, + int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *, gridinfo_t*, SCT_t*SCT, SuperLUStat_t * ); extern int_t dblock_gemm_scatterBottomRight( int_t lb, int_t j, double* bigV, int_t knsupc, int_t klst, int_t* lsub, int_t * usub, int_t ldt, - int_t* indirect, int_t* indirect2, + int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *, gridinfo_t*, SCT_t*SCT, SuperLUStat_t * ); @@ -713,7 +713,7 @@ extern int_t dTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, double* uval, double *tempv); extern int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, int_t *usub, double* uval, double *tempv, - int_t knsupc, int_t nsupr, double* lusup, + int_t knsupc, int nsupr, double* lusup, Glu_persist_t *Glu_persist) ; extern void pdgstrs2 #ifdef _CRAY @@ -797,21 +797,21 @@ int_t dzRecvUPanel(int_t k, int_t sender, double alpha, /* from communication_aux.h */ extern int_t dIBcast_LPanel (int_t k, int_t k0, int_t* lsub, double* lusup, gridinfo_t *, int* msgcnt, MPI_Request *, - int_t **ToSendR, int_t *xsup, int ); + int **ToSendR, int_t *xsup, int ); extern int_t dBcast_LPanel(int_t k, int_t k0, int_t* lsub, double* lusup, - gridinfo_t *, int* msgcnt, int_t **ToSendR, + gridinfo_t *, int* msgcnt, int **ToSendR, int_t *xsup , SCT_t*, int); extern int_t dIBcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval, gridinfo_t *, int* msgcnt, MPI_Request *, - int_t *ToSendD, int ); + int *ToSendD, int ); extern int_t dBcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval, - gridinfo_t *, int* msgcnt, int_t *ToSendD, SCT_t*, int); + gridinfo_t *, int* msgcnt, int *ToSendD, SCT_t*, int); extern int_t dIrecv_LPanel (int_t k, int_t k0, int_t* Lsub_buf, double* Lval_buf, gridinfo_t *, MPI_Request *, LocalLU_t *, int); extern int_t dIrecv_UPanel(int_t k, int_t k0, int_t* Usub_buf, double*, LocalLU_t *, gridinfo_t*, MPI_Request *, int); -extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int_t **ToSendR, +extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int **ToSendR, MPI_Request *s, SCT_t*); extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *); extern int_t dWait_URecv(MPI_Request *, int* msgcnt, SCT_t *); diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index b45e2659..23c91042 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -84,7 +84,7 @@ at the top-level directory. #elif defined (_LONGINT) typedef int64_t int_t; #define mpi_int_t MPI_LONG_LONG_INT - #define IFMT "%ld" + #define IFMT "%lld" #else /* Default */ typedef int int_t; #define mpi_int_t MPI_INT @@ -864,8 +864,8 @@ typedef struct int_t *iperm_c_supno; int_t *iperm_u; int_t *perm_u; - int_t *indirect; - int_t *indirect2; + int *indirect; + int *indirect2; } factNodelists_t; @@ -1047,6 +1047,8 @@ extern int_t get_min (int_t *, int_t); extern int compare_pair (const void *, const void *); extern int_t static_partition (struct superlu_pair *, int_t, int_t *, int_t, int_t *, int_t *, int); +extern int get_acc_offload(); + /* Routines for debugging */ extern void print_panel_seg_dist(int_t, int_t, int_t, int_t, int_t *, int_t *); diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h index f4c86e0b..eeeb2314 100644 --- a/SRC/superlu_zdefs.h +++ b/SRC/superlu_zdefs.h @@ -571,13 +571,13 @@ extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid); /* from scatter.h */ extern void zblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info, - Remain_info_t *Remain_info, doublecomplex *L_mat, int_t ldl, - doublecomplex *U_mat, int_t ldu, doublecomplex *bigV, + Remain_info_t *Remain_info, doublecomplex *L_mat, int ldl, + doublecomplex *U_mat, int ldu, doublecomplex *bigV, // int_t jj0, int_t knsupc, int_t klst, int_t *lsub, int_t *usub, int_t ldt, int_t thread_id, - int_t *indirect, int_t *indirect2, + int *indirect, int *indirect2, int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr, int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, int_t *xsup, gridinfo_t *, SuperLUStat_t * @@ -597,7 +597,7 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock, int_t knsupc, int_t klst, int_t *lsub, int_t *usub, int_t ldt, int_t thread_id, - int_t *indirect, int_t *indirect2, + int *indirect, int *indirect2, int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr, int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, int_t *xsup, gridinfo_t * @@ -611,7 +611,7 @@ extern int_t zblock_gemm_scatterTopLeft( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, int_t * usub, int_t ldt, - int_t* indirect, int_t* indirect2, + int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *, gridinfo_t*, SCT_t*SCT, SuperLUStat_t * ); @@ -619,21 +619,21 @@ extern int_t zblock_gemm_scatterTopRight( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, int_t * usub, int_t ldt, - int_t* indirect, int_t* indirect2, + int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *, gridinfo_t*, SCT_t*SCT, SuperLUStat_t * ); extern int_t zblock_gemm_scatterBottomLeft( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, int_t * usub, int_t ldt, - int_t* indirect, int_t* indirect2, + int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *, gridinfo_t*, SCT_t*SCT, SuperLUStat_t * ); extern int_t zblock_gemm_scatterBottomRight( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, int_t * usub, int_t ldt, - int_t* indirect, int_t* indirect2, + int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *, gridinfo_t*, SCT_t*SCT, SuperLUStat_t * ); @@ -716,7 +716,7 @@ extern int_t zTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, doublecomplex* uval, doublecomplex *tempv); extern int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, int_t *usub, doublecomplex* uval, doublecomplex *tempv, - int_t knsupc, int_t nsupr, doublecomplex* lusup, + int_t knsupc, int nsupr, doublecomplex* lusup, Glu_persist_t *Glu_persist) ; extern void pzgstrs2 #ifdef _CRAY @@ -800,21 +800,21 @@ int_t zzRecvUPanel(int_t k, int_t sender, doublecomplex alpha, /* from communication_aux.h */ extern int_t zIBcast_LPanel (int_t k, int_t k0, int_t* lsub, doublecomplex* lusup, gridinfo_t *, int* msgcnt, MPI_Request *, - int_t **ToSendR, int_t *xsup, int ); + int **ToSendR, int_t *xsup, int ); extern int_t zBcast_LPanel(int_t k, int_t k0, int_t* lsub, doublecomplex* lusup, - gridinfo_t *, int* msgcnt, int_t **ToSendR, + gridinfo_t *, int* msgcnt, int **ToSendR, int_t *xsup , SCT_t*, int); extern int_t zIBcast_UPanel(int_t k, int_t k0, int_t* usub, doublecomplex* uval, gridinfo_t *, int* msgcnt, MPI_Request *, - int_t *ToSendD, int ); + int *ToSendD, int ); extern int_t zBcast_UPanel(int_t k, int_t k0, int_t* usub, doublecomplex* uval, - gridinfo_t *, int* msgcnt, int_t *ToSendD, SCT_t*, int); + gridinfo_t *, int* msgcnt, int *ToSendD, SCT_t*, int); extern int_t zIrecv_LPanel (int_t k, int_t k0, int_t* Lsub_buf, doublecomplex* Lval_buf, gridinfo_t *, MPI_Request *, LocalLU_t *, int); extern int_t zIrecv_UPanel(int_t k, int_t k0, int_t* Usub_buf, doublecomplex*, LocalLU_t *, gridinfo_t*, MPI_Request *, int); -extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int_t **ToSendR, +extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int **ToSendR, MPI_Request *s, SCT_t*); extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *); extern int_t zWait_URecv(MPI_Request *, int* msgcnt, SCT_t *); diff --git a/SRC/supernodalForest.c b/SRC/supernodalForest.c index f5bc3dda..e033d9b5 100644 --- a/SRC/supernodalForest.c +++ b/SRC/supernodalForest.c @@ -35,8 +35,9 @@ sForest_t** getForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* { return getGreedyLoadBalForests( maxLvl, nsupers, setree, treeList); } - + return 0; } + double calcNodeListWeight(int_t nnodes, int_t* nodeList, treeList_t* treeList) { double trWeight = 0; diff --git a/SRC/supernodal_etree.c b/SRC/supernodal_etree.c index f32a5c6c..225d224d 100644 --- a/SRC/supernodal_etree.c +++ b/SRC/supernodal_etree.c @@ -108,6 +108,7 @@ int free_treelist(int_t nsuper, treeList_t* treeList) SUPERLU_FREE(treeList[i].childrenList); } SUPERLU_FREE(treeList); + return 0; } int_t estimateWeight(int_t nsupers, int_t*setree, treeList_t* treeList, int_t* xsup) @@ -165,6 +166,8 @@ int_t estimateWeight(int_t nsupers, int_t*setree, treeList_t* treeList, int_t* x } } + + return 0; } /* estimateWeight */ diff --git a/SRC/symbfact.c b/SRC/symbfact.c index 648853a1..7cfa0bd8 100644 --- a/SRC/symbfact.c +++ b/SRC/symbfact.c @@ -528,8 +528,8 @@ static int_t column_dfs */ lsub[nextl++] = krow; /* krow is indexed into A */ if ( nextl >= nzlmax ) { - if ( mem_error = symbfact_SubXpand(A->ncol, jcol, nextl, (MemType) LSUB, - &nzlmax, Glu_freeable) ) + if ( (mem_error = symbfact_SubXpand(A->ncol, jcol, nextl, (MemType) LSUB, + &nzlmax, Glu_freeable)) ) return (mem_error); lsub = Glu_freeable->lsub; } @@ -570,10 +570,10 @@ static int_t column_dfs if ( chperm == EMPTY ) { lsub[nextl++] = kchild; if ( nextl >= nzlmax ) { - if ( mem_error = + if ( (mem_error = symbfact_SubXpand(A->ncol, jcol, nextl, (MemType) LSUB, &nzlmax, - Glu_freeable) ) + Glu_freeable)) ) return (mem_error); lsub = Glu_freeable->lsub; } @@ -786,8 +786,8 @@ static int_t set_usub new_next = nextu + nseg; while ( new_next > nzumax ) { - if (mem_error = symbfact_SubXpand(n, jcol, nextu, (MemType) USUB, &nzumax, - Glu_freeable)) + if ( (mem_error = symbfact_SubXpand(n, jcol, nextu, (MemType) USUB, &nzumax, + Glu_freeable)) ) return (mem_error); usub = Glu_freeable->usub; } diff --git a/SRC/treeFactorization.c b/SRC/treeFactorization.c index e01d58ab..4806b78a 100644 --- a/SRC/treeFactorization.c +++ b/SRC/treeFactorization.c @@ -258,6 +258,7 @@ int freeCommRequestsArr(int_t mxLeafNode, commRequests_t** comReqss) SUPERLU_FREE(comReqss[i]); } SUPERLU_FREE(comReqss); + return 0; } int_t initFactStat(int_t nsupers, factStat_t* factStat) @@ -295,6 +296,7 @@ int freeFactStat(factStat_t* factStat) SUPERLU_FREE(factStat->IbcastPanel_L); SUPERLU_FREE(factStat->IbcastPanel_U); SUPERLU_FREE(factStat->gpuLUreduced); + return 0; } int_t initFactNodelists(int_t ldt, int_t num_threads, int_t nsupers, @@ -302,8 +304,13 @@ int_t initFactNodelists(int_t ldt, int_t num_threads, int_t nsupers, { fNlists->iperm_u = INT_T_ALLOC(nsupers); fNlists->perm_u = INT_T_ALLOC(nsupers); +#if 0 // Sherry: change to int type fNlists->indirect = INT_T_ALLOC(num_threads * ldt); fNlists->indirect2 = INT_T_ALLOC(num_threads * ldt); +#else + fNlists->indirect = (int*) SUPERLU_MALLOC(num_threads * ldt * sizeof(int)); + fNlists->indirect2 = (int*) SUPERLU_MALLOC(num_threads * ldt * sizeof(int)); +#endif return 0; } @@ -344,6 +351,7 @@ int freeMsgsArr(int_t numLA, msgs_t **msgss) SUPERLU_FREE(msgss[i]); } SUPERLU_FREE(msgss); + return 0; } int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo) diff --git a/SRC/util.c b/SRC/util.c index eb962f92..76abe96f 100644 --- a/SRC/util.c +++ b/SRC/util.c @@ -267,6 +267,7 @@ int DeAllocLlu_3d(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) SUPERLU_FREE(Llu->ToSendD); for (i = 0; i < nbc; ++i) SUPERLU_FREE(Llu->ToSendR[i]); SUPERLU_FREE(Llu->ToSendR); + return 0; } /*! \brief Allocate storage in ScalePermstruct */ @@ -296,6 +297,7 @@ void ScalePermstructFree(ScalePermstruct_t *ScalePermstruct) SUPERLU_FREE(ScalePermstruct->R); SUPERLU_FREE(ScalePermstruct->C); break; + default: break; } } @@ -1427,6 +1429,7 @@ int_t partition( int_t* a, int_t l, int_t r, int_t dir) { t = a[l]; a[l] = a[j]; a[j] = t; return j; } + return 0; } @@ -1483,6 +1486,8 @@ int_t partitionM( int_t* a, int_t l, int_t r, int_t lda, int_t dir, int_t dims) } return j; } + + return 0; } diff --git a/SRC/zbinary_io.c b/SRC/zbinary_io.c index d49b9a8a..4c670505 100644 --- a/SRC/zbinary_io.c +++ b/SRC/zbinary_io.c @@ -18,6 +18,7 @@ zread_binary(FILE *fp, int_t *m, int_t *n, int_t *nnz, nnz_read = fread(*nzval, dsize, (size_t) (2 * (*nnz)), fp); printf("# of doubles fread: %d\n", nnz_read); fclose(fp); + return 0; } int @@ -37,4 +38,5 @@ zwrite_binary(int_t n, int_t nnz, printf("dump binary file ... # of doubles fwrite: %d\n", nnz_written); assert(nnz_written == 2*nnz); fclose(fp1); + return 0; } diff --git a/SRC/zcommunication_aux.c b/SRC/zcommunication_aux.c index 4f78ed9a..2410057a 100644 --- a/SRC/zcommunication_aux.c +++ b/SRC/zcommunication_aux.c @@ -28,7 +28,7 @@ int_t zIBcast_LPanel /*broadcasts index array lsub and non-zero value array lusup of a newly factored L column to my process row*/ (int_t k, int_t k0, int_t* lsub, doublecomplex* lusup, gridinfo_t *grid, - int* msgcnt, MPI_Request *send_req, int_t **ToSendR, int_t *xsup, + int* msgcnt, MPI_Request *send_req, int **ToSendR, int_t *xsup, int tag_ub) { int_t Pc = grid->npcol; @@ -68,7 +68,7 @@ int_t zBcast_LPanel /*broadcasts index array lsub and non-zero value array lusup of a newly factored L column to my process row*/ (int_t k, int_t k0, int_t* lsub, doublecomplex* lusup, gridinfo_t *grid, - int* msgcnt, int_t **ToSendR, int_t *xsup , SCT_t* SCT, + int* msgcnt, int **ToSendR, int_t *xsup , SCT_t* SCT, int tag_ub) { unsigned long long t1 = _rdtsc(); @@ -109,7 +109,7 @@ int_t zBcast_LPanel int_t zIBcast_UPanel /*asynchronously braodcasts U panel to my process row */ (int_t k, int_t k0, int_t* usub, doublecomplex* uval, gridinfo_t *grid, - int* msgcnt, MPI_Request *send_req_u, int_t *ToSendD, int tag_ub ) + int* msgcnt, MPI_Request *send_req_u, int *ToSendD, int tag_ub ) { int_t iam = grid->iam; @@ -152,7 +152,7 @@ int_t zIBcast_UPanel /*Synchronously braodcasts U panel to my process row */ int_t zBcast_UPanel(int_t k, int_t k0, int_t* usub, doublecomplex* uval, gridinfo_t *grid, - int* msgcnt, int_t *ToSendD, SCT_t* SCT, int tag_ub) + int* msgcnt, int *ToSendD, SCT_t* SCT, int tag_ub) { unsigned long long t1 = _rdtsc(); diff --git a/SRC/zscatter3d.c b/SRC/zscatter3d.c index 7e4b18fd..00fef67b 100644 --- a/SRC/zscatter3d.c +++ b/SRC/zscatter3d.c @@ -29,7 +29,7 @@ at the top-level directory. static void scatter_u (int_t ib, int_t jb, int_t nsupc, int_t iukp, int_t *xsup, int_t klst, int_t nbrow, int_t lptr, int_t temp_nbrow, int_t *lsub, int_t *usub, doublecomplex *tempv, - int_t *indirect, + int *indirect, int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, gridinfo_t *grid); @@ -78,15 +78,15 @@ void zblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info, Remain_info_t *Remain_info, - doublecomplex *L_mat, int_t ldl, - doublecomplex *U_mat, int_t ldu, + doublecomplex *L_mat, int ldl, + doublecomplex *U_mat, int ldu, doublecomplex *bigV, // int_t jj0, int_t knsupc, int_t klst, int_t *lsub, int_t *usub, int_t ldt, int_t thread_id, - int_t *indirect, - int_t *indirect2, + int *indirect, + int *indirect2, int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr, int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, int_t *xsup, gridinfo_t *grid, @@ -102,8 +102,8 @@ zblock_gemm_scatter( int_t lb, int_t j, #else thread_id = 0; #endif - int_t *indirect_thread = indirect + ldt * thread_id; - int_t *indirect2_thread = indirect2 + ldt * thread_id; + int *indirect_thread = indirect + ldt * thread_id; + int *indirect2_thread = indirect2 + ldt * thread_id; doublecomplex *tempv1 = bigV + thread_id * ldt * ldt; /* Getting U block information */ @@ -113,7 +113,7 @@ zblock_gemm_scatter( int_t lb, int_t j, int_t nsupc = SuperSize(jb); int_t ljb = LBj (jb, grid); int_t st_col; - int_t ncols; + int ncols; // if (j > jj0) if (j > 0) { @@ -129,7 +129,7 @@ zblock_gemm_scatter( int_t lb, int_t j, /* Getting L block information */ int_t lptr = Remain_info[lb].lptr; int_t ib = Remain_info[lb].ib; - int_t temp_nbrow = lsub[lptr + 1]; + int temp_nbrow = lsub[lptr + 1]; lptr += LB_DESCRIPTOR; int_t cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow); /* Getting L block information */ @@ -219,8 +219,8 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, int_t knsupc, int_t klst, int_t *lsub, int_t *usub, int_t ldt, int_t thread_id, - int_t *indirect, - int_t *indirect2, + int *indirect, + int *indirect2, int_t **Lrowind_bc_ptr, doublecomplex **Lnzval_bc_ptr, int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, int_t *xsup, gridinfo_t *grid @@ -229,8 +229,8 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, #endif ) { - int_t *indirect_thread = indirect + ldt * thread_id; - int_t *indirect2_thread = indirect2 + ldt * thread_id; + int *indirect_thread = indirect + ldt * thread_id; + int *indirect2_thread = indirect2 + ldt * thread_id; doublecomplex *tempv1 = bigV + thread_id * ldt * ldt; /* Getting U block information */ @@ -348,7 +348,7 @@ int_t zblock_gemm_scatterTopLeft( int_t lb, /* block number in L */ int_t j, /* block number in U */ doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, int_t * usub, int_t ldt, - int_t* indirect, int_t* indirect2, HyP_t* HyP, + int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *LUstruct, gridinfo_t* grid, SCT_t*SCT, SuperLUStat_t *stat @@ -387,7 +387,7 @@ int_t zblock_gemm_scatterTopLeft( int_t lb, /* block number in L */ int_t zblock_gemm_scatterTopRight( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, - int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + int_t * usub, int_t ldt, int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *LUstruct, gridinfo_t* grid, @@ -422,7 +422,7 @@ int_t zblock_gemm_scatterTopRight( int_t lb, int_t j, int_t zblock_gemm_scatterBottomLeft( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, - int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + int_t * usub, int_t ldt, int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *LUstruct, gridinfo_t* grid, @@ -459,7 +459,7 @@ int_t zblock_gemm_scatterBottomLeft( int_t lb, int_t j, int_t zblock_gemm_scatterBottomRight( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, - int_t * usub, int_t ldt, int_t* indirect, int_t* indirect2, + int_t * usub, int_t ldt, int* indirect, int* indirect2, HyP_t* HyP, LUstruct_t *LUstruct, gridinfo_t* grid, @@ -513,8 +513,8 @@ scatter_l (int_t ib, int_t *usub, int_t *lsub, double *tempv, - int_t *indirect_thread, int_t *indirect2, - int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid) + int *indirect_thread, int_t *indirect2, + int **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid) { int_t rel, i, segsize, jj; double *nzval; @@ -588,7 +588,7 @@ scatter_u (int_t ib, int_t *lsub, int_t *usub, doublecomplex *tempv, - int_t *indirect, + int *indirect, int_t **Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, gridinfo_t *grid) { #ifdef PI_DEBUG diff --git a/SRC/zsp_blas2_dist.c b/SRC/zsp_blas2_dist.c index 4f8990eb..54a55bd3 100644 --- a/SRC/zsp_blas2_dist.c +++ b/SRC/zsp_blas2_dist.c @@ -426,9 +426,8 @@ sp_zgemv_dist(char *trans, doublecomplex alpha, SuperMatrix *A, } /* Quick return if possible. */ - if (A->nrow == 0 || A->ncol == 0 || - z_eq(&alpha, &comp_zero) && - z_eq(&beta, &comp_one)) + if ( A->nrow == 0 || A->ncol == 0 || + (z_eq(&alpha, &comp_zero) && z_eq(&beta, &comp_one)) ) return 0; diff --git a/SRC/ztreeFactorization.c b/SRC/ztreeFactorization.c index c33272e0..0cda8e56 100644 --- a/SRC/ztreeFactorization.c +++ b/SRC/ztreeFactorization.c @@ -94,6 +94,7 @@ int zLluBufFreeArr(int_t numLA, zLUValSubBuf_t **LUvsbs) SUPERLU_FREE(LUvsbs[i]); } SUPERLU_FREE(LUvsbs); + return 0; } @@ -217,8 +218,8 @@ int_t zdenseTreeFactor( Remain_info_t* Remain_info = packLUInfo->Remain_info; uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo; lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo; - int_t* indirect = fNlists->indirect; - int_t* indirect2 = fNlists->indirect2; + int* indirect = fNlists->indirect; + int* indirect2 = fNlists->indirect2; /*Schurcomplement Update*/ int_t nub = uPanelInfo->nub; int_t nlb = lPanelInfo->nlb; @@ -489,8 +490,8 @@ int_t zsparseTreeFactor_ASYNC( lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo; int_t *lsub = lPanelInfo->lsub; int_t *usub = uPanelInfo->usub; - int_t* indirect = fNlists->indirect; - int_t* indirect2 = fNlists->indirect2; + int* indirect = fNlists->indirect; + int* indirect2 = fNlists->indirect2; /*Schurcomplement Update*/ diff --git a/SRC/ztrfAux.c b/SRC/ztrfAux.c index e96cc496..921ccc89 100644 --- a/SRC/ztrfAux.c +++ b/SRC/ztrfAux.c @@ -120,7 +120,7 @@ int_t zSchurComplementSetup( LocalLU_t *Llu = LUstruct->Llu; int_t* xsup = Glu_persist->xsup; - int_t* ToRecv = Llu->ToRecv; + int* ToRecv = Llu->ToRecv; int_t iam = grid->iam; int_t myrow = MYROW (iam, grid); @@ -355,7 +355,7 @@ int_t zSchurComplementSetupGPU( LocalLU_t *Llu = LUstruct->Llu; int_t* xsup = Glu_persist->xsup; - int_t* ToRecv = Llu->ToRecv; + int* ToRecv = Llu->ToRecv; int_t iam = grid->iam; int_t myrow = MYROW (iam, grid); diff --git a/SRC/ztrfCommWrapper.c b/SRC/ztrfCommWrapper.c index a9cf4f11..11d65284 100644 --- a/SRC/ztrfCommWrapper.c +++ b/SRC/ztrfCommWrapper.c @@ -134,7 +134,7 @@ int_t zLPanelTrSolve( int_t k, int_t* factored_L, int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); int_t kcol = PCOL (k, grid); int_t mycol = MYCOL (iam, grid); - int_t nsupc = SuperSize(k); + int nsupc = SuperSize(k); /*factor the L panel*/ if (mycol == kcol && iam != pkk) @@ -142,7 +142,7 @@ int_t zLPanelTrSolve( int_t k, int_t* factored_L, // factored_L[k] = 1; int_t lk = LBj (k, grid); doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk]; - int_t nsupr; + int nsupr; if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1]; else @@ -158,7 +158,7 @@ int_t zLPanelTrSolve( int_t k, int_t* factored_L, int_t l = nsupr; doublecomplex* ublk_ptr = BlockUFactor; - int_t ld_ujrow = nsupc; + int ld_ujrow = nsupc; // unsigned long long t1 = _rdtsc(); @@ -170,7 +170,7 @@ int_t zLPanelTrSolve( int_t k, int_t* factored_L, { int_t off = i * BL; // Sherry: int_t len = MY_MIN(BL, l - i * BL); - int_t len = SUPERLU_MIN(BL, l - i * BL); + int len = SUPERLU_MIN(BL, l - i * BL); #if 1 #if defined (USE_VENDOR_BLAS) @@ -197,7 +197,7 @@ int_t zLPanelTrSolve( int_t k, int_t* factored_L, factored_L[k] = 1; int_t lk = LBj (k, grid); doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk]; - int_t nsupr; + int nsupr; if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1]; else @@ -208,7 +208,7 @@ int_t zLPanelTrSolve( int_t k, int_t* factored_L, int_t l = nsupr - nsupc; doublecomplex* ublk_ptr = BlockUFactor; - int_t ld_ujrow = nsupc; + int ld_ujrow = nsupc; // printf("%d: L update \n",k ); #define BL 32 @@ -217,7 +217,7 @@ int_t zLPanelTrSolve( int_t k, int_t* factored_L, { int_t off = i * BL; // Sherry: int_t len = MY_MIN(BL, l - i * BL); - int_t len = SUPERLU_MIN(BL, (l - i * BL)); + int len = SUPERLU_MIN(BL, (l - i * BL)); #pragma omp task { #if 1 @@ -405,8 +405,8 @@ int_t zIBcastRecvLPanel( Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; int_t* xsup = Glu_persist->xsup; - int_t** ToSendR = Llu->ToSendR; - int_t* ToRecv = Llu->ToRecv; + int** ToSendR = Llu->ToSendR; + int* ToRecv = Llu->ToRecv; int_t iam = grid->iam; int_t Pc = grid->npcol; int_t mycol = MYCOL (iam, grid); @@ -462,8 +462,8 @@ int_t zIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt, { LocalLU_t *Llu = LUstruct->Llu; - int_t* ToSendD = Llu->ToSendD; - int_t* ToRecv = Llu->ToRecv; + int* ToSendD = Llu->ToSendD; + int* ToRecv = Llu->ToRecv; int_t iam = grid->iam; int_t Pr = grid->nprow; int_t myrow = MYROW (iam, grid); @@ -508,8 +508,8 @@ int_t zWaitL( int_t k, int* msgcnt, int* msgcntU, gridinfo_t *grid, LUstruct_t *LUstruct, SCT_t *SCT) { LocalLU_t *Llu = LUstruct->Llu; - int_t** ToSendR = Llu->ToSendR; - int_t* ToRecv = Llu->ToRecv; + int** ToSendR = Llu->ToSendR; + int* ToRecv = Llu->ToRecv; int_t iam = grid->iam; int_t mycol = MYCOL (iam, grid); int_t kcol = PCOL (k, grid); @@ -537,8 +537,8 @@ int_t zWaitU( int_t k, int* msgcnt, { LocalLU_t *Llu = LUstruct->Llu; - int_t* ToRecv = Llu->ToRecv; - int_t* ToSendD = Llu->ToSendD; + int* ToRecv = Llu->ToRecv; + int* ToSendD = Llu->ToSendD; int_t iam = grid->iam; int_t myrow = MYROW (iam, grid); int_t krow = PROW (k, grid); diff --git a/run_cmake_build.sh b/run_cmake_build.sh index bd272408..a4b5ec24 100755 --- a/run_cmake_build.sh +++ b/run_cmake_build.sh @@ -31,32 +31,11 @@ fi # -DTPL_ENABLE_PARMETISLIB=OFF # -DCMAKE_CXX_FLAGS="-std=c++11" \ -## if [ !$?NERSC_HOST ] -# if [ -z $NERSC_HOST ] -# then -# echo "NERSC_HOST undefined" -# fi - -if [ "$NERSC_HOST" == "edison" ] -then - mkdir edison-build; cd edison-build; -# export PARMETIS_ROOT=~/Edison/lib/parmetis-4.0.3_64 - export PARMETIS_ROOT=~/Edison/lib/parmetis-4.0.3 - export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64 - cmake .. \ - -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \ - -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \ - -DCMAKE_C_FLAGS="-std=c99 -fPIC -DPRNTlevel=1" \ - -DCMAKE_Fortran_COMPILER=ftn \ - -DTPL_ENABLE_BLASLIB=OFF \ - -DTPL_BLAS_LIBRARIES="-mkl" \ - -DBUILD_SHARED_LIBS=OFF \ - -DCMAKE_INSTALL_PREFIX=. -# -DXSDK_INDEX_SIZE=64 \ -# -DCMAKE_EXE_LINKER_FLAGS="-shared" -elif [ "$NERSC_HOST" == "cori" ] +if [ "$NERSC_HOST" == "cori" ] then +# rm -fr 64-build; mkdir 64-build; cd 64-build; +# export PARMETIS_ROOT=~/Cori/lib/parmetis-4.0.3-64 rm -fr cori-build; mkdir cori-build; cd cori-build; export PARMETIS_ROOT=~/Cori/lib/parmetis-4.0.3 # export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/shared-build @@ -67,8 +46,8 @@ then -DTPL_ENABLE_BLASLIB=OFF \ -DTPL_BLAS_LIBRARIES="-mkl" \ -DCMAKE_Fortran_COMPILER=ftn \ - -DCMAKE_C_FLAGS="-std=c99 -fPIC -DPRNTlevel=0" \ - -DCMAKE_INSTALL_PREFIX=. + -DCMAKE_C_FLAGS="-std=c99 -fPIC -DPRNTlevel=1 -g -O3" \ + -DCMAKE_INSTALL_PREFIX=. \ # -DXSDK_INDEX_SIZE=64 # -DCMAKE_EXE_LINKER_FLAGS="-shared" \ fi From d369a6e09d56941988ade632553af923af0891c7 Mon Sep 17 00:00:00 2001 From: piyush sao Date: Thu, 10 Sep 2020 12:28:42 -0400 Subject: [PATCH 021/147] dcreatemarx3d is working wih pz=1 --- EXAMPLE/CMakeLists.txt | 2 +- EXAMPLE/Makefile | 2 +- EXAMPLE/dcreate_matrix3d.c | 461 +++++++++++++++++++++++++++++++++++++ EXAMPLE/pddrive3d.c | 118 +++++----- SRC/CMakeLists.txt | 1 + SRC/Makefile | 2 +- SRC/nrformat_loc.c | 78 +++++++ SRC/superlu_ddefs.h | 8 +- SRC/superlu_defs.h | 4 + 9 files changed, 620 insertions(+), 56 deletions(-) create mode 100644 EXAMPLE/dcreate_matrix3d.c create mode 100644 SRC/nrformat_loc.c diff --git a/EXAMPLE/CMakeLists.txt b/EXAMPLE/CMakeLists.txt index 785ebdcb..0126dc41 100644 --- a/EXAMPLE/CMakeLists.txt +++ b/EXAMPLE/CMakeLists.txt @@ -57,7 +57,7 @@ if(enable_double) add_executable(pddrive4 ${DEXM4}) target_link_libraries(pddrive4 ${all_link_libs}) - set(DEXM3D pddrive3d.c dcreate_matrix.c) + set(DEXM3D pddrive3d.c dcreate_matrix.c dcreate_matrix3d.c) add_executable(pddrive3d ${DEXM3D}) target_link_libraries(pddrive3d ${all_link_libs}) diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile index 56a17b99..b8d999a1 100644 --- a/EXAMPLE/Makefile +++ b/EXAMPLE/Makefile @@ -37,7 +37,7 @@ DEXM1 = pddrive1.o dcreate_matrix.o DEXM2 = pddrive2.o dcreate_matrix.o dcreate_matrix_perturbed.o DEXM3 = pddrive3.o dcreate_matrix.o DEXM4 = pddrive4.o dcreate_matrix.o -DEXM3D = pddrive3d.o dcreate_matrix.o +DEXM3D = pddrive3d.o dcreate_matrix.o dcreate_matrix3d.o # dtrfAux.o dtreeFactorization.o treeFactorization.o pd3dcomm.o superlu_grid3d.o pdgstrf3d.o DEXMG = pddrive_ABglobal.o DEXMG1 = pddrive1_ABglobal.o diff --git a/EXAMPLE/dcreate_matrix3d.c b/EXAMPLE/dcreate_matrix3d.c new file mode 100644 index 00000000..5725d543 --- /dev/null +++ b/EXAMPLE/dcreate_matrix3d.c @@ -0,0 +1,461 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Read the matrix from data file + * + *
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * 
+ */ +#include +#include "superlu_ddefs.h" + +/* \brief + * + *
+ * Purpose
+ * =======
+ *
+ * DCREATE_MATRIX read the matrix from data file in Harwell-Boeing format,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ *
+ * Arguments
+ * =========
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format.
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) double**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) double**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * 
+ */ + +int dcreate_matrix3d(SuperMatrix *A, int nrhs, double **rhs, + int *ldb, double **x, int *ldx, + FILE *fp, gridinfo3d_t *grid3d) +{ + SuperMatrix GA; /* global A */ + double *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + double *nzval; /* global */ + double *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid3d->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter dcreate_matrix3d()"); +#endif + + if ( !iam ) + { + double t = SuperLU_timer_(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( nzval, nnz, MPI_DOUBLE, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + else + { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + + /* Allocate storage for compressed column representation. */ + dallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_DOUBLE, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + +#if 0 + nzval[0] = 0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) + { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); + } + + /* Create compressed column matrix for GA. */ + dCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_D, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = doubleMalloc_dist(m * nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = doubleMalloc_dist(n * nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + dGenXtrue_dist(n, nrhs, xtrue_global, n); + dFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc + 1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) + { + row = fst_row + j; + rowptr[j + 1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (double *) doubleMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) + { + for (j = colptr[i]; j < colptr[i + 1]; ++j) + { + row = rowind[j]; + if ( (row >= fst_row) && (row < fst_row + m_loc) ) + { + row = row - fst_row; + relpos = marker[row]; + colind[relpos] = i; + nzval_loc[relpos] = nzval[j]; + ++marker[row]; + } + } + } + +#if ( DEBUGlevel>=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + dCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_D, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = doubleMalloc_dist(m_loc * nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + { + row = fst_row + i; + (*rhs)[j * m_loc + i] = b_global[j * n + row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = doubleMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit dcreate_matrix()"); +#endif + return 0; +} + + +int dcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, double **rhs, + int *ldb, double **x, int *ldx, + FILE *fp, char * postfix, gridinfo3d_t *grid3d) +{ + SuperMatrix GA; /* global A */ + double *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + double *nzval; /* global */ + double *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid3d->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter dcreate_matrix()"); +#endif + + if ( !iam ) + { + double t = SuperLU_timer_(); + + if (!strcmp(postfix, "rua")) + { + /* Read the matrix stored on disk in Harwell-Boeing format. */ + dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "mtx")) + { + /* Read the matrix stored on disk in Matrix Market format. */ + dreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "rb")) + { + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + dreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "dat")) + { + /* Read the matrix stored on disk in triplet format. */ + dreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "datnh")) + { + /* Read the matrix stored on disk in triplet format (without header). */ + dreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "bin")) + { + /* Read the matrix stored on disk in binary format. */ + dread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else + { + ABORT("File format not known"); + } + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( nzval, nnz, MPI_DOUBLE, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + else + { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + + /* Allocate storage for compressed column representation. */ + dallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_DOUBLE, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + +#if 0 + nzval[0] = 0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) + { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); + } + + /* Create compressed column matrix for GA. */ + dCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_D, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = doubleMalloc_dist(m * nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = doubleMalloc_dist(n * nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + dGenXtrue_dist(n, nrhs, xtrue_global, n); + dFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc + 1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) + { + row = fst_row + j; + rowptr[j + 1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (double *) doubleMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) + { + for (j = colptr[i]; j < colptr[i + 1]; ++j) + { + row = rowind[j]; + if ( (row >= fst_row) && (row < fst_row + m_loc) ) + { + row = row - fst_row; + relpos = marker[row]; + colind[relpos] = i; + nzval_loc[relpos] = nzval[j]; + ++marker[row]; + } + } + } + +#if ( DEBUGlevel>=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + dCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_D, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = doubleMalloc_dist(m_loc * nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + { + row = fst_row + i; + (*rhs)[j * m_loc + i] = b_global[j * n + row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = doubleMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit dcreate_matrix()"); +#endif + return 0; +} diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c index a3c55229..3823b7ea 100644 --- a/EXAMPLE/pddrive3d.c +++ b/EXAMPLE/pddrive3d.c @@ -1,9 +1,9 @@ /*! \file Copyright (c) 2003, The Regents of the University of California, through -Lawrence Berkeley National Laboratory (subject to receipt of any required -approvals from U.S. Dept. of Energy) +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) -All rights reserved. +All rights reserved. The source code is distributed under BSD license, see the file License.txt at the top-level directory. @@ -19,7 +19,7 @@ at the top-level directory. * May 10, 2019 * */ -#include "superlu_ddefs.h" +#include "superlu_ddefs.h" /*! \brief * @@ -115,7 +115,8 @@ main (int argc, char *argv[]) } } else - { /* Last arg is considered a filename */ + { + /* Last arg is considered a filename */ if (!(fp = fopen (*cpp, "r"))) { ABORT ("File does not exist"); @@ -129,46 +130,49 @@ main (int argc, char *argv[]) ------------------------------------------------------------ */ superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); - if(grid.iam==0) { - MPI_Query_thread(&omp_mpi_level); - switch (omp_mpi_level) { - case MPI_THREAD_SINGLE: - printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); - fflush(stdout); - break; - case MPI_THREAD_FUNNELED: - printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); - fflush(stdout); - break; - case MPI_THREAD_SERIALIZED: - printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); - fflush(stdout); - break; - case MPI_THREAD_MULTIPLE: - printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); - fflush(stdout); - break; - } + if (grid.iam == 0) + { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) + { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } } - + /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if (iam >= nprow * npcol *npdep) + if (iam >= nprow * npcol * npdep) goto out; - if (!iam) { - int v_major, v_minor, v_bugfix; + if (!iam) + { + int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER - printf("__INTEL_COMPILER is defined\n"); + printf("__INTEL_COMPILER is defined\n"); #endif - printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); - superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); - printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); - printf("Input matrix file:\t%s\n", *cpp); - printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); - fflush(stdout); + fflush(stdout); } #if ( DEBUGlevel>=1 ) @@ -178,15 +182,20 @@ main (int argc, char *argv[]) /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------ */ - for (ii = 0; iinpdep * sizeof(int_t)); + row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t)); + nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); + row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); + MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts, + 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts, + 1, mpi_int_t, 0, grid3d->zscp.comm); + nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int_t)); + row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int_t)); + + nnz_disp[0] = 0; + row_disp[0] = 0; + for (int i = 0; i < grid3d->npdep; i++) + { + nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i]; + row_disp[i + 1] = row_disp[i] + row_counts[i]; + nnz_counts_int[i] = nnz_counts[i]; + row_counts_int[i] = row_counts[i]; + } + + if (grid3d->zscp.Iam == 0) + { + A2d.colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t)); + A2d.nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(double)); + A2d.rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t)); + A2d.rowptr[0] = 0; + } + + MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d.nzval, + nnz_counts_int, nnz_disp, + MPI_DOUBLE, 0, grid3d->zscp.comm); + MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d.colind, + nnz_counts_int, nnz_disp, + MPI_DOUBLE, 0, grid3d->zscp.comm); + + MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d.rowptr[1], + row_counts_int, row_disp, + mpi_int_t, 0, grid3d->zscp.comm); + + if (grid3d->zscp.Iam == 0) + { + for (int i = 0; i < grid3d->npdep; i++) + { + for(int j = row_disp[i]; jfst_row; + return A2d; +} \ No newline at end of file diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index 05b71fdc..2bf22496 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -333,7 +333,13 @@ extern int dcreate_matrix_dat(SuperMatrix *, int, double **, int *, double **, int *, FILE *, gridinfo_t *); extern int dcreate_matrix_postfix(SuperMatrix *, int, double **, int *, double **, int *, FILE *, char *, gridinfo_t *); - +/*For 3D code */ +extern int dcreate_matrix3d(SuperMatrix *A, int nrhs, double **rhs, + int *ldb, double **x, int *ldx, + FILE *fp, gridinfo3d_t *grid3d); +extern int dcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, double **rhs, + int *ldb, double **x, int *ldx, + FILE *fp, char * postfix, gridinfo3d_t *grid3d); /* Driver related */ extern void dgsequ_dist (SuperMatrix *, double *, double *, double *, diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index b45e2659..b17c4ff1 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -1098,6 +1098,10 @@ extern int_t StdList_Size(StdList lst); yes_no_t StdList_Empty(StdList lst); /*==== For 3D code ====*/ +/* Matrix distributed in NRformat_loc in 3D process grid, it converts +it to a NRformat_loc distributed in two-D grid in grid-0 */ +NRformat_loc dGatherNRformat_loc(NRformat_loc *A, gridinfo3d_t *grid3d); + extern void DistPrint(char* function_name, double value, char* Units, gridinfo_t* grid); extern void DistPrint3D(char* function_name, double value, char* Units, gridinfo3d_t* grid3d); From 850121f109ce78f60d080db5e9495fb02abc0d7c Mon Sep 17 00:00:00 2001 From: piyush sao Date: Fri, 11 Sep 2020 01:52:26 -0400 Subject: [PATCH 022/147] nrformat_loc working for pz=1 but not pdgssvx3d --- EXAMPLE/pddrive3d.c | 71 +- SRC/nrformat_loc.c | 2 + SRC/pdgssvx3d.c | 1983 +++++++++++++++++++++++-------------------- 3 files changed, 1119 insertions(+), 937 deletions(-) diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c index 3823b7ea..4bc7f2f4 100644 --- a/EXAMPLE/pddrive3d.c +++ b/EXAMPLE/pddrive3d.c @@ -20,6 +20,7 @@ at the top-level directory. * */ #include "superlu_ddefs.h" +#include /*! \brief * @@ -48,6 +49,38 @@ at the top-level directory. *
*/ +void checkNRFMT(NRformat_loc*A, NRformat_loc*B) +{ + /* + int_t nnz_loc; + int_t m_loc; + int_t fst_row; + void *nzval; + int_t *rowptr; + int_t *colind; + */ + + assert(A->nnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert(((double *)A->nzval)[i] == ((double *)B->nzval)[i]); + assert((A->colind)[i] == (B->colind)[i]); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + // assert(((double *)A->nzval)[i] ==((double *)B->nzval)[i]); + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + + printf("Matrix check passed\n"); + +} + int main (int argc, char *argv[]) { @@ -65,6 +98,7 @@ main (int argc, char *argv[]) int iam, info, ldb, ldx, nrhs; char **cpp, c, *suffix; FILE *fp, *fopen (); + FILE *fp0; extern int cpp_defs (); int ii, omp_mpi_level; @@ -117,7 +151,9 @@ main (int argc, char *argv[]) else { /* Last arg is considered a filename */ - if (!(fp = fopen (*cpp, "r"))) + if (!(fp = fopen (*cpp, "r")) + || !(fp0 = fopen (*cpp, "r")) + ) { ABORT ("File does not exist"); } @@ -190,12 +226,35 @@ main (int argc, char *argv[]) // printf("%s\n", suffix); } } - #if 0 +#define NRFRMT +#ifndef NRFRMT if ( grid.zscp.Iam == 0 ) // only in process layer 0 - dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &(grid.grid2d)); - #else - dcreate_matrix_postfix3d(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &(grid)); - #endif + dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, + &ldx, fp, suffix, &(grid.grid2d)); +#else + NRformat_loc *Astore, *Astore0; + + // *fp0 = *fp; + dcreate_matrix_postfix3d(&A, nrhs, &b, &ldb, + &xtrue, &ldx, fp, suffix, &(grid)); + // NRformat_loc Atmp = dGatherNRformat_loc( + // (NRformat_loc *) A.Store, + // &grid); + // Astore = &Atmp; + // SuperMatrix Aref; + // double *bref, *xtrueref; + // if ( grid.zscp.Iam == 0 ) // only in process layer 0 + // { + // dcreate_matrix_postfix(&Aref, nrhs, &bref, &ldb, + // &xtrueref, &ldx, fp0, + // suffix, &(grid.grid2d)); + // for (int i = 0; i < 5; i++) + // printf("%g %g\n", bref[i], b[i] ); + // Astore0 = (NRformat_loc *) Aref.Store; + // checkNRFMT(Astore, Astore0); + // } + // MPI_Finalize(); exit(0); +#endif if (!(berr = doubleMalloc_dist (nrhs))) ABORT ("Malloc fails for berr[]."); diff --git a/SRC/nrformat_loc.c b/SRC/nrformat_loc.c index f156412b..932bff25 100644 --- a/SRC/nrformat_loc.c +++ b/SRC/nrformat_loc.c @@ -71,6 +71,8 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A, gridinfo3d_t *grid3d) A2d.rowptr[j] += row_disp[i]; } } + A2d.nnz_loc = nnz_disp[grid3d->npdep]; + A2d.m_loc = row_disp[grid3d->npdep]; } A2d.fst_row = A->fst_row; diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index d8ea203d..915b66db 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -500,1044 +500,1165 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, LUstruct_t * LUstruct, SOLVEstruct_t * SOLVEstruct, double *berr, SuperLUStat_t * stat, int *info) { - NRformat_loc *Astore; - SuperMatrix GA; /* Global A in NC format */ - NCformat *GAstore; - double *a_GA; - SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ - NCPformat *GACstore; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - Glu_freeable_t *Glu_freeable; - /* The nonzero structures of L and U factors, which are - replicated on all processrs. - (lsub, xlsub) contains the compressed subscript of - supernodes in L. - (usub, xusub) contains the compressed subscript of - nonzero segments in U. - If options->Fact != SamePattern_SameRowPerm, they are - computed by SYMBFACT routine, and then used by PDDISTRIBUTE - routine. They will be freed after PDDISTRIBUTE routine. - If options->Fact == SamePattern_SameRowPerm, these - structures are not used. */ - yes_no_t parSymbFact = options->ParSymbFact; - fact_t Fact; - double *a; - int_t *colptr, *rowind; - int_t *perm_r; /* row permutations from partial pivoting */ - int_t *perm_c; /* column permutation vector */ - int_t *etree; /* elimination tree */ - int_t *rowptr, *colind; /* Local A in NR */ - int_t colequ, Equil, factored, job, notran, rowequ, need_value; - int_t i, iinfo, j, irow, m, n, nnz, permc_spec; - int_t nnz_loc, m_loc, fst_row, icol; - int iam; - int ldx; /* LDA for matrix X (local). */ - char equed[1], norm[1]; - double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; - double *X, *b_col, *b_work, *x_col; - double t; - float GA_mem_use; /* memory usage by global A */ - float dist_mem_use; /* memory usage during distribution */ - superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; + NRformat_loc *Astore; + SuperMatrix GA; /* Global A in NC format */ + NCformat *GAstore; + double *a_GA; + SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ + NCPformat *GACstore; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t *Glu_freeable; + /* The nonzero structures of L and U factors, which are + replicated on all processrs. + (lsub, xlsub) contains the compressed subscript of + supernodes in L. + (usub, xusub) contains the compressed subscript of + nonzero segments in U. + If options->Fact != SamePattern_SameRowPerm, they are + computed by SYMBFACT routine, and then used by PDDISTRIBUTE + routine. They will be freed after PDDISTRIBUTE routine. + If options->Fact == SamePattern_SameRowPerm, these + structures are not used. */ + yes_no_t parSymbFact = options->ParSymbFact; + fact_t Fact; + double *a; + int_t *colptr, *rowind; + int_t *perm_r; /* row permutations from partial pivoting */ + int_t *perm_c; /* column permutation vector */ + int_t *etree; /* elimination tree */ + int_t *rowptr, *colind; /* Local A in NR */ + int_t colequ, Equil, factored, job, notran, rowequ, need_value; + int_t i, iinfo, j, irow, m, n, nnz, permc_spec; + int_t nnz_loc, m_loc, fst_row, icol; + int iam; + int ldx; /* LDA for matrix X (local). */ + char equed[1], norm[1]; + double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; + double *X, *b_col, *b_work, *x_col; + double t; + float GA_mem_use; /* memory usage by global A */ + float dist_mem_use; /* memory usage during distribution */ + superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; #if ( PRNTlevel>= 2 ) - double dmin, dsum, dprod; + double dmin, dsum, dprod; #endif - // get the 2d grid - gridinfo_t *grid = &(grid3d->grid2d); - iam = grid->iam; - - /* Initialization. */ - - /* definition of factored seen by each process layer */ - Fact = options->Fact; - factored = (Fact == FACTORED); - - /* Test the options choices. */ - *info = 0; - Fact = options->Fact; - if (Fact < 0 || Fact > FACTORED) - *info = -1; - else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR) - *info = -1; - else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC) - *info = -1; - else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA) - *info = -1; - else if (options->IterRefine == SLU_EXTRA) { - *info = -1; - fprintf (stderr, - "Extra precise iterative refinement yet to support."); - } - if (*info) { - i = -(*info); - pxerr_dist ("pdgssvx3d", grid, -*info); - return; - } + // get the 2d grid + gridinfo_t *grid = &(grid3d->grid2d); + iam = grid->iam; + + /* Initialization. */ + + /* definition of factored seen by each process layer */ + Fact = options->Fact; + factored = (Fact == FACTORED); + + /* Test the options choices. */ + *info = 0; + Fact = options->Fact; + if (Fact < 0 || Fact > FACTORED) + *info = -1; + else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR) + *info = -1; + else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC) + *info = -1; + else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA) + *info = -1; + else if (options->IterRefine == SLU_EXTRA) + { + *info = -1; + fprintf (stderr, + "Extra precise iterative refinement yet to support."); + } + if (*info) + { + i = -(*info); + pxerr_dist ("pdgssvx3d", grid, -*info); + return; + } #if ( DEBUGlevel>=1 ) CHECK_MALLOC (iam, "Enter pdgssvx3d()"); #endif - - /* Perform preprocessing steps on process layer zero, including: - ordering, symbolic factorization, distribution of L & U */ - if (grid3d->zscp.Iam == 0) - { - m = A->nrow; - n = A->ncol; - Astore = (NRformat_loc *) A->Store; - nnz_loc = Astore->nnz_loc; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - a = (double *) Astore->nzval; - rowptr = Astore->rowptr; - colind = Astore->colind; - - /* Test the other input parameters. */ - if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc - || A->Dtype != SLU_D || A->Mtype != SLU_GE) - *info = -2; - else if (ldb < m_loc) - *info = -5; - else if (nrhs < 0) - *info = -6; - if (*info) { - i = -(*info); - pxerr_dist ("pdgssvx3d", grid, -*info); - return; - } - /* Structures needed for parallel symbolic factorization */ - int_t *sizes, *fstVtxSep; - int noDomains, nprocs_num; - MPI_Comm symb_comm; /* communicator for symbolic factorization */ - int col, key; /* parameters for creating a new communicator */ - Pslu_freeable_t Pslu_freeable; - float flinfo; - - sizes = NULL; - fstVtxSep = NULL; - symb_comm = MPI_COMM_NULL; - - Equil = (!factored && options->Equil == YES); - notran = (options->Trans == NOTRANS); - - iam = grid->iam; - job = 5; - if (factored || (Fact == SamePattern_SameRowPerm && Equil)) - { - rowequ = (ScalePermstruct->DiagScale == ROW) || - (ScalePermstruct->DiagScale == BOTH); - colequ = (ScalePermstruct->DiagScale == COL) || - (ScalePermstruct->DiagScale == BOTH); - } - else - rowequ = colequ = FALSE; - - /* The following arrays are replicated on all processes. */ - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - etree = LUstruct->etree; - R = ScalePermstruct->R; - C = ScalePermstruct->C; - /********/ - - /* Not factored & ask for equilibration */ - if (Equil && Fact != SamePattern_SameRowPerm) { - /* Allocate storage if not done so before. */ - switch (ScalePermstruct->DiagScale) + /* Perform preprocessing steps on process layer zero, including: + ordering, symbolic factorization, distribution of L & U */ +#define NRFRMT +#ifdef NRFRMT + NRformat_loc Atmp = dGatherNRformat_loc( + (NRformat_loc *) A->Store, + grid3d); +#endif + + if (grid3d->zscp.Iam == 0) + { + + + m = A->nrow; + n = A->ncol; +#ifdef NRFRMT + Astore = &Atmp; +#else + Astore = (NRformat_loc *) A->Store; +#endif + nnz_loc = Astore->nnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + a = (double *) Astore->nzval; + rowptr = Astore->rowptr; + colind = Astore->colind; + + /* Test the other input parameters. */ + if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc + || A->Dtype != SLU_D || A->Mtype != SLU_GE) + *info = -2; + else if (ldb < m_loc) + *info = -5; + else if (nrhs < 0) + *info = -6; + if (*info) { - case NOEQUIL: - if (!(R = (double *) doubleMalloc_dist (m))) - ABORT ("Malloc fails for R[]."); - if (!(C = (double *) doubleMalloc_dist (n))) - ABORT ("Malloc fails for C[]."); - ScalePermstruct->R = R; - ScalePermstruct->C = C; - break; - case ROW: - if (!(C = (double *) doubleMalloc_dist (n))) - ABORT ("Malloc fails for C[]."); - ScalePermstruct->C = C; - break; - case COL: - if (!(R = (double *) doubleMalloc_dist (m))) - ABORT ("Malloc fails for R[]."); - ScalePermstruct->R = R; - break; + i = -(*info); + pxerr_dist ("pdgssvx3d", grid, -*info); + return; } - } - - /* ------------------------------------------------------------ - Diagonal scaling to equilibrate the matrix. - ------------------------------------------------------------ */ - if (Equil) { + + /* Structures needed for parallel symbolic factorization */ + int_t *sizes, *fstVtxSep; + int noDomains, nprocs_num; + MPI_Comm symb_comm; /* communicator for symbolic factorization */ + int col, key; /* parameters for creating a new communicator */ + Pslu_freeable_t Pslu_freeable; + float flinfo; + + sizes = NULL; + fstVtxSep = NULL; + symb_comm = MPI_COMM_NULL; + + Equil = (!factored && options->Equil == YES); + notran = (options->Trans == NOTRANS); + + iam = grid->iam; + job = 5; + if (factored || (Fact == SamePattern_SameRowPerm && Equil)) + { + rowequ = (ScalePermstruct->DiagScale == ROW) || + (ScalePermstruct->DiagScale == BOTH); + colequ = (ScalePermstruct->DiagScale == COL) || + (ScalePermstruct->DiagScale == BOTH); + } + else + rowequ = colequ = FALSE; + + /* The following arrays are replicated on all processes. */ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + etree = LUstruct->etree; + R = ScalePermstruct->R; + C = ScalePermstruct->C; + /********/ + + /* Not factored & ask for equilibration */ + if (Equil && Fact != SamePattern_SameRowPerm) + { + /* Allocate storage if not done so before. */ + switch (ScalePermstruct->DiagScale) + { + case NOEQUIL: + if (!(R = (double *) doubleMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + if (!(C = (double *) doubleMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->R = R; + ScalePermstruct->C = C; + break; + case ROW: + if (!(C = (double *) doubleMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->C = C; + break; + case COL: + if (!(R = (double *) doubleMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + ScalePermstruct->R = R; + break; + } + } + + /* ------------------------------------------------------------ + Diagonal scaling to equilibrate the matrix. + ------------------------------------------------------------ */ + if (Equil) + { #if ( DEBUGlevel>=1 ) - CHECK_MALLOC (iam, "Enter equil"); + CHECK_MALLOC (iam, "Enter equil"); #endif - t = SuperLU_timer_ (); - - if (Fact == SamePattern_SameRowPerm) { - /* Reuse R and C. */ - switch (ScalePermstruct->DiagScale) { - case NOEQUIL: - break; - case ROW: - irow = fst_row; - for (j = 0; j < m_loc; ++j) { - for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { - a[i] *= R[irow]; /* Scale rows. */ - } - ++irow; - } - break; - case COL: - for (j = 0; j < m_loc; ++j) - for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { - icol = colind[i]; - a[i] *= C[icol]; /* Scale columns. */ + t = SuperLU_timer_ (); + + if (Fact == SamePattern_SameRowPerm) + { + /* Reuse R and C. */ + switch (ScalePermstruct->DiagScale) + { + case NOEQUIL: + break; + case ROW: + irow = fst_row; + for (j = 0; j < m_loc; ++j) + { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) + { + a[i] *= R[irow]; /* Scale rows. */ + } + ++irow; + } + break; + case COL: + for (j = 0; j < m_loc; ++j) + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) + { + icol = colind[i]; + a[i] *= C[icol]; /* Scale columns. */ + } + break; + case BOTH: + irow = fst_row; + for (j = 0; j < m_loc; ++j) + { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) + { + icol = colind[i]; + a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ + } + ++irow; + } + break; + } } - break; - case BOTH: - irow = fst_row; - for (j = 0; j < m_loc; ++j) - { - for (i = rowptr[j]; i < rowptr[j + 1]; ++i) - { - icol = colind[i]; - a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ - } - ++irow; - } - break; - } - } else { /* Compute R & C from scratch */ - /* Compute the row and column scalings. */ - pdgsequ (A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); - - if ( iinfo > 0 ) { - if ( iinfo <= m ) { + else /* Compute R & C from scratch */ + { + /* Compute the row and column scalings. */ + pdgsequ (A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); + + if ( iinfo > 0 ) + { + if ( iinfo <= m ) + { #if ( PRNTlevel>=1 ) - fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo); + fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo); #endif - } else { + } + else + { #if ( PRNTlevel>=1 ) - fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n); + fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo - n); #endif - } - } else if ( iinfo < 0 ) return; - - /* Now iinfo == 0 */ - - /* Equilibrate matrix A if it is badly-scaled. - A <-- diag(R)*A*diag(C) */ - pdlaqgs (A, R, C, rowcnd, colcnd, amax, equed); - - if ( strncmp(equed, "R", 1)==0 ) { - ScalePermstruct->DiagScale = ROW; - rowequ = ROW; - } else if ( strncmp(equed, "C", 1)==0 ) { - ScalePermstruct->DiagScale = COL; - colequ = COL; - } else if ( strncmp(equed, "B", 1)==0 ) { - ScalePermstruct->DiagScale = BOTH; - rowequ = ROW; - colequ = COL; - } else ScalePermstruct->DiagScale = NOEQUIL; + } + } + else if ( iinfo < 0 ) return; + + /* Now iinfo == 0 */ + + /* Equilibrate matrix A if it is badly-scaled. + A <-- diag(R)*A*diag(C) */ + pdlaqgs (A, R, C, rowcnd, colcnd, amax, equed); + + if ( strncmp(equed, "R", 1) == 0 ) + { + ScalePermstruct->DiagScale = ROW; + rowequ = ROW; + } + else if ( strncmp(equed, "C", 1) == 0 ) + { + ScalePermstruct->DiagScale = COL; + colequ = COL; + } + else if ( strncmp(equed, "B", 1) == 0 ) + { + ScalePermstruct->DiagScale = BOTH; + rowequ = ROW; + colequ = COL; + } + else ScalePermstruct->DiagScale = NOEQUIL; #if ( PRNTlevel>=1 ) - if (iam==0) { - printf (".. equilibrated? *equed = %c\n", *equed); - fflush(stdout); - } + if (iam == 0) + { + printf (".. equilibrated? *equed = %c\n", *equed); + fflush(stdout); + } #endif - } /* end if-else Fact ... */ + } /* end if-else Fact ... */ - stat->utime[EQUIL] = SuperLU_timer_ () - t; + stat->utime[EQUIL] = SuperLU_timer_ () - t; #if ( DEBUGlevel>=1 ) - CHECK_MALLOC (iam, "Exit equil"); + CHECK_MALLOC (iam, "Exit equil"); #endif - } /* end if Equil ... LAPACK style, not involving MC64 */ - - if (!factored) { /* Skip this if already factored. */ - /* - * Gather A from the distributed compressed row format to - * global A in compressed column format. - * Numerical values are gathered only when a row permutation - * for large diagonal is sought after. - */ - if (Fact != SamePattern_SameRowPerm && - (parSymbFact == NO || options->RowPerm != NO)) { - - need_value = (options->RowPerm == LargeDiag_MC64); - - pdCompRow_loc_to_CompCol_global (need_value, A, grid, &GA); - - GAstore = (NCformat *) GA.Store; - colptr = GAstore->colptr; - rowind = GAstore->rowind; - nnz = GAstore->nnz; - GA_mem_use = (nnz + n + 1) * sizeof (int_t); - - if (need_value) { - a_GA = (double *) GAstore->nzval; - GA_mem_use += nnz * sizeof (double); - } + } /* end if Equil ... LAPACK style, not involving MC64 */ - else - assert (GAstore->nzval == NULL); - } - - /* ------------------------------------------------------------ - Find the row permutation for A. - ------------------------------------------------------------ */ - if (options->RowPerm != NO) { - t = SuperLU_timer_ (); - if (Fact != SamePattern_SameRowPerm) { - if (options->RowPerm == MY_PERMR) { - /* Use user's perm_r. */ - /* Permute the global matrix GA for symbfact() */ - for (i = 0; i < colptr[n]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } else if ( options->RowPerm == LargeDiag_MC64 ) { - /* Get a new perm_r[] */ - if (job == 5) { - /* Allocate storage for scaling factors. */ - if (!(R1 = doubleMalloc_dist (m))) - ABORT ("SUPERLU_MALLOC fails for R1[]"); - if (!(C1 = doubleMalloc_dist (n))) - ABORT ("SUPERLU_MALLOC fails for C1[]"); - } - - if ( iam==0 ) { - /* Process 0 finds a row permutation */ - iinfo = dldperm_dist (job, m, nnz, colptr, rowind, a_GA, - perm_r, R1, C1); - MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); - if ( iinfo == 0 ) { - MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); - if (job == 5 && Equil) { - MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); - MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); - } - } - } else { - MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); - if ( iinfo == 0 ) { - MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); - if (job == 5 && Equil) { - MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); - MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); + if (!factored) /* Skip this if already factored. */ + { + /* + * Gather A from the distributed compressed row format to + * global A in compressed column format. + * Numerical values are gathered only when a row permutation + * for large diagonal is sought after. + */ + if (Fact != SamePattern_SameRowPerm && + (parSymbFact == NO || options->RowPerm != NO)) + { + + need_value = (options->RowPerm == LargeDiag_MC64); + + pdCompRow_loc_to_CompCol_global (need_value, A, grid, &GA); + + GAstore = (NCformat *) GA.Store; + colptr = GAstore->colptr; + rowind = GAstore->rowind; + nnz = GAstore->nnz; + GA_mem_use = (nnz + n + 1) * sizeof (int_t); + + if (need_value) + { + a_GA = (double *) GAstore->nzval; + GA_mem_use += nnz * sizeof (double); } - } - } - if ( iinfo && job == 5) { /* Error return */ - SUPERLU_FREE(R1); - SUPERLU_FREE(C1); + else + assert (GAstore->nzval == NULL); } + + /* ------------------------------------------------------------ + Find the row permutation for A. + ------------------------------------------------------------ */ + if (options->RowPerm != NO) + { + t = SuperLU_timer_ (); + if (Fact != SamePattern_SameRowPerm) + { + if (options->RowPerm == MY_PERMR) + { + /* Use user's perm_r. */ + /* Permute the global matrix GA for symbfact() */ + for (i = 0; i < colptr[n]; ++i) + { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } + else if ( options->RowPerm == LargeDiag_MC64 ) + { + /* Get a new perm_r[] */ + if (job == 5) + { + /* Allocate storage for scaling factors. */ + if (!(R1 = doubleMalloc_dist (m))) + ABORT ("SUPERLU_MALLOC fails for R1[]"); + if (!(C1 = doubleMalloc_dist (n))) + ABORT ("SUPERLU_MALLOC fails for C1[]"); + } + + if ( iam == 0 ) + { + /* Process 0 finds a row permutation */ + iinfo = dldperm_dist (job, m, nnz, colptr, rowind, a_GA, + perm_r, R1, C1); + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) + { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) + { + MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); + } + } + } + else + { + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) + { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) + { + MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); + } + } + } + + if ( iinfo && job == 5) /* Error return */ + { + SUPERLU_FREE(R1); + SUPERLU_FREE(C1); + } +#if ( PRNTlevel>=2 ) + dmin = damch_dist ("Overflow"); + dsum = 0.0; + dprod = 1.0; +#endif + if ( iinfo == 0 ) + { + if (job == 5) + { + if ( Equil ) + { + for (i = 0; i < n; ++i) + { + R1[i] = exp (R1[i]); + C1[i] = exp (C1[i]); + } + + /* Scale the distributed matrix further. + A <-- diag(R1)*A*diag(C1) */ + irow = fst_row; + for (j = 0; j < m_loc; ++j) + { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) + { + icol = colind[i]; + a[i] *= R1[irow] * C1[icol]; #if ( PRNTlevel>=2 ) - dmin = damch_dist ("Overflow"); - dsum = 0.0; - dprod = 1.0; + if (perm_r[irow] == icol) + { + /* New diagonal */ + if (job == 2 || job == 3) + dmin = SUPERLU_MIN(dmin, fabs(a[i])); + else if (job == 4) + dsum += fabs(a[i]); + else if (job == 5) + dprod *= fabs(a[i]); + } #endif - if ( iinfo == 0 ) { - if (job == 5) { - if ( Equil ) { - for (i = 0; i < n; ++i) { - R1[i] = exp (R1[i]); - C1[i] = exp (C1[i]); - } - - /* Scale the distributed matrix further. - A <-- diag(R1)*A*diag(C1) */ - irow = fst_row; - for (j = 0; j < m_loc; ++j) { - for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { - icol = colind[i]; - a[i] *= R1[irow] * C1[icol]; + } + ++irow; + } + + /* Multiply together the scaling factors -- + R/C from simple scheme, R1/C1 from MC64. */ + if (rowequ) + for (i = 0; i < m; ++i) R[i] *= R1[i]; + else + for (i = 0; i < m; ++i) R[i] = R1[i]; + if (colequ) + for (i = 0; i < n; ++i) C[i] *= C1[i]; + else + for (i = 0; i < n; ++i) C[i] = C1[i]; + + ScalePermstruct->DiagScale = BOTH; + rowequ = colequ = 1; + + } /* end if Equil */ + + /* Now permute global A to prepare for symbfact() */ + for (j = 0; j < n; ++j) + { + for (i = colptr[j]; i < colptr[j + 1]; ++i) + { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } + SUPERLU_FREE (R1); + SUPERLU_FREE (C1); + } + else /* job = 2,3,4 */ + { + for (j = 0; j < n; ++j) + { + for (i = colptr[j]; i < colptr[j + 1]; ++i) + { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } /* end for i ... */ + } /* end for j ... */ + } /* end else job ... */ + } + else /* if iinfo != 0 */ + { + for (i = 0; i < m; ++i) perm_r[i] = i; + } #if ( PRNTlevel>=2 ) - if (perm_r[irow] == icol) { - /* New diagonal */ if (job == 2 || job == 3) - dmin = SUPERLU_MIN(dmin, fabs(a[i])); + { + if (!iam) + printf ("\tsmallest diagonal %e\n", dmin); + } else if (job == 4) - dsum += fabs(a[i]); + { + if (!iam) + printf ("\tsum of diagonal %e\n", dsum); + } else if (job == 5) - dprod *= fabs(a[i]); - } + { + if (!iam) + printf ("\t product of diagonal %e\n", dprod); + } #endif } - ++irow; - } - - /* Multiply together the scaling factors -- - R/C from simple scheme, R1/C1 from MC64. */ - if (rowequ) - for (i = 0; i < m; ++i) R[i] *= R1[i]; - else - for (i = 0; i < m; ++i) R[i] = R1[i]; - if (colequ) - for (i = 0; i < n; ++i) C[i] *= C1[i]; - else - for (i = 0; i < n; ++i) C[i] = C1[i]; - - ScalePermstruct->DiagScale = BOTH; - rowequ = colequ = 1; - - } /* end if Equil */ - - /* Now permute global A to prepare for symbfact() */ - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j + 1]; ++i) { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } - SUPERLU_FREE (R1); - SUPERLU_FREE (C1); - } else { /* job = 2,3,4 */ - for (j = 0; j < n; ++j) { - for (i = colptr[j]; i < colptr[j + 1]; ++i) - { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } /* end for i ... */ - } /* end for j ... */ - } /* end else job ... */ - } else { /* if iinfo != 0 */ - for (i = 0; i < m; ++i) perm_r[i] = i; - } -#if ( PRNTlevel>=2 ) - if (job == 2 || job == 3) { - if (!iam) - printf ("\tsmallest diagonal %e\n", dmin); - } else if (job == 4) { - if (!iam) - printf ("\tsum of diagonal %e\n", dsum); - } else if (job == 5) { - if (!iam) - printf ("\t product of diagonal %e\n", dprod); - } -#endif - } else { /* use largeDiag_AWPM */ + else /* use largeDiag_AWPM */ + { #ifdef HAVE_COMBBLAS - c2cpp_GetAWPM(A, grid, ScalePermstruct); + c2cpp_GetAWPM(A, grid, ScalePermstruct); #else - if ( iam == 0 ) { - printf("CombBLAS is not available\n"); fflush(stdout); - } + if ( iam == 0 ) + { + printf("CombBLAS is not available\n"); fflush(stdout); + } #endif - } /* end if-else options->RowPerm ... */ + } /* end if-else options->RowPerm ... */ - t = SuperLU_timer_ () - t; - stat->utime[ROWPERM] = t; + t = SuperLU_timer_ () - t; + stat->utime[ROWPERM] = t; #if ( PRNTlevel>=1 ) - if ( !iam ) { - printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); - fflush(stdout); - } + if ( !iam ) + { + printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); + fflush(stdout); + } #endif - } /* end if Fact not SamePattern_SameRowPerm ... */ - } else { /* options->RowPerm == NOROWPERM / NATURAL */ - for (i = 0; i < m; ++i) perm_r[i] = i; - } - + } /* end if Fact not SamePattern_SameRowPerm ... */ + } + else /* options->RowPerm == NOROWPERM / NATURAL */ + { + for (i = 0; i < m; ++i) perm_r[i] = i; + } + #if ( DEBUGlevel>=2 ) - if (!iam) - PrintInt10 ("perm_r", m, perm_r); + if (!iam) + PrintInt10 ("perm_r", m, perm_r); #endif - } /* end if (!factored) */ - - if (!factored || options->IterRefine) { - /* Compute norm(A), which will be used to adjust small diagonal. */ - if (notran) - *(unsigned char *) norm = '1'; - else - *(unsigned char *) norm = 'I'; - anorm = pdlangs (norm, A, grid); + } /* end if (!factored) */ + + if (!factored || options->IterRefine) + { + /* Compute norm(A), which will be used to adjust small diagonal. */ + if (notran) + *(unsigned char *) norm = '1'; + else + *(unsigned char *) norm = 'I'; + anorm = pdlangs (norm, A, grid); #if ( PRNTlevel>=1 ) - if (!iam) { - printf (".. anorm %e\n", anorm); fflush(stdout); - } -#endif - } - - - /* ------------------------------------------------------------ - Perform the LU factorization. - ------------------------------------------------------------ */ - if (!factored) { - t = SuperLU_timer_ (); - /* - * Get column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A - * permc_spec = PARMETIS: parallel METIS on structure of A'+A - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - - if (parSymbFact == YES || permc_spec == PARMETIS) { - nprocs_num = grid->nprow * grid->npcol; - noDomains = (int) (pow (2, ((int) LOG2 (nprocs_num)))); - - /* create a new communicator for the first noDomains - processes in grid->comm */ - key = iam; - if (iam < noDomains) - col = 0; - else - col = MPI_UNDEFINED; - MPI_Comm_split (grid->comm, col, key, &symb_comm); - - if (permc_spec == NATURAL || permc_spec == MY_PERMC) { - if (permc_spec == NATURAL) + if (!iam) { - for (j = 0; j < n; ++j) - perm_c[j] = j; + printf (".. anorm %e\n", anorm); fflush(stdout); } - if (!(sizes = intMalloc_dist (2 * noDomains))) - ABORT ("SUPERLU_MALLOC fails for sizes."); - if (!(fstVtxSep = intMalloc_dist (2 * noDomains))) - ABORT ("SUPERLU_MALLOC fails for fstVtxSep."); - for (i = 0; i < 2 * noDomains - 2; ++i) { - sizes[i] = 0; - fstVtxSep[i] = 0; - } - sizes[2 * noDomains - 2] = m; - fstVtxSep[2 * noDomains - 2] = 0; - } else if (permc_spec != PARMETIS) { - /* same as before */ - printf("{%4d,%4d}: pdgssvx3d: invalid ColPerm option when ParSymbfact is used\n", - (int) MYROW(grid->iam, grid), (int) MYCOL(grid->iam, grid)); - } - } /* end ... use parmetis */ - - if (permc_spec != MY_PERMC && Fact == DOFACT) { - if (permc_spec == PARMETIS) { - /* Get column permutation vector in perm_c. * - * This routine takes as input the distributed input matrix A * - * and does not modify it. It also allocates memory for * - * sizes[] and fstVtxSep[] arrays, that contain information * - * on the separator tree computed by ParMETIS. */ - flinfo = get_perm_c_parmetis (A, perm_r, perm_c, nprocs_num, - noDomains, &sizes, &fstVtxSep, - grid, &symb_comm); - if (flinfo > 0) - ABORT ("ERROR in get perm_c parmetis."); - } else { - get_perm_c_dist (iam, permc_spec, &GA, perm_c); +#endif } - } - - stat->utime[COLPERM] = SuperLU_timer_ () - t; - - /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' - (a.k.a. column etree), depending on the choice of ColPerm. - Adjust perm_c[] to be consistent with a postorder of etree. - Permute columns of A to form A*Pc'. */ - if (Fact != SamePattern_SameRowPerm) { - if (parSymbFact == NO) { - - int_t *GACcolbeg, *GACcolend, *GACrowind; - - sp_colorder (options, &GA, perm_c, etree, &GAC); - - /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ - GACstore = (NCPformat *) GAC.Store; - GACcolbeg = GACstore->colbeg; - GACcolend = GACstore->colend; - GACrowind = GACstore->rowind; - for (j = 0; j < n; ++j) { - for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { - irow = GACrowind[i]; - GACrowind[i] = perm_c[irow]; + + + /* ------------------------------------------------------------ + Perform the LU factorization. + ------------------------------------------------------------ */ + if (!factored) + { + t = SuperLU_timer_ (); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A + * permc_spec = PARMETIS: parallel METIS on structure of A'+A + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + + if (parSymbFact == YES || permc_spec == PARMETIS) + { + nprocs_num = grid->nprow * grid->npcol; + noDomains = (int) (pow (2, ((int) LOG2 (nprocs_num)))); + + /* create a new communicator for the first noDomains + processes in grid->comm */ + key = iam; + if (iam < noDomains) + col = 0; + else + col = MPI_UNDEFINED; + MPI_Comm_split (grid->comm, col, key, &symb_comm); + + if (permc_spec == NATURAL || permc_spec == MY_PERMC) + { + if (permc_spec == NATURAL) + { + for (j = 0; j < n; ++j) + perm_c[j] = j; + } + if (!(sizes = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for sizes."); + if (!(fstVtxSep = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for fstVtxSep."); + for (i = 0; i < 2 * noDomains - 2; ++i) + { + sizes[i] = 0; + fstVtxSep[i] = 0; + } + sizes[2 * noDomains - 2] = m; + fstVtxSep[2 * noDomains - 2] = 0; + } + else if (permc_spec != PARMETIS) + { + /* same as before */ + printf("{%4d,%4d}: pdgssvx3d: invalid ColPerm option when ParSymbfact is used\n", + (int) MYROW(grid->iam, grid), (int) MYCOL(grid->iam, grid)); + } + } /* end ... use parmetis */ + + if (permc_spec != MY_PERMC && Fact == DOFACT) + { + if (permc_spec == PARMETIS) + { + /* Get column permutation vector in perm_c. * + * This routine takes as input the distributed input matrix A * + * and does not modify it. It also allocates memory for * + * sizes[] and fstVtxSep[] arrays, that contain information * + * on the separator tree computed by ParMETIS. */ + flinfo = get_perm_c_parmetis (A, perm_r, perm_c, nprocs_num, + noDomains, &sizes, &fstVtxSep, + grid, &symb_comm); + if (flinfo > 0) + ABORT ("ERROR in get perm_c parmetis."); + } + else + { + get_perm_c_dist (iam, permc_spec, &GA, perm_c); + } } - } - - - /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up - the nonzero data structures for L & U. */ + + stat->utime[COLPERM] = SuperLU_timer_ () - t; + + /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' + (a.k.a. column etree), depending on the choice of ColPerm. + Adjust perm_c[] to be consistent with a postorder of etree. + Permute columns of A to form A*Pc'. */ + if (Fact != SamePattern_SameRowPerm) + { + if (parSymbFact == NO) + { + + int_t *GACcolbeg, *GACcolend, *GACrowind; + + sp_colorder (options, &GA, perm_c, etree, &GAC); + + /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ + GACstore = (NCPformat *) GAC.Store; + GACcolbeg = GACstore->colbeg; + GACcolend = GACstore->colend; + GACrowind = GACstore->rowind; + for (j = 0; j < n; ++j) + { + for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) + { + irow = GACrowind[i]; + GACrowind[i] = perm_c[irow]; + } + } + + + /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up + the nonzero data structures for L & U. */ #if ( PRNTlevel>=1 ) - if (!iam) - printf - (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", - sp_ienv_dist (2), sp_ienv_dist (3), sp_ienv_dist (6)); + if (!iam) + printf + (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", + sp_ienv_dist (2), sp_ienv_dist (3), sp_ienv_dist (6)); #endif - t = SuperLU_timer_ (); - if (!(Glu_freeable = (Glu_freeable_t *) - SUPERLU_MALLOC (sizeof (Glu_freeable_t)))) - ABORT ("Malloc fails for Glu_freeable."); + t = SuperLU_timer_ (); + if (!(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC (sizeof (Glu_freeable_t)))) + ABORT ("Malloc fails for Glu_freeable."); - /* Every process does this. */ - iinfo = symbfact (options, iam, &GAC, perm_c, etree, - Glu_persist, Glu_freeable); + /* Every process does this. */ + iinfo = symbfact (options, iam, &GAC, perm_c, etree, + Glu_persist, Glu_freeable); - stat->utime[SYMBFAC] = SuperLU_timer_ () - t; - if (iinfo < 0) { - /* Successful return */ - QuerySpace_dist (n, -iinfo, Glu_freeable, &symb_mem_usage); + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (iinfo < 0) + { + /* Successful return */ + QuerySpace_dist (n, -iinfo, Glu_freeable, &symb_mem_usage); #if ( PRNTlevel>=1 ) - if (!iam) { - printf ("\tNo of supers %ld\n", - Glu_persist->supno[n - 1] + 1); - printf ("\tSize of G(L) %ld\n", - Glu_freeable->xlsub[n]); - printf ("\tSize of G(U) %ld\n", - Glu_freeable->xusub[n]); - printf ("\tint %d, short %d, float %d, double %d\n", - sizeof (int_t), sizeof (short), - sizeof (float), sizeof (double)); - printf - ("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", - symb_mem_usage.for_lu * 1e-6, - symb_mem_usage.total * 1e-6, - symb_mem_usage.expansions); - } + if (!iam) + { + printf ("\tNo of supers %ld\n", + Glu_persist->supno[n - 1] + 1); + printf ("\tSize of G(L) %ld\n", + Glu_freeable->xlsub[n]); + printf ("\tSize of G(U) %ld\n", + Glu_freeable->xusub[n]); + printf ("\tint %d, short %d, float %d, double %d\n", + sizeof (int_t), sizeof (short), + sizeof (float), sizeof (double)); + printf + ("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", + symb_mem_usage.for_lu * 1e-6, + symb_mem_usage.total * 1e-6, + symb_mem_usage.expansions); + } #endif - } else { - if (!iam) { - fprintf (stderr, "symbfact() error returns %d\n", - (int) iinfo); - exit (-1); + } + else + { + if (!iam) + { + fprintf (stderr, "symbfact() error returns %d\n", + (int) iinfo); + exit (-1); + } + } + + } /* end serial symbolic factorization */ + else /* parallel symbolic factorization */ + { + t = SuperLU_timer_ (); + flinfo = + symbfact_dist (nprocs_num, noDomains, A, perm_c, perm_r, + sizes, fstVtxSep, &Pslu_freeable, + &(grid->comm), &symb_comm, + &symb_mem_usage); + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (flinfo > 0) + ABORT + ("Insufficient memory for parallel symbolic factorization."); + } + + /* Destroy GA */ + if (parSymbFact == NO || options->RowPerm != NO) + Destroy_CompCol_Matrix_dist (&GA); + if (parSymbFact == NO) + Destroy_CompCol_Permuted_dist (&GAC); + + } /* end if Fact not SamePattern_SameRowPerm */ + + if (sizes) + SUPERLU_FREE (sizes); + if (fstVtxSep) + SUPERLU_FREE (fstVtxSep); + if (symb_comm != MPI_COMM_NULL) + MPI_Comm_free (&symb_comm); + + if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) + { + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + t = SuperLU_timer_ (); + dist_mem_use = pddistribute (Fact, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid); + stat->utime[DIST] = SuperLU_timer_ () - t; + + /* Deallocate storage used in symbolic factorization. */ + if (Fact != SamePattern_SameRowPerm) + { + iinfo = symbfact_SubFree (Glu_freeable); + SUPERLU_FREE (Glu_freeable); + } } - } - - } /* end serial symbolic factorization */ - else { /* parallel symbolic factorization */ - t = SuperLU_timer_ (); - flinfo = - symbfact_dist (nprocs_num, noDomains, A, perm_c, perm_r, - sizes, fstVtxSep, &Pslu_freeable, - &(grid->comm), &symb_comm, - &symb_mem_usage); - stat->utime[SYMBFAC] = SuperLU_timer_ () - t; - if (flinfo > 0) - ABORT - ("Insufficient memory for parallel symbolic factorization."); - } + else + { + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; - /* Destroy GA */ - if (parSymbFact == NO || options->RowPerm != NO) - Destroy_CompCol_Matrix_dist (&GA); - if (parSymbFact == NO) - Destroy_CompCol_Permuted_dist (&GAC); - - } /* end if Fact not SamePattern_SameRowPerm */ - - if (sizes) - SUPERLU_FREE (sizes); - if (fstVtxSep) - SUPERLU_FREE (fstVtxSep); - if (symb_comm != MPI_COMM_NULL) - MPI_Comm_free (&symb_comm); - - if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) - colind[j] = perm_c[colind[j]]; - - /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. - NOTE: the row permutation Pc*Pr is applied internally in the - distribution routine. */ - t = SuperLU_timer_ (); - dist_mem_use = pddistribute (Fact, n, A, ScalePermstruct, - Glu_freeable, LUstruct, grid); - stat->utime[DIST] = SuperLU_timer_ () - t; - - /* Deallocate storage used in symbolic factorization. */ - if (Fact != SamePattern_SameRowPerm) - { - iinfo = symbfact_SubFree (Glu_freeable); - SUPERLU_FREE (Glu_freeable); - } - } else { - /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. - NOTE: the row permutation Pc*Pr is applied internally in the - distribution routine. */ - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) - colind[j] = perm_c[colind[j]]; - - t = SuperLU_timer_ (); - dist_mem_use = ddist_psymbtonum (Fact, n, A, ScalePermstruct, - &Pslu_freeable, LUstruct, grid); - if (dist_mem_use > 0) - ABORT ("Not enough memory available for dist_psymbtonum\n"); + t = SuperLU_timer_ (); + dist_mem_use = ddist_psymbtonum (Fact, n, A, ScalePermstruct, + &Pslu_freeable, LUstruct, grid); + if (dist_mem_use > 0) + ABORT ("Not enough memory available for dist_psymbtonum\n"); - stat->utime[DIST] = SuperLU_timer_ () - t; - } + stat->utime[DIST] = SuperLU_timer_ () - t; + } - /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); */ - } /* end if not Factored */ - } /* end if process layer 0 */ + /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); */ + } /* end if not Factored */ + } /* end if process layer 0 */ + + trf3Dpartition_t* trf3Dpartition; + + /* Perform numerical factorization in parallel on all process layers.*/ + if ( !factored ) + { - trf3Dpartition_t* trf3Dpartition; + /* send the data across all the layers */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &anorm, 1, MPI_DOUBLE, 0, grid3d->zscp.comm); - /* Perform numerical factorization in parallel on all process layers.*/ - if ( !factored ) { + /* send the LU structure to all the grids */ + dp3dScatter(n, LUstruct, grid3d); - /* send the data across all the layers */ - MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->zscp.comm); - MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->zscp.comm); - MPI_Bcast( &anorm, 1, MPI_DOUBLE, 0, grid3d->zscp.comm); - - /* send the LU structure to all the grids */ - dp3dScatter(n, LUstruct, grid3d); + int_t nsupers = getNsupers(n, LUstruct); + trf3Dpartition = dinitTrf3Dpartition(nsupers, options, LUstruct, grid3d); - int_t nsupers = getNsupers(n, LUstruct); - trf3Dpartition = dinitTrf3Dpartition(nsupers, options, LUstruct, grid3d); + SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t)); + SCT_init(SCT); - SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t)); - SCT_init(SCT); - #if ( PRNTlevel>=1 ) - if (iam==0) { - printf("after 3D initialization.\n"); fflush(stdout); - } + if (iam == 0) + { + printf("after 3D initialization.\n"); fflush(stdout); + } #endif - t = SuperLU_timer_ (); + t = SuperLU_timer_ (); + + /*factorize in grid 1*/ + // if(grid3d->zscp.Iam) + + pdgstrf3d (options, m, n, anorm, trf3Dpartition, SCT, LUstruct, + grid3d, stat, info); + stat->utime[FACT] = SuperLU_timer_ () - t; - /*factorize in grid 1*/ - // if(grid3d->zscp.Iam) + double tgather = SuperLU_timer_(); - pdgstrf3d (options, m, n, anorm, trf3Dpartition, SCT, LUstruct, - grid3d, stat, info); - stat->utime[FACT] = SuperLU_timer_ () - t; - - double tgather = SuperLU_timer_(); - - dgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT); + dgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT); + + SCT->gatherLUtimer += SuperLU_timer_() - tgather; + /*print stats for bottom grid*/ - SCT->gatherLUtimer += SuperLU_timer_() - tgather; - /*print stats for bottom grid*/ - #if ( PRNTlevel>=1 ) - if (!grid3d->zscp.Iam) - { - SCT_print(grid, SCT); - SCT_print3D(grid3d, SCT); - } - SCT_printComm3D(grid3d, SCT); - - /*print memory usage*/ - d3D_printMemUse( trf3Dpartition, LUstruct, grid3d ); - - /*print forest weight and costs*/ - printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d); - /*reduces stat from all the layers*/ + if (!grid3d->zscp.Iam) + { + SCT_print(grid, SCT); + SCT_print3D(grid3d, SCT); + } + SCT_printComm3D(grid3d, SCT); + + /*print memory usage*/ + d3D_printMemUse( trf3Dpartition, LUstruct, grid3d ); + + /*print forest weight and costs*/ + printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d); + /*reduces stat from all the layers*/ #endif - dDestroy_trf3Dpartition(trf3Dpartition, grid3d); - SCT_free(SCT); - - } /* end if not Factored */ - - if ( grid3d->zscp.Iam == 0 ) { // only process layer 0 - if (!factored) { - if (options->PrintStat) { - int_t TinyPivots; - float for_lu, total, max, avg, temp; - - dQuerySpace_dist (n, LUstruct, grid, stat, &num_mem_usage); - - if (parSymbFact == TRUE) { - /* The memory used in the redistribution routine - includes the memory used for storing the symbolic - structure and the memory allocated for numerical factorization */ - temp = SUPERLU_MAX (symb_mem_usage.total, -dist_mem_use); - if (options->RowPerm != NO) - temp = SUPERLU_MAX (temp, GA_mem_use); - } - else { - temp = SUPERLU_MAX (symb_mem_usage.total + GA_mem_use, /* symbfact step */ - symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu /* distribution step */ - ); - } + dDestroy_trf3Dpartition(trf3Dpartition, grid3d); + SCT_free(SCT); - temp = SUPERLU_MAX (temp, num_mem_usage.total); + } /* end if not Factored */ - MPI_Reduce (&temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); - MPI_Reduce (&temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); - MPI_Allreduce (&stat->TinyPivots, &TinyPivots, 1, mpi_int_t, - MPI_SUM, grid->comm); - stat->TinyPivots = TinyPivots; + if ( grid3d->zscp.Iam == 0 ) // only process layer 0 + { + if (!factored) + { + if (options->PrintStat) + { + int_t TinyPivots; + float for_lu, total, max, avg, temp; + + dQuerySpace_dist (n, LUstruct, grid, stat, &num_mem_usage); + + if (parSymbFact == TRUE) + { + /* The memory used in the redistribution routine + includes the memory used for storing the symbolic + structure and the memory allocated for numerical factorization */ + temp = SUPERLU_MAX (symb_mem_usage.total, -dist_mem_use); + if (options->RowPerm != NO) + temp = SUPERLU_MAX (temp, GA_mem_use); + } + else + { + temp = SUPERLU_MAX (symb_mem_usage.total + GA_mem_use, /* symbfact step */ + symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu /* distribution step */ + ); + } + + temp = SUPERLU_MAX (temp, num_mem_usage.total); + + MPI_Reduce (&temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + MPI_Reduce (&temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Allreduce (&stat->TinyPivots, &TinyPivots, 1, mpi_int_t, + MPI_SUM, grid->comm); + stat->TinyPivots = TinyPivots; - MPI_Reduce (&num_mem_usage.for_lu, &for_lu, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); - MPI_Reduce (&num_mem_usage.total, &total, + MPI_Reduce (&num_mem_usage.for_lu, &for_lu, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&num_mem_usage.total, &total, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); - if (!iam) { - printf("\tNUMfact space (MB) sum(procs): L\\U\t%.2f\tall\t%.2f\n", - for_lu * 1e-6, total * 1e-6); - printf ("\tTotal highmark (MB): " - "All\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg * 1e-6, - avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); - printf("**************************************************\n"); - fflush(stdout); - } - } - - } /* end if not Factored */ - - /* ------------------------------------------------------------ - Compute the solution matrix X. - ------------------------------------------------------------ */ - if (nrhs) { - if (!(b_work = doubleMalloc_dist (n))) - ABORT ("Malloc fails for b_work[]"); - - /* ------------------------------------------------------ - Scale the right-hand side if equilibration was performed - ------------------------------------------------------*/ - if (notran) - { - if (rowequ) - { + if (!iam) + { + printf("\tNUMfact space (MB) sum(procs): L\\U\t%.2f\tall\t%.2f\n", + for_lu * 1e-6, total * 1e-6); + printf ("\tTotal highmark (MB): " + "All\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg * 1e-6, + avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); + printf("**************************************************\n"); + fflush(stdout); + } + } + + } /* end if not Factored */ + + /* ------------------------------------------------------------ + Compute the solution matrix X. + ------------------------------------------------------------ */ + if (nrhs) + { + if (!(b_work = doubleMalloc_dist (n))) + ABORT ("Malloc fails for b_work[]"); + + /* ------------------------------------------------------ + Scale the right-hand side if equilibration was performed + ------------------------------------------------------*/ + if (notran) + { + if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (colequ) + { b_col = B; for (j = 0; j < nrhs; ++j) - { + { irow = fst_row; for (i = 0; i < m_loc; ++i) - { - b_col[i] *= R[irow]; + { + b_col[i] *= C[irow]; ++irow; - } + } b_col += ldb; - } - } - } - else if (colequ) - { + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if (!(X = doubleMalloc_dist (((size_t) ldx) * nrhs))) + ABORT ("Malloc fails for X[]"); + x_col = X; b_col = B; for (j = 0; j < nrhs; ++j) - { - irow = fst_row; - for (i = 0; i < m_loc; ++i) - { - b_col[i] *= C[irow]; - ++irow; - } + { + for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; + x_col += ldx; b_col += ldb; - } - } - - /* Save a copy of the right-hand side. */ - ldx = ldb; - if (!(X = doubleMalloc_dist (((size_t) ldx) * nrhs))) - ABORT ("Malloc fails for X[]"); - x_col = X; - b_col = B; - for (j = 0; j < nrhs; ++j) { - for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; - x_col += ldx; - b_col += ldb; - } + } - /* ------------------------------------------------------ - Solve the linear system. - ------------------------------------------------------*/ - if (options->SolveInitialized == NO) /* First time */ - /* Inside this routine, SolveInitialized is set to YES. - For repeated call to pdgssvx3d(), no need to re-initialilze - the Solve data & communication structures, unless a new - factorization with Fact == DOFACT or SamePattern is asked for. */ - { - dSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct, - grid, SOLVEstruct); - } - stat->utime[SOLVE] = 0.0; + /* ------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------*/ + if (options->SolveInitialized == NO) /* First time */ + /* Inside this routine, SolveInitialized is set to YES. + For repeated call to pdgssvx3d(), no need to re-initialilze + the Solve data & communication structures, unless a new + factorization with Fact == DOFACT or SamePattern is asked for. */ + { + dSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct, + grid, SOLVEstruct); + } + stat->utime[SOLVE] = 0.0; #if 0 // Sherry: the following interface is needed by 3D trisolve. - pdgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc, - fst_row, ldb, nrhs, SOLVEstruct, stat, info); + pdgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); #else - pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, - fst_row, ldb, nrhs, SOLVEstruct, stat, info); + pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); #endif - /* ------------------------------------------------------------ - Use iterative refinement to improve the computed solution and - compute error bounds and backward error estimates for it. - ------------------------------------------------------------ */ - if (options->IterRefine) - { - /* Improve the solution by iterative refinement. */ - int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; - SOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ + /* ------------------------------------------------------------ + Use iterative refinement to improve the computed solution and + compute error bounds and backward error estimates for it. + ------------------------------------------------------------ */ + if (options->IterRefine) + { + /* Improve the solution by iterative refinement. */ + int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; + SOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ - t = SuperLU_timer_ (); - if (options->RefineInitialized == NO || Fact == DOFACT) { - /* All these cases need to re-initialize gsmv structure */ - if (options->RefineInitialized) - pdgsmv_finalize (SOLVEstruct->gsmv_comm); - pdgsmv_init (A, SOLVEstruct->row_to_proc, grid, - SOLVEstruct->gsmv_comm); - - /* Save a copy of the transformed local col indices - in colind_gsmv[]. */ - if (colind_gsmv) SUPERLU_FREE (colind_gsmv); - if (!(it = intMalloc_dist (nnz_loc))) - ABORT ("Malloc fails for colind_gsmv[]"); - colind_gsmv = SOLVEstruct->A_colind_gsmv = it; - for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; - options->RefineInitialized = YES; - } - else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) { - double at; - int_t k, jcol, p; - /* Swap to beginning the part of A corresponding to the - local part of X, as was done in pdgsmv_init() */ - for (i = 0; i < m_loc; ++i) { /* Loop through each row */ - k = rowptr[i]; - for (j = rowptr[i]; j < rowptr[i + 1]; ++j) - { - jcol = colind[j]; - p = SOLVEstruct->row_to_proc[jcol]; - if (p == iam) - { /* Local */ - at = a[k]; - a[k] = a[j]; - a[j] = at; - ++k; - } - } - } - - /* Re-use the local col indices of A obtained from the - previous call to pdgsmv_init() */ - for (i = 0; i < nnz_loc; ++i) - colind[i] = colind_gsmv[i]; + t = SuperLU_timer_ (); + if (options->RefineInitialized == NO || Fact == DOFACT) + { + /* All these cases need to re-initialize gsmv structure */ + if (options->RefineInitialized) + pdgsmv_finalize (SOLVEstruct->gsmv_comm); + pdgsmv_init (A, SOLVEstruct->row_to_proc, grid, + SOLVEstruct->gsmv_comm); + + /* Save a copy of the transformed local col indices + in colind_gsmv[]. */ + if (colind_gsmv) SUPERLU_FREE (colind_gsmv); + if (!(it = intMalloc_dist (nnz_loc))) + ABORT ("Malloc fails for colind_gsmv[]"); + colind_gsmv = SOLVEstruct->A_colind_gsmv = it; + for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; + options->RefineInitialized = YES; + } + else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) + { + double at; + int_t k, jcol, p; + /* Swap to beginning the part of A corresponding to the + local part of X, as was done in pdgsmv_init() */ + for (i = 0; i < m_loc; ++i) /* Loop through each row */ + { + k = rowptr[i]; + for (j = rowptr[i]; j < rowptr[i + 1]; ++j) + { + jcol = colind[j]; + p = SOLVEstruct->row_to_proc[jcol]; + if (p == iam) + { + /* Local */ + at = a[k]; + a[k] = a[j]; + a[j] = at; + ++k; + } + } + } + + /* Re-use the local col indices of A obtained from the + previous call to pdgsmv_init() */ + for (i = 0; i < nnz_loc; ++i) + colind[i] = colind_gsmv[i]; + } + + if (nrhs == 1) + { + /* Use the existing solve structure */ + SOLVEstruct1 = SOLVEstruct; + } + else + { + /* For nrhs > 1, since refinement is performed for RHS + one at a time, the communication structure for pdgstrs + is different than the solve with nrhs RHS. + So we use SOLVEstruct1 for the refinement step. + */ + if (!(SOLVEstruct1 = (SOLVEstruct_t *) + SUPERLU_MALLOC (sizeof (SOLVEstruct_t)))) + ABORT ("Malloc fails for SOLVEstruct1"); + /* Copy the same stuff */ + SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; + SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; + SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; + SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; + SOLVEstruct1->diag_len = SOLVEstruct->diag_len; + SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; + SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; + + /* Initialize the *gstrs_comm for 1 RHS. */ + if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) + ABORT ("Malloc fails for gstrs_comm[]"); + pxgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, + Glu_persist, SOLVEstruct1); + } + + pdgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid, + B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); + + /* Deallocate the storage associated with SOLVEstruct1 */ + if (nrhs > 1) + { + pxgstrs_finalize (SOLVEstruct1->gstrs_comm); + SUPERLU_FREE (SOLVEstruct1); + } + + stat->utime[REFINE] = SuperLU_timer_ () - t; } - - if (nrhs == 1) - { /* Use the existing solve structure */ - SOLVEstruct1 = SOLVEstruct; - } - else { - /* For nrhs > 1, since refinement is performed for RHS - one at a time, the communication structure for pdgstrs - is different than the solve with nrhs RHS. - So we use SOLVEstruct1 for the refinement step. - */ - if (!(SOLVEstruct1 = (SOLVEstruct_t *) - SUPERLU_MALLOC (sizeof (SOLVEstruct_t)))) - ABORT ("Malloc fails for SOLVEstruct1"); - /* Copy the same stuff */ - SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; - SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; - SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; - SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; - SOLVEstruct1->diag_len = SOLVEstruct->diag_len; - SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; - SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; - - /* Initialize the *gstrs_comm for 1 RHS. */ - if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) - SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) - ABORT ("Malloc fails for gstrs_comm[]"); - pxgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, - Glu_persist, SOLVEstruct1); - } - - pdgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid, - B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); - - /* Deallocate the storage associated with SOLVEstruct1 */ - if (nrhs > 1) - { - pxgstrs_finalize (SOLVEstruct1->gstrs_comm); - SUPERLU_FREE (SOLVEstruct1); - } - - stat->utime[REFINE] = SuperLU_timer_ () - t; - } - - /* Permute the solution matrix B <= Pc'*X. */ - pdPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc, - SOLVEstruct->inv_perm_c, - X, ldx, B, ldb, nrhs, grid); + + /* Permute the solution matrix B <= Pc'*X. */ + pdPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc, + SOLVEstruct->inv_perm_c, + X, ldx, B, ldb, nrhs, grid); #if ( DEBUGlevel>=2 ) - printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); - for (i = 0; i < m_loc; ++i) - printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]); + printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); + for (i = 0; i < m_loc; ++i) + printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]); #endif - - /* Transform the solution matrix X to a solution of the original - system before the equilibration. */ - if (notran) - { - if (colequ) - { + + /* Transform the solution matrix X to a solution of the original + system before the equilibration. */ + if (notran) + { + if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (rowequ) + { b_col = B; for (j = 0; j < nrhs; ++j) - { + { irow = fst_row; for (i = 0; i < m_loc; ++i) - { - b_col[i] *= C[irow]; + { + b_col[i] *= R[irow]; ++irow; - } + } b_col += ldb; - } - } - } - else if (rowequ) - { - b_col = B; - for (j = 0; j < nrhs; ++j) - { - irow = fst_row; - for (i = 0; i < m_loc; ++i) - { - b_col[i] *= R[irow]; - ++irow; - } - b_col += ldb; - } - } - - SUPERLU_FREE (b_work); - SUPERLU_FREE (X); - - } /* end if nrhs != 0 */ - + } + } + + SUPERLU_FREE (b_work); + SUPERLU_FREE (X); + + } /* end if nrhs != 0 */ + #if ( PRNTlevel>=1 ) - if (!iam) - printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale); + if (!iam) + printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale); #endif - - /* Deallocate R and/or C if it was not used. */ - if (Equil && Fact != SamePattern_SameRowPerm) - { - switch (ScalePermstruct->DiagScale) - { - case NOEQUIL: - SUPERLU_FREE (R); - SUPERLU_FREE (C); - break; - case ROW: - SUPERLU_FREE (C); - break; - case COL: - SUPERLU_FREE (R); - break; - } - } - + + /* Deallocate R and/or C if it was not used. */ + if (Equil && Fact != SamePattern_SameRowPerm) + { + switch (ScalePermstruct->DiagScale) + { + case NOEQUIL: + SUPERLU_FREE (R); + SUPERLU_FREE (C); + break; + case ROW: + SUPERLU_FREE (C); + break; + case COL: + SUPERLU_FREE (R); + break; + } + } + #if 0 - if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact) - Destroy_CompCol_Permuted_dist (&GAC); + if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact) + Destroy_CompCol_Permuted_dist (&GAC); #endif - } /* process layer 0 done solve */ + } /* process layer 0 done solve */ #if ( DEBUGlevel>=1 ) CHECK_MALLOC (iam, "Exit pdgssvx3d()"); From a5b826c95ac900d9bdec48e2754f85515bcc128e Mon Sep 17 00:00:00 2001 From: piyush sao Date: Thu, 17 Sep 2020 01:14:52 -0400 Subject: [PATCH 023/147] working with pz=1 --- SRC/pdgssvx3d.c | 61 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 7 deletions(-) diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index 915b66db..cb37d50e 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -19,6 +19,40 @@ at the top-level directory. * May 10, 2019 */ #include "superlu_ddefs.h" +#include + +static void checkNRFMT(NRformat_loc*A, NRformat_loc*B) +{ + /* + int_t nnz_loc; + int_t m_loc; + int_t fst_row; + void *nzval; + int_t *rowptr; + int_t *colind; + */ + + assert(A->nnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert(((double *)A->nzval)[i] == ((double *)B->nzval)[i]); + // printf("%lf \n", ((double *)A->nzval)[i]); + assert((A->colind)[i] == (B->colind)[i]); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + // assert(((double *)A->nzval)[i] ==((double *)B->nzval)[i]); + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + + printf("Matrix check passed\n"); + +} #if 0 #include "p3dcomm.h" #include "pdgstrf3d.h" @@ -584,23 +618,36 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* Perform preprocessing steps on process layer zero, including: ordering, symbolic factorization, distribution of L & U */ #define NRFRMT -#ifdef NRFRMT +// #ifdef NRFRMT NRformat_loc Atmp = dGatherNRformat_loc( (NRformat_loc *) A->Store, grid3d); -#endif - +// #endif + NRformat_loc* Astore0 =&Atmp; if (grid3d->zscp.Iam == 0) { m = A->nrow; n = A->ncol; -#ifdef NRFRMT - Astore = &Atmp; -#else + // checkNRFMT(Astore0, (NRformat_loc *) A->Store); +#ifdef NRFMT + A->Store = Astore0; +#endif Astore = (NRformat_loc *) A->Store; -#endif + +// #ifdef NRFRMT +// Astore = Astore0; +// #else +// Astore = Astore0; +// // Astore = (NRformat_loc *) A->Store; +// // Astore->nzval = Astore0->nzval; +// // Astore->rowptr = Astore0->rowptr; +// // Astore->colind = Astore0->colind; +// // Astore->nnz_loc = Astore0->nnz_loc; +// // Astore->m_loc = Astore0->m_loc; +// // Astore->fst_row = Astore0->fst_row; +// #endif nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc; fst_row = Astore->fst_row; From 73a1c22a8439c2fee4cd54c8339162b208b76900 Mon Sep 17 00:00:00 2001 From: piyush sao Date: Thu, 17 Sep 2020 01:55:04 -0400 Subject: [PATCH 024/147] bug fix: gathering A->colind was using MPI_DOUBLE as the type rather mp_int_t --- SRC/nrformat_loc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SRC/nrformat_loc.c b/SRC/nrformat_loc.c index 932bff25..975dfe18 100644 --- a/SRC/nrformat_loc.c +++ b/SRC/nrformat_loc.c @@ -56,7 +56,7 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A, gridinfo3d_t *grid3d) MPI_DOUBLE, 0, grid3d->zscp.comm); MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d.colind, nnz_counts_int, nnz_disp, - MPI_DOUBLE, 0, grid3d->zscp.comm); + mpi_int_t, 0, grid3d->zscp.comm); MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d.rowptr[1], row_counts_int, row_disp, From b744611e8197ca776bf991b385efc93e5af5a26c Mon Sep 17 00:00:00 2001 From: piyush sao Date: Thu, 17 Sep 2020 12:10:59 -0400 Subject: [PATCH 025/147] passing matrixCheck for pz=2 Pxy=1 --- SRC/nrformat_loc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/SRC/nrformat_loc.c b/SRC/nrformat_loc.c index 975dfe18..5c6b655a 100644 --- a/SRC/nrformat_loc.c +++ b/SRC/nrformat_loc.c @@ -66,9 +66,10 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A, gridinfo3d_t *grid3d) { for (int i = 0; i < grid3d->npdep; i++) { - for(int j = row_disp[i]; jnpdep]; From 3bd6eb8e716efa8a8419fdd1ddc60f7887e5874c Mon Sep 17 00:00:00 2001 From: piyush sao Date: Thu, 17 Sep 2020 17:26:51 -0400 Subject: [PATCH 026/147] debugging gathering B --- EXAMPLE/pddrive3d.c | 50 ++++-- SRC/nrformat_loc.c | 58 ++++++- SRC/pdgsequ.c | 139 ++++++++------- SRC/pdgssvx3d.c | 78 +++++---- SRC/pdutil.c | 399 ++++++++++++++++++++++++-------------------- SRC/superlu_defs.h | 4 +- 6 files changed, 428 insertions(+), 300 deletions(-) diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c index 4bc7f2f4..20d82bfc 100644 --- a/EXAMPLE/pddrive3d.c +++ b/EXAMPLE/pddrive3d.c @@ -49,6 +49,19 @@ at the top-level directory. *
*/ +static void matCheck(int n, int m, double* A, int LDA + , double* B, int LDB) +{ + for(int j=0; jm_loc, nrhs, B2d, Astore->m_loc, bref, ldb); + } // MPI_Finalize(); exit(0); + #endif #endif if (!(berr = doubleMalloc_dist (nrhs))) ABORT ("Malloc fails for berr[]."); diff --git a/SRC/nrformat_loc.c b/SRC/nrformat_loc.c index 5c6b655a..ee6936a8 100644 --- a/SRC/nrformat_loc.c +++ b/SRC/nrformat_loc.c @@ -1,5 +1,16 @@ #include "superlu_ddefs.h" +static void matCopy(int n, int m, double* Dst, int lddst + , double* Src, int ldsrc) +{ + for(int j=0; jnpdep * sizeof(int_t)); row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t)); nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); + b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts, 1, mpi_int_t, 0, grid3d->zscp.comm); MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts, 1, mpi_int_t, 0, grid3d->zscp.comm); - nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int_t)); - row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int_t)); + nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int)); + row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int)); + b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int)); nnz_disp[0] = 0; row_disp[0] = 0; + b_disp[0] =0; for (int i = 0; i < grid3d->npdep; i++) { nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i]; row_disp[i + 1] = row_disp[i] + row_counts[i]; + b_disp[i+1] = nrhs*row_disp[i + 1]; nnz_counts_int[i] = nnz_counts[i]; row_counts_int[i] = row_counts[i]; + b_counts_int[i] = nrhs*row_counts[i]; } if (grid3d->zscp.Iam == 0) @@ -71,11 +89,39 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A, gridinfo3d_t *grid3d) // A2d.rowptr[j] += row_disp[i]; A2d.rowptr[j] +=nnz_disp[i]; } - } + } A2d.nnz_loc = nnz_disp[grid3d->npdep]; A2d.m_loc = row_disp[grid3d->npdep]; + A2d.fst_row = A->fst_row; } - A2d.fst_row = A->fst_row; + // compacting B + double *Btmp =SUPERLU_MALLOC(A->m_loc*nrhs * sizeof(int_t)); + matCopy(A->m_loc, nrhs, Btmp, A->m_loc,B, ldb); + + double *B1; + if (grid3d->zscp.Iam == 0) + { + B1 = SUPERLU_MALLOC( A2d.m_loc*nrhs * sizeof(double)); + *B2d = SUPERLU_MALLOC( A2d.m_loc*nrhs * sizeof(double)); + } + + MPI_Gatherv(Btmp, nrhs*A->m_loc, MPI_DOUBLE, B1, + b_counts_int, b_disp, + MPI_DOUBLE, 0, grid3d->zscp.comm); + if (grid3d->zscp.Iam == 0) + { + for (int i = 0; i < grid3d->npdep; ++i) + { + /* code */ + matCopy(row_counts_int[i], nrhs, *B2d + row_disp[i], A2d.m_loc, + B1+nrhs*row_disp[i], row_counts_int[i]); + } + + SUPERLU_FREE(B1); + } + + + return A2d; } \ No newline at end of file diff --git a/SRC/pdgsequ.c b/SRC/pdgsequ.c index 8702fbf5..685f294a 100644 --- a/SRC/pdgsequ.c +++ b/SRC/pdgsequ.c @@ -13,7 +13,7 @@ at the top-level directory. /*! @file * \brief Computes row and column scalings * - * File name: pdgsequ.c + * File name: pdgsequ.c * History: Modified from LAPACK routine DGEEQU */ #include @@ -84,7 +84,7 @@ at the top-level directory. void pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, - double *colcnd, double *amax, int_t *info, gridinfo_t *grid) + double *colcnd, double *amax, int_t *info, gridinfo_t *grid) { /* Local variables */ @@ -102,20 +102,22 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, /* Test the input parameters. */ *info = 0; if ( A->nrow < 0 || A->ncol < 0 || - A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE ) - *info = -1; - if (*info != 0) { - i = -(*info); - pxerr_dist("pdgsequ", grid, i); - return; + A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE ) + *info = -1; + if (*info != 0) + { + i = -(*info); + pxerr_dist("pdgsequ", grid, i); + return; } /* Quick return if possible */ - if ( A->nrow == 0 || A->ncol == 0 ) { - *rowcnd = 1.; - *colcnd = 1.; - *amax = 0.; - return; + if ( A->nrow == 0 || A->ncol == 0 ) + { + *rowcnd = 1.; + *colcnd = 1.; + *amax = 0.; + return; } Astore = A->Store; @@ -131,43 +133,49 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, /* Find the maximum element in each row. */ irow = Astore->fst_row; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) - r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[j]) ); - ++irow; + for (i = 0; i < m_loc; ++i) + { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i + 1]; ++j) + r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[j]) ); + ++irow; } /* Find the maximum and minimum scale factors. */ rcmin = bignum; rcmax = 0.; - for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) { - rcmax = SUPERLU_MAX(rcmax, r[i]); - rcmin = SUPERLU_MIN(rcmin, r[i]); + for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) + { + rcmax = SUPERLU_MAX(rcmax, r[i]); + rcmin = SUPERLU_MIN(rcmin, r[i]); } /* Get the global MAX and MIN for R */ tempmax = rcmax; tempmin = rcmin; MPI_Allreduce( &tempmax, &rcmax, - 1, MPI_DOUBLE, MPI_MAX, grid->comm); + 1, MPI_DOUBLE, MPI_MAX, grid->comm); MPI_Allreduce( &tempmin, &rcmin, - 1, MPI_DOUBLE, MPI_MIN, grid->comm); + 1, MPI_DOUBLE, MPI_MIN, grid->comm); *amax = rcmax; - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (i = 0; i < A->nrow; ++i) - if (r[i] == 0.) { - *info = i + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (i = 0; i < A->nrow; ++i) - r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); - /* Compute ROWCND = min(R(I)) / max(R(I)) */ - *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + if (rcmin == 0.) + { + /* Find the first zero scale factor and return an error code. */ + for (i = 0; i < A->nrow; ++i) + if (r[i] == 0.) + { + *info = i + 1; + return; + } + } + else + { + /* Invert the scale factors. */ + for (i = 0; i < A->nrow; ++i) + r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); + /* Compute ROWCND = min(R(I)) / max(R(I)) */ + *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); } /* Compute column scale factors */ @@ -176,17 +184,19 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, /* Find the maximum element in each column, assuming the row scalings computed above. */ irow = Astore->fst_row; - for (i = 0; i < m_loc; ++i) { - for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { - jcol = Astore->colind[j]; - c[jcol] = SUPERLU_MAX( c[jcol], fabs(Aval[j]) * r[irow] ); - } - ++irow; + for (i = 0; i < m_loc; ++i) + { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i + 1]; ++j) + { + jcol = Astore->colind[j]; + c[jcol] = SUPERLU_MAX( c[jcol], fabs(Aval[j]) * r[irow] ); + } + ++irow; } /* Find the global maximum for c[j] */ if ( !(loc_max = doubleMalloc_dist(A->ncol))) - ABORT("Malloc fails for loc_max[]."); + ABORT("Malloc fails for loc_max[]."); for (j = 0; j < A->ncol; ++j) loc_max[j] = c[j]; MPI_Allreduce(loc_max, c, A->ncol, MPI_DOUBLE, MPI_MAX, grid->comm); SUPERLU_FREE(loc_max); @@ -194,34 +204,39 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, /* Find the maximum and minimum scale factors. */ rcmin = bignum; rcmax = 0.; - for (j = 0; j < A->ncol; ++j) { - rcmax = SUPERLU_MAX(rcmax, c[j]); - rcmin = SUPERLU_MIN(rcmin, c[j]); + for (j = 0; j < A->ncol; ++j) + { + rcmax = SUPERLU_MAX(rcmax, c[j]); + rcmin = SUPERLU_MIN(rcmin, c[j]); } - if (rcmin == 0.) { - /* Find the first zero scale factor and return an error code. */ - for (j = 0; j < A->ncol; ++j) - if ( c[j] == 0. ) { - *info = A->nrow + j + 1; - return; - } - } else { - /* Invert the scale factors. */ - for (j = 0; j < A->ncol; ++j) - c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); - /* Compute COLCND = min(C(J)) / max(C(J)) */ - *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + if (rcmin == 0.) + { + /* Find the first zero scale factor and return an error code. */ + for (j = 0; j < A->ncol; ++j) + if ( c[j] == 0. ) + { + *info = A->nrow + j + 1; + return; + } + } + else + { + /* Invert the scale factors. */ + for (j = 0; j < A->ncol; ++j) + c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); + /* Compute COLCND = min(C(J)) / max(C(J)) */ + *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); } /* gather R from each process to get the global R. */ procs = grid->nprow * grid->npcol; if ( !(r_sizes = SUPERLU_MALLOC(2 * procs * sizeof(int)))) - ABORT("Malloc fails for r_sizes[]."); + ABORT("Malloc fails for r_sizes[]."); displs = r_sizes + procs; if ( !(loc_r = doubleMalloc_dist(m_loc))) - ABORT("Malloc fails for loc_r[]."); + ABORT("Malloc fails for loc_r[]."); j = Astore->fst_row; for (i = 0; i < m_loc; ++i) loc_r[i] = r[j++]; @@ -230,11 +245,11 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, /* Set up the displacements for allgatherv */ displs[0] = 0; - for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1]; + for (i = 1; i < procs; ++i) displs[i] = displs[i - 1] + r_sizes[i - 1]; /* Now gather the actual data */ MPI_Allgatherv(loc_r, m_loc, MPI_DOUBLE, r, r_sizes, displs, - MPI_DOUBLE, grid->comm); + MPI_DOUBLE, grid->comm); SUPERLU_FREE(r_sizes); SUPERLU_FREE(loc_r); diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index cb37d50e..e7fc8bfd 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -23,34 +23,34 @@ at the top-level directory. static void checkNRFMT(NRformat_loc*A, NRformat_loc*B) { - /* - int_t nnz_loc; - int_t m_loc; - int_t fst_row; - void *nzval; - int_t *rowptr; - int_t *colind; - */ - - assert(A->nnz_loc == B->nnz_loc); - assert(A->m_loc == B->m_loc); - assert(A->fst_row == B->fst_row); - - for (int_t i = 0; i < A->nnz_loc; i++) - { - assert(((double *)A->nzval)[i] == ((double *)B->nzval)[i]); - // printf("%lf \n", ((double *)A->nzval)[i]); - assert((A->colind)[i] == (B->colind)[i]); - } - - for (int_t i = 0; i < A->m_loc + 1; i++) - { - // assert(((double *)A->nzval)[i] ==((double *)B->nzval)[i]); - assert((A->rowptr)[i] == (B->rowptr)[i]); - } - - - printf("Matrix check passed\n"); + /* + int_t nnz_loc; + int_t m_loc; + int_t fst_row; + void *nzval; + int_t *rowptr; + int_t *colind; + */ + + assert(A->nnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert(((double *)A->nzval)[i] == ((double *)B->nzval)[i]); + // printf("%lf \n", ((double *)A->nzval)[i]); + assert((A->colind)[i] == (B->colind)[i]); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + // assert(((double *)A->nzval)[i] ==((double *)B->nzval)[i]); + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + + printf("Matrix check passed\n"); } #if 0 @@ -618,12 +618,17 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* Perform preprocessing steps on process layer zero, including: ordering, symbolic factorization, distribution of L & U */ #define NRFRMT -// #ifdef NRFRMT + + double* B2d; + int ldb2d; + int ldb3d = ldb; + double* B3d = B; NRformat_loc Atmp = dGatherNRformat_loc( (NRformat_loc *) A->Store, + B, ldb, nrhs, &B2d, grid3d); -// #endif - NRformat_loc* Astore0 =&Atmp; + + NRformat_loc* Astore0 = &Atmp; if (grid3d->zscp.Iam == 0) { @@ -631,9 +636,11 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, m = A->nrow; n = A->ncol; // checkNRFMT(Astore0, (NRformat_loc *) A->Store); -#ifdef NRFMT +#ifdef NRFRMT A->Store = Astore0; -#endif + ldb = Astore0->m_loc; + B = B2d; +#endif Astore = (NRformat_loc *) A->Store; // #ifdef NRFRMT @@ -1463,7 +1470,9 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, { if (rowequ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { irow = fst_row; @@ -1478,7 +1487,9 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, } else if (colequ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { irow = fst_row; @@ -1706,6 +1717,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, #endif } /* process layer 0 done solve */ + // b2d->B #if ( DEBUGlevel>=1 ) CHECK_MALLOC (iam, "Exit pdgssvx3d()"); diff --git a/SRC/pdutil.c b/SRC/pdutil.c index a3bf6766..5c665d8b 100644 --- a/SRC/pdutil.c +++ b/SRC/pdutil.c @@ -27,10 +27,10 @@ at the top-level directory. */ int pdCompRow_loc_to_CompCol_global ( - int_t need_value, /* Input. Whether need to gather numerical values */ - SuperMatrix *A, /* Input. Distributed matrix in NRformat_loc format. */ - gridinfo_t *grid, /* Input */ - SuperMatrix *GA /* Output */ + int_t need_value, /* Input. Whether need to gather numerical values */ + SuperMatrix *A, /* Input. Distributed matrix in NRformat_loc format. */ + gridinfo_t *grid, /* Input */ + SuperMatrix *GA /* Output */ ) { NRformat_loc *Astore; @@ -44,11 +44,11 @@ int pdCompRow_loc_to_CompCol_global double *a_buf; /* Buffer to merge blocks into block columns. */ int_t *itemp; int_t *colptr_send; /* Buffer to redistribute the column pointers of the - local block rows. - Use n_loc+1 pointers for each block. */ + local block rows. + Use n_loc+1 pointers for each block. */ int_t *colptr_blk; /* The column pointers for each block, after - redistribution to the local block columns. - Use n_loc+1 pointers for each block. */ + redistribution to the local block columns. + Use n_loc+1 pointers for each block. */ int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */ int_t *rowind_buf; /* Buffer to merge blocks into block columns. */ int_t *fst_rows, *n_locs; @@ -81,19 +81,19 @@ int pdCompRow_loc_to_CompCol_global #if ( DEBUGlevel>=2 ) printf("Proc %d\n", grid->iam); PrintInt10("rowind_loc", nnz_loc, rowind_loc); - PrintInt10("colptr_loc", n+1, colptr_loc); + PrintInt10("colptr_loc", n + 1, colptr_loc); #endif procs = grid->nprow * grid->npcol; - if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) ) - ABORT("Malloc fails for fst_rows[]"); + if ( !(fst_rows = (int_t *) intMalloc_dist(2 * procs)) ) + ABORT("Malloc fails for fst_rows[]"); n_locs = fst_rows + procs; MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t, - grid->comm); - for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i]; - n_locs[procs-1] = n - fst_rows[procs-1]; - if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) ) - ABORT("Malloc fails for recvcnts[]"); + grid->comm); + for (i = 0; i < procs - 1; ++i) n_locs[i] = fst_rows[i + 1] - fst_rows[i]; + n_locs[procs - 1] = n - fst_rows[procs - 1]; + if ( !(recvcnts = SUPERLU_MALLOC(5 * procs * sizeof(int))) ) + ABORT("Malloc fails for recvcnts[]"); sendcnts = recvcnts + procs; rdispls = sendcnts + procs; sdispls = rdispls + procs; @@ -103,65 +103,71 @@ int pdCompRow_loc_to_CompCol_global Now the matrix view is P-by-P block-partition. */ /* n column starts for each column, and procs column ends for each block */ if ( !(colptr_send = intMalloc_dist(n + procs)) ) - ABORT("Malloc fails for colptr_send[]"); - if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) ) - ABORT("Malloc fails for colptr_blk[]"); - for (i = 0, j = 0; i < procs; ++i) { - for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k]; - colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */ - sendcnts[i] = n_locs[i] + 1; + ABORT("Malloc fails for colptr_send[]"); + if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc) + 1) * procs)) ) + ABORT("Malloc fails for colptr_blk[]"); + for (i = 0, j = 0; i < procs; ++i) + { + for (k = j; k < j + n_locs[i]; ++k) colptr_send[i + k] = colptr_loc[k]; + colptr_send[i + k] = colptr_loc[k]; /* Add an END marker */ + sendcnts[i] = n_locs[i] + 1; #if ( DEBUGlevel>=1 ) - assert(j == fst_rows[i]); + assert(j == fst_rows[i]); #endif - sdispls[i] = j + i; - recvcnts[i] = n_loc + 1; - rdispls[i] = i * (n_loc + 1); - j += n_locs[i]; /* First column of next block in colptr_loc[] */ + sdispls[i] = j + i; + recvcnts[i] = n_loc + 1; + rdispls[i] = i * (n_loc + 1); + j += n_locs[i]; /* First column of next block in colptr_loc[] */ } MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t, - colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm); + colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm); /* Adjust colptr_blk[] so that they contain the local indices of the column pointers in the receive buffer. */ nnz = 0; /* The running sum of the nonzeros counted by far */ k = 0; - for (i = 0; i < procs; ++i) { - for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) { - colnnz = colptr_blk[j+1] - colptr_blk[j]; - /*assert(k<=j);*/ - colptr_blk[k] = nnz; - nnz += colnnz; /* Start of the next column */ - ++k; - } - colptr_blk[k++] = nnz; /* Add an END marker for each block */ + for (i = 0; i < procs; ++i) + { + for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) + { + colnnz = colptr_blk[j + 1] - colptr_blk[j]; + /*assert(k<=j);*/ + colptr_blk[k] = nnz; + nnz += colnnz; /* Start of the next column */ + ++k; + } + colptr_blk[k++] = nnz; /* Add an END marker for each block */ } /*assert(k == (n_loc+1)*procs);*/ /* Now prepare to transfer row indices and values. */ sdispls[0] = 0; - for (i = 0; i < procs-1; ++i) { - sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]]; - sdispls[i+1] = sdispls[i] + sendcnts[i]; + for (i = 0; i < procs - 1; ++i) + { + sendcnts[i] = colptr_loc[fst_rows[i + 1]] - colptr_loc[fst_rows[i]]; + sdispls[i + 1] = sdispls[i] + sendcnts[i]; } - sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]]; - for (i = 0; i < procs; ++i) { + sendcnts[procs - 1] = colptr_loc[n] - colptr_loc[fst_rows[procs - 1]]; + for (i = 0; i < procs; ++i) + { j = rdispls[i]; /* Point to this block in colptr_blk[]. */ - recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j]; + recvcnts[i] = colptr_blk[j + n_loc] - colptr_blk[j]; } rdispls[0] = 0; /* Recompute rdispls[] for row indices. */ - for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i]; + for (i = 0; i < procs - 1; ++i) rdispls[i + 1] = rdispls[i] + recvcnts[i]; - k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */ - if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) ) + k = rdispls[procs - 1] + recvcnts[procs - 1]; /* Total received */ + if ( !(rowind_recv = (int_t *) intMalloc_dist(2 * k)) ) ABORT("Malloc fails for rowind_recv[]"); rowind_buf = rowind_recv + k; MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t, - rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm); - if ( need_value ) { - if ( !(a_recv = (double *) doubleMalloc_dist(2*k)) ) - ABORT("Malloc fails for rowind_recv[]"); - a_buf = a_recv + k; - MPI_Alltoallv(a_loc, sendcnts, sdispls, MPI_DOUBLE, + rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm); + if ( need_value ) + { + if ( !(a_recv = (double *) doubleMalloc_dist(2 * k)) ) + ABORT("Malloc fails for rowind_recv[]"); + a_buf = a_recv + k; + MPI_Alltoallv(a_loc, sendcnts, sdispls, MPI_DOUBLE, a_recv, recvcnts, rdispls, MPI_DOUBLE, grid->comm); } @@ -169,39 +175,48 @@ int pdCompRow_loc_to_CompCol_global /* Reset colptr_loc[] to point to the n_loc global columns. */ colptr_loc[0] = 0; itemp = colptr_send; - for (j = 0; j < n_loc; ++j) { + for (j = 0; j < n_loc; ++j) + { colnnz = 0; - for (i = 0; i < procs; ++i) { - k = i * (n_loc + 1) + j; /* j-th column in i-th block */ - colnnz += colptr_blk[k+1] - colptr_blk[k]; - } - colptr_loc[j+1] = colptr_loc[j] + colnnz; - itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */ + for (i = 0; i < procs; ++i) + { + k = i * (n_loc + 1) + j; /* j-th column in i-th block */ + colnnz += colptr_blk[k + 1] - colptr_blk[k]; + } + colptr_loc[j + 1] = colptr_loc[j] + colnnz; + itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */ } itemp[n_loc] = colptr_loc[n_loc]; /* Merge blocks of row indices into columns of row indices. */ - for (i = 0; i < procs; ++i) { + for (i = 0; i < procs; ++i) + { k = i * (n_loc + 1); - for (j = 0; j < n_loc; ++j) { /* i-th block */ - for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { - rowind_buf[itemp[j]] = rowind_recv[l]; - ++itemp[j]; - } - } + for (j = 0; j < n_loc; ++j) /* i-th block */ + { + for (l = colptr_blk[k + j]; l < colptr_blk[k + j + 1]; ++l) + { + rowind_buf[itemp[j]] = rowind_recv[l]; + ++itemp[j]; + } + } } - if ( need_value ) { - for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j]; - for (i = 0; i < procs; ++i) { - k = i * (n_loc + 1); - for (j = 0; j < n_loc; ++j) { /* i-th block */ - for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { - a_buf[itemp[j]] = a_recv[l]; - ++itemp[j]; - } - } - } + if ( need_value ) + { + for (j = 0; j < n_loc + 1; ++j) itemp[j] = colptr_loc[j]; + for (i = 0; i < procs; ++i) + { + k = i * (n_loc + 1); + for (j = 0; j < n_loc; ++j) /* i-th block */ + { + for (l = colptr_blk[k + j]; l < colptr_blk[k + j + 1]; ++l) + { + a_buf[itemp[j]] = a_recv[l]; + ++itemp[j]; + } + } + } } /* ------------------------------------------------------------ @@ -223,48 +238,54 @@ int pdCompRow_loc_to_CompCol_global if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) ) ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]"); - if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) ) + if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n + 1)) ) ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]"); /* Allgatherv for row indices. */ rdispls[0] = 0; - for (i = 0; i < procs-1; ++i) { - rdispls[i+1] = rdispls[i] + itemp[i]; + for (i = 0; i < procs - 1; ++i) + { + rdispls[i + 1] = rdispls[i] + itemp[i]; itemp_32[i] = itemp[i]; } - itemp_32[procs-1] = itemp[procs-1]; + itemp_32[procs - 1] = itemp[procs - 1]; it = nnz_loc; MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, - itemp_32, rdispls, mpi_int_t, grid->comm); - if ( need_value ) { - if ( !(GAstore->nzval = (double *) doubleMalloc_dist (nnz)) ) - ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]"); - MPI_Allgatherv(a_buf, it, MPI_DOUBLE, GAstore->nzval, - itemp_32, rdispls, MPI_DOUBLE, grid->comm); - } else GAstore->nzval = NULL; + itemp_32, rdispls, mpi_int_t, grid->comm); + if ( need_value ) + { + if ( !(GAstore->nzval = (double *) doubleMalloc_dist (nnz)) ) + ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]"); + MPI_Allgatherv(a_buf, it, MPI_DOUBLE, GAstore->nzval, + itemp_32, rdispls, MPI_DOUBLE, grid->comm); + } + else GAstore->nzval = NULL; /* Now gather the column pointers. */ rdispls[0] = 0; - for (i = 0; i < procs-1; ++i) { - rdispls[i+1] = rdispls[i] + n_locs[i]; + for (i = 0; i < procs - 1; ++i) + { + rdispls[i + 1] = rdispls[i] + n_locs[i]; itemp_32[i] = n_locs[i]; } - itemp_32[procs-1] = n_locs[procs-1]; + itemp_32[procs - 1] = n_locs[procs - 1]; MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, - itemp_32, rdispls, mpi_int_t, grid->comm); + itemp_32, rdispls, mpi_int_t, grid->comm); /* Recompute column pointers. */ - for (i = 1; i < procs; ++i) { + for (i = 1; i < procs; ++i) + { k = rdispls[i]; - for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1]; - itemp[i] += itemp[i-1]; /* prefix sum */ + for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i - 1]; + itemp[i] += itemp[i - 1]; /* prefix sum */ } GAstore->colptr[n] = nnz; #if ( DEBUGlevel>=2 ) - if ( !grid->iam ) { + if ( !grid->iam ) + { printf("After pdCompRow_loc_to_CompCol_global()\n"); - dPrint_CompCol_Matrix_dist(GA); + dPrint_CompCol_Matrix_dist(GA); } #endif @@ -289,14 +310,14 @@ int pdCompRow_loc_to_CompCol_global */ int pdPermute_Dense_Matrix ( - int_t fst_row, - int_t m_loc, - int_t row_to_proc[], - int_t perm[], - double X[], int ldx, - double B[], int ldb, - int nrhs, - gridinfo_t *grid + int_t fst_row, + int_t m_loc, + int_t row_to_proc[], + int_t perm[], + double X[], int ldx, + double B[], int ldb, + int nrhs, + gridinfo_t *grid ) { int_t i, j, k, l; @@ -312,7 +333,7 @@ int pdPermute_Dense_Matrix #endif procs = grid->nprow * grid->npcol; - if ( !(sendcnts = SUPERLU_MALLOC(10*procs * sizeof(int))) ) + if ( !(sendcnts = SUPERLU_MALLOC(10 * procs * sizeof(int))) ) ABORT("Malloc fails for sendcnts[]."); sendcnts_nrhs = sendcnts + procs; recvcnts = sendcnts_nrhs + procs; @@ -327,65 +348,72 @@ int pdPermute_Dense_Matrix for (i = 0; i < procs; ++i) sendcnts[i] = 0; /* Count the number of X entries to be sent to each process.*/ - for (i = fst_row; i < fst_row + m_loc; ++i) { + for (i = fst_row; i < fst_row + m_loc; ++i) + { p = row_to_proc[perm[i]]; - ++sendcnts[p]; + ++sendcnts[p]; } MPI_Alltoall(sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm); sdispls[0] = rdispls[0] = 0; sdispls_nrhs[0] = rdispls_nrhs[0] = 0; sendcnts_nrhs[0] = sendcnts[0] * nrhs; recvcnts_nrhs[0] = recvcnts[0] * nrhs; - for (i = 1; i < procs; ++i) { - sdispls[i] = sdispls[i-1] + sendcnts[i-1]; - sdispls_nrhs[i] = sdispls[i] * nrhs; - rdispls[i] = rdispls[i-1] + recvcnts[i-1]; - rdispls_nrhs[i] = rdispls[i] * nrhs; - sendcnts_nrhs[i] = sendcnts[i] * nrhs; - recvcnts_nrhs[i] = recvcnts[i] * nrhs; + for (i = 1; i < procs; ++i) + { + sdispls[i] = sdispls[i - 1] + sendcnts[i - 1]; + sdispls_nrhs[i] = sdispls[i] * nrhs; + rdispls[i] = rdispls[i - 1] + recvcnts[i - 1]; + rdispls_nrhs[i] = rdispls[i] * nrhs; + sendcnts_nrhs[i] = sendcnts[i] * nrhs; + recvcnts_nrhs[i] = recvcnts[i] * nrhs; } - k = sdispls[procs-1] + sendcnts[procs-1];/* Total number of sends */ - l = rdispls[procs-1] + recvcnts[procs-1];/* Total number of recvs */ + k = sdispls[procs - 1] + sendcnts[procs - 1]; /* Total number of sends */ + l = rdispls[procs - 1] + recvcnts[procs - 1]; /* Total number of recvs */ /*assert(k == m_loc);*/ /*assert(l == m_loc);*/ if ( !(send_ibuf = intMalloc_dist(k + l)) ) ABORT("Malloc fails for send_ibuf[]."); recv_ibuf = send_ibuf + k; - if ( !(send_dbuf = doubleMalloc_dist((k + l)*nrhs)) ) + if ( !(send_dbuf = doubleMalloc_dist((k + l) * nrhs)) ) ABORT("Malloc fails for send_dbuf[]."); recv_dbuf = send_dbuf + k * nrhs; - for (i = 0; i < procs; ++i) { + for (i = 0; i < procs; ++i) + { ptr_to_ibuf[i] = sdispls[i]; - ptr_to_dbuf[i] = sdispls_nrhs[i]; + ptr_to_dbuf[i] = sdispls_nrhs[i]; } /* Fill in the send buffers: send_ibuf[] and send_dbuf[]. */ - for (i = fst_row; i < fst_row + m_loc; ++i) { + for (i = fst_row; i < fst_row + m_loc; ++i) + { j = perm[i]; - p = row_to_proc[j]; - send_ibuf[ptr_to_ibuf[p]] = j; - j = ptr_to_dbuf[p]; - RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ - send_dbuf[j++] = X[i-fst_row + k*ldx]; - } - ++ptr_to_ibuf[p]; - ptr_to_dbuf[p] += nrhs; + p = row_to_proc[j]; + send_ibuf[ptr_to_ibuf[p]] = j; + j = ptr_to_dbuf[p]; + RHS_ITERATE(k) /* RHS stored in row major in the buffer */ + { + send_dbuf[j++] = X[i - fst_row + k * ldx]; + } + ++ptr_to_ibuf[p]; + ptr_to_dbuf[p] += nrhs; } /* Transfer the (permuted) row indices and numerical values. */ MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t, - recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm); + recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm); MPI_Alltoallv(send_dbuf, sendcnts_nrhs, sdispls_nrhs, MPI_DOUBLE, - recv_dbuf, recvcnts_nrhs, rdispls_nrhs, MPI_DOUBLE, - grid->comm); + recv_dbuf, recvcnts_nrhs, rdispls_nrhs, MPI_DOUBLE, + grid->comm); /* Copy the buffer into b. */ - for (i = 0, l = 0; i < m_loc; ++i) { + for (i = 0, l = 0; i < m_loc; ++i) + { j = recv_ibuf[i] - fst_row; /* Relative row number */ - RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ - B[j + k*ldb] = recv_dbuf[l++]; - } + RHS_ITERATE(k) /* RHS stored in row major in the buffer */ + { + B[j + k * ldb] = recv_dbuf[l++]; + } } SUPERLU_FREE(sendcnts); @@ -401,9 +429,9 @@ int pdPermute_Dense_Matrix /*! \brief Initialize the data structure for the solution phase. */ int dSolveInit(superlu_dist_options_t *options, SuperMatrix *A, - int_t perm_r[], int_t perm_c[], int_t nrhs, - LUstruct_t *LUstruct, gridinfo_t *grid, - SOLVEstruct_t *SOLVEstruct) + int_t perm_r[], int_t perm_c[], int_t nrhs, + LUstruct_t *LUstruct, gridinfo_t *grid, + SOLVEstruct_t *SOLVEstruct) { int_t *row_to_proc, *inv_perm_c, *itemp; NRformat_loc *Astore; @@ -416,7 +444,7 @@ int dSolveInit(superlu_dist_options_t *options, SuperMatrix *A, procs = grid->nprow * grid->npcol; if ( !(row_to_proc = intMalloc_dist(A->nrow)) ) - ABORT("Malloc fails for row_to_proc[]"); + ABORT("Malloc fails for row_to_proc[]"); SOLVEstruct->row_to_proc = row_to_proc; if ( !(inv_perm_c = intMalloc_dist(A->ncol)) ) ABORT("Malloc fails for inv_perm_c[]."); @@ -430,19 +458,21 @@ int dSolveInit(superlu_dist_options_t *options, SuperMatrix *A, NOTE: For those processes that do not own any row, it must must be set so that fst_row == A->nrow. ------------------------------------------------------------*/ - if ( !(itemp = intMalloc_dist(procs+1)) ) + if ( !(itemp = intMalloc_dist(procs + 1)) ) ABORT("Malloc fails for itemp[]"); MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, - grid->comm); + grid->comm); itemp[procs] = A->nrow; - for (p = 0; p < procs; ++p) { - for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p; + for (p = 0; p < procs; ++p) + { + for (i = itemp[p] ; i < itemp[p + 1]; ++i) row_to_proc[i] = p; } #if ( DEBUGlevel>=2 ) - if ( !grid->iam ) { - printf("fst_row = %d\n", fst_row); - PrintInt10("row_to_proc", A->nrow, row_to_proc); - PrintInt10("inv_perm_c", A->ncol, inv_perm_c); + if ( !grid->iam ) + { + printf("fst_row = %d\n", fst_row); + PrintInt10("row_to_proc", A->nrow, row_to_proc); + PrintInt10("inv_perm_c", A->ncol, inv_perm_c); } #endif SUPERLU_FREE(itemp); @@ -453,32 +483,34 @@ int dSolveInit(superlu_dist_options_t *options, SuperMatrix *A, than total Procs? For the processes without any row, let fst_row be EMPTY (-1). Make sure this case works! */ MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, - grid->comm); + grid->comm); itemp[procs] = n; - for (p = 0; p < procs; ++p) { + for (p = 0; p < procs; ++p) + { j = itemp[p]; - if ( j != EMPTY ) { - k = itemp[p+1]; - if ( k == EMPTY ) k = n; - for (i = j ; i < k; ++i) row_to_proc[i] = p; - } + if ( j != EMPTY ) + { + k = itemp[p + 1]; + if ( k == EMPTY ) k = n; + for (i = j ; i < k; ++i) row_to_proc[i] = p; + } } #endif get_diag_procs(A->ncol, LUstruct->Glu_persist, grid, - &SOLVEstruct->num_diag_procs, - &SOLVEstruct->diag_procs, - &SOLVEstruct->diag_len); + &SOLVEstruct->num_diag_procs, + &SOLVEstruct->diag_procs, + &SOLVEstruct->diag_len); /* Setup communication pattern for redistribution of B and X. */ if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *) - SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) + SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) ABORT("Malloc fails for gstrs_comm[]"); pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, - LUstruct->Glu_persist, SOLVEstruct); + LUstruct->Glu_persist, SOLVEstruct); if ( !(SOLVEstruct->gsmv_comm = (pdgsmv_comm_t *) - SUPERLU_MALLOC(sizeof(pdgsmv_comm_t))) ) + SUPERLU_MALLOC(sizeof(pdgsmv_comm_t))) ) ABORT("Malloc fails for gsmv_comm[]"); SOLVEstruct->A_colind_gsmv = NULL; @@ -494,9 +526,10 @@ void dSolveFinalize(superlu_dist_options_t *options, SOLVEstruct_t *SOLVEstruct) pxgstrs_finalize(SOLVEstruct->gstrs_comm); - if ( options->RefineInitialized ) { + if ( options->RefineInitialized ) + { pdgsmv_finalize(SOLVEstruct->gsmv_comm); - options->RefineInitialized = NO; + options->RefineInitialized = NO; } SUPERLU_FREE(SOLVEstruct->gsmv_comm); SUPERLU_FREE(SOLVEstruct->row_to_proc); @@ -510,28 +543,30 @@ void dSolveFinalize(superlu_dist_options_t *options, SOLVEstruct_t *SOLVEstruct) /*! \brief Check the inf-norm of the error vector */ void pdinf_norm_error(int iam, int_t n, int_t nrhs, double x[], int_t ldx, - double xtrue[], int_t ldxtrue, gridinfo_t *grid) + double xtrue[], int_t ldxtrue, gridinfo_t *grid) { double err, xnorm, temperr, tempxnorm; double *x_work, *xtrue_work; int i, j; - for (j = 0; j < nrhs; j++) { - x_work = &x[j*ldx]; - xtrue_work = &xtrue[j*ldxtrue]; - err = xnorm = 0.0; - for (i = 0; i < n; i++) { - err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i])); - xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i])); - } - - /* get the golbal max err & xnrom */ - temperr = err; - tempxnorm = xnorm; - MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - - err = err / xnorm; - if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); + for (j = 0; j < nrhs; j++) + { + x_work = &x[j * ldx]; + xtrue_work = &xtrue[j * ldxtrue]; + err = xnorm = 0.0; + for (i = 0; i < n; i++) + { + err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i])); + xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i])); + } + + /* get the golbal max err & xnrom */ + temperr = err; + tempxnorm = xnorm; + MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm); + MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm); + + err = err / xnorm; + if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); } } diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index b17c4ff1..4c08ceb7 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -1100,7 +1100,9 @@ yes_no_t StdList_Empty(StdList lst); /*==== For 3D code ====*/ /* Matrix distributed in NRformat_loc in 3D process grid, it converts it to a NRformat_loc distributed in two-D grid in grid-0 */ -NRformat_loc dGatherNRformat_loc(NRformat_loc *A, gridinfo3d_t *grid3d); +NRformat_loc dGatherNRformat_loc(NRformat_loc *A, + double* B, int ldb, int nrhs, double** B2d, + gridinfo3d_t *grid3d); extern void DistPrint(char* function_name, double value, char* Units, gridinfo_t* grid); From 7ba0fdf3c7167f11cd27c5f20e8ecbdc53a5e519 Mon Sep 17 00:00:00 2001 From: piyush sao Date: Fri, 18 Sep 2020 15:59:45 -0400 Subject: [PATCH 027/147] bug in Btmp allocaction --- SRC/nrformat_loc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/SRC/nrformat_loc.c b/SRC/nrformat_loc.c index ee6936a8..7f9b03fa 100644 --- a/SRC/nrformat_loc.c +++ b/SRC/nrformat_loc.c @@ -96,7 +96,8 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A, } // compacting B - double *Btmp =SUPERLU_MALLOC(A->m_loc*nrhs * sizeof(int_t)); + double *Btmp; + Btmp =SUPERLU_MALLOC(A->m_loc*nrhs * sizeof(double)); matCopy(A->m_loc, nrhs, Btmp, A->m_loc,B, ldb); double *B1; From d2f9d923108fe850bd00ef9f3e4107a473f49045 Mon Sep 17 00:00:00 2001 From: xiaoye Date: Mon, 21 Sep 2020 14:20:36 -0700 Subject: [PATCH 028/147] Fix a bug in nrformat_loc3d: computing A2d.fst_row Add some comments in the 3 new functions TODO: Scatter the solution back to 3D before exit pdgssvx3d --- EXAMPLE/pddrive3d.c | 47 ++++++++++++++++++++++++++++-------- SRC/nrformat_loc.c | 41 +++++++++++++++++++++++++++---- SRC/pdgssvx3d.c | 59 ++++++++++++++++++++++++++++++++++++--------- SRC/pdutil.c | 4 ++- run_cmake_build.sh | 3 ++- 5 files changed, 125 insertions(+), 29 deletions(-) diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c index 20d82bfc..4a29b39b 100644 --- a/EXAMPLE/pddrive3d.c +++ b/EXAMPLE/pddrive3d.c @@ -77,10 +77,18 @@ void checkNRFMT(NRformat_loc*A, NRformat_loc*B) assert(A->m_loc == B->m_loc); assert(A->fst_row == B->fst_row); +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + PrintDouble5("A", A->nnz_loc, Aval); + PrintDouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + for (int_t i = 0; i < A->nnz_loc; i++) { assert(((double *)A->nzval)[i] == ((double *)B->nzval)[i]); assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); } for (int_t i = 0; i < A->m_loc + 1; i++) @@ -89,7 +97,6 @@ void checkNRFMT(NRformat_loc*A, NRformat_loc*B) assert((A->rowptr)[i] == (B->rowptr)[i]); } - printf("Matrix check passed\n"); } @@ -99,7 +106,7 @@ main (int argc, char *argv[]) { superlu_dist_options_t options; SuperLUStat_t stat; - SuperMatrix A; // only on process layer 0 + SuperMatrix A; // Now, A is on all 3D processes ScalePermstruct_t ScalePermstruct; LUstruct_t LUstruct; SOLVEstruct_t SOLVEstruct; @@ -250,7 +257,9 @@ main (int argc, char *argv[]) // *fp0 = *fp; dcreate_matrix_postfix3d(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &(grid)); - #if 1 + //printf("ldx %d, ldb %d\n", ldx, ldb); + +#if 0 // following code is only for checking *Gather* routine double* B2d; NRformat_loc Atmp = dGatherNRformat_loc( (NRformat_loc *) A.Store, @@ -264,15 +273,25 @@ main (int argc, char *argv[]) dcreate_matrix_postfix(&Aref, nrhs, &bref, &ldb, &xtrueref, &ldx, fp0, suffix, &(grid.grid2d)); - // for (int i = 0; i < 5; i++) - // printf("%g %g\n", bref[i], b[i] ); Astore0 = (NRformat_loc *) Aref.Store; - // checkNRFMT(Astore, Astore0); + + /* + if ( (grid.grid2d).iam == 0 ) { + printf(" iam %d\n", 0); + checkNRFMT(Astore, Astore0); + } else if ((grid.grid2d).iam == 1 ) { + printf(" iam %d\n", 1); + checkNRFMT(Astore, Astore0); + } + */ + + // bref, xtrueref are created on 2D matCheck(Astore->m_loc, nrhs, B2d, Astore->m_loc, bref, ldb); } // MPI_Finalize(); exit(0); #endif #endif + if (!(berr = doubleMalloc_dist (nrhs))) ABORT ("Malloc fails for berr[]."); @@ -313,14 +332,20 @@ main (int argc, char *argv[]) fflush(stdout); } +#ifdef NRFRMT // Sherry: new 3D matrix + m = A.nrow; + n = A.ncol; +#else if ( grid.zscp.Iam == 0 ) // Process layer 0 { m = A.nrow; n = A.ncol; } + // broadcast m, n to all the process layers; MPI_Bcast( &m, 1, mpi_int_t, 0, grid.zscp.comm); MPI_Bcast( &n, 1, mpi_int_t, 0, grid.zscp.comm); +#endif /* Initialize ScalePermstruct and LUstruct. */ ScalePermstructInit (m, n, &ScalePermstruct); @@ -333,6 +358,9 @@ main (int argc, char *argv[]) pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); + //PrintDouble5("After exit pdgssvx3d, solution b", ldb, b); + //PrintDouble5("After exit pdgssvx3d, xtrue", ldb, xtrue); + /* Check the accuracy of the solution. */ if ( grid.zscp.Iam == 0 ) // Process layer 0 pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, @@ -342,16 +370,12 @@ main (int argc, char *argv[]) /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------ */ - if ( grid.zscp.Iam == 0 ) // process layer 0 { PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ - Destroy_CompRowLoc_Matrix_dist (&A); Destroy_LU (n, &(grid.grid2d), &LUstruct); - SUPERLU_FREE (b); - SUPERLU_FREE (xtrue); if (options.SolveInitialized) { dSolveFinalize (&options, &SOLVEstruct); @@ -363,6 +387,9 @@ main (int argc, char *argv[]) DeAllocGlu_3d(&LUstruct); } + Destroy_CompRowLoc_Matrix_dist (&A); + SUPERLU_FREE (b); + SUPERLU_FREE (xtrue); SUPERLU_FREE (berr); ScalePermstructFree (&ScalePermstruct); LUstructFree (&LUstruct); diff --git a/SRC/nrformat_loc.c b/SRC/nrformat_loc.c index 7f9b03fa..e1e7ed9a 100644 --- a/SRC/nrformat_loc.c +++ b/SRC/nrformat_loc.c @@ -25,6 +25,10 @@ static void matCopy(int n, int m, double* Dst, int lddst // beyond the last row, so that rowptr[n_loc] = nnz_loc.*/ // } NRformat_loc; +/* + * Input: {A, B, ldb} are distributed on 3D process grid + * Output: {A2d, B2d} are distributed on layer 0 2D process grid + */ NRformat_loc dGatherNRformat_loc(NRformat_loc *A, double* B, int ldb, int nrhs, double** B2d, gridinfo3d_t *grid3d) @@ -75,11 +79,10 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A, MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d.colind, nnz_counts_int, nnz_disp, mpi_int_t, 0, grid3d->zscp.comm); - MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d.rowptr[1], row_counts_int, row_disp, mpi_int_t, 0, grid3d->zscp.comm); - + if (grid3d->zscp.Iam == 0) { for (int i = 0; i < grid3d->npdep; i++) @@ -92,7 +95,25 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A, } A2d.nnz_loc = nnz_disp[grid3d->npdep]; A2d.m_loc = row_disp[grid3d->npdep]; - A2d.fst_row = A->fst_row; +#if 0 + A2d.fst_row = A->fst_row; // This is a bug +#else + gridinfo_t *grid2d = &(grid3d->grid2d); + int procs2d = grid2d->nprow * grid2d->npcol; + int m_loc_2d = A2d.m_loc; + int* m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int)); + + MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, MPI_INT, grid2d->comm); + + int fst_row = 0; + for (int p = 0; p < procs2d; ++p) { + if (grid2d->iam == p) A2d.fst_row = fst_row; + fst_row += m_loc_2d_counts[p]; + } + + SUPERLU_FREE(m_loc_2d_counts); +#endif + } // compacting B @@ -123,6 +144,16 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A, } - +#if 0 + /* free storage */ + SUPERLU_FREE(nnz_counts); + SUPERLU_FREE(row_counts); + SUPERLU_FREE(nnz_counts_int); + SUPERLU_FREE(row_counts_int); + SUPERLU_FREE(nnz_disp); + SUPERLU_FREE(row_disp); + SUPERLU_FREE(b_disp); +#endif + return A2d; -} \ No newline at end of file +} diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index e7fc8bfd..3ab5f702 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -619,29 +619,33 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, ordering, symbolic factorization, distribution of L & U */ #define NRFRMT - double* B2d; - int ldb2d; + /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d + B3d and Astore3d will be restored on return */ int ldb3d = ldb; - double* B3d = B; + double* B3d = B; + NRformat_loc* Astore3d = (NRformat_loc *) A->Store; + + double* B2d; + //int ldb2d; // not used NRformat_loc Atmp = dGatherNRformat_loc( (NRformat_loc *) A->Store, B, ldb, nrhs, &B2d, grid3d); - NRformat_loc* Astore0 = &Atmp; + NRformat_loc* Astore0 = &Atmp; // Astore0 is on 2D if (grid3d->zscp.Iam == 0) { - - m = A->nrow; n = A->ncol; // checkNRFMT(Astore0, (NRformat_loc *) A->Store); #ifdef NRFRMT + // On input, A->Store is on 3D, now A->Store is re-assigned to 2D store A->Store = Astore0; ldb = Astore0->m_loc; - B = B2d; + B = B2d; // B is now re-assigned to B2d #endif - Astore = (NRformat_loc *) A->Store; + //PrintDouble5("after gather B=B2d", ldb, B); + Astore = (NRformat_loc *) A->Store; // on 2D // #ifdef NRFRMT // Astore = Astore0; @@ -1096,7 +1100,8 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, #if ( PRNTlevel>=1 ) if (!iam) { - printf (".. anorm %e\n", anorm); fflush(stdout); + printf (".. anorm %e\n", anorm); fflush(stdout); + fflush(stdout); } #endif } @@ -1214,10 +1219,12 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up the nonzero data structures for L & U. */ #if ( PRNTlevel>=1 ) - if (!iam) - printf + if (!iam) { + printf (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", sp_ienv_dist (2), sp_ienv_dist (3), sp_ienv_dist (6)); + fflush(stdout); + } #endif t = SuperLU_timer_ (); if (!(Glu_freeable = (Glu_freeable_t *) @@ -1251,6 +1258,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, symb_mem_usage.for_lu * 1e-6, symb_mem_usage.total * 1e-6, symb_mem_usage.expansions); + fflush(stdout); } #endif } @@ -1534,6 +1542,8 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, #else pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, fst_row, ldb, nrhs, SOLVEstruct, stat, info); + + //PrintDouble5("after pdgstrs X", ldb, X); #endif /* ------------------------------------------------------------ @@ -1717,8 +1727,33 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, #endif } /* process layer 0 done solve */ - // b2d->B + // Sherry comment: + // Now, B <=> B2d, and is filled with the solution X + // B3d is the saved pointer of the B on input + // Need the following code: + // - scatter the solution from 2D back to 3D: {B2d,ldb} -> {B3d,ldb3d} + // (can we reuse b_count[] and b_disp[] already computed in 'Gather' routine?) + +#if 0 // for debugging + if ( grid3d->zscp.Iam == 0 ) { // only process layer 0 + PrintDouble5("Before exit pdgssvx3d, solution B2d", ldb, B2d); + PrintDouble5("Before exit pdgssvx3d, solution B", ldb, B); + } + PrintDouble5("Before exit pdgssvx3d, solution B3d", ldb3d, B3d); + fflush(stdout); +#endif + + /* free storage, which are allocated only in layer 0 */ + if ( grid3d->zscp.Iam == 0 ) { // free matrix A and B2d on 2D + SUPERLU_FREE( Atmp.rowptr ); + SUPERLU_FREE( Atmp.colind ); + SUPERLU_FREE( Atmp.nzval ); + SUPERLU_FREE(B2d); + } + + A->Store = Astore3d; // restore Astore to 3D + #if ( DEBUGlevel>=1 ) CHECK_MALLOC (iam, "Exit pdgssvx3d()"); #endif diff --git a/SRC/pdutil.c b/SRC/pdutil.c index 5c665d8b..5620fc4a 100644 --- a/SRC/pdutil.c +++ b/SRC/pdutil.c @@ -82,6 +82,7 @@ int pdCompRow_loc_to_CompCol_global printf("Proc %d\n", grid->iam); PrintInt10("rowind_loc", nnz_loc, rowind_loc); PrintInt10("colptr_loc", n + 1, colptr_loc); + fflush(stdout); #endif procs = grid->nprow * grid->npcol; @@ -119,9 +120,10 @@ int pdCompRow_loc_to_CompCol_global rdispls[i] = i * (n_loc + 1); j += n_locs[i]; /* First column of next block in colptr_loc[] */ } + MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t, colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm); - + /* Adjust colptr_blk[] so that they contain the local indices of the column pointers in the receive buffer. */ nnz = 0; /* The running sum of the nonzeros counted by far */ diff --git a/run_cmake_build.sh b/run_cmake_build.sh index bd272408..fd2318db 100755 --- a/run_cmake_build.sh +++ b/run_cmake_build.sh @@ -55,6 +55,7 @@ then -DCMAKE_INSTALL_PREFIX=. # -DXSDK_INDEX_SIZE=64 \ # -DCMAKE_EXE_LINKER_FLAGS="-shared" + elif [ "$NERSC_HOST" == "cori" ] then rm -fr cori-build; mkdir cori-build; cd cori-build; @@ -67,7 +68,7 @@ then -DTPL_ENABLE_BLASLIB=OFF \ -DTPL_BLAS_LIBRARIES="-mkl" \ -DCMAKE_Fortran_COMPILER=ftn \ - -DCMAKE_C_FLAGS="-std=c99 -fPIC -DPRNTlevel=0" \ + -DCMAKE_C_FLAGS="-std=c99 -fPIC -DPRNTlevel=1 -g -O0" \ -DCMAKE_INSTALL_PREFIX=. # -DXSDK_INDEX_SIZE=64 # -DCMAKE_EXE_LINKER_FLAGS="-shared" \ From 14f84ce6ea14d0177b04989e369b7b1fa71aa29e Mon Sep 17 00:00:00 2001 From: piyush sao Date: Tue, 6 Oct 2020 00:48:35 -0400 Subject: [PATCH 029/147] scattering-B appears tto be woorking --- EXAMPLE/pddrive3d.c | 2 + SRC/nrformat_loc.c | 153 +++++++--- SRC/pdgssvx3d.c | 663 +++++++++++++++++++++++--------------------- SRC/superlu_defs.h | 4 + 4 files changed, 458 insertions(+), 364 deletions(-) diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c index 4a29b39b..ea70ce5b 100644 --- a/EXAMPLE/pddrive3d.c +++ b/EXAMPLE/pddrive3d.c @@ -362,7 +362,9 @@ main (int argc, char *argv[]) //PrintDouble5("After exit pdgssvx3d, xtrue", ldb, xtrue); /* Check the accuracy of the solution. */ + #ifndef NRFRMT if ( grid.zscp.Iam == 0 ) // Process layer 0 + #endif pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, nrhs, b, ldb, xtrue, ldx, &(grid.grid2d)); fflush(stdout); diff --git a/SRC/nrformat_loc.c b/SRC/nrformat_loc.c index e1e7ed9a..287893cb 100644 --- a/SRC/nrformat_loc.c +++ b/SRC/nrformat_loc.c @@ -1,15 +1,14 @@ #include "superlu_ddefs.h" -static void matCopy(int n, int m, double* Dst, int lddst - , double* Src, int ldsrc) +static void matCopy(int n, int m, double *Dst, int lddst, double *Src, int ldsrc) { - for(int j=0; jnpdep * sizeof(int_t)); row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t)); nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); @@ -54,15 +53,15 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A, nnz_disp[0] = 0; row_disp[0] = 0; - b_disp[0] =0; + b_disp[0] = 0; for (int i = 0; i < grid3d->npdep; i++) { nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i]; row_disp[i + 1] = row_disp[i] + row_counts[i]; - b_disp[i+1] = nrhs*row_disp[i + 1]; + b_disp[i + 1] = nrhs * row_disp[i + 1]; nnz_counts_int[i] = nnz_counts[i]; row_counts_int[i] = row_counts[i]; - b_counts_int[i] = nrhs*row_counts[i]; + b_counts_int[i] = nrhs * row_counts[i]; } if (grid3d->zscp.Iam == 0) @@ -82,68 +81,71 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A, MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d.rowptr[1], row_counts_int, row_disp, mpi_int_t, 0, grid3d->zscp.comm); - + if (grid3d->zscp.Iam == 0) { for (int i = 0; i < grid3d->npdep; i++) { - for(int j = row_disp[i]+1; jnpdep]; A2d.m_loc = row_disp[grid3d->npdep]; #if 0 A2d.fst_row = A->fst_row; // This is a bug #else - gridinfo_t *grid2d = &(grid3d->grid2d); - int procs2d = grid2d->nprow * grid2d->npcol; - int m_loc_2d = A2d.m_loc; - int* m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int)); - - MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, MPI_INT, grid2d->comm); - - int fst_row = 0; - for (int p = 0; p < procs2d; ++p) { - if (grid2d->iam == p) A2d.fst_row = fst_row; - fst_row += m_loc_2d_counts[p]; - } - - SUPERLU_FREE(m_loc_2d_counts); -#endif - + gridinfo_t *grid2d = &(grid3d->grid2d); + int procs2d = grid2d->nprow * grid2d->npcol; + int m_loc_2d = A2d.m_loc; + int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int)); + + MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, MPI_INT, grid2d->comm); + + int fst_row = 0; + for (int p = 0; p < procs2d; ++p) + { + if (grid2d->iam == p) + A2d.fst_row = fst_row; + fst_row += m_loc_2d_counts[p]; + } + + SUPERLU_FREE(m_loc_2d_counts); +#endif } - + // Btmp <- compact(B) // compacting B double *Btmp; - Btmp =SUPERLU_MALLOC(A->m_loc*nrhs * sizeof(double)); - matCopy(A->m_loc, nrhs, Btmp, A->m_loc,B, ldb); + Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(double)); + matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb); double *B1; if (grid3d->zscp.Iam == 0) { - B1 = SUPERLU_MALLOC( A2d.m_loc*nrhs * sizeof(double)); - *B2d = SUPERLU_MALLOC( A2d.m_loc*nrhs * sizeof(double)); + B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double)); + *B2d = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double)); } - MPI_Gatherv(Btmp, nrhs*A->m_loc, MPI_DOUBLE, B1, + // B1 <- gatherv(Btmp) + MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_DOUBLE, B1, b_counts_int, b_disp, MPI_DOUBLE, 0, grid3d->zscp.comm); + + // B2d <- colMajor(B1) if (grid3d->zscp.Iam == 0) { for (int i = 0; i < grid3d->npdep; ++i) { /* code */ - matCopy(row_counts_int[i], nrhs, *B2d + row_disp[i], A2d.m_loc, - B1+nrhs*row_disp[i], row_counts_int[i]); + matCopy(row_counts_int[i], nrhs, *B2d + row_disp[i], A2d.m_loc, + B1 + nrhs * row_disp[i], row_counts_int[i]); } SUPERLU_FREE(B1); } - #if 0 /* free storage */ SUPERLU_FREE(nnz_counts); @@ -154,6 +156,73 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A, SUPERLU_FREE(row_disp); SUPERLU_FREE(b_disp); #endif - + return A2d; } + +// X2d <- A^{-1} B2D + +int dScatterB3d(NRformat_loc A2d, NRformat_loc *A, + double *B, int ldb, int nrhs, double *B2d, + gridinfo3d_t *grid3d) +{ + int_t *nnz_counts, *row_counts; + int *nnz_disp, *row_disp, *nnz_counts_int, *row_counts_int; + int *b_counts_int, *b_disp; + nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t)); + row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t)); + nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); + row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); + b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); + MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts, + 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts, + 1, mpi_int_t, 0, grid3d->zscp.comm); + nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int)); + row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int)); + b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int)); + + nnz_disp[0] = 0; + row_disp[0] = 0; + b_disp[0] = 0; + for (int i = 0; i < grid3d->npdep; i++) + { + nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i]; + row_disp[i + 1] = row_disp[i] + row_counts[i]; + b_disp[i + 1] = nrhs * row_disp[i + 1]; + nnz_counts_int[i] = nnz_counts[i]; + row_counts_int[i] = row_counts[i]; + b_counts_int[i] = nrhs * row_counts[i]; + } + + double *B1; + if (grid3d->zscp.Iam == 0) + { + B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double)); + } + + // B1 <- blockByBock(b2d) + if (grid3d->zscp.Iam == 0) + { + for (int i = 0; i < grid3d->npdep; ++i) + { + /* code */ + matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i], + B2d + row_disp[i], A2d.m_loc); + } + + } + + // + double *Btmp; + Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(double)); + + // Bttmp <- scatterv(B1) + MPI_Scatterv(B1, b_counts_int, b_disp, MPI_DOUBLE, + Btmp, nrhs * A->m_loc, MPI_DOUBLE, 0, grid3d->zscp.comm); + + // B <- colMajor(Btmp) + matCopy(A->m_loc, nrhs, B, ldb,Btmp, A->m_loc); + + return 0; +} \ No newline at end of file diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index 3ab5f702..3390b5ce 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -9,7 +9,6 @@ The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ - /*! @file * \brief Solves a system of linear equations A*X=B using 3D process grid. * @@ -21,7 +20,7 @@ at the top-level directory. #include "superlu_ddefs.h" #include -static void checkNRFMT(NRformat_loc*A, NRformat_loc*B) +static void checkNRFMT(NRformat_loc *A, NRformat_loc *B) { /* int_t nnz_loc; @@ -49,9 +48,7 @@ static void checkNRFMT(NRformat_loc*A, NRformat_loc*B) assert((A->rowptr)[i] == (B->rowptr)[i]); } - printf("Matrix check passed\n"); - } #if 0 #include "p3dcomm.h" @@ -527,18 +524,17 @@ static void checkNRFMT(NRformat_loc*A, NRformat_loc*B) *
*/ -void -pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, - ScalePermstruct_t * ScalePermstruct, - double B[], int ldb, int nrhs, gridinfo3d_t * grid3d, - LUstruct_t * LUstruct, SOLVEstruct_t * SOLVEstruct, - double *berr, SuperLUStat_t * stat, int *info) +void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, + ScalePermstruct_t *ScalePermstruct, + double B[], int ldb, int nrhs, gridinfo3d_t *grid3d, + LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct, + double *berr, SuperLUStat_t *stat, int *info) { NRformat_loc *Astore; - SuperMatrix GA; /* Global A in NC format */ + SuperMatrix GA; /* Global A in NC format */ NCformat *GAstore; double *a_GA; - SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ + SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ NCPformat *GACstore; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; Glu_freeable_t *Glu_freeable; @@ -557,28 +553,28 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, fact_t Fact; double *a; int_t *colptr, *rowind; - int_t *perm_r; /* row permutations from partial pivoting */ - int_t *perm_c; /* column permutation vector */ - int_t *etree; /* elimination tree */ - int_t *rowptr, *colind; /* Local A in NR */ + int_t *perm_r; /* row permutations from partial pivoting */ + int_t *perm_c; /* column permutation vector */ + int_t *etree; /* elimination tree */ + int_t *rowptr, *colind; /* Local A in NR */ int_t colequ, Equil, factored, job, notran, rowequ, need_value; int_t i, iinfo, j, irow, m, n, nnz, permc_spec; int_t nnz_loc, m_loc, fst_row, icol; int iam; - int ldx; /* LDA for matrix X (local). */ + int ldx; /* LDA for matrix X (local). */ char equed[1], norm[1]; double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; double *X, *b_col, *b_work, *x_col; double t; - float GA_mem_use; /* memory usage by global A */ - float dist_mem_use; /* memory usage during distribution */ + float GA_mem_use; /* memory usage by global A */ + float dist_mem_use; /* memory usage during distribution */ superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; -#if ( PRNTlevel>= 2 ) +#if (PRNTlevel >= 2) double dmin, dsum, dprod; #endif // get the 2d grid - gridinfo_t *grid = &(grid3d->grid2d); + gridinfo_t *grid = &(grid3d->grid2d); iam = grid->iam; /* Initialization. */ @@ -601,18 +597,18 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, else if (options->IterRefine == SLU_EXTRA) { *info = -1; - fprintf (stderr, - "Extra precise iterative refinement yet to support."); + fprintf(stderr, + "Extra precise iterative refinement yet to support."); } if (*info) { i = -(*info); - pxerr_dist ("pdgssvx3d", grid, -*info); + pxerr_dist("pdgssvx3d", grid, -*info); return; } -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC (iam, "Enter pdgssvx3d()"); +#if (DEBUGlevel >= 1) + CHECK_MALLOC(iam, "Enter pdgssvx3d()"); #endif /* Perform preprocessing steps on process layer zero, including: @@ -622,17 +618,21 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d B3d and Astore3d will be restored on return */ int ldb3d = ldb; - double* B3d = B; - NRformat_loc* Astore3d = (NRformat_loc *) A->Store; - - double* B2d; + double *B3d = B; + NRformat_loc *Astore3d = (NRformat_loc *)A->Store; + + double *B2d; //int ldb2d; // not used NRformat_loc Atmp = dGatherNRformat_loc( - (NRformat_loc *) A->Store, - B, ldb, nrhs, &B2d, - grid3d); + (NRformat_loc *)A->Store, + B, ldb, nrhs, &B2d, + grid3d); + + // dGatherNRformat_loc(NRformat_loc3d * A3d, grid3d); + // dScatterBNRformat_loc(NRformat_loc3d * A3d, grid3d); - NRformat_loc* Astore0 = &Atmp; // Astore0 is on 2D + NRformat_loc *Astore0 = &Atmp; // Astore0 is on 2D + NRformat_loc *A_orig = A->Store; if (grid3d->zscp.Iam == 0) { m = A->nrow; @@ -642,33 +642,32 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, // On input, A->Store is on 3D, now A->Store is re-assigned to 2D store A->Store = Astore0; ldb = Astore0->m_loc; - B = B2d; // B is now re-assigned to B2d + B = B2d; // B is now re-assigned to B2d #endif //PrintDouble5("after gather B=B2d", ldb, B); - Astore = (NRformat_loc *) A->Store; // on 2D - -// #ifdef NRFRMT -// Astore = Astore0; -// #else -// Astore = Astore0; -// // Astore = (NRformat_loc *) A->Store; -// // Astore->nzval = Astore0->nzval; -// // Astore->rowptr = Astore0->rowptr; -// // Astore->colind = Astore0->colind; -// // Astore->nnz_loc = Astore0->nnz_loc; -// // Astore->m_loc = Astore0->m_loc; -// // Astore->fst_row = Astore0->fst_row; -// #endif + Astore = (NRformat_loc *)A->Store; // on 2D + + // #ifdef NRFRMT + // Astore = Astore0; + // #else + // Astore = Astore0; + // // Astore = (NRformat_loc *) A->Store; + // // Astore->nzval = Astore0->nzval; + // // Astore->rowptr = Astore0->rowptr; + // // Astore->colind = Astore0->colind; + // // Astore->nnz_loc = Astore0->nnz_loc; + // // Astore->m_loc = Astore0->m_loc; + // // Astore->fst_row = Astore0->fst_row; + // #endif nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc; fst_row = Astore->fst_row; - a = (double *) Astore->nzval; + a = (double *)Astore->nzval; rowptr = Astore->rowptr; colind = Astore->colind; /* Test the other input parameters. */ - if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc - || A->Dtype != SLU_D || A->Mtype != SLU_GE) + if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE) *info = -2; else if (ldb < m_loc) *info = -5; @@ -677,15 +676,15 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, if (*info) { i = -(*info); - pxerr_dist ("pdgssvx3d", grid, -*info); + pxerr_dist("pdgssvx3d", grid, -*info); return; } /* Structures needed for parallel symbolic factorization */ int_t *sizes, *fstVtxSep; int noDomains, nprocs_num; - MPI_Comm symb_comm; /* communicator for symbolic factorization */ - int col, key; /* parameters for creating a new communicator */ + MPI_Comm symb_comm; /* communicator for symbolic factorization */ + int col, key; /* parameters for creating a new communicator */ Pslu_freeable_t Pslu_freeable; float flinfo; @@ -701,9 +700,9 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, if (factored || (Fact == SamePattern_SameRowPerm && Equil)) { rowequ = (ScalePermstruct->DiagScale == ROW) || - (ScalePermstruct->DiagScale == BOTH); + (ScalePermstruct->DiagScale == BOTH); colequ = (ScalePermstruct->DiagScale == COL) || - (ScalePermstruct->DiagScale == BOTH); + (ScalePermstruct->DiagScale == BOTH); } else rowequ = colequ = FALSE; @@ -723,21 +722,21 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, switch (ScalePermstruct->DiagScale) { case NOEQUIL: - if (!(R = (double *) doubleMalloc_dist (m))) - ABORT ("Malloc fails for R[]."); - if (!(C = (double *) doubleMalloc_dist (n))) - ABORT ("Malloc fails for C[]."); + if (!(R = (double *)doubleMalloc_dist(m))) + ABORT("Malloc fails for R[]."); + if (!(C = (double *)doubleMalloc_dist(n))) + ABORT("Malloc fails for C[]."); ScalePermstruct->R = R; ScalePermstruct->C = C; break; case ROW: - if (!(C = (double *) doubleMalloc_dist (n))) - ABORT ("Malloc fails for C[]."); + if (!(C = (double *)doubleMalloc_dist(n))) + ABORT("Malloc fails for C[]."); ScalePermstruct->C = C; break; case COL: - if (!(R = (double *) doubleMalloc_dist (m))) - ABORT ("Malloc fails for R[]."); + if (!(R = (double *)doubleMalloc_dist(m))) + ABORT("Malloc fails for R[]."); ScalePermstruct->R = R; break; } @@ -748,10 +747,10 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, ------------------------------------------------------------ */ if (Equil) { -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC (iam, "Enter equil"); +#if (DEBUGlevel >= 1) + CHECK_MALLOC(iam, "Enter equil"); #endif - t = SuperLU_timer_ (); + t = SuperLU_timer_(); if (Fact == SamePattern_SameRowPerm) { @@ -766,7 +765,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, { for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { - a[i] *= R[irow]; /* Scale rows. */ + a[i] *= R[irow]; /* Scale rows. */ } ++irow; } @@ -776,7 +775,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { icol = colind[i]; - a[i] *= C[icol]; /* Scale columns. */ + a[i] *= C[icol]; /* Scale columns. */ } break; case BOTH: @@ -793,68 +792,70 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, break; } } - else /* Compute R & C from scratch */ + else /* Compute R & C from scratch */ { /* Compute the row and column scalings. */ - pdgsequ (A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); + pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); - if ( iinfo > 0 ) + if (iinfo > 0) { - if ( iinfo <= m ) + if (iinfo <= m) { -#if ( PRNTlevel>=1 ) +#if (PRNTlevel >= 1) fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo); #endif } else { -#if ( PRNTlevel>=1 ) +#if (PRNTlevel >= 1) fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo - n); #endif } } - else if ( iinfo < 0 ) return; + else if (iinfo < 0) + return; /* Now iinfo == 0 */ /* Equilibrate matrix A if it is badly-scaled. A <-- diag(R)*A*diag(C) */ - pdlaqgs (A, R, C, rowcnd, colcnd, amax, equed); + pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed); - if ( strncmp(equed, "R", 1) == 0 ) + if (strncmp(equed, "R", 1) == 0) { ScalePermstruct->DiagScale = ROW; rowequ = ROW; } - else if ( strncmp(equed, "C", 1) == 0 ) + else if (strncmp(equed, "C", 1) == 0) { ScalePermstruct->DiagScale = COL; colequ = COL; } - else if ( strncmp(equed, "B", 1) == 0 ) + else if (strncmp(equed, "B", 1) == 0) { ScalePermstruct->DiagScale = BOTH; rowequ = ROW; colequ = COL; } - else ScalePermstruct->DiagScale = NOEQUIL; + else + ScalePermstruct->DiagScale = NOEQUIL; -#if ( PRNTlevel>=1 ) +#if (PRNTlevel >= 1) if (iam == 0) { - printf (".. equilibrated? *equed = %c\n", *equed); + printf(".. equilibrated? *equed = %c\n", *equed); fflush(stdout); } #endif } /* end if-else Fact ... */ - stat->utime[EQUIL] = SuperLU_timer_ () - t; -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC (iam, "Exit equil"); + stat->utime[EQUIL] = SuperLU_timer_() - t; +#if (DEBUGlevel >= 1) + CHECK_MALLOC(iam, "Exit equil"); #endif } /* end if Equil ... LAPACK style, not involving MC64 */ - if (!factored) /* Skip this if already factored. */ + if (!factored) /* Skip this if already factored. */ { /* * Gather A from the distributed compressed row format to @@ -863,27 +864,27 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, * for large diagonal is sought after. */ if (Fact != SamePattern_SameRowPerm && - (parSymbFact == NO || options->RowPerm != NO)) + (parSymbFact == NO || options->RowPerm != NO)) { need_value = (options->RowPerm == LargeDiag_MC64); - pdCompRow_loc_to_CompCol_global (need_value, A, grid, &GA); + pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); - GAstore = (NCformat *) GA.Store; + GAstore = (NCformat *)GA.Store; colptr = GAstore->colptr; rowind = GAstore->rowind; nnz = GAstore->nnz; - GA_mem_use = (nnz + n + 1) * sizeof (int_t); + GA_mem_use = (nnz + n + 1) * sizeof(int_t); if (need_value) { - a_GA = (double *) GAstore->nzval; - GA_mem_use += nnz * sizeof (double); + a_GA = (double *)GAstore->nzval; + GA_mem_use += nnz * sizeof(double); } else - assert (GAstore->nzval == NULL); + assert(GAstore->nzval == NULL); } /* ------------------------------------------------------------ @@ -891,7 +892,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, ------------------------------------------------------------ */ if (options->RowPerm != NO) { - t = SuperLU_timer_ (); + t = SuperLU_timer_(); if (Fact != SamePattern_SameRowPerm) { if (options->RowPerm == MY_PERMR) @@ -904,68 +905,68 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, rowind[i] = perm_r[irow]; } } - else if ( options->RowPerm == LargeDiag_MC64 ) + else if (options->RowPerm == LargeDiag_MC64) { /* Get a new perm_r[] */ if (job == 5) { /* Allocate storage for scaling factors. */ - if (!(R1 = doubleMalloc_dist (m))) - ABORT ("SUPERLU_MALLOC fails for R1[]"); - if (!(C1 = doubleMalloc_dist (n))) - ABORT ("SUPERLU_MALLOC fails for C1[]"); + if (!(R1 = doubleMalloc_dist(m))) + ABORT("SUPERLU_MALLOC fails for R1[]"); + if (!(C1 = doubleMalloc_dist(n))) + ABORT("SUPERLU_MALLOC fails for C1[]"); } - if ( iam == 0 ) + if (iam == 0) { /* Process 0 finds a row permutation */ - iinfo = dldperm_dist (job, m, nnz, colptr, rowind, a_GA, - perm_r, R1, C1); - MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); - if ( iinfo == 0 ) + iinfo = dldperm_dist(job, m, nnz, colptr, rowind, a_GA, + perm_r, R1, C1); + MPI_Bcast(&iinfo, 1, mpi_int_t, 0, grid->comm); + if (iinfo == 0) { - MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + MPI_Bcast(perm_r, m, mpi_int_t, 0, grid->comm); if (job == 5 && Equil) { - MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); - MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast(R1, m, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast(C1, n, MPI_DOUBLE, 0, grid->comm); } } } else { - MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); - if ( iinfo == 0 ) + MPI_Bcast(&iinfo, 1, mpi_int_t, 0, grid->comm); + if (iinfo == 0) { - MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + MPI_Bcast(perm_r, m, mpi_int_t, 0, grid->comm); if (job == 5 && Equil) { - MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); - MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast(R1, m, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast(C1, n, MPI_DOUBLE, 0, grid->comm); } } } - if ( iinfo && job == 5) /* Error return */ + if (iinfo && job == 5) /* Error return */ { SUPERLU_FREE(R1); SUPERLU_FREE(C1); } -#if ( PRNTlevel>=2 ) - dmin = damch_dist ("Overflow"); +#if (PRNTlevel >= 2) + dmin = damch_dist("Overflow"); dsum = 0.0; dprod = 1.0; #endif - if ( iinfo == 0 ) + if (iinfo == 0) { if (job == 5) { - if ( Equil ) + if (Equil) { for (i = 0; i < n; ++i) { - R1[i] = exp (R1[i]); - C1[i] = exp (C1[i]); + R1[i] = exp(R1[i]); + C1[i] = exp(C1[i]); } /* Scale the distributed matrix further. @@ -977,7 +978,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, { icol = colind[i]; a[i] *= R1[irow] * C1[icol]; -#if ( PRNTlevel>=2 ) +#if (PRNTlevel >= 2) if (perm_r[irow] == icol) { /* New diagonal */ @@ -996,18 +997,22 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* Multiply together the scaling factors -- R/C from simple scheme, R1/C1 from MC64. */ if (rowequ) - for (i = 0; i < m; ++i) R[i] *= R1[i]; + for (i = 0; i < m; ++i) + R[i] *= R1[i]; else - for (i = 0; i < m; ++i) R[i] = R1[i]; + for (i = 0; i < m; ++i) + R[i] = R1[i]; if (colequ) - for (i = 0; i < n; ++i) C[i] *= C1[i]; + for (i = 0; i < n; ++i) + C[i] *= C1[i]; else - for (i = 0; i < n; ++i) C[i] = C1[i]; + for (i = 0; i < n; ++i) + C[i] = C1[i]; ScalePermstruct->DiagScale = BOTH; rowequ = colequ = 1; - } /* end if Equil */ + } /* end if Equil */ /* Now permute global A to prepare for symbfact() */ for (j = 0; j < n; ++j) @@ -1018,10 +1023,10 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, rowind[i] = perm_r[irow]; } } - SUPERLU_FREE (R1); - SUPERLU_FREE (C1); + SUPERLU_FREE(R1); + SUPERLU_FREE(C1); } - else /* job = 2,3,4 */ + else /* job = 2,3,4 */ { for (j = 0; j < n; ++j) { @@ -1029,48 +1034,50 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, { irow = rowind[i]; rowind[i] = perm_r[irow]; - } /* end for i ... */ - } /* end for j ... */ - } /* end else job ... */ + } /* end for i ... */ + } /* end for j ... */ + } /* end else job ... */ } - else /* if iinfo != 0 */ + else /* if iinfo != 0 */ { - for (i = 0; i < m; ++i) perm_r[i] = i; + for (i = 0; i < m; ++i) + perm_r[i] = i; } -#if ( PRNTlevel>=2 ) +#if (PRNTlevel >= 2) if (job == 2 || job == 3) { if (!iam) - printf ("\tsmallest diagonal %e\n", dmin); + printf("\tsmallest diagonal %e\n", dmin); } else if (job == 4) { if (!iam) - printf ("\tsum of diagonal %e\n", dsum); + printf("\tsum of diagonal %e\n", dsum); } else if (job == 5) { if (!iam) - printf ("\t product of diagonal %e\n", dprod); + printf("\t product of diagonal %e\n", dprod); } #endif } - else /* use largeDiag_AWPM */ + else /* use largeDiag_AWPM */ { #ifdef HAVE_COMBBLAS c2cpp_GetAWPM(A, grid, ScalePermstruct); #else - if ( iam == 0 ) + if (iam == 0) { - printf("CombBLAS is not available\n"); fflush(stdout); + printf("CombBLAS is not available\n"); + fflush(stdout); } #endif } /* end if-else options->RowPerm ... */ - t = SuperLU_timer_ () - t; + t = SuperLU_timer_() - t; stat->utime[ROWPERM] = t; -#if ( PRNTlevel>=1 ) - if ( !iam ) +#if (PRNTlevel >= 1) + if (!iam) { printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); fflush(stdout); @@ -1078,14 +1085,15 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, #endif } /* end if Fact not SamePattern_SameRowPerm ... */ } - else /* options->RowPerm == NOROWPERM / NATURAL */ + else /* options->RowPerm == NOROWPERM / NATURAL */ { - for (i = 0; i < m; ++i) perm_r[i] = i; + for (i = 0; i < m; ++i) + perm_r[i] = i; } -#if ( DEBUGlevel>=2 ) +#if (DEBUGlevel >= 2) if (!iam) - PrintInt10 ("perm_r", m, perm_r); + PrintInt10("perm_r", m, perm_r); #endif } /* end if (!factored) */ @@ -1093,26 +1101,26 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, { /* Compute norm(A), which will be used to adjust small diagonal. */ if (notran) - *(unsigned char *) norm = '1'; + *(unsigned char *)norm = '1'; else - *(unsigned char *) norm = 'I'; - anorm = pdlangs (norm, A, grid); -#if ( PRNTlevel>=1 ) + *(unsigned char *)norm = 'I'; + anorm = pdlangs(norm, A, grid); +#if (PRNTlevel >= 1) if (!iam) { - printf (".. anorm %e\n", anorm); fflush(stdout); - fflush(stdout); + printf(".. anorm %e\n", anorm); + fflush(stdout); + fflush(stdout); } #endif } - /* ------------------------------------------------------------ Perform the LU factorization. ------------------------------------------------------------ */ if (!factored) { - t = SuperLU_timer_ (); + t = SuperLU_timer_(); /* * Get column permutation vector perm_c[], according to permc_spec: * permc_spec = NATURAL: natural ordering @@ -1127,7 +1135,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, if (parSymbFact == YES || permc_spec == PARMETIS) { nprocs_num = grid->nprow * grid->npcol; - noDomains = (int) (pow (2, ((int) LOG2 (nprocs_num)))); + noDomains = (int)(pow(2, ((int)LOG2(nprocs_num)))); /* create a new communicator for the first noDomains processes in grid->comm */ @@ -1136,7 +1144,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, col = 0; else col = MPI_UNDEFINED; - MPI_Comm_split (grid->comm, col, key, &symb_comm); + MPI_Comm_split(grid->comm, col, key, &symb_comm); if (permc_spec == NATURAL || permc_spec == MY_PERMC) { @@ -1145,10 +1153,10 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, for (j = 0; j < n; ++j) perm_c[j] = j; } - if (!(sizes = intMalloc_dist (2 * noDomains))) - ABORT ("SUPERLU_MALLOC fails for sizes."); - if (!(fstVtxSep = intMalloc_dist (2 * noDomains))) - ABORT ("SUPERLU_MALLOC fails for fstVtxSep."); + if (!(sizes = intMalloc_dist(2 * noDomains))) + ABORT("SUPERLU_MALLOC fails for sizes."); + if (!(fstVtxSep = intMalloc_dist(2 * noDomains))) + ABORT("SUPERLU_MALLOC fails for fstVtxSep."); for (i = 0; i < 2 * noDomains - 2; ++i) { sizes[i] = 0; @@ -1161,7 +1169,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, { /* same as before */ printf("{%4d,%4d}: pdgssvx3d: invalid ColPerm option when ParSymbfact is used\n", - (int) MYROW(grid->iam, grid), (int) MYCOL(grid->iam, grid)); + (int)MYROW(grid->iam, grid), (int)MYCOL(grid->iam, grid)); } } /* end ... use parmetis */ @@ -1174,19 +1182,19 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, * and does not modify it. It also allocates memory for * * sizes[] and fstVtxSep[] arrays, that contain information * * on the separator tree computed by ParMETIS. */ - flinfo = get_perm_c_parmetis (A, perm_r, perm_c, nprocs_num, - noDomains, &sizes, &fstVtxSep, - grid, &symb_comm); + flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, + noDomains, &sizes, &fstVtxSep, + grid, &symb_comm); if (flinfo > 0) - ABORT ("ERROR in get perm_c parmetis."); + ABORT("ERROR in get perm_c parmetis."); } else { - get_perm_c_dist (iam, permc_spec, &GA, perm_c); + get_perm_c_dist(iam, permc_spec, &GA, perm_c); } } - stat->utime[COLPERM] = SuperLU_timer_ () - t; + stat->utime[COLPERM] = SuperLU_timer_() - t; /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' (a.k.a. column etree), depending on the choice of ColPerm. @@ -1199,10 +1207,10 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, int_t *GACcolbeg, *GACcolend, *GACrowind; - sp_colorder (options, &GA, perm_c, etree, &GAC); + sp_colorder(options, &GA, perm_c, etree, &GAC); /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ - GACstore = (NCPformat *) GAC.Store; + GACstore = (NCPformat *)GAC.Store; GACcolbeg = GACstore->colbeg; GACcolend = GACstore->colend; GACrowind = GACstore->rowind; @@ -1215,49 +1223,47 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, } } - /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up the nonzero data structures for L & U. */ -#if ( PRNTlevel>=1 ) - if (!iam) { - printf - (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", - sp_ienv_dist (2), sp_ienv_dist (3), sp_ienv_dist (6)); - fflush(stdout); +#if (PRNTlevel >= 1) + if (!iam) + { + printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", + sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); + fflush(stdout); } #endif - t = SuperLU_timer_ (); + t = SuperLU_timer_(); if (!(Glu_freeable = (Glu_freeable_t *) - SUPERLU_MALLOC (sizeof (Glu_freeable_t)))) - ABORT ("Malloc fails for Glu_freeable."); + SUPERLU_MALLOC(sizeof(Glu_freeable_t)))) + ABORT("Malloc fails for Glu_freeable."); /* Every process does this. */ - iinfo = symbfact (options, iam, &GAC, perm_c, etree, - Glu_persist, Glu_freeable); + iinfo = symbfact(options, iam, &GAC, perm_c, etree, + Glu_persist, Glu_freeable); - stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + stat->utime[SYMBFAC] = SuperLU_timer_() - t; if (iinfo < 0) { /* Successful return */ - QuerySpace_dist (n, -iinfo, Glu_freeable, &symb_mem_usage); + QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); -#if ( PRNTlevel>=1 ) +#if (PRNTlevel >= 1) if (!iam) { - printf ("\tNo of supers %ld\n", - Glu_persist->supno[n - 1] + 1); - printf ("\tSize of G(L) %ld\n", - Glu_freeable->xlsub[n]); - printf ("\tSize of G(U) %ld\n", - Glu_freeable->xusub[n]); - printf ("\tint %d, short %d, float %d, double %d\n", - sizeof (int_t), sizeof (short), - sizeof (float), sizeof (double)); - printf - ("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", - symb_mem_usage.for_lu * 1e-6, - symb_mem_usage.total * 1e-6, - symb_mem_usage.expansions); + printf("\tNo of supers %ld\n", + Glu_persist->supno[n - 1] + 1); + printf("\tSize of G(L) %ld\n", + Glu_freeable->xlsub[n]); + printf("\tSize of G(U) %ld\n", + Glu_freeable->xusub[n]); + printf("\tint %d, short %d, float %d, double %d\n", + sizeof(int_t), sizeof(short), + sizeof(float), sizeof(double)); + printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", + symb_mem_usage.for_lu * 1e-6, + symb_mem_usage.total * 1e-6, + symb_mem_usage.expansions); fflush(stdout); } #endif @@ -1266,41 +1272,40 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, { if (!iam) { - fprintf (stderr, "symbfact() error returns %d\n", - (int) iinfo); - exit (-1); + fprintf(stderr, "symbfact() error returns %d\n", + (int)iinfo); + exit(-1); } } - } /* end serial symbolic factorization */ - else /* parallel symbolic factorization */ + } /* end serial symbolic factorization */ + else /* parallel symbolic factorization */ { - t = SuperLU_timer_ (); + t = SuperLU_timer_(); flinfo = - symbfact_dist (nprocs_num, noDomains, A, perm_c, perm_r, - sizes, fstVtxSep, &Pslu_freeable, - &(grid->comm), &symb_comm, - &symb_mem_usage); - stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r, + sizes, fstVtxSep, &Pslu_freeable, + &(grid->comm), &symb_comm, + &symb_mem_usage); + stat->utime[SYMBFAC] = SuperLU_timer_() - t; if (flinfo > 0) - ABORT - ("Insufficient memory for parallel symbolic factorization."); + ABORT("Insufficient memory for parallel symbolic factorization."); } /* Destroy GA */ if (parSymbFact == NO || options->RowPerm != NO) - Destroy_CompCol_Matrix_dist (&GA); + Destroy_CompCol_Matrix_dist(&GA); if (parSymbFact == NO) - Destroy_CompCol_Permuted_dist (&GAC); + Destroy_CompCol_Permuted_dist(&GAC); } /* end if Fact not SamePattern_SameRowPerm */ if (sizes) - SUPERLU_FREE (sizes); + SUPERLU_FREE(sizes); if (fstVtxSep) - SUPERLU_FREE (fstVtxSep); + SUPERLU_FREE(fstVtxSep); if (symb_comm != MPI_COMM_NULL) - MPI_Comm_free (&symb_comm); + MPI_Comm_free(&symb_comm); if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { @@ -1311,16 +1316,16 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. NOTE: the row permutation Pc*Pr is applied internally in the distribution routine. */ - t = SuperLU_timer_ (); - dist_mem_use = pddistribute (Fact, n, A, ScalePermstruct, - Glu_freeable, LUstruct, grid); - stat->utime[DIST] = SuperLU_timer_ () - t; + t = SuperLU_timer_(); + dist_mem_use = pddistribute(Fact, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid); + stat->utime[DIST] = SuperLU_timer_() - t; /* Deallocate storage used in symbolic factorization. */ if (Fact != SamePattern_SameRowPerm) { - iinfo = symbfact_SubFree (Glu_freeable); - SUPERLU_FREE (Glu_freeable); + iinfo = symbfact_SubFree(Glu_freeable); + SUPERLU_FREE(Glu_freeable); } } else @@ -1332,29 +1337,29 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; - t = SuperLU_timer_ (); - dist_mem_use = ddist_psymbtonum (Fact, n, A, ScalePermstruct, - &Pslu_freeable, LUstruct, grid); + t = SuperLU_timer_(); + dist_mem_use = ddist_psymbtonum(Fact, n, A, ScalePermstruct, + &Pslu_freeable, LUstruct, grid); if (dist_mem_use > 0) - ABORT ("Not enough memory available for dist_psymbtonum\n"); + ABORT("Not enough memory available for dist_psymbtonum\n"); - stat->utime[DIST] = SuperLU_timer_ () - t; + stat->utime[DIST] = SuperLU_timer_() - t; } /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); */ } /* end if not Factored */ - } /* end if process layer 0 */ + } /* end if process layer 0 */ - trf3Dpartition_t* trf3Dpartition; + trf3Dpartition_t *trf3Dpartition; /* Perform numerical factorization in parallel on all process layers.*/ - if ( !factored ) + if (!factored) { /* send the data across all the layers */ - MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->zscp.comm); - MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->zscp.comm); - MPI_Bcast( &anorm, 1, MPI_DOUBLE, 0, grid3d->zscp.comm); + MPI_Bcast(&m, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast(&n, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast(&anorm, 1, MPI_DOUBLE, 0, grid3d->zscp.comm); /* send the LU structure to all the grids */ dp3dScatter(n, LUstruct, grid3d); @@ -1362,24 +1367,25 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, int_t nsupers = getNsupers(n, LUstruct); trf3Dpartition = dinitTrf3Dpartition(nsupers, options, LUstruct, grid3d); - SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t)); + SCT_t *SCT = (SCT_t *)SUPERLU_MALLOC(sizeof(SCT_t)); SCT_init(SCT); -#if ( PRNTlevel>=1 ) +#if (PRNTlevel >= 1) if (iam == 0) { - printf("after 3D initialization.\n"); fflush(stdout); + printf("after 3D initialization.\n"); + fflush(stdout); } #endif - t = SuperLU_timer_ (); + t = SuperLU_timer_(); /*factorize in grid 1*/ // if(grid3d->zscp.Iam) - pdgstrf3d (options, m, n, anorm, trf3Dpartition, SCT, LUstruct, - grid3d, stat, info); - stat->utime[FACT] = SuperLU_timer_ () - t; + pdgstrf3d(options, m, n, anorm, trf3Dpartition, SCT, LUstruct, + grid3d, stat, info); + stat->utime[FACT] = SuperLU_timer_() - t; double tgather = SuperLU_timer_(); @@ -1388,7 +1394,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, SCT->gatherLUtimer += SuperLU_timer_() - tgather; /*print stats for bottom grid*/ -#if ( PRNTlevel>=1 ) +#if (PRNTlevel >= 1) if (!grid3d->zscp.Iam) { SCT_print(grid, SCT); @@ -1397,7 +1403,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, SCT_printComm3D(grid3d, SCT); /*print memory usage*/ - d3D_printMemUse( trf3Dpartition, LUstruct, grid3d ); + d3D_printMemUse(trf3Dpartition, LUstruct, grid3d); /*print forest weight and costs*/ printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d); @@ -1409,7 +1415,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, } /* end if not Factored */ - if ( grid3d->zscp.Iam == 0 ) // only process layer 0 + if (grid3d->zscp.Iam == 0) // only process layer 0 { if (!factored) { @@ -1418,58 +1424,59 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, int_t TinyPivots; float for_lu, total, max, avg, temp; - dQuerySpace_dist (n, LUstruct, grid, stat, &num_mem_usage); + dQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage); if (parSymbFact == TRUE) { /* The memory used in the redistribution routine includes the memory used for storing the symbolic structure and the memory allocated for numerical factorization */ - temp = SUPERLU_MAX (symb_mem_usage.total, -dist_mem_use); + temp = SUPERLU_MAX(symb_mem_usage.total, -dist_mem_use); if (options->RowPerm != NO) - temp = SUPERLU_MAX (temp, GA_mem_use); + temp = SUPERLU_MAX(temp, GA_mem_use); } else { - temp = SUPERLU_MAX (symb_mem_usage.total + GA_mem_use, /* symbfact step */ - symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu /* distribution step */ - ); + temp = SUPERLU_MAX(symb_mem_usage.total + GA_mem_use, /* symbfact step */ + symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu /* distribution step */ + ); } - temp = SUPERLU_MAX (temp, num_mem_usage.total); + temp = SUPERLU_MAX(temp, num_mem_usage.total); - MPI_Reduce (&temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); - MPI_Reduce (&temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); - MPI_Allreduce (&stat->TinyPivots, &TinyPivots, 1, mpi_int_t, - MPI_SUM, grid->comm); + MPI_Reduce(&temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + MPI_Reduce(&temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Allreduce(&stat->TinyPivots, &TinyPivots, 1, mpi_int_t, + MPI_SUM, grid->comm); stat->TinyPivots = TinyPivots; - MPI_Reduce (&num_mem_usage.for_lu, &for_lu, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); - MPI_Reduce (&num_mem_usage.total, &total, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce(&num_mem_usage.for_lu, &for_lu, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce(&num_mem_usage.total, &total, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); if (!iam) { printf("\tNUMfact space (MB) sum(procs): L\\U\t%.2f\tall\t%.2f\n", - for_lu * 1e-6, total * 1e-6); - printf ("\tTotal highmark (MB): " - "All\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg * 1e-6, - avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); + for_lu * 1e-6, total * 1e-6); + printf("\tTotal highmark (MB): " + "All\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", + avg * 1e-6, + avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); printf("**************************************************\n"); fflush(stdout); } } - } /* end if not Factored */ + } /* end if not Factored */ /* ------------------------------------------------------------ Compute the solution matrix X. ------------------------------------------------------------ */ if (nrhs) { - if (!(b_work = doubleMalloc_dist (n))) - ABORT ("Malloc fails for b_work[]"); + if (!(b_work = doubleMalloc_dist(n))) + ABORT("Malloc fails for b_work[]"); /* ------------------------------------------------------ Scale the right-hand side if equilibration was performed @@ -1512,13 +1519,14 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* Save a copy of the right-hand side. */ ldx = ldb; - if (!(X = doubleMalloc_dist (((size_t) ldx) * nrhs))) - ABORT ("Malloc fails for X[]"); + if (!(X = doubleMalloc_dist(((size_t)ldx) * nrhs))) + ABORT("Malloc fails for X[]"); x_col = X; b_col = B; for (j = 0; j < nrhs; ++j) { - for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; + for (i = 0; i < m_loc; ++i) + x_col[i] = b_col[i]; x_col += ldx; b_col += ldb; } @@ -1527,13 +1535,13 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, Solve the linear system. ------------------------------------------------------*/ if (options->SolveInitialized == NO) /* First time */ - /* Inside this routine, SolveInitialized is set to YES. + /* Inside this routine, SolveInitialized is set to YES. For repeated call to pdgssvx3d(), no need to re-initialilze the Solve data & communication structures, unless a new factorization with Fact == DOFACT or SamePattern is asked for. */ { - dSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct, - grid, SOLVEstruct); + dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, + grid, SOLVEstruct); } stat->utime[SOLVE] = 0.0; #if 0 // Sherry: the following interface is needed by 3D trisolve. @@ -1541,8 +1549,8 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, fst_row, ldb, nrhs, SOLVEstruct, stat, info); #else pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, - fst_row, ldb, nrhs, SOLVEstruct, stat, info); - + fst_row, ldb, nrhs, SOLVEstruct, stat, info); + //PrintDouble5("after pdgstrs X", ldb, X); #endif @@ -1554,24 +1562,26 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, { /* Improve the solution by iterative refinement. */ int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; - SOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ + SOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ - t = SuperLU_timer_ (); + t = SuperLU_timer_(); if (options->RefineInitialized == NO || Fact == DOFACT) { /* All these cases need to re-initialize gsmv structure */ if (options->RefineInitialized) - pdgsmv_finalize (SOLVEstruct->gsmv_comm); - pdgsmv_init (A, SOLVEstruct->row_to_proc, grid, - SOLVEstruct->gsmv_comm); + pdgsmv_finalize(SOLVEstruct->gsmv_comm); + pdgsmv_init(A, SOLVEstruct->row_to_proc, grid, + SOLVEstruct->gsmv_comm); /* Save a copy of the transformed local col indices in colind_gsmv[]. */ - if (colind_gsmv) SUPERLU_FREE (colind_gsmv); - if (!(it = intMalloc_dist (nnz_loc))) - ABORT ("Malloc fails for colind_gsmv[]"); + if (colind_gsmv) + SUPERLU_FREE(colind_gsmv); + if (!(it = intMalloc_dist(nnz_loc))) + ABORT("Malloc fails for colind_gsmv[]"); colind_gsmv = SOLVEstruct->A_colind_gsmv = it; - for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; + for (i = 0; i < nnz_loc; ++i) + colind_gsmv[i] = colind[i]; options->RefineInitialized = YES; } else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) @@ -1580,7 +1590,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, int_t k, jcol, p; /* Swap to beginning the part of A corresponding to the local part of X, as was done in pdgsmv_init() */ - for (i = 0; i < m_loc; ++i) /* Loop through each row */ + for (i = 0; i < m_loc; ++i) /* Loop through each row */ { k = rowptr[i]; for (j = rowptr[i]; j < rowptr[i + 1]; ++j) @@ -1617,8 +1627,8 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, So we use SOLVEstruct1 for the refinement step. */ if (!(SOLVEstruct1 = (SOLVEstruct_t *) - SUPERLU_MALLOC (sizeof (SOLVEstruct_t)))) - ABORT ("Malloc fails for SOLVEstruct1"); + SUPERLU_MALLOC(sizeof(SOLVEstruct_t)))) + ABORT("Malloc fails for SOLVEstruct1"); /* Copy the same stuff */ SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; @@ -1630,33 +1640,33 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* Initialize the *gstrs_comm for 1 RHS. */ if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) - SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) - ABORT ("Malloc fails for gstrs_comm[]"); - pxgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, - Glu_persist, SOLVEstruct1); + SUPERLU_MALLOC(sizeof(pxgstrs_comm_t)))) + ABORT("Malloc fails for gstrs_comm[]"); + pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, + Glu_persist, SOLVEstruct1); } - pdgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid, - B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); + pdgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid, + B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); /* Deallocate the storage associated with SOLVEstruct1 */ if (nrhs > 1) { - pxgstrs_finalize (SOLVEstruct1->gstrs_comm); - SUPERLU_FREE (SOLVEstruct1); + pxgstrs_finalize(SOLVEstruct1->gstrs_comm); + SUPERLU_FREE(SOLVEstruct1); } - stat->utime[REFINE] = SuperLU_timer_ () - t; + stat->utime[REFINE] = SuperLU_timer_() - t; } /* Permute the solution matrix B <= Pc'*X. */ - pdPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc, - SOLVEstruct->inv_perm_c, - X, ldx, B, ldb, nrhs, grid); -#if ( DEBUGlevel>=2 ) - printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); + pdPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc, + SOLVEstruct->inv_perm_c, + X, ldx, B, ldb, nrhs, grid); +#if (DEBUGlevel >= 2) + printf("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); for (i = 0; i < m_loc; ++i) - printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]); + printf("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]); #endif /* Transform the solution matrix X to a solution of the original @@ -1693,14 +1703,14 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, } } - SUPERLU_FREE (b_work); - SUPERLU_FREE (X); + SUPERLU_FREE(b_work); + SUPERLU_FREE(X); - } /* end if nrhs != 0 */ + } /* end if nrhs != 0 */ -#if ( PRNTlevel>=1 ) +#if (PRNTlevel >= 1) if (!iam) - printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale); + printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); #endif /* Deallocate R and/or C if it was not used. */ @@ -1709,14 +1719,14 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, switch (ScalePermstruct->DiagScale) { case NOEQUIL: - SUPERLU_FREE (R); - SUPERLU_FREE (C); + SUPERLU_FREE(R); + SUPERLU_FREE(C); break; case ROW: - SUPERLU_FREE (C); + SUPERLU_FREE(C); break; case COL: - SUPERLU_FREE (R); + SUPERLU_FREE(R); break; } } @@ -1728,6 +1738,15 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, } /* process layer 0 done solve */ +#ifdef NRFRMT + dScatterB3d(Atmp, + A_orig, + B3d, ldb, nrhs, B2d, + grid3d); +#endif + // double *B, int ldb, int nrhs, double *B2d, + // gridinfo3d_t *grid3d); + // Sherry comment: // Now, B <=> B2d, and is filled with the solution X // B3d is the saved pointer of the B on input @@ -1745,17 +1764,17 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, #endif /* free storage, which are allocated only in layer 0 */ - if ( grid3d->zscp.Iam == 0 ) { // free matrix A and B2d on 2D - SUPERLU_FREE( Atmp.rowptr ); - SUPERLU_FREE( Atmp.colind ); - SUPERLU_FREE( Atmp.nzval ); - SUPERLU_FREE(B2d); + if (grid3d->zscp.Iam == 0) + { // free matrix A and B2d on 2D + SUPERLU_FREE(Atmp.rowptr); + SUPERLU_FREE(Atmp.colind); + SUPERLU_FREE(Atmp.nzval); + // SUPERLU_FREE(B2d); } - + A->Store = Astore3d; // restore Astore to 3D - -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC (iam, "Exit pdgssvx3d()"); -#endif +#if (DEBUGlevel >= 1) + CHECK_MALLOC(iam, "Exit pdgssvx3d()"); +#endif } diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index 4c08ceb7..30436361 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -1104,6 +1104,10 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A, double* B, int ldb, int nrhs, double** B2d, gridinfo3d_t *grid3d); +int dScatterB3d(NRformat_loc A2d, NRformat_loc *A, + double *B, int ldb, int nrhs, double *B2d, + gridinfo3d_t *grid3d); + extern void DistPrint(char* function_name, double value, char* Units, gridinfo_t* grid); extern void DistPrint3D(char* function_name, double value, char* Units, gridinfo3d_t* grid3d); From 75cfc06d79d6dcc6647fc4f5828616f84725c0ca Mon Sep 17 00:00:00 2001 From: piyush sao Date: Tue, 13 Oct 2020 00:34:55 -0400 Subject: [PATCH 030/147] working code with nrformat_loc3d --- SRC/nrformat_loc.c | 153 +++++++++++++++++++++++++++++++++++++++------ SRC/pdgssvx3d.c | 32 ++++++---- SRC/superlu_defs.h | 5 +- SRC/supermatrix.h | 16 +++++ 4 files changed, 174 insertions(+), 32 deletions(-) diff --git a/SRC/nrformat_loc.c b/SRC/nrformat_loc.c index 287893cb..040dd5f4 100644 --- a/SRC/nrformat_loc.c +++ b/SRC/nrformat_loc.c @@ -146,6 +146,8 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A, SUPERLU_FREE(B1); } + + #if 0 /* free storage */ SUPERLU_FREE(nnz_counts); @@ -162,10 +164,63 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A, // X2d <- A^{-1} B2D -int dScatterB3d(NRformat_loc A2d, NRformat_loc *A, - double *B, int ldb, int nrhs, double *B2d, - gridinfo3d_t *grid3d) +int dScatterB3d_(NRformat_loc3d *A3d, gridinfo3d_t *grid3d) +{ + + double *B = A3d->B; + int ldb = A3d->ldb; + int nrhs = A3d->nrhs; + double *B2d = A3d->B2d; + NRformat_loc A2d = *(A3d->A_nfmt); + int m_loc = A3d->m_loc; + int *b_counts_int = A3d->b_counts_int; + int *b_disp = A3d->b_disp; + int *row_counts_int = A3d->row_counts_int; + int *row_disp = A3d->row_disp; + + double *B1; + if (grid3d->zscp.Iam == 0) + { + B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double)); + } + + // B1 <- blockByBock(b2d) + if (grid3d->zscp.Iam == 0) + { + for (int i = 0; i < grid3d->npdep; ++i) + { + /* code */ + matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i], + B2d + row_disp[i], A2d.m_loc); + } + } + + // + double *Btmp; + Btmp = SUPERLU_MALLOC(A3d->m_loc * nrhs * sizeof(double)); + + // Bttmp <- scatterv(B1) + MPI_Scatterv(B1, b_counts_int, b_disp, MPI_DOUBLE, + Btmp, nrhs * A3d->m_loc, MPI_DOUBLE, 0, grid3d->zscp.comm); + + // B <- colMajor(Btmp) + matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc); + + return 0; +} + +NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, + double *B, int ldb, int nrhs, + gridinfo3d_t *grid3d) { + NRformat_loc3d *A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d)); + NRformat_loc *A2d = SUPERLU_MALLOC(sizeof(NRformat_loc)); + A3d->m_loc = A->m_loc; + A3d->B = B; + A3d->ldb = ldb; + A3d->nrhs = nrhs; + + // find number of nnzs int_t *nnz_counts, *row_counts; int *nnz_disp, *row_disp, *nnz_counts_int, *row_counts_int; int *b_counts_int, *b_disp; @@ -195,34 +250,94 @@ int dScatterB3d(NRformat_loc A2d, NRformat_loc *A, b_counts_int[i] = nrhs * row_counts[i]; } - double *B1; if (grid3d->zscp.Iam == 0) { - B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double)); + A2d->colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t)); + A2d->nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(double)); + A2d->rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t)); + A2d->rowptr[0] = 0; } - // B1 <- blockByBock(b2d) + MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval, + nnz_counts_int, nnz_disp, + MPI_DOUBLE, 0, grid3d->zscp.comm); + MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind, + nnz_counts_int, nnz_disp, + mpi_int_t, 0, grid3d->zscp.comm); + MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1], + row_counts_int, row_disp, + mpi_int_t, 0, grid3d->zscp.comm); + if (grid3d->zscp.Iam == 0) { - for (int i = 0; i < grid3d->npdep; ++i) + for (int i = 0; i < grid3d->npdep; i++) { - /* code */ - matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i], - B2d + row_disp[i], A2d.m_loc); + for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++) + { + // A2d->rowptr[j] += row_disp[i]; + A2d->rowptr[j] += nnz_disp[i]; + } } - - } + A2d->nnz_loc = nnz_disp[grid3d->npdep]; + A2d->m_loc = row_disp[grid3d->npdep]; +#if 0 + A2d->fst_row = A->fst_row; // This is a bug +#else + gridinfo_t *grid2d = &(grid3d->grid2d); + int procs2d = grid2d->nprow * grid2d->npcol; + int m_loc_2d = A2d->m_loc; + int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int)); - // + MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, MPI_INT, grid2d->comm); + + int fst_row = 0; + for (int p = 0; p < procs2d; ++p) + { + if (grid2d->iam == p) + A2d->fst_row = fst_row; + fst_row += m_loc_2d_counts[p]; + } + + SUPERLU_FREE(m_loc_2d_counts); +#endif + } + // Btmp <- compact(B) + // compacting B double *Btmp; Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(double)); + matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb); - // Bttmp <- scatterv(B1) - MPI_Scatterv(B1, b_counts_int, b_disp, MPI_DOUBLE, - Btmp, nrhs * A->m_loc, MPI_DOUBLE, 0, grid3d->zscp.comm); + double *B1; + if (grid3d->zscp.Iam == 0) + { + B1 = SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(double)); + A3d->B2d = SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(double)); + } - // B <- colMajor(Btmp) - matCopy(A->m_loc, nrhs, B, ldb,Btmp, A->m_loc); + // B1 <- gatherv(Btmp) + MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_DOUBLE, B1, + b_counts_int, b_disp, + MPI_DOUBLE, 0, grid3d->zscp.comm); - return 0; + // B2d <- colMajor(B1) + if (grid3d->zscp.Iam == 0) + { + for (int i = 0; i < grid3d->npdep; ++i) + { + /* code */ + matCopy(row_counts_int[i], nrhs, A3d->B2d + row_disp[i], A2d->m_loc, + B1 + nrhs * row_disp[i], row_counts_int[i]); + } + + SUPERLU_FREE(B1); + } + + A3d->A_nfmt = A2d; + A3d->b_counts_int = b_counts_int; + A3d->b_disp = b_disp; + A3d->row_counts_int = row_counts_int; + A3d->row_disp = row_disp; + + return A3d; + // , double **B2d, } \ No newline at end of file diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index 3390b5ce..c7833a3a 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -619,19 +619,26 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, B3d and Astore3d will be restored on return */ int ldb3d = ldb; double *B3d = B; + NRformat_loc *Astore3d = (NRformat_loc *)A->Store; double *B2d; - //int ldb2d; // not used +//int ldb2d; // not used +#if 0 NRformat_loc Atmp = dGatherNRformat_loc( (NRformat_loc *)A->Store, B, ldb, nrhs, &B2d, grid3d); - // dGatherNRformat_loc(NRformat_loc3d * A3d, grid3d); - // dScatterBNRformat_loc(NRformat_loc3d * A3d, grid3d); - NRformat_loc *Astore0 = &Atmp; // Astore0 is on 2D +#else + NRformat_loc3d *A3d = dGatherNRformat_loc3d( + (NRformat_loc *)A->Store, + B, ldb, nrhs, grid3d); + B2d = A3d->B2d; + NRformat_loc *Astore0 = A3d->A_nfmt; +#endif + NRformat_loc *A_orig = A->Store; if (grid3d->zscp.Iam == 0) { @@ -1739,11 +1746,12 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, } /* process layer 0 done solve */ #ifdef NRFRMT - dScatterB3d(Atmp, - A_orig, - B3d, ldb, nrhs, B2d, - grid3d); -#endif +dScatterB3d_(A3d, grid3d); + // dScatterB3d(*(A3d->A_nfmt), + // A_orig, + // B3d, ldb, nrhs, B2d, + // grid3d); +#endif // double *B, int ldb, int nrhs, double *B2d, // gridinfo3d_t *grid3d); @@ -1766,9 +1774,9 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, /* free storage, which are allocated only in layer 0 */ if (grid3d->zscp.Iam == 0) { // free matrix A and B2d on 2D - SUPERLU_FREE(Atmp.rowptr); - SUPERLU_FREE(Atmp.colind); - SUPERLU_FREE(Atmp.nzval); + // SUPERLU_FREE(Atmp.rowptr); + // SUPERLU_FREE(Atmp.colind); + // SUPERLU_FREE(Atmp.nzval); // SUPERLU_FREE(B2d); } diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index 30436361..713599be 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -1108,7 +1108,10 @@ int dScatterB3d(NRformat_loc A2d, NRformat_loc *A, double *B, int ldb, int nrhs, double *B2d, gridinfo3d_t *grid3d); - +NRformat_loc3d* dGatherNRformat_loc3d(NRformat_loc *A, + double *B, int ldb, int nrhs, + gridinfo3d_t *grid3d); +extern int dScatterB3d_(NRformat_loc3d *A3d, gridinfo3d_t *grid3d); extern void DistPrint(char* function_name, double value, char* Units, gridinfo_t* grid); extern void DistPrint3D(char* function_name, double value, char* Units, gridinfo3d_t* grid3d); extern void treeImbalance3D(gridinfo3d_t *grid3d, SCT_t* SCT); diff --git a/SRC/supermatrix.h b/SRC/supermatrix.h index 1c296530..955453b4 100644 --- a/SRC/supermatrix.h +++ b/SRC/supermatrix.h @@ -188,4 +188,20 @@ typedef struct { } NRformat_loc; +typedef struct NRformat_loc3d +{ + NRformat_loc* A_nfmt; + double* B; + int ldb; + int nrhs; + int m_loc; + double* B2d; + + int* row_counts_int; + int* row_disp; + int* b_counts_int; + int* b_disp; +} NRformat_loc3d; + + #endif /* __SUPERLU_SUPERMATRIX */ From 3a2ae7819eeb8cf2142c23086a72660318e08d27 Mon Sep 17 00:00:00 2001 From: piyush sao Date: Thu, 22 Oct 2020 15:53:27 -0400 Subject: [PATCH 031/147] temporary adding -g to compile commands --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 62a5b283..dc67bd12 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -124,15 +124,15 @@ set(INSTALL_BIN_DIR "${default_install_bin_dir}" CACHE STRING "The folder where # Set up required compiler defines and options. ## get_directory_property( DirDefs COMPILE_DEFINITIONS ) -# set(CMAKE_C_FLAGS "-DDEBUGlevel=0 -DPRNTlevel=0 ${CMAKE_C_FLAGS}") -set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}") +# set(CMAKE_C_FLAGS "-g -DDEBUGlevel=0 -DPRNTlevel=0 ${CMAKE_C_FLAGS}") +set(CMAKE_CXX_FLAGS "-g -std=c++11 ${CMAKE_CXX_FLAGS}") if(XSDK_INDEX_SIZE EQUAL 64) message("-- Using 64 bit integer for index size.") endif() -set(CMAKE_C_FLAGS_RELEASE "-O3" CACHE STRING "") +set(CMAKE_C_FLAGS_RELEASE "-O3 -g" CACHE STRING "") message("cmake_c_flags_release '${CMAKE_C_FLAGS_RELEASE}'") message("cmake_shared_library_c_flags '${CMAKE_SHARED_LIBRARY_C_FLAGS}'") -set(CMAKE_CXX_FLAGS_RELEASE "-O3" CACHE STRING "") +set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g" CACHE STRING "") ###################################################################### # From a9068f6e8d82059023ce777d95c2ad833bbea032 Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Thu, 22 Oct 2020 13:38:30 -0700 Subject: [PATCH 032/147] Generated complex version; remains to be tested. Renamed routines and file name to accommodate complex. --- SRC/CMakeLists.txt | 3 +- SRC/Makefile | 8 +- SRC/{nrformat_loc.c => dnrformat_loc3d.c} | 197 +++++++----- SRC/pdgssvx3d.c | 4 +- SRC/superlu_ddefs.h | 8 + SRC/superlu_defs.h | 3 +- SRC/supermatrix.h | 9 +- SRC/znrformat_loc3d.c | 373 ++++++++++++++++++++++ 8 files changed, 512 insertions(+), 93 deletions(-) rename SRC/{nrformat_loc.c => dnrformat_loc3d.c} (82%) create mode 100644 SRC/znrformat_loc3d.c diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 93d59739..576412cc 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -22,7 +22,6 @@ endif () # first: precision-independent files # global.cpp set(sources - nrformat_loc.c sp_ienv.c etree.c sp_colorder.c @@ -104,6 +103,7 @@ if(enable_double) pdgsmv_AXglobal.c pdGetDiagU.c pdgssvx3d.c ## 3D code + dnrformat_loc3d.c pdgstrf3d.c dtreeFactorization.c dgather.c @@ -156,6 +156,7 @@ if(enable_complex16) pzgsmv_AXglobal.c pzGetDiagU.c pzgssvx3d.c ## 3D code + znrformat_loc3d.c pzgstrf3d.c ztreeFactorization.c zscatter3d.c diff --git a/SRC/Makefile b/SRC/Makefile index 239a1056..860c638a 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -71,10 +71,11 @@ DPLUSRC = pdgssvx.o pdgssvx_ABglobal.o \ pdgstrf.o pdgstrf2.o pdGetDiagU.o \ pdgstrs.o pdgstrs1.o pdgstrs_lsum.o pdgstrs_Bglobal.o \ pdgsrfs.o pdgsmv.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o \ - dreadtriple_noheader.o nrformat_loc.o ##$(FACT3D) + dreadtriple_noheader.o # from 3D code DPLUSRC += pdgssvx3d.o pdgstrf3d.o dtreeFactorization.o dscatter3d.o \ - dgather.o pd3dcomm.o dtrfAux.o dcommunication_aux.o dtrfCommWrapper.o + dgather.o pd3dcomm.o dtrfAux.o dcommunication_aux.o dtrfCommWrapper.o \ + dnrformat_loc3d.o ##$(FACT3D) # # Routines for double complex parallel SuperLU @@ -88,7 +89,8 @@ ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \ zreadtriple_noheader.o # from 3D code ZPLUSRC += pzgssvx3d.o pzgstrf3d.o ztreeFactorization.o zscatter3d.o \ - zgather.o pz3dcomm.o ztrfAux.o zcommunication_aux.o ztrfCommWrapper.o + zgather.o pz3dcomm.o ztrfAux.o zcommunication_aux.o ztrfCommWrapper.o \ + znrformat_loc3d.o ##$(FACT3D) all: double complex16 diff --git a/SRC/nrformat_loc.c b/SRC/dnrformat_loc3d.c similarity index 82% rename from SRC/nrformat_loc.c rename to SRC/dnrformat_loc3d.c index 040dd5f4..71db5247 100644 --- a/SRC/nrformat_loc.c +++ b/SRC/dnrformat_loc3d.c @@ -1,3 +1,28 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + + +/*! @file + * \brief Preprocessing routines for the 3D factorization/solve codes: + * - Gather {A,B} from 3D grid to 2D process layer 0 + * - Scatter B (solution) from 2D process layer 0 to 3D grid + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * Oak Ridge National Lab.
+ * October 22, 2020
+ */
+
 #include "superlu_ddefs.h"
 
 static void matCopy(int n, int m, double *Dst, int lddst, double *Src, int ldsrc)
@@ -10,29 +35,24 @@ static void matCopy(int n, int m, double *Dst, int lddst, double *Src, int ldsrc
 
     return;
 }
-// typedef struct {
-//     int_t nnz_loc;   /* number of nonzeros in the local submatrix */
-//     int_t m_loc;     /* number of rows local to this processor */
-//     int_t fst_row;   /* global index of the first row */
-//     void  *nzval;    /* pointer to array of nonzero values, packed by row */
-//     int_t *rowptr;   /* pointer to array of beginning of rows in nzval[]
-// 			and colind[]  */
-//     int_t *colind;   /* pointer to array of column indices of the nonzeros */
-//                      /* Note:
-// 			Zero-based indexing is used;
-// 			rowptr[] has n_loc + 1 entries, the last one pointing
-// 			beyond the last row, so that rowptr[n_loc] = nnz_loc.*/
-// } NRformat_loc;
 
 /*
- * Input:  {A, B, ldb} are distributed on 3D process grid
- * Output: {A2d, B2d} are distributed on layer 0 2D process grid
+ * Gather {A,B} from 3D grid to 2D process layer 0
+ *     Input:  {A, B, ldb} are distributed on 3D process grid
+ *     Output: {A2d, B2d} are distributed on layer 0 2D process grid
+ *             output is in the returned A3d->{} structure.
  */
-NRformat_loc dGatherNRformat_loc(NRformat_loc *A,
-                                 double *B, int ldb, int nrhs, double **B2d,
-                                 gridinfo3d_t *grid3d)
+NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input
+                                      double *B,       // input
+				      int ldb, int nrhs, // input
+                                      gridinfo3d_t *grid3d)
 {
-    NRformat_loc A2d;
+    NRformat_loc3d *A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+    NRformat_loc *A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
+    A3d->m_loc = A->m_loc;
+    A3d->B = (double *) B; // on 3D process grid
+    A3d->ldb = ldb;
+    A3d->nrhs = nrhs;
 
     // find number of nnzs
     int_t *nnz_counts, *row_counts;
@@ -66,19 +86,19 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A,
 
     if (grid3d->zscp.Iam == 0)
     {
-        A2d.colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t));
-        A2d.nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(double));
-        A2d.rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t));
-        A2d.rowptr[0] = 0;
+        A2d->colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t));
+        A2d->nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(double));
+        A2d->rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t));
+        A2d->rowptr[0] = 0;
     }
 
-    MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d.nzval,
+    MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval,
                 nnz_counts_int, nnz_disp,
                 MPI_DOUBLE, 0, grid3d->zscp.comm);
-    MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d.colind,
+    MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
                 nnz_counts_int, nnz_disp,
                 mpi_int_t, 0, grid3d->zscp.comm);
-    MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d.rowptr[1],
+    MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
                 row_counts_int, row_disp,
                 mpi_int_t, 0, grid3d->zscp.comm);
 
@@ -88,18 +108,18 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A,
         {
             for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
             {
-                // A2d.rowptr[j] += row_disp[i];
-                A2d.rowptr[j] += nnz_disp[i];
+                // A2d->rowptr[j] += row_disp[i];
+                A2d->rowptr[j] += nnz_disp[i];
             }
         }
-        A2d.nnz_loc = nnz_disp[grid3d->npdep];
-        A2d.m_loc = row_disp[grid3d->npdep];
+        A2d->nnz_loc = nnz_disp[grid3d->npdep];
+        A2d->m_loc = row_disp[grid3d->npdep];
 #if 0	
-        A2d.fst_row = A->fst_row; // This is a bug
+        A2d->fst_row = A->fst_row; // This is a bug
 #else
         gridinfo_t *grid2d = &(grid3d->grid2d);
         int procs2d = grid2d->nprow * grid2d->npcol;
-        int m_loc_2d = A2d.m_loc;
+        int m_loc_2d = A2d->m_loc;
         int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
 
         MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, MPI_INT, grid2d->comm);
@@ -108,7 +128,7 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A,
         for (int p = 0; p < procs2d; ++p)
         {
             if (grid2d->iam == p)
-                A2d.fst_row = fst_row;
+                A2d->fst_row = fst_row;
             fst_row += m_loc_2d_counts[p];
         }
 
@@ -124,8 +144,8 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A,
     double *B1;
     if (grid3d->zscp.Iam == 0)
     {
-        B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double));
-        *B2d = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double));
+        B1 = SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(double));
+        A3d->B2d = (double *) SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(double));
     }
 
     // B1 <- gatherv(Btmp)
@@ -139,38 +159,43 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A,
         for (int i = 0; i < grid3d->npdep; ++i)
         {
             /* code */
-            matCopy(row_counts_int[i], nrhs, *B2d + row_disp[i], A2d.m_loc,
-                    B1 + nrhs * row_disp[i], row_counts_int[i]);
+            matCopy(row_counts_int[i], nrhs, ((double*)A3d->B2d) + row_disp[i],
+		    A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
         }
 
         SUPERLU_FREE(B1);
     }
 
-
+    A3d->A_nfmt = A2d;
+    A3d->b_counts_int = b_counts_int;
+    A3d->b_disp = b_disp;
+    A3d->row_counts_int = row_counts_int;
+    A3d->row_disp = row_disp;
 
 #if 0
     /* free storage */
     SUPERLU_FREE(nnz_counts);
-    SUPERLU_FREE(row_counts);
     SUPERLU_FREE(nnz_counts_int);
-    SUPERLU_FREE(row_counts_int);
+    SUPERLU_FREE(row_counts);
     SUPERLU_FREE(nnz_disp);
-    SUPERLU_FREE(row_disp);
-    SUPERLU_FREE(b_disp);
 #endif
 
-    return A2d;
-}
+    return A3d;
 
-// X2d <- A^{-1} B2D
+} /* dGatherNRformat_loc3d */
 
-int dScatterB3d_(NRformat_loc3d *A3d, gridinfo3d_t *grid3d)
+/*
+ * Scatter B (solution) from 2D process layer 0 to 3D grid
+ *   Output: X2d <- A^{-1} B2d
+ */
+int dScatter_B3d(NRformat_loc3d *A3d,  // modified
+		 gridinfo3d_t *grid3d)
 {
 
-    double *B = A3d->B;
+    double *B = (double *) A3d->B;
     int ldb = A3d->ldb;
     int nrhs = A3d->nrhs;
-    double *B2d = A3d->B2d;
+    double *B2d = (double *) A3d->B2d;
     NRformat_loc A2d = *(A3d->A_nfmt);
     int m_loc = A3d->m_loc;
     int *b_counts_int = A3d->b_counts_int;
@@ -206,19 +231,30 @@ int dScatterB3d_(NRformat_loc3d *A3d, gridinfo3d_t *grid3d)
     // B <- colMajor(Btmp)
     matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
 
+#if 0
+    /* free storage */
+    SUPERLU_FREE(A3d->b_counts_int);
+    SUPERLU_FREE(A3d->b_disp);
+    SUPERLU_FREE(A3d->row_counts_int);
+    SUPERLU_FREE(A3d->row_disp);
+#endif
+
     return 0;
-}
+} /* dScatter_B3d */
 
-NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A,
-                                      double *B, int ldb, int nrhs,
-                                      gridinfo3d_t *grid3d)
+
+/*
+ * THIS ROUTINE IS NOT USED ANYMORE.
+ */
+NRformat_loc dGatherNRformat_loc(
+				 NRformat_loc *A, // input 
+                                 double *B,       // input
+				 int ldb,
+				 int nrhs,
+				 double **B2d,    // output
+                                 gridinfo3d_t *grid3d)
 {
-    NRformat_loc3d *A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
-    NRformat_loc *A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
-    A3d->m_loc = A->m_loc;
-    A3d->B = B;
-    A3d->ldb = ldb;
-    A3d->nrhs = nrhs;
+    NRformat_loc A2d;
 
     // find number of nnzs
     int_t *nnz_counts, *row_counts;
@@ -252,19 +288,19 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A,
 
     if (grid3d->zscp.Iam == 0)
     {
-        A2d->colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t));
-        A2d->nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(double));
-        A2d->rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t));
-        A2d->rowptr[0] = 0;
+        A2d.colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t));
+        A2d.nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(double));
+        A2d.rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t));
+        A2d.rowptr[0] = 0;
     }
 
-    MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval,
+    MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d.nzval,
                 nnz_counts_int, nnz_disp,
                 MPI_DOUBLE, 0, grid3d->zscp.comm);
-    MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+    MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d.colind,
                 nnz_counts_int, nnz_disp,
                 mpi_int_t, 0, grid3d->zscp.comm);
-    MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+    MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d.rowptr[1],
                 row_counts_int, row_disp,
                 mpi_int_t, 0, grid3d->zscp.comm);
 
@@ -274,18 +310,18 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A,
         {
             for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
             {
-                // A2d->rowptr[j] += row_disp[i];
-                A2d->rowptr[j] += nnz_disp[i];
+                // A2d.rowptr[j] += row_disp[i];
+                A2d.rowptr[j] += nnz_disp[i];
             }
         }
-        A2d->nnz_loc = nnz_disp[grid3d->npdep];
-        A2d->m_loc = row_disp[grid3d->npdep];
+        A2d.nnz_loc = nnz_disp[grid3d->npdep];
+        A2d.m_loc = row_disp[grid3d->npdep];
 #if 0	
-        A2d->fst_row = A->fst_row; // This is a bug
+        A2d.fst_row = A->fst_row; // This is a bug
 #else
         gridinfo_t *grid2d = &(grid3d->grid2d);
         int procs2d = grid2d->nprow * grid2d->npcol;
-        int m_loc_2d = A2d->m_loc;
+        int m_loc_2d = A2d.m_loc;
         int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
 
         MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, MPI_INT, grid2d->comm);
@@ -294,7 +330,7 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A,
         for (int p = 0; p < procs2d; ++p)
         {
             if (grid2d->iam == p)
-                A2d->fst_row = fst_row;
+                A2d.fst_row = fst_row;
             fst_row += m_loc_2d_counts[p];
         }
 
@@ -310,8 +346,8 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A,
     double *B1;
     if (grid3d->zscp.Iam == 0)
     {
-        B1 = SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(double));
-        A3d->B2d = SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(double));
+        B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double));
+        *B2d = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double));
     }
 
     // B1 <- gatherv(Btmp)
@@ -325,19 +361,14 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A,
         for (int i = 0; i < grid3d->npdep; ++i)
         {
             /* code */
-            matCopy(row_counts_int[i], nrhs, A3d->B2d + row_disp[i], A2d->m_loc,
+            matCopy(row_counts_int[i], nrhs, *B2d + row_disp[i], A2d.m_loc,
                     B1 + nrhs * row_disp[i], row_counts_int[i]);
         }
 
         SUPERLU_FREE(B1);
     }
 
-    A3d->A_nfmt = A2d;
-    A3d->b_counts_int = b_counts_int;
-    A3d->b_disp = b_disp;
-    A3d->row_counts_int = row_counts_int;
-    A3d->row_disp = row_disp;
 
-    return A3d;
-    // , double **B2d,
-}
\ No newline at end of file
+    return A2d;
+}
+
diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c
index c7833a3a..0dd75612 100644
--- a/SRC/pdgssvx3d.c
+++ b/SRC/pdgssvx3d.c
@@ -633,7 +633,7 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 	NRformat_loc *Astore0 = &Atmp; // Astore0 is on 2D
 #else
 	NRformat_loc3d *A3d = dGatherNRformat_loc3d(
-		(NRformat_loc *)A->Store,
+	(NRformat_loc *)A->Store,
 		B, ldb, nrhs, grid3d);
 	B2d = A3d->B2d; 
 	NRformat_loc *Astore0 = A3d->A_nfmt; 
@@ -1746,7 +1746,7 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 	} /* process layer 0 done solve */
 
 #ifdef NRFRMT
-dScatterB3d_(A3d, grid3d);
+	dScatter_B3d(A3d, grid3d);
 	// dScatterB3d(*(A3d->A_nfmt),
 	// 			A_orig,
 	// 			B3d, ldb, nrhs, B2d,
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index 2bf22496..28c2559a 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -340,6 +340,14 @@ extern int dcreate_matrix3d(SuperMatrix *A, int nrhs, double **rhs,
 extern int dcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, double **rhs,
                            int *ldb, double **x, int *ldx,
                            FILE *fp, char * postfix, gridinfo3d_t *grid3d);
+    
+/* Matrix distributed in NRformat_loc in 3D process grid. It converts 
+   it to a NRformat_loc distributed in 2D grid in grid-0 */
+extern NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, double *B,
+					     int ldb, int nrhs,
+					     gridinfo3d_t *grid3d);
+extern int dScatter_B3d(NRformat_loc3d *A3d, gridinfo3d_t *grid3d);
+    
 
 /* Driver related */
 extern void    dgsequ_dist (SuperMatrix *, double *, double *, double *,
diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h
index 713599be..909f90f6 100644
--- a/SRC/superlu_defs.h
+++ b/SRC/superlu_defs.h
@@ -1105,13 +1105,14 @@ NRformat_loc dGatherNRformat_loc(NRformat_loc *A,
     gridinfo3d_t *grid3d);
 
 int dScatterB3d(NRformat_loc A2d, NRformat_loc *A,
-                         double *B, int ldb, int nrhs, double *B2d,
+		 double *B, int ldb, int nrhs, double *B2d,
                          gridinfo3d_t *grid3d);
 
 NRformat_loc3d*  dGatherNRformat_loc3d(NRformat_loc *A,
                                  double *B, int ldb, int nrhs,
                                  gridinfo3d_t *grid3d);
 extern int dScatterB3d_(NRformat_loc3d *A3d, gridinfo3d_t *grid3d);
+    
 extern void DistPrint(char* function_name,  double value, char* Units, gridinfo_t* grid);
 extern void DistPrint3D(char* function_name,  double value, char* Units, gridinfo3d_t* grid3d);
 extern void treeImbalance3D(gridinfo3d_t *grid3d, SCT_t* SCT);
diff --git a/SRC/supermatrix.h b/SRC/supermatrix.h
index 955453b4..f1c8c8b1 100644
--- a/SRC/supermatrix.h
+++ b/SRC/supermatrix.h
@@ -188,16 +188,19 @@ typedef struct {
 } NRformat_loc;
 
 
+/* Data structure for storing 3D matrix on layer 0 of the 2D process grid */
 typedef struct NRformat_loc3d
 {
     NRformat_loc* A_nfmt; 
-    double* B;
+    void* B;   // distributed on 3D process grid
+    //double* B;   // distributed on 3D process grid
     int  ldb;
     int nrhs;
     int m_loc; 
-    double* B2d;
+    void* B2d; // on 2D process layer
+    //double* B2d; // on 2D process layer
 
-    int* row_counts_int;
+    int* row_counts_int; // these counts are for {A, B} distributed on 2D layer 0
     int* row_disp;
     int* b_counts_int;
     int* b_disp;
diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c
new file mode 100644
index 00000000..b2a70a86
--- /dev/null
+++ b/SRC/znrformat_loc3d.c
@@ -0,0 +1,373 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Preprocessing routines for the 3D factorization/solve codes:
+ *        - Gather {A,B} from 3D grid to 2D process layer 0
+ *        - Scatter B (solution) from 2D process layer 0 to 3D grid
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * Oak Ridge National Lab.
+ * October 22, 2020
+ */
+
+#include "superlu_zdefs.h"
+
+static void matCopy(int n, int m, doublecomplex *Dst, int lddst, doublecomplex *Src, int ldsrc)
+{
+    for (int j = 0; j < m; j++)
+        for (int i = 0; i < n; ++i)
+        {
+            Dst[i + lddst * j] = Src[i + ldsrc * j];
+        }
+
+    return;
+}
+
+/*
+ * Gather {A,B} from 3D grid to 2D process layer 0
+ *     Input:  {A, B, ldb} are distributed on 3D process grid
+ *     Output: {A2d, B2d} are distributed on layer 0 2D process grid
+ *             output is in the returned A3d->{} structure.
+ */
+NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input
+                                      doublecomplex *B,       // input
+				      int ldb, int nrhs, // input
+                                      gridinfo3d_t *grid3d)
+{
+    NRformat_loc3d *A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+    NRformat_loc *A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
+    A3d->m_loc = A->m_loc;
+    A3d->B = (doublecomplex *) B; // on 3D process grid
+    A3d->ldb = ldb;
+    A3d->nrhs = nrhs;
+
+    // find number of nnzs
+    int_t *nnz_counts, *row_counts;
+    int *nnz_disp, *row_disp, *nnz_counts_int, *row_counts_int;
+    int *b_counts_int, *b_disp;
+    nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+    row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+    nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+    row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+    b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+    MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
+               1, mpi_int_t, 0, grid3d->zscp.comm);
+    MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
+               1, mpi_int_t, 0, grid3d->zscp.comm);
+    nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+    row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+    b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+
+    nnz_disp[0] = 0;
+    row_disp[0] = 0;
+    b_disp[0] = 0;
+    for (int i = 0; i < grid3d->npdep; i++)
+    {
+        nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
+        row_disp[i + 1] = row_disp[i] + row_counts[i];
+        b_disp[i + 1] = nrhs * row_disp[i + 1];
+        nnz_counts_int[i] = nnz_counts[i];
+        row_counts_int[i] = row_counts[i];
+        b_counts_int[i] = nrhs * row_counts[i];
+    }
+
+    if (grid3d->zscp.Iam == 0)
+    {
+        A2d->colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t));
+        A2d->nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(doublecomplex));
+        A2d->rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t));
+        A2d->rowptr[0] = 0;
+    }
+
+    MPI_Gatherv(A->nzval, A->nnz_loc, SuperLU_MPI_DOUBLE_COMPLEX, A2d->nzval,
+                nnz_counts_int, nnz_disp,
+                SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm);
+    MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+                nnz_counts_int, nnz_disp,
+                mpi_int_t, 0, grid3d->zscp.comm);
+    MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+                row_counts_int, row_disp,
+                mpi_int_t, 0, grid3d->zscp.comm);
+
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (int i = 0; i < grid3d->npdep; i++)
+        {
+            for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+            {
+                // A2d->rowptr[j] += row_disp[i];
+                A2d->rowptr[j] += nnz_disp[i];
+            }
+        }
+        A2d->nnz_loc = nnz_disp[grid3d->npdep];
+        A2d->m_loc = row_disp[grid3d->npdep];
+#if 0	
+        A2d->fst_row = A->fst_row; // This is a bug
+#else
+        gridinfo_t *grid2d = &(grid3d->grid2d);
+        int procs2d = grid2d->nprow * grid2d->npcol;
+        int m_loc_2d = A2d->m_loc;
+        int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+        MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, MPI_INT, grid2d->comm);
+
+        int fst_row = 0;
+        for (int p = 0; p < procs2d; ++p)
+        {
+            if (grid2d->iam == p)
+                A2d->fst_row = fst_row;
+            fst_row += m_loc_2d_counts[p];
+        }
+
+        SUPERLU_FREE(m_loc_2d_counts);
+#endif
+    }
+    // Btmp <- compact(B)
+    // compacting B
+    doublecomplex *Btmp;
+    Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(doublecomplex));
+    matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
+
+    doublecomplex *B1;
+    if (grid3d->zscp.Iam == 0)
+    {
+        B1 = SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(doublecomplex));
+        A3d->B2d = (doublecomplex *) SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(doublecomplex));
+    }
+
+    // B1 <- gatherv(Btmp)
+    MPI_Gatherv(Btmp, nrhs * A->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, B1,
+                b_counts_int, b_disp,
+                SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm);
+
+    // B2d <- colMajor(B1)
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (int i = 0; i < grid3d->npdep; ++i)
+        {
+            /* code */
+            matCopy(row_counts_int[i], nrhs, ((doublecomplex*)A3d->B2d) + row_disp[i],
+		    A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
+        }
+
+        SUPERLU_FREE(B1);
+    }
+
+    A3d->A_nfmt = A2d;
+    A3d->b_counts_int = b_counts_int;
+    A3d->b_disp = b_disp;
+    A3d->row_counts_int = row_counts_int;
+    A3d->row_disp = row_disp;
+
+#if 0
+    /* free storage */
+    SUPERLU_FREE(nnz_counts);
+    SUPERLU_FREE(nnz_counts_int);
+    SUPERLU_FREE(row_counts);
+    SUPERLU_FREE(nnz_disp);
+#endif
+
+    return A3d;
+
+} /* dGatherNRformat_loc3d */
+
+/*
+ * Scatter B (solution) from 2D process layer 0 to 3D grid
+ *   Output: X2d <- A^{-1} B2d
+ */
+int zScatter_B3d(NRformat_loc3d *A3d,  // modified
+		 gridinfo3d_t *grid3d)
+{
+
+    doublecomplex *B = (doublecomplex *) A3d->B;
+    int ldb = A3d->ldb;
+    int nrhs = A3d->nrhs;
+    doublecomplex *B2d = (doublecomplex *) A3d->B2d;
+    NRformat_loc A2d = *(A3d->A_nfmt);
+    int m_loc = A3d->m_loc;
+    int *b_counts_int = A3d->b_counts_int;
+    int *b_disp = A3d->b_disp;
+    int *row_counts_int = A3d->row_counts_int;
+    int *row_disp = A3d->row_disp;
+
+    doublecomplex *B1;
+    if (grid3d->zscp.Iam == 0)
+    {
+        B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(doublecomplex));
+    }
+
+    // B1 <- blockByBock(b2d)
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (int i = 0; i < grid3d->npdep; ++i)
+        {
+            /* code */
+            matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i],
+                    B2d + row_disp[i], A2d.m_loc);
+        }
+    }
+
+    //
+    doublecomplex *Btmp;
+    Btmp = SUPERLU_MALLOC(A3d->m_loc * nrhs * sizeof(doublecomplex));
+
+    // Bttmp <- scatterv(B1)
+    MPI_Scatterv(B1, b_counts_int, b_disp, SuperLU_MPI_DOUBLE_COMPLEX,
+                 Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm);
+
+    // B <- colMajor(Btmp)
+    matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
+
+#if 0
+    /* free storage */
+    SUPERLU_FREE(A3d->b_counts_int);
+    SUPERLU_FREE(A3d->b_disp);
+    SUPERLU_FREE(A3d->row_counts_int);
+    SUPERLU_FREE(A3d->row_disp);
+#endif
+
+    return 0;
+} /* dScatter_B3d */
+
+
+/*
+ * THIS ROUTINE IS NOT USED ANYMORE.
+ */
+NRformat_loc dGatherNRformat_loc(
+				 NRformat_loc *A, // input 
+                                 double *B,       // input
+				 int ldb,
+				 int nrhs,
+				 double **B2d,    // output
+                                 gridinfo3d_t *grid3d)
+{
+    NRformat_loc A2d;
+
+    // find number of nnzs
+    int_t *nnz_counts, *row_counts;
+    int *nnz_disp, *row_disp, *nnz_counts_int, *row_counts_int;
+    int *b_counts_int, *b_disp;
+    nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+    row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+    nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+    row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+    b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+    MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
+               1, mpi_int_t, 0, grid3d->zscp.comm);
+    MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
+               1, mpi_int_t, 0, grid3d->zscp.comm);
+    nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+    row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+    b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+
+    nnz_disp[0] = 0;
+    row_disp[0] = 0;
+    b_disp[0] = 0;
+    for (int i = 0; i < grid3d->npdep; i++)
+    {
+        nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
+        row_disp[i + 1] = row_disp[i] + row_counts[i];
+        b_disp[i + 1] = nrhs * row_disp[i + 1];
+        nnz_counts_int[i] = nnz_counts[i];
+        row_counts_int[i] = row_counts[i];
+        b_counts_int[i] = nrhs * row_counts[i];
+    }
+
+    if (grid3d->zscp.Iam == 0)
+    {
+        A2d.colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t));
+        A2d.nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(double));
+        A2d.rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t));
+        A2d.rowptr[0] = 0;
+    }
+
+    MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d.nzval,
+                nnz_counts_int, nnz_disp,
+                MPI_DOUBLE, 0, grid3d->zscp.comm);
+    MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d.colind,
+                nnz_counts_int, nnz_disp,
+                mpi_int_t, 0, grid3d->zscp.comm);
+    MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d.rowptr[1],
+                row_counts_int, row_disp,
+                mpi_int_t, 0, grid3d->zscp.comm);
+
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (int i = 0; i < grid3d->npdep; i++)
+        {
+            for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+            {
+                // A2d.rowptr[j] += row_disp[i];
+                A2d.rowptr[j] += nnz_disp[i];
+            }
+        }
+        A2d.nnz_loc = nnz_disp[grid3d->npdep];
+        A2d.m_loc = row_disp[grid3d->npdep];
+#if 0	
+        A2d.fst_row = A->fst_row; // This is a bug
+#else
+        gridinfo_t *grid2d = &(grid3d->grid2d);
+        int procs2d = grid2d->nprow * grid2d->npcol;
+        int m_loc_2d = A2d.m_loc;
+        int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+        MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, MPI_INT, grid2d->comm);
+
+        int fst_row = 0;
+        for (int p = 0; p < procs2d; ++p)
+        {
+            if (grid2d->iam == p)
+                A2d.fst_row = fst_row;
+            fst_row += m_loc_2d_counts[p];
+        }
+
+        SUPERLU_FREE(m_loc_2d_counts);
+#endif
+    }
+    // Btmp <- compact(B)
+    // compacting B
+    double *Btmp;
+    Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(double));
+    matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
+
+    double *B1;
+    if (grid3d->zscp.Iam == 0)
+    {
+        B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double));
+        *B2d = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double));
+    }
+
+    // B1 <- gatherv(Btmp)
+    MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_DOUBLE, B1,
+                b_counts_int, b_disp,
+                MPI_DOUBLE, 0, grid3d->zscp.comm);
+
+    // B2d <- colMajor(B1)
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (int i = 0; i < grid3d->npdep; ++i)
+        {
+            /* code */
+            matCopy(row_counts_int[i], nrhs, *B2d + row_disp[i], A2d.m_loc,
+                    B1 + nrhs * row_disp[i], row_counts_int[i]);
+        }
+
+        SUPERLU_FREE(B1);
+    }
+
+
+    return A2d;
+}
+

From e7c8193844ecc0c9bd3c58670beaab2c3790d81d Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Wed, 28 Oct 2020 13:16:06 -0700
Subject: [PATCH 033/147] Generated the complex version for the new 3D-to-2D
 gathering code.

---
 EXAMPLE/CMakeLists.txt     |   2 +-
 EXAMPLE/Makefile           |   2 +-
 EXAMPLE/dcreate_matrix3d.c |   8 +-
 EXAMPLE/pddrive3d.c        | 164 ++++++-------
 EXAMPLE/pzdrive3d.c        | 111 ++++++++-
 EXAMPLE/zcreate_matrix3d.c | 462 +++++++++++++++++++++++++++++++++++++
 SRC/dnrformat_loc3d.c      | 131 +----------
 SRC/pdgssvx3d.c            |  58 ++---
 SRC/pzgssvx3d.c            |  65 ++++--
 SRC/superlu_ddefs.h        |   3 +-
 SRC/superlu_dist_config.h  |  16 +-
 SRC/superlu_zdefs.h        |  18 +-
 SRC/znrformat_loc3d.c      | 133 +----------
 13 files changed, 741 insertions(+), 432 deletions(-)
 create mode 100644 EXAMPLE/zcreate_matrix3d.c

diff --git a/EXAMPLE/CMakeLists.txt b/EXAMPLE/CMakeLists.txt
index 0126dc41..fd415766 100644
--- a/EXAMPLE/CMakeLists.txt
+++ b/EXAMPLE/CMakeLists.txt
@@ -108,7 +108,7 @@ if(enable_complex16)
   add_executable(pzdrive4 ${ZEXM4})
   target_link_libraries(pzdrive4 ${all_link_libs})
 
-  set(ZEXM3D pzdrive3d.c zcreate_matrix.c)
+  set(ZEXM3D pzdrive3d.c zcreate_matrix.c zcreate_matrix3d.c)
   add_executable(pzdrive3d ${ZEXM3D})
   target_link_libraries(pzdrive3d ${all_link_libs})
 
diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile
index b8d999a1..f453e084 100644
--- a/EXAMPLE/Makefile
+++ b/EXAMPLE/Makefile
@@ -50,7 +50,7 @@ ZEXM1	= pzdrive1.o zcreate_matrix.o
 ZEXM2	= pzdrive2.o zcreate_matrix.o zcreate_matrix_perturbed.o
 ZEXM3	= pzdrive3.o zcreate_matrix.o
 ZEXM4	= pzdrive4.o zcreate_matrix.o
-ZEXM3D	= pzdrive3d.o zcreate_matrix.o
+ZEXM3D	= pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o
 ZEXMG	= pzdrive_ABglobal.o
 ZEXMG1	= pzdrive1_ABglobal.o
 ZEXMG2	= pzdrive2_ABglobal.o
diff --git a/EXAMPLE/dcreate_matrix3d.c b/EXAMPLE/dcreate_matrix3d.c
index 5725d543..3ad83de9 100644
--- a/EXAMPLE/dcreate_matrix3d.c
+++ b/EXAMPLE/dcreate_matrix3d.c
@@ -10,13 +10,15 @@ at the top-level directory.
 */
 
 
+
 /*! @file
  * \brief Read the matrix from data file
  *
  * 
- * -- Distributed SuperLU routine (version 2.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 15, 2003
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Oak Ridge National Lab.
+ * October 26, 2020
  * 
*/ #include diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c index ea70ce5b..553b7741 100644 --- a/EXAMPLE/pddrive3d.c +++ b/EXAMPLE/pddrive3d.c @@ -1,9 +1,9 @@ /*! \file Copyright (c) 2003, The Regents of the University of California, through -Lawrence Berkeley National Laboratory (subject to receipt of any required -approvals from U.S. Dept. of Energy) +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) -All rights reserved. +All rights reserved. The source code is distributed under BSD license, see the file License.txt at the top-level directory. @@ -19,8 +19,7 @@ at the top-level directory. * May 10, 2019 * */ -#include "superlu_ddefs.h" -#include +#include "superlu_ddefs.h" /*! \brief * @@ -48,21 +47,19 @@ at the top-level directory. * *
*/ - -static void matCheck(int n, int m, double* A, int LDA - , double* B, int LDB) + +static void matCheck(int n, int m, double* A, int LDA, + double* B, int LDB) { for(int j=0; jnnz_loc, Bval); fflush(stdout); #endif - + + double * Aval = (double *) A->nzval; + double * Bval = (double *) B->nzval; for (int_t i = 0; i < A->nnz_loc; i++) { - assert(((double *)A->nzval)[i] == ((double *)B->nzval)[i]); + assert( Aval[i] == Bval[i] ); assert((A->colind)[i] == (B->colind)[i]); printf("colind[] correct\n"); } for (int_t i = 0; i < A->m_loc + 1; i++) { - // assert(((double *)A->nzval)[i] ==((double *)B->nzval)[i]); assert((A->rowptr)[i] == (B->rowptr)[i]); } @@ -106,7 +104,7 @@ main (int argc, char *argv[]) { superlu_dist_options_t options; SuperLUStat_t stat; - SuperMatrix A; // Now, A is on all 3D processes + SuperMatrix A; // Now, A is on all 3D processes ScalePermstruct_t ScalePermstruct; LUstruct_t LUstruct; SOLVEstruct_t SOLVEstruct; @@ -118,7 +116,6 @@ main (int argc, char *argv[]) int iam, info, ldb, ldx, nrhs; char **cpp, c, *suffix; FILE *fp, *fopen (); - FILE *fp0; extern int cpp_defs (); int ii, omp_mpi_level; @@ -169,11 +166,8 @@ main (int argc, char *argv[]) } } else - { - /* Last arg is considered a filename */ - if (!(fp = fopen (*cpp, "r")) - || !(fp0 = fopen (*cpp, "r")) - ) + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) { ABORT ("File does not exist"); } @@ -186,49 +180,46 @@ main (int argc, char *argv[]) ------------------------------------------------------------ */ superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); - if (grid.iam == 0) - { - MPI_Query_thread(&omp_mpi_level); - switch (omp_mpi_level) - { - case MPI_THREAD_SINGLE: - printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); - fflush(stdout); - break; - case MPI_THREAD_FUNNELED: - printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); - fflush(stdout); - break; - case MPI_THREAD_SERIALIZED: - printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); - fflush(stdout); - break; - case MPI_THREAD_MULTIPLE: - printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); - fflush(stdout); - break; - } + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } } - + /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if (iam >= nprow * npcol * npdep) + if (iam >= nprow * npcol *npdep) goto out; - if (!iam) - { - int v_major, v_minor, v_bugfix; + if (!iam) { + int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER - printf("__INTEL_COMPILER is defined\n"); + printf("__INTEL_COMPILER is defined\n"); #endif - printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); - superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); - printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); - printf("Input matrix file:\t%s\n", *cpp); - printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); - fflush(stdout); + fflush(stdout); } #if ( DEBUGlevel>=1 ) @@ -238,19 +229,18 @@ main (int argc, char *argv[]) /* ------------------------------------------------------------ GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------ */ - for (ii = 0; ii < strlen(*cpp); ii++) - { - if ((*cpp)[ii] == '.') - { - suffix = &((*cpp)[ii + 1]); - // printf("%s\n", suffix); - } + for (ii = 0; iim_loc, nrhs, b, ldb, xtrue, ldx, &(grid.grid2d)); fflush(stdout); @@ -372,19 +357,16 @@ main (int argc, char *argv[]) /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------ */ - if ( grid.zscp.Iam == 0 ) // process layer 0 - { - PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ Destroy_LU (n, &(grid.grid2d), &LUstruct); - if (options.SolveInitialized) - { + if (options.SolveInitialized) { dSolveFinalize (&options, &SOLVEstruct); } - } - else // Process layers not equal 0 - { + } else { // Process layers not equal 0 DeAllocLlu_3d(n, &LUstruct, &grid); DeAllocGlu_3d(&LUstruct); } diff --git a/EXAMPLE/pzdrive3d.c b/EXAMPLE/pzdrive3d.c index 2ab20daa..be3522fb 100644 --- a/EXAMPLE/pzdrive3d.c +++ b/EXAMPLE/pzdrive3d.c @@ -46,13 +46,65 @@ at the top-level directory. * *
*/ + +static void matCheck(int n, int m, doublecomplex* A, int LDA, + doublecomplex* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + PrintDouble5("A", A->nnz_loc, Aval); + PrintDouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + doublecomplex * Aval = (doublecomplex *) A->nzval; + doublecomplex * Bval = (doublecomplex *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( (Aval[i].r == Bval[i].r) && (Aval[i].i == Bval[i].i) ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} int main (int argc, char *argv[]) { superlu_dist_options_t options; SuperLUStat_t stat; - SuperMatrix A; // only on process layer 0 + SuperMatrix A; // Now, A is on all 3D processes ScalePermstruct_t ScalePermstruct; LUstruct_t LUstruct; SOLVEstruct_t SOLVEstruct; @@ -183,8 +235,52 @@ main (int argc, char *argv[]) // printf("%s\n", suffix); } } + +#define NRFRMT +#ifndef NRFRMT if ( grid.zscp.Iam == 0 ) // only in process layer 0 zcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &(grid.grid2d)); + +#else + NRformat_loc *Astore, *Astore0; + + // *fp0 = *fp; + zcreate_matrix_postfix3d(&A, nrhs, &b, &ldb, + &xtrue, &ldx, fp, suffix, &(grid)); + //printf("ldx %d, ldb %d\n", ldx, ldb); + +#if 0 // following code is only for checking *Gather* routine + doublecomplex* B2d; + NRformat_loc Atmp = dGatherNRformat_loc( + (NRformat_loc *) A.Store, + b, ldb, nrhs, &B2d, + &grid); + Astore = &Atmp; + SuperMatrix Aref; + doublecomplex *bref, *xtrueref; + if ( grid.zscp.Iam == 0 ) // only in process layer 0 + { + zcreate_matrix_postfix(&Aref, nrhs, &bref, &ldb, + &xtrueref, &ldx, fp0, + suffix, &(grid.grid2d)); + Astore0 = (NRformat_loc *) Aref.Store; + + /* + if ( (grid.grid2d).iam == 0 ) { + printf(" iam %d\n", 0); + checkNRFMT(Astore, Astore0); + } else if ((grid.grid2d).iam == 1 ) { + printf(" iam %d\n", 1); + checkNRFMT(Astore, Astore0); + } + */ + + // bref, xtrueref are created on 2D + matCheck(Astore->m_loc, nrhs, B2d, Astore->m_loc, bref, ldb); + } + // MPI_Finalize(); exit(0); + #endif +#endif if (!(berr = doubleMalloc_dist (nrhs))) ABORT ("Malloc fails for berr[]."); @@ -225,6 +321,10 @@ main (int argc, char *argv[]) fflush(stdout); } +#ifdef NRFRMT // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; +#else if ( grid.zscp.Iam == 0 ) // Process layer 0 { m = A.nrow; @@ -233,6 +333,7 @@ main (int argc, char *argv[]) // broadcast m, n to all the process layers; MPI_Bcast( &m, 1, mpi_int_t, 0, grid.zscp.comm); MPI_Bcast( &n, 1, mpi_int_t, 0, grid.zscp.comm); +#endif /* Initialize ScalePermstruct and LUstruct. */ ScalePermstructInit (m, n, &ScalePermstruct); @@ -246,7 +347,9 @@ main (int argc, char *argv[]) &LUstruct, &SOLVEstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ +#ifndef NRFRMT if ( grid.zscp.Iam == 0 ) // Process layer 0 +#endif pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, nrhs, b, ldb, xtrue, ldx, &(grid.grid2d)); fflush(stdout); @@ -259,10 +362,7 @@ main (int argc, char *argv[]) PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ - Destroy_CompRowLoc_Matrix_dist (&A); Destroy_LU (n, &(grid.grid2d), &LUstruct); - SUPERLU_FREE (b); - SUPERLU_FREE (xtrue); if (options.SolveInitialized) { zSolveFinalize (&options, &SOLVEstruct); } @@ -271,6 +371,9 @@ main (int argc, char *argv[]) DeAllocGlu_3d(&LUstruct); } + Destroy_CompRowLoc_Matrix_dist (&A); + SUPERLU_FREE (b); + SUPERLU_FREE (xtrue); SUPERLU_FREE (berr); ScalePermstructFree (&ScalePermstruct); LUstructFree (&LUstruct); diff --git a/EXAMPLE/zcreate_matrix3d.c b/EXAMPLE/zcreate_matrix3d.c new file mode 100644 index 00000000..b3c43ffd --- /dev/null +++ b/EXAMPLE/zcreate_matrix3d.c @@ -0,0 +1,462 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Read the matrix from data file + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Oak Ridge National Lab.
+ * October 26, 2020
+ * 
+ */ +#include +#include "superlu_zdefs.h" + +/* \brief + * + *
+ * Purpose
+ * =======
+ *
+ * ZCREATE_MATRIX read the matrix from data file in Harwell-Boeing format,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ *
+ * Arguments
+ * =========
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format.
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) double**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) double**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * 
+ */ + +int zcreate_matrix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs, + int *ldb, doublecomplex **x, int *ldx, + FILE *fp, gridinfo3d_t *grid3d) +{ + SuperMatrix GA; /* global A */ + doublecomplex *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + doublecomplex *nzval; /* global */ + doublecomplex *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid3d->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter dcreate_matrix3d()"); +#endif + + if ( !iam ) + { + double t = SuperLU_timer_(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + else + { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + + /* Allocate storage for compressed column representation. */ + zallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + +#if 0 + nzval[0] = 0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) + { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); + } + + /* Create compressed column matrix for GA. */ + zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_Z, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = doublecomplexMalloc_dist(m * nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = doublecomplexMalloc_dist(n * nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + zGenXtrue_dist(n, nrhs, xtrue_global, n); + zFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc + 1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) + { + row = fst_row + j; + rowptr[j + 1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) + { + for (j = colptr[i]; j < colptr[i + 1]; ++j) + { + row = rowind[j]; + if ( (row >= fst_row) && (row < fst_row + m_loc) ) + { + row = row - fst_row; + relpos = marker[row]; + colind[relpos] = i; + nzval_loc[relpos] = nzval[j]; + ++marker[row]; + } + } + } + +#if ( DEBUGlevel>=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_D, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = doublecomplexMalloc_dist(m_loc * nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + { + row = fst_row + i; + (*rhs)[j * m_loc + i] = b_global[j * n + row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = doublecomplexMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit dcreate_matrix()"); +#endif + return 0; +} + + +int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs, + int *ldb, doublecomplex **x, int *ldx, + FILE *fp, char * postfix, gridinfo3d_t *grid3d) +{ + SuperMatrix GA; /* global A */ + doublecomplex *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + doublecomplex *nzval; /* global */ + doublecomplex *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid3d->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter dcreate_matrix()"); +#endif + + if ( !iam ) + { + double t = SuperLU_timer_(); + + if(!strcmp(postfix,"cua")) + { + /* Read the matrix stored on disk in Harwell-Boeing format. */ + zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "mtx")) + { + /* Read the matrix stored on disk in Matrix Market format. */ + zreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "rb")) + { + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + zreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "dat")) + { + /* Read the matrix stored on disk in triplet format. */ + zreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "datnh")) + { + /* Read the matrix stored on disk in triplet format (without header). */ + zreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "bin")) + { + /* Read the matrix stored on disk in binary format. */ + zread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else + { + ABORT("File format not known"); + } + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + else + { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + + /* Allocate storage for compressed column representation. */ + zallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + +#if 0 + nzval[0] = 0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) + { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); + } + + /* Create compressed column matrix for GA. */ + zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_D, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = doublecomplexMalloc_dist(m * nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = doublecomplexMalloc_dist(n * nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + zGenXtrue_dist(n, nrhs, xtrue_global, n); + zFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc + 1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) + { + row = fst_row + j; + rowptr[j + 1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) + { + for (j = colptr[i]; j < colptr[i + 1]; ++j) + { + row = rowind[j]; + if ( (row >= fst_row) && (row < fst_row + m_loc) ) + { + row = row - fst_row; + relpos = marker[row]; + colind[relpos] = i; + nzval_loc[relpos] = nzval[j]; + ++marker[row]; + } + } + } + +#if ( DEBUGlevel>=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_Z, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = doublecomplexMalloc_dist(m_loc * nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + { + row = fst_row + i; + (*rhs)[j * m_loc + i] = b_global[j * n + row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = doublecomplexMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit dcreate_matrix()"); +#endif + return 0; +} diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c index 71db5247..99add3f3 100644 --- a/SRC/dnrformat_loc3d.c +++ b/SRC/dnrformat_loc3d.c @@ -18,7 +18,7 @@ at the top-level directory. * *
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
  * Oak Ridge National Lab.
  * October 22, 2020
  */
@@ -243,132 +243,3 @@ int dScatter_B3d(NRformat_loc3d *A3d,  // modified
 } /* dScatter_B3d */
 
 
-/*
- * THIS ROUTINE IS NOT USED ANYMORE.
- */
-NRformat_loc dGatherNRformat_loc(
-				 NRformat_loc *A, // input 
-                                 double *B,       // input
-				 int ldb,
-				 int nrhs,
-				 double **B2d,    // output
-                                 gridinfo3d_t *grid3d)
-{
-    NRformat_loc A2d;
-
-    // find number of nnzs
-    int_t *nnz_counts, *row_counts;
-    int *nnz_disp, *row_disp, *nnz_counts_int, *row_counts_int;
-    int *b_counts_int, *b_disp;
-    nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
-    row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
-    nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
-    row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
-    b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
-    MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
-               1, mpi_int_t, 0, grid3d->zscp.comm);
-    MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
-               1, mpi_int_t, 0, grid3d->zscp.comm);
-    nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
-    row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
-    b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
-
-    nnz_disp[0] = 0;
-    row_disp[0] = 0;
-    b_disp[0] = 0;
-    for (int i = 0; i < grid3d->npdep; i++)
-    {
-        nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
-        row_disp[i + 1] = row_disp[i] + row_counts[i];
-        b_disp[i + 1] = nrhs * row_disp[i + 1];
-        nnz_counts_int[i] = nnz_counts[i];
-        row_counts_int[i] = row_counts[i];
-        b_counts_int[i] = nrhs * row_counts[i];
-    }
-
-    if (grid3d->zscp.Iam == 0)
-    {
-        A2d.colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t));
-        A2d.nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(double));
-        A2d.rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t));
-        A2d.rowptr[0] = 0;
-    }
-
-    MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d.nzval,
-                nnz_counts_int, nnz_disp,
-                MPI_DOUBLE, 0, grid3d->zscp.comm);
-    MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d.colind,
-                nnz_counts_int, nnz_disp,
-                mpi_int_t, 0, grid3d->zscp.comm);
-    MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d.rowptr[1],
-                row_counts_int, row_disp,
-                mpi_int_t, 0, grid3d->zscp.comm);
-
-    if (grid3d->zscp.Iam == 0)
-    {
-        for (int i = 0; i < grid3d->npdep; i++)
-        {
-            for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
-            {
-                // A2d.rowptr[j] += row_disp[i];
-                A2d.rowptr[j] += nnz_disp[i];
-            }
-        }
-        A2d.nnz_loc = nnz_disp[grid3d->npdep];
-        A2d.m_loc = row_disp[grid3d->npdep];
-#if 0	
-        A2d.fst_row = A->fst_row; // This is a bug
-#else
-        gridinfo_t *grid2d = &(grid3d->grid2d);
-        int procs2d = grid2d->nprow * grid2d->npcol;
-        int m_loc_2d = A2d.m_loc;
-        int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
-
-        MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, MPI_INT, grid2d->comm);
-
-        int fst_row = 0;
-        for (int p = 0; p < procs2d; ++p)
-        {
-            if (grid2d->iam == p)
-                A2d.fst_row = fst_row;
-            fst_row += m_loc_2d_counts[p];
-        }
-
-        SUPERLU_FREE(m_loc_2d_counts);
-#endif
-    }
-    // Btmp <- compact(B)
-    // compacting B
-    double *Btmp;
-    Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(double));
-    matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
-
-    double *B1;
-    if (grid3d->zscp.Iam == 0)
-    {
-        B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double));
-        *B2d = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double));
-    }
-
-    // B1 <- gatherv(Btmp)
-    MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_DOUBLE, B1,
-                b_counts_int, b_disp,
-                MPI_DOUBLE, 0, grid3d->zscp.comm);
-
-    // B2d <- colMajor(B1)
-    if (grid3d->zscp.Iam == 0)
-    {
-        for (int i = 0; i < grid3d->npdep; ++i)
-        {
-            /* code */
-            matCopy(row_counts_int[i], nrhs, *B2d + row_disp[i], A2d.m_loc,
-                    B1 + nrhs * row_disp[i], row_counts_int[i]);
-        }
-
-        SUPERLU_FREE(B1);
-    }
-
-
-    return A2d;
-}
-
diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c
index 0dd75612..71d3e2fb 100644
--- a/SRC/pdgssvx3d.c
+++ b/SRC/pdgssvx3d.c
@@ -578,6 +578,17 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 	iam = grid->iam;
 
 	/* Initialization. */
+	/* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d 
+	   B3d and Astore3d will be restored on return  */
+	int ldb3d = ldb;
+	double *B3d = B;
+	NRformat_loc *Astore3d = (NRformat_loc *)A->Store;
+	double *B2d;
+	NRformat_loc3d *A3d = dGatherNRformat_loc3d((NRformat_loc *)A->Store,
+						    B, ldb, nrhs, grid3d);
+	B2d = A3d->B2d; 
+	NRformat_loc *Astore0 = A3d->A_nfmt; 
+	NRformat_loc *A_orig = A->Store;
 
 	/* definition of factored seen by each process layer */
 	Fact = options->Fact;
@@ -600,6 +611,14 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 		fprintf(stderr,
 				"Extra precise iterative refinement yet to support.");
 	}
+	/* Test the other input parameters. */
+	else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE)
+	    *info = -2;
+	else if (ldb < Astore3d->m_loc)
+	    *info = -5;
+	else if (nrhs < 0) {
+	    *info = -6;
+	}
 	if (*info)
 	{
 		i = -(*info);
@@ -615,31 +634,6 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 	   ordering, symbolic factorization, distribution of L & U */
 #define NRFRMT
 
-	/* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d 
-	   B3d and Astore3d will be restored on return  */
-	int ldb3d = ldb;
-	double *B3d = B;
-
-	NRformat_loc *Astore3d = (NRformat_loc *)A->Store;
-
-	double *B2d;
-//int ldb2d;  // not used
-#if 0
-	NRformat_loc Atmp = dGatherNRformat_loc(
-		(NRformat_loc *)A->Store,
-		B, ldb, nrhs, &B2d,
-		grid3d);
-
-	NRformat_loc *Astore0 = &Atmp; // Astore0 is on 2D
-#else
-	NRformat_loc3d *A3d = dGatherNRformat_loc3d(
-	(NRformat_loc *)A->Store,
-		B, ldb, nrhs, grid3d);
-	B2d = A3d->B2d; 
-	NRformat_loc *Astore0 = A3d->A_nfmt; 
-#endif
-
-	NRformat_loc *A_orig = A->Store;
 	if (grid3d->zscp.Iam == 0)
 	{
 		m = A->nrow;
@@ -673,20 +667,6 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 		rowptr = Astore->rowptr;
 		colind = Astore->colind;
 
-		/* Test the other input parameters. */
-		if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE)
-			*info = -2;
-		else if (ldb < m_loc)
-			*info = -5;
-		else if (nrhs < 0)
-			*info = -6;
-		if (*info)
-		{
-			i = -(*info);
-			pxerr_dist("pdgssvx3d", grid, -*info);
-			return;
-		}
-
 		/* Structures needed for parallel symbolic factorization */
 		int_t *sizes, *fstVtxSep;
 		int noDomains, nprocs_num;
diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c
index 9a73cdad..7d59d942 100644
--- a/SRC/pzgssvx3d.c
+++ b/SRC/pzgssvx3d.c
@@ -547,7 +547,18 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     iam = grid->iam;
     
     /* Initialization. */
-
+    /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d 
+       B3d and Astore3d will be restored on return  */
+    int ldb3d = ldb;
+    doublecomplex *B3d = B;
+    NRformat_loc *Astore3d = (NRformat_loc *)A->Store;
+    doublecomplex *B2d;
+    NRformat_loc3d *A3d = zGatherNRformat_loc3d((NRformat_loc *)A->Store,
+		   	  			B, ldb, nrhs, grid3d);
+    B2d = (doublecomplex *) A3d->B2d; 
+    NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0
+    NRformat_loc *A_orig = A->Store;
+	
     /* definition of factored seen by each process layer */
     Fact = options->Fact;
     factored = (Fact == FACTORED);
@@ -568,6 +579,15 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
         fprintf (stderr,
 	         "Extra precise iterative refinement yet to support.");
     }
+    /* Test the other input parameters. */
+    else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
+	     || A->Dtype != SLU_Z || A->Mtype != SLU_GE)
+	     *info = -2;
+    else if (ldb < Astore3d->m_loc)
+	     *info = -5;
+    else if (nrhs < 0) {
+	     *info = -6;
+    }
     if (*info) {
 	i = -(*info);
 	pxerr_dist ("pzgssvx3d", grid, -*info);
@@ -580,10 +600,22 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	
     /* Perform preprocessing steps on process layer zero, including:
        ordering, symbolic factorization, distribution of L & U */
+	
+#define NRFRMT
+
     if (grid3d->zscp.Iam == 0)
     {
         m = A->nrow;
     	n = A->ncol;
+#ifdef NRFRMT
+	// On input, A->Store is on 3D, now A->Store is re-assigned to 2D store
+	A->Store = Astore0;
+	ldb = Astore0->m_loc;
+	B = B2d; // B is now re-assigned to B2d
+	//PrintDouble5("after gather B=B2d", ldb, B);
+#endif
+
+	/* The following code now works on 2D grid-0 */
     	Astore = (NRformat_loc *) A->Store;
     	nnz_loc = Astore->nnz_loc;
     	m_loc = Astore->m_loc;
@@ -592,20 +624,6 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     	rowptr = Astore->rowptr;
     	colind = Astore->colind;
 
-	/* Test the other input parameters. */
-	if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
-	     || A->Dtype != SLU_Z || A->Mtype != SLU_GE)
-	     *info = -2;
-    	else if (ldb < m_loc)
-	     *info = -5;
-    	else if (nrhs < 0)
-	     *info = -6;
-	if (*info) {
-	   i = -(*info);
-	   pxerr_dist ("pzgssvx3d", grid, -*info);
-	   return;
-	}
-
         /* Structures needed for parallel symbolic factorization */
     	int_t *sizes, *fstVtxSep;
 	int noDomains, nprocs_num;
@@ -1540,6 +1558,23 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 
     } /* process layer 0 done solve */
 
+#ifdef NRFRMT
+	zScatter_B3d(A3d, grid3d);
+	
+	/* free storage, which are allocated only in layer 0 */
+	if (grid3d->zscp.Iam == 0)
+	{ // free matrix A and B2d on 2D
+		// SUPERLU_FREE(Atmp.rowptr);
+		// SUPERLU_FREE(Atmp.colind);
+		// SUPERLU_FREE(Atmp.nzval);
+		// SUPERLU_FREE(B2d);
+	}
+
+	A->Store = Astore3d; // restore Astore to 3D
+	
+#endif
+
+    
 #if ( DEBUGlevel>=1 )
 	CHECK_MALLOC (iam, "Exit pzgssvx3d()");
 #endif
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index 28c2559a..6ecc0881 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -333,7 +333,8 @@ extern int     dcreate_matrix_dat(SuperMatrix *, int, double **, int *,
 			      double **, int *, FILE *, gridinfo_t *);
 extern int 	   dcreate_matrix_postfix(SuperMatrix *, int, double **, int *,
 				  double **, int *, FILE *, char *, gridinfo_t *);
-/*For 3D code */
+
+/*==== For 3D code ====*/
 extern int dcreate_matrix3d(SuperMatrix *A, int nrhs, double **rhs,
                      int *ldb, double **x, int *ldx,
                      FILE *fp, gridinfo3d_t *grid3d);
diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h
index 2416fef5..ec3d9f9a 100644
--- a/SRC/superlu_dist_config.h
+++ b/SRC/superlu_dist_config.h
@@ -1,17 +1,7 @@
-/* superlu_dist_config.h.in */
-
-/* Enable parmetis */
+/* #define XSDK_INDEX_SIZE 64 */
+/* #define SLU_HAVE_LAPACK TRUE */
 #define HAVE_PARMETIS TRUE
-
-/* Enable LAPACK */
-/* #undef SLU_HAVE_LAPACK */
-
-/* Enable CombBLAS */
-/* #undef HAVE_COMBBLAS */
-
-/* enable 64bit index mode */
-/* #undef XSDK_INDEX_SIZE */
-
+/* #define HAVE_COMBBLAS TRUE */
 #if (XSDK_INDEX_SIZE == 64)
 #define _LONGINT 1
 #endif
diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h
index f4c86e0b..710eeeb3 100644
--- a/SRC/superlu_zdefs.h
+++ b/SRC/superlu_zdefs.h
@@ -331,10 +331,9 @@ extern int     zcreate_matrix_rb(SuperMatrix *, int, doublecomplex **, int *,
 			      doublecomplex **, int *, FILE *, gridinfo_t *);
 extern int     zcreate_matrix_dat(SuperMatrix *, int, doublecomplex **, int *,
 			      doublecomplex **, int *, FILE *, gridinfo_t *);
-extern int 	   zcreate_matrix_postfix(SuperMatrix *, int, doublecomplex **, int *,
+extern int zcreate_matrix_postfix(SuperMatrix *, int, doublecomplex **, int *,
 				  doublecomplex **, int *, FILE *, char *, gridinfo_t *);
 
-
 /* Driver related */
 extern void    zgsequ_dist (SuperMatrix *, double *, double *, double *,
 			    double *, double *, int_t *);
@@ -556,7 +555,20 @@ extern void ztrtri_(char*, char*, int*, doublecomplex*, int*, int*);
 
 
 /*==== For 3D code ====*/
-
+extern int zcreate_matrix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs,
+                     int *ldb, doublecomplex **x, int *ldx,
+                     FILE *fp, gridinfo3d_t *grid3d);
+extern int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs,
+             doublecomplex **rhs, int *ldb, doublecomplex **x, int *ldx,
+	     FILE *fp, char * postfix, gridinfo3d_t *grid3d);
+    
+/* Matrix distributed in NRformat_loc in 3D process grid. It converts 
+   it to a NRformat_loc distributed in 2D grid in grid-0 */
+extern NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, doublecomplex *B,
+					     int ldb, int nrhs,
+					     gridinfo3d_t *grid3d);
+extern int zScatter_B3d(NRformat_loc3d *A3d, gridinfo3d_t *grid3d);
+    
 extern void pzgssvx3d (superlu_dist_options_t *, SuperMatrix *,
 		       ScalePermstruct_t *, doublecomplex B[], int ldb, int nrhs,
 		       gridinfo3d_t *, LUstruct_t *, SOLVEstruct_t *, 
diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c
index b2a70a86..ecb0d64c 100644
--- a/SRC/znrformat_loc3d.c
+++ b/SRC/znrformat_loc3d.c
@@ -17,7 +17,7 @@ at the top-level directory.
  *
  * 
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
  * Oak Ridge National Lab.
  * October 22, 2020
  */
@@ -181,7 +181,7 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input
 
     return A3d;
 
-} /* dGatherNRformat_loc3d */
+} /* zGatherNRformat_loc3d */
 
 /*
  * Scatter B (solution) from 2D process layer 0 to 3D grid
@@ -242,132 +242,3 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 } /* dScatter_B3d */
 
 
-/*
- * THIS ROUTINE IS NOT USED ANYMORE.
- */
-NRformat_loc dGatherNRformat_loc(
-				 NRformat_loc *A, // input 
-                                 double *B,       // input
-				 int ldb,
-				 int nrhs,
-				 double **B2d,    // output
-                                 gridinfo3d_t *grid3d)
-{
-    NRformat_loc A2d;
-
-    // find number of nnzs
-    int_t *nnz_counts, *row_counts;
-    int *nnz_disp, *row_disp, *nnz_counts_int, *row_counts_int;
-    int *b_counts_int, *b_disp;
-    nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
-    row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
-    nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
-    row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
-    b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
-    MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
-               1, mpi_int_t, 0, grid3d->zscp.comm);
-    MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
-               1, mpi_int_t, 0, grid3d->zscp.comm);
-    nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
-    row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
-    b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
-
-    nnz_disp[0] = 0;
-    row_disp[0] = 0;
-    b_disp[0] = 0;
-    for (int i = 0; i < grid3d->npdep; i++)
-    {
-        nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
-        row_disp[i + 1] = row_disp[i] + row_counts[i];
-        b_disp[i + 1] = nrhs * row_disp[i + 1];
-        nnz_counts_int[i] = nnz_counts[i];
-        row_counts_int[i] = row_counts[i];
-        b_counts_int[i] = nrhs * row_counts[i];
-    }
-
-    if (grid3d->zscp.Iam == 0)
-    {
-        A2d.colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t));
-        A2d.nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(double));
-        A2d.rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t));
-        A2d.rowptr[0] = 0;
-    }
-
-    MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d.nzval,
-                nnz_counts_int, nnz_disp,
-                MPI_DOUBLE, 0, grid3d->zscp.comm);
-    MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d.colind,
-                nnz_counts_int, nnz_disp,
-                mpi_int_t, 0, grid3d->zscp.comm);
-    MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d.rowptr[1],
-                row_counts_int, row_disp,
-                mpi_int_t, 0, grid3d->zscp.comm);
-
-    if (grid3d->zscp.Iam == 0)
-    {
-        for (int i = 0; i < grid3d->npdep; i++)
-        {
-            for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
-            {
-                // A2d.rowptr[j] += row_disp[i];
-                A2d.rowptr[j] += nnz_disp[i];
-            }
-        }
-        A2d.nnz_loc = nnz_disp[grid3d->npdep];
-        A2d.m_loc = row_disp[grid3d->npdep];
-#if 0	
-        A2d.fst_row = A->fst_row; // This is a bug
-#else
-        gridinfo_t *grid2d = &(grid3d->grid2d);
-        int procs2d = grid2d->nprow * grid2d->npcol;
-        int m_loc_2d = A2d.m_loc;
-        int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
-
-        MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, MPI_INT, grid2d->comm);
-
-        int fst_row = 0;
-        for (int p = 0; p < procs2d; ++p)
-        {
-            if (grid2d->iam == p)
-                A2d.fst_row = fst_row;
-            fst_row += m_loc_2d_counts[p];
-        }
-
-        SUPERLU_FREE(m_loc_2d_counts);
-#endif
-    }
-    // Btmp <- compact(B)
-    // compacting B
-    double *Btmp;
-    Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(double));
-    matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
-
-    double *B1;
-    if (grid3d->zscp.Iam == 0)
-    {
-        B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double));
-        *B2d = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double));
-    }
-
-    // B1 <- gatherv(Btmp)
-    MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_DOUBLE, B1,
-                b_counts_int, b_disp,
-                MPI_DOUBLE, 0, grid3d->zscp.comm);
-
-    // B2d <- colMajor(B1)
-    if (grid3d->zscp.Iam == 0)
-    {
-        for (int i = 0; i < grid3d->npdep; ++i)
-        {
-            /* code */
-            matCopy(row_counts_int[i], nrhs, *B2d + row_disp[i], A2d.m_loc,
-                    B1 + nrhs * row_disp[i], row_counts_int[i]);
-        }
-
-        SUPERLU_FREE(B1);
-    }
-
-
-    return A2d;
-}
-

From 3c2484b4a601fa462b5d5c2a60d9b4262a26d97e Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Sat, 31 Oct 2020 09:55:12 -0700
Subject: [PATCH 034/147] Working version of 3D code: 64bit indexing and
 complex. Added SUPERLU_FREE() at the end of the routines in
 xnrformat_loc3d(), and pxgssvx3d().

---
 EXAMPLE/Makefile          |  2 +-
 SRC/dnrformat_loc3d.c     | 11 ++++++++---
 SRC/pdgssvx3d.c           | 22 ++++++++++++----------
 SRC/pzgssvx3d.c           | 19 ++++++++++---------
 SRC/superlu_dist_config.h | 16 +++++++++++++---
 SRC/supermatrix.h         |  4 +---
 SRC/znrformat_loc3d.c     |  9 +++++----
 7 files changed, 50 insertions(+), 33 deletions(-)

diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile
index 0b7f30a1..f11ef225 100644
--- a/EXAMPLE/Makefile
+++ b/EXAMPLE/Makefile
@@ -38,7 +38,7 @@ DEXM2	= pddrive2.o dcreate_matrix.o dcreate_matrix_perturbed.o
 DEXM3	= pddrive3.o dcreate_matrix.o
 DEXM4	= pddrive4.o dcreate_matrix.o
 
-DEXM3D	= pddrive3d.o dcreate_matrix.o  dscatter3d.o dcreate_matrix3d.o 
+DEXM3D	= pddrive3d.o dcreate_matrix.o dcreate_matrix3d.o 
 
 #	   dtrfAux.o dtreeFactorization.o treeFactorization.o pd3dcomm.o superlu_grid3d.o pdgstrf3d.o
 DEXMG	= pddrive_ABglobal.o
diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c
index 99add3f3..04707910 100644
--- a/SRC/dnrformat_loc3d.c
+++ b/SRC/dnrformat_loc3d.c
@@ -41,6 +41,7 @@ static void matCopy(int n, int m, double *Dst, int lddst, double *Src, int ldsrc
  *     Input:  {A, B, ldb} are distributed on 3D process grid
  *     Output: {A2d, B2d} are distributed on layer 0 2D process grid
  *             output is in the returned A3d->{} structure.
+ *             see supermatrix.h for nrformat_loc3d{} structure.
  */
 NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input
                                       double *B,       // input
@@ -172,12 +173,14 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input
     A3d->row_counts_int = row_counts_int;
     A3d->row_disp = row_disp;
 
-#if 0
+#if 1
     /* free storage */
     SUPERLU_FREE(nnz_counts);
     SUPERLU_FREE(nnz_counts_int);
     SUPERLU_FREE(row_counts);
     SUPERLU_FREE(nnz_disp);
+    SUPERLU_FREE(Btmp);
+    SUPERLU_FREE(B1);
 #endif
 
     return A3d;
@@ -224,19 +227,21 @@ int dScatter_B3d(NRformat_loc3d *A3d,  // modified
     double *Btmp;
     Btmp = SUPERLU_MALLOC(A3d->m_loc * nrhs * sizeof(double));
 
-    // Bttmp <- scatterv(B1)
+    // Btmp <- scatterv(B1)
     MPI_Scatterv(B1, b_counts_int, b_disp, MPI_DOUBLE,
                  Btmp, nrhs * A3d->m_loc, MPI_DOUBLE, 0, grid3d->zscp.comm);
 
     // B <- colMajor(Btmp)
     matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
 
-#if 0
+#if 1
     /* free storage */
     SUPERLU_FREE(A3d->b_counts_int);
     SUPERLU_FREE(A3d->b_disp);
     SUPERLU_FREE(A3d->row_counts_int);
     SUPERLU_FREE(A3d->row_disp);
+    SUPERLU_FREE(B1);
+    SUPERLU_FREE(Btmp);
 #endif
 
     return 0;
diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c
index 71d3e2fb..f9f95af3 100644
--- a/SRC/pdgssvx3d.c
+++ b/SRC/pdgssvx3d.c
@@ -586,7 +586,7 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 	double *B2d;
 	NRformat_loc3d *A3d = dGatherNRformat_loc3d((NRformat_loc *)A->Store,
 						    B, ldb, nrhs, grid3d);
-	B2d = A3d->B2d; 
+	B2d = (double *) A3d->B2d; 
 	NRformat_loc *Astore0 = A3d->A_nfmt; 
 	NRformat_loc *A_orig = A->Store;
 
@@ -1751,17 +1751,19 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 	fflush(stdout);
 #endif
 
-	/* free storage, which are allocated only in layer 0 */
-	if (grid3d->zscp.Iam == 0)
-	{ // free matrix A and B2d on 2D
-		// SUPERLU_FREE(Atmp.rowptr);
-		// SUPERLU_FREE(Atmp.colind);
-		// SUPERLU_FREE(Atmp.nzval);
-		// SUPERLU_FREE(B2d);
-	}
-
 	A->Store = Astore3d; // restore Astore to 3D
 
+    /* free A2d and B2d, which are allocated only in 2D layer Grid_0 */
+    if (grid3d->zscp.Iam == 0) {
+       NRformat_loc *A2d = A3d->A_nfmt;
+       SUPERLU_FREE( A2d->rowptr );
+       SUPERLU_FREE( A2d->colind );
+       SUPERLU_FREE( A2d->nzval );
+       SUPERLU_FREE( A2d );         // free 2D structure
+       SUPERLU_FREE(A3d->B2d);
+       SUPERLU_FREE(A3d);           // free 3D structure
+    }
+    
 #if (DEBUGlevel >= 1)
 	CHECK_MALLOC(iam, "Exit pdgssvx3d()");
 #endif
diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c
index 6b6d5a17..59e40a7f 100644
--- a/SRC/pzgssvx3d.c
+++ b/SRC/pzgssvx3d.c
@@ -1563,17 +1563,18 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 #ifdef NRFRMT
 	zScatter_B3d(A3d, grid3d);
 	
-	/* free storage, which are allocated only in layer 0 */
-	if (grid3d->zscp.Iam == 0)
-	{ // free matrix A and B2d on 2D
-		// SUPERLU_FREE(Atmp.rowptr);
-		// SUPERLU_FREE(Atmp.colind);
-		// SUPERLU_FREE(Atmp.nzval);
-		// SUPERLU_FREE(B2d);
-	}
-
 	A->Store = Astore3d; // restore Astore to 3D
 	
+	/* free A2d and B2d, which are allocated only in 2D layer Grid_0 */
+	if (grid3d->zscp.Iam == 0) {
+	    NRformat_loc *A2d = A3d->A_nfmt;
+	    SUPERLU_FREE( A2d->rowptr );
+	    SUPERLU_FREE( A2d->colind );
+	    SUPERLU_FREE( A2d->nzval );
+	    SUPERLU_FREE( A2d );         // free 2D structure
+	    SUPERLU_FREE(A3d->B2d);
+	    SUPERLU_FREE(A3d);           // free 3D structure
+	}
 #endif
 
     
diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h
index ec3d9f9a..c3def71c 100644
--- a/SRC/superlu_dist_config.h
+++ b/SRC/superlu_dist_config.h
@@ -1,7 +1,17 @@
-/* #define XSDK_INDEX_SIZE 64 */
-/* #define SLU_HAVE_LAPACK TRUE */
+/* superlu_dist_config.h.in */
+
+/* Enable parmetis */
 #define HAVE_PARMETIS TRUE
-/* #define HAVE_COMBBLAS TRUE */
+
+/* Enable LAPACK */
+/* #undef SLU_HAVE_LAPACK */
+
+/* Enable CombBLAS */
+/* #undef HAVE_COMBBLAS */
+
+/* enable 64bit index mode */
+#define XSDK_INDEX_SIZE 64
+
 #if (XSDK_INDEX_SIZE == 64)
 #define _LONGINT 1
 #endif
diff --git a/SRC/supermatrix.h b/SRC/supermatrix.h
index f1c8c8b1..db70575d 100644
--- a/SRC/supermatrix.h
+++ b/SRC/supermatrix.h
@@ -193,12 +193,10 @@ typedef struct NRformat_loc3d
 {
     NRformat_loc* A_nfmt; 
     void* B;   // distributed on 3D process grid
-    //double* B;   // distributed on 3D process grid
     int  ldb;
     int nrhs;
     int m_loc; 
-    void* B2d; // on 2D process layer
-    //double* B2d; // on 2D process layer
+    void* B2d; // on 2D process layer Grid_0
 
     int* row_counts_int; // these counts are for {A, B} distributed on 2D layer 0
     int* row_disp;
diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c
index ecb0d64c..5a246101 100644
--- a/SRC/znrformat_loc3d.c
+++ b/SRC/znrformat_loc3d.c
@@ -40,6 +40,7 @@ static void matCopy(int n, int m, doublecomplex *Dst, int lddst, doublecomplex *
  *     Input:  {A, B, ldb} are distributed on 3D process grid
  *     Output: {A2d, B2d} are distributed on layer 0 2D process grid
  *             output is in the returned A3d->{} structure.
+ *             see supermatrix.h for nrformat_loc3d{} structure.
  */
 NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input
                                       doublecomplex *B,       // input
@@ -171,13 +172,13 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input
     A3d->row_counts_int = row_counts_int;
     A3d->row_disp = row_disp;
 
-#if 0
     /* free storage */
     SUPERLU_FREE(nnz_counts);
     SUPERLU_FREE(nnz_counts_int);
     SUPERLU_FREE(row_counts);
     SUPERLU_FREE(nnz_disp);
-#endif
+    SUPERLU_FREE(Btmp);
+    SUPERLU_FREE(B1);
 
     return A3d;
 
@@ -230,13 +231,13 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
     // B <- colMajor(Btmp)
     matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
 
-#if 0
     /* free storage */
     SUPERLU_FREE(A3d->b_counts_int);
     SUPERLU_FREE(A3d->b_disp);
     SUPERLU_FREE(A3d->row_counts_int);
     SUPERLU_FREE(A3d->row_disp);
-#endif
+    SUPERLU_FREE(Btmp);
+    SUPERLU_FREE(B1);
 
     return 0;
 } /* dScatter_B3d */

From 9d37d26de53a0062c193fa37416270057325001c Mon Sep 17 00:00:00 2001
From: 7ps 
Date: Thu, 5 Nov 2020 15:17:31 -0500
Subject: [PATCH 035/147] Fixing some double frees

---
 SRC/dnrformat_loc3d.c     | 31 ++++++++++++++-----------------
 SRC/superlu_dist_config.h |  2 +-
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c
index 04707910..439a51a3 100644
--- a/SRC/dnrformat_loc3d.c
+++ b/SRC/dnrformat_loc3d.c
@@ -9,8 +9,6 @@ The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-
-
 /*! @file
  * \brief Preprocessing routines for the 3D factorization/solve codes:
  *        - Gather {A,B} from 3D grid to 2D process layer 0
@@ -43,15 +41,15 @@ static void matCopy(int n, int m, double *Dst, int lddst, double *Src, int ldsrc
  *             output is in the returned A3d->{} structure.
  *             see supermatrix.h for nrformat_loc3d{} structure.
  */
-NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input
-                                      double *B,       // input
-				      int ldb, int nrhs, // input
+NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A,   // input
+                                      double *B,         // input
+                                      int ldb, int nrhs, // input
                                       gridinfo3d_t *grid3d)
 {
     NRformat_loc3d *A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
     NRformat_loc *A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
     A3d->m_loc = A->m_loc;
-    A3d->B = (double *) B; // on 3D process grid
+    A3d->B = (double *)B; // on 3D process grid
     A3d->ldb = ldb;
     A3d->nrhs = nrhs;
 
@@ -146,7 +144,7 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input
     if (grid3d->zscp.Iam == 0)
     {
         B1 = SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(double));
-        A3d->B2d = (double *) SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(double));
+        A3d->B2d = (double *)SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(double));
     }
 
     // B1 <- gatherv(Btmp)
@@ -160,8 +158,8 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input
         for (int i = 0; i < grid3d->npdep; ++i)
         {
             /* code */
-            matCopy(row_counts_int[i], nrhs, ((double*)A3d->B2d) + row_disp[i],
-		    A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
+            matCopy(row_counts_int[i], nrhs, ((double *)A3d->B2d) + row_disp[i],
+                    A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
         }
 
         SUPERLU_FREE(B1);
@@ -180,7 +178,7 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input
     SUPERLU_FREE(row_counts);
     SUPERLU_FREE(nnz_disp);
     SUPERLU_FREE(Btmp);
-    SUPERLU_FREE(B1);
+    // SUPERLU_FREE(B1);
 #endif
 
     return A3d;
@@ -191,14 +189,14 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input
  * Scatter B (solution) from 2D process layer 0 to 3D grid
  *   Output: X2d <- A^{-1} B2d
  */
-int dScatter_B3d(NRformat_loc3d *A3d,  // modified
-		 gridinfo3d_t *grid3d)
+int dScatter_B3d(NRformat_loc3d *A3d, // modified
+                 gridinfo3d_t *grid3d)
 {
 
-    double *B = (double *) A3d->B;
+    double *B = (double *)A3d->B;
     int ldb = A3d->ldb;
     int nrhs = A3d->nrhs;
-    double *B2d = (double *) A3d->B2d;
+    double *B2d = (double *)A3d->B2d;
     NRformat_loc A2d = *(A3d->A_nfmt);
     int m_loc = A3d->m_loc;
     int *b_counts_int = A3d->b_counts_int;
@@ -240,11 +238,10 @@ int dScatter_B3d(NRformat_loc3d *A3d,  // modified
     SUPERLU_FREE(A3d->b_disp);
     SUPERLU_FREE(A3d->row_counts_int);
     SUPERLU_FREE(A3d->row_disp);
-    SUPERLU_FREE(B1);
+    if (grid3d->zscp.Iam == 0)
+        SUPERLU_FREE(B1);
     SUPERLU_FREE(Btmp);
 #endif
 
     return 0;
 } /* dScatter_B3d */
-
-
diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h
index c3def71c..5bca8f26 100644
--- a/SRC/superlu_dist_config.h
+++ b/SRC/superlu_dist_config.h
@@ -10,7 +10,7 @@
 /* #undef HAVE_COMBBLAS */
 
 /* enable 64bit index mode */
-#define XSDK_INDEX_SIZE 64
+// #define XSDK_INDEX_SIZE 64
 
 #if (XSDK_INDEX_SIZE == 64)
 #define _LONGINT 1

From 8a3e0b31cd1bae49616088cab1066d1120c560c3 Mon Sep 17 00:00:00 2001
From: 7ps 
Date: Thu, 5 Nov 2020 15:55:50 -0500
Subject: [PATCH 036/147] FIxin double in complex version

---
 SRC/znrformat_loc3d.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c
index 5a246101..7c9c0ba2 100644
--- a/SRC/znrformat_loc3d.c
+++ b/SRC/znrformat_loc3d.c
@@ -9,7 +9,6 @@ The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-
 /*! @file
  * \brief Preprocessing routines for the 3D factorization/solve codes:
  *        - Gather {A,B} from 3D grid to 2D process layer 0
@@ -42,15 +41,15 @@ static void matCopy(int n, int m, doublecomplex *Dst, int lddst, doublecomplex *
  *             output is in the returned A3d->{} structure.
  *             see supermatrix.h for nrformat_loc3d{} structure.
  */
-NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input
-                                      doublecomplex *B,       // input
-				      int ldb, int nrhs, // input
+NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A,   // input
+                                      doublecomplex *B,  // input
+                                      int ldb, int nrhs, // input
                                       gridinfo3d_t *grid3d)
 {
     NRformat_loc3d *A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
     NRformat_loc *A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
     A3d->m_loc = A->m_loc;
-    A3d->B = (doublecomplex *) B; // on 3D process grid
+    A3d->B = (doublecomplex *)B; // on 3D process grid
     A3d->ldb = ldb;
     A3d->nrhs = nrhs;
 
@@ -145,7 +144,7 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input
     if (grid3d->zscp.Iam == 0)
     {
         B1 = SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(doublecomplex));
-        A3d->B2d = (doublecomplex *) SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(doublecomplex));
+        A3d->B2d = (doublecomplex *)SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(doublecomplex));
     }
 
     // B1 <- gatherv(Btmp)
@@ -159,8 +158,8 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input
         for (int i = 0; i < grid3d->npdep; ++i)
         {
             /* code */
-            matCopy(row_counts_int[i], nrhs, ((doublecomplex*)A3d->B2d) + row_disp[i],
-		    A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
+            matCopy(row_counts_int[i], nrhs, ((doublecomplex *)A3d->B2d) + row_disp[i],
+                    A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
         }
 
         SUPERLU_FREE(B1);
@@ -178,7 +177,7 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input
     SUPERLU_FREE(row_counts);
     SUPERLU_FREE(nnz_disp);
     SUPERLU_FREE(Btmp);
-    SUPERLU_FREE(B1);
+    // SUPERLU_FREE(B1);
 
     return A3d;
 
@@ -188,14 +187,14 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input
  * Scatter B (solution) from 2D process layer 0 to 3D grid
  *   Output: X2d <- A^{-1} B2d
  */
-int zScatter_B3d(NRformat_loc3d *A3d,  // modified
-		 gridinfo3d_t *grid3d)
+int zScatter_B3d(NRformat_loc3d *A3d, // modified
+                 gridinfo3d_t *grid3d)
 {
 
-    doublecomplex *B = (doublecomplex *) A3d->B;
+    doublecomplex *B = (doublecomplex *)A3d->B;
     int ldb = A3d->ldb;
     int nrhs = A3d->nrhs;
-    doublecomplex *B2d = (doublecomplex *) A3d->B2d;
+    doublecomplex *B2d = (doublecomplex *)A3d->B2d;
     NRformat_loc A2d = *(A3d->A_nfmt);
     int m_loc = A3d->m_loc;
     int *b_counts_int = A3d->b_counts_int;
@@ -237,9 +236,8 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
     SUPERLU_FREE(A3d->row_counts_int);
     SUPERLU_FREE(A3d->row_disp);
     SUPERLU_FREE(Btmp);
-    SUPERLU_FREE(B1);
+    if (grid3d->zscp.Iam == 0)
+        SUPERLU_FREE(B1);
 
     return 0;
 } /* dScatter_B3d */
-
-

From 572e62e9d3954f5a95051fd811c15c08f067774b Mon Sep 17 00:00:00 2001
From: piyush sao 
Date: Thu, 5 Nov 2020 19:50:31 -0500
Subject: [PATCH 037/147] changing it back

---
 SRC/superlu_dist_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h
index 5bca8f26..c3def71c 100644
--- a/SRC/superlu_dist_config.h
+++ b/SRC/superlu_dist_config.h
@@ -10,7 +10,7 @@
 /* #undef HAVE_COMBBLAS */
 
 /* enable 64bit index mode */
-// #define XSDK_INDEX_SIZE 64
+#define XSDK_INDEX_SIZE 64
 
 #if (XSDK_INDEX_SIZE == 64)
 #define _LONGINT 1

From 4fb315c58acac3a85c29fd18725bee51744d5aa4 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Fri, 6 Nov 2020 17:55:08 -0800
Subject: [PATCH 038/147] Modify superlu_gridmap() and superlu_grid() so that
 they work correctly     when the superlu group does not include all the
 processes. Update SRC/CMakeLists.txt, add ${MPI_CXX_LIBRARIES} in
 set(superlu_dist_libs ...). Some changes in pxdistribute.c, avoid valgrind
 complaints about     conditional jump or move depending on uninitialized
 memory.

---
 EXAMPLE/Makefile            |  7 +++----
 EXAMPLE/pddrive.c           |  2 +-
 EXAMPLE/pddrive1.c          |  2 +-
 EXAMPLE/pddrive1_ABglobal.c |  3 +--
 EXAMPLE/pddrive2.c          |  2 +-
 EXAMPLE/pddrive2_ABglobal.c |  3 +--
 EXAMPLE/pddrive3.c          |  2 +-
 EXAMPLE/pddrive3_ABglobal.c |  3 +--
 EXAMPLE/pddrive4.c          |  2 +-
 EXAMPLE/pddrive4_ABglobal.c |  2 +-
 EXAMPLE/pddrive_ABglobal.c  |  3 +--
 EXAMPLE/pddrive_spawn.c     |  2 +-
 EXAMPLE/pzdrive.c           |  2 +-
 EXAMPLE/pzdrive1.c          |  2 +-
 EXAMPLE/pzdrive1_ABglobal.c |  3 +--
 EXAMPLE/pzdrive2.c          |  2 +-
 EXAMPLE/pzdrive2_ABglobal.c |  3 +--
 EXAMPLE/pzdrive3.c          |  2 +-
 EXAMPLE/pzdrive3_ABglobal.c |  3 +--
 EXAMPLE/pzdrive4_ABglobal.c |  2 +-
 EXAMPLE/pzdrive_ABglobal.c  |  4 +---
 EXAMPLE/pzdrive_spawn.c     |  2 +-
 SRC/get_perm_c_parmetis.c   |  8 +++++---
 SRC/pddistribute.c          | 13 ++++++++-----
 SRC/pzdistribute.c          | 13 ++++++++-----
 SRC/superlu_dist_config.h   |  2 +-
 SRC/superlu_grid.c          | 32 ++++++++++++++++++++++----------
 27 files changed, 68 insertions(+), 58 deletions(-)

diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile
index d74d31bb..db8da572 100644
--- a/EXAMPLE/Makefile
+++ b/EXAMPLE/Makefile
@@ -30,9 +30,8 @@
 #######################################################################
 include ../make.inc
 
-DEXM	= pddrive.o dcreate_matrix.o sp_ienv.o #pdgstrf2.o 
-#pdgssvx.o
-# pdgstrs_lsum_X1.o pdgstrf_X1.o
+DEXM	= pddrive.o dcreate_matrix.o sp_ienv.o 
+	#pdgssvx.o pdgstrf2.o
 DEXM1	= pddrive1.o dcreate_matrix.o
 DEXM2	= pddrive2.o dcreate_matrix.o dcreate_matrix_perturbed.o
 DEXM3	= pddrive3.o dcreate_matrix.o
@@ -65,7 +64,7 @@ complex16: pzdrive pzdrive1 pzdrive2 pzdrive3 pzdrive4 \
 	   pzdrive_ABglobal pzdrive1_ABglobal pzdrive2_ABglobal \
 	   pzdrive3_ABglobal pzdrive4_ABglobal
 
-pddrive: $(DEXM) $(DSUPERLULIB)
+pddrive: $(DEXM) $(DSUPERLULIB) 
 	$(LOADER) $(LOADOPTS) $(DEXM) $(LIBS) -lm -o $@
 
 pddrive1: $(DEXM1) $(DSUPERLULIB)
diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c
index fd143ef7..0e199880 100644
--- a/EXAMPLE/pddrive.c
+++ b/EXAMPLE/pddrive.c
@@ -140,7 +140,7 @@ int main(int argc, char *argv[])
 	
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam >= nprow * npcol || iam ==-1 ) goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
diff --git a/EXAMPLE/pddrive1.c b/EXAMPLE/pddrive1.c
index 8e5396a9..a3ddd3da 100644
--- a/EXAMPLE/pddrive1.c
+++ b/EXAMPLE/pddrive1.c
@@ -104,7 +104,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 ) goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
diff --git a/EXAMPLE/pddrive1_ABglobal.c b/EXAMPLE/pddrive1_ABglobal.c
index ad74edc6..7f06b70e 100644
--- a/EXAMPLE/pddrive1_ABglobal.c
+++ b/EXAMPLE/pddrive1_ABglobal.c
@@ -105,8 +105,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
+    if ( iam == -1 ) goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pddrive2.c b/EXAMPLE/pddrive2.c
index 5ad5a3ae..8007cdf2 100644
--- a/EXAMPLE/pddrive2.c
+++ b/EXAMPLE/pddrive2.c
@@ -116,7 +116,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
diff --git a/EXAMPLE/pddrive2_ABglobal.c b/EXAMPLE/pddrive2_ABglobal.c
index fad13d43..57ebadf3 100644
--- a/EXAMPLE/pddrive2_ABglobal.c
+++ b/EXAMPLE/pddrive2_ABglobal.c
@@ -105,8 +105,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
+    if ( iam == -1 )	goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pddrive3.c b/EXAMPLE/pddrive3.c
index 4287cae0..88a54b05 100644
--- a/EXAMPLE/pddrive3.c
+++ b/EXAMPLE/pddrive3.c
@@ -113,7 +113,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
diff --git a/EXAMPLE/pddrive3_ABglobal.c b/EXAMPLE/pddrive3_ABglobal.c
index 775dc5a7..2e2a7433 100644
--- a/EXAMPLE/pddrive3_ABglobal.c
+++ b/EXAMPLE/pddrive3_ABglobal.c
@@ -111,8 +111,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
+    if ( iam == -1 )	goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pddrive4.c b/EXAMPLE/pddrive4.c
index d7289de5..cc5caf63 100644
--- a/EXAMPLE/pddrive4.c
+++ b/EXAMPLE/pddrive4.c
@@ -130,7 +130,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in any of the 2 grids. */
     MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    if ( iam >= 10 ) goto out;
+    if ( iam == -1 ) goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pddrive4_ABglobal.c b/EXAMPLE/pddrive4_ABglobal.c
index f870de85..7ec43282 100644
--- a/EXAMPLE/pddrive4_ABglobal.c
+++ b/EXAMPLE/pddrive4_ABglobal.c
@@ -126,7 +126,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in any of the 2 grids. */
     MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    if ( iam >= 10 ) goto out;
+    if ( iam == -1 ) goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pddrive_ABglobal.c b/EXAMPLE/pddrive_ABglobal.c
index d6719e41..a47388b5 100644
--- a/EXAMPLE/pddrive_ABglobal.c
+++ b/EXAMPLE/pddrive_ABglobal.c
@@ -106,8 +106,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
+    if ( iam == -1 )	goto out;
 
 #if ( VAMPIR>=1 )
     VT_traceoff();
diff --git a/EXAMPLE/pddrive_spawn.c b/EXAMPLE/pddrive_spawn.c
index 47b04729..97803d87 100755
--- a/EXAMPLE/pddrive_spawn.c
+++ b/EXAMPLE/pddrive_spawn.c
@@ -151,7 +151,7 @@ int main(int argc, char *argv[])
 	
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c
index bf6c3e9b..927a059d 100644
--- a/EXAMPLE/pzdrive.c
+++ b/EXAMPLE/pzdrive.c
@@ -138,7 +138,7 @@ int main(int argc, char *argv[])
 	
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
diff --git a/EXAMPLE/pzdrive1.c b/EXAMPLE/pzdrive1.c
index 6718cd0e..c0713bad 100644
--- a/EXAMPLE/pzdrive1.c
+++ b/EXAMPLE/pzdrive1.c
@@ -103,7 +103,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
diff --git a/EXAMPLE/pzdrive1_ABglobal.c b/EXAMPLE/pzdrive1_ABglobal.c
index d2a048d4..4437e4a8 100644
--- a/EXAMPLE/pzdrive1_ABglobal.c
+++ b/EXAMPLE/pzdrive1_ABglobal.c
@@ -104,8 +104,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
+    if ( iam == -1 )	goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pzdrive2.c b/EXAMPLE/pzdrive2.c
index df9b2263..000d847f 100644
--- a/EXAMPLE/pzdrive2.c
+++ b/EXAMPLE/pzdrive2.c
@@ -115,7 +115,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
diff --git a/EXAMPLE/pzdrive2_ABglobal.c b/EXAMPLE/pzdrive2_ABglobal.c
index b6c00cbb..9959465b 100644
--- a/EXAMPLE/pzdrive2_ABglobal.c
+++ b/EXAMPLE/pzdrive2_ABglobal.c
@@ -104,8 +104,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
+    if ( iam == -1 )	goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pzdrive3.c b/EXAMPLE/pzdrive3.c
index c00415d6..3c2fb931 100644
--- a/EXAMPLE/pzdrive3.c
+++ b/EXAMPLE/pzdrive3.c
@@ -112,7 +112,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
diff --git a/EXAMPLE/pzdrive3_ABglobal.c b/EXAMPLE/pzdrive3_ABglobal.c
index e1f21ce1..c83cf1a3 100644
--- a/EXAMPLE/pzdrive3_ABglobal.c
+++ b/EXAMPLE/pzdrive3_ABglobal.c
@@ -110,8 +110,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
+    if ( iam == -1 ) goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pzdrive4_ABglobal.c b/EXAMPLE/pzdrive4_ABglobal.c
index 4ec17583..d65aa74f 100644
--- a/EXAMPLE/pzdrive4_ABglobal.c
+++ b/EXAMPLE/pzdrive4_ABglobal.c
@@ -125,7 +125,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in any of the 2 grids. */
     MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    if ( iam >= 10 ) goto out;
+    if ( iam == -1 ) goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pzdrive_ABglobal.c b/EXAMPLE/pzdrive_ABglobal.c
index 5871f7b8..c3d798c1 100644
--- a/EXAMPLE/pzdrive_ABglobal.c
+++ b/EXAMPLE/pzdrive_ABglobal.c
@@ -105,9 +105,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )
-	goto out;
-
+    if ( iam == -1 )	goto out;
 
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
diff --git a/EXAMPLE/pzdrive_spawn.c b/EXAMPLE/pzdrive_spawn.c
index faf725c6..c266971d 100755
--- a/EXAMPLE/pzdrive_spawn.c
+++ b/EXAMPLE/pzdrive_spawn.c
@@ -151,7 +151,7 @@ int main(int argc, char *argv[])
 	
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
diff --git a/SRC/get_perm_c_parmetis.c b/SRC/get_perm_c_parmetis.c
index 6381f4e5..b08c35ab 100644
--- a/SRC/get_perm_c_parmetis.c
+++ b/SRC/get_perm_c_parmetis.c
@@ -122,7 +122,7 @@ get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c,
   int_t m, n, bnz, i, j;
   int_t *rowptr, *colind, *l_fstVtxSep, *l_sizes;
   int_t *b_rowptr, *b_colind;
-  int_t *dist_order;
+  int_t *dist_order = NULL;
   int  *recvcnts, *displs;
   /* first row index on each processor when the matrix is distributed
      on nprocs (vtxdist_i) or noDomains processors (vtxdist_o) */
@@ -260,7 +260,8 @@ get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c,
   if (bnz) SUPERLU_FREE (b_colind);
   SUPERLU_FREE (b_rowptr);
 
-#if 0  
+#if 0  /* The following code is not needed anymore, because parmetis
+	  now supports 64bit indexing */
   if ( iam < noDomains) {
     SUPERLU_FREE (options);
   }
@@ -283,7 +284,8 @@ get_perm_c_parmetis (SuperMatrix *A, int_t *perm_r, int_t *perm_c,
   dist_order = dist_order_int;
 #endif
 
-#endif
+#endif  /* not needed any more */
+
   
   /* Allgatherv dist_order to get perm_c */
   if (!(displs = (int *) SUPERLU_MALLOC (nprocs_i * sizeof(int))))
diff --git a/SRC/pddistribute.c b/SRC/pddistribute.c
index cc792462..17f770d7 100644
--- a/SRC/pddistribute.c
+++ b/SRC/pddistribute.c
@@ -73,9 +73,9 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct,
     int_t  SendCnt; /* number of remote nonzeros to be sent */
     int_t  RecvCnt; /* number of remote nonzeros to be sent */
     int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
-    int_t  *ia, *ja, **ia_send, *index, *itemp;
+    int_t  *ia, *ja, **ia_send, *index, *itemp = NULL;
     int_t  *ptr_to_send;
-    double *aij, **aij_send, *nzval, *dtemp;
+    double *aij, **aij_send, *nzval, *dtemp = NULL;
     double *nzval_a;
 	double asum,asum_tot;
     int    iam, it, p, procs, iam_g;
@@ -216,7 +216,8 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct,
        NOTE: Can possibly use MPI_Alltoallv.
        ------------------------------------------------------------*/
     for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
+	if ( p != iam && nnzToSend[p]>0 ) {  // cause two of the tests to hang
+	//	if ( p != iam ) {
 	    it = 2*nnzToSend[p];
 	    MPI_Isend( ia_send[p], it, mpi_int_t,
 		       p, iam, grid->comm, &send_req[p] );
@@ -227,7 +228,8 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct,
     }
 
     for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
+	if ( p != iam && nnzToRecv[p]>0 ) {
+	    //if ( p != iam ) {
 	    it = 2*nnzToRecv[p];
 	    MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status );
 	    it = nnzToRecv[p];
@@ -246,7 +248,8 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct,
     }
 
     for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
+        if ( p != iam && nnzToSend[p] > 0 ) {
+	    //if ( p != iam ) {
 	    MPI_Wait( &send_req[p], &status);
 	    MPI_Wait( &send_req[procs+p], &status);
 	}
diff --git a/SRC/pzdistribute.c b/SRC/pzdistribute.c
index b7189d95..fbbdff99 100644
--- a/SRC/pzdistribute.c
+++ b/SRC/pzdistribute.c
@@ -72,9 +72,9 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct,
     int_t  SendCnt; /* number of remote nonzeros to be sent */
     int_t  RecvCnt; /* number of remote nonzeros to be sent */
     int_t  *nnzToSend, *nnzToRecv, maxnnzToRecv;
-    int_t  *ia, *ja, **ia_send, *index, *itemp;
+    int_t  *ia, *ja, **ia_send, *index, *itemp = NULL;
     int_t  *ptr_to_send;
-    doublecomplex *aij, **aij_send, *nzval, *dtemp;
+    doublecomplex *aij, **aij_send, *nzval, *dtemp = NULL;
     doublecomplex *nzval_a;
 	doublecomplex asum,asum_tot;
     int    iam, it, p, procs, iam_g;
@@ -215,7 +215,8 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct,
        NOTE: Can possibly use MPI_Alltoallv.
        ------------------------------------------------------------*/
     for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
+	if ( p != iam && nnzToSend[p] > 0 ) {
+	//if ( p != iam ) {
 	    it = 2*nnzToSend[p];
 	    MPI_Isend( ia_send[p], it, mpi_int_t,
 		       p, iam, grid->comm, &send_req[p] );
@@ -226,7 +227,8 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct,
     }
 
     for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
+	if ( p != iam && nnzToRecv[p] > 0 ) {
+	    //if ( p != iam ) {
 	    it = 2*nnzToRecv[p];
 	    MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status );
 	    it = nnzToRecv[p];
@@ -245,7 +247,8 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct,
     }
 
     for (p = 0; p < procs; ++p) {
-        if ( p != iam ) {
+        if ( p != iam && nnzToSend[p] > 0 ) {
+	    //if ( p != iam ) {
 	    MPI_Wait( &send_req[p], &status);
 	    MPI_Wait( &send_req[procs+p], &status);
 	}
diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h
index 12924438..d7eb53ac 100644
--- a/SRC/superlu_dist_config.h
+++ b/SRC/superlu_dist_config.h
@@ -13,7 +13,7 @@
 /* #undef HAVE_COMBBLAS */
 
 /* enable 64bit index mode */
-/* #undef XSDK_INDEX_SIZE */
+#define XSDK_INDEX_SIZE 64
 
 #if (XSDK_INDEX_SIZE == 64)
 #define _LONGINT 1
diff --git a/SRC/superlu_grid.c b/SRC/superlu_grid.c
index 2b7600d2..32303e4c 100644
--- a/SRC/superlu_grid.c
+++ b/SRC/superlu_grid.c
@@ -27,6 +27,11 @@ MPI_Datatype SuperLU_MPI_DOUBLE_COMPLEX = MPI_DATATYPE_NULL;
 #endif
 
 /*! \brief All processes in the MPI communicator must call this routine.
+ * 
+ *  On output, if a process is not in the SuperLU group, the following 
+ *  values are assigned to it:
+ *      grid->comm = MPI_COMM_NULL
+ *      grid->iam = -1
  */
 void superlu_gridinit(MPI_Comm Bcomm, /* The base communicator upon which
 					 the new grid is formed. */
@@ -59,6 +64,11 @@ void superlu_gridinit(MPI_Comm Bcomm, /* The base communicator upon which
 
 
 /*! \brief All processes in the MPI communicator must call this routine.
+ *
+ *  On output, if a process is not in the SuperLU group, the following 
+ *  values are assigned to it:
+ *      grid->comm = MPI_COMM_NULL
+ *      grid->iam = -1
  */
 void superlu_gridmap(
 		     MPI_Comm Bcomm, /* The base communicator upon which
@@ -108,17 +118,17 @@ void superlu_gridmap(
     MPI_Group_incl( mpi_base_group, Np, pranks, &superlu_grp );
     /* Create the new communicator. */
     /* NOTE: The call is to be executed by all processes in Bcomm,
-       even if they do not belong in the new group -- superlu_grp. */
+       even if they do not belong in the new group -- superlu_grp.
+       The function returns MPI_COMM_NULL to processes that are not in superlu_grp. */
     MPI_Comm_create( Bcomm, superlu_grp, &grid->comm );
 
-    /* Bail out if I am not in the group, superlu_group. */
+    /* Bail out if I am not in the group "superlu_grp". */
     if ( grid->comm == MPI_COMM_NULL ) {
-	grid->comm = Bcomm;
-	MPI_Comm_rank( Bcomm, &i );
-	grid->iam = i;
-	/*grid->iam = -1;*/
-	SUPERLU_FREE(pranks);
-	return;
+	// grid->comm = Bcomm;  do not need to reassign to a valid communicator
+	grid->iam = -1;
+	//SUPERLU_FREE(pranks);
+	//return;
+	goto gridmap_out;
     }
 
     MPI_Comm_rank( grid->comm, &(grid->iam) );
@@ -166,14 +176,16 @@ void superlu_gridmap(
     }
 #endif
 
+ gridmap_out:        
     SUPERLU_FREE(pranks);
     MPI_Group_free(&superlu_grp);
     MPI_Group_free(&mpi_base_group);
-}
+    
+} /* superlu_gridmap */
 
 void superlu_gridexit(gridinfo_t *grid)
 {
-    if ( grid->comm != MPI_COMM_NULL && grid->comm != MPI_COMM_WORLD ) {
+    if ( grid->comm != MPI_COMM_NULL ) {
 	/* Marks the communicator objects for deallocation. */
 	MPI_Comm_free( &grid->rscp.comm );
 	MPI_Comm_free( &grid->cscp.comm );

From 471866bf9f9333f0c9c2cc8210588e8a59dc439d Mon Sep 17 00:00:00 2001
From: 7ps 
Date: Wed, 11 Nov 2020 21:22:12 -0500
Subject: [PATCH 039/147] Free a3d and A2d on all process grids

---
 SRC/pdgssvx3d.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c
index f9f95af3..20b9e790 100644
--- a/SRC/pdgssvx3d.c
+++ b/SRC/pdgssvx3d.c
@@ -1754,16 +1754,18 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 	A->Store = Astore3d; // restore Astore to 3D
 
     /* free A2d and B2d, which are allocated only in 2D layer Grid_0 */
+	NRformat_loc *A2d = A3d->A_nfmt;
     if (grid3d->zscp.Iam == 0) {
-       NRformat_loc *A2d = A3d->A_nfmt;
+       
        SUPERLU_FREE( A2d->rowptr );
        SUPERLU_FREE( A2d->colind );
        SUPERLU_FREE( A2d->nzval );
-       SUPERLU_FREE( A2d );         // free 2D structure
+       
        SUPERLU_FREE(A3d->B2d);
-       SUPERLU_FREE(A3d);           // free 3D structure
+       
     }
-    
+    SUPERLU_FREE( A2d );         // free 2D structure
+	SUPERLU_FREE(A3d);           // free 3D structure
 #if (DEBUGlevel >= 1)
 	CHECK_MALLOC(iam, "Exit pdgssvx3d()");
 #endif

From 7b0efdb050f4cf6b0e8137fa2cf3cb2151faaf26 Mon Sep 17 00:00:00 2001
From: piyush sao 
Date: Thu, 5 Nov 2020 20:05:03 -0500
Subject: [PATCH 040/147] Adding empty file to mark the new branch

---
 SRC/dsuperlu_blas.c | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 SRC/dsuperlu_blas.c

diff --git a/SRC/dsuperlu_blas.c b/SRC/dsuperlu_blas.c
new file mode 100644
index 00000000..e69de29b

From cf9c83a0817db124391b1f8e87b2f6324b82aa54 Mon Sep 17 00:00:00 2001
From: 7ps 
Date: Wed, 11 Nov 2020 21:17:14 -0500
Subject: [PATCH 041/147] adding superlu_dgemm

---
 SRC/dsuperlu_blas.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/SRC/dsuperlu_blas.c b/SRC/dsuperlu_blas.c
index e69de29b..c3c1dab9 100644
--- a/SRC/dsuperlu_blas.c
+++ b/SRC/dsuperlu_blas.c
@@ -0,0 +1,32 @@
+
+
+#ifdef _CRAY
+_fcd ftcs = _cptofcd("N", strlen("N"));
+_fcd ftcs1 = _cptofcd("L", strlen("L"));
+_fcd ftcs2 = _cptofcd("N", strlen("N"));
+_fcd ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+int superlu_dgemm(const char *transa, const char *transb,
+                  int m, int n, int k, double alpha, double *a, 
+                  int lda, double *b, int ldb, double beta, double *c, int ldc)
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd(transa, strlen(transa));
+    _fcd ftcs1 = _cptofcd(transb, strlen(transb));
+    return SGEMM(ftcs, ftcs1, &m, &n, &k,
+                 &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#elif defined(USE_VENDOR_BLAS)
+    return dgemm_(transa, transb, &m, &n, &k,
+                  &alpha, a, &lda, b, &ldb, &beta, c, &ldc, 1, 1);
+#else
+    return dgemm_(transa, transb, &m, &n, &k,
+                  &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#endif
+}
+
+
+// dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+//   179  			ublk_ptr, &ld_ujrow, &lusup[off], &nsupr,
+
+//    dtrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr
\ No newline at end of file

From 2e43b0afc65b73f4d982eed03d8465b789107307 Mon Sep 17 00:00:00 2001
From: 7ps 
Date: Thu, 12 Nov 2020 20:36:02 -0500
Subject: [PATCH 042/147] Adding superlu_gemm and calling it from
 block_scatter_gemm

---
 SRC/CMakeLists.txt  |  1 +
 SRC/dscatter3d.c    | 35 +++++------------------------------
 SRC/dsuperlu_blas.c |  5 +++--
 SRC/superlu_ddefs.h |  5 +++++
 4 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 576412cc..bef541f5 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -112,6 +112,7 @@ if(enable_double)
     dtrfAux.c	
     dcommunication_aux.c 
     dtrfCommWrapper.c
+    dsuperlu_blas.c
   )
 endif()
 
diff --git a/SRC/dscatter3d.c b/SRC/dscatter3d.c
index a71a1f8e..c2edb2b9 100644
--- a/SRC/dscatter3d.c
+++ b/SRC/dscatter3d.c
@@ -142,27 +142,13 @@ dblock_gemm_scatter( int_t lb, int_t j,
     double alpha = 1.0;
     double beta = 0.0;
 
-    /* calling DGEMM */
-    // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
-#if 1
-  #if defined (USE_VENDOR_BLAS)
-    dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
-          &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl,
-          &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
-  #else
-    dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
-          &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl,
-          &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow);
-  #endif
-#else
-    // printf("%d %d %d %d  %d %d %d %d\n", temp_nbrow, ncols, ldu,  ldl,st_col,(knsupc - ldu)*ldl + cum_nrow,cum_nrow,st_col);
 
-    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+    superlu_dgemm("N", "N",
                 temp_nbrow, ncols, ldu, alpha,
                 &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
                 &U_mat[st_col * ldu], ldu,
                 beta, tempv1, temp_nbrow);
-#endif
+
 
     // printf("SCU update: (%d, %d)\n",ib,jb );
 #ifdef SCATTER_PROFILE
@@ -255,24 +241,13 @@ dblock_gemm_scatter_lock( int_t lb, int_t j,
     double alpha = 1.0;  double beta = 0.0;
 
     /* calling DGEMM */
-#if 1
-  #if defined (USE_VENDOR_BLAS)
-    // printf(" m %d n %d k %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
-    dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
-           &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl,
-           &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
-  #else
-    dgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
-           &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl,
-           &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow);
-  #endif
-#else
-    cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
+
+    superlu_dgemm("N", "N",
                 temp_nbrow, ncols, ldu, alpha,
                 &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
                 &U_mat[st_col * ldu], ldu,
                 beta, tempv1, temp_nbrow);
-#endif
+
 
     /*try to get the lock for the block*/
     if (lock)       /*lock is not null*/
diff --git a/SRC/dsuperlu_blas.c b/SRC/dsuperlu_blas.c
index c3c1dab9..f7893533 100644
--- a/SRC/dsuperlu_blas.c
+++ b/SRC/dsuperlu_blas.c
@@ -1,4 +1,4 @@
-
+#include "superlu_ddefs.h"
 
 #ifdef _CRAY
 _fcd ftcs = _cptofcd("N", strlen("N"));
@@ -17,8 +17,9 @@ int superlu_dgemm(const char *transa, const char *transb,
     return SGEMM(ftcs, ftcs1, &m, &n, &k,
                  &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
 #elif defined(USE_VENDOR_BLAS)
-    return dgemm_(transa, transb, &m, &n, &k,
+    dgemm_(transa, transb, &m, &n, &k,
                   &alpha, a, &lda, b, &ldb, &beta, c, &ldc, 1, 1);
+    return 0;
 #else
     return dgemm_(transa, transb, &m, &n, &k,
                   &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index fa1d9d6a..7db2a32e 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -566,6 +566,11 @@ extern int daxpy_(int *n, double *za, double *zx,
 // LAPACK routine
 extern void dtrtri_(char*, char*, int*, double*, int*, int*);
 
+// Superlu blas routines
+extern int superlu_dgemm(const char *transa, const char *transb,
+                  int m, int n, int k, double alpha, double *a, 
+                  int lda, double *b, int ldb, double beta, double *c, int ldc);
+
 
 /*==== For 3D code ====*/
 

From a512891e536a8819627d3b22adc3ce93e1a06399 Mon Sep 17 00:00:00 2001
From: 7ps 
Date: Thu, 19 Nov 2020 15:27:04 -0500
Subject: [PATCH 043/147] minor cleaup

---
 SRC/pdgssvx3d.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c
index 20b9e790..90674027 100644
--- a/SRC/pdgssvx3d.c
+++ b/SRC/pdgssvx3d.c
@@ -1717,20 +1717,11 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 				break;
 			}
 		}
-
-#if 0
-		if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact)
-			Destroy_CompCol_Permuted_dist (&GAC);
-#endif
-
 	} /* process layer 0 done solve */
 
 #ifdef NRFRMT
 	dScatter_B3d(A3d, grid3d);
-	// dScatterB3d(*(A3d->A_nfmt),
-	// 			A_orig,
-	// 			B3d, ldb, nrhs, B2d,
-	// 			grid3d);
+	
 #endif
 	//  double *B, int ldb, int nrhs, double *B2d,
 	//  gridinfo3d_t *grid3d);
@@ -1742,14 +1733,6 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 	//    - scatter the solution from 2D back to 3D: {B2d,ldb} -> {B3d,ldb3d}
 	//      (can we reuse b_count[] and b_disp[] already computed in 'Gather' routine?)
 
-#if 0 // for debugging	
-	if ( grid3d->zscp.Iam == 0 ) {  // only process layer 0
-	    PrintDouble5("Before exit pdgssvx3d, solution B2d", ldb, B2d);
-	    PrintDouble5("Before exit pdgssvx3d, solution B", ldb, B);
-	}
-	PrintDouble5("Before exit pdgssvx3d, solution B3d", ldb3d, B3d);
-	fflush(stdout);
-#endif
 
 	A->Store = Astore3d; // restore Astore to 3D
 

From b94d6e73f726449bde7a574ad34e5c82a2aeb2ce Mon Sep 17 00:00:00 2001
From: 7ps 
Date: Sun, 29 Nov 2020 12:14:24 -0500
Subject: [PATCH 044/147] Adding new superlu wrappers

---
 SRC/dsuperlu_blas.c   |  82 +++++++++-
 SRC/dtrfCommWrapper.c | 368 +++++++++++++++++++++---------------------
 SRC/pdgstrf2.c        |  56 ++++---
 SRC/superlu_ddefs.h   |   9 ++
 4 files changed, 300 insertions(+), 215 deletions(-)

diff --git a/SRC/dsuperlu_blas.c b/SRC/dsuperlu_blas.c
index f7893533..1488f427 100644
--- a/SRC/dsuperlu_blas.c
+++ b/SRC/dsuperlu_blas.c
@@ -8,7 +8,7 @@ _fcd ftcs3 = _cptofcd("U", strlen("U"));
 #endif
 
 int superlu_dgemm(const char *transa, const char *transb,
-                  int m, int n, int k, double alpha, double *a, 
+                  int m, int n, int k, double alpha, double *a,
                   int lda, double *b, int ldb, double beta, double *c, int ldc)
 {
 #ifdef _CRAY
@@ -18,7 +18,7 @@ int superlu_dgemm(const char *transa, const char *transb,
                  &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
 #elif defined(USE_VENDOR_BLAS)
     dgemm_(transa, transb, &m, &n, &k,
-                  &alpha, a, &lda, b, &ldb, &beta, c, &ldc, 1, 1);
+           &alpha, a, &lda, b, &ldb, &beta, c, &ldc, 1, 1);
     return 0;
 #else
     return dgemm_(transa, transb, &m, &n, &k,
@@ -26,8 +26,80 @@ int superlu_dgemm(const char *transa, const char *transb,
 #endif
 }
 
+int superlu_dtrsm(const char *sideRL, const char *uplo,
+                  const char *transa, const char *diag,
+                  const int m, const int n,
+                  const double alpha, const double *a,
+                  const int lda, double *b, const int ldb)
 
-// dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-//   179  			ublk_ptr, &ld_ujrow, &lusup[off], &nsupr,
+{
+#if defined(USE_VENDOR_BLAS)
+    dtrsm_(sideRL, uplo, transa, diag,
+           &m, &n, &alpha, a, &lda, b, &ldb,
+           1, 1, 1, 1);
+    return 0;
+#else
+    return dtrsm_(sideRL, uplo, transa, diag,
+                  &m, &n, &alpha, a, &lda, b, &ldb);
+#endif
+}
+
+int superlu_dger(const int m, const int n, const double alpha,
+                 const double *x, const int incx, const double *y,
+                 const int incy, double *a, const int lda)
+{
+#ifdef _CRAY
+    SGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#else
+    dger_(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#endif
+
+    return 0;
+}
+
+int superlu_dscal(const int n, const double alpha, double *x, const int incx)
+{
+    dscal_(&n, &alpha, x, &incx);
+    return 0;
+}
 
-//    dtrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr
\ No newline at end of file
+int superlu_daxpy(const int n, const double alpha, const double *x, const int incx, double *y, const int incy)
+{
+    daxpy_(&n, &alpha, x, &incx, y, &incy);
+    return 0;
+}
+
+int superlu_dgemv(const char *trans, const int m,
+                  const int n, const double alpha, const double *a,
+                  const int lda, const double *x, const int incx,
+                  const double beta, double *y, const int incy)
+{
+
+#ifdef USE_VENDOR_BLAS
+		dgemv_(trans, &m, &n, &alpha, a,
+           &lda, x, &incx, 
+           &beta, y, &incy, 1);
+#else
+		dgemv_(trans, &m, &n, &alpha, a,
+           &lda, x, &incx, 
+           &beta, y, &incy);
+#endif
+    
+    return 0;
+}
+
+int superlu_dtrsv(char *uplo, char *trans, char *diag,
+                  int n, double *a, int lda, double *x, int incx)
+{
+#ifdef _CRAY
+    // _fcd ftcs = _cptofcd("N", strlen("N"));
+		STRSV(_cptofcd(uplo, strlen(uplo)), _cptofcd(trans, strlen(trans)), _cptofcd(diag, strlen(diag)), 
+         &n, a, &lda, x, &incx);
+#elif defined (USE_VENDOR_BLAS)
+		dtrsv_(uplo, trans, diag, &n, a, &lda, x, &incx, 1, 1, 1);
+#else
+		dtrsv_(uplo, trans, diag, &n, a, &lda, x, &incx);
+#endif
+    
+    return 0;
+}
\ No newline at end of file
diff --git a/SRC/dtrfCommWrapper.c b/SRC/dtrfCommWrapper.c
index 13455b08..8ab915bd 100644
--- a/SRC/dtrfCommWrapper.c
+++ b/SRC/dtrfCommWrapper.c
@@ -9,7 +9,6 @@ The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-
 /*! @file
  * \brief Communication wrapper routines for 2D factorization.
  *
@@ -30,57 +29,56 @@ at the top-level directory.
 #include "mkl.h"
 #else
 //#include "cblas.h"
-#endif 
+#endif
 
-int_t dDiagFactIBCast(int_t k,  int_t k0,      // supernode to be factored
-                     double *BlockUFactor,
-                     double *BlockLFactor,
-                     int_t* IrecvPlcd_D,
-                     MPI_Request *U_diag_blk_recv_req,
-                     MPI_Request *L_diag_blk_recv_req,
-                     MPI_Request *U_diag_blk_send_req,
-                     MPI_Request *L_diag_blk_send_req,
-                     gridinfo_t *grid,
-                     superlu_dist_options_t *options,
-                     double thresh,
-                     LUstruct_t *LUstruct,
-                     SuperLUStat_t *stat, int *info,
-                     SCT_t *SCT,
-		     int tag_ub
-                    )
+int_t dDiagFactIBCast(int_t k, int_t k0, // supernode to be factored
+                      double *BlockUFactor,
+                      double *BlockLFactor,
+                      int_t *IrecvPlcd_D,
+                      MPI_Request *U_diag_blk_recv_req,
+                      MPI_Request *L_diag_blk_recv_req,
+                      MPI_Request *U_diag_blk_send_req,
+                      MPI_Request *L_diag_blk_send_req,
+                      gridinfo_t *grid,
+                      superlu_dist_options_t *options,
+                      double thresh,
+                      LUstruct_t *LUstruct,
+                      SuperLUStat_t *stat, int *info,
+                      SCT_t *SCT,
+                      int tag_ub)
 {
     // unpacking variables
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     LocalLU_t *Llu = LUstruct->Llu;
-    int_t* xsup = Glu_persist->xsup;
+    int_t *xsup = Glu_persist->xsup;
 
     int_t iam = grid->iam;
     int_t Pc = grid->npcol;
     int_t Pr = grid->nprow;
-    int_t myrow = MYROW (iam, grid);
-    int_t mycol = MYCOL (iam, grid);
-    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
-    int_t krow = PROW (k, grid);
-    int_t kcol = PCOL (k, grid);
+    int_t myrow = MYROW(iam, grid);
+    int_t mycol = MYCOL(iam, grid);
+    int_t pkk = PNUM(PROW(k, grid), PCOL(k, grid), grid);
+    int_t krow = PROW(k, grid);
+    int_t kcol = PCOL(k, grid);
 
     //xsup for supersize
 
     /*Place Irecvs first*/
     // if (IrecvPlcd_D[k] == 0 )
     // {
-    int_t nsupc = SuperSize (k);
+    int_t nsupc = SuperSize(k);
     if (mycol == kcol && iam != pkk)
     {
-        dIRecv_UDiagBlock(k0, BlockUFactor,  /*pointer for the diagonal block*/
-                         nsupc * nsupc, krow,
-                         U_diag_blk_recv_req, grid, SCT, tag_ub);
+        dIRecv_UDiagBlock(k0, BlockUFactor, /*pointer for the diagonal block*/
+                          nsupc * nsupc, krow,
+                          U_diag_blk_recv_req, grid, SCT, tag_ub);
     }
 
     if (myrow == krow && iam != pkk)
     {
-        dIRecv_LDiagBlock(k0, BlockLFactor,  /*pointer for the diagonal block*/
-                         nsupc * nsupc, kcol,
-                         L_diag_blk_recv_req, grid, SCT, tag_ub);
+        dIRecv_LDiagBlock(k0, BlockLFactor, /*pointer for the diagonal block*/
+                          nsupc * nsupc, kcol,
+                          L_diag_blk_recv_req, grid, SCT, tag_ub);
     }
     IrecvPlcd_D[k] = 1;
     // }
@@ -109,39 +107,39 @@ int_t dDiagFactIBCast(int_t k,  int_t k0,      // supernode to be factored
         /*Isend U blocks to the process row*/
         int_t nsupc = SuperSize(k);
         dISend_UDiagBlock(k0, BlockLFactor,
-                         nsupc * nsupc, U_diag_blk_send_req , grid, tag_ub);
+                          nsupc * nsupc, U_diag_blk_send_req, grid, tag_ub);
 
         /*Isend L blocks to the process col*/
         dISend_LDiagBlock(k0, BlockLFactor,
-                         nsupc * nsupc, L_diag_blk_send_req, grid, tag_ub);
+                          nsupc * nsupc, L_diag_blk_send_req, grid, tag_ub);
         SCT->commVolFactor += 1.0 * nsupc * nsupc * (Pr + Pc);
     }
     // }
     return 0;
 }
 
-int_t dLPanelTrSolve( int_t k,   int_t* factored_L,
-		      double* BlockUFactor,
-		      gridinfo_t *grid,
-		      LUstruct_t *LUstruct)
+int_t dLPanelTrSolve(int_t k, int_t *factored_L,
+                     double *BlockUFactor,
+                     gridinfo_t *grid,
+                     LUstruct_t *LUstruct)
 {
     double alpha = 1.0;
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     LocalLU_t *Llu = LUstruct->Llu;
-    int_t* xsup = Glu_persist->xsup;
+    int_t *xsup = Glu_persist->xsup;
 
     int_t iam = grid->iam;
 
-    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
-    int_t kcol = PCOL (k, grid);
-    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM(PROW(k, grid), PCOL(k, grid), grid);
+    int_t kcol = PCOL(k, grid);
+    int_t mycol = MYCOL(iam, grid);
     int nsupc = SuperSize(k);
 
     /*factor the L panel*/
-    if (mycol == kcol  && iam != pkk)
+    if (mycol == kcol && iam != pkk)
     {
         // factored_L[k] = 1;
-        int_t lk = LBj (k, grid);
+        int_t lk = LBj(k, grid);
         double *lusup = Llu->Lnzval_bc_ptr[lk];
         int nsupr;
         if (Llu->Lrowind_bc_ptr[lk])
@@ -158,34 +156,36 @@ int_t dLPanelTrSolve( int_t k,   int_t* factored_L,
         // }
 
         int_t l = nsupr;
-        double* ublk_ptr = BlockUFactor;
+        double *ublk_ptr = BlockUFactor;
         int ld_ujrow = nsupc;
 
         // unsigned long long t1 = _rdtsc();
 
         // #pragma omp for schedule(dynamic) nowait
-#define BL  32
+#define BL 32
         for (int i = 0; i < CEILING(l, BL); ++i)
         {
-            #pragma omp task
+#pragma omp task
             {
                 int_t off = i * BL;
                 // Sherry: int_t len = MY_MIN(BL, l - i * BL);
                 int len = SUPERLU_MIN(BL, l - i * BL);
 
-#if 1
-  #if defined (USE_VENDOR_BLAS)
-		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-			ublk_ptr, &ld_ujrow, &lusup[off], &nsupr,
-			1, 1, 1, 1);
-  #else
-		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-			ublk_ptr, &ld_ujrow, &lusup[off], &nsupr);
-  #endif
-#else
-                cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
-                len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off], nsupr);
-#endif
+                // #if 1
+                //   #if defined (USE_VENDOR_BLAS)
+                // 		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+                // 			ublk_ptr, &ld_ujrow, &lusup[off], &nsupr,
+                // 			1, 1, 1, 1);
+                //   #else
+                // 		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+                // 			ublk_ptr, &ld_ujrow, &lusup[off], &nsupr);
+                //   #endif
+                // #else
+                //                 cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
+                //                 len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off], nsupr);
+                // #endif
+                superlu_dtrsm("R", "U", "N", "N",
+                              len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off], nsupr);
             }
         }
     }
@@ -196,7 +196,7 @@ int_t dLPanelTrSolve( int_t k,   int_t* factored_L,
          { */
         /* code */
         factored_L[k] = 1;
-        int_t lk = LBj (k, grid);
+        int_t lk = LBj(k, grid);
         double *lusup = Llu->Lnzval_bc_ptr[lk];
         int nsupr;
         if (Llu->Lrowind_bc_ptr[lk])
@@ -208,85 +208,87 @@ int_t dLPanelTrSolve( int_t k,   int_t* factored_L,
 
         int_t l = nsupr - nsupc;
 
-        double* ublk_ptr = BlockUFactor;
+        double *ublk_ptr = BlockUFactor;
         int ld_ujrow = nsupc;
         // printf("%d: L update \n",k );
 
-#define BL  32
+#define BL 32
         // #pragma omp parallel for
         for (int i = 0; i < CEILING(l, BL); ++i)
         {
             int_t off = i * BL;
             // Sherry: int_t len = MY_MIN(BL, l - i * BL);
             int len = SUPERLU_MIN(BL, (l - i * BL));
-            #pragma omp task
+#pragma omp task
             {
-#if 1
-  #if defined (USE_VENDOR_BLAS)
-		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-			ublk_ptr, &ld_ujrow, &lusup[nsupc + off], &nsupr,
-			1, 1, 1, 1);
-  #else
-		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-			ublk_ptr, &ld_ujrow, &lusup[nsupc + off], &nsupr);
-  #endif
-#else
-                cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
-                             len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
-#endif
-
+                // #if 1
+                //   #if defined (USE_VENDOR_BLAS)
+                // 		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+                // 			ublk_ptr, &ld_ujrow, &lusup[nsupc + off], &nsupr,
+                // 			1, 1, 1, 1);
+                //   #else
+                // 		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+                // 			ublk_ptr, &ld_ujrow, &lusup[nsupc + off], &nsupr);
+                //   #endif
+                // #else
+                //                 cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
+                //                              len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
+                // #endif
+
+                superlu_dtrsm("R", "U", "N", "N",
+                              len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
             }
         }
     }
 
     return 0;
-}  /* dLPanelTrSolve */
+} /* dLPanelTrSolve */
 
-int_t dLPanelUpdate( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
-                    MPI_Request * U_diag_blk_recv_req,
-                    double* BlockUFactor,
+int_t dLPanelUpdate(int_t k, int_t *IrecvPlcd_D, int_t *factored_L,
+                    MPI_Request *U_diag_blk_recv_req,
+                    double *BlockUFactor,
                     gridinfo_t *grid,
                     LUstruct_t *LUstruct, SCT_t *SCT)
 {
 
-    dUDiagBlockRecvWait( k,  IrecvPlcd_D, factored_L,
-                         U_diag_blk_recv_req, grid, LUstruct, SCT);
+    dUDiagBlockRecvWait(k, IrecvPlcd_D, factored_L,
+                        U_diag_blk_recv_req, grid, LUstruct, SCT);
 
-    dLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct );
+    dLPanelTrSolve(k, factored_L, BlockUFactor, grid, LUstruct);
 
     return 0;
-}  /* dLPanelUpdate */
+} /* dLPanelUpdate */
 
-#define BL  32
+#define BL 32
 
-int_t dUPanelTrSolve( int_t k,  
-                     double* BlockLFactor,
-                     double* bigV,
+int_t dUPanelTrSolve(int_t k,
+                     double *BlockLFactor,
+                     double *bigV,
                      int_t ldt,
-                     Ublock_info_t* Ublock_info,
+                     Ublock_info_t *Ublock_info,
                      gridinfo_t *grid,
                      LUstruct_t *LUstruct,
                      SuperLUStat_t *stat, SCT_t *SCT)
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     LocalLU_t *Llu = LUstruct->Llu;
-    int_t* xsup = Glu_persist->xsup;
+    int_t *xsup = Glu_persist->xsup;
     int_t iam = grid->iam;
-    int_t myrow = MYROW (iam, grid);
-    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
-    int_t krow = PROW (k, grid);
+    int_t myrow = MYROW(iam, grid);
+    int_t pkk = PNUM(PROW(k, grid), PCOL(k, grid), grid);
+    int_t krow = PROW(k, grid);
     int_t nsupc = SuperSize(k);
 
     /*factor the U panel*/
-    if (myrow == krow  && iam != pkk)
+    if (myrow == krow && iam != pkk)
     {
-        int_t lk = LBi (k, grid);         /* Local block number */
+        int_t lk = LBi(k, grid); /* Local block number */
         if (!Llu->Unzval_br_ptr[lk])
             return 0;
         /* Initialization. */
-        int_t klst = FstBlockC (k + 1);
+        int_t klst = FstBlockC(k + 1);
 
-        int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+        int_t *usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */
         double *uval = Llu->Unzval_br_ptr[lk];
         int_t nb = usub[0];
 
@@ -294,22 +296,22 @@ int_t dUPanelTrSolve( int_t k,
         double *lusup = BlockLFactor;
 
         /* Loop through all the row blocks. to get the iukp and rukp*/
-        Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+        Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat);
 
         /* Loop through all the row blocks. */
         // #pragma omp for schedule(dynamic,2) nowait
         for (int_t b = 0; b < nb; ++b)
         {
- #pragma omp task
+#pragma omp task
             {
 #ifdef _OPENMP
                 int_t thread_id = omp_get_thread_num();
 #else
                 int_t thread_id = 0;
 #endif
-                double *tempv = bigV +  thread_id * ldt * ldt;
+                double *tempv = bigV + thread_id * ldt * ldt;
                 dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
-				       usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist);
+                                        usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist);
             }
         }
     }
@@ -321,48 +323,46 @@ int_t dUPanelTrSolve( int_t k,
         // factored_U[k] = 1;
         int_t *Lsub_buf;
         double *Lval_buf;
-        int_t lk = LBj (k, grid);
+        int_t lk = LBj(k, grid);
         Lsub_buf = Llu->Lrowind_bc_ptr[lk];
         Lval_buf = Llu->Lnzval_bc_ptr[lk];
 
-
         /* calculate U panel */
         // PDGSTRS2 (n, k0, k, Lsub_buf, Lval_buf, Glu_persist, grid, Llu,
         //           stat, HyP->Ublock_info, bigV, ldt, SCT);
 
-        lk = LBi (k, grid);         /* Local block number */
+        lk = LBi(k, grid); /* Local block number */
         if (Llu->Unzval_br_ptr[lk])
         {
             /* Initialization. */
-            int_t klst = FstBlockC (k + 1);
+            int_t klst = FstBlockC(k + 1);
 
-            int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+            int_t *usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */
             double *uval = Llu->Unzval_br_ptr[lk];
             int_t nb = usub[0];
 
             // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
-            int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+            int_t nsupr = Lsub_buf[1]; /* LDA of lusup[] */
             double *lusup = Lval_buf;
 
             /* Loop through all the row blocks. to get the iukp and rukp*/
-            Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+            Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat);
 
             /* Loop through all the row blocks. */
             // printf("%d :U update \n", k);
             for (int_t b = 0; b < nb; ++b)
             {
- #pragma omp task
+#pragma omp task
                 {
 #ifdef _OPENMP
                     int_t thread_id = omp_get_thread_num();
 #else
                     int_t thread_id = 0;
 #endif
-                    double *tempv = bigV +  thread_id * ldt * ldt;
+                    double *tempv = bigV + thread_id * ldt * ldt;
                     dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
-					   usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist);
+                                            usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist);
                 }
-
             }
         }
     }
@@ -370,116 +370,114 @@ int_t dUPanelTrSolve( int_t k,
     return 0;
 } /* dUPanelTrSolve */
 
-int_t dUPanelUpdate( int_t k,  int_t* factored_U,
-                    MPI_Request * L_diag_blk_recv_req,
-                    double* BlockLFactor,
-                    double* bigV,
+int_t dUPanelUpdate(int_t k, int_t *factored_U,
+                    MPI_Request *L_diag_blk_recv_req,
+                    double *BlockLFactor,
+                    double *bigV,
                     int_t ldt,
-                    Ublock_info_t* Ublock_info,
+                    Ublock_info_t *Ublock_info,
                     gridinfo_t *grid,
                     LUstruct_t *LUstruct,
                     SuperLUStat_t *stat, SCT_t *SCT)
 {
 
-    LDiagBlockRecvWait( k, factored_U, L_diag_blk_recv_req, grid);
+    LDiagBlockRecvWait(k, factored_U, L_diag_blk_recv_req, grid);
 
-    dUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid,
-                       LUstruct, stat, SCT);
+    dUPanelTrSolve(k, BlockLFactor, bigV, ldt, Ublock_info, grid,
+                   LUstruct, stat, SCT);
     return 0;
 }
 
 int_t dIBcastRecvLPanel(
     int_t k,
     int_t k0,
-    int* msgcnt,
+    int *msgcnt,
     MPI_Request *send_req,
-    MPI_Request *recv_req ,
-    int_t* Lsub_buf,
-    double* Lval_buf,
-    int_t * factored,
+    MPI_Request *recv_req,
+    int_t *Lsub_buf,
+    double *Lval_buf,
+    int_t *factored,
     gridinfo_t *grid,
     LUstruct_t *LUstruct,
     SCT_t *SCT,
-    int tag_ub
-)
+    int tag_ub)
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     LocalLU_t *Llu = LUstruct->Llu;
-    int_t* xsup = Glu_persist->xsup;
-    int** ToSendR = Llu->ToSendR;
-    int* ToRecv = Llu->ToRecv;
+    int_t *xsup = Glu_persist->xsup;
+    int **ToSendR = Llu->ToSendR;
+    int *ToRecv = Llu->ToRecv;
     int_t iam = grid->iam;
     int_t Pc = grid->npcol;
-    int_t mycol = MYCOL (iam, grid);
-    int_t kcol = PCOL (k, grid);
-    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    int_t mycol = MYCOL(iam, grid);
+    int_t kcol = PCOL(k, grid);
+    int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
     /* code */
     if (mycol == kcol)
     {
         /*send the L panel to myrow*/
 
-        int_t lk = LBj (k, grid);     /* Local block number. */
-        int_t* lsub = Lrowind_bc_ptr[lk];
-        double* lusup = Lnzval_bc_ptr[lk];
+        int_t lk = LBj(k, grid); /* Local block number. */
+        int_t *lsub = Lrowind_bc_ptr[lk];
+        double *lusup = Lnzval_bc_ptr[lk];
 
-        dIBcast_LPanel (k, k0, lsub, lusup, grid, msgcnt, send_req,
-		       ToSendR, xsup, tag_ub);
+        dIBcast_LPanel(k, k0, lsub, lusup, grid, msgcnt, send_req,
+                       ToSendR, xsup, tag_ub);
 
         if (lsub)
         {
-            int_t nrbl  =   lsub[0]; /*number of L blocks */
-            int_t   len   = lsub[1];       /* LDA of the nzval[] */
-            int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-            int_t len2  = SuperSize(lk) * len;
+            int_t nrbl = lsub[0]; /*number of L blocks */
+            int_t len = lsub[1];  /* LDA of the nzval[] */
+            int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+            int_t len2 = SuperSize(lk) * len;
             SCT->commVolFactor += 1.0 * (Pc - 1) * (len1 * sizeof(int_t) + len2 * sizeof(double));
         }
     }
     else
     {
         /*receive factored L panels*/
-        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        if (ToRecv[k] >= 1) /* Recv block column L(:,0). */
         {
             /*place Irecv*/
-            dIrecv_LPanel (k, k0, Lsub_buf, Lval_buf, grid, recv_req, Llu, tag_ub);
+            dIrecv_LPanel(k, k0, Lsub_buf, Lval_buf, grid, recv_req, Llu, tag_ub);
         }
         else
         {
             msgcnt[0] = 0;
         }
-
     }
     factored[k] = 0;
 
     return 0;
 }
 
-int_t dIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt,
-    			     MPI_Request *send_requ,
-    			     MPI_Request *recv_requ,
-    			     int_t* Usub_buf, double* Uval_buf,
-    			     gridinfo_t *grid, LUstruct_t *LUstruct,
-    			     SCT_t *SCT, int tag_ub)
+int_t dIBcastRecvUPanel(int_t k, int_t k0, int *msgcnt,
+                        MPI_Request *send_requ,
+                        MPI_Request *recv_requ,
+                        int_t *Usub_buf, double *Uval_buf,
+                        gridinfo_t *grid, LUstruct_t *LUstruct,
+                        SCT_t *SCT, int tag_ub)
 {
     LocalLU_t *Llu = LUstruct->Llu;
 
-    int* ToSendD = Llu->ToSendD;
-    int* ToRecv = Llu->ToRecv;
+    int *ToSendD = Llu->ToSendD;
+    int *ToRecv = Llu->ToRecv;
     int_t iam = grid->iam;
     int_t Pr = grid->nprow;
-    int_t myrow = MYROW (iam, grid);
-    int_t krow = PROW (k, grid);
+    int_t myrow = MYROW(iam, grid);
+    int_t krow = PROW(k, grid);
 
-    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double **Unzval_br_ptr = Llu->Unzval_br_ptr;
     if (myrow == krow)
     {
         /*send U panel to myrow*/
-        int_t   lk = LBi (k, grid);
-        int_t*  usub = Ufstnz_br_ptr[lk];
-        double* uval = Unzval_br_ptr[lk];
+        int_t lk = LBi(k, grid);
+        int_t *usub = Ufstnz_br_ptr[lk];
+        double *uval = Unzval_br_ptr[lk];
         dIBcast_UPanel(k, k0, usub, uval, grid, msgcnt,
-                        send_requ, ToSendD, tag_ub);
+                       send_requ, ToSendD, tag_ub);
         if (usub)
         {
             /* code */
@@ -491,9 +489,9 @@ int_t dIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt,
     else
     {
         /*receive U panels */
-        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        if (ToRecv[k] == 2) /* Recv block row U(k,:). */
         {
-            dIrecv_UPanel (k, k0, Usub_buf, Uval_buf, Llu, grid, recv_requ, tag_ub);
+            dIrecv_UPanel(k, k0, Usub_buf, Uval_buf, Llu, grid, recv_requ, tag_ub);
         }
         else
         {
@@ -504,58 +502,58 @@ int_t dIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt,
     return 0;
 }
 
-int_t dWaitL( int_t k, int* msgcnt, int* msgcntU,
-              MPI_Request *send_req, MPI_Request *recv_req,
-    	      gridinfo_t *grid, LUstruct_t *LUstruct, SCT_t *SCT)
+int_t dWaitL(int_t k, int *msgcnt, int *msgcntU,
+             MPI_Request *send_req, MPI_Request *recv_req,
+             gridinfo_t *grid, LUstruct_t *LUstruct, SCT_t *SCT)
 {
     LocalLU_t *Llu = LUstruct->Llu;
-    int** ToSendR = Llu->ToSendR;
-    int* ToRecv = Llu->ToRecv;
+    int **ToSendR = Llu->ToSendR;
+    int *ToRecv = Llu->ToRecv;
     int_t iam = grid->iam;
-    int_t mycol = MYCOL (iam, grid);
-    int_t kcol = PCOL (k, grid);
+    int_t mycol = MYCOL(iam, grid);
+    int_t kcol = PCOL(k, grid);
     if (mycol == kcol)
     {
         /*send the L panel to myrow*/
-        Wait_LSend (k, grid, ToSendR, send_req, SCT);
+        Wait_LSend(k, grid, ToSendR, send_req, SCT);
     }
     else
     {
         /*receive factored L panels*/
-        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        if (ToRecv[k] >= 1) /* Recv block column L(:,0). */
         {
             /*force wait for I recv to complete*/
-            dWait_LRecv( recv_req,  msgcnt, msgcntU, grid, SCT);
+            dWait_LRecv(recv_req, msgcnt, msgcntU, grid, SCT);
         }
     }
 
     return 0;
 }
 
-int_t dWaitU( int_t k, int* msgcnt,
-              MPI_Request *send_requ, MPI_Request *recv_requ,
-    	      gridinfo_t *grid, LUstruct_t *LUstruct, SCT_t *SCT)
+int_t dWaitU(int_t k, int *msgcnt,
+             MPI_Request *send_requ, MPI_Request *recv_requ,
+             gridinfo_t *grid, LUstruct_t *LUstruct, SCT_t *SCT)
 {
     LocalLU_t *Llu = LUstruct->Llu;
 
-    int* ToRecv = Llu->ToRecv;
-    int* ToSendD = Llu->ToSendD;
+    int *ToRecv = Llu->ToRecv;
+    int *ToSendD = Llu->ToSendD;
     int_t iam = grid->iam;
-    int_t myrow = MYROW (iam, grid);
-    int_t krow = PROW (k, grid);
+    int_t myrow = MYROW(iam, grid);
+    int_t krow = PROW(k, grid);
     if (myrow == krow)
     {
-        int_t lk = LBi (k, grid);
+        int_t lk = LBi(k, grid);
         if (ToSendD[lk] == YES)
             Wait_USend(send_requ, grid, SCT);
     }
     else
     {
         /*receive U panels */
-        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        if (ToRecv[k] == 2) /* Recv block row U(k,:). */
         {
             /*force wait*/
-            dWait_URecv( recv_requ, msgcnt, SCT);
+            dWait_URecv(recv_requ, msgcnt, SCT);
         }
     }
     return 0;
diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c
index 4b1b63de..1064dca0 100644
--- a/SRC/pdgstrf2.c
+++ b/SRC/pdgstrf2.c
@@ -378,19 +378,22 @@ int_t LpanelUpdate(int off0,  int nsupc, double* ublk_ptr, int ld_ujrow,
     {
         int_t off = i * GT;
         int len = SUPERLU_MIN(GT, l - i * GT);
-#if 1
-  #if defined (USE_VENDOR_BLAS)
-        dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-		ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr,
-		1, 1, 1, 1);
-  #else
-        dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-		ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr);
-  #endif
-#else
-        cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
+// #if 1
+//   #if defined (USE_VENDOR_BLAS)
+//         dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+// 		ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr,
+// 		1, 1, 1, 1);
+//   #else
+//         dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
+// 		ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr);
+//   #endif
+// #else
+//         cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
+//                      len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr);
+// #endif
+    superlu_dtrsm ("R", "U", "N", "N",
                      len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr);
-#endif
+
 
     } /* for i = ... */
 
@@ -735,20 +738,23 @@ int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
     int_t luptr = (knsupc - ldu) * (nsupr + 1);
     // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu );
 
-#if 1
-  #if defined (USE_VENDOR_BLAS)
-     dtrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
-	     &lusup[luptr], &nsupr, tempv, &ldu,
-	     1, 1, 1, 1);
-  #else
-     dtrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
-	     &lusup[luptr], &nsupr, tempv, &ldu);
-  #endif
-#else
-
-    cblas_dtrsm (CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit,
+// #if 1
+//   #if defined (USE_VENDOR_BLAS)
+//      dtrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
+// 	     &lusup[luptr], &nsupr, tempv, &ldu,
+// 	     1, 1, 1, 1);
+//   #else
+//      dtrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha,
+// 	     &lusup[luptr], &nsupr, tempv, &ldu);
+//   #endif
+// #else
+
+//     cblas_dtrsm (CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit,
+//                  ldu, ncols, alpha, &lusup[luptr], nsupr, tempv, ldu);
+// #endif
+
+    superlu_dtrsm ("L", "L", "N", "U",
                  ldu, ncols, alpha, &lusup[luptr], nsupr, tempv, ldu);
-#endif
 
     /*now scatter the output into sparse U block*/
     dTrs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv);
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index 7db2a32e..9173c7e1 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -571,6 +571,15 @@ extern int superlu_dgemm(const char *transa, const char *transb,
                   int m, int n, int k, double alpha, double *a, 
                   int lda, double *b, int ldb, double beta, double *c, int ldc);
 
+extern int superlu_dtrsm(const char *sideRL, const char *uplo,
+                  const char *transa, const char *diag,
+                  const int m, const int n,
+                  const double alpha, const double *a,
+                  const int lda, double *b, const int ldb);
+extern int superlu_dger(const int m, const int n, const double alpha,
+                 const double *x, const int incx, const double *y,
+                 const int incy, double *a, const int lda);
+
 
 /*==== For 3D code ====*/
 

From d873f5a5ef1178f2b1d880d0e80e37afb0aaa306 Mon Sep 17 00:00:00 2001
From: 7ps 
Date: Sun, 29 Nov 2020 12:19:31 -0500
Subject: [PATCH 045/147] Adding new defs

---
 SRC/dsuperlu_blas.c |    3 +-
 SRC/superlu_ddefs.h | 1585 +++++++++++++++++++++----------------------
 2 files changed, 791 insertions(+), 797 deletions(-)

diff --git a/SRC/dsuperlu_blas.c b/SRC/dsuperlu_blas.c
index 1488f427..090a77cb 100644
--- a/SRC/dsuperlu_blas.c
+++ b/SRC/dsuperlu_blas.c
@@ -102,4 +102,5 @@ int superlu_dtrsv(char *uplo, char *trans, char *diag,
 #endif
     
     return 0;
-}
\ No newline at end of file
+}
+
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index 9173c7e1..a132f0ae 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -9,7 +9,6 @@ The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-
 /*! @file
  * \brief  Distributed SuperLU data types and function prototypes
  *
@@ -37,7 +36,8 @@ at the top-level directory.
 #include "superlu_defs.h"
 
 /*-- Auxiliary data type used in PxGSTRS/PxGSTRS1. */
-typedef struct {
+typedef struct
+{
     int_t lbnum;  /* Row block number (local).      */
     int_t indpos; /* Starting position in Uindex[]. */
 } Ucb_indptr_t;
@@ -47,33 +47,34 @@ typedef struct {
  * column format, the blocks in U are stored in compressed block row format.
  */
 #define MAX_LOOKAHEADS 50
-typedef struct {
-    int_t   **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc)                 */
-    double **Lnzval_bc_ptr;  /* size ceil(NSUPERS/Pc)                 */
-    double **Linv_bc_ptr;  /* size ceil(NSUPERS/Pc)                 */
-    int_t   **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)  pointers to locations in Lrowind_bc_ptr and Lnzval_bc_ptr */
-    int_t   *Unnz; /* number of nonzeros per block column in U*/
-	int_t   **Lrowind_bc_2_lsum; /* size ceil(NSUPERS/Pc)  map indices of Lrowind_bc_ptr to indices of lsum  */
-    double  **Uinv_bc_ptr;  /* size ceil(NSUPERS/Pc)     	*/
-    int_t   **Ufstnz_br_ptr;  /* size ceil(NSUPERS/Pr)                 */
-    double  **Unzval_br_ptr;  /* size ceil(NSUPERS/Pr)                 */
-        /*-- Data structures used for broadcast and reduction trees. --*/
-    BcTree  *LBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
-    RdTree  *LRtree_ptr;       /* size ceil(NSUPERS/Pr)                */
-    BcTree  *UBtree_ptr;       /* size ceil(NSUPERS/Pc)                */
-    RdTree  *URtree_ptr;       /* size ceil(NSUPERS/Pr)			*/
+typedef struct
+{
+    int_t **Lrowind_bc_ptr;     /* size ceil(NSUPERS/Pc)                 */
+    double **Lnzval_bc_ptr;     /* size ceil(NSUPERS/Pc)                 */
+    double **Linv_bc_ptr;       /* size ceil(NSUPERS/Pc)                 */
+    int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc)  pointers to locations in Lrowind_bc_ptr and Lnzval_bc_ptr */
+    int_t *Unnz;                /* number of nonzeros per block column in U*/
+    int_t **Lrowind_bc_2_lsum;  /* size ceil(NSUPERS/Pc)  map indices of Lrowind_bc_ptr to indices of lsum  */
+    double **Uinv_bc_ptr;       /* size ceil(NSUPERS/Pc)     	*/
+    int_t **Ufstnz_br_ptr;      /* size ceil(NSUPERS/Pr)                 */
+    double **Unzval_br_ptr;     /* size ceil(NSUPERS/Pr)                 */
+    /*-- Data structures used for broadcast and reduction trees. --*/
+    BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc)                */
+    RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr)                */
+    BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc)                */
+    RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr)			*/
 #if 0
     int_t   *Lsub_buf;        /* Buffer for the remote subscripts of L */
     double  *Lval_buf;        /* Buffer for the remote nonzeros of L   */
     int_t   *Usub_buf;        /* Buffer for the remote subscripts of U */
     double  *Uval_buf;        /* Buffer for the remote nonzeros of U   */
 #endif
-    int_t   *Lsub_buf_2[MAX_LOOKAHEADS];   /* Buffers for the remote subscripts of L*/
-    double  *Lval_buf_2[MAX_LOOKAHEADS];   /* Buffers for the remote nonzeros of L  */
-    int_t   *Usub_buf_2[MAX_LOOKAHEADS];   /* Buffer for the remote subscripts of U */
-    double  *Uval_buf_2[MAX_LOOKAHEADS];   /* Buffer for the remote nonzeros of U   */
-    double  *ujrow;           /* used in panel factorization.          */
-    int_t   bufmax[NBUFFERS]; /* Maximum buffer size across all MPI ranks:
+    int_t *Lsub_buf_2[MAX_LOOKAHEADS];  /* Buffers for the remote subscripts of L*/
+    double *Lval_buf_2[MAX_LOOKAHEADS]; /* Buffers for the remote nonzeros of L  */
+    int_t *Usub_buf_2[MAX_LOOKAHEADS];  /* Buffer for the remote subscripts of U */
+    double *Uval_buf_2[MAX_LOOKAHEADS]; /* Buffer for the remote nonzeros of U   */
+    double *ujrow;                      /* used in panel factorization.          */
+    int_t bufmax[NBUFFERS];             /* Maximum buffer size across all MPI ranks:
 			       *  0 : maximum size of Lsub_buf[]
 			       *  1 : maximum size of Lval_buf[]
 			       *  2 : maximum size of Usub_buf[]
@@ -82,30 +83,29 @@ typedef struct {
 			       */
 
     /*-- Record communication schedule for factorization. --*/
-    int   *ToRecv;          /* Recv from no one (0), left (1), and up (2).*/
-    int   *ToSendD;         /* Whether need to send down block row.       */
-    int   **ToSendR;        /* List of processes to send right block col. */
+    int *ToRecv;   /* Recv from no one (0), left (1), and up (2).*/
+    int *ToSendD;  /* Whether need to send down block row.       */
+    int **ToSendR; /* List of processes to send right block col. */
 
     /*-- Record communication schedule for forward/back solves. --*/
-    int_t   *fmod;            /* Modification count for L-solve            */
-    int_t   **fsendx_plist;   /* Column process list to send down Xk       */
-    int_t   *frecv;           /* Modifications to be recv'd in proc row    */
-    int_t   nfrecvx;          /* Number of Xk I will receive in L-solve    */
-    int_t   nfsendx;          /* Number of Xk I will send in L-solve       */
-    int_t   *bmod;            /* Modification count for U-solve            */
-    int_t   **bsendx_plist;   /* Column process list to send down Xk       */
-    int_t   *brecv;           /* Modifications to be recv'd in proc row    */
-    int_t   nbrecvx;          /* Number of Xk I will receive in U-solve    */
-    int_t   nbsendx;          /* Number of Xk I will send in U-solve       */
-    int_t   *mod_bit;         /* Flag contribution from each row blocks    */
+    int_t *fmod;          /* Modification count for L-solve            */
+    int_t **fsendx_plist; /* Column process list to send down Xk       */
+    int_t *frecv;         /* Modifications to be recv'd in proc row    */
+    int_t nfrecvx;        /* Number of Xk I will receive in L-solve    */
+    int_t nfsendx;        /* Number of Xk I will send in L-solve       */
+    int_t *bmod;          /* Modification count for U-solve            */
+    int_t **bsendx_plist; /* Column process list to send down Xk       */
+    int_t *brecv;         /* Modifications to be recv'd in proc row    */
+    int_t nbrecvx;        /* Number of Xk I will receive in U-solve    */
+    int_t nbsendx;        /* Number of Xk I will send in U-solve       */
+    int_t *mod_bit;       /* Flag contribution from each row blocks    */
 
     /*-- Auxiliary arrays used for forward/back solves. --*/
-    int_t   *ilsum;           /* Starting position of each supernode in lsum
+    int_t *ilsum;       /* Starting position of each supernode in lsum
 				 (local)  */
-    int_t   ldalsum;          /* LDA of lsum (local) */
-    int_t   SolveMsgSent;     /* Number of actual messages sent in LU-solve */
-    int_t   SolveMsgVol;      /* Volume of messages sent in the solve phase */
-
+    int_t ldalsum;      /* LDA of lsum (local) */
+    int_t SolveMsgSent; /* Number of actual messages sent in LU-solve */
+    int_t SolveMsgVol;  /* Volume of messages sent in the solve phase */
 
     /*********************/
     /* The following variables are used in the hybrid solver */
@@ -114,19 +114,19 @@ typedef struct {
     int_t UT_SOLVE;
     int_t L_SOLVE;
     int_t FRECV;
-    int_t ut_ldalsum;        /* LDA of lsum (local) */
-    int_t *ut_ilsum;         /* ilsum in column-wise                        */
-    int_t *utmod;            /* Modification count for Ut-solve.            */
-    int_t **ut_sendx_plist;  /* Row process list to send down Xk            */
-    int_t *utrecv;           /* Modifications to be recev'd in proc column. */
-    int_t n_utsendx;         /* Number of Xk I will receive                 */
-    int_t n_utrecvx;         /* Number of Xk I will send                    */
+    int_t ut_ldalsum;       /* LDA of lsum (local) */
+    int_t *ut_ilsum;        /* ilsum in column-wise                        */
+    int_t *utmod;           /* Modification count for Ut-solve.            */
+    int_t **ut_sendx_plist; /* Row process list to send down Xk            */
+    int_t *utrecv;          /* Modifications to be recev'd in proc column. */
+    int_t n_utsendx;        /* Number of Xk I will receive                 */
+    int_t n_utrecvx;        /* Number of Xk I will send                    */
     int_t n_utrecvmod;
     int_t nroot;
     int_t *ut_modbit;
     int_t *Urbs;
-    Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */
-    int_t  **Ucb_valptr;      /* Vertical linked list pointing to Unzval[] */
+    Ucb_indptr_t **Ucb_indptr; /* Vertical linked list pointing to Uindex[] */
+    int_t **Ucb_valptr;        /* Vertical linked list pointing to Unzval[] */
 
     /* some additional counters for L solve */
     int_t n;
@@ -135,63 +135,63 @@ typedef struct {
     int_t inv; /* whether the diagonal block is inverted*/
 } LocalLU_t;
 
-
-typedef struct {
+typedef struct
+{
     int_t *etree;
     Glu_persist_t *Glu_persist;
     LocalLU_t *Llu;
-	char dt;
+    char dt;
 } LUstruct_t;
 
-
 /*-- Data structure for communication during matrix-vector multiplication. */
-typedef struct {
+typedef struct
+{
     int_t *extern_start;
-    int_t *ind_tosend;    /* X indeices to be sent to other processes */
-    int_t *ind_torecv;    /* X indeices to be received from other processes */
-    int_t *ptr_ind_tosend;/* Printers to ind_tosend[] (Size procs)
+    int_t *ind_tosend;     /* X indeices to be sent to other processes */
+    int_t *ind_torecv;     /* X indeices to be received from other processes */
+    int_t *ptr_ind_tosend; /* Printers to ind_tosend[] (Size procs)
 			     (also point to val_torecv) */
-    int_t *ptr_ind_torecv;/* Printers to ind_torecv[] (Size procs)
+    int_t *ptr_ind_torecv; /* Printers to ind_torecv[] (Size procs)
 			     (also point to val_tosend) */
-    int   *SendCounts;    /* Numbers of X indices to be sent
+    int *SendCounts;       /* Numbers of X indices to be sent
 			     (also numbers of X values to be received) */
-    int   *RecvCounts;    /* Numbers of X indices to be received
+    int *RecvCounts;       /* Numbers of X indices to be received
 			     (also numbers of X values to be sent) */
-    double *val_tosend;   /* X values to be sent to other processes */
-    double *val_torecv;   /* X values to be received from other processes */
-    int_t TotalIndSend;   /* Total number of indices to be sent
+    double *val_tosend;    /* X values to be sent to other processes */
+    double *val_torecv;    /* X values to be received from other processes */
+    int_t TotalIndSend;    /* Total number of indices to be sent
 			     (also total number of values to be received) */
-    int_t TotalValSend;   /* Total number of values to be sent.
+    int_t TotalValSend;    /* Total number of values to be sent.
 			     (also total number of indices to be received) */
 } pdgsmv_comm_t;
 
 /*-- Data structure holding the information for the solution phase --*/
-typedef struct {
+typedef struct
+{
     int_t *row_to_proc;
     int_t *inv_perm_c;
     int_t num_diag_procs, *diag_procs, *diag_len;
-    pdgsmv_comm_t *gsmv_comm; /* communication metadata for SpMV,
+    pdgsmv_comm_t *gsmv_comm;   /* communication metadata for SpMV,
          	       		      required by IterRefine.          */
-    pxgstrs_comm_t *gstrs_comm;  /* communication metadata for SpTRSV. */
-    int_t *A_colind_gsmv; /* After pdgsmv_init(), the global column
+    pxgstrs_comm_t *gstrs_comm; /* communication metadata for SpTRSV. */
+    int_t *A_colind_gsmv;       /* After pdgsmv_init(), the global column
                              indices of A are translated into the relative
                              positions in the gathered x-vector.
                              This is re-used in repeated calls to pdgsmv() */
-    int_t *xrow_to_proc; /* used by PDSLin */
+    int_t *xrow_to_proc;        /* used by PDSLin */
 } SOLVEstruct_t;
 
-
 /*==== For 3D code ====*/
 
-// new structures for pdgstrf_4_8 
+// new structures for pdgstrf_4_8
 
 typedef struct
 {
     int_t nub;
     int_t klst;
     int_t ldu;
-    int_t* usub;
-    double* uval;
+    int_t *usub;
+    double *uval;
 } uPanelInfo_t;
 
 typedef struct
@@ -199,25 +199,23 @@ typedef struct
     int_t *lsub;
     double *lusup;
     int_t luptr0;
-    int_t nlb;  //number of l blocks
+    int_t nlb; //number of l blocks
     int_t nsupr;
 } lPanelInfo_t;
 
- 
-
 /* HyP_t is the data structure to assist HALO offload of Schur-complement. */
 typedef struct
 {
     Remain_info_t *lookAhead_info, *Remain_info;
     Ublock_info_t *Ublock_info, *Ublock_info_Phi;
-    
-    int_t first_l_block_acc , first_u_block_acc;
-    int_t last_offload ;
-    int_t *Lblock_dirty_bit, * Ublock_dirty_bit;
+
+    int_t first_l_block_acc, first_u_block_acc;
+    int_t last_offload;
+    int_t *Lblock_dirty_bit, *Ublock_dirty_bit;
     double *lookAhead_L_buff, *Remain_L_buff;
-    int_t lookAheadBlk;  /* number of blocks in look-ahead window */
-    int_t RemainBlk ;    /* number of blocks outside look-ahead window */
-    int_t  num_look_aheads, nsupers;
+    int_t lookAheadBlk; /* number of blocks in look-ahead window */
+    int_t RemainBlk;    /* number of blocks outside look-ahead window */
+    int_t num_look_aheads, nsupers;
     int_t ldu, ldu_Phi;
     int_t num_u_blks, num_u_blks_Phi;
 
@@ -234,32 +232,31 @@ typedef struct
     int_t nCudaStreams;
 } HyP_t;
 
-typedef struct 
+typedef struct
 {
-    int_t * Lsub_buf ;
-    double * Lval_buf ;
-    int_t * Usub_buf ;
-    double * Uval_buf ;
+    int_t *Lsub_buf;
+    double *Lval_buf;
+    int_t *Usub_buf;
+    double *Uval_buf;
 } dLUValSubBuf_t;
 
 int_t scuStatUpdate(
     int_t knsupc,
-    HyP_t* HyP, 
-    SCT_t* SCT,
-    SuperLUStat_t *stat
-    );
+    HyP_t *HyP,
+    SCT_t *SCT,
+    SuperLUStat_t *stat);
 
 typedef struct
 {
     gEtreeInfo_t gEtreeInfo;
-    int_t* iperm_c_supno;
-    int_t* myNodeCount;
-    int_t* myTreeIdxs;
-    int_t* myZeroTrIdxs;
-    int_t** treePerm;
-    sForest_t** sForests;
-    int_t* supernode2treeMap;
-    dLUValSubBuf_t  *LUvsb;
+    int_t *iperm_c_supno;
+    int_t *myNodeCount;
+    int_t *myTreeIdxs;
+    int_t *myZeroTrIdxs;
+    int_t **treePerm;
+    sForest_t **sForests;
+    int_t *supernode2treeMap;
+    dLUValSubBuf_t *LUvsb;
 } trf3Dpartition_t;
 
 typedef struct
@@ -269,20 +266,19 @@ typedef struct
 } scuBufs_t;
 
 typedef struct
-{   
-    double* BlockLFactor;
-    double* BlockUFactor;
+{
+    double *BlockLFactor;
+    double *BlockUFactor;
 } diagFactBufs_t;
 
 typedef struct
 {
-    Ublock_info_t* Ublock_info;
-    Remain_info_t*  Remain_info;
-    uPanelInfo_t* uPanelInfo;
-    lPanelInfo_t* lPanelInfo;
+    Ublock_info_t *Ublock_info;
+    Remain_info_t *Remain_info;
+    uPanelInfo_t *uPanelInfo;
+    lPanelInfo_t *lPanelInfo;
 } packLUInfo_t;
 
-
 /*=====================*/
 
 /***********************************************************************
@@ -290,420 +286,425 @@ typedef struct
  ***********************************************************************/
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
-
-/* Supernodal LU factor related */
-extern void
-dCreate_CompCol_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *,
-			    int_t *, int_t *, Stype_t, Dtype_t, Mtype_t);
-extern void
-dCreate_CompRowLoc_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, int_t,
-			       int_t, double *, int_t *, int_t *,
-			       Stype_t, Dtype_t, Mtype_t);
-extern void
-dCompRow_to_CompCol_dist(int_t, int_t, int_t, double *, int_t *, int_t *,
-                         double **, int_t **, int_t **);
-extern int
-pdCompRow_loc_to_CompCol_global(int_t, SuperMatrix *, gridinfo_t *,
-	 		        SuperMatrix *);
-extern void
-dCopy_CompCol_Matrix_dist(SuperMatrix *, SuperMatrix *);
-extern void
-dCreate_Dense_Matrix_dist(SuperMatrix *, int_t, int_t, double *, int_t,
-			  Stype_t, Dtype_t, Mtype_t);
-extern void
-dCreate_SuperNode_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *,
-			      int_t *, int_t *, int_t *, int_t *, int_t *,
-			      Stype_t, Dtype_t, Mtype_t);
-extern void
-dCopy_Dense_Matrix_dist(int_t, int_t, double *, int_t,
-                        double *, int_t);
-
-extern void    dallocateA_dist (int_t, int_t, double **, int_t **, int_t **);
-extern void    dGenXtrue_dist (int_t, int_t, double *, int_t);
-extern void    dFillRHS_dist (char *, int_t, double *, int_t,
+    /* Supernodal LU factor related */
+    extern void
+    dCreate_CompCol_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *,
+                                int_t *, int_t *, Stype_t, Dtype_t, Mtype_t);
+    extern void
+    dCreate_CompRowLoc_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, int_t,
+                                   int_t, double *, int_t *, int_t *,
+                                   Stype_t, Dtype_t, Mtype_t);
+    extern void
+    dCompRow_to_CompCol_dist(int_t, int_t, int_t, double *, int_t *, int_t *,
+                             double **, int_t **, int_t **);
+    extern int
+    pdCompRow_loc_to_CompCol_global(int_t, SuperMatrix *, gridinfo_t *,
+                                    SuperMatrix *);
+    extern void
+    dCopy_CompCol_Matrix_dist(SuperMatrix *, SuperMatrix *);
+    extern void
+    dCreate_Dense_Matrix_dist(SuperMatrix *, int_t, int_t, double *, int_t,
+                              Stype_t, Dtype_t, Mtype_t);
+    extern void
+    dCreate_SuperNode_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *,
+                                  int_t *, int_t *, int_t *, int_t *, int_t *,
+                                  Stype_t, Dtype_t, Mtype_t);
+    extern void
+    dCopy_Dense_Matrix_dist(int_t, int_t, double *, int_t,
+                            double *, int_t);
+
+    extern void dallocateA_dist(int_t, int_t, double **, int_t **, int_t **);
+    extern void dGenXtrue_dist(int_t, int_t, double *, int_t);
+    extern void dFillRHS_dist(char *, int_t, double *, int_t,
                               SuperMatrix *, double *, int_t);
-extern int     dcreate_matrix(SuperMatrix *, int, double **, int *,
-			      double **, int *, FILE *, gridinfo_t *);
-extern int     dcreate_matrix_rb(SuperMatrix *, int, double **, int *,
-			      double **, int *, FILE *, gridinfo_t *);
-extern int     dcreate_matrix_dat(SuperMatrix *, int, double **, int *,
-			      double **, int *, FILE *, gridinfo_t *);
-extern int 	   dcreate_matrix_postfix(SuperMatrix *, int, double **, int *,
-				  double **, int *, FILE *, char *, gridinfo_t *);
-
-/*==== For 3D code ====*/
-extern int dcreate_matrix3d(SuperMatrix *A, int nrhs, double **rhs,
-                     int *ldb, double **x, int *ldx,
-                     FILE *fp, gridinfo3d_t *grid3d);
-extern int dcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, double **rhs,
-                           int *ldb, double **x, int *ldx,
-                           FILE *fp, char * postfix, gridinfo3d_t *grid3d);
-    
-/* Matrix distributed in NRformat_loc in 3D process grid. It converts 
+    extern int dcreate_matrix(SuperMatrix *, int, double **, int *,
+                              double **, int *, FILE *, gridinfo_t *);
+    extern int dcreate_matrix_rb(SuperMatrix *, int, double **, int *,
+                                 double **, int *, FILE *, gridinfo_t *);
+    extern int dcreate_matrix_dat(SuperMatrix *, int, double **, int *,
+                                  double **, int *, FILE *, gridinfo_t *);
+    extern int dcreate_matrix_postfix(SuperMatrix *, int, double **, int *,
+                                      double **, int *, FILE *, char *, gridinfo_t *);
+
+    /*==== For 3D code ====*/
+    extern int dcreate_matrix3d(SuperMatrix *A, int nrhs, double **rhs,
+                                int *ldb, double **x, int *ldx,
+                                FILE *fp, gridinfo3d_t *grid3d);
+    extern int dcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, double **rhs,
+                                        int *ldb, double **x, int *ldx,
+                                        FILE *fp, char *postfix, gridinfo3d_t *grid3d);
+
+    /* Matrix distributed in NRformat_loc in 3D process grid. It converts 
    it to a NRformat_loc distributed in 2D grid in grid-0 */
-extern NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, double *B,
-					     int ldb, int nrhs,
-					     gridinfo3d_t *grid3d);
-extern int dScatter_B3d(NRformat_loc3d *A3d, gridinfo3d_t *grid3d);
-    
-
-/* Driver related */
-extern void    dgsequ_dist (SuperMatrix *, double *, double *, double *,
-			    double *, double *, int_t *);
-extern double  dlangs_dist (char *, SuperMatrix *);
-extern void    dlaqgs_dist (SuperMatrix *, double *, double *, double,
-			    double, double, char *);
-extern void    pdgsequ (SuperMatrix *, double *, double *, double *,
-			double *, double *, int_t *, gridinfo_t *);
-extern double  pdlangs (char *, SuperMatrix *, gridinfo_t *);
-extern void    pdlaqgs (SuperMatrix *, double *, double *, double,
-			double, double, char *);
-extern int     pdPermute_Dense_Matrix(int_t, int_t, int_t [], int_t[],
-				      double [], int, double [], int, int,
-				      gridinfo_t *);
-
-extern int     sp_dtrsv_dist (char *, char *, char *, SuperMatrix *,
-			      SuperMatrix *, double *, int *);
-extern int     sp_dgemv_dist (char *, double, SuperMatrix *, double *,
-			      int, double, double *, int);
-extern int     sp_dgemm_dist (char *, int, double, SuperMatrix *,
-                        double *, int, double, double *, int);
-
-extern float ddistribute(fact_t, int_t, SuperMatrix *, Glu_freeable_t *,
-			 LUstruct_t *, gridinfo_t *);
-extern void  pdgssvx_ABglobal(superlu_dist_options_t *, SuperMatrix *,
-			      ScalePermstruct_t *, double *,
-			      int, int, gridinfo_t *, LUstruct_t *, double *,
-			      SuperLUStat_t *, int *);
-extern float pddistribute(fact_t, int_t, SuperMatrix *,
-			 ScalePermstruct_t *, Glu_freeable_t *,
-			 LUstruct_t *, gridinfo_t *);
-extern void  pdgssvx(superlu_dist_options_t *, SuperMatrix *,
-		     ScalePermstruct_t *, double *,
-		     int, int, gridinfo_t *, LUstruct_t *,
-		     SOLVEstruct_t *, double *, SuperLUStat_t *, int *);
-extern void  pdCompute_Diag_Inv(int_t, LUstruct_t *,gridinfo_t *, SuperLUStat_t *, int *);
-extern int  dSolveInit(superlu_dist_options_t *, SuperMatrix *, int_t [], int_t [],
-		       int_t, LUstruct_t *, gridinfo_t *, SOLVEstruct_t *);
-extern void dSolveFinalize(superlu_dist_options_t *, SOLVEstruct_t *);
-extern int_t pxgstrs_init(int_t, int_t, int_t, int_t,
-                          int_t [], int_t [], gridinfo_t *grid,
-	                  Glu_persist_t *, SOLVEstruct_t *);
-extern void pxgstrs_finalize(pxgstrs_comm_t *);
-extern int  dldperm_dist(int_t, int_t, int_t, int_t [], int_t [],
-		    double [], int_t *, double [], double []);
-extern int  static_schedule(superlu_dist_options_t *, int, int,
-		            LUstruct_t *, gridinfo_t *, SuperLUStat_t *,
-			    int_t *, int_t *, int *);
-extern void LUstructInit(const int_t, LUstruct_t *);
-extern void LUstructFree(LUstruct_t *);
-extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
-extern void Destroy_Tree(int_t, gridinfo_t *, LUstruct_t *);
-extern void dscatter_l (int ib, int ljb, int nsupc, int_t iukp, int_t* xsup,
-			int klst, int nbrow, int_t lptr, int temp_nbrow,
-			int_t* usub, int_t* lsub, double *tempv,
-			int* indirect_thread, int* indirect2,
-			int_t ** Lrowind_bc_ptr, double **Lnzval_bc_ptr,
-			gridinfo_t * grid);
-extern void dscatter_u (int ib, int jb, int nsupc, int_t iukp, int_t * xsup,
-                        int klst, int nbrow, int_t lptr, int temp_nbrow,
-                        int_t* lsub, int_t* usub, double* tempv,
-                        int_t ** Ufstnz_br_ptr, double **Unzval_br_ptr,
-                        gridinfo_t * grid);
-extern int_t pdgstrf(superlu_dist_options_t *, int, int, double,
-		    LUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*);
-
-/* #define GPU_PROF
+    extern NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, double *B,
+                                                 int ldb, int nrhs,
+                                                 gridinfo3d_t *grid3d);
+    extern int dScatter_B3d(NRformat_loc3d *A3d, gridinfo3d_t *grid3d);
+
+    /* Driver related */
+    extern void dgsequ_dist(SuperMatrix *, double *, double *, double *,
+                            double *, double *, int_t *);
+    extern double dlangs_dist(char *, SuperMatrix *);
+    extern void dlaqgs_dist(SuperMatrix *, double *, double *, double,
+                            double, double, char *);
+    extern void pdgsequ(SuperMatrix *, double *, double *, double *,
+                        double *, double *, int_t *, gridinfo_t *);
+    extern double pdlangs(char *, SuperMatrix *, gridinfo_t *);
+    extern void pdlaqgs(SuperMatrix *, double *, double *, double,
+                        double, double, char *);
+    extern int pdPermute_Dense_Matrix(int_t, int_t, int_t[], int_t[],
+                                      double[], int, double[], int, int,
+                                      gridinfo_t *);
+
+    extern int sp_dtrsv_dist(char *, char *, char *, SuperMatrix *,
+                             SuperMatrix *, double *, int *);
+    extern int sp_dgemv_dist(char *, double, SuperMatrix *, double *,
+                             int, double, double *, int);
+    extern int sp_dgemm_dist(char *, int, double, SuperMatrix *,
+                             double *, int, double, double *, int);
+
+    extern float ddistribute(fact_t, int_t, SuperMatrix *, Glu_freeable_t *,
+                             LUstruct_t *, gridinfo_t *);
+    extern void pdgssvx_ABglobal(superlu_dist_options_t *, SuperMatrix *,
+                                 ScalePermstruct_t *, double *,
+                                 int, int, gridinfo_t *, LUstruct_t *, double *,
+                                 SuperLUStat_t *, int *);
+    extern float pddistribute(fact_t, int_t, SuperMatrix *,
+                              ScalePermstruct_t *, Glu_freeable_t *,
+                              LUstruct_t *, gridinfo_t *);
+    extern void pdgssvx(superlu_dist_options_t *, SuperMatrix *,
+                        ScalePermstruct_t *, double *,
+                        int, int, gridinfo_t *, LUstruct_t *,
+                        SOLVEstruct_t *, double *, SuperLUStat_t *, int *);
+    extern void pdCompute_Diag_Inv(int_t, LUstruct_t *, gridinfo_t *, SuperLUStat_t *, int *);
+    extern int dSolveInit(superlu_dist_options_t *, SuperMatrix *, int_t[], int_t[],
+                          int_t, LUstruct_t *, gridinfo_t *, SOLVEstruct_t *);
+    extern void dSolveFinalize(superlu_dist_options_t *, SOLVEstruct_t *);
+    extern int_t pxgstrs_init(int_t, int_t, int_t, int_t,
+                              int_t[], int_t[], gridinfo_t *grid,
+                              Glu_persist_t *, SOLVEstruct_t *);
+    extern void pxgstrs_finalize(pxgstrs_comm_t *);
+    extern int dldperm_dist(int_t, int_t, int_t, int_t[], int_t[],
+                            double[], int_t *, double[], double[]);
+    extern int static_schedule(superlu_dist_options_t *, int, int,
+                               LUstruct_t *, gridinfo_t *, SuperLUStat_t *,
+                               int_t *, int_t *, int *);
+    extern void LUstructInit(const int_t, LUstruct_t *);
+    extern void LUstructFree(LUstruct_t *);
+    extern void Destroy_LU(int_t, gridinfo_t *, LUstruct_t *);
+    extern void Destroy_Tree(int_t, gridinfo_t *, LUstruct_t *);
+    extern void dscatter_l(int ib, int ljb, int nsupc, int_t iukp, int_t *xsup,
+                           int klst, int nbrow, int_t lptr, int temp_nbrow,
+                           int_t *usub, int_t *lsub, double *tempv,
+                           int *indirect_thread, int *indirect2,
+                           int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                           gridinfo_t *grid);
+    extern void dscatter_u(int ib, int jb, int nsupc, int_t iukp, int_t *xsup,
+                           int klst, int nbrow, int_t lptr, int temp_nbrow,
+                           int_t *lsub, int_t *usub, double *tempv,
+                           int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                           gridinfo_t *grid);
+    extern int_t pdgstrf(superlu_dist_options_t *, int, int, double,
+                         LUstruct_t *, gridinfo_t *, SuperLUStat_t *, int *);
+
+    /* #define GPU_PROF
 #define IPM_PROF */
 
-/* Solve related */
-extern void pdgstrs_Bglobal(int_t, LUstruct_t *, gridinfo_t *,
-			     double *, int_t, int, SuperLUStat_t *, int *);
-extern void pdgstrs(int_t, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *,
-		    double *, int_t, int_t, int_t, int, SOLVEstruct_t *,
-		    SuperLUStat_t *, int *);
-extern int_t pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb,
-				   int_t fst_row, int_t *ilsum, double *x,
-				   ScalePermstruct_t *, Glu_persist_t *,
-				   gridinfo_t *, SOLVEstruct_t *);
-extern void dlsum_fmod(double *, double *, double *, double *,
-		       int, int, int_t , int_t *, int_t, int_t, int_t,
-		       int_t *, gridinfo_t *, LocalLU_t *,
-		       MPI_Request [], SuperLUStat_t *);
-extern void dlsum_bmod(double *, double *, double *,
-                       int, int_t, int_t *, int_t *, Ucb_indptr_t **,
-                       int_t **, int_t *, gridinfo_t *, LocalLU_t *,
-		       MPI_Request [], SuperLUStat_t *);
-
-extern void dlsum_fmod_inv(double *, double *, double *, double *,
-		       int, int_t , int_t *,
-		       int_t *, gridinfo_t *, LocalLU_t *,
-		       SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int_t, int_t, int, int);
-extern void dlsum_fmod_inv_master(double *, double *, double *, double *,
-		       int, int, int_t , int_t *, int_t,
-		       int_t *, gridinfo_t *, LocalLU_t *,
-		       SuperLUStat_t **, int_t, int_t, int_t, int_t, int, int);
-extern void dlsum_bmod_inv(double *, double *, double *, double *,
-                       int, int_t, int_t *, int_t *, Ucb_indptr_t **,
-                       int_t **, int_t *, gridinfo_t *, LocalLU_t *,
-		       SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int, int);
-extern void dlsum_bmod_inv_master(double *, double *, double *, double *,
-                       int, int_t, int_t *, int_t *, Ucb_indptr_t **,
-                       int_t **, int_t *, gridinfo_t *, LocalLU_t *,
-		       SuperLUStat_t **, int_t, int_t, int, int);
-
-extern void pdgsrfs(int_t, SuperMatrix *, double, LUstruct_t *,
-		    ScalePermstruct_t *, gridinfo_t *,
-		    double [], int_t, double [], int_t, int,
-		    SOLVEstruct_t *, double *, SuperLUStat_t *, int *);
-extern void pdgsrfs_ABXglobal(int_t, SuperMatrix *, double, LUstruct_t *,
-		  gridinfo_t *, double *, int_t, double *, int_t,
-		  int, double *, SuperLUStat_t *, int *);
-extern int   pdgsmv_AXglobal_setup(SuperMatrix *, Glu_persist_t *,
-				   gridinfo_t *, int_t *, int_t *[],
-				   double *[], int_t *[], int_t []);
-extern int  pdgsmv_AXglobal(int_t, int_t [], double [], int_t [],
-	                       double [], double []);
-extern int  pdgsmv_AXglobal_abs(int_t, int_t [], double [], int_t [],
-				 double [], double []);
-extern void pdgsmv_init(SuperMatrix *, int_t *, gridinfo_t *,
-			pdgsmv_comm_t *);
-extern void pdgsmv(int_t, SuperMatrix *, gridinfo_t *, pdgsmv_comm_t *,
-		   double x[], double ax[]);
-extern void pdgsmv_finalize(pdgsmv_comm_t *);
-
-/* Memory-related */
-extern double  *doubleMalloc_dist(int_t);
-extern double  *doubleCalloc_dist(int_t);
-extern void  *duser_malloc_dist (int_t, int_t);
-extern void  duser_free_dist (int_t, int_t);
-extern int_t dQuerySpace_dist(int_t, LUstruct_t *, gridinfo_t *,
-			      SuperLUStat_t *, superlu_dist_mem_usage_t *);
-
-/* Auxiliary routines */
-
-extern void dClone_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *);
-extern void dCopy_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *);
-extern void dZero_CompRowLoc_Matrix_dist(SuperMatrix *);
-extern void dScaleAddId_CompRowLoc_Matrix_dist(SuperMatrix *, double);
-extern void dScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *, double);
-extern void dZeroLblocks(int, int_t, gridinfo_t *, LUstruct_t *);
-extern void    dfill_dist (double *, int_t, double);
-extern void    dinf_norm_error_dist (int_t, int_t, double*, int_t,
-                                     double*, int_t, gridinfo_t*);
-extern void    pdinf_norm_error(int, int_t, int_t, double [], int_t,
-				double [], int_t , gridinfo_t *);
-extern void  dreadhb_dist (int, FILE *, int_t *, int_t *, int_t *,
-			   double **, int_t **, int_t **);
-extern void  dreadtriple_dist(FILE *, int_t *, int_t *, int_t *,
-			 double **, int_t **, int_t **);
-extern void  dreadtriple_noheader(FILE *, int_t *, int_t *, int_t *,
-			 double **, int_t **, int_t **);
-extern void  dreadrb_dist(int, FILE *, int_t *, int_t *, int_t *,
-		     double **, int_t **, int_t **);
-extern void  dreadMM_dist(FILE *, int_t *, int_t *, int_t *,
-	                  double **, int_t **, int_t **);
-extern int  dread_binary(FILE *, int_t *, int_t *, int_t *,
-	                  double **, int_t **, int_t **);
-
-/* Distribute the data for numerical factorization */
-extern float ddist_psymbtonum(fact_t, int_t, SuperMatrix *,
-                                ScalePermstruct_t *, Pslu_freeable_t *,
-                                LUstruct_t *, gridinfo_t *);
-extern void pdGetDiagU(int_t, LUstruct_t *, gridinfo_t *, double *);
-
-
-/* Routines for debugging */
-extern void  dPrintLblocks(int, int_t, gridinfo_t *, Glu_persist_t *,
-		 	   LocalLU_t *);
-extern void  dPrintUblocks(int, int_t, gridinfo_t *, Glu_persist_t *,
-			   LocalLU_t *);
-extern void  dPrint_CompCol_Matrix_dist(SuperMatrix *);
-extern void  dPrint_Dense_Matrix_dist(SuperMatrix *);
-extern int   dPrint_CompRowLoc_Matrix_dist(SuperMatrix *);
-extern int   file_dPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A);
-extern int   file_PrintDouble5(FILE *, char *, int_t, double *);
-
-
-/* BLAS */
+    /* Solve related */
+    extern void pdgstrs_Bglobal(int_t, LUstruct_t *, gridinfo_t *,
+                                double *, int_t, int, SuperLUStat_t *, int *);
+    extern void pdgstrs(int_t, LUstruct_t *, ScalePermstruct_t *, gridinfo_t *,
+                        double *, int_t, int_t, int_t, int, SOLVEstruct_t *,
+                        SuperLUStat_t *, int *);
+    extern int_t pdReDistribute_B_to_X(double *B, int_t m_loc, int nrhs, int_t ldb,
+                                       int_t fst_row, int_t *ilsum, double *x,
+                                       ScalePermstruct_t *, Glu_persist_t *,
+                                       gridinfo_t *, SOLVEstruct_t *);
+    extern void dlsum_fmod(double *, double *, double *, double *,
+                           int, int, int_t, int_t *, int_t, int_t, int_t,
+                           int_t *, gridinfo_t *, LocalLU_t *,
+                           MPI_Request[], SuperLUStat_t *);
+    extern void dlsum_bmod(double *, double *, double *,
+                           int, int_t, int_t *, int_t *, Ucb_indptr_t **,
+                           int_t **, int_t *, gridinfo_t *, LocalLU_t *,
+                           MPI_Request[], SuperLUStat_t *);
+
+    extern void dlsum_fmod_inv(double *, double *, double *, double *,
+                               int, int_t, int_t *,
+                               int_t *, gridinfo_t *, LocalLU_t *,
+                               SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int_t, int_t, int, int);
+    extern void dlsum_fmod_inv_master(double *, double *, double *, double *,
+                                      int, int, int_t, int_t *, int_t,
+                                      int_t *, gridinfo_t *, LocalLU_t *,
+                                      SuperLUStat_t **, int_t, int_t, int_t, int_t, int, int);
+    extern void dlsum_bmod_inv(double *, double *, double *, double *,
+                               int, int_t, int_t *, int_t *, Ucb_indptr_t **,
+                               int_t **, int_t *, gridinfo_t *, LocalLU_t *,
+                               SuperLUStat_t **, int_t *, int_t *, int_t, int_t, int, int);
+    extern void dlsum_bmod_inv_master(double *, double *, double *, double *,
+                                      int, int_t, int_t *, int_t *, Ucb_indptr_t **,
+                                      int_t **, int_t *, gridinfo_t *, LocalLU_t *,
+                                      SuperLUStat_t **, int_t, int_t, int, int);
+
+    extern void pdgsrfs(int_t, SuperMatrix *, double, LUstruct_t *,
+                        ScalePermstruct_t *, gridinfo_t *,
+                        double[], int_t, double[], int_t, int,
+                        SOLVEstruct_t *, double *, SuperLUStat_t *, int *);
+    extern void pdgsrfs_ABXglobal(int_t, SuperMatrix *, double, LUstruct_t *,
+                                  gridinfo_t *, double *, int_t, double *, int_t,
+                                  int, double *, SuperLUStat_t *, int *);
+    extern int pdgsmv_AXglobal_setup(SuperMatrix *, Glu_persist_t *,
+                                     gridinfo_t *, int_t *, int_t *[],
+                                     double *[], int_t *[], int_t[]);
+    extern int pdgsmv_AXglobal(int_t, int_t[], double[], int_t[],
+                               double[], double[]);
+    extern int pdgsmv_AXglobal_abs(int_t, int_t[], double[], int_t[],
+                                   double[], double[]);
+    extern void pdgsmv_init(SuperMatrix *, int_t *, gridinfo_t *,
+                            pdgsmv_comm_t *);
+    extern void pdgsmv(int_t, SuperMatrix *, gridinfo_t *, pdgsmv_comm_t *,
+                       double x[], double ax[]);
+    extern void pdgsmv_finalize(pdgsmv_comm_t *);
+
+    /* Memory-related */
+    extern double *doubleMalloc_dist(int_t);
+    extern double *doubleCalloc_dist(int_t);
+    extern void *duser_malloc_dist(int_t, int_t);
+    extern void duser_free_dist(int_t, int_t);
+    extern int_t dQuerySpace_dist(int_t, LUstruct_t *, gridinfo_t *,
+                                  SuperLUStat_t *, superlu_dist_mem_usage_t *);
+
+    /* Auxiliary routines */
+
+    extern void dClone_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *);
+    extern void dCopy_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *);
+    extern void dZero_CompRowLoc_Matrix_dist(SuperMatrix *);
+    extern void dScaleAddId_CompRowLoc_Matrix_dist(SuperMatrix *, double);
+    extern void dScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *, double);
+    extern void dZeroLblocks(int, int_t, gridinfo_t *, LUstruct_t *);
+    extern void dfill_dist(double *, int_t, double);
+    extern void dinf_norm_error_dist(int_t, int_t, double *, int_t,
+                                     double *, int_t, gridinfo_t *);
+    extern void pdinf_norm_error(int, int_t, int_t, double[], int_t,
+                                 double[], int_t, gridinfo_t *);
+    extern void dreadhb_dist(int, FILE *, int_t *, int_t *, int_t *,
+                             double **, int_t **, int_t **);
+    extern void dreadtriple_dist(FILE *, int_t *, int_t *, int_t *,
+                                 double **, int_t **, int_t **);
+    extern void dreadtriple_noheader(FILE *, int_t *, int_t *, int_t *,
+                                     double **, int_t **, int_t **);
+    extern void dreadrb_dist(int, FILE *, int_t *, int_t *, int_t *,
+                             double **, int_t **, int_t **);
+    extern void dreadMM_dist(FILE *, int_t *, int_t *, int_t *,
+                             double **, int_t **, int_t **);
+    extern int dread_binary(FILE *, int_t *, int_t *, int_t *,
+                            double **, int_t **, int_t **);
+
+    /* Distribute the data for numerical factorization */
+    extern float ddist_psymbtonum(fact_t, int_t, SuperMatrix *,
+                                  ScalePermstruct_t *, Pslu_freeable_t *,
+                                  LUstruct_t *, gridinfo_t *);
+    extern void pdGetDiagU(int_t, LUstruct_t *, gridinfo_t *, double *);
+
+    /* Routines for debugging */
+    extern void dPrintLblocks(int, int_t, gridinfo_t *, Glu_persist_t *,
+                              LocalLU_t *);
+    extern void dPrintUblocks(int, int_t, gridinfo_t *, Glu_persist_t *,
+                              LocalLU_t *);
+    extern void dPrint_CompCol_Matrix_dist(SuperMatrix *);
+    extern void dPrint_Dense_Matrix_dist(SuperMatrix *);
+    extern int dPrint_CompRowLoc_Matrix_dist(SuperMatrix *);
+    extern int file_dPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A);
+    extern int file_PrintDouble5(FILE *, char *, int_t, double *);
+
+    /* BLAS */
 
 #ifdef USE_VENDOR_BLAS
-extern void dgemm_(const char*, const char*, const int*, const int*, const int*,
-                  const double*, const double*, const int*, const double*,
-                  const int*, const double*, double*, const int*, int, int);
-extern void dtrsv_(char*, char*, char*, int*, double*, int*,
-                  double*, int*, int, int, int);
-extern void dtrsm_(char*, char*, char*, char*, int*, int*,
-                  double*, double*, int*, double*,
-                  int*, int, int, int, int);
-extern void dgemv_(char *, int *, int *, double *, double *a, int *,
-                  double *, int *, double *, double *, int *, int);
-
-extern void dger_(int*, int*, double*, double*, int*,
-                 double*, int*, double*, int*);
+    extern void dgemm_(const char *, const char *, const int *, const int *, const int *,
+                       const double *, const double *, const int *, const double *,
+                       const int *, const double *, double *, const int *, int, int);
+    extern void dtrsv_(char *, char *, char *, int *, double *, int *,
+                       double *, int *, int, int, int);
+    extern void dtrsm_(char *, char *, char *, char *, int *, int *,
+                       double *, double *, int *, double *,
+                       int *, int, int, int, int);
+    extern void dgemv_(char *, int *, int *, double *, double *a, int *,
+                       double *, int *, double *, double *, int *, int);
+
+    extern void dger_(int *, int *, double *, double *, int *,
+                      double *, int *, double *, int *);
 
 #else
-extern int dgemm_(const char*, const char*, const int*, const int*, const int*,
-                   const double*,  const double*,  const int*,  const double*,
-                   const int*,  const double*, double*, const int*);
-extern int dtrsv_(char*, char*, char*, int*, double*, int*,
-                  double*, int*);
-extern int dtrsm_(char*, char*, char*, char*, int*, int*,
-                  double*, double*, int*, double*, int*);
+extern int dgemm_(const char *, const char *, const int *, const int *, const int *,
+                  const double *, const double *, const int *, const double *,
+                  const int *, const double *, double *, const int *);
+extern int dtrsv_(char *, char *, char *, int *, double *, int *,
+                  double *, int *);
+extern int dtrsm_(char *, char *, char *, char *, int *, int *,
+                  double *, double *, int *, double *, int *);
 extern int dgemv_(char *, int *, int *, double *, double *a, int *,
                   double *, int *, double *, double *, int *);
-extern void dger_(int*, int*, double*, double*, int*,
-                 double*, int*, double*, int*);
+extern void dger_(int *, int *, double *, double *, int *,
+                  double *, int *, double *, int *);
 
 #endif
 
-extern int dscal_(int *n, double *da, double *dx, int *incx);
-extern int daxpy_(int *n, double *za, double *zx, 
-	               int *incx, double *zy, int *incy);
-// LAPACK routine
-extern void dtrtri_(char*, char*, int*, double*, int*, int*);
-
-// Superlu blas routines
-extern int superlu_dgemm(const char *transa, const char *transb,
-                  int m, int n, int k, double alpha, double *a, 
-                  int lda, double *b, int ldb, double beta, double *c, int ldc);
-
-extern int superlu_dtrsm(const char *sideRL, const char *uplo,
-                  const char *transa, const char *diag,
-                  const int m, const int n,
-                  const double alpha, const double *a,
-                  const int lda, double *b, const int ldb);
-extern int superlu_dger(const int m, const int n, const double alpha,
-                 const double *x, const int incx, const double *y,
-                 const int incy, double *a, const int lda);
-
-
-/*==== For 3D code ====*/
-
-extern void pdgssvx3d (superlu_dist_options_t *, SuperMatrix *,
-		       ScalePermstruct_t *, double B[], int ldb, int nrhs,
-		       gridinfo3d_t *, LUstruct_t *, SOLVEstruct_t *, 
-		       double *berr, SuperLUStat_t *, int *info);
-extern int_t pdgstrf3d(superlu_dist_options_t *, int m, int n, double anorm,
-		       trf3Dpartition_t*, SCT_t *, LUstruct_t *,
-		       gridinfo3d_t *, SuperLUStat_t *, int *);
-extern void dInit_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb );
-extern void Free_HyP(HyP_t* HyP);
-extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid);
+    extern int dscal_(int *n, double *da, double *dx, int *incx);
+    extern int daxpy_(int *n, double *za, double *zx,
+                      int *incx, double *zy, int *incy);
+    // LAPACK routine
+    extern void dtrtri_(char *, char *, int *, double *, int *, int *);
+
+    // Superlu blas routines
+    extern int superlu_dgemm(const char *transa, const char *transb,
+                             int m, int n, int k, double alpha, double *a,
+                             int lda, double *b, int ldb, double beta, double *c, int ldc);
+
+    extern int superlu_dtrsm(const char *sideRL, const char *uplo,
+                             const char *transa, const char *diag,
+                             const int m, const int n,
+                             const double alpha, const double *a,
+                             const int lda, double *b, const int ldb);
+    extern int superlu_dger(const int m, const int n, const double alpha,
+                            const double *x, const int incx, const double *y,
+                            const int incy, double *a, const int lda);
+
+    extern int superlu_dscal(const int n, const double alpha, double *x, const int incx);
+    extern int superlu_dtrsv(char *uplo, char *trans, char *diag,
+                             int n, double *a, int lda, double *x, int incx);
+    extern int superlu_dgemv(const char *trans, const int m,
+                             const int n, const double alpha, const double *a,
+                             const int lda, const double *x, const int incx,
+                             const double beta, double *y, const int incy);
+    extern int superlu_daxpy(const int n, const double alpha, const double *x, const int incx, double *y, const int incy);
+    /*==== For 3D code ====*/
+
+    extern void pdgssvx3d(superlu_dist_options_t *, SuperMatrix *,
+                          ScalePermstruct_t *, double B[], int ldb, int nrhs,
+                          gridinfo3d_t *, LUstruct_t *, SOLVEstruct_t *,
+                          double *berr, SuperLUStat_t *, int *info);
+    extern int_t pdgstrf3d(superlu_dist_options_t *, int m, int n, double anorm,
+                           trf3Dpartition_t *, SCT_t *, LUstruct_t *,
+                           gridinfo3d_t *, SuperLUStat_t *, int *);
+    extern void dInit_HyP(HyP_t *HyP, LocalLU_t *Llu, int_t mcb, int_t mrb);
+    extern void Free_HyP(HyP_t *HyP);
+    extern int updateDirtyBit(int_t k0, HyP_t *HyP, gridinfo_t *grid);
 
     /* from scatter.h */
-extern void
-dblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info,
-                    Remain_info_t *Remain_info, double *L_mat, int ldl,
-                    double *U_mat, int ldu,  double *bigV,
-                    // int_t jj0,
-                    int_t knsupc,  int_t klst,
-                    int_t *lsub, int_t *usub, int_t ldt,
-                    int_t thread_id,
-                    int *indirect, int *indirect2,
-                    int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
-                    int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
-                    int_t *xsup, gridinfo_t *, SuperLUStat_t *
+    extern void
+    dblock_gemm_scatter(int_t lb, int_t j, Ublock_info_t *Ublock_info,
+                        Remain_info_t *Remain_info, double *L_mat, int ldl,
+                        double *U_mat, int ldu, double *bigV,
+                        // int_t jj0,
+                        int_t knsupc, int_t klst,
+                        int_t *lsub, int_t *usub, int_t ldt,
+                        int_t thread_id,
+                        int *indirect, int *indirect2,
+                        int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                        int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                        int_t *xsup, gridinfo_t *, SuperLUStat_t *
 #ifdef SCATTER_PROFILE
-                    , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+                        ,
+                        double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
 #endif
-                  );
-
-#ifdef _OPENMP    
-/*this version uses a lock to prevent multiple thread updating the same block*/
-extern void
-dblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock,
-                         Ublock_info_t *Ublock_info,  Remain_info_t *Remain_info,
-                         double *L_mat, int_t ldl, double *U_mat, int_t ldu,
-                         double *bigV,
-                         // int_t jj0,
-                         int_t knsupc,  int_t klst,
-                         int_t *lsub, int_t *usub, int_t ldt,
-                         int_t thread_id,
-                         int *indirect, int *indirect2,
-                         int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
-                         int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
-                         int_t *xsup, gridinfo_t *
+    );
+
+#ifdef _OPENMP
+    /*this version uses a lock to prevent multiple thread updating the same block*/
+    extern void
+    dblock_gemm_scatter_lock(int_t lb, int_t j, omp_lock_t *lock,
+                             Ublock_info_t *Ublock_info, Remain_info_t *Remain_info,
+                             double *L_mat, int_t ldl, double *U_mat, int_t ldu,
+                             double *bigV,
+                             // int_t jj0,
+                             int_t knsupc, int_t klst,
+                             int_t *lsub, int_t *usub, int_t ldt,
+                             int_t thread_id,
+                             int *indirect, int *indirect2,
+                             int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr,
+                             int_t **Ufstnz_br_ptr, double **Unzval_br_ptr,
+                             int_t *xsup, gridinfo_t *
 #ifdef SCATTER_PROFILE
-                         , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+                             ,
+                             double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
 #endif
-                       );
+    );
 #endif
-    
-extern int_t
-dblock_gemm_scatterTopLeft( int_t lb,  int_t j, double* bigV,
-				 int_t knsupc,  int_t klst, int_t* lsub,
-                                 int_t * usub, int_t ldt,
-				 int* indirect, int* indirect2,
-                                 HyP_t* HyP, LUstruct_t *, gridinfo_t*,
-                                 SCT_t*SCT, SuperLUStat_t *
-                               );
-extern int_t 
-dblock_gemm_scatterTopRight( int_t lb,  int_t j, double* bigV,
-				  int_t knsupc,  int_t klst, int_t* lsub,
-                                  int_t * usub, int_t ldt,
-				  int* indirect, int* indirect2,
-                                  HyP_t* HyP, LUstruct_t *, gridinfo_t*,
-                                  SCT_t*SCT, SuperLUStat_t * );
-extern int_t
-dblock_gemm_scatterBottomLeft( int_t lb,  int_t j, double* bigV,
-				    int_t knsupc,  int_t klst, int_t* lsub,
-                                    int_t * usub, int_t ldt, 
-				    int* indirect, int* indirect2,
-                                    HyP_t* HyP, LUstruct_t *, gridinfo_t*,
-                                    SCT_t*SCT, SuperLUStat_t * );
-extern int_t 
-dblock_gemm_scatterBottomRight( int_t lb,  int_t j, double* bigV,
-				     int_t knsupc,  int_t klst, int_t* lsub,
-                                     int_t * usub, int_t ldt,
-				     int* indirect, int* indirect2,
-                                     HyP_t* HyP, LUstruct_t *, gridinfo_t*,
-                                     SCT_t*SCT, SuperLUStat_t * );
+
+    extern int_t
+    dblock_gemm_scatterTopLeft(int_t lb, int_t j, double *bigV,
+                               int_t knsupc, int_t klst, int_t *lsub,
+                               int_t *usub, int_t ldt,
+                               int *indirect, int *indirect2,
+                               HyP_t *HyP, LUstruct_t *, gridinfo_t *,
+                               SCT_t *SCT, SuperLUStat_t *);
+    extern int_t
+    dblock_gemm_scatterTopRight(int_t lb, int_t j, double *bigV,
+                                int_t knsupc, int_t klst, int_t *lsub,
+                                int_t *usub, int_t ldt,
+                                int *indirect, int *indirect2,
+                                HyP_t *HyP, LUstruct_t *, gridinfo_t *,
+                                SCT_t *SCT, SuperLUStat_t *);
+    extern int_t
+    dblock_gemm_scatterBottomLeft(int_t lb, int_t j, double *bigV,
+                                  int_t knsupc, int_t klst, int_t *lsub,
+                                  int_t *usub, int_t ldt,
+                                  int *indirect, int *indirect2,
+                                  HyP_t *HyP, LUstruct_t *, gridinfo_t *,
+                                  SCT_t *SCT, SuperLUStat_t *);
+    extern int_t
+    dblock_gemm_scatterBottomRight(int_t lb, int_t j, double *bigV,
+                                   int_t knsupc, int_t klst, int_t *lsub,
+                                   int_t *usub, int_t ldt,
+                                   int *indirect, int *indirect2,
+                                   HyP_t *HyP, LUstruct_t *, gridinfo_t *,
+                                   SCT_t *SCT, SuperLUStat_t *);
 
     /* from gather.h */
-extern void dgather_u(int_t num_u_blks,
-              Ublock_info_t *Ublock_info, int_t * usub,
-              double *uval,  double *bigU,  int_t ldu,
-              int_t *xsup, int_t klst                /* for SuperSize */
-             );
-
-extern void dgather_l( int_t num_LBlk, int_t knsupc,
-               Remain_info_t *L_info,
-               double * lval, int_t LD_lval,
-               double * L_buff );
-
-extern void dRgather_L(int_t k, int_t *lsub, double *lusup, gEtreeInfo_t*,
-		      Glu_persist_t *, gridinfo_t *, HyP_t *,
-		      int_t *myIperm, int_t *iperm_c_supno );
-extern void dRgather_U(int_t k, int_t jj0, int_t *usub, double *uval,
-		      double *bigU, gEtreeInfo_t*, Glu_persist_t *,
-		      gridinfo_t *, HyP_t *, int_t *myIperm,
-		      int_t *iperm_c_supno, int_t *perm_u);
+    extern void dgather_u(int_t num_u_blks,
+                          Ublock_info_t *Ublock_info, int_t *usub,
+                          double *uval, double *bigU, int_t ldu,
+                          int_t *xsup, int_t klst /* for SuperSize */
+    );
+
+    extern void dgather_l(int_t num_LBlk, int_t knsupc,
+                          Remain_info_t *L_info,
+                          double *lval, int_t LD_lval,
+                          double *L_buff);
+
+    extern void dRgather_L(int_t k, int_t *lsub, double *lusup, gEtreeInfo_t *,
+                           Glu_persist_t *, gridinfo_t *, HyP_t *,
+                           int_t *myIperm, int_t *iperm_c_supno);
+    extern void dRgather_U(int_t k, int_t jj0, int_t *usub, double *uval,
+                           double *bigU, gEtreeInfo_t *, Glu_persist_t *,
+                           gridinfo_t *, HyP_t *, int_t *myIperm,
+                           int_t *iperm_c_supno, int_t *perm_u);
 
     /* from xtrf3Dpartition.h */
-extern trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers,
-					     superlu_dist_options_t *options,
-					     LUstruct_t *LUstruct, gridinfo3d_t * grid3d);
-extern void dDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d);
+    extern trf3Dpartition_t *dinitTrf3Dpartition(int_t nsupers,
+                                                 superlu_dist_options_t *options,
+                                                 LUstruct_t *LUstruct, gridinfo3d_t *grid3d);
+    extern void dDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d);
 
-extern void d3D_printMemUse(trf3Dpartition_t*  trf3Dpartition,
-			    LUstruct_t *LUstruct, gridinfo3d_t * grid3d);
+    extern void d3D_printMemUse(trf3Dpartition_t *trf3Dpartition,
+                                LUstruct_t *LUstruct, gridinfo3d_t *grid3d);
 
-extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat,
-		       superlu_dist_options_t *options, LocalLU_t *Llu,
-		       int_t* xsup, int_t num_look_aheads, int_t nsupers,
-		       int_t * iperm_c_supno);
+    extern int *getLastDep(gridinfo_t *grid, SuperLUStat_t *stat,
+                           superlu_dist_options_t *options, LocalLU_t *Llu,
+                           int_t *xsup, int_t num_look_aheads, int_t nsupers,
+                           int_t *iperm_c_supno);
 
-extern void dinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
-				  sForest_t**  sForests, LUstruct_t* LUstruct,
-				  gridinfo3d_t* grid3d);
+    extern void dinit3DLUstructForest(int_t *myTreeIdxs, int_t *myZeroTrIdxs,
+                                      sForest_t **sForests, LUstruct_t *LUstruct,
+                                      gridinfo3d_t *grid3d);
 
-extern int_t dgatherAllFactoredLUFr(int_t* myZeroTrIdxs, sForest_t* sForests,
-				   LUstruct_t* LUstruct, gridinfo3d_t* grid3d,
-				   SCT_t* SCT );
+    extern int_t dgatherAllFactoredLUFr(int_t *myZeroTrIdxs, sForest_t *sForests,
+                                        LUstruct_t *LUstruct, gridinfo3d_t *grid3d,
+                                        SCT_t *SCT);
 
     /* The following are from pdgstrf2.h */
 #if 0 // Sherry: same routine names, but different code !!!!!!!
@@ -719,341 +720,333 @@ void pdgstrs2_omp (int_t, int_t, int_t, int_t *, double*, Glu_persist_t *, gridi
                       LocalLU_t *, SuperLUStat_t *, Ublock_info_t *, double *bigV, int_t ldt, SCT_t *SCT );
 #endif
 
-#else 
-extern void pdgstrf2_trsm(superlu_dist_options_t * options, int_t k0, int_t k,
-			  double thresh, Glu_persist_t *, gridinfo_t *,
-			  LocalLU_t *, MPI_Request *, int tag_ub,
-			  SuperLUStat_t *, int *info);
+#else
+extern void pdgstrf2_trsm(superlu_dist_options_t *options, int_t k0, int_t k,
+                          double thresh, Glu_persist_t *, gridinfo_t *,
+                          LocalLU_t *, MPI_Request *, int tag_ub,
+                          SuperLUStat_t *, int *info);
 extern void pdgstrs2_omp(int_t k0, int_t k, Glu_persist_t *, gridinfo_t *,
-			 LocalLU_t *, Ublock_info_t *, SuperLUStat_t *);
+                         LocalLU_t *, Ublock_info_t *, SuperLUStat_t *);
 #endif // same routine names   !!!!!!!!
 
-extern int_t dLpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr,
-			  int_t ld_ujrow, double* lusup, int_t nsupr, SCT_t*);
-extern void Local_Dgstrf2(superlu_dist_options_t *options, int_t k,
-			  double thresh, double *BlockUFactor, Glu_persist_t *,
-			  gridinfo_t *, LocalLU_t *,
-                          SuperLUStat_t *, int *info, SCT_t*);
-extern int_t dTrs2_GatherU(int_t iukp, int_t rukp, int_t klst,
-			  int_t nsupc, int_t ldu, int_t *usub,
-			  double* uval, double *tempv);
-extern int_t dTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst,
-			   int_t nsupc, int_t ldu, int_t *usub,
-			   double* uval, double *tempv);
-extern int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
-                             int_t *usub, double* uval, double *tempv,
-                             int_t knsupc, int nsupr, double* lusup,
-                             Glu_persist_t *Glu_persist)  ;
-extern void pdgstrs2
+    extern int_t dLpanelUpdate(int_t off0, int_t nsupc, double *ublk_ptr,
+                               int_t ld_ujrow, double *lusup, int_t nsupr, SCT_t *);
+    extern void Local_Dgstrf2(superlu_dist_options_t *options, int_t k,
+                              double thresh, double *BlockUFactor, Glu_persist_t *,
+                              gridinfo_t *, LocalLU_t *,
+                              SuperLUStat_t *, int *info, SCT_t *);
+    extern int_t dTrs2_GatherU(int_t iukp, int_t rukp, int_t klst,
+                               int_t nsupc, int_t ldu, int_t *usub,
+                               double *uval, double *tempv);
+    extern int_t dTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst,
+                                int_t nsupc, int_t ldu, int_t *usub,
+                                double *uval, double *tempv);
+    extern int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
+                                         int_t *usub, double *uval, double *tempv,
+                                         int_t knsupc, int nsupr, double *lusup,
+                                         Glu_persist_t *Glu_persist);
+    extern void pdgstrs2
 #ifdef _CRAY
-(
-    int_t m, int_t k0, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
-    LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3
-);
+        (
+            int_t m, int_t k0, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+            LocalLU_t *Llu, SuperLUStat_t *stat, _fcd ftcs1, _fcd ftcs2, _fcd ftcs3);
 #else
-(
-    int_t m, int_t k0, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
-    LocalLU_t *Llu, SuperLUStat_t *stat
-);
+    (
+        int_t m, int_t k0, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid,
+        LocalLU_t *Llu, SuperLUStat_t *stat);
 #endif
 
-extern void pdgstrf2(superlu_dist_options_t *, int_t nsupers, int_t k0,
-		     int_t k, double thresh, Glu_persist_t *, gridinfo_t *,
-		     LocalLU_t *, MPI_Request *, int, SuperLUStat_t *, int *);
+    extern void pdgstrf2(superlu_dist_options_t *, int_t nsupers, int_t k0,
+                         int_t k, double thresh, Glu_persist_t *, gridinfo_t *,
+                         LocalLU_t *, MPI_Request *, int, SuperLUStat_t *, int *);
 
     /* from p3dcomm.h */
-extern int_t dAllocLlu_3d(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d);
-extern int_t dp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d);
-extern int_t dscatter3dLPanels(int_t nsupers,
-                       LUstruct_t * LUstruct, gridinfo3d_t* grid3d);
-extern int_t dscatter3dUPanels(int_t nsupers,
-                       LUstruct_t * LUstruct, gridinfo3d_t* grid3d);
-extern int_t dcollect3dLpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d);
-extern int_t dcollect3dUpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d);
-extern int_t dp3dCollect(int_t layer, int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d);
-/*zero out LU non zero entries*/
-extern int_t dzeroSetLU(int_t nnodes, int_t* nodeList , LUstruct_t *, gridinfo3d_t*);
-extern int AllocGlu_3d(int_t n, int_t nsupers, LUstruct_t *);
-extern int DeAllocLlu_3d(int_t n, LUstruct_t *, gridinfo3d_t*);
-extern int DeAllocGlu_3d(LUstruct_t *);
-
-/* Reduces L and U panels of nodes in the List nodeList (size=nnnodes)
+    extern int_t dAllocLlu_3d(int_t nsupers, LUstruct_t *LUstruct, gridinfo3d_t *grid3d);
+    extern int_t dp3dScatter(int_t n, LUstruct_t *LUstruct, gridinfo3d_t *grid3d);
+    extern int_t dscatter3dLPanels(int_t nsupers,
+                                   LUstruct_t *LUstruct, gridinfo3d_t *grid3d);
+    extern int_t dscatter3dUPanels(int_t nsupers,
+                                   LUstruct_t *LUstruct, gridinfo3d_t *grid3d);
+    extern int_t dcollect3dLpanels(int_t layer, int_t nsupers, LUstruct_t *LUstruct, gridinfo3d_t *grid3d);
+    extern int_t dcollect3dUpanels(int_t layer, int_t nsupers, LUstruct_t *LUstruct, gridinfo3d_t *grid3d);
+    extern int_t dp3dCollect(int_t layer, int_t n, LUstruct_t *LUstruct, gridinfo3d_t *grid3d);
+    /*zero out LU non zero entries*/
+    extern int_t dzeroSetLU(int_t nnodes, int_t *nodeList, LUstruct_t *, gridinfo3d_t *);
+    extern int AllocGlu_3d(int_t n, int_t nsupers, LUstruct_t *);
+    extern int DeAllocLlu_3d(int_t n, LUstruct_t *, gridinfo3d_t *);
+    extern int DeAllocGlu_3d(LUstruct_t *);
+
+    /* Reduces L and U panels of nodes in the List nodeList (size=nnnodes)
 receiver[L(nodelist)] =sender[L(nodelist)] +receiver[L(nodelist)]
 receiver[U(nodelist)] =sender[U(nodelist)] +receiver[U(nodelist)]
 */
-int_t dreduceAncestors3d(int_t sender, int_t receiver,
-                        int_t nnodes, int_t* nodeList,
-                        double* Lval_buf, double* Uval_buf,
-                        LUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT);
-/*reduces all nodelists required in a level*/
-int_t dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount,
-                           int_t** treePerm,
-                           dLUValSubBuf_t* LUvsb,
-                           LUstruct_t* LUstruct,
-                           gridinfo3d_t* grid3d,
-                           SCT_t* SCT );
-/*
+    int_t dreduceAncestors3d(int_t sender, int_t receiver,
+                             int_t nnodes, int_t *nodeList,
+                             double *Lval_buf, double *Uval_buf,
+                             LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT);
+    /*reduces all nodelists required in a level*/
+    int_t dreduceAllAncestors3d(int_t ilvl, int_t *myNodeCount,
+                                int_t **treePerm,
+                                dLUValSubBuf_t *LUvsb,
+                                LUstruct_t *LUstruct,
+                                gridinfo3d_t *grid3d,
+                                SCT_t *SCT);
+    /*
 	Copies factored L and U panels from sender grid to receiver grid
 	receiver[L(nodelist)] <-- sender[L(nodelist)];
 	receiver[U(nodelist)] <-- sender[U(nodelist)];
 */
-int_t dgatherFactoredLU(int_t sender, int_t receiver,
-                       int_t nnodes, int_t *nodeList, dLUValSubBuf_t*  LUvsb,
-                       LUstruct_t* LUstruct, gridinfo3d_t* grid3d,SCT_t* SCT );
+    int_t dgatherFactoredLU(int_t sender, int_t receiver,
+                            int_t nnodes, int_t *nodeList, dLUValSubBuf_t *LUvsb,
+                            LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT);
 
-/*Gathers all the L and U factors to grid 0 for solve stage 
+    /*Gathers all the L and U factors to grid 0 for solve stage 
 	By  repeatidly calling above function*/
-int_t dgatherAllFactoredLU(trf3Dpartition_t*  trf3Dpartition, LUstruct_t* LUstruct,
-			   gridinfo3d_t* grid3d, SCT_t* SCT );
+    int_t dgatherAllFactoredLU(trf3Dpartition_t *trf3Dpartition, LUstruct_t *LUstruct,
+                               gridinfo3d_t *grid3d, SCT_t *SCT);
 
-/*Distributes data in each layer and initilizes ancestors
+    /*Distributes data in each layer and initilizes ancestors
  as zero in required nodes*/
-int_t dinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
-                      int_t* nodeCount, int_t** nodeList,
-                      LUstruct_t* LUstruct, gridinfo3d_t* grid3d);
-
-int_t dzSendLPanel(int_t k, int_t receiver,
-		   LUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT);
-int_t dzRecvLPanel(int_t k, int_t sender, double alpha, 
-                   double beta, double* Lval_buf,
-		   LUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT);
-int_t dzSendUPanel(int_t k, int_t receiver,
-		   LUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT);
-int_t dzRecvUPanel(int_t k, int_t sender, double alpha,
-                   double beta, double* Uval_buf,
-		   LUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT);
+    int_t dinit3DLUstruct(int_t *myTreeIdxs, int_t *myZeroTrIdxs,
+                          int_t *nodeCount, int_t **nodeList,
+                          LUstruct_t *LUstruct, gridinfo3d_t *grid3d);
+
+    int_t dzSendLPanel(int_t k, int_t receiver,
+                       LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT);
+    int_t dzRecvLPanel(int_t k, int_t sender, double alpha,
+                       double beta, double *Lval_buf,
+                       LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT);
+    int_t dzSendUPanel(int_t k, int_t receiver,
+                       LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT);
+    int_t dzRecvUPanel(int_t k, int_t sender, double alpha,
+                       double beta, double *Uval_buf,
+                       LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT);
 
     /* from communication_aux.h */
-extern int_t dIBcast_LPanel (int_t k, int_t k0, int_t* lsub, double* lusup,
-			     gridinfo_t *, int* msgcnt, MPI_Request *,
-			     int **ToSendR, int_t *xsup, int );
-extern int_t dBcast_LPanel(int_t k, int_t k0, int_t* lsub, double* lusup,
-			   gridinfo_t *, int* msgcnt, int **ToSendR,
-			   int_t *xsup , SCT_t*, int);
-extern int_t dIBcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval,
-			    gridinfo_t *, int* msgcnt, MPI_Request *,
-			    int *ToSendD, int );
-extern int_t dBcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval,
-			   gridinfo_t *, int* msgcnt, int *ToSendD, SCT_t*, int);
-extern int_t dIrecv_LPanel (int_t k, int_t k0,  int_t* Lsub_buf, 
-			    double* Lval_buf, gridinfo_t *,
-			    MPI_Request *, LocalLU_t *, int);
-extern int_t dIrecv_UPanel(int_t k, int_t k0, int_t* Usub_buf, double*,
-			   LocalLU_t *, gridinfo_t*, MPI_Request *, int);
-extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int **ToSendR,
-			MPI_Request *s, SCT_t*);
-extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *);
-extern int_t dWait_URecv(MPI_Request *, int* msgcnt, SCT_t *);
-extern int_t Check_LRecv(MPI_Request*, int* msgcnt);
-extern int_t dWait_LRecv(MPI_Request*, int* msgcnt, int* msgcntsU,
-			 gridinfo_t *, SCT_t*);
-extern int_t dISend_UDiagBlock(int_t k0, double *ublk_ptr, int_t size,
-			       MPI_Request *, gridinfo_t *, int);
-extern int_t dRecv_UDiagBlock(int_t k0, double *ublk_ptr, int_t size,
-			      int_t src, gridinfo_t *, SCT_t*, int);
-extern int_t Wait_UDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *);
-extern int_t Wait_LDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *);
-extern int_t dPackLBlock(int_t k, double* Dest, Glu_persist_t *,
-			 gridinfo_t *, LocalLU_t *);
-extern int_t dISend_LDiagBlock(int_t k0, double *lblk_ptr, int_t size,
-			       MPI_Request *, gridinfo_t *, int);
-extern int_t dIRecv_UDiagBlock(int_t k0, double *ublk_ptr, int_t size,
-			       int_t src, MPI_Request *, gridinfo_t *,
-			       SCT_t*, int);
-extern int_t Wait_UDiagBlock_Recv(MPI_Request *, SCT_t *);
-extern int_t Test_UDiagBlock_Recv(MPI_Request *, SCT_t *);
-extern int_t dIRecv_LDiagBlock(int_t k0, double *L_blk_ptr, int_t size,
-			       int_t src, MPI_Request *, gridinfo_t*, SCT_t*, int);
-extern int_t Wait_LDiagBlock_Recv(MPI_Request *, SCT_t *);
-extern int_t Test_LDiagBlock_Recv(MPI_Request *, SCT_t *);
-
-extern int_t dUDiagBlockRecvWait( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
-				  MPI_Request *, gridinfo_t *, LUstruct_t *, SCT_t *);
-extern int_t LDiagBlockRecvWait( int_t k, int_t* factored_U, MPI_Request *, gridinfo_t *);
-#if (MPI_VERSION>2)
-extern int_t dIBcast_UDiagBlock(int_t k, double *ublk_ptr, int_t size,
-				MPI_Request *, gridinfo_t *);
-extern int_t dIBcast_LDiagBlock(int_t k, double *lblk_ptr, int_t size,
-			       MPI_Request *, gridinfo_t *);
+    extern int_t dIBcast_LPanel(int_t k, int_t k0, int_t *lsub, double *lusup,
+                                gridinfo_t *, int *msgcnt, MPI_Request *,
+                                int **ToSendR, int_t *xsup, int);
+    extern int_t dBcast_LPanel(int_t k, int_t k0, int_t *lsub, double *lusup,
+                               gridinfo_t *, int *msgcnt, int **ToSendR,
+                               int_t *xsup, SCT_t *, int);
+    extern int_t dIBcast_UPanel(int_t k, int_t k0, int_t *usub, double *uval,
+                                gridinfo_t *, int *msgcnt, MPI_Request *,
+                                int *ToSendD, int);
+    extern int_t dBcast_UPanel(int_t k, int_t k0, int_t *usub, double *uval,
+                               gridinfo_t *, int *msgcnt, int *ToSendD, SCT_t *, int);
+    extern int_t dIrecv_LPanel(int_t k, int_t k0, int_t *Lsub_buf,
+                               double *Lval_buf, gridinfo_t *,
+                               MPI_Request *, LocalLU_t *, int);
+    extern int_t dIrecv_UPanel(int_t k, int_t k0, int_t *Usub_buf, double *,
+                               LocalLU_t *, gridinfo_t *, MPI_Request *, int);
+    extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int **ToSendR,
+                            MPI_Request *s, SCT_t *);
+    extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *);
+    extern int_t dWait_URecv(MPI_Request *, int *msgcnt, SCT_t *);
+    extern int_t Check_LRecv(MPI_Request *, int *msgcnt);
+    extern int_t dWait_LRecv(MPI_Request *, int *msgcnt, int *msgcntsU,
+                             gridinfo_t *, SCT_t *);
+    extern int_t dISend_UDiagBlock(int_t k0, double *ublk_ptr, int_t size,
+                                   MPI_Request *, gridinfo_t *, int);
+    extern int_t dRecv_UDiagBlock(int_t k0, double *ublk_ptr, int_t size,
+                                  int_t src, gridinfo_t *, SCT_t *, int);
+    extern int_t Wait_UDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *);
+    extern int_t Wait_LDiagBlockSend(MPI_Request *, gridinfo_t *, SCT_t *);
+    extern int_t dPackLBlock(int_t k, double *Dest, Glu_persist_t *,
+                             gridinfo_t *, LocalLU_t *);
+    extern int_t dISend_LDiagBlock(int_t k0, double *lblk_ptr, int_t size,
+                                   MPI_Request *, gridinfo_t *, int);
+    extern int_t dIRecv_UDiagBlock(int_t k0, double *ublk_ptr, int_t size,
+                                   int_t src, MPI_Request *, gridinfo_t *,
+                                   SCT_t *, int);
+    extern int_t Wait_UDiagBlock_Recv(MPI_Request *, SCT_t *);
+    extern int_t Test_UDiagBlock_Recv(MPI_Request *, SCT_t *);
+    extern int_t dIRecv_LDiagBlock(int_t k0, double *L_blk_ptr, int_t size,
+                                   int_t src, MPI_Request *, gridinfo_t *, SCT_t *, int);
+    extern int_t Wait_LDiagBlock_Recv(MPI_Request *, SCT_t *);
+    extern int_t Test_LDiagBlock_Recv(MPI_Request *, SCT_t *);
+
+    extern int_t dUDiagBlockRecvWait(int_t k, int_t *IrecvPlcd_D, int_t *factored_L,
+                                     MPI_Request *, gridinfo_t *, LUstruct_t *, SCT_t *);
+    extern int_t LDiagBlockRecvWait(int_t k, int_t *factored_U, MPI_Request *, gridinfo_t *);
+#if (MPI_VERSION > 2)
+    extern int_t dIBcast_UDiagBlock(int_t k, double *ublk_ptr, int_t size,
+                                    MPI_Request *, gridinfo_t *);
+    extern int_t dIBcast_LDiagBlock(int_t k, double *lblk_ptr, int_t size,
+                                    MPI_Request *, gridinfo_t *);
 #endif
 
     /* from trfCommWrapper.h */
-extern int_t dDiagFactIBCast(int_t k,  int_t k0,
-			     double *BlockUFactor, double *BlockLFactor,
-			     int_t* IrecvPlcd_D, MPI_Request *, MPI_Request *,
-			     MPI_Request *, MPI_Request *, gridinfo_t *,
-			     superlu_dist_options_t *, double thresh,
-			     LUstruct_t *LUstruct, SuperLUStat_t *, int *info,
-			     SCT_t *, int tag_ub);
-extern int_t dUPanelTrSolve( int_t k, double* BlockLFactor, double* bigV,
-			     int_t ldt, Ublock_info_t*, gridinfo_t *,
-			     LUstruct_t *, SuperLUStat_t *, SCT_t *);
-extern int_t Wait_LUDiagSend(int_t k, MPI_Request *, MPI_Request *,
-			     gridinfo_t *, SCT_t *);
-extern int_t dLPanelUpdate(int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
-			   MPI_Request *, double* BlockUFactor, gridinfo_t *,
-			   LUstruct_t *, SCT_t *);
-extern int_t dUPanelUpdate(int_t k, int_t* factored_U, MPI_Request *,
-			   double* BlockLFactor, double* bigV,
-			   int_t ldt, Ublock_info_t*, gridinfo_t *,
-			   LUstruct_t *, SuperLUStat_t *, SCT_t *);
-extern int_t dIBcastRecvLPanel(int_t k, int_t k0, int* msgcnt,
-			       MPI_Request *, MPI_Request *,
-			       int_t* Lsub_buf, double* Lval_buf,
-			      int_t * factored, gridinfo_t *, LUstruct_t *,
-			      SCT_t *, int tag_ub);
-extern int_t dIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt, MPI_Request *,
-			       MPI_Request *, int_t* Usub_buf, double* Uval_buf,
-			       gridinfo_t *, LUstruct_t *, SCT_t *, int tag_ub);
-extern int_t dWaitL(int_t k, int* msgcnt, int* msgcntU, MPI_Request *,
-		    MPI_Request *, gridinfo_t *, LUstruct_t *, SCT_t *);
-extern int_t dWaitU(int_t k, int* msgcnt, MPI_Request *, MPI_Request *,
-		   gridinfo_t *, LUstruct_t *, SCT_t *);
-extern int_t dLPanelTrSolve(int_t k, int_t* factored_L, double* BlockUFactor,
-			    gridinfo_t *, LUstruct_t *);
+    extern int_t dDiagFactIBCast(int_t k, int_t k0,
+                                 double *BlockUFactor, double *BlockLFactor,
+                                 int_t *IrecvPlcd_D, MPI_Request *, MPI_Request *,
+                                 MPI_Request *, MPI_Request *, gridinfo_t *,
+                                 superlu_dist_options_t *, double thresh,
+                                 LUstruct_t *LUstruct, SuperLUStat_t *, int *info,
+                                 SCT_t *, int tag_ub);
+    extern int_t dUPanelTrSolve(int_t k, double *BlockLFactor, double *bigV,
+                                int_t ldt, Ublock_info_t *, gridinfo_t *,
+                                LUstruct_t *, SuperLUStat_t *, SCT_t *);
+    extern int_t Wait_LUDiagSend(int_t k, MPI_Request *, MPI_Request *,
+                                 gridinfo_t *, SCT_t *);
+    extern int_t dLPanelUpdate(int_t k, int_t *IrecvPlcd_D, int_t *factored_L,
+                               MPI_Request *, double *BlockUFactor, gridinfo_t *,
+                               LUstruct_t *, SCT_t *);
+    extern int_t dUPanelUpdate(int_t k, int_t *factored_U, MPI_Request *,
+                               double *BlockLFactor, double *bigV,
+                               int_t ldt, Ublock_info_t *, gridinfo_t *,
+                               LUstruct_t *, SuperLUStat_t *, SCT_t *);
+    extern int_t dIBcastRecvLPanel(int_t k, int_t k0, int *msgcnt,
+                                   MPI_Request *, MPI_Request *,
+                                   int_t *Lsub_buf, double *Lval_buf,
+                                   int_t *factored, gridinfo_t *, LUstruct_t *,
+                                   SCT_t *, int tag_ub);
+    extern int_t dIBcastRecvUPanel(int_t k, int_t k0, int *msgcnt, MPI_Request *,
+                                   MPI_Request *, int_t *Usub_buf, double *Uval_buf,
+                                   gridinfo_t *, LUstruct_t *, SCT_t *, int tag_ub);
+    extern int_t dWaitL(int_t k, int *msgcnt, int *msgcntU, MPI_Request *,
+                        MPI_Request *, gridinfo_t *, LUstruct_t *, SCT_t *);
+    extern int_t dWaitU(int_t k, int *msgcnt, MPI_Request *, MPI_Request *,
+                        gridinfo_t *, LUstruct_t *, SCT_t *);
+    extern int_t dLPanelTrSolve(int_t k, int_t *factored_L, double *BlockUFactor,
+                                gridinfo_t *, LUstruct_t *);
 
     /* from trfAux.h */
-extern int_t getNsupers(int, LUstruct_t *);
-extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo);
-extern int   freePackLUInfo(packLUInfo_t* packLUInfo);
-extern int_t dSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*,
-				   Remain_info_t*, uPanelInfo_t *,
-				   lPanelInfo_t *, int_t*, int_t *, int_t *,
-				   double *bigU, int_t* Lsub_buf,
-				   double* Lval_buf, int_t* Usub_buf,
-				   double* Uval_buf, gridinfo_t *, LUstruct_t *);
-extern int_t dSchurComplementSetupGPU(int_t k, msgs_t* msgs, packLUInfo_t*,
-				      int_t*, int_t*, int_t*, gEtreeInfo_t*,
-				      factNodelists_t*, scuBufs_t*,
-				      dLUValSubBuf_t* LUvsb, gridinfo_t *,
-				      LUstruct_t *, HyP_t*);
-extern double* dgetBigV(int_t, int_t);
-extern double* dgetBigU(int_t, gridinfo_t *, LUstruct_t *);
-extern int_t getBigUSize(int_t, gridinfo_t *, LUstruct_t *);
-// permutation from superLU default
-extern int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *,
-			      LUstruct_t *, gridinfo_t *);
-extern void getSCUweight(int_t nsupers, treeList_t* treeList, LUstruct_t *, gridinfo3d_t *);
+    extern int_t getNsupers(int, LUstruct_t *);
+    extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t *packLUInfo);
+    extern int freePackLUInfo(packLUInfo_t *packLUInfo);
+    extern int_t dSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t *,
+                                       Remain_info_t *, uPanelInfo_t *,
+                                       lPanelInfo_t *, int_t *, int_t *, int_t *,
+                                       double *bigU, int_t *Lsub_buf,
+                                       double *Lval_buf, int_t *Usub_buf,
+                                       double *Uval_buf, gridinfo_t *, LUstruct_t *);
+    extern int_t dSchurComplementSetupGPU(int_t k, msgs_t *msgs, packLUInfo_t *,
+                                          int_t *, int_t *, int_t *, gEtreeInfo_t *,
+                                          factNodelists_t *, scuBufs_t *,
+                                          dLUValSubBuf_t *LUvsb, gridinfo_t *,
+                                          LUstruct_t *, HyP_t *);
+    extern double *dgetBigV(int_t, int_t);
+    extern double *dgetBigU(int_t, gridinfo_t *, LUstruct_t *);
+    extern int_t getBigUSize(int_t, gridinfo_t *, LUstruct_t *);
+    // permutation from superLU default
+    extern int_t *getPerm_c_supno(int_t nsupers, superlu_dist_options_t *,
+                                  LUstruct_t *, gridinfo_t *);
+    extern void getSCUweight(int_t nsupers, treeList_t *treeList, LUstruct_t *, gridinfo3d_t *);
 
     /* from treeFactorization.h */
-extern int_t dLluBufInit(dLUValSubBuf_t*, LUstruct_t *);
-extern int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
-			  scuBufs_t*, LUstruct_t*, gridinfo_t *);
-extern int dfreeScuBufs(scuBufs_t* scuBufs);
-
-// the generic tree factoring code 
-extern int_t treeFactor(
-    int_t nnnodes,          // number of nodes in the tree
-    int_t *perm_c_supno,    // list of nodes in the order of factorization
-    commRequests_t *comReqs,    // lists of communication requests
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
-    packLUInfo_t*packLUInfo,
-    msgs_t*msgs,
-    dLUValSubBuf_t* LUvsb,
-    diagFactBufs_t *dFBuf,
-    factStat_t *factStat,
-    factNodelists_t  *fNlists,
-    superlu_dist_options_t *options,
-    int_t * gIperm_c_supno,
-    int_t ldt,
-    LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
-    double thresh,  SCT_t *SCT,
-    int *info
-);
-
-extern int_t dsparseTreeFactor(
-    int_t nnodes,          // number of nodes in the tree
-    int_t *perm_c_supno,    // list of nodes in the order of factorization
-    treeTopoInfo_t* treeTopoInfo,
-    commRequests_t *comReqs,    // lists of communication requests
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
-    packLUInfo_t*packLUInfo,
-    msgs_t*msgs,
-    dLUValSubBuf_t* LUvsb,
-    diagFactBufs_t *dFBuf,
-    factStat_t *factStat,
-    factNodelists_t  *fNlists,
-    superlu_dist_options_t *options,
-    int_t * gIperm_c_supno,
-    int_t ldt,
-    LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
-    double thresh,  SCT_t *SCT,
-    int *info
-);
-
-extern int_t ddenseTreeFactor(
-    int_t nnnodes,          // number of nodes in the tree
-    int_t *perm_c_supno,    // list of nodes in the order of factorization
-    commRequests_t *comReqs,    // lists of communication requests
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
-    packLUInfo_t*packLUInfo,
-    msgs_t*msgs,
-    dLUValSubBuf_t* LUvsb,
-    diagFactBufs_t *dFBuf,
-    factStat_t *factStat,
-    factNodelists_t  *fNlists,
-    superlu_dist_options_t *options,
-    int_t * gIperm_c_supno,
-    int_t ldt,
-    LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
-    double thresh,  SCT_t *SCT, int tag_ub,
-    int *info
-);
-
-extern int_t dsparseTreeFactor_ASYNC(
-    sForest_t* sforest,
-    commRequests_t **comReqss,    // lists of communication requests // size maxEtree level
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
-    packLUInfo_t*packLUInfo,
-    msgs_t**msgss,                  // size=num Look ahead
-    dLUValSubBuf_t** LUvsbs,          // size=num Look ahead
-    diagFactBufs_t **dFBufs,         // size maxEtree level
-    factStat_t *factStat,
-    factNodelists_t  *fNlists,
-    gEtreeInfo_t*   gEtreeInfo,        // global etree info
-    superlu_dist_options_t *options,
-    int_t * gIperm_c_supno,
-    int_t ldt,
-    HyP_t* HyP,
-    LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
-    double thresh,  SCT_t *SCT, int tag_ub,
-    int *info
-);
-extern dLUValSubBuf_t** dLluBufInitArr(int_t numLA, LUstruct_t *LUstruct);
-extern int dLluBufFreeArr(int_t numLA, dLUValSubBuf_t **LUvsbs);
-extern diagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid);
-extern int dfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs);
-extern int_t dinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf);
-extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs,
-			    gridinfo_t *grid, SCT_t *SCT);
-extern int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *, SCT_t *);
+    extern int_t dLluBufInit(dLUValSubBuf_t *, LUstruct_t *);
+    extern int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
+                              scuBufs_t *, LUstruct_t *, gridinfo_t *);
+    extern int dfreeScuBufs(scuBufs_t *scuBufs);
+
+    // the generic tree factoring code
+    extern int_t treeFactor(
+        int_t nnnodes,           // number of nodes in the tree
+        int_t *perm_c_supno,     // list of nodes in the order of factorization
+        commRequests_t *comReqs, // lists of communication requests
+        scuBufs_t *scuBufs,      // contains buffers for schur complement update
+        packLUInfo_t *packLUInfo,
+        msgs_t *msgs,
+        dLUValSubBuf_t *LUvsb,
+        diagFactBufs_t *dFBuf,
+        factStat_t *factStat,
+        factNodelists_t *fNlists,
+        superlu_dist_options_t *options,
+        int_t *gIperm_c_supno,
+        int_t ldt,
+        LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat,
+        double thresh, SCT_t *SCT,
+        int *info);
+
+    extern int_t dsparseTreeFactor(
+        int_t nnodes,        // number of nodes in the tree
+        int_t *perm_c_supno, // list of nodes in the order of factorization
+        treeTopoInfo_t *treeTopoInfo,
+        commRequests_t *comReqs, // lists of communication requests
+        scuBufs_t *scuBufs,      // contains buffers for schur complement update
+        packLUInfo_t *packLUInfo,
+        msgs_t *msgs,
+        dLUValSubBuf_t *LUvsb,
+        diagFactBufs_t *dFBuf,
+        factStat_t *factStat,
+        factNodelists_t *fNlists,
+        superlu_dist_options_t *options,
+        int_t *gIperm_c_supno,
+        int_t ldt,
+        LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat,
+        double thresh, SCT_t *SCT,
+        int *info);
+
+    extern int_t ddenseTreeFactor(
+        int_t nnnodes,           // number of nodes in the tree
+        int_t *perm_c_supno,     // list of nodes in the order of factorization
+        commRequests_t *comReqs, // lists of communication requests
+        scuBufs_t *scuBufs,      // contains buffers for schur complement update
+        packLUInfo_t *packLUInfo,
+        msgs_t *msgs,
+        dLUValSubBuf_t *LUvsb,
+        diagFactBufs_t *dFBuf,
+        factStat_t *factStat,
+        factNodelists_t *fNlists,
+        superlu_dist_options_t *options,
+        int_t *gIperm_c_supno,
+        int_t ldt,
+        LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat,
+        double thresh, SCT_t *SCT, int tag_ub,
+        int *info);
+
+    extern int_t dsparseTreeFactor_ASYNC(
+        sForest_t *sforest,
+        commRequests_t **comReqss, // lists of communication requests // size maxEtree level
+        scuBufs_t *scuBufs,        // contains buffers for schur complement update
+        packLUInfo_t *packLUInfo,
+        msgs_t **msgss,          // size=num Look ahead
+        dLUValSubBuf_t **LUvsbs, // size=num Look ahead
+        diagFactBufs_t **dFBufs, // size maxEtree level
+        factStat_t *factStat,
+        factNodelists_t *fNlists,
+        gEtreeInfo_t *gEtreeInfo, // global etree info
+        superlu_dist_options_t *options,
+        int_t *gIperm_c_supno,
+        int_t ldt,
+        HyP_t *HyP,
+        LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat,
+        double thresh, SCT_t *SCT, int tag_ub,
+        int *info);
+    extern dLUValSubBuf_t **dLluBufInitArr(int_t numLA, LUstruct_t *LUstruct);
+    extern int dLluBufFreeArr(int_t numLA, dLUValSubBuf_t **LUvsbs);
+    extern diagFactBufs_t **dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t *grid);
+    extern int dfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t **dFBufs);
+    extern int_t dinitDiagFactBufs(int_t ldt, diagFactBufs_t *dFBuf);
+    extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs,
+                                gridinfo_t *grid, SCT_t *SCT);
+    extern int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *, SCT_t *);
 
     /* from ancFactorization.h (not called) */
-extern int_t ancestorFactor(
-    int_t ilvl,             // level of factorization 
-    sForest_t* sforest,
-    commRequests_t **comReqss,    // lists of communication requests // size maxEtree level
-    scuBufs_t *scuBufs,          // contains buffers for schur complement update
-    packLUInfo_t*packLUInfo,
-    msgs_t**msgss,                  // size=num Look ahead
-    dLUValSubBuf_t** LUvsbs,          // size=num Look ahead
-    diagFactBufs_t **dFBufs,         // size maxEtree level
-    factStat_t *factStat,
-    factNodelists_t  *fNlists,
-    gEtreeInfo_t*   gEtreeInfo,        // global etree info
-    superlu_dist_options_t *options,
-    int_t * gIperm_c_supno,
-    int_t ldt,
-    HyP_t* HyP,
-    LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
-    double thresh,  SCT_t *SCT, int tag_ub, int *info
-);
-
-/*=====================*/
+    extern int_t ancestorFactor(
+        int_t ilvl, // level of factorization
+        sForest_t *sforest,
+        commRequests_t **comReqss, // lists of communication requests // size maxEtree level
+        scuBufs_t *scuBufs,        // contains buffers for schur complement update
+        packLUInfo_t *packLUInfo,
+        msgs_t **msgss,          // size=num Look ahead
+        dLUValSubBuf_t **LUvsbs, // size=num Look ahead
+        diagFactBufs_t **dFBufs, // size maxEtree level
+        factStat_t *factStat,
+        factNodelists_t *fNlists,
+        gEtreeInfo_t *gEtreeInfo, // global etree info
+        superlu_dist_options_t *options,
+        int_t *gIperm_c_supno,
+        int_t ldt,
+        HyP_t *HyP,
+        LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat,
+        double thresh, SCT_t *SCT, int tag_ub, int *info);
+
+    /*=====================*/
 
 #ifdef __cplusplus
-  }
+}
 #endif
 
 #endif /* __SUPERLU_dDEFS */
-

From 8eeaa09a4757b8e4c20f1febbd6eb4fea9d31a5a Mon Sep 17 00:00:00 2001
From: 7ps 
Date: Sun, 29 Nov 2020 12:29:45 -0500
Subject: [PATCH 046/147] chaning instances of cblas with superlu_blas

---
 SRC/dtrfCommWrapper.c |   33 +-
 SRC/pd3dcomm.c        | 1335 ++++++++++++++++++++---------------------
 SRC/pdgstrf2.c        |  568 +++++++++---------
 3 files changed, 940 insertions(+), 996 deletions(-)

diff --git a/SRC/dtrfCommWrapper.c b/SRC/dtrfCommWrapper.c
index 8ab915bd..4bbf623e 100644
--- a/SRC/dtrfCommWrapper.c
+++ b/SRC/dtrfCommWrapper.c
@@ -170,22 +170,9 @@ int_t dLPanelTrSolve(int_t k, int_t *factored_L,
                 int_t off = i * BL;
                 // Sherry: int_t len = MY_MIN(BL, l - i * BL);
                 int len = SUPERLU_MIN(BL, l - i * BL);
-
-                // #if 1
-                //   #if defined (USE_VENDOR_BLAS)
-                // 		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-                // 			ublk_ptr, &ld_ujrow, &lusup[off], &nsupr,
-                // 			1, 1, 1, 1);
-                //   #else
-                // 		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-                // 			ublk_ptr, &ld_ujrow, &lusup[off], &nsupr);
-                //   #endif
-                // #else
-                //                 cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
-                //                 len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off], nsupr);
-                // #endif
                 superlu_dtrsm("R", "U", "N", "N",
-                              len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off], nsupr);
+                              len, nsupc, alpha, ublk_ptr,
+                              ld_ujrow, &lusup[off], nsupr);
             }
         }
     }
@@ -221,22 +208,10 @@ int_t dLPanelTrSolve(int_t k, int_t *factored_L,
             int len = SUPERLU_MIN(BL, (l - i * BL));
 #pragma omp task
             {
-                // #if 1
-                //   #if defined (USE_VENDOR_BLAS)
-                // 		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-                // 			ublk_ptr, &ld_ujrow, &lusup[nsupc + off], &nsupr,
-                // 			1, 1, 1, 1);
-                //   #else
-                // 		dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-                // 			ublk_ptr, &ld_ujrow, &lusup[nsupc + off], &nsupr);
-                //   #endif
-                // #else
-                //                 cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
-                //                              len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
-                // #endif
 
                 superlu_dtrsm("R", "U", "N", "N",
-                              len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
+                              len, nsupc, alpha, ublk_ptr,
+                              ld_ujrow, &lusup[nsupc + off], nsupr);
             }
         }
     }
diff --git a/SRC/pd3dcomm.c b/SRC/pd3dcomm.c
index 98ecd37b..e11fb9af 100644
--- a/SRC/pd3dcomm.c
+++ b/SRC/pd3dcomm.c
@@ -9,7 +9,6 @@ The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-
 /*! @file
  * \brief Communication routines for the 3D algorithm.
  *
@@ -32,45 +31,45 @@ at the top-level directory.
 #include "xtrf3Dpartition.h"
 #endif
 
-#define INT_T_ALLOC(x)  ((int_t *) SUPERLU_MALLOC ( (x) * sizeof (int_t)))
-#define DOUBLE_ALLOC(x)  ((double *) SUPERLU_MALLOC ( (x) * sizeof (double)))
+#define INT_T_ALLOC(x) ((int_t *)SUPERLU_MALLOC((x) * sizeof(int_t)))
+#define DOUBLE_ALLOC(x) ((double *)SUPERLU_MALLOC((x) * sizeof(double)))
 
 // #define MPI_MALLOC
-#define MPI_INT_ALLOC(a, b) (MPI_Alloc_mem( (b)*sizeof(int_t), MPI_INFO_NULL, &(a) ))
-#define MPI_DATATYPE_ALLOC(a, b) (MPI_Alloc_mem((b)*sizeof(double), MPI_INFO_NULL, &(a)))
+#define MPI_INT_ALLOC(a, b) (MPI_Alloc_mem((b) * sizeof(int_t), MPI_INFO_NULL, &(a)))
+#define MPI_DATATYPE_ALLOC(a, b) (MPI_Alloc_mem((b) * sizeof(double), MPI_INFO_NULL, &(a)))
 
-int_t dAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+int_t dAllocLlu(int_t nsupers, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 {
-    int i;
-    int_t Pc = grid3d->npcol;
-    int_t Pr = grid3d->nprow;
-    
-    int_t nbc = CEILING(nsupers, Pc);
-    int_t nbr = CEILING(nsupers, Pr);
-    
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t   **Lrowind_bc_ptr =
-	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbc); 	/* size ceil(NSUPERS/Pc) */
-    double  **Lnzval_bc_ptr =
-	(double **) SUPERLU_MALLOC(sizeof(double*)*nbc);  /* size ceil(NSUPERS/Pc) */
-
-    for (i = 0; i < nbc ; ++i)
+	int i;
+	int_t Pc = grid3d->npcol;
+	int_t Pr = grid3d->nprow;
+
+	int_t nbc = CEILING(nsupers, Pc);
+	int_t nbr = CEILING(nsupers, Pr);
+
+	LocalLU_t *Llu = LUstruct->Llu;
+	int_t **Lrowind_bc_ptr =
+		(int_t **)SUPERLU_MALLOC(sizeof(int_t *) * nbc); /* size ceil(NSUPERS/Pc) */
+	double **Lnzval_bc_ptr =
+		(double **)SUPERLU_MALLOC(sizeof(double *) * nbc); /* size ceil(NSUPERS/Pc) */
+
+	for (i = 0; i < nbc; ++i)
 	{
-	    /* code */
-	    Lrowind_bc_ptr[i] = NULL;
-	    Lnzval_bc_ptr[i] = NULL;
+		/* code */
+		Lrowind_bc_ptr[i] = NULL;
+		Lnzval_bc_ptr[i] = NULL;
 	}
-    
-    int_t   **Ufstnz_br_ptr =
-	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbr); /* size ceil(NSUPERS/Pr) */
-    double  **Unzval_br_ptr =
-	(double **) SUPERLU_MALLOC(sizeof(double*)*nbr); /* size ceil(NSUPERS/Pr) */
-    
-    for (i = 0; i < nbr ; ++i)
+
+	int_t **Ufstnz_br_ptr =
+		(int_t **)SUPERLU_MALLOC(sizeof(int_t *) * nbr); /* size ceil(NSUPERS/Pr) */
+	double **Unzval_br_ptr =
+		(double **)SUPERLU_MALLOC(sizeof(double *) * nbr); /* size ceil(NSUPERS/Pr) */
+
+	for (i = 0; i < nbr; ++i)
 	{
-	    /* code */
-	    Ufstnz_br_ptr[i] = NULL;
-	    Unzval_br_ptr[i] = NULL;
+		/* code */
+		Ufstnz_br_ptr[i] = NULL;
+		Unzval_br_ptr[i] = NULL;
 	}
 
 #if 0 // Sherry: change to int type    
@@ -78,818 +77,796 @@ int_t dAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
     int_t *ToSendD = intCalloc_dist(nbr); /* Whether need to send down block row. */
     int_t **ToSendR = (int_t **) SUPERLU_MALLOC(nbc * sizeof(int_t*)); /* List of processes to send right block col. */
 #else
-                  /* Recv from no one (0), left (1), and up (2).*/
-    int *ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int));
-    for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
-                  /* Whether need to send down block row. */
-    int *ToSendD = SUPERLU_MALLOC(nbr * sizeof(int));
-    for (i = 0; i < nbr; ++i) ToSendD[i] = 0;
-                  /* List of processes to send right block col. */
-    int **ToSendR = (int **) SUPERLU_MALLOC(nbc * sizeof(int*));
-#endif    
-
-    for (i = 0; i < nbc; ++i)
+	/* Recv from no one (0), left (1), and up (2).*/
+	int *ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int));
+	for (i = 0; i < nsupers; ++i)
+		ToRecv[i] = 0;
+	/* Whether need to send down block row. */
+	int *ToSendD = SUPERLU_MALLOC(nbr * sizeof(int));
+	for (i = 0; i < nbr; ++i)
+		ToSendD[i] = 0;
+	/* List of processes to send right block col. */
+	int **ToSendR = (int **)SUPERLU_MALLOC(nbc * sizeof(int *));
+#endif
+
+	for (i = 0; i < nbc; ++i)
 	{
-	    /* code */
-	    //ToSendR[i] = INT_T_ALLOC(Pc);
-	    ToSendR[i] = SUPERLU_MALLOC(Pc * sizeof(int));
+		/* code */
+		//ToSendR[i] = INT_T_ALLOC(Pc);
+		ToSendR[i] = SUPERLU_MALLOC(Pc * sizeof(int));
 	}
-    
-    /*now setup the pointers*/
-    Llu->Lrowind_bc_ptr = Lrowind_bc_ptr ;
-    Llu->Lnzval_bc_ptr = Lnzval_bc_ptr ;
-    Llu->Ufstnz_br_ptr = Ufstnz_br_ptr ;
-    Llu->Unzval_br_ptr = Unzval_br_ptr ;
-    Llu->ToRecv = ToRecv ;
-    Llu->ToSendD = ToSendD ;
-    Llu->ToSendR = ToSendR ;
-    
-    return 0;
+
+	/*now setup the pointers*/
+	Llu->Lrowind_bc_ptr = Lrowind_bc_ptr;
+	Llu->Lnzval_bc_ptr = Lnzval_bc_ptr;
+	Llu->Ufstnz_br_ptr = Ufstnz_br_ptr;
+	Llu->Unzval_br_ptr = Unzval_br_ptr;
+	Llu->ToRecv = ToRecv;
+	Llu->ToSendD = ToSendD;
+	Llu->ToSendR = ToSendR;
+
+	return 0;
 } /* dAllocLlu */
 
-int_t dmpiMallocLUStruct(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+int_t dmpiMallocLUStruct(int_t nsupers, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t* xsup = LUstruct->Glu_persist->xsup;
-    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
-    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    gridinfo_t* grid = &(grid3d->grid2d);
-    
-    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-    for ( int_t lb = 0; lb < k; ++lb)
+	LocalLU_t *Llu = LUstruct->Llu;
+	int_t *xsup = LUstruct->Glu_persist->xsup;
+	int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	double **Unzval_br_ptr = Llu->Unzval_br_ptr;
+	int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	gridinfo_t *grid = &(grid3d->grid2d);
+
+	int_t k = CEILING(nsupers, grid->nprow); /* Number of local block rows */
+	for (int_t lb = 0; lb < k; ++lb)
 	{
-	    int_t *usub, *usub_new;
-	    usub =  Ufstnz_br_ptr[lb];
-	    
-	    double * uval = Unzval_br_ptr[lb];
-	    double * uval_new;
-	    
-	    /*if non empty set the flag*/
-	    if (usub != NULL)
+		int_t *usub, *usub_new;
+		usub = Ufstnz_br_ptr[lb];
+
+		double *uval = Unzval_br_ptr[lb];
+		double *uval_new;
+
+		/*if non empty set the flag*/
+		if (usub != NULL)
 		{
-		    int_t lenv, lens;
-		    lenv = usub[1];
-		    lens = usub[2];
-		    
-		    MPI_INT_ALLOC(usub_new, lens);
-		    memcpy( usub_new, usub, lens * sizeof(int_t));
-		    MPI_DATATYPE_ALLOC(uval_new, lenv);
-		    memcpy( uval_new, uval, lenv * sizeof(double));
-		    Ufstnz_br_ptr[lb] = usub_new;
-		    Unzval_br_ptr[lb] = uval_new;
-		    SUPERLU_FREE(usub);
-		    SUPERLU_FREE(uval);
+			int_t lenv, lens;
+			lenv = usub[1];
+			lens = usub[2];
+
+			MPI_INT_ALLOC(usub_new, lens);
+			memcpy(usub_new, usub, lens * sizeof(int_t));
+			MPI_DATATYPE_ALLOC(uval_new, lenv);
+			memcpy(uval_new, uval, lenv * sizeof(double));
+			Ufstnz_br_ptr[lb] = usub_new;
+			Unzval_br_ptr[lb] = uval_new;
+			SUPERLU_FREE(usub);
+			SUPERLU_FREE(uval);
 		}
 	} /*for ( int_t lb = 0; lb < k; ++lb)*/
-    
-    int_t iam = grid->iam;
-    int_t mycol = MYCOL (iam, grid);
-    
-    /*start broadcasting blocks*/
-    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+
+	int_t iam = grid->iam;
+	int_t mycol = MYCOL(iam, grid);
+
+	/*start broadcasting blocks*/
+	for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */
 	{
-	    int_t pc = PCOL( jb, grid );
-	    if (mycol == pc)
+		int_t pc = PCOL(jb, grid);
+		if (mycol == pc)
 		{
-		    int_t ljb = LBj( jb, grid ); /* Local block number */
-		    int_t  *lsub , *lsub_new;
-		    double *lnzval, *lnzval_new;
-		    lsub = Lrowind_bc_ptr[ljb];
-		    lnzval = Lnzval_bc_ptr[ljb];
-		    
-		    if (lsub)
+			int_t ljb = LBj(jb, grid); /* Local block number */
+			int_t *lsub, *lsub_new;
+			double *lnzval, *lnzval_new;
+			lsub = Lrowind_bc_ptr[ljb];
+			lnzval = Lnzval_bc_ptr[ljb];
+
+			if (lsub)
 			{
-			    int_t nrbl, len, len1, len2;
-			    
-			    nrbl  =   lsub[0]; /*number of L blocks */
-			    len   = lsub[1];       /* LDA of the nzval[] */
-			    len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-			    len2  = SuperSize(jb) * len;
-			    
-			    MPI_INT_ALLOC(lsub_new, len1);
-			    memcpy( lsub_new, lsub, len1 * sizeof(int_t));
-			    MPI_DATATYPE_ALLOC(lnzval_new, len2);
-			    memcpy( lnzval_new, lnzval, len2 * sizeof(double));
-			    Lrowind_bc_ptr[ljb] = lsub_new;
-			    SUPERLU_FREE(lsub );
-			    Lnzval_bc_ptr[ljb] = lnzval_new;
-			    SUPERLU_FREE(lnzval );
+				int_t nrbl, len, len1, len2;
+
+				nrbl = lsub[0]; /*number of L blocks */
+				len = lsub[1];	/* LDA of the nzval[] */
+				len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+				len2 = SuperSize(jb) * len;
+
+				MPI_INT_ALLOC(lsub_new, len1);
+				memcpy(lsub_new, lsub, len1 * sizeof(int_t));
+				MPI_DATATYPE_ALLOC(lnzval_new, len2);
+				memcpy(lnzval_new, lnzval, len2 * sizeof(double));
+				Lrowind_bc_ptr[ljb] = lsub_new;
+				SUPERLU_FREE(lsub);
+				Lnzval_bc_ptr[ljb] = lnzval_new;
+				SUPERLU_FREE(lnzval);
 			}
 		} /* if mycol == pc ... */
-	} /* for jb ... */
-    
-    return 0;
-}
+	}	  /* for jb ... */
 
+	return 0;
+}
 
 int_t dzSendLPanel(int_t k, int_t receiver,
-                   LUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+				   LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t* xsup = LUstruct->Glu_persist->xsup;
-    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    gridinfo_t* grid = &(grid3d->grid2d);
-    int_t iam = grid->iam;
-    int_t mycol = MYCOL (iam, grid);
-
-    int_t pc = PCOL( k, grid );
-    if (mycol == pc)
+	LocalLU_t *Llu = LUstruct->Llu;
+	int_t *xsup = LUstruct->Glu_persist->xsup;
+	int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	gridinfo_t *grid = &(grid3d->grid2d);
+	int_t iam = grid->iam;
+	int_t mycol = MYCOL(iam, grid);
+
+	int_t pc = PCOL(k, grid);
+	if (mycol == pc)
 	{
-	    int_t lk = LBj( k, grid ); /* Local block number */
-	    int_t  *lsub;
-	    double* lnzval;
-	    lsub = Lrowind_bc_ptr[lk];
-	    lnzval = Lnzval_bc_ptr[lk];
-	    
-	    if (lsub != NULL)
+		int_t lk = LBj(k, grid); /* Local block number */
+		int_t *lsub;
+		double *lnzval;
+		lsub = Lrowind_bc_ptr[lk];
+		lnzval = Lnzval_bc_ptr[lk];
+
+		if (lsub != NULL)
 		{
-		    int_t len   = lsub[1];       /* LDA of the nzval[] */
-		    int_t len2  = SuperSize(k) * len; /* size of nzval of L panel */
-		    
-		    MPI_Send(lnzval, len2, MPI_DOUBLE, receiver, k, grid3d->zscp.comm);
-		    SCT->commVolRed += len2 * sizeof(double);
+			int_t len = lsub[1];			 /* LDA of the nzval[] */
+			int_t len2 = SuperSize(k) * len; /* size of nzval of L panel */
+
+			MPI_Send(lnzval, len2, MPI_DOUBLE, receiver, k, grid3d->zscp.comm);
+			SCT->commVolRed += len2 * sizeof(double);
 		}
 	}
-    return 0;
+	return 0;
 }
 
-
 int_t dzRecvLPanel(int_t k, int_t sender, double alpha, double beta,
-                    double* Lval_buf,
-                    LUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+				   double *Lval_buf,
+				   LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
 {
-    
-    // A(k) = alpha*A(k) + beta* A^{sender}(k)
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t* xsup = LUstruct->Glu_persist->xsup;
-    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    gridinfo_t* grid = &(grid3d->grid2d);
-    int inc = 1;    
-    int_t iam = grid->iam;
-    int_t mycol = MYCOL (iam, grid);
-    
-    int_t pc = PCOL( k, grid );
-    if (mycol == pc)
+
+	// A(k) = alpha*A(k) + beta* A^{sender}(k)
+	LocalLU_t *Llu = LUstruct->Llu;
+	int_t *xsup = LUstruct->Glu_persist->xsup;
+	int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	gridinfo_t *grid = &(grid3d->grid2d);
+	int inc = 1;
+	int_t iam = grid->iam;
+	int_t mycol = MYCOL(iam, grid);
+
+	int_t pc = PCOL(k, grid);
+	if (mycol == pc)
 	{
-	    int_t lk = LBj( k, grid ); /* Local block number */
-	    int_t  *lsub;
-	    double* lnzval;
-	    lsub = Lrowind_bc_ptr[lk];
-	    lnzval = Lnzval_bc_ptr[lk];
-	    
-	    if (lsub != NULL)
+		int_t lk = LBj(k, grid); /* Local block number */
+		int_t *lsub;
+		double *lnzval;
+		lsub = Lrowind_bc_ptr[lk];
+		lnzval = Lnzval_bc_ptr[lk];
+
+		if (lsub != NULL)
 		{
-		    
-		    int len   = lsub[1];       /* LDA of the nzval[] */
-		    int len2  = SuperSize(k) * len;	/*size of nzval of L panels*/
-		    
-		    MPI_Status status;
-		    MPI_Recv(Lval_buf , len2, MPI_DOUBLE, sender, k,
-			     grid3d->zscp.comm, &status);
-		    
-		    /*reduce the updates*/
-#if 1
-		    dscal_(&len2, &alpha, lnzval, &inc);
-		    daxpy_(&len2, &beta, Lval_buf, &inc, lnzval, &inc);
-#else
-		    cblas_dscal (len2, alpha, lnzval, 1);
-		    cblas_daxpy (len2, beta, Lval_buf, 1, lnzval, 1);
-#endif
+
+			int len = lsub[1];			   /* LDA of the nzval[] */
+			int len2 = SuperSize(k) * len; /*size of nzval of L panels*/
+
+			MPI_Status status;
+			MPI_Recv(Lval_buf, len2, MPI_DOUBLE, sender, k,
+					 grid3d->zscp.comm, &status);
+
+			/*reduce the updates*/
+			superlu_dscal(len2, alpha, lnzval, 1);
+			superlu_daxpy(len2, beta, Lval_buf, 1, lnzval, 1);
 		}
 	}
 
-    return 0;
+	return 0;
 }
 
 int_t dzSendUPanel(int_t k, int_t receiver,
-                    LUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+				   LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
-    gridinfo_t* grid = &(grid3d->grid2d);
-    int_t iam = grid->iam;
-
-    int_t myrow = MYROW (iam, grid);
-    int_t pr = PROW( k, grid );
-    if (myrow == pr)
+	LocalLU_t *Llu = LUstruct->Llu;
+	int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	double **Unzval_br_ptr = Llu->Unzval_br_ptr;
+	gridinfo_t *grid = &(grid3d->grid2d);
+	int_t iam = grid->iam;
+
+	int_t myrow = MYROW(iam, grid);
+	int_t pr = PROW(k, grid);
+	if (myrow == pr)
 	{
-	    int_t lk = LBi( k, grid ); /* Local block number */
-	    int_t  *usub;
-	    double* unzval;
-	    usub = Ufstnz_br_ptr[lk];
-	    unzval = Unzval_br_ptr[lk];
-	    
-	    if (usub != NULL)
+		int_t lk = LBi(k, grid); /* Local block number */
+		int_t *usub;
+		double *unzval;
+		usub = Ufstnz_br_ptr[lk];
+		unzval = Unzval_br_ptr[lk];
+
+		if (usub != NULL)
 		{
-		    int lenv = usub[1];
-		    
-		    /* code */
-		    MPI_Send(unzval, lenv, MPI_DOUBLE, receiver, k, grid3d->zscp.comm);
-		    SCT->commVolRed += lenv * sizeof(double);
+			int lenv = usub[1];
+
+			/* code */
+			MPI_Send(unzval, lenv, MPI_DOUBLE, receiver, k, grid3d->zscp.comm);
+			SCT->commVolRed += lenv * sizeof(double);
 		}
 	}
-	
-    return 0;
-}
 
+	return 0;
+}
 
 int_t dzRecvUPanel(int_t k, int_t sender, double alpha, double beta,
-                    double* Uval_buf, LUstruct_t* LUstruct,
-                    gridinfo3d_t* grid3d, SCT_t* SCT)
+				   double *Uval_buf, LUstruct_t *LUstruct,
+				   gridinfo3d_t *grid3d, SCT_t *SCT)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
-    gridinfo_t* grid = &(grid3d->grid2d);
-    int inc = 1;
-    int_t iam = grid->iam;
-    int_t myrow = MYROW (iam, grid);
-    int_t pr = PROW( k, grid );
-
-    if (myrow == pr)
+	LocalLU_t *Llu = LUstruct->Llu;
+	int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	double **Unzval_br_ptr = Llu->Unzval_br_ptr;
+	gridinfo_t *grid = &(grid3d->grid2d);
+	int inc = 1;
+	int_t iam = grid->iam;
+	int_t myrow = MYROW(iam, grid);
+	int_t pr = PROW(k, grid);
+
+	if (myrow == pr)
 	{
-	    int_t lk = LBi( k, grid ); /* Local block number */
-	    int_t  *usub;
-	    double* unzval;
-	    usub = Ufstnz_br_ptr[lk];
-	    unzval = Unzval_br_ptr[lk];
-	    
-	    if (usub != NULL)
+		int_t lk = LBi(k, grid); /* Local block number */
+		int_t *usub;
+		double *unzval;
+		usub = Ufstnz_br_ptr[lk];
+		unzval = Unzval_br_ptr[lk];
+
+		if (usub != NULL)
 		{
-		    int lenv = usub[1];
-		    MPI_Status status;
-		    MPI_Recv(Uval_buf , lenv, MPI_DOUBLE, sender, k,
-			     grid3d->zscp.comm, &status);
-		    
-		    /*reduce the updates*/
-#if 1
-		    dscal_(&lenv, &alpha, unzval, &inc);
-		    daxpy_(&lenv, &beta, Uval_buf, &inc, unzval, &inc);
-#else
-		    cblas_dscal (lenv, alpha, unzval, 1);
-		    cblas_daxpy (lenv, beta, Uval_buf, 1, unzval, 1);
-#endif
+			int lenv = usub[1];
+			MPI_Status status;
+			MPI_Recv(Uval_buf, lenv, MPI_DOUBLE, sender, k,
+					 grid3d->zscp.comm, &status);
+
+			/*reduce the updates*/
+			superlu_dscal(lenv, alpha, unzval, 1);
+			superlu_daxpy(lenv, beta, Uval_buf, 1, unzval, 1);
 		}
 	}
-    return 0;
+	return 0;
 }
 
-
-int_t dp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+int_t dp3dScatter(int_t n, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 /* Copies LU structure from layer 0 to all the layers */
 {
-    gridinfo_t* grid = &(grid3d->grid2d);
-    int_t Pc = grid->npcol;
-    int_t Pr = grid->nprow;
-    
-    /* broadcast etree */
-    int_t *etree = LUstruct->etree;
-    MPI_Bcast( etree, n, mpi_int_t, 0,  grid3d->zscp.comm);
-    
-    int_t nsupers;
-    
-    if (!grid3d->zscp.Iam)
-	nsupers = getNsupers(n, LUstruct);
-    
-    /* broadcast nsupers */
-    MPI_Bcast( &nsupers, 1, mpi_int_t, 0,  grid3d->zscp.comm);
-    
-    /* Scatter and alloc Glu_persist */
-    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
-	AllocGlu_3d(n, nsupers, LUstruct);
-    
-    /* broadcast Glu_persist */
-    int_t *xsup = LUstruct->Glu_persist->xsup;
-    MPI_Bcast( xsup, nsupers + 1, mpi_int_t, 0,  grid3d->zscp.comm);
-    
-    int_t *supno = LUstruct->Glu_persist->supno;
-    MPI_Bcast( supno, n, mpi_int_t, 0,  grid3d->zscp.comm);
-    
-    /* now broadcast localLu_t */
-    /* first allocating space for it */
-    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
-	dAllocLlu(nsupers, LUstruct, grid3d);
-    
-    LocalLU_t *Llu = LUstruct->Llu;
-    
-    /*scatter all the L blocks and indexes*/
-    dscatter3dLPanels( nsupers, LUstruct, grid3d);
-
-    /*scatter all the U blocks and indexes*/
-    dscatter3dUPanels( nsupers, LUstruct, grid3d);
-    
-    int_t* bufmax = Llu->bufmax;
-    MPI_Bcast( bufmax, NBUFFERS, mpi_int_t, 0,  grid3d->zscp.comm);
-    
-    /* now sending tosendR etc */
-    int** ToSendR = Llu->ToSendR;
-    int* ToRecv = Llu->ToRecv;
-    int* ToSendD = Llu->ToSendD;
-    
-    int_t nbr = CEILING(nsupers, Pr);
-    int_t nbc = CEILING(nsupers, Pc);
-    //Sherry MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0,  grid3d->zscp.comm);
-    MPI_Bcast( ToRecv, nsupers, MPI_INT, 0,  grid3d->zscp.comm);
-    
-    MPI_Bcast( ToSendD, nbr, MPI_INT, 0,  grid3d->zscp.comm);
-    for (int i = 0; i < nbc; ++i)
+	gridinfo_t *grid = &(grid3d->grid2d);
+	int_t Pc = grid->npcol;
+	int_t Pr = grid->nprow;
+
+	/* broadcast etree */
+	int_t *etree = LUstruct->etree;
+	MPI_Bcast(etree, n, mpi_int_t, 0, grid3d->zscp.comm);
+
+	int_t nsupers;
+
+	if (!grid3d->zscp.Iam)
+		nsupers = getNsupers(n, LUstruct);
+
+	/* broadcast nsupers */
+	MPI_Bcast(&nsupers, 1, mpi_int_t, 0, grid3d->zscp.comm);
+
+	/* Scatter and alloc Glu_persist */
+	if (grid3d->zscp.Iam) // all other process layers not equal 0
+		AllocGlu_3d(n, nsupers, LUstruct);
+
+	/* broadcast Glu_persist */
+	int_t *xsup = LUstruct->Glu_persist->xsup;
+	MPI_Bcast(xsup, nsupers + 1, mpi_int_t, 0, grid3d->zscp.comm);
+
+	int_t *supno = LUstruct->Glu_persist->supno;
+	MPI_Bcast(supno, n, mpi_int_t, 0, grid3d->zscp.comm);
+
+	/* now broadcast localLu_t */
+	/* first allocating space for it */
+	if (grid3d->zscp.Iam) // all other process layers not equal 0
+		dAllocLlu(nsupers, LUstruct, grid3d);
+
+	LocalLU_t *Llu = LUstruct->Llu;
+
+	/*scatter all the L blocks and indexes*/
+	dscatter3dLPanels(nsupers, LUstruct, grid3d);
+
+	/*scatter all the U blocks and indexes*/
+	dscatter3dUPanels(nsupers, LUstruct, grid3d);
+
+	int_t *bufmax = Llu->bufmax;
+	MPI_Bcast(bufmax, NBUFFERS, mpi_int_t, 0, grid3d->zscp.comm);
+
+	/* now sending tosendR etc */
+	int **ToSendR = Llu->ToSendR;
+	int *ToRecv = Llu->ToRecv;
+	int *ToSendD = Llu->ToSendD;
+
+	int_t nbr = CEILING(nsupers, Pr);
+	int_t nbc = CEILING(nsupers, Pc);
+	//Sherry MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0,  grid3d->zscp.comm);
+	MPI_Bcast(ToRecv, nsupers, MPI_INT, 0, grid3d->zscp.comm);
+
+	MPI_Bcast(ToSendD, nbr, MPI_INT, 0, grid3d->zscp.comm);
+	for (int i = 0; i < nbc; ++i)
 	{
-	    /* code */
-	    MPI_Bcast( ToSendR[i], Pc, MPI_INT, 0,  grid3d->zscp.comm);
+		/* code */
+		MPI_Bcast(ToSendR[i], Pc, MPI_INT, 0, grid3d->zscp.comm);
 	}
-    
-    //
+
+	//
 #ifdef MPI_MALLOC
-    // change MY LU struct into MPI malloc based
-    if (!grid3d->zscp.Iam)
-	mpiMallocLUStruct(nsupers, LUstruct, grid3d);
+	// change MY LU struct into MPI malloc based
+	if (!grid3d->zscp.Iam)
+		mpiMallocLUStruct(nsupers, LUstruct, grid3d);
 #endif
-    return 0;
+	return 0;
 } /* dp3dScatter */
 
-
 int_t dscatter3dUPanels(int_t nsupers,
-		       LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+						LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 {
 
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
-    gridinfo_t* grid = &(grid3d->grid2d);
-    
-    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-    for ( int_t lb = 0; lb < k; ++lb) {
-	int_t *usub;
-	usub =  Ufstnz_br_ptr[lb];
-	
-	double * uval = Unzval_br_ptr[lb];
-	
-	int_t flag = 0;
-	/*if non empty set the flag*/
-	if (!grid3d->zscp.Iam && usub != NULL)
-	    flag = 1;
-	/*bcast the flag*/
-	MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
-	
-	if (flag) {
-	    int_t lenv, lens;
-	    lenv = 0;
-	    lens = 0;
-	    
-	    if (!grid3d->zscp.Iam)
+	LocalLU_t *Llu = LUstruct->Llu;
+	int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	double **Unzval_br_ptr = Llu->Unzval_br_ptr;
+	gridinfo_t *grid = &(grid3d->grid2d);
+
+	int_t k = CEILING(nsupers, grid->nprow); /* Number of local block rows */
+	for (int_t lb = 0; lb < k; ++lb)
+	{
+		int_t *usub;
+		usub = Ufstnz_br_ptr[lb];
+
+		double *uval = Unzval_br_ptr[lb];
+
+		int_t flag = 0;
+		/*if non empty set the flag*/
+		if (!grid3d->zscp.Iam && usub != NULL)
+			flag = 1;
+		/*bcast the flag*/
+		MPI_Bcast(&flag, 1, mpi_int_t, 0, grid3d->zscp.comm);
+
+		if (flag)
 		{
-		    lenv = usub[1];
-		    lens = usub[2];
-		}
-	    
-	    /*broadcast the size of sub array*/
-	    MPI_Bcast( &lens, 1, mpi_int_t, 0,  grid3d->zscp.comm);
-	    MPI_Bcast( &lenv, 1, mpi_int_t, 0,  grid3d->zscp.comm);
-	    
-	    /*allocate lsub*/
-	    if (grid3d->zscp.Iam)
+			int_t lenv, lens;
+			lenv = 0;
+			lens = 0;
+
+			if (!grid3d->zscp.Iam)
+			{
+				lenv = usub[1];
+				lens = usub[2];
+			}
+
+			/*broadcast the size of sub array*/
+			MPI_Bcast(&lens, 1, mpi_int_t, 0, grid3d->zscp.comm);
+			MPI_Bcast(&lenv, 1, mpi_int_t, 0, grid3d->zscp.comm);
+
+			/*allocate lsub*/
+			if (grid3d->zscp.Iam)
 #ifdef MPI_MALLOC
-		MPI_INT_ALLOC(usub, lens);
+				MPI_INT_ALLOC(usub, lens);
 #else
- 	        usub = INT_T_ALLOC(lens);
+				usub = INT_T_ALLOC(lens);
 #endif
 
-	    /*bcast usub*/
-	    MPI_Bcast( usub, lens, mpi_int_t, 0,  grid3d->zscp.comm);
+			/*bcast usub*/
+			MPI_Bcast(usub, lens, mpi_int_t, 0, grid3d->zscp.comm);
 
-	    /*allocate uval*/
-	    if (grid3d->zscp.Iam)
+			/*allocate uval*/
+			if (grid3d->zscp.Iam)
 #ifdef MPI_MALLOC
-		MPI_DATATYPE_ALLOC(uval, lenv);
+				MPI_DATATYPE_ALLOC(uval, lenv);
 #else
-	        uval = doubleMalloc_dist(lenv); //DOUBLE_ALLOC(lenv);
+				uval = doubleMalloc_dist(lenv); //DOUBLE_ALLOC(lenv);
 #endif
-	    /*broadcast uval*/
-	    MPI_Bcast( uval, lenv, MPI_DOUBLE, 0,  grid3d->zscp.comm);
-	    
-	    /*setup the pointer*/
-	    Unzval_br_ptr[lb] = uval;
-	    Ufstnz_br_ptr[lb] = usub;
-	} /* end if flag */
-
-    } /* end for lb ... */
-    return 0;
-} /* end dScatter3dUPanels */
+			/*broadcast uval*/
+			MPI_Bcast(uval, lenv, MPI_DOUBLE, 0, grid3d->zscp.comm);
+
+			/*setup the pointer*/
+			Unzval_br_ptr[lb] = uval;
+			Ufstnz_br_ptr[lb] = usub;
+		} /* end if flag */
 
+	} /* end for lb ... */
+	return 0;
+} /* end dScatter3dUPanels */
 
 int_t dscatter3dLPanels(int_t nsupers,
-                       LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+						LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t* xsup = LUstruct->Glu_persist->xsup;
-    gridinfo_t* grid = &(grid3d->grid2d);
-    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    int_t iam = grid->iam;
-    
-    int_t mycol = MYCOL (iam, grid);
-    
-    /*start broadcasting blocks*/
-    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
-    {
-	int_t pc = PCOL( jb, grid );
-	if (mycol == pc)
-        {
-	    int_t ljb = LBj( jb, grid ); /* Local block number */
-	    int_t  *lsub;
-	    double* lnzval;
-	    lsub = Lrowind_bc_ptr[ljb];
-	    lnzval = Lnzval_bc_ptr[ljb];
-		
-	    int_t flag = 0;
-	    /*if non empty set the flag*/
-	    if (!grid3d->zscp.Iam && lsub != NULL)
-		    flag = 1;
-            /*bcast the flag*/
-	    MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
-		
-            if (flag) {
-		int_t nrbl, len, len1, len2;
-		if (!grid3d->zscp.Iam)
-		    {
-			nrbl  =   lsub[0]; /*number of L blocks */
-			len   = lsub[1];   /* LDA of the nzval[] */
-			len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-			len2  = SuperSize(jb) * len;
-		    }
-
-		/*bcast lsub len*/
-		MPI_Bcast( &len1, 1, mpi_int_t, 0,  grid3d->zscp.comm);
-		    
-   	        /*allocate lsub*/
-		if (grid3d->zscp.Iam)
+	LocalLU_t *Llu = LUstruct->Llu;
+	int_t *xsup = LUstruct->Glu_persist->xsup;
+	gridinfo_t *grid = &(grid3d->grid2d);
+	int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	int_t iam = grid->iam;
+
+	int_t mycol = MYCOL(iam, grid);
+
+	/*start broadcasting blocks*/
+	for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */
+	{
+		int_t pc = PCOL(jb, grid);
+		if (mycol == pc)
+		{
+			int_t ljb = LBj(jb, grid); /* Local block number */
+			int_t *lsub;
+			double *lnzval;
+			lsub = Lrowind_bc_ptr[ljb];
+			lnzval = Lnzval_bc_ptr[ljb];
+
+			int_t flag = 0;
+			/*if non empty set the flag*/
+			if (!grid3d->zscp.Iam && lsub != NULL)
+				flag = 1;
+			/*bcast the flag*/
+			MPI_Bcast(&flag, 1, mpi_int_t, 0, grid3d->zscp.comm);
+
+			if (flag)
+			{
+				int_t nrbl, len, len1, len2;
+				if (!grid3d->zscp.Iam)
+				{
+					nrbl = lsub[0]; /*number of L blocks */
+					len = lsub[1];	/* LDA of the nzval[] */
+					len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+					len2 = SuperSize(jb) * len;
+				}
+
+				/*bcast lsub len*/
+				MPI_Bcast(&len1, 1, mpi_int_t, 0, grid3d->zscp.comm);
+
+				/*allocate lsub*/
+				if (grid3d->zscp.Iam)
 #ifdef MPI_MALLOC
-		    MPI_INT_ALLOC(lsub, len1);
+					MPI_INT_ALLOC(lsub, len1);
 #else
-		    
-		    lsub = INT_T_ALLOC(len1);
+
+					lsub = INT_T_ALLOC(len1);
 #endif
-		    /*now broadcast lsub*/
-		    MPI_Bcast( lsub, len1, mpi_int_t, 0,  grid3d->zscp.comm);
-
-		    /*set up pointer*/
-		    Lrowind_bc_ptr[ljb] = lsub;
-		    
-		    /*bcast lnzval len*/
-		    MPI_Bcast( &len2, 1, mpi_int_t, 0,  grid3d->zscp.comm);
-		    
-		    /*allocate space for nzval*/
-		    if (grid3d->zscp.Iam)
+				/*now broadcast lsub*/
+				MPI_Bcast(lsub, len1, mpi_int_t, 0, grid3d->zscp.comm);
+
+				/*set up pointer*/
+				Lrowind_bc_ptr[ljb] = lsub;
+
+				/*bcast lnzval len*/
+				MPI_Bcast(&len2, 1, mpi_int_t, 0, grid3d->zscp.comm);
+
+				/*allocate space for nzval*/
+				if (grid3d->zscp.Iam)
 #ifdef MPI_MALLOC
-			MPI_DATATYPE_ALLOC(lnzval, len2);
+					MPI_DATATYPE_ALLOC(lnzval, len2);
 #else
-		        lnzval = doubleCalloc_dist(len2);
+					lnzval = doubleCalloc_dist(len2);
 #endif
-		    
-		    /*bcast nonzero values*/
-		    MPI_Bcast( lnzval, len2, MPI_DOUBLE, 0,  grid3d->zscp.comm);
-		    
-		    /*setup the pointers*/
-		    Lnzval_bc_ptr[ljb] = lnzval;
 
-		} /* end if flag */
+				/*bcast nonzero values*/
+				MPI_Bcast(lnzval, len2, MPI_DOUBLE, 0, grid3d->zscp.comm);
 
-	} /* end if mycol == pc */
-    } /* end for jb ... */
+				/*setup the pointers*/
+				Lnzval_bc_ptr[ljb] = lnzval;
 
-    return 0;
+			} /* end if flag */
+
+		} /* end if mycol == pc */
+	}	  /* end for jb ... */
+
+	return 0;
 } /* dscatter3dLPanels */
 
-int_t dcollect3dLpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct,
-		       gridinfo3d_t* grid3d)
+int_t dcollect3dLpanels(int_t layer, int_t nsupers, LUstruct_t *LUstruct,
+						gridinfo3d_t *grid3d)
 {
 
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t* xsup = LUstruct->Glu_persist->xsup;
-    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    gridinfo_t* grid = &(grid3d->grid2d);
+	LocalLU_t *Llu = LUstruct->Llu;
+	int_t *xsup = LUstruct->Glu_persist->xsup;
+	int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	gridinfo_t *grid = &(grid3d->grid2d);
 
-    int_t iam = grid->iam;
-    int_t mycol = MYCOL (iam, grid);
+	int_t iam = grid->iam;
+	int_t mycol = MYCOL(iam, grid);
 
-    /*start broadcasting blocks*/
-    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
-    {
-	int_t pc = PCOL( jb, grid );
-	if (mycol == pc)
+	/*start broadcasting blocks*/
+	for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */
 	{
-	    int_t ljb = LBj( jb, grid ); /* Local block number */
-	    int_t  *lsub;
-	    double* lnzval;
-	    lsub = Lrowind_bc_ptr[ljb];
-	    lnzval = Lnzval_bc_ptr[ljb];
-		    
-	    if (lsub != NULL)
-	    {
-	        int_t len   = lsub[1];       /* LDA of the nzval[] */
-		int_t len2  = SuperSize(jb) * len; /*size of nzval of L panel */
-			    
-	        if (grid3d->zscp.Iam == layer)
-		{
-		    MPI_Send(lnzval, len2, MPI_DOUBLE, 0, jb, grid3d->zscp.comm);
-		}
-		if (!grid3d->zscp.Iam)
+		int_t pc = PCOL(jb, grid);
+		if (mycol == pc)
 		{
-		    MPI_Status status;
-		    MPI_Recv(lnzval, len2, MPI_DOUBLE, layer, jb, grid3d->zscp.comm, &status);
+			int_t ljb = LBj(jb, grid); /* Local block number */
+			int_t *lsub;
+			double *lnzval;
+			lsub = Lrowind_bc_ptr[ljb];
+			lnzval = Lnzval_bc_ptr[ljb];
+
+			if (lsub != NULL)
+			{
+				int_t len = lsub[1];			  /* LDA of the nzval[] */
+				int_t len2 = SuperSize(jb) * len; /*size of nzval of L panel */
+
+				if (grid3d->zscp.Iam == layer)
+				{
+					MPI_Send(lnzval, len2, MPI_DOUBLE, 0, jb, grid3d->zscp.comm);
+				}
+				if (!grid3d->zscp.Iam)
+				{
+					MPI_Status status;
+					MPI_Recv(lnzval, len2, MPI_DOUBLE, layer, jb, grid3d->zscp.comm, &status);
+				}
+			}
 		}
-	     }
-	}
-    } /* for jb ... */
-    return 0;
+	} /* for jb ... */
+	return 0;
 }
 
-int_t dcollect3dUpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct,
-      			 gridinfo3d_t* grid3d)
+int_t dcollect3dUpanels(int_t layer, int_t nsupers, LUstruct_t *LUstruct,
+						gridinfo3d_t *grid3d)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
-    gridinfo_t* grid = &(grid3d->grid2d);
-    
-    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
-    for ( int_t lb = 0; lb < k; ++lb)
-    {
-	int_t *usub;
-	usub =  Ufstnz_br_ptr[lb];
-	double * uval = Unzval_br_ptr[lb];
-	    
-	if (usub)
+	LocalLU_t *Llu = LUstruct->Llu;
+	int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	double **Unzval_br_ptr = Llu->Unzval_br_ptr;
+	gridinfo_t *grid = &(grid3d->grid2d);
+
+	int_t k = CEILING(nsupers, grid->nprow); /* Number of local block rows */
+	for (int_t lb = 0; lb < k; ++lb)
 	{
-	    /* code */
-	    int lenv = usub[1];
-	    if (grid3d->zscp.Iam == layer)
-		{
-		    MPI_Send(uval, lenv, MPI_DOUBLE, 0, lb, grid3d->zscp.comm);
-		}
-		    
-	    if (!grid3d->zscp.Iam)
+		int_t *usub;
+		usub = Ufstnz_br_ptr[lb];
+		double *uval = Unzval_br_ptr[lb];
+
+		if (usub)
 		{
-		    MPI_Status status;
-		    MPI_Recv(uval, lenv, MPI_DOUBLE, layer, lb, grid3d->zscp.comm, &status);
+			/* code */
+			int lenv = usub[1];
+			if (grid3d->zscp.Iam == layer)
+			{
+				MPI_Send(uval, lenv, MPI_DOUBLE, 0, lb, grid3d->zscp.comm);
+			}
+
+			if (!grid3d->zscp.Iam)
+			{
+				MPI_Status status;
+				MPI_Recv(uval, lenv, MPI_DOUBLE, layer, lb, grid3d->zscp.comm, &status);
+			}
 		}
-	}
-    } /* for lb ... */
-    return 0;
+	} /* for lb ... */
+	return 0;
 }
 
 /* Gather the LU factors on layer-0 */
-int_t dp3dCollect(int_t layer, int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+int_t dp3dCollect(int_t layer, int_t n, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 {
-    int_t nsupers = getNsupers(n, LUstruct);
-    dcollect3dLpanels(layer, nsupers,  LUstruct, grid3d);
-    dcollect3dUpanels(layer,  nsupers, LUstruct, grid3d);
-    return 0;
+	int_t nsupers = getNsupers(n, LUstruct);
+	dcollect3dLpanels(layer, nsupers, LUstruct, grid3d);
+	dcollect3dUpanels(layer, nsupers, LUstruct, grid3d);
+	return 0;
 }
 
-
 /* Zero out LU non zero entries */
-int_t dzeroSetLU(int_t nnodes, int_t* nodeList, LUstruct_t *LUstruct,
-      		 gridinfo3d_t* grid3d)
+int_t dzeroSetLU(int_t nnodes, int_t *nodeList, LUstruct_t *LUstruct,
+				 gridinfo3d_t *grid3d)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
-    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
-    
-    int_t* xsup = LUstruct->Glu_persist->xsup;
-    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
-    gridinfo_t* grid = &(grid3d->grid2d);
-    
-    int_t iam = grid->iam;
-    
-    int_t myrow = MYROW (iam, grid);
-    int_t mycol = MYCOL (iam, grid);
-    
-    /*first setting the L blocks to zero*/
-    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	LocalLU_t *Llu = LUstruct->Llu;
+	int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	double **Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+	int_t *xsup = LUstruct->Glu_persist->xsup;
+	int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	gridinfo_t *grid = &(grid3d->grid2d);
+
+	int_t iam = grid->iam;
+
+	int_t myrow = MYROW(iam, grid);
+	int_t mycol = MYCOL(iam, grid);
+
+	/*first setting the L blocks to zero*/
+	for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */
 	{
-	    
-	    int_t jb = nodeList[node];
-	    int_t pc = PCOL( jb, grid );
-	    if (mycol == pc)
+
+		int_t jb = nodeList[node];
+		int_t pc = PCOL(jb, grid);
+		if (mycol == pc)
 		{
-		    int_t ljb = LBj( jb, grid ); /* Local block number */
-		    int_t  *lsub;
-		    double* lnzval;
-		    lsub = Lrowind_bc_ptr[ljb];
-		    lnzval = Lnzval_bc_ptr[ljb];
-		    
-		    if (lsub != NULL)
+			int_t ljb = LBj(jb, grid); /* Local block number */
+			int_t *lsub;
+			double *lnzval;
+			lsub = Lrowind_bc_ptr[ljb];
+			lnzval = Lnzval_bc_ptr[ljb];
+
+			if (lsub != NULL)
 			{
-			    int_t len   = lsub[1];       /* LDA of the nzval[] */
-			    int_t len2  = SuperSize(jb) * len;	/*size of nzval of L panel */
-			    memset( lnzval, 0, len2 * sizeof(double) );
+				int_t len = lsub[1];			  /* LDA of the nzval[] */
+				int_t len2 = SuperSize(jb) * len; /*size of nzval of L panel */
+				memset(lnzval, 0, len2 * sizeof(double));
 			}
 		}
 	}
 
-    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */
 	{
-	    
-	    int_t ib = nodeList[node];
-	    int_t pr = PROW( ib, grid );
-	    if (myrow == pr)
+
+		int_t ib = nodeList[node];
+		int_t pr = PROW(ib, grid);
+		if (myrow == pr)
 		{
-		    int_t lib = LBi( ib, grid ); /* Local block number */
-		    int_t  *usub;
-		    double* unzval;
-		    usub = Ufstnz_br_ptr[lib];
-		    unzval = Unzval_br_ptr[lib];
-		    
-		    if (usub != NULL)
+			int_t lib = LBi(ib, grid); /* Local block number */
+			int_t *usub;
+			double *unzval;
+			usub = Ufstnz_br_ptr[lib];
+			unzval = Unzval_br_ptr[lib];
+
+			if (usub != NULL)
 			{
-			    int lenv = usub[1];
-			    memset( unzval, 0, lenv * sizeof(double) );
+				int lenv = usub[1];
+				memset(unzval, 0, lenv * sizeof(double));
 			}
 		}
 	}
-    
-    return 0;
-}
 
+	return 0;
+}
 
 int_t dreduceAncestors3d(int_t sender, int_t receiver,
-                        int_t nnodes, int_t* nodeList,
-                        double* Lval_buf, double* Uval_buf,
-                        LUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+						 int_t nnodes, int_t *nodeList,
+						 double *Lval_buf, double *Uval_buf,
+						 LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
 {
-    double alpha = 1.0, beta = 1.0;	
-    int_t myGrid = grid3d->zscp.Iam;
-    
-    /*first setting the L blocks to zero*/
-    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	double alpha = 1.0, beta = 1.0;
+	int_t myGrid = grid3d->zscp.Iam;
+
+	/*first setting the L blocks to zero*/
+	for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */
 	{
-	    int_t jb = nodeList[node];
-	    
-	    if (myGrid == sender)
+		int_t jb = nodeList[node];
+
+		if (myGrid == sender)
 		{
-		    dzSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
-		    dzSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
+			dzSendLPanel(jb, receiver, LUstruct, grid3d, SCT);
+			dzSendUPanel(jb, receiver, LUstruct, grid3d, SCT);
+		}
+		else
+		{
+			dzRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+						 LUstruct, grid3d, SCT);
+			dzRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+						 LUstruct, grid3d, SCT);
 		}
-	    else {
-	        dzRecvLPanel(jb, sender, alpha, beta, Lval_buf,
-                                LUstruct, grid3d, SCT);
-		dzRecvUPanel(jb, sender, alpha, beta, Uval_buf,
-                                LUstruct,  grid3d, SCT);
-	    }
-	    
 	}
-    return 0;
-    
+	return 0;
 }
 
-
 int_t dgatherFactoredLU(int_t sender, int_t receiver,
-                        int_t nnodes, int_t *nodeList,
-                        dLUValSubBuf_t* LUvsb,
-                        LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT)
+						int_t nnodes, int_t *nodeList,
+						dLUValSubBuf_t *LUvsb,
+						LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
 {
-    double alpha = 0.0, beta = 1.0;	
-    double * Lval_buf  = LUvsb->Lval_buf;
-    double * Uval_buf  = LUvsb->Uval_buf;
-    int_t myGrid = grid3d->zscp.Iam;
-    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	double alpha = 0.0, beta = 1.0;
+	double *Lval_buf = LUvsb->Lval_buf;
+	double *Uval_buf = LUvsb->Uval_buf;
+	int_t myGrid = grid3d->zscp.Iam;
+	for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */
 	{
-	    int_t jb = nodeList[node];
-	    if (myGrid == sender)
+		int_t jb = nodeList[node];
+		if (myGrid == sender)
 		{
-		    dzSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
-		    dzSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
-		    
+			dzSendLPanel(jb, receiver, LUstruct, grid3d, SCT);
+			dzSendUPanel(jb, receiver, LUstruct, grid3d, SCT);
 		}
-	    else
+		else
 		{
-		    dzRecvLPanel(jb, sender, alpha, beta, Lval_buf,
-                                     LUstruct, grid3d, SCT);
-		    dzRecvUPanel(jb, sender, alpha, beta, Uval_buf,
-                                     LUstruct, grid3d, SCT);
+			dzRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+						 LUstruct, grid3d, SCT);
+			dzRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+						 LUstruct, grid3d, SCT);
 		}
 	}
-    return 0;
-    
+	return 0;
 }
 
-
-int_t dinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
-                      int_t* nodeCount, int_t** nodeList, LUstruct_t* LUstruct,
-		      gridinfo3d_t* grid3d)
+int_t dinit3DLUstruct(int_t *myTreeIdxs, int_t *myZeroTrIdxs,
+					  int_t *nodeCount, int_t **nodeList, LUstruct_t *LUstruct,
+					  gridinfo3d_t *grid3d)
 {
-    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
-    
-    for (int_t lvl = 0; lvl < maxLvl; lvl++)
+	int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+
+	for (int_t lvl = 0; lvl < maxLvl; lvl++)
 	{
-	    if (myZeroTrIdxs[lvl])
+		if (myZeroTrIdxs[lvl])
 		{
-		    /* code */
-		    int_t treeId = myTreeIdxs[lvl];
-		    dzeroSetLU(nodeCount[treeId], nodeList[treeId], LUstruct, grid3d);
+			/* code */
+			int_t treeId = myTreeIdxs[lvl];
+			dzeroSetLU(nodeCount[treeId], nodeList[treeId], LUstruct, grid3d);
 		}
 	}
-    
-    return 0;
-}
 
+	return 0;
+}
 
-int_t dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm,
-                             dLUValSubBuf_t* LUvsb, LUstruct_t* LUstruct,
-                             gridinfo3d_t* grid3d, SCT_t* SCT )
+int_t dreduceAllAncestors3d(int_t ilvl, int_t *myNodeCount, int_t **treePerm,
+							dLUValSubBuf_t *LUvsb, LUstruct_t *LUstruct,
+							gridinfo3d_t *grid3d, SCT_t *SCT)
 {
-    double * Lval_buf  = LUvsb->Lval_buf;
-    double * Uval_buf  = LUvsb->Uval_buf;
-    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
-    int_t myGrid = grid3d->zscp.Iam;
-    
-    int_t sender, receiver;
-    if ((myGrid % (1 << (ilvl + 1))) == 0)
+	double *Lval_buf = LUvsb->Lval_buf;
+	double *Uval_buf = LUvsb->Uval_buf;
+	int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+	int_t myGrid = grid3d->zscp.Iam;
+
+	int_t sender, receiver;
+	if ((myGrid % (1 << (ilvl + 1))) == 0)
 	{
-	    sender = myGrid + (1 << ilvl);
-	    receiver = myGrid;
+		sender = myGrid + (1 << ilvl);
+		receiver = myGrid;
 	}
-    else
+	else
 	{
-	    sender = myGrid;
-	    receiver = myGrid - (1 << ilvl);
+		sender = myGrid;
+		receiver = myGrid - (1 << ilvl);
 	}
-    
-    /*Reduce all the ancestors*/
-    for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl)
+
+	/*Reduce all the ancestors*/
+	for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl)
 	{
-	    /* code */
-	    // int_t atree = myTreeIdxs[alvl];
-	    int_t nsAncestor = myNodeCount[alvl];
-	    int_t* cAncestorList = treePerm[alvl];
-	    double treduce = SuperLU_timer_();
-	    dreduceAncestors3d(sender, receiver, nsAncestor, cAncestorList,
-			        Lval_buf, Uval_buf, LUstruct, grid3d, SCT);
-	    SCT->ancsReduce += SuperLU_timer_() - treduce;
-	    
+		/* code */
+		// int_t atree = myTreeIdxs[alvl];
+		int_t nsAncestor = myNodeCount[alvl];
+		int_t *cAncestorList = treePerm[alvl];
+		double treduce = SuperLU_timer_();
+		dreduceAncestors3d(sender, receiver, nsAncestor, cAncestorList,
+						   Lval_buf, Uval_buf, LUstruct, grid3d, SCT);
+		SCT->ancsReduce += SuperLU_timer_() - treduce;
 	}
-    return 0;
+	return 0;
 }
 
-int_t dgatherAllFactoredLU( trf3Dpartition_t*  trf3Dpartition,
-			   LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT )
+int_t dgatherAllFactoredLU(trf3Dpartition_t *trf3Dpartition,
+						   LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
 {
-    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
-    int_t myGrid = grid3d->zscp.Iam;
-    int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
-    sForest_t** sForests = trf3Dpartition->sForests;
-    dLUValSubBuf_t*  LUvsb =  trf3Dpartition->LUvsb;
-    int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
-    int_t** gNodeLists = getNodeListFr(maxLvl, sForests);
-    
-    for (int_t ilvl = 0; ilvl < maxLvl - 1; ++ilvl)
+	int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+	int_t myGrid = grid3d->zscp.Iam;
+	int_t *myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
+	sForest_t **sForests = trf3Dpartition->sForests;
+	dLUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb;
+	int_t *gNodeCount = getNodeCountsFr(maxLvl, sForests);
+	int_t **gNodeLists = getNodeListFr(maxLvl, sForests);
+
+	for (int_t ilvl = 0; ilvl < maxLvl - 1; ++ilvl)
 	{
-	    /* code */
-	    int_t sender, receiver;
-	    if (!myZeroTrIdxs[ilvl])
+		/* code */
+		int_t sender, receiver;
+		if (!myZeroTrIdxs[ilvl])
 		{
-		    if ((myGrid % (1 << (ilvl + 1))) == 0)
+			if ((myGrid % (1 << (ilvl + 1))) == 0)
 			{
-			    sender = myGrid + (1 << ilvl);
-			    receiver = myGrid;
+				sender = myGrid + (1 << ilvl);
+				receiver = myGrid;
 			}
-		    else
+			else
 			{
-			    sender = myGrid;
-			    receiver = myGrid - (1 << ilvl);
+				sender = myGrid;
+				receiver = myGrid - (1 << ilvl);
 			}
-		    
-		    for (int_t alvl = 0; alvl <= ilvl; alvl++)
+
+			for (int_t alvl = 0; alvl <= ilvl; alvl++)
 			{
-			    int_t diffLvl  = ilvl - alvl;
-			    int_t numTrees = 1 << diffLvl;
-			    int_t blvl = maxLvl - alvl - 1;
-			    int_t st = (1 << blvl) - 1 + (sender >> alvl);
-			    
-			    for (int_t tr = st; tr < st + numTrees; ++tr)
+				int_t diffLvl = ilvl - alvl;
+				int_t numTrees = 1 << diffLvl;
+				int_t blvl = maxLvl - alvl - 1;
+				int_t st = (1 << blvl) - 1 + (sender >> alvl);
+
+				for (int_t tr = st; tr < st + numTrees; ++tr)
 				{
-				    /* code */
-				    dgatherFactoredLU(sender, receiver,
-						     gNodeCount[tr], gNodeLists[tr],
-						     LUvsb,
-						     LUstruct, grid3d, SCT );
+					/* code */
+					dgatherFactoredLU(sender, receiver,
+									  gNodeCount[tr], gNodeLists[tr],
+									  LUvsb,
+									  LUstruct, grid3d, SCT);
 				}
 			}
-		    
 		}
 	} /* for ilvl ... */
-    	
-    SUPERLU_FREE(gNodeCount); // sherry added
-    SUPERLU_FREE(gNodeLists);
 
-    return 0;
-} /* dgatherAllFactoredLU */
+	SUPERLU_FREE(gNodeCount); // sherry added
+	SUPERLU_FREE(gNodeLists);
 
+	return 0;
+} /* dgatherAllFactoredLU */
diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c
index 1064dca0..2c94f51c 100644
--- a/SRC/pdgstrf2.c
+++ b/SRC/pdgstrf2.c
@@ -9,7 +9,6 @@ The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
-
 /*! @file
  * \brief Performs panel LU factorization.
  *
@@ -137,24 +136,22 @@ at the top-level directory.
  * 
*/ /* This pdgstrf2 is based on TRSM function */ -void -pdgstrf2_trsm - (superlu_dist_options_t * options, int_t k0, int_t k, double thresh, - Glu_persist_t * Glu_persist, gridinfo_t * grid, LocalLU_t * Llu, - MPI_Request * U_diag_blk_send_req, int tag_ub, - SuperLUStat_t * stat, int *info) +void pdgstrf2_trsm(superlu_dist_options_t *options, int_t k0, int_t k, double thresh, + Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, + MPI_Request *U_diag_blk_send_req, int tag_ub, + SuperLUStat_t *stat, int *info) { /* printf("entering pdgstrf2 %d \n", grid->iam); */ int cols_left, iam, l, pkk, pr; int incx = 1, incy = 1; - int nsupr; /* number of rows in the block (LDA) */ - int nsupc; /* number of columns in the block */ + int nsupr; /* number of rows in the block (LDA) */ + int nsupc; /* number of columns in the block */ int luptr; int_t i, myrow, krow, j, jfst, jlst, u_diag_cnt; int_t *xsup = Glu_persist->xsup; double *lusup, temp; - double *ujrow, *ublk_ptr; /* pointer to the U block */ + double *ujrow, *ublk_ptr; /* pointer to the U block */ double alpha = -1, zero = 0.0; int_t Pr; MPI_Status status; @@ -164,68 +161,77 @@ pdgstrf2_trsm /* Initialization. */ iam = grid->iam; Pr = grid->nprow; - myrow = MYROW (iam, grid); - krow = PROW (k, grid); - pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); - j = LBj (k, grid); /* Local block number */ - jfst = FstBlockC (k); - jlst = FstBlockC (k + 1); + myrow = MYROW(iam, grid); + krow = PROW(k, grid); + pkk = PNUM(PROW(k, grid), PCOL(k, grid), grid); + j = LBj(k, grid); /* Local block number */ + jfst = FstBlockC(k); + jlst = FstBlockC(k + 1); lusup = Llu->Lnzval_bc_ptr[j]; - nsupc = SuperSize (k); + nsupc = SuperSize(k); if (Llu->Lrowind_bc_ptr[j]) nsupr = Llu->Lrowind_bc_ptr[j][1]; else nsupr = 0; #ifdef PI_DEBUG - printf ("rank %d Iter %d k=%d \t dtrsm nsuper %d \n", - iam, k0, k, nsupr); + printf("rank %d Iter %d k=%d \t dtrsm nsuper %d \n", + iam, k0, k, nsupr); #endif ublk_ptr = ujrow = Llu->ujrow; - luptr = 0; /* Point to the diagonal entries. */ - cols_left = nsupc; /* supernode size */ - int ld_ujrow = nsupc; /* leading dimension of ujrow */ + luptr = 0; /* Point to the diagonal entries. */ + cols_left = nsupc; /* supernode size */ + int ld_ujrow = nsupc; /* leading dimension of ujrow */ u_diag_cnt = 0; incy = ld_ujrow; - if ( U_diag_blk_send_req && - U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL ) { + if (U_diag_blk_send_req && + U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL) + { /* There are pending sends - wait for all Isend to complete */ -#if ( PROFlevel>=1 ) - TIC (t1); +#if (PROFlevel >= 1) + TIC(t1); #endif - for (pr = 0; pr < Pr; ++pr) { - if (pr != myrow) { - MPI_Wait (U_diag_blk_send_req + pr, &status); + for (pr = 0; pr < Pr; ++pr) + { + if (pr != myrow) + { + MPI_Wait(U_diag_blk_send_req + pr, &status); } - } -#if ( PROFlevel>=1 ) - TOC (t2, t1); - stat->utime[COMM] += t2; - stat->utime[COMM_DIAG] += t2; + } +#if (PROFlevel >= 1) + TOC(t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DIAG] += t2; #endif - /* flag no more outstanding send request. */ - U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL; + /* flag no more outstanding send request. */ + U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL; } - if (iam == pkk) { /* diagonal process */ - /* ++++ First step compute diagonal block ++++++++++ */ - for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */ + if (iam == pkk) + { /* diagonal process */ + /* ++++ First step compute diagonal block ++++++++++ */ + for (j = 0; j < jlst - jfst; ++j) + { /* for each column in panel */ /* Diagonal pivot */ i = luptr; /* Not to replace zero pivot. */ - if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0 ) { - if (fabs (lusup[i]) < thresh) { /* Diagonal */ + if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0) + { + if (fabs(lusup[i]) < thresh) + { /* Diagonal */ -#if ( PRNTlevel>=2 ) - printf ("(%d) .. col %d, tiny pivot %e ", - iam, jfst + j, lusup[i]); +#if (PRNTlevel >= 2) + printf("(%d) .. col %d, tiny pivot %e ", + iam, jfst + j, lusup[i]); #endif /* Keep the new diagonal entry with the same sign. */ - if (lusup[i] < 0) lusup[i] = -thresh; - else lusup[i] = thresh; -#if ( PRNTlevel>=2 ) - printf ("replaced by %e\n", lusup[i]); + if (lusup[i] < 0) + lusup[i] = -thresh; + else + lusup[i] = thresh; +#if (PRNTlevel >= 2) + printf("replaced by %e\n", lusup[i]); #endif ++(stat->TinyPivots); } @@ -238,63 +244,70 @@ pdgstrf2_trsm /* storing U in full form */ int st; - for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) { + for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) + { st = j * ld_ujrow + j; ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */ } - if ( ujrow[0] == zero ) { /* Test for singularity. */ + if (ujrow[0] == zero) + { /* Test for singularity. */ *info = j + jfst + 1; - } else { /* Scale the j-th column within diag. block. */ + } + else + { /* Scale the j-th column within diag. block. */ temp = 1.0 / ujrow[0]; for (i = luptr + 1; i < luptr - j + nsupc; ++i) - lusup[i] *= temp; + lusup[i] *= temp; stat->ops[FACT] += nsupc - j - 1; } /* Rank-1 update of the trailing submatrix within diag. block. */ - if (--cols_left) { + if (--cols_left) + { /* l = nsupr - j - 1; */ - l = nsupc - j - 1; /* Piyush */ - dger_ (&l, &cols_left, &alpha, &lusup[luptr + 1], &incx, - &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1], - &nsupr); + l = nsupc - j - 1; /* Piyush */ + dger_(&l, &cols_left, &alpha, &lusup[luptr + 1], &incx, + &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1], + &nsupr); stat->ops[FACT] += 2 * l * cols_left; } /* ujrow = ublk_ptr + u_diag_cnt; */ ujrow = ujrow + ld_ujrow + 1; /* move to next row of U */ - luptr += nsupr + 1; /* move to next column */ + luptr += nsupr + 1; /* move to next column */ - } /* for column j ... first loop */ + } /* for column j ... first loop */ - /* ++++ Second step compute off-diagonal block with communication ++*/ + /* ++++ Second step compute off-diagonal block with communication ++*/ ublk_ptr = ujrow = Llu->ujrow; - if (U_diag_blk_send_req && iam == pkk) { /* Send the U block downward */ + if (U_diag_blk_send_req && iam == pkk) + { /* Send the U block downward */ /** ALWAYS SEND TO ALL OTHERS - TO FIX **/ -#if ( PROFlevel>=1 ) - TIC (t1); +#if (PROFlevel >= 1) + TIC(t1); #endif - for (pr = 0; pr < Pr; ++pr) { - if (pr != krow) { + for (pr = 0; pr < Pr; ++pr) + { + if (pr != krow) + { /* tag = ((k0<<2)+2) % tag_ub; */ /* tag = (4*(nsupers+k0)+2) % tag_ub; */ - MPI_Isend (ublk_ptr, nsupc * nsupc, MPI_DOUBLE, pr, - SLU_MPI_TAG (4, k0) /* tag */ , - comm, U_diag_blk_send_req + pr); - + MPI_Isend(ublk_ptr, nsupc * nsupc, MPI_DOUBLE, pr, + SLU_MPI_TAG(4, k0) /* tag */, + comm, U_diag_blk_send_req + pr); } } -#if ( PROFlevel>=1 ) - TOC (t2, t1); - stat->utime[COMM] += t2; - stat->utime[COMM_DIAG] += t2; +#if (PROFlevel >= 1) + TOC(t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DIAG] += t2; #endif - /* flag outstanding Isend */ - U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* Sherry */ + /* flag outstanding Isend */ + U_diag_blk_send_req[krow] = (MPI_Request)TRUE; /* Sherry */ } /* pragma below would be changed by an MKL call */ @@ -303,20 +316,22 @@ pdgstrf2_trsm // n = nsupc; double alpha = 1.0; #ifdef PI_DEBUG - printf ("calling dtrsm\n"); - printf ("dtrsm diagonal param 11: %d \n", nsupr); + printf("calling dtrsm\n"); + printf("dtrsm diagonal param 11: %d \n", nsupr); #endif -#if defined (USE_VENDOR_BLAS) - dtrsm_ ("R", "U", "N", "N", &l, &nsupc, - &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr, - 1, 1, 1, 1); +#if defined(USE_VENDOR_BLAS) + dtrsm_("R", "U", "N", "N", &l, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr, + 1, 1, 1, 1); #else - dtrsm_ ("R", "U", "N", "N", &l, &nsupc, - &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr); + dtrsm_("R", "U", "N", "N", &l, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr); #endif - stat->ops[FACT] += (flops_t) nsupc * (nsupc+1) * l; - } else { /* non-diagonal process */ + stat->ops[FACT] += (flops_t)nsupc * (nsupc + 1) * l; + } + else + { /* non-diagonal process */ /* ================================================================== * * Receive the diagonal block of U for panel factorization of L(:,k). * * Note: we block for panel factorization of L(:,k), but panel * @@ -326,92 +341,79 @@ pdgstrf2_trsm /* tag = ((k0<<2)+2) % tag_ub; */ /* tag = (4*(nsupers+k0)+2) % tag_ub; */ // printf("hello message receiving%d %d\n",(nsupc*(nsupc+1))>>1,SLU_MPI_TAG(4,k0)); -#if ( PROFlevel>=1 ) - TIC (t1); +#if (PROFlevel >= 1) + TIC(t1); #endif - MPI_Recv (ublk_ptr, (nsupc * nsupc), MPI_DOUBLE, krow, - SLU_MPI_TAG (4, k0) /* tag */ , - comm, &status); -#if ( PROFlevel>=1 ) - TOC (t2, t1); - stat->utime[COMM] += t2; - stat->utime[COMM_DIAG] += t2; + MPI_Recv(ublk_ptr, (nsupc * nsupc), MPI_DOUBLE, krow, + SLU_MPI_TAG(4, k0) /* tag */, + comm, &status); +#if (PROFlevel >= 1) + TOC(t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DIAG] += t2; #endif - if (nsupr > 0) { + if (nsupr > 0) + { double alpha = 1.0; #ifdef PI_DEBUG - printf ("dtrsm non diagonal param 11: %d \n", nsupr); + printf("dtrsm non diagonal param 11: %d \n", nsupr); if (!lusup) - printf (" Rank :%d \t Empty block column occurred :\n", iam); + printf(" Rank :%d \t Empty block column occurred :\n", iam); #endif -#if defined (USE_VENDOR_BLAS) - dtrsm_ ("R", "U", "N", "N", &nsupr, &nsupc, - &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr, 1, 1, 1, 1); +#if defined(USE_VENDOR_BLAS) + dtrsm_("R", "U", "N", "N", &nsupr, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr, 1, 1, 1, 1); #else - dtrsm_ ("R", "U", "N", "N", &nsupr, &nsupc, - &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr); + dtrsm_("R", "U", "N", "N", &nsupr, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr); #endif - stat->ops[FACT] += (flops_t) nsupc * (nsupc+1) * nsupr; + stat->ops[FACT] += (flops_t)nsupc * (nsupc + 1) * nsupr; } } /* end if pkk ... */ /* printf("exiting pdgstrf2 %d \n", grid->iam); */ -} /* PDGSTRF2_trsm */ +} /* PDGSTRF2_trsm */ /***************************************************************************** * The following functions are for the new pdgstrf2_dtrsm in the 3D code. *****************************************************************************/ -static -int_t LpanelUpdate(int off0, int nsupc, double* ublk_ptr, int ld_ujrow, - double* lusup, int nsupr, SCT_t* SCT) +static int_t LpanelUpdate(int off0, int nsupc, double *ublk_ptr, int ld_ujrow, + double *lusup, int nsupr, SCT_t *SCT) { int_t l = nsupr - off0; double alpha = 1.0; unsigned long long t1 = _rdtsc(); -#define GT 32 +#define GT 32 #pragma omp parallel for for (int i = 0; i < CEILING(l, GT); ++i) { int_t off = i * GT; int len = SUPERLU_MIN(GT, l - i * GT); -// #if 1 -// #if defined (USE_VENDOR_BLAS) -// dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha, -// ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr, -// 1, 1, 1, 1); -// #else -// dtrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha, -// ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr); -// #endif -// #else -// cblas_dtrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, -// len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); -// #endif - superlu_dtrsm ("R", "U", "N", "N", - len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); + superlu_dtrsm("R", "U", "N", "N", + len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); } /* for i = ... */ t1 = _rdtsc() - t1; - SCT->trf2_flops += (double) l * (double)nsupc * (double)nsupc; + SCT->trf2_flops += (double)l * (double)nsupc * (double)nsupc; SCT->trf2_time += t1; SCT->L_PanelUpdate_tl += t1; return 0; } #pragma GCC push_options -#pragma GCC optimize ("O0") +#pragma GCC optimize("O0") /*factorizes the diagonal block; called from process that owns the (k,k) block*/ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, double *BlockUFactor, /*factored U is overwritten here*/ Glu_persist_t *Glu_persist, gridinfo_t *grid, LocalLU_t *Llu, - SuperLUStat_t *stat, int *info, SCT_t* SCT) + SuperLUStat_t *stat, int *info, SCT_t *SCT) { //unsigned long long t1 = _rdtsc(); int_t *xsup = Glu_persist->xsup; @@ -419,11 +421,11 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, // printf("Entering dgetrf2 %d \n", k); /* Initialization. */ - int_t lk = LBj (k, grid); /* Local block number */ - int_t jfst = FstBlockC (k); - int_t jlst = FstBlockC (k + 1); + int_t lk = LBj(k, grid); /* Local block number */ + int_t jfst = FstBlockC(k); + int_t jlst = FstBlockC(k + 1); double *lusup = Llu->Lnzval_bc_ptr[lk]; - int_t nsupc = SuperSize (k); + int_t nsupc = SuperSize(k); int nsupr; if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1]; @@ -431,31 +433,34 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, nsupr = 0; double *ublk_ptr = BlockUFactor; double *ujrow = BlockUFactor; - int_t luptr = 0; /* Point_t to the diagonal entries. */ - int cols_left = nsupc; /* supernode size */ + int_t luptr = 0; /* Point_t to the diagonal entries. */ + int cols_left = nsupc; /* supernode size */ int_t u_diag_cnt = 0; - int_t ld_ujrow = nsupc; /* leading dimension of ujrow */ + int_t ld_ujrow = nsupc; /* leading dimension of ujrow */ int incx = 1; int incy = ld_ujrow; - for (int_t j = 0; j < jlst - jfst; ++j) /* for each column in panel */ + for (int_t j = 0; j < jlst - jfst; ++j) /* for each column in panel */ { /* Diagonal pivot */ int_t i = luptr; /* Not to replace zero pivot. */ if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0) { - if (fabs (lusup[i]) < thresh) { /* Diagonal */ + if (fabs(lusup[i]) < thresh) + { /* Diagonal */ -#if ( PRNTlevel>=2 ) - printf ("(%d) .. col %d, tiny pivot %e ", - iam, jfst + j, lusup[i]); +#if (PRNTlevel >= 2) + printf("(%d) .. col %d, tiny pivot %e ", + iam, jfst + j, lusup[i]); #endif /* Keep the new diagonal entry with the same sign. */ - if (lusup[i] < 0) lusup[i] = -thresh; - else lusup[i] = thresh; -#if ( PRNTlevel>=2 ) - printf ("replaced by %e\n", lusup[i]); + if (lusup[i] < 0) + lusup[i] = -thresh; + else + lusup[i] = thresh; +#if (PRNTlevel >= 2) + printf("replaced by %e\n", lusup[i]); #endif ++(stat->TinyPivots); } @@ -467,11 +472,11 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */ } - if (ujrow[0] == zero) /* Test for singularity. */ + if (ujrow[0] == zero) /* Test for singularity. */ { *info = j + jfst + 1; } - else /* Scale the j-th column. */ + else /* Scale the j-th column. */ { double temp; temp = 1.0 / ujrow[0]; @@ -486,23 +491,18 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, /*following must be int*/ int l = nsupc - j - 1; - /* Rank-1 update */ -#if 1 - dger_ (&l, &cols_left, &alpha, &lusup[luptr + 1], &incx, - &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1], &nsupr); -#else - cblas_dger (CblasColMajor, l, cols_left, alpha, &lusup[luptr + 1], incx, - &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], - nsupr); -#endif + /* Rank-1 update */ + + superlu_dger(l, cols_left, alpha, &lusup[luptr + 1], incx, + &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], + nsupr); stat->ops[FACT] += 2 * l * cols_left; } ujrow = ujrow + ld_ujrow + 1; /* move to next row of U */ luptr += nsupr + 1; /* move to next column */ - } /* for column j ... first loop */ - + } /* for column j ... first loop */ //int_t thread_id = omp_get_thread_num(); // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( _rdtsc() - t1); @@ -571,22 +571,21 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, * *
*/ -void pdgstrf2_xtrsm -(superlu_dist_options_t *options, int_t nsupers, - int_t k0, int_t k, double thresh, Glu_persist_t *Glu_persist, - gridinfo_t *grid, LocalLU_t *Llu, MPI_Request *U_diag_blk_send_req, - int tag_ub, SuperLUStat_t *stat, int *info, SCT_t *SCT) +void pdgstrf2_xtrsm(superlu_dist_options_t *options, int_t nsupers, + int_t k0, int_t k, double thresh, Glu_persist_t *Glu_persist, + gridinfo_t *grid, LocalLU_t *Llu, MPI_Request *U_diag_blk_send_req, + int tag_ub, SuperLUStat_t *stat, int *info, SCT_t *SCT) { int cols_left, iam, pkk; int incy = 1; - int nsupr; /* number of rows in the block (LDA) */ + int nsupr; /* number of rows in the block (LDA) */ int luptr; int_t myrow, krow, j, jfst, jlst, u_diag_cnt; - int nsupc; /* number of columns in the block */ + int nsupc; /* number of columns in the block */ int_t *xsup = Glu_persist->xsup; double *lusup; - double *ujrow, *ublk_ptr; /* pointer to the U block */ + double *ujrow, *ublk_ptr; /* pointer to the U block */ int_t Pr; /* Quick return. */ @@ -595,23 +594,23 @@ void pdgstrf2_xtrsm /* Initialization. */ iam = grid->iam; Pr = grid->nprow; - myrow = MYROW (iam, grid); - krow = PROW (k, grid); - pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); - j = LBj (k, grid); /* Local block number */ - jfst = FstBlockC (k); - jlst = FstBlockC (k + 1); + myrow = MYROW(iam, grid); + krow = PROW(k, grid); + pkk = PNUM(PROW(k, grid), PCOL(k, grid), grid); + j = LBj(k, grid); /* Local block number */ + jfst = FstBlockC(k); + jlst = FstBlockC(k + 1); lusup = Llu->Lnzval_bc_ptr[j]; - nsupc = SuperSize (k); + nsupc = SuperSize(k); if (Llu->Lrowind_bc_ptr[j]) nsupr = Llu->Lrowind_bc_ptr[j][1]; else nsupr = 0; ublk_ptr = ujrow = Llu->ujrow; - luptr = 0; /* Point to the diagonal entries. */ - cols_left = nsupc; /* supernode size */ - int ld_ujrow = nsupc; /* leading dimension of ujrow */ + luptr = 0; /* Point to the diagonal entries. */ + cols_left = nsupc; /* supernode size */ + int ld_ujrow = nsupc; /* leading dimension of ujrow */ u_diag_cnt = 0; incy = ld_ujrow; @@ -621,23 +620,23 @@ void pdgstrf2_xtrsm Wait_UDiagBlockSend(U_diag_blk_send_req, grid, SCT); } - if (iam == pkk) /* diagonal process */ + if (iam == pkk) /* diagonal process */ { /*factorize the diagonal block*/ Local_Dgstrf2(options, k, thresh, Llu->ujrow, Glu_persist, grid, Llu, stat, info, SCT); ublk_ptr = ujrow = Llu->ujrow; - if (U_diag_blk_send_req && iam == pkk) /* Send the U block */ + if (U_diag_blk_send_req && iam == pkk) /* Send the U block */ { dISend_UDiagBlock(k0, ublk_ptr, nsupc * nsupc, U_diag_blk_send_req, - grid, tag_ub); - U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* flag outstanding Isend */ + grid, tag_ub); + U_diag_blk_send_req[krow] = (MPI_Request)TRUE; /* flag outstanding Isend */ } - LpanelUpdate(nsupc, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); + LpanelUpdate(nsupc, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); } - else /* non-diagonal process */ + else /* non-diagonal process */ { /* ================================================ * * Receive the diagonal block of U * @@ -646,11 +645,11 @@ void pdgstrf2_xtrsm * but panel factorization of U(:,k) don't * * ================================================ */ - dRecv_UDiagBlock( k0, ublk_ptr, (nsupc * nsupc), krow, grid, SCT, tag_ub); + dRecv_UDiagBlock(k0, ublk_ptr, (nsupc * nsupc), krow, grid, SCT, tag_ub); if (nsupr > 0) { - LpanelUpdate(0, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); + LpanelUpdate(0, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); } } /* end if pkk ... */ @@ -663,19 +662,20 @@ void pdgstrf2_xtrsm /* PDGSTRS2 helping kernels*/ int_t dTrs2_GatherU(int_t iukp, int_t rukp, int_t klst, - int_t nsupc, int_t ldu, - int_t *usub, - double* uval, double *tempv) + int_t nsupc, int_t ldu, + int_t *usub, + double *uval, double *tempv) { double zero = 0.0; int_t ncols = 0; for (int_t jj = iukp; jj < iukp + nsupc; ++jj) { int_t segsize = klst - usub[jj]; - if ( segsize ) + if (segsize) { int_t lead_zero = ldu - segsize; - for (int_t i = 0; i < lead_zero; ++i) tempv[i] = zero; + for (int_t i = 0; i < lead_zero; ++i) + tempv[i] = zero; tempv += lead_zero; for (int_t i = 0; i < segsize; ++i) tempv[i] = uval[rukp + i]; @@ -688,8 +688,8 @@ int_t dTrs2_GatherU(int_t iukp, int_t rukp, int_t klst, } int_t dTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, - int_t nsupc, int_t ldu, - int_t *usub, double* uval, double *tempv) + int_t nsupc, int_t ldu, + int_t *usub, double *uval, double *tempv) { for (int_t jj = 0; jj < nsupc; ++jj) { @@ -710,16 +710,16 @@ int_t dTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, } int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, - int_t *usub, double *uval, double *tempv, - int_t knsupc, int nsupr, double *lusup, - Glu_persist_t *Glu_persist) /*glupersist for xsup for supersize*/ + int_t *usub, double *uval, double *tempv, + int_t knsupc, int nsupr, double *lusup, + Glu_persist_t *Glu_persist) /*glupersist for xsup for supersize*/ { double alpha = 1.0; int_t *xsup = Glu_persist->xsup; // int_t iukp = Ublock_info.iukp; // int_t rukp = Ublock_info.rukp; int_t gb = usub[iukp]; - int_t nsupc = SuperSize (gb); + int_t nsupc = SuperSize(gb); iukp += UB_DESCRIPTOR; // printf("klst inside task%d\n", ); @@ -727,34 +727,18 @@ int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, int ldu = 0; for (int_t jj = iukp; jj < iukp + nsupc; ++jj) { - ldu = SUPERLU_MAX( klst - usub[jj], ldu) ; + ldu = SUPERLU_MAX(klst - usub[jj], ldu); } /*pack U block into a dense Block*/ int ncols = dTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub, - uval, tempv); + uval, tempv); /*now call dtrsm on packed dense block*/ int_t luptr = (knsupc - ldu) * (nsupr + 1); - // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu ); - -// #if 1 -// #if defined (USE_VENDOR_BLAS) -// dtrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha, -// &lusup[luptr], &nsupr, tempv, &ldu, -// 1, 1, 1, 1); -// #else -// dtrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha, -// &lusup[luptr], &nsupr, tempv, &ldu); -// #endif -// #else - -// cblas_dtrsm (CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, -// ldu, ncols, alpha, &lusup[luptr], nsupr, tempv, ldu); -// #endif - - superlu_dtrsm ("L", "L", "N", "U", - ldu, ncols, alpha, &lusup[luptr], nsupr, tempv, ldu); + + superlu_dtrsm("L", "L", "N", "U", + ldu, ncols, alpha, &lusup[luptr], nsupr, tempv, ldu); /*now scatter the output into sparse U block*/ dTrs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv); @@ -762,23 +746,22 @@ int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, return 0; } -#if 1 +#if 1 /***************************************************************************** * The following pdgstrf2_omp is improved for KNL, since Version 5.2.0. *****************************************************************************/ -void pdgstrs2_omp -(int_t k0, int_t k, Glu_persist_t * Glu_persist, gridinfo_t * grid, - LocalLU_t * Llu, Ublock_info_t *Ublock_info, SuperLUStat_t * stat) +void pdgstrs2_omp(int_t k0, int_t k, Glu_persist_t *Glu_persist, gridinfo_t *grid, + LocalLU_t *Llu, Ublock_info_t *Ublock_info, SuperLUStat_t *stat) { #ifdef PI_DEBUG printf("====Entering pdgstrs2==== \n"); #endif int iam, pkk; int incx = 1; - int nsupr; /* number of rows in the block L(:,k) (LDA) */ + int nsupr; /* number of rows in the block L(:,k) (LDA) */ int segsize; - int nsupc; /* number of columns in the block */ + int nsupc; /* number of columns in the block */ int_t luptr, iukp, rukp; int_t b, gb, j, klst, knsupc, lk, nb; int_t *xsup = Glu_persist->xsup; @@ -792,24 +775,28 @@ void pdgstrs2_omp #endif /* Quick return. */ - lk = LBi (k, grid); /* Local block number */ - if (!Llu->Unzval_br_ptr[lk]) return; + lk = LBi(k, grid); /* Local block number */ + if (!Llu->Unzval_br_ptr[lk]) + return; /* Initialization. */ iam = grid->iam; - pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + pkk = PNUM(PROW(k, grid), PCOL(k, grid), grid); //int k_row_cycle = k / grid->nprow; /* for which cycle k exist (to assign rowwise thread blocking) */ //int gb_col_cycle; /* cycle through block columns */ - klst = FstBlockC (k + 1); - knsupc = SuperSize (k); - usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ + klst = FstBlockC(k + 1); + knsupc = SuperSize(k); + usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ uval = Llu->Unzval_br_ptr[lk]; - if (iam == pkk) { - lk = LBj (k, grid); + if (iam == pkk) + { + lk = LBj(k, grid); nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ lusup = Llu->Lnzval_bc_ptr[lk]; - } else { - nsupr = Llu->Lsub_buf_2[k0 % (1 + stat->num_look_aheads)][1]; /* LDA of lusup[] */ + } + else + { + nsupr = Llu->Lsub_buf_2[k0 % (1 + stat->num_look_aheads)][1]; /* LDA of lusup[] */ lusup = Llu->Lval_buf_2[k0 % (1 + stat->num_look_aheads)]; } @@ -826,68 +813,72 @@ void pdgstrs2_omp #undef USE_Ublock_info #ifdef USE_Ublock_info /** 4/19/2019 **/ /* Loop through all the row blocks. to get the iukp and rukp*/ - Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); + Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat); #else - int* blocks_index_pointers = SUPERLU_MALLOC (3 * nb * sizeof(int)); - int* blocks_value_pointers = blocks_index_pointers + nb; - int* nsupc_temp = blocks_value_pointers + nb; - for (b = 0; b < nb; b++) { /* set up pointers to each block */ - blocks_index_pointers[b] = iukp + UB_DESCRIPTOR; - blocks_value_pointers[b] = rukp; - gb = usub[iukp]; - rukp += usub[iukp+1]; - nsupc = SuperSize( gb ); - nsupc_temp[b] = nsupc; - iukp += (UB_DESCRIPTOR + nsupc); /* move to the next block */ + int *blocks_index_pointers = SUPERLU_MALLOC(3 * nb * sizeof(int)); + int *blocks_value_pointers = blocks_index_pointers + nb; + int *nsupc_temp = blocks_value_pointers + nb; + for (b = 0; b < nb; b++) + { /* set up pointers to each block */ + blocks_index_pointers[b] = iukp + UB_DESCRIPTOR; + blocks_value_pointers[b] = rukp; + gb = usub[iukp]; + rukp += usub[iukp + 1]; + nsupc = SuperSize(gb); + nsupc_temp[b] = nsupc; + iukp += (UB_DESCRIPTOR + nsupc); /* move to the next block */ } #endif // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for -#pragma omp parallel for schedule(static) default(shared) \ - private(b,j,iukp,rukp,segsize) +#pragma omp parallel for schedule(static) default(shared) private(b, j, iukp, rukp, segsize) /* Loop through all the blocks in the row. */ - for (b = 0; b < nb; ++b) { + for (b = 0; b < nb; ++b) + { #ifdef USE_Ublock_info - iukp = Ublock_info[b].iukp; - rukp = Ublock_info[b].rukp; + iukp = Ublock_info[b].iukp; + rukp = Ublock_info[b].rukp; #else - iukp = blocks_index_pointers[b]; - rukp = blocks_value_pointers[b]; + iukp = blocks_index_pointers[b]; + rukp = blocks_value_pointers[b]; #endif /* Loop through all the segments in the block. */ #ifdef USE_Ublock_info - gb = usub[iukp]; - nsupc = SuperSize( gb ); - iukp += UB_DESCRIPTOR; - for (j = 0; j < nsupc; j++) { -#else - for (j = 0; j < nsupc_temp[b]; j++) { + gb = usub[iukp]; + nsupc = SuperSize(gb); + iukp += UB_DESCRIPTOR; + for (j = 0; j < nsupc; j++) + { +#else + for (j = 0; j < nsupc_temp[b]; j++) + { #endif segsize = klst - usub[iukp++]; - if (segsize) { -#pragma omp task default(shared) firstprivate(segsize,rukp) if (segsize > 30) - { /* Nonzero segment. */ - int_t luptr = (knsupc - segsize) * (nsupr + 1); - //printf("[2] segsize %d, nsupr %d\n", segsize, nsupr); - -#if defined (USE_VENDOR_BLAS) - dtrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr, - &uval[rukp], &incx, 1, 1, 1); + if (segsize) + { +#pragma omp task default(shared) firstprivate(segsize, rukp) if (segsize > 30) + { /* Nonzero segment. */ + int_t luptr = (knsupc - segsize) * (nsupr + 1); + //printf("[2] segsize %d, nsupr %d\n", segsize, nsupr); + +#if defined(USE_VENDOR_BLAS) + dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, + &uval[rukp], &incx, 1, 1, 1); #else - dtrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr, - &uval[rukp], &incx); + dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, + &uval[rukp], &incx); #endif - } /* end task */ - rukp += segsize; + } /* end task */ + rukp += segsize; #ifndef USE_Ublock_info - stat->ops[FACT] += segsize * (segsize + 1); + stat->ops[FACT] += segsize * (segsize + 1); #endif - } /* end if segsize > 0 */ - } /* end for j in parallel ... */ -/* #pragma omp taskwait */ - } /* end for b ... */ + } /* end if segsize > 0 */ + } /* end for j in parallel ... */ + /* #pragma omp taskwait */ + } /* end for b ... */ #ifndef USE_Ublock_info /* Deallocate memory */ @@ -902,44 +893,45 @@ void pdgstrs2_omp } /* pdgstrs2_omp */ -#else /*==== new version from Piyush ====*/ +#else /*==== new version from Piyush ====*/ -void pdgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf, - double *Lval_buf, Glu_persist_t *Glu_persist, - gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, - Ublock_info_t *Ublock_info, double *bigV, int_t ldt, SCT_t *SCT) +void pdgstrs2_omp(int_t k0, int_t k, int_t *Lsub_buf, + double *Lval_buf, Glu_persist_t *Glu_persist, + gridinfo_t *grid, LocalLU_t *Llu, SuperLUStat_t *stat, + Ublock_info_t *Ublock_info, double *bigV, int_t ldt, SCT_t *SCT) { unsigned long long t1 = _rdtsc(); int_t *xsup = Glu_persist->xsup; /* Quick return. */ - int_t lk = LBi (k, grid); /* Local block number */ + int_t lk = LBi(k, grid); /* Local block number */ - if (!Llu->Unzval_br_ptr[lk]) return; + if (!Llu->Unzval_br_ptr[lk]) + return; /* Initialization. */ - int_t klst = FstBlockC (k + 1); - int_t knsupc = SuperSize (k); - int_t *usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ + int_t klst = FstBlockC(k + 1); + int_t knsupc = SuperSize(k); + int_t *usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ double *uval = Llu->Unzval_br_ptr[lk]; int_t nb = usub[0]; - int_t nsupr = Lsub_buf[1]; /* LDA of lusup[] */ + int_t nsupr = Lsub_buf[1]; /* LDA of lusup[] */ double *lusup = Lval_buf; /* Loop through all the row blocks. to get the iukp and rukp*/ - Trs2_InitUbloc_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); + Trs2_InitUbloc_info(klst, nb, Ublock_info, usub, Glu_persist, stat); /* Loop through all the row blocks. */ -#pragma omp parallel for schedule(dynamic,2) +#pragma omp parallel for schedule(dynamic, 2) for (int_t b = 0; b < nb; ++b) { int_t thread_id = omp_get_thread_num(); - double *tempv = bigV + thread_id * ldt * ldt; + double *tempv = bigV + thread_id * ldt * ldt; dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, - usub, uval, tempv, knsupc, nsupr, lusup, Glu_persist); + usub, uval, tempv, knsupc, nsupr, lusup, Glu_persist); } /* for b ... */ - SCT->PDGSTRS2_tl += (double) ( _rdtsc() - t1); + SCT->PDGSTRS2_tl += (double)(_rdtsc() - t1); } /* pdgstrs2_omp new version from Piyush */ #endif From 385734212e1e6d68df19675279525c2581d91974 Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Sun, 6 Dec 2020 12:46:59 -0800 Subject: [PATCH 047/147] For 3D complex code: replaced direct BLAS calls by the superlu_ wrapper calls in file zsuperlu_blas.c. --- SRC/CMakeLists.txt | 1 + SRC/Makefile | 4 +- SRC/pz3dcomm.c | 8 +++ SRC/pzgstrf2.c | 18 ++++++- SRC/superlu_ddefs.h | 109 +++++++++++++++++++------------------ SRC/superlu_zdefs.h | 61 ++++++++++++++------- SRC/zscatter3d.c | 16 +++++- SRC/zsuperlu_blas.c | 122 ++++++++++++++++++++++++++++++++++++++++++ SRC/ztrfCommWrapper.c | 12 ++++- 9 files changed, 274 insertions(+), 77 deletions(-) create mode 100644 SRC/zsuperlu_blas.c diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index bef541f5..3d059586 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -165,6 +165,7 @@ if(enable_complex16) pz3dcomm.c ztrfAux.c zcommunication_aux.c ztrfCommWrapper.c + zsuperlu_blas.c ) endif() diff --git a/SRC/Makefile b/SRC/Makefile index 860c638a..f3c73168 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -71,7 +71,7 @@ DPLUSRC = pdgssvx.o pdgssvx_ABglobal.o \ pdgstrf.o pdgstrf2.o pdGetDiagU.o \ pdgstrs.o pdgstrs1.o pdgstrs_lsum.o pdgstrs_Bglobal.o \ pdgsrfs.o pdgsmv.o pdgsrfs_ABXglobal.o pdgsmv_AXglobal.o \ - dreadtriple_noheader.o + dreadtriple_noheader.o dsuperlu_blas.o # from 3D code DPLUSRC += pdgssvx3d.o pdgstrf3d.o dtreeFactorization.o dscatter3d.o \ dgather.o pd3dcomm.o dtrfAux.o dcommunication_aux.o dtrfCommWrapper.o \ @@ -86,7 +86,7 @@ ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \ pzgstrf.o pzgstrf2.o pzGetDiagU.o \ pzgstrs.o pzgstrs1.o pzgstrs_lsum.o pzgstrs_Bglobal.o \ pzgsrfs.o pzgsmv.o pzgsrfs_ABXglobal.o pzgsmv_AXglobal.o \ - zreadtriple_noheader.o + zreadtriple_noheader.o zsuperlu_blas.o # from 3D code ZPLUSRC += pzgssvx3d.o pzgstrf3d.o ztreeFactorization.o zscatter3d.o \ zgather.o pz3dcomm.o ztrfAux.o zcommunication_aux.o ztrfCommWrapper.o \ diff --git a/SRC/pz3dcomm.c b/SRC/pz3dcomm.c index fc79bc15..c1627c4c 100644 --- a/SRC/pz3dcomm.c +++ b/SRC/pz3dcomm.c @@ -250,6 +250,9 @@ int_t zzRecvLPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex bet grid3d->zscp.comm, &status); /*reduce the updates*/ + superlu_zscal(len2, alpha, lnzval, 1); + superlu_zaxpy(len2, beta, Lval_buf, 1, lnzval, 1); +#if 0 // replaced #if 1 zscal_(&len2, &alpha, lnzval, &inc); zaxpy_(&len2, &beta, Lval_buf, &inc, lnzval, &inc); @@ -257,6 +260,7 @@ int_t zzRecvLPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex bet cblas_zscal (len2, (void*) &alpha, lnzval, 1); cblas_zaxpy (len2, (void*) &beta, Lval_buf, 1, lnzval, 1); #endif +#endif } } @@ -325,12 +329,16 @@ int_t zzRecvUPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex bet grid3d->zscp.comm, &status); /*reduce the updates*/ + superlu_zscal(lenv, alpha, unzval, 1); + superlu_zaxpy(lenv, beta, Uval_buf, 1, unzval, 1); +#if 0 // replaced #if 1 zscal_(&lenv, &alpha, unzval, &inc); zaxpy_(&lenv, &beta, Uval_buf, &inc, unzval, &inc); #else cblas_zscal (lenv, (void*) &alpha, unzval, 1); cblas_zaxpy (lenv, (void*) &beta, Uval_buf, 1, unzval, 1); +#endif #endif } } diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c index 2116d03a..d7d27495 100644 --- a/SRC/pzgstrf2.c +++ b/SRC/pzgstrf2.c @@ -379,6 +379,11 @@ int_t LpanelUpdate(int_t off0, int nsupc, doublecomplex* ublk_ptr, int ld_ujrow { int_t off = i * GT; int len = SUPERLU_MIN(GT, l - i * GT); + + superlu_ztrsm("R", "U", "N", "N", len, nsupc, alpha, + ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); + +#if 0 // replaced by supelru_dtrsm #if 1 #if defined (USE_VENDOR_BLAS) ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha, @@ -392,6 +397,7 @@ int_t LpanelUpdate(int_t off0, int nsupc, doublecomplex* ublk_ptr, int ld_ujrow cblas_ztrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, len, nsupc, (void*) &alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); #endif +#endif // replaced by supelru_dtrsm } /* for i = ... */ @@ -486,6 +492,10 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh, int l = nsupc - j - 1; /* Rank-1 update */ + superlu_zger(l, cols_left, alpha, &lusup[luptr + 1], incx, + &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], nsupr); + +#if 0 // replaced by superlu_ #if 1 zgeru_ (&l, &cols_left, &alpha, &lusup[luptr + 1], &incx, &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1], @@ -495,6 +505,8 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh, &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], nsupr); #endif +#endif // replaced by superlu_ + stat->ops[FACT] += 8 * l * cols_left; } @@ -737,7 +749,10 @@ int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, /*now call ztrsm on packed dense block*/ int_t luptr = (knsupc - ldu) * (nsupr + 1); // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu ); - + + superlu_ztrsm("L", "L", "N", "U", ldu, ncols, alpha, + &lusup[luptr], nsupr, tempv, ldu); +#if 0 // replaced by superlu_ #if 1 #if defined (USE_VENDOR_BLAS) ztrsm_ ("L", "L", "N", "U", &ldu, &ncols, &alpha, @@ -752,6 +767,7 @@ int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, cblas_ztrsm (CblasColMajor, CblasLeft, CblasLower, CblasNoTrans, CblasUnit, ldu, ncols, (void*) &alpha, &lusup[luptr], nsupr, tempv, ldu); #endif +#endif // replaced by superlu_ /*now scatter the output into sparse U block*/ zTrs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv); diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index a132f0ae..18f8b46e 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -524,63 +524,66 @@ extern "C" /* BLAS */ #ifdef USE_VENDOR_BLAS - extern void dgemm_(const char *, const char *, const int *, const int *, const int *, - const double *, const double *, const int *, const double *, - const int *, const double *, double *, const int *, int, int); - extern void dtrsv_(char *, char *, char *, int *, double *, int *, - double *, int *, int, int, int); - extern void dtrsm_(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, - int *, int, int, int, int); - extern void dgemv_(char *, int *, int *, double *, double *a, int *, - double *, int *, double *, double *, int *, int); - - extern void dger_(int *, int *, double *, double *, int *, - double *, int *, double *, int *); +extern void dgemm_(const char*, const char*, const int*, const int*, const int*, + const double*, const double*, const int*, const double*, + const int*, const double*, double*, const int*, int, int); +extern void dtrsv_(char*, char*, char*, int*, double*, int*, + double*, int*, int, int, int); +extern void dtrsm_(const char*, const char*, const char*, const char*, + const int*, const int*, const double*, const double*, const int*, + double*, const int*, int, int, int, int); +extern void dgemv_(const char *, const int *, const int *, const double *, + const double *a, const int *, const double *, const int *, + const double *, double *, const int *, int); #else -extern int dgemm_(const char *, const char *, const int *, const int *, const int *, - const double *, const double *, const int *, const double *, - const int *, const double *, double *, const int *); -extern int dtrsv_(char *, char *, char *, int *, double *, int *, - double *, int *); -extern int dtrsm_(char *, char *, char *, char *, int *, int *, - double *, double *, int *, double *, int *); -extern int dgemv_(char *, int *, int *, double *, double *a, int *, - double *, int *, double *, double *, int *); -extern void dger_(int *, int *, double *, double *, int *, - double *, int *, double *, int *); - +extern int dgemm_(const char*, const char*, const int*, const int*, const int*, + const double*, const double*, const int*, const double*, + const int*, const double*, double*, const int*); +extern int dtrsv_(char*, char*, char*, int*, double*, int*, + double*, int*); +extern int dtrsm_(const char*, const char*, const char*, const char*, + const int*, const int*, const double*, const double*, const int*, + double*, const int*); +extern void dgemv_(const char *, const int *, const int *, const double *, + const double *a, const int *, const double *, const int *, + const double *, double *, const int *); #endif - extern int dscal_(int *n, double *da, double *dx, int *incx); - extern int daxpy_(int *n, double *za, double *zx, - int *incx, double *zy, int *incy); - // LAPACK routine - extern void dtrtri_(char *, char *, int *, double *, int *, int *); - - // Superlu blas routines - extern int superlu_dgemm(const char *transa, const char *transb, - int m, int n, int k, double alpha, double *a, - int lda, double *b, int ldb, double beta, double *c, int ldc); - - extern int superlu_dtrsm(const char *sideRL, const char *uplo, - const char *transa, const char *diag, - const int m, const int n, - const double alpha, const double *a, - const int lda, double *b, const int ldb); - extern int superlu_dger(const int m, const int n, const double alpha, - const double *x, const int incx, const double *y, - const int incy, double *a, const int lda); - - extern int superlu_dscal(const int n, const double alpha, double *x, const int incx); - extern int superlu_dtrsv(char *uplo, char *trans, char *diag, - int n, double *a, int lda, double *x, int incx); - extern int superlu_dgemv(const char *trans, const int m, - const int n, const double alpha, const double *a, - const int lda, const double *x, const int incx, - const double beta, double *y, const int incy); - extern int superlu_daxpy(const int n, const double alpha, const double *x, const int incx, double *y, const int incy); +extern void dger_(const int*, const int*, const double*, + const double*, const int*, const double*, const int*, + double*, const int*); + +extern int dscal_(const int *n, const double *alpha, double *dx, const int *incx); +extern int daxpy_(const int *n, const double *alpha, const double *x, + const int *incx, double *y, const int *incy); + +/* SuperLU BLAS interface: dsuperlu_blas.c */ +extern int superlu_dgemm(const char *transa, const char *transb, + int m, int n, int k, double alpha, double *a, + int lda, double *b, int ldb, double beta, double *c, int ldc); +extern int superlu_dtrsm(const char *sideRL, const char *uplo, + const char *transa, const char *diag, const int m, const int n, + const double alpha, const double *a, + const int lda, double *b, const int ldb); +extern int superlu_dger(const int m, const int n, const double alpha, + const double *x, const int incx, const double *y, + const int incy, double *a, const int lda); +extern int superlu_dscal(const int n, const double alpha, double *x, const int incx); +extern int superlu_daxpy(const int n, const double alpha, + const double *x, const int incx, double *y, const int incy); +extern int superlu_dgemv(const char *trans, const int m, + const int n, const double alpha, const double *a, + const int lda, const double *x, const int incx, + const double beta, double *y, const int incy); +extern int superlu_dtrsv(char *uplo, char *trans, char *diag, + int n, double *a, int lda, double *x, int incx); + +// LAPACK routine +extern void dtrtri_(char *, char *, int *, double *, int *, int *); + + + /*==== For 3D code ====*/ extern void pdgssvx3d(superlu_dist_options_t *, SuperMatrix *, diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h index 345b4d71..903045ed 100644 --- a/SRC/superlu_zdefs.h +++ b/SRC/superlu_zdefs.h @@ -516,21 +516,19 @@ extern int file_PrintDoublecomplex(FILE *fp, char *, int_t, doublecomplex *); /* BLAS */ - + #ifdef USE_VENDOR_BLAS extern void zgemm_(const char*, const char*, const int*, const int*, const int*, const doublecomplex*, const doublecomplex*, const int*, const doublecomplex*, const int*, const doublecomplex*, doublecomplex*, const int*, int, int); extern void ztrsv_(char*, char*, char*, int*, doublecomplex*, int*, doublecomplex*, int*, int, int, int); -extern void ztrsm_(char*, char*, char*, char*, int*, int*, - doublecomplex*, doublecomplex*, int*, doublecomplex*, - int*, int, int, int, int); -extern void zgemv_(char *, int *, int *, doublecomplex *, doublecomplex *a, int *, - doublecomplex *, int *, doublecomplex *, doublecomplex *, int *, int); - -extern void zgeru_(int*, int*, doublecomplex*, doublecomplex*, int*, - doublecomplex*, int*, doublecomplex*, int*); +extern void ztrsm_(const char*, const char*, const char*, const char*, + const int*, const int*, const doublecomplex*, const doublecomplex*, const int*, + doublecomplex*, const int*, int, int, int, int); +extern void zgemv_(const char *, const int *, const int *, const doublecomplex *, + const doublecomplex *a, const int *, const doublecomplex *, const int *, + const doublecomplex *, doublecomplex *, const int *, int); #else extern int zgemm_(const char*, const char*, const int*, const int*, const int*, @@ -538,18 +536,43 @@ extern int zgemm_(const char*, const char*, const int*, const int*, const int*, const int*, const doublecomplex*, doublecomplex*, const int*); extern int ztrsv_(char*, char*, char*, int*, doublecomplex*, int*, doublecomplex*, int*); -extern int ztrsm_(char*, char*, char*, char*, int*, int*, - doublecomplex*, doublecomplex*, int*, doublecomplex*, int*); -extern int zgemv_(char *, int *, int *, doublecomplex *, doublecomplex *a, int *, - doublecomplex *, int *, doublecomplex *, doublecomplex *, int *); -extern int zgeru_(int*, int*, doublecomplex*, doublecomplex*, int*, - doublecomplex*, int*, doublecomplex*, int*); - +extern int ztrsm_(const char*, const char*, const char*, const char*, + const int*, const int*, const doublecomplex*, const doublecomplex*, const int*, + doublecomplex*, const int*); +extern void zgemv_(const char *, const int *, const int *, const doublecomplex *, + const doublecomplex *a, const int *, const doublecomplex *, const int *, + const doublecomplex *, doublecomplex *, const int *); #endif -extern int zscal_(int *n, doublecomplex *da, doublecomplex *dx, int *incx); -extern int zaxpy_(int *n, doublecomplex *za, doublecomplex *zx, - int *incx, doublecomplex *zy, int *incy); +extern void zgeru_(const int*, const int*, const doublecomplex*, + const doublecomplex*, const int*, const doublecomplex*, const int*, + doublecomplex*, const int*); + +extern int zscal_(const int *n, const doublecomplex *alpha, doublecomplex *dx, const int *incx); +extern int zaxpy_(const int *n, const doublecomplex *alpha, const doublecomplex *x, + const int *incx, doublecomplex *y, const int *incy); + +/* SuperLU BLAS interface: xsuperlu_blas.c.base */ +extern int superlu_zgemm(const char *transa, const char *transb, + int m, int n, int k, doublecomplex alpha, doublecomplex *a, + int lda, doublecomplex *b, int ldb, doublecomplex beta, doublecomplex *c, int ldc); +extern int superlu_ztrsm(const char *sideRL, const char *uplo, + const char *transa, const char *diag, const int m, const int n, + const doublecomplex alpha, const doublecomplex *a, + const int lda, doublecomplex *b, const int ldb); +extern int superlu_zger(const int m, const int n, const doublecomplex alpha, + const doublecomplex *x, const int incx, const doublecomplex *y, + const int incy, doublecomplex *a, const int lda); +extern int superlu_zscal(const int n, const doublecomplex alpha, doublecomplex *x, const int incx); +extern int superlu_zaxpy(const int n, const doublecomplex alpha, + const doublecomplex *x, const int incx, doublecomplex *y, const int incy); +extern int superlu_zgemv(const char *trans, const int m, + const int n, const doublecomplex alpha, const doublecomplex *a, + const int lda, const doublecomplex *x, const int incx, + const doublecomplex beta, doublecomplex *y, const int incy); +extern int superlu_ztrsv(char *uplo, char *trans, char *diag, + int n, doublecomplex *a, int lda, doublecomplex *x, int incx); + // LAPACK routine extern void ztrtri_(char*, char*, int*, doublecomplex*, int*, int*); diff --git a/SRC/zscatter3d.c b/SRC/zscatter3d.c index 00fef67b..e7a1ae64 100644 --- a/SRC/zscatter3d.c +++ b/SRC/zscatter3d.c @@ -142,6 +142,13 @@ zblock_gemm_scatter( int_t lb, int_t j, /* calling ZGEMM */ // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col ); + + superlu_zgemm("N", "N", temp_nbrow, ncols, ldu, alpha, + &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl, + &U_mat[st_col * ldu], ldu, + beta, tempv1, temp_nbrow); + +#if 0 // ** replaced by superlu_zgemm #if 1 #if defined (USE_VENDOR_BLAS) zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha, @@ -161,7 +168,8 @@ zblock_gemm_scatter( int_t lb, int_t j, &U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow); #endif - +#endif // ** replaced by superlu_zgemm + // printf("SCU update: (%d, %d)\n",ib,jb ); #ifdef SCATTER_PROFILE unsigned long long ttx = __rdtsc(); @@ -253,6 +261,11 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0}; /* calling ZGEMM */ + superlu_zgemm("N", "N", temp_nbrow, ncols, ldu, alpha, + L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl, + U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow); + +#if 0 // replaced by superlu_zgemm #if 1 #if defined (USE_VENDOR_BLAS) // printf(" m %d n %d k %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col ); @@ -271,6 +284,7 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, &U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow); #endif +#endif // replaced by superlu_zgemm /*try to get the lock for the block*/ if (lock) /*lock is not null*/ diff --git a/SRC/zsuperlu_blas.c b/SRC/zsuperlu_blas.c new file mode 100644 index 00000000..2d413761 --- /dev/null +++ b/SRC/zsuperlu_blas.c @@ -0,0 +1,122 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Wrapper functions to call BLAS. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * December 6, 2020
+ */
+
+#include "superlu_zdefs.h"
+
+#ifdef _CRAY
+_fcd ftcs = _cptofcd("N", strlen("N"));
+_fcd ftcs1 = _cptofcd("L", strlen("L"));
+_fcd ftcs2 = _cptofcd("N", strlen("N"));
+_fcd ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+int superlu_zgemm(const char *transa, const char *transb,
+                  int m, int n, int k, doublecomplex alpha, doublecomplex *a,
+                  int lda, doublecomplex *b, int ldb, doublecomplex beta, doublecomplex *c, int ldc)
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd(transa, strlen(transa));
+    _fcd ftcs1 = _cptofcd(transb, strlen(transb));
+    return CGEMM(ftcs, ftcs1, &m, &n, &k,
+                 &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#elif defined(USE_VENDOR_BLAS)
+    zgemm_(transa, transb, &m, &n, &k,
+           &alpha, a, &lda, b, &ldb, &beta, c, &ldc, 1, 1);
+    return 0;
+#else
+    return zgemm_(transa, transb, &m, &n, &k,
+                  &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#endif
+}
+
+int superlu_ztrsm(const char *sideRL, const char *uplo,
+                  const char *transa, const char *diag,
+                  const int m, const int n,
+                  const doublecomplex alpha, const doublecomplex *a,
+                  const int lda, doublecomplex *b, const int ldb)
+
+{
+#if defined(USE_VENDOR_BLAS)
+    ztrsm_(sideRL, uplo, transa, diag,
+           &m, &n, &alpha, a, &lda, b, &ldb,
+           1, 1, 1, 1);
+    return 0;
+#else
+    return ztrsm_(sideRL, uplo, transa, diag,
+                  &m, &n, &alpha, a, &lda, b, &ldb);
+#endif
+}
+
+int superlu_zger(const int m, const int n, const doublecomplex alpha,
+                 const doublecomplex *x, const int incx, const doublecomplex *y,
+                 const int incy, doublecomplex *a, const int lda)
+{
+#ifdef _CRAY
+    CGERU(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#else
+    zgeru_(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#endif
+
+    return 0;
+}
+
+int superlu_zscal(const int n, const doublecomplex alpha, doublecomplex *x, const int incx)
+{
+    zscal_(&n, &alpha, x, &incx);
+    return 0;
+}
+
+int superlu_zaxpy(const int n, const doublecomplex alpha,
+    const doublecomplex *x, const int incx, doublecomplex *y, const int incy)
+{
+    zaxpy_(&n, &alpha, x, &incx, y, &incy);
+    return 0;
+}
+
+int superlu_zgemv(const char *trans, const int m,
+                  const int n, const doublecomplex alpha, const doublecomplex *a,
+                  const int lda, const doublecomplex *x, const int incx,
+                  const doublecomplex beta, doublecomplex *y, const int incy)
+{
+#ifdef USE_VENDOR_BLAS
+    zgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy, 1);
+#else
+    zgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
+#endif
+    
+    return 0;
+}
+
+int superlu_ztrsv(char *uplo, char *trans, char *diag,
+                  int n, doublecomplex *a, int lda, doublecomplex *x, int incx)
+{
+#ifdef _CRAY
+    // _fcd ftcs = _cptofcd("N", strlen("N"));
+    CTRSV(_cptofcd(uplo, strlen(uplo)), _cptofcd(trans, strlen(trans)), _cptofcd(diag, strlen(diag)), 
+         &n, a, &lda, x, &incx);
+#elif defined (USE_VENDOR_BLAS)
+    ztrsv_(uplo, trans, diag, &n, a, &lda, x, &incx, 1, 1, 1);
+#else
+    ztrsv_(uplo, trans, diag, &n, a, &lda, x, &incx);
+#endif
+    
+    return 0;
+}
+
diff --git a/SRC/ztrfCommWrapper.c b/SRC/ztrfCommWrapper.c
index 11d65284..98efab03 100644
--- a/SRC/ztrfCommWrapper.c
+++ b/SRC/ztrfCommWrapper.c
@@ -172,6 +172,10 @@ int_t zLPanelTrSolve( int_t k,   int_t* factored_L,
                 // Sherry: int_t len = MY_MIN(BL, l - i * BL);
                 int len = SUPERLU_MIN(BL, l - i * BL);
 
+                superlu_ztrsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[off], nsupr);
+		
+#if 0 // ** replaced by superlu_ztrsm 		
 #if 1
   #if defined (USE_VENDOR_BLAS)
 		ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
@@ -185,6 +189,8 @@ int_t zLPanelTrSolve( int_t k,   int_t* factored_L,
                 cblas_ztrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
                 len, nsupc, (void*) &alpha, ublk_ptr, ld_ujrow, &lusup[off], nsupr);
 #endif
+#endif // ** replaced by superlu_ztrsm 		
+		
             }
         }
     }
@@ -218,8 +224,11 @@ int_t zLPanelTrSolve( int_t k,   int_t* factored_L,
             int_t off = i * BL;
             // Sherry: int_t len = MY_MIN(BL, l - i * BL);
             int len = SUPERLU_MIN(BL, (l - i * BL));
-            #pragma omp task
+#pragma omp task
             {
+                superlu_ztrsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
+#if 0 // ** replaced by superlu_ztrsm
 #if 1
   #if defined (USE_VENDOR_BLAS)
 		ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
@@ -233,6 +242,7 @@ int_t zLPanelTrSolve( int_t k,   int_t* factored_L,
                 cblas_ztrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
                              len, nsupc, (void*) &alpha, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
 #endif
+#endif // ** replaced by superlu_ztrsm
 
             }
         }

From 9397bcad6b6320b4341ce70c500dfda2153d0e67 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Sun, 6 Dec 2020 21:54:06 -0800
Subject: [PATCH 048/147] Use dsuperlu_blas.c generated by the basefile, with
 proper leading comment.

---
 SRC/dsuperlu_blas.c | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/SRC/dsuperlu_blas.c b/SRC/dsuperlu_blas.c
index 090a77cb..9b4cb79f 100644
--- a/SRC/dsuperlu_blas.c
+++ b/SRC/dsuperlu_blas.c
@@ -1,3 +1,24 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Wrapper functions to call BLAS.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * December 6, 2020
+ */
+
 #include "superlu_ddefs.h"
 
 #ifdef _CRAY
@@ -63,7 +84,8 @@ int superlu_dscal(const int n, const double alpha, double *x, const int incx)
     return 0;
 }
 
-int superlu_daxpy(const int n, const double alpha, const double *x, const int incx, double *y, const int incy)
+int superlu_daxpy(const int n, const double alpha,
+    const double *x, const int incx, double *y, const int incy)
 {
     daxpy_(&n, &alpha, x, &incx, y, &incy);
     return 0;
@@ -74,15 +96,10 @@ int superlu_dgemv(const char *trans, const int m,
                   const int lda, const double *x, const int incx,
                   const double beta, double *y, const int incy)
 {
-
 #ifdef USE_VENDOR_BLAS
-		dgemv_(trans, &m, &n, &alpha, a,
-           &lda, x, &incx, 
-           &beta, y, &incy, 1);
+    dgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy, 1);
 #else
-		dgemv_(trans, &m, &n, &alpha, a,
-           &lda, x, &incx, 
-           &beta, y, &incy);
+    dgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
 #endif
     
     return 0;
@@ -93,12 +110,12 @@ int superlu_dtrsv(char *uplo, char *trans, char *diag,
 {
 #ifdef _CRAY
     // _fcd ftcs = _cptofcd("N", strlen("N"));
-		STRSV(_cptofcd(uplo, strlen(uplo)), _cptofcd(trans, strlen(trans)), _cptofcd(diag, strlen(diag)), 
+    STRSV(_cptofcd(uplo, strlen(uplo)), _cptofcd(trans, strlen(trans)), _cptofcd(diag, strlen(diag)), 
          &n, a, &lda, x, &incx);
 #elif defined (USE_VENDOR_BLAS)
-		dtrsv_(uplo, trans, diag, &n, a, &lda, x, &incx, 1, 1, 1);
+    dtrsv_(uplo, trans, diag, &n, a, &lda, x, &incx, 1, 1, 1);
 #else
-		dtrsv_(uplo, trans, diag, &n, a, &lda, x, &incx);
+    dtrsv_(uplo, trans, diag, &n, a, &lda, x, &incx);
 #endif
     
     return 0;

From 1e94597a9d7c130534a56a7bdedef4e5571e1509 Mon Sep 17 00:00:00 2001
From: 7ps 
Date: Thu, 10 Dec 2020 13:46:08 -0500
Subject: [PATCH 049/147] fixing a compilation error; working with
 (r,c,d)=(2,2,2) cg20

---
 SRC/zscatter3d.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SRC/zscatter3d.c b/SRC/zscatter3d.c
index e7a1ae64..eeece774 100644
--- a/SRC/zscatter3d.c
+++ b/SRC/zscatter3d.c
@@ -262,8 +262,8 @@ zblock_gemm_scatter_lock( int_t lb, int_t j,
 
     /* calling ZGEMM */
     superlu_zgemm("N", "N", temp_nbrow, ncols, ldu, alpha,
-           L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
-           U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow);
+           &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+           &U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow);
     
 #if 0 // replaced by superlu_zgemm    
 #if 1

From d547f2b7dbcadea79d990cff750e1de67e5c002b Mon Sep 17 00:00:00 2001
From: 7ps 
Date: Thu, 10 Dec 2020 15:53:40 -0500
Subject: [PATCH 050/147] Adding GPU compilation flags

---
 CMakeLists.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dc67bd12..a21b1b5a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,15 @@ set(CMAKE_INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib")
 # -fPIC is added in compiler flag.
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
+find_package(CUDA REQUIRED)
+if (CUDA_FOUND)
+  if (NOT CMAKE_CUDA_FLAGS)
+    cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS Auto)
+  endif()
+  set(CUDA_NVCC_FLAGS_RELEASE "-O3 --expt-relaxed-constexpr -DNDEBUG ${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS}")
+  set(CUDA_NVCC_FLAGS_DEBUG "-O0 --expt-relaxed-constexpr -DDEBUG -g ${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS}")
+endif()
+
 
 #---- For shared library
 

From 8ae91b3cac18a41a04dfc57524ddee39093832c4 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Sat, 12 Dec 2020 22:15:24 -0800
Subject: [PATCH 051/147] Correct free A2d and A3d at the end of pzgssvx3d.c

---
 SRC/pzgssvx3d.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c
index 59e40a7f..34822fa0 100644
--- a/SRC/pzgssvx3d.c
+++ b/SRC/pzgssvx3d.c
@@ -1566,17 +1566,16 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	A->Store = Astore3d; // restore Astore to 3D
 	
 	/* free A2d and B2d, which are allocated only in 2D layer Grid_0 */
+	NRformat_loc *A2d = A3d->A_nfmt;
 	if (grid3d->zscp.Iam == 0) {
-	    NRformat_loc *A2d = A3d->A_nfmt;
 	    SUPERLU_FREE( A2d->rowptr );
 	    SUPERLU_FREE( A2d->colind );
 	    SUPERLU_FREE( A2d->nzval );
-	    SUPERLU_FREE( A2d );         // free 2D structure
-	    SUPERLU_FREE(A3d->B2d);
-	    SUPERLU_FREE(A3d);           // free 3D structure
+	    SUPERLU_FREE( A3d->B2d );
 	}
+	SUPERLU_FREE( A2d );         // free 2D structure
+	SUPERLU_FREE( A3d );         // free 3D structure
 #endif
-
     
 #if ( DEBUGlevel>=1 )
 	CHECK_MALLOC (iam, "Exit pzgssvx3d()");

From 639a8a7162ff64b0ccacf06a5efcb59801a350d9 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Wed, 16 Dec 2020 09:39:48 -0800
Subject: [PATCH 052/147] Resolved all the conflicts between Master and
 Version-7 branches.

---
 EXAMPLE/pddrive3d.c       |  20 ++---
 EXAMPLE/pzdrive3d.c       |  20 ++---
 SRC/communication_aux.c   |  35 ++++-----
 SRC/dcommunication_aux.c  |  52 ++++++++-----
 SRC/dreadMM.c             |   2 +-
 SRC/dscatter3d.c          |  52 ++++++++-----
 SRC/dtreeFactorization.c  |  14 ++--
 SRC/dtrfAux.c             |  29 +++++---
 SRC/dtrfCommWrapper.c     |  32 ++++----
 SRC/dutil_dist.c          |  62 +++++++++++++++-
 SRC/pd3dcomm.c            |  85 +++++++++++----------
 SRC/pdgssvx.c             |   1 +
 SRC/pdgssvx3d.c           |  26 ++++---
 SRC/pdgssvx_ABglobal.c    |   2 +
 SRC/pdgstrf2.c            |  14 ++--
 SRC/pdgstrf3d.c           |  15 ++--
 SRC/psymbfact.c           |   6 +-
 SRC/pz3dcomm.c            |  71 +++++++++---------
 SRC/pzgssvx3d.c           |  22 +++---
 SRC/pzgstrf2.c            |  18 ++---
 SRC/pzgstrf3d.c           |  15 ++--
 SRC/sec_structs.c         |   1 -
 SRC/superlu_ddefs.h       |  52 ++++++-------
 SRC/superlu_defs.h        |  25 +++----
 SRC/superlu_dist_config.h |   2 +-
 SRC/superlu_grid3d.c      |   8 +-
 SRC/superlu_zdefs.h       | 151 +++++---------------------------------
 SRC/supernodal_etree.c    |  23 +++---
 SRC/trfAux.c              | 134 ++++++++++++++++++---------------
 SRC/util.c                |  57 --------------
 SRC/zcommunication_aux.c  |  52 ++++++++-----
 SRC/zreadMM.c             |   2 +-
 SRC/zscatter3d.c          |  40 +++++-----
 SRC/ztreeFactorization.c  |  14 ++--
 SRC/ztrfAux.c             |  28 ++++---
 SRC/ztrfCommWrapper.c     |  32 ++++----
 SRC/zutil_dist.c          |  57 ++++++++++++++
 37 files changed, 642 insertions(+), 629 deletions(-)

diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c
index 553b7741..4f4f1a4b 100644
--- a/EXAMPLE/pddrive3d.c
+++ b/EXAMPLE/pddrive3d.c
@@ -105,9 +105,9 @@ main (int argc, char *argv[])
     superlu_dist_options_t options;
     SuperLUStat_t stat;
     SuperMatrix A;  // Now, A is on all 3D processes  
-    ScalePermstruct_t ScalePermstruct;
-    LUstruct_t LUstruct;
-    SOLVEstruct_t SOLVEstruct;
+    dScalePermstruct_t ScalePermstruct;
+    dLUstruct_t LUstruct;
+    dSOLVEstruct_t SOLVEstruct;
     gridinfo3d_t grid;
     double *berr;
     double *b, *xtrue;
@@ -336,8 +336,8 @@ main (int argc, char *argv[])
 #endif    
 
     /* Initialize ScalePermstruct and LUstruct. */
-    ScalePermstructInit (m, n, &ScalePermstruct);
-    LUstructInit (n, &LUstruct);
+    dScalePermstructInit (m, n, &ScalePermstruct);
+    dLUstructInit (n, &LUstruct);
 
     /* Initialize the statistics variables. */
     PStatInit (&stat);
@@ -362,21 +362,21 @@ main (int argc, char *argv[])
 
 	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
 
-        Destroy_LU (n, &(grid.grid2d), &LUstruct);
+        dDestroy_LU (n, &(grid.grid2d), &LUstruct);
         if (options.SolveInitialized) {
             dSolveFinalize (&options, &SOLVEstruct);
         }
     } else { // Process layers not equal 0
-        DeAllocLlu_3d(n, &LUstruct, &grid);
-        DeAllocGlu_3d(&LUstruct);
+        dDeAllocLlu_3d(n, &LUstruct, &grid);
+        dDeAllocGlu_3d(&LUstruct);
     }
 
     Destroy_CompRowLoc_Matrix_dist (&A);
     SUPERLU_FREE (b);
     SUPERLU_FREE (xtrue);
     SUPERLU_FREE (berr);
-    ScalePermstructFree (&ScalePermstruct);
-    LUstructFree (&LUstruct);
+    dScalePermstructFree (&ScalePermstruct);
+    dLUstructFree (&LUstruct);
     PStatFree (&stat);
 
     /* ------------------------------------------------------------
diff --git a/EXAMPLE/pzdrive3d.c b/EXAMPLE/pzdrive3d.c
index be3522fb..3cd975e8 100644
--- a/EXAMPLE/pzdrive3d.c
+++ b/EXAMPLE/pzdrive3d.c
@@ -105,9 +105,9 @@ main (int argc, char *argv[])
     superlu_dist_options_t options;
     SuperLUStat_t stat;
     SuperMatrix A;  // Now, A is on all 3D processes  
-    ScalePermstruct_t ScalePermstruct;
-    LUstruct_t LUstruct;
-    SOLVEstruct_t SOLVEstruct;
+    zScalePermstruct_t ScalePermstruct;
+    zLUstruct_t LUstruct;
+    zSOLVEstruct_t SOLVEstruct;
     gridinfo3d_t grid;
     double *berr;
     doublecomplex *b, *xtrue;
@@ -336,8 +336,8 @@ main (int argc, char *argv[])
 #endif    
 
     /* Initialize ScalePermstruct and LUstruct. */
-    ScalePermstructInit (m, n, &ScalePermstruct);
-    LUstructInit (n, &LUstruct);
+    zScalePermstructInit (m, n, &ScalePermstruct);
+    zLUstructInit (n, &LUstruct);
 
     /* Initialize the statistics variables. */
     PStatInit (&stat);
@@ -362,21 +362,21 @@ main (int argc, char *argv[])
 
 	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
 
-        Destroy_LU (n, &(grid.grid2d), &LUstruct);
+        zDestroy_LU (n, &(grid.grid2d), &LUstruct);
         if (options.SolveInitialized) {
             zSolveFinalize (&options, &SOLVEstruct);
         }
     } else { // Process layers not equal 0
-        DeAllocLlu_3d(n, &LUstruct, &grid);
-        DeAllocGlu_3d(&LUstruct);
+        zDeAllocLlu_3d(n, &LUstruct, &grid);
+        zDeAllocGlu_3d(&LUstruct);
     }
 
     Destroy_CompRowLoc_Matrix_dist (&A);
     SUPERLU_FREE (b);
     SUPERLU_FREE (xtrue);
     SUPERLU_FREE (berr);
-    ScalePermstructFree (&ScalePermstruct);
-    LUstructFree (&LUstruct);
+    zScalePermstructFree (&ScalePermstruct);
+    zLUstructFree (&LUstruct);
     PStatFree (&stat);
 
     /* ------------------------------------------------------------
diff --git a/SRC/communication_aux.c b/SRC/communication_aux.c
index 9529bcdd..f3b3791e 100644
--- a/SRC/communication_aux.c
+++ b/SRC/communication_aux.c
@@ -21,7 +21,7 @@ int_t Wait_LSend
 /*wait till broadcast of L finished*/
 (int_t k, gridinfo_t *grid, int **ToSendR, MPI_Request *send_req, SCT_t* SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     int_t Pc = grid->npcol;
     int_t iam = grid->iam;
     int_t lk = LBj (k, grid);
@@ -36,7 +36,7 @@ int_t Wait_LSend
             MPI_Wait (&send_req[pj + Pc], &status);
         }
     }
-    SCT->Wait_LSend_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_LSend_tl += ( SuperLU_timer_() - t1);
     return 0;
 }
 
@@ -45,7 +45,7 @@ int_t Wait_USend
 /*wait till broadcast of U panels finished*/
 ( MPI_Request *send_req, gridinfo_t *grid, SCT_t* SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     int_t iam = grid->iam;
     int_t Pr = grid->nprow;
     int_t myrow = MYROW (iam, grid);
@@ -58,7 +58,7 @@ int_t Wait_USend
             MPI_Wait (&send_req[pi + Pr], &status);
         }
     }
-    SCT->Wait_USend_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_USend_tl += (double) (SuperLU_timer_() - t1);
     return 0;
 }
 
@@ -103,7 +103,7 @@ int_t Wait_UDiagBlockSend(MPI_Request *U_diag_blk_send_req,
                           gridinfo_t * grid, SCT_t* SCT)
 {
 
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     int_t iam = grid->iam;
     int_t Pr = grid->nprow;
     int_t myrow = MYROW (iam, grid);
@@ -115,15 +115,14 @@ int_t Wait_UDiagBlockSend(MPI_Request *U_diag_blk_send_req,
             MPI_Wait (U_diag_blk_send_req + pr, &status);
         }
     }
-    SCT->Wait_UDiagBlockSend_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_UDiagBlockSend_tl += (double) ( SuperLU_timer_() - t1);
     return 0;
 }
 
 int_t Wait_LDiagBlockSend(MPI_Request *L_diag_blk_send_req,
                           gridinfo_t * grid, SCT_t* SCT)
 {
-
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     int_t iam = grid->iam;
     int_t Pc = grid->npcol;
     int_t mycol = MYCOL (iam, grid);
@@ -135,51 +134,49 @@ int_t Wait_LDiagBlockSend(MPI_Request *L_diag_blk_send_req,
             MPI_Wait (L_diag_blk_send_req + pc, &status);
         }
     }
-    SCT->Wait_UDiagBlockSend_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_UDiagBlockSend_tl += (double) ( SuperLU_timer_() - t1);
     return 0;
 }
 
 
 int_t Wait_UDiagBlock_Recv( MPI_Request *request, SCT_t* SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     MPI_Status status;
     MPI_Wait(request, &status);
-    SCT->Wait_UDiagBlock_Recv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_UDiagBlock_Recv_tl += (double) ( SuperLU_timer_() - t1);
     return 0;
-
 }
 
 int_t Test_UDiagBlock_Recv( MPI_Request *request, SCT_t* SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     MPI_Status status;
     int flag;
     MPI_Test(request,&flag, &status);
-    SCT->Wait_UDiagBlock_Recv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_UDiagBlock_Recv_tl += (double) ( SuperLU_timer_() - t1);
     return flag;
 
 }
 
 int_t Wait_LDiagBlock_Recv( MPI_Request *request, SCT_t* SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     MPI_Status status;
     MPI_Wait(request, &status);
-    SCT->Wait_LDiagBlock_Recv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_LDiagBlock_Recv_tl += (double) ( SuperLU_timer_() - t1);
     return 0;
 
 }
 
 int_t Test_LDiagBlock_Recv( MPI_Request *request, SCT_t* SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     MPI_Status status;
     int flag;
     MPI_Test(request, &flag, &status);
-    SCT->Wait_LDiagBlock_Recv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_LDiagBlock_Recv_tl += (double) ( SuperLU_timer_() - t1);
     return flag;
-
 }
 
 /*
diff --git a/SRC/dcommunication_aux.c b/SRC/dcommunication_aux.c
index 608fba76..ef9d6da6 100644
--- a/SRC/dcommunication_aux.c
+++ b/SRC/dcommunication_aux.c
@@ -72,7 +72,8 @@ int_t dBcast_LPanel
  int* msgcnt,  int **ToSendR, int_t *xsup , SCT_t* SCT,
  int tag_ub)
 {
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     int_t Pc = grid->npcol;
     int_t lk = LBj (k, grid);
     superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
@@ -101,7 +102,8 @@ int_t dBcast_LPanel
 
         }
     }
-    SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl +=  SuperLU_timer_() - t1;
     return 0;
 }
 
@@ -156,7 +158,8 @@ int_t dBcast_UPanel(int_t k, int_t k0, int_t* usub,
 		    int* msgcnt, int *ToSendD, SCT_t* SCT, int tag_ub)
 
 {
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     int_t iam = grid->iam;
     int_t lk = LBi (k, grid);
     int_t Pr = grid->nprow;
@@ -188,14 +191,15 @@ int_t dBcast_UPanel(int_t k, int_t k0, int_t* usub,
             }       /* if pi ... */
         }           /* for pi ... */
     }
-    SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl += SuperLU_timer_() - t1;
     return 0;
 }
 
 int_t dIrecv_LPanel
 /*it places Irecv call for L panel*/
 (int_t k, int_t k0,  int_t* Lsub_buf, double* Lval_buf,
- gridinfo_t *grid, MPI_Request *recv_req, LocalLU_t *Llu, int tag_ub )
+ gridinfo_t *grid, MPI_Request *recv_req, dLocalLU_t *Llu, int tag_ub )
 {
     int_t kcol = PCOL (k, grid);
 
@@ -212,7 +216,7 @@ int_t dIrecv_LPanel
 
 int_t dIrecv_UPanel
 /*it places Irecv calls to receive U panels*/
-(int_t k, int_t k0, int_t* Usub_buf, double* Uval_buf, LocalLU_t *Llu,
+(int_t k, int_t k0, int_t* Usub_buf, double* Uval_buf, dLocalLU_t *Llu,
  gridinfo_t* grid, MPI_Request *recv_req_u, int tag_ub )
 {
     int_t krow = PROW (k, grid);
@@ -230,13 +234,15 @@ int_t dIrecv_UPanel
 int_t dWait_URecv
 ( MPI_Request *recv_req, int* msgcnt, SCT_t* SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     MPI_Status status;
     MPI_Wait (&recv_req[0], &status);
     MPI_Get_count (&status, mpi_int_t, &msgcnt[2]);
     MPI_Wait (&recv_req[1], &status);
     MPI_Get_count (&status, MPI_DOUBLE, &msgcnt[3]);
-    SCT->Wait_URecv_tl += (double) ( _rdtsc() - t1);
+    //SCT->Wait_URecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_URecv_tl +=  SuperLU_timer_() - t1;
     return 0;
 }
 
@@ -244,7 +250,8 @@ int_t dWait_LRecv
 /*waits till L blocks have been received*/
 (  MPI_Request* recv_req, int* msgcnt, int* msgcntsU, gridinfo_t * grid, SCT_t* SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     MPI_Status status;
     
     if (recv_req[0] != MPI_REQUEST_NULL)
@@ -268,7 +275,8 @@ int_t dWait_LRecv
     {
         msgcnt[1] = msgcntsU[1];
     }
-    SCT->Wait_LRecv_tl += (double) ( _rdtsc() - t1);
+    //SCT->Wait_LRecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_LRecv_tl +=  SuperLU_timer_() - t1;
     return 0;
 }
 
@@ -304,7 +312,8 @@ int_t dRecv_UDiagBlock(int_t k0, double *ublk_ptr, /*pointer for the diagonal bl
                       int_t src,
                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
 {
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     MPI_Status status;
     MPI_Comm comm = (grid->cscp).comm;
     /* tag = ((k0<<2)+2) % tag_ub;        */
@@ -312,13 +321,14 @@ int_t dRecv_UDiagBlock(int_t k0, double *ublk_ptr, /*pointer for the diagonal bl
 
     MPI_Recv (ublk_ptr, size, MPI_DOUBLE, src,
               SLU_MPI_TAG (4, k0), comm, &status);
-    SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl +=  SuperLU_timer_() - t1;
     return 0;
 }
 
 
 int_t dPackLBlock(int_t k, double* Dest, Glu_persist_t *Glu_persist,
-                  gridinfo_t *grid, LocalLU_t *Llu)
+                  gridinfo_t *grid, dLocalLU_t *Llu)
 /*Copies src matrix into dest matrix*/
 {
     /* Initialization. */
@@ -375,7 +385,8 @@ int_t dIRecv_UDiagBlock(int_t k0, double *ublk_ptr, /*pointer for the diagonal b
                        MPI_Request *U_diag_blk_recv_req,
                        gridinfo_t * grid, SCT_t* SCT, int tag_ub)
 {
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     MPI_Comm comm = (grid->cscp).comm;
     /* tag = ((k0<<2)+2) % tag_ub;        */
     /* tag = (4*(nsupers+k0)+2) % tag_ub; */
@@ -386,7 +397,8 @@ int_t dIRecv_UDiagBlock(int_t k0, double *ublk_ptr, /*pointer for the diagonal b
     {
         printf("Error in IRecv_UDiagBlock count\n");
     }
-    SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
     return 0;
 }
 
@@ -396,7 +408,8 @@ int_t dIRecv_LDiagBlock(int_t k0, double *L_blk_ptr, /*pointer for the diagonal
                        MPI_Request *L_diag_blk_recv_req,
                        gridinfo_t * grid, SCT_t* SCT, int tag_ub)
 {
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     MPI_Comm comm = (grid->rscp).comm;
     /* tag = ((k0<<2)+2) % tag_ub;        */
     /* tag = (4*(nsupers+k0)+2) % tag_ub; */
@@ -408,7 +421,8 @@ int_t dIRecv_LDiagBlock(int_t k0, double *L_blk_ptr, /*pointer for the diagonal
     {
         printf("Error in IRecv_lDiagBlock count\n");
     }
-    SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl +=  SuperLU_timer_() - t1;
     return 0;
 }
 
@@ -449,9 +463,9 @@ int_t dIBcast_LDiagBlock(int_t k, double *lblk_ptr, /*pointer for the diagonal b
 int_t dUDiagBlockRecvWait( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
                            MPI_Request * U_diag_blk_recv_req,
                            gridinfo_t *grid,
-                           LUstruct_t *LUstruct, SCT_t *SCT)
+                           dLUstruct_t *LUstruct, SCT_t *SCT)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
 
     int_t iam = grid->iam;
 
diff --git a/SRC/dreadMM.c b/SRC/dreadMM.c
index 53b02ab0..b36712cc 100644
--- a/SRC/dreadMM.c
+++ b/SRC/dreadMM.c
@@ -61,7 +61,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
      if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) {
        printf("Invalid header (first line does not contain 5 tokens)\n");
-       exit;
+       exit(-1);
      }
 
      if(strcmp(banner,"%%matrixmarket")) {
diff --git a/SRC/dscatter3d.c b/SRC/dscatter3d.c
index c2edb2b9..73d4df7d 100644
--- a/SRC/dscatter3d.c
+++ b/SRC/dscatter3d.c
@@ -152,7 +152,8 @@ dblock_gemm_scatter( int_t lb, int_t j,
 
     // printf("SCU update: (%d, %d)\n",ib,jb );
 #ifdef SCATTER_PROFILE
-    unsigned long long ttx = __rdtsc();
+    //unsigned long long ttx = __rdtsc();
+    double ttx = SuperLU_timer_();
 #endif
     /*Now scattering the block*/
     if (ib < jb)
@@ -185,7 +186,8 @@ dblock_gemm_scatter( int_t lb, int_t j,
     // stat->ops[FACT] += 2*temp_nbrow*ncols*ldu + temp_nbrow*ncols;
 
 #ifdef SCATTER_PROFILE
-    double t_s = (double) __rdtsc() - ttx;
+    //double t_s = (double) __rdtsc() - ttx;
+    double t_s = SuperLU_timer_() - ttx;
     Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
     += 3.0 * (double ) temp_nbrow * (double ) ncols;
     Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
@@ -256,7 +258,8 @@ dblock_gemm_scatter_lock( int_t lb, int_t j,
         }
 
 #ifdef SCATTER_PROFILE
-    unsigned long long ttx = __rdtsc();
+    //unsigned long long ttx = __rdtsc();
+    double ttx = SuperLU_timer_();
 #endif
     /*Now scattering the block*/
     if (ib < jb)
@@ -289,7 +292,8 @@ dblock_gemm_scatter_lock( int_t lb, int_t j,
         omp_unset_lock(lock);
 
 #ifdef SCATTER_PROFILE
-    double t_s = (double) __rdtsc() - ttx;
+    //double t_s = (double) __rdtsc() - ttx;
+    double t_s = SuperLU_timer_() - ttx;
     Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
     += 3.0 * (double ) temp_nbrow * (double ) ncols;
     Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
@@ -325,13 +329,13 @@ int_t dblock_gemm_scatterTopLeft( int_t lb, /* block number in L */
                                  double* bigV, int_t knsupc,  int_t klst,
 				 int_t* lsub, int_t * usub, int_t ldt,
 				 int* indirect, int* indirect2, HyP_t* HyP,
-                                 LUstruct_t *LUstruct,
+                                 dLUstruct_t *LUstruct,
                                  gridinfo_t* grid,
                                  SCT_t*SCT, SuperLUStat_t *stat
                                )
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
     int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
@@ -344,7 +348,8 @@ int_t dblock_gemm_scatterTopLeft( int_t lb, /* block number in L */
 #endif
     
 //    printf("Thread's ID %lld \n", thread_id);
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     dblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info,
 			HyP->lookAhead_L_buff, HyP->Lnbrow,
                         HyP->bigU_host, HyP->ldu,
@@ -356,7 +361,8 @@ int_t dblock_gemm_scatterTopLeft( int_t lb, /* block number in L */
                         , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
 #endif
                       );
-    unsigned long long t2 = _rdtsc();
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
     SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
     return 0;
 } /* dgemm_scatterTopLeft */
@@ -365,13 +371,13 @@ int_t dblock_gemm_scatterTopRight( int_t lb,  int_t j,
                                   double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
                                   int_t * usub, int_t ldt,  int* indirect, int* indirect2,
                                   HyP_t* HyP,
-                                  LUstruct_t *LUstruct,
+                                  dLUstruct_t *LUstruct,
                                   gridinfo_t* grid,
                                   SCT_t*SCT, SuperLUStat_t *stat
                                 )
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
     int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
@@ -382,7 +388,8 @@ int_t dblock_gemm_scatterTopRight( int_t lb,  int_t j,
 #else
     volatile  int_t thread_id = 0;
 #endif
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     dblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow,
                         HyP->bigU_Phi, HyP->ldu_Phi,
                         bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
@@ -391,7 +398,8 @@ int_t dblock_gemm_scatterTopRight( int_t lb,  int_t j,
                         , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
 #endif
                       );
-    unsigned long long t2 = _rdtsc();
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
     SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
     return 0;
 } /* dblock_gemm_scatterTopRight */
@@ -400,13 +408,13 @@ int_t dblock_gemm_scatterBottomLeft( int_t lb,  int_t j,
                                     double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
                                     int_t * usub, int_t ldt,  int* indirect, int* indirect2,
                                     HyP_t* HyP,
-                                    LUstruct_t *LUstruct,
+                                    dLUstruct_t *LUstruct,
                                     gridinfo_t* grid,
                                     SCT_t*SCT, SuperLUStat_t *stat
                                   )
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
     int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
@@ -418,7 +426,8 @@ int_t dblock_gemm_scatterBottomLeft( int_t lb,  int_t j,
     volatile int_t thread_id = 0;
 #endif
     //printf("Thread's ID %lld \n", thread_id);
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     dblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
                         HyP->bigU_host, HyP->ldu,
                         bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
@@ -427,7 +436,8 @@ int_t dblock_gemm_scatterBottomLeft( int_t lb,  int_t j,
                         , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
 #endif
                       );
-    unsigned long long t2 = _rdtsc();
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
     SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
     return 0;
 
@@ -437,13 +447,13 @@ int_t dblock_gemm_scatterBottomRight( int_t lb,  int_t j,
                                      double* bigV, int_t knsupc,  int_t klst, int_t* lsub,
                                      int_t * usub, int_t ldt,  int* indirect, int* indirect2,
                                      HyP_t* HyP,
-                                     LUstruct_t *LUstruct,
+                                     dLUstruct_t *LUstruct,
                                      gridinfo_t* grid,
                                      SCT_t*SCT, SuperLUStat_t *stat
                                    )
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
     int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
@@ -455,7 +465,8 @@ int_t dblock_gemm_scatterBottomRight( int_t lb,  int_t j,
     volatile  int_t thread_id = 0;
 #endif
    // printf("Thread's ID %lld \n", thread_id);
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     dblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
                         HyP->bigU_Phi, HyP->ldu_Phi,
                         bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
@@ -465,7 +476,8 @@ int_t dblock_gemm_scatterBottomRight( int_t lb,  int_t j,
 #endif
                       );
 
-    unsigned long long t2 = _rdtsc();
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();    
     SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
     return 0;
 
diff --git a/SRC/dtreeFactorization.c b/SRC/dtreeFactorization.c
index 43ef7de2..78cf5ec2 100644
--- a/SRC/dtreeFactorization.c
+++ b/SRC/dtreeFactorization.c
@@ -24,9 +24,9 @@ at the top-level directory.
 #include "trfCommWrapper.h"
 #endif
 
-int_t dLluBufInit(dLUValSubBuf_t* LUvsb, LUstruct_t *LUstruct)
+int_t dLluBufInit(dLUValSubBuf_t* LUvsb, dLUstruct_t *LUstruct)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
     LUvsb->Lsub_buf = intMalloc_dist(Llu->bufmax[0]); //INT_T_ALLOC(Llu->bufmax[0]);
     LUvsb->Lval_buf = doubleMalloc_dist(Llu->bufmax[1]); //DOUBLE_ALLOC(Llu->bufmax[1]);
     LUvsb->Usub_buf = intMalloc_dist(Llu->bufmax[2]); //INT_T_ALLOC(Llu->bufmax[2]);
@@ -71,7 +71,7 @@ int dfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs)
     return 0;
 }
 
-dLUValSubBuf_t** dLluBufInitArr(int_t numLA, LUstruct_t *LUstruct)
+dLUValSubBuf_t** dLluBufInitArr(int_t numLA, dLUstruct_t *LUstruct)
 {
     dLUValSubBuf_t** LUvsbs = (dLUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(dLUValSubBuf_t*));
     for (int_t i = 0; i < numLA; ++i)
@@ -100,7 +100,7 @@ int dLluBufFreeArr(int_t numLA, dLUValSubBuf_t **LUvsbs)
 
 int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
                   scuBufs_t* scuBufs,
-                  LUstruct_t* LUstruct,
+                  dLUstruct_t* LUstruct,
                   gridinfo_t * grid)
 {
     scuBufs->bigV = dgetBigV(ldt, num_threads);
@@ -137,13 +137,13 @@ int_t ddenseTreeFactor(
     superlu_dist_options_t *options,
     int_t * gIperm_c_supno,
     int_t ldt,
-    LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    dLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
     double thresh,  SCT_t *SCT, int tag_ub,
     int *info
 )
 {
     gridinfo_t* grid = &(grid3d->grid2d);
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
 
     /*main loop over all the super nodes*/
     for (int_t k0 = 0; k0 < nnodes   ; ++k0)
@@ -300,7 +300,7 @@ int_t dsparseTreeFactor_ASYNC(
     int_t * gIperm_c_supno,
     int_t ldt,
     HyP_t* HyP,
-    LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    dLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
     double thresh,  SCT_t *SCT, int tag_ub,
     int *info
 )
diff --git a/SRC/dtrfAux.c b/SRC/dtrfAux.c
index 869d2fbb..ad08eef4 100644
--- a/SRC/dtrfAux.c
+++ b/SRC/dtrfAux.c
@@ -27,7 +27,7 @@ at the top-level directory.
 #endif
 
 /* Inititalize the data structure to assist HALO offload of Schur-complement. */
-void dInit_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb )
+void dInit_HyP(HyP_t* HyP, dLocalLU_t *Llu, int_t mcb, int_t mrb )
 {
     HyP->last_offload = -1;
 #if 0
@@ -70,7 +70,7 @@ void dInit_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb )
 
 /*init3DLUstruct with forest interface */
 void dinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
-                           sForest_t**  sForests, LUstruct_t* LUstruct,
+                           sForest_t**  sForests, dLUstruct_t* LUstruct,
                            gridinfo3d_t* grid3d)
 {
     int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
@@ -114,11 +114,11 @@ int_t dSchurComplementSetup(
     int_t* Usub_buf,
     double *Uval_buf,
     gridinfo_t *grid,
-    LUstruct_t *LUstruct
+    dLUstruct_t *LUstruct
 )
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
 
     int* ToRecv = Llu->ToRecv;
@@ -338,7 +338,7 @@ int_t dSchurComplementSetupGPU(
     int_t* iperm_c_supno, int_t*perm_c_supno,
     gEtreeInfo_t*   gEtreeInfo, factNodelists_t* fNlists,
     scuBufs_t* scuBufs, dLUValSubBuf_t* LUvsb,
-    gridinfo_t *grid, LUstruct_t *LUstruct,
+    gridinfo_t *grid, dLUstruct_t *LUstruct,
     HyP_t* HyP)
 {
     int_t * Lsub_buf  = LUvsb->Lsub_buf;
@@ -353,7 +353,7 @@ int_t dSchurComplementSetupGPU(
     double* bigU = scuBufs->bigU;
 
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
 
     int* ToRecv = Llu->ToRecv;
@@ -497,8 +497,7 @@ double* dgetBigV(int_t ldt, int_t num_threads)
     return bigV;
 }
 
-double* dgetBigU(int_t nsupers, gridinfo_t *grid,
-                    LUstruct_t *LUstruct)
+double* dgetBigU(int_t nsupers, gridinfo_t *grid, dLUstruct_t *LUstruct)
 {
     int_t Pr = grid->nprow;
     int_t Pc = grid->npcol;
@@ -542,9 +541,10 @@ double* dgetBigU(int_t nsupers, gridinfo_t *grid,
     return bigU;
 } /* dgetBigU */
 
+
 trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers,
 				      superlu_dist_options_t *options,
-				      LUstruct_t *LUstruct, gridinfo3d_t * grid3d
+				      dLUstruct_t *LUstruct, gridinfo3d_t * grid3d
 				      )
 {
     gridinfo_t* grid = &(grid3d->grid2d);
@@ -553,7 +553,12 @@ trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers,
     int iam = grid3d->iam;
     CHECK_MALLOC (iam, "Enter dinitTrf3Dpartition()");
 #endif
-    int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid);
+    int_t* perm_c_supno = getPerm_c_supno(nsupers, options,
+					  LUstruct->etree,
+					  LUstruct->Glu_persist,
+					  LUstruct->Llu->Lrowind_bc_ptr,
+					  LUstruct->Llu->Ufstnz_br_ptr,
+					  grid);
     int_t* iperm_c_supno = getFactIperm(perm_c_supno, nsupers);
 
     // calculating tree factorization
@@ -561,7 +566,9 @@ trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers,
     treeList_t* treeList = setree2list(nsupers, setree );
 
     /*update treelist with weight and depth*/
-    getSCUweight(nsupers, treeList, LUstruct, grid3d);
+    getSCUweight(nsupers, treeList, LUstruct->Glu_persist->xsup,
+		  LUstruct->Llu->Lrowind_bc_ptr, LUstruct->Llu->Ufstnz_br_ptr,
+		  grid3d);
 
     calcTreeWeight(nsupers, setree, treeList, LUstruct->Glu_persist->xsup);
 
diff --git a/SRC/dtrfCommWrapper.c b/SRC/dtrfCommWrapper.c
index 4bbf623e..69721631 100644
--- a/SRC/dtrfCommWrapper.c
+++ b/SRC/dtrfCommWrapper.c
@@ -42,14 +42,14 @@ int_t dDiagFactIBCast(int_t k, int_t k0, // supernode to be factored
                       gridinfo_t *grid,
                       superlu_dist_options_t *options,
                       double thresh,
-                      LUstruct_t *LUstruct,
+                      dLUstruct_t *LUstruct,
                       SuperLUStat_t *stat, int *info,
                       SCT_t *SCT,
                       int tag_ub)
 {
     // unpacking variables
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
     int_t *xsup = Glu_persist->xsup;
 
     int_t iam = grid->iam;
@@ -121,11 +121,11 @@ int_t dDiagFactIBCast(int_t k, int_t k0, // supernode to be factored
 int_t dLPanelTrSolve(int_t k, int_t *factored_L,
                      double *BlockUFactor,
                      gridinfo_t *grid,
-                     LUstruct_t *LUstruct)
+                     dLUstruct_t *LUstruct)
 {
     double alpha = 1.0;
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
     int_t *xsup = Glu_persist->xsup;
 
     int_t iam = grid->iam;
@@ -223,7 +223,7 @@ int_t dLPanelUpdate(int_t k, int_t *IrecvPlcd_D, int_t *factored_L,
                     MPI_Request *U_diag_blk_recv_req,
                     double *BlockUFactor,
                     gridinfo_t *grid,
-                    LUstruct_t *LUstruct, SCT_t *SCT)
+                    dLUstruct_t *LUstruct, SCT_t *SCT)
 {
 
     dUDiagBlockRecvWait(k, IrecvPlcd_D, factored_L,
@@ -242,11 +242,11 @@ int_t dUPanelTrSolve(int_t k,
                      int_t ldt,
                      Ublock_info_t *Ublock_info,
                      gridinfo_t *grid,
-                     LUstruct_t *LUstruct,
+                     dLUstruct_t *LUstruct,
                      SuperLUStat_t *stat, SCT_t *SCT)
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
     int_t *xsup = Glu_persist->xsup;
     int_t iam = grid->iam;
     int_t myrow = MYROW(iam, grid);
@@ -352,7 +352,7 @@ int_t dUPanelUpdate(int_t k, int_t *factored_U,
                     int_t ldt,
                     Ublock_info_t *Ublock_info,
                     gridinfo_t *grid,
-                    LUstruct_t *LUstruct,
+                    dLUstruct_t *LUstruct,
                     SuperLUStat_t *stat, SCT_t *SCT)
 {
 
@@ -373,12 +373,12 @@ int_t dIBcastRecvLPanel(
     double *Lval_buf,
     int_t *factored,
     gridinfo_t *grid,
-    LUstruct_t *LUstruct,
+    dLUstruct_t *LUstruct,
     SCT_t *SCT,
     int tag_ub)
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
     int_t *xsup = Glu_persist->xsup;
     int **ToSendR = Llu->ToSendR;
     int *ToRecv = Llu->ToRecv;
@@ -431,10 +431,10 @@ int_t dIBcastRecvUPanel(int_t k, int_t k0, int *msgcnt,
                         MPI_Request *send_requ,
                         MPI_Request *recv_requ,
                         int_t *Usub_buf, double *Uval_buf,
-                        gridinfo_t *grid, LUstruct_t *LUstruct,
+                        gridinfo_t *grid, dLUstruct_t *LUstruct,
                         SCT_t *SCT, int tag_ub)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
 
     int *ToSendD = Llu->ToSendD;
     int *ToRecv = Llu->ToRecv;
@@ -479,9 +479,9 @@ int_t dIBcastRecvUPanel(int_t k, int_t k0, int *msgcnt,
 
 int_t dWaitL(int_t k, int *msgcnt, int *msgcntU,
              MPI_Request *send_req, MPI_Request *recv_req,
-             gridinfo_t *grid, LUstruct_t *LUstruct, SCT_t *SCT)
+             gridinfo_t *grid, dLUstruct_t *LUstruct, SCT_t *SCT)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
     int **ToSendR = Llu->ToSendR;
     int *ToRecv = Llu->ToRecv;
     int_t iam = grid->iam;
@@ -507,9 +507,9 @@ int_t dWaitL(int_t k, int *msgcnt, int *msgcntU,
 
 int_t dWaitU(int_t k, int *msgcnt,
              MPI_Request *send_requ, MPI_Request *recv_requ,
-             gridinfo_t *grid, LUstruct_t *LUstruct, SCT_t *SCT)
+             gridinfo_t *grid, dLUstruct_t *LUstruct, SCT_t *SCT)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
 
     int *ToRecv = Llu->ToRecv;
     int *ToSendD = Llu->ToSendD;
diff --git a/SRC/dutil_dist.c b/SRC/dutil_dist.c
index 2ed4aca4..8be5c73d 100644
--- a/SRC/dutil_dist.c
+++ b/SRC/dutil_dist.c
@@ -420,9 +420,67 @@ void dScalePermstructFree(dScalePermstruct_t *ScalePermstruct)
         SUPERLU_FREE(ScalePermstruct->R);
         SUPERLU_FREE(ScalePermstruct->C);
         break;
+      default: break;
     }
 }
 
+/*
+ * The following are from 3D code p3dcomm.c
+ */
+
+int dAllocGlu_3d(int_t n, int_t nsupers, dLUstruct_t * LUstruct)
+{
+    /*broadcasting Glu_persist*/
+    LUstruct->Glu_persist->xsup  = intMalloc_dist(nsupers+1); //INT_T_ALLOC(nsupers+1);
+    LUstruct->Glu_persist->supno = intMalloc_dist(n); //INT_T_ALLOC(n);
+    return 0;
+}
+
+// Sherry added
+int dDeAllocGlu_3d(dLUstruct_t * LUstruct)
+{
+    SUPERLU_FREE(LUstruct->Glu_persist->xsup);
+    SUPERLU_FREE(LUstruct->Glu_persist->supno);
+    return 0;
+}
+
+int dDeAllocLlu_3d(int_t n, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int i, nbc, nbr, nsupers;
+    dLocalLU_t *Llu = LUstruct->Llu;
+
+    nsupers = (LUstruct->Glu_persist)->supno[n-1] + 1;
+
+    nbc = CEILING(nsupers, grid3d->npcol);
+    for (i = 0; i < nbc; ++i) 
+	if ( Llu->Lrowind_bc_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
+#ifdef GPU_ACC
+	    checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i]));
+#else
+	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
+#endif
+	}
+    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
+    SUPERLU_FREE (Llu->Lnzval_bc_ptr);
+
+    nbr = CEILING(nsupers, grid3d->nprow);
+    for (i = 0; i < nbr; ++i)
+	if ( Llu->Ufstnz_br_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
+	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
+    SUPERLU_FREE (Llu->Unzval_br_ptr);
+
+    /* The following can be freed after factorization. */
+    SUPERLU_FREE(Llu->ToRecv);
+    SUPERLU_FREE(Llu->ToSendD);
+    for (i = 0; i < nbc; ++i) SUPERLU_FREE(Llu->ToSendR[i]);
+    SUPERLU_FREE(Llu->ToSendR);
+    return 0;
+} /* dDeAllocLlu_3d */
+
 
 /**** Other utilities ****/
 void
@@ -598,8 +656,8 @@ void dDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid,
 		  Glu_persist_t *Glu_persist, dLocalLU_t *Llu)
 {
     register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb;
-    register int_t k, mycol, r;
-	int_t nnzL, n,nmax;
+    int k, mycol, r, n, nmax;
+    int_t nnzL;
     int_t *xsup = Glu_persist->xsup;
     int_t *index;
     double *nzval;
diff --git a/SRC/pd3dcomm.c b/SRC/pd3dcomm.c
index e11fb9af..96dc1815 100644
--- a/SRC/pd3dcomm.c
+++ b/SRC/pd3dcomm.c
@@ -31,14 +31,11 @@ at the top-level directory.
 #include "xtrf3Dpartition.h"
 #endif
 
-#define INT_T_ALLOC(x) ((int_t *)SUPERLU_MALLOC((x) * sizeof(int_t)))
-#define DOUBLE_ALLOC(x) ((double *)SUPERLU_MALLOC((x) * sizeof(double)))
-
 // #define MPI_MALLOC
 #define MPI_INT_ALLOC(a, b) (MPI_Alloc_mem((b) * sizeof(int_t), MPI_INFO_NULL, &(a)))
 #define MPI_DATATYPE_ALLOC(a, b) (MPI_Alloc_mem((b) * sizeof(double), MPI_INFO_NULL, &(a)))
 
-int_t dAllocLlu(int_t nsupers, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
+int_t dAllocLlu(int_t nsupers, dLUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 {
 	int i;
 	int_t Pc = grid3d->npcol;
@@ -47,7 +44,7 @@ int_t dAllocLlu(int_t nsupers, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 	int_t nbc = CEILING(nsupers, Pc);
 	int_t nbr = CEILING(nsupers, Pr);
 
-	LocalLU_t *Llu = LUstruct->Llu;
+	dLocalLU_t *Llu = LUstruct->Llu;
 	int_t **Lrowind_bc_ptr =
 		(int_t **)SUPERLU_MALLOC(sizeof(int_t *) * nbc); /* size ceil(NSUPERS/Pc) */
 	double **Lnzval_bc_ptr =
@@ -108,9 +105,9 @@ int_t dAllocLlu(int_t nsupers, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 	return 0;
 } /* dAllocLlu */
 
-int_t dmpiMallocLUStruct(int_t nsupers, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
+int_t dmpiMallocLUStruct(int_t nsupers, dLUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 {
-	LocalLU_t *Llu = LUstruct->Llu;
+	dLocalLU_t *Llu = LUstruct->Llu;
 	int_t *xsup = LUstruct->Glu_persist->xsup;
 	int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
 	double **Unzval_br_ptr = Llu->Unzval_br_ptr;
@@ -185,9 +182,9 @@ int_t dmpiMallocLUStruct(int_t nsupers, LUstruct_t *LUstruct, gridinfo3d_t *grid
 }
 
 int_t dzSendLPanel(int_t k, int_t receiver,
-				   LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
+		   dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
 {
-	LocalLU_t *Llu = LUstruct->Llu;
+	dLocalLU_t *Llu = LUstruct->Llu;
 	int_t *xsup = LUstruct->Glu_persist->xsup;
 	int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
 	double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
@@ -217,12 +214,12 @@ int_t dzSendLPanel(int_t k, int_t receiver,
 }
 
 int_t dzRecvLPanel(int_t k, int_t sender, double alpha, double beta,
-				   double *Lval_buf,
-				   LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
+		   double *Lval_buf,
+		   dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
 {
 
 	// A(k) = alpha*A(k) + beta* A^{sender}(k)
-	LocalLU_t *Llu = LUstruct->Llu;
+	dLocalLU_t *Llu = LUstruct->Llu;
 	int_t *xsup = LUstruct->Glu_persist->xsup;
 	int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
 	double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
@@ -260,9 +257,9 @@ int_t dzRecvLPanel(int_t k, int_t sender, double alpha, double beta,
 }
 
 int_t dzSendUPanel(int_t k, int_t receiver,
-				   LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
+		   dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
 {
-	LocalLU_t *Llu = LUstruct->Llu;
+	dLocalLU_t *Llu = LUstruct->Llu;
 	int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
 	double **Unzval_br_ptr = Llu->Unzval_br_ptr;
 	gridinfo_t *grid = &(grid3d->grid2d);
@@ -292,10 +289,10 @@ int_t dzSendUPanel(int_t k, int_t receiver,
 }
 
 int_t dzRecvUPanel(int_t k, int_t sender, double alpha, double beta,
-				   double *Uval_buf, LUstruct_t *LUstruct,
+				   double *Uval_buf, dLUstruct_t *LUstruct,
 				   gridinfo3d_t *grid3d, SCT_t *SCT)
 {
-	LocalLU_t *Llu = LUstruct->Llu;
+	dLocalLU_t *Llu = LUstruct->Llu;
 	int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
 	double **Unzval_br_ptr = Llu->Unzval_br_ptr;
 	gridinfo_t *grid = &(grid3d->grid2d);
@@ -327,7 +324,7 @@ int_t dzRecvUPanel(int_t k, int_t sender, double alpha, double beta,
 	return 0;
 }
 
-int_t dp3dScatter(int_t n, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
+int_t dp3dScatter(int_t n, dLUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 /* Copies LU structure from layer 0 to all the layers */
 {
 	gridinfo_t *grid = &(grid3d->grid2d);
@@ -341,14 +338,14 @@ int_t dp3dScatter(int_t n, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 	int_t nsupers;
 
 	if (!grid3d->zscp.Iam)
-		nsupers = getNsupers(n, LUstruct);
+		nsupers = getNsupers(n, LUstruct->Glu_persist);
 
 	/* broadcast nsupers */
 	MPI_Bcast(&nsupers, 1, mpi_int_t, 0, grid3d->zscp.comm);
 
 	/* Scatter and alloc Glu_persist */
 	if (grid3d->zscp.Iam) // all other process layers not equal 0
-		AllocGlu_3d(n, nsupers, LUstruct);
+		dAllocGlu_3d(n, nsupers, LUstruct);
 
 	/* broadcast Glu_persist */
 	int_t *xsup = LUstruct->Glu_persist->xsup;
@@ -357,12 +354,12 @@ int_t dp3dScatter(int_t n, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 	int_t *supno = LUstruct->Glu_persist->supno;
 	MPI_Bcast(supno, n, mpi_int_t, 0, grid3d->zscp.comm);
 
-	/* now broadcast localLu_t */
+	/* now broadcast localLu */
 	/* first allocating space for it */
 	if (grid3d->zscp.Iam) // all other process layers not equal 0
 		dAllocLlu(nsupers, LUstruct, grid3d);
 
-	LocalLU_t *Llu = LUstruct->Llu;
+	dLocalLU_t *Llu = LUstruct->Llu;
 
 	/*scatter all the L blocks and indexes*/
 	dscatter3dLPanels(nsupers, LUstruct, grid3d);
@@ -400,10 +397,10 @@ int_t dp3dScatter(int_t n, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 } /* dp3dScatter */
 
 int_t dscatter3dUPanels(int_t nsupers,
-						LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
+			dLUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 {
 
-	LocalLU_t *Llu = LUstruct->Llu;
+	dLocalLU_t *Llu = LUstruct->Llu;
 	int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
 	double **Unzval_br_ptr = Llu->Unzval_br_ptr;
 	gridinfo_t *grid = &(grid3d->grid2d);
@@ -470,9 +467,9 @@ int_t dscatter3dUPanels(int_t nsupers,
 } /* end dScatter3dUPanels */
 
 int_t dscatter3dLPanels(int_t nsupers,
-						LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
+			dLUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 {
-	LocalLU_t *Llu = LUstruct->Llu;
+	dLocalLU_t *Llu = LUstruct->Llu;
 	int_t *xsup = LUstruct->Glu_persist->xsup;
 	gridinfo_t *grid = &(grid3d->grid2d);
 	int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
@@ -553,11 +550,11 @@ int_t dscatter3dLPanels(int_t nsupers,
 	return 0;
 } /* dscatter3dLPanels */
 
-int_t dcollect3dLpanels(int_t layer, int_t nsupers, LUstruct_t *LUstruct,
+int_t dcollect3dLpanels(int_t layer, int_t nsupers, dLUstruct_t *LUstruct,
 						gridinfo3d_t *grid3d)
 {
 
-	LocalLU_t *Llu = LUstruct->Llu;
+	dLocalLU_t *Llu = LUstruct->Llu;
 	int_t *xsup = LUstruct->Glu_persist->xsup;
 	int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
 	double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
@@ -598,10 +595,10 @@ int_t dcollect3dLpanels(int_t layer, int_t nsupers, LUstruct_t *LUstruct,
 	return 0;
 }
 
-int_t dcollect3dUpanels(int_t layer, int_t nsupers, LUstruct_t *LUstruct,
+int_t dcollect3dUpanels(int_t layer, int_t nsupers, dLUstruct_t *LUstruct,
 						gridinfo3d_t *grid3d)
 {
-	LocalLU_t *Llu = LUstruct->Llu;
+	dLocalLU_t *Llu = LUstruct->Llu;
 	int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
 	double **Unzval_br_ptr = Llu->Unzval_br_ptr;
 	gridinfo_t *grid = &(grid3d->grid2d);
@@ -633,19 +630,19 @@ int_t dcollect3dUpanels(int_t layer, int_t nsupers, LUstruct_t *LUstruct,
 }
 
 /* Gather the LU factors on layer-0 */
-int_t dp3dCollect(int_t layer, int_t n, LUstruct_t *LUstruct, gridinfo3d_t *grid3d)
+int_t dp3dCollect(int_t layer, int_t n, dLUstruct_t *LUstruct, gridinfo3d_t *grid3d)
 {
-	int_t nsupers = getNsupers(n, LUstruct);
+	int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
 	dcollect3dLpanels(layer, nsupers, LUstruct, grid3d);
 	dcollect3dUpanels(layer, nsupers, LUstruct, grid3d);
 	return 0;
 }
 
 /* Zero out LU non zero entries */
-int_t dzeroSetLU(int_t nnodes, int_t *nodeList, LUstruct_t *LUstruct,
+int_t dzeroSetLU(int_t nnodes, int_t *nodeList, dLUstruct_t *LUstruct,
 				 gridinfo3d_t *grid3d)
 {
-	LocalLU_t *Llu = LUstruct->Llu;
+	dLocalLU_t *Llu = LUstruct->Llu;
 	int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
 	double **Unzval_br_ptr = Llu->Unzval_br_ptr;
 
@@ -707,9 +704,9 @@ int_t dzeroSetLU(int_t nnodes, int_t *nodeList, LUstruct_t *LUstruct,
 }
 
 int_t dreduceAncestors3d(int_t sender, int_t receiver,
-						 int_t nnodes, int_t *nodeList,
-						 double *Lval_buf, double *Uval_buf,
-						 LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
+			 int_t nnodes, int_t *nodeList,
+			 double *Lval_buf, double *Uval_buf,
+			 dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
 {
 	double alpha = 1.0, beta = 1.0;
 	int_t myGrid = grid3d->zscp.Iam;
@@ -736,9 +733,9 @@ int_t dreduceAncestors3d(int_t sender, int_t receiver,
 }
 
 int_t dgatherFactoredLU(int_t sender, int_t receiver,
-						int_t nnodes, int_t *nodeList,
-						dLUValSubBuf_t *LUvsb,
-						LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
+			int_t nnodes, int_t *nodeList,
+			dLUValSubBuf_t *LUvsb,
+			dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
 {
 	double alpha = 0.0, beta = 1.0;
 	double *Lval_buf = LUvsb->Lval_buf;
@@ -764,8 +761,8 @@ int_t dgatherFactoredLU(int_t sender, int_t receiver,
 }
 
 int_t dinit3DLUstruct(int_t *myTreeIdxs, int_t *myZeroTrIdxs,
-					  int_t *nodeCount, int_t **nodeList, LUstruct_t *LUstruct,
-					  gridinfo3d_t *grid3d)
+		      int_t *nodeCount, int_t **nodeList, dLUstruct_t *LUstruct,
+		      gridinfo3d_t *grid3d)
 {
 	int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
 
@@ -783,8 +780,8 @@ int_t dinit3DLUstruct(int_t *myTreeIdxs, int_t *myZeroTrIdxs,
 }
 
 int_t dreduceAllAncestors3d(int_t ilvl, int_t *myNodeCount, int_t **treePerm,
-							dLUValSubBuf_t *LUvsb, LUstruct_t *LUstruct,
-							gridinfo3d_t *grid3d, SCT_t *SCT)
+			    dLUValSubBuf_t *LUvsb, dLUstruct_t *LUstruct,
+			    gridinfo3d_t *grid3d, SCT_t *SCT)
 {
 	double *Lval_buf = LUvsb->Lval_buf;
 	double *Uval_buf = LUvsb->Uval_buf;
@@ -819,7 +816,7 @@ int_t dreduceAllAncestors3d(int_t ilvl, int_t *myNodeCount, int_t **treePerm,
 }
 
 int_t dgatherAllFactoredLU(trf3Dpartition_t *trf3Dpartition,
-						   LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
+			   dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT)
 {
 	int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
 	int_t myGrid = grid3d->zscp.Iam;
diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c
index a130ed78..94bf18ff 100644
--- a/SRC/pdgssvx.c
+++ b/SRC/pdgssvx.c
@@ -1568,6 +1568,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 	    case COL:
 		SUPERLU_FREE(R);
 		break;
+	    default: break;
 	}
     }
 
diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c
index 90674027..ae67abe3 100644
--- a/SRC/pdgssvx3d.c
+++ b/SRC/pdgssvx3d.c
@@ -392,7 +392,7 @@ static void checkNRFMT(NRformat_loc *A, NRformat_loc *B)
  *           If all the above condition are true, the LU decomposition is
  *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
  *
- * ScalePermstruct (input/output) ScalePermstruct_t* (global)
+ * ScalePermstruct (input/output) dScalePermstruct_t* (global)
  *         The data structure to store the scaling and permutation vectors
  *         describing the transformations performed to the matrix A.
  *         It contains the following fields:
@@ -467,7 +467,7 @@ static void checkNRFMT(NRformat_loc *A, NRformat_loc *B)
  *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
  *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
  *
- * LUstruct (input/output) LUstruct_t*
+ * LUstruct (input/output) dLUstruct_t*
  *         The data structures to store the distributed L and U factors.
  *         It contains the following fields:
  *
@@ -494,13 +494,13 @@ static void checkNRFMT(NRformat_loc *A, NRformat_loc *B)
  *           The distributed data structures to store L and U factors.
  *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
  *
- * SOLVEstruct (input/output) SOLVEstruct_t*
+ * SOLVEstruct (input/output) dSOLVEstruct_t*
  *         The data structure to hold the communication pattern used
  *         in the phases of triangular solution and iterative refinement.
  *         This pattern should be intialized only once for repeated solutions.
  *         If options->SolveInitialized = YES, it is an input argument.
  *         If options->SolveInitialized = NO and nrhs != 0, it is an output
- *         argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'.
+ *         argument. See superlu_ddefs.h for the definition of 'dSOLVEstruct_t'.
  *
  * berr    (output) double*, dimension (nrhs) (global)
  *         The componentwise relative backward error of each solution
@@ -525,9 +525,9 @@ static void checkNRFMT(NRformat_loc *A, NRformat_loc *B)
  */
 
 void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
-			   ScalePermstruct_t *ScalePermstruct,
+			   dScalePermstruct_t *ScalePermstruct,
 			   double B[], int ldb, int nrhs, gridinfo3d_t *grid3d,
-			   LUstruct_t *LUstruct, SOLVEstruct_t *SOLVEstruct,
+			   dLUstruct_t *LUstruct, dSOLVEstruct_t *SOLVEstruct,
 			   double *berr, SuperLUStat_t *stat, int *info)
 {
 	NRformat_loc *Astore;
@@ -726,6 +726,7 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 					ABORT("Malloc fails for R[].");
 				ScalePermstruct->R = R;
 				break;
+			default: break;
 			}
 		}
 
@@ -1351,7 +1352,7 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 		/* send the LU structure to all the grids */
 		dp3dScatter(n, LUstruct, grid3d);
 
-		int_t nsupers = getNsupers(n, LUstruct);
+		int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
 		trf3Dpartition = dinitTrf3Dpartition(nsupers, options, LUstruct, grid3d);
 
 		SCT_t *SCT = (SCT_t *)SUPERLU_MALLOC(sizeof(SCT_t));
@@ -1549,7 +1550,7 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 			{
 				/* Improve the solution by iterative refinement. */
 				int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
-				SOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */
+				dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */
 
 				t = SuperLU_timer_();
 				if (options->RefineInitialized == NO || Fact == DOFACT)
@@ -1613,8 +1614,8 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 						is different than the solve with nrhs RHS.
 						So we use SOLVEstruct1 for the refinement step.
 					      */
-					if (!(SOLVEstruct1 = (SOLVEstruct_t *)
-							  SUPERLU_MALLOC(sizeof(SOLVEstruct_t))))
+					if (!(SOLVEstruct1 = (dSOLVEstruct_t *)
+					      SUPERLU_MALLOC(sizeof(dSOLVEstruct_t))))
 						ABORT("Malloc fails for SOLVEstruct1");
 					/* Copy the same stuff */
 					SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
@@ -1627,9 +1628,9 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 
 					/* Initialize the *gstrs_comm for 1 RHS. */
 					if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
-							  SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))))
+						  SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))))
 						ABORT("Malloc fails for gstrs_comm[]");
-					pxgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid,
+					pdgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid,
 								 Glu_persist, SOLVEstruct1);
 				}
 
@@ -1715,6 +1716,7 @@ void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A,
 			case COL:
 				SUPERLU_FREE(R);
 				break;
+			default: break;
 			}
 		}
 	} /* process layer 0 done solve */
diff --git a/SRC/pdgssvx_ABglobal.c b/SRC/pdgssvx_ABglobal.c
index f988bf94..8cdc7c66 100644
--- a/SRC/pdgssvx_ABglobal.c
+++ b/SRC/pdgssvx_ABglobal.c
@@ -588,6 +588,7 @@ pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A,
 		    ABORT("Malloc fails for R[].");
 		ScalePermstruct->R = R;
 		break;
+	    default: break;
 	}
     }
 
@@ -1098,6 +1099,7 @@ pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A,
 	    case COL:
 		SUPERLU_FREE(R);
 		break;
+	    default: break;
 	}
     }
     if ( !factored || (factored && options->IterRefine) )
diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c
index 019512f8..b1c311b8 100644
--- a/SRC/pdgstrf2.c
+++ b/SRC/pdgstrf2.c
@@ -378,7 +378,6 @@ pdgstrf2_trsm
 
 } /* PDGSTRF2_trsm */
 
-#if 0 /* COMMENT OUT 3D CODE FOR NOW */
 
 /*****************************************************************************
  * The following functions are for the new pdgstrf2_dtrsm in the 3D code.
@@ -388,7 +387,7 @@ static int_t LpanelUpdate(int off0, int nsupc, double *ublk_ptr, int ld_ujrow,
 {
     int_t l = nsupr - off0;
     double alpha = 1.0;
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
 
 #define GT 32
 #pragma omp parallel for
@@ -402,7 +401,7 @@ static int_t LpanelUpdate(int off0, int nsupc, double *ublk_ptr, int ld_ujrow,
 
     } /* for i = ... */
 
-    t1 = _rdtsc() - t1;
+    t1 = SuperLU_timer_() - t1;
 
     SCT->trf2_flops += (double)l * (double)nsupc * (double)nsupc;
     SCT->trf2_time += t1;
@@ -418,7 +417,7 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
                    Glu_persist_t *Glu_persist, gridinfo_t *grid, dLocalLU_t *Llu,
                    SuperLUStat_t *stat, int *info, SCT_t* SCT)
 {
-    //unsigned long long t1 = _rdtsc();
+    //double t1 = SuperLU_timer_();
     int_t *xsup = Glu_persist->xsup;
     double alpha = -1, zero = 0.0;
 
@@ -750,9 +749,9 @@ int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
     return 0;
 } /* dTrs2_GatherTrsmScatter */
 
-#endif /* END 3D CODE */
 /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
 
+#if 1
 
 /*****************************************************************************
  * The following pdgstrf2_omp is improved for KNL, since Version 5.2.0.
@@ -907,7 +906,7 @@ void pdgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf,
 		  gridinfo_t *grid, dLocalLU_t *Llu, SuperLUStat_t *stat,
 		  Ublock_info_t *Ublock_info, double *bigV, int_t ldt, SCT_t *SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     int_t *xsup = Glu_persist->xsup;
     /* Quick return. */
     int_t lk = LBi (k, grid);         /* Local block number */
@@ -941,7 +940,8 @@ void pdgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf,
 				usub, uval, tempv, knsupc, nsupr, lusup, Glu_persist);
     } /* for b ... */
 
-    SCT->PDGSTRS2_tl += (double) ( _rdtsc() - t1);
+    SCT->PDGSTRS2_tl += (double) ( SuperLU_timer_() - t1);
 
 } /* pdgstrs2_omp new version from Piyush */
 
+#endif /* there are 2 versions of pdgstrs2_omp */
diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c
index ed8cc562..675610a4 100644
--- a/SRC/pdgstrf3d.c
+++ b/SRC/pdgstrf3d.c
@@ -80,7 +80,7 @@ at the top-level directory.
  * SCT    (input/output) SCT_t*
  *        Various statistics of 3D factorization.
  *
- * LUstruct (input/output) LUstruct_t*
+ * LUstruct (input/output) dLUstruct_t*
  *         The data structures to store the distributed L and U factors.
  *         The following fields should be defined:
  *
@@ -91,9 +91,9 @@ at the top-level directory.
  *         xsup[s] is the leading column of the s-th supernode,
  *             supno[i] is the supernode number to which column i belongs.
  *
- *         o Llu (input/output) LocalLU_t*
+ *         o Llu (input/output) dLocalLU_t*
  *           The distributed data structures to store L and U factors.
- *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
+ *           See superlu_ddefs.h for the definition of 'dLocalLU_t'.
  *
  * grid3d (input) gridinfo3d_t*
  *        The 3D process mesh. It contains the MPI communicator, the number
@@ -118,11 +118,11 @@ at the top-level directory.
  */
 int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
 		trf3Dpartition_t*  trf3Dpartition, SCT_t *SCT,
-		LUstruct_t *LUstruct, gridinfo3d_t * grid3d,
+		dLUstruct_t *LUstruct, gridinfo3d_t * grid3d,
 		SuperLUStat_t *stat, int *info)
 {
     gridinfo_t* grid = &(grid3d->grid2d);
-    LocalLU_t *Llu = LUstruct->Llu;
+    dLocalLU_t *Llu = LUstruct->Llu;
 
     // problem specific contants
     int_t ldt = sp_ienv_dist (3);     /* Size of maximum supernode */
@@ -142,7 +142,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt);
 
     //getting Nsupers
-    int_t nsupers = getNsupers(n, LUstruct);
+    int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
 
     // Grid related Variables
     int_t iam = grid->iam; // in 2D grid
@@ -219,7 +219,8 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
 
     HyP->first_l_block_acc = first_l_block_acc;
     HyP->first_u_block_acc = first_u_block_acc;
-    int_t bigu_size = getBigUSize(nsupers, grid, LUstruct);
+    //int_t bigu_size = getBigUSize(nsupers, grid, LUstruct);
+    int_t bigu_size = getBigUSize(nsupers, grid, LUstruct->Llu->Lrowind_bc_ptr);
     // int_t buffer_size = get_max_buffer_size ();
     // HyP->buffer_size = buffer_size;
     HyP->bigu_size = bigu_size;
diff --git a/SRC/psymbfact.c b/SRC/psymbfact.c
index b42af073..fdade46e 100644
--- a/SRC/psymbfact.c
+++ b/SRC/psymbfact.c
@@ -3157,7 +3157,8 @@ expand_RL
 	
 	if (!computeL)
 	  marker[vtx] = markl;
-	for (ii; ii < mpnelts; ii++) {
+	//for (ii; ii < mpnelts; ii++) {  // Sherry: compiler warning
+	for (; ii < mpnelts; ii++) {
 	  elt = lsub_rcvd[ii];
 	  if (elt >= vtx) {
 	    if (marker[elt] != markl) {
@@ -3456,7 +3457,8 @@ rl_update
 	if (!computeL)
 	  marker[vtx] = markl;
 	PS->nops += mpnelts - ii;
-	for (ii; ii < mpnelts; ii++) {
+	//for (ii; ii < mpnelts; ii++) { // Sherry: compiler warning
+	for (; ii < mpnelts; ii++) {
 	  elt = lsub_rcvd[ii];
 	  if (elt >= vtx) {
 	    if (marker[elt] != markl) {
diff --git a/SRC/pz3dcomm.c b/SRC/pz3dcomm.c
index c1627c4c..e72fad7e 100644
--- a/SRC/pz3dcomm.c
+++ b/SRC/pz3dcomm.c
@@ -31,14 +31,11 @@ at the top-level directory.
 #include "xtrf3Dpartition.h"
 #endif
 
-#define INT_T_ALLOC(x)  ((int_t *) SUPERLU_MALLOC ( (x) * sizeof (int_t)))
-#define DOUBLE_ALLOC(x)  ((double *) SUPERLU_MALLOC ( (x) * sizeof (double)))
-
 // #define MPI_MALLOC
 #define MPI_INT_ALLOC(a, b) (MPI_Alloc_mem( (b)*sizeof(int_t), MPI_INFO_NULL, &(a) ))
 #define MPI_DATATYPE_ALLOC(a, b) (MPI_Alloc_mem((b)*sizeof(doublecomplex), MPI_INFO_NULL, &(a)))
 
-int_t zAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+int_t zAllocLlu(int_t nsupers, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
 {
     int i;
     int_t Pc = grid3d->npcol;
@@ -47,7 +44,7 @@ int_t zAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
     int_t nbc = CEILING(nsupers, Pc);
     int_t nbr = CEILING(nsupers, Pr);
     
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t   **Lrowind_bc_ptr =
 	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbc); 	/* size ceil(NSUPERS/Pc) */
     doublecomplex  **Lnzval_bc_ptr =
@@ -106,9 +103,9 @@ int_t zAllocLlu(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
     return 0;
 } /* zAllocLlu */
 
-int_t zmpiMallocLUStruct(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+int_t zmpiMallocLUStruct(int_t nsupers, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = LUstruct->Glu_persist->xsup;
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
     doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
@@ -184,9 +181,9 @@ int_t zmpiMallocLUStruct(int_t nsupers, LUstruct_t * LUstruct, gridinfo3d_t* gri
 
 
 int_t zzSendLPanel(int_t k, int_t receiver,
-                   LUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+                   zLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = LUstruct->Glu_persist->xsup;
     int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
@@ -218,11 +215,11 @@ int_t zzSendLPanel(int_t k, int_t receiver,
 
 int_t zzRecvLPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex beta,
                     doublecomplex* Lval_buf,
-                    LUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+                    zLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
 {
     
     // A(k) = alpha*A(k) + beta* A^{sender}(k)
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = LUstruct->Glu_persist->xsup;
     int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
@@ -268,9 +265,9 @@ int_t zzRecvLPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex bet
 }
 
 int_t zzSendUPanel(int_t k, int_t receiver,
-                    LUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+		   zLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
     doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
     gridinfo_t* grid = &(grid3d->grid2d);
@@ -301,10 +298,10 @@ int_t zzSendUPanel(int_t k, int_t receiver,
 
 
 int_t zzRecvUPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex beta,
-                    doublecomplex* Uval_buf, LUstruct_t* LUstruct,
+                    doublecomplex* Uval_buf, zLUstruct_t* LUstruct,
                     gridinfo3d_t* grid3d, SCT_t* SCT)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
     doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
     gridinfo_t* grid = &(grid3d->grid2d);
@@ -346,7 +343,7 @@ int_t zzRecvUPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex bet
 }
 
 
-int_t zp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+int_t zp3dScatter(int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
 /* Copies LU structure from layer 0 to all the layers */
 {
     gridinfo_t* grid = &(grid3d->grid2d);
@@ -360,14 +357,14 @@ int_t zp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
     int_t nsupers;
     
     if (!grid3d->zscp.Iam)
-	nsupers = getNsupers(n, LUstruct);
+	nsupers = getNsupers(n, LUstruct->Glu_persist);
     
     /* broadcast nsupers */
     MPI_Bcast( &nsupers, 1, mpi_int_t, 0,  grid3d->zscp.comm);
     
     /* Scatter and alloc Glu_persist */
     if ( grid3d->zscp.Iam ) // all other process layers not equal 0
-	AllocGlu_3d(n, nsupers, LUstruct);
+	zAllocGlu_3d(n, nsupers, LUstruct);
     
     /* broadcast Glu_persist */
     int_t *xsup = LUstruct->Glu_persist->xsup;
@@ -376,12 +373,12 @@ int_t zp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
     int_t *supno = LUstruct->Glu_persist->supno;
     MPI_Bcast( supno, n, mpi_int_t, 0,  grid3d->zscp.comm);
     
-    /* now broadcast localLu_t */
+    /* now broadcast localLu */
     /* first allocating space for it */
     if ( grid3d->zscp.Iam ) // all other process layers not equal 0
 	zAllocLlu(nsupers, LUstruct, grid3d);
     
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     
     /*scatter all the L blocks and indexes*/
     zscatter3dLPanels( nsupers, LUstruct, grid3d);
@@ -419,10 +416,10 @@ int_t zp3dScatter(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
 
 
 int_t zscatter3dUPanels(int_t nsupers,
-		       LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+			zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
 {
 
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
     doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
     gridinfo_t* grid = &(grid3d->grid2d);
@@ -488,9 +485,9 @@ int_t zscatter3dUPanels(int_t nsupers,
 
 
 int_t zscatter3dLPanels(int_t nsupers,
-                       LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+			zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = LUstruct->Glu_persist->xsup;
     gridinfo_t* grid = &(grid3d->grid2d);
     int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
@@ -570,11 +567,11 @@ int_t zscatter3dLPanels(int_t nsupers,
     return 0;
 } /* zscatter3dLPanels */
 
-int_t zcollect3dLpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct,
+int_t zcollect3dLpanels(int_t layer, int_t nsupers, zLUstruct_t * LUstruct,
 		       gridinfo3d_t* grid3d)
 {
 
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = LUstruct->Glu_persist->xsup;
     int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
@@ -615,10 +612,10 @@ int_t zcollect3dLpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct,
     return 0;
 }
 
-int_t zcollect3dUpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct,
+int_t zcollect3dUpanels(int_t layer, int_t nsupers, zLUstruct_t * LUstruct,
       			 gridinfo3d_t* grid3d)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
     doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
     gridinfo_t* grid = &(grid3d->grid2d);
@@ -650,9 +647,9 @@ int_t zcollect3dUpanels(int_t layer, int_t nsupers, LUstruct_t * LUstruct,
 }
 
 /* Gather the LU factors on layer-0 */
-int_t zp3dCollect(int_t layer, int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+int_t zp3dCollect(int_t layer, int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
 {
-    int_t nsupers = getNsupers(n, LUstruct);
+    int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
     zcollect3dLpanels(layer, nsupers,  LUstruct, grid3d);
     zcollect3dUpanels(layer,  nsupers, LUstruct, grid3d);
     return 0;
@@ -660,10 +657,10 @@ int_t zp3dCollect(int_t layer, int_t n, LUstruct_t * LUstruct, gridinfo3d_t* gri
 
 
 /* Zero out LU non zero entries */
-int_t zzeroSetLU(int_t nnodes, int_t* nodeList, LUstruct_t *LUstruct,
+int_t zzeroSetLU(int_t nnodes, int_t* nodeList, zLUstruct_t *LUstruct,
       		 gridinfo3d_t* grid3d)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
     doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr;
     
@@ -728,7 +725,7 @@ int_t zzeroSetLU(int_t nnodes, int_t* nodeList, LUstruct_t *LUstruct,
 int_t zreduceAncestors3d(int_t sender, int_t receiver,
                         int_t nnodes, int_t* nodeList,
                         doublecomplex* Lval_buf, doublecomplex* Uval_buf,
-                        LUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+                        zLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
 {
     doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0};
     int_t myGrid = grid3d->zscp.Iam;
@@ -759,7 +756,7 @@ int_t zreduceAncestors3d(int_t sender, int_t receiver,
 int_t zgatherFactoredLU(int_t sender, int_t receiver,
                         int_t nnodes, int_t *nodeList,
                         zLUValSubBuf_t* LUvsb,
-                        LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT)
+                        zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT)
 {
     doublecomplex alpha = {0.0, 0.0}, beta = {1.0, 0.0};
     doublecomplex * Lval_buf  = LUvsb->Lval_buf;
@@ -788,7 +785,7 @@ int_t zgatherFactoredLU(int_t sender, int_t receiver,
 
 
 int_t zinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
-                      int_t* nodeCount, int_t** nodeList, LUstruct_t* LUstruct,
+                      int_t* nodeCount, int_t** nodeList, zLUstruct_t* LUstruct,
 		      gridinfo3d_t* grid3d)
 {
     int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
@@ -808,7 +805,7 @@ int_t zinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
 
 
 int_t zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm,
-                             zLUValSubBuf_t* LUvsb, LUstruct_t* LUstruct,
+                             zLUValSubBuf_t* LUvsb, zLUstruct_t* LUstruct,
                              gridinfo3d_t* grid3d, SCT_t* SCT )
 {
     doublecomplex * Lval_buf  = LUvsb->Lval_buf;
@@ -845,7 +842,7 @@ int_t zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm,
 }
 
 int_t zgatherAllFactoredLU( trf3Dpartition_t*  trf3Dpartition,
-			   LUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT )
+			    zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT )
 {
     int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
     int_t myGrid = grid3d->zscp.Iam;
diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c
index 34822fa0..c01e3bcd 100644
--- a/SRC/pzgssvx3d.c
+++ b/SRC/pzgssvx3d.c
@@ -360,7 +360,7 @@ at the top-level directory.
  *           If all the above condition are true, the LU decomposition is
  *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
  *
- * ScalePermstruct (input/output) ScalePermstruct_t* (global)
+ * ScalePermstruct (input/output) dScalePermstruct_t* (global)
  *         The data structure to store the scaling and permutation vectors
  *         describing the transformations performed to the matrix A.
  *         It contains the following fields:
@@ -435,7 +435,7 @@ at the top-level directory.
  *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
  *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
  *
- * LUstruct (input/output) LUstruct_t*
+ * LUstruct (input/output) zLUstruct_t*
  *         The data structures to store the distributed L and U factors.
  *         It contains the following fields:
  *
@@ -462,13 +462,13 @@ at the top-level directory.
  *           The distributed data structures to store L and U factors.
  *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
  *
- * SOLVEstruct (input/output) SOLVEstruct_t*
+ * SOLVEstruct (input/output) zSOLVEstruct_t*
  *         The data structure to hold the communication pattern used
  *         in the phases of triangular solution and iterative refinement.
  *         This pattern should be intialized only once for repeated solutions.
  *         If options->SolveInitialized = YES, it is an input argument.
  *         If options->SolveInitialized = NO and nrhs != 0, it is an output
- *         argument. See superlu_ddefs.h for the definition of 'SOLVEstruct_t'.
+ *         argument. See superlu_zdefs.h for the definition of 'zSOLVEstruct_t'.
  *
  * berr    (output) double*, dimension (nrhs) (global)
  *         The componentwise relative backward error of each solution
@@ -494,9 +494,9 @@ at the top-level directory.
 
 void
 pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
-           ScalePermstruct_t * ScalePermstruct,
+           zScalePermstruct_t * ScalePermstruct,
            doublecomplex B[], int ldb, int nrhs, gridinfo3d_t * grid3d,
-           LUstruct_t * LUstruct, SOLVEstruct_t * SOLVEstruct,
+           zLUstruct_t * LUstruct, zSOLVEstruct_t * SOLVEstruct,
            double *berr, SuperLUStat_t * stat, int *info)
 {
     NRformat_loc *Astore;
@@ -1211,7 +1211,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	/* send the LU structure to all the grids */
 	zp3dScatter(n, LUstruct, grid3d);
 
-	int_t nsupers = getNsupers(n, LUstruct);
+	int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
 	trf3Dpartition = zinitTrf3Dpartition(nsupers, options, LUstruct, grid3d);
 
 	SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t));
@@ -1391,7 +1391,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		    {
 			/* Improve the solution by iterative refinement. */
 			int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
-			SOLVEstruct_t *SOLVEstruct1;    /* Used by refinement. */
+			zSOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */
 
 			t = SuperLU_timer_ ();
 			if (options->RefineInitialized == NO || Fact == DOFACT) {
@@ -1447,8 +1447,8 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		is different than the solve with nrhs RHS.
 		So we use SOLVEstruct1 for the refinement step.
 	      */
-				if (!(SOLVEstruct1 = (SOLVEstruct_t *)
-				      SUPERLU_MALLOC (sizeof (SOLVEstruct_t))))
+				if (!(SOLVEstruct1 = (zSOLVEstruct_t *)
+				      SUPERLU_MALLOC (sizeof (zSOLVEstruct_t))))
 				    ABORT ("Malloc fails for SOLVEstruct1");
 				/* Copy the same stuff */
 				SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
@@ -1463,7 +1463,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 				if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
 				      SUPERLU_MALLOC (sizeof (pxgstrs_comm_t))))
 				    ABORT ("Malloc fails for gstrs_comm[]");
-				pxgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid,
+				pzgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid,
 					      Glu_persist, SOLVEstruct1);
 			    }
 			
diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c
index bafeec54..b6193bad 100644
--- a/SRC/pzgstrf2.c
+++ b/SRC/pzgstrf2.c
@@ -362,7 +362,6 @@ pzgstrf2_trsm
 }  /* PZGSTRF2_trsm */
 
 	
-#if 0 /* COMMENT OUT 3D CODE FOR NOW */
 
 /*****************************************************************************
  * The following functions are for the new pdgstrf2_ztrsm in the 3D code.
@@ -373,7 +372,7 @@ int_t LpanelUpdate(int_t off0,  int_t nsupc, doublecomplex* ublk_ptr, int_t ld_u
 {
     int_t l = nsupr - off0;
     doublecomplex alpha = {1.0, 0.0};
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
 
 #define GT  32
 #pragma omp parallel for
@@ -397,13 +396,13 @@ int_t LpanelUpdate(int_t off0,  int_t nsupc, doublecomplex* ublk_ptr, int_t ld_u
 
     } /* for i = ... */
 
-    t1 = _rdtsc() - t1;
+    t1 = SuperLU_timer_() - t1;
 
     SCT->trf2_flops += (double) l * (double)nsupc * (double)nsupc;
     SCT->trf2_time += t1;
     SCT->L_PanelUpdate_tl += t1;
     return 0;
-}
+} /* LpanelUpdate */
 
 #pragma GCC push_options
 #pragma GCC optimize ("O0")
@@ -413,7 +412,7 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
                    Glu_persist_t *Glu_persist, gridinfo_t *grid, zLocalLU_t *Llu,
                    SuperLUStat_t *stat, int *info, SCT_t* SCT)
 {
-    //unsigned long long t1 = _rdtsc();
+    //double t1 = SuperLU_timer_();
     int_t *xsup = Glu_persist->xsup;
     doublecomplex alpha = {-1.0, 0.0}, zero = {0.0, 0.0}, one = {1.0, 0.0};
 
@@ -501,7 +500,7 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
 
 
     //int_t thread_id = omp_get_thread_num();
-    // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( _rdtsc() - t1);
+    // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += SuperLU_timer_() - t1;
 } /* Local_Zgstrf2 */
 
 #pragma GCC pop_options
@@ -743,9 +742,9 @@ int_t zTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp,
     return 0;
 }
 
-#endif /* END 3D CODE */
 /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */
 
+#if 1
 
 /*****************************************************************************
  * The following pdgstrf2_omp is improved for KNL, since Version 5.2.0.
@@ -894,7 +893,7 @@ void pzgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf,
 		  gridinfo_t *grid, zLocalLU_t *Llu, SuperLUStat_t *stat,
 		  Ublock_info_t *Ublock_info, doublecomplex *bigV, int_t ldt, SCT_t *SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     int_t *xsup = Glu_persist->xsup;
     /* Quick return. */
     int_t lk = LBi (k, grid);         /* Local block number */
@@ -928,6 +927,7 @@ void pzgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf,
 				usub, uval, tempv, knsupc, nsupr, lusup, Glu_persist);
     } /* for b ... */
 
-    SCT->PDGSTRS2_tl += (double) ( _rdtsc() - t1);
+    SCT->PDGSTRS2_tl += (double) ( SuperLU_timer_() - t1);
 } /* pdgstrs2_omp new version from Piyush */
 
+#endif
diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c
index e33c4a23..bc40ab75 100644
--- a/SRC/pzgstrf3d.c
+++ b/SRC/pzgstrf3d.c
@@ -79,7 +79,7 @@ at the top-level directory.
  * SCT    (input/output) SCT_t*
  *        Various statistics of 3D factorization.
  *
- * LUstruct (input/output) LUstruct_t*
+ * LUstruct (input/output) zLUstruct_t*
  *         The data structures to store the distributed L and U factors.
  *         The following fields should be defined:
  *
@@ -90,9 +90,9 @@ at the top-level directory.
  *         xsup[s] is the leading column of the s-th supernode,
  *             supno[i] is the supernode number to which column i belongs.
  *
- *         o Llu (input/output) LocalLU_t*
+ *         o Llu (input/output) zLocalLU_t*
  *           The distributed data structures to store L and U factors.
- *           See superlu_ddefs.h for the definition of 'LocalLU_t'.
+ *           See superlu_zdefs.h for the definition of 'zLocalLU_t'.
  *
  * grid3d (input) gridinfo3d_t*
  *        The 3D process mesh. It contains the MPI communicator, the number
@@ -117,11 +117,11 @@ at the top-level directory.
  */
 int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
 		trf3Dpartition_t*  trf3Dpartition, SCT_t *SCT,
-		LUstruct_t *LUstruct, gridinfo3d_t * grid3d,
+		zLUstruct_t *LUstruct, gridinfo3d_t * grid3d,
 		SuperLUStat_t *stat, int *info)
 {
     gridinfo_t* grid = &(grid3d->grid2d);
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
 
     // problem specific contants
     int_t ldt = sp_ienv_dist (3);     /* Size of maximum supernode */
@@ -141,7 +141,7 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt);
 
     //getting Nsupers
-    int_t nsupers = getNsupers(n, LUstruct);
+    int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
 
     // Grid related Variables
     int_t iam = grid->iam; // in 2D grid
@@ -218,7 +218,8 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
 
     HyP->first_l_block_acc = first_l_block_acc;
     HyP->first_u_block_acc = first_u_block_acc;
-    int_t bigu_size = getBigUSize(nsupers, grid, LUstruct);
+    //int_t bigu_size = getBigUSize(nsupers, grid, LUstruct);
+    int_t bigu_size = getBigUSize(nsupers, grid, LUstruct->Llu->Lrowind_bc_ptr);
     // int_t buffer_size = get_max_buffer_size ();
     // HyP->buffer_size = buffer_size;
     HyP->bigu_size = bigu_size;
diff --git a/SRC/sec_structs.c b/SRC/sec_structs.c
index 7b5ea143..1ae6ff80 100644
--- a/SRC/sec_structs.c
+++ b/SRC/sec_structs.c
@@ -184,7 +184,6 @@ void SCT_init(SCT_t* SCT)
     SCT->offloadable_flops = 0.0;
     SCT->offloadable_mops = 0.0;
 
-    SCT->SchurCompUdtThreadTime;
 #if 0
     SCT->SchurCompUdtThreadTime = (double *) _mm_malloc(num_threads * CACHE_LINE_SIZE * sizeof(double), 64);
 #else
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index 8f3e6570..d5e7fb2e 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -648,15 +648,15 @@ extern NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, double *B,
 extern int dScatter_B3d(NRformat_loc3d *A3d, gridinfo3d_t *grid3d);
 
 extern void pdgssvx3d(superlu_dist_options_t *, SuperMatrix *,
-		      ScalePermstruct_t *, double B[], int ldb, int nrhs,
-		      gridinfo3d_t *, LUstruct_t *, SOLVEstruct_t *,
+		      dScalePermstruct_t *, double B[], int ldb, int nrhs,
+		      gridinfo3d_t *, dLUstruct_t *, dSOLVEstruct_t *,
 		      double *berr, SuperLUStat_t *, int *info);
 extern int_t pdgstrf3d(superlu_dist_options_t *, int m, int n, double anorm,
-		       trf3Dpartition_t *, SCT_t *, LUstruct_t *,
+		       trf3Dpartition_t *, SCT_t *, dLUstruct_t *,
 		       gridinfo3d_t *, SuperLUStat_t *, int *);
-    extern void dInit_HyP(HyP_t *HyP, LocalLU_t *Llu, int_t mcb, int_t mrb);
-    extern void Free_HyP(HyP_t *HyP);
-    extern int updateDirtyBit(int_t k0, HyP_t *HyP, gridinfo_t *grid);
+extern void dInit_HyP(HyP_t *HyP, dLocalLU_t *Llu, int_t mcb, int_t mrb);
+extern void Free_HyP(HyP_t *HyP);
+extern int updateDirtyBit(int_t k0, HyP_t *HyP, gridinfo_t *grid);
 
     /* from scatter.h */
     extern void
@@ -704,28 +704,28 @@ extern int_t pdgstrf3d(superlu_dist_options_t *, int m, int n, double anorm,
                                int_t knsupc, int_t klst, int_t *lsub,
                                int_t *usub, int_t ldt,
                                int *indirect, int *indirect2,
-                               HyP_t *HyP, LUstruct_t *, gridinfo_t *,
+                               HyP_t *HyP, dLUstruct_t *, gridinfo_t *,
                                SCT_t *SCT, SuperLUStat_t *);
     extern int_t
     dblock_gemm_scatterTopRight(int_t lb, int_t j, double *bigV,
                                 int_t knsupc, int_t klst, int_t *lsub,
                                 int_t *usub, int_t ldt,
                                 int *indirect, int *indirect2,
-                                HyP_t *HyP, LUstruct_t *, gridinfo_t *,
+                                HyP_t *HyP, dLUstruct_t *, gridinfo_t *,
                                 SCT_t *SCT, SuperLUStat_t *);
     extern int_t
     dblock_gemm_scatterBottomLeft(int_t lb, int_t j, double *bigV,
                                   int_t knsupc, int_t klst, int_t *lsub,
                                   int_t *usub, int_t ldt,
                                   int *indirect, int *indirect2,
-                                  HyP_t *HyP, LUstruct_t *, gridinfo_t *,
+                                  HyP_t *HyP, dLUstruct_t *, gridinfo_t *,
                                   SCT_t *SCT, SuperLUStat_t *);
     extern int_t
     dblock_gemm_scatterBottomRight(int_t lb, int_t j, double *bigV,
                                    int_t knsupc, int_t klst, int_t *lsub,
                                    int_t *usub, int_t ldt,
                                    int *indirect, int *indirect2,
-                                   HyP_t *HyP, LUstruct_t *, gridinfo_t *,
+                                   HyP_t *HyP, dLUstruct_t *, gridinfo_t *,
                                    SCT_t *SCT, SuperLUStat_t *);
 
     /* from gather.h */
@@ -757,10 +757,10 @@ extern int_t pdgstrf3d(superlu_dist_options_t *, int m, int n, double anorm,
     extern void d3D_printMemUse(trf3Dpartition_t *trf3Dpartition,
                                 dLUstruct_t *LUstruct, gridinfo3d_t *grid3d);
 
-    extern int *getLastDep(gridinfo_t *grid, SuperLUStat_t *stat,
-                           superlu_dist_options_t *options, dLocalLU_t *Llu,
-                           int_t *xsup, int_t num_look_aheads, int_t nsupers,
-                           int_t *iperm_c_supno);
+    //extern int *getLastDep(gridinfo_t *grid, SuperLUStat_t *stat,
+    //                       superlu_dist_options_t *options, dLocalLU_t *Llu,
+    //                       int_t *xsup, int_t num_look_aheads, int_t nsupers,
+    //                       int_t *iperm_c_supno);
 
     extern void dinit3DLUstructForest(int_t *myTreeIdxs, int_t *myZeroTrIdxs,
                                       sForest_t **sForests, dLUstruct_t *LUstruct,
@@ -823,9 +823,9 @@ extern int_t dcollect3dUpanels(int_t layer, int_t nsupers, dLUstruct_t * LUstruc
 extern int_t dp3dCollect(int_t layer, int_t n, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d);
 /*zero out LU non zero entries*/
 extern int_t dzeroSetLU(int_t nnodes, int_t* nodeList , dLUstruct_t *, gridinfo3d_t*);
-extern int AllocGlu_3d(int_t n, int_t nsupers, dLUstruct_t *);
-extern int DeAllocLlu_3d(int_t n, dLUstruct_t *, gridinfo3d_t*);
-extern int DeAllocGlu_3d(dLUstruct_t *);
+extern int dAllocGlu_3d(int_t n, int_t nsupers, dLUstruct_t *);
+extern int dDeAllocLlu_3d(int_t n, dLUstruct_t *, gridinfo3d_t*);
+extern int dDeAllocGlu_3d(dLUstruct_t *);
 
 /* Reduces L and U panels of nodes in the List nodeList (size=nnnodes)
 receiver[L(nodelist)] =sender[L(nodelist)] +receiver[L(nodelist)]
@@ -876,21 +876,21 @@ int_t dzRecvUPanel(int_t k, int_t sender, double alpha,
     /* from communication_aux.h */
 extern int_t dIBcast_LPanel (int_t k, int_t k0, int_t* lsub, double* lusup,
 			     gridinfo_t *, int* msgcnt, MPI_Request *,
-			     int_t **ToSendR, int_t *xsup, int );
+			     int **ToSendR, int_t *xsup, int );
 extern int_t dBcast_LPanel(int_t k, int_t k0, int_t* lsub, double* lusup,
-			   gridinfo_t *, int* msgcnt, int_t **ToSendR,
+			   gridinfo_t *, int* msgcnt, int **ToSendR,
 			   int_t *xsup , SCT_t*, int);
 extern int_t dIBcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval,
 			    gridinfo_t *, int* msgcnt, MPI_Request *,
-			    int_t *ToSendD, int );
+			    int *ToSendD, int );
 extern int_t dBcast_UPanel(int_t k, int_t k0, int_t* usub, double* uval,
-			   gridinfo_t *, int* msgcnt, int_t *ToSendD, SCT_t*, int);
+			   gridinfo_t *, int* msgcnt, int *ToSendD, SCT_t*, int);
 extern int_t dIrecv_LPanel (int_t k, int_t k0,  int_t* Lsub_buf, 
 			    double* Lval_buf, gridinfo_t *,
 			    MPI_Request *, dLocalLU_t *, int);
 extern int_t dIrecv_UPanel(int_t k, int_t k0, int_t* Usub_buf, double*,
 			   dLocalLU_t *, gridinfo_t*, MPI_Request *, int);
-extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int_t **ToSendR,
+extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int **ToSendR,
 			MPI_Request *s, SCT_t*);
 extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *);
 extern int_t dWait_URecv(MPI_Request *, int* msgcnt, SCT_t *);
@@ -963,7 +963,6 @@ extern int_t dLPanelTrSolve(int_t k, int_t* factored_L, double* BlockUFactor,
 			    gridinfo_t *, dLUstruct_t *);
 
     /* from trfAux.h */
-extern int_t getNsupers(int, dLUstruct_t *);
 extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo);
 extern int   freePackLUInfo(packLUInfo_t* packLUInfo);
 extern int_t dSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*,
@@ -979,12 +978,7 @@ extern int_t dSchurComplementSetupGPU(int_t k, msgs_t* msgs, packLUInfo_t*,
 				      dLUstruct_t *, HyP_t*);
 extern double* dgetBigV(int_t, int_t);
 extern double* dgetBigU(int_t, gridinfo_t *, dLUstruct_t *);
-extern int_t getBigUSize(int_t, gridinfo_t *, dLUstruct_t *);
-// permutation from superLU default
-extern int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *,
-			      dLUstruct_t *, gridinfo_t *);
-extern void getSCUweight(int_t nsupers, treeList_t* treeList, dLUstruct_t *, gridinfo3d_t *);
-
+    
     /* from treeFactorization.h */
 extern int_t dLluBufInit(dLUValSubBuf_t*, dLUstruct_t *);
 extern int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h
index db670222..193cba19 100644
--- a/SRC/superlu_defs.h
+++ b/SRC/superlu_defs.h
@@ -1114,26 +1114,18 @@ extern int_t 	   	StdList_Size(StdList lst);
 yes_no_t 		StdList_Empty(StdList lst);
 
 /*==== For 3D code ====*/
-/* Matrix distributed in NRformat_loc in 3D process grid, it converts 
-it to a NRformat_loc distributed in two-D grid in grid-0 */
-NRformat_loc dGatherNRformat_loc(NRformat_loc *A, 
-    double* B, int ldb, int nrhs, double** B2d,
-    gridinfo3d_t *grid3d);
-
-int dScatterB3d(NRformat_loc A2d, NRformat_loc *A,
-		 double *B, int ldb, int nrhs, double *B2d,
-                         gridinfo3d_t *grid3d);
-
-NRformat_loc3d*  dGatherNRformat_loc3d(NRformat_loc *A,
-                                 double *B, int ldb, int nrhs,
-                                 gridinfo3d_t *grid3d);
-extern int dScatterB3d_(NRformat_loc3d *A3d, gridinfo3d_t *grid3d);
     
 extern void DistPrint(char* function_name,  double value, char* Units, gridinfo_t* grid);
 extern void DistPrint3D(char* function_name,  double value, char* Units, gridinfo3d_t* grid3d);
 extern void treeImbalance3D(gridinfo3d_t *grid3d, SCT_t* SCT);
 extern void SCT_printComm3D(gridinfo3d_t *grid3d, SCT_t* SCT);
 
+// permutation from superLU default
+extern int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *,
+			      int_t *etree, Glu_persist_t *Glu_persist, 
+			      int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr,
+			      gridinfo_t *);
+
 /* Manipulate counters */
 extern void SCT_init(SCT_t*);
 extern void SCT_print(gridinfo_t *grid, SCT_t* SCT);
@@ -1224,6 +1216,11 @@ extern sForest_t**  getGreedyLoadBalForests( int_t maxLvl, int_t nsupers, int_t*
 extern sForest_t**  getForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList);
 
     /* from trfAux.h */
+extern int_t getBigUSize(int_t nsupers, gridinfo_t *grid, int_t **Lrowind_bc_ptr);
+extern void getSCUweight(int_t nsupers, treeList_t* treeList, int_t* xsup,
+			 int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr,
+			 gridinfo3d_t * grid3d);
+extern int getNsupers(int n, Glu_persist_t *Glu_persist);
 extern int set_tag_ub();
 extern int getNumThreads(int);
 extern int_t num_full_cols_U(int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup,
diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h
index 4ea5a7ff..ffd061a2 100644
--- a/SRC/superlu_dist_config.h
+++ b/SRC/superlu_dist_config.h
@@ -13,7 +13,7 @@
 /* #undef HAVE_COMBBLAS */
 
 /* enable 64bit index mode */
-#define XSDK_INDEX_SIZE 64
+/* #undef XSDK_INDEX_SIZE */
 
 #if (XSDK_INDEX_SIZE == 64)
 #define _LONGINT 1
diff --git a/SRC/superlu_grid3d.c b/SRC/superlu_grid3d.c
index 3a5ff2de..7c9be57f 100644
--- a/SRC/superlu_grid3d.c
+++ b/SRC/superlu_grid3d.c
@@ -65,12 +65,14 @@ void superlu_gridmap3d(
     int *pranks;
     int i, j, info;
 
+#if 0 // older MPI doesn't support complex in C    
     /* Create datatype in C for MPI complex. */
     if ( SuperLU_MPI_DOUBLE_COMPLEX == MPI_DATATYPE_NULL ) {
         MPI_Type_contiguous( 2, MPI_DOUBLE, &SuperLU_MPI_DOUBLE_COMPLEX );
         MPI_Type_commit( &SuperLU_MPI_DOUBLE_COMPLEX );
     }
-
+#endif
+    
     /* Check MPI environment initialization. */
     MPI_Initialized( &info );
     if ( !info )
@@ -283,7 +285,11 @@ void superlu_gridexit3d(gridinfo3d_t *grid)
         MPI_Comm_free( &grid->grid2d.comm );
         MPI_Comm_free( &grid->comm );
     }
+#if 0    
     if ( SuperLU_MPI_DOUBLE_COMPLEX != MPI_DATATYPE_NULL ) {
         MPI_Type_free( &SuperLU_MPI_DOUBLE_COMPLEX );
+	SuperLU_MPI_DOUBLE_COMPLEX = MPI_DATATYPE_NULL; /* some MPI system does not set this
+							   to be NULL after Type_free */
     }
+#endif    
 }
diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h
index 43c283b4..bb4ae8e7 100644
--- a/SRC/superlu_zdefs.h
+++ b/SRC/superlu_zdefs.h
@@ -329,112 +329,7 @@ typedef struct
     lPanelInfo_t* lPanelInfo;
 } packLUInfo_t;
 
-#endif
-/*=====================*/
-
-/*==== For 3D code ====*/
-
-// new structures for pdgstrf_4_8 
-
-typedef struct
-{
-    int_t nub;
-    int_t klst;
-    int_t ldu;
-    int_t* usub;
-    doublecomplex* uval;
-} uPanelInfo_t;
-
-typedef struct
-{
-    int_t *lsub;
-    doublecomplex *lusup;
-    int_t luptr0;
-    int_t nlb;  //number of l blocks
-    int_t nsupr;
-} lPanelInfo_t;
-
- 
-
-/* HyP_t is the data structure to assist HALO offload of Schur-complement. */
-typedef struct
-{
-    Remain_info_t *lookAhead_info, *Remain_info;
-    Ublock_info_t *Ublock_info, *Ublock_info_Phi;
-    
-    int_t first_l_block_acc , first_u_block_acc;
-    int_t last_offload ;
-    int_t *Lblock_dirty_bit, * Ublock_dirty_bit;
-    doublecomplex *lookAhead_L_buff, *Remain_L_buff;
-    int_t lookAheadBlk;  /* number of blocks in look-ahead window */
-    int_t RemainBlk ;    /* number of blocks outside look-ahead window */
-    int_t  num_look_aheads, nsupers;
-    int_t ldu, ldu_Phi;
-    int_t num_u_blks, num_u_blks_Phi;
-
-    int_t jj_cpu;
-    doublecomplex *bigU_Phi;
-    doublecomplex *bigU_host;
-    int_t Lnbrow;
-    int_t Rnbrow;
-
-    int_t buffer_size;
-    int_t bigu_size;
-    int_t offloadCondition;
-    int_t superlu_acc_offload;
-    int_t nCudaStreams;
-} HyP_t;
-
-typedef struct 
-{
-    int_t * Lsub_buf ;
-    doublecomplex * Lval_buf ;
-    int_t * Usub_buf ;
-    doublecomplex * Uval_buf ;
-} zLUValSubBuf_t;
-
-int_t scuStatUpdate(
-    int_t knsupc,
-    HyP_t* HyP, 
-    SCT_t* SCT,
-    SuperLUStat_t *stat
-    );
-
-typedef struct
-{
-    gEtreeInfo_t gEtreeInfo;
-    int_t* iperm_c_supno;
-    int_t* myNodeCount;
-    int_t* myTreeIdxs;
-    int_t* myZeroTrIdxs;
-    int_t** treePerm;
-    sForest_t** sForests;
-    int_t* supernode2treeMap;
-    zLUValSubBuf_t  *LUvsb;
-} trf3Dpartition_t;
-
-typedef struct
-{
-    doublecomplex *bigU;
-    doublecomplex *bigV;
-} scuBufs_t;
-
-typedef struct
-{   
-    doublecomplex* BlockLFactor;
-    doublecomplex* BlockUFactor;
-} diagFactBufs_t;
-
-typedef struct
-{
-    Ublock_info_t* Ublock_info;
-    Remain_info_t*  Remain_info;
-    uPanelInfo_t* uPanelInfo;
-    lPanelInfo_t* lPanelInfo;
-} packLUInfo_t;
-
-
-/*=====================*/
+/*==== End 3D structures ============*/
 
 /***********************************************************************
  * Function prototypes
@@ -757,13 +652,13 @@ extern NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, doublecomplex *B,
 extern int zScatter_B3d(NRformat_loc3d *A3d, gridinfo3d_t *grid3d);
     
 extern void pzgssvx3d (superlu_dist_options_t *, SuperMatrix *,
-		       ScalePermstruct_t *, doublecomplex B[], int ldb, int nrhs,
-		       gridinfo3d_t *, LUstruct_t *, SOLVEstruct_t *, 
+		       zScalePermstruct_t *, doublecomplex B[], int ldb, int nrhs,
+		       gridinfo3d_t *, zLUstruct_t *, zSOLVEstruct_t *, 
 		       double *berr, SuperLUStat_t *, int *info);
 extern int_t pzgstrf3d(superlu_dist_options_t *, int m, int n, double anorm,
-		       trf3Dpartition_t*, SCT_t *, LUstruct_t *,
+		       trf3Dpartition_t*, SCT_t *, zLUstruct_t *,
 		       gridinfo3d_t *, SuperLUStat_t *, int *);
-extern void zInit_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb );
+extern void zInit_HyP(HyP_t* HyP, zLocalLU_t *Llu, int_t mcb, int_t mrb );
 extern void Free_HyP(HyP_t* HyP);
 extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid);
 
@@ -811,7 +706,7 @@ zblock_gemm_scatterTopLeft( int_t lb,  int_t j, doublecomplex* bigV,
 				 int_t knsupc,  int_t klst, int_t* lsub,
                                  int_t * usub, int_t ldt,
 				 int* indirect, int* indirect2,
-                                 HyP_t* HyP, LUstruct_t *, gridinfo_t*,
+                                 HyP_t* HyP, zLUstruct_t *, gridinfo_t*,
                                  SCT_t*SCT, SuperLUStat_t *
                                );
 extern int_t 
@@ -819,21 +714,21 @@ zblock_gemm_scatterTopRight( int_t lb,  int_t j, doublecomplex* bigV,
 				  int_t knsupc,  int_t klst, int_t* lsub,
                                   int_t * usub, int_t ldt,
 				  int* indirect, int* indirect2,
-                                  HyP_t* HyP, LUstruct_t *, gridinfo_t*,
+                                  HyP_t* HyP, zLUstruct_t *, gridinfo_t*,
                                   SCT_t*SCT, SuperLUStat_t * );
 extern int_t
 zblock_gemm_scatterBottomLeft( int_t lb,  int_t j, doublecomplex* bigV,
 				    int_t knsupc,  int_t klst, int_t* lsub,
                                     int_t * usub, int_t ldt, 
 				    int* indirect, int* indirect2,
-                                    HyP_t* HyP, LUstruct_t *, gridinfo_t*,
+                                    HyP_t* HyP, zLUstruct_t *, gridinfo_t*,
                                     SCT_t*SCT, SuperLUStat_t * );
 extern int_t 
 zblock_gemm_scatterBottomRight( int_t lb,  int_t j, doublecomplex* bigV,
 				     int_t knsupc,  int_t klst, int_t* lsub,
                                      int_t * usub, int_t ldt,
 				     int* indirect, int* indirect2,
-                                     HyP_t* HyP, LUstruct_t *, gridinfo_t*,
+                                     HyP_t* HyP, zLUstruct_t *, gridinfo_t*,
                                      SCT_t*SCT, SuperLUStat_t * );
 
     /* from gather.h */
@@ -859,16 +754,16 @@ extern void zRgather_U(int_t k, int_t jj0, int_t *usub, doublecomplex *uval,
     /* from xtrf3Dpartition.h */
 extern trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers,
 					     superlu_dist_options_t *options,
-					     LUstruct_t *LUstruct, gridinfo3d_t * grid3d);
+					     zLUstruct_t *LUstruct, gridinfo3d_t * grid3d);
 extern void zDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d);
 
 extern void z3D_printMemUse(trf3Dpartition_t*  trf3Dpartition,
-			    LUstruct_t *LUstruct, gridinfo3d_t * grid3d);
+			    zLUstruct_t *LUstruct, gridinfo3d_t * grid3d);
 
-extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat,
-		       superlu_dist_options_t *options, LocalLU_t *Llu,
-		       int_t* xsup, int_t num_look_aheads, int_t nsupers,
-		       int_t * iperm_c_supno);
+    //extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat,
+    //		       superlu_dist_options_t *options, LocalLU_t *Llu,
+    //		       int_t* xsup, int_t num_look_aheads, int_t nsupers,
+    //		       int_t * iperm_c_supno);
 
 extern int_t zLpanelUpdate(int_t off0, int_t nsupc, doublecomplex* ublk_ptr,
 			  int_t ld_ujrow, doublecomplex* lusup, int_t nsupr, SCT_t*);
@@ -915,9 +810,9 @@ extern int_t zcollect3dUpanels(int_t layer, int_t nsupers, zLUstruct_t * LUstruc
 extern int_t zp3dCollect(int_t layer, int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d);
 /*zero out LU non zero entries*/
 extern int_t zzeroSetLU(int_t nnodes, int_t* nodeList , zLUstruct_t *, gridinfo3d_t*);
-extern int AllocGlu_3d(int_t n, int_t nsupers, zLUstruct_t *);
-extern int DeAllocLlu_3d(int_t n, zLUstruct_t *, gridinfo3d_t*);
-extern int DeAllocGlu_3d(zLUstruct_t *);
+extern int zAllocGlu_3d(int_t n, int_t nsupers, zLUstruct_t *);
+extern int zDeAllocLlu_3d(int_t n, zLUstruct_t *, gridinfo3d_t*);
+extern int zDeAllocGlu_3d(zLUstruct_t *);
 
 /* Reduces L and U panels of nodes in the List nodeList (size=nnnodes)
 receiver[L(nodelist)] =sender[L(nodelist)] +receiver[L(nodelist)]
@@ -980,9 +875,9 @@ extern int_t zBcast_UPanel(int_t k, int_t k0, int_t* usub, doublecomplex* uval,
 			   gridinfo_t *, int* msgcnt, int *ToSendD, SCT_t*, int);
 extern int_t zIrecv_LPanel (int_t k, int_t k0,  int_t* Lsub_buf, 
 			    doublecomplex* Lval_buf, gridinfo_t *,
-			    MPI_Request *, LocalLU_t *, int);
+			    MPI_Request *, zLocalLU_t *, int);
 extern int_t zIrecv_UPanel(int_t k, int_t k0, int_t* Usub_buf, doublecomplex*,
-			   LocalLU_t *, gridinfo_t*, MPI_Request *, int);
+			   zLocalLU_t *, gridinfo_t*, MPI_Request *, int);
 extern int_t Wait_LSend(int_t k, gridinfo_t *grid, int **ToSendR,
 			MPI_Request *s, SCT_t*);
 extern int_t Wait_USend(MPI_Request *, gridinfo_t *, SCT_t *);
@@ -1056,7 +951,6 @@ extern int_t zLPanelTrSolve(int_t k, int_t* factored_L, doublecomplex* BlockUFac
 			    gridinfo_t *, zLUstruct_t *);
 
     /* from trfAux.h */
-extern int_t getNsupers(int, zLUstruct_t *);
 extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo);
 extern int   freePackLUInfo(packLUInfo_t* packLUInfo);
 extern int_t zSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*,
@@ -1072,11 +966,6 @@ extern int_t zSchurComplementSetupGPU(int_t k, msgs_t* msgs, packLUInfo_t*,
 				      zLUstruct_t *, HyP_t*);
 extern doublecomplex* zgetBigV(int_t, int_t);
 extern doublecomplex* zgetBigU(int_t, gridinfo_t *, zLUstruct_t *);
-extern int_t getBigUSize(int_t, gridinfo_t *, zLUstruct_t *);
-// permutation from superLU default
-extern int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *,
-			      zLUstruct_t *, gridinfo_t *);
-extern void getSCUweight(int_t nsupers, treeList_t* treeList, zLUstruct_t *, gridinfo3d_t *);
 
     /* from treeFactorization.h */
 extern int_t zLluBufInit(zLUValSubBuf_t*, zLUstruct_t *);
diff --git a/SRC/supernodal_etree.c b/SRC/supernodal_etree.c
index 225d224d..a282187e 100644
--- a/SRC/supernodal_etree.c
+++ b/SRC/supernodal_etree.c
@@ -244,6 +244,7 @@ int_t printFileList(char* sname, int_t nnodes, int_t*dlist, int_t*setree)
 	fprintf(fp, "}\n");
 	fprintf(fp, "//EOF\n");
 	fclose(fp);
+	return 0;
 }
 
 int_t getDescendList(int_t k, int_t*dlist,  treeList_t* treeList)
@@ -330,10 +331,10 @@ int_t* getEtreeLB(int_t nnodes, int_t* perm_l, int_t* gTopOrder)
 // calculates EtreeLB boundaries for given list of nodes, via perm_l
 {
 	//calculate minimum and maximum topOrder
-	int_t minTop, maxTop;
+	int minTop, maxTop;
 	minTop = gTopOrder[perm_l[0]];
 	maxTop = gTopOrder[perm_l[nnodes - 1]];
-	int_t numLB = maxTop - minTop + 2;
+	int numLB = maxTop - minTop + 2;
 	//int_t* lEtreeLB = (int_t *) malloc( sizeof(int_t) * numLB);
 	int_t* lEtreeLB = (int_t *) intMalloc_dist(numLB); // Sherry fix
 	for (int i = 0; i < numLB; ++i)
@@ -342,8 +343,8 @@ int_t* getEtreeLB(int_t nnodes, int_t* perm_l, int_t* gTopOrder)
 		lEtreeLB[i] = 0;
 	}
 	lEtreeLB[0] = 0;
-	int_t curLevel = minTop;
-	int_t curPtr = 1;
+	int curLevel = minTop;
+	int curPtr = 1;
 	for (int i = 0; i < nnodes ; ++i)
 	{
 		/* code */
@@ -426,7 +427,7 @@ int_t testSubtreeNodelist(int_t nsupers, int_t numList, int_t** nodeList, int_t*
 int_t testListPerm(int_t nodeCount, int_t* nodeList, int_t* permList, int_t* gTopLevel)
 {
 	// checking monotonicity
-	for (int_t i = 0; i < nodeCount - 1; ++i)
+	for (int i = 0; i < nodeCount - 1; ++i)
 	{
 		if (!( gTopLevel[permList[i]] <= gTopLevel[permList[i + 1]]))
 		{
@@ -485,10 +486,10 @@ int_t* merg_perms(int_t nperms, int_t* nnodes, int_t** perms)
 
 	//now concatenat arrays
 	int_t ptr = 0;
-	for (int_t tr = 0; tr < nperms; ++tr)
+	for (int tr = 0; tr < nperms; ++tr)
 	{
 		/* code */
-		for (int_t nd = 0; nd < nnodes[tr]; ++nd)
+		for (int nd = 0; nd < nnodes[tr]; ++nd)
 		{
 			/* code */
 			gperm[ptr] = perms[tr][nd];
@@ -533,6 +534,7 @@ int_t mergPermTest(int_t nperms, int_t* gperms, int_t* nnodes)
 	return nn;
 } /* mergPermTest */
 
+#if 0 // Sherry: not called anymore
 int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat,
 		superlu_dist_options_t *options,
                 LocalLU_t *Llu, int_t* xsup,
@@ -662,6 +664,9 @@ int* getLastDepBtree( int_t nsupers, treeList_t* treeList)
 	return look_ahead;
 }
 
+#endif // Sherry: not called anymore
+
+
 int_t* getGlobal_iperm(int_t nsupers, int_t nperms,  // number of permutations
                        int_t** perms, 		// array of permutations
                        int_t* nnodes 		// number of nodes in each permutation
@@ -961,8 +966,8 @@ void Print_EtreeLevelBoundry(int_t *Etree_LvlBdry, int_t max_level, int_t nsuper
 {
 	for (int i = 0; i < max_level; ++i)
 	{
-		int_t st = 0;
-		int_t ed = nsuper;
+		int st = 0;
+		int ed = nsuper;
 		st = Etree_LvlBdry[i];
 		ed = Etree_LvlBdry[i + 1];
 		printf("Level %d, NumSuperNodes=%d,\t Start=%d end=%d\n", i, ed - st, st, ed);
diff --git a/SRC/trfAux.c b/SRC/trfAux.c
index c6ef1535..d0280189 100644
--- a/SRC/trfAux.c
+++ b/SRC/trfAux.c
@@ -29,10 +29,9 @@ int_t getslu25D_enabled()
     }
 }
 
-int_t getNsupers(int n, LUstruct_t *LUstruct)
+int getNsupers(int n, Glu_persist_t *Glu_persist)
 {
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    int_t nsupers = Glu_persist->supno[n - 1] + 1;
+    int nsupers = Glu_persist->supno[n - 1] + 1;
     return nsupers;
 }
 
@@ -150,8 +149,8 @@ int_t estimate_bigu_size( int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr,
 } /* old estimate_bigu_size. New one is in util.c */
 #endif /**** end old ones ****/
 
-int_t getBigUSize(int_t nsupers, gridinfo_t *grid,
-                  LUstruct_t *LUstruct)
+int_t getBigUSize(int_t nsupers, gridinfo_t *grid, int_t **Lrowind_bc_ptr)
+//LUstruct_t *LUstruct)
 {
 
     int_t Pr = grid->nprow;
@@ -170,7 +169,8 @@ int_t getBigUSize(int_t nsupers, gridinfo_t *grid,
         if (mycol == tpc)
         {
             int_t lk = LBj (i, grid);
-            int_t* lsub = LUstruct->Llu->Lrowind_bc_ptr[lk];
+            //int_t* lsub = LUstruct->Llu->Lrowind_bc_ptr[lk];
+            int_t* lsub = Lrowind_bc_ptr[lk];
             if (lsub != NULL)
             {
                 local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]);
@@ -221,17 +221,18 @@ int_t* getFactIperm(int_t* perm, int_t nsupers)
     return iperm;
 }
 
-int_t* getPerm_c_supno(int_t nsupers,
-                       superlu_dist_options_t *options,
-                       LUstruct_t *LUstruct, gridinfo_t *grid)
+int_t* getPerm_c_supno(int_t nsupers, superlu_dist_options_t *options,
+		       int_t *etree, Glu_persist_t *Glu_persist,
+		       int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr,
+		       gridinfo_t *grid)
 
 {
     /*I do not understand the following code in detail,
     I have just written a wrapper around it*/
 
     int_t* perm_c_supno;
-    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    //Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    //LocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
 
     int_t iam = grid->iam;
@@ -270,7 +271,7 @@ int_t* getPerm_c_supno(int_t nsupers,
         {
             /* Use the etree computed from serial symb. fact., and turn it
             into supernodal tree.  */
-            int_t *etree = LUstruct->etree;
+            //int_t *etree = LUstruct->etree;
 #if ( PRNTlevel>=1 )
             if ( grid->iam == 0 ) printf( " === using column e-tree ===\n" );
 #endif
@@ -302,7 +303,8 @@ int_t* getPerm_c_supno(int_t nsupers,
             for ( lb = 0; lb < ncb; lb++ )
             {
                 jb = lb * grid->npcol + mycol;
-                index = Llu->Lrowind_bc_ptr[lb];
+                //index = Llu->Lrowind_bc_ptr[lb];
+                index = Lrowind_bc_ptr[lb];
                 if ( index )   /* Not an empty column */
                 {
                     i = index[0];
@@ -330,7 +332,8 @@ int_t* getPerm_c_supno(int_t nsupers,
             if ( mycol < nsupers % grid->npcol )
             {
                 jb = ncb * grid->npcol + mycol;
-                index = Llu->Lrowind_bc_ptr[ncb];
+                //index = Llu->Lrowind_bc_ptr[ncb];
+                index = Lrowind_bc_ptr[ncb];
                 if ( index )   /* Not an empty column */
                 {
                     i = index[0];
@@ -466,7 +469,8 @@ int_t* getPerm_c_supno(int_t nsupers,
         {
             jb = lb * Pr + myrow;
             pc = jb % Pc;
-            index = Llu->Ufstnz_br_ptr[lb];
+            //index = Llu->Ufstnz_br_ptr[lb];
+            index = Ufstnz_br_ptr[lb];
 
             if ( index )   /* Not an empty row */
             {
@@ -488,7 +492,8 @@ int_t* getPerm_c_supno(int_t nsupers,
         {
             jb = nrb * Pr + myrow;
             pc = jb % Pc;
-            index = Llu->Ufstnz_br_ptr[nrb];
+            //index = Llu->Ufstnz_br_ptr[nrb];
+	    index = Ufstnz_br_ptr[nrb];
 
             if ( index )   /* Not an empty row */
             {
@@ -514,7 +519,8 @@ int_t* getPerm_c_supno(int_t nsupers,
         {
             jb = lb * Pr + myrow;
             pc = jb % Pc;
-            index = Llu->Ufstnz_br_ptr[lb];
+            //index = Llu->Ufstnz_br_ptr[lb];
+            index = Ufstnz_br_ptr[lb];
 
             if ( index )   /* Not an empty row */
             {
@@ -535,7 +541,8 @@ int_t* getPerm_c_supno(int_t nsupers,
         {
             jb = nrb * Pr + myrow;
             pc = jb % Pc;
-            index = Llu->Ufstnz_br_ptr[nrb];
+            //index = Llu->Ufstnz_br_ptr[nrb];
+            index = Ufstnz_br_ptr[nrb];
 
             if ( index )   /* Not an empty row */
             {
@@ -631,7 +638,8 @@ int_t* getPerm_c_supno(int_t nsupers,
         for ( lb = 0; lb < ncb; lb++ )
         {
             jb = lb * Pc + mycol;
-            index = Llu->Lrowind_bc_ptr[lb];
+            //index = Llu->Lrowind_bc_ptr[lb];
+            index = Lrowind_bc_ptr[lb];
             if ( index )   /* Not an empty column */
             {
                 nblocks += index[0];
@@ -640,7 +648,8 @@ int_t* getPerm_c_supno(int_t nsupers,
         if ( mycol < nsupers % grid->npcol )
         {
             jb = ncb * Pc + mycol;
-            index = Llu->Lrowind_bc_ptr[ncb];
+            //index = Llu->Lrowind_bc_ptr[ncb];
+            index = Lrowind_bc_ptr[ncb];
             if ( index )   /* Not an empty column */
             {
                 nblocks += index[0];
@@ -656,7 +665,8 @@ int_t* getPerm_c_supno(int_t nsupers,
             Lrows[lb] = 0;
 
             jb = lb * Pc + mycol;
-            index = Llu->Lrowind_bc_ptr[lb];
+            //index = Llu->Lrowind_bc_ptr[lb];
+            index = Lrowind_bc_ptr[lb];
             if ( index )   /* Not an empty column */
             {
                 i = index[0];
@@ -687,7 +697,8 @@ int_t* getPerm_c_supno(int_t nsupers,
         {
             Lrows[ncb] = 0;
             jb = ncb * Pc + mycol;
-            index = Llu->Lrowind_bc_ptr[ncb];
+            //index = Llu->Lrowind_bc_ptr[ncb];
+            index = Lrowind_bc_ptr[ncb];
             if ( index )   /* Not an empty column */
             {
                 i = index[0];
@@ -1148,14 +1159,48 @@ int_t* getPerm_c_supno(int_t nsupers,
 } /* getPerm_c_supno */
 
 
-void getSCUweight(int_t nsupers, treeList_t* treeList,
-                  LUstruct_t *LUstruct, gridinfo3d_t * grid3d
-                 )
+int_t Trs2_InitUblock_info(int_t klst, int_t nb,
+			    Ublock_info_t *Ublock_info,
+			    int_t *usub,
+			    Glu_persist_t *Glu_persist, SuperLUStat_t *stat )
+{
+    int_t *xsup = Glu_persist->xsup;
+    int_t iukp, rukp;
+    iukp = BR_HEADER;
+    rukp = 0;
+
+    for (int_t b = 0; b < nb; ++b)
+    {
+        int_t gb = usub[iukp];
+        int_t nsupc = SuperSize (gb);
+
+        Ublock_info[b].iukp = iukp;
+        Ublock_info[b].rukp = rukp;
+        // Ublock_info[b].nsupc = nsupc;
+
+        iukp += UB_DESCRIPTOR;
+	/* Sherry: can remove this loop for rukp
+	   rukp += usub[iukp-1];
+	 */
+       for (int_t j = 0; j < nsupc; ++j)
+        {
+            int_t segsize = klst - usub[iukp++];
+            rukp += segsize;
+            stat->ops[FACT] += segsize * (segsize + 1);
+        }
+    }
+    return 0;
+}
+
+void getSCUweight(int_t nsupers, treeList_t* treeList, int_t* xsup,
+		  int_t** Lrowind_bc_ptr, int_t** Ufstnz_br_ptr,
+		  gridinfo3d_t * grid3d
+		  )
 {
     gridinfo_t* grid = &(grid3d->grid2d);
-    int_t** Lrowind_bc_ptr = LUstruct->Llu->Lrowind_bc_ptr;
-    int_t** Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr;
-    int_t* xsup = LUstruct->Glu_persist->xsup;
+    //int_t** Lrowind_bc_ptr = LUstruct->Llu->Lrowind_bc_ptr;
+    //int_t** Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr;
+    //int_t* xsup = LUstruct->Glu_persist->xsup;
 
     int_t * perm_u = INT_T_ALLOC(nsupers);
     int_t * mylsize = INT_T_ALLOC(nsupers);
@@ -1223,36 +1268,3 @@ void getSCUweight(int_t nsupers, treeList_t* treeList,
 
 } /* getSCUweight */
 
-int_t Trs2_InitUblock_info(int_t klst, int_t nb,
-			    Ublock_info_t *Ublock_info,
-			    int_t *usub,
-			    Glu_persist_t *Glu_persist, SuperLUStat_t *stat )
-{
-    int_t *xsup = Glu_persist->xsup;
-    int_t iukp, rukp;
-    iukp = BR_HEADER;
-    rukp = 0;
-
-    for (int_t b = 0; b < nb; ++b)
-    {
-        int_t gb = usub[iukp];
-        int_t nsupc = SuperSize (gb);
-
-        Ublock_info[b].iukp = iukp;
-        Ublock_info[b].rukp = rukp;
-        // Ublock_info[b].nsupc = nsupc;
-
-        iukp += UB_DESCRIPTOR;
-	/* Sherry: can remove this loop for rukp
-	   rukp += usub[iukp-1];
-	 */
-       for (int_t j = 0; j < nsupc; ++j)
-        {
-            int_t segsize = klst - usub[iukp++];
-            rukp += segsize;
-            stat->ops[FACT] += segsize * (segsize + 1);
-        }
-    }
-    return 0;
-}
-
diff --git a/SRC/util.c b/SRC/util.c
index 0ddbbc57..6acca0b7 100644
--- a/SRC/util.c
+++ b/SRC/util.c
@@ -1092,63 +1092,6 @@ int_t partitionM( int_t* a, int_t l, int_t r, int_t lda, int_t dir, int_t dims)
 } /* partitionM */
 
 
-/*
- * The following are from 3D code p3dcomm.c
- */
-
-int AllocGlu_3d(int_t n, int_t nsupers, LUstruct_t * LUstruct)
-{
-    /*broadcasting Glu_persist*/
-    LUstruct->Glu_persist->xsup  = intMalloc_dist(nsupers+1); //INT_T_ALLOC(nsupers+1);
-    LUstruct->Glu_persist->supno = intMalloc_dist(n); //INT_T_ALLOC(n);
-    return 0;
-}
-
-// Sherry added
-int DeAllocGlu_3d(LUstruct_t * LUstruct)
-{
-    SUPERLU_FREE(LUstruct->Glu_persist->xsup);
-    SUPERLU_FREE(LUstruct->Glu_persist->supno);
-    return 0;
-}
-
-int DeAllocLlu_3d(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d)
-{
-    int i, nbc, nbr, nsupers;
-    LocalLU_t *Llu = LUstruct->Llu;
-
-    nsupers = (LUstruct->Glu_persist)->supno[n-1] + 1;
-
-    nbc = CEILING(nsupers, grid3d->npcol);
-    for (i = 0; i < nbc; ++i) 
-	if ( Llu->Lrowind_bc_ptr[i] ) {
-	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
-#ifdef GPU_ACC
-	    checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i]));
-#else
-	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
-#endif
-	}
-    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
-    SUPERLU_FREE (Llu->Lnzval_bc_ptr);
-
-    nbr = CEILING(nsupers, grid3d->nprow);
-    for (i = 0; i < nbr; ++i)
-	if ( Llu->Ufstnz_br_ptr[i] ) {
-	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
-	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
-	}
-    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
-    SUPERLU_FREE (Llu->Unzval_br_ptr);
-
-    /* The following can be freed after factorization. */
-    SUPERLU_FREE(Llu->ToRecv);
-    SUPERLU_FREE(Llu->ToSendD);
-    for (i = 0; i < nbc; ++i) SUPERLU_FREE(Llu->ToSendR[i]);
-    SUPERLU_FREE(Llu->ToSendR);
-    return 0;
-} /* DeAllocLlu_3d */
-
 int_t** getTreePerm( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
                      int_t* nodeCount, int_t** nodeList,
                      int_t* perm_c_supno, int_t* iperm_c_supno,
diff --git a/SRC/zcommunication_aux.c b/SRC/zcommunication_aux.c
index 2410057a..55f9c435 100644
--- a/SRC/zcommunication_aux.c
+++ b/SRC/zcommunication_aux.c
@@ -71,7 +71,8 @@ int_t zBcast_LPanel
  int* msgcnt,  int **ToSendR, int_t *xsup , SCT_t* SCT,
  int tag_ub)
 {
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     int_t Pc = grid->npcol;
     int_t lk = LBj (k, grid);
     superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
@@ -100,7 +101,8 @@ int_t zBcast_LPanel
 
         }
     }
-    SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl +=  SuperLU_timer_() - t1;
     return 0;
 }
 
@@ -155,7 +157,8 @@ int_t zBcast_UPanel(int_t k, int_t k0, int_t* usub,
 		   int* msgcnt, int *ToSendD, SCT_t* SCT, int tag_ub)
 
 {
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     int_t iam = grid->iam;
     int_t lk = LBi (k, grid);
     int_t Pr = grid->nprow;
@@ -187,14 +190,15 @@ int_t zBcast_UPanel(int_t k, int_t k0, int_t* usub,
             }       /* if pi ... */
         }           /* for pi ... */
     }
-    SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl += SuperLU_timer_() - t1;
     return 0;
 }
 
 int_t zIrecv_LPanel
 /*it places Irecv call for L panel*/
 (int_t k, int_t k0,  int_t* Lsub_buf, doublecomplex* Lval_buf,
- gridinfo_t *grid, MPI_Request *recv_req, LocalLU_t *Llu, int tag_ub )
+ gridinfo_t *grid, MPI_Request *recv_req, zLocalLU_t *Llu, int tag_ub )
 {
     int_t kcol = PCOL (k, grid);
 
@@ -211,7 +215,7 @@ int_t zIrecv_LPanel
 
 int_t zIrecv_UPanel
 /*it places Irecv calls to receive U panels*/
-(int_t k, int_t k0, int_t* Usub_buf, doublecomplex* Uval_buf, LocalLU_t *Llu,
+(int_t k, int_t k0, int_t* Usub_buf, doublecomplex* Uval_buf, zLocalLU_t *Llu,
  gridinfo_t* grid, MPI_Request *recv_req_u, int tag_ub )
 {
     int_t krow = PROW (k, grid);
@@ -229,13 +233,15 @@ int_t zIrecv_UPanel
 int_t zWait_URecv
 ( MPI_Request *recv_req, int* msgcnt, SCT_t* SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     MPI_Status status;
     MPI_Wait (&recv_req[0], &status);
     MPI_Get_count (&status, mpi_int_t, &msgcnt[2]);
     MPI_Wait (&recv_req[1], &status);
     MPI_Get_count (&status, SuperLU_MPI_DOUBLE_COMPLEX, &msgcnt[3]);
-    SCT->Wait_URecv_tl += (double) ( _rdtsc() - t1);
+    //SCT->Wait_URecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_URecv_tl +=  SuperLU_timer_() - t1;
     return 0;
 }
 
@@ -243,7 +249,8 @@ int_t zWait_LRecv
 /*waits till L blocks have been received*/
 (  MPI_Request* recv_req, int* msgcnt, int* msgcntsU, gridinfo_t * grid, SCT_t* SCT)
 {
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     MPI_Status status;
     
     if (recv_req[0] != MPI_REQUEST_NULL)
@@ -267,7 +274,8 @@ int_t zWait_LRecv
     {
         msgcnt[1] = msgcntsU[1];
     }
-    SCT->Wait_LRecv_tl += (double) ( _rdtsc() - t1);
+    //SCT->Wait_LRecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_LRecv_tl +=  SuperLU_timer_() - t1;
     return 0;
 }
 
@@ -303,7 +311,8 @@ int_t zRecv_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, /*pointer for the diag
                       int_t src,
                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
 {
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     MPI_Status status;
     MPI_Comm comm = (grid->cscp).comm;
     /* tag = ((k0<<2)+2) % tag_ub;        */
@@ -311,13 +320,14 @@ int_t zRecv_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, /*pointer for the diag
 
     MPI_Recv (ublk_ptr, size, SuperLU_MPI_DOUBLE_COMPLEX, src,
               SLU_MPI_TAG (4, k0), comm, &status);
-    SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl +=  SuperLU_timer_() - t1;
     return 0;
 }
 
 
 int_t zPackLBlock(int_t k, doublecomplex* Dest, Glu_persist_t *Glu_persist,
-                  gridinfo_t *grid, LocalLU_t *Llu)
+                  gridinfo_t *grid, zLocalLU_t *Llu)
 /*Copies src matrix into dest matrix*/
 {
     /* Initialization. */
@@ -374,7 +384,8 @@ int_t zIRecv_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, /*pointer for the dia
                        MPI_Request *U_diag_blk_recv_req,
                        gridinfo_t * grid, SCT_t* SCT, int tag_ub)
 {
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     MPI_Comm comm = (grid->cscp).comm;
     /* tag = ((k0<<2)+2) % tag_ub;        */
     /* tag = (4*(nsupers+k0)+2) % tag_ub; */
@@ -385,7 +396,8 @@ int_t zIRecv_UDiagBlock(int_t k0, doublecomplex *ublk_ptr, /*pointer for the dia
     {
         printf("Error in IRecv_UDiagBlock count\n");
     }
-    SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
     return 0;
 }
 
@@ -395,7 +407,8 @@ int_t zIRecv_LDiagBlock(int_t k0, doublecomplex *L_blk_ptr, /*pointer for the di
                        MPI_Request *L_diag_blk_recv_req,
                        gridinfo_t * grid, SCT_t* SCT, int tag_ub)
 {
-    unsigned long long t1 = _rdtsc();
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     MPI_Comm comm = (grid->rscp).comm;
     /* tag = ((k0<<2)+2) % tag_ub;        */
     /* tag = (4*(nsupers+k0)+2) % tag_ub; */
@@ -407,7 +420,8 @@ int_t zIRecv_LDiagBlock(int_t k0, doublecomplex *L_blk_ptr, /*pointer for the di
     {
         printf("Error in IRecv_lDiagBlock count\n");
     }
-    SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
     return 0;
 }
 
@@ -448,9 +462,9 @@ int_t zIBcast_LDiagBlock(int_t k, doublecomplex *lblk_ptr, /*pointer for the dia
 int_t zUDiagBlockRecvWait( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
                            MPI_Request * U_diag_blk_recv_req,
                            gridinfo_t *grid,
-                           LUstruct_t *LUstruct, SCT_t *SCT)
+                           zLUstruct_t *LUstruct, SCT_t *SCT)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
 
     int_t iam = grid->iam;
 
diff --git a/SRC/zreadMM.c b/SRC/zreadMM.c
index 526641aa..3d0048cf 100644
--- a/SRC/zreadMM.c
+++ b/SRC/zreadMM.c
@@ -60,7 +60,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz,
 
      if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) {
        printf("Invalid header (first line does not contain 5 tokens)\n");
-       exit;
+       exit(-1);
      }
 
      if(strcmp(banner,"%%matrixmarket")) {
diff --git a/SRC/zscatter3d.c b/SRC/zscatter3d.c
index eeece774..62e8603a 100644
--- a/SRC/zscatter3d.c
+++ b/SRC/zscatter3d.c
@@ -172,7 +172,7 @@ zblock_gemm_scatter( int_t lb, int_t j,
     
     // printf("SCU update: (%d, %d)\n",ib,jb );
 #ifdef SCATTER_PROFILE
-    unsigned long long ttx = __rdtsc();
+    double ttx = SuperLU_timer_();
 #endif
     /*Now scattering the block*/
     if (ib < jb)
@@ -205,7 +205,7 @@ zblock_gemm_scatter( int_t lb, int_t j,
     // stat->ops[FACT] += 2*temp_nbrow*ncols*ldu + temp_nbrow*ncols;
 
 #ifdef SCATTER_PROFILE
-    double t_s = (double) __rdtsc() - ttx;
+    double t_s = SuperLU_timer_() - ttx;
     Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
     += 3.0 * (double ) temp_nbrow * (double ) ncols;
     Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
@@ -293,7 +293,7 @@ zblock_gemm_scatter_lock( int_t lb, int_t j,
         }
 
 #ifdef SCATTER_PROFILE
-    unsigned long long ttx = __rdtsc();
+    double ttx = SuperLU_timer_();
 #endif
     /*Now scattering the block*/
     if (ib < jb)
@@ -326,7 +326,7 @@ zblock_gemm_scatter_lock( int_t lb, int_t j,
         omp_unset_lock(lock);
 
 #ifdef SCATTER_PROFILE
-    double t_s = (double) __rdtsc() - ttx;
+    double t_s = SuperLU_timer_() - ttx;
     Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
     += 3.0 * (double ) temp_nbrow * (double ) ncols;
     Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
@@ -363,13 +363,13 @@ int_t zblock_gemm_scatterTopLeft( int_t lb, /* block number in L */
                                  doublecomplex* bigV, int_t knsupc,  int_t klst,
 				 int_t* lsub, int_t * usub, int_t ldt,
 				 int* indirect, int* indirect2, HyP_t* HyP,
-                                 LUstruct_t *LUstruct,
+                                 zLUstruct_t *LUstruct,
                                  gridinfo_t* grid,
                                  SCT_t*SCT, SuperLUStat_t *stat
                                )
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
     int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
@@ -382,7 +382,7 @@ int_t zblock_gemm_scatterTopLeft( int_t lb, /* block number in L */
 #endif
 
 //    printf("Thread's ID %lld \n", thread_id);
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     zblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info,
 			HyP->lookAhead_L_buff, HyP->Lnbrow,
                         HyP->bigU_host, HyP->ldu,
@@ -394,7 +394,7 @@ int_t zblock_gemm_scatterTopLeft( int_t lb, /* block number in L */
                         , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
 #endif
                       );
-    unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
     SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
     return 0;
 } /* zgemm_scatterTopLeft */
@@ -403,13 +403,13 @@ int_t zblock_gemm_scatterTopRight( int_t lb,  int_t j,
                                   doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
                                   int_t * usub, int_t ldt,  int* indirect, int* indirect2,
                                   HyP_t* HyP,
-                                  LUstruct_t *LUstruct,
+                                  zLUstruct_t *LUstruct,
                                   gridinfo_t* grid,
                                   SCT_t*SCT, SuperLUStat_t *stat
                                 )
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
     int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
@@ -420,7 +420,7 @@ int_t zblock_gemm_scatterTopRight( int_t lb,  int_t j,
 #else
     volatile  int_t thread_id = 0;
 #endif
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     zblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow,
                         HyP->bigU_Phi, HyP->ldu_Phi,
                         bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
@@ -429,7 +429,7 @@ int_t zblock_gemm_scatterTopRight( int_t lb,  int_t j,
                         , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
 #endif
                       );
-    unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
     SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
     return 0;
 } /* zblock_gemm_scatterTopRight */
@@ -438,13 +438,13 @@ int_t zblock_gemm_scatterBottomLeft( int_t lb,  int_t j,
                                     doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
                                     int_t * usub, int_t ldt,  int* indirect, int* indirect2,
                                     HyP_t* HyP,
-                                    LUstruct_t *LUstruct,
+                                    zLUstruct_t *LUstruct,
                                     gridinfo_t* grid,
                                     SCT_t*SCT, SuperLUStat_t *stat
                                   )
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
     int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
@@ -456,7 +456,7 @@ int_t zblock_gemm_scatterBottomLeft( int_t lb,  int_t j,
     volatile int_t thread_id = 0;
 #endif
     //printf("Thread's ID %lld \n", thread_id);
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     zblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
                         HyP->bigU_host, HyP->ldu,
                         bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
@@ -465,7 +465,7 @@ int_t zblock_gemm_scatterBottomLeft( int_t lb,  int_t j,
                         , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
 #endif
                       );
-    unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
     SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
     return 0;
 
@@ -475,13 +475,13 @@ int_t zblock_gemm_scatterBottomRight( int_t lb,  int_t j,
                                      doublecomplex* bigV, int_t knsupc,  int_t klst, int_t* lsub,
                                      int_t * usub, int_t ldt,  int* indirect, int* indirect2,
                                      HyP_t* HyP,
-                                     LUstruct_t *LUstruct,
+                                     zLUstruct_t *LUstruct,
                                      gridinfo_t* grid,
                                      SCT_t*SCT, SuperLUStat_t *stat
                                    )
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
     int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
     int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
@@ -493,7 +493,7 @@ int_t zblock_gemm_scatterBottomRight( int_t lb,  int_t j,
     volatile  int_t thread_id = 0;
 #endif
    // printf("Thread's ID %lld \n", thread_id);
-    unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
     zblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
                         HyP->bigU_Phi, HyP->ldu_Phi,
                         bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
@@ -503,7 +503,7 @@ int_t zblock_gemm_scatterBottomRight( int_t lb,  int_t j,
 #endif
                       );
 
-    unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
     SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
     return 0;
 
diff --git a/SRC/ztreeFactorization.c b/SRC/ztreeFactorization.c
index 0cda8e56..f0e65f70 100644
--- a/SRC/ztreeFactorization.c
+++ b/SRC/ztreeFactorization.c
@@ -23,9 +23,9 @@ at the top-level directory.
 #include "trfCommWrapper.h"
 #endif
 
-int_t zLluBufInit(zLUValSubBuf_t* LUvsb, LUstruct_t *LUstruct)
+int_t zLluBufInit(zLUValSubBuf_t* LUvsb, zLUstruct_t *LUstruct)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     LUvsb->Lsub_buf = intMalloc_dist(Llu->bufmax[0]); //INT_T_ALLOC(Llu->bufmax[0]);
     LUvsb->Lval_buf = doublecomplexMalloc_dist(Llu->bufmax[1]); //DOUBLE_ALLOC(Llu->bufmax[1]);
     LUvsb->Usub_buf = intMalloc_dist(Llu->bufmax[2]); //INT_T_ALLOC(Llu->bufmax[2]);
@@ -70,7 +70,7 @@ int zfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs)
     return 0;
 }
 
-zLUValSubBuf_t** zLluBufInitArr(int_t numLA, LUstruct_t *LUstruct)
+zLUValSubBuf_t** zLluBufInitArr(int_t numLA, zLUstruct_t *LUstruct)
 {
     zLUValSubBuf_t** LUvsbs = (zLUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(zLUValSubBuf_t*));
     for (int_t i = 0; i < numLA; ++i)
@@ -100,7 +100,7 @@ int zLluBufFreeArr(int_t numLA, zLUValSubBuf_t **LUvsbs)
 
 int_t zinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
                   scuBufs_t* scuBufs,
-                  LUstruct_t* LUstruct,
+                  zLUstruct_t* LUstruct,
                   gridinfo_t * grid)
 {
     scuBufs->bigV = zgetBigV(ldt, num_threads);
@@ -137,13 +137,13 @@ int_t zdenseTreeFactor(
     superlu_dist_options_t *options,
     int_t * gIperm_c_supno,
     int_t ldt,
-    LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    zLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
     double thresh,  SCT_t *SCT, int tag_ub,
     int *info
 )
 {
     gridinfo_t* grid = &(grid3d->grid2d);
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
 
     /*main loop over all the super nodes*/
     for (int_t k0 = 0; k0 < nnodes   ; ++k0)
@@ -300,7 +300,7 @@ int_t zsparseTreeFactor_ASYNC(
     int_t * gIperm_c_supno,
     int_t ldt,
     HyP_t* HyP,
-    LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    zLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
     double thresh,  SCT_t *SCT, int tag_ub,
     int *info
 )
diff --git a/SRC/ztrfAux.c b/SRC/ztrfAux.c
index 921ccc89..787d48a7 100644
--- a/SRC/ztrfAux.c
+++ b/SRC/ztrfAux.c
@@ -26,7 +26,7 @@ at the top-level directory.
 #endif
 
 /* Inititalize the data structure to assist HALO offload of Schur-complement. */
-void zInit_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb )
+void zInit_HyP(HyP_t* HyP, zLocalLU_t *Llu, int_t mcb, int_t mrb )
 {
     HyP->last_offload = -1;
 #if 0
@@ -69,7 +69,7 @@ void zInit_HyP(HyP_t* HyP, LocalLU_t *Llu, int_t mcb, int_t mrb )
 
 /*init3DLUstruct with forest interface */
 void zinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
-                           sForest_t**  sForests, LUstruct_t* LUstruct,
+                           sForest_t**  sForests, zLUstruct_t* LUstruct,
                            gridinfo3d_t* grid3d)
 {
     int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
@@ -113,11 +113,11 @@ int_t zSchurComplementSetup(
     int_t* Usub_buf,
     doublecomplex *Uval_buf,
     gridinfo_t *grid,
-    LUstruct_t *LUstruct
+    zLUstruct_t *LUstruct
 )
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
 
     int* ToRecv = Llu->ToRecv;
@@ -337,7 +337,7 @@ int_t zSchurComplementSetupGPU(
     int_t* iperm_c_supno, int_t*perm_c_supno,
     gEtreeInfo_t*   gEtreeInfo, factNodelists_t* fNlists,
     scuBufs_t* scuBufs, zLUValSubBuf_t* LUvsb,
-    gridinfo_t *grid, LUstruct_t *LUstruct,
+    gridinfo_t *grid, zLUstruct_t *LUstruct,
     HyP_t* HyP)
 {
     int_t * Lsub_buf  = LUvsb->Lsub_buf;
@@ -352,7 +352,7 @@ int_t zSchurComplementSetupGPU(
     doublecomplex* bigU = scuBufs->bigU;
 
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
 
     int* ToRecv = Llu->ToRecv;
@@ -496,8 +496,7 @@ doublecomplex* zgetBigV(int_t ldt, int_t num_threads)
     return bigV;
 }
 
-doublecomplex* zgetBigU(int_t nsupers, gridinfo_t *grid,
-                    LUstruct_t *LUstruct)
+doublecomplex* zgetBigU(int_t nsupers, gridinfo_t *grid, zLUstruct_t *LUstruct)
 {
     int_t Pr = grid->nprow;
     int_t Pc = grid->npcol;
@@ -541,9 +540,10 @@ doublecomplex* zgetBigU(int_t nsupers, gridinfo_t *grid,
     return bigU;
 } /* zgetBigU */
 
+
 trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers,
 				      superlu_dist_options_t *options,
-				      LUstruct_t *LUstruct, gridinfo3d_t * grid3d
+				      zLUstruct_t *LUstruct, gridinfo3d_t * grid3d
 				      )
 {
     gridinfo_t* grid = &(grid3d->grid2d);
@@ -552,7 +552,11 @@ trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers,
     int iam = grid3d->iam;
     CHECK_MALLOC (iam, "Enter zinitTrf3Dpartition()");
 #endif
-    int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid);
+    int_t* perm_c_supno = getPerm_c_supno(nsupers, options,
+					  LUstruct->etree,
+					  LUstruct->Glu_persist,
+					  LUstruct->Llu->Lrowind_bc_ptr,
+					  LUstruct->Llu->Ufstnz_br_ptr, grid);
     int_t* iperm_c_supno = getFactIperm(perm_c_supno, nsupers);
 
     // calculating tree factorization
@@ -560,7 +564,9 @@ trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers,
     treeList_t* treeList = setree2list(nsupers, setree );
 
     /*update treelist with weight and depth*/
-    getSCUweight(nsupers, treeList, LUstruct, grid3d);
+    getSCUweight(nsupers, treeList,LUstruct->Glu_persist->xsup,
+		 LUstruct->Llu->Lrowind_bc_ptr, LUstruct->Llu->Ufstnz_br_ptr,
+		 grid3d);
 
     calcTreeWeight(nsupers, setree, treeList, LUstruct->Glu_persist->xsup);
 
diff --git a/SRC/ztrfCommWrapper.c b/SRC/ztrfCommWrapper.c
index 98efab03..b37fbb4a 100644
--- a/SRC/ztrfCommWrapper.c
+++ b/SRC/ztrfCommWrapper.c
@@ -42,7 +42,7 @@ int_t zDiagFactIBCast(int_t k,  int_t k0,      // supernode to be factored
                      gridinfo_t *grid,
                      superlu_dist_options_t *options,
                      double thresh,
-                     LUstruct_t *LUstruct,
+                     zLUstruct_t *LUstruct,
                      SuperLUStat_t *stat, int *info,
                      SCT_t *SCT,
 		     int tag_ub
@@ -50,7 +50,7 @@ int_t zDiagFactIBCast(int_t k,  int_t k0,      // supernode to be factored
 {
     // unpacking variables
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
 
     int_t iam = grid->iam;
@@ -122,11 +122,11 @@ int_t zDiagFactIBCast(int_t k,  int_t k0,      // supernode to be factored
 int_t zLPanelTrSolve( int_t k,   int_t* factored_L,
 		      doublecomplex* BlockUFactor,
 		      gridinfo_t *grid,
-		      LUstruct_t *LUstruct)
+		      zLUstruct_t *LUstruct)
 {
     doublecomplex alpha = {1.0, 0.0};
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
 
     int_t iam = grid->iam;
@@ -255,7 +255,7 @@ int_t zLPanelUpdate( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
                     MPI_Request * U_diag_blk_recv_req,
                     doublecomplex* BlockUFactor,
                     gridinfo_t *grid,
-                    LUstruct_t *LUstruct, SCT_t *SCT)
+                    zLUstruct_t *LUstruct, SCT_t *SCT)
 {
 
     zUDiagBlockRecvWait( k,  IrecvPlcd_D, factored_L,
@@ -274,11 +274,11 @@ int_t zUPanelTrSolve( int_t k,
                      int_t ldt,
                      Ublock_info_t* Ublock_info,
                      gridinfo_t *grid,
-                     LUstruct_t *LUstruct,
+                     zLUstruct_t *LUstruct,
                      SuperLUStat_t *stat, SCT_t *SCT)
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
     int_t iam = grid->iam;
     int_t myrow = MYROW (iam, grid);
@@ -386,7 +386,7 @@ int_t zUPanelUpdate( int_t k,  int_t* factored_U,
                     int_t ldt,
                     Ublock_info_t* Ublock_info,
                     gridinfo_t *grid,
-                    LUstruct_t *LUstruct,
+                    zLUstruct_t *LUstruct,
                     SuperLUStat_t *stat, SCT_t *SCT)
 {
 
@@ -407,13 +407,13 @@ int_t zIBcastRecvLPanel(
     doublecomplex* Lval_buf,
     int_t * factored,
     gridinfo_t *grid,
-    LUstruct_t *LUstruct,
+    zLUstruct_t *LUstruct,
     SCT_t *SCT,
     int tag_ub
 )
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int_t* xsup = Glu_persist->xsup;
     int** ToSendR = Llu->ToSendR;
     int* ToRecv = Llu->ToRecv;
@@ -467,10 +467,10 @@ int_t zIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt,
     			     MPI_Request *send_requ,
     			     MPI_Request *recv_requ,
     			     int_t* Usub_buf, doublecomplex* Uval_buf,
-    			     gridinfo_t *grid, LUstruct_t *LUstruct,
+    			     gridinfo_t *grid, zLUstruct_t *LUstruct,
     			     SCT_t *SCT, int tag_ub)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
 
     int* ToSendD = Llu->ToSendD;
     int* ToRecv = Llu->ToRecv;
@@ -515,9 +515,9 @@ int_t zIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt,
 
 int_t zWaitL( int_t k, int* msgcnt, int* msgcntU,
               MPI_Request *send_req, MPI_Request *recv_req,
-    	      gridinfo_t *grid, LUstruct_t *LUstruct, SCT_t *SCT)
+    	      gridinfo_t *grid, zLUstruct_t *LUstruct, SCT_t *SCT)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
     int** ToSendR = Llu->ToSendR;
     int* ToRecv = Llu->ToRecv;
     int_t iam = grid->iam;
@@ -543,9 +543,9 @@ int_t zWaitL( int_t k, int* msgcnt, int* msgcntU,
 
 int_t zWaitU( int_t k, int* msgcnt,
               MPI_Request *send_requ, MPI_Request *recv_requ,
-    	      gridinfo_t *grid, LUstruct_t *LUstruct, SCT_t *SCT)
+    	      gridinfo_t *grid, zLUstruct_t *LUstruct, SCT_t *SCT)
 {
-    LocalLU_t *Llu = LUstruct->Llu;
+    zLocalLU_t *Llu = LUstruct->Llu;
 
     int* ToRecv = Llu->ToRecv;
     int* ToSendD = Llu->ToSendD;
diff --git a/SRC/zutil_dist.c b/SRC/zutil_dist.c
index 6ed2eb97..58b62d12 100644
--- a/SRC/zutil_dist.c
+++ b/SRC/zutil_dist.c
@@ -424,6 +424,63 @@ void zScalePermstructFree(zScalePermstruct_t *ScalePermstruct)
     }
 }
 
+/*
+ * The following are from 3D code p3dcomm.c
+ */
+
+int zAllocGlu_3d(int_t n, int_t nsupers, zLUstruct_t * LUstruct)
+{
+    /*broadcasting Glu_persist*/
+    LUstruct->Glu_persist->xsup  = intMalloc_dist(nsupers+1); //INT_T_ALLOC(nsupers+1);
+    LUstruct->Glu_persist->supno = intMalloc_dist(n); //INT_T_ALLOC(n);
+    return 0;
+}
+
+// Sherry added
+int zDeAllocGlu_3d(zLUstruct_t * LUstruct)
+{
+    SUPERLU_FREE(LUstruct->Glu_persist->xsup);
+    SUPERLU_FREE(LUstruct->Glu_persist->supno);
+    return 0;
+}
+
+int zDeAllocLlu_3d(int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int i, nbc, nbr, nsupers;
+    zLocalLU_t *Llu = LUstruct->Llu;
+
+    nsupers = (LUstruct->Glu_persist)->supno[n-1] + 1;
+
+    nbc = CEILING(nsupers, grid3d->npcol);
+    for (i = 0; i < nbc; ++i) 
+	if ( Llu->Lrowind_bc_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
+#ifdef GPU_ACC
+	    checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i]));
+#else
+	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
+#endif
+	}
+    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
+    SUPERLU_FREE (Llu->Lnzval_bc_ptr);
+
+    nbr = CEILING(nsupers, grid3d->nprow);
+    for (i = 0; i < nbr; ++i)
+	if ( Llu->Ufstnz_br_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
+	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
+    SUPERLU_FREE (Llu->Unzval_br_ptr);
+
+    /* The following can be freed after factorization. */
+    SUPERLU_FREE(Llu->ToRecv);
+    SUPERLU_FREE(Llu->ToSendD);
+    for (i = 0; i < nbc; ++i) SUPERLU_FREE(Llu->ToSendR[i]);
+    SUPERLU_FREE(Llu->ToSendR);
+    return 0;
+} /* zDeAllocLlu_3d */
+
 
 /**** Other utilities ****/
 void

From 2cb3a94c44be111414ab218c764b22107f918a0d Mon Sep 17 00:00:00 2001
From: 7ps 
Date: Tue, 22 Dec 2020 00:51:15 -0500
Subject: [PATCH 053/147] pddrive is compiling with new cmake flags, but not
 pddrive3d

---
 CMakeLists.txt             |    3 +-
 SRC/CMakeLists.txt         |    4 +-
 SRC/acc_aux.c              |  665 +++++++++++++
 SRC/acc_aux.h              |   48 +
 SRC/lustruct_gpu.h         |  258 +++++
 SRC/pdgstrf3d.c            |   49 +-
 SRC/superlu_gpu.cu         | 1856 ++++++++++++++++++++++++++++++++++++
 SRC/treeFactorizationGPU.c |  583 +++++++++++
 8 files changed, 3445 insertions(+), 21 deletions(-)
 create mode 100644 SRC/acc_aux.c
 create mode 100644 SRC/acc_aux.h
 create mode 100644 SRC/lustruct_gpu.h
 create mode 100644 SRC/superlu_gpu.cu
 create mode 100644 SRC/treeFactorizationGPU.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a21b1b5a..683d7e94 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,8 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 # Set up options
 option(enable_doc       "Build doxygen documentation" OFF)
 option(enable_double    "Enable double precision library" ON)
-option(enable_complex16 "Enable complex16 precision library" ON)
+#option(enable_complex16 "Enable complex16 precision library" ON)
+option(enable_complex16 "Enable complex16 precision library" OFF)
 option(enable_tests  "Build tests" ON)
 option(enable_examples  "Build examples" ON)
 option(TPL_ENABLE_BLASLIB  "Build the CBLAS library" ${enable_blaslib_DEFAULT})
diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index 3d059586..f9b2d974 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -52,6 +52,7 @@ set(sources
   communication_aux.c
   treeFactorization.c 
   sec_structs.c
+  cublas_utils.c
 )
 if (MSVC)
   list(APPEND sources wingetopt.c)
@@ -113,6 +114,7 @@ if(enable_double)
     dcommunication_aux.c 
     dtrfCommWrapper.c
     dsuperlu_blas.c
+    superlu_gpu.cu 
   )
 endif()
 
@@ -178,7 +180,7 @@ if (BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS)
 endif()
 
 set(superlu_dist_libs ${MPI_C_LIBRARIES} ${BLAS_LIB} ${LAPACK_LIB}
-    ${PARMETIS_LIB} ${COMBBLAS_LIB})
+    ${PARMETIS_LIB} ${COMBBLAS_LIB} ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES})
 
 if (NOT MSVC)
   list(APPEND superlu_dist_libs m)
diff --git a/SRC/acc_aux.c b/SRC/acc_aux.c
new file mode 100644
index 00000000..c8a1c634
--- /dev/null
+++ b/SRC/acc_aux.c
@@ -0,0 +1,665 @@
+#include "acc_aux.h"
+
+#define CLAMP(x, low, high)  (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))
+
+int
+get_thread_per_process ()
+{
+    char *ttemp;
+    ttemp = getenv ("THREAD_PER_PROCESS");
+
+    if (ttemp)
+        return atoi (ttemp);
+    else
+        return 1;
+}
+
+
+static inline double
+load_imb (double *A, int nthreads)
+{
+    int i;
+    double _mx, _avg;
+    _mx = 0;
+    _avg = 0;
+    for (i = 0; i < nthreads; i++)
+    {
+        _mx = (((_mx) > (A[i])) ? (_mx) : (A[i]));
+        _avg += A[i];
+    }
+    _avg = _avg / (double) nthreads;
+    return _mx - _avg;
+}
+
+
+
+int_t
+get_max_buffer_size ()
+{
+    char *ttemp;
+    ttemp = getenv ("MAX_BUFFER_SIZE");
+    if (ttemp)
+        return atoi (ttemp);
+    else
+        return 5000000;
+}
+
+
+// #define ACC_ASYNC_COST 3.79e-3
+
+#define  MAX_DIM 12800
+#define  MAX_IN_DIM 256
+#define  LOG_2_MAX_IN_DIM 8
+#define  LOG_2_MAX_DIM 7
+
+
+double get_acc_async_cost()
+{
+    char *ttemp;
+    ttemp = getenv ("ACC_ASYNC_COST");
+    if (ttemp)
+        return (double) atof (ttemp);
+    else
+        return 4e-3;
+}
+
+// #define  CPU_BANDWIDTH 35.0
+
+double cpu_bandwidth;
+int communication_overlap;
+double acc_async_cost;
+
+
+int_t fixed_partition;
+double frac;
+
+/* Sherry: these lookup tables are not needed on Titan, nor Summit */
+double CpuDgemmLookUp[8][8][9];
+double PhiDgemmLookUp[8][8][9];
+double PhiBWLookUp[8];       // no used?
+double MicPciBandwidth[18];  // no used?
+double MicScatterBW[24][24];
+
+#ifdef OFFLOAD_PROFILE
+double MicScatterTLI[MAX_BLOCK_SIZE / STEPPING][MAX_BLOCK_SIZE / STEPPING];
+double host_scu_flop_rate[CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING];
+#endif
+
+static inline unsigned int next_power_2(unsigned int v)
+{
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+    return v;
+}
+
+
+static inline unsigned int previous_power_2(unsigned int v)
+{
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+    return v / 2;
+}
+
+
+#include 
+// static inline uint32_t my_log2(const uint32_t x) {
+//     uint32_t y;
+//     asm ( "\tbsr %1, %0\n"
+//           : "=r"(y)
+//           : "r" (x)
+//         );
+//     return y;
+// }
+
+static inline uint32_t my_log2(const uint32_t x)
+{
+    return (uint32_t) log2((double) x);
+}
+
+static inline unsigned int nearst_2_100(unsigned int v)
+{
+    v = (v + 99) / 100;
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+    return SUPERLU_MIN(my_log2(v), LOG_2_MAX_DIM) ;
+}
+
+static inline unsigned int nearst_k(unsigned int v)
+{
+
+    v--;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v++;
+    return SUPERLU_MIN(my_log2(v), LOG_2_MAX_IN_DIM) ;
+}
+
+
+
+double estimate_acc_time(int m, int n , int k)
+{
+    double flop_rate = PhiDgemmLookUp[nearst_2_100(m)][nearst_2_100(m)][nearst_k(k)];
+    double gemm_time = 2e-9 * (double) m * (double)n * (double)k / (flop_rate);
+
+    double mop_rate = PhiBWLookUp[nearst_2_100( sqrt((double) m * (double)n))];
+
+    double scatter_time = 24e-9 * (double) m * (double)n / mop_rate ;
+    // printf("gemm_time %.2e scatter_time %.2e, flop_rate %lf mop_rate %lf ",gemm_time, scatter_time, flop_rate,mop_rate);
+    if (gemm_time < 0)
+    {
+        /* code */
+        printf(" m %d n %d k %d \n", m, n, k);
+        exit(0);
+    }
+
+    double eta = 1;       /*to allow more computations on ACC only applicable for MPI cases*/
+    // if(m>1024 && k>32) eta=1.5;
+    if (communication_overlap)
+    {
+        if (m > 2048 && k > 32) eta = 5.0;
+        if (m > 4096 && k > 32) eta = 6.0;
+        if (m > 4096 && k > 64) eta = 8.0;
+    }
+
+
+    return (gemm_time + scatter_time) / eta;
+}
+
+
+
+double estimate_acc_gemm_time(int m, int n , int k)
+{
+    double flop_rate = PhiDgemmLookUp[nearst_2_100(m)][nearst_2_100(m)][nearst_k(k)];
+    double gemm_time = 2e-9 * (double) m * (double)n * (double)k / (flop_rate);
+
+
+    double eta = 1;       /*to allow more computations on ACC only applicable for MPI cases*/
+    // if(m>1024 && k>32) eta=1.5;
+    if (communication_overlap)
+    {
+        if (m > 2048 && k > 32) eta = 5.0;
+        if (m > 4096 && k > 32) eta = 6.0;
+        if (m > 4096 && k > 64) eta = 8.0;
+    }
+
+
+    return (gemm_time) / eta;
+}
+
+
+double estimate_acc_scatter_time(int m, int n , int k)
+{
+
+    double mop_rate = PhiBWLookUp[nearst_2_100( sqrt((double) m * (double)n))];
+
+    double scatter_time = 24e-9 * (double) m * (double)n / mop_rate ;
+
+    double eta = 1;       /*to allow more computations on ACC only applicable for MPI cases*/
+    // if(m>1024 && k>32) eta=1.5;
+    if (communication_overlap)
+    {
+        if (m > 2048 && k > 32) eta = 5.0;
+        if (m > 4096 && k > 32) eta = 6.0;
+        if (m > 4096 && k > 64) eta = 8.0;
+    }
+
+
+    return (scatter_time) / eta;
+}
+
+double estimate_cpu_time(int m, int n , int k)
+{
+    if (m == 0 || n == 0 || k == 0)
+    {
+        return 0;
+    }
+    double flop_rate = CpuDgemmLookUp[nearst_2_100(m)][nearst_2_100(m)][nearst_k(k)];
+    double gemm_time = 2e-9 * (double) m * (double)n * (double)k / (flop_rate);
+    double scatter_time = 24e-9 * (double) m * (double)n / cpu_bandwidth ;
+    return gemm_time + scatter_time;
+}
+
+
+double acc_data_send_time(size_t sz)
+{
+    if (my_log2((sz + 999) / 1000) > 17 ) return 1e-9 * (double) sz / MicPciBandwidth[17];
+    return 1e-9 * (double) sz / MicPciBandwidth[my_log2((sz + 999) / 1000)];
+}
+
+
+void LookUpTableInit(int my_rank)
+{
+    char *ttemp;
+    char buffer[1024];
+    char *line;
+    FILE *fp;
+
+    ttemp = getenv("CPU_BANDWIDTH");
+    if (ttemp)
+    {
+        cpu_bandwidth = atof(ttemp);
+#ifdef GPU_DEBUG
+        if (!my_rank) printf("Bandwidth of CP is %lf \n", cpu_bandwidth );
+#endif
+    }
+    else
+    {
+        printf("Please set CPU_BANDWIDTH : bbye\n");
+        exit(0);
+
+    }
+
+    // ttemp = getenv("SLU_ACC_COMM_OVERLAP");
+    // if (ttemp)
+    // {
+    //     communication_overlap = atoi(ttemp);
+    //     if (!my_rank && communication_overlap ) printf("Using communication computation overlap version\n");
+    // }
+    // else
+    // {
+    //     printf("Please set SLU_ACC_COMM_OVERLAP : bbye\n");
+    //     exit(0);
+    // }
+
+
+    /*Reading CPU performance table */
+    ttemp = getenv("CPU_DGEMM_PERF_TABLE");
+    if (ttemp)
+    {
+        fp = fopen(ttemp, "r");
+        double max_flop_rate = 0;
+        if (!fp)
+        {
+            if (!my_rank) printf("can not open %s: exiting  \n", ttemp);
+            exit(0);
+        }
+
+        while ((line = fgets(buffer, sizeof(buffer), fp)) != NULL)
+        {
+
+            int m, n, k;
+            double flop_rate;
+            sscanf(line, "%d, %d, %d, %lf ", &m, &n, &k, &flop_rate);
+            CpuDgemmLookUp[nearst_2_100(m)][nearst_2_100(m)][nearst_k(k)] = flop_rate;
+            max_flop_rate = SUPERLU_MAX(flop_rate, max_flop_rate);
+        }
+        fclose(fp);
+        // printf("CPU: MAX FLOP Rate %lf GFLOP/Sec\n",max_flop_rate );
+    }
+    else
+    {
+        printf("Assign CPU performance table \n");
+        exit(0);
+    }
+
+    ttemp = getenv("ACC_DGEMM_PERF_TABLE");
+    if (ttemp)
+    {
+        fp = fopen(ttemp, "r");
+        if (!fp)
+        {
+            printf("can not open %s: exiting  \n", ttemp);
+            exit(0);
+        }
+        double max_flop_rate = 0;
+        while ((line = fgets(buffer, sizeof(buffer), fp)) != NULL)
+        {
+
+            int m, n, k;
+            double flop_rate;
+            sscanf(line, "%d, %d, %d, %lf ", &m, &n, &k, &flop_rate);
+            PhiDgemmLookUp[nearst_2_100(m)][nearst_2_100(m)][nearst_k(k)] = flop_rate;
+            max_flop_rate = SUPERLU_MAX(flop_rate, max_flop_rate);
+        }
+        fclose(fp);
+#ifdef GPU_DEBUG
+        if (!my_rank) printf("ACC: MAX FLOP Rate %lf GFLOP/Sec\n", max_flop_rate );
+#endif
+    }
+    else
+    {
+        printf("Assign ACC DGEMM performance table \n");
+        exit(0);
+    }
+
+    ttemp = getenv("ACC_SCATTER_PERF_TABLE");
+    if (ttemp)
+    {
+        fp = fopen(ttemp, "r");
+        double max_mop_rate = 0;
+        while ((line = fgets(buffer, sizeof(buffer), fp)) != NULL)
+        {
+
+            int m;
+            double mop_rate, sd;
+            sscanf(line, "%d, %lf, %lf", &m, &mop_rate, &sd);
+            PhiBWLookUp[nearst_2_100(m)] = mop_rate;
+            max_mop_rate = SUPERLU_MAX(mop_rate, max_mop_rate);
+        }
+        fclose(fp);
+#ifdef GPU_DEBUG
+        if (!my_rank) printf("ACC: MAX MOP Rate %lf GFLOP/Sec\n", max_mop_rate );
+#endif
+    }
+    else
+    {
+        printf("Assign ACC DGEMM performance table \n");
+        exit(0);
+    }
+
+
+    ttemp = getenv("ACC_PCI_BW_TABLE");
+    if (ttemp)
+    {
+        fp = fopen(ttemp, "r");
+
+        while ((line = fgets(buffer, sizeof(buffer), fp)) != NULL)
+        {
+
+            int m;
+            double bw;
+            sscanf(line, "%d,%lf", &m, &bw);
+            MicPciBandwidth[my_log2(m / 1000)] = bw;
+
+        }
+        fclose(fp);
+    }
+    else
+    {
+        printf("Assign ACC_PCI_BW_TABLE \n");
+        exit(0);
+    }
+
+    ttemp = getenv("ACC_SCATTER_BW_TABLE");
+    if (ttemp)
+    {
+        fp = fopen(ttemp, "r");
+
+
+        for (int i = 0; i < 24; ++i)
+        {
+            for (int j = 0; j < 24; ++j)
+            {
+                fscanf(fp, "%lf", &MicScatterBW[i][j]);
+                // printf("%d  %d %lf\n",i,j,MicScatterBW[i][j] );
+            }
+        }
+
+
+        fclose(fp);
+    }
+    else
+    {
+        printf("Assign ACC_SCATTER_BW_TABLE \n");
+        exit(0);
+    }
+
+#ifdef OFFLOAD_PROFILE
+    ttemp = getenv("ACC_SCATTER_TLI_TABLE");
+    if (ttemp)
+    {
+        fp = fopen(ttemp, "r");
+        double max_mop_rate = 0;
+
+        for (int i = 0; i < MAX_BLOCK_SIZE / STEPPING; ++i)
+        {
+            for (int j = 0; j < MAX_BLOCK_SIZE / STEPPING; ++j)
+            {
+
+                fscanf(fp, "%lf", &MicScatterTLI[i][j]);
+                if (MicScatterTLI[i][j] > 2)
+                {
+                    MicScatterTLI[i][j] = 2;
+                }
+                // printf("%lf \n", MicScatterTLI[i][j]);
+            }
+        }
+
+
+        fclose(fp);
+    }
+    else
+    {
+        printf("ACC_SCATTER_TLI_TABLE \n");
+        exit(0);
+    }
+
+    ttemp = getenv("HOST_SCU_PERF_TABLE");
+    if (ttemp)
+    {
+        fp = fopen(ttemp, "r");
+        for (int_t k = 0; k < CBLOCK / CSTEPPING; ++k)
+        {
+
+            for (int_t i = 0; i < CBLOCK / CSTEPPING; ++i)
+            {
+                for (int_t j = 0; j < CBLOCK / CSTEPPING; ++j)
+                {
+                    fscanf(fp, "%lf", &host_scu_flop_rate[k][i][j]);
+
+                }
+
+            }
+        }
+        fclose(fp);
+    }
+    else
+    {
+        printf("please assign HOST_SCU_PERF_TABLE \n");
+        exit(0);
+    }
+
+#endif
+
+    ttemp = getenv("FIXED_PARTITION");
+    if (ttemp)
+    {
+        fixed_partition = atoi(ttemp);
+        if (fixed_partition)
+        {
+            printf("Using fixed workload partition \n");
+            ttemp = getenv("CPU_ACC_WORK_PARTITION");
+            if (ttemp)
+            {
+                frac  = atof (ttemp);
+            }
+            else
+            {
+                frac = 1;
+            }
+
+        }
+
+    }
+    else
+    {
+        fixed_partition = 0;
+    }
+
+} /* end LookupTableInit */
+
+
+double l_count[24];    /*used for keeping entries*/
+double u_count[24];     /*for keeping u entries*/
+
+double
+estimate_acc_scatter_time_strat1(Ublock_info_t* Ublock_info, int_t nub, Remain_info_t* Lblock_info, int_t nlb )
+{
+    for (int i = 0; i < 24; ++i)
+    {
+        l_count[i] = 0;
+        u_count[i] = 0;
+    }
+
+    int_t cum_nrows = 0;
+    int_t cum_ncols = 0;
+    for (int i = 0; i < nub; ++i)
+    {
+        int_t ncols = Ublock_info[i].ncols;
+        int_t ind = SUPERLU_MAX(CEILING(ncols, 8) - 1, 0);
+        u_count[ind] += (double) ncols;
+        cum_ncols += ncols;
+
+    }
+
+
+    for (int i = 0; i < nlb; ++i)
+    {
+        int_t nrows = Lblock_info[i].nrows;
+        int_t ind = SUPERLU_MAX(CEILING(nrows, 8) - 1, 0);
+        l_count[ind] += (double) nrows;
+
+        cum_nrows += nrows;
+
+    }
+
+    double ttime = 0;
+    for (int i = 0; i < 24; ++i)
+    {
+        for (int j = 0; j < 24; ++j)
+        {
+            /* code */
+            ttime += 8 * 3e-9 * l_count[i] * u_count[j] / MicScatterBW[i][j];
+        }
+    }
+
+    // ttime *= (MicScatterTLI[CLAMP( CEILING(cum_nrows, STEPPING) ,0 , MAX_BLOCK_SIZE/STEPPING -1 )][CLAMP( CEILING(cum_ncols, STEPPING) ,0 , MAX_BLOCK_SIZE/STEPPING -1 )]) ;
+    ttime *= SUPERLU_MIN(nub * nlb / 240 , 1);
+    return ttime;
+}
+
+#ifdef OFFLOAD_PROFILE
+/*following is a good strategy; gives good prediction but for some reason I do not get over all performance
+improvement so I've ommited this thing out*/
+double
+estimate_cpu_sc_time_strat1(int_t ldu, Ublock_info_t* Ublock_info, int_t nub, Remain_info_t* Lblock_info, int_t nlb )
+{
+    int_t ind_k = SUPERLU_MAX(CEILING(ldu, 8) - 1, 0);
+    for (int i = 0; i < 24; ++i)
+    {
+        l_count[i] = 0;
+        u_count[i] = 0;
+    }
+
+    int_t cum_nrows = 0;
+    int_t cum_ncols = 0;
+    for (int i = 0; i < nub; ++i)
+    {
+        int_t ncols = Ublock_info[i].ncols;
+        int_t ind = SUPERLU_MAX(CEILING(ncols, 8) - 1, 0);
+        u_count[ind] += (double) ncols;
+        cum_ncols += ncols;
+
+    }
+
+
+    for (int i = 0; i < nlb; ++i)
+    {
+        int_t nrows = Lblock_info[i].nrows;
+        int_t ind = SUPERLU_MAX(CEILING(nrows, 8) - 1, 0);
+        l_count[ind] += (double) nrows;
+        cum_nrows += nrows;
+    }
+
+    double ttime = 0;
+    for (int i = 0; i < 24; ++i)
+    {
+        for (int j = 0; j < 24; ++j)
+        {
+            /* flop rate is in gf/sec */
+            ttime += 2e-9 * ldu * l_count[i] * u_count[j] / host_scu_flop_rate[ind_k][i][j];
+        }
+    }
+
+    return ttime;
+}
+
+#endif
+
+/* Sherry: this routine is not called */
+int_t fixed_cpu_acc_partition (Ublock_info_t *Ublock_info_Phi, int_t num_u_blks_Phi , int_t Rnbrow, int_t ldu_Phi)
+{
+    int_t acc_cols, cpu_cols;
+    int_t total_cols = Ublock_info_Phi[num_u_blks_Phi - 1].full_u_cols;
+    if (frac == 0)
+    {
+        return num_u_blks_Phi;
+    }
+    else if (frac == 1)
+    {
+        return 0;
+    }
+
+    for (int_t j = num_u_blks_Phi - 1; j > -1; --j)      // ###
+    {
+
+        acc_cols = (j == 0) ? 0 : Ublock_info_Phi[j - 1].full_u_cols ;
+        cpu_cols = total_cols - acc_cols;
+
+        if (estimate_acc_time (Rnbrow, acc_cols, ldu_Phi) < acc_async_cost)
+        {
+            break;
+        }
+        if (cpu_cols > frac * total_cols )
+        {
+            return j;
+        }
+
+    }
+
+    return 0;
+}
+
+
+/* Partition the "num_u_blks_Phi" portion into GPU and CPU part,
+   based on the estimated computational cost on CPU and GPU.
+   This was useful for the old Intel Phi architecture, but for the
+   new architecture, such as Titan and Summit, we can give everything
+   to GPU.
+*/
+int_t tuned_partition(int_t num_u_blks_Phi, Ublock_info_t *Ublock_info_Phi, Remain_info_t* Remain_info, int_t RemainBlk, double cpu_time_0, int_t Rnbrow, int_t ldu_Phi )
+{
+    double cpu_time, acc_time;
+    int_t acc_cols, cpu_cols;
+
+    for (int_t j = num_u_blks_Phi - 1; j > -1; --j)      // ###
+    {
+
+        acc_cols = (j == 0) ? 0 : Ublock_info_Phi[j - 1].full_u_cols ;
+        cpu_cols = Ublock_info_Phi[num_u_blks_Phi - 1].full_u_cols - acc_cols;
+        acc_time = estimate_acc_scatter_time_strat1(&Ublock_info_Phi[0], j,
+                   Remain_info,  RemainBlk ) + estimate_acc_gemm_time(Rnbrow, acc_cols, ldu_Phi);
+
+        cpu_time = estimate_cpu_time(Rnbrow, cpu_cols, ldu_Phi) + cpu_time_0;
+
+
+        // SCT.Predicted_host_sch_time[k0] = cpu_time_without_offload;
+        if (cpu_time > acc_time + acc_async_cost)
+        {
+            return j;
+
+        }
+    }
+
+    return 0; /*default value is zero */
+}
+
+
diff --git a/SRC/acc_aux.h b/SRC/acc_aux.h
new file mode 100644
index 00000000..b63ca119
--- /dev/null
+++ b/SRC/acc_aux.h
@@ -0,0 +1,48 @@
+#pragma once
+
+// #include "pdgstrf.h"
+
+
+typedef struct mdwin_t
+{
+	double cpu_bandwidth;
+	int communication_overlap;
+	double acc_async_cost;
+
+
+	int_t fixed_partition;
+	double frac;
+
+	double CpuDgemmLookUp[8][8][9];
+	double PhiDgemmLookUp[8][8][9];
+	double PhiBWLookUp[8];
+	double MicPciBandwidth[18];
+	double MicScatterBW[24][24];
+
+#ifdef OFFLOAD_PROFILE
+	double MicScatterTLI[MAX_BLOCK_SIZE / STEPPING][MAX_BLOCK_SIZE / STEPPING];
+	double host_scu_flop_rate[CBLOCK / CSTEPPING][CBLOCK / CSTEPPING][CBLOCK / CSTEPPING];
+#endif
+} mdwin_t;
+
+int_t
+get_max_buffer_size ();
+
+double get_acc_async_cost();
+
+double estimate_acc_time(int m, int n , int k);
+
+double estimate_acc_gemm_time(int m, int n , int k);
+
+double estimate_acc_scatter_time(int m, int n , int k);
+
+double estimate_cpu_time(int m, int n , int k);
+
+double acc_data_send_time(size_t sz);
+
+void LookUpTableInit(int my_rank);
+
+
+int_t fixed_cpu_acc_partition (Ublock_info_t *Ublock_info_Phi, int_t num_u_blks_Phi , int_t Rnbrow, int_t ldu_Phi);
+int_t tuned_partition(int_t num_u_blks_Phi, Ublock_info_t *Ublock_info_Phi, Remain_info_t* Remain_info,
+                      int_t RemainBlk, double cpu_time_0, int_t Rnbrow, int_t ldu_Phi );
\ No newline at end of file
diff --git a/SRC/lustruct_gpu.h b/SRC/lustruct_gpu.h
new file mode 100644
index 00000000..8e3a7443
--- /dev/null
+++ b/SRC/lustruct_gpu.h
@@ -0,0 +1,258 @@
+// This file contains descriptions and declarations for structures used
+// in GPU
+/*also declaration used for GPUs*/
+#pragma once // Causes this source file to be included onle once
+
+#define DEBUG
+// #ifdef DEBUG
+// #include 
+// #endif
+// #include 
+// #include "mkl.h"
+
+// #define USE_VENDOR_BLAS
+
+
+#include 
+#include 
+#include "superlu_ddefs.h"
+// #include "sec_structs.h"
+// #include "supernodal_etree.h"
+
+
+#define SLU_TARGET_GPU 0
+
+#define MAX_BLOCK_SIZE 10000
+
+
+static
+void check(cudaError_t result, char const *const func, const char *const file, int_t const line)
+{
+    if (result)
+    {
+        fprintf(stderr, "CUDA error at file %s: line %d code=(%s) \"%s\" \n",
+                file, line, cudaGetErrorString(result), func);
+
+        // Make sure we call CUDA Device Reset before exiting
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define checkCudaErrors(val)           check ( (val), #val, __FILE__, __LINE__ )
+
+typedef struct SCUbuf_gpu_
+{
+    /*Informations for various buffers*/
+    double *bigV;
+    double *bigU;
+    double *bigU_host;      /*pinned location*/
+    int_t *indirect;        /*for indirect address calculations*/
+    int_t *indirect2;       /*for indirect address calculations*/
+
+    double *Remain_L_buff;  /* on GPU */
+    double *Remain_L_buff_host; /* Sherry: this memory is page-locked, why need another copy on GPU ? */
+    
+    int_t *lsub;
+    int_t *usub;
+
+    int_t *lsub_buf, *usub_buf;
+    
+    Ublock_info_t *Ublock_info; /* on GPU */
+    Remain_info_t *Remain_info;
+    Ublock_info_t *Ublock_info_host;
+    Remain_info_t *Remain_info_host;
+
+    int_t* usub_IndirectJ3;  /* on GPU */
+    int_t* usub_IndirectJ3_host;
+
+} SCUbuf_gpu_t;
+
+
+#define MAX_NCUDA_STREAMS 32
+
+typedef struct LUstruct_gpu_ 
+{
+
+    int_t   *LrowindVec;      /* A single vector */
+    int_t   *LrowindPtr;      /* A single vector */
+
+    double  *LnzvalVec;       /* A single vector */
+    int_t   *LnzvalPtr;       /* A single vector */
+    int_t   *LnzvalPtr_host;  /* A single vector */
+
+    int_t   *UrowindVec;            /* A single vector */
+    int_t   *UrowindPtr;            /* A single vector */
+    int_t   *UrowindPtr_host;       /* A single vector */
+    int_t   *UnzvalPtr_host;
+
+    double  *UnzvalVec;       /* A single vector */
+    int_t   *UnzvalPtr;      /* A single vector */
+    /*gpu pointers for easy block accesses */
+    local_l_blk_info_t *local_l_blk_infoVec;
+    int_t *local_l_blk_infoPtr;
+    int_t *jib_lookupVec;
+    int_t *jib_lookupPtr;
+    local_u_blk_info_t *local_u_blk_infoVec;
+
+    int_t *local_u_blk_infoPtr;
+    int_t *ijb_lookupVec;
+    int_t *ijb_lookupPtr;
+
+    // GPU buffers for performing Schur Complement Update on GPU
+    SCUbuf_gpu_t scubufs[MAX_NCUDA_STREAMS];
+    double *acc_L_buff, *acc_U_buff;
+
+    /*Informations for various buffers*/
+    int_t buffer_size;      /**/
+    int_t nsupers;  /*should have number of supernodes*/
+    int_t *xsup;
+    gridinfo_t *grid;
+
+
+    double ScatterMOPCounter;
+    double ScatterMOPTimer;
+    double GemmFLOPCounter;
+    double GemmFLOPTimer;
+
+    double cPCIeH2D;
+    double cPCIeD2H;
+    double tHost_PCIeH2D;
+    double tHost_PCIeD2H;
+
+
+    /*cuda events to measure DGEMM and SCATTER timing */
+    int_t *isOffloaded;       /*stores if any iteration is offloaded or not*/
+    cudaEvent_t *GemmStart, *GemmEnd, *ScatterEnd;  /*cuda events to store gemm and scatter's begin and end*/
+    cudaEvent_t *ePCIeH2D;
+    cudaEvent_t *ePCIeD2H_Start;
+    cudaEvent_t *ePCIeD2H_End;
+
+    int_t *xsup_host;
+    int_t* perm_c_supno;
+    int_t first_l_block_gpu, first_u_block_gpu;
+} LUstruct_gpu;
+
+
+typedef struct sluGPU_t_
+{
+
+    int_t gpuId;        // if there are multiple GPUs
+    LUstruct_gpu *A_gpu, *dA_gpu;
+    cudaStream_t funCallStreams[MAX_NCUDA_STREAMS], CopyStream;
+    cublasHandle_t cublasHandles[MAX_NCUDA_STREAMS];
+    int_t lastOffloadStream[MAX_NCUDA_STREAMS];
+    int_t nCudaStreams;
+    int_t* isNodeInMyGrid;
+    double acc_async_cost;
+
+} sluGPU_t;
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+int_t initD2Hreduce(
+    int_t next_k,
+    d2Hreduce_t* d2Hred,
+    int_t last_flag,
+    // int_t *perm_c_supno,
+    HyP_t* HyP,
+    sluGPU_t *sluGPU,
+    gridinfo_t *grid,
+    LUstruct_t *LUstruct
+    ,SCT_t* SCT
+);
+
+int_t reduceGPUlu(
+    
+    int_t last_flag,
+    d2Hreduce_t* d2Hred,
+    sluGPU_t *sluGPU,
+    SCT_t *SCT,
+    gridinfo_t *grid,
+    LUstruct_t *LUstruct
+);
+
+int_t waitGPUscu(int_t streamId, sluGPU_t *sluGPU, SCT_t *SCT);
+int_t sendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred, sluGPU_t *sluGPU);
+int_t sendSCUdataHost2GPU(
+    int_t streamId,
+    int_t* lsub,
+    int_t* usub,
+    double* bigU,
+    int_t bigu_send_size,
+    int_t Remain_lbuf_send_size,
+    sluGPU_t *sluGPU,
+    HyP_t* HyP
+);
+
+int_t initSluGPU3D_t(
+    
+    sluGPU_t *sluGPU,
+    LUstruct_t *LUstruct,
+    gridinfo3d_t * grid3d,
+    int_t* perm_c_supno,
+    int_t n,
+    int_t buffer_size,
+    int_t bigu_size,
+    int_t ldt
+);
+int_t SchurCompUpdate_GPU(
+    int_t streamId,
+    int_t jj_cpu, int_t nub, int_t klst, int_t knsupc,
+    int_t Rnbrow, int_t RemainBlk,
+    int_t Remain_lbuf_send_size,
+    int_t bigu_send_size, int_t ldu,
+    int_t mcb,
+    int_t buffer_size, int_t lsub_len, int_t usub_len,
+    int_t ldt, int_t k0,
+    sluGPU_t *sluGPU, gridinfo_t *grid
+);
+
+
+
+void CopyLUToGPU3D (
+    int_t* isNodeInMyGrid,
+    LocalLU_t *A_host,
+    sluGPU_t *sluGPU,
+    Glu_persist_t *Glu_persist, int_t n,
+    gridinfo3d_t *grid3d,
+    int_t buffer_size,
+    int_t bigu_size,
+    int_t ldt
+);
+
+int_t reduceAllAncestors3d_GPU(
+    int_t ilvl, int_t* myNodeCount,
+    int_t** treePerm,
+    dLUValSubBuf_t*LUvsb,
+    LUstruct_t* LUstruct,
+    gridinfo3d_t* grid3d,
+    sluGPU_t *sluGPU,
+    d2Hreduce_t* d2Hred,
+    factStat_t *factStat,
+    HyP_t* HyP,
+    SCT_t* SCT );
+
+
+void syncAllfunCallStreams(sluGPU_t* sluGPU, SCT_t* SCT);
+int_t free_LUstruct_gpu (LUstruct_gpu *A_gpu);
+
+int_t freeSluGPU(sluGPU_t *sluGPU);
+
+cublasStatus_t checkCublas(cublasStatus_t result);
+// cudaError_t checkCuda(cudaError_t result);
+
+void dPrint_matrix( char *desc, int_t m, int_t n, double *dA, int_t lda );
+
+/*to print out various statistics*/
+void printGPUStats(LUstruct_gpu *A_gpu);
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef DEBUG
diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c
index ed8cc562..b9a884dd 100644
--- a/SRC/pdgstrf3d.c
+++ b/SRC/pdgstrf3d.c
@@ -206,27 +206,19 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), ldt, grid);
 
     /* Setting up GPU related data structures */
-
+#define GPU_FRAMEWORK
+#ifdef GPU_FRAMEWORK
     int_t first_l_block_acc = 0;
     int_t first_u_block_acc = 0;
-    int_t Pc = grid->npcol;
+    int_t Pc = grid->npcol; 
     int_t Pr = grid->nprow;
-    int_t mrb =    (nsupers + Pr - 1) / Pr;
+    int_t mrb =    (nsupers + Pr - 1) / Pr;  // Sherry check ... use ceiling
     int_t mcb =    (nsupers + Pc - 1) / Pc;
-    HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t));
-
-    dInit_HyP(HyP, Llu, mcb, mrb);
-
+    HyP_t *HyP = (HyP_t *) malloc(sizeof(HyP_t));
+    Init_HyP(HyP, Llu, mcb, mrb);
     HyP->first_l_block_acc = first_l_block_acc;
     HyP->first_u_block_acc = first_u_block_acc;
-    int_t bigu_size = getBigUSize(nsupers, grid, LUstruct);
-    // int_t buffer_size = get_max_buffer_size ();
-    // HyP->buffer_size = buffer_size;
-    HyP->bigu_size = bigu_size;
-    HyP->nsupers = nsupers;
-
-#ifdef GPU_ACC
-
+    int_t superlu_acc_offload = HyP->superlu_acc_offload;
     /*Now initialize the GPU data structure*/
     LUstruct_gpu *A_gpu, *dA_gpu;
 
@@ -235,24 +227,43 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     sluGPU_t sluGPUobj;
     sluGPU_t *sluGPU = &sluGPUobj;
     sluGPU->isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm);
+
+    int_t bigu_size = getBigUSize(nsupers, grid, LUstruct);
+    int_t buffer_size = get_max_buffer_size ();
+    HyP->buffer_size = buffer_size;
+    HyP->bigu_size = bigu_size;
+    HyP->nsupers = nsupers;
+
     if (superlu_acc_offload)
     {
+
+#if 0 	/* Sherry: For GPU code on titan, we do not need performance 
+	   lookup tables since due to difference in CPU-GPU performance
+	   it didn't make much sense to do any Schur-complement update
+	   on CPU, except for the lookahead-update on CPU. Same should
+	   hold for summit as well. (from Piyush)   */
+
         /*Initilize the lookup tables */
         LookUpTableInit(iam);
         acc_async_cost = get_acc_async_cost();
 #ifdef GPU_DEBUG
         if (!iam) printf("Using MIC async cost of %lf \n", acc_async_cost);
 #endif
-
+#endif
+	int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid);
+	/* Initialize GPU data structures */
         initSluGPU3D_t(sluGPU, LUstruct, grid3d, perm_c_supno,
                        n, buffer_size, bigu_size, ldt);
-
+        
         HyP->first_u_block_acc = sluGPU->A_gpu->first_u_block_gpu;
         HyP->first_l_block_acc = sluGPU->A_gpu->first_l_block_gpu;
         HyP->nCudaStreams = sluGPU->nCudaStreams;
-    }
 
-#endif  // end GPU_ACC
+    } /* end if superlu_acc_offload */
+
+#endif
+
+
 
     /*====  starting main factorization loop =====*/
     MPI_Barrier( grid3d->comm);
diff --git a/SRC/superlu_gpu.cu b/SRC/superlu_gpu.cu
new file mode 100644
index 00000000..241e7362
--- /dev/null
+++ b/SRC/superlu_gpu.cu
@@ -0,0 +1,1856 @@
+#define GPU_DEBUG
+
+#include "mpi.h"
+#include "omp.h"
+#include "sec_structs.h"
+#include 
+#include 
+#include 
+#undef Reduce
+#include 
+#include "lustruct_gpu.h"
+#include "p3dcomm.h"
+// #include "mkl_cblas.h"
+
+extern "C" {
+	void cblas_daxpy(const int N, const double alpha, const double *X,
+	                 const int incX, double *Y, const int incY);
+}
+
+/*error reporting functions */
+static
+cudaError_t checkCuda(cudaError_t result)
+{
+#if defined(DEBUG) || defined(_DEBUG)
+	if (result != cudaSuccess)
+	{
+		fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
+		assert(result == cudaSuccess);
+	}
+#endif
+	return result;
+}
+
+
+cublasStatus_t checkCublas(cublasStatus_t result)
+{
+#if defined(DEBUG) || defined(_DEBUG)
+	if (result != CUBLAS_STATUS_SUCCESS)
+	{
+		fprintf(stderr, "CUDA Blas Runtime Error: %s\n", cublasGetErrorString(result));
+		assert(result == CUBLAS_STATUS_SUCCESS);
+	}
+#endif
+	return result;
+}
+
+
+int_t getnCudaStreams()
+{
+        char *ttemp;
+	ttemp = getenv ("N_CUDA_STREAMS");
+
+	if (ttemp)
+		return atoi (ttemp);
+	else
+		return 1;
+}
+
+
+// #define UNIT_STRIDE
+
+__device__ inline
+void device_scatter_l (int_t thread_id,
+                       int_t nsupc, int_t temp_nbrow,
+                       int_t *usub, int_t iukp, int_t klst,
+                       double *nzval, int_t ldv,
+                       double *tempv, int_t nbrow,
+                       // int_t *indirect2_thread
+                       int *indirect2_thread
+                      )
+{
+
+
+	int_t segsize, jj;
+
+	for (jj = 0; jj < nsupc; ++jj)
+	{
+		segsize = klst - usub[iukp + jj];
+		if (segsize)
+		{
+			if (thread_id < temp_nbrow)
+			{
+
+#ifndef UNIT_STRIDE
+				nzval[indirect2_thread[thread_id]] -= tempv[thread_id];
+#else
+				nzval[thread_id] -= tempv[thread_id]; /*making access unit strided*/
+#endif
+			}
+			tempv += nbrow;
+		}
+		nzval += ldv;
+	}
+}
+
+#define THREAD_BLOCK_SIZE  512  /* Sherry: was 192 on Titan */
+
+#define MAX_SUPER_SIZE   512    /* Sherry: was 192. Must be <= THREAD_BLOCK_SIZE */
+
+
+__device__ inline
+void device_scatter_l_2D (int thread_id,
+                          int nsupc, int temp_nbrow,
+                          int_t *usub, int iukp, int_t klst,
+                          double *nzval, int ldv,
+                          const double *tempv, int nbrow,
+                          // int_t *indirect2_thread
+                          int *indirect2_thread,
+                          int nnz_cols, int ColPerBlock,
+                          int *IndirectJ3
+                         )
+{
+    if ( thread_id < temp_nbrow * ColPerBlock )
+    {
+	int thread_id_x  = thread_id % temp_nbrow;
+	int thread_id_y  = thread_id / temp_nbrow;
+
+#define UNROLL_ITER 8
+
+#pragma unroll 4
+	for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock)
+	{
+	    nzval[ldv * IndirectJ3[col] + indirect2_thread[thread_id_x]]
+		-= tempv[nbrow * col + thread_id_x];
+	}
+    }
+}
+
+/* Sherry: this routine is not used */
+__global__
+void cub_scan_test(void)
+{
+    int thread_id = threadIdx.x;
+    typedef cub::BlockScan BlockScan; /*1D int data type*/
+
+    __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/
+    
+    __shared__ int IndirectJ1[MAX_SUPER_SIZE];
+    __shared__ int IndirectJ2[MAX_SUPER_SIZE];
+
+    if (thread_id < MAX_SUPER_SIZE)
+    {
+	IndirectJ1[thread_id] = (thread_id + 1) % 2;
+    }
+
+    __syncthreads();
+    if (thread_id < MAX_SUPER_SIZE)
+	BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]);
+
+
+    if (thread_id < MAX_SUPER_SIZE)
+	printf("%d %d\n", thread_id, IndirectJ2[thread_id]);
+
+}
+
+__device__ inline
+void device_scatter_u_2D (int thread_id,
+                          int temp_nbrow,  int nsupc,
+                          double * ucol,
+                          int_t * usub, int iukp,
+                          int_t ilst, int_t klst,
+                          int_t * index, int iuip_lib,
+                          double * tempv, int nbrow,
+                          int *indirect,
+                          int nnz_cols, int ColPerBlock,
+                          int *IndirectJ1,
+                          int *IndirectJ3
+                         )
+{
+    if ( thread_id < temp_nbrow * ColPerBlock )
+    {
+	/* 1D threads are logically arranged in 2D shape. */
+	int thread_id_x  = thread_id % temp_nbrow;
+	int thread_id_y  = thread_id / temp_nbrow;
+
+#pragma unroll 4
+	for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock)
+	{
+	    ucol[IndirectJ1[IndirectJ3[col]] + indirect[thread_id_x]]
+		-= tempv[nbrow * col + thread_id_x];
+	}
+    }
+}
+
+
+__device__ inline
+void device_scatter_u (int_t thread_id,
+                       int_t temp_nbrow,  int_t nsupc,
+                       double * ucol,
+                       int_t * usub, int_t iukp,
+                       int_t ilst, int_t klst,
+                       int_t * index, int_t iuip_lib,
+                       double * tempv, int_t nbrow,
+                       // int_t *indirect
+                       int *indirect
+                      )
+{
+	int_t segsize, fnz, jj;
+	for (jj = 0; jj < nsupc; ++jj)
+	{
+		segsize = klst - usub[iukp + jj];
+		fnz = index[iuip_lib++];
+		ucol -= fnz;
+		if (segsize)            /* Nonzero segment in U(k.j). */
+		{
+
+
+			if (thread_id < temp_nbrow)
+			{
+#ifndef UNIT_STRIDE
+				ucol[indirect[thread_id]] -= tempv[thread_id];
+#else
+				/*making access unit strided;
+				it doesn't work; it for measurements */
+				ucol[thread_id] -= tempv[thread_id];
+#endif
+			}
+			tempv += nbrow;
+		}
+		ucol += ilst ;
+	}
+}
+
+
+__global__
+void Scatter_GPU_kernel(
+    int_t streamId,
+    int_t ii_st, int_t ii_end, 
+    int_t jj_st, int_t jj_end, /* defines rectangular Schur block to be scatter */
+    int_t klst,
+    int_t jj0,   /* 0 on entry */
+    int_t nrows, int_t ldt, int_t npcol, int_t nprow,
+    LUstruct_gpu * A_gpu)
+{
+
+    /* initializing pointers */
+    int_t *xsup = A_gpu->xsup;
+    int_t *UrowindPtr = A_gpu->UrowindPtr;
+    int_t *UrowindVec = A_gpu->UrowindVec;
+    int_t *UnzvalPtr = A_gpu->UnzvalPtr;
+    double *UnzvalVec = A_gpu->UnzvalVec;
+    int_t *LrowindPtr = A_gpu->LrowindPtr;
+    int_t *LrowindVec = A_gpu->LrowindVec;
+    int_t *LnzvalPtr = A_gpu->LnzvalPtr;
+    double *LnzvalVec = A_gpu->LnzvalVec;
+    double *bigV = A_gpu->scubufs[streamId].bigV;
+    local_l_blk_info_t *local_l_blk_infoVec = A_gpu->local_l_blk_infoVec;
+    local_u_blk_info_t *local_u_blk_infoVec = A_gpu->local_u_blk_infoVec;
+    int_t *local_l_blk_infoPtr = A_gpu->local_l_blk_infoPtr;
+    int_t *local_u_blk_infoPtr = A_gpu->local_u_blk_infoPtr;
+    Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info;
+    Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info;
+    int_t *lsub  = A_gpu->scubufs[streamId].lsub;
+    int_t *usub  = A_gpu->scubufs[streamId].usub;
+
+    /* thread block assignment: this thread block is 
+       assigned to block (lb, j) in 2D grid */
+    int lb = blockIdx.x + ii_st;
+    int j  = blockIdx.y + jj_st;
+    __shared__ int indirect_thread[MAX_SUPER_SIZE];  /* row-wise */
+    __shared__ int indirect2_thread[MAX_SUPER_SIZE]; /* row-wise */
+    __shared__ int IndirectJ1[THREAD_BLOCK_SIZE];    /* column-wise */
+    __shared__ int IndirectJ3[THREAD_BLOCK_SIZE];    /* column-wise */
+
+    /* see CUB page https://nvlabs.github.io/cub/. Implement threads collectives */
+    typedef cub::BlockScan BlockScan; /*1D int data type*/
+    __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/
+
+    int thread_id = threadIdx.x;
+
+    int iukp = Ublock_info[j].iukp;
+    int jb = Ublock_info[j].jb;
+    int nsupc = SuperSize (jb);
+    int ljb = jb / npcol;
+
+    double *tempv1;
+    if (jj_st == jj0)
+    {
+	tempv1 = (j == jj_st) ? bigV
+	                      : bigV + Ublock_info[j - 1].full_u_cols * nrows;
+    }
+    else
+    {
+	tempv1 = (j == jj_st) ? bigV 
+	                      : bigV + (Ublock_info[j - 1].full_u_cols -
+					Ublock_info[jj_st - 1].full_u_cols) * nrows;
+    }
+
+    /* # of nonzero columns in block j  */
+    int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols
+	: (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols);
+    int cum_ncol = (j == 0) ? 0	: Ublock_info[j - 1].full_u_cols;
+
+    int lptr = Remain_info[lb].lptr; 
+    int ib   = Remain_info[lb].ib;
+    int temp_nbrow = lsub[lptr + 1]; /* number of rows in the current L block */
+    lptr += LB_DESCRIPTOR;
+
+    int_t cum_nrow;
+    if (ii_st == 0)
+    {
+	cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+    }
+    else
+    {
+	cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow - Remain_info[ii_st - 1].FullRow);
+    }
+
+    tempv1 += cum_nrow;
+
+    if (ib < jb)  /*scatter U code */
+    {
+	int ilst = FstBlockC (ib + 1);
+	int lib =  ib / nprow;   /* local index of row block ib */
+	int_t *index = &UrowindVec[UrowindPtr[lib]];
+
+	int num_u_blocks = index[0];
+
+	int ljb = (jb) / npcol; /* local index of column block jb */
+
+	/* Each thread is responsible for one block column */
+	__shared__ int ljb_ind;
+	/*do a search ljb_ind at local row lib*/
+	int blks_per_threads = CEILING(num_u_blocks, THREAD_BLOCK_SIZE);
+	for (int i = 0; i < blks_per_threads; ++i) 
+	    /* each thread is assigned a chunk of consecutive U blocks to search */
+	{   
+	    /* only one thread finds the block index matching ljb */
+	    if (thread_id * blks_per_threads + i < num_u_blocks &&
+		local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + thread_id * blks_per_threads + i ].ljb == ljb)
+	    {
+		ljb_ind = thread_id * blks_per_threads + i; 
+	    }
+	}
+	__syncthreads();
+
+	int iuip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].iuip;
+	int ruip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].ruip;
+	iuip_lib += UB_DESCRIPTOR;
+	double *Unzval_lib = &UnzvalVec[UnzvalPtr[lib]];
+	double *ucol = &Unzval_lib[ruip_lib];
+
+	if (thread_id < temp_nbrow) /* row-wise */
+	{   /* cyclically map each thread to a row */
+	    indirect_thread[thread_id] = (int) lsub[lptr + thread_id];
+	}
+
+	/* column-wise: each thread is assigned one column */
+	if (thread_id < nnz_cols)
+	    IndirectJ3[thread_id] = A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id];
+	/* indirectJ3[j] == kk means the j-th nonzero segment
+	   points to column kk in this supernode */
+
+	__syncthreads();
+
+	/* threads are divided into multiple columns */
+	int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow;
+	
+	if (thread_id < THREAD_BLOCK_SIZE)
+	    IndirectJ1[thread_id] = 0;
+
+	if (thread_id < THREAD_BLOCK_SIZE)
+	{
+	    if (thread_id < nsupc)
+	    {
+		/* fstnz subscript of each column in the block */
+		IndirectJ1[thread_id] = index[iuip_lib + thread_id];
+	    }
+	}
+
+	/* perform an inclusive block-wide prefix sum among all threads */
+	if (thread_id < THREAD_BLOCK_SIZE)
+	    BlockScan(temp_storage).InclusiveSum(IndirectJ1[thread_id], IndirectJ1[thread_id]);
+
+	if (thread_id < THREAD_BLOCK_SIZE)
+	    IndirectJ1[thread_id] = -IndirectJ1[thread_id] + ilst * thread_id;
+
+	__syncthreads();
+
+	device_scatter_u_2D (
+			     thread_id,
+			     temp_nbrow,  nsupc,
+			     ucol,
+			     usub, iukp,
+			     ilst, klst,
+			     index, iuip_lib,
+			     tempv1, nrows,
+			     indirect_thread,
+			     nnz_cols, ColPerBlock,
+			     IndirectJ1,
+			     IndirectJ3 );
+
+    } else { /* ib >= jb, scatter L code */
+	
+	int rel;
+	double *nzval;
+	int_t *index = &LrowindVec[LrowindPtr[ljb]];
+	int num_l_blocks = index[0];
+	int ldv = index[1];
+
+	int fnz = FstBlockC (ib);
+	int lib = ib / nprow;
+
+	__shared__ int lib_ind;
+	/*do a search lib_ind for lib*/
+	int blks_per_threads = CEILING(num_l_blocks, THREAD_BLOCK_SIZE);
+	for (int i = 0; i < blks_per_threads; ++i)
+	{
+	    if (thread_id * blks_per_threads + i < num_l_blocks &&
+		local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + thread_id * blks_per_threads + i ].lib == lib)
+	    {
+		lib_ind = thread_id * blks_per_threads + i;
+	    }
+	}
+	__syncthreads();
+
+	int lptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].lptrj;
+	int luptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].luptrj;
+	lptrj += LB_DESCRIPTOR;
+	int dest_nbrow = index[lptrj - 1];
+
+	if (thread_id < dest_nbrow)
+	{
+	    rel = index[lptrj + thread_id] - fnz;
+	    indirect_thread[rel] = thread_id;
+	}
+	__syncthreads();
+
+	/* can be precalculated */
+	if (thread_id < temp_nbrow)
+	{
+	    rel = lsub[lptr + thread_id] - fnz;
+	    indirect2_thread[thread_id] = indirect_thread[rel];
+	}
+	if (thread_id < nnz_cols)
+	    IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id];
+	__syncthreads();
+
+	int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow;
+
+	nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj;
+	device_scatter_l_2D(
+			    thread_id,
+			    nsupc, temp_nbrow,
+			    usub, iukp, klst,
+			    nzval, ldv,
+			    tempv1, nrows, indirect2_thread,
+			    nnz_cols, ColPerBlock,
+			    IndirectJ3);
+    } /* end else ib >= jb */
+
+} /* end Scatter_GPU_kernel */
+
+
+#define GPU_2D_SCHUDT  /* Not used */
+
+int_t SchurCompUpdate_GPU(
+    int_t streamId,
+    int_t jj_cpu, /* 0 on entry, pointing to the start of Phi part */
+    int_t nub,    /* jj_cpu on entry, pointing to the end of the Phi part */
+    int_t klst, int_t knsupc,
+    int_t Rnbrow, int_t RemainBlk,
+    int_t Remain_lbuf_send_size,
+    int_t bigu_send_size, int_t ldu,
+    int_t mcb,    /* num_u_blks_hi */
+    int_t buffer_size, int_t lsub_len, int_t usub_len,
+    int_t ldt, int_t k0,
+    sluGPU_t *sluGPU, gridinfo_t *grid
+)
+{
+
+    LUstruct_gpu * A_gpu = sluGPU->A_gpu;
+    LUstruct_gpu * dA_gpu = sluGPU->dA_gpu;
+    int_t nprow = grid->nprow;
+    int_t npcol = grid->npcol;
+
+    cudaStream_t FunCallStream = sluGPU->funCallStreams[streamId];
+    cublasHandle_t cublas_handle0 = sluGPU->cublasHandles[streamId];
+    int_t * lsub = A_gpu->scubufs[streamId].lsub_buf;
+    int_t * usub = A_gpu->scubufs[streamId].usub_buf;
+    Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info_host;
+    double * Remain_L_buff = A_gpu->scubufs[streamId].Remain_L_buff_host;
+    Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info_host;
+    double * bigU = A_gpu->scubufs[streamId].bigU_host;
+
+    A_gpu->isOffloaded[k0] = 1;
+    /* start by sending data to  */
+    int_t *xsup = A_gpu->xsup_host;
+    int_t col_back = (jj_cpu == 0) ? 0 : Ublock_info[jj_cpu - 1].full_u_cols;
+    // if(nub<1) return;
+    int_t ncols  = Ublock_info[nub - 1].full_u_cols - col_back;
+
+    /* Sherry: can get max_super_size from sp_ienv(3) */
+    int_t indirectJ1[MAX_SUPER_SIZE]; // 0 indicates an empry segment
+    int_t indirectJ2[MAX_SUPER_SIZE]; // # of nonzero segments so far 
+    int_t indirectJ3[MAX_SUPER_SIZE]; /* indirectJ3[j] == k means the
+					 j-th nonzero segment points
+					 to column k in this supernode */
+    /* calculate usub_indirect */
+    for (int jj = jj_cpu; jj < nub; ++jj)
+    {
+	int_t iukp = Ublock_info[jj].iukp;
+	int_t jb = Ublock_info[jj].jb;
+	int_t nsupc = SuperSize (jb);
+	int_t addr = (jj == 0) ? 0
+	                       : Ublock_info[jj - 1].full_u_cols - col_back;
+
+	for (int_t kk = 0; kk < MAX_SUPER_SIZE; ++kk)
+	{
+	    indirectJ1[kk] = 0;
+	}
+
+	for (int_t kk = 0; kk < nsupc; ++kk)
+	{
+	    indirectJ1[kk] = ((klst - usub[iukp + kk]) == 0) ? 0 : 1;
+	}
+
+	/*prefix sum - indicates # of nonzero segments up to column kk */
+	indirectJ2[0] = indirectJ1[0];
+	for (int_t kk = 1; kk < MAX_SUPER_SIZE; ++kk)
+	{
+	    indirectJ2[kk] = indirectJ2[kk - 1] + indirectJ1[kk];
+	}
+
+	/* total number of nonzero segments in this supernode */
+	int nnz_col = indirectJ2[MAX_SUPER_SIZE - 1];
+
+	/* compactation */
+	for (int_t kk = 0; kk < MAX_SUPER_SIZE; ++kk)
+	{
+	    if (indirectJ1[kk]) /* kk is a nonzero segment */
+	    {
+		/* indirectJ3[j] == kk means the j-th nonzero segment
+		   points to column kk in this supernode */
+		indirectJ3[indirectJ2[kk] - 1] = kk;
+	    }
+	}
+
+	for (int i = 0; i < nnz_col; ++i)
+	{
+	    /* addr == total # of full columns before current block jj */
+	    A_gpu->scubufs[streamId].usub_IndirectJ3_host[addr + i] = indirectJ3[i];
+	}
+    } /* end for jj ... calculate usub_indirect */
+
+    //printf("SchurCompUpdate_GPU[3]: jj_cpu %d, nub %d\n", jj_cpu, nub); fflush(stdout);
+
+    /*sizeof RemainLbuf = Rnbuf*knsupc */
+    double tTmp = SuperLU_timer_();
+    cudaEventRecord(A_gpu->ePCIeH2D[k0], FunCallStream);
+
+    checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].usub_IndirectJ3,
+			      A_gpu->scubufs[streamId].usub_IndirectJ3_host,
+			      ncols * sizeof(int_t), cudaMemcpyHostToDevice,
+			      FunCallStream)) ;
+
+    checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Remain_L_buff, Remain_L_buff,
+			      Remain_lbuf_send_size * sizeof(double),
+			      cudaMemcpyHostToDevice, FunCallStream)) ;
+
+    checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].bigU, bigU,
+			      bigu_send_size * sizeof(double),
+			      cudaMemcpyHostToDevice, FunCallStream) );
+
+    checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Remain_info, Remain_info,
+			      RemainBlk * sizeof(Remain_info_t),
+			      cudaMemcpyHostToDevice, FunCallStream) );
+
+    checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Ublock_info, Ublock_info,
+			      mcb * sizeof(Ublock_info_t), cudaMemcpyHostToDevice,
+			      FunCallStream) );
+
+    checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].lsub, lsub,
+			      lsub_len * sizeof(int_t), cudaMemcpyHostToDevice,
+			      FunCallStream) );
+
+    checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].usub, usub,
+			      usub_len * sizeof(int_t), cudaMemcpyHostToDevice,
+			      FunCallStream) );
+
+    A_gpu->tHost_PCIeH2D += SuperLU_timer_() - tTmp;
+    A_gpu->cPCIeH2D += Remain_lbuf_send_size * sizeof(double)
+	             + bigu_send_size * sizeof(double)
+	             + RemainBlk * sizeof(Remain_info_t)
+	             + mcb * sizeof(Ublock_info_t)
+	             + lsub_len * sizeof(int_t)
+	             + usub_len * sizeof(int_t);
+
+    double alpha = 1.0, beta = 0.0;
+
+    int_t ii_st  = 0;
+    int_t ii_end = 0;
+    int_t maxGemmBlockDim= (int) sqrt(buffer_size);
+    // int_t maxGemmBlockDim = 8000;
+
+    /* Organize GEMM by blocks of [ii_st : ii_end, jj_st : jj_end] that
+       fits in the buffer_size  */
+    while (ii_end < RemainBlk)
+    {
+	ii_st = ii_end;
+	ii_end = RemainBlk;
+	int_t nrow_max = maxGemmBlockDim;
+// nrow_max = Rnbrow;
+	int_t remaining_rows = (ii_st == 0) ? Rnbrow : Rnbrow - Remain_info[ii_st - 1].FullRow;
+	nrow_max = (remaining_rows / nrow_max) > 0 ? remaining_rows / CEILING(remaining_rows,  nrow_max) : nrow_max;
+
+	int_t ResRow = (ii_st == 0) ? 0 : Remain_info[ii_st - 1].FullRow;
+	for (int_t i = ii_st; i < RemainBlk - 1; ++i)
+	{
+	    if ( Remain_info[i + 1].FullRow > ResRow + nrow_max)
+		{
+		    ii_end = i;
+		    break;  /* row dimension reaches nrow_max */
+		}
+	}
+
+	int_t nrows;   /* actual row dimension for GEMM */
+	int_t st_row;
+	if (ii_st > 0)
+	{
+	    nrows = Remain_info[ii_end - 1].FullRow - Remain_info[ii_st - 1].FullRow;
+	    st_row = Remain_info[ii_st - 1].FullRow;
+	}
+	else
+	{
+	    nrows = Remain_info[ii_end - 1].FullRow;
+	    st_row = 0;
+	}
+
+	int_t jj_st = jj_cpu;
+	int_t jj_end = jj_cpu;
+
+	while (jj_end < nub && nrows > 0 )
+	{
+	    int_t remaining_cols = (jj_st == jj_cpu) ? ncols : ncols - Ublock_info[jj_st - 1].full_u_cols;
+	    if ( remaining_cols * nrows < buffer_size)
+	    {
+		jj_st = jj_end;
+		jj_end = nub;
+	    }
+	    else  /* C matrix cannot fit in buffer, need to break into pieces */
+	    {
+		int_t ncol_max = buffer_size / nrows;
+		/** Must revisit **/
+		ncol_max = SUPERLU_MIN(ncol_max, maxGemmBlockDim);
+		ncol_max = (remaining_cols / ncol_max) > 0 ?
+		    remaining_cols / CEILING(remaining_cols,  ncol_max)
+		    : ncol_max;
+
+		jj_st = jj_end;
+		jj_end = nub;
+
+		int_t ResCol = (jj_st == 0) ? 0 : Ublock_info[jj_st - 1].full_u_cols;
+		for (int_t j = jj_st; j < nub - 1; ++j)
+		{
+		    if (Ublock_info[j + 1].full_u_cols > ResCol + ncol_max)
+		    {
+			jj_end = j;
+			break;
+		    }
+		}
+	    } /* end-if-else */
+
+	    int_t ncols;
+	    int_t st_col;
+	    if (jj_st > 0)
+	    {
+		ncols = Ublock_info[jj_end - 1].full_u_cols - Ublock_info[jj_st - 1].full_u_cols;
+		st_col = Ublock_info[jj_st - 1].full_u_cols;
+		if (ncols == 0) exit(0);
+	    }
+	    else
+	    {
+		ncols = Ublock_info[jj_end - 1].full_u_cols;
+		st_col = 0;
+	    }
+
+	    /* none of the matrix dimension is zero. */
+	    if (nrows > 0 && ldu > 0 && ncols > 0)
+	    {
+		if (nrows * ncols > buffer_size) {
+		    printf("!! Matrix size %lld x %lld exceeds buffer_size \n", 
+			   nrows, ncols, buffer_size);
+		    fflush(stdout);
+		}
+		assert(nrows * ncols <= buffer_size);
+		cublasSetStream(cublas_handle0, FunCallStream);
+		cudaEventRecord(A_gpu->GemmStart[k0], FunCallStream);
+		cublasDgemm(cublas_handle0, CUBLAS_OP_N, CUBLAS_OP_N,
+			    nrows, ncols, ldu, &alpha,
+			    &A_gpu->scubufs[streamId].Remain_L_buff[(knsupc - ldu) * Rnbrow + st_row], Rnbrow,
+			    &A_gpu->scubufs[streamId].bigU[st_col * ldu], ldu,
+			    &beta, A_gpu->scubufs[streamId].bigV, nrows);
+
+// #define SCATTER_OPT
+#ifdef SCATTER_OPT
+		cudaStreamSynchronize(FunCallStream);
+#warning this function is synchrnous
+#endif
+		cudaEventRecord(A_gpu->GemmEnd[k0], FunCallStream);
+
+		A_gpu->GemmFLOPCounter += 2.0 * (double) nrows * ncols * ldu ;
+
+		/*
+		 * Scattering the output
+		 */
+		dim3 dimBlock(THREAD_BLOCK_SIZE);   // 1d thread
+
+		dim3 dimGrid(ii_end - ii_st, jj_end - jj_st);
+
+		Scatter_GPU_kernel <<< dimGrid, dimBlock, 0, FunCallStream>>>
+		    (streamId, ii_st, ii_end,  jj_st, jj_end, klst,
+		     0, nrows, ldt, npcol, nprow, dA_gpu);
+#ifdef SCATTER_OPT
+		cudaStreamSynchronize(FunCallStream);
+#warning this function is synchrnous
+#endif
+
+		cudaEventRecord(A_gpu->ScatterEnd[k0], FunCallStream);
+
+		A_gpu->ScatterMOPCounter +=  3.0 * (double) nrows * ncols;
+	    } /* endif ... none of the matrix dimension is zero. */
+
+	} /* end while jj_end < nub */
+
+    } /* end while (ii_end < RemainBlk) */
+
+    return 0;
+} /* end SchurCompUpdate_GPU */
+
+
+void print_occupany()
+{
+	int blockSize;   // The launch configurator returned block size
+	int minGridSize; /* The minimum grid size needed to achieve the
+			    best potential occupancy  */
+
+	cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize,
+	                                    Scatter_GPU_kernel, 0, 0);
+#if (PRNTlevel>=1)
+	printf("Occupancy: MinGridSize %d blocksize %d \n", minGridSize, blockSize);
+#endif
+}
+
+void printDevProp(cudaDeviceProp devProp)
+{
+	size_t mfree, mtotal;
+	cudaMemGetInfo	(&mfree, &mtotal);
+
+	printf("GPU Name:                      %s\n",  devProp.name);
+	printf("Total global memory:           %zu\n",  devProp.totalGlobalMem);
+	printf("Total free memory:             %zu\n",  mfree);
+	printf("Clock rate:                    %d\n",  devProp.clockRate);
+
+	return;
+}
+
+
+int
+get_mpi_process_per_gpu ()
+{
+
+
+	char *ttemp;
+	ttemp = getenv ("MPI_PROCESS_PER_GPU");
+
+	if (ttemp)
+		return atol (ttemp);
+	else
+	{
+		printf("MPI_PROCESS_PER_GPU is not set; Using default 1 \n");
+		return 1;
+	}
+}
+
+size_t
+get_acc_memory ()
+{
+
+	size_t mfree, mtotal;
+	cudaMemGetInfo	(&mfree, &mtotal);
+#if 0
+	printf("Total memory %zu & free memory %zu\n", mtotal, mfree);
+#endif
+	return (size_t) (0.9 * (double) mfree) / get_mpi_process_per_gpu ();
+
+
+}
+
+
+int_t free_LUstruct_gpu (LUstruct_gpu * A_gpu)
+{
+	checkCuda(cudaFree(A_gpu->LrowindVec));
+	checkCuda(cudaFree(A_gpu->LrowindPtr));
+
+	checkCuda(cudaFree(A_gpu->LnzvalVec));
+	checkCuda(cudaFree(A_gpu->LnzvalPtr));
+	free(A_gpu->LnzvalPtr_host);
+	/*freeing the pinned memory*/
+	int_t streamId = 0;
+	checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_info_host));
+	checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Ublock_info_host));
+	checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_L_buff_host));
+	checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].bigU_host));
+
+	checkCuda(cudaFreeHost(A_gpu->acc_L_buff));
+	checkCuda(cudaFreeHost(A_gpu->acc_U_buff));
+	checkCuda(cudaFreeHost(A_gpu->scubufs[streamId].lsub_buf));
+	checkCuda(cudaFreeHost(A_gpu->scubufs[streamId].usub_buf));
+
+
+	free(A_gpu->isOffloaded);
+	free(A_gpu->GemmStart);
+	free(A_gpu->GemmEnd);
+	free(A_gpu->ScatterEnd);
+	free(A_gpu->ePCIeH2D);
+
+	free(A_gpu->ePCIeD2H_Start);
+	free(A_gpu->ePCIeD2H_End);
+
+	checkCuda(cudaFree(A_gpu->UrowindVec));
+	checkCuda(cudaFree(A_gpu->UrowindPtr));
+
+	free(A_gpu->UrowindPtr_host);
+
+	checkCuda(cudaFree(A_gpu->UnzvalVec));
+	checkCuda(cudaFree(A_gpu->UnzvalPtr));
+
+	checkCuda(cudaFree(A_gpu->grid));
+
+
+
+	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV));
+	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU));
+
+	checkCuda(cudaFree(A_gpu->scubufs[streamId].Remain_L_buff));
+	checkCuda(cudaFree(A_gpu->scubufs[streamId].Ublock_info));
+	checkCuda(cudaFree(A_gpu->scubufs[streamId].Remain_info));
+
+	// checkCuda(cudaFree(A_gpu->indirect));
+	// checkCuda(cudaFree(A_gpu->indirect2));
+	checkCuda(cudaFree(A_gpu->xsup));
+
+	checkCuda(cudaFree(A_gpu->scubufs[streamId].lsub));
+	checkCuda(cudaFree(A_gpu->scubufs[streamId].usub));
+
+
+	checkCuda(cudaFree(A_gpu->local_l_blk_infoVec));
+	checkCuda(cudaFree(A_gpu->local_l_blk_infoPtr));
+	checkCuda(cudaFree(A_gpu->jib_lookupVec));
+	checkCuda(cudaFree(A_gpu->jib_lookupPtr));
+	checkCuda(cudaFree(A_gpu->local_u_blk_infoVec));
+	checkCuda(cudaFree(A_gpu->local_u_blk_infoPtr));
+	checkCuda(cudaFree(A_gpu->ijb_lookupVec));
+	checkCuda(cudaFree(A_gpu->ijb_lookupPtr));
+
+	return 0;
+
+}
+
+
+
+void dPrint_matrix( char *desc, int_t m, int_t n, double * dA, int_t lda ) {
+	double *cPtr = (double *) malloc(sizeof(double) * lda * n);
+	checkCuda(cudaMemcpy( cPtr, dA,
+	                      lda * n * sizeof(double), cudaMemcpyDeviceToHost)) ;
+
+	int_t i, j;
+	printf( "\n %s\n", desc );
+	for ( i = 0; i < m; i++ ) {
+		for ( j = 0; j < n; j++ ) printf( " %.3e", cPtr[i + j * lda] );
+		printf( "\n" );
+	}
+	free(cPtr);
+}
+
+void printGPUStats(LUstruct_gpu * A_gpu)
+{
+    double tGemm = 0;
+    double tScatter = 0;
+    double tPCIeH2D = 0;
+    double tPCIeD2H = 0;
+
+    for (int_t i = 0; i < A_gpu->nsupers; ++i)
+    {
+	float milliseconds = 0;
+
+	if (A_gpu->isOffloaded[i])
+	{
+	    cudaEventElapsedTime(&milliseconds, A_gpu->ePCIeH2D[i], A_gpu->GemmStart[i]);
+	    tPCIeH2D += 1e-3 * (double) milliseconds;
+	    milliseconds = 0;
+	    cudaEventElapsedTime(&milliseconds, A_gpu->GemmStart[i], A_gpu->GemmEnd[i]);
+	    tGemm += 1e-3 * (double) milliseconds;
+	    milliseconds = 0;
+	    cudaEventElapsedTime(&milliseconds, A_gpu->GemmEnd[i], A_gpu->ScatterEnd[i]);
+	    tScatter += 1e-3 * (double) milliseconds;
+	}
+
+	milliseconds = 0;
+	cudaEventElapsedTime(&milliseconds, A_gpu->ePCIeD2H_Start[i], A_gpu->ePCIeD2H_End[i]);
+	tPCIeD2H += 1e-3 * (double) milliseconds;
+    }
+
+    printf("GPU: Flops offloaded %.3e Time spent %lf Flop rate %lf GF/sec \n",
+	   A_gpu->GemmFLOPCounter, tGemm, 1e-9 * A_gpu->GemmFLOPCounter / tGemm  );
+    printf("GPU: Mop offloaded %.3e Time spent %lf Bandwidth %lf GByte/sec \n",
+	   A_gpu->ScatterMOPCounter, tScatter, 8e-9 * A_gpu->ScatterMOPCounter / tScatter  );
+    printf("PCIe Data Transfer H2D:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n",
+	   1e-9 * A_gpu->cPCIeH2D, A_gpu->tHost_PCIeH2D, tPCIeH2D, 1e-9 * A_gpu->cPCIeH2D / tPCIeH2D  );
+    printf("PCIe Data Transfer D2H:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n",
+	   1e-9 * A_gpu->cPCIeD2H, A_gpu->tHost_PCIeD2H, tPCIeD2H, 1e-9 * A_gpu->cPCIeD2H / tPCIeD2H  );
+    fflush(stdout);
+
+} /* end printGPUStats */
+
+
+int_t initSluGPU3D_t(
+    sluGPU_t *sluGPU,
+    LUstruct_t *LUstruct,
+    gridinfo3d_t * grid3d,
+    int_t* perm_c_supno,
+    int_t n,
+    int_t buffer_size,    /* read from env variable MAX_BUFFER_SIZE */
+    int_t bigu_size,
+    int_t ldt             /* NSUP read from sp_ienv(3) */
+)
+{
+	gridinfo_t* grid = &(grid3d->grid2d);
+	checkCudaErrors(cudaDeviceReset ())     ;
+	Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+	LocalLU_t *Llu = LUstruct->Llu;
+	int_t* isNodeInMyGrid = sluGPU->isNodeInMyGrid;
+
+	sluGPU->nCudaStreams = getnCudaStreams();
+	if (!grid->iam) {
+	    printf("initSluGPU3D_t: Using hardware acceleration, with %d cuda streams \n", sluGPU->nCudaStreams);
+	    fflush(stdout);
+	    if ( MAX_SUPER_SIZE < ldt ) {
+	        ABORT("MAX_SUPER_SIZE smaller than requested NSUP");
+	    }
+	}
+
+	cudaStreamCreate(&(sluGPU->CopyStream));
+
+	for (int_t streamId = 0; streamId < sluGPU->nCudaStreams; streamId++)
+	{
+		cudaStreamCreate(&(sluGPU->funCallStreams[streamId]));
+		cublasCreate(&(sluGPU->cublasHandles[streamId]));
+		sluGPU->lastOffloadStream[streamId] = -1;
+	}
+
+	sluGPU->A_gpu = (LUstruct_gpu *) malloc (sizeof(LUstruct_gpu));
+	sluGPU->A_gpu->perm_c_supno = perm_c_supno;
+	CopyLUToGPU3D (
+	    isNodeInMyGrid,
+	    Llu,             /* referred to as A_host */
+	    sluGPU,
+	    Glu_persist, n,
+	    grid3d,
+	    buffer_size,
+	    bigu_size,
+	    ldt
+	);
+
+	return 0;
+} /* end initSluGPU3D_t */
+
+int_t initD2Hreduce(
+    int_t next_k,
+    d2Hreduce_t* d2Hred,
+    int_t last_flag,
+    HyP_t* HyP,
+    sluGPU_t *sluGPU,
+    gridinfo_t *grid,
+    LUstruct_t *LUstruct
+    , SCT_t* SCT
+)
+{
+	Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+	LocalLU_t *Llu = LUstruct->Llu;
+	int_t* xsup = Glu_persist->xsup;
+	int_t iam = grid->iam;
+	int_t myrow = MYROW (iam, grid);
+	int_t mycol = MYCOL (iam, grid);
+	int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+
+
+	// int_t next_col = SUPERLU_MIN (k0 + num_look_aheads + 1, nsupers - 1);
+	// int_t next_k = perm_c_supno[next_col];  /* global block number for next colum*/
+	int_t mkcol, mkrow;
+
+	int_t kljb = LBj( next_k, grid );   /*local block number for next block*/
+	int_t kijb = LBi( next_k, grid );   /*local block number for next block*/
+
+	int_t *kindexL ;                     /*for storing index vectors*/
+	int_t *kindexU ;
+	mkrow = PROW (next_k, grid);
+	mkcol = PCOL (next_k, grid);
+	int_t ksup_size = SuperSize(next_k);
+
+	int_t copyL_kljb = 0;
+	int_t copyU_kljb = 0;
+	int_t l_copy_len = 0;
+	int_t u_copy_len = 0;
+
+	if (mkcol == mycol &&  Lrowind_bc_ptr[kljb] != NULL  && last_flag)
+	{
+		if (HyP->Lblock_dirty_bit[kljb] > -1)
+		{
+			copyL_kljb = 1;
+			int_t lastk0 = HyP->Lblock_dirty_bit[kljb];
+			int_t streamIdk0Offload =  lastk0 % sluGPU->nCudaStreams;
+			if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1)
+			{
+				// printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload);
+				double ttx = SuperLU_timer_();
+				cudaStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]);
+				SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+				sluGPU->lastOffloadStream[streamIdk0Offload] = -1;
+			}
+		}
+
+		kindexL = Lrowind_bc_ptr[kljb];
+		l_copy_len = kindexL[1] * ksup_size;
+	}
+
+	if ( mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL    && last_flag )
+	{
+		if (HyP->Ublock_dirty_bit[kijb] > -1)
+		{
+			copyU_kljb = 1;
+			int_t lastk0 = HyP->Ublock_dirty_bit[kijb];
+			int_t streamIdk0Offload =  lastk0 % sluGPU->nCudaStreams;
+			if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1)
+			{
+				// printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload);
+				double ttx = SuperLU_timer_();
+				cudaStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]);
+				SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+				sluGPU->lastOffloadStream[streamIdk0Offload] = -1;
+			}
+
+		}
+		// copyU_kljb = HyP->Ublock_dirty_bit[kijb]>-1? 1: 0;
+		kindexU = Ufstnz_br_ptr[kijb];
+		u_copy_len = kindexU[1];
+	}
+
+	// wait for streams if they have not been finished
+
+	// d2Hred->next_col = next_col;
+	d2Hred->next_k = next_k;
+	d2Hred->kljb = kljb;
+	d2Hred->kijb = kijb;
+	d2Hred->copyL_kljb = copyL_kljb;
+	d2Hred->copyU_kljb = copyU_kljb;
+	d2Hred->l_copy_len = l_copy_len;
+	d2Hred->u_copy_len = u_copy_len;
+	d2Hred->kindexU = kindexU;
+	d2Hred->kindexL = kindexL;
+	d2Hred->mkrow = mkrow;
+	d2Hred->mkcol = mkcol;
+	d2Hred->ksup_size = ksup_size;
+	return 0;
+}
+
+int_t reduceGPUlu(
+
+    int_t last_flag,
+    d2Hreduce_t* d2Hred,
+    sluGPU_t *sluGPU,
+    SCT_t *SCT,
+    gridinfo_t *grid,
+    LUstruct_t *LUstruct
+)
+{
+
+	LocalLU_t *Llu = LUstruct->Llu;
+	int_t iam = grid->iam;
+	int_t myrow = MYROW (iam, grid);
+	int_t mycol = MYCOL (iam, grid);
+	int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+	double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+	int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+	double** Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+	cudaStream_t CopyStream;
+	LUstruct_gpu *A_gpu;
+	A_gpu = sluGPU->A_gpu;
+	CopyStream = sluGPU->CopyStream;
+
+	int_t kljb = d2Hred->kljb;
+	int_t kijb = d2Hred->kijb;
+	int_t copyL_kljb = d2Hred->copyL_kljb;
+	int_t copyU_kljb = d2Hred->copyU_kljb;
+	int_t mkrow = d2Hred->mkrow;
+	int_t mkcol = d2Hred->mkcol;
+	int_t ksup_size = d2Hred->ksup_size;
+	int_t *kindex;
+	if ((copyL_kljb || copyU_kljb) && last_flag )
+	{
+		double ttx = SuperLU_timer_();
+		cudaStreamSynchronize(CopyStream);
+		SCT->PhiWaitTimer_2 += SuperLU_timer_() - ttx;
+	}
+
+
+	double tt_start = SuperLU_timer_();
+
+
+	if (last_flag)
+	{
+
+		if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL )
+		{
+
+			kindex = Lrowind_bc_ptr[kljb];
+			int_t len = kindex[1];
+
+			if (copyL_kljb)
+			{
+
+				double *nzval_host;
+				nzval_host = Lnzval_bc_ptr[kljb];
+				int_t llen = ksup_size * len;
+
+				double alpha = 1;
+				cblas_daxpy (llen, alpha, A_gpu->acc_L_buff, 1, nzval_host, 1);
+			}
+
+		}
+	}
+	if (last_flag)
+	{
+		if (mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL )
+		{
+
+			kindex = Ufstnz_br_ptr[kijb];
+			int_t len = kindex[1];
+
+			if (copyU_kljb)
+			{
+
+				double *nzval_host;
+				nzval_host = Unzval_br_ptr[kijb];
+
+				double alpha = 1;
+				cblas_daxpy (len, alpha, A_gpu->acc_U_buff, 1, nzval_host, 1);
+			}
+
+		}
+	}
+
+	double tt_end = SuperLU_timer_();
+	SCT->AssemblyTimer += tt_end - tt_start;
+	return 0;
+}
+
+
+int_t waitGPUscu(int_t streamId,  sluGPU_t *sluGPU, SCT_t *SCT)
+{
+	double ttx = SuperLU_timer_();
+	cudaStreamSynchronize(sluGPU->funCallStreams[streamId]);
+	SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+	return 0;
+}
+
+int_t sendLUpanelGPU2HOST(
+    int_t k0,
+    d2Hreduce_t* d2Hred,
+    sluGPU_t *sluGPU
+)
+{
+
+	int_t kljb = d2Hred->kljb;
+	int_t kijb = d2Hred->kijb;
+	int_t copyL_kljb = d2Hred->copyL_kljb;
+	int_t copyU_kljb = d2Hred->copyU_kljb;
+	int_t l_copy_len = d2Hred->l_copy_len;
+	int_t u_copy_len = d2Hred->u_copy_len;
+	cudaStream_t CopyStream = sluGPU->CopyStream;;
+	LUstruct_gpu *A_gpu = sluGPU->A_gpu;
+	double tty = SuperLU_timer_();
+	cudaEventRecord(A_gpu->ePCIeD2H_Start[k0], CopyStream);
+	if (copyL_kljb)
+		checkCuda(cudaMemcpyAsync(A_gpu->acc_L_buff, &A_gpu->LnzvalVec[A_gpu->LnzvalPtr_host[kljb]],
+		                          l_copy_len * sizeof(double), cudaMemcpyDeviceToHost, CopyStream ) );
+
+	if (copyU_kljb)
+		checkCuda(cudaMemcpyAsync(A_gpu->acc_U_buff, &A_gpu->UnzvalVec[A_gpu->UnzvalPtr_host[kijb]],
+		                          u_copy_len * sizeof(double), cudaMemcpyDeviceToHost, CopyStream ) );
+	cudaEventRecord(A_gpu->ePCIeD2H_End[k0], CopyStream);
+	A_gpu->tHost_PCIeD2H += SuperLU_timer_() - tty;
+	A_gpu->cPCIeD2H += u_copy_len * sizeof(double) + l_copy_len * sizeof(double);
+
+	return 0;
+}
+
+/* Copy L and U panel data structures from host to the host part of the
+   data structures in A_gpu.
+   GPU is not involved in this routine. */
+int_t sendSCUdataHost2GPU(
+    int_t streamId,
+    int_t* lsub,
+    int_t* usub,
+    double* bigU,
+    int_t bigu_send_size,
+    int_t Remain_lbuf_send_size,
+    sluGPU_t *sluGPU,
+    HyP_t* HyP
+)
+{
+    //{printf("....[enter] sendSCUdataHost2GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);}
+
+	int_t usub_len = usub[2];
+	int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+    //{printf("....[2] in sendSCUdataHost2GPU, lsub_len %d\n", lsub_len); fflush(stdout);}
+	LUstruct_gpu *A_gpu = sluGPU->A_gpu;
+	memcpy(A_gpu->scubufs[streamId].lsub_buf, lsub, sizeof(int_t)*lsub_len);
+	memcpy(A_gpu->scubufs[streamId].usub_buf, usub, sizeof(int_t)*usub_len);
+	memcpy(A_gpu->scubufs[streamId].Remain_info_host, HyP->Remain_info,
+	       sizeof(Remain_info_t)*HyP->RemainBlk);
+	memcpy(A_gpu->scubufs[streamId].Ublock_info_host, HyP->Ublock_info_Phi,
+	       sizeof(Ublock_info_t)*HyP->num_u_blks_Phi);
+	memcpy(A_gpu->scubufs[streamId].Remain_L_buff_host, HyP->Remain_L_buff,
+	       sizeof(double)*Remain_lbuf_send_size);
+	memcpy(A_gpu->scubufs[streamId].bigU_host, bigU,
+	       sizeof(double)*bigu_send_size);
+
+	return 0;
+}
+
+
+int_t freeSluGPU(sluGPU_t *sluGPU)
+{
+	return 0;
+}
+
+
+void CopyLUToGPU3D (
+    int_t* isNodeInMyGrid,
+    LocalLU_t *A_host, /* distributed LU structure on host */
+    sluGPU_t *sluGPU,
+    Glu_persist_t *Glu_persist, int_t n,
+    gridinfo3d_t *grid3d,
+    int_t buffer_size, /* bigV size on GPU for Schur complement update */
+    int_t bigu_size,
+    int_t ldt
+)
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    LUstruct_gpu * A_gpu =  sluGPU->A_gpu;
+    LUstruct_gpu **dA_gpu =  &(sluGPU->dA_gpu);
+
+#ifdef GPU_DEBUG
+    if ( grid3d->iam==0 ) {
+	print_occupany();
+	cudaDeviceProp devProp;
+	cudaGetDeviceProperties(&devProp, 0);
+	printDevProp(devProp);
+    }
+#endif
+	int_t *xsup ;
+	xsup = Glu_persist->xsup;
+	int_t iam = grid->iam;
+	int_t nsupers = Glu_persist->supno[n - 1] + 1;
+	int_t Pc = grid->npcol;
+	int_t Pr = grid->nprow;
+	int_t myrow = MYROW (iam, grid);
+	int_t mycol = MYCOL (iam, grid);
+	int_t mrb =    (nsupers + Pr - 1) / Pr;
+	int_t mcb =    (nsupers + Pc - 1) / Pc;
+	int_t remain_l_max = A_host->bufmax[1];
+
+	/*copies of scalars for easy access*/
+	A_gpu->nsupers = nsupers;
+	A_gpu->ScatterMOPCounter = 0;
+	A_gpu->GemmFLOPCounter = 0;
+	A_gpu->cPCIeH2D = 0;
+	A_gpu->cPCIeD2H = 0;
+	A_gpu->tHost_PCIeH2D = 0;
+	A_gpu->tHost_PCIeD2H = 0;
+
+	/*initializing memory*/
+	size_t max_gpu_memory = get_acc_memory ();
+	size_t gpu_mem_used = 0;
+
+	void *tmp_ptr;
+
+	A_gpu->xsup_host = xsup;
+
+	int_t nCudaStreams = sluGPU->nCudaStreams;
+	/*pinned memory allocations.
+          Paged-locked memory by cudaMallocHost is accessible to the device.*/
+	for (int_t streamId = 0; streamId < nCudaStreams; streamId++ )
+	{
+	    void *tmp_ptr;
+	    checkCudaErrors(cudaMallocHost(  &tmp_ptr, (n) * sizeof(int_t) )) ;
+	    A_gpu->scubufs[streamId].usub_IndirectJ3_host = (int_t*) tmp_ptr;
+
+	    checkCudaErrors(cudaMalloc( &tmp_ptr,  ( n) * sizeof(int_t) ));
+	    A_gpu->scubufs[streamId].usub_IndirectJ3 =  (int_t*) tmp_ptr;
+	    gpu_mem_used += ( n) * sizeof(int_t);
+	    checkCudaErrors(cudaMallocHost(  &tmp_ptr, mrb * sizeof(Remain_info_t) )) ;
+	    A_gpu->scubufs[streamId].Remain_info_host = (Remain_info_t*)tmp_ptr;
+	    checkCudaErrors(cudaMallocHost(  &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ;
+	    A_gpu->scubufs[streamId].Ublock_info_host = (Ublock_info_t*)tmp_ptr;
+	    checkCudaErrors(cudaMallocHost(  &tmp_ptr,  remain_l_max * sizeof(double) )) ;
+	    A_gpu->scubufs[streamId].Remain_L_buff_host = (double *) tmp_ptr;
+	    checkCudaErrors(cudaMallocHost(  &tmp_ptr,  bigu_size * sizeof(double) )) ;
+	    A_gpu->scubufs[streamId].bigU_host = (double *) tmp_ptr;
+
+	    cudaMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[1]));
+	    A_gpu->acc_L_buff = (double *) tmp_ptr;
+	    cudaMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[3]));
+	    A_gpu->acc_U_buff = (double *) tmp_ptr;
+	    cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[0]));
+	    A_gpu->scubufs[streamId].lsub_buf =  (int_t *) tmp_ptr;
+	    cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[2]));
+	    A_gpu->scubufs[streamId].usub_buf = (int_t *) tmp_ptr;
+
+	    checkCudaErrors(cudaMalloc(  &tmp_ptr,  remain_l_max * sizeof(double) )) ;
+	    A_gpu->scubufs[streamId].Remain_L_buff = (double *) tmp_ptr;
+	    gpu_mem_used += remain_l_max * sizeof(double);
+	    checkCudaErrors(cudaMalloc(  &tmp_ptr,  bigu_size * sizeof(double) )) ;
+	    A_gpu->scubufs[streamId].bigU = (double *) tmp_ptr;
+	    gpu_mem_used += bigu_size * sizeof(double);
+	    checkCudaErrors(cudaMalloc(  &tmp_ptr,  mcb * sizeof(Ublock_info_t) )) ;
+	    A_gpu->scubufs[streamId].Ublock_info = (Ublock_info_t *) tmp_ptr;
+	    gpu_mem_used += mcb * sizeof(Ublock_info_t);
+	    checkCudaErrors(cudaMalloc(  &tmp_ptr,  mrb * sizeof(Remain_info_t) )) ;
+	    A_gpu->scubufs[streamId].Remain_info = (Remain_info_t *) tmp_ptr;
+	    gpu_mem_used += mrb * sizeof(Remain_info_t);
+	    checkCudaErrors(cudaMalloc(  &tmp_ptr,  buffer_size * sizeof(double))) ;
+	    A_gpu->scubufs[streamId].bigV = (double *) tmp_ptr;
+	    gpu_mem_used += buffer_size * sizeof(double);
+	    checkCudaErrors(cudaMalloc(  &tmp_ptr,  A_host->bufmax[0]*sizeof(int_t))) ;
+	    A_gpu->scubufs[streamId].lsub = (int_t *) tmp_ptr;
+	    gpu_mem_used += A_host->bufmax[0] * sizeof(int_t);
+	    checkCudaErrors(cudaMalloc(  &tmp_ptr,  A_host->bufmax[2]*sizeof(int_t))) ;
+	    A_gpu->scubufs[streamId].usub = (int_t *) tmp_ptr;
+	    gpu_mem_used += A_host->bufmax[2] * sizeof(int_t);
+
+	} /* endfor streamID ... allocate paged-locked memory */
+
+	A_gpu->isOffloaded = (int_t *) malloc (sizeof(int_t) * nsupers);
+	A_gpu->GemmStart  = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers);
+	A_gpu->GemmEnd  = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers);
+	A_gpu->ScatterEnd  = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers);
+	A_gpu->ePCIeH2D = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers);
+	A_gpu->ePCIeD2H_Start = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers);
+	A_gpu->ePCIeD2H_End = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers);
+
+	for (int_t i = 0; i < nsupers; ++i)
+	{
+	    A_gpu->isOffloaded[i] = 0;
+	    checkCudaErrors(cudaEventCreate(&(A_gpu->GemmStart[i])));
+	    checkCudaErrors(cudaEventCreate(&(A_gpu->GemmEnd[i])));
+	    checkCudaErrors(cudaEventCreate(&(A_gpu->ScatterEnd[i])));
+	    checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeH2D[i])));
+	    checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeH2D[i])));
+	    checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeD2H_Start[i])));
+	    checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeD2H_End[i])));
+	}
+
+	/*---- Copy L data structure to GPU ----*/
+
+	/*pointers and address of local blocks for easy accessibility */
+	local_l_blk_info_t  *local_l_blk_infoVec;
+	int_t  * local_l_blk_infoPtr;
+	local_l_blk_infoPtr =  (int_t *) malloc( CEILING(nsupers, Pc) * sizeof(int_t ) );
+
+	/* First pass: count total L blocks */
+	int_t cum_num_l_blocks = 0;  /* total number of L blocks I own */
+	for (int_t i = 0; i < CEILING(nsupers, Pc); ++i)
+	{  /* going through each block column I own */
+
+	    if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1)
+	    {
+		int_t *index = A_host->Lrowind_bc_ptr[i];
+		int_t num_l_blocks = index[0];
+		cum_num_l_blocks += num_l_blocks;
+	    }
+	}
+
+	/*allocating memory*/
+	local_l_blk_infoVec =  (local_l_blk_info_t *) malloc(cum_num_l_blocks * sizeof(local_l_blk_info_t));
+
+	/* Second pass: set up the meta-data for the L structure */
+	cum_num_l_blocks = 0;
+
+	/*initialzing vectors */
+	for (int_t i = 0; i < CEILING(nsupers, Pc); ++i)
+	{
+	    if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1)
+	    {
+		int_t *index = A_host->Lrowind_bc_ptr[i];
+		int_t num_l_blocks = index[0]; /* # L blocks in this column */
+
+		if (num_l_blocks > 0)
+		{
+
+		    local_l_blk_info_t *local_l_blk_info_i = local_l_blk_infoVec + cum_num_l_blocks;
+		    local_l_blk_infoPtr[i] = cum_num_l_blocks;
+
+		    int_t lptrj = BC_HEADER;
+		    int_t luptrj = 0;
+				
+		    for (int_t j = 0; j < num_l_blocks ; ++j)
+		    {
+
+			int_t ijb = index[lptrj];
+
+			local_l_blk_info_i[j].lib = ijb / Pr;
+			local_l_blk_info_i[j].lptrj = lptrj;
+			local_l_blk_info_i[j].luptrj = luptrj;
+			luptrj += index[lptrj + 1];
+			lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+			
+		    }
+		}
+		cum_num_l_blocks += num_l_blocks;
+	    }
+
+	} /* endfor all block columns */
+
+
+	/* Allocate L memory on GPU, and copy the values from CPU to GPU */
+	checkCudaErrors(cudaMalloc(  &tmp_ptr,  cum_num_l_blocks * sizeof(local_l_blk_info_t))) ;
+	A_gpu->local_l_blk_infoVec = (local_l_blk_info_t *) tmp_ptr;
+	gpu_mem_used += cum_num_l_blocks * sizeof(local_l_blk_info_t);
+	checkCudaErrors(cudaMemcpy( (A_gpu->local_l_blk_infoVec), local_l_blk_infoVec, cum_num_l_blocks * sizeof(local_l_blk_info_t), cudaMemcpyHostToDevice)) ;
+
+	checkCudaErrors(cudaMalloc(  &tmp_ptr,  CEILING(nsupers, Pc)*sizeof(int_t))) ;
+	A_gpu->local_l_blk_infoPtr = (int_t *) tmp_ptr;
+	gpu_mem_used += CEILING(nsupers, Pc) * sizeof(int_t);
+	checkCudaErrors(cudaMemcpy( (A_gpu->local_l_blk_infoPtr), local_l_blk_infoPtr, CEILING(nsupers, Pc)*sizeof(int_t), cudaMemcpyHostToDevice)) ;
+
+
+	/*---- Copy U data structure to GPU ----*/
+
+	local_u_blk_info_t  *local_u_blk_infoVec;
+	int_t  * local_u_blk_infoPtr;
+	local_u_blk_infoPtr =  (int_t *) malloc( CEILING(nsupers, Pr) * sizeof(int_t ) );
+
+	/* First pass: count total U blocks */
+	int_t cum_num_u_blocks = 0;
+
+	for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
+	{
+
+	    if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1)
+	    {
+		int_t *index = A_host->Ufstnz_br_ptr[i];
+		int_t num_u_blocks = index[0];
+		cum_num_u_blocks += num_u_blocks;
+
+	    }
+
+	}
+
+	local_u_blk_infoVec =  (local_u_blk_info_t *) malloc(cum_num_u_blocks * sizeof(local_u_blk_info_t));
+
+	/* Second pass: set up the meta-data for the U structure */
+	cum_num_u_blocks = 0;
+
+	for (int_t i = 0; i < CEILING(nsupers, Pr); ++i)
+	{
+	    if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1)
+	    {
+		int_t *index = A_host->Ufstnz_br_ptr[i];
+		int_t num_u_blocks = index[0];
+
+		if (num_u_blocks > 0)
+		{
+		    local_u_blk_info_t  *local_u_blk_info_i = local_u_blk_infoVec + cum_num_u_blocks;
+		    local_u_blk_infoPtr[i] = cum_num_u_blocks;
+
+		    int_t iuip_lib, ruip_lib;
+		    iuip_lib = BR_HEADER;
+		    ruip_lib = 0;
+
+		    for (int_t j = 0; j < num_u_blocks ; ++j)
+		    {
+
+			int_t ijb = index[iuip_lib];
+			local_u_blk_info_i[j].ljb = ijb / Pc;
+			local_u_blk_info_i[j].iuip = iuip_lib;
+			local_u_blk_info_i[j].ruip = ruip_lib;
+
+			ruip_lib += index[iuip_lib + 1];
+			iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+
+		    }
+		}
+		cum_num_u_blocks +=  num_u_blocks;
+	    }
+
+	}
+
+	checkCudaErrors(cudaMalloc( &tmp_ptr,  cum_num_u_blocks * sizeof(local_u_blk_info_t))) ;
+	A_gpu->local_u_blk_infoVec = (local_u_blk_info_t *) tmp_ptr;
+	gpu_mem_used += cum_num_u_blocks * sizeof(local_u_blk_info_t);
+	checkCudaErrors(cudaMemcpy( (A_gpu->local_u_blk_infoVec), local_u_blk_infoVec, cum_num_u_blocks * sizeof(local_u_blk_info_t), cudaMemcpyHostToDevice)) ;
+
+	checkCudaErrors(cudaMalloc( &tmp_ptr,  CEILING(nsupers, Pr)*sizeof(int_t))) ;
+	A_gpu->local_u_blk_infoPtr = (int_t *) tmp_ptr;
+	gpu_mem_used += CEILING(nsupers, Pr) * sizeof(int_t);
+	checkCudaErrors(cudaMemcpy( (A_gpu->local_u_blk_infoPtr), local_u_blk_infoPtr, CEILING(nsupers, Pr)*sizeof(int_t), cudaMemcpyHostToDevice)) ;
+
+
+	/* Copy the actual L indices and values */
+	int_t l_k = CEILING( nsupers, grid->npcol ); /* # of local block columns */
+	int_t *temp_LrowindPtr    = (int_t *) malloc(sizeof(int_t) * l_k);
+	int_t *temp_LnzvalPtr     = (int_t *) malloc(sizeof(int_t) * l_k);
+	int_t *Lnzval_size = (int_t *) malloc(sizeof(int_t) * l_k);
+	int_t l_ind_len = 0;
+	int_t l_val_len = 0;
+	for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */
+	{
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc && isNodeInMyGrid[jb] == 1)
+	    {
+		int_t ljb = LBj( jb, grid ); /* Local block number */
+		int_t  *index_host;
+		index_host = A_host->Lrowind_bc_ptr[ljb];
+
+		temp_LrowindPtr[ljb] = l_ind_len;
+		temp_LnzvalPtr[ljb] = l_val_len;        // ###
+		Lnzval_size[ljb] = 0;       //###
+		if (index_host != NULL)
+		{
+		    int_t nrbl  = index_host[0];   /* number of L blocks */
+		    int_t len   = index_host[1];   /* LDA of the nzval[] */
+		    int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+
+		    /* Global block number is mycol +  ljb*Pc */
+		    int_t nsupc = SuperSize(jb);
+
+		    l_ind_len += len1;
+		    l_val_len += len * nsupc;
+		    Lnzval_size[ljb] = len * nsupc ; // ###
+		}
+		else
+		{
+		    Lnzval_size[ljb] = 0 ; // ###
+		}
+		
+	    }
+	} /* endfor jb = 0 ... */
+
+	/* Copy the actual U indices and values */
+	int_t u_k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+	int_t *temp_UrowindPtr    = (int_t *) malloc(sizeof(int_t) * u_k);
+	int_t *temp_UnzvalPtr     = (int_t *) malloc(sizeof(int_t) * u_k);
+	int_t *Unzval_size = (int_t *) malloc(sizeof(int_t) * u_k);
+	int_t u_ind_len = 0;
+	int_t u_val_len = 0;
+	for ( int_t lb = 0; lb < u_k; ++lb)
+	{
+	    int_t *index_host;
+	    index_host =  A_host->Ufstnz_br_ptr[lb];
+	    temp_UrowindPtr[lb] = u_ind_len;
+	    temp_UnzvalPtr[lb] = u_val_len;
+	    Unzval_size[lb] = 0;
+	    if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1)
+	    {
+		int_t len = index_host[1];
+		int_t len1 = index_host[2];
+
+		u_ind_len += len1;
+		u_val_len += len;
+		Unzval_size[lb] = len;
+	    }
+	    else
+	    {
+		Unzval_size[lb] = 0;
+	    }
+	}
+
+	gpu_mem_used += l_ind_len * sizeof(int_t);
+	gpu_mem_used += 2 * l_k * sizeof(int_t);
+	gpu_mem_used += u_ind_len * sizeof(int_t);
+	gpu_mem_used += 2 * u_k * sizeof(int_t);
+
+	/*left memory shall be divided among the two */
+
+	for (int_t i = 0;  i < l_k; ++i)
+	{
+	    temp_LnzvalPtr[i] = -1;
+	}
+
+	for (int_t i = 0; i < u_k; ++i)
+	{
+	    temp_UnzvalPtr[i] = -1;
+	}
+
+	/*setting these pointers back */
+	l_val_len = 0;
+	u_val_len = 0;
+
+	int_t num_gpu_l_blocks = 0;
+	int_t num_gpu_u_blocks = 0;
+	size_t mem_l_block, mem_u_block;
+
+	/* Find the trailing matrix size that can fit into GPU memory */
+	for (int_t i = nsupers - 1; i > -1; --i)
+	{
+	    /* ulte se chalte hai eleimination tree  */
+	    /* bottom up ordering  */
+	    int_t i_sup = A_gpu->perm_c_supno[i];
+
+	    int_t pc = PCOL( i_sup, grid );
+	    if (isNodeInMyGrid[i_sup] == 1)
+	    {
+		if (mycol == pc )
+		{
+		    int_t ljb  = LBj(i_sup, grid);
+		    mem_l_block = sizeof(double) * Lnzval_size[ljb];
+		    if (gpu_mem_used + mem_l_block > max_gpu_memory)
+		    {
+			break;
+		    }
+		    else
+		    {
+			gpu_mem_used += mem_l_block;
+			temp_LnzvalPtr[ljb] = l_val_len;
+			l_val_len += Lnzval_size[ljb];
+			num_gpu_l_blocks++;
+			A_gpu->first_l_block_gpu = i;
+		    }
+		}
+
+		int_t pr = PROW( i_sup, grid );
+		if (myrow == pr)
+		{
+		    int_t lib  = LBi(i_sup, grid);
+		    mem_u_block = sizeof(double) * Unzval_size[lib];
+		    if (gpu_mem_used + mem_u_block > max_gpu_memory)
+		    {
+			break;
+		    }
+		    else
+		    {
+			gpu_mem_used += mem_u_block;
+			temp_UnzvalPtr[lib] = u_val_len;
+			u_val_len += Unzval_size[lib];
+			num_gpu_u_blocks++;
+			A_gpu->first_u_block_gpu = i;
+		    }
+
+		}
+	    } /* endif */
+
+	} /* endfor i .... nsupers */
+
+#if (PRNTlevel>=1)
+	printf("(%d) Number of L blocks in GPU %d, U blocks %d\n", 
+	       grid3d->iam, num_gpu_l_blocks, num_gpu_u_blocks );
+	printf("(%d) elimination order of first block in GPU: L block %d, U block %d\n", 
+	       grid3d->iam, A_gpu->first_l_block_gpu, A_gpu->first_u_block_gpu);
+	printf("(%d) Memory of L %.1f GB, memory for U %.1f GB, Total device memory used %.1f GB, Memory allowed %.1f GB \n", grid3d->iam,
+	       l_val_len * sizeof(double)*1e-9, 
+	       u_val_len * sizeof(double)*1e-9, 
+	       gpu_mem_used*1e-9, max_gpu_memory*1e-9);
+	fflush(stdout);
+#endif
+
+	/* Assemble index vector on temp */
+	int_t *indtemp = (int_t *) malloc(sizeof(int_t) * l_ind_len);
+	for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+	{
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc && isNodeInMyGrid[jb] == 1)
+	    {
+		int_t ljb = LBj( jb, grid ); /* Local block number */
+		int_t  *index_host;
+		index_host = A_host->Lrowind_bc_ptr[ljb];
+
+		if (index_host != NULL)
+		{
+		    int_t nrbl  =   index_host[0]; /* number of L blocks */
+		    int_t len   = index_host[1];   /* LDA of the nzval[] */
+		    int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+
+		    memcpy(&indtemp[temp_LrowindPtr[ljb]] , index_host, len1 * sizeof(int_t)) ;
+		}
+	    }
+	}
+
+	checkCudaErrors(cudaMalloc( &tmp_ptr,  l_ind_len * sizeof(int_t))) ;
+	A_gpu->LrowindVec = (int_t *) tmp_ptr;
+	checkCudaErrors(cudaMemcpy( (A_gpu->LrowindVec), indtemp, l_ind_len * sizeof(int_t), cudaMemcpyHostToDevice)) ;
+
+	checkCudaErrors(cudaMalloc(  &tmp_ptr,  l_val_len * sizeof(double)));
+	A_gpu->LnzvalVec = (double *) tmp_ptr;
+	checkCudaErrors(cudaMemset( (A_gpu->LnzvalVec), 0, l_val_len * sizeof(double)));
+
+	checkCudaErrors(cudaMalloc(  &tmp_ptr,  l_k * sizeof(int_t))) ;
+	A_gpu->LrowindPtr = (int_t *) tmp_ptr;
+	checkCudaErrors(cudaMemcpy( (A_gpu->LrowindPtr), temp_LrowindPtr, l_k * sizeof(int_t), cudaMemcpyHostToDevice)) ;
+
+	checkCudaErrors(cudaMalloc(  &tmp_ptr,  l_k * sizeof(int_t))) ;
+	A_gpu->LnzvalPtr = (int_t *) tmp_ptr;
+	checkCudaErrors(cudaMemcpy( (A_gpu->LnzvalPtr), temp_LnzvalPtr, l_k * sizeof(int_t), cudaMemcpyHostToDevice)) ;
+
+	A_gpu->LnzvalPtr_host = temp_LnzvalPtr;
+
+	int_t *indtemp1 = (int_t *) malloc(sizeof(int_t) * u_ind_len);
+	for ( int_t lb = 0; lb < u_k; ++lb)
+	{
+		int_t *index_host;
+		index_host =  A_host->Ufstnz_br_ptr[lb];
+
+		if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1)
+		{
+			int_t len1 = index_host[2];
+			memcpy(&indtemp1[temp_UrowindPtr[lb]] , index_host, sizeof(int_t)*len1);
+
+		}
+	}
+
+	checkCudaErrors(cudaMalloc(  &tmp_ptr,  u_ind_len * sizeof(int_t))) ;
+	A_gpu->UrowindVec = (int_t *) tmp_ptr;
+	checkCudaErrors(cudaMemcpy( (A_gpu->UrowindVec), indtemp1, u_ind_len * sizeof(int_t), cudaMemcpyHostToDevice)) ;
+
+	checkCudaErrors(cudaMalloc(  &tmp_ptr,  u_val_len * sizeof(double)));
+	A_gpu->UnzvalVec = (double *) tmp_ptr;
+	checkCudaErrors(cudaMemset( (A_gpu->UnzvalVec), 0, u_val_len * sizeof(double)));
+
+	checkCudaErrors(cudaMalloc(  &tmp_ptr,  u_k * sizeof(int_t))) ;
+	A_gpu->UrowindPtr = (int_t *) tmp_ptr;
+	checkCudaErrors(cudaMemcpy( (A_gpu->UrowindPtr), temp_UrowindPtr, u_k * sizeof(int_t), cudaMemcpyHostToDevice)) ;
+
+	A_gpu->UnzvalPtr_host = temp_UnzvalPtr;
+
+	checkCudaErrors(cudaMalloc(  &tmp_ptr,  u_k * sizeof(int_t))) ;
+	A_gpu->UnzvalPtr = (int_t *) tmp_ptr;
+	checkCudaErrors(cudaMemcpy( (A_gpu->UnzvalPtr), temp_UnzvalPtr, u_k * sizeof(int_t), cudaMemcpyHostToDevice)) ;
+
+	checkCudaErrors(cudaMalloc(  &tmp_ptr,  (nsupers + 1)*sizeof(int_t))) ;
+	A_gpu->xsup = (int_t *) tmp_ptr;
+	checkCudaErrors(cudaMemcpy( (A_gpu->xsup), xsup, (nsupers + 1)*sizeof(int_t), cudaMemcpyHostToDevice)) ;
+
+	checkCudaErrors(cudaMalloc( &tmp_ptr,  sizeof(LUstruct_gpu))) ;
+	*dA_gpu = (LUstruct_gpu *) tmp_ptr;
+	checkCudaErrors(cudaMemcpy( *dA_gpu, A_gpu, sizeof(LUstruct_gpu), cudaMemcpyHostToDevice)) ;
+
+	free (temp_LrowindPtr);
+	free (temp_UrowindPtr);
+	free (indtemp1);
+	free (indtemp);
+
+} /* end CopyLUToGPU3D */
+
+
+
+int_t reduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount,
+                               int_t** treePerm,
+                               LUValSubBuf_t*LUvsb,
+                               LUstruct_t* LUstruct,
+                               gridinfo3d_t* grid3d,
+                               sluGPU_t *sluGPU,
+                               d2Hreduce_t* d2Hred,
+                               factStat_t *factStat,
+                               HyP_t* HyP,
+                               SCT_t* SCT )
+{
+
+
+// first synchronize all cuda streams
+	int_t superlu_acc_offload =   HyP->superlu_acc_offload;
+
+
+
+
+	int_t maxLvl = log2i( (int_t) grid3d->zscp.Np) + 1;
+	int_t myGrid = grid3d->zscp.Iam;
+	gridinfo_t* grid = &(grid3d->grid2d);
+	int_t* gpuLUreduced = factStat->gpuLUreduced;
+
+
+	int_t sender;
+	if ((myGrid % (1 << (ilvl + 1))) == 0)
+	{
+		sender = myGrid + (1 << ilvl);
+
+	}
+	else
+	{
+		sender = myGrid;
+	}
+
+	/*Reduce all the ancestors from the GPU*/
+	if (myGrid == sender && superlu_acc_offload)
+	{
+		for (int_t streamId = 0; streamId < sluGPU->nCudaStreams; streamId++)
+		{
+			double ttx = SuperLU_timer_();
+			cudaStreamSynchronize(sluGPU->funCallStreams[streamId]);
+			SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+			sluGPU->lastOffloadStream[streamId] = -1;
+		}
+
+		for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl)
+		{
+			/* code */
+			// int_t atree = myTreeIdxs[alvl];
+			int_t nsAncestor = myNodeCount[alvl];
+			int_t* cAncestorList = treePerm[alvl];
+
+			for (int_t node = 0; node < nsAncestor; node++ )
+			{
+				int_t k = cAncestorList[node];
+				if (!gpuLUreduced[k])
+				{
+
+					initD2Hreduce(k, d2Hred, 1,
+					              HyP, sluGPU, grid, LUstruct, SCT);
+					int_t copyL_kljb = d2Hred->copyL_kljb;
+					int_t copyU_kljb = d2Hred->copyU_kljb;
+
+					double tt_start1 = SuperLU_timer_();
+					SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+					if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++;
+					sendLUpanelGPU2HOST(k, d2Hred, sluGPU);
+					/*
+					    Reduce the LU panels from GPU
+					*/
+					reduceGPUlu(1, d2Hred,
+					            sluGPU, SCT, grid, LUstruct);
+
+					gpuLUreduced[k] = 1;
+				}
+			}
+		}
+	} /*if (myGrid == sender)*/
+
+	reduceAllAncestors3d(ilvl, myNodeCount, treePerm,
+	                     LUvsb, LUstruct, grid3d, SCT );
+	return 0;
+}
+
+
+void syncAllfunCallStreams(sluGPU_t* sluGPU, SCT_t* SCT)
+{
+	for (int_t streamId = 0; streamId < sluGPU->nCudaStreams; streamId++)
+	{
+		double ttx = SuperLU_timer_();
+		cudaStreamSynchronize(sluGPU->funCallStreams[streamId]);
+		SCT->PhiWaitTimer += SuperLU_timer_() - ttx;
+		sluGPU->lastOffloadStream[streamId] = -1;
+	}
+}
diff --git a/SRC/treeFactorizationGPU.c b/SRC/treeFactorizationGPU.c
new file mode 100644
index 00000000..43aa7089
--- /dev/null
+++ b/SRC/treeFactorizationGPU.c
@@ -0,0 +1,583 @@
+#include "treeFactorization.h"
+#include "trfCommWrapper.h"
+#include "lustruct_gpu.h"
+#ifdef __INTEL_COMPILER
+#include "mkl.h"
+#else
+//#include "cblas.h"
+#endif
+
+
+/* 
+/-- num_u_blks--\ /-- num_u_blks_Phi --\
+----------------------------------------
+|  host_cols    ||    GPU   |   host   |
+----------------------------------------
+                  ^          ^
+                  0          jj_cpu
+*/
+static int_t getAccUPartition( HyP_t* HyP )
+{
+    /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */
+    int_t total_cols_1  = HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols;
+    
+    int_t host_cols     = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols;
+    double cpu_time_0   = estimate_cpu_time(HyP->Lnbrow, total_cols_1, HyP->ldu_Phi) +
+                          estimate_cpu_time(HyP->Rnbrow, host_cols, HyP->ldu)
+                          + estimate_cpu_time(HyP->Lnbrow, host_cols, HyP->ldu) ;
+
+    int jj_cpu;
+
+#if 0  /* Ignoe those estimates */
+    jj_cpu = tuned_partition(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi,
+                                   HyP->Remain_info, HyP->RemainBlk, cpu_time_0, HyP->Rnbrow, HyP->ldu_Phi );
+#else /* Sherry: new */
+    jj_cpu = HyP->num_u_blks_Phi;
+#endif
+
+    if (jj_cpu != 0  && HyP->Rnbrow > 0)  // ###
+    {
+        HyP->offloadCondition = 1;
+    }
+    else
+    {
+        HyP->offloadCondition = 0;
+        jj_cpu = 0;        // ###
+    }
+
+    return jj_cpu;
+
+}
+
+int_t sparseTreeFactor_ASYNC_GPU(
+    sForest_t* sforest,
+    commRequests_t **comReqss, // lists of communication requests,
+                               // size = maxEtree level
+    scuBufs_t *scuBufs, // contains buffers for schur complement update
+    packLUInfo_t*packLUInfo,
+    msgs_t**msgss,                  // size = num Look ahead
+    LUValSubBuf_t**LUvsbs,          // size = num Look ahead
+    diagFactBufs_t **dFBufs,        // size = maxEtree level
+    factStat_t *factStat,
+    factNodelists_t  *fNlists,
+    gEtreeInfo_t*   gEtreeInfo,     // global etree info
+    superlu_options_t *options,
+    int_t * gIperm_c_supno,
+    int_t ldt,
+    sluGPU_t *sluGPU,
+    d2Hreduce_t* d2Hred,
+    HyP_t* HyP,
+    LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    double thresh,  SCT_t *SCT,
+    int *info
+)
+{
+// sforest.nNodes, sforest.nodeList,
+    // &sforest.topoInfo,
+    int_t nnodes = sforest->nNodes ; // number of nodes in supernodal etree
+    if (nnodes < 1)
+    {
+        return 1;
+    }
+
+    int_t *perm_c_supno = sforest->nodeList ;  // list of nodes in the order of factorization
+    treeTopoInfo_t* treeTopoInfo = &sforest->topoInfo;
+    int_t* myIperm = treeTopoInfo->myIperm;
+
+    gridinfo_t* grid = &(grid3d->grid2d);
+    /*main loop over all the levels*/
+
+    int_t maxTopoLevel = treeTopoInfo->numLvl;
+    int_t* eTreeTopLims = treeTopoInfo->eTreeTopLims;
+    int_t * IrecvPlcd_D = factStat->IrecvPlcd_D;
+    int_t* factored_D = factStat->factored_D;
+    int_t * factored_L = factStat->factored_L;
+    int_t * factored_U = factStat->factored_U;
+    int_t* IbcastPanel_L = factStat->IbcastPanel_L;
+    int_t* IbcastPanel_U = factStat->IbcastPanel_U;
+    int_t* gpuLUreduced = factStat->gpuLUreduced;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+
+    int_t numLAMax = getNumLookAhead();
+    int_t numLA = numLAMax;  // number of look-ahead panels
+    int_t superlu_acc_offload = HyP->superlu_acc_offload;
+    int_t last_flag = 1;  /* for updating nsuper-1 only once */
+    int_t nCudaStreams = sluGPU->nCudaStreams; // number of cuda streams
+
+    if (superlu_acc_offload) syncAllfunCallStreams(sluGPU, SCT);
+
+    /* Go through each leaf node */
+    for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0)
+    {
+        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+        int_t offset = k0;
+        /* k-th diagonal factorization */
+
+        /* If LU panels from GPU are not reduced, then reduce
+	   them before diagonal factorization */
+        if (!gpuLUreduced[k] && superlu_acc_offload)
+        {
+            double tt_start1 = SuperLU_timer_();
+
+            initD2Hreduce(k, d2Hred, last_flag,
+                          HyP, sluGPU, grid, LUstruct, SCT);
+            int_t copyL_kljb = d2Hred->copyL_kljb;
+            int_t copyU_kljb = d2Hred->copyU_kljb;
+
+            if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++;
+            sendLUpanelGPU2HOST( k, d2Hred, sluGPU);
+
+            reduceGPUlu(last_flag,  d2Hred,
+                        sluGPU, SCT, grid, LUstruct);
+
+            gpuLUreduced[k] = 1;
+            SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+        }
+
+	double t1 = SuperLU_timer_();
+
+        /*Now factor and broadcast diagonal block*/
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT );
+        factored_D[k] = 1;
+	
+	SCT->pdgstrf2_timer += (SuperLU_timer_() - t1);
+    } /* for all leaves ... */
+
+    //printf(".. SparseFactor_GPU: after leaves\n"); fflush(stdout);
+
+    /* Process supernodal etree level by level */
+    for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
+        // for (int_t topoLvl = 0; topoLvl < 1; ++topoLvl)
+    {
+      //      printf("(%d) factor level %d, maxTopoLevel %d\n",grid3d->iam,topoLvl,maxTopoLevel); fflush(stdout);
+        /* code */
+        int_t k_st = eTreeTopLims[topoLvl];
+        int_t k_end = eTreeTopLims[topoLvl + 1];
+	
+	/* Process all the nodes in 'topoLvl': diagonal factorization */
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+
+            if (!factored_D[k] )
+            {
+                /*If LU panels from GPU are not reduced then reduce
+		  them before diagonal factorization*/
+                if (!gpuLUreduced[k] && superlu_acc_offload)
+                {
+                    double tt_start1 = SuperLU_timer_();
+                    initD2Hreduce(k, d2Hred, last_flag,
+                                  HyP, sluGPU, grid, LUstruct, SCT);
+                    int_t copyL_kljb = d2Hred->copyL_kljb;
+                    int_t copyU_kljb = d2Hred->copyU_kljb;
+
+                    if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++;
+                    sendLUpanelGPU2HOST( k, d2Hred, sluGPU);
+                    /*
+                        Reduce the LU panels from GPU
+                    */
+                    reduceGPUlu(last_flag, d2Hred,
+                                sluGPU, SCT, grid, LUstruct);
+
+                    gpuLUreduced[k] = 1;
+                    SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+                }
+
+		double t1 = SuperLU_timer_();
+		/* Factor diagonal block on CPU */
+                sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                                options, thresh, LUstruct, stat, info, SCT );
+		SCT->pdgstrf2_timer += (SuperLU_timer_() - t1);
+            }
+        } /* for all nodes in this level */
+
+	//printf(".. SparseFactor_GPU: after diag factorization\n"); fflush(stdout);
+
+        double t_apt = SuperLU_timer_(); /* Async Pipe Timer */
+
+	/* Process all the nodes in 'topoLvl': panel updates on CPU */
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+
+            /*L update */
+            if (factored_L[k] == 0)
+            {   sLPanelUpdate(k,  dFBufs[offset], factStat,
+                              comReqss[offset], grid, LUstruct, SCT);
+                factored_L[k] = 1;
+            }
+            /*U update*/
+            if (factored_U[k] == 0)
+            {   sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset], scuBufs,
+                              packLUInfo, grid, LUstruct, stat, SCT);
+                factored_U[k] = 1;
+            }
+        } /* end panel update */
+
+	//printf(".. after CPU panel updates. numLA %d\n", numLA); fflush(stdout);
+
+	/* Process all the panels in look-ahead window: 
+	   broadcast L and U panels. */
+        for (int_t k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+            /* diagonal factorization */
+
+            /*L Ibcast*/
+            if (IbcastPanel_L[k] == 0)
+            {
+                sIBcastRecvLPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid,
+				   LUstruct, SCT );
+                IbcastPanel_L[k] = 1; /*for consistancy; unused later*/
+            }
+
+            /*U Ibcast*/
+            if (IbcastPanel_U[k] == 0)
+            {
+                sIBcastRecvUPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid,
+				   LUstruct, SCT );
+                IbcastPanel_U[k] = 1;
+            }
+        } /* end for panels in look-ahead window */
+
+	//printf(".. after CPU look-ahead updates\n"); fflush(stdout);
+
+        // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+        SCT->tAsyncPipeTail += (SuperLU_timer_() - t_apt);
+
+	/* Process all the nodes in level 'topoLvl': Schur complement update
+	   (no MPI communication)  */
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+
+            double tsch = SuperLU_timer_();
+
+            /*Wait for L panel*/
+            sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+
+            /*Wait for U panel*/
+            sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+
+            int_t LU_nonempty = SchurComplementSetupGPU(k,
+                                msgss[offset], packLUInfo,
+                                myIperm, gIperm_c_supno, perm_c_supno,
+                                gEtreeInfo, fNlists, scuBufs,
+				LUvsbs[offset], grid, LUstruct, HyP);
+            // initializing D2H data transfer. D2H = Device To Host.
+            int_t jj_cpu; /* limit between CPU and GPU */
+            if (superlu_acc_offload)
+            {
+                jj_cpu = getAccUPartition (HyP);
+
+                if (jj_cpu > 0)
+                    jj_cpu = HyP->num_u_blks_Phi;
+
+		/* Sherry force this --> */
+		jj_cpu = HyP->num_u_blks_Phi;  // -1 ??
+		HyP->offloadCondition = 1;
+            }
+            else
+            {
+                jj_cpu = 0;
+            }
+
+            // int_t jj_cpu = HyP->num_u_blks_Phi-1;
+            // if (HyP->Rnbrow > 0 && jj_cpu>=0)
+            //     HyP->offloadCondition = 1;
+            // else
+            //     HyP->offloadCondition = 0;
+            //     jj_cpu=0;
+#if 0
+	    if ( HyP->offloadCondition ) {
+	    printf("(%d) k=%d, nub=%d, nub_host=%d, nub_phi=%d, jj_cpu %d, offloadCondition %d\n",
+		   grid3d->iam, k, HyP->num_u_blks+HyP->num_u_blks_Phi ,
+		   HyP->num_u_blks, HyP->num_u_blks_Phi,
+		   jj_cpu, HyP->offloadCondition);
+	    fflush(stdout);
+	    }
+#endif
+            scuStatUpdate( SuperSize(k), HyP,  SCT, stat);
+
+            int_t offload_condition = HyP->offloadCondition;
+            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            int_t *lsub = lPanelInfo->lsub;
+            int_t *usub = uPanelInfo->usub;
+            int_t* indirect  = fNlists->indirect;
+            int_t* indirect2  = fNlists->indirect2;
+
+            /* Schur Complement Update */
+
+            int_t knsupc = SuperSize(k);
+            int_t klst = FstBlockC (k + 1);
+
+            double* bigV = scuBufs->bigV;
+            double* bigU = scuBufs->bigU;
+
+	    double t1 = SuperLU_timer_();
+
+#pragma omp parallel  /* Look-ahead update on CPU */
+            {
+		int_t thread_id = omp_get_thread_num();
+
+#pragma omp for	 schedule(dynamic,2) nowait
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
+                {
+                    int_t j   = ij / HyP->lookAheadBlk ;
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    block_gemm_scatterTopLeft( lb,  j, bigV, knsupc, klst, lsub,
+                                               usub, ldt,  indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                }
+
+#pragma omp for  schedule(dynamic,2) nowait
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
+		{
+		    int_t j   = ij / HyP->lookAheadBlk ;
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    block_gemm_scatterTopRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                usub, ldt,  indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                }
+
+#pragma omp for	 schedule(dynamic,2) nowait
+                for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij)
+                {
+                    int_t j   = ij / HyP->RemainBlk;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    block_gemm_scatterBottomLeft( lb,  j, bigV, knsupc, klst, lsub,
+						  usub, ldt,  indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                } /* for int_t ij = ... */
+            } /* end parallel region ... end look-ahead update */
+
+	    SCT->lookaheadupdatetimer += (SuperLU_timer_() - t1);
+
+	    //printf("... after look-ahead update, topoLvl %d\t maxTopoLevel %d\n", topoLvl, maxTopoLevel); fflush(stdout);
+
+	    /* Reduce the L & U panels from GPU to CPU.       */
+            if (topoLvl < maxTopoLevel - 1) { /* Not the root */
+	        int_t k_parent = gEtreeInfo->setree[k];
+                gEtreeInfo->numChildLeft[k_parent]--;
+                if (gEtreeInfo->numChildLeft[k_parent] == 0 && k_parent < nnodes)
+		{   /* if k is the last child in this level */
+                    int_t k0_parent =  myIperm[k_parent];
+                    if (k0_parent > 0)
+		    {
+                        /* code */
+      //      printf("Before assert: iam %d, k %d, k_parent %d, k0_parent %d, nnodes %d\n", grid3d->iam, k, k_parent, k0_parent, nnodes); fflush(stdout);
+	//	      exit(-1);
+			assert(k0_parent < nnodes);
+                        int_t offset = k0_parent - k_end;
+                        if (!gpuLUreduced[k_parent] && superlu_acc_offload)
+                        {
+                            double tt_start1 = SuperLU_timer_();
+
+                            initD2Hreduce(k_parent, d2Hred, last_flag,
+                                          HyP, sluGPU, grid, LUstruct, SCT);
+                            int_t copyL_kljb = d2Hred->copyL_kljb;
+                            int_t copyU_kljb = d2Hred->copyU_kljb;
+
+                            if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++;
+                            sendLUpanelGPU2HOST( k_parent, d2Hred, sluGPU);
+
+                            /* Reduce the LU panels from GPU */
+                            reduceGPUlu(last_flag,  d2Hred,
+                                        sluGPU, SCT, grid, LUstruct);
+
+                            gpuLUreduced[k_parent] = 1;
+                            SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
+			}
+			
+			/* Factorize diagonal block on CPU */
+                        sDiagFactIBCast(k_parent,  dFBufs[offset],
+					factStat, comReqss[offset], grid,
+                                        options, thresh, LUstruct, stat,
+					info, SCT );
+                        factored_D[k_parent] = 1;
+		    } /* end if k0_parent > 0 */
+
+                } /* end if all children are done */
+            } /* end if non-root */
+
+#pragma omp parallel
+            {
+		/* Master thread performs Schur complement update on GPU. */
+#pragma omp master
+                {
+                    if (superlu_acc_offload)
+                    {
+                        int_t thread_id = omp_get_thread_num();
+			double t1 = SuperLU_timer_();
+
+                        if (offload_condition)
+                        {
+			    SCT->datatransfer_count++;
+                            int_t streamId  = k0 % nCudaStreams;
+
+                            /*wait for previous offload to get finished*/
+                            if (sluGPU->lastOffloadStream[streamId] != -1)
+                            {
+                                waitGPUscu(streamId, sluGPU, SCT);
+                                sluGPU->lastOffloadStream[streamId] = -1;
+                            }
+
+                            int_t Remain_lbuf_send_size = knsupc * HyP->Rnbrow;
+                            int_t bigu_send_size = jj_cpu < 1 ? 0 : HyP->ldu_Phi * HyP->Ublock_info_Phi[jj_cpu - 1].full_u_cols;
+                            assert(bigu_send_size < HyP->bigu_size);
+
+	    /* !! Sherry add the test to avoid seg_fault inside sendSCUdataHost2GPU */
+	    if ( bigu_send_size > 0 ) {
+                            sendSCUdataHost2GPU(streamId, lsub, usub, bigU, bigu_send_size,
+                                                Remain_lbuf_send_size, sluGPU, HyP
+                                               );
+
+                            sluGPU->lastOffloadStream[streamId] = k0;
+                            int_t usub_len = usub[2];
+                            int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+			    //{printf("... before SchurCompUpdate_GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);}
+
+                            SchurCompUpdate_GPU(
+                                streamId, 0, jj_cpu, klst, knsupc, HyP->Rnbrow, HyP->RemainBlk,
+                                Remain_lbuf_send_size, bigu_send_size, HyP->ldu_Phi, HyP->num_u_blks_Phi,
+                                HyP->buffer_size, lsub_len, usub_len, ldt, k0, sluGPU, grid
+                            );
+	    } /* endif bigu_send_size > 0 */
+
+                            // sendLUpanelGPU2HOST( k0, d2Hred, sluGPU);
+
+                            SCT->schurPhiCallCount ++;
+                            HyP->jj_cpu = jj_cpu;
+                            updateDirtyBit(k0, HyP, grid);
+                        } /* endif (offload_condition) */
+
+			double t2 = SuperLU_timer_();
+                        SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1); /* not used */
+                        SCT->CPUOffloadTimer += (double) (t2 - t1); // Sherry added
+
+                    } /* endif (superlu_acc_offload) */
+
+                } /* end omp master thread */
+
+#pragma omp for  schedule(dynamic,2) nowait
+		/* The following update is on CPU. Should not be necessary now,
+		   because we set jj_cpu equal to num_u_blks_Phi.      		*/
+                for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu) ; ++ij)
+                {
+		    //printf(".. WARNING: should NOT get here\n");
+                    int_t j   = ij / HyP->RemainBlk + jj_cpu;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    block_gemm_scatterBottomRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                   usub, ldt,  indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                } /* for int_t ij = ... */
+
+            } /* end omp parallel region */
+
+            //SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+
+            // finish waiting for diag block send
+            int_t abs_offset = k0 - k_st;
+            sWait_LUDiagSend(k,  comReqss[abs_offset], grid, SCT);
+
+            /*Schedule next I bcasts within look-ahead window */
+            for (int_t next_k0 = k0 + 1; next_k0 < SUPERLU_MIN( k0 + 1 + numLA, nnodes); ++next_k0)
+            {
+                /* code */
+                int_t next_k = perm_c_supno[next_k0];
+                int_t offset = next_k0 % numLA;
+
+                /*L Ibcast*/
+                if (IbcastPanel_L[next_k] == 0 && factored_L[next_k])
+                {
+                    sIBcastRecvLPanel( next_k, comReqss[offset],
+				       LUvsbs[offset],  msgss[offset],
+				       factStat, grid, LUstruct, SCT );
+                    IbcastPanel_L[next_k] = 1; /*will be used later*/
+                }
+                /*U Ibcast*/
+                if (IbcastPanel_U[next_k] == 0 && factored_U[next_k])
+                {
+                    sIBcastRecvUPanel( next_k, comReqss[offset],
+				       LUvsbs[offset],  msgss[offset],
+				       factStat, grid, LUstruct, SCT );
+                    IbcastPanel_U[next_k] = 1;
+                }
+            } /* end for look-ahead window */
+
+            if (topoLvl < maxTopoLevel - 1) /* not root */
+            {
+                /*look-ahead LU factorization*/
+                int_t kx_st = eTreeTopLims[topoLvl + 1];
+                int_t kx_end = eTreeTopLims[topoLvl + 2];
+                for (int_t k0x = kx_st; k0x < kx_end; k0x++)
+                {
+                    /* code */
+                    int_t kx = perm_c_supno[k0x];
+                    int_t offset = k0x - kx_st;
+                    if (IrecvPlcd_D[kx] && !factored_L[kx])
+                    {
+                        /*check if received*/
+                        int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvUDiag)
+                        {
+                            sLPanelTrSolve( kx,  dFBufs[offset],
+                                            factStat, comReqss[offset],
+                                            grid, LUstruct, SCT);
+                            factored_L[kx] = 1;
+
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_L[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within look-ahead window
+                                    factored_L[kx])
+                            {
+                                int_t offset1 = k0x % numLA;
+                                sIBcastRecvLPanel( kx, comReqss[offset1],  LUvsbs[offset1],
+                                                   msgss[offset1], factStat, grid, LUstruct, SCT );
+                                IbcastPanel_L[kx] = 1; /*will be used later*/
+                            }
+
+                        }
+                    }
+
+                    if (IrecvPlcd_D[kx] && !factored_U[kx])
+                    {
+                        /*check if received*/
+                        int_t recvLDiag = checkRecvLDiag( kx, comReqss[offset],
+                                                          grid, SCT);
+                        if (recvLDiag)
+                        {
+                            sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo,
+                                            grid, LUstruct, stat, SCT);
+                            factored_U[kx] = 1;
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_U[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within lookahead window
+                                    factored_U[kx])
+                            {
+                                int_t offset = k0x % numLA;
+                                sIBcastRecvUPanel( kx, comReqss[offset],  LUvsbs[offset],  msgss[offset], factStat, grid, LUstruct, SCT );
+                                IbcastPanel_U[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+                } /* end look-ahead */
+
+            } /* end if non-root level */
+	    
+	    /* end Schur complement update */
+            SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+
+        } /* end Schur update for all the nodes in level 'topoLvl' */
+
+    } /* end for all levels of the tree */
+
+    return 0;
+} /* end sparseTreeFactor_ASYNC_GPU */

From d0ff65ef510a321a62776f3c58605272137270bf Mon Sep 17 00:00:00 2001
From: 7ps 
Date: Tue, 22 Dec 2020 16:22:19 -0500
Subject: [PATCH 054/147] Compiling on thimble but gettign stuck with 1
 processors

---
 CMakeLists.txt             |   2 +-
 SRC/CMakeLists.txt         |   4 +-
 SRC/acc_aux.c              |  42 +--
 SRC/pdgstrf3d.c            |   8 +-
 SRC/superlu_gpu.cu         |  33 +--
 SRC/treeFactorizationGPU.c | 520 +++++++++++++++++++++++--------------
 6 files changed, 367 insertions(+), 242 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 683d7e94..80089345 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@
 cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
 
 # Project version numbers
-project(SuperLU_DIST C CXX)
+project(SuperLU_DIST C CXX CUDA)
 set(VERSION_MAJOR "7")
 set(VERSION_MINOR "0")
 set(VERSION_BugFix "0")
diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index f9b2d974..916ce8df 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -14,6 +14,7 @@ set(headers
     TreeBcast_slu_impl.hpp
     TreeReduce_slu_impl.hpp	
     ${CMAKE_CURRENT_BINARY_DIR}/superlu_dist_config.h
+    cub/cub.cuh
 )
 if (MSVC)
   list(APPEND headers wingetopt.h)
@@ -50,7 +51,8 @@ set(sources
   supernodalForest.c
   trfAux.c 
   communication_aux.c
-  treeFactorization.c 
+  treeFactorization.c
+  treeFactorizationGPU.c 
   sec_structs.c
   cublas_utils.c
 )
diff --git a/SRC/acc_aux.c b/SRC/acc_aux.c
index c8a1c634..d62d0c4e 100644
--- a/SRC/acc_aux.c
+++ b/SRC/acc_aux.c
@@ -2,17 +2,17 @@
 
 #define CLAMP(x, low, high)  (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))
 
-int
-get_thread_per_process ()
-{
-    char *ttemp;
-    ttemp = getenv ("THREAD_PER_PROCESS");
-
-    if (ttemp)
-        return atoi (ttemp);
-    else
-        return 1;
-}
+// int
+// get_thread_per_process ()
+// {
+//     char *ttemp;
+//     ttemp = getenv ("THREAD_PER_PROCESS");
+
+//     if (ttemp)
+//         return atoi (ttemp);
+//     else
+//         return 1;
+// }
 
 
 static inline double
@@ -33,16 +33,16 @@ load_imb (double *A, int nthreads)
 
 
 
-int_t
-get_max_buffer_size ()
-{
-    char *ttemp;
-    ttemp = getenv ("MAX_BUFFER_SIZE");
-    if (ttemp)
-        return atoi (ttemp);
-    else
-        return 5000000;
-}
+// int_t
+// get_max_buffer_size ()
+// {
+//     char *ttemp;
+//     ttemp = getenv ("MAX_BUFFER_SIZE");
+//     if (ttemp)
+//         return atoi (ttemp);
+//     else
+//         return 5000000;
+// }
 
 
 // #define ACC_ASYNC_COST 3.79e-3
diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c
index b9a884dd..843d257f 100644
--- a/SRC/pdgstrf3d.c
+++ b/SRC/pdgstrf3d.c
@@ -214,8 +214,8 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int_t Pr = grid->nprow;
     int_t mrb =    (nsupers + Pr - 1) / Pr;  // Sherry check ... use ceiling
     int_t mcb =    (nsupers + Pc - 1) / Pc;
-    HyP_t *HyP = (HyP_t *) malloc(sizeof(HyP_t));
-    Init_HyP(HyP, Llu, mcb, mrb);
+    HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t));
+    dInit_HyP(HyP, Llu, mcb, mrb);
     HyP->first_l_block_acc = first_l_block_acc;
     HyP->first_u_block_acc = first_u_block_acc;
     int_t superlu_acc_offload = HyP->superlu_acc_offload;
@@ -292,7 +292,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
             {
                 double tilvl = SuperLU_timer_();
 #ifdef GPU_ACC
-                dsparseTreeFactor_ASYNC_GPU(
+                sparseTreeFactor_ASYNC_GPU(
                     sforest,
                     comReqss, &scuBufs,  &packLUInfo,
                     msgss, LUvsbs, dFBufs,  &factStat, &fNlists,
@@ -315,7 +315,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
             if (ilvl < maxLvl - 1)     /*then reduce before factorization*/
             {
 #ifdef GPU_ACC
-                dreduceAllAncestors3d_GPU(
+                reduceAllAncestors3d_GPU(
                     ilvl, myNodeCount, treePerm, LUvsb,
                     LUstruct, grid3d, sluGPU, d2Hred, &factStat, HyP,
                     SCT );
diff --git a/SRC/superlu_gpu.cu b/SRC/superlu_gpu.cu
index 241e7362..e8eaa92c 100644
--- a/SRC/superlu_gpu.cu
+++ b/SRC/superlu_gpu.cu
@@ -2,14 +2,15 @@
 
 #include "mpi.h"
 #include "omp.h"
-#include "sec_structs.h"
+// #include "sec_structs.h"
 #include 
 #include 
 #include 
 #undef Reduce
-#include 
+// #include 
+#include "cub/cub.cuh"
 #include "lustruct_gpu.h"
-#include "p3dcomm.h"
+// #include "p3dcomm.h"
 // #include "mkl_cblas.h"
 
 extern "C" {
@@ -32,17 +33,17 @@ cudaError_t checkCuda(cudaError_t result)
 }
 
 
-cublasStatus_t checkCublas(cublasStatus_t result)
-{
-#if defined(DEBUG) || defined(_DEBUG)
-	if (result != CUBLAS_STATUS_SUCCESS)
-	{
-		fprintf(stderr, "CUDA Blas Runtime Error: %s\n", cublasGetErrorString(result));
-		assert(result == CUBLAS_STATUS_SUCCESS);
-	}
-#endif
-	return result;
-}
+// cublasStatus_t checkCublas(cublasStatus_t result)
+// {
+// #if defined(DEBUG) || defined(_DEBUG)
+// 	if (result != CUBLAS_STATUS_SUCCESS)
+// 	{
+// 		fprintf(stderr, "CUDA Blas Runtime Error: %s\n", cublasGetErrorString(result));
+// 		assert(result == CUBLAS_STATUS_SUCCESS);
+// 	}
+// #endif
+// 	return result;
+// }
 
 
 int_t getnCudaStreams()
@@ -1759,7 +1760,7 @@ void CopyLUToGPU3D (
 
 int_t reduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount,
                                int_t** treePerm,
-                               LUValSubBuf_t*LUvsb,
+                               dLUValSubBuf_t*LUvsb,
                                LUstruct_t* LUstruct,
                                gridinfo3d_t* grid3d,
                                sluGPU_t *sluGPU,
@@ -1838,7 +1839,7 @@ int_t reduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount,
 		}
 	} /*if (myGrid == sender)*/
 
-	reduceAllAncestors3d(ilvl, myNodeCount, treePerm,
+	dreduceAllAncestors3d(ilvl, myNodeCount, treePerm,
 	                     LUvsb, LUstruct, grid3d, SCT );
 	return 0;
 }
diff --git a/SRC/treeFactorizationGPU.c b/SRC/treeFactorizationGPU.c
index 43aa7089..33bdd56d 100644
--- a/SRC/treeFactorizationGPU.c
+++ b/SRC/treeFactorizationGPU.c
@@ -1,5 +1,5 @@
-#include "treeFactorization.h"
-#include "trfCommWrapper.h"
+// #include "treeFactorization.h"
+// #include "trfCommWrapper.h"
 #include "lustruct_gpu.h"
 #ifdef __INTEL_COMPILER
 #include "mkl.h"
@@ -7,7 +7,6 @@
 //#include "cblas.h"
 #endif
 
-
 /* 
 /-- num_u_blks--\ /-- num_u_blks_Phi --\
 ----------------------------------------
@@ -16,100 +15,99 @@
                   ^          ^
                   0          jj_cpu
 */
-static int_t getAccUPartition( HyP_t* HyP )
+static int_t getAccUPartition(HyP_t *HyP)
 {
     /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */
-    int_t total_cols_1  = HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols;
-    
-    int_t host_cols     = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols;
-    double cpu_time_0   = estimate_cpu_time(HyP->Lnbrow, total_cols_1, HyP->ldu_Phi) +
-                          estimate_cpu_time(HyP->Rnbrow, host_cols, HyP->ldu)
-                          + estimate_cpu_time(HyP->Lnbrow, host_cols, HyP->ldu) ;
+    int_t total_cols_1 = HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols;
+
+    int_t host_cols = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols;
+    double cpu_time_0 = estimate_cpu_time(HyP->Lnbrow, total_cols_1, HyP->ldu_Phi) +
+                        estimate_cpu_time(HyP->Rnbrow, host_cols, HyP->ldu) + estimate_cpu_time(HyP->Lnbrow, host_cols, HyP->ldu);
 
     int jj_cpu;
 
-#if 0  /* Ignoe those estimates */
+#if 0 /* Ignoe those estimates */
     jj_cpu = tuned_partition(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi,
                                    HyP->Remain_info, HyP->RemainBlk, cpu_time_0, HyP->Rnbrow, HyP->ldu_Phi );
 #else /* Sherry: new */
     jj_cpu = HyP->num_u_blks_Phi;
 #endif
 
-    if (jj_cpu != 0  && HyP->Rnbrow > 0)  // ###
+    if (jj_cpu != 0 && HyP->Rnbrow > 0) // ###
     {
         HyP->offloadCondition = 1;
     }
     else
     {
         HyP->offloadCondition = 0;
-        jj_cpu = 0;        // ###
+        jj_cpu = 0; // ###
     }
 
     return jj_cpu;
-
 }
 
 int_t sparseTreeFactor_ASYNC_GPU(
-    sForest_t* sforest,
+    sForest_t *sforest,
     commRequests_t **comReqss, // lists of communication requests,
                                // size = maxEtree level
-    scuBufs_t *scuBufs, // contains buffers for schur complement update
-    packLUInfo_t*packLUInfo,
-    msgs_t**msgss,                  // size = num Look ahead
-    LUValSubBuf_t**LUvsbs,          // size = num Look ahead
-    diagFactBufs_t **dFBufs,        // size = maxEtree level
+    scuBufs_t *scuBufs,        // contains buffers for schur complement update
+    packLUInfo_t *packLUInfo,
+    msgs_t **msgss,          // size = num Look ahead
+    dLUValSubBuf_t **LUvsbs, // size = num Look ahead
+    diagFactBufs_t **dFBufs, // size = maxEtree level
     factStat_t *factStat,
-    factNodelists_t  *fNlists,
-    gEtreeInfo_t*   gEtreeInfo,     // global etree info
-    superlu_options_t *options,
-    int_t * gIperm_c_supno,
+    factNodelists_t *fNlists,
+    gEtreeInfo_t *gEtreeInfo, // global etree info
+    superlu_dist_options_t *options,
+    int_t *gIperm_c_supno,
     int_t ldt,
     sluGPU_t *sluGPU,
-    d2Hreduce_t* d2Hred,
-    HyP_t* HyP,
-    LUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
-    double thresh,  SCT_t *SCT,
-    int *info
-)
+    d2Hreduce_t *d2Hred,
+    HyP_t *HyP,
+    LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat,
+    double thresh, SCT_t *SCT, int tag_ub,
+    int *info)
 {
-// sforest.nNodes, sforest.nodeList,
+    // sforest.nNodes, sforest.nodeList,
     // &sforest.topoInfo,
-    int_t nnodes = sforest->nNodes ; // number of nodes in supernodal etree
+    int_t nnodes = sforest->nNodes; // number of nodes in supernodal etree
     if (nnodes < 1)
     {
         return 1;
     }
 
-    int_t *perm_c_supno = sforest->nodeList ;  // list of nodes in the order of factorization
-    treeTopoInfo_t* treeTopoInfo = &sforest->topoInfo;
-    int_t* myIperm = treeTopoInfo->myIperm;
+    int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization
+    treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo;
+    int_t *myIperm = treeTopoInfo->myIperm;
 
-    gridinfo_t* grid = &(grid3d->grid2d);
+    gridinfo_t *grid = &(grid3d->grid2d);
     /*main loop over all the levels*/
 
     int_t maxTopoLevel = treeTopoInfo->numLvl;
-    int_t* eTreeTopLims = treeTopoInfo->eTreeTopLims;
-    int_t * IrecvPlcd_D = factStat->IrecvPlcd_D;
-    int_t* factored_D = factStat->factored_D;
-    int_t * factored_L = factStat->factored_L;
-    int_t * factored_U = factStat->factored_U;
-    int_t* IbcastPanel_L = factStat->IbcastPanel_L;
-    int_t* IbcastPanel_U = factStat->IbcastPanel_U;
-    int_t* gpuLUreduced = factStat->gpuLUreduced;
-    int_t* xsup = LUstruct->Glu_persist->xsup;
-
-    int_t numLAMax = getNumLookAhead();
-    int_t numLA = numLAMax;  // number of look-ahead panels
+    int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims;
+    int_t *IrecvPlcd_D = factStat->IrecvPlcd_D;
+    int_t *factored_D = factStat->factored_D;
+    int_t *factored_L = factStat->factored_L;
+    int_t *factored_U = factStat->factored_U;
+    int_t *IbcastPanel_L = factStat->IbcastPanel_L;
+    int_t *IbcastPanel_U = factStat->IbcastPanel_U;
+    int_t *gpuLUreduced = factStat->gpuLUreduced;
+    int_t *xsup = LUstruct->Glu_persist->xsup;
+
+    // int_t numLAMax = getNumLookAhead();
+    int_t numLAMax = getNumLookAhead(options);
+    int_t numLA = numLAMax; // number of look-ahead panels
     int_t superlu_acc_offload = HyP->superlu_acc_offload;
-    int_t last_flag = 1;  /* for updating nsuper-1 only once */
+    int_t last_flag = 1;                       /* for updating nsuper-1 only once */
     int_t nCudaStreams = sluGPU->nCudaStreams; // number of cuda streams
 
-    if (superlu_acc_offload) syncAllfunCallStreams(sluGPU, SCT);
+    if (superlu_acc_offload)
+        syncAllfunCallStreams(sluGPU, SCT);
 
     /* Go through each leaf node */
     for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0)
     {
-        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+        int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno
         int_t offset = k0;
         /* k-th diagonal factorization */
 
@@ -124,44 +122,58 @@ int_t sparseTreeFactor_ASYNC_GPU(
             int_t copyL_kljb = d2Hred->copyL_kljb;
             int_t copyU_kljb = d2Hred->copyU_kljb;
 
-            if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++;
-            sendLUpanelGPU2HOST( k, d2Hred, sluGPU);
+            if (copyL_kljb || copyU_kljb)
+                SCT->PhiMemCpyCounter++;
+            sendLUpanelGPU2HOST(k, d2Hred, sluGPU);
 
-            reduceGPUlu(last_flag,  d2Hred,
+            reduceGPUlu(last_flag, d2Hred,
                         sluGPU, SCT, grid, LUstruct);
 
             gpuLUreduced[k] = 1;
             SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
         }
 
-	double t1 = SuperLU_timer_();
+        double t1 = SuperLU_timer_();
 
         /*Now factor and broadcast diagonal block*/
+        // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+        //                 options, thresh, LUstruct, stat, info, SCT);
+
+#if 0
         sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
-                        options, thresh, LUstruct, stat, info, SCT );
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+        dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+                        factStat->IrecvPlcd_D,
+                        comReqss[offset]->U_diag_blk_recv_req,
+                        comReqss[offset]->L_diag_blk_recv_req,
+                        comReqss[offset]->U_diag_blk_send_req,
+                        comReqss[offset]->L_diag_blk_send_req,
+                        grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
         factored_D[k] = 1;
-	
-	SCT->pdgstrf2_timer += (SuperLU_timer_() - t1);
+
+        SCT->pdgstrf2_timer += (SuperLU_timer_() - t1);
     } /* for all leaves ... */
 
     //printf(".. SparseFactor_GPU: after leaves\n"); fflush(stdout);
 
     /* Process supernodal etree level by level */
     for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
-        // for (int_t topoLvl = 0; topoLvl < 1; ++topoLvl)
+    // for (int_t topoLvl = 0; topoLvl < 1; ++topoLvl)
     {
-      //      printf("(%d) factor level %d, maxTopoLevel %d\n",grid3d->iam,topoLvl,maxTopoLevel); fflush(stdout);
+        //      printf("(%d) factor level %d, maxTopoLevel %d\n",grid3d->iam,topoLvl,maxTopoLevel); fflush(stdout);
         /* code */
         int_t k_st = eTreeTopLims[topoLvl];
         int_t k_end = eTreeTopLims[topoLvl + 1];
-	
-	/* Process all the nodes in 'topoLvl': diagonal factorization */
+
+        /* Process all the nodes in 'topoLvl': diagonal factorization */
         for (int_t k0 = k_st; k0 < k_end; ++k0)
         {
-            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno
             int_t offset = k0 - k_st;
 
-            if (!factored_D[k] )
+            if (!factored_D[k])
             {
                 /*If LU panels from GPU are not reduced then reduce
 		  them before diagonal factorization*/
@@ -173,8 +185,9 @@ int_t sparseTreeFactor_ASYNC_GPU(
                     int_t copyL_kljb = d2Hred->copyL_kljb;
                     int_t copyU_kljb = d2Hred->copyU_kljb;
 
-                    if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++;
-                    sendLUpanelGPU2HOST( k, d2Hred, sluGPU);
+                    if (copyL_kljb || copyU_kljb)
+                        SCT->PhiMemCpyCounter++;
+                    sendLUpanelGPU2HOST(k, d2Hred, sluGPU);
                     /*
                         Reduce the LU panels from GPU
                     */
@@ -185,104 +198,148 @@ int_t sparseTreeFactor_ASYNC_GPU(
                     SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
                 }
 
-		double t1 = SuperLU_timer_();
-		/* Factor diagonal block on CPU */
-                sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
-                                options, thresh, LUstruct, stat, info, SCT );
-		SCT->pdgstrf2_timer += (SuperLU_timer_() - t1);
+                double t1 = SuperLU_timer_();
+                /* Factor diagonal block on CPU */
+                // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+                //                 options, thresh, LUstruct, stat, info, SCT);
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+                dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+                                factStat->IrecvPlcd_D,
+                                comReqss[offset]->U_diag_blk_recv_req,
+                                comReqss[offset]->L_diag_blk_recv_req,
+                                comReqss[offset]->U_diag_blk_send_req,
+                                comReqss[offset]->L_diag_blk_send_req,
+                                grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                SCT->pdgstrf2_timer += (SuperLU_timer_() - t1);
             }
         } /* for all nodes in this level */
 
-	//printf(".. SparseFactor_GPU: after diag factorization\n"); fflush(stdout);
+        //printf(".. SparseFactor_GPU: after diag factorization\n"); fflush(stdout);
 
         double t_apt = SuperLU_timer_(); /* Async Pipe Timer */
 
-	/* Process all the nodes in 'topoLvl': panel updates on CPU */
+        /* Process all the nodes in 'topoLvl': panel updates on CPU */
         for (int_t k0 = k_st; k0 < k_end; ++k0)
         {
-            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno
             int_t offset = k0 - k_st;
 
             /*L update */
             if (factored_L[k] == 0)
-            {   sLPanelUpdate(k,  dFBufs[offset], factStat,
-                              comReqss[offset], grid, LUstruct, SCT);
+            {
+#if 0
+		sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset],
+			      grid, LUstruct, SCT);
+#else
+                dLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+                              comReqss[offset]->U_diag_blk_recv_req,
+                              dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT);
+#endif
+
                 factored_L[k] = 1;
             }
             /*U update*/
             if (factored_U[k] == 0)
-            {   sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset], scuBufs,
-                              packLUInfo, grid, LUstruct, stat, SCT);
+            {
+#if 0
+		sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset],
+			      scuBufs, packLUInfo, grid, LUstruct, stat, SCT);
+#else
+                dUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req,
+                              dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt,
+                              packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+#endif
                 factored_U[k] = 1;
             }
         } /* end panel update */
 
-	//printf(".. after CPU panel updates. numLA %d\n", numLA); fflush(stdout);
+        //printf(".. after CPU panel updates. numLA %d\n", numLA); fflush(stdout);
 
-	/* Process all the panels in look-ahead window: 
+        /* Process all the panels in look-ahead window: 
 	   broadcast L and U panels. */
         for (int_t k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)
         {
-            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno
             int_t offset = k0 % numLA;
             /* diagonal factorization */
 
             /*L Ibcast*/
             if (IbcastPanel_L[k] == 0)
             {
+#if 0
                 sIBcastRecvLPanel( k, comReqss[offset],  LUvsbs[offset],
-                                   msgss[offset], factStat, grid,
-				   LUstruct, SCT );
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+                dIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req,
+                                  comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf,
+                                  LUvsbs[offset]->Lval_buf, factStat->factored,
+                                  grid, LUstruct, SCT, tag_ub);
+#endif
                 IbcastPanel_L[k] = 1; /*for consistancy; unused later*/
             }
 
             /*U Ibcast*/
             if (IbcastPanel_U[k] == 0)
             {
+#if 0
                 sIBcastRecvUPanel( k, comReqss[offset],  LUvsbs[offset],
-                                   msgss[offset], factStat, grid,
-				   LUstruct, SCT );
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+                dIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+                                  comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf,
+                                  LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub);
+#endif
                 IbcastPanel_U[k] = 1;
             }
         } /* end for panels in look-ahead window */
 
-	//printf(".. after CPU look-ahead updates\n"); fflush(stdout);
+        //printf(".. after CPU look-ahead updates\n"); fflush(stdout);
 
         // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
         SCT->tAsyncPipeTail += (SuperLU_timer_() - t_apt);
 
-	/* Process all the nodes in level 'topoLvl': Schur complement update
+        /* Process all the nodes in level 'topoLvl': Schur complement update
 	   (no MPI communication)  */
         for (int_t k0 = k_st; k0 < k_end; ++k0)
         {
-            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno
             int_t offset = k0 % numLA;
 
             double tsch = SuperLU_timer_();
 
-            /*Wait for L panel*/
+#if 0
             sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
-
             /*Wait for U panel*/
             sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+#else
+            dWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU,
+                   comReqss[offset]->send_req, comReqss[offset]->recv_req,
+                   grid, LUstruct, SCT);
+            dWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+                   comReqss[offset]->recv_requ, grid, LUstruct, SCT);
+#endif
 
-            int_t LU_nonempty = SchurComplementSetupGPU(k,
-                                msgss[offset], packLUInfo,
-                                myIperm, gIperm_c_supno, perm_c_supno,
-                                gEtreeInfo, fNlists, scuBufs,
-				LUvsbs[offset], grid, LUstruct, HyP);
+            int_t LU_nonempty = dSchurComplementSetupGPU(k,
+                                                        msgss[offset], packLUInfo,
+                                                        myIperm, gIperm_c_supno, perm_c_supno,
+                                                        gEtreeInfo, fNlists, scuBufs,
+                                                        LUvsbs[offset], grid, LUstruct, HyP);
             // initializing D2H data transfer. D2H = Device To Host.
             int_t jj_cpu; /* limit between CPU and GPU */
             if (superlu_acc_offload)
             {
-                jj_cpu = getAccUPartition (HyP);
+                jj_cpu = getAccUPartition(HyP);
 
                 if (jj_cpu > 0)
                     jj_cpu = HyP->num_u_blks_Phi;
 
-		/* Sherry force this --> */
-		jj_cpu = HyP->num_u_blks_Phi;  // -1 ??
-		HyP->offloadCondition = 1;
+                /* Sherry force this --> */
+                jj_cpu = HyP->num_u_blks_Phi; // -1 ??
+                HyP->offloadCondition = 1;
             }
             else
             {
@@ -304,75 +361,76 @@ int_t sparseTreeFactor_ASYNC_GPU(
 	    fflush(stdout);
 	    }
 #endif
-            scuStatUpdate( SuperSize(k), HyP,  SCT, stat);
+            scuStatUpdate(SuperSize(k), HyP, SCT, stat);
 
             int_t offload_condition = HyP->offloadCondition;
-            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
-            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            uPanelInfo_t *uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t *lPanelInfo = packLUInfo->lPanelInfo;
             int_t *lsub = lPanelInfo->lsub;
             int_t *usub = uPanelInfo->usub;
-            int_t* indirect  = fNlists->indirect;
-            int_t* indirect2  = fNlists->indirect2;
+            int_t *indirect = fNlists->indirect;
+            int_t *indirect2 = fNlists->indirect2;
 
             /* Schur Complement Update */
 
             int_t knsupc = SuperSize(k);
-            int_t klst = FstBlockC (k + 1);
+            int_t klst = FstBlockC(k + 1);
 
-            double* bigV = scuBufs->bigV;
-            double* bigU = scuBufs->bigU;
+            double *bigV = scuBufs->bigV;
+            double *bigU = scuBufs->bigU;
 
-	    double t1 = SuperLU_timer_();
+            double t1 = SuperLU_timer_();
 
-#pragma omp parallel  /* Look-ahead update on CPU */
+#pragma omp parallel /* Look-ahead update on CPU */
             {
-		int_t thread_id = omp_get_thread_num();
+                int_t thread_id = omp_get_thread_num();
 
-#pragma omp for	 schedule(dynamic,2) nowait
+#pragma omp for
                 for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
                 {
-                    int_t j   = ij / HyP->lookAheadBlk ;
-                    int_t lb  = ij % HyP->lookAheadBlk;
-                    block_gemm_scatterTopLeft( lb,  j, bigV, knsupc, klst, lsub,
-                                               usub, ldt,  indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                    int_t j = ij / HyP->lookAheadBlk;
+                    int_t lb = ij % HyP->lookAheadBlk;
+                    dblock_gemm_scatterTopLeft(lb, j, bigV, knsupc, klst, lsub,
+                                              usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
                 }
 
-#pragma omp for  schedule(dynamic,2) nowait
+#pragma omp for
                 for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
-		{
-		    int_t j   = ij / HyP->lookAheadBlk ;
-                    int_t lb  = ij % HyP->lookAheadBlk;
-                    block_gemm_scatterTopRight( lb,  j, bigV, knsupc, klst, lsub,
-                                                usub, ldt,  indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                {
+                    int_t j = ij / HyP->lookAheadBlk;
+                    int_t lb = ij % HyP->lookAheadBlk;
+                    dblock_gemm_scatterTopRight(lb, j, bigV, knsupc, klst, lsub,
+                                               usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
                 }
 
-#pragma omp for	 schedule(dynamic,2) nowait
+#pragma omp for
                 for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij)
                 {
-                    int_t j   = ij / HyP->RemainBlk;
-                    int_t lb  = ij % HyP->RemainBlk;
-                    block_gemm_scatterBottomLeft( lb,  j, bigV, knsupc, klst, lsub,
-						  usub, ldt,  indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                    int_t j = ij / HyP->RemainBlk;
+                    int_t lb = ij % HyP->RemainBlk;
+                    dblock_gemm_scatterBottomLeft(lb, j, bigV, knsupc, klst, lsub,
+                                                 usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
                 } /* for int_t ij = ... */
-            } /* end parallel region ... end look-ahead update */
+            }     /* end parallel region ... end look-ahead update */
 
-	    SCT->lookaheadupdatetimer += (SuperLU_timer_() - t1);
+            SCT->lookaheadupdatetimer += (SuperLU_timer_() - t1);
 
-	    //printf("... after look-ahead update, topoLvl %d\t maxTopoLevel %d\n", topoLvl, maxTopoLevel); fflush(stdout);
+            //printf("... after look-ahead update, topoLvl %d\t maxTopoLevel %d\n", topoLvl, maxTopoLevel); fflush(stdout);
 
-	    /* Reduce the L & U panels from GPU to CPU.       */
-            if (topoLvl < maxTopoLevel - 1) { /* Not the root */
-	        int_t k_parent = gEtreeInfo->setree[k];
+            /* Reduce the L & U panels from GPU to CPU.       */
+            if (topoLvl < maxTopoLevel - 1)
+            { /* Not the root */
+                int_t k_parent = gEtreeInfo->setree[k];
                 gEtreeInfo->numChildLeft[k_parent]--;
                 if (gEtreeInfo->numChildLeft[k_parent] == 0 && k_parent < nnodes)
-		{   /* if k is the last child in this level */
-                    int_t k0_parent =  myIperm[k_parent];
+                { /* if k is the last child in this level */
+                    int_t k0_parent = myIperm[k_parent];
                     if (k0_parent > 0)
-		    {
+                    {
                         /* code */
-      //      printf("Before assert: iam %d, k %d, k_parent %d, k0_parent %d, nnodes %d\n", grid3d->iam, k, k_parent, k0_parent, nnodes); fflush(stdout);
-	//	      exit(-1);
-			assert(k0_parent < nnodes);
+                        //      printf("Before assert: iam %d, k %d, k_parent %d, k0_parent %d, nnodes %d\n", grid3d->iam, k, k_parent, k0_parent, nnodes); fflush(stdout);
+                        //	      exit(-1);
+                        assert(k0_parent < nnodes);
                         int_t offset = k0_parent - k_end;
                         if (!gpuLUreduced[k_parent] && superlu_acc_offload)
                         {
@@ -383,42 +441,52 @@ int_t sparseTreeFactor_ASYNC_GPU(
                             int_t copyL_kljb = d2Hred->copyL_kljb;
                             int_t copyU_kljb = d2Hred->copyU_kljb;
 
-                            if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++;
-                            sendLUpanelGPU2HOST( k_parent, d2Hred, sluGPU);
+                            if (copyL_kljb || copyU_kljb)
+                                SCT->PhiMemCpyCounter++;
+                            sendLUpanelGPU2HOST(k_parent, d2Hred, sluGPU);
 
                             /* Reduce the LU panels from GPU */
-                            reduceGPUlu(last_flag,  d2Hred,
+                            reduceGPUlu(last_flag, d2Hred,
                                         sluGPU, SCT, grid, LUstruct);
 
                             gpuLUreduced[k_parent] = 1;
                             SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1;
-			}
-			
-			/* Factorize diagonal block on CPU */
-                        sDiagFactIBCast(k_parent,  dFBufs[offset],
-					factStat, comReqss[offset], grid,
-                                        options, thresh, LUstruct, stat,
-					info, SCT );
+                        }
+
+                        /* Factorize diagonal block on CPU */
+#if 0
+                        sDiagFactIBCast(k_parent,  dFBufs[offset], factStat,
+					comReqss[offset], grid, options, thresh,
+					LUstruct, stat, info, SCT, tag_ub);
+#else
+                        dDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor,
+                                        dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+                                        comReqss[offset]->U_diag_blk_recv_req,
+                                        comReqss[offset]->L_diag_blk_recv_req,
+                                        comReqss[offset]->U_diag_blk_send_req,
+                                        comReqss[offset]->L_diag_blk_send_req,
+                                        grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
                         factored_D[k_parent] = 1;
-		    } /* end if k0_parent > 0 */
+                    } /* end if k0_parent > 0 */
 
                 } /* end if all children are done */
-            } /* end if non-root */
+            }     /* end if non-root */
 
 #pragma omp parallel
             {
-		/* Master thread performs Schur complement update on GPU. */
+                /* Master thread performs Schur complement update on GPU. */
 #pragma omp master
                 {
                     if (superlu_acc_offload)
                     {
                         int_t thread_id = omp_get_thread_num();
-			double t1 = SuperLU_timer_();
+                        double t1 = SuperLU_timer_();
 
                         if (offload_condition)
                         {
-			    SCT->datatransfer_count++;
-                            int_t streamId  = k0 % nCudaStreams;
+                            SCT->datatransfer_count++;
+                            int_t streamId = k0 % nCudaStreams;
 
                             /*wait for previous offload to get finished*/
                             if (sluGPU->lastOffloadStream[streamId] != -1)
@@ -431,49 +499,48 @@ int_t sparseTreeFactor_ASYNC_GPU(
                             int_t bigu_send_size = jj_cpu < 1 ? 0 : HyP->ldu_Phi * HyP->Ublock_info_Phi[jj_cpu - 1].full_u_cols;
                             assert(bigu_send_size < HyP->bigu_size);
 
-	    /* !! Sherry add the test to avoid seg_fault inside sendSCUdataHost2GPU */
-	    if ( bigu_send_size > 0 ) {
-                            sendSCUdataHost2GPU(streamId, lsub, usub, bigU, bigu_send_size,
-                                                Remain_lbuf_send_size, sluGPU, HyP
-                                               );
+                            /* !! Sherry add the test to avoid seg_fault inside sendSCUdataHost2GPU */
+                            if (bigu_send_size > 0)
+                            {
+                                sendSCUdataHost2GPU(streamId, lsub, usub, bigU, bigu_send_size,
+                                                    Remain_lbuf_send_size, sluGPU, HyP);
 
-                            sluGPU->lastOffloadStream[streamId] = k0;
-                            int_t usub_len = usub[2];
-                            int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
-			    //{printf("... before SchurCompUpdate_GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);}
+                                sluGPU->lastOffloadStream[streamId] = k0;
+                                int_t usub_len = usub[2];
+                                int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+                                //{printf("... before SchurCompUpdate_GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);}
 
-                            SchurCompUpdate_GPU(
-                                streamId, 0, jj_cpu, klst, knsupc, HyP->Rnbrow, HyP->RemainBlk,
-                                Remain_lbuf_send_size, bigu_send_size, HyP->ldu_Phi, HyP->num_u_blks_Phi,
-                                HyP->buffer_size, lsub_len, usub_len, ldt, k0, sluGPU, grid
-                            );
-	    } /* endif bigu_send_size > 0 */
+                                SchurCompUpdate_GPU(
+                                    streamId, 0, jj_cpu, klst, knsupc, HyP->Rnbrow, HyP->RemainBlk,
+                                    Remain_lbuf_send_size, bigu_send_size, HyP->ldu_Phi, HyP->num_u_blks_Phi,
+                                    HyP->buffer_size, lsub_len, usub_len, ldt, k0, sluGPU, grid);
+                            } /* endif bigu_send_size > 0 */
 
                             // sendLUpanelGPU2HOST( k0, d2Hred, sluGPU);
 
-                            SCT->schurPhiCallCount ++;
+                            SCT->schurPhiCallCount++;
                             HyP->jj_cpu = jj_cpu;
                             updateDirtyBit(k0, HyP, grid);
                         } /* endif (offload_condition) */
 
-			double t2 = SuperLU_timer_();
-                        SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1); /* not used */
-                        SCT->CPUOffloadTimer += (double) (t2 - t1); // Sherry added
+                        double t2 = SuperLU_timer_();
+                        SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double)(t2 - t1); /* not used */
+                        SCT->CPUOffloadTimer += (double)(t2 - t1);                                     // Sherry added
 
                     } /* endif (superlu_acc_offload) */
 
                 } /* end omp master thread */
 
-#pragma omp for  schedule(dynamic,2) nowait
-		/* The following update is on CPU. Should not be necessary now,
+#pragma omp for
+                /* The following update is on CPU. Should not be necessary now,
 		   because we set jj_cpu equal to num_u_blks_Phi.      		*/
-                for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu) ; ++ij)
+                for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu); ++ij)
                 {
-		    //printf(".. WARNING: should NOT get here\n");
-                    int_t j   = ij / HyP->RemainBlk + jj_cpu;
-                    int_t lb  = ij % HyP->RemainBlk;
-                    block_gemm_scatterBottomRight( lb,  j, bigV, knsupc, klst, lsub,
-                                                   usub, ldt,  indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
+                    //printf(".. WARNING: should NOT get here\n");
+                    int_t j = ij / HyP->RemainBlk + jj_cpu;
+                    int_t lb = ij % HyP->RemainBlk;
+                    dblock_gemm_scatterBottomRight(lb, j, bigV, knsupc, klst, lsub,
+                                                  usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
                 } /* for int_t ij = ... */
 
             } /* end omp parallel region */
@@ -482,10 +549,16 @@ int_t sparseTreeFactor_ASYNC_GPU(
 
             // finish waiting for diag block send
             int_t abs_offset = k0 - k_st;
+#if 0
             sWait_LUDiagSend(k,  comReqss[abs_offset], grid, SCT);
+#else
+            Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req,
+                            comReqss[abs_offset]->L_diag_blk_send_req,
+                            grid, SCT);
+#endif
 
             /*Schedule next I bcasts within look-ahead window */
-            for (int_t next_k0 = k0 + 1; next_k0 < SUPERLU_MIN( k0 + 1 + numLA, nnodes); ++next_k0)
+            for (int_t next_k0 = k0 + 1; next_k0 < SUPERLU_MIN(k0 + 1 + numLA, nnodes); ++next_k0)
             {
                 /* code */
                 int_t next_k = perm_c_supno[next_k0];
@@ -494,17 +567,31 @@ int_t sparseTreeFactor_ASYNC_GPU(
                 /*L Ibcast*/
                 if (IbcastPanel_L[next_k] == 0 && factored_L[next_k])
                 {
-                    sIBcastRecvLPanel( next_k, comReqss[offset],
-				       LUvsbs[offset],  msgss[offset],
-				       factStat, grid, LUstruct, SCT );
+#if 0
+                    sIBcastRecvLPanel( next_k, comReqss[offset], 
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+                    dIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt,
+                                      comReqss[offset]->send_req, comReqss[offset]->recv_req,
+                                      LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf,
+                                      factStat->factored, grid, LUstruct, SCT, tag_ub);
+#endif
                     IbcastPanel_L[next_k] = 1; /*will be used later*/
                 }
                 /*U Ibcast*/
                 if (IbcastPanel_U[next_k] == 0 && factored_U[next_k])
                 {
+#if 0
                     sIBcastRecvUPanel( next_k, comReqss[offset],
-				       LUvsbs[offset],  msgss[offset],
-				       factStat, grid, LUstruct, SCT );
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+                    dIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt,
+                                      comReqss[offset]->send_requ, comReqss[offset]->recv_requ,
+                                      LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf,
+                                      grid, LUstruct, SCT, tag_ub);
+#endif
                     IbcastPanel_U[next_k] = 1;
                 }
             } /* end for look-ahead window */
@@ -526,44 +613,79 @@ int_t sparseTreeFactor_ASYNC_GPU(
                                                          grid, SCT);
                         if (recvUDiag)
                         {
+#if 0
                             sLPanelTrSolve( kx,  dFBufs[offset],
                                             factStat, comReqss[offset],
                                             grid, LUstruct, SCT);
+#else
+                            dLPanelTrSolve(kx, factStat->factored_L,
+                                           dFBufs[offset]->BlockUFactor, grid, LUstruct);
+#endif
+
                             factored_L[kx] = 1;
 
                             /*check if an L_Ibcast is possible*/
 
                             if (IbcastPanel_L[kx] == 0 &&
-                                    k0x - k0 < numLA + 1  && // is within look-ahead window
-                                    factored_L[kx])
+                                k0x - k0 < numLA + 1 && // is within look-ahead window
+                                factored_L[kx])
                             {
                                 int_t offset1 = k0x % numLA;
-                                sIBcastRecvLPanel( kx, comReqss[offset1],  LUvsbs[offset1],
-                                                   msgss[offset1], factStat, grid, LUstruct, SCT );
+#if 0
+                                sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1],
+                                                   msgss[offset1], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+                                dIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt,
+                                                  comReqss[offset1]->send_req,
+                                                  comReqss[offset1]->recv_req,
+                                                  LUvsbs[offset1]->Lsub_buf,
+                                                  LUvsbs[offset1]->Lval_buf,
+                                                  factStat->factored,
+                                                  grid, LUstruct, SCT, tag_ub);
+#endif
                                 IbcastPanel_L[kx] = 1; /*will be used later*/
                             }
-
                         }
                     }
 
                     if (IrecvPlcd_D[kx] && !factored_U[kx])
                     {
                         /*check if received*/
-                        int_t recvLDiag = checkRecvLDiag( kx, comReqss[offset],
-                                                          grid, SCT);
+                        int_t recvLDiag = checkRecvLDiag(kx, comReqss[offset],
+                                                         grid, SCT);
                         if (recvLDiag)
                         {
+#if 0
                             sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo,
                                             grid, LUstruct, stat, SCT);
+#else
+                            dUPanelTrSolve(kx, dFBufs[offset]->BlockLFactor,
+                                           scuBufs->bigV,
+                                           ldt, packLUInfo->Ublock_info,
+                                           grid, LUstruct, stat, SCT);
+#endif
                             factored_U[kx] = 1;
                             /*check if an L_Ibcast is possible*/
 
                             if (IbcastPanel_U[kx] == 0 &&
-                                    k0x - k0 < numLA + 1  && // is within lookahead window
-                                    factored_U[kx])
+                                k0x - k0 < numLA + 1 && // is within lookahead window
+                                factored_U[kx])
                             {
                                 int_t offset = k0x % numLA;
-                                sIBcastRecvUPanel( kx, comReqss[offset],  LUvsbs[offset],  msgss[offset], factStat, grid, LUstruct, SCT );
+#if 0
+                                sIBcastRecvUPanel( kx, comReqss[offset],
+						   LUvsbs[offset],
+						   msgss[offset], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+                                dIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt,
+                                                  comReqss[offset]->send_requ,
+                                                  comReqss[offset]->recv_requ,
+                                                  LUvsbs[offset]->Usub_buf,
+                                                  LUvsbs[offset]->Uval_buf,
+                                                  grid, LUstruct, SCT, tag_ub);
+#endif
                                 IbcastPanel_U[kx] = 1; /*will be used later*/
                             }
                         }
@@ -571,8 +693,8 @@ int_t sparseTreeFactor_ASYNC_GPU(
                 } /* end look-ahead */
 
             } /* end if non-root level */
-	    
-	    /* end Schur complement update */
+
+            /* end Schur complement update */
             SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
 
         } /* end Schur update for all the nodes in level 'topoLvl' */

From 3b11e7d2029ff4c043b4337247d2354a0992da5d Mon Sep 17 00:00:00 2001
From: Xiaoye Li 
Date: Thu, 31 Dec 2020 01:20:26 -0500
Subject: [PATCH 055/147] Put GPU DisplayHeader() inside #if (PRNTlevel>=1)

---
 SRC/pdgstrf.c             | 7 ++++---
 SRC/pzgstrf.c             | 7 ++++---
 SRC/superlu_dist_config.h | 6 +++---
 SRC/util.c                | 2 +-
 make.inc.in               | 2 +-
 5 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c
index 1d60904d..39888c37 100644
--- a/SRC/pdgstrf.c
+++ b/SRC/pdgstrf.c
@@ -834,10 +834,11 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     if ( checkCuda(cudaHostAlloc((void**)&bigV, bigv_size * sizeof(double) ,cudaHostAllocDefault)) )
         ABORT("Malloc fails for dgemm buffer V");
 
-    DisplayHeader();
-
 #if ( PRNTlevel>=1 )
-    printf(" Starting with %d Cuda Streams \n",nstreams );
+    if (iam==0) {
+      DisplayHeader();
+      printf(" Starting with %d Cuda Streams \n",nstreams );
+    }
 #endif
 
     cublasHandle_t *handle;
diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c
index 20d32ad8..7246b968 100644
--- a/SRC/pzgstrf.c
+++ b/SRC/pzgstrf.c
@@ -834,10 +834,11 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     if ( checkCuda(cudaHostAlloc((void**)&bigV, bigv_size * sizeof(doublecomplex) ,cudaHostAllocDefault)) )
         ABORT("Malloc fails for zgemm buffer V");
 
-    DisplayHeader();
-
 #if ( PRNTlevel>=1 )
-    printf(" Starting with %d Cuda Streams \n",nstreams );
+    if (iam==0) {
+      DisplayHeader();
+      printf(" Starting with %d Cuda Streams \n",nstreams );
+    }
 #endif
 
     cublasHandle_t *handle;
diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h
index d7eb53ac..e966ac45 100644
--- a/SRC/superlu_dist_config.h
+++ b/SRC/superlu_dist_config.h
@@ -1,19 +1,19 @@
 /* superlu_dist_config.h.in */
 
 /* Enable CUDA */
-/* #undef HAVE_CUDA */
+#define HAVE_CUDA TRUE
 
 /* Enable parmetis */
 #define HAVE_PARMETIS TRUE
 
 /* Enable LAPACK */
-/* #undef SLU_HAVE_LAPACK */
+#define SLU_HAVE_LAPACK TRUE
 
 /* Enable CombBLAS */
 /* #undef HAVE_COMBBLAS */
 
 /* enable 64bit index mode */
-#define XSDK_INDEX_SIZE 64
+/* #undef XSDK_INDEX_SIZE */
 
 #if (XSDK_INDEX_SIZE == 64)
 #define _LONGINT 1
diff --git a/SRC/util.c b/SRC/util.c
index b5e1c5fe..9edfcd52 100644
--- a/SRC/util.c
+++ b/SRC/util.c
@@ -434,7 +434,7 @@ void print_sp_ienv_dist(superlu_dist_options_t *options)
     printf("**    relaxation                 : " IFMT "\n", sp_ienv_dist(2));
     printf("**    max supernode              : " IFMT "\n", sp_ienv_dist(3));
     printf("**    estimated fill ratio       : " IFMT "\n", sp_ienv_dist(6));
-    printf("**    min GEMM dimension for GPU : " IFMT "\n", sp_ienv_dist(7));
+    printf("**    min GEMM m*k*n to use GPU  : " IFMT "\n", sp_ienv_dist(7));
     printf("**************************************************\n");
 }
 
diff --git a/make.inc.in b/make.inc.in
index bb82a7e4..0169a09f 100644
--- a/make.inc.in
+++ b/make.inc.in
@@ -29,7 +29,7 @@ LIBS 	    = $(DSUPERLULIB) ${BLAS_LIB_EXPORT} -lm #-lmpi
 LIBS	    += ${LAPACK_LIB_EXPORT}
 LIBS	    += ${PARMETIS_LIB_EXPORT}
 LIBS 	    += ${COMBBLAS_LIB_EXPORT}
-LIBS 	    += ${EXTRA_LIB_EXPORT}
+# LIBS 	    += ${EXTRA_LIB_EXPORT}
 LIBS        += ${EXTRA_FLIB_EXPORT}
 CUDALIBS    = ${CUDA_LIB_EXPORT}
 LIBS        += ${CUDA_LIB_EXPORT}

From 40f87eb455fb0ef59fa1ad165563c3e7c2cd757e Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Sat, 2 Jan 2021 15:30:41 -0800
Subject: [PATCH 056/147] Created the new FORTRAN interface for the 3D code.
 Improved the FORTRAN interface for the 2D code.

---
 CMakeLists.txt                |   4 +-
 EXAMPLE/pddrive.c             |  11 +-
 EXAMPLE/pddrive1.c            |   6 +-
 EXAMPLE/pddrive2.c            |   6 +-
 EXAMPLE/pddrive3.c            |   6 +-
 EXAMPLE/pddrive3d.c           |  13 +-
 EXAMPLE/pddrive4.c            |   6 +-
 EXAMPLE/pzdrive.c             |   6 +-
 EXAMPLE/pzdrive1.c            |   6 +-
 EXAMPLE/pzdrive2.c            |   6 +-
 EXAMPLE/pzdrive3.c            |   6 +-
 EXAMPLE/pzdrive3d.c           |  13 +-
 EXAMPLE/pzdrive4.c            |   6 +-
 FORTRAN/CMakeLists.txt        |  25 ++-
 FORTRAN/Makefile              |  21 +-
 FORTRAN/README                |  12 +-
 FORTRAN/dcreate_dist_matrix.c |   3 +
 FORTRAN/f_5x5.f90             |  19 +-
 FORTRAN/f_pddrive.f90         |  50 ++---
 FORTRAN/f_pzdrive.f90         |  51 ++---
 FORTRAN/superlu_c2f_dwrap.c   | 117 ++++++++--
 FORTRAN/superlu_c2f_zwrap.c   | 117 ++++++++--
 FORTRAN/superlu_mod.f90       |  24 +-
 FORTRAN/superlupara.f90       |  44 ++--
 SRC/pdutil.c                  | 401 +++++++++++++++-------------------
 SRC/pzutil.c                  |  12 +-
 SRC/superlu_FCnames.h         |  95 ++++----
 SRC/superlu_ddefs.h           |   2 +-
 SRC/superlu_dist_config.h     |  20 --
 SRC/superlu_grid.c            |  32 ++-
 SRC/superlu_grid3d.c          |  68 +-----
 SRC/superlu_zdefs.h           |   2 +-
 SRC/util.c                    |  12 +-
 TEST/pdtest.c                 |   4 +-
 TEST/pztest.c                 |   4 +-
 35 files changed, 656 insertions(+), 574 deletions(-)
 delete mode 100644 SRC/superlu_dist_config.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6e518563..3b145e7f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -137,8 +137,8 @@ if(XSDK_INDEX_SIZE EQUAL 64)
     message("-- Using 64 bit integer for index size.")
 endif()	
 set(CMAKE_C_FLAGS_RELEASE "-O3 -g" CACHE STRING "")
-message("cmake_c_flags_release '${CMAKE_C_FLAGS_RELEASE}'")
-message("cmake_shared_library_c_flags '${CMAKE_SHARED_LIBRARY_C_FLAGS}'")
+#message("cmake_c_flags_release '${CMAKE_C_FLAGS_RELEASE}'")
+#message("cmake_shared_library_c_flags '${CMAKE_SHARED_LIBRARY_C_FLAGS}'")
 set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g" CACHE STRING "")
 
 ######################################################################
diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c
index fd143ef7..91e6bea1 100644
--- a/EXAMPLE/pddrive.c
+++ b/EXAMPLE/pddrive.c
@@ -23,7 +23,6 @@ at the top-level directory.
 
 #include 
 #include "superlu_ddefs.h"
-//#include "superlu_zdefs.h"
 
 /*! \brief
  *
@@ -140,7 +139,7 @@ int main(int argc, char *argv[])
 	
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -198,12 +197,8 @@ int main(int argc, char *argv[])
 	options.DiagInv           = NO;
      */
     set_default_options_dist(&options);
-
-    options.ReplaceTinyPivot = YES;
-    options.IterRefine = NOREFINE;
-
 #if 0
-    options.RowPerm = LargeDiag_HWPM;
+    options.RowPerm = NOROWPERM;
     options.IterRefine = NOREFINE;
     options.ColPerm = NATURAL;
     options.Equil = NO; 
@@ -233,7 +228,7 @@ int main(int argc, char *argv[])
 
     /* Check the accuracy of the solution. */
     pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-		     nrhs, b, ldb, xtrue, ldx, &grid);
+		     nrhs, b, ldb, xtrue, ldx, grid.comm);
 
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
 
diff --git a/EXAMPLE/pddrive1.c b/EXAMPLE/pddrive1.c
index 8e5396a9..f11a2799 100644
--- a/EXAMPLE/pddrive1.c
+++ b/EXAMPLE/pddrive1.c
@@ -104,7 +104,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -189,7 +189,7 @@ int main(int argc, char *argv[])
     /* Check the accuracy of the solution. */
     if ( !iam ) printf("\tSolve the first system:\n");
     pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-		     nrhs, b, ldb, xtrue, ldx, &grid);
+		     nrhs, b, ldb, xtrue, ldx, grid.comm);
 
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
     PStatFree(&stat);
@@ -208,7 +208,7 @@ int main(int argc, char *argv[])
     /* Check the accuracy of the solution. */
     if ( !iam ) printf("\tSolve the system with a different B:\n");
     pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-		     nrhs, b1, ldb, xtrue, ldx, &grid);
+		     nrhs, b1, ldb, xtrue, ldx, grid.comm);
 
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
 
diff --git a/EXAMPLE/pddrive2.c b/EXAMPLE/pddrive2.c
index 5ad5a3ae..f60326bb 100644
--- a/EXAMPLE/pddrive2.c
+++ b/EXAMPLE/pddrive2.c
@@ -116,7 +116,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -191,7 +191,7 @@ int main(int argc, char *argv[])
             &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
     /* Check the accuracy of the solution. */
-    pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid);
+    pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm);
     
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
     PStatFree(&stat);
@@ -228,7 +228,7 @@ int main(int argc, char *argv[])
 
     /* Check the accuracy of the solution. */
     if ( !iam ) printf("Solve the system with the same sparsity pattern.\n");
-    pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, &grid);
+    pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, grid.comm);
 
 #if ( PRNTlevel>=2 )
     if (iam==0) {
diff --git a/EXAMPLE/pddrive3.c b/EXAMPLE/pddrive3.c
index 4287cae0..f2886945 100644
--- a/EXAMPLE/pddrive3.c
+++ b/EXAMPLE/pddrive3.c
@@ -113,7 +113,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -207,7 +207,7 @@ int main(int argc, char *argv[])
             &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
     /* Check the accuracy of the solution. */
-    pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid);
+    pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm);
     
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
     PStatFree(&stat);
@@ -245,7 +245,7 @@ int main(int argc, char *argv[])
     /* Check the accuracy of the solution. */
     if ( !iam )
         printf("Solve a system with the same pattern and similar values.\n");
-    pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, &grid);
+    pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm);
 
     /* Print the statistics. */
     PStatPrint(&options, &stat, &grid);
diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c
index 4f4f1a4b..1e4b0db5 100644
--- a/EXAMPLE/pddrive3d.c
+++ b/EXAMPLE/pddrive3d.c
@@ -204,8 +204,7 @@ main (int argc, char *argv[])
 	
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if (iam >= nprow * npcol *npdep)
-        goto out;
+    if (iam == -1)     goto out;
     if (!iam) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -242,14 +241,13 @@ main (int argc, char *argv[])
 	dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &(grid.grid2d));
 	
 #else
-    NRformat_loc *Astore, *Astore0;
-
     // *fp0 = *fp;
     dcreate_matrix_postfix3d(&A, nrhs, &b, &ldb,
                              &xtrue, &ldx, fp, suffix, &(grid));
     //printf("ldx %d, ldb %d\n", ldx, ldb);
     
 #if 0  // following code is only for checking *Gather* routine
+    NRformat_loc *Astore, *Astore0;
     double* B2d;
     NRformat_loc Atmp = dGatherNRformat_loc(
                             (NRformat_loc *) A.Store,
@@ -347,11 +345,8 @@ main (int argc, char *argv[])
                &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
     /* Check the accuracy of the solution. */
-#ifndef NRFRMT
-    if ( grid.zscp.Iam == 0 )  // Process layer 0
-#endif    
-        pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
-                          nrhs, b, ldb, xtrue, ldx, &(grid.grid2d));
+    pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                          nrhs, b, ldb, xtrue, ldx, grid.comm);
     fflush(stdout);
 
     /* ------------------------------------------------------------
diff --git a/EXAMPLE/pddrive4.c b/EXAMPLE/pddrive4.c
index d7289de5..b7c35ba0 100644
--- a/EXAMPLE/pddrive4.c
+++ b/EXAMPLE/pddrive4.c
@@ -130,7 +130,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in any of the 2 grids. */
     MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    if ( iam >= 10 ) goto out;
+    if ( iam == -1 ) goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
@@ -193,7 +193,7 @@ int main(int argc, char *argv[])
 
         /* Check the accuracy of the solution. */
         pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-                         nrhs, b, ldb, xtrue, ldx, &grid1);
+                         nrhs, b, ldb, xtrue, ldx, grid1.comm);
     
 	/* Print the statistics. */
 	PStatPrint(&options, &stat, &grid1);
@@ -258,7 +258,7 @@ int main(int argc, char *argv[])
 
         /* Check the accuracy of the solution. */
         pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-                         nrhs, b, ldb, xtrue, ldx, &grid2);
+                         nrhs, b, ldb, xtrue, ldx, grid2.comm);
     
 	/* Print the statistics. */
 	PStatPrint(&options, &stat, &grid2);
diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c
index bf6c3e9b..3878558d 100644
--- a/EXAMPLE/pzdrive.c
+++ b/EXAMPLE/pzdrive.c
@@ -138,7 +138,7 @@ int main(int argc, char *argv[])
 	
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -197,7 +197,7 @@ int main(int argc, char *argv[])
      */
     set_default_options_dist(&options);
 #if 0
-    options.RowPerm = LargeDiag_HWPM;
+    options.RowPerm = NOROWPERM;
     options.IterRefine = NOREFINE;
     options.ColPerm = NATURAL;
     options.Equil = NO; 
@@ -227,7 +227,7 @@ int main(int argc, char *argv[])
 
     /* Check the accuracy of the solution. */
     pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-		     nrhs, b, ldb, xtrue, ldx, &grid);
+		     nrhs, b, ldb, xtrue, ldx, grid.comm);
 
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
 
diff --git a/EXAMPLE/pzdrive1.c b/EXAMPLE/pzdrive1.c
index 6718cd0e..f01bc447 100644
--- a/EXAMPLE/pzdrive1.c
+++ b/EXAMPLE/pzdrive1.c
@@ -103,7 +103,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -188,7 +188,7 @@ int main(int argc, char *argv[])
     /* Check the accuracy of the solution. */
     if ( !iam ) printf("\tSolve the first system:\n");
     pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-		     nrhs, b, ldb, xtrue, ldx, &grid);
+		     nrhs, b, ldb, xtrue, ldx, grid.comm);
 
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
     PStatFree(&stat);
@@ -207,7 +207,7 @@ int main(int argc, char *argv[])
     /* Check the accuracy of the solution. */
     if ( !iam ) printf("\tSolve the system with a different B:\n");
     pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-		     nrhs, b1, ldb, xtrue, ldx, &grid);
+		     nrhs, b1, ldb, xtrue, ldx, grid.comm);
 
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
 
diff --git a/EXAMPLE/pzdrive2.c b/EXAMPLE/pzdrive2.c
index df9b2263..ce28390e 100644
--- a/EXAMPLE/pzdrive2.c
+++ b/EXAMPLE/pzdrive2.c
@@ -115,7 +115,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -190,7 +190,7 @@ int main(int argc, char *argv[])
             &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
     /* Check the accuracy of the solution. */
-    pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid);
+    pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm);
     
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
     PStatFree(&stat);
@@ -227,7 +227,7 @@ int main(int argc, char *argv[])
 
     /* Check the accuracy of the solution. */
     if ( !iam ) printf("Solve the system with the same sparsity pattern.\n");
-    pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, &grid);
+    pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, grid.comm);
 
 #if ( PRNTlevel>=2 )
     if (iam==0) {
diff --git a/EXAMPLE/pzdrive3.c b/EXAMPLE/pzdrive3.c
index c00415d6..47b33b43 100644
--- a/EXAMPLE/pzdrive3.c
+++ b/EXAMPLE/pzdrive3.c
@@ -112,7 +112,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if ( iam >= nprow * npcol )	goto out;
+    if ( iam == -1 )	goto out;
     if ( !iam ) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -206,7 +206,7 @@ int main(int argc, char *argv[])
             &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
     /* Check the accuracy of the solution. */
-    pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, &grid);
+    pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm);
     
     PStatPrint(&options, &stat, &grid);        /* Print the statistics. */
     PStatFree(&stat);
@@ -244,7 +244,7 @@ int main(int argc, char *argv[])
     /* Check the accuracy of the solution. */
     if ( !iam )
         printf("Solve a system with the same pattern and similar values.\n");
-    pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, &grid);
+    pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm);
 
     /* Print the statistics. */
     PStatPrint(&options, &stat, &grid);
diff --git a/EXAMPLE/pzdrive3d.c b/EXAMPLE/pzdrive3d.c
index 3cd975e8..24ca2777 100644
--- a/EXAMPLE/pzdrive3d.c
+++ b/EXAMPLE/pzdrive3d.c
@@ -204,8 +204,7 @@ main (int argc, char *argv[])
 	
     /* Bail out if I do not belong in the grid. */
     iam = grid.iam;
-    if (iam >= nprow * npcol *npdep)
-        goto out;
+    if (iam == -1)     goto out;
     if (!iam) {
 	int v_major, v_minor, v_bugfix;
 #ifdef __INTEL_COMPILER
@@ -242,14 +241,13 @@ main (int argc, char *argv[])
 	zcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, suffix, &(grid.grid2d));
 	
 #else
-    NRformat_loc *Astore, *Astore0;
-
     // *fp0 = *fp;
     zcreate_matrix_postfix3d(&A, nrhs, &b, &ldb,
                              &xtrue, &ldx, fp, suffix, &(grid));
     //printf("ldx %d, ldb %d\n", ldx, ldb);
     
 #if 0  // following code is only for checking *Gather* routine
+    NRformat_loc *Astore, *Astore0;
     doublecomplex* B2d;
     NRformat_loc Atmp = dGatherNRformat_loc(
                             (NRformat_loc *) A.Store,
@@ -347,11 +345,8 @@ main (int argc, char *argv[])
                &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
     /* Check the accuracy of the solution. */
-#ifndef NRFRMT
-    if ( grid.zscp.Iam == 0 )  // Process layer 0
-#endif    
-        pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
-                          nrhs, b, ldb, xtrue, ldx, &(grid.grid2d));
+    pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc,
+                          nrhs, b, ldb, xtrue, ldx, grid.comm);
     fflush(stdout);
 
     /* ------------------------------------------------------------
diff --git a/EXAMPLE/pzdrive4.c b/EXAMPLE/pzdrive4.c
index 3b768f81..53e8c06c 100644
--- a/EXAMPLE/pzdrive4.c
+++ b/EXAMPLE/pzdrive4.c
@@ -129,7 +129,7 @@ int main(int argc, char *argv[])
 
     /* Bail out if I do not belong in any of the 2 grids. */
     MPI_Comm_rank( MPI_COMM_WORLD, &iam );
-    if ( iam >= 10 ) goto out;
+    if ( iam == -1 ) goto out;
     
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter main()");
@@ -192,7 +192,7 @@ int main(int argc, char *argv[])
 
         /* Check the accuracy of the solution. */
         pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-                         nrhs, b, ldb, xtrue, ldx, &grid1);
+                         nrhs, b, ldb, xtrue, ldx, grid1.comm);
     
 	/* Print the statistics. */
 	PStatPrint(&options, &stat, &grid1);
@@ -257,7 +257,7 @@ int main(int argc, char *argv[])
 
         /* Check the accuracy of the solution. */
         pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc,
-                         nrhs, b, ldb, xtrue, ldx, &grid2);
+                         nrhs, b, ldb, xtrue, ldx, grid2.comm);
     
 	/* Print the statistics. */
 	PStatPrint(&options, &stat, &grid2);
diff --git a/FORTRAN/CMakeLists.txt b/FORTRAN/CMakeLists.txt
index 16c9c615..0fa977b5 100644
--- a/FORTRAN/CMakeLists.txt
+++ b/FORTRAN/CMakeLists.txt
@@ -10,7 +10,7 @@ link_directories(${MPI_Fortran_LIBRARIES})
 set(all_link_libs superlu_dist ${MPI_Fortran_LIBRARIES} ${BLAS_LIB} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
 #message("!!! in Fortran: MPI_Fortran_LINK_FLAGS='${MPI_Fortran_LINK_FLAGS}'")
 #message("!!! in Fortran: all_link_libs='${all_link_libs}'")
-message("!!! in Fortran: cxx_implicit='${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES}'")
+#message("!!! in Fortran: cxx_implicit='${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES}'")
 if (NOT MSVC)
   list(APPEND all_link_libs m)
 endif ()
@@ -18,24 +18,37 @@ endif ()
 set(F_MOD superlupara.f90 superlu_mod.f90)
 
 if(enable_double)
-  set(C_DWRAP dcreate_dist_matrix.c superlu_c2f_dwrap.c)
-  set(F_DEXM ${F_MOD} dhbcode1.f90 f_pddrive.f90 ${C_DWRAP})
+  set(C_DWRAP c2f_dcreate_matrix_x_b.c superlu_c2f_dwrap.c)
+  set(F_DEXM ${F_MOD} f_pddrive.f90 ${C_DWRAP})
   add_executable(f_pddrive ${F_DEXM})
   target_link_libraries(f_pddrive ${all_link_libs})
+#  set_target_properties(f_pddrive PROPERTIES LINKER_LANGUAGE Fortran)
   set_target_properties(f_pddrive PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
   
+  set(F_DEXM3D ${F_MOD} f_pddrive3d.f90 ${C_DWRAP})
+  add_executable(f_pddrive3d ${F_DEXM3D})
+  target_link_libraries(f_pddrive3d ${all_link_libs})
+  set_target_properties(f_pddrive3d PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
+  
   set(F_5x5 ${F_MOD} f_5x5.f90 sp_ienv.c ${C_DWRAP})
   add_executable(f_5x5 ${F_5x5})
   target_link_libraries(f_5x5 ${all_link_libs})
-#  set_target_properties(f_5x5 PROPERTIES LINKER_LANGUAGE Fortran)
   set_target_properties(f_5x5 PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
+  
 endif()
 
 if(enable_complex16)
-  set(C_ZWRAP zcreate_dist_matrix.c superlu_c2f_zwrap.c)
-  set(F_ZEXM ${F_MOD} zhbcode1.f90 f_pzdrive.f90 ${C_ZWRAP})
+  set(C_ZWRAP c2f_zcreate_matrix_x_b.c superlu_c2f_zwrap.c)
+  
+  set(F_ZEXM ${F_MOD} f_pzdrive.f90 ${C_ZWRAP})
   add_executable(f_pzdrive ${F_ZEXM})
   target_link_libraries(f_pzdrive ${all_link_libs})
 #  set_target_properties(f_pzdrive PROPERTIES LINKER_LANGUAGE Fortran)
   set_target_properties(f_pzdrive PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
+
+  set(F_ZEXM3D ${F_MOD} f_pzdrive3d.f90 ${C_ZWRAP})
+  add_executable(f_pzdrive3d ${F_ZEXM3D})
+  target_link_libraries(f_pzdrive3d ${all_link_libs})
+  set_target_properties(f_pzdrive3d PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
+  
 endif()
diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile
index d275c360..7b9a5cb0 100644
--- a/FORTRAN/Makefile
+++ b/FORTRAN/Makefile
@@ -14,14 +14,17 @@ include ../make.inc
 #F90FLAGS	= $(FFLAGS) -qfree -qsuffix=f=f90  -qflag=w:w
 
 F_MOD	= superlupara.o superlu_mod.o
-C_DWRAP	= dcreate_dist_matrix.o superlu_c2f_dwrap.o
-C_ZWRAP	= zcreate_dist_matrix.o superlu_c2f_zwrap.o
+C_DWRAP	= c2f_dcreate_matrix_x_b.o superlu_c2f_dwrap.o #dcreate_dist_matrix.o
+C_ZWRAP	= c2f_zcreate_matrix_x_b.o superlu_c2f_zwrap.o
 
-F_DEXM	= $(F_MOD) dhbcode1.o f_pddrive.o
-F_ZEXM	= $(F_MOD) zhbcode1.o f_pzdrive.o
-F_5x5 	= $(F_MOD) f_5x5.o sp_ienv.o 
+F_DEXM	= $(F_MOD) f_pddrive.o
+F_DEXM3D= $(F_MOD) f_pddrive3d.o
+F_ZEXM	= $(F_MOD) f_pzdrive.o
+F_ZEXM3D= $(F_MOD) f_pzdrive3d.o
+F_5x5 	= $(F_MOD) f_5x5.o sp_ienv.o
 
-all: f_pddrive f_pzdrive f_5x5
+
+all: f_pddrive f_pddrive3d f_pzdrive f_pzdrive3d f_5x5
 
 f_5x5: $(F_5x5) $(C_DWRAP) $(DSUPERLULIB)
 	$(LOADER) $(LOADOPTS) $(F_5x5) $(C_DWRAP) $(LIBS) -o $@
@@ -29,9 +32,15 @@ f_5x5: $(F_5x5) $(C_DWRAP) $(DSUPERLULIB)
 f_pddrive: $(F_DEXM) $(C_DWRAP) $(DSUPERLULIB)
 	$(LOADER) $(LOADOPTS) $(F_DEXM) $(C_DWRAP) $(LIBS) -o $@
 
+f_pddrive3d: $(F_DEXM3D) $(C_DWRAP) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(F_DEXM3D) $(C_DWRAP) $(LIBS) -o $@
+
 f_pzdrive: $(F_ZEXM) $(C_ZWRAP) $(DSUPERLULIB)
 	$(LOADER) $(LOADOPTS) $(F_ZEXM) $(C_ZWRAP) $(LIBS) -o $@
 
+f_pzdrive3d: $(F_ZEXM3D) $(C_ZWRAP) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(F_ZEXM3D) $(C_ZWRAP) $(LIBS) -o $@
+
 .c.o:
 	$(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -I$(INCLUDEDIR) -c $< $(VERBOSE)
 
diff --git a/FORTRAN/README b/FORTRAN/README
index 95187ab8..c26ef8c4 100644
--- a/FORTRAN/README
+++ b/FORTRAN/README
@@ -13,22 +13,22 @@ To compile the code, type 'make'
 
 There are two examples in the directory.
 
-1. f_5x5.f90:
+1. f_5x5.f90
    A small 5x5 example appeared in the SuperLU Users Guide, Section 2.2.
    To run the code, type:
       mpiexec -n 2 f_5x5
    (The example is set up to use 2 processors.)
 
-2. f_pddrive.f90
+2. f_pddrive.f90, f_pddrive3d.f90
    A real example Fortran driver routine that reads a matrix from a file
    'g20.rua' in Harwell-Boeing format.
    To run the code, type:
-      mpiexec -n 4 f_pddrive
-   (The example is set up to use 4 processors.)
+      mpiexec -n 4 f_pddrive (or f_pddrive3d)
+   (The example is set up to use 4 MPI processes)
 
 3. f_pzdrive.f90
    A complex example Fortran driver routine that reads a matrix from a file
    'cg20.cua' in Harwell-Boeing format.
    To run the code, type:
-      mpiexec -n 4 f_pzdrive
-   (The example is set up to use 4 processors.)
+      mpiexec -n 4 f_pzdrive (or f_pddrive3d)
+   (The example is set up to use 4 MPI processes)
diff --git a/FORTRAN/dcreate_dist_matrix.c b/FORTRAN/dcreate_dist_matrix.c
index 3a6dde99..bf451cb9 100644
--- a/FORTRAN/dcreate_dist_matrix.c
+++ b/FORTRAN/dcreate_dist_matrix.c
@@ -118,6 +118,9 @@ int dcreate_dist_matrix(SuperMatrix *A, int_t m, int_t n, int_t nnz,
 	MPI_Bcast( colptr,  n+1, mpi_int_t,  0, grid->comm );
     }
 
+    if (iam==0) {printf("after broadcast: m %d, nnz %d\n", m,nnz); fflush(stdout);}
+    exit(-1);
+
 #if 0
     nzval[0]=0.1;
 #endif
diff --git a/FORTRAN/f_5x5.f90 b/FORTRAN/f_5x5.f90
index fec77adc..48e391e8 100644
--- a/FORTRAN/f_5x5.f90
+++ b/FORTRAN/f_5x5.f90
@@ -40,8 +40,8 @@ program f_5x5
       parameter ( maxn = 10, maxnz = 100, maxnrhs = 10 )
       integer colind(maxnz), rowptr(maxn+1)
       real*8  nzval(maxnz), b(maxn), berr(maxnrhs)
-      integer n, m, nnz, nrhs, ldb, nprow, npcol, init
-      integer*4 iam, info, i, ierr, ldb4
+      integer n, m, nnz, nrhs, nprow, npcol, init
+      integer*4 iam, info, i, ierr, ldb
       integer nnz_loc, m_loc, fst_row
       real*8  s, u, p, e, r, l
 
@@ -166,7 +166,6 @@ program f_5x5
          b(i) = 1.0
       enddo
       nrhs = 1
-      ldb4 = ldb
 
 ! Set the default input options
       call f_set_default_options(options)
@@ -184,7 +183,7 @@ program f_5x5
       call f_PStatInit(stat)
 
 ! Call the linear equation solver
-      call f_pdgssvx(options, A, ScalePermstruct, b, ldb4, nrhs, &
+      call f_pdgssvx(options, A, ScalePermstruct, b, ldb, nrhs, &
                      grid, LUstruct, SOLVEstruct, berr, stat, info)
 
       if (info == 0 .and. iam == 1) then
@@ -197,12 +196,12 @@ program f_5x5
       call f_PStatFree(stat)
       call f_Destroy_SuperMat_Store_dist(A)
       call f_ScalePermstructFree(ScalePermstruct)
-      call f_Destroy_LU(n, grid, LUstruct)
-      call f_LUstructFree(LUstruct)
-      call get_superlu_options(options, SolveInitialized=init)
-      if (init == YES) then
-         call f_dSolveFinalize(options, SOLVEstruct)
-      endif
+      call f_Destroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct)
+!      call f_LUstructFree(LUstruct)
+!      call get_superlu_options(options, SolveInitialized=init)
+!      if (init == YES) then
+!         call f_dSolveFinalize(options, SOLVEstruct)
+!      endif
 
 ! Release the SuperLU process grid
 100   call f_superlu_gridexit(grid)
diff --git a/FORTRAN/f_pddrive.f90 b/FORTRAN/f_pddrive.f90
index 33803d99..aa8f7e22 100644
--- a/FORTRAN/f_pddrive.f90
+++ b/FORTRAN/f_pddrive.f90
@@ -35,9 +35,10 @@ program f_pddrive
       integer maxn, maxnz, maxnrhs
       parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 )
       integer rowind(maxnz), colptr(maxn)
-      real*8  values(maxnz), b(maxn), berr(maxnrhs)
-      integer n, m, nnz, nprow, npcol, ldb, init
-      integer*4 iam, info, i, ierr, ldb4, nrhs
+      real*8  values(maxnz), b(maxn), berr(maxnrhs), xtrue(maxn)
+      integer n, m, nnz, nprow, npcol
+      integer*4 iam, info, i, ierr, ldb, nrhs
+      character*80 fname
 
       integer(superlu_ptr) :: grid
       integer(superlu_ptr) :: options
@@ -76,32 +77,16 @@ program f_pddrive
          write(*,*) ' Process grid ', nprow, ' X ', npcol
       endif
 
-! Read Harwell-Boeing matrix, and adjust the pointers and indices
-! to 0-based indexing, as required by C routines.
+! Read and distribute the matrix to the process gird
+      nrhs = 1
+      fname = '../EXAMPLE/g20.rua'//char(0)  !! make the string null-ended
+      call  f_dcreate_matrix_x_b(fname, A, m, n, nnz, &
+      	                            nrhs, b, ldb, xtrue, ldx, grid)
+
       if ( iam == 0 ) then 
-         open(file = "../EXAMPLE/g20.rua", status = "old", unit = 5)
-         call dhbcode1(m, n, nnz, values, rowind, colptr)
-         close(unit = 5)
-!
-         do i = 1, n+1
-            colptr(i) = colptr(i) - 1
-         enddo
-         do i = 1, nnz
-            rowind(i) = rowind(i) - 1
-         enddo
+         write(*,*) ' Matrix A was set up: m ', m, ' nnz ', nnz
       endif
 
-! Distribute the matrix to the process gird
-      call  f_dcreate_dist_matrix(A, m, n, nnz, values, rowind, colptr, grid)
-
-! Setup the right hand side
-      call get_CompRowLoc_Matrix(A, nrow_loc=ldb)
-      do i = 1, ldb
-         b(i) = 1.0
-      enddo
-      nrhs = 1
-      ldb4 = ldb
-
 ! Set the default input options
       call f_set_default_options(options)
 
@@ -118,11 +103,13 @@ program f_pddrive
       call f_PStatInit(stat)
 
 ! Call the linear equation solver
-      call f_pdgssvx(options, A, ScalePermstruct, b, ldb4, nrhs, &
+      call f_pdgssvx(options, A, ScalePermstruct, b, ldb, nrhs, &
                      grid, LUstruct, SOLVEstruct, berr, stat, info)
 
       if (info == 0) then
-         write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+         if ( iam == 0 ) then
+     	     write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+	 endif
       else
          write(*,*) 'INFO from f_pdgssvx = ', info
       endif
@@ -131,12 +118,7 @@ program f_pddrive
       call f_PStatFree(stat)
       call f_Destroy_CompRowLoc_Mat_dist(A)
       call f_ScalePermstructFree(ScalePermstruct)
-      call f_Destroy_LU(n, grid, LUstruct)
-      call f_LUstructFree(LUstruct)
-      call get_superlu_options(options, SolveInitialized=init)
-      if (init == YES) then
-         call f_dSolveFinalize(options, SOLVEstruct)
-      endif
+      call f_Destroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct)
 
 ! Release the SuperLU process grid
 100   call f_superlu_gridexit(grid)
diff --git a/FORTRAN/f_pzdrive.f90 b/FORTRAN/f_pzdrive.f90
index 9c9db5b0..8609621d 100644
--- a/FORTRAN/f_pzdrive.f90
+++ b/FORTRAN/f_pzdrive.f90
@@ -34,9 +34,11 @@ program f_pzdrive
       integer maxn, maxnz, maxnrhs
       parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 )
       integer rowind(maxnz), colptr(maxn)
-      double complex  values(maxnz), b(maxn), berr(maxnrhs)
-      integer n, m, nnz, nprow, npcol, ldb, init
-      integer*4 iam, info, i, ierr, ldb4, nrhs
+      double complex  values(maxnz), b(maxn), xtrue(maxn)
+      real*8 berr(maxnrhs)
+      integer n, m, nnz, nprow, npcol
+      integer*4 iam, info, i, ierr, ldb, nrhs
+      character*80 fname
 
       integer(superlu_ptr) :: grid
       integer(superlu_ptr) :: options
@@ -75,32 +77,16 @@ program f_pzdrive
          write(*,*) ' Process grid ', nprow, ' X ', npcol
       endif
 
-! Read Harwell-Boeing matrix, and adjust the pointers and indices
-! to 0-based indexing, as required by C routines.
+! Read and distribute the matrix to the process gird
+      nrhs = 1
+      fname = '../EXAMPLE/cg20.cua'//char(0)  !! make the string null-ended
+      call  f_zcreate_matrix_x_b(fname, A, m, n, nnz, &
+      	                            nrhs, b, ldb, xtrue, ldx, grid)
+
       if ( iam == 0 ) then 
-         open(file = "../EXAMPLE/cg20.cua", status = "old", unit = 5)
-         call zhbcode1(m, n, nnz, values, rowind, colptr)
-         close(unit = 5)
-!
-         do i = 1, n+1
-            colptr(i) = colptr(i) - 1
-         enddo
-         do i = 1, nnz
-            rowind(i) = rowind(i) - 1
-         enddo
+         write(*,*) ' Matrix A was set up: m ', m, ' nnz ', nnz
       endif
 
-! Distribute the matrix to the process gird
-      call  f_zcreate_dist_matrix(A, m, n, nnz, values, rowind, colptr, grid)
-
-! Setup the right hand side
-      call get_CompRowLoc_Matrix(A, nrow_loc=ldb)
-      do i = 1, ldb
-         b(i) = 1.0
-      enddo
-      nrhs = 1
-      ldb4 = ldb
-
 ! Set the default input options
       call f_set_default_options(options)
 
@@ -117,11 +103,13 @@ program f_pzdrive
       call f_PStatInit(stat)
 
 ! Call the linear equation solver
-      call f_pzgssvx(options, A, ScalePermstruct, b, ldb4, nrhs, &
+      call f_pzgssvx(options, A, ScalePermstruct, b, ldb, nrhs, &
                      grid, LUstruct, SOLVEstruct, berr, stat, info)
 
       if (info == 0) then
-         write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+         if ( iam == 0 ) then
+     	     write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs)
+	 endif
       else
          write(*,*) 'INFO from f_pdgssvx = ', info
       endif
@@ -130,12 +118,7 @@ program f_pzdrive
       call f_PStatFree(stat)
       call f_Destroy_CompRowLoc_Mat_dist(A)
       call f_ScalePermstructFree(ScalePermstruct)
-      call f_Destroy_LU(n, grid, LUstruct)
-      call f_LUstructFree(LUstruct)
-      call get_superlu_options(options, SolveInitialized=init)
-      if (init == YES) then
-         call f_zSolveFinalize(options, SOLVEstruct)
-      endif
+      call f_Destroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct)
 
 ! Release the SuperLU process grid
 100   call f_superlu_gridexit(grid)
diff --git a/FORTRAN/superlu_c2f_dwrap.c b/FORTRAN/superlu_c2f_dwrap.c
index da0e5de7..616f8729 100644
--- a/FORTRAN/superlu_c2f_dwrap.c
+++ b/FORTRAN/superlu_c2f_dwrap.c
@@ -53,6 +53,11 @@ void f_create_gridinfo_handle(fptr *handle)
    *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo_t));
 }
 
+void f_create_gridinfo3d_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo3d_t));
+}
+
 void f_create_options_handle(fptr *handle)
 {
    *handle = (fptr) SUPERLU_MALLOC(sizeof(superlu_dist_options_t));
@@ -131,6 +136,15 @@ void f_get_gridinfo(fptr *grid, int *iam, int_t *nprow, int_t *npcol)
   *nprow=((gridinfo_t *) *grid)->nprow;
 }
 
+void f_get_gridinfo3d(fptr *grid, int *iam,
+         	      int_t *nprow, int_t *npcol, int_t *npdep)
+{
+  *iam=((gridinfo3d_t *) *grid)->iam;
+  *npcol=((gridinfo3d_t *) *grid)->npcol;
+  *nprow=((gridinfo3d_t *) *grid)->nprow;
+  *npdep=((gridinfo3d_t *) *grid)->npdep;
+}
+
 void f_get_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol)
 {
    *nrow = ((SuperMatrix *) *A)->nrow;
@@ -211,13 +225,17 @@ void f_set_default_options(fptr *options)
 
 void f_superlu_gridinit(int *Bcomm, int_t *nprow, int_t *npcol, fptr *grid)
 {
-  
    superlu_gridinit(f2c_comm(Bcomm), *nprow, *npcol, (gridinfo_t *) *grid);
 }
 
+void f_superlu_gridinit3d(int *Bcomm, int_t *nprow, int_t *npcol,
+   			  int_t *npdep, fptr *grid)
+{
+    superlu_gridinit3d(f2c_comm(Bcomm), *nprow, *npcol, *npdep, (gridinfo3d_t *) *grid);
+}
+
 void f_superlu_gridmap(int *Bcomm, int_t *nprow, int_t *npcol, 
-                       int_t *usermap, int_t *ldumap,
-	 fptr *grid)
+                       int_t *usermap, int_t *ldumap, fptr *grid)
 {
    superlu_gridmap(f2c_comm(Bcomm), *nprow, *npcol, usermap, *ldumap, (gridinfo_t *) *grid);
 }
@@ -261,9 +279,35 @@ void f_LUstructFree(fptr *LUstruct)
    dLUstructFree((dLUstruct_t *) *LUstruct);
 }
 
-void f_Destroy_LU(int_t *n, fptr *grid, fptr *LUstruct)
+void f_Destroy_LU_SOLVE_struct(fptr *options, int_t *n, fptr *grid,
+                               fptr *LUstruct, fptr *SOLVEstruct)
 {
-   dDestroy_LU(*n, (gridinfo_t *) *grid, (dLUstruct_t *) *LUstruct);
+    superlu_dist_options_t *opt = (superlu_dist_options_t *) *options;
+    dDestroy_LU(*n, (gridinfo_t *) *grid, (dLUstruct_t *) *LUstruct);
+    dLUstructFree((dLUstruct_t *) *LUstruct);
+    if ( opt->SolveInitialized ) {
+        dSolveFinalize(opt, (dSOLVEstruct_t *) *SOLVEstruct);
+    }
+}
+
+void f_Destroy_LU_SOLVE_struct_3d(fptr *options, int_t *n, fptr *grid,
+		                  fptr *LUstruct, fptr *SOLVEstruct)
+{
+    gridinfo3d_t *grid3d = (gridinfo3d_t *) *grid;
+    superlu_dist_options_t *opt = (superlu_dist_options_t *) *options;
+    dLUstruct_t *LUstruct_ptr = (dLUstruct_t *) *LUstruct;
+    
+    if ( grid3d->zscp.Iam == 0 ) { // process layer 0
+	dDestroy_LU(*n, &(grid3d->grid2d), LUstruct_ptr);
+	if ( opt->SolveInitialized ) {
+	    dSolveFinalize(opt, (dSOLVEstruct_t *) *SOLVEstruct);
+	}
+    } else { // process layers not equal 0
+        dDeAllocLlu_3d(*n, LUstruct_ptr, grid3d);
+        dDeAllocGlu_3d(LUstruct_ptr);
+    }
+    
+    dLUstructFree(LUstruct_ptr);
 }
 
 void f_dCreate_CompRowLoc_Mat_dist(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
@@ -307,19 +351,62 @@ void f_pdgssvx(fptr *options, fptr *A, fptr *ScalePermstruct, double *B,
 	       (gridinfo_t *) *grid);
 }
 
-/* Create the distributed matrix */
-
-void f_dcreate_dist_matrix(fptr *A, int_t *m, int_t *n, int_t *nnz,
-			   double *nzval, int_t *rowind, int_t *colptr,
-			   fptr *grid)
+void f_pdgssvx3d(fptr *options, fptr *A, fptr *ScalePermstruct,
+                 double *B, int *ldb, int *nrhs,
+                 fptr *grid, fptr *LUstruct, fptr *SOLVEstruct,
+                 double *berr, fptr *stat, int *info)
 {
-   int dcreate_dist_matrix(SuperMatrix *, int_t, int_t, int_t, double *,
-			   int_t * , int_t *, gridinfo_t *);
+    gridinfo3d_t *grid3d = (gridinfo3d_t *) *grid;
+    pdgssvx3d((superlu_dist_options_t *) *options, (SuperMatrix *) *A,
+	      (dScalePermstruct_t *) *ScalePermstruct, B, *ldb, *nrhs,
+	      grid3d, (dLUstruct_t *) *LUstruct,
+	      (dSOLVEstruct_t *) *SOLVEstruct, berr,
+	      (SuperLUStat_t *) *stat, info);
+
+    if ( grid3d->zscp.Iam == 0 ) {
+	PStatPrint((superlu_dist_options_t *) *options,
+		   (SuperLUStat_t *) *stat, &(grid3d->grid2d));
+    }
+}
 
-   dcreate_dist_matrix((SuperMatrix *) *A, (int_t) *m, *n, *nnz, 
-		       (double *) nzval, (int_t *) rowind, (int_t *) colptr,
-		       (gridinfo_t *) *grid);
+/* Create the distributed matrix */
 
+void f_dcreate_matrix_x_b(char *fname, fptr *A, int *m, int *n, int_t *nnz,
+		           int *nrhs, double *b, int *ldb,
+		           double *xtrue, int *ldx, fptr *grid)
+{
+    extern int c2f_dcreate_matrix_x_b(char *fname, int nrhs, int nprocs,
+    	                   MPI_Comm, SuperMatrix *A, int *m_g, int *n_g,
+			   int_t *nnz_g, double *rhs, int *ldb,
+			   double *x, int *ldx);
+
+    int iam, nprocs;
+    int_t nprow, npcol;
+    MPI_Comm slucomm = ((gridinfo_t *) *grid)->comm;
+    f_get_gridinfo(grid, &iam, &nprow, &npcol);
+    nprocs = nprow * npcol;
+			   
+    c2f_dcreate_matrix_x_b(fname, *nrhs, nprocs, slucomm,
+    	                   (SuperMatrix *) *A, m, n, nnz, b, ldb, xtrue, ldx);
+}
+
+void f_dcreate_matrix_x_b_3d(char *fname, fptr *A, int *m, int *n, int_t *nnz,
+		           int *nrhs, double *b, int *ldb,
+		           double *xtrue, int *ldx, fptr *grid)
+{
+    extern int c2f_dcreate_matrix_x_b(char *fname, int nrhs, int nprocs,
+    	                   MPI_Comm, SuperMatrix *A, int *m_g, int *n_g,
+			   int_t *nnz_g, double *rhs, int *ldb,
+			   double *x, int *ldx);
+
+    int iam, nprocs;
+    int_t nprow, npcol, npdep;
+    MPI_Comm slucomm = ((gridinfo3d_t *) *grid)->comm;
+    f_get_gridinfo3d(grid, &iam, &nprow, &npcol, &npdep);
+    nprocs = nprow * npcol * npdep;
+			   
+    c2f_dcreate_matrix_x_b(fname, *nrhs, nprocs, slucomm,
+    	                   (SuperMatrix *) *A, m, n, nnz, b, ldb, xtrue, ldx);
 }
 
 /* Check malloc */
diff --git a/FORTRAN/superlu_c2f_zwrap.c b/FORTRAN/superlu_c2f_zwrap.c
index ee963113..7f5c990a 100644
--- a/FORTRAN/superlu_c2f_zwrap.c
+++ b/FORTRAN/superlu_c2f_zwrap.c
@@ -52,6 +52,11 @@ void f_create_gridinfo_handle(fptr *handle)
    *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo_t));
 }
 
+void f_create_gridinfo3d_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo3d_t));
+}
+
 void f_create_options_handle(fptr *handle)
 {
    *handle = (fptr) SUPERLU_MALLOC(sizeof(superlu_dist_options_t));
@@ -130,6 +135,15 @@ void f_get_gridinfo(fptr *grid, int *iam, int_t *nprow, int_t *npcol)
   *nprow=((gridinfo_t *) *grid)->nprow;
 }
 
+void f_get_gridinfo3d(fptr *grid, int *iam,
+         	      int_t *nprow, int_t *npcol, int_t *npdep)
+{
+  *iam=((gridinfo3d_t *) *grid)->iam;
+  *npcol=((gridinfo3d_t *) *grid)->npcol;
+  *nprow=((gridinfo3d_t *) *grid)->nprow;
+  *npdep=((gridinfo3d_t *) *grid)->npdep;
+}
+
 void f_get_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol)
 {
    *nrow = ((SuperMatrix *) *A)->nrow;
@@ -210,13 +224,17 @@ void f_set_default_options(fptr *options)
 
 void f_superlu_gridinit(int *Bcomm, int_t *nprow, int_t *npcol, fptr *grid)
 {
-  
    superlu_gridinit(f2c_comm(Bcomm), *nprow, *npcol, (gridinfo_t *) *grid);
 }
 
+void f_superlu_gridinit3d(int *Bcomm, int_t *nprow, int_t *npcol,
+   			  int_t *npdep, fptr *grid)
+{
+    superlu_gridinit3d(f2c_comm(Bcomm), *nprow, *npcol, *npdep, (gridinfo3d_t *) *grid);
+}
+
 void f_superlu_gridmap(int *Bcomm, int_t *nprow, int_t *npcol, 
-                       int_t *usermap, int_t *ldumap,
-	 fptr *grid)
+                       int_t *usermap, int_t *ldumap, fptr *grid)
 {
    superlu_gridmap(f2c_comm(Bcomm), *nprow, *npcol, usermap, *ldumap, (gridinfo_t *) *grid);
 }
@@ -260,9 +278,35 @@ void f_LUstructFree(fptr *LUstruct)
    zLUstructFree((zLUstruct_t *) *LUstruct);
 }
 
-void f_Destroy_LU(int_t *n, fptr *grid, fptr *LUstruct)
+void f_Destroy_LU_SOLVE_struct(fptr *options, int_t *n, fptr *grid,
+                               fptr *LUstruct, fptr *SOLVEstruct)
 {
-   zDestroy_LU(*n, (gridinfo_t *) *grid, (zLUstruct_t *) *LUstruct);
+    superlu_dist_options_t *opt = (superlu_dist_options_t *) *options;
+    zDestroy_LU(*n, (gridinfo_t *) *grid, (zLUstruct_t *) *LUstruct);
+    zLUstructFree((zLUstruct_t *) *LUstruct);
+    if ( opt->SolveInitialized ) {
+        zSolveFinalize(opt, (zSOLVEstruct_t *) *SOLVEstruct);
+    }
+}
+
+void f_Destroy_LU_SOLVE_struct_3d(fptr *options, int_t *n, fptr *grid,
+		                  fptr *LUstruct, fptr *SOLVEstruct)
+{
+    gridinfo3d_t *grid3d = (gridinfo3d_t *) *grid;
+    superlu_dist_options_t *opt = (superlu_dist_options_t *) *options;
+    zLUstruct_t *LUstruct_ptr = (zLUstruct_t *) *LUstruct;
+    
+    if ( grid3d->zscp.Iam == 0 ) { // process layer 0
+	zDestroy_LU(*n, &(grid3d->grid2d), LUstruct_ptr);
+	if ( opt->SolveInitialized ) {
+	    zSolveFinalize(opt, (zSOLVEstruct_t *) *SOLVEstruct);
+	}
+    } else { // process layers not equal 0
+        zDeAllocLlu_3d(*n, LUstruct_ptr, grid3d);
+        zDeAllocGlu_3d(LUstruct_ptr);
+    }
+    
+    zLUstructFree(LUstruct_ptr);
 }
 
 void f_zCreate_CompRowLoc_Mat_dist(fptr *A, int_t *m, int_t *n, int_t *nnz_loc,
@@ -306,19 +350,62 @@ void f_pzgssvx(fptr *options, fptr *A, fptr *ScalePermstruct, doublecomplex *B,
 	       (gridinfo_t *) *grid);
 }
 
-/* Create the distributed matrix */
-
-void f_zcreate_dist_matrix(fptr *A, int_t *m, int_t *n, int_t *nnz,
-			   doublecomplex *nzval, int_t *rowind, int_t *colptr,
-			   fptr *grid)
+void f_pzgssvx3d(fptr *options, fptr *A, fptr *ScalePermstruct,
+                 doublecomplex *B, int *ldb, int *nrhs,
+                 fptr *grid, fptr *LUstruct, fptr *SOLVEstruct,
+                 double *berr, fptr *stat, int *info)
 {
-   int zcreate_dist_matrix(SuperMatrix *, int_t, int_t, int_t, doublecomplex *,
-			   int_t * , int_t *, gridinfo_t *);
+    gridinfo3d_t *grid3d = (gridinfo3d_t *) *grid;
+    pzgssvx3d((superlu_dist_options_t *) *options, (SuperMatrix *) *A,
+	      (zScalePermstruct_t *) *ScalePermstruct, B, *ldb, *nrhs,
+	      grid3d, (zLUstruct_t *) *LUstruct,
+	      (zSOLVEstruct_t *) *SOLVEstruct, berr,
+	      (SuperLUStat_t *) *stat, info);
+
+    if ( grid3d->zscp.Iam == 0 ) {
+	PStatPrint((superlu_dist_options_t *) *options,
+		   (SuperLUStat_t *) *stat, &(grid3d->grid2d));
+    }
+}
 
-   zcreate_dist_matrix((SuperMatrix *) *A, (int_t) *m, *n, *nnz, 
-		       (doublecomplex *) nzval, (int_t *) rowind, (int_t *) colptr,
-		       (gridinfo_t *) *grid);
+/* Create the distributed matrix */
 
+void f_zcreate_matrix_x_b(char *fname, fptr *A, int *m, int *n, int_t *nnz,
+		           int *nrhs, doublecomplex *b, int *ldb,
+		           doublecomplex *xtrue, int *ldx, fptr *grid)
+{
+    extern int c2f_zcreate_matrix_x_b(char *fname, int nrhs, int nprocs,
+    	                   MPI_Comm, SuperMatrix *A, int *m_g, int *n_g,
+			   int_t *nnz_g, doublecomplex *rhs, int *ldb,
+			   doublecomplex *x, int *ldx);
+
+    int iam, nprocs;
+    int_t nprow, npcol;
+    MPI_Comm slucomm = ((gridinfo_t *) *grid)->comm;
+    f_get_gridinfo(grid, &iam, &nprow, &npcol);
+    nprocs = nprow * npcol;
+			   
+    c2f_zcreate_matrix_x_b(fname, *nrhs, nprocs, slucomm,
+    	                   (SuperMatrix *) *A, m, n, nnz, b, ldb, xtrue, ldx);
+}
+
+void f_zcreate_matrix_x_b_3d(char *fname, fptr *A, int *m, int *n, int_t *nnz,
+		           int *nrhs, doublecomplex *b, int *ldb,
+		           doublecomplex *xtrue, int *ldx, fptr *grid)
+{
+    extern int c2f_zcreate_matrix_x_b(char *fname, int nrhs, int nprocs,
+    	                   MPI_Comm, SuperMatrix *A, int *m_g, int *n_g,
+			   int_t *nnz_g, doublecomplex *rhs, int *ldb,
+			   doublecomplex *x, int *ldx);
+
+    int iam, nprocs;
+    int_t nprow, npcol, npdep;
+    MPI_Comm slucomm = ((gridinfo3d_t *) *grid)->comm;
+    f_get_gridinfo3d(grid, &iam, &nprow, &npcol, &npdep);
+    nprocs = nprow * npcol * npdep;
+			   
+    c2f_zcreate_matrix_x_b(fname, *nrhs, nprocs, slucomm,
+    	                   (SuperMatrix *) *A, m, n, nnz, b, ldb, xtrue, ldx);
 }
 
 /* Check malloc */
diff --git a/FORTRAN/superlu_mod.f90 b/FORTRAN/superlu_mod.f90
index bdfa8191..4a70a8f4 100644
--- a/FORTRAN/superlu_mod.f90
+++ b/FORTRAN/superlu_mod.f90
@@ -1,7 +1,12 @@
 !> @file
 !! \brief This module contains Fortran-side wrappers for the SuperLU
 !! get/set functions.
-!
+!!
+!! 
+!! -- Distributed SuperLU routine (version 7.0) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! Last update: December 31, 2020
+!! 
module superlu_mod @@ -16,13 +21,18 @@ module superlu_mod implicit none contains -subroutine get_GridInfo(grid, iam, nprow, npcol) - integer(superlu_ptr) :: grid +subroutine get_GridInfo(grid, iam, nprow, npcol, npdep) + integer(superlu_ptr) :: grid !! can be 2D or 3D grid integer*4, optional :: iam - integer, optional :: nprow, npcol - integer :: l_iam, l_nprow, l_npcol - - call f_get_gridinfo(grid, l_iam, l_nprow, l_npcol) + integer, optional :: nprow, npcol, npdep + integer :: l_iam, l_nprow, l_npcol, l_npdep + + if (present(npdep)) then + call f_get_gridinfo3d(grid, l_iam, l_nprow, l_npcol, l_npdep) + npdep = l_npdep + else + call f_get_gridinfo(grid, l_iam, l_nprow, l_npcol) + endif if (present(iam)) iam = l_iam if (present(nprow)) nprow = l_nprow diff --git a/FORTRAN/superlupara.f90 b/FORTRAN/superlupara.f90 index d246ae88..122265ad 100644 --- a/FORTRAN/superlupara.f90 +++ b/FORTRAN/superlupara.f90 @@ -2,6 +2,11 @@ !! \brief This module contains some parameter used in SuperLU for !! Fortran90 user. ! +!!
+!! -- Distributed SuperLU routine (version 7.0) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! Last update: December 31, 2020
+!! 
module superlupara_mod @@ -24,8 +29,8 @@ module superlupara_mod !---------------------------------------------------- ! The following parameters are defined: -! These values come from superlu_defs.h. If the values in there change with -! the version of SuperLU, then they need to be changed here, too. +! These values come from superlu_enum_consts.h. If the values in there +! change, then they need to be changed here, too. integer, parameter, public :: & NO = 0, & ! yes_no_t @@ -35,8 +40,9 @@ module superlupara_mod SamePattern_SameRowPerm = 2, & FACTORED = 3, & NOROWPERM = 0, & ! rowperm_t - LargeDiag = 1, & - MY_PERMR = 2, & + LargeDiag_MC64 = 1, & + LargeDiag_HWPM = 2, & + MY_PERMR = 3, & NATURAL = 0, & ! colperm_t MMD_ATA = 1, & MMD_AT_PLUS_A = 2, & @@ -53,15 +59,26 @@ module superlupara_mod COL = 2, & BOTH = 3, & NOREFINE = 0, & ! IterRefine_t - SINGLE = 1, & - DOUBLE = 2, & - EXTRA = 3, & - LUSUP = 0, & ! MemType Need? - UCOL = 1, & - LSUB = 2, & - USUB = 3, & - SYSTEM = 0, & ! LU_space_t Need? - USER = 1 + SLU_SINGLE = 1, & + SLU_DOUBLE = 2, & + SLU_EXTRA = 3, & + USUB = 0, & ! MemType + LSUB = 1, & + UCOL = 2, & + LUSUP = 3, & + LLVL = 4, & + ULVL = 5, & + NO_MEMTYPE = 6, & + SYSTEM = 0, & ! LU_space_t + USER = 1, & + SILU = 0, & ! milu_t + SMILU_1 = 1, & + SMILU_2 = 2, & + SMILU_3 = 3 + +! These values come from supermatrix.h. If the values in there +! change, then they need to be changed here, too. + integer, parameter, public :: & SLU_NC = 0, & ! Stype_t SLU_NCP = 1, & @@ -85,7 +102,6 @@ module superlupara_mod SLU_HEL = 7, & SLU_HEU = 8 - !---------------------------------------------------- end module superlupara_mod diff --git a/SRC/pdutil.c b/SRC/pdutil.c index 1a19ba59..591c5f2a 100644 --- a/SRC/pdutil.c +++ b/SRC/pdutil.c @@ -27,10 +27,10 @@ at the top-level directory. */ int pdCompRow_loc_to_CompCol_global ( - int_t need_value, /* Input. Whether need to gather numerical values */ - SuperMatrix *A, /* Input. Distributed matrix in NRformat_loc format. */ - gridinfo_t *grid, /* Input */ - SuperMatrix *GA /* Output */ + int_t need_value, /* Input. Whether need to gather numerical values */ + SuperMatrix *A, /* Input. Distributed matrix in NRformat_loc format. */ + gridinfo_t *grid, /* Input */ + SuperMatrix *GA /* Output */ ) { NRformat_loc *Astore; @@ -44,11 +44,11 @@ int pdCompRow_loc_to_CompCol_global double *a_buf; /* Buffer to merge blocks into block columns. */ int_t *itemp; int_t *colptr_send; /* Buffer to redistribute the column pointers of the - local block rows. - Use n_loc+1 pointers for each block. */ + local block rows. + Use n_loc+1 pointers for each block. */ int_t *colptr_blk; /* The column pointers for each block, after - redistribution to the local block columns. - Use n_loc+1 pointers for each block. */ + redistribution to the local block columns. + Use n_loc+1 pointers for each block. */ int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */ int_t *rowind_buf; /* Buffer to merge blocks into block columns. */ int_t *fst_rows, *n_locs; @@ -81,20 +81,19 @@ int pdCompRow_loc_to_CompCol_global #if ( DEBUGlevel>=2 ) printf("Proc %d\n", grid->iam); PrintInt10("rowind_loc", nnz_loc, rowind_loc); - PrintInt10("colptr_loc", n + 1, colptr_loc); - fflush(stdout); + PrintInt10("colptr_loc", n+1, colptr_loc); #endif procs = grid->nprow * grid->npcol; - if ( !(fst_rows = (int_t *) intMalloc_dist(2 * procs)) ) - ABORT("Malloc fails for fst_rows[]"); + if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) ) + ABORT("Malloc fails for fst_rows[]"); n_locs = fst_rows + procs; MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t, - grid->comm); - for (i = 0; i < procs - 1; ++i) n_locs[i] = fst_rows[i + 1] - fst_rows[i]; - n_locs[procs - 1] = n - fst_rows[procs - 1]; - if ( !(recvcnts = SUPERLU_MALLOC(5 * procs * sizeof(int))) ) - ABORT("Malloc fails for recvcnts[]"); + grid->comm); + for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i]; + n_locs[procs-1] = n - fst_rows[procs-1]; + if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) ) + ABORT("Malloc fails for recvcnts[]"); sendcnts = recvcnts + procs; rdispls = sendcnts + procs; sdispls = rdispls + procs; @@ -104,72 +103,65 @@ int pdCompRow_loc_to_CompCol_global Now the matrix view is P-by-P block-partition. */ /* n column starts for each column, and procs column ends for each block */ if ( !(colptr_send = intMalloc_dist(n + procs)) ) - ABORT("Malloc fails for colptr_send[]"); - if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc) + 1) * procs)) ) - ABORT("Malloc fails for colptr_blk[]"); - for (i = 0, j = 0; i < procs; ++i) - { - for (k = j; k < j + n_locs[i]; ++k) colptr_send[i + k] = colptr_loc[k]; - colptr_send[i + k] = colptr_loc[k]; /* Add an END marker */ - sendcnts[i] = n_locs[i] + 1; + ABORT("Malloc fails for colptr_send[]"); + if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) ) + ABORT("Malloc fails for colptr_blk[]"); + for (i = 0, j = 0; i < procs; ++i) { + for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k]; + colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */ + sendcnts[i] = n_locs[i] + 1; #if ( DEBUGlevel>=1 ) - assert(j == fst_rows[i]); + assert(j == fst_rows[i]); #endif - sdispls[i] = j + i; - recvcnts[i] = n_loc + 1; - rdispls[i] = i * (n_loc + 1); - j += n_locs[i]; /* First column of next block in colptr_loc[] */ + sdispls[i] = j + i; + recvcnts[i] = n_loc + 1; + rdispls[i] = i * (n_loc + 1); + j += n_locs[i]; /* First column of next block in colptr_loc[] */ } - MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t, - colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm); - + colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm); + /* Adjust colptr_blk[] so that they contain the local indices of the column pointers in the receive buffer. */ nnz = 0; /* The running sum of the nonzeros counted by far */ k = 0; - for (i = 0; i < procs; ++i) - { - for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) - { - colnnz = colptr_blk[j + 1] - colptr_blk[j]; - /*assert(k<=j);*/ - colptr_blk[k] = nnz; - nnz += colnnz; /* Start of the next column */ - ++k; - } - colptr_blk[k++] = nnz; /* Add an END marker for each block */ + for (i = 0; i < procs; ++i) { + for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) { + colnnz = colptr_blk[j+1] - colptr_blk[j]; + /*assert(k<=j);*/ + colptr_blk[k] = nnz; + nnz += colnnz; /* Start of the next column */ + ++k; + } + colptr_blk[k++] = nnz; /* Add an END marker for each block */ } /*assert(k == (n_loc+1)*procs);*/ /* Now prepare to transfer row indices and values. */ sdispls[0] = 0; - for (i = 0; i < procs - 1; ++i) - { - sendcnts[i] = colptr_loc[fst_rows[i + 1]] - colptr_loc[fst_rows[i]]; - sdispls[i + 1] = sdispls[i] + sendcnts[i]; + for (i = 0; i < procs-1; ++i) { + sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]]; + sdispls[i+1] = sdispls[i] + sendcnts[i]; } - sendcnts[procs - 1] = colptr_loc[n] - colptr_loc[fst_rows[procs - 1]]; - for (i = 0; i < procs; ++i) - { + sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]]; + for (i = 0; i < procs; ++i) { j = rdispls[i]; /* Point to this block in colptr_blk[]. */ - recvcnts[i] = colptr_blk[j + n_loc] - colptr_blk[j]; + recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j]; } rdispls[0] = 0; /* Recompute rdispls[] for row indices. */ - for (i = 0; i < procs - 1; ++i) rdispls[i + 1] = rdispls[i] + recvcnts[i]; + for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i]; - k = rdispls[procs - 1] + recvcnts[procs - 1]; /* Total received */ - if ( !(rowind_recv = (int_t *) intMalloc_dist(2 * k)) ) + k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */ + if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) ) ABORT("Malloc fails for rowind_recv[]"); rowind_buf = rowind_recv + k; MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t, - rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm); - if ( need_value ) - { - if ( !(a_recv = (double *) doubleMalloc_dist(2 * k)) ) - ABORT("Malloc fails for rowind_recv[]"); - a_buf = a_recv + k; - MPI_Alltoallv(a_loc, sendcnts, sdispls, MPI_DOUBLE, + rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm); + if ( need_value ) { + if ( !(a_recv = (double *) doubleMalloc_dist(2*k)) ) + ABORT("Malloc fails for rowind_recv[]"); + a_buf = a_recv + k; + MPI_Alltoallv(a_loc, sendcnts, sdispls, MPI_DOUBLE, a_recv, recvcnts, rdispls, MPI_DOUBLE, grid->comm); } @@ -177,48 +169,39 @@ int pdCompRow_loc_to_CompCol_global /* Reset colptr_loc[] to point to the n_loc global columns. */ colptr_loc[0] = 0; itemp = colptr_send; - for (j = 0; j < n_loc; ++j) - { + for (j = 0; j < n_loc; ++j) { colnnz = 0; - for (i = 0; i < procs; ++i) - { - k = i * (n_loc + 1) + j; /* j-th column in i-th block */ - colnnz += colptr_blk[k + 1] - colptr_blk[k]; - } - colptr_loc[j + 1] = colptr_loc[j] + colnnz; - itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */ + for (i = 0; i < procs; ++i) { + k = i * (n_loc + 1) + j; /* j-th column in i-th block */ + colnnz += colptr_blk[k+1] - colptr_blk[k]; + } + colptr_loc[j+1] = colptr_loc[j] + colnnz; + itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */ } itemp[n_loc] = colptr_loc[n_loc]; /* Merge blocks of row indices into columns of row indices. */ - for (i = 0; i < procs; ++i) - { + for (i = 0; i < procs; ++i) { k = i * (n_loc + 1); - for (j = 0; j < n_loc; ++j) /* i-th block */ - { - for (l = colptr_blk[k + j]; l < colptr_blk[k + j + 1]; ++l) - { - rowind_buf[itemp[j]] = rowind_recv[l]; - ++itemp[j]; - } - } + for (j = 0; j < n_loc; ++j) { /* i-th block */ + for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { + rowind_buf[itemp[j]] = rowind_recv[l]; + ++itemp[j]; + } + } } - if ( need_value ) - { - for (j = 0; j < n_loc + 1; ++j) itemp[j] = colptr_loc[j]; - for (i = 0; i < procs; ++i) - { - k = i * (n_loc + 1); - for (j = 0; j < n_loc; ++j) /* i-th block */ - { - for (l = colptr_blk[k + j]; l < colptr_blk[k + j + 1]; ++l) - { - a_buf[itemp[j]] = a_recv[l]; - ++itemp[j]; - } - } - } + if ( need_value ) { + for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j]; + for (i = 0; i < procs; ++i) { + k = i * (n_loc + 1); + for (j = 0; j < n_loc; ++j) { /* i-th block */ + for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { + a_buf[itemp[j]] = a_recv[l]; + ++itemp[j]; + } + } + } } /* ------------------------------------------------------------ @@ -240,54 +223,48 @@ int pdCompRow_loc_to_CompCol_global if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) ) ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]"); - if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n + 1)) ) + if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) ) ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]"); /* Allgatherv for row indices. */ rdispls[0] = 0; - for (i = 0; i < procs - 1; ++i) - { - rdispls[i + 1] = rdispls[i] + itemp[i]; + for (i = 0; i < procs-1; ++i) { + rdispls[i+1] = rdispls[i] + itemp[i]; itemp_32[i] = itemp[i]; } - itemp_32[procs - 1] = itemp[procs - 1]; + itemp_32[procs-1] = itemp[procs-1]; it = nnz_loc; MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, - itemp_32, rdispls, mpi_int_t, grid->comm); - if ( need_value ) - { - if ( !(GAstore->nzval = (double *) doubleMalloc_dist (nnz)) ) - ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]"); - MPI_Allgatherv(a_buf, it, MPI_DOUBLE, GAstore->nzval, - itemp_32, rdispls, MPI_DOUBLE, grid->comm); - } - else GAstore->nzval = NULL; + itemp_32, rdispls, mpi_int_t, grid->comm); + if ( need_value ) { + if ( !(GAstore->nzval = (double *) doubleMalloc_dist (nnz)) ) + ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]"); + MPI_Allgatherv(a_buf, it, MPI_DOUBLE, GAstore->nzval, + itemp_32, rdispls, MPI_DOUBLE, grid->comm); + } else GAstore->nzval = NULL; /* Now gather the column pointers. */ rdispls[0] = 0; - for (i = 0; i < procs - 1; ++i) - { - rdispls[i + 1] = rdispls[i] + n_locs[i]; + for (i = 0; i < procs-1; ++i) { + rdispls[i+1] = rdispls[i] + n_locs[i]; itemp_32[i] = n_locs[i]; } - itemp_32[procs - 1] = n_locs[procs - 1]; + itemp_32[procs-1] = n_locs[procs-1]; MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, - itemp_32, rdispls, mpi_int_t, grid->comm); + itemp_32, rdispls, mpi_int_t, grid->comm); /* Recompute column pointers. */ - for (i = 1; i < procs; ++i) - { + for (i = 1; i < procs; ++i) { k = rdispls[i]; - for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i - 1]; - itemp[i] += itemp[i - 1]; /* prefix sum */ + for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1]; + itemp[i] += itemp[i-1]; /* prefix sum */ } GAstore->colptr[n] = nnz; #if ( DEBUGlevel>=2 ) - if ( !grid->iam ) - { + if ( !grid->iam ) { printf("After pdCompRow_loc_to_CompCol_global()\n"); - dPrint_CompCol_Matrix_dist(GA); + dPrint_CompCol_Matrix_dist(GA); } #endif @@ -312,14 +289,14 @@ int pdCompRow_loc_to_CompCol_global */ int pdPermute_Dense_Matrix ( - int_t fst_row, - int_t m_loc, - int_t row_to_proc[], - int_t perm[], - double X[], int ldx, - double B[], int ldb, - int nrhs, - gridinfo_t *grid + int_t fst_row, + int_t m_loc, + int_t row_to_proc[], + int_t perm[], + double X[], int ldx, + double B[], int ldb, + int nrhs, + gridinfo_t *grid ) { int_t i, j, k, l; @@ -335,7 +312,7 @@ int pdPermute_Dense_Matrix #endif procs = grid->nprow * grid->npcol; - if ( !(sendcnts = SUPERLU_MALLOC(10 * procs * sizeof(int))) ) + if ( !(sendcnts = SUPERLU_MALLOC(10*procs * sizeof(int))) ) ABORT("Malloc fails for sendcnts[]."); sendcnts_nrhs = sendcnts + procs; recvcnts = sendcnts_nrhs + procs; @@ -350,72 +327,65 @@ int pdPermute_Dense_Matrix for (i = 0; i < procs; ++i) sendcnts[i] = 0; /* Count the number of X entries to be sent to each process.*/ - for (i = fst_row; i < fst_row + m_loc; ++i) - { + for (i = fst_row; i < fst_row + m_loc; ++i) { p = row_to_proc[perm[i]]; - ++sendcnts[p]; + ++sendcnts[p]; } MPI_Alltoall(sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm); sdispls[0] = rdispls[0] = 0; sdispls_nrhs[0] = rdispls_nrhs[0] = 0; sendcnts_nrhs[0] = sendcnts[0] * nrhs; recvcnts_nrhs[0] = recvcnts[0] * nrhs; - for (i = 1; i < procs; ++i) - { - sdispls[i] = sdispls[i - 1] + sendcnts[i - 1]; - sdispls_nrhs[i] = sdispls[i] * nrhs; - rdispls[i] = rdispls[i - 1] + recvcnts[i - 1]; - rdispls_nrhs[i] = rdispls[i] * nrhs; - sendcnts_nrhs[i] = sendcnts[i] * nrhs; - recvcnts_nrhs[i] = recvcnts[i] * nrhs; + for (i = 1; i < procs; ++i) { + sdispls[i] = sdispls[i-1] + sendcnts[i-1]; + sdispls_nrhs[i] = sdispls[i] * nrhs; + rdispls[i] = rdispls[i-1] + recvcnts[i-1]; + rdispls_nrhs[i] = rdispls[i] * nrhs; + sendcnts_nrhs[i] = sendcnts[i] * nrhs; + recvcnts_nrhs[i] = recvcnts[i] * nrhs; } - k = sdispls[procs - 1] + sendcnts[procs - 1]; /* Total number of sends */ - l = rdispls[procs - 1] + recvcnts[procs - 1]; /* Total number of recvs */ + k = sdispls[procs-1] + sendcnts[procs-1];/* Total number of sends */ + l = rdispls[procs-1] + recvcnts[procs-1];/* Total number of recvs */ /*assert(k == m_loc);*/ /*assert(l == m_loc);*/ if ( !(send_ibuf = intMalloc_dist(k + l)) ) ABORT("Malloc fails for send_ibuf[]."); recv_ibuf = send_ibuf + k; - if ( !(send_dbuf = doubleMalloc_dist((k + l) * nrhs)) ) + if ( !(send_dbuf = doubleMalloc_dist((k + l)*nrhs)) ) ABORT("Malloc fails for send_dbuf[]."); recv_dbuf = send_dbuf + k * nrhs; - for (i = 0; i < procs; ++i) - { + for (i = 0; i < procs; ++i) { ptr_to_ibuf[i] = sdispls[i]; - ptr_to_dbuf[i] = sdispls_nrhs[i]; + ptr_to_dbuf[i] = sdispls_nrhs[i]; } /* Fill in the send buffers: send_ibuf[] and send_dbuf[]. */ - for (i = fst_row; i < fst_row + m_loc; ++i) - { + for (i = fst_row; i < fst_row + m_loc; ++i) { j = perm[i]; - p = row_to_proc[j]; - send_ibuf[ptr_to_ibuf[p]] = j; - j = ptr_to_dbuf[p]; - RHS_ITERATE(k) /* RHS stored in row major in the buffer */ - { - send_dbuf[j++] = X[i - fst_row + k * ldx]; - } - ++ptr_to_ibuf[p]; - ptr_to_dbuf[p] += nrhs; + p = row_to_proc[j]; + send_ibuf[ptr_to_ibuf[p]] = j; + j = ptr_to_dbuf[p]; + RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ + send_dbuf[j++] = X[i-fst_row + k*ldx]; + } + ++ptr_to_ibuf[p]; + ptr_to_dbuf[p] += nrhs; } /* Transfer the (permuted) row indices and numerical values. */ MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t, - recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm); + recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm); MPI_Alltoallv(send_dbuf, sendcnts_nrhs, sdispls_nrhs, MPI_DOUBLE, - recv_dbuf, recvcnts_nrhs, rdispls_nrhs, MPI_DOUBLE, - grid->comm); + recv_dbuf, recvcnts_nrhs, rdispls_nrhs, MPI_DOUBLE, + grid->comm); /* Copy the buffer into b. */ - for (i = 0, l = 0; i < m_loc; ++i) - { + for (i = 0, l = 0; i < m_loc; ++i) { j = recv_ibuf[i] - fst_row; /* Relative row number */ - RHS_ITERATE(k) /* RHS stored in row major in the buffer */ - { - B[j + k * ldb] = recv_dbuf[l++]; - } + RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ + B[j + k*ldb] = recv_dbuf[l++]; + } } SUPERLU_FREE(sendcnts); @@ -733,7 +703,7 @@ int dSolveInit(superlu_dist_options_t *options, SuperMatrix *A, procs = grid->nprow * grid->npcol; if ( !(row_to_proc = intMalloc_dist(A->nrow)) ) - ABORT("Malloc fails for row_to_proc[]"); + ABORT("Malloc fails for row_to_proc[]"); SOLVEstruct->row_to_proc = row_to_proc; if ( !(inv_perm_c = intMalloc_dist(A->ncol)) ) ABORT("Malloc fails for inv_perm_c[]."); @@ -747,21 +717,19 @@ int dSolveInit(superlu_dist_options_t *options, SuperMatrix *A, NOTE: For those processes that do not own any row, it must must be set so that fst_row == A->nrow. ------------------------------------------------------------*/ - if ( !(itemp = intMalloc_dist(procs + 1)) ) + if ( !(itemp = intMalloc_dist(procs+1)) ) ABORT("Malloc fails for itemp[]"); MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, - grid->comm); + grid->comm); itemp[procs] = A->nrow; - for (p = 0; p < procs; ++p) - { - for (i = itemp[p] ; i < itemp[p + 1]; ++i) row_to_proc[i] = p; + for (p = 0; p < procs; ++p) { + for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p; } #if ( DEBUGlevel>=2 ) - if ( !grid->iam ) - { - printf("fst_row = %d\n", fst_row); - PrintInt10("row_to_proc", A->nrow, row_to_proc); - PrintInt10("inv_perm_c", A->ncol, inv_perm_c); + if ( !grid->iam ) { + printf("fst_row = %d\n", fst_row); + PrintInt10("row_to_proc", A->nrow, row_to_proc); + PrintInt10("inv_perm_c", A->ncol, inv_perm_c); } #endif SUPERLU_FREE(itemp); @@ -772,34 +740,32 @@ int dSolveInit(superlu_dist_options_t *options, SuperMatrix *A, than total Procs? For the processes without any row, let fst_row be EMPTY (-1). Make sure this case works! */ MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, - grid->comm); + grid->comm); itemp[procs] = n; - for (p = 0; p < procs; ++p) - { + for (p = 0; p < procs; ++p) { j = itemp[p]; - if ( j != EMPTY ) - { - k = itemp[p + 1]; - if ( k == EMPTY ) k = n; - for (i = j ; i < k; ++i) row_to_proc[i] = p; - } + if ( j != EMPTY ) { + k = itemp[p+1]; + if ( k == EMPTY ) k = n; + for (i = j ; i < k; ++i) row_to_proc[i] = p; + } } #endif get_diag_procs(A->ncol, LUstruct->Glu_persist, grid, - &SOLVEstruct->num_diag_procs, - &SOLVEstruct->diag_procs, - &SOLVEstruct->diag_len); + &SOLVEstruct->num_diag_procs, + &SOLVEstruct->diag_procs, + &SOLVEstruct->diag_len); /* Setup communication pattern for redistribution of B and X. */ if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *) - SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) + SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) ABORT("Malloc fails for gstrs_comm[]"); pdgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, LUstruct->Glu_persist, SOLVEstruct); if ( !(SOLVEstruct->gsmv_comm = (pdgsmv_comm_t *) - SUPERLU_MALLOC(sizeof(pdgsmv_comm_t))) ) + SUPERLU_MALLOC(sizeof(pdgsmv_comm_t))) ) ABORT("Malloc fails for gsmv_comm[]"); SOLVEstruct->A_colind_gsmv = NULL; @@ -811,56 +777,51 @@ int dSolveInit(superlu_dist_options_t *options, SuperMatrix *A, */ void dSolveFinalize(superlu_dist_options_t *options, dSOLVEstruct_t *SOLVEstruct) { - int_t *it; - pxgstrs_finalize(SOLVEstruct->gstrs_comm); - if ( options->RefineInitialized ) - { + if ( options->RefineInitialized ) { pdgsmv_finalize(SOLVEstruct->gsmv_comm); - options->RefineInitialized = NO; + options->RefineInitialized = NO; } SUPERLU_FREE(SOLVEstruct->gsmv_comm); SUPERLU_FREE(SOLVEstruct->row_to_proc); SUPERLU_FREE(SOLVEstruct->inv_perm_c); SUPERLU_FREE(SOLVEstruct->diag_procs); SUPERLU_FREE(SOLVEstruct->diag_len); - if ( (it = SOLVEstruct->A_colind_gsmv) ) SUPERLU_FREE(it); + if ( SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(SOLVEstruct->A_colind_gsmv); options->SolveInitialized = NO; } /* dSolveFinalize */ /*! \brief Check the inf-norm of the error vector */ void pdinf_norm_error(int iam, int_t n, int_t nrhs, double x[], int_t ldx, - double xtrue[], int_t ldxtrue, gridinfo_t *grid) + double xtrue[], int_t ldxtrue, MPI_Comm slucomm) { double err, xnorm, temperr, tempxnorm; double *x_work, *xtrue_work; int i, j; - for (j = 0; j < nrhs; j++) - { - x_work = &x[j * ldx]; - xtrue_work = &xtrue[j * ldxtrue]; - err = xnorm = 0.0; - for (i = 0; i < n; i++) - { - err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i])); - xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i])); - } - - /* get the golbal max err & xnrom */ - temperr = err; - tempxnorm = xnorm; - MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - - err = err / xnorm; - if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); + for (j = 0; j < nrhs; j++) { + x_work = &x[j*ldx]; + xtrue_work = &xtrue[j*ldxtrue]; + err = xnorm = 0.0; + for (i = 0; i < n; i++) { + err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i])); + xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i])); + } + + /* get the golbal max err & xnrom */ + temperr = err; + tempxnorm = xnorm; + MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, slucomm); + MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, slucomm); + + err = err / xnorm; + if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); } } -/*! \brief Destroy distributed L & U matrices. */ +/*! \brief Destroy broadcast and reduction trees used in triangular solve */ void dDestroy_Tree(int_t n, gridinfo_t *grid, dLUstruct_t *LUstruct) { diff --git a/SRC/pzutil.c b/SRC/pzutil.c index 8fdedc7f..bd77323e 100644 --- a/SRC/pzutil.c +++ b/SRC/pzutil.c @@ -776,8 +776,6 @@ int zSolveInit(superlu_dist_options_t *options, SuperMatrix *A, */ void zSolveFinalize(superlu_dist_options_t *options, zSOLVEstruct_t *SOLVEstruct) { - int_t *it; - pxgstrs_finalize(SOLVEstruct->gstrs_comm); if ( options->RefineInitialized ) { @@ -789,14 +787,14 @@ void zSolveFinalize(superlu_dist_options_t *options, zSOLVEstruct_t *SOLVEstruct SUPERLU_FREE(SOLVEstruct->inv_perm_c); SUPERLU_FREE(SOLVEstruct->diag_procs); SUPERLU_FREE(SOLVEstruct->diag_len); - if ( (it = SOLVEstruct->A_colind_gsmv) ) SUPERLU_FREE(it); + if ( SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(SOLVEstruct->A_colind_gsmv); options->SolveInitialized = NO; } /* zSolveFinalize */ /*! \brief Check the inf-norm of the error vector */ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx, - doublecomplex xtrue[], int_t ldxtrue, gridinfo_t *grid) + doublecomplex xtrue[], int_t ldxtrue, MPI_Comm slucomm) { double err, xnorm, temperr, tempxnorm; doublecomplex *x_work, *xtrue_work; @@ -816,15 +814,15 @@ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx /* get the golbal max err & xnrom */ temperr = err; tempxnorm = xnorm; - MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm); + MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, slucomm); + MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, slucomm); err = err / xnorm; if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); } } -/*! \brief Destroy distributed L & U matrices. */ +/*! \brief Destroy broadcast and reduction trees used in triangular solve */ void zDestroy_Tree(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct) { diff --git a/SRC/superlu_FCnames.h b/SRC/superlu_FCnames.h index 9bd95e2a..7b5d2825 100644 --- a/SRC/superlu_FCnames.h +++ b/SRC/superlu_FCnames.h @@ -24,49 +24,58 @@ at the top-level directory. #define __SUPERLU_FCNAMES /* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle FC_GLOBAL(f_create_gridinfo_handle,F_CREATE_GRIDINFO_HANDLE) -#define f_create_options_handle FC_GLOBAL(f_create_options_handle,F_CREATE_OPTIONS_HANDLE) -#define f_create_ScalePerm_handle FC_GLOBAL(f_create_scaleperm_handle,F_CREATE_SCALEPERM_HANDLE) -#define f_create_LUstruct_handle FC_GLOBAL(f_create_lustruct_handle,F_CREATE_LUSTRUCT_HANDLE) -#define f_create_SOLVEstruct_handle FC_GLOBAL(f_create_solvestruct_handle,F_CREATE_SOLVESTRUCT_HANDLE) -#define f_create_SuperMatrix_handle FC_GLOBAL(f_create_supermatrix_handle,F_CREATE_SUPERMATRIX_HANDLE) -#define f_destroy_gridinfo_handle FC_GLOBAL(f_destroy_gridinfo_handle,F_DESTROY_GRIDINFO_HANDLE) -#define f_destroy_options_handle FC_GLOBAL(f_destroy_options_handle,F_DESTROY_OPTIONS_HANDLE) -#define f_destroy_ScalePerm_handle FC_GLOBAL(f_destroy_scaleperm_handle,F_DESTROY_SCALEPERM_HANDLE) -#define f_destroy_LUstruct_handle FC_GLOBAL(f_destroy_lustruct_handle,F_DESTROY_LUSTRUCT_HANDLE) -#define f_destroy_SOLVEstruct_handle FC_GLOBAL(f_destroy_solvestruct_handle,F_DESTROY_SOLVESTRUCT_HANDLE) -#define f_destroy_SuperMatrix_handle FC_GLOBAL(f_destroy_supermatrix_handle,F_DESTROY_SUPERMATRIX_HANDLE) -#define f_create_SuperLUStat_handle FC_GLOBAL(f_create_superlustat_handle,F_CREATE_SUPERLUSTAT_HANDLE) -#define f_destroy_SuperLUStat_handle FC_GLOBAL(f_destroy_superlustat_handle,F_DESTROY_SUPERLUSTAT_HANDLE) -#define f_get_gridinfo FC_GLOBAL(f_get_gridinfo,F_GET_GRIDINFO) -#define f_get_SuperMatrix FC_GLOBAL(f_get_supermatrix,F_GET_SUPERMATRIX) -#define f_set_SuperMatrix FC_GLOBAL(f_set_supermatrix,F_SET_SUPERMATRIX) -#define f_get_CompRowLoc_Matrix FC_GLOBAL(f_get_comprowloc_matrix,F_GET_COMPROWLOC_MATRIX) -#define f_set_CompRowLoc_Matrix FC_GLOBAL(f_set_comprowloc_matrix,F_SET_COMPROWLOC_MATRIX) -#define f_get_superlu_options FC_GLOBAL(f_get_superlu_options,F_GET_SUPERLU_OPTIONS) -#define f_set_superlu_options FC_GLOBAL(f_set_superlu_options,F_SET_SUPERLU_OPTIONS) -#define f_set_default_options FC_GLOBAL(f_set_default_options,F_SET_DEFAULT_OPTIONS) -#define f_superlu_gridinit FC_GLOBAL(f_superlu_gridinit,F_SUPERLU_GRIDINIT) -#define f_superlu_gridmap FC_GLOBAL(f_superlu_gridmap,F_SUPERLU_GRIDMAP) -#define f_superlu_gridexit FC_GLOBAL(f_superlu_gridexit,F_SUPERLU_GRIDEXIT) -#define f_ScalePermstructInit FC_GLOBAL(f_scalepermstructinit,F_SCALEPERMSTRUCTINIT) -#define f_ScalePermstructFree FC_GLOBAL(f_scalepermstructfree,F_SCALEPERMSTRUCTFREE) -#define f_PStatInit FC_GLOBAL(f_pstatinit,F_PSTATINIT) -#define f_PStatFree FC_GLOBAL(f_pstatfree,F_PSTATFREE) -#define f_LUstructInit FC_GLOBAL(f_lustructinit,F_LUSTRUCTINIT) -#define f_LUstructFree FC_GLOBAL(f_lustructfree,F_LUSTRUCTFREE) -#define f_Destroy_LU FC_GLOBAL(f_destroy_lu,F_DESTROY_LU) -#define f_dCreate_CompRowLoc_Mat_dist FC_GLOBAL(f_dcreate_comprowloc_mat_dist,F_DCREATE_COMPROWLOC_MAT_DIST) -#define f_zCreate_CompRowLoc_Mat_dist FC_GLOBAL(f_zcreate_comprowloc_mat_dist,F_ZCREATE_COMPROWLOC_MAT_DIST) -#define f_Destroy_CompRowLoc_Mat_dist FC_GLOBAL(f_destroy_comprowloc_mat_dist,F_DESTROY_COMPROWLOC_MAT_DIST) -#define f_Destroy_SuperMat_Store_dist FC_GLOBAL(f_destroy_supermat_store_dist,F_DESTROY_SUPERMAT_STORE_DIST) -#define f_dSolveFinalize FC_GLOBAL(f_dsolvefinalize,F_DSOLVEFINALIZE) -#define f_zSolveFinalize FC_GLOBAL(f_zsolvefinalize,F_ZSOLVEFINALIZE) -#define f_pdgssvx FC_GLOBAL(f_pdgssvx,F_PDGSSVX) -#define f_pzgssvx FC_GLOBAL(f_pzgssvx,F_PZGSSVX) -#define f_dcreate_dist_matrix FC_GLOBAL(f_dcreate_dist_matrix,F_DCREATE_DIST_MATRIX) -#define f_zcreate_dist_matrix FC_GLOBAL(f_zcreate_dist_matrix,F_ZCREATE_DIST_MATRIX) -#define f_check_malloc FC_GLOBAL(f_check_malloc,F_CHECK_MALLOC) +#define f_create_gridinfo_handle FC_GLOBAL(f_create_gridinfo_handle,F_CREATE_GRIDINFO_HANDLE) +#define f_create_gridinfo3d_handle FC_GLOBAL(f_create_gridinfo3d_handle,F_CREATE_GRIDINFO3D_HANDLE) +#define f_create_options_handle FC_GLOBAL(f_create_options_handle,F_CREATE_OPTIONS_HANDLE) +#define f_create_ScalePerm_handle FC_GLOBAL(f_create_scaleperm_handle,F_CREATE_SCALEPERM_HANDLE) +#define f_create_LUstruct_handle FC_GLOBAL(f_create_lustruct_handle,F_CREATE_LUSTRUCT_HANDLE) +#define f_create_SOLVEstruct_handle FC_GLOBAL(f_create_solvestruct_handle,F_CREATE_SOLVESTRUCT_HANDLE) +#define f_create_SuperMatrix_handle FC_GLOBAL(f_create_supermatrix_handle,F_CREATE_SUPERMATRIX_HANDLE) +#define f_destroy_gridinfo_handle FC_GLOBAL(f_destroy_gridinfo_handle,F_DESTROY_GRIDINFO_HANDLE) +#define f_destroy_options_handle FC_GLOBAL(f_destroy_options_handle,F_DESTROY_OPTIONS_HANDLE) +#define f_destroy_ScalePerm_handle FC_GLOBAL(f_destroy_scaleperm_handle,F_DESTROY_SCALEPERM_HANDLE) +#define f_destroy_LUstruct_handle FC_GLOBAL(f_destroy_lustruct_handle,F_DESTROY_LUSTRUCT_HANDLE) +#define f_destroy_SOLVEstruct_handle FC_GLOBAL(f_destroy_solvestruct_handle,F_DESTROY_SOLVESTRUCT_HANDLE) +#define f_destroy_SuperMatrix_handle FC_GLOBAL(f_destroy_supermatrix_handle,F_DESTROY_SUPERMATRIX_HANDLE) +#define f_create_SuperLUStat_handle FC_GLOBAL(f_create_superlustat_handle,F_CREATE_SUPERLUSTAT_HANDLE) +#define f_destroy_SuperLUStat_handle FC_GLOBAL(f_destroy_superlustat_handle,F_DESTROY_SUPERLUSTAT_HANDLE) +#define f_get_gridinfo FC_GLOBAL(f_get_gridinfo,F_GET_GRIDINFO) +#define f_get_gridinfo3d FC_GLOBAL(f_get_gridinfo3d,F_GET_GRIDINFO3D) +#define f_get_SuperMatrix FC_GLOBAL(f_get_supermatrix,F_GET_SUPERMATRIX) +#define f_set_SuperMatrix FC_GLOBAL(f_set_supermatrix,F_SET_SUPERMATRIX) +#define f_get_CompRowLoc_Matrix FC_GLOBAL(f_get_comprowloc_matrix,F_GET_COMPROWLOC_MATRIX) +#define f_set_CompRowLoc_Matrix FC_GLOBAL(f_set_comprowloc_matrix,F_SET_COMPROWLOC_MATRIX) +#define f_get_superlu_options FC_GLOBAL(f_get_superlu_options,F_GET_SUPERLU_OPTIONS) +#define f_set_superlu_options FC_GLOBAL(f_set_superlu_options,F_SET_SUPERLU_OPTIONS) +#define f_set_default_options FC_GLOBAL(f_set_default_options,F_SET_DEFAULT_OPTIONS) +#define f_superlu_gridinit FC_GLOBAL(f_superlu_gridinit,F_SUPERLU_GRIDINIT) +#define f_superlu_gridinit3d FC_GLOBAL(f_superlu_gridinit3d,F_SUPERLU_GRIDINIT3D) +#define f_superlu_gridmap FC_GLOBAL(f_superlu_gridmap,F_SUPERLU_GRIDMAP) +#define f_superlu_gridexit FC_GLOBAL(f_superlu_gridexit,F_SUPERLU_GRIDEXIT) +#define f_ScalePermstructInit FC_GLOBAL(f_scalepermstructinit,F_SCALEPERMSTRUCTINIT) +#define f_ScalePermstructFree FC_GLOBAL(f_scalepermstructfree,F_SCALEPERMSTRUCTFREE) +#define f_PStatInit FC_GLOBAL(f_pstatinit,F_PSTATINIT) +#define f_PStatFree FC_GLOBAL(f_pstatfree,F_PSTATFREE) +#define f_LUstructInit FC_GLOBAL(f_lustructinit,F_LUSTRUCTINIT) +#define f_LUstructFree FC_GLOBAL(f_lustructfree,F_LUSTRUCTFREE) +#define f_Destroy_LU_SOLVE_struct FC_GLOBAL(f_destroy_lu_solve_struct,F_DESTROY_LU_SOLVE_STRUCT) +#define f_Destroy_LU_SOLVE_struct_3d FC_GLOBAL(f_destroy_lu_solve_struct_3d,F_DESTROY_LU_SOLVE_STRUCT_3D) +#define f_dCreate_CompRowLoc_Mat_dist FC_GLOBAL(f_dcreate_comprowloc_mat_dist,F_DCREATE_COMPROWLOC_MAT_DIST) +#define f_zCreate_CompRowLoc_Mat_dist FC_GLOBAL(f_zcreate_comprowloc_mat_dist,F_ZCREATE_COMPROWLOC_MAT_DIST) +#define f_Destroy_CompRowLoc_Mat_dist FC_GLOBAL(f_destroy_comprowloc_mat_dist,F_DESTROY_COMPROWLOC_MAT_DIST) +#define f_Destroy_SuperMat_Store_dist FC_GLOBAL(f_destroy_supermat_store_dist,F_DESTROY_SUPERMAT_STORE_DIST) +#define f_dSolveFinalize FC_GLOBAL(f_dsolvefinalize,F_DSOLVEFINALIZE) +#define f_zSolveFinalize FC_GLOBAL(f_zsolvefinalize,F_ZSOLVEFINALIZE) +#define f_pdgssvx FC_GLOBAL(f_pdgssvx,F_PDGSSVX) +#define f_pdgssvx3d FC_GLOBAL(f_pdgssvx3d,F_PDGSSVX3D) +#define f_pzgssvx FC_GLOBAL(f_pzgssvx,F_PZGSSVX) +#define f_pzgssvx3d FC_GLOBAL(f_pzgssvx3d,F_PZGSSVX3D) +#define f_dcreate_dist_matrix FC_GLOBAL(f_dcreate_dist_matrix,F_DCREATE_DIST_MATRIX) +#define f_dcreate_matrix_x_b FC_GLOBAL(f_dcreate_matrix_x_b,F_DCREATE_MATRIX_X_B) +#define f_dcreate_matrix_x_b_3d FC_GLOBAL(f_dcreate_matrix_x_b_3d,F_DCREATE_MATRIX_X_B_3D) +#define f_zcreate_matrix_x_b FC_GLOBAL(f_zcreate_matrix_x_b,F_ZCREATE_MATRIX_X_B) +#define f_zcreate_matrix_x_b_3d FC_GLOBAL(f_zcreate_matrix_x_b_3d,F_ZCREATE_MATRIX_X_B_3D) +#define f_check_malloc FC_GLOBAL(f_check_malloc,F_CHECK_MALLOC) /* BLAS */ #define sasum_ FC_GLOBAL(sasum,SASUM) diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index d5e7fb2e..32a26aea 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -536,7 +536,7 @@ extern void dfill_dist (double *, int_t, double); extern void dinf_norm_error_dist (int_t, int_t, double*, int_t, double*, int_t, gridinfo_t*); extern void pdinf_norm_error(int, int_t, int_t, double [], int_t, - double [], int_t , gridinfo_t *); + double [], int_t , MPI_Comm); extern void dreadhb_dist (int, FILE *, int_t *, int_t *, int_t *, double **, int_t **, int_t **); extern void dreadtriple_dist(FILE *, int_t *, int_t *, int_t *, diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h deleted file mode 100644 index ffd061a2..00000000 --- a/SRC/superlu_dist_config.h +++ /dev/null @@ -1,20 +0,0 @@ -/* superlu_dist_config.h.in */ - -/* Enable CUDA */ -/* #undef HAVE_CUDA */ - -/* Enable parmetis */ -#define HAVE_PARMETIS TRUE - -/* Enable LAPACK */ -/* #undef SLU_HAVE_LAPACK */ - -/* Enable CombBLAS */ -/* #undef HAVE_COMBBLAS */ - -/* enable 64bit index mode */ -/* #undef XSDK_INDEX_SIZE */ - -#if (XSDK_INDEX_SIZE == 64) -#define _LONGINT 1 -#endif diff --git a/SRC/superlu_grid.c b/SRC/superlu_grid.c index 2b7600d2..32303e4c 100644 --- a/SRC/superlu_grid.c +++ b/SRC/superlu_grid.c @@ -27,6 +27,11 @@ MPI_Datatype SuperLU_MPI_DOUBLE_COMPLEX = MPI_DATATYPE_NULL; #endif /*! \brief All processes in the MPI communicator must call this routine. + * + * On output, if a process is not in the SuperLU group, the following + * values are assigned to it: + * grid->comm = MPI_COMM_NULL + * grid->iam = -1 */ void superlu_gridinit(MPI_Comm Bcomm, /* The base communicator upon which the new grid is formed. */ @@ -59,6 +64,11 @@ void superlu_gridinit(MPI_Comm Bcomm, /* The base communicator upon which /*! \brief All processes in the MPI communicator must call this routine. + * + * On output, if a process is not in the SuperLU group, the following + * values are assigned to it: + * grid->comm = MPI_COMM_NULL + * grid->iam = -1 */ void superlu_gridmap( MPI_Comm Bcomm, /* The base communicator upon which @@ -108,17 +118,17 @@ void superlu_gridmap( MPI_Group_incl( mpi_base_group, Np, pranks, &superlu_grp ); /* Create the new communicator. */ /* NOTE: The call is to be executed by all processes in Bcomm, - even if they do not belong in the new group -- superlu_grp. */ + even if they do not belong in the new group -- superlu_grp. + The function returns MPI_COMM_NULL to processes that are not in superlu_grp. */ MPI_Comm_create( Bcomm, superlu_grp, &grid->comm ); - /* Bail out if I am not in the group, superlu_group. */ + /* Bail out if I am not in the group "superlu_grp". */ if ( grid->comm == MPI_COMM_NULL ) { - grid->comm = Bcomm; - MPI_Comm_rank( Bcomm, &i ); - grid->iam = i; - /*grid->iam = -1;*/ - SUPERLU_FREE(pranks); - return; + // grid->comm = Bcomm; do not need to reassign to a valid communicator + grid->iam = -1; + //SUPERLU_FREE(pranks); + //return; + goto gridmap_out; } MPI_Comm_rank( grid->comm, &(grid->iam) ); @@ -166,14 +176,16 @@ void superlu_gridmap( } #endif + gridmap_out: SUPERLU_FREE(pranks); MPI_Group_free(&superlu_grp); MPI_Group_free(&mpi_base_group); -} + +} /* superlu_gridmap */ void superlu_gridexit(gridinfo_t *grid) { - if ( grid->comm != MPI_COMM_NULL && grid->comm != MPI_COMM_WORLD ) { + if ( grid->comm != MPI_COMM_NULL ) { /* Marks the communicator objects for deallocation. */ MPI_Comm_free( &grid->rscp.comm ); MPI_Comm_free( &grid->cscp.comm ); diff --git a/SRC/superlu_grid3d.c b/SRC/superlu_grid3d.c index 7c9be57f..8dd12462 100644 --- a/SRC/superlu_grid3d.c +++ b/SRC/superlu_grid3d.c @@ -92,17 +92,17 @@ void superlu_gridmap3d( MPI_Group_incl( mpi_base_group, Np, pranks, &superlu_grp ); /* Create the new communicator. */ /* NOTE: The call is to be executed by all processes in Bcomm, - even if they do not belong in the new group -- superlu_grp. */ + even if they do not belong in the new group -- superlu_grp. + The function returns MPI_COMM_NULL to processes that are not in superlu_grp. */ MPI_Comm_create( Bcomm, superlu_grp, &grid->comm ); /* Bail out if I am not in the group, superlu_group. */ if ( grid->comm == MPI_COMM_NULL ) { - grid->comm = Bcomm; - MPI_Comm_rank( Bcomm, &i ); - grid->iam = i; - /*grid->iam = -1;*/ - SUPERLU_FREE(pranks); - return; + //grid->comm = Bcomm; do not need to reassign to a valid communicator + grid->iam = -1; + //SUPERLU_FREE(pranks); + //return; + goto gridmap_out; } grid->nprow = nprow; @@ -214,61 +214,9 @@ void superlu_gridmap3d( grid->grid2d.npcol = npcol; MPI_Comm_rank( grid->grid2d.comm, &(grid->grid2d.iam)); - // grid->grid2d.cscp = grid->cscp; -#if 0 - /* Make a list of the processes in the new communicator. */ - pranks = (int *) SUPERLU_MALLOC(Np * sizeof(int)); - for (j = 0; j < npcol; ++j) - for (i = 0; i < nprow; ++i) - pranks[i * npcol + j] = usermap[j * ldumap + i]; - - /* - * Form MPI communicator for all. - */ - /* Get the group underlying Bcomm. */ - MPI_Comm_group( Bcomm, &mpi_base_group ); - /* Create the new group. */ - MPI_Group_incl( mpi_base_group, Np, pranks, &superlu_grp ); - /* Create the new communicator. */ - /* NOTE: The call is to be executed by all processes in Bcomm, - even if they do not belong in the new group -- superlu_grp. */ - MPI_Comm_create( Bcomm, superlu_grp, &grid->comm ); - - /* Bail out if I am not in the group, superlu_group. */ - if ( grid->comm == MPI_COMM_NULL ) { - grid->comm = Bcomm; - MPI_Comm_rank( Bcomm, &i ); - grid->iam = i; - /*grid->iam = -1;*/ - SUPERLU_FREE(pranks); - return; - } - - MPI_Comm_rank( grid->comm, &(grid->iam) ); - myrow = grid->iam / npcol; - mycol = grid->iam % npcol; - - /* - * Form MPI communicator for myrow, scope = COMM_ROW. - */ - - MPI_Comm_split(grid->comm, myrow, mycol, &(grid->rscp.comm)); - - - /* - * Form MPI communicator for mycol, scope = COMM_COLUMN. - */ - MPI_Comm_split(grid->comm, mycol, myrow, &(grid->cscp.comm)); - - - grid->rscp.Np = npcol; - grid->rscp.Iam = mycol; - grid->cscp.Np = nprow; - grid->cscp.Iam = myrow; -#endif - + gridmap_out: SUPERLU_FREE(pranks); MPI_Group_free( &superlu_grp ); MPI_Group_free( &mpi_base_group ); diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h index bb4ae8e7..fb17eacc 100644 --- a/SRC/superlu_zdefs.h +++ b/SRC/superlu_zdefs.h @@ -537,7 +537,7 @@ extern void zfill_dist (doublecomplex *, int_t, doublecomplex); extern void zinf_norm_error_dist (int_t, int_t, doublecomplex*, int_t, doublecomplex*, int_t, gridinfo_t*); extern void pzinf_norm_error(int, int_t, int_t, doublecomplex [], int_t, - doublecomplex [], int_t , gridinfo_t *); + doublecomplex [], int_t , MPI_Comm); extern void zreadhb_dist (int, FILE *, int_t *, int_t *, int_t *, doublecomplex **, int_t **, int_t **); extern void zreadtriple_dist(FILE *, int_t *, int_t *, int_t *, diff --git a/SRC/util.c b/SRC/util.c index 6acca0b7..665d3354 100644 --- a/SRC/util.c +++ b/SRC/util.c @@ -319,13 +319,13 @@ PStatPrint(superlu_dist_options_t *options, SuperLUStat_t *stat, gridinfo_t *gri printf("**** Time (seconds) ****\n"); if ( options->Equil != NO ) - printf("\tEQUIL time %8.2f\n", utime[EQUIL]); + printf("\tEQUIL time %8.3f\n", utime[EQUIL]); if ( options->RowPerm != NOROWPERM ) - printf("\tROWPERM time %8.2f\n", utime[ROWPERM]); + printf("\tROWPERM time %8.3f\n", utime[ROWPERM]); if ( options->ColPerm != NATURAL ) - printf("\tCOLPERM time %8.2f\n", utime[COLPERM]); - printf("\tSYMBFACT time %8.2f\n", utime[SYMBFAC]); - printf("\tDISTRIBUTE time %8.2f\n", utime[DIST]); + printf("\tCOLPERM time %8.3f\n", utime[COLPERM]); + printf("\tSYMBFACT time %8.3f\n", utime[SYMBFAC]); + printf("\tDISTRIBUTE time %8.3f\n", utime[DIST]); } @@ -333,7 +333,7 @@ PStatPrint(superlu_dist_options_t *options, SuperLUStat_t *stat, gridinfo_t *gri 0, grid->comm); factflop = flopcnt; if ( !iam && options->Fact != FACTORED ) { - printf("\tFACTOR time %8.2f\n", utime[FACT]); + printf("\tFACTOR time %8.3f\n", utime[FACT]); if ( utime[FACT] != 0.0 ) printf("\tFactor flops\t%e\tMflops \t%8.2f\n", flopcnt, diff --git a/TEST/pdtest.c b/TEST/pdtest.c index c10fc127..fd0dcf24 100644 --- a/TEST/pdtest.c +++ b/TEST/pdtest.c @@ -357,7 +357,7 @@ int main(int argc, char *argv[]) PStatFree(&stat); #if 0 pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, &grid); + nrhs, b, ldb, xtrue, ldx, grid.comm); #endif if ( info ) { printf(FMT3, "pdgssvx",info,izero,n,nrhs,imat,nfail); @@ -375,7 +375,7 @@ int main(int argc, char *argv[]) dgst04(n, nrhs, solx, ldx, xact, ldx, rcond, &result[2]); pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, &grid); + nrhs, b, ldb, xtrue, ldx, grid.comm); #endif /* Print information about the tests that did diff --git a/TEST/pztest.c b/TEST/pztest.c index 6b430749..e3f57bc5 100644 --- a/TEST/pztest.c +++ b/TEST/pztest.c @@ -357,7 +357,7 @@ int main(int argc, char *argv[]) PStatFree(&stat); #if 0 pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, &grid); + nrhs, b, ldb, xtrue, ldx, grid.comm); #endif if ( info ) { printf(FMT3, "pzgssvx",info,izero,n,nrhs,imat,nfail); @@ -375,7 +375,7 @@ int main(int argc, char *argv[]) dgst04(n, nrhs, solx, ldx, xact, ldx, rcond, &result[2]); pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, &grid); + nrhs, b, ldb, xtrue, ldx, grid.comm); #endif /* Print information about the tests that did From b28dfc1ce9ff97f26c993ef743c8225d6dddade9 Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Mon, 4 Jan 2021 13:53:45 -0800 Subject: [PATCH 057/147] Modified EXAMPLE/pddrive1.c, illustrate how to use the same LU factor to different RHS, and diffferent number of RHS's. --- EXAMPLE/pddrive1.c | 73 +++++++++++++++++++++++++++++++++++----------- EXAMPLE/pzdrive1.c | 73 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 112 insertions(+), 34 deletions(-) diff --git a/EXAMPLE/pddrive1.c b/EXAMPLE/pddrive1.c index f11a2799..686ecc8c 100644 --- a/EXAMPLE/pddrive1.c +++ b/EXAMPLE/pddrive1.c @@ -14,10 +14,11 @@ at the top-level directory. * \brief Driver program for PDGSSVX example * *
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  * April 5, 2015
+ * January 4 2020
  * 
*/ @@ -33,7 +34,8 @@ at the top-level directory. * The driver program PDDRIVE1. * * This example illustrates how to use PDGSSVX to - * solve systems with the same A but different right-hand side. + * solve systems with the same A but different right-hand side, + * possibly with different number of right-hand sides. * In this case, we factorize A only once in the first call to * PDGSSVX, and reuse the following data structures * in the subsequent call to PDGSSVX: @@ -54,8 +56,8 @@ int main(int argc, char *argv[]) dSOLVEstruct_t SOLVEstruct; gridinfo_t grid; double *berr; - double *b, *xtrue, *b1; - int i, j, m, n; + double *b, *xtrue, *b1, *b2; + int i, j, m, n, m_loc; int nprow, npcol; int iam, info, ldb, ldx, nrhs; char **cpp, c, *postfix; @@ -65,7 +67,7 @@ int main(int argc, char *argv[]) nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ - nrhs = 1; /* Number of right-hand side. */ + nrhs = 3; /* Max. number of right-hand sides. */ /* ------------------------------------------------------------ INITIALIZE MPI ENVIRONMENT. @@ -141,14 +143,24 @@ int main(int argc, char *argv[]) dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid); if ( !(b1 = doubleMalloc_dist(ldb * nrhs)) ) ABORT("Malloc fails for b1[]"); - for (j = 0; j < nrhs; ++j) - for (i = 0; i < ldb; ++i) b1[i+j*ldb] = b[i+j*ldb]; + if ( !(b2 = doubleMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < ldb; ++i) { + b1[i+j*ldb] = b[i+j*ldb]; + b2[i+j*ldb] = b[i+j*ldb]; + } + } if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); + m = A.nrow; + n = A.ncol; + m_loc = ((NRformat_loc *)A.Store)->m_loc; + /* ------------------------------------------------------------ - WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + 1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS. ------------------------------------------------------------*/ /* Set the default input options: @@ -171,9 +183,6 @@ int main(int argc, char *argv[]) fflush(stdout); } - m = A.nrow; - n = A.ncol; - /* Initialize ScalePermstruct and LUstruct. */ dScalePermstructInit(m, n, &ScalePermstruct); dLUstructInit(n, &LUstruct); @@ -182,41 +191,70 @@ int main(int argc, char *argv[]) PStatInit(&stat); /* Call the linear equation solver. */ + nrhs = 1; pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ if ( !iam ) printf("\tSolve the first system:\n"); - pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); + pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); /* ------------------------------------------------------------ - NOW WE SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. ------------------------------------------------------------*/ options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ PStatInit(&stat); /* Initialize the statistics variables. */ + nrhs = 1; pdgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ if ( !iam ) printf("\tSolve the system with a different B:\n"); - pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b1, ldb, xtrue, ldx, grid.comm); + pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + /* ------------------------------------------------------------ + 3. SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + NUMBER OF RIGHT-HAND SIDES, WE WILL USE THE EXISTING L AND U + FACTORS IN LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 3; + + /* When changing the number of RHS's, the following counters + for communication messages must be reset. */ + pxgstrs_comm_t *gstrs_comm = SOLVEstruct.gstrs_comm; + SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt); + SUPERLU_FREE(gstrs_comm->X_to_B_SendCnt); + SUPERLU_FREE(gstrs_comm->ptr_to_ibuf); + pdgstrs_init(n, m_loc, nrhs, ((NRformat_loc *)A.Store)->fst_row, + ScalePermstruct.perm_r, ScalePermstruct.perm_c, &grid, + LUstruct.Glu_persist, &SOLVEstruct); + + pdgssvx(&options, &A, &ScalePermstruct, b2, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with 3 RHS's:\n"); + pdinf_norm_error(iam, m_loc, nrhs, b2, ldb, xtrue, ldx, grid.comm); + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------*/ - PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); dScalePermstructFree(&ScalePermstruct); dDestroy_LU(n, &grid, &LUstruct); @@ -226,6 +264,7 @@ int main(int argc, char *argv[]) } SUPERLU_FREE(b); SUPERLU_FREE(b1); + SUPERLU_FREE(b2); SUPERLU_FREE(xtrue); SUPERLU_FREE(berr); fclose(fp); diff --git a/EXAMPLE/pzdrive1.c b/EXAMPLE/pzdrive1.c index f01bc447..b65733b2 100644 --- a/EXAMPLE/pzdrive1.c +++ b/EXAMPLE/pzdrive1.c @@ -13,10 +13,11 @@ at the top-level directory. * \brief Driver program for PZGSSVX example * *
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  * April 5, 2015
+ * January 4 2020
  * 
*/ @@ -32,7 +33,8 @@ at the top-level directory. * The driver program PZDRIVE1. * * This example illustrates how to use PZGSSVX to - * solve systems with the same A but different right-hand side. + * solve systems with the same A but different right-hand side, + * possibly with different number of right-hand sides. * In this case, we factorize A only once in the first call to * PZGSSVX, and reuse the following data structures * in the subsequent call to PZGSSVX: @@ -53,8 +55,8 @@ int main(int argc, char *argv[]) zSOLVEstruct_t SOLVEstruct; gridinfo_t grid; double *berr; - doublecomplex *b, *xtrue, *b1; - int i, j, m, n; + doublecomplex *b, *xtrue, *b1, *b2; + int i, j, m, n, m_loc; int nprow, npcol; int iam, info, ldb, ldx, nrhs; char **cpp, c, *postfix; @@ -64,7 +66,7 @@ int main(int argc, char *argv[]) nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ - nrhs = 1; /* Number of right-hand side. */ + nrhs = 3; /* Max. number of right-hand sides. */ /* ------------------------------------------------------------ INITIALIZE MPI ENVIRONMENT. @@ -140,14 +142,24 @@ int main(int argc, char *argv[]) zcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid); if ( !(b1 = doublecomplexMalloc_dist(ldb * nrhs)) ) ABORT("Malloc fails for b1[]"); - for (j = 0; j < nrhs; ++j) - for (i = 0; i < ldb; ++i) b1[i+j*ldb] = b[i+j*ldb]; + if ( !(b2 = doublecomplexMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < ldb; ++i) { + b1[i+j*ldb] = b[i+j*ldb]; + b2[i+j*ldb] = b[i+j*ldb]; + } + } if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); + m = A.nrow; + n = A.ncol; + m_loc = ((NRformat_loc *)A.Store)->m_loc; + /* ------------------------------------------------------------ - WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + 1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS. ------------------------------------------------------------*/ /* Set the default input options: @@ -170,9 +182,6 @@ int main(int argc, char *argv[]) fflush(stdout); } - m = A.nrow; - n = A.ncol; - /* Initialize ScalePermstruct and LUstruct. */ zScalePermstructInit(m, n, &ScalePermstruct); zLUstructInit(n, &LUstruct); @@ -181,41 +190,70 @@ int main(int argc, char *argv[]) PStatInit(&stat); /* Call the linear equation solver. */ + nrhs = 1; pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ if ( !iam ) printf("\tSolve the first system:\n"); - pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); + pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); /* ------------------------------------------------------------ - NOW WE SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. ------------------------------------------------------------*/ options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ PStatInit(&stat); /* Initialize the statistics variables. */ + nrhs = 1; pzgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); /* Check the accuracy of the solution. */ if ( !iam ) printf("\tSolve the system with a different B:\n"); - pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b1, ldb, xtrue, ldx, grid.comm); + pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + /* ------------------------------------------------------------ + 3. SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + NUMBER OF RIGHT-HAND SIDES, WE WILL USE THE EXISTING L AND U + FACTORS IN LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 3; + + /* When changing the number of RHS's, the following counters + for communication messages must be reset. */ + pxgstrs_comm_t *gstrs_comm = SOLVEstruct.gstrs_comm; + SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt); + SUPERLU_FREE(gstrs_comm->X_to_B_SendCnt); + SUPERLU_FREE(gstrs_comm->ptr_to_ibuf); + pzgstrs_init(n, m_loc, nrhs, ((NRformat_loc *)A.Store)->fst_row, + ScalePermstruct.perm_r, ScalePermstruct.perm_c, &grid, + LUstruct.Glu_persist, &SOLVEstruct); + + pzgssvx(&options, &A, &ScalePermstruct, b2, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with 3 RHS's:\n"); + pzinf_norm_error(iam, m_loc, nrhs, b2, ldb, xtrue, ldx, grid.comm); + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------*/ - PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); zScalePermstructFree(&ScalePermstruct); zDestroy_LU(n, &grid, &LUstruct); @@ -225,6 +263,7 @@ int main(int argc, char *argv[]) } SUPERLU_FREE(b); SUPERLU_FREE(b1); + SUPERLU_FREE(b2); SUPERLU_FREE(xtrue); SUPERLU_FREE(berr); fclose(fp); From c13fda170a4b1cd269ea530e7344039a91d6c8c3 Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Fri, 8 Jan 2021 13:41:30 -0800 Subject: [PATCH 058/147] Add the newly created files in FORTRAN/ folder. Install the compiled FORTRAN/*.mod files into include/ folder. --- FORTRAN/CMakeLists.txt | 10 ++ FORTRAN/c2f_dcreate_matrix_x_b.c | 282 +++++++++++++++++++++++++++++++ FORTRAN/c2f_zcreate_matrix_x_b.c | 281 ++++++++++++++++++++++++++++++ FORTRAN/f_pddrive3d.f90 | 161 ++++++++++++++++++ FORTRAN/f_pzdrive3d.f90 | 161 ++++++++++++++++++ 5 files changed, 895 insertions(+) create mode 100644 FORTRAN/c2f_dcreate_matrix_x_b.c create mode 100644 FORTRAN/c2f_zcreate_matrix_x_b.c create mode 100644 FORTRAN/f_pddrive3d.f90 create mode 100644 FORTRAN/f_pzdrive3d.f90 diff --git a/FORTRAN/CMakeLists.txt b/FORTRAN/CMakeLists.txt index 0fa977b5..1f8a836d 100644 --- a/FORTRAN/CMakeLists.txt +++ b/FORTRAN/CMakeLists.txt @@ -1,6 +1,11 @@ # Sherry; may not need it? include_directories(${SuperLU_DIST_SOURCE_DIR}/SRC) +set(headers + ${CMAKE_INSTALL_PREFIX}/FORTRAN/superlu_mod.mod + ${CMAKE_INSTALL_PREFIX}/FORTRAN/superlupara_mod.mod + ) + # Fortran stuff add_definitions(${MPI_Fortran_COMPILE_FLAGS}) include_directories(${MPI_Fortran_INCLUDE_PATH}) @@ -52,3 +57,8 @@ if(enable_complex16) set_target_properties(f_pzdrive3d PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") endif() + +install(FILES ${headers} +# DESTINATION ${CMAKE_INSTALL_PREFIX}/include) + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) diff --git a/FORTRAN/c2f_dcreate_matrix_x_b.c b/FORTRAN/c2f_dcreate_matrix_x_b.c new file mode 100644 index 00000000..e35f85c9 --- /dev/null +++ b/FORTRAN/c2f_dcreate_matrix_x_b.c @@ -0,0 +1,282 @@ + + +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Read the matrix from data file, then distribute it in a + * distributed CSR format. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * Last update: December 31, 2020
+ * 
+ */ +#include +#include "superlu_ddefs.h" + +/* \brief + * + *
+ * Purpose
+ * =======
+ * 
+ * C2F_DCREATE_MATRIX_X_B read the matrix from data file in various formats,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ * Arguments   
+ * =========      
+ *
+ * FNAME (input) char*
+ *       File name as a character string.
+ *
+ * NRHS  (input) int
+ *       Number of right-hand sides.
+ *
+ * nprocs (input) int*
+ *       Total number of MPI processes.
+ *
+ * slucomm (input) MPI_Comm
+ *       SuperLU's communicator
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format.
+ *
+ * m_g   (output) int*
+ *       Global matrix row dimension
+ *
+ * n_g   (output) int*
+ *       Global matrix column dimension
+ *
+ * nnz_g (output) int_t*
+ *       Number of nonzeros in global matrix
+ *
+ * rhs   (output) double*
+ *       The right-hand side matrix.
+ *
+ * ldb   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * x     (output) double*
+ *       The true solution matrix.
+ *
+ * ldx   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * 
+ */ + +int c2f_dcreate_matrix_x_b(char *fname, int nrhs, int nprocs, + MPI_Comm slucomm, SuperMatrix *A, + int *m_g, int *n_g, int_t *nnz_g, + double *rhs, int *ldb, double *x, int *ldx) +{ + SuperMatrix GA; /* global A */ + double *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + double *nzval; /* global */ + double *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t *marker; + int_t nnz, nnz_loc; + int m, n; + int m_loc, fst_row; + int m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int row, col, i, j, relpos; + int iam; + char trans[1]; + + char **cpp, c, *postfix;; + FILE *fp, *fopen(); + + MPI_Comm_rank(slucomm, &iam); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter c2f_dreate_matrix_x_b()"); +#endif + + if ( iam==0 ) { + double t = SuperLU_timer_(); + + if ( !(fp = fopen(fname, "r")) ) { + ABORT("File does not exist"); + } + for (i = 0; i < strlen(fname); i++) { + if (fname[i]=='.') { + postfix = &(fname[i+1]); + } + } + if(!strcmp(postfix,"rua")){ + /* Read the matrix stored on disk in Harwell-Boeing format. */ + dreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"mtx")){ + /* Read the matrix stored on disk in Matrix Market format. */ + dreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"rb")){ + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + dreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"dat")){ + /* Read the matrix stored on disk in triplet format. */ + dreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"datnh")){ + /* Read the matrix stored on disk in triplet format (without header). */ + dreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"bin")){ + /* Read the matrix stored on disk in binary format. */ + dread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else { + ABORT("File format not known"); + } + + fclose(fp); + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( nzval, nnz, MPI_DOUBLE, 0, slucomm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, slucomm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, slucomm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, slucomm ); + + /* Allocate storage for compressed column representation. */ + dallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_DOUBLE, 0, slucomm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, slucomm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, slucomm ); + } + +#if 0 + nzval[0]=0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / nprocs; //(grid->nprow * grid->npcol); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * nprocs) != m) { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (nprocs - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (nprocs - 1); + } + + /* Create compressed column matrix for GA. */ + dCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_D, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = doubleMalloc_dist(m*nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = doubleMalloc_dist(n*nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + dGenXtrue_dist(n, nrhs, xtrue_global, n); + dFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc+1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) { + row = fst_row + j; + rowptr[j+1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (double *) doubleMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) { + for (j = colptr[i]; j < colptr[i+1]; ++j) { + row = rowind[j]; + if ( (row>=fst_row) && (row=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + dCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_D, SLU_GE); + + /* Get the local B */ + for (j =0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) { + row = fst_row + i; + rhs[j*m_loc+i] = b_global[j*n+row]; + } + } + *ldb = m_loc; + *ldx = m_loc; + + /* Set the true X */ + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) + x[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit c2f_dreate_matrix_x_b()"); +#endif + + *m_g = m; + *n_g = n; + *nnz_g = nnz; + return 0; +} diff --git a/FORTRAN/c2f_zcreate_matrix_x_b.c b/FORTRAN/c2f_zcreate_matrix_x_b.c new file mode 100644 index 00000000..e23428ee --- /dev/null +++ b/FORTRAN/c2f_zcreate_matrix_x_b.c @@ -0,0 +1,281 @@ + +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Read the matrix from data file, then distribute it in a + * distributed CSR format. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * Last update: December 31, 2020
+ * 
+ */ +#include +#include "superlu_zdefs.h" + +/* \brief + * + *
+ * Purpose
+ * =======
+ * 
+ * C2F_ZCREATE_MATRIX_X_B read the matrix from data file in various formats,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ * Arguments   
+ * =========      
+ *
+ * FNAME (input) char*
+ *       File name as a character string.
+ *
+ * NRHS  (input) int
+ *       Number of right-hand sides.
+ *
+ * nprocs (input) int*
+ *       Total number of MPI processes.
+ *
+ * slucomm (input) MPI_Comm
+ *       SuperLU's communicator
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format.
+ *
+ * m_g   (output) int*
+ *       Global matrix row dimension
+ *
+ * n_g   (output) int*
+ *       Global matrix column dimension
+ *
+ * nnz_g (output) int_t*
+ *       Number of nonzeros in global matrix
+ *
+ * rhs   (output) double*
+ *       The right-hand side matrix.
+ *
+ * ldb   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * x     (output) double*
+ *       The true solution matrix.
+ *
+ * ldx   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * 
+ */ + +int c2f_zcreate_matrix_x_b(char *fname, int nrhs, int nprocs, + MPI_Comm slucomm, SuperMatrix *A, + int *m_g, int *n_g, int_t *nnz_g, + doublecomplex *rhs, int *ldb, doublecomplex *x, int *ldx) +{ + SuperMatrix GA; /* global A */ + doublecomplex *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + doublecomplex *nzval; /* global */ + doublecomplex *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t *marker; + int_t nnz, nnz_loc; + int m, n; + int m_loc, fst_row; + int m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int row, col, i, j, relpos; + int iam; + char trans[1]; + + char **cpp, c, *postfix;; + FILE *fp, *fopen(); + + MPI_Comm_rank(slucomm, &iam); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter c2f_zreate_matrix_x_b()"); +#endif + + if ( iam==0 ) { + double t = SuperLU_timer_(); + + if ( !(fp = fopen(fname, "r")) ) { + ABORT("File does not exist"); + } + for (i = 0; i < strlen(fname); i++) { + if (fname[i]=='.') { + postfix = &(fname[i+1]); + } + } + if(!strcmp(postfix,"cua")){ + /* Read the matrix stored on disk in Harwell-Boeing format. */ + zreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"mtx")){ + /* Read the matrix stored on disk in Matrix Market format. */ + zreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"rb")){ + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + zreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"dat")){ + /* Read the matrix stored on disk in triplet format. */ + zreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"datnh")){ + /* Read the matrix stored on disk in triplet format (without header). */ + zreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"bin")){ + /* Read the matrix stored on disk in binary format. */ + zread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else { + ABORT("File format not known"); + } + + fclose(fp); + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, slucomm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, slucomm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, slucomm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, slucomm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, slucomm ); + + /* Allocate storage for compressed column representation. */ + zallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, SuperLU_MPI_DOUBLE_COMPLEX, 0, slucomm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, slucomm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, slucomm ); + } + +#if 0 + nzval[0]=0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / nprocs; //(grid->nprow * grid->npcol); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * nprocs) != m) { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (nprocs - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (nprocs - 1); + } + + /* Create compressed column matrix for GA. */ + zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_Z, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = doublecomplexMalloc_dist(m*nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = doublecomplexMalloc_dist(n*nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + zGenXtrue_dist(n, nrhs, xtrue_global, n); + zFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc+1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) { + row = fst_row + j; + rowptr[j+1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (doublecomplex *) doublecomplexMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) { + for (j = colptr[i]; j < colptr[i+1]; ++j) { + row = rowind[j]; + if ( (row>=fst_row) && (row=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + zCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_Z, SLU_GE); + + /* Get the local B */ + for (j =0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) { + row = fst_row + i; + rhs[j*m_loc+i] = b_global[j*n+row]; + } + } + *ldb = m_loc; + *ldx = m_loc; + + /* Set the true X */ + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) + x[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit c2f_zreate_matrix_x_b()"); +#endif + + *m_g = m; + *n_g = n; + *nnz_g = nnz; + return 0; +} diff --git a/FORTRAN/f_pddrive3d.f90 b/FORTRAN/f_pddrive3d.f90 new file mode 100644 index 00000000..b124af36 --- /dev/null +++ b/FORTRAN/f_pddrive3d.f90 @@ -0,0 +1,161 @@ + + +!> @file +! Copyright (c) 2003, The Regents of the University of California, through +! Lawrence Berkeley National Laboratory (subject to receipt of any required +! approvals from U.S. Dept. of Energy) +! +! All rights reserved. +! +! The source code is distributed under BSD license, see the file License.txt +! at the top-level directory. +! +!> @file +!! \brief The driver program to solve a linear system with default options. +!! +!!
+!! -- Distributed SuperLU routine (version 7.0) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! January 2, 2021
+!! 
+! + program f_pddrive3d +! +! Purpose +! ======= +! +! The driver program F_PDDRIVE3D. +! +! This example illustrates how to use F_PDGSSVX3D with the full +! (default) options to solve a linear system. +! +! Seven basic steps are required: +! 1. Create C structures used in SuperLU_DIST +! 2. Initialize the MPI environment and the SuperLU process grid +! 3. Set up the input matrix and the right-hand side +! 4. Set the options argument +! 5. Call f_pdgssvx3d +! 6. Release the process grid and terminate the MPI environment +! 7. Release all structures +! +! The program may be run by typing +! mpiexec -np 8 f_pddrive3d +! + use superlu_mod +! implicit none + include 'mpif.h' + integer maxn, maxnz, maxnrhs + parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 ) + integer rowind(maxnz), colptr(maxn) + real*8 values(maxnz), b(maxn), berr(maxnrhs), xtrue(maxn) + integer n, m, nnz, nprow, npcol, npdep, init + integer*4 iam, info, i, ierr, ldb, nrhs + character*80 fname + + integer(superlu_ptr) :: grid ! 3D process grid + integer(superlu_ptr) :: options + integer(superlu_ptr) :: ScalePermstruct + integer(superlu_ptr) :: LUstruct + integer(superlu_ptr) :: SOLVEstruct + integer(superlu_ptr) :: A ! A is on all 3D processes + integer(superlu_ptr) :: stat + +! Initialize MPI environment + call mpi_init(ierr) + +! Check malloc +! call f_check_malloc(iam) + +! Create Fortran handles for the C structures used in SuperLU_DIST + call f_create_gridinfo3d_handle(grid) + call f_create_options_handle(options) + call f_create_ScalePerm_handle(ScalePermstruct) + call f_create_LUstruct_handle(LUstruct) + call f_create_SOLVEstruct_handle(SOLVEstruct) + call f_create_SuperMatrix_handle(A) + call f_create_SuperLUStat_handle(stat) + +! Initialize the SuperLU_DIST process grid + nprow = 2 + npcol = 2 + npdep = 2 + call f_superlu_gridinit3d(MPI_COMM_WORLD, nprow, npcol, npdep, grid) + +! Bail out if I do not belong in the grid. + call get_GridInfo(grid, iam=iam, npdep=npdep) + if ( iam >= (nprow * npcol * npdep) ) then + go to 100 + endif + if ( iam == 0 ) then + write(*,*) ' Process grid: ', nprow, ' X', npcol, ' X', npdep + endif + +! Read and distribute the matrix to the process gird + nrhs = 1 + fname = '../EXAMPLE/g20.rua'//char(0) !! make the string null-ended + call f_dcreate_matrix_x_b_3d(fname, A, m, n, nnz, & + nrhs, b, ldb, xtrue, ldx, grid) + + if ( iam == 0 ) then + write(*,*) ' Matrix A was set up: m ', m, ' nnz ', nnz + endif + +! Set the default input options + call f_set_default_options(options) + +! Change one or more options +! call set_superlu_options(options,Fact=FACTORED) +! call set_superlu_options(options,ParSymbFact=YES) + +! Initialize ScalePermstruct and LUstruct + call get_SuperMatrix(A, nrow=m, ncol=n) + call f_ScalePermstructInit(m, n, ScalePermstruct) + call f_LUstructInit(m, n, LUstruct) + +! Initialize the statistics variables + call f_PStatInit(stat) + +! Call the linear equation solver + call f_pdgssvx3d(options, A, ScalePermstruct, b, ldb, nrhs, & + grid, LUstruct, SOLVEstruct, berr, stat, info) + + if (info == 0) then + if ( iam == 0 ) then + write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs) + endif + else + write(*,*) 'INFO from f_pdgssvx = ', info + endif + +! Deallocate the storage allocated by SuperLU_DIST + call f_PStatFree(stat) + call f_Destroy_CompRowLoc_Mat_dist(A) + call f_ScalePermstructFree(ScalePermstruct) + call f_Destroy_LU_SOLVE_struct_3d(options, n, grid, LUstruct, SOLVEstruct) +! call f_LUstructFree(LUstruct) +! call get_superlu_options(options, SolveInitialized=init) +! if (init == YES) then +! call f_dSolveFinalize(options, SOLVEstruct) +! endif + +! Release the SuperLU process grid +100 call f_superlu_gridexit(grid) + +! Deallocate the C structures pointed to by the Fortran handles + call f_destroy_gridinfo_handle(grid) + call f_destroy_options_handle(options) + call f_destroy_ScalePerm_handle(ScalePermstruct) + call f_destroy_LUstruct_handle(LUstruct) + call f_destroy_SOLVEstruct_handle(SOLVEstruct) + call f_destroy_SuperMatrix_handle(A) + call f_destroy_SuperLUStat_handle(stat) + +! Check malloc +! call f_check_malloc(iam) + + +! Terminate the MPI execution environment + call mpi_finalize(ierr) + + stop + end diff --git a/FORTRAN/f_pzdrive3d.f90 b/FORTRAN/f_pzdrive3d.f90 new file mode 100644 index 00000000..42a0a12d --- /dev/null +++ b/FORTRAN/f_pzdrive3d.f90 @@ -0,0 +1,161 @@ + +!> @file +! Copyright (c) 2003, The Regents of the University of California, through +! Lawrence Berkeley National Laboratory (subject to receipt of any required +! approvals from U.S. Dept. of Energy) +! +! All rights reserved. +! +! The source code is distributed under BSD license, see the file License.txt +! at the top-level directory. +! +!> @file +!! \brief The driver program to solve a linear system with default options. +!! +!!
+!! -- Distributed SuperLU routine (version 7.0) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! January 2, 2021
+!! 
+! + program f_pzdrive3d +! +! Purpose +! ======= +! +! The driver program F_PZDRIVE3D. +! +! This example illustrates how to use F_PZGSSVX3D with the full +! (default) options to solve a linear system. +! +! Seven basic steps are required: +! 1. Create C structures used in SuperLU_DIST +! 2. Initialize the MPI environment and the SuperLU process grid +! 3. Set up the input matrix and the right-hand side +! 4. Set the options argument +! 5. Call f_pzgssvx3d +! 6. Release the process grid and terminate the MPI environment +! 7. Release all structures +! +! The program may be run by typing +! mpiexec -np 8 f_pzdrive3d +! + use superlu_mod +! implicit none + include 'mpif.h' + integer maxn, maxnz, maxnrhs + parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 ) + integer rowind(maxnz), colptr(maxn) + double complex values(maxnz), b(maxn), xtrue(maxn) + real*8 berr(maxnrhs) + integer n, m, nnz, nprow, npcol, npdep, init + integer*4 iam, info, i, ierr, ldb, nrhs + character*80 fname + + integer(superlu_ptr) :: grid ! 3D process grid + integer(superlu_ptr) :: options + integer(superlu_ptr) :: ScalePermstruct + integer(superlu_ptr) :: LUstruct + integer(superlu_ptr) :: SOLVEstruct + integer(superlu_ptr) :: A ! A is on all 3D processes + integer(superlu_ptr) :: stat + +! Initialize MPI environment + call mpi_init(ierr) + +! Check malloc +! call f_check_malloc(iam) + +! Create Fortran handles for the C structures used in SuperLU_DIST + call f_create_gridinfo3d_handle(grid) + call f_create_options_handle(options) + call f_create_ScalePerm_handle(ScalePermstruct) + call f_create_LUstruct_handle(LUstruct) + call f_create_SOLVEstruct_handle(SOLVEstruct) + call f_create_SuperMatrix_handle(A) + call f_create_SuperLUStat_handle(stat) + +! Initialize the SuperLU_DIST process grid + nprow = 2 + npcol = 2 + npdep = 2 + call f_superlu_gridinit3d(MPI_COMM_WORLD, nprow, npcol, npdep, grid) + +! Bail out if I do not belong in the grid. + call get_GridInfo(grid, iam=iam, npdep=npdep) + if ( iam >= (nprow * npcol * npdep) ) then + go to 100 + endif + if ( iam == 0 ) then + write(*,*) ' Process grid: ', nprow, ' X', npcol, ' X', npdep + endif + +! Read and distribute the matrix to the process gird + nrhs = 1 + fname = '../EXAMPLE/cg20.cua'//char(0) !! make the string null-ended + call f_zcreate_matrix_x_b_3d(fname, A, m, n, nnz, & + nrhs, b, ldb, xtrue, ldx, grid) + + if ( iam == 0 ) then + write(*,*) ' Matrix A was set up: m ', m, ' nnz ', nnz + endif + +! Set the default input options + call f_set_default_options(options) + +! Change one or more options +! call set_superlu_options(options,Fact=FACTORED) +! call set_superlu_options(options,ParSymbFact=YES) + +! Initialize ScalePermstruct and LUstruct + call get_SuperMatrix(A, nrow=m, ncol=n) + call f_ScalePermstructInit(m, n, ScalePermstruct) + call f_LUstructInit(m, n, LUstruct) + +! Initialize the statistics variables + call f_PStatInit(stat) + +! Call the linear equation solver + call f_pzgssvx3d(options, A, ScalePermstruct, b, ldb, nrhs, & + grid, LUstruct, SOLVEstruct, berr, stat, info) + + if (info == 0) then + if ( iam == 0 ) then + write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs) + endif + else + write(*,*) 'INFO from f_pdgssvx = ', info + endif + +! Deallocate the storage allocated by SuperLU_DIST + call f_PStatFree(stat) + call f_Destroy_CompRowLoc_Mat_dist(A) + call f_ScalePermstructFree(ScalePermstruct) + call f_Destroy_LU_SOLVE_struct_3d(options, n, grid, LUstruct, SOLVEstruct) +! call f_LUstructFree(LUstruct) +! call get_superlu_options(options, SolveInitialized=init) +! if (init == YES) then +! call f_dSolveFinalize(options, SOLVEstruct) +! endif + +! Release the SuperLU process grid +100 call f_superlu_gridexit(grid) + +! Deallocate the C structures pointed to by the Fortran handles + call f_destroy_gridinfo_handle(grid) + call f_destroy_options_handle(options) + call f_destroy_ScalePerm_handle(ScalePermstruct) + call f_destroy_LUstruct_handle(LUstruct) + call f_destroy_SOLVEstruct_handle(SOLVEstruct) + call f_destroy_SuperMatrix_handle(A) + call f_destroy_SuperLUStat_handle(stat) + +! Check malloc +! call f_check_malloc(iam) + + +! Terminate the MPI execution environment + call mpi_finalize(ierr) + + stop + end From 02cc861307763959a5e660ba3c863606f95ab8af Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Thu, 21 Jan 2021 16:30:58 -0500 Subject: [PATCH 059/147] small fixes related to Intel. --- SRC/sec_structs.c | 2 +- SRC/superlu_defs.h | 3 ++- SRC/superlu_dist_config.h | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/SRC/sec_structs.c b/SRC/sec_structs.c index 7b5ea143..7085cbfe 100644 --- a/SRC/sec_structs.c +++ b/SRC/sec_structs.c @@ -1,4 +1,4 @@ -#include +//#include #include "superlu_ddefs.h" #if 0 #include "sec_structs.h" diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index 5f400652..690be21b 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -107,7 +107,8 @@ extern MPI_Datatype SuperLU_MPI_DOUBLE_COMPLEX; static __inline__ unsigned long long _rdtsc(void) { unsigned long long int x; - __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x)); + // __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x)); + x = 0; return x; } #endif diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index c3def71c..2416fef5 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -10,7 +10,7 @@ /* #undef HAVE_COMBBLAS */ /* enable 64bit index mode */ -#define XSDK_INDEX_SIZE 64 +/* #undef XSDK_INDEX_SIZE */ #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 From 8583a62d94b735904c276e2bf400be63e0bd4743 Mon Sep 17 00:00:00 2001 From: piyush sao Date: Thu, 21 Jan 2021 16:31:54 -0500 Subject: [PATCH 060/147] Adding cub. to the tracking --- SRC/cub/block/block_discontinuity.cuh | 593 +++++ SRC/cub/block/block_exchange.cuh | 988 +++++++ SRC/cub/block/block_histogram.cuh | 415 +++ SRC/cub/block/block_load.cuh | 1086 ++++++++ SRC/cub/block/block_radix_rank.cuh | 485 ++++ SRC/cub/block/block_radix_sort.cuh | 863 ++++++ SRC/cub/block/block_raking_layout.cuh | 149 ++ SRC/cub/block/block_reduce.cuh | 607 +++++ SRC/cub/block/block_scan.cuh | 2318 +++++++++++++++++ SRC/cub/block/block_shift.cuh | 325 +++ SRC/cub/block/block_store.cuh | 892 +++++++ .../block_histogram_atomic.cuh | 82 + .../specializations/block_histogram_sort.cuh | 226 ++ .../specializations/block_reduce_raking.cuh | 247 ++ .../block_reduce_raking_commutative_only.cuh | 202 ++ .../block_reduce_warp_reductions.cuh | 222 ++ .../specializations/block_scan_raking.cuh | 788 ++++++ .../specializations/block_scan_warp_scans.cuh | 421 +++ SRC/cub/block_range/block_range_histo.cuh | 319 +++ .../block_range_radix_sort_downsweep.cuh | 744 ++++++ .../block_range_radix_sort_upsweep.cuh | 450 ++++ SRC/cub/block_range/block_range_reduce.cuh | 430 +++ .../block_range/block_range_reduce_by_key.cuh | 1034 ++++++++ SRC/cub/block_range/block_range_scan.cuh | 538 ++++ SRC/cub/block_range/block_range_select.cuh | 735 ++++++ .../block_scan_prefix_operators.cuh | 566 ++++ .../block_range_histo_gatomic.cuh | 184 ++ .../block_range_histo_satomic.cuh | 245 ++ .../block_range_histo_sort.cuh | 364 +++ SRC/cub/cub.cuh | 95 + SRC/cub/device/device_histogram.cuh | 653 +++++ SRC/cub/device/device_partition.cuh | 275 ++ SRC/cub/device/device_radix_sort.cuh | 420 +++ SRC/cub/device/device_reduce.cuh | 804 ++++++ SRC/cub/device/device_scan.cuh | 419 +++ SRC/cub/device/device_select.cuh | 372 +++ .../dispatch/device_histogram_dispatch.cuh | 554 ++++ .../dispatch/device_radix_sort_dispatch.cuh | 939 +++++++ .../device_reduce_by_key_dispatch.cuh | 594 +++++ .../dispatch/device_reduce_dispatch.cuh | 743 ++++++ .../device/dispatch/device_scan_dispatch.cuh | 565 ++++ .../dispatch/device_select_dispatch.cuh | 564 ++++ SRC/cub/grid/grid_barrier.cuh | 211 ++ SRC/cub/grid/grid_even_share.cuh | 185 ++ SRC/cub/grid/grid_mapping.cuh | 95 + SRC/cub/grid/grid_queue.cuh | 216 ++ SRC/cub/host/spinlock.cuh | 123 + SRC/cub/iterator/arg_index_input_iterator.cuh | 255 ++ .../cache_modified_input_iterator.cuh | 240 ++ .../cache_modified_output_iterator.cuh | 253 ++ SRC/cub/iterator/constant_input_iterator.cuh | 235 ++ SRC/cub/iterator/counting_input_iterator.cuh | 228 ++ SRC/cub/iterator/tex_obj_input_iterator.cuh | 308 +++ SRC/cub/iterator/tex_ref_input_iterator.cuh | 370 +++ SRC/cub/iterator/transform_input_iterator.cuh | 252 ++ SRC/cub/thread/thread_load.cuh | 444 ++++ SRC/cub/thread/thread_operators.cuh | 206 ++ SRC/cub/thread/thread_reduce.cuh | 169 ++ SRC/cub/thread/thread_scan.cuh | 283 ++ SRC/cub/thread/thread_store.cuh | 414 +++ SRC/cub/util_allocator.cuh | 664 +++++ SRC/cub/util_arch.cuh | 197 ++ SRC/cub/util_debug.cuh | 115 + SRC/cub/util_device.cuh | 372 +++ SRC/cub/util_macro.cuh | 107 + SRC/cub/util_namespace.cuh | 41 + SRC/cub/util_ptx.cuh | 606 +++++ SRC/cub/util_type.cuh | 1027 ++++++++ .../warp/specializations/warp_reduce_shfl.cuh | 330 +++ .../warp/specializations/warp_reduce_smem.cuh | 358 +++ .../warp/specializations/warp_scan_shfl.cuh | 401 +++ .../warp/specializations/warp_scan_smem.cuh | 319 +++ SRC/cub/warp/warp_reduce.cuh | 627 +++++ SRC/cub/warp/warp_scan.cuh | 1451 +++++++++++ 74 files changed, 34617 insertions(+) create mode 100644 SRC/cub/block/block_discontinuity.cuh create mode 100644 SRC/cub/block/block_exchange.cuh create mode 100644 SRC/cub/block/block_histogram.cuh create mode 100644 SRC/cub/block/block_load.cuh create mode 100644 SRC/cub/block/block_radix_rank.cuh create mode 100644 SRC/cub/block/block_radix_sort.cuh create mode 100644 SRC/cub/block/block_raking_layout.cuh create mode 100644 SRC/cub/block/block_reduce.cuh create mode 100644 SRC/cub/block/block_scan.cuh create mode 100644 SRC/cub/block/block_shift.cuh create mode 100644 SRC/cub/block/block_store.cuh create mode 100644 SRC/cub/block/specializations/block_histogram_atomic.cuh create mode 100644 SRC/cub/block/specializations/block_histogram_sort.cuh create mode 100644 SRC/cub/block/specializations/block_reduce_raking.cuh create mode 100644 SRC/cub/block/specializations/block_reduce_raking_commutative_only.cuh create mode 100644 SRC/cub/block/specializations/block_reduce_warp_reductions.cuh create mode 100644 SRC/cub/block/specializations/block_scan_raking.cuh create mode 100644 SRC/cub/block/specializations/block_scan_warp_scans.cuh create mode 100644 SRC/cub/block_range/block_range_histo.cuh create mode 100644 SRC/cub/block_range/block_range_radix_sort_downsweep.cuh create mode 100644 SRC/cub/block_range/block_range_radix_sort_upsweep.cuh create mode 100644 SRC/cub/block_range/block_range_reduce.cuh create mode 100644 SRC/cub/block_range/block_range_reduce_by_key.cuh create mode 100644 SRC/cub/block_range/block_range_scan.cuh create mode 100644 SRC/cub/block_range/block_range_select.cuh create mode 100644 SRC/cub/block_range/block_scan_prefix_operators.cuh create mode 100644 SRC/cub/block_range/specializations/block_range_histo_gatomic.cuh create mode 100644 SRC/cub/block_range/specializations/block_range_histo_satomic.cuh create mode 100644 SRC/cub/block_range/specializations/block_range_histo_sort.cuh create mode 100644 SRC/cub/cub.cuh create mode 100644 SRC/cub/device/device_histogram.cuh create mode 100644 SRC/cub/device/device_partition.cuh create mode 100644 SRC/cub/device/device_radix_sort.cuh create mode 100644 SRC/cub/device/device_reduce.cuh create mode 100644 SRC/cub/device/device_scan.cuh create mode 100644 SRC/cub/device/device_select.cuh create mode 100644 SRC/cub/device/dispatch/device_histogram_dispatch.cuh create mode 100644 SRC/cub/device/dispatch/device_radix_sort_dispatch.cuh create mode 100644 SRC/cub/device/dispatch/device_reduce_by_key_dispatch.cuh create mode 100644 SRC/cub/device/dispatch/device_reduce_dispatch.cuh create mode 100644 SRC/cub/device/dispatch/device_scan_dispatch.cuh create mode 100644 SRC/cub/device/dispatch/device_select_dispatch.cuh create mode 100644 SRC/cub/grid/grid_barrier.cuh create mode 100644 SRC/cub/grid/grid_even_share.cuh create mode 100644 SRC/cub/grid/grid_mapping.cuh create mode 100644 SRC/cub/grid/grid_queue.cuh create mode 100644 SRC/cub/host/spinlock.cuh create mode 100644 SRC/cub/iterator/arg_index_input_iterator.cuh create mode 100644 SRC/cub/iterator/cache_modified_input_iterator.cuh create mode 100644 SRC/cub/iterator/cache_modified_output_iterator.cuh create mode 100644 SRC/cub/iterator/constant_input_iterator.cuh create mode 100644 SRC/cub/iterator/counting_input_iterator.cuh create mode 100644 SRC/cub/iterator/tex_obj_input_iterator.cuh create mode 100644 SRC/cub/iterator/tex_ref_input_iterator.cuh create mode 100644 SRC/cub/iterator/transform_input_iterator.cuh create mode 100644 SRC/cub/thread/thread_load.cuh create mode 100644 SRC/cub/thread/thread_operators.cuh create mode 100644 SRC/cub/thread/thread_reduce.cuh create mode 100644 SRC/cub/thread/thread_scan.cuh create mode 100644 SRC/cub/thread/thread_store.cuh create mode 100644 SRC/cub/util_allocator.cuh create mode 100644 SRC/cub/util_arch.cuh create mode 100644 SRC/cub/util_debug.cuh create mode 100644 SRC/cub/util_device.cuh create mode 100644 SRC/cub/util_macro.cuh create mode 100644 SRC/cub/util_namespace.cuh create mode 100644 SRC/cub/util_ptx.cuh create mode 100644 SRC/cub/util_type.cuh create mode 100644 SRC/cub/warp/specializations/warp_reduce_shfl.cuh create mode 100644 SRC/cub/warp/specializations/warp_reduce_smem.cuh create mode 100644 SRC/cub/warp/specializations/warp_scan_shfl.cuh create mode 100644 SRC/cub/warp/specializations/warp_scan_smem.cuh create mode 100644 SRC/cub/warp/warp_reduce.cuh create mode 100644 SRC/cub/warp/warp_scan.cuh diff --git a/SRC/cub/block/block_discontinuity.cuh b/SRC/cub/block/block_discontinuity.cuh new file mode 100644 index 00000000..6b2f8c78 --- /dev/null +++ b/SRC/cub/block/block_discontinuity.cuh @@ -0,0 +1,593 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_type.cuh" +#include "../util_ptx.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png) + * \ingroup BlockModule + * + * \tparam T The data type to be flagged. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items + * that differ from their predecessors (or successors). For example, head flags are convenient + * for demarcating disjoint data segments as part of a segmented scan or reduction. + * - \blocked + * + * \par Performance Considerations + * - \granularity + * + * \par A Simple Example + * \blockcollective{BlockDiscontinuity} + * \par + * The code snippet below illustrates the head flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute head flags for discontinuities in the segment + * int head_flags[4]; + * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. + * The corresponding output \p head_flags in those threads will be + * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * + * \par Performance Considerations + * - Incurs zero bank conflicts for most types + * + */ +template < + typename T, + int BLOCK_DIM_X, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockDiscontinuity +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + + /// Shared memory storage layout type (last element from each thread's input) + typedef T _TempStorage[BLOCK_THREADS]; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /// Specialization for when FlagOp has third index param + template ::HAS_PARAM> + struct ApplyOp + { + // Apply flag operator + static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx) + { + return flag_op(a, b, idx); + } + }; + + /// Specialization for when FlagOp does not have a third index param + template + struct ApplyOp + { + // Apply flag operator + static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx) + { + return flag_op(a, b); + } + }; + + /// Templated unrolling of item comparison (inductive case) + template + struct Iterate + { + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagItems( + int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + flags[ITERATION] = ApplyOp::Flag( + flag_op, + input[ITERATION - 1], + input[ITERATION], + (linear_tid * ITEMS_PER_THREAD) + ITERATION); + + Iterate::FlagItems(linear_tid, flags, input, flag_op); + } + }; + + /// Templated unrolling of item comparison (termination case) + template + struct Iterate + { + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + static __device__ __forceinline__ void FlagItems( + int linear_tid, + FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + {} + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + +public: + + /// \smemstorage{BlockDiscontinuity} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockDiscontinuity() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockDiscontinuity( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Head flag operations + *********************************************************************/ + //@{ + + + /** + * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is always flagged. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute head flags for discontinuities in the segment + * int head_flags[4]; + * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. + * The corresponding output \p head_flags in those threads will be + * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share last item + temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + __syncthreads(); + + // Set flag for first item + head_flags[0] = (linear_tid == 0) ? + 1 : // First thread + ApplyOp::Flag( + flag_op, + temp_storage[linear_tid - 1], + input[0], + linear_tid * ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagItems(linear_tid, head_flags, input, flag_op); + } + + + /** + * \brief Sets head flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag head_flagsi is set for item + * inputi when + * flag_op(previous-item, inputi) + * returns \p true (where previous-item is either the preceding item + * in the same thread or the last item in the previous thread). + * - For thread0, item input0 is compared + * against \p tile_predecessor_item. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the head-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread0 obtain the predecessor item for the entire tile + * int tile_predecessor_item; + * if (threadIdx.x == 0) tile_predecessor_item == ... + * + * // Collectively compute head flags for discontinuities in the segment + * int head_flags[4]; + * BlockDiscontinuity(temp_storage).FlagHeads( + * head_flags, thread_data, cub::Inequality(), tile_predecessor_item); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }, + * and that \p tile_predecessor_item is \p 0. The corresponding output \p head_flags in those threads will be + * { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagHeads( + FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). + { + // Share last item + temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1]; + + __syncthreads(); + + // Set flag for first item + T predecessor_item = (linear_tid == 0) ? + tile_predecessor_item : // First thread + temp_storage[linear_tid - 1]; + + head_flags[0] = ApplyOp::Flag( + flag_op, + predecessor_item, + input[0], + linear_tid * ITEMS_PER_THREAD); + + // Set head_flags for remaining items + Iterate<1, ITEMS_PER_THREAD>::FlagItems(linear_tid, head_flags, input, flag_op); + } + + + //@} end member group + /******************************************************************//** + * \name Tail flag operations + *********************************************************************/ + //@{ + + + /** + * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged. + * + * \par + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is always flagged. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute tail flags for discontinuities in the segment + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }. + * The corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagTails( + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op) ///< [in] Binary boolean flag predicate + { + // Share first item + temp_storage[linear_tid] = input[0]; + + __syncthreads(); + + // Set flag for last item + tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? + 1 : // Last thread + ApplyOp::Flag( + flag_op, + input[ITEMS_PER_THREAD - 1], + temp_storage[linear_tid + 1], + (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1)); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagItems(linear_tid, tail_flags, input, flag_op); + } + + + /** + * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block. + * + * \par + * - The flag tail_flagsi is set for item + * inputi when + * flag_op(inputi, next-item) + * returns \p true (where next-item is either the next item + * in the same thread or the first item in the next thread). + * - For threadBLOCK_THREADS-1, item + * inputITEMS_PER_THREAD-1 is compared + * against \p tile_predecessor_item. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the tail-flagging of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int + * typedef cub::BlockDiscontinuity BlockDiscontinuity; + * + * // Allocate shared memory for BlockDiscontinuity + * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Have thread127 obtain the successor item for the entire tile + * int tile_successor_item; + * if (threadIdx.x == 127) tile_successor_item == ... + * + * // Collectively compute tail flags for discontinuities in the segment + * int tail_flags[4]; + * BlockDiscontinuity(temp_storage).FlagTails( + * tail_flags, thread_data, cub::Inequality(), tile_successor_item); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } + * and that \p tile_successor_item is \p 125. The corresponding output \p tail_flags in those threads will be + * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam FlagT [inferred] The flag type (must be an integer type) + * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. + */ + template < + int ITEMS_PER_THREAD, + typename FlagT, + typename FlagOp> + __device__ __forceinline__ void FlagTails( + FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + FlagOp flag_op, ///< [in] Binary boolean flag predicate + T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). + { + // Share first item + temp_storage[linear_tid] = input[0]; + + __syncthreads(); + + // Set flag for last item + T successor_item = (linear_tid == BLOCK_THREADS - 1) ? + tile_successor_item : // Last thread + temp_storage[linear_tid + 1]; + + tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::Flag( + flag_op, + input[ITEMS_PER_THREAD - 1], + successor_item, + (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1)); + + // Set tail_flags for remaining items + Iterate<0, ITEMS_PER_THREAD - 1>::FlagItems(linear_tid, tail_flags, input, flag_op); + } + + //@} end member group + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/block/block_exchange.cuh b/SRC/cub/block/block_exchange.cuh new file mode 100644 index 00000000..1eb4c5f4 --- /dev/null +++ b/SRC/cub/block/block_exchange.cuh @@ -0,0 +1,988 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png) + * \ingroup BlockModule + * + * \tparam T The data type to be exchanged. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of items partitioned onto each thread. + * \tparam WARP_TIME_SLICING [optional] When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - It is commonplace for blocks of threads to rearrange data items between + * threads. For example, the global memory subsystem prefers access patterns + * where data items are "striped" across threads (where consecutive threads access consecutive items), + * yet most block-wide operations prefer a "blocked" partitioning of items across threads + * (where consecutive items belong to a single thread). + * - BlockExchange supports the following types of data exchanges: + * - Transposing between [blocked](index.html#sec5sec3) and [striped](index.html#sec5sec3) arrangements + * - Transposing between [blocked](index.html#sec5sec3) and [warp-striped](index.html#sec5sec3) arrangements + * - Scattering ranked items to a [blocked arrangement](index.html#sec5sec3) + * - Scattering ranked items to a [striped arrangement](index.html#sec5sec3) + * - \blocked + * + * \par A Simple Example + * \blockcollective{BlockExchange} + * \par + * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Load a tile of data striped across threads + * int thread_data[4]; + * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); + * + * // Collectively exchange data into a blocked arrangement across threads + * BlockExchange(temp_storage).StripedToBlocked(thread_data); + * + * \endcode + * \par + * Suppose the set of striped input \p thread_data across the block of threads is + * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + * \par Performance Considerations + * - Proper device-specific padding ensures zero bank conflicts for most types. + * + */ +template < + typename T, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + bool WARP_TIME_SLICING = false, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockExchange +{ +private: + + /****************************************************************************** + * Constants + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH), + SMEM_BANKS = 1 << LOG_SMEM_BANKS, + + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1, + + TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS, + TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD, + + WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS), + WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD, + + // Insert padding if the number of items per thread is a power of two + INSERT_PADDING = 0, // Mooch PowerOfTwo::VALUE, + PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0, + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Shared memory storage layout type + typedef T _TempStorage[TIME_SLICED_ITEMS + PADDING_ITEMS]; + +public: + + /// \smemstorage{BlockExchange} + struct TempStorage : Uninitialized<_TempStorage> {}; + +private: + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + int lane_id; + int warp_id; + int warp_offset; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /** + * Transposes data items from blocked arrangement to striped arrangement. Specialized for no timeslicing. + */ + __device__ __forceinline__ void BlockedToStriped( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between blocked and striped arrangements. + Int2Type time_slicing) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage[item_offset] = items[ITEM]; + } + + __syncthreads(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + items[ITEM] = temp_storage[item_offset]; + } + } + + + /** + * Transposes data items from blocked arrangement to striped arrangement. Specialized for warp-timeslicing. + */ + __device__ __forceinline__ void BlockedToStriped( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between blocked and striped arrangements. + Int2Type time_slicing) + { + T temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; + const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + + __syncthreads(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage[item_offset] = items[ITEM]; + } + } + + __syncthreads(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + // Read a strip of items + const int STRIP_OFFSET = ITEM * BLOCK_THREADS; + const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + + if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + { + int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_items[ITEM] = temp_storage[item_offset]; + } + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = temp_items[ITEM]; + } + } + + + /** + * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for no timeslicing + */ + __device__ __forceinline__ void BlockedToWarpStriped( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between blocked and warp-striped arrangements. + Int2Type time_slicing) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage[item_offset] = items[ITEM]; + } + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + items[ITEM] = temp_storage[item_offset]; + } + } + + /** + * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for warp-timeslicing + */ + __device__ __forceinline__ void BlockedToWarpStriped( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between blocked and warp-striped arrangements. + Int2Type time_slicing) + { + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE) + { + __syncthreads(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage[item_offset] = items[ITEM]; + } + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + items[ITEM] = temp_storage[item_offset]; + } + } + } + } + + + /** + * Transposes data items from striped arrangement to blocked arrangement. Specialized for no timeslicing. + */ + __device__ __forceinline__ void StripedToBlocked( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + Int2Type time_slicing) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage[item_offset] = items[ITEM]; + } + + __syncthreads(); + + // No timeslicing + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + items[ITEM] = temp_storage[item_offset]; + } + } + + + /** + * Transposes data items from striped arrangement to blocked arrangement. Specialized for warp-timeslicing. + */ + __device__ __forceinline__ void StripedToBlocked( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + Int2Type time_slicing) + { + // Warp time-slicing + T temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; + const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + + __syncthreads(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + // Write a strip of items + const int STRIP_OFFSET = ITEM * BLOCK_THREADS; + const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + + if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + { + int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage[item_offset] = items[ITEM]; + } + } + } + + __syncthreads(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_items[ITEM] = temp_storage[item_offset]; + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = temp_items[ITEM]; + } + } + + + /** + * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for no timeslicing + */ + __device__ __forceinline__ void WarpStripedToBlocked( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between warp-striped and blocked arrangements. + Int2Type time_slicing) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage[item_offset] = items[ITEM]; + } + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + items[ITEM] = temp_storage[item_offset]; + } + } + + + /** + * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for warp-timeslicing + */ + __device__ __forceinline__ void WarpStripedToBlocked( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between warp-striped and blocked arrangements. + Int2Type time_slicing) + { + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE) + { + __syncthreads(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_storage[item_offset] = items[ITEM]; + } + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + items[ITEM] = temp_storage[item_offset]; + } + } + } + } + + + /** + * Exchanges data items annotated by rank into blocked arrangement. Specialized for no timeslicing. + */ + template + __device__ __forceinline__ void ScatterToBlocked( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange + Offset ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type time_slicing) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage[item_offset] = items[ITEM]; + } + + __syncthreads(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + items[ITEM] = temp_storage[item_offset]; + } + } + + /** + * Exchanges data items annotated by rank into blocked arrangement. Specialized for warp-timeslicing. + */ + template + __device__ __forceinline__ void ScatterToBlocked( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange + Offset ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type time_slicing) + { + T temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + __syncthreads(); + + const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM] - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage[item_offset] = items[ITEM]; + } + } + + __syncthreads(); + + if (warp_id == SLICE) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_items[ITEM] = temp_storage[item_offset]; + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = temp_items[ITEM]; + } + } + + + /** + * Exchanges data items annotated by rank into striped arrangement. Specialized for no timeslicing. + */ + template + __device__ __forceinline__ void ScatterToStriped( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange + Offset ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type time_slicing) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage[item_offset] = items[ITEM]; + } + + __syncthreads(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + items[ITEM] = temp_storage[item_offset]; + } + } + + + /** + * Exchanges data items annotated by rank into striped arrangement. Specialized for warp-timeslicing. + */ + template + __device__ __forceinline__ void ScatterToStriped( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange + Offset ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + Int2Type time_slicing) + { + T temp_items[ITEMS_PER_THREAD]; + + #pragma unroll + for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) + { + const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; + const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; + + __syncthreads(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM] - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + temp_storage[item_offset] = items[ITEM]; + } + } + + __syncthreads(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + // Read a strip of items + const int STRIP_OFFSET = ITEM * BLOCK_THREADS; + const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; + + if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) + { + int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; + if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) + { + if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; + temp_items[ITEM] = temp_storage[item_offset]; + } + } + } + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = temp_items[ITEM]; + } + } + + +public: + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockExchange() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()), + warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockExchange( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()), + warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) + {} + + + //@} end member group + /******************************************************************//** + * \name Structured exchanges + *********************************************************************/ + //@{ + + /** + * \brief Transposes data items from striped arrangement to blocked arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Load a tile of ordered data into a striped arrangement across block threads + * int thread_data[4]; + * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); + * + * // Collectively exchange data into a blocked arrangement across threads + * BlockExchange(temp_storage).StripedToBlocked(thread_data); + * + * \endcode + * \par + * Suppose the set of striped input \p thread_data across the block of threads is + * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } after loading from global memory. + * The corresponding output \p thread_data in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ + __device__ __forceinline__ void StripedToBlocked( + T items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. + { + StripedToBlocked(items, Int2Type()); + } + + /** + * \brief Transposes data items from blocked arrangement to striped arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively exchange data into a striped arrangement across threads + * BlockExchange(temp_storage).BlockedToStriped(thread_data); + * + * // Store data striped across block threads into an ordered tile + * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of blocked input \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } in + * preparation for storing to global memory. + * + */ + __device__ __forceinline__ void BlockedToStriped( + T items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between blocked and striped arrangements. + { + BlockedToStriped(items, Int2Type()); + } + + + /** + * \brief Transposes data items from warp-striped arrangement to blocked arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Load a tile of ordered data into a warp-striped arrangement across warp threads + * int thread_data[4]; + * cub::LoadSWarptriped(threadIdx.x, d_data, thread_data); + * + * // Collectively exchange data into a blocked arrangement across threads + * BlockExchange(temp_storage).WarpStripedToBlocked(thread_data); + * + * \endcode + * \par + * Suppose the set of warp-striped input \p thread_data across the block of threads is + * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } + * after loading from global memory. (The first 128 items are striped across + * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) + * The corresponding output \p thread_data in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ + __device__ __forceinline__ void WarpStripedToBlocked( + T items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between warp-striped and blocked arrangements. + { + WarpStripedToBlocked(items, Int2Type()); + } + + /** + * \brief Transposes data items from blocked arrangement to warp-striped arrangement. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement + * of 512 integer items partitioned across 128 threads where each thread owns 4 items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockExchange BlockExchange; + * + * // Allocate shared memory for BlockExchange + * __shared__ typename BlockExchange::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively exchange data into a warp-striped arrangement across threads + * BlockExchange(temp_storage).BlockedToWarpStriped(thread_data); + * + * // Store data striped across warp threads into an ordered tile + * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of blocked input \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } + * in preparation for storing to global memory. (The first 128 items are striped across + * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) + * + */ + __device__ __forceinline__ void BlockedToWarpStriped( + T items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between blocked and warp-striped arrangements. + { + BlockedToWarpStriped(items, Int2Type()); + } + + + //@} end member group + /******************************************************************//** + * \name Scatter exchanges + *********************************************************************/ + //@{ + + + /** + * \brief Exchanges data items annotated by rank into blocked arrangement. + * + * \par + * - \smemreuse + * + * \tparam Offset [inferred] Signed integer type for local offsets + */ + template + __device__ __forceinline__ void ScatterToBlocked( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange + Offset ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToBlocked(items, ranks, Int2Type()); + } + + + /** + * \brief Exchanges data items annotated by rank into striped arrangement. + * + * \par + * - \smemreuse + * + * \tparam Offset [inferred] Signed integer type for local offsets + */ + template + __device__ __forceinline__ void ScatterToStriped( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange + Offset ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + ScatterToStriped(items, ranks, Int2Type()); + } + + + /** + * \brief Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged. + * + * \par + * - \smemreuse + * + * \tparam Offset [inferred] Signed integer type for local offsets + */ + template + __device__ __forceinline__ void ScatterToStripedGuarded( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange + Offset ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + if (ranks[ITEM] >= 0) + temp_storage[item_offset] = items[ITEM]; + } + + __syncthreads(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + items[ITEM] = temp_storage[item_offset]; + } + } + + /** + * \brief Exchanges valid data items annotated by rank into striped arrangement. + * + * \par + * - \smemreuse + * + * \tparam Offset [inferred] Signed integer type for local offsets + * \tparam ValidFlag [inferred] Flag type denoting which items are valid + */ + template + __device__ __forceinline__ void ScatterToStriped( + T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange + Offset ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks + ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = ranks[ITEM]; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + if (is_valid[ITEM]) + temp_storage[item_offset] = items[ITEM]; + } + + __syncthreads(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; + if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); + items[ITEM] = temp_storage[item_offset]; + } + } + + //@} end member group + + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/block_histogram.cuh b/SRC/cub/block/block_histogram.cuh new file mode 100644 index 00000000..1ec78388 --- /dev/null +++ b/SRC/cub/block/block_histogram.cuh @@ -0,0 +1,415 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ + +#pragma once + +#include "specializations/block_histogram_sort.cuh" +#include "specializations/block_histogram_atomic.cuh" +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + +/** + * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms. + */ +enum BlockHistogramAlgorithm +{ + + /** + * \par Overview + * Sorting followed by differentiation. Execution is comprised of two phases: + * -# Sort the data using efficient radix sort + * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts. + * + * \par Performance Considerations + * Delivers consistent throughput regardless of sample bin distribution. + */ + BLOCK_HISTO_SORT, + + + /** + * \par Overview + * Use atomic addition to update byte counts directly + * + * \par Performance Considerations + * Performance is strongly tied to the hardware implementation of atomic + * addition, and may be significantly degraded for non uniformly-random + * input distributions where many concurrent updates are likely to be + * made to the same bin counter. + */ + BLOCK_HISTO_ATOMIC, +}; + + + +/****************************************************************************** + * Block histogram + ******************************************************************************/ + + +/** + * \brief The BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png) + * \ingroup BlockModule + * + * \tparam T The sample type being histogrammed (must be castable to an integer bin identifier) + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of items per thread + * \tparam BINS The number bins within the histogram + * \tparam ALGORITHM [optional] cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A histogram + * counts the number of observations that fall into each of the disjoint categories (known as bins). + * - BlockHistogram can be optionally specialized to use different algorithms: + * -# cub::BLOCK_HISTO_SORT. Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm) + * -# cub::BLOCK_HISTO_ATOMIC. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm) + * + * \par Performance Considerations + * - \granularity + * + * \par A Simple Example + * \blockcollective{BlockHistogram} + * \par + * The code snippet below illustrates a 256-bin histogram of 512 integer samples that + * are partitioned across 128 threads where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char data[4]; + * ... + * + * // Compute the block-wide histogram + * BlockHistogram(temp_storage).Histogram(data, smem_histogram); + * + * \endcode + * + * \par Performance and Usage Considerations + * - The histogram output can be constructed in shared or global memory + * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives + * + */ +template < + typename T, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + int BINS, + BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockHistogram +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /** + * Ensure the template parameterization meets the requirements of the + * targeted device architecture. BLOCK_HISTO_ATOMIC can only be used + * on version SM120 or later. Otherwise BLOCK_HISTO_SORT is used + * regardless. + */ + static const BlockHistogramAlgorithm SAFE_ALGORITHM = + ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ? + BLOCK_HISTO_SORT : + ALGORITHM; + + /// Internal specialization. + typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT), + BlockHistogramSort, + BlockHistogramAtomic >::Type InternalBlockHistogram; + + /// Shared memory storage layout type for BlockHistogram + typedef typename InternalBlockHistogram::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + +public: + + /// \smemstorage{BlockHistogram} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockHistogram() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockHistogram( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Histogram operations + *********************************************************************/ + //@{ + + + /** + * \brief Initialize the shared histogram counters to zero. + * + * \par Snippet + * The code snippet below illustrates a the initialization and update of a + * histogram of 512 integer samples that are partitioned across 128 threads + * where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char thread_samples[4]; + * ... + * + * // Initialize the block-wide histogram + * BlockHistogram(temp_storage).InitHistogram(smem_histogram); + * + * // Update the block-wide histogram + * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); + * + * \endcode + * + * \tparam HistoCounter [inferred] Histogram counter type + */ + template + __device__ __forceinline__ void InitHistogram(HistoCounter histogram[BINS]) + { + // Initialize histogram bin counts to zeros + int histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + histogram[histo_offset + linear_tid] = 0; + } + // Finish up with guarded initialization if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) + { + histogram[histo_offset + linear_tid] = 0; + } + } + + + /** + * \brief Constructs a block-wide histogram in shared/global memory. Each thread contributes an array of input elements. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a 256-bin histogram of 512 integer samples that + * are partitioned across 128 threads where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char thread_samples[4]; + * ... + * + * // Compute the block-wide histogram + * BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram); + * + * \endcode + * + * \tparam HistoCounter [inferred] Histogram counter type + */ + template < + typename HistoCounter> + __device__ __forceinline__ void Histogram( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram + { + // Initialize histogram bin counts to zeros + InitHistogram(histogram); + + __syncthreads(); + + // Composite the histogram + InternalBlockHistogram(temp_storage).Composite(items, histogram); + } + + + + /** + * \brief Updates an existing block-wide histogram in shared/global memory. Each thread composites an array of input elements. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a the initialization and update of a + * histogram of 512 integer samples that are partitioned across 128 threads + * where each thread owns 4 samples. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each + * typedef cub::BlockHistogram BlockHistogram; + * + * // Allocate shared memory for BlockHistogram + * __shared__ typename BlockHistogram::TempStorage temp_storage; + * + * // Allocate shared memory for block-wide histogram bin counts + * __shared__ unsigned int smem_histogram[256]; + * + * // Obtain input samples per thread + * unsigned char thread_samples[4]; + * ... + * + * // Initialize the block-wide histogram + * BlockHistogram(temp_storage).InitHistogram(smem_histogram); + * + * // Update the block-wide histogram + * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); + * + * \endcode + * + * \tparam HistoCounter [inferred] Histogram counter type + */ + template < + typename HistoCounter> + __device__ __forceinline__ void Composite( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram + { + InternalBlockHistogram(temp_storage).Composite(items, histogram); + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/block_load.cuh b/SRC/cub/block/block_load.cuh new file mode 100644 index 00000000..afa8ff7c --- /dev/null +++ b/SRC/cub/block/block_load.cuh @@ -0,0 +1,1086 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Operations for reading linear tiles of data into the CUDA thread block. + */ + +#pragma once + +#include + +#include "block_exchange.cuh" +#include "../util_ptx.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + + +/******************************************************************//** + * \name Blocked arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block. + * + * \blocked + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIterator [inferred] The random-access iterator type for input \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename InputIterator> +__device__ __forceinline__ void LoadDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + // Load directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; + } +} + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. + * + * \blocked + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIterator [inferred] The random-access iterator type for input \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename InputIterator> +__device__ __forceinline__ void LoadDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load +{ + int bounds = valid_items - (linear_tid * ITEMS_PER_THREAD); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (ITEM < bounds) + { + items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM]; + } + } +} + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.. + * + * \blocked + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIterator [inferred] The random-access iterator type for input \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename InputIterator> +__device__ __forceinline__ void LoadDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + T oob_default) ///< [in] Default value to assign out-of-bound items +{ + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = oob_default; + } + + LoadDirectBlocked(linear_tid, block_itr, items, valid_items); +} + + +/** + * \brief Load a linear segment of items into a blocked arrangement across the thread block. + * + * \blocked + * + * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned + * + * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ +template < + typename T, + int ITEMS_PER_THREAD> +__device__ __forceinline__ void LoadDirectBlockedVectorized( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + T *block_ptr, ///< [in] Input pointer for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + enum + { + // Maximum CUDA vector size is 4 elements + MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD), + + // Vector size must be a power of two and an even divisor of the items per thread + VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ? + MAX_VEC_SIZE : + 1, + + VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE, + }; + + // Vector type + typedef typename CubVector::Type Vector; + + // Vector items + Vector vec_items[VECTORS_PER_THREAD]; + + // Aliased input ptr + Vector *ptr = reinterpret_cast(block_ptr + (linear_tid * VEC_SIZE * VECTORS_PER_THREAD)); + + // Load directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++) + { + vec_items[ITEM] = ptr[ITEM]; + } + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = reinterpret_cast(vec_items)[ITEM]; + } +} + + + +//@} end member group +/******************************************************************//** + * \name Striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Load a linear segment of items into a striped arrangement across the thread block. + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIterator [inferred] The random-access iterator type for input \iterator. + */ +template < + int BLOCK_THREADS, + typename T, + int ITEMS_PER_THREAD, + typename InputIterator> +__device__ __forceinline__ void LoadDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = block_itr[(ITEM * BLOCK_THREADS) + linear_tid]; + } +} + + +/** + * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIterator [inferred] The random-access iterator type for input \iterator. + */ +template < + int BLOCK_THREADS, + typename T, + int ITEMS_PER_THREAD, + typename InputIterator> +__device__ __forceinline__ void LoadDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load +{ + int bounds = valid_items - linear_tid; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (ITEM * BLOCK_THREADS < bounds) + { + items[ITEM] = block_itr[linear_tid + (ITEM * BLOCK_THREADS)]; + } + } +} + + +/** + * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIterator [inferred] The random-access iterator type for input \iterator. + */ +template < + int BLOCK_THREADS, + typename T, + int ITEMS_PER_THREAD, + typename InputIterator> +__device__ __forceinline__ void LoadDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + T oob_default) ///< [in] Default value to assign out-of-bound items +{ + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = oob_default; + } + + LoadDirectStriped(linear_tid, block_itr, items, valid_items); +} + + + +//@} end member group +/******************************************************************//** + * \name Warp-striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Load a linear segment of items into a warp-striped arrangement across the thread block. + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIterator [inferred] The random-access iterator type for input \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename InputIterator> +__device__ __forceinline__ void LoadDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + // Load directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]; + } +} + + +/** + * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIterator [inferred] The random-access iterator type for input \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename InputIterator> +__device__ __forceinline__ void LoadDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + int bounds = valid_items - warp_offset - tid; + + // Load directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((ITEM * CUB_PTX_WARP_THREADS) < bounds) + { + items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)]; + } + } +} + + +/** + * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to load. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam InputIterator [inferred] The random-access iterator type for input \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename InputIterator> +__device__ __forceinline__ void LoadDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + T oob_default) ///< [in] Default value to assign out-of-bound items +{ + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = oob_default; + } + + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); +} + + +//@} end member group + +/** @} */ // end group UtilIo + + + +//----------------------------------------------------------------------------- +// Generic BlockLoad abstraction +//----------------------------------------------------------------------------- + +/** + * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. + */ +enum BlockLoadAlgorithm +{ + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is read + * directly from memory. The thread block reads items in a parallel "raking" fashion: threadi + * reads the ith segment of consecutive elements. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) decreases as the + * access stride between threads increases (i.e., the number items per thread). + */ + BLOCK_LOAD_DIRECT, + + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is read directly + * from memory using CUDA's built-in vectorized loads as a coalescing optimization. + * The thread block reads items in a parallel "raking" fashion: threadi uses vector loads to + * read the ith segment of consecutive elements. + * + * For example, ld.global.v4.s32 instructions will be generated when \p T = \p int and \p ITEMS_PER_THREAD > 4. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high until the the + * access stride between threads (i.e., the number items per thread) exceeds the + * maximum vector load width (typically 4 items or 64B, whichever is lower). + * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The \p InputIterator is not a simple pointer type + * - The block input offset is not quadword-aligned + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + */ + BLOCK_LOAD_VECTORIZE, + + /** + * \par Overview + * + * A [striped arrangement](index.html#sec5sec3) of data is read + * directly from memory and then is locally transposed into a + * [blocked arrangement](index.html#sec5sec3). The thread block + * reads items in a parallel "strip-mining" fashion: + * threadi reads items having stride \p BLOCK_THREADS + * between them. cub::BlockExchange is then used to locally reorder the items + * into a [blocked arrangement](index.html#sec5sec3). + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items loaded per thread. + * - The local reordering incurs slightly longer latencies and throughput than the + * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. + */ + BLOCK_LOAD_TRANSPOSE, + + + /** + * \par Overview + * + * A [warp-striped arrangement](index.html#sec5sec3) of data is read + * directly from memory and then is locally transposed into a + * [blocked arrangement](index.html#sec5sec3). Each warp reads its own + * contiguous segment in a parallel "strip-mining" fashion: lanei + * reads items having stride \p WARP_THREADS between them. cub::BlockExchange + * is then used to locally reorder the items into a + * [blocked arrangement](index.html#sec5sec3). + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items loaded per thread. + * - The local reordering incurs slightly longer latencies and throughput than the + * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. + */ + BLOCK_LOAD_WARP_TRANSPOSE, +}; + + +/** + * \brief The BlockLoad class provides [collective](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [blocked arrangement](index.html#sec5sec3) across a CUDA thread block. ![](block_load_logo.png) + * \ingroup BlockModule + * \ingroup UtilIo + * + * \tparam InputIterator The input iterator type \iterator. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. + * \tparam ALGORITHM [optional] cub::BlockLoadAlgorithm tuning policy. default: cub::BLOCK_LOAD_DIRECT. + * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - The BlockLoad class provides a single data movement abstraction that can be specialized + * to implement different cub::BlockLoadAlgorithm strategies. This facilitates different + * performance policies for different architectures, data types, granularity sizes, etc. + * - BlockLoad can be optionally specialized by different data movement strategies: + * -# cub::BLOCK_LOAD_DIRECT. A [blocked arrangement](index.html#sec5sec3) + * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm) + * -# cub::BLOCK_LOAD_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) + * of data is read directly from memory using CUDA's built-in vectorized loads as a + * coalescing optimization. [More...](\ref cub::BlockLoadAlgorithm) + * -# cub::BLOCK_LOAD_TRANSPOSE. A [striped arrangement](index.html#sec5sec3) + * of data is read directly from memory and is then locally transposed into a + * [blocked arrangement](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm) + * -# cub::BLOCK_LOAD_WARP_TRANSPOSE. A [warp-striped arrangement](index.html#sec5sec3) + * of data is read directly from memory and is then locally transposed into a + * [blocked arrangement](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm) + * - \rowmajor + * + * \par A Simple Example + * \blockcollective{BlockLoad} + * \par + * The code snippet below illustrates the loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * + */ +template < + typename InputIterator, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, + bool WARP_TIME_SLICING = false, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockLoad +{ +private: + + /****************************************************************************** + * Constants and typed definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + // Data type of input iterator + typedef typename std::iterator_traits::value_type T; + + + /****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + + /// Load helper + template + struct LoadInternal; + + + /** + * BLOCK_LOAD_DIRECT specialization of load helper + */ + template + struct LoadInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &temp_storage, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + __device__ __forceinline__ void Load( + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + LoadDirectBlocked(linear_tid, block_itr, items); + } + + /// Load a linear segment of items from memory, guarded by range + __device__ __forceinline__ void Load( + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items); + } + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + __device__ __forceinline__ void Load( + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + T oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); + } + + }; + + + /** + * BLOCK_LOAD_VECTORIZE specialization of load helper + */ + template + struct LoadInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &temp_storage, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) + __device__ __forceinline__ void Load( + T *block_ptr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + LoadDirectBlockedVectorized(linear_tid, block_ptr, items); + } + + /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization) + template < + typename T, + typename _InputIterator> + __device__ __forceinline__ void Load( + _InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + LoadDirectBlocked(linear_tid, block_itr, items); + } + + /// Load a linear segment of items from memory, guarded by range (skips vectorization) + __device__ __forceinline__ void Load( + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items); + } + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization) + __device__ __forceinline__ void Load( + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + T oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); + } + + }; + + + /** + * BLOCK_LOAD_TRANSPOSE specialization of load helper + */ + template + struct LoadInternal + { + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + typedef typename BlockExchange::TempStorage _TempStorage; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + __device__ __forceinline__ void Load( + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + { + LoadDirectStriped(linear_tid, block_itr, items); + BlockExchange(temp_storage).StripedToBlocked(items); + } + + /// Load a linear segment of items from memory, guarded by range + __device__ __forceinline__ void Load( + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectStriped(linear_tid, block_itr, items, valid_items); + BlockExchange(temp_storage).StripedToBlocked(items); + } + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + __device__ __forceinline__ void Load( + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + T oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); + BlockExchange(temp_storage).StripedToBlocked(items); + } + + }; + + + /** + * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper + */ + template + struct LoadInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + typedef typename BlockExchange::TempStorage _TempStorage; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ LoadInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Load a linear segment of items from memory + __device__ __forceinline__ void Load( + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ + { + LoadDirectWarpStriped(linear_tid, block_itr, items); + BlockExchange(temp_storage).WarpStripedToBlocked(items); + } + + /// Load a linear segment of items from memory, guarded by range + __device__ __forceinline__ void Load( + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); + BlockExchange(temp_storage).WarpStripedToBlocked(items); + } + + + /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + __device__ __forceinline__ void Load( + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + T oob_default) ///< [in] Default value to assign out-of-bound items + { + LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); + BlockExchange(temp_storage).WarpStripedToBlocked(items); + } + }; + + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Internal load implementation to use + typedef LoadInternal InternalLoad; + + + /// Shared memory storage layout type + typedef typename InternalLoad::TempStorage _TempStorage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + +public: + + /// \smemstorage{BlockLoad} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockLoad() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockLoad( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + + + //@} end member group + /******************************************************************//** + * \name Data movement + *********************************************************************/ + //@{ + + + /** + * \brief Load a linear segment of items from memory. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * + */ + __device__ __forceinline__ void Load( + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load + { + InternalLoad(temp_storage, linear_tid).Load(block_itr, items); + } + + + /** + * \brief Load a linear segment of items from memory, guarded by range. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the guarded loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, int valid_items, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6... and \p valid_items is \p 5. + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }, with only the first two threads + * being unmasked to load portions of valid data (and other items remaining unassigned). + * + */ + __device__ __forceinline__ void Load( + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items) ///< [in] Number of valid items to load + { + InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items); + } + + + /** + * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the guarded loading of a linear + * segment of 512 integers into a "blocked" arrangement across 128 threads where each + * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, + * meaning memory references are efficiently coalesced using a warp-striped access + * pattern (after which items are locally reordered among threads). + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, int valid_items, ...) + * { + * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockLoad BlockLoad; + * + * // Allocate shared memory for BlockLoad + * __shared__ typename BlockLoad::TempStorage temp_storage; + * + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1); + * + * \endcode + * \par + * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6..., + * \p valid_items is \p 5, and the out-of-bounds default is \p -1. + * The set of \p thread_data across the block of threads in those threads will be + * { [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }, with only the first two threads + * being unmasked to load portions of valid data (and other items are assigned \p -1) + * + */ + __device__ __forceinline__ void Load( + InputIterator block_itr, ///< [in] The thread block's base input iterator for loading from + T (&items)[ITEMS_PER_THREAD], ///< [out] Data to load + int valid_items, ///< [in] Number of valid items to load + T oob_default) ///< [in] Default value to assign out-of-bound items + { + InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default); + } + + + //@} end member group + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/block_radix_rank.cuh b/SRC/cub/block/block_radix_rank.cuh new file mode 100644 index 00000000..4b5a6a76 --- /dev/null +++ b/SRC/cub/block/block_radix_rank.cuh @@ -0,0 +1,485 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock + */ + +#pragma once + +#include "../thread/thread_reduce.cuh" +#include "../thread/thread_scan.cuh" +#include "../block/block_scan.cuh" +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock. + * \ingroup BlockModule + * + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam RADIX_BITS The number of radix bits per digit place + * \tparam DESCENDING Whether or not the sorted-order is high-to-low + * \tparam MEMOIZE_OUTER_SCAN [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. + * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) + * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * Blah... + * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits). + * - \blocked + * + * \par Performance Considerations + * - \granularity + * + * \par Examples + * \par + * - Example 1: Simple radix rank of 32-bit integer keys + * \code + * #include + * + * template + * __global__ void ExampleKernel(...) + * { + * + * \endcode + */ +template < + int BLOCK_DIM_X, + int RADIX_BITS, + bool DESCENDING, + bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, + BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, + cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockRadixRank +{ +private: + + /****************************************************************************** + * Type definitions and constants + ******************************************************************************/ + + // Integer type for digit counters (to be packed into words of type PackedCounters) + typedef unsigned short DigitCounter; + + // Integer type for packing DigitCounters into columns of shared memory banks + typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte), + unsigned long long, + unsigned int>::Type PackedCounter; + + enum + { + // The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + RADIX_DIGITS = 1 << RADIX_BITS, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + BYTES_PER_COUNTER = sizeof(DigitCounter), + LOG_BYTES_PER_COUNTER = Log2::VALUE, + + PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), + LOG_PACKING_RATIO = Log2::VALUE, + + LOG_COUNTER_LANES = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0), // Always at least one lane + COUNTER_LANES = 1 << LOG_COUNTER_LANES, + + // The number of packed counters per thread (plus one for padding) + RAKING_SEGMENT = COUNTER_LANES + 1, + + LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH), + SMEM_BANKS = 1 << LOG_SMEM_BANKS, + }; + + + /// BlockScan type + typedef BlockScan< + PackedCounter, + BLOCK_DIM_X, + INNER_SCAN_ALGORITHM, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockScan; + + + /// Shared memory storage layout type for BlockRadixRank + struct _TempStorage + { + // Storage for scanning local ranks + typename BlockScan::TempStorage block_scan; + + union + { + DigitCounter digit_counters[COUNTER_LANES + 1][BLOCK_THREADS][PACKING_RATIO]; + PackedCounter raking_grid[BLOCK_THREADS][RAKING_SEGMENT]; + }; + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Copy of raking segment, promoted to registers + PackedCounter cached_segment[RAKING_SEGMENT]; + + + /****************************************************************************** + * Templated iteration + ******************************************************************************/ + + // General template iteration + template + struct Iterate + { + /** + * Decode keys. Decodes the radix digit from the current digit place + * and increments the thread's corresponding counter in shared + * memory for that digit. + * + * Saves both (1) the prior value of that counter (the key's + * thread-local exclusive prefix sum for that digit), and (2) the shared + * memory offset of the counter (for later use). + */ + template + static __device__ __forceinline__ void DecodeKeys( + BlockRadixRank &cta, // BlockRadixRank instance + UnsignedBits (&keys)[KEYS_PER_THREAD], // Key to decode + DigitCounter (&thread_prefixes)[KEYS_PER_THREAD], // Prefix counter value (out parameter) + DigitCounter* (&digit_counters)[KEYS_PER_THREAD], // Counter smem offset (out parameter) + int current_bit, // The least-significant bit position of the current digit to extract + int num_bits) // The number of bits in the current digit + { + // Get digit + UnsignedBits digit = BFE(keys[COUNT], current_bit, num_bits); + + // Get sub-counter + UnsignedBits sub_counter = digit >> LOG_COUNTER_LANES; + + // Get counter lane + UnsignedBits counter_lane = digit & (COUNTER_LANES - 1); + + if (DESCENDING) + { + sub_counter = PACKING_RATIO - 1 - sub_counter; + counter_lane = COUNTER_LANES - 1 - counter_lane; + } + + // Pointer to smem digit counter + digit_counters[COUNT] = &cta.temp_storage.digit_counters[counter_lane][cta.linear_tid][sub_counter]; + + // Load thread-exclusive prefix + thread_prefixes[COUNT] = *digit_counters[COUNT]; + + // Store inclusive prefix + *digit_counters[COUNT] = thread_prefixes[COUNT] + 1; + + // Iterate next key + Iterate::DecodeKeys(cta, keys, thread_prefixes, digit_counters, current_bit, num_bits); + } + + + // Termination + template + static __device__ __forceinline__ void UpdateRanks( + int (&ranks)[KEYS_PER_THREAD], // Local ranks (out parameter) + DigitCounter (&thread_prefixes)[KEYS_PER_THREAD], // Prefix counter value + DigitCounter* (&digit_counters)[KEYS_PER_THREAD]) // Counter smem offset + { + // Add in threadblock exclusive prefix + ranks[COUNT] = thread_prefixes[COUNT] + *digit_counters[COUNT]; + + // Iterate next key + Iterate::UpdateRanks(ranks, thread_prefixes, digit_counters); + } + }; + + + // Termination + template + struct Iterate + { + // DecodeKeys + template + static __device__ __forceinline__ void DecodeKeys( + BlockRadixRank &cta, + UnsignedBits (&keys)[KEYS_PER_THREAD], + DigitCounter (&thread_prefixes)[KEYS_PER_THREAD], + DigitCounter* (&digit_counters)[KEYS_PER_THREAD], + int current_bit, // The least-significant bit position of the current digit to extract + int num_bits) // The number of bits in the current digit + {} + + + // UpdateRanks + template + static __device__ __forceinline__ void UpdateRanks( + int (&ranks)[KEYS_PER_THREAD], + DigitCounter (&thread_prefixes)[KEYS_PER_THREAD], + DigitCounter *(&digit_counters)[KEYS_PER_THREAD]) + {} + }; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /** + * Internal storage allocator + */ + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /** + * Performs upsweep raking reduction, returning the aggregate + */ + __device__ __forceinline__ PackedCounter Upsweep() + { + PackedCounter *smem_raking_ptr = temp_storage.raking_grid[linear_tid]; + PackedCounter *raking_ptr; + + if (MEMOIZE_OUTER_SCAN) + { + // Copy data into registers + #pragma unroll + for (int i = 0; i < RAKING_SEGMENT; i++) + { + cached_segment[i] = smem_raking_ptr[i]; + } + raking_ptr = cached_segment; + } + else + { + raking_ptr = smem_raking_ptr; + } + + return ThreadReduce(raking_ptr, Sum()); + } + + + /// Performs exclusive downsweep raking scan + __device__ __forceinline__ void ExclusiveDownsweep( + PackedCounter raking_partial) + { + PackedCounter *smem_raking_ptr = temp_storage.raking_grid[linear_tid]; + + PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ? + cached_segment : + smem_raking_ptr; + + // Exclusive raking downsweep scan + ThreadScanExclusive(raking_ptr, raking_ptr, Sum(), raking_partial); + + if (MEMOIZE_OUTER_SCAN) + { + // Copy data back to smem + #pragma unroll + for (int i = 0; i < RAKING_SEGMENT; i++) + { + smem_raking_ptr[i] = cached_segment[i]; + } + } + } + + + /** + * Reset shared memory digit counters + */ + __device__ __forceinline__ void ResetCounters() + { + // Reset shared memory digit counters + #pragma unroll + for (int LANE = 0; LANE < COUNTER_LANES + 1; LANE++) + { + *((PackedCounter*) temp_storage.digit_counters[LANE][linear_tid]) = 0; + } + } + + + /** + * Scan shared memory digit counters. + */ + __device__ __forceinline__ void ScanCounters() + { + // Upsweep scan + PackedCounter raking_partial = Upsweep(); + + // Compute exclusive sum + PackedCounter exclusive_partial; + PackedCounter packed_aggregate; + BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, packed_aggregate); + + // Propagate totals in packed fields + #pragma unroll + for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++) + { + exclusive_partial += packed_aggregate << (sizeof(DigitCounter) * 8 * PACKED); + } + + // Downsweep scan with exclusive partial + ExclusiveDownsweep(exclusive_partial); + } + +public: + + /// \smemstorage{BlockScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockRadixRank() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockRadixRank( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Raking + *********************************************************************/ + //@{ + + /** + * \brief Rank keys. + */ + template < + typename UnsignedBits, + int KEYS_PER_THREAD> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile + int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile + int current_bit, ///< [in] The least-significant bit position of the current digit to extract + int num_bits) ///< [in] The number of bits in the current digit + { + DigitCounter thread_prefixes[KEYS_PER_THREAD]; // For each key, the count of previous keys in this tile having the same digit + DigitCounter* digit_counters[KEYS_PER_THREAD]; // For each key, the byte-offset of its corresponding digit counter in smem + + // Reset shared memory digit counters + ResetCounters(); + + // Decode keys and update digit counters + Iterate<0, KEYS_PER_THREAD>::DecodeKeys(*this, keys, thread_prefixes, digit_counters, current_bit, num_bits); + + __syncthreads(); + + // Scan shared memory counters + ScanCounters(); + + __syncthreads(); + + // Extract the local ranks of each key + Iterate<0, KEYS_PER_THREAD>::UpdateRanks(ranks, thread_prefixes, digit_counters); + } + + + /** + * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. + */ + template < + typename UnsignedBits, + int KEYS_PER_THREAD> + __device__ __forceinline__ void RankKeys( + UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile + int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) + int current_bit, ///< [in] The least-significant bit position of the current digit to extract + int num_bits, ///< [in] The number of bits in the current digit + int &inclusive_digit_prefix) ///< [out] The incluisve prefix sum for the digit threadIdx.x + { + // Rank keys + RankKeys(keys, ranks, current_bit, num_bits); + + // Get the inclusive and exclusive digit totals corresponding to the calling thread. + if ((BLOCK_THREADS == RADIX_DIGITS) || (linear_tid < RADIX_DIGITS)) + { + int bin_idx = (DESCENDING) ? + RADIX_DIGITS - linear_tid - 1 : + linear_tid; + + // Obtain ex/inclusive digit counts. (Unfortunately these all reside in the + // first counter column, resulting in unavoidable bank conflicts.) + int counter_lane = (bin_idx & (COUNTER_LANES - 1)); + int sub_counter = bin_idx >> (LOG_COUNTER_LANES); + inclusive_digit_prefix = temp_storage.digit_counters[counter_lane + 1][0][sub_counter]; + } + } +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/block/block_radix_sort.cuh b/SRC/cub/block/block_radix_sort.cuh new file mode 100644 index 00000000..032f3678 --- /dev/null +++ b/SRC/cub/block/block_radix_sort.cuh @@ -0,0 +1,863 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockRadixSort class provides [collective](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block. + */ + + +#pragma once + +#include "block_exchange.cuh" +#include "block_radix_rank.cuh" +#include "../util_ptx.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockRadixSort class provides [collective](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method. ![](sorting_logo.png) + * \ingroup BlockModule + * + * \tparam Key Key type + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of items per thread + * \tparam Value [optional] Value type (default: cub::NullType, which indicates a keys-only sort) + * \tparam RADIX_BITS [optional] The number of radix bits per digit place (default: 4 bits) + * \tparam MEMOIZE_OUTER_SCAN [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). + * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) + * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges + * items into ascending order. It relies upon a positional representation for + * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, + * characters, etc.) specified from least-significant to most-significant. For a + * given input sequence of keys and a set of rules specifying a total ordering + * of the symbolic alphabet, the radix sorting method produces a lexicographic + * ordering of those keys. + * - BlockRadixSort can sort all of the built-in C++ numeric primitive types, e.g.: + * unsigned char, \p int, \p double, etc. Within each key, the implementation treats fixed-length + * bit-sequences of \p RADIX_BITS as radix digit places. Although the direct radix sorting + * method can only be applied to unsigned integral types, BlockRadixSort + * is able to sort signed and floating-point types via simple bit-wise transformations + * that ensure lexicographic key ordering. + * - \rowmajor + * + * \par Performance Considerations + * - \granularity + * + * \par A Simple Example + * \blockcollective{BlockRadixSort} + * \par + * The code snippet below illustrates a sort of 512 integer keys that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).Sort(thread_keys); + * + * ... + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ +template < + typename Key, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + typename Value = NullType, + int RADIX_BITS = 4, + bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, + BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, + cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockRadixSort +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + // The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + // Whether or not there are values to be trucked along with keys + KEYS_ONLY = Equals::VALUE, + }; + + // Key traits and unsigned bits type + typedef NumericTraits KeyTraits; + typedef typename KeyTraits::UnsignedBits UnsignedBits; + + /// Ascending BlockRadixRank utility type + typedef BlockRadixRank< + BLOCK_DIM_X, + RADIX_BITS, + false, + MEMOIZE_OUTER_SCAN, + INNER_SCAN_ALGORITHM, + SMEM_CONFIG, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + AscendingBlockRadixRank; + + /// Descending BlockRadixRank utility type + typedef BlockRadixRank< + BLOCK_DIM_X, + RADIX_BITS, + true, + MEMOIZE_OUTER_SCAN, + INNER_SCAN_ALGORITHM, + SMEM_CONFIG, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + DescendingBlockRadixRank; + + /// BlockExchange utility type for keys + typedef BlockExchange BlockExchangeKeys; + + /// BlockExchange utility type for values + typedef BlockExchange BlockExchangeValues; + + /// Shared memory storage layout type + struct _TempStorage + { + union + { + typename AscendingBlockRadixRank::TempStorage asending_ranking_storage; + typename DescendingBlockRadixRank::TempStorage descending_ranking_storage; + typename BlockExchangeKeys::TempStorage exchange_keys; + typename BlockExchangeValues::TempStorage exchange_values; + }; + }; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + /// Rank keys (specialized for ascending sort) + __device__ __forceinline__ void RankKeys( + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + int begin_bit, + int pass_bits, + Int2Type is_descending) + { + AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys( + unsigned_keys, + ranks, + begin_bit, + pass_bits); + } + + /// Rank keys (specialized for descending sort) + __device__ __forceinline__ void RankKeys( + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + int begin_bit, + int pass_bits, + Int2Type is_descending) + { + DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys( + unsigned_keys, + ranks, + begin_bit, + pass_bits); + } + + /// ExchangeValues (specialized for key-value sort, to-blocked arrangement) + __device__ __forceinline__ void ExchangeValues( + Value (&values)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Int2Type is_keys_only, + Int2Type is_blocked) + { + __syncthreads(); + + // Exchange values through shared memory in blocked arrangement + BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks); + } + + /// ExchangeValues (specialized for key-value sort, to-striped arrangement) + __device__ __forceinline__ void ExchangeValues( + Value (&values)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Int2Type is_keys_only, + Int2Type is_blocked) + { + __syncthreads(); + + // Exchange values through shared memory in blocked arrangement + BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks); + } + + /// ExchangeValues (specialized for keys-only sort) + template + __device__ __forceinline__ void ExchangeValues( + Value (&values)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Int2Type is_keys_only, + Int2Type is_blocked) + {} + + /// Sort blocked arrangement + template + __device__ __forceinline__ void SortBlocked( + Key (&keys)[ITEMS_PER_THREAD], ///< Keys to sort + Value (&values)[ITEMS_PER_THREAD], ///< Values to sort + int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison + Int2Type is_descending, ///< Tag whether is a descending-order sort + Int2Type is_keys_only) ///< Tag whether is keys-only sort + { + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = + reinterpret_cast(keys); + + // Twiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); + } + + // Radix sorting passes + while (true) + { + int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); + + // Rank the blocked keys + int ranks[ITEMS_PER_THREAD]; + RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); + begin_bit += RADIX_BITS; + + __syncthreads(); + + // Exchange keys through shared memory in blocked arrangement + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); + + // Exchange values through shared memory in blocked arrangement + ExchangeValues(values, ranks, is_keys_only, Int2Type()); + + // Quit if done + if (begin_bit >= end_bit) break; + + __syncthreads(); + } + + // Untwiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); + } + } + + /// Sort blocked -> striped arrangement + template + __device__ __forceinline__ void SortBlockedToStriped( + Key (&keys)[ITEMS_PER_THREAD], ///< Keys to sort + Value (&values)[ITEMS_PER_THREAD], ///< Values to sort + int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison + Int2Type is_descending, ///< Tag whether is a descending-order sort + Int2Type is_keys_only) ///< Tag whether is keys-only sort + { + UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = + reinterpret_cast(keys); + + // Twiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); + } + + // Radix sorting passes + while (true) + { + int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); + + // Rank the blocked keys + int ranks[ITEMS_PER_THREAD]; + RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); + begin_bit += RADIX_BITS; + + __syncthreads(); + + // Check if this is the last pass + if (begin_bit >= end_bit) + { + // Last pass exchanges keys through shared memory in striped arrangement + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks); + + // Last pass exchanges through shared memory in striped arrangement + ExchangeValues(values, ranks, is_keys_only, Int2Type()); + + // Quit + break; + } + + // Exchange keys through shared memory in blocked arrangement + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); + + // Exchange values through shared memory in blocked arrangement + ExchangeValues(values, ranks, is_keys_only, Int2Type()); + + __syncthreads(); + } + + // Untwiddle bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); + } + } + + + +public: + + /// \smemstorage{BlockScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockRadixSort() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockRadixSort( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Sorting (blocked arrangements) + *********************************************************************/ + //@{ + + /** + * \brief Performs an ascending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).Sort(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. + * The corresponding output \p thread_keys in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + */ + __device__ __forceinline__ void Sort( + Key (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(Key) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs an ascending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. + * + */ + __device__ __forceinline__ void Sort( + Key (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + Value (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(Key) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + /** + * \brief Performs a descending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).Sort(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. + * The corresponding output \p thread_keys in those threads will be + * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. + */ + __device__ __forceinline__ void SortDescending( + Key (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(Key) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs a descending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. + * + */ + __device__ __forceinline__ void SortDescending( + Key (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + Value (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(Key) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + //@} end member group + /******************************************************************//** + * \name Sorting (blocked arrangement -> striped arrangement) + *********************************************************************/ + //@{ + + + /** + * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. + * + */ + __device__ __forceinline__ void SortBlockedToStriped( + Key (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(Key) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. + * + */ + __device__ __forceinline__ void SortBlockedToStriped( + Key (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + Value (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(Key) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive keys. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * ... + * + * // Collectively sort the keys + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. + * + */ + __device__ __forceinline__ void SortDescendingBlockedToStriped( + Key (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(Key) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + NullType values[ITEMS_PER_THREAD]; + + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + /** + * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). + * + * \par + * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" + * more than one tile of values, simply perform a key-value sort of the keys paired + * with a temporary value array that enumerates the key indices. The reordered indices + * can then be used as a gather-vector for exchanging other associated tile data through + * shared memory. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sort of 512 integer keys and values that + * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive pairs. The final partitioning is striped. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each + * typedef cub::BlockRadixSort BlockRadixSort; + * + * // Allocate shared memory for BlockRadixSort + * __shared__ typename BlockRadixSort::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_keys[4]; + * int thread_values[4]; + * ... + * + * // Collectively sort the keys and values among block threads + * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); + * + * \endcode + * \par + * Suppose the set of input \p thread_keys across the block of threads is + * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The + * corresponding output \p thread_keys in those threads will be + * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. + * + */ + __device__ __forceinline__ void SortDescendingBlockedToStriped( + Key (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort + Value (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort + int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison + int end_bit = sizeof(Key) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + { + SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); + } + + + //@} end member group + +}; + +/** + * \example example_block_radix_sort.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/block_raking_layout.cuh b/SRC/cub/block/block_raking_layout.cuh new file mode 100644 index 00000000..9c01f255 --- /dev/null +++ b/SRC/cub/block/block_raking_layout.cuh @@ -0,0 +1,149 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data. + */ + + +#pragma once + +#include "../util_macro.cuh" +#include "../util_arch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. ![](raking.png) + * \ingroup BlockModule + * + * \par Overview + * This type facilitates a shared memory usage pattern where a block of CUDA + * threads places elements into shared memory and then reduces the active + * parallelism to one "raking" warp of threads for serially aggregating consecutive + * sequences of shared items. Padding is inserted to eliminate bank conflicts + * (for most data types). + * + * \tparam T The data type to be exchanged. + * \tparam BLOCK_THREADS The thread block size in threads. + * \tparam PTX_ARCH [optional] \ptxversion + */ +template < + typename T, + int BLOCK_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +struct BlockRakingLayout +{ + //--------------------------------------------------------------------- + // Constants and type definitions + //--------------------------------------------------------------------- + + enum + { + /// The total number of elements that need to be cooperatively reduced + SHARED_ELEMENTS = BLOCK_THREADS, + + /// Maximum number of warp-synchronous raking threads + MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)), + + /// Number of raking elements per warp-synchronous raking thread (rounded up) + SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS, + + /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads) + RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH, + + /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1) + HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0), + + /// Degree of bank conflicts (e.g., 4-way) + CONFLICT_DEGREE = (HAS_CONFLICTS) ? + (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) : + 1, + + /// Pad each segment length with one element if degree of bank conflicts is greater than 4-way (heuristic) + SEGMENT_PADDING = (CONFLICT_DEGREE > CUB_PREFER_CONFLICT_OVER_PADDING(PTX_ARCH)) ? 1 : 0, +// SEGMENT_PADDING = (HAS_CONFLICTS) ? 1 : 0, + + /// Total number of elements in the raking grid + GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + SEGMENT_PADDING), + + /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads) + UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0), + }; + + + /** + * \brief Shared memory storage type + */ + typedef T _TempStorage[BlockRakingLayout::GRID_ELEMENTS]; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /** + * \brief Returns the location for the calling thread to place data into the grid + */ + static __device__ __forceinline__ T* PlacementPtr( + TempStorage &temp_storage, + int linear_tid) + { + // Offset for partial + unsigned int offset = linear_tid; + + // Add in one padding element for every segment + if (SEGMENT_PADDING > 0) + { + offset += offset / SEGMENT_LENGTH; + } + + // Incorporating a block of padding partials every shared memory segment + return temp_storage.Alias() + offset; + } + + + /** + * \brief Returns the location for the calling thread to begin sequential raking + */ + static __device__ __forceinline__ T* RakingPtr( + TempStorage &temp_storage, + int linear_tid) + { + return temp_storage.Alias() + (linear_tid * (SEGMENT_LENGTH + SEGMENT_PADDING)); + } +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/block_reduce.cuh b/SRC/cub/block/block_reduce.cuh new file mode 100644 index 00000000..8e3124c9 --- /dev/null +++ b/SRC/cub/block/block_reduce.cuh @@ -0,0 +1,607 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "specializations/block_reduce_raking.cuh" +#include "specializations/block_reduce_raking_commutative_only.cuh" +#include "specializations/block_reduce_warp_reductions.cuh" +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "../thread/thread_operators.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + +/** + * BlockReduceAlgorithm enumerates alternative algorithms for parallel + * reduction across a CUDA threadblock. + */ +enum BlockReduceAlgorithm +{ + + /** + * \par Overview + * An efficient "raking" reduction algorithm that only supports commutative + * reduction operators (true for most operations, e.g., addition). + * + * \par + * Execution is comprised of three phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Threads in warps other than the first warp place + * their partial reductions into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within the first + * warp continue to accumulate by raking across segments of shared partial reductions + * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. + * + * \par + * \image html block_reduce.png + *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.
+ * + * \par Performance Considerations + * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE + * and is preferable when the reduction operator is commutative. This variant + * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall + * throughput across the GPU when suitably occupied. However, turn-around latency may be + * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable + * when the GPU is under-occupied. + */ + BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, + + + /** + * \par Overview + * An efficient "raking" reduction algorithm that supports commutative + * (e.g., addition) and non-commutative (e.g., string concatenation) reduction + * operators. \blocked. + * + * \par + * Execution is comprised of three phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Each thread then places the partial reduction + * of its item(s) into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within a + * single warp rake across segments of shared partial reductions. + * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. + * + * \par + * \image html block_reduce.png + *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.
+ * + * \par Performance Considerations + * - This variant performs more communication than BLOCK_REDUCE_RAKING + * and is only preferable when the reduction operator is non-commutative. This variant + * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall + * throughput across the GPU when suitably occupied. However, turn-around latency may be + * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable + * when the GPU is under-occupied. + */ + BLOCK_REDUCE_RAKING, + + + /** + * \par Overview + * A quick "tiled warp-reductions" reduction algorithm that supports commutative + * (e.g., addition) and non-commutative (e.g., string concatenation) reduction + * operators. + * + * \par + * Execution is comprised of four phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Each thread then places the partial reduction + * of its item(s) into shared memory. + * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style + * reduction within each warp. + * -# A propagation phase where the warp reduction outputs in each warp are + * updated with the aggregate from each preceding warp. + * + * \par + * \image html block_scan_warpscans.png + *
\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.
+ * + * \par Performance Considerations + * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING + * or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall + * throughput across the GPU. However turn-around latency may be lower and + * thus useful when the GPU is under-occupied. + */ + BLOCK_REDUCE_WARP_REDUCTIONS, +}; + + +/****************************************************************************** + * Block reduce + ******************************************************************************/ + +/** + * \brief The BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png) + * \ingroup BlockModule + * + * \tparam T Data type being reduced + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ALGORITHM [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a list of input elements. + * - \rowmajor + * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles: + * -# cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY. An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * -# cub::BLOCK_REDUCE_RAKING. An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * -# cub::BLOCK_REDUCE_WARP_REDUCTIONS. A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * + * \par Performance Considerations + * - \granularity + * - Very efficient (only one synchronization barrier). + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Summation (vs. generic reduction) + * - \p BLOCK_THREADS is a multiple of the architecture's warp size + * - Every thread has a valid input (i.e., full vs. partial-tiles) + * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives + * + * \par A Simple Example + * \blockcollective{BlockReduce} + * \par + * The code snippet below illustrates a sum reduction of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data); + * + * \endcode + * + */ +template < + typename T, + int BLOCK_DIM_X, + BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockReduce +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + typedef BlockReduceWarpReductions WarpReductions; + typedef BlockReduceRakingCommutativeOnly RakingCommutativeOnly; + typedef BlockReduceRaking Raking; + + /// Internal specialization type + typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS), + WarpReductions, + typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), + RakingCommutativeOnly, + Raking>::Type>::Type InternalBlockReduce; // BlockReduceRaking + + /// Shared memory storage layout type for BlockReduce + typedef typename InternalBlockReduce::TempStorage _TempStorage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + +public: + + /// \smemstorage{BlockReduce} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockReduce() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockReduce( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Generic reductions + *********************************************************************/ + //@{ + + + /** + * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item + * int thread_data; + * ... + * + * // Compute the block-wide max for thread0 + * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); + * + * \endcode + * + * \tparam ReductionOp [inferred] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op) ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + { + return InternalBlockReduce(temp_storage).template Reduce(input, BLOCK_THREADS, reduction_op); + } + + + /** + * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Compute the block-wide max for thread0 + * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); + * + * \endcode + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ReductionOp [inferred] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment + ReductionOp reduction_op) ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + { + // Reduce partials + T partial = ThreadReduce(inputs, reduction_op); + return Reduce(partial, reduction_op); + } + + + /** + * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first \p num_valid threads each contribute one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction of a partially-full tile of integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int num_valid, ...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item + * int thread_data; + * if (threadIdx.x < num_valid) thread_data = ... + * + * // Compute the block-wide max for thread0 + * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid); + * + * \endcode + * + * \tparam ReductionOp [inferred] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op, ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) + { + // Determine if we scan skip bounds checking + if (num_valid >= BLOCK_THREADS) + { + return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); + } + else + { + return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); + } + } + + + //@} end member group + /******************************************************************//** + * \name Summation reductions + *********************************************************************/ + //@{ + + + /** + * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item + * int thread_data; + * ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data); + * + * \endcode + * + */ + __device__ __forceinline__ T Sum( + T input) ///< [in] Calling thread's input + { + return InternalBlockReduce(temp_storage).template Sum(input, BLOCK_THREADS); + } + + /** + * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data); + * + * \endcode + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ T Sum( + T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment + { + // Reduce partials + T partial = ThreadReduce(inputs, cub::Sum()); + return Sum(partial); + } + + + /** + * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first \p num_valid threads each contribute one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int num_valid, ...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item (up to num_items) + * int thread_data; + * if (threadIdx.x < num_valid) + * thread_data = ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid); + * + * \endcode + * + */ + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input + int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) + { + // Determine if we scan skip bounds checking + if (num_valid >= BLOCK_THREADS) + { + return InternalBlockReduce(temp_storage).template Sum(input, num_valid); + } + else + { + return InternalBlockReduce(temp_storage).template Sum(input, num_valid); + } + } + + + //@} end member group +}; + +/** + * \example example_block_reduce.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/block_scan.cuh b/SRC/cub/block/block_scan.cuh new file mode 100644 index 00000000..84e58302 --- /dev/null +++ b/SRC/cub/block/block_scan.cuh @@ -0,0 +1,2318 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "specializations/block_scan_raking.cuh" +#include "specializations/block_scan_warp_scans.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_ptx.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/****************************************************************************** + * Scan utility types + ******************************************************************************/ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Reduce-value-by-ID scan operator + */ +template ///< Wrapped reduction operator type +struct ReduceByKeyOp +{ + ReductionOp op; ///< Wrapped reduction operator + + /// Constructor + __device__ __forceinline__ ReduceByKeyOp(ReductionOp op) : op(op) {} + + /// Scan operator + template + __device__ __forceinline__ KeyValuePair operator()( + const KeyValuePair &first, + const KeyValuePair &second) + { + KeyValuePair retval; + + retval.value = (second.key != first.key) ? + second.value : // The second value is for a different ID, return only that value + op(first.value, second.value); // The values are for the same ID so reduce them + + retval.key = second.key; + return retval; + } +}; + + + +/** + * Segmented scan operator + */ +template ///< Wrapped reduction operator type +struct SegmentedOp +{ + ReductionOp op; ///< Wrapped reduction operator + + /// Constructor + __device__ __forceinline__ SegmentedOp(ReductionOp op) : op(op) {} + + /// Scan operator + template + __device__ __forceinline__ KeyValuePair operator()( + const KeyValuePair &first, + const KeyValuePair &second) + { + if (second.key) { + KeyValuePair retval; + retval.value = second.value; + retval.key = first.key + second.key; + return retval; + } else { + KeyValuePair retval; + retval.value = op(first.value, second.value); + retval.key = first.key + second.key; + return ; + } + } +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + +/** + * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block. + */ +enum BlockScanAlgorithm +{ + + /** + * \par Overview + * An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases: + * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions. + * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp. + * -# Downsweep sequential exclusive scan in shared memory. Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output. + * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. + * + * \par + * \image html block_scan_raking.png + *
\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.
+ * + * \par Performance Considerations + * - Although this variant may suffer longer turnaround latencies when the + * GPU is under-occupied, it can often provide higher overall throughput + * across the GPU when suitably occupied. + */ + BLOCK_SCAN_RAKING, + + + /** + * \par Overview + * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at + * the expense of higher register pressure. Raking threads preserve their + * "upsweep" segment of values in registers while performing warp-synchronous + * scan, allowing the "downsweep" not to re-read them from shared memory. + */ + BLOCK_SCAN_RAKING_MEMOIZE, + + + /** + * \par Overview + * A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases: + * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. + * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp. + * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp. + * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. + * + * \par + * \image html block_scan_warpscans.png + *
\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.
+ * + * \par Performance Considerations + * - Although this variant may suffer lower overall throughput across the + * GPU because due to a heavy reliance on inefficient warpscans, it can + * often provide lower turnaround latencies when the GPU is under-occupied. + */ + BLOCK_SCAN_WARP_SCANS, +}; + + +/****************************************************************************** + * Block scan + ******************************************************************************/ + +/** + * \brief The BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png) + * \ingroup BlockModule + * + * \tparam T Data type being scanned + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ALGORITHM [optional] cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) + * produces an output list where each element is computed to be the reduction + * of the elements occurring earlier in the input list. Prefix sum + * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * that the ith output reduction incorporates the ith input. + * The term \em exclusive indicates the ith input is not incorporated into + * the ith output reduction. + * - \rowmajor + * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles: + * -# cub::BLOCK_SCAN_RAKING. An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) + * -# cub::BLOCK_SCAN_RAKING_MEMOIZE. Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm) + * -# cub::BLOCK_SCAN_WARP_SCANS. A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) + * + * \par Performance Considerations + * - \granularity + * - Uses special instructions when applicable (e.g., warp \p SHFL) + * - Uses synchronization-free communication between warp lanes when applicable + * - Invokes a minimal number of minimal block-wide synchronization barriers (only + * one or two depending on algorithm selection) + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Prefix sum variants (vs. generic scan) + * - \blocksize + * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives + * + * \par A Simple Example + * \blockcollective{BlockScan} + * \par + * The code snippet below illustrates an exclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * {[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}. + * The corresponding output \p thread_data in those threads will be + * {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}. + * + */ +template < + typename T, + int BLOCK_DIM_X, + BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockScan +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /** + * Ensure the template parameterization meets the requirements of the + * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy + * cannot be used with threadblock sizes not a multiple of the + * architectural warp size. + */ + static const BlockScanAlgorithm SAFE_ALGORITHM = + ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ? + BLOCK_SCAN_RAKING : + ALGORITHM; + + typedef BlockScanWarpScans WarpScans; + typedef BlockScanRaking Raking; + + /// Define the delegate type for the desired algorithm + typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS), + WarpScans, + Raking>::Type InternalBlockScan; + + /// Shared memory storage layout type for BlockScan + typedef typename InternalBlockScan::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + +public: + + /// \smemstorage{BlockScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockScan() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockScan( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + + + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix sum operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. + * + * \par + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. + * + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output) ///< [out] Calling thread's output item (may be aliased to \p input) + { + T block_aggregate; + InternalBlockScan(temp_storage).ExclusiveSum(input, output, block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. + * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. + * + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).ExclusiveSum(input, output, block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveSum( + * thread_data, thread_data, block_aggregate, prefix_op); + * __syncthreads(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 0, 1, ..., 127. + * The output for the second segment will be 128, 129, ..., 255. Furthermore, + * the value \p 128 will be stored in \p block_aggregate for all threads after each scan. + * + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. + { + InternalBlockScan(temp_storage).ExclusiveSum(input, output, block_aggregate, block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix sum operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ void ExclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveSum(thread_partial, thread_partial); + + // Exclusive scan in registers with prefix + ThreadScanExclusive(input, output, scan_op, thread_partial); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ void ExclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveSum(thread_partial, thread_partial, block_aggregate); + + // Exclusive scan in registers with prefix + ThreadScanExclusive(input, output, scan_op, thread_partial); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) + * across 128 threads where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * __syncthreads(); + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage.scan).ExclusiveSum( + * thread_data, thread_data, block_aggregate, prefix_op); + * __syncthreads(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * __syncthreads(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 0, 1, 2, 3, ..., 510, 511. + * The output for the second segment will be 512, 513, 514, 515, ..., 1022, 1023. Furthermore, + * the value \p 512 will be stored in \p block_aggregate for all threads after each scan. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_callback_op); + + // Exclusive scan in registers with prefix + ThreadScanExclusive(input, output, scan_op, thread_partial); + } + + + + //@} end member group // Inclusive prefix sums + /******************************************************************//** + * \name Exclusive prefix scan operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. + * + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T identity, ///< [in] Identity value + ScanOp scan_op) ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + { + T block_aggregate; + InternalBlockScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. + * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. + * + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + T identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(INT_MIN); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide exclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveScan( + * thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate, prefix_op); + * __syncthreads(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, ..., 124, 126. + * The output for the second segment will be 126, 128, 128, 130, ..., 252, 254. Furthermore, + * \p block_aggregate will be assigned \p 126 in all threads after the first scan, assigned \p 254 after the second + * scan, etc. + * + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, block_aggregate, block_prefix_callback_op); + } + + + //@} end member group // Inclusive prefix sums + /******************************************************************//** + * \name Exclusive prefix scan operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. + * The corresponding output \p thread_data in those threads will be + * { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T identity, ///< [in] Identity value + ScanOp scan_op) ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + { + // Reduce consecutive thread items in registers + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveScan(thread_partial, thread_partial, identity, scan_op); + + // Exclusive scan in registers with prefix + ThreadScanExclusive(input, output, scan_op, thread_partial); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The + * corresponding output \p thread_data in those threads will be { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. + * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate); + + // Exclusive scan in registers with prefix + ThreadScanExclusive(input, output, scan_op, thread_partial); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * __syncthreads(); + * + * // Collectively compute the block-wide exclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage.scan).ExclusiveScan( + * thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate, prefix_op); + * __syncthreads(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * __syncthreads(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510. + * The output for the second segment will be 510, 512, 512, 514, 514, 516, ..., 1020, 1022. Furthermore, + * \p block_aggregate will be assigned \p 510 in all threads after the first scan, assigned \p 1022 after the second + * scan, etc. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. + { + // Reduce consecutive thread items in registers + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate, block_prefix_callback_op); + + // Exclusive scan in registers with prefix + ThreadScanExclusive(input, output, scan_op, thread_partial); + } + + + //@} end member group + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + /******************************************************************//** + * \name Exclusive prefix scan operations (identityless, single datum per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no identity value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + { + T block_aggregate; + InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate, block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix scan operations (identityless, multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. With no identity value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + { + // Reduce consecutive thread items in registers + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveScan(thread_partial, thread_partial, scan_op); + + // Exclusive scan in registers with prefix + ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate); + + // Exclusive scan in registers with prefix + ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. + { + // Reduce consecutive thread items in registers + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_callback_op); + + // Exclusive scan in registers with prefix + ThreadScanExclusive(input, output, scan_op, thread_partial); + } + + + //@} end member group + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + /******************************************************************//** + * \name Inclusive prefix sum operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. + * + * \par + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. + * + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output) ///< [out] Calling thread's output item (may be aliased to \p input) + { + T block_aggregate; + InternalBlockScan(temp_storage).InclusiveSum(input, output, block_aggregate); + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. + * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. + * + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).InclusiveSum(input, output, block_aggregate); + } + + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide inclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).InclusiveSum( + * thread_data, thread_data, block_aggregate, prefix_op); + * __syncthreads(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 1, 2, ..., 128. + * The output for the second segment will be 129, 130, ..., 256. Furthermore, + * the value \p 128 will be stored in \p block_aggregate for all threads after each scan. + * + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. + { + InternalBlockScan(temp_storage).InclusiveSum(input, output, block_aggregate, block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix sum operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ void InclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveSum(input[0], output[0]); + } + else + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveSum(thread_partial, thread_partial); + + // Inclusive scan in registers with prefix + ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be + * { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. + * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveSum(input[0], output[0], block_aggregate); + } + else + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveSum(thread_partial, thread_partial, block_aggregate); + + // Inclusive scan in registers with prefix + ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) + * across 128 threads where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * __syncthreads(); + * + * // Collectively compute the block-wide inclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage.scan).IncluisveSum( + * thread_data, thread_data, block_aggregate, prefix_op); + * __syncthreads(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * __syncthreads(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 1, 2, 3, 4, ..., 511, 512. + * The output for the second segment will be 513, 514, 515, 516, ..., 1023, 1024. Furthermore, + * the value \p 512 will be stored in \p block_aggregate for all threads after each scan. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveSum(input[0], output[0], block_aggregate, block_prefix_callback_op); + } + else + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_callback_op); + + // Inclusive scan in registers with prefix + ThreadScanInclusive(input, output, scan_op, thread_partial); + } + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix scan operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. + * + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + { + T block_aggregate; + InclusiveScan(input, output, scan_op, block_aggregate); + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. + * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. + * + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate); + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(INT_MIN); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide inclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).InclusiveScan( + * thread_data, thread_data, cub::Max(), block_aggregate, prefix_op); + * __syncthreads(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be 0, 0, 2, 2, ..., 126, 126. + * The output for the second segment will be 128, 128, 130, 130, ..., 254, 254. Furthermore, + * \p block_aggregate will be assigned \p 126 in all threads after the first scan, assigned \p 254 after the second + * scan, etc. + * + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate, block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix scan operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The + * corresponding output \p thread_data in those threads will be { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveScan(input[0], output[0], scan_op); + } + else + { + // Reduce consecutive thread items in registers + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveScan(thread_partial, thread_partial, scan_op); + + // Inclusive scan in registers with prefix + ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. + * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveScan(input[0], output[0], scan_op, block_aggregate); + } + else + { + // Reduce consecutive thread items in registers + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate); + + // Inclusive scan in registers with prefix + ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * __syncthreads(); + * + * // Collectively compute the block-wide inclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage.scan).InclusiveScan( + * thread_data, thread_data, cub::Max(), block_aggregate, prefix_op); + * __syncthreads(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * __syncthreads(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be 0, 0, 2, 2, 4, 4, ..., 510, 510. + * The output for the second segment will be 512, 512, 514, 514, 516, 516, ..., 1022, 1022. Furthermore, + * \p block_aggregate will be assigned \p 510 in all threads after the first scan, assigned \p 1022 after the second + * scan, etc. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveScan(input[0], output[0], scan_op, block_aggregate, block_prefix_callback_op); + } + else + { + // Reduce consecutive thread items in registers + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_callback_op); + + // Inclusive scan in registers with prefix + ThreadScanInclusive(input, output, scan_op, thread_partial); + } + } + + //@} end member group + + +}; + +/** + * \example example_block_scan.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/block_shift.cuh b/SRC/cub/block/block_shift.cuh new file mode 100644 index 00000000..3cd09222 --- /dev/null +++ b/SRC/cub/block/block_shift.cuh @@ -0,0 +1,325 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockShift class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_arch.cuh" +#include "../util_ptx.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockShift class provides [collective](index.html#sec0) methods for shifting data partitioned across a CUDA thread block. ![](transpose_logo.png) + * \ingroup BlockModule + * + * \tparam T The data type to be exchanged. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * It is commonplace for blocks of threads to rearrange data items between + * threads. The BlockShift abstraction allows threads to efficiently shift items + * either (a) up to their successor or (b) down to their predecessor. + * + */ +template < + typename T, + int BLOCK_DIM_X, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockShift +{ +private: + + /****************************************************************************** + * Constants + ******************************************************************************/ + + enum + { + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Shared memory storage layout type + typedef typename If<(PTX_ARCH >= 300), + T[WARPS], // Kepler+ only needs smem to share between warps + T[BLOCK_THREADS] >::Type _TempStorage; + +public: + + /// \smemstorage{BlockShift} + struct TempStorage : Uninitialized<_TempStorage> {}; + +private: + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + int lane_id; + int warp_id; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + +public: + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockShift() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockShift( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + //@} end member group + /******************************************************************//** + * \name Shift exchanges + *********************************************************************/ + //@{ + + + /** + * \brief Each thread obtains the \p input provided by its predecessor. The first thread receives \p block_prefix. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Up( + T input, ///< [in] Input item + T &output, ///< [out] Output item + T block_prefix) ///< [in] Prefix item to be provided to thread0 + { +#if CUB_PTX_ARCH >= 300 + if (lane_id == WARP_THREADS - 1) + temp_storage[warp_id] = input; + + __syncthreads(); + + output = ShuffleUp(input, 1); + if (lane_id == 0) + { + output = (linear_tid == 0) ? + block_prefix : + temp_storage[warp_id - 1]; + } +#else + temp_storage[linear_tid] = input; + + __syncthreads(); + + output = (linear_tid == 0) ? + block_prefix : + temp_storage[linear_tid - 1]; +#endif + } + + + /** + * \brief Each thread receives the \p input provided by its predecessor. The first thread receives \p block_prefix. All threads receive the \p input provided by threadBLOCK_THREADS-1. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Up( + T input, ///< [in] Input item + T &output, ///< [out] Output item + T block_prefix, ///< [in] Prefix item to be provided to thread0 + T &block_suffix) ///< [out] Suffix item shifted out by the threadBLOCK_THREADS-1 to be provided to all threads + { +#if CUB_PTX_ARCH >= 300 + if (lane_id == WARP_THREADS - 1) + temp_storage[warp_id] = input; + + __syncthreads(); + + output = ShuffleUp(input, 1); + if (lane_id == 0) + { + output = (linear_tid == 0) ? + block_prefix : + temp_storage[warp_id - 1]; + } + block_suffix = temp_storage[WARPS - 1]; +#else + temp_storage[linear_tid] = input; + + __syncthreads(); + + output = (linear_tid == 0) ? + block_prefix : + temp_storage[linear_tid - 1]; + + block_suffix = temp_storage[BLOCK_THREADS - 1]; +#endif + } + + + /** + * \brief Each thread obtains the \p input provided by its successor. The last thread receives \p block_suffix. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Down( + T input, ///< [in] Input item + T &output, ///< [out] Output item + T block_suffix) ///< [in] Suffix item to be provided to threadBLOCK_THREADS-1 + { +#if CUB_PTX_ARCH >= 300 + if (lane_id == 0) + temp_storage[warp_id] = input; + + __syncthreads(); + + output = ShuffleDown(input, 1); + if (lane_id == WARP_THREADS - 1) + { + output = (linear_tid == BLOCK_THREADS - 1) ? + block_suffix : + temp_storage[warp_id + 1]; + } +#else + temp_storage[linear_tid] = input; + + __syncthreads(); + + output = (linear_tid == BLOCK_THREADS - 1) ? + block_suffix : + temp_storage[linear_tid + 1]; +#endif + } + + + /** + * \brief Each thread obtains the \p input provided by its successor. The last thread receives \p block_suffix. All threads receive the \p input provided by thread0. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Down( + T input, ///< [in] Input item + T &output, ///< [out] Output item + T block_suffix, ///< [in] Suffix item to be provided to threadBLOCK_THREADS-1 + T &block_prefix) ///< [out] Prefix item shifted out by the thread0 to be provided to all threads + { +#if CUB_PTX_ARCH >= 300 + if (lane_id == 0) + temp_storage[warp_id] = input; + + __syncthreads(); + + output = ShuffleDown(input, 1); + if (lane_id == WARP_THREADS - 1) + { + output = (linear_tid == BLOCK_THREADS - 1) ? + block_suffix : + temp_storage[warp_id + 1]; + } +#else + temp_storage[linear_tid] = input; + + __syncthreads(); + + output = (linear_tid == BLOCK_THREADS - 1) ? + block_suffix : + temp_storage[linear_tid + 1]; +#endif + + block_prefix = temp_storage[0]; + } + + //@} end member group + + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/block_store.cuh b/SRC/cub/block/block_store.cuh new file mode 100644 index 00000000..066541ad --- /dev/null +++ b/SRC/cub/block/block_store.cuh @@ -0,0 +1,892 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Operations for writing linear segments of data from the CUDA thread block + */ + +#pragma once + +#include + +#include "block_exchange.cuh" +#include "../util_ptx.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + + +/******************************************************************//** + * \name Blocked arrangement I/O (direct) + *********************************************************************/ +//@{ + +/** + * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. + * + * \blocked + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIterator [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIterator> +__device__ __forceinline__ void StoreDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +{ + // Store directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM] = items[ITEM]; + } +} + + +/** + * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range + * + * \blocked + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIterator [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIterator> +__device__ __forceinline__ void StoreDirectBlocked( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write +{ + // Store directly in thread-blocked order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items) + { + block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM] = items[ITEM]; + } + } +} + + +/** + * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. + * + * \blocked + * + * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned, + * which is the default starting offset returned by \p cudaMalloc() + * + * \par + * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * + */ +template < + typename T, + int ITEMS_PER_THREAD> +__device__ __forceinline__ void StoreDirectBlockedVectorized( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + T *block_ptr, ///< [in] Input pointer for storing from + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +{ + enum + { + // Maximum CUDA vector size is 4 elements + MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD), + + // Vector size must be a power of two and an even divisor of the items per thread + VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ? + MAX_VEC_SIZE : + 1, + + VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE, + }; + + // Vector type + typedef typename CubVector::Type Vector; + + // Alias global pointer + Vector *block_ptr_vectors = reinterpret_cast(block_ptr); + + // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling) + Vector raw_vector[VECTORS_PER_THREAD]; + T *raw_items = reinterpret_cast(raw_vector); + + // Copy + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + raw_items[ITEM] = items[ITEM]; + } + + // Direct-store using vector types + StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector); +} + + + +//@} end member group +/******************************************************************//** + * \name Striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Store a striped arrangement of data across the thread block into a linear segment of items. + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIterator [inferred] The random-access iterator type for output \iterator. + */ +template < + int BLOCK_THREADS, + typename T, + int ITEMS_PER_THREAD, + typename OutputIterator> +__device__ __forceinline__ void StoreDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store +{ + // Store directly in striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + block_itr[(ITEM * BLOCK_THREADS) + linear_tid] = items[ITEM]; + } +} + + +/** + * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range + * + * \striped + * + * \tparam BLOCK_THREADS The thread block size in threads + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIterator [inferred] The random-access iterator type for output \iterator. + */ +template < + int BLOCK_THREADS, + typename T, + int ITEMS_PER_THREAD, + typename OutputIterator> +__device__ __forceinline__ void StoreDirectStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write +{ + // Store directly in striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items) + { + block_itr[(ITEM * BLOCK_THREADS) + linear_tid] = items[ITEM]; + } + } +} + + + +//@} end member group +/******************************************************************//** + * \name Warp-striped arrangement I/O (direct) + *********************************************************************/ +//@{ + + +/** + * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items. + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIterator [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIterator> +__device__ __forceinline__ void StoreDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + // Store directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; + } +} + + +/** + * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range + * + * \warpstriped + * + * \par Usage Considerations + * The number of threads in the thread block must be a multiple of the architecture's warp size. + * + * \tparam T [inferred] The data type to store. + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam OutputIterator [inferred] The random-access iterator type for output \iterator. + */ +template < + typename T, + int ITEMS_PER_THREAD, + typename OutputIterator> +__device__ __forceinline__ void StoreDirectWarpStriped( + int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) + OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write +{ + int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); + int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; + int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + + // Store directly in warp-striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) + { + block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; + } + } +} + + +//@} end member group + + +/** @} */ // end group UtilIo + + +//----------------------------------------------------------------------------- +// Generic BlockStore abstraction +//----------------------------------------------------------------------------- + +/** + * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory. + */ +enum BlockStoreAlgorithm +{ + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is written + * directly to memory. The thread block writes items in a parallel "raking" fashion: + * threadi writes the ith segment of consecutive elements. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) decreases as the + * access stride between threads increases (i.e., the number items per thread). + */ + BLOCK_STORE_DIRECT, + + /** + * \par Overview + * + * A [blocked arrangement](index.html#sec5sec3) of data is written directly + * to memory using CUDA's built-in vectorized stores as a coalescing optimization. + * The thread block writes items in a parallel "raking" fashion: threadi uses vector stores to + * write the ith segment of consecutive elements. + * + * For example, st.global.v4.s32 instructions will be generated when \p T = \p int and \p ITEMS_PER_THREAD > 4. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high until the the + * access stride between threads (i.e., the number items per thread) exceeds the + * maximum vector store width (typically 4 items or 64B, whichever is lower). + * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT: + * - \p ITEMS_PER_THREAD is odd + * - The \p OutputIterator is not a simple pointer type + * - The block output offset is not quadword-aligned + * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) + */ + BLOCK_STORE_VECTORIZE, + + /** + * \par Overview + * A [blocked arrangement](index.html#sec5sec3) is locally + * transposed into a [striped arrangement](index.html#sec5sec3) + * which is then written to memory. More specifically, cub::BlockExchange + * used to locally reorder the items into a + * [striped arrangement](index.html#sec5sec3), after which the + * thread block writes items in a parallel "strip-mining" fashion: consecutive + * items owned by threadi are written to memory with + * stride \p BLOCK_THREADS between them. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items written per thread. + * - The local reordering incurs slightly longer latencies and throughput than the + * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. + */ + BLOCK_STORE_TRANSPOSE, + + /** + * \par Overview + * A [blocked arrangement](index.html#sec5sec3) is locally + * transposed into a [warp-striped arrangement](index.html#sec5sec3) + * which is then written to memory. More specifically, cub::BlockExchange used + * to locally reorder the items into a + * [warp-striped arrangement](index.html#sec5sec3), after which + * each warp writes its own contiguous segment in a parallel "strip-mining" fashion: + * consecutive items owned by lanei are written to memory + * with stride \p WARP_THREADS between them. + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items written per thread. + * - The local reordering incurs slightly longer latencies and throughput than the + * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. + */ + BLOCK_STORE_WARP_TRANSPOSE, +}; + + +/** + * \brief The BlockStore class provides [collective](index.html#sec0) data movement methods for writing a [blocked arrangement](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory. ![](block_store_logo.png) + * \ingroup BlockModule + * \ingroup UtilIo + * + * \tparam OutputIterator The input iterator type \iterator. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. + * \tparam ALGORITHM [optional] cub::BlockStoreAlgorithm tuning policy enumeration. default: cub::BLOCK_STORE_DIRECT. + * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - The BlockStore class provides a single data movement abstraction that can be specialized + * to implement different cub::BlockStoreAlgorithm strategies. This facilitates different + * performance policies for different architectures, data types, granularity sizes, etc. + * - BlockStore can be optionally specialized by different data movement strategies: + * -# cub::BLOCK_STORE_DIRECT. A [blocked arrangement](index.html#sec5sec3) of data is written + * directly to memory. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) + * of data is written directly to memory using CUDA's built-in vectorized stores as a + * coalescing optimization. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) + * is locally transposed into a [striped arrangement](index.html#sec5sec3) which is + * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) + * -# cub::BLOCK_STORE_WARP_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) + * is locally transposed into a [warp-striped arrangement](index.html#sec5sec3) which is + * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) + * - \rowmajor + * + * \par A Simple Example + * \blockcollective{BlockStore} + * \par + * The code snippet below illustrates the storing of a "blocked" arrangement + * of 512 integers across 128 threads (where each thread owns 4 consecutive items) + * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * meaning items are locally reordered among threads so that memory references will be + * efficiently coalesced using a warp-striped access pattern. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockStore BlockStore; + * + * // Allocate shared memory for BlockStore + * __shared__ typename BlockStore::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Store items to linear memory + * int thread_data[4]; + * BlockStore(temp_storage).Store(d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... + * + */ +template < + typename OutputIterator, + int BLOCK_DIM_X, + int ITEMS_PER_THREAD, + BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, + bool WARP_TIME_SLICING = false, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockStore +{ +private: + /****************************************************************************** + * Constants and typed definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + // Data type of input iterator + typedef typename std::iterator_traits::value_type T; + + + /****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + + /// Store helper + template + struct StoreInternal; + + + /** + * BLOCK_STORE_DIRECT specialization of store helper + */ + template + struct StoreInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + __device__ __forceinline__ void Store( + OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectBlocked(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + __device__ __forceinline__ void Store( + OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + StoreDirectBlocked(linear_tid, block_itr, items, valid_items); + } + }; + + + /** + * BLOCK_STORE_VECTORIZE specialization of store helper + */ + template + struct StoreInternal + { + /// Shared memory storage layout type + typedef NullType TempStorage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization) + __device__ __forceinline__ void Store( + T *block_ptr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectBlockedVectorized(linear_tid, block_ptr, items); + } + + /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization) + template + __device__ __forceinline__ void Store( + _OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + StoreDirectBlocked(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + __device__ __forceinline__ void Store( + OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + StoreDirectBlocked(linear_tid, block_itr, items, valid_items); + } + }; + + + /** + * BLOCK_STORE_TRANSPOSE specialization of store helper + */ + template + struct StoreInternal + { + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + typedef typename BlockExchange::TempStorage _TempStorage; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + __device__ __forceinline__ void Store( + OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + BlockExchange(temp_storage).BlockedToStriped(items); + StoreDirectStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + __device__ __forceinline__ void Store( + OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToStriped(items); + StoreDirectStriped(linear_tid, block_itr, items, valid_items); + } + }; + + + /** + * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper + */ + template + struct StoreInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + typedef typename BlockExchange::TempStorage _TempStorage; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + __device__ __forceinline__ void Store( + OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + StoreDirectWarpStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + __device__ __forceinline__ void Store( + OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + StoreDirectWarpStriped(linear_tid, block_itr, items, valid_items); + } + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Internal load implementation to use + typedef StoreInternal InternalStore; + + + /// Shared memory storage layout type + typedef typename InternalStore::TempStorage _TempStorage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + +public: + + + /// \smemstorage{BlockStore} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockStore() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockStore( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Data movement + *********************************************************************/ + //@{ + + + /** + * \brief Store items into a linear segment of memory. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the storing of a "blocked" arrangement + * of 512 integers across 128 threads (where each thread owns 4 consecutive items) + * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * meaning items are locally reordered among threads so that memory references will be + * efficiently coalesced using a warp-striped access pattern. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, ...) + * { + * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockStore BlockStore; + * + * // Allocate shared memory for BlockStore + * __shared__ typename BlockStore::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Store items to linear memory + * int thread_data[4]; + * BlockStore(temp_storage).Store(d_data, thread_data); + * + * \endcode + * \par + * Suppose the set of \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... + * + */ + __device__ __forceinline__ void Store( + OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + InternalStore(temp_storage, linear_tid).Store(block_itr, items); + } + + /** + * \brief Store items into a linear segment of memory, guarded by range. + * + * \par + * - \blocked + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the guarded storing of a "blocked" arrangement + * of 512 integers across 128 threads (where each thread owns 4 consecutive items) + * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, + * meaning items are locally reordered among threads so that memory references will be + * efficiently coalesced using a warp-striped access pattern. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int *d_data, int valid_items, ...) + * { + * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each + * typedef cub::BlockStore BlockStore; + * + * // Allocate shared memory for BlockStore + * __shared__ typename BlockStore::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Store items to linear memory + * int thread_data[4]; + * BlockStore(temp_storage).Store(d_data, thread_data, valid_items); + * + * \endcode + * \par + * Suppose the set of \p thread_data across the block of threads is + * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] } and \p valid_items is \p 5. + * The output \p d_data will be 0, 1, 2, 3, 4, ?, ?, ?, ..., with + * only the first two threads being unmasked to store portions of valid data. + * + */ + __device__ __forceinline__ void Store( + OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items); + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/specializations/block_histogram_atomic.cuh b/SRC/cub/block/specializations/block_histogram_atomic.cuh new file mode 100644 index 00000000..ec4159ee --- /dev/null +++ b/SRC/cub/block/specializations/block_histogram_atomic.cuh @@ -0,0 +1,82 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ +template +struct BlockHistogramAtomic +{ + /// Shared memory storage layout type + struct TempStorage {}; + + + /// Constructor + __device__ __forceinline__ BlockHistogramAtomic( + TempStorage &temp_storage) + {} + + + /// Composite data onto an existing histogram + template < + typename T, + typename HistoCounter, + int ITEMS_PER_THREAD> + __device__ __forceinline__ void Composite( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram + { + // Update histogram + #pragma unroll + for (int i = 0; i < ITEMS_PER_THREAD; ++i) + { + atomicAdd(histogram + items[i], 1); + } + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/specializations/block_histogram_sort.cuh b/SRC/cub/block/specializations/block_histogram_sort.cuh new file mode 100644 index 00000000..12766ae5 --- /dev/null +++ b/SRC/cub/block/specializations/block_histogram_sort.cuh @@ -0,0 +1,226 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../../block/block_radix_sort.cuh" +#include "../../block/block_discontinuity.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/** + * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. + */ +template < + typename T, ///< Sample type + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int ITEMS_PER_THREAD, ///< The number of samples per thread + int BINS, ///< The number of bins into which histogram samples may fall + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockHistogramSort +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + // Parameterize BlockRadixSort type for our thread block + typedef BlockRadixSort< + T, + BLOCK_DIM_X, + ITEMS_PER_THREAD, + NullType, + 4, + (PTX_ARCH >= 350) ? true : false, + BLOCK_SCAN_WARP_SCANS, + (PTX_ARCH >= 350) ? cudaSharedMemBankSizeEightByte : cudaSharedMemBankSizeFourByte, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockRadixSortT; + + // Parameterize BlockDiscontinuity type for our thread block + typedef BlockDiscontinuity< + T, + BLOCK_DIM_X, + BLOCK_DIM_Y, + BLOCK_DIM_Z, + PTX_ARCH> + BlockDiscontinuityT; + + /// Shared memory + union _TempStorage + { + // Storage for sorting bin values + typename BlockRadixSortT::TempStorage sort; + + struct + { + // Storage for detecting discontinuities in the tile of sorted bin values + typename BlockDiscontinuityT::TempStorage flag; + + // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values + unsigned int run_begin[BINS]; + unsigned int run_end[BINS]; + }; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + int linear_tid; + + + /// Constructor + __device__ __forceinline__ BlockHistogramSort( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + // Discontinuity functor + struct DiscontinuityOp + { + // Reference to temp_storage + _TempStorage &temp_storage; + + // Constructor + __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) : + temp_storage(temp_storage) + {} + + // Discontinuity predicate + __device__ __forceinline__ bool operator()(const T &a, const T &b, unsigned int b_index) + { + if (a != b) + { + // Note the begin/end offsets in shared storage + temp_storage.run_begin[b] = b_index; + temp_storage.run_end[a] = b_index; + + return true; + } + else + { + return false; + } + } + }; + + + // Composite data onto an existing histogram + template < + typename HistoCounter> + __device__ __forceinline__ void Composite( + T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram + HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram + { + enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; + + // Sort bytes in blocked arrangement + BlockRadixSortT(temp_storage.sort).Sort(items); + + __syncthreads(); + + // Initialize the shared memory's run_begin and run_end for each bin + int histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; + temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; + } + // Finish up with guarded initialization if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) + { + temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; + temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; + } + + __syncthreads(); + + int flags[ITEMS_PER_THREAD]; // unused + + // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile + DiscontinuityOp flag_op(temp_storage); + BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op); + + // Update begin for first item + if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0; + + __syncthreads(); + + // Composite into histogram + histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + int thread_offset = histo_offset + linear_tid; + HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; + histogram[thread_offset] += count; + } + + // Finish up with guarded composition if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) + { + int thread_offset = histo_offset + linear_tid; + HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; + histogram[thread_offset] += count; + } + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/specializations/block_reduce_raking.cuh b/SRC/cub/block/specializations/block_reduce_raking.cuh new file mode 100644 index 00000000..3bddce65 --- /dev/null +++ b/SRC/cub/block/specializations/block_reduce_raking.cuh @@ -0,0 +1,247 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + */ + +#pragma once + +#include "../../block/block_raking_layout.cuh" +#include "../../warp/warp_reduce.cuh" +#include "../../thread/thread_reduce.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. + * + * Supports non-commutative binary reduction operators. Unlike commutative + * reduction operators (e.g., addition), the application of a non-commutative + * reduction operator (e.g, string concatenation) across a sequence of inputs must + * honor the relative ordering of items and partial reductions when applying the + * reduction operator. + * + * Compared to the implementation of BlockReduceRaking (which does not support + * non-commutative operators), this implementation requires a few extra + * rounds of inter-thread communication. + */ +template < + typename T, ///< Data type being reduced + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockReduceRaking +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /// Layout type for padded thread block raking grid + typedef BlockRakingLayout BlockRakingLayout; + + /// WarpReduce utility type + typedef typename WarpReduce::InternalWarpReduce WarpReduce; + + /// Constants + enum + { + /// Number of raking threads + RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, + + /// Number of raking elements per warp synchronous raking thread + SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, + + /// Cooperative work can be entirely warp synchronous + WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS), + + /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two + WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo::VALUE, + + /// Whether or not accesses into smem are unguarded + RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED, + + }; + + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + int linear_tid; + + + /// Constructor + __device__ __forceinline__ BlockReduceRaking( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + template + __device__ __forceinline__ T RakingReduction( + ReductionOp reduction_op, ///< [in] Binary scan operator + T *raking_segment, + T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type iteration) + { + // Update partial if addend is in range + if ((FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid)) + { + T addend = raking_segment[ITERATION]; + partial = reduction_op(partial, addend); + } + return RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type()); + } + + template + __device__ __forceinline__ T RakingReduction( + ReductionOp reduction_op, ///< [in] Binary scan operator + T *raking_segment, + T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type iteration) + { + return partial; + } + + + /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template + __device__ __forceinline__ T Sum( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + cub::Sum reduction_op; + + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two) + partial = WarpReduce(temp_storage.warp_storage).template Sum( + partial, + num_valid); + } + else + { + // Place partial into shared memory grid. + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; + + __syncthreads(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = raking_segment[0]; + + partial = RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type<1>()); + + partial = WarpReduce(temp_storage.warp_storage).template Sum( + partial, + num_valid); + } + } + + return partial; + } + + + /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two) + partial = WarpReduce(temp_storage.warp_storage).template Reduce( + partial, + num_valid, + reduction_op); + } + else + { + // Place partial into shared memory grid. + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; + + __syncthreads(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = raking_segment[0]; + + partial = RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type<1>()); + + partial = WarpReduce(temp_storage.warp_storage).template Reduce( + partial, + num_valid, + reduction_op); + } + } + + return partial; + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/SRC/cub/block/specializations/block_reduce_raking_commutative_only.cuh new file mode 100644 index 00000000..d0d73678 --- /dev/null +++ b/SRC/cub/block/specializations/block_reduce_raking_commutative_only.cuh @@ -0,0 +1,202 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. + */ + +#pragma once + +#include "block_reduce_raking.cuh" +#include "../../warp/warp_reduce.cuh" +#include "../../thread/thread_reduce.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. Does not support block sizes that are not a multiple of the warp size. + */ +template < + typename T, ///< Data type being reduced + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockReduceRakingCommutativeOnly +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values + typedef BlockReduceRaking FallBack; + + /// Constants + enum + { + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// Whether or not to use fall-back + USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)), + + /// Number of raking threads + RAKING_THREADS = WARP_THREADS, + + /// Number of threads actually sharing items with the raking threads + SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS), + + /// Number of raking elements per warp synchronous raking thread + SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS, + }; + + /// WarpReduce utility type + typedef WarpReduce WarpReduce; + + /// Layout type for padded thread block raking grid + typedef BlockRakingLayout BlockRakingLayout; + + /// Shared memory storage layout type + struct _TempStorage + { + union + { + struct + { + typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid + }; + typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan + }; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + int linear_tid; + + + /// Constructor + __device__ __forceinline__ BlockReduceRakingCommutativeOnly( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template + __device__ __forceinline__ T Sum( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + if (USE_FALLBACK || !FULL_TILE) + { + return FallBack(temp_storage.fallback_storage).template Sum(partial, num_valid); + } + else + { + // Place partial into shared memory grid + if (linear_tid >= RAKING_THREADS) + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; + + __syncthreads(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = ThreadReduce(raking_segment, cub::Sum(), partial); + + // Warpscan + partial = WarpReduce(temp_storage.warp_storage).Sum(partial); + } + } + + return partial; + } + + + /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + if (USE_FALLBACK || !FULL_TILE) + { + return FallBack(temp_storage.fallback_storage).template Reduce(partial, num_valid, reduction_op); + } + else + { + // Place partial into shared memory grid + if (linear_tid >= RAKING_THREADS) + *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; + + __syncthreads(); + + // Reduce parallelism to one warp + if (linear_tid < RAKING_THREADS) + { + // Raking reduction in grid + T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + partial = ThreadReduce(raking_segment, reduction_op, partial); + + // Warpscan + partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op); + } + } + + return partial; + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/specializations/block_reduce_warp_reductions.cuh b/SRC/cub/block/specializations/block_reduce_warp_reductions.cuh new file mode 100644 index 00000000..648650f1 --- /dev/null +++ b/SRC/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -0,0 +1,222 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock. Supports non-commutative reduction operators. + */ + +#pragma once + +#include "../../warp/warp_reduce.cuh" +#include "../../util_ptx.cuh" +#include "../../util_arch.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock. Supports non-commutative reduction operators. + */ +template < + typename T, ///< Data type being reduced + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockReduceWarpReductions +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// Number of active warps + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + /// The logical warp size for warp reductions + LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS), + + /// Whether or not the logical warp size evenly divides the threadblock size + EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0) + }; + + + /// WarpReduce utility type + typedef typename WarpReduce::InternalWarpReduce WarpReduce; + + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpReduce::TempStorage warp_reduce[WARPS]; ///< Buffer for warp-synchronous scan + T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan + T block_prefix; ///< Shared prefix for the entire threadblock + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + int linear_tid; + int warp_id; + int lane_id; + + + /// Constructor + __device__ __forceinline__ BlockReduceWarpReductions( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + template + __device__ __forceinline__ T ApplyWarpAggregates( + ReductionOp reduction_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type successor_warp) + { + if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid)) + { + T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP]; + warp_aggregate = reduction_op(warp_aggregate, addend); + } + return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type()); + } + + template + __device__ __forceinline__ T ApplyWarpAggregates( + ReductionOp reduction_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type successor_warp) + { + return warp_aggregate; + } + + + /// Returns block-wide aggregate in thread0. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T ApplyWarpAggregates( + ReductionOp reduction_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + // Share lane aggregates + if (lane_id == 0) + { + temp_storage.warp_aggregates[warp_id] = warp_aggregate; + } + + __syncthreads(); + + // Update total aggregate in warp 0, lane 0 + if (linear_tid == 0) + { + warp_aggregate = ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type<1>()); + } + + return warp_aggregate; + } + + + /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + cub::Sum reduction_op; + unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE; + unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ? + LOGICAL_WARP_SIZE : + (warp_offset < num_valid) ? + num_valid - warp_offset : + 0; + + // Warp reduction in every warp + T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Sum<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>( + input, + warp_num_valid); + + // Update outputs and block_aggregate with warp-wide aggregates from lane-0s + return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); + } + + + /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template < + bool FULL_TILE, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input partial reductions + int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + unsigned int warp_id = (WARPS == 1) ? 0 : (linear_tid / LOGICAL_WARP_SIZE); + unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE; + unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ? + LOGICAL_WARP_SIZE : + (warp_offset < num_valid) ? + num_valid - warp_offset : + 0; + + // Warp reduction in every warp + T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>( + input, + warp_num_valid, + reduction_op); + + // Update outputs and block_aggregate with warp-wide aggregates from lane-0s + return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/specializations/block_scan_raking.cuh b/SRC/cub/block/specializations/block_scan_raking.cuh new file mode 100644 index 00000000..8ae388da --- /dev/null +++ b/SRC/cub/block/specializations/block_scan_raking.cuh @@ -0,0 +1,788 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + + +/** + * \file + * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock. + */ + +#pragma once + +#include "../../util_ptx.cuh" +#include "../../util_arch.cuh" +#include "../../block/block_raking_layout.cuh" +#include "../../thread/thread_reduce.cuh" +#include "../../thread/thread_scan.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock. + */ +template < + typename T, ///< Data type being scanned + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + bool MEMOIZE, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanRaking +{ + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /// Layout type for padded threadblock raking grid + typedef BlockRakingLayout BlockRakingLayout; + + /// Constants + enum + { + /// Number of raking threads + RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, + + /// Number of raking elements per warp synchronous raking thread + SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, + + /// Cooperative work can be entirely warp synchronous + WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS), + }; + + /// WarpScan utility type + typedef WarpScan WarpScan; + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpScan::TempStorage warp_scan; ///< Buffer for warp-synchronous scan + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid + T block_aggregate; ///< Block aggregate + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + int linear_tid; + T cached_segment[SEGMENT_LENGTH]; + + + /// Constructor + __device__ __forceinline__ BlockScanRaking( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /// Templated reduction + template + __device__ __forceinline__ T GuardedReduce( + T* raking_ptr, ///< [in] Input array + ScanOp scan_op, ///< [in] Binary reduction operator + T raking_partial, ///< [in] Prefix to seed reduction with + Int2Type iteration) + { + if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS)) + { + T addend = raking_ptr[ITERATION]; + raking_partial = scan_op(raking_partial, addend); + } + + return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type()); + } + + + /// Templated reduction (base case) + template + __device__ __forceinline__ T GuardedReduce( + T* raking_ptr, ///< [in] Input array + ScanOp scan_op, ///< [in] Binary reduction operator + T raking_partial, ///< [in] Prefix to seed reduction with + Int2Type iteration) + { + return raking_partial; + } + + + /// Templated copy + template + __device__ __forceinline__ void CopySegment( + T* out, ///< [out] Out array + T* in, ///< [in] Input array + Int2Type iteration) + { + out[ITERATION] = in[ITERATION]; + CopySegment(out, in, Int2Type()); + } + + + /// Templated copy (base case) + __device__ __forceinline__ void CopySegment( + T* out, ///< [out] Out array + T* in, ///< [in] Input array + Int2Type iteration) + {} + + + /// Performs upsweep raking reduction, returning the aggregate + template + __device__ __forceinline__ T Upsweep( + ScanOp scan_op) + { + T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + + // Read data into registers + CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); + + T raking_partial = cached_segment[0]; + + return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>()); + } + + + /// Performs exclusive downsweep raking scan + template + __device__ __forceinline__ void ExclusiveDownsweep( + ScanOp scan_op, + T raking_partial, + bool apply_prefix = true) + { + T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + + // Read data back into registers + if (!MEMOIZE) + { + CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); + } + + ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); + + // Write data back to smem + CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); + } + + + /// Performs inclusive downsweep raking scan + template + __device__ __forceinline__ void InclusiveDownsweep( + ScanOp scan_op, + T raking_partial, + bool apply_prefix = true) + { + T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); + + // Read data back into registers + if (!MEMOIZE) + { + CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); + } + + ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); + + // Write data back to smem + CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + input, + output, + identity, + scan_op, + block_aggregate); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + raking_partial, + raking_partial, + identity, + scan_op, + temp_storage.block_aggregate); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, raking_partial); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + input, + output, + identity, + scan_op, + block_aggregate, + block_prefix_callback_op); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + raking_partial, + raking_partial, + identity, + scan_op, + temp_storage.block_aggregate, + block_prefix_callback_op); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, raking_partial); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + input, + output, + scan_op, + block_aggregate); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + raking_partial, + raking_partial, + scan_op, + temp_storage.block_aggregate); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0)); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + input, + output, + scan_op, + block_aggregate, + block_prefix_callback_op); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + raking_partial, + raking_partial, + scan_op, + temp_storage.block_aggregate, + block_prefix_callback_op); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, raking_partial); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).ExclusiveSum( + input, + output, + block_aggregate); + } + else + { + // Raking scan + Sum scan_op; + + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveSum( + raking_partial, + raking_partial, + temp_storage.block_aggregate); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, raking_partial); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).ExclusiveSum( + input, + output, + block_aggregate, + block_prefix_callback_op); + } + else + { + // Raking scan + Sum scan_op; + + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveSum( + raking_partial, + raking_partial, + temp_storage.block_aggregate, + block_prefix_callback_op); + + // Exclusive raking downsweep scan + ExclusiveDownsweep(scan_op, raking_partial); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).InclusiveScan( + input, + output, + scan_op, + block_aggregate); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + raking_partial, + raking_partial, + scan_op, + temp_storage.block_aggregate); + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0)); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).InclusiveScan( + input, + output, + scan_op, + block_aggregate, + block_prefix_callback_op); + } + else + { + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan( + raking_partial, + raking_partial, + scan_op, + temp_storage.block_aggregate, + block_prefix_callback_op); + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, raking_partial); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).InclusiveSum( + input, + output, + block_aggregate); + } + else + { + // Raking scan + Sum scan_op; + + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Exclusive warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveSum( + raking_partial, + raking_partial, + temp_storage.block_aggregate); + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0)); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + if (WARP_SYNCHRONOUS) + { + // Short-circuit directly to warp scan + WarpScan(temp_storage.warp_scan).InclusiveSum( + input, + output, + block_aggregate, + block_prefix_callback_op); + } + else + { + // Raking scan + Sum scan_op; + + // Place thread partial into shared memory raking grid + T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); + *placement_ptr = input; + + __syncthreads(); + + // Reduce parallelism down to just raking threads + if (linear_tid < RAKING_THREADS) + { + // Raking upsweep reduction in grid + T raking_partial = Upsweep(scan_op); + + // Warp synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveSum( + raking_partial, + raking_partial, + temp_storage.block_aggregate, + block_prefix_callback_op); + + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, raking_partial); + } + + __syncthreads(); + + // Grab thread prefix from shared memory + output = *placement_ptr; + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/specializations/block_scan_warp_scans.cuh b/SRC/cub/block/specializations/block_scan_warp_scans.cuh new file mode 100644 index 00000000..f2d06beb --- /dev/null +++ b/SRC/cub/block/specializations/block_scan_warp_scans.cuh @@ -0,0 +1,421 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock. + */ + +#pragma once + +#include "../../util_arch.cuh" +#include "../../util_ptx.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock. + */ +template < + typename T, + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanWarpScans +{ + /// Constants + enum + { + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of active warps + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + }; + + /// WarpScan utility type + typedef WarpScan WarpScan; + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpScan::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans + T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan + T block_prefix; ///< Shared prefix for the entire threadblock + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Thread fields + _TempStorage &temp_storage; + int linear_tid; + int warp_id; + int lane_id; + + + /// Constructor + __device__ __forceinline__ BlockScanWarpScans( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &partial, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + bool lane_valid, ///< [in] Whether or not the partial belonging to the current thread is valid + Int2Type addend_warp) + { + T inclusive = scan_op(block_aggregate, partial); + if (warp_id == WARP) + { + partial = (lane_valid) ? + inclusive : + block_aggregate; + } + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + + ApplyWarpAggregates(partial, scan_op, block_aggregate, lane_valid, Int2Type()); + } + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &partial, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + bool lane_valid, ///< [in] Whether or not the partial belonging to the current thread is valid + Int2Type addend_warp) + {} + + + /// Update the calling thread's partial reduction with the warp-wide aggregates from preceding warps. Also returns block-wide aggregate in thread0. + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &partial, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + bool lane_valid = true) ///< [in] Whether or not the partial belonging to the current thread is valid + { + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = warp_aggregate; + + __syncthreads(); + + block_aggregate = temp_storage.warp_aggregates[0]; + +#if __CUDA_ARCH__ <= 130 + + // Use template unrolling for SM1x (since the PTX backend can't handle it) + ApplyWarpAggregates(partial, scan_op, block_aggregate, lane_valid, Int2Type<1>()); + +#else + + // Use the pragma unrolling (since it uses less registers) + #pragma unroll + for (int WARP = 1; WARP < WARPS; WARP++) + { + T inclusive = scan_op(block_aggregate, partial); + if (warp_id == WARP) + { + partial = (lane_valid) ? + inclusive : + block_aggregate; + } + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + } + +#endif + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + T inclusive_output; + WarpScan(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, output, identity, scan_op); + + // Update outputs and block_aggregate with warp-wide aggregates + ApplyWarpAggregates(output, scan_op, inclusive_output, block_aggregate); + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + ExclusiveScan(input, output, identity, scan_op, block_aggregate); + + // Use the first warp to determine the threadblock prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + __syncthreads(); + + // Incorporate threadblock prefix into outputs + T block_prefix = temp_storage.block_prefix; + output = scan_op(block_prefix, output); + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + T inclusive_output; + WarpScan(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, output, scan_op); + + // Update outputs and block_aggregate with warp-wide aggregates + ApplyWarpAggregates(output, scan_op, inclusive_output, block_aggregate, (lane_id > 0)); + } + + + /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + ExclusiveScan(input, output, scan_op, block_aggregate); + + // Use the first warp to determine the threadblock prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + __syncthreads(); + + // Incorporate threadblock prefix into outputs + T block_prefix = temp_storage.block_prefix; + output = (linear_tid == 0) ? + block_prefix : + scan_op(block_prefix, output); + } + + + /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + Sum scan_op; + T inclusive_output; + + WarpScan(temp_storage.warp_scan[warp_id]).Sum(input, inclusive_output, output); + + // Update outputs and block_aggregate with warp-wide aggregates from lane WARP_THREADS-1 + ApplyWarpAggregates(output, scan_op, inclusive_output, block_aggregate); + } + + + /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + ExclusiveSum(input, output, block_aggregate); + + // Use the first warp to determine the threadblock prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + __syncthreads(); + + // Incorporate threadblock prefix into outputs + Sum scan_op; + T block_prefix = temp_storage.block_prefix; + output = scan_op(block_prefix, output); + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScan(temp_storage.warp_scan[warp_id]).InclusiveScan(input, output, scan_op); + + // Update outputs and block_aggregate with warp-wide aggregates from lane WARP_THREADS-1 + ApplyWarpAggregates(output, scan_op, output, block_aggregate); + + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + InclusiveScan(input, output, scan_op, block_aggregate); + + // Use the first warp to determine the threadblock prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + __syncthreads(); + + // Incorporate threadblock prefix into outputs + T block_prefix = temp_storage.block_prefix; + output = scan_op(block_prefix, output); + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScan(temp_storage.warp_scan[warp_id]).InclusiveSum(input, output); + + // Update outputs and block_aggregate with warp-wide aggregates from lane WARP_THREADS-1 + ApplyWarpAggregates(output, Sum(), output, block_aggregate); + } + + + /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + { + InclusiveSum(input, output, block_aggregate); + + // Use the first warp to determine the threadblock prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + __syncthreads(); + + // Incorporate threadblock prefix into outputs + Sum scan_op; + T block_prefix = temp_storage.block_prefix; + output = scan_op(block_prefix, output); + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block_range/block_range_histo.cuh b/SRC/cub/block_range/block_range_histo.cuh new file mode 100644 index 00000000..3ad884c1 --- /dev/null +++ b/SRC/cub/block_range/block_range_histo.cuh @@ -0,0 +1,319 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles. + */ + +#pragma once + +#include + +#include "specializations/block_range_histo_gatomic.cuh" +#include "specializations/block_range_histo_satomic.cuh" +#include "specializations/block_range_histo_sort.cuh" +#include "../util_type.cuh" +#include "../grid/grid_mapping.cuh" +#include "../grid/grid_even_share.cuh" +#include "../grid/grid_queue.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + + +/** + * \brief DeviceHistogramAlgorithm enumerates alternative algorithms for BlockRangeHistogram. + */ +enum DeviceHistogramAlgorithm +{ + + /** + * \par Overview + * A two-kernel approach in which: + * -# Thread blocks in the first kernel aggregate their own privatized + * histograms using block-wide sorting (see BlockHistogramAlgorithm::BLOCK_HISTO_SORT). + * -# A single thread block in the second kernel reduces them into the output histogram(s). + * + * \par Performance Considerations + * Delivers consistent throughput regardless of sample bin distribution. + * + * However, because histograms are privatized in shared memory, a large + * number of bins (e.g., thousands) may adversely affect occupancy and + * performance (or even the ability to launch). + */ + DEVICE_HISTO_SORT, + + + /** + * \par Overview + * A two-kernel approach in which: + * -# Thread blocks in the first kernel aggregate their own privatized + * histograms using shared-memory \p atomicAdd(). + * -# A single thread block in the second kernel reduces them into the + * output histogram(s). + * + * \par Performance Considerations + * Performance is strongly tied to the hardware implementation of atomic + * addition, and may be significantly degraded for non uniformly-random + * input distributions where many concurrent updates are likely to be + * made to the same bin counter. + * + * However, because histograms are privatized in shared memory, a large + * number of bins (e.g., thousands) may adversely affect occupancy and + * performance (or even the ability to launch). + */ + DEVICE_HISTO_SHARED_ATOMIC, + + + /** + * \par Overview + * A single-kernel approach in which thread blocks update the output histogram(s) directly + * using global-memory \p atomicAdd(). + * + * \par Performance Considerations + * Performance is strongly tied to the hardware implementation of atomic + * addition, and may be significantly degraded for non uniformly-random + * input distributions where many concurrent updates are likely to be + * made to the same bin counter. + * + * Performance is not significantly impacted when computing histograms having large + * numbers of bins (e.g., thousands). + */ + DEVICE_HISTO_GLOBAL_ATOMIC, + +}; + + +/****************************************************************************** + * Tuning policy + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for BlockRangeHistogram + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + DeviceHistogramAlgorithm _HISTO_ALGORITHM, ///< Cooperative histogram algorithm to use + GridMappingStrategy _GRID_MAPPING> ///< How to map tiles of input onto thread blocks +struct BlockRangeHistogramPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + }; + + static const DeviceHistogramAlgorithm HISTO_ALGORITHM = _HISTO_ALGORITHM; ///< Cooperative histogram algorithm to use + static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; ///< How to map tiles of input onto thread blocks +}; + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles. + */ +template < + typename BlockRangeHistogramPolicy, ///< Parameterized BlockRangeHistogramPolicy tuning policy type + int BINS, ///< Number of histogram bins per channel + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIterator, ///< Random-access input iterator type for reading samples. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1] + typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeHistogram +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Histogram grid algorithm + static const DeviceHistogramAlgorithm HISTO_ALGORITHM = BlockRangeHistogramPolicy::HISTO_ALGORITHM; + + // Alternative internal implementation types + typedef BlockRangeHistogramSort< BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramSortT; + typedef BlockRangeHistogramSharedAtomic< BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramSharedAtomicT; + typedef BlockRangeHistogramGlobalAtomic< BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramGlobalAtomicT; + + // Internal block sweep histogram type + typedef typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SORT), + BlockRangeHistogramSortT, + typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SHARED_ATOMIC), + BlockRangeHistogramSharedAtomicT, + BlockRangeHistogramGlobalAtomicT>::Type>::Type InternalBlockDelegate; + + enum + { + TILE_ITEMS = InternalBlockDelegate::TILE_ITEMS, + }; + + + // Temporary storage type + typedef typename InternalBlockDelegate::TempStorage TempStorage; + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Internal block delegate + InternalBlockDelegate internal_delegate; + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeHistogram( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIterator d_in, ///< Input data to reduce + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms + : + internal_delegate(temp_storage, d_in, d_out_histograms) + {} + + + /** + * \brief Reduce a consecutive segment of input tiles + */ + __device__ __forceinline__ void ConsumeRange( + Offset block_offset, ///< [in] Threadblock begin offset (inclusive) + Offset block_end) ///< [in] Threadblock end offset (exclusive) + { + // Consume subsequent full tiles of input + while (block_offset + TILE_ITEMS <= block_end) + { + internal_delegate.ConsumeTile(block_offset); + block_offset += TILE_ITEMS; + } + + // Consume a partially-full tile + if (block_offset < block_end) + { + int valid_items = block_end - block_offset; + internal_delegate.ConsumeTile(block_offset, valid_items); + } + + // Aggregate output + internal_delegate.AggregateOutput(); + } + + + /** + * Reduce a consecutive segment of input tiles + */ + __device__ __forceinline__ void ConsumeRange( + Offset num_items, ///< [in] Total number of global input items + GridEvenShare &even_share, ///< [in] GridEvenShare descriptor + GridQueue &queue, ///< [in,out] GridQueue descriptor + Int2Type is_even_share) ///< [in] Marker type indicating this is an even-share mapping + { + even_share.BlockInit(); + ConsumeRange(even_share.block_offset, even_share.block_end); + } + + + /** + * Dequeue and reduce tiles of items as part of a inter-block scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + GridQueue queue) ///< Queue descriptor for assigning tiles of work to thread blocks + { + // Shared block offset + __shared__ Offset shared_block_offset; + + // We give each thread block at least one tile of input. + Offset block_offset = blockIdx.x * TILE_ITEMS; + Offset even_share_base = gridDim.x * TILE_ITEMS; + + // Process full tiles of input + while (block_offset + TILE_ITEMS <= num_items) + { + internal_delegate.ConsumeTile(block_offset); + + // Dequeue up to TILE_ITEMS + if (threadIdx.x == 0) + shared_block_offset = queue.Drain(TILE_ITEMS) + even_share_base; + + __syncthreads(); + + block_offset = shared_block_offset; + + __syncthreads(); + } + + // Consume a partially-full tile + if (block_offset < num_items) + { + int valid_items = num_items - block_offset; + internal_delegate.ConsumeTile(block_offset, valid_items); + } + + // Aggregate output + internal_delegate.AggregateOutput(); + } + + + /** + * Dequeue and reduce tiles of items as part of a inter-block scan + */ + __device__ __forceinline__ void ConsumeRange( + Offset num_items, ///< [in] Total number of global input items + GridEvenShare &even_share, ///< [in] GridEvenShare descriptor + GridQueue &queue, ///< [in,out] GridQueue descriptor + Int2Type is_dynamic) ///< [in] Marker type indicating this is a dynamic mapping + { + ConsumeRange(num_items, queue); + } + + +}; + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block_range/block_range_radix_sort_downsweep.cuh b/SRC/cub/block_range/block_range_radix_sort_downsweep.cuh new file mode 100644 index 00000000..4141315e --- /dev/null +++ b/SRC/cub/block_range/block_range_radix_sort_downsweep.cuh @@ -0,0 +1,744 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * BlockRangeRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles. + */ + + +#pragma once + +#include "../thread/thread_load.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_radix_rank.cuh" +#include "../block/block_exchange.cuh" +#include "../util_type.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Types of scattering strategies + */ +enum RadixSortScatterAlgorithm +{ + RADIX_SORT_SCATTER_DIRECT, ///< Scatter directly from registers to global bins + RADIX_SORT_SCATTER_TWO_PHASE, ///< First scatter from registers into shared memory bins, then into global bins +}; + + +/** + * Parameterizable tuning policy type for BlockRangeRadixSortDownsweep + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys (and values) + bool _EXCHANGE_TIME_SLICING, ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure + bool _MEMOIZE_OUTER_SCAN, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure. See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. + BlockScanAlgorithm _INNER_SCAN_ALGORITHM, ///< The BlockScan algorithm algorithm to use + RadixSortScatterAlgorithm _SCATTER_ALGORITHM, ///< The scattering strategy to use + cudaSharedMemConfig _SMEM_CONFIG, ///< Shared memory bank mode + int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins) +struct BlockRangeRadixSortDownsweepPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + EXCHANGE_TIME_SLICING = _EXCHANGE_TIME_SLICING, ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure + RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) + MEMOIZE_OUTER_SCAN = _MEMOIZE_OUTER_SCAN, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure. See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys (and values) + static const BlockScanAlgorithm INNER_SCAN_ALGORITHM = _INNER_SCAN_ALGORITHM; ///< The BlockScan algorithm algorithm to use + static const RadixSortScatterAlgorithm SCATTER_ALGORITHM = _SCATTER_ALGORITHM; ///< The scattering strategy to use + static const cudaSharedMemConfig SMEM_CONFIG = _SMEM_CONFIG; ///< Shared memory bank mode +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief BlockRangeRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles. + */ +template < + typename BlockRangeRadixSortDownsweepPolicy, ///< Parameterized BlockRangeRadixSortDownsweepPolicy tuning policy type + bool DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename Key, ///< Key type + typename Value, ///< Value type + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeRadixSortDownsweep +{ + //--------------------------------------------------------------------- + // Type definitions and constants + //--------------------------------------------------------------------- + + // Appropriate unsigned-bits representation of Key + typedef typename Traits::UnsignedBits UnsignedBits; + + static const UnsignedBits MIN_KEY = Traits::MIN_KEY; + static const UnsignedBits MAX_KEY = Traits::MAX_KEY; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = BlockRangeRadixSortDownsweepPolicy::LOAD_ALGORITHM; + static const CacheLoadModifier LOAD_MODIFIER = BlockRangeRadixSortDownsweepPolicy::LOAD_MODIFIER; + static const BlockScanAlgorithm INNER_SCAN_ALGORITHM = BlockRangeRadixSortDownsweepPolicy::INNER_SCAN_ALGORITHM; + static const RadixSortScatterAlgorithm SCATTER_ALGORITHM = BlockRangeRadixSortDownsweepPolicy::SCATTER_ALGORITHM; + static const cudaSharedMemConfig SMEM_CONFIG = BlockRangeRadixSortDownsweepPolicy::SMEM_CONFIG; + + enum + { + BLOCK_THREADS = BlockRangeRadixSortDownsweepPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeRadixSortDownsweepPolicy::ITEMS_PER_THREAD, + EXCHANGE_TIME_SLICING = BlockRangeRadixSortDownsweepPolicy::EXCHANGE_TIME_SLICING, + RADIX_BITS = BlockRangeRadixSortDownsweepPolicy::RADIX_BITS, + MEMOIZE_OUTER_SCAN = BlockRangeRadixSortDownsweepPolicy::MEMOIZE_OUTER_SCAN, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + RADIX_DIGITS = 1 << RADIX_BITS, + KEYS_ONLY = Equals::VALUE, + + WARP_THREADS = CUB_PTX_LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + BYTES_PER_SIZET = sizeof(Offset), + LOG_BYTES_PER_SIZET = Log2::VALUE, + + LOG_SMEM_BANKS = CUB_PTX_LOG_SMEM_BANKS, + SMEM_BANKS = 1 << LOG_SMEM_BANKS, + + DIGITS_PER_SCATTER_PASS = BLOCK_THREADS / SMEM_BANKS, + SCATTER_PASSES = RADIX_DIGITS / DIGITS_PER_SCATTER_PASS, + + LOG_STORE_TXN_THREADS = LOG_SMEM_BANKS, + STORE_TXN_THREADS = 1 << LOG_STORE_TXN_THREADS, + }; + + // Input iterator wrapper types + typedef CacheModifiedInputIterator KeysItr; + typedef CacheModifiedInputIterator ValuesItr; + + // BlockRadixRank type + typedef BlockRadixRank< + BLOCK_THREADS, + RADIX_BITS, + DESCENDING, + MEMOIZE_OUTER_SCAN, + INNER_SCAN_ALGORITHM, + SMEM_CONFIG> BlockRadixRank; + + // BlockLoad type (keys) + typedef BlockLoad< + KeysItr, + BLOCK_THREADS, + ITEMS_PER_THREAD, + LOAD_ALGORITHM, + EXCHANGE_TIME_SLICING> BlockLoadKeys; + + // BlockLoad type (values) + typedef BlockLoad< + ValuesItr, + BLOCK_THREADS, + ITEMS_PER_THREAD, + LOAD_ALGORITHM, + EXCHANGE_TIME_SLICING> BlockLoadValues; + + // BlockExchange type (keys) + typedef BlockExchange< + UnsignedBits, + BLOCK_THREADS, + ITEMS_PER_THREAD, + EXCHANGE_TIME_SLICING> BlockExchangeKeys; + + // BlockExchange type (values) + typedef BlockExchange< + Value, + BLOCK_THREADS, + ITEMS_PER_THREAD, + EXCHANGE_TIME_SLICING> BlockExchangeValues; + + + /** + * Shared memory storage layout + */ + struct _TempStorage + { + Offset relative_bin_offsets[RADIX_DIGITS + 1]; + bool short_circuit; + + union + { + typename BlockRadixRank::TempStorage ranking; + typename BlockLoadKeys::TempStorage load_keys; + typename BlockLoadValues::TempStorage load_values; + typename BlockExchangeKeys::TempStorage exchange_keys; + typename BlockExchangeValues::TempStorage exchange_values; + }; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Thread fields + //--------------------------------------------------------------------- + + // Shared storage for this CTA + _TempStorage &temp_storage; + + // Input and output device pointers + KeysItr d_keys_in; + ValuesItr d_values_in; + UnsignedBits *d_keys_out; + Value *d_values_out; + + // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads) + Offset bin_offset; + + // The least-significant bit position of the current digit to extract + int current_bit; + + // Number of bits in current digit + int num_bits; + + // Whether to short-ciruit + bool short_circuit; + + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + /** + * Decodes given keys to lookup digit offsets in shared memory + */ + __device__ __forceinline__ void DecodeRelativeBinOffsets( + UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD]) + { + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + UnsignedBits digit = BFE(twiddled_keys[KEY], current_bit, num_bits); + + // Lookup base digit offset from shared memory + relative_bin_offsets[KEY] = temp_storage.relative_bin_offsets[digit]; + } + } + + + /** + * Scatter ranked items to global memory + */ + template + __device__ __forceinline__ void ScatterItems( + T (&items)[ITEMS_PER_THREAD], + int (&local_ranks)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], + T *d_out, + Offset valid_items) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + // Scatter if not out-of-bounds + if (FULL_TILE || (local_ranks[ITEM] < valid_items)) + { + d_out[relative_bin_offsets[ITEM] + local_ranks[ITEM]] = items[ITEM]; + } + } + } + + + /** + * Scatter ranked keys directly to global memory + */ + template + __device__ __forceinline__ void ScatterKeys( + UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Offset valid_items, + Int2Type scatter_algorithm) + { + // Compute scatter offsets + DecodeRelativeBinOffsets(twiddled_keys, relative_bin_offsets); + + // Untwiddle keys before outputting + UnsignedBits keys[ITEMS_PER_THREAD]; + + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + keys[KEY] = Traits::TwiddleOut(twiddled_keys[KEY]); + } + + // Scatter to global + ScatterItems(keys, ranks, relative_bin_offsets, d_keys_out, valid_items); + } + + + /** + * Scatter ranked keys through shared memory, then to global memory + */ + template + __device__ __forceinline__ void ScatterKeys( + UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Offset valid_items, + Int2Type scatter_algorithm) + { + // Exchange keys through shared memory + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(twiddled_keys, ranks); + + // Compute striped local ranks + int local_ranks[ITEMS_PER_THREAD]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS); + } + + // Scatter directly + ScatterKeys( + twiddled_keys, + relative_bin_offsets, + local_ranks, + valid_items, + Int2Type()); + } + + + /** + * Scatter ranked values directly to global memory + */ + template + __device__ __forceinline__ void ScatterValues( + Value (&values)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Offset valid_items, + Int2Type scatter_algorithm) + { + // Scatter to global + ScatterItems(values, ranks, relative_bin_offsets, d_values_out, valid_items); + } + + + /** + * Scatter ranked values through shared memory, then to global memory + */ + template + __device__ __forceinline__ void ScatterValues( + Value (&values)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Offset valid_items, + Int2Type scatter_algorithm) + { + __syncthreads(); + + // Exchange keys through shared memory + BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks); + + // Compute striped local ranks + int local_ranks[ITEMS_PER_THREAD]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS); + } + + // Scatter directly + ScatterValues( + values, + relative_bin_offsets, + local_ranks, + valid_items, + Int2Type()); + } + + + /** + * Load a tile of items (specialized for full tile) + */ + template + __device__ __forceinline__ void LoadItems( + BlockLoadT &block_loader, + T (&items)[ITEMS_PER_THREAD], + InputIterator d_in, + Offset valid_items, + Int2Type is_full_tile) + { + block_loader.Load(d_in, items); + } + + + /** + * Load a tile of items (specialized for partial tile) + */ + template + __device__ __forceinline__ void LoadItems( + BlockLoadT &block_loader, + T (&items)[ITEMS_PER_THREAD], + InputIterator d_in, + Offset valid_items, + Int2Type is_full_tile) + { + block_loader.Load(d_in, items, valid_items); + } + + + /** + * Truck along associated values + */ + template + __device__ __forceinline__ void GatherScatterValues( + _Value (&values)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Offset block_offset, + Offset valid_items) + { + __syncthreads(); + + BlockLoadValues loader(temp_storage.load_values); + LoadItems( + loader, + values, + d_values_in + block_offset, + valid_items, + Int2Type()); + + ScatterValues( + values, + relative_bin_offsets, + ranks, + valid_items, + Int2Type()); + } + + + /** + * Truck along associated values (specialized for key-only sorting) + */ + template + __device__ __forceinline__ void GatherScatterValues( + NullType (&values)[ITEMS_PER_THREAD], + Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], + int (&ranks)[ITEMS_PER_THREAD], + Offset block_offset, + Offset valid_items) + {} + + + /** + * Process tile + */ + template + __device__ __forceinline__ void ProcessTile( + Offset block_offset, + const Offset &valid_items = TILE_ITEMS) + { + // Per-thread tile data + UnsignedBits keys[ITEMS_PER_THREAD]; // Keys + UnsignedBits twiddled_keys[ITEMS_PER_THREAD]; // Twiddled keys + int ranks[ITEMS_PER_THREAD]; // For each key, the local rank within the CTA + Offset relative_bin_offsets[ITEMS_PER_THREAD]; // For each key, the global scatter base offset of the corresponding digit + + // Assign max-key to all keys + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + keys[ITEM] = (DESCENDING) ? MIN_KEY : MAX_KEY; + } + + // Load tile of keys + BlockLoadKeys loader(temp_storage.load_keys); + LoadItems( + loader, + keys, + d_keys_in + block_offset, + valid_items, + Int2Type()); + + __syncthreads(); + + // Twiddle key bits if necessary + #pragma unroll + for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) + { + twiddled_keys[KEY] = Traits::TwiddleIn(keys[KEY]); + } + + // Rank the twiddled keys + int inclusive_digit_prefix; + BlockRadixRank(temp_storage.ranking).RankKeys( + twiddled_keys, + ranks, + current_bit, + num_bits, + inclusive_digit_prefix); + + // Update global scatter base offsets for each digit + if ((BLOCK_THREADS == RADIX_DIGITS) || (threadIdx.x < RADIX_DIGITS)) + { + int exclusive_digit_prefix; + + // Get exclusive digit prefix from inclusive prefix + if (DESCENDING) + { + // Get the prefix from the next thread (higher bins come first) +#if CUB_PTX_ARCH >= 300 + exclusive_digit_prefix = ShuffleDown(inclusive_digit_prefix, 1); + if (threadIdx.x == RADIX_DIGITS - 1) + exclusive_digit_prefix = 0; +#else + volatile int* exchange = reinterpret_cast(temp_storage.relative_bin_offsets); + exchange[threadIdx.x + 1] = 0; + exchange[threadIdx.x] = inclusive_digit_prefix; + exclusive_digit_prefix = exchange[threadIdx.x + 1]; +#endif + } + else + { + // Get the prefix from the previous thread (lower bins come first) +#if CUB_PTX_ARCH >= 300 + exclusive_digit_prefix = ShuffleUp(inclusive_digit_prefix, 1); + if (threadIdx.x == 0) + exclusive_digit_prefix = 0; +#else + volatile int* exchange = reinterpret_cast(temp_storage.relative_bin_offsets); + exchange[threadIdx.x] = 0; + exchange[threadIdx.x + 1] = inclusive_digit_prefix; + exclusive_digit_prefix = exchange[threadIdx.x]; +#endif + } + + bin_offset -= exclusive_digit_prefix; + temp_storage.relative_bin_offsets[threadIdx.x] = bin_offset; + bin_offset += inclusive_digit_prefix; + } + + __syncthreads(); + + // Scatter keys + ScatterKeys(twiddled_keys, relative_bin_offsets, ranks, valid_items, Int2Type()); + + // Gather/scatter values + Value values[ITEMS_PER_THREAD]; + GatherScatterValues(values, relative_bin_offsets, ranks, block_offset, valid_items); + } + + + /** + * Copy tiles within the range of input + */ + template < + typename InputIterator, + typename T> + __device__ __forceinline__ void Copy( + InputIterator d_in, + T *d_out, + Offset block_offset, + Offset block_end) + { + // Simply copy the input + while (block_offset + TILE_ITEMS <= block_end) + { + T items[ITEMS_PER_THREAD]; + + LoadDirectStriped(threadIdx.x, d_in + block_offset, items); + __syncthreads(); + StoreDirectStriped(threadIdx.x, d_out + block_offset, items); + + block_offset += TILE_ITEMS; + } + + // Clean up last partial tile with guarded-I/O + if (block_offset < block_end) + { + Offset valid_items = block_end - block_offset; + + T items[ITEMS_PER_THREAD]; + + LoadDirectStriped(threadIdx.x, d_in + block_offset, items, valid_items); + __syncthreads(); + StoreDirectStriped(threadIdx.x, d_out + block_offset, items, valid_items); + } + } + + + /** + * Copy tiles within the range of input (specialized for NullType) + */ + template + __device__ __forceinline__ void Copy( + InputIterator d_in, + NullType *d_out, + Offset block_offset, + Offset block_end) + {} + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeRadixSortDownsweep( + TempStorage &temp_storage, + Offset bin_offset, + Key *d_keys_in, + Key *d_keys_out, + Value *d_values_in, + Value *d_values_out, + int current_bit, + int num_bits) + : + temp_storage(temp_storage.Alias()), + bin_offset(bin_offset), + d_keys_in(reinterpret_cast(d_keys_in)), + d_keys_out(reinterpret_cast(d_keys_out)), + d_values_in(d_values_in), + d_values_out(d_values_out), + current_bit(current_bit), + num_bits(num_bits), + short_circuit(false) + {} + + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeRadixSortDownsweep( + TempStorage &temp_storage, + Offset num_items, + Offset *d_spine, + Key *d_keys_in, + Key *d_keys_out, + Value *d_values_in, + Value *d_values_out, + int current_bit, + int num_bits) + : + temp_storage(temp_storage.Alias()), + d_keys_in(reinterpret_cast(d_keys_in)), + d_keys_out(reinterpret_cast(d_keys_out)), + d_values_in(d_values_in), + d_values_out(d_values_out), + current_bit(current_bit), + num_bits(num_bits) + { + // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit) + if (threadIdx.x < RADIX_DIGITS) + { + int bin_idx = (DESCENDING) ? + RADIX_DIGITS - threadIdx.x - 1 : + threadIdx.x; + + // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size + Offset first_block_bin_offset = d_spine[gridDim.x * bin_idx]; + int predicate = ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items)); + this->temp_storage.short_circuit = WarpAll(predicate); + + // Load my block's bin offset for my bin + bin_offset = d_spine[(gridDim.x * bin_idx) + blockIdx.x]; + } + + __syncthreads(); + + short_circuit = this->temp_storage.short_circuit; + } + + + /** + * Distribute keys from a segment of input tiles. + */ + __device__ __forceinline__ void ProcessRegion( + Offset block_offset, + const Offset &block_end) + { + if (short_circuit) + { + // Copy keys + Copy(d_keys_in, d_keys_out, block_offset, block_end); + + // Copy values + Copy(d_values_in, d_values_out, block_offset, block_end); + } + else + { + // Process full tiles of tile_items + while (block_offset + TILE_ITEMS <= block_end) + { + ProcessTile(block_offset); + block_offset += TILE_ITEMS; + + __syncthreads(); + } + + // Clean up last partial tile with guarded-I/O + if (block_offset < block_end) + { + ProcessTile(block_offset, block_end - block_offset); + } + } + } +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block_range/block_range_radix_sort_upsweep.cuh b/SRC/cub/block_range/block_range_radix_sort_upsweep.cuh new file mode 100644 index 00000000..faadbd3f --- /dev/null +++ b/SRC/cub/block_range/block_range_radix_sort_upsweep.cuh @@ -0,0 +1,450 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * BlockRangeRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep across a range of tiles. + */ + +#pragma once + +#include "../thread/thread_reduce.cuh" +#include "../thread/thread_load.cuh" +#include "../block/block_load.cuh" +#include "../util_type.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for BlockRangeRadixSortUpsweep + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys + int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins) +struct BlockRangeRadixSortUpsweepPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) + }; + + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief BlockRangeRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep across a range of tiles. + */ +template < + typename BlockRangeRadixSortUpsweepPolicy, ///< Parameterized BlockRangeRadixSortUpsweepPolicy tuning policy type + typename Key, ///< Key type + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeRadixSortUpsweep +{ + + //--------------------------------------------------------------------- + // Type definitions and constants + //--------------------------------------------------------------------- + + typedef typename Traits::UnsignedBits UnsignedBits; + + // Integer type for digit counters (to be packed into words of PackedCounters) + typedef unsigned char DigitCounter; + + // Integer type for packing DigitCounters into columns of shared memory banks + typedef unsigned int PackedCounter; + + static const CacheLoadModifier LOAD_MODIFIER = BlockRangeRadixSortUpsweepPolicy::LOAD_MODIFIER; + + enum + { + RADIX_BITS = BlockRangeRadixSortUpsweepPolicy::RADIX_BITS, + BLOCK_THREADS = BlockRangeRadixSortUpsweepPolicy::BLOCK_THREADS, + KEYS_PER_THREAD = BlockRangeRadixSortUpsweepPolicy::ITEMS_PER_THREAD, + + RADIX_DIGITS = 1 << RADIX_BITS, + + LOG_WARP_THREADS = CUB_PTX_LOG_WARP_THREADS, + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + + TILE_ITEMS = BLOCK_THREADS * KEYS_PER_THREAD, + + BYTES_PER_COUNTER = sizeof(DigitCounter), + LOG_BYTES_PER_COUNTER = Log2::VALUE, + + PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), + LOG_PACKING_RATIO = Log2::VALUE, + + LOG_COUNTER_LANES = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO), + COUNTER_LANES = 1 << LOG_COUNTER_LANES, + + // To prevent counter overflow, we must periodically unpack and aggregate the + // digit counters back into registers. Each counter lane is assigned to a + // warp for aggregation. + + LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS), + + // Unroll tiles in batches without risk of counter overflow + UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD), + UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS, + }; + + + // Input iterator wrapper types + typedef CacheModifiedInputIterator KeysItr; + + /** + * Shared memory storage layout + */ + struct _TempStorage + { + union + { + DigitCounter digit_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; + PackedCounter packed_counters[COUNTER_LANES][BLOCK_THREADS]; + Offset digit_partials[RADIX_DIGITS][WARP_THREADS + 1]; + }; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Thread fields (aggregate state bundle) + //--------------------------------------------------------------------- + + // Shared storage for this CTA + _TempStorage &temp_storage; + + // Thread-local counters for periodically aggregating composite-counter lanes + Offset local_counts[LANES_PER_WARP][PACKING_RATIO]; + + // Input and output device pointers + KeysItr d_keys_in; + + // The least-significant bit position of the current digit to extract + int current_bit; + + // Number of bits in current digit + int num_bits; + + + + //--------------------------------------------------------------------- + // Helper structure for templated iteration + //--------------------------------------------------------------------- + + // Iterate + template + struct Iterate + { + // BucketKeys + static __device__ __forceinline__ void BucketKeys( + BlockRangeRadixSortUpsweep &cta, + UnsignedBits keys[KEYS_PER_THREAD]) + { + cta.Bucket(keys[COUNT]); + + // Next + Iterate::BucketKeys(cta, keys); + } + }; + + // Terminate + template + struct Iterate + { + // BucketKeys + static __device__ __forceinline__ void BucketKeys(BlockRangeRadixSortUpsweep &cta, UnsignedBits keys[KEYS_PER_THREAD]) {} + }; + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + /** + * Decode a key and increment corresponding smem digit counter + */ + __device__ __forceinline__ void Bucket(UnsignedBits key) + { + // Perform transform op + UnsignedBits converted_key = Traits::TwiddleIn(key); + + // Extract current digit bits + UnsignedBits digit = BFE(converted_key, current_bit, num_bits); + + // Get sub-counter offset + UnsignedBits sub_counter = digit & (PACKING_RATIO - 1); + + // Get row offset + UnsignedBits row_offset = digit >> LOG_PACKING_RATIO; + + // Increment counter + temp_storage.digit_counters[row_offset][threadIdx.x][sub_counter]++; + } + + + /** + * Reset composite counters + */ + __device__ __forceinline__ void ResetDigitCounters() + { + #pragma unroll + for (int LANE = 0; LANE < COUNTER_LANES; LANE++) + { + temp_storage.packed_counters[LANE][threadIdx.x] = 0; + } + } + + + /** + * Reset the unpacked counters in each thread + */ + __device__ __forceinline__ void ResetUnpackedCounters() + { + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + local_counts[LANE][UNPACKED_COUNTER] = 0; + } + } + } + + + /** + * Extracts and aggregates the digit counters for each counter lane + * owned by this warp + */ + __device__ __forceinline__ void UnpackDigitCounts() + { + unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; + unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1); + + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + const int counter_lane = (LANE * WARPS) + warp_id; + if (counter_lane < COUNTER_LANES) + { + #pragma unroll + for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS) + { + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + Offset counter = temp_storage.digit_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER]; + local_counts[LANE][UNPACKED_COUNTER] += counter; + } + } + } + } + } + + + /** + * Places unpacked counters into smem for final digit reduction + */ + __device__ __forceinline__ void ReduceUnpackedCounts(Offset &bin_count) + { + unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; + unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1); + + // Place unpacked digit counters in shared memory + #pragma unroll + for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) + { + int counter_lane = (LANE * WARPS) + warp_id; + if (counter_lane < COUNTER_LANES) + { + int digit_row = counter_lane << LOG_PACKING_RATIO; + + #pragma unroll + for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) + { + temp_storage.digit_partials[digit_row + UNPACKED_COUNTER][warp_tid] = + local_counts[LANE][UNPACKED_COUNTER]; + } + } + } + + __syncthreads(); + + // Rake-reduce bin_count reductions + if (threadIdx.x < RADIX_DIGITS) + { + bin_count = ThreadReduce( + temp_storage.digit_partials[threadIdx.x], + Sum()); + } + } + + + /** + * Processes a single, full tile + */ + __device__ __forceinline__ void ProcessFullTile(Offset block_offset) + { + // Tile of keys + UnsignedBits keys[KEYS_PER_THREAD]; + + LoadDirectStriped(threadIdx.x, d_keys_in + block_offset, keys); + + // Prevent hoisting +// __threadfence_block(); +// __syncthreads(); + + // Bucket tile of keys + Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys); + } + + + /** + * Processes a single load (may have some threads masked off) + */ + __device__ __forceinline__ void ProcessPartialTile( + Offset block_offset, + const Offset &block_end) + { + // Process partial tile if necessary using single loads + block_offset += threadIdx.x; + while (block_offset < block_end) + { + // Load and bucket key + UnsignedBits key = d_keys_in[block_offset]; + Bucket(key); + block_offset += BLOCK_THREADS; + } + } + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeRadixSortUpsweep( + TempStorage &temp_storage, + Key *d_keys_in, + int current_bit, + int num_bits) + : + temp_storage(temp_storage.Alias()), + d_keys_in(reinterpret_cast(d_keys_in)), + current_bit(current_bit), + num_bits(num_bits) + {} + + + /** + * Compute radix digit histograms from a segment of input tiles. + */ + __device__ __forceinline__ void ProcessRegion( + Offset block_offset, + const Offset &block_end, + Offset &bin_count) ///< [out] The digit count for tid'th bin (output param, valid in the first RADIX_DIGITS threads) + { + // Reset digit counters in smem and unpacked counters in registers + ResetDigitCounters(); + ResetUnpackedCounters(); + + // Unroll batches of full tiles + while (block_offset + UNROLLED_ELEMENTS <= block_end) + { + for (int i = 0; i < UNROLL_COUNT; ++i) + { + ProcessFullTile(block_offset); + block_offset += TILE_ITEMS; + } + + __syncthreads(); + + // Aggregate back into local_count registers to prevent overflow + UnpackDigitCounts(); + + __syncthreads(); + + // Reset composite counters in lanes + ResetDigitCounters(); + } + + // Unroll single full tiles + while (block_offset + TILE_ITEMS <= block_end) + { + ProcessFullTile(block_offset); + block_offset += TILE_ITEMS; + } + + // Process partial tile if necessary + ProcessPartialTile( + block_offset, + block_end); + + __syncthreads(); + + // Aggregate back into local_count registers + UnpackDigitCounts(); + + __syncthreads(); + + // Final raking reduction of counts by bin + ReduceUnpackedCounts(bin_count); + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block_range/block_range_reduce.cuh b/SRC/cub/block_range/block_range_reduce.cuh new file mode 100644 index 00000000..9e97f87b --- /dev/null +++ b/SRC/cub/block_range/block_range_reduce.cuh @@ -0,0 +1,430 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles. + */ + +#pragma once + +#include + +#include "../block/block_load.cuh" +#include "../block/block_reduce.cuh" +#include "../grid/grid_mapping.cuh" +#include "../grid/grid_queue.cuh" +#include "../grid/grid_even_share.cuh" +#include "../util_type.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for BlockRangeReduce + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load + BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + GridMappingStrategy _GRID_MAPPING> ///< How to map tiles of input onto thread blocks +struct BlockRangeReducePolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load + }; + + static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; ///< How to map tiles of input onto thread blocks +}; + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles. + * + * Each thread reduces only the values it loads. If \p FIRST_TILE, this + * partial reduction is stored into \p thread_aggregate. Otherwise it is + * accumulated into \p thread_aggregate. + */ +template < + typename BlockRangeReducePolicy, ///< Parameterized BlockRangeReducePolicy tuning policy type + typename InputIterator, ///< Random-access iterator type for input + typename Offset, ///< Signed integer type for global offsets + typename ReductionOp> ///< Binary reduction operator type having member T operator()(const T &a, const T &b) +struct BlockRangeReduce +{ + + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // The value type of the input iterator + typedef typename std::iterator_traits::value_type T; + + // Vector type of T for data movement + typedef typename CubVector::Type VectorT; + + // Input iterator wrapper type + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + InputIterator>::Type // Directly use the supplied input iterator type + WrappedInputIterator; + + // Constants + enum + { + BLOCK_THREADS = BlockRangeReducePolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeReducePolicy::ITEMS_PER_THREAD, + VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, BlockRangeReducePolicy::VECTOR_LOAD_LENGTH), + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type + CAN_VECTORIZE = (VECTOR_LOAD_LENGTH > 1) && + (IsPointer::VALUE) && + Traits::PRIMITIVE, + + }; + + static const CacheLoadModifier LOAD_MODIFIER = BlockRangeReducePolicy::LOAD_MODIFIER; + static const BlockReduceAlgorithm BLOCK_ALGORITHM = BlockRangeReducePolicy::BLOCK_ALGORITHM; + + // Parameterized BlockReduce primitive + typedef BlockReduce BlockReduceT; + + /// Shared memory type required by this thread block + typedef typename BlockReduceT::TempStorage _TempStorage; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + T thread_aggregate; ///< Each thread's partial reduction + _TempStorage& temp_storage; ///< Reference to temp_storage + InputIterator d_in; ///< Input data to reduce + WrappedInputIterator d_wrapped_in; ///< Wrapped input data to reduce + ReductionOp reduction_op; ///< Binary reduction operator + int first_tile_size; ///< Size of first tile consumed + bool is_aligned; ///< Whether or not input is vector-aligned + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + + // Whether or not the input is aligned with the vector type (specialized for types we can vectorize) + template + static __device__ __forceinline__ bool IsAligned( + Iterator d_in, + Int2Type can_vectorize) + { + return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0; + } + + // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize) + template + static __device__ __forceinline__ bool IsAligned( + Iterator d_in, + Int2Type can_vectorize) + { + return false; + } + + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeReduce( + TempStorage& temp_storage, ///< Reference to temp_storage + InputIterator d_in, ///< Input data to reduce + ReductionOp reduction_op) ///< Binary reduction operator + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_wrapped_in(d_in), + reduction_op(reduction_op), + first_tile_size(0), + is_aligned(IsAligned(d_in, Int2Type())) + {} + + + /** + * Consume a full tile of input (specialized for cases where we cannot vectorize) + */ + template + __device__ __forceinline__ T ConsumeFullTile( + _Offset block_offset, ///< The offset the tile to consume + Int2Type can_vectorize) ///< Whether or not we can vectorize loads + { + T items[ITEMS_PER_THREAD]; + + // Load items in striped fashion + LoadDirectStriped(threadIdx.x, d_wrapped_in + block_offset, items); + + // Reduce items within each thread stripe + return ThreadReduce(items, reduction_op); + } + + + /** + * Consume a full tile of input (specialized for cases where we can vectorize) + */ + template + __device__ __forceinline__ T ConsumeFullTile( + _Offset block_offset, ///< The offset the tile to consume + Int2Type can_vectorize) ///< Whether or not we can vectorize loads + { + if (!is_aligned) + { + // Not aligned + return ConsumeFullTile(block_offset, Int2Type()); + } + else + { + // Alias items as an array of VectorT and load it in striped fashion + enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH }; + + T items[ITEMS_PER_THREAD]; + + VectorT *vec_items = reinterpret_cast(items); + + // Vector input iterator wrapper type + CacheModifiedInputIterator d_vec_in( + reinterpret_cast(d_in + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH))); + + #pragma unroll + for (int i = 0; i < WORDS; ++i) + vec_items[i] = d_vec_in[BLOCK_THREADS * i]; + + // Reduce items within each thread stripe + return ThreadReduce(items, reduction_op); + } + } + + + + /** + * Process a single tile of input + */ + template + __device__ __forceinline__ void ConsumeTile( + Offset block_offset, ///< The offset the tile to consume + int valid_items = TILE_ITEMS) ///< The number of valid items in the tile + { + if (FULL_TILE) + { + // Full tile + T partial = ConsumeFullTile(block_offset, Int2Type()); + + // Update running thread aggregate + thread_aggregate = (first_tile_size) ? + reduction_op(thread_aggregate, partial) : // Update + partial; // Assign + } + else + { + // Partial tile + int thread_offset = threadIdx.x; + + if (!first_tile_size && (thread_offset < valid_items)) + { + // Assign thread_aggregate + thread_aggregate = d_wrapped_in[block_offset + thread_offset]; + thread_offset += BLOCK_THREADS; + } + + while (thread_offset < valid_items) + { + // Update thread aggregate + T item = d_wrapped_in[block_offset + thread_offset]; + thread_aggregate = reduction_op(thread_aggregate, item); + thread_offset += BLOCK_THREADS; + } + } + + // Set first tile size if necessary + if (!first_tile_size) + first_tile_size = valid_items; + } + + + //--------------------------------------------------------------- + // Consume a contiguous segment of tiles + //--------------------------------------------------------------------- + + /** + * \brief Reduce a contiguous segment of input tiles + */ + __device__ __forceinline__ void ConsumeRange( + Offset block_offset, ///< [in] Threadblock begin offset (inclusive) + Offset block_end, ///< [in] Threadblock end offset (exclusive) + T &block_aggregate) ///< [out] Running total + { + // Consume subsequent full tiles of input + while (block_offset + TILE_ITEMS <= block_end) + { + ConsumeTile(block_offset); + block_offset += TILE_ITEMS; + } + + // Consume a partially-full tile + if (block_offset < block_end) + { + int valid_items = block_end - block_offset; + ConsumeTile(block_offset, valid_items); + } + + // Compute block-wide reduction + block_aggregate = (first_tile_size < TILE_ITEMS) ? + BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) : + BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op); + } + + + /** + * Reduce a contiguous segment of input tiles + */ + __device__ __forceinline__ void ConsumeRange( + Offset num_items, ///< [in] Total number of global input items + GridEvenShare &even_share, ///< [in] GridEvenShare descriptor + GridQueue &queue, ///< [in,out] GridQueue descriptor + T &block_aggregate, ///< [out] Running total + Int2Type is_even_share) ///< [in] Marker type indicating this is an even-share mapping + { + // Initialize even-share descriptor for this thread block + even_share.BlockInit(); + + // Consume input tiles + ConsumeRange(even_share.block_offset, even_share.block_end, block_aggregate); + } + + + //--------------------------------------------------------------------- + // Dynamically consume tiles + //--------------------------------------------------------------------- + + /** + * Dequeue and reduce tiles of items as part of a inter-block scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + GridQueue queue, ///< Queue descriptor for assigning tiles of work to thread blocks + T &block_aggregate) ///< [out] Running total + { + // Shared dequeue offset + __shared__ Offset dequeue_offset; + + // We give each thread block at least one tile of input. + Offset block_offset = blockIdx.x * TILE_ITEMS; + Offset even_share_base = gridDim.x * TILE_ITEMS; + + if (block_offset + TILE_ITEMS <= num_items) + { + // Consume full tile of input + ConsumeTile(block_offset); + + // Dequeue more tiles + while (true) + { + // Dequeue a tile of items + if (threadIdx.x == 0) + dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base; + + __syncthreads(); + + // Grab tile offset and check if we're done with full tiles + block_offset = dequeue_offset; + + __syncthreads(); + + if (block_offset + TILE_ITEMS > num_items) + break; + + // Consume a full tile + ConsumeTile(block_offset); + } + } + + if (block_offset < num_items) + { + int valid_items = num_items - block_offset; + ConsumeTile(block_offset, valid_items); + } + + // Compute block-wide reduction + block_aggregate = (first_tile_size < TILE_ITEMS) ? + BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) : + BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op); + } + + + /** + * Dequeue and reduce tiles of items as part of a inter-block scan + */ + __device__ __forceinline__ void ConsumeRange( + Offset num_items, ///< [in] Total number of global input items + GridEvenShare &even_share, ///< [in] GridEvenShare descriptor + GridQueue &queue, ///< [in,out] GridQueue descriptor + T &block_aggregate, ///< [out] Running total + Int2Type is_dynamic) ///< [in] Marker type indicating this is a dynamic mapping + { + ConsumeRange(num_items, queue, block_aggregate); + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block_range/block_range_reduce_by_key.cuh b/SRC/cub/block_range/block_range_reduce_by_key.cuh new file mode 100644 index 00000000..f56baaa0 --- /dev/null +++ b/SRC/cub/block_range/block_range_reduce_by_key.cuh @@ -0,0 +1,1034 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. + */ + +#pragma once + +#include + +#include "block_scan_prefix_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_exchange.cuh" +#include "../block/block_discontinuity.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../iterator/constant_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for BlockRangeReduceByKey + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + bool _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct BlockRangeReduceByKeyPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + TWO_PHASE_SCATTER = _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + +/****************************************************************************** + * Tile status interface types + ******************************************************************************/ + +/** + * Tile status interface for reduction by key. + * + */ +template < + typename Value, + typename Offset, + bool SINGLE_WORD = (Traits::PRIMITIVE) && (sizeof(Value) + sizeof(Offset) < 16)> +struct ReduceByKeyScanTileState; + + +/** + * Tile status interface for reduction by key, specialized for scan status and value types that + * cannot be combined into one machine word. + */ +template < + typename Value, + typename Offset> +struct ReduceByKeyScanTileState : + ScanTileState > +{ + typedef ScanTileState > SuperClass; + + /// Constructor + __host__ __device__ __forceinline__ + ReduceByKeyScanTileState() : SuperClass() {} +}; + + +/** + * Tile status interface for reduction by key, specialized for scan status and value types that + * can be combined into one machine word that can be read/written coherently in a single access. + */ +template < + typename Value, + typename Offset> +struct ReduceByKeyScanTileState +{ + typedef ItemOffsetPair ItemOffsetPair; + + // Constants + enum + { + PAIR_SIZE = sizeof(Value) + sizeof(Offset), + TXN_WORD_SIZE = 1 << Log2::VALUE, + STATUS_WORD_SIZE = TXN_WORD_SIZE - PAIR_SIZE, + + TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, + }; + + // Status word type + typedef typename If<(STATUS_WORD_SIZE == 8), + long long, + typename If<(STATUS_WORD_SIZE == 4), + int, + typename If<(STATUS_WORD_SIZE == 2), + short, + char>::Type>::Type>::Type StatusWord; + + // Status word type + typedef typename If<(TXN_WORD_SIZE == 16), + longlong2, + typename If<(TXN_WORD_SIZE == 8), + long long, + int>::Type>::Type TxnWord; + + // Device word type (for when sizeof(Value) == sizeof(Offset)) + struct TileDescriptorBigStatus + { + Offset offset; + Value value; + StatusWord status; + }; + + // Device word type (for when sizeof(Value) != sizeof(Offset)) + struct TileDescriptorLittleStatus + { + Value value; + StatusWord status; + Offset offset; + }; + + // Device word type + typedef typename If< + (sizeof(Value) == sizeof(Offset)), + TileDescriptorBigStatus, + TileDescriptorLittleStatus>::Type + TileDescriptor; + + + // Device storage + TileDescriptor *d_tile_status; + + + /// Constructor + __host__ __device__ __forceinline__ + ReduceByKeyScanTileState() + : + d_tile_status(NULL) + {} + + + /// Initializer + __host__ __device__ __forceinline__ + cudaError_t Init( + int num_tiles, ///< [in] Number of tiles + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation + { + d_tile_status = reinterpret_cast(d_temp_storage); + return cudaSuccess; + } + + + /** + * Compute device memory needed for tile status + */ + __host__ __device__ __forceinline__ + static cudaError_t AllocationSize( + int num_tiles, ///< [in] Number of tiles + size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + { + temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors + return cudaSuccess; + } + + + /** + * Initialize (from device) + */ + __device__ __forceinline__ void InitializeStatus(int num_tiles) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tile_idx < num_tiles) + { + // Not-yet-set + d_tile_status[TILE_STATUS_PADDING + tile_idx].status = StatusWord(SCAN_TILE_INVALID); + } + + if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) + { + // Padding + d_tile_status[threadIdx.x].status = StatusWord(SCAN_TILE_OOB); + } + } + + + /** + * Update the specified tile's inclusive value and corresponding status + */ + __device__ __forceinline__ void SetInclusive(int tile_idx, ItemOffsetPair tile_inclusive) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_INCLUSIVE; + tile_descriptor.value = tile_inclusive.value; + tile_descriptor.offset = tile_inclusive.offset; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias); + } + + + /** + * Update the specified tile's partial value and corresponding status + */ + __device__ __forceinline__ void SetPartial(int tile_idx, ItemOffsetPair tile_partial) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_PARTIAL; + tile_descriptor.value = tile_partial.value; + tile_descriptor.offset = tile_partial.offset; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias); + } + + /** + * Wait for the corresponding tile to become non-invalid + */ + __device__ __forceinline__ void WaitForValid( + int tile_idx, + StatusWord &status, + ItemOffsetPair &value) + { + // Use warp-any to determine when all threads have valid status + TxnWord alias = ThreadLoad(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx)); + TileDescriptor tile_descriptor = reinterpret_cast(alias); + + while ((tile_descriptor.status == SCAN_TILE_INVALID)) + { + alias = ThreadLoad(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx)); + tile_descriptor = reinterpret_cast(alias); + } + + status = tile_descriptor.status; + value.value = tile_descriptor.value; + value.offset = tile_descriptor.offset; + } + +}; + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief BlockRangeReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key across a range of tiles + */ +template < + typename BlockRangeReduceByKeyPolicy, ///< Parameterized BlockRangeReduceByKeyPolicy tuning policy type + typename KeyInputIterator, ///< Random-access input iterator type for keys + typename KeyOutputIterator, ///< Random-access output iterator type for keys + typename ValueInputIterator, ///< Random-access input iterator type for values + typename ValueOutputIterator, ///< Random-access output iterator type for values + typename EqualityOp, ///< Key equality operator type + typename ReductionOp, ///< Value reduction operator type + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeReduceByKey +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Data type of key iterator + typedef typename std::iterator_traits::value_type Key; + + // Data type of value iterator + typedef typename std::iterator_traits::value_type Value; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileState; + + // Constants + enum + { + BLOCK_THREADS = BlockRangeReduceByKeyPolicy::BLOCK_THREADS, + WARPS = BLOCK_THREADS / CUB_PTX_WARP_THREADS, + ITEMS_PER_THREAD = BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD, + TWO_PHASE_SCATTER = (BlockRangeReduceByKeyPolicy::TWO_PHASE_SCATTER) && (ITEMS_PER_THREAD > 1), + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type) + HAS_IDENTITY_ZERO = (Equals::VALUE) && (Traits::PRIMITIVE), + + // Whether or not to sync after loading data + SYNC_AFTER_LOAD = (BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), + + // Whether or not this is run-length-encoding with a constant iterator as values + IS_RUN_LENGTH_ENCODE = (Equals >::VALUE) || (Equals >::VALUE) || (Equals >::VALUE), + + }; + + // Cache-modified input iterator wrapper type for keys + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValueInputIterator + KeyInputIterator>::Type // Directly use the supplied input iterator type + WrappedKeyInputIterator; + + // Cache-modified input iterator wrapper type for values + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValueInputIterator + ValueInputIterator>::Type // Directly use the supplied input iterator type + WrappedValueInputIterator; + + // Value-offset tuple type for scanning (maps accumulated values to segment index) + typedef ItemOffsetPair ValueOffsetPair; + + // Reduce-value-by-segment scan operator + struct ReduceByKeyOp + { + ReductionOp op; ///< Wrapped reduction operator + + /// Constructor + __device__ __forceinline__ ReduceByKeyOp(ReductionOp op) : op(op) {} + + /// Scan operator (specialized for sum on primitive types) + __device__ __forceinline__ ValueOffsetPair operator()( + const ValueOffsetPair &first, ///< First partial reduction + const ValueOffsetPair &second, ///< Second partial reduction + Int2Type has_identity_zero) ///< Whether the operation has a zero-valued identity + { + Value select = (second.offset) ? 0 : first.value; + + ValueOffsetPair retval; + retval.offset = first.offset + second.offset; + retval.value = op(select, second.value); + return retval; + } + + /// Scan operator (specialized for reductions without zero-valued identity) + __device__ __forceinline__ ValueOffsetPair operator()( + const ValueOffsetPair &first, ///< First partial reduction + const ValueOffsetPair &second, ///< Second partial reduction + Int2Type has_identity_zero) ///< Whether the operation has a zero-valued identity + { +#if (__CUDA_ARCH__ > 130) + // This expression uses less registers and is faster when compiled with nvvm + ValueOffsetPair retval; + retval.offset = first.offset + second.offset; + if (second.offset) + { + retval.value = second.value; + return retval; + } + else + { + retval.value = op(first.value, second.value); + return retval; + } +#else + // This expression uses less registers and is faster when compiled with Open64 + ValueOffsetPair retval; + retval.offset = first.offset + second.offset; + retval.value = (second.offset) ? + second.value : // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate + op(first.value, second.value); // The second partial reduction does not span a reset, so accumulate both into the running aggregate + return retval; +#endif + } + + /// Scan operator + __device__ __forceinline__ ValueOffsetPair operator()( + const ValueOffsetPair &first, ///< First partial reduction + const ValueOffsetPair &second) ///< Second partial reduction + { + return (*this)(first, second, Int2Type()); + } + }; + + // Parameterized BlockLoad type for keys + typedef BlockLoad< + WrappedKeyInputIterator, + BlockRangeReduceByKeyPolicy::BLOCK_THREADS, + BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD, + BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM> + BlockLoadKeys; + + // Parameterized BlockLoad type for values + typedef BlockLoad< + WrappedValueInputIterator, + BlockRangeReduceByKeyPolicy::BLOCK_THREADS, + BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD, + (IS_RUN_LENGTH_ENCODE) ? + BLOCK_LOAD_DIRECT : + (BlockLoadAlgorithm) BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM> + BlockLoadValues; + + // Parameterized BlockExchange type for locally compacting items as part of a two-phase scatter + typedef BlockExchange< + Key, + BLOCK_THREADS, + ITEMS_PER_THREAD> + BlockExchangeKeys; + + // Parameterized BlockExchange type for locally compacting items as part of a two-phase scatter + typedef BlockExchange< + Value, + BLOCK_THREADS, + ITEMS_PER_THREAD> + BlockExchangeValues; + + // Parameterized BlockDiscontinuity type for keys + typedef BlockDiscontinuity BlockDiscontinuityKeys; + + // Parameterized BlockScan type + typedef BlockScan< + ValueOffsetPair, + BlockRangeReduceByKeyPolicy::BLOCK_THREADS, + BlockRangeReduceByKeyPolicy::SCAN_ALGORITHM> + BlockScanAllocations; + + // Callback type for obtaining tile prefix during block scan + typedef BlockScanLookbackPrefixOp< + ValueOffsetPair, + ReduceByKeyOp, + ScanTileState> + LookbackPrefixCallbackOp; + + // Shared memory type for this threadblock + struct _TempStorage + { + + union + { + struct + { + typename BlockScanAllocations::TempStorage scan; // Smem needed for tile scanning + typename LookbackPrefixCallbackOp::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockDiscontinuityKeys::TempStorage discontinuity; // Smem needed for discontinuity detection + typename BlockLoadKeys::TempStorage load_keys; // Smem needed for loading keys + + Offset tile_idx; // Shared tile index + Offset tile_num_flags_prefix; // Exclusive tile prefix + }; + + // Smem needed for loading values + typename BlockLoadValues::TempStorage load_values; + + // Smem needed for compacting values + typename BlockExchangeValues::TempStorage exchange_values; + + // Smem needed for compacting keys + typename BlockExchangeKeys::TempStorage exchange_keys; + }; + + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage &temp_storage; ///< Reference to temp_storage + + WrappedKeyInputIterator d_keys_in; ///< Input keys + KeyOutputIterator d_keys_out; ///< Output keys + + WrappedValueInputIterator d_values_in; ///< Input values + ValueOutputIterator d_values_out; ///< Output values + + InequalityWrapper inequality_op; ///< Key inequality operator + ReduceByKeyOp scan_op; ///< Reduce-value-by flag scan operator + Offset num_items; ///< Total number of input items + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + BlockRangeReduceByKey( + TempStorage &temp_storage, ///< Reference to temp_storage + KeyInputIterator d_keys_in, ///< Input keys + KeyOutputIterator d_keys_out, ///< Output keys + ValueInputIterator d_values_in, ///< Input values + ValueOutputIterator d_values_out, ///< Output values + EqualityOp equality_op, ///< Key equality operator + ReductionOp reduction_op, ///< Value reduction operator + Offset num_items) ///< Total number of input items + : + temp_storage(temp_storage.Alias()), + d_keys_in(d_keys_in), + d_keys_out(d_keys_out), + d_values_in(d_values_in), + d_values_out(d_values_out), + inequality_op(equality_op), + scan_op(reduction_op), + num_items(num_items) + {} + + + //--------------------------------------------------------------------- + // Block scan utility methods + //--------------------------------------------------------------------- + + /** + * Scan with identity (first tile) + */ + __device__ __forceinline__ + void ScanBlock( + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + ValueOffsetPair &block_aggregate, + Int2Type has_identity) + { + ValueOffsetPair identity; + identity.value = 0; + identity.offset = 0; + BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, identity, scan_op, block_aggregate); + } + + /** + * Scan without identity (first tile). Without an identity, the first output item is undefined. + * + */ + __device__ __forceinline__ + void ScanBlock( + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + ValueOffsetPair &block_aggregate, + Int2Type has_identity) + { + BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, scan_op, block_aggregate); + } + + /** + * Scan with identity (subsequent tile) + */ + __device__ __forceinline__ + void ScanBlock( + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + ValueOffsetPair &block_aggregate, + LookbackPrefixCallbackOp &prefix_op, + Int2Type has_identity) + { + ValueOffsetPair identity; + identity.value = 0; + identity.offset = 0; + BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, identity, scan_op, block_aggregate, prefix_op); + } + + /** + * Scan without identity (subsequent tile). Without an identity, the first output item is undefined. + */ + __device__ __forceinline__ + void ScanBlock( + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + ValueOffsetPair &block_aggregate, + LookbackPrefixCallbackOp &prefix_op, + Int2Type has_identity) + { + BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, scan_op, block_aggregate, prefix_op); + } + + + //--------------------------------------------------------------------- + // Zip utility methods + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void ZipValuesAndFlags( + Offset num_remaining, + Value (&values)[ITEMS_PER_THREAD], + Offset (&flags)[ITEMS_PER_THREAD], + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD]) + { + // Zip values and flags + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + // Unset flags for out-of-bounds keys + if ((LAST_TILE) && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_remaining)) + flags[ITEM] = 0; + + values_and_segments[ITEM].value = values[ITEM]; + values_and_segments[ITEM].offset = flags[ITEM]; + } + } + + //--------------------------------------------------------------------- + // Scatter utility methods + //--------------------------------------------------------------------- + + + + /** + * Scatter flagged items to output offsets (specialized for direct scattering) + * + * The exclusive scan causes each head flag to be paired with the previous + * value aggregate. As such: + * - The scatter offsets must be decremented for value value aggregates + * - The first tile does not scatter the first flagged value (it is undefined from the exclusive scan) + * - If the tile is partially-full, we need to scatter the first out-of-bounds value (which aggregates all valid values in the last segment) + * + */ + template + __device__ __forceinline__ void ScatterDirect( + Offset num_remaining, + Key (&keys)[ITEMS_PER_THREAD], + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + Offset (&flags)[ITEMS_PER_THREAD], + Offset tile_num_flags, + Int2Type iteration) + { + // Scatter key + if (flags[ITEM]) + { + d_keys_out[values_and_segments[ITEM].offset] = keys[ITEM]; + } + + bool is_first_flag = FIRST_TILE && (ITEM == 0) && (threadIdx.x == 0); + bool is_oob_value = (LAST_TILE) && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining); + + // Scatter value reduction + if (((flags[ITEM] || is_oob_value)) && (!is_first_flag)) + { + d_values_out[values_and_segments[ITEM].offset - 1] = values_and_segments[ITEM].value; + } + + ScatterDirect(num_remaining, keys, values_and_segments, flags, tile_num_flags, Int2Type()); + } + + template + __device__ __forceinline__ void ScatterDirect( + Offset num_remaining, + Key (&keys)[ITEMS_PER_THREAD], + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + Offset (&flags)[ITEMS_PER_THREAD], + Offset tile_num_flags, + Int2Type iteration) + {} + + /** + * Scatter flagged items to output offsets (specialized for two-phase scattering) + * + * The exclusive scan causes each head flag to be paired with the previous + * value aggregate. As such: + * - The scatter offsets must be decremented for value value aggregates + * - The first tile does not scatter the first flagged value (it is undefined from the exclusive scan) + * - If the tile is partially-full, we need to scatter the first out-of-bounds value (which aggregates all valid values in the last segment) + * + */ + template + __device__ __forceinline__ void ScatterTwoPhase( + Offset num_remaining, + Key (&keys)[ITEMS_PER_THREAD], + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + Offset (&flags)[ITEMS_PER_THREAD], + Offset tile_num_flags, + Offset tile_num_flags_prefix) + { + int local_ranks[ITEMS_PER_THREAD]; + Value values[ITEMS_PER_THREAD]; + + // Share exclusive tile prefix + if (threadIdx.x == 0) + { + temp_storage.tile_num_flags_prefix = tile_num_flags_prefix; + } + + __syncthreads(); + + // Load exclusive tile prefix in all threads + tile_num_flags_prefix = temp_storage.tile_num_flags_prefix; + + __syncthreads(); + + // Compute local scatter ranks + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + local_ranks[ITEM] = values_and_segments[ITEM].offset - tile_num_flags_prefix; + } + + // Compact keys in shared memory + BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, local_ranks, flags); + + // Scatter keys + StoreDirectStriped(threadIdx.x, d_keys_out + tile_num_flags_prefix, keys, tile_num_flags); + + // Unzip values and set flag for first oob item in last tile + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + values[ITEM] = values_and_segments[ITEM].value; + + if (FIRST_TILE) + local_ranks[ITEM]--; + + if (LAST_TILE && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining)) + flags[ITEM] = 1; + } + + // Unset first flag in first tile + if (FIRST_TILE && (threadIdx.x == 0)) + flags[0] = 0; + + __syncthreads(); + + // Compact values in shared memory + BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, local_ranks, flags); + + // Number to output + Offset exchange_count = tile_num_flags; + + if (LAST_TILE && (num_remaining < TILE_ITEMS)) + exchange_count++; + + if (FIRST_TILE) + { + exchange_count--; + } + else + { + tile_num_flags_prefix--; + } + + // Scatter values + StoreDirectStriped(threadIdx.x, d_values_out + tile_num_flags_prefix, values, exchange_count); + + __syncthreads(); + } + + + /** + * Scatter flagged items + */ + template + __device__ __forceinline__ void Scatter( + Offset num_remaining, + Key (&keys)[ITEMS_PER_THREAD], + ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], + Offset (&flags)[ITEMS_PER_THREAD], + Offset tile_num_flags, + Offset tile_num_flags_prefix) + { + // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one + if ((TWO_PHASE_SCATTER) && ((tile_num_flags >> Log2::VALUE) > 0)) + { + ScatterTwoPhase( + num_remaining, + keys, + values_and_segments, + flags, + tile_num_flags, + tile_num_flags_prefix); + } + else + { + ScatterDirect( + num_remaining, + keys, + values_and_segments, + flags, + tile_num_flags, + Int2Type<0>()); + } + } + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic domino scan) + */ + template < + bool LAST_TILE> + __device__ __forceinline__ ValueOffsetPair ConsumeTile( + Offset num_items, ///< Total number of global input items + Offset num_remaining, ///< Number of global input items remaining (including this tile) + int tile_idx, ///< Tile index + Offset block_offset, ///< Tile offset + ScanTileState &tile_status) ///< Global list of tile status + { + Key keys[ITEMS_PER_THREAD]; // Tile keys + Value values[ITEMS_PER_THREAD]; // Tile values + Offset flags[ITEMS_PER_THREAD]; // Segment head flags + ValueOffsetPair values_and_segments[ITEMS_PER_THREAD]; // Zipped values and segment flags|indices + + ValueOffsetPair running_total; // Running count of segments and current value aggregate (including this tile) + + if (tile_idx == 0) + { + // First tile + + // Load keys and values + if (LAST_TILE) + { + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, num_remaining); + } + else + { + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys); + } + + if (SYNC_AFTER_LOAD) + __syncthreads(); + + // Load values + if (LAST_TILE) + BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, num_remaining); + else + BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values); + + if (SYNC_AFTER_LOAD) + __syncthreads(); + + // Set head flags. First tile sets the first flag for the first item + BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(flags, keys, inequality_op); + + // Zip values and flags + ZipValuesAndFlags(num_remaining, values, flags, values_and_segments); + + // Exclusive scan of values and flags + ValueOffsetPair block_aggregate; + ScanBlock(values_and_segments, block_aggregate, Int2Type()); + + // Update tile status if this is not the last tile + if (!LAST_TILE && (threadIdx.x == 0)) + tile_status.SetInclusive(0, block_aggregate); + + // Set offset for first scan output + if (!HAS_IDENTITY_ZERO && (threadIdx.x == 0)) + values_and_segments[0].offset = 0; + + running_total = block_aggregate; + + // Scatter flagged items + Scatter(num_remaining, keys, values_and_segments, flags, block_aggregate.offset, 0); + } + else + { + // Not first tile + + // Load keys and values + if (LAST_TILE) + { + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, num_remaining); + } + else + { + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys); + } + + if (SYNC_AFTER_LOAD) + __syncthreads(); + + // Load values + if (LAST_TILE) + BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, num_remaining); + else + BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values); + + if (SYNC_AFTER_LOAD) + __syncthreads(); + + // Obtain the last key in the previous tile to compare with + Key tile_predecessor_key = (threadIdx.x == 0) ? + d_keys_in[block_offset - 1] : + ZeroInitialize(); + + // Set head flags + BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(flags, keys, inequality_op, tile_predecessor_key); + + // Zip values and flags + ZipValuesAndFlags(num_remaining, values, flags, values_and_segments); + + // Exclusive scan of values and flags + ValueOffsetPair block_aggregate; + LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx); + + ScanBlock(values_and_segments, block_aggregate, prefix_op, Int2Type()); + running_total = prefix_op.inclusive_prefix; + + // Scatter flagged items + Scatter(num_remaining, keys, values_and_segments, flags, block_aggregate.offset, prefix_op.exclusive_prefix.offset); + } + + return running_total; + } + + + /** + * Dequeue and scan tiles of items as part of a dynamic domino scan + */ + template ///< Output iterator type for recording number of items selected + __device__ __forceinline__ void ConsumeRange( + int num_tiles, ///< Total number of input tiles + GridQueue queue, ///< Queue descriptor for assigning tiles of work to thread blocks + ScanTileState &tile_status, ///< Global list of tile status + NumSegmentsIterator d_num_segments) ///< Output pointer for total number of segments identified + { +#if (CUB_PTX_ARCH <= 130) + // Blocks are launched in increasing order, so just assign one tile per block + + int tile_idx = (blockIdx.y * 32 * 1024) + blockIdx.x; // Current tile index + Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile + Offset num_remaining = num_items - block_offset; // Remaining items (including this tile) + + if (num_remaining > TILE_ITEMS) + { + // Full tile + ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); + } + else if (num_remaining > 0) + { + // Last tile + ValueOffsetPair running_total = ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); + + // Output the total number of items selected + if (threadIdx.x == 0) + { + *d_num_segments = running_total.offset; + + // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment + if (num_remaining == TILE_ITEMS) + { + d_values_out[running_total.offset - 1] = running_total.value; + } + } + } +#else + // Blocks may not be launched in increasing order, so work-steal tiles + + // Get first tile index + if (threadIdx.x == 0) + temp_storage.tile_idx = queue.Drain(1); + + __syncthreads(); + + int tile_idx = temp_storage.tile_idx; + Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile + Offset num_remaining = num_items - block_offset; // Remaining items (including this tile) + + while (num_remaining > TILE_ITEMS) + { + if (SYNC_AFTER_LOAD) + __syncthreads(); + + // Consume full tile + ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); + + // Get tile index + if (threadIdx.x == 0) + temp_storage.tile_idx = queue.Drain(1); + + __syncthreads(); + + tile_idx = temp_storage.tile_idx; + block_offset = Offset(TILE_ITEMS) * tile_idx; + num_remaining = num_items - block_offset; + } + + if (num_remaining > 0) + { + // Consume last tile (treat as partially-full) + ValueOffsetPair running_total = ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); + + if ((threadIdx.x == 0)) + { + // Output the total number of items selected + *d_num_segments = running_total.offset; + + // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment + if (num_remaining == TILE_ITEMS) + { + d_values_out[running_total.offset - 1] = running_total.value; + } + } + } +#endif + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block_range/block_range_scan.cuh b/SRC/cub/block_range/block_range_scan.cuh new file mode 100644 index 00000000..77d44d11 --- /dev/null +++ b/SRC/cub/block_range/block_range_scan.cuh @@ -0,0 +1,538 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles. + */ + +#pragma once + +#include + +#include "block_scan_prefix_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for BlockRangeScan + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + bool _LOAD_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage) + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + BlockStoreAlgorithm _STORE_ALGORITHM, ///< The BlockStore algorithm to use + bool _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct BlockRangeScanPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + LOAD_WARP_TIME_SLICING = _LOAD_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage) + STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; ///< The BlockStore algorithm to use + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles. + */ +template < + typename BlockRangeScanPolicy, ///< Parameterized BlockRangeScanPolicy tuning policy type + typename InputIterator, ///< Random-access input iterator type + typename OutputIterator, ///< Random-access output iterator type + typename ScanOp, ///< Scan functor type + typename Identity, ///< Identity element type (cub::NullType for inclusive scan) + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeScan +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Data type of input iterator + typedef typename std::iterator_traits::value_type T; + + // Tile status descriptor interface type + typedef ScanTileState ScanTileState; + + // Input iterator wrapper type + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + InputIterator>::Type // Directly use the supplied input iterator type + WrappedInputIterator; + + // Constants + enum + { + INCLUSIVE = Equals::VALUE, // Inclusive scan if no identity type is provided + BLOCK_THREADS = BlockRangeScanPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeScanPolicy::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + // Parameterized BlockLoad type + typedef BlockLoad< + WrappedInputIterator, + BlockRangeScanPolicy::BLOCK_THREADS, + BlockRangeScanPolicy::ITEMS_PER_THREAD, + BlockRangeScanPolicy::LOAD_ALGORITHM, + BlockRangeScanPolicy::LOAD_WARP_TIME_SLICING> + BlockLoadT; + + // Parameterized BlockStore type + typedef BlockStore< + OutputIterator, + BlockRangeScanPolicy::BLOCK_THREADS, + BlockRangeScanPolicy::ITEMS_PER_THREAD, + BlockRangeScanPolicy::STORE_ALGORITHM, + BlockRangeScanPolicy::STORE_WARP_TIME_SLICING> + BlockStoreT; + + // Parameterized BlockScan type + typedef BlockScan< + T, + BlockRangeScanPolicy::BLOCK_THREADS, + BlockRangeScanPolicy::SCAN_ALGORITHM> + BlockScanT; + + // Callback type for obtaining tile prefix during block scan + typedef BlockScanLookbackPrefixOp< + T, + ScanOp, + ScanTileState> + LookbackPrefixCallbackOp; + + // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles + typedef BlockScanRunningPrefixOp< + T, + ScanOp> + RunningPrefixCallbackOp; + + // Shared memory type for this threadblock + struct _TempStorage + { + union + { + typename BlockLoadT::TempStorage load; // Smem needed for tile loading + typename BlockStoreT::TempStorage store; // Smem needed for tile storing + struct + { + typename LookbackPrefixCallbackOp::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockScanT::TempStorage scan; // Smem needed for tile scanning + }; + }; + + Offset tile_idx; // Shared tile index + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage &temp_storage; ///< Reference to temp_storage + WrappedInputIterator d_in; ///< Input data + OutputIterator d_out; ///< Output data + ScanOp scan_op; ///< Binary scan operator + Identity identity; ///< Identity element + + + + //--------------------------------------------------------------------- + // Block scan utility methods (first tile) + //--------------------------------------------------------------------- + + /** + * Exclusive scan specialization + */ + template + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate) + { + BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate); + } + + /** + * Exclusive sum specialization + */ + template + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate) + { + BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate); + } + + /** + * Inclusive scan specialization + */ + template + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate) + { + BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate); + } + + /** + * Inclusive sum specialization + */ + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate) + { + BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate); + } + + //--------------------------------------------------------------------- + // Block scan utility methods (subsequent tiles) + //--------------------------------------------------------------------- + + /** + * Exclusive scan specialization (with prefix from predecessors) + */ + template + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op) + { + BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate, prefix_op); + } + + /** + * Exclusive sum specialization (with prefix from predecessors) + */ + template + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op) + { + BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate, prefix_op); + } + + /** + * Inclusive scan specialization (with prefix from predecessors) + */ + template + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op) + { + BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate, prefix_op); + } + + /** + * Inclusive sum specialization (with prefix from predecessors) + */ + template + __device__ __forceinline__ + void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op) + { + BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate, prefix_op); + } + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + BlockRangeScan( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIterator d_in, ///< Input data + OutputIterator d_out, ///< Output data + ScanOp scan_op, ///< Binary scan operator + Identity identity) ///< Identity element + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_out(d_out), + scan_op(scan_op), + identity(identity) + {} + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic domino scan) + */ + template + __device__ __forceinline__ void ConsumeTile( + Offset num_items, ///< Total number of input items + Offset num_remaining, ///< Total number of items remaining to be processed (including this tile) + int tile_idx, ///< Tile index + Offset block_offset, ///< Tile offset + ScanTileState &tile_status) ///< Global list of tile status + { + // Load items + T items[ITEMS_PER_THREAD]; + + if (LAST_TILE) + BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_remaining); + else + BlockLoadT(temp_storage.load).Load(d_in + block_offset, items); + + __syncthreads(); + + // Perform tile scan + if (tile_idx == 0) + { + // Scan first tile + T block_aggregate; + ScanBlock(items, scan_op, identity, block_aggregate); + + // Update tile status if there may be successor tiles (i.e., this tile is full) + if (!LAST_TILE && (threadIdx.x == 0)) + tile_status.SetInclusive(0, block_aggregate); + } + else + { + // Scan non-first tile + T block_aggregate; + LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx); + ScanBlock(items, scan_op, identity, block_aggregate, prefix_op); + } + + __syncthreads(); + + // Store items + if (LAST_TILE) + BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, num_remaining); + else + BlockStoreT(temp_storage.store).Store(d_out + block_offset, items); + } + + + /** + * Dequeue and scan tiles of items as part of a dynamic domino scan + */ + __device__ __forceinline__ void ConsumeRange( + int num_items, ///< Total number of input items + GridQueue queue, ///< Queue descriptor for assigning tiles of work to thread blocks + ScanTileState &tile_status) ///< Global list of tile status + { +#if (CUB_PTX_ARCH <= 130) + // Blocks are launched in increasing order, so just assign one tile per block + + int tile_idx = (blockIdx.y * 32 * 1024) + blockIdx.x; // Current tile index + Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile + Offset num_remaining = num_items - block_offset; // Remaining items (including this tile) + + if (block_offset + TILE_ITEMS <= num_items) + ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); + else if (block_offset < num_items) + ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); + +#else + // Blocks may not be launched in increasing order, so work-steal tiles + + // Get first tile index + if (threadIdx.x == 0) + temp_storage.tile_idx = queue.Drain(1); + + __syncthreads(); + + int tile_idx = temp_storage.tile_idx; + Offset block_offset = TILE_ITEMS * tile_idx; + Offset num_remaining = num_items - block_offset; + + while (num_remaining >= TILE_ITEMS) + { + // Consume full tile + ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); + + // Get next tile + if (threadIdx.x == 0) + temp_storage.tile_idx = queue.Drain(1); + + __syncthreads(); + + tile_idx = temp_storage.tile_idx; + block_offset = TILE_ITEMS * tile_idx; + num_remaining = num_items - block_offset; + } + + // Consume the last (and potentially partially-full) tile + if (num_remaining > 0) + { + ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); + } + +#endif + } + + + //--------------------------------------------------------------------- + // Scan an sequence of consecutive tiles (independent of other thread blocks) + //--------------------------------------------------------------------- + + /** + * Process a tile of input + */ + template < + bool FULL_TILE, + bool FIRST_TILE> + __device__ __forceinline__ void ConsumeTile( + Offset block_offset, ///< Tile offset + RunningPrefixCallbackOp &prefix_op, ///< Running prefix operator + int valid_items = TILE_ITEMS) ///< Number of valid items in the tile + { + // Load items + T items[ITEMS_PER_THREAD]; + + if (FULL_TILE) + BlockLoadT(temp_storage.load).Load(d_in + block_offset, items); + else + BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, valid_items); + + __syncthreads(); + + // Block scan + if (FIRST_TILE) + { + T block_aggregate; + ScanBlock(items, scan_op, identity, block_aggregate); + prefix_op.running_total = block_aggregate; + } + else + { + T block_aggregate; + ScanBlock(items, scan_op, identity, block_aggregate, prefix_op); + } + + __syncthreads(); + + // Store items + if (FULL_TILE) + BlockStoreT(temp_storage.store).Store(d_out + block_offset, items); + else + BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, valid_items); + } + + + /** + * Scan a consecutive share of input tiles + */ + __device__ __forceinline__ void ConsumeRange( + Offset block_offset, ///< [in] Threadblock begin offset (inclusive) + Offset block_end) ///< [in] Threadblock end offset (exclusive) + { + BlockScanRunningPrefixOp prefix_op(scan_op); + + if (block_offset + TILE_ITEMS <= block_end) + { + // Consume first tile of input (full) + ConsumeTile(block_offset, prefix_op); + block_offset += TILE_ITEMS; + + // Consume subsequent full tiles of input + while (block_offset + TILE_ITEMS <= block_end) + { + ConsumeTile(block_offset, prefix_op); + block_offset += TILE_ITEMS; + } + + // Consume a partially-full tile + if (block_offset < block_end) + { + int valid_items = block_end - block_offset; + ConsumeTile(block_offset, prefix_op, valid_items); + } + } + else + { + // Consume the first tile of input (partially-full) + int valid_items = block_end - block_offset; + ConsumeTile(block_offset, prefix_op, valid_items); + } + } + + + /** + * Scan a consecutive share of input tiles, seeded with the specified prefix value + */ + __device__ __forceinline__ void ConsumeRange( + Offset block_offset, ///< [in] Threadblock begin offset (inclusive) + Offset block_end, ///< [in] Threadblock end offset (exclusive) + T prefix) ///< [in] The prefix to apply to the scan segment + { + BlockScanRunningPrefixOp prefix_op(prefix, scan_op); + + // Consume full tiles of input + while (block_offset + TILE_ITEMS <= block_end) + { + ConsumeTile(block_offset, prefix_op); + block_offset += TILE_ITEMS; + } + + // Consume a partially-full tile + if (block_offset < block_end) + { + int valid_items = block_end - block_offset; + ConsumeTile(block_offset, prefix_op, valid_items); + } + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block_range/block_range_select.cuh b/SRC/cub/block_range/block_range_select.cuh new file mode 100644 index 00000000..59fb5ce2 --- /dev/null +++ b/SRC/cub/block_range/block_range_select.cuh @@ -0,0 +1,735 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeSelect implements a stateful abstraction of CUDA thread blocks for participating in device-wide select. + */ + +#pragma once + +#include + +#include "block_scan_prefix_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_store.cuh" +#include "../block/block_scan.cuh" +#include "../block/block_exchange.cuh" +#include "../block/block_discontinuity.cuh" +#include "../grid/grid_queue.cuh" +#include "../iterator/cache_modified_input_iterator.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for BlockRangeSelect + */ +template < + int _BLOCK_THREADS, ///< Threads per thread block + int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use + CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements + bool _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global + BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use +struct BlockRangeSelectPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) + TWO_PHASE_SCATTER = _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global + }; + + static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use + static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements + static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use +}; + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + +/** + * \brief BlockRangeSelect implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles + * + * Performs functor-based selection if SelectOp functor type != NullType + * Otherwise performs flag-based selection if FlagIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename BlockRangeSelectPolicy, ///< Parameterized BlockRangeSelectPolicy tuning policy type + typename InputIterator, ///< Random-access input iterator type for selection items + typename FlagIterator, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename OutputIterator, ///< Random-access input iterator type for selected items + typename SelectOp, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection) + typename EqualityOp, ///< Equality operator type (NullType if selection functor or selections is to be used for selection) + typename Offset, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +struct BlockRangeSelect +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Data type of input iterator + typedef typename std::iterator_traits::value_type T; + + // Data type of flag iterator + typedef typename std::iterator_traits::value_type Flag; + + // Tile status descriptor interface type + typedef ScanTileState ScanTileState; + + // Constants + enum + { + USE_SELECT_OP, + USE_SELECT_FLAGS, + USE_DISCONTINUITY, + + BLOCK_THREADS = BlockRangeSelectPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeSelectPolicy::ITEMS_PER_THREAD, + TWO_PHASE_SCATTER = (BlockRangeSelectPolicy::TWO_PHASE_SCATTER) && (ITEMS_PER_THREAD > 1), + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + + // Whether or not to sync after loading data + SYNC_AFTER_LOAD = (BlockRangeSelectPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), + + SELECT_METHOD = (!Equals::VALUE) ? + USE_SELECT_OP : + (!Equals::VALUE) ? + USE_SELECT_FLAGS : + USE_DISCONTINUITY + }; + + // Input iterator wrapper type + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + InputIterator>::Type // Directly use the supplied input iterator type + WrappedInputIterator; + + // Flag iterator wrapper type + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + FlagIterator>::Type // Directly use the supplied input iterator type + WrappedFlagIterator; + + // Parameterized BlockLoad type for input items + typedef BlockLoad< + WrappedInputIterator, + BlockRangeSelectPolicy::BLOCK_THREADS, + BlockRangeSelectPolicy::ITEMS_PER_THREAD, + BlockRangeSelectPolicy::LOAD_ALGORITHM> + BlockLoadT; + + // Parameterized BlockLoad type for flags + typedef BlockLoad< + WrappedFlagIterator, + BlockRangeSelectPolicy::BLOCK_THREADS, + BlockRangeSelectPolicy::ITEMS_PER_THREAD, + BlockRangeSelectPolicy::LOAD_ALGORITHM> + BlockLoadFlags; + + // Parameterized BlockExchange type for input items + typedef BlockExchange< + T, + BLOCK_THREADS, + ITEMS_PER_THREAD> + BlockExchangeT; + + // Parameterized BlockDiscontinuity type for input items + typedef BlockDiscontinuity BlockDiscontinuityT; + + // Parameterized BlockScan type + typedef BlockScan< + Offset, + BlockRangeSelectPolicy::BLOCK_THREADS, + BlockRangeSelectPolicy::SCAN_ALGORITHM> + BlockScanAllocations; + + // Callback type for obtaining tile prefix during block scan + typedef BlockScanLookbackPrefixOp< + Offset, + Sum, + ScanTileState> + LookbackPrefixCallbackOp; + + // Shared memory type for this threadblock + struct _TempStorage + { + union + { + struct + { + typename LookbackPrefixCallbackOp::TempStorage prefix; // Smem needed for cooperative prefix callback + typename BlockScanAllocations::TempStorage scan; // Smem needed for tile scanning + typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection + }; + + // Smem needed for input loading + typename BlockLoadT::TempStorage load_items; + + // Smem needed for flag loading + typename BlockLoadFlags::TempStorage load_flags; + + // Smem needed for two-phase scatter + typename If::Type exchange; + }; + + Offset tile_idx; // Shared tile index + Offset tile_num_selected_prefix; // Exclusive tile prefix + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage &temp_storage; ///< Reference to temp_storage + WrappedInputIterator d_in; ///< Input data + WrappedFlagIterator d_flags; ///< Input flags + OutputIterator d_out; ///< Output data + SelectOp select_op; ///< Selection operator + InequalityWrapper inequality_op; ///< Inequality operator + Offset num_items; ///< Total number of input items + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + BlockRangeSelect( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIterator d_in, ///< Input data + FlagIterator d_flags, ///< Input flags + OutputIterator d_out, ///< Output data + SelectOp select_op, ///< Selection operator + EqualityOp equality_op, ///< Equality operator + Offset num_items) ///< Total number of input items + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_flags(d_flags), + d_out(d_out), + select_op(select_op), + inequality_op(equality_op), + num_items(num_items) + {} + + + //--------------------------------------------------------------------- + // Utility methods for initializing the selections + //--------------------------------------------------------------------- + + /** + * Template unrolled selection via selection operator + */ + template + __device__ __forceinline__ void ApplySelectionOp( + Offset block_offset, + Offset num_remaining, + T (&items)[ITEMS_PER_THREAD], + Offset (&selected)[ITEMS_PER_THREAD], + Int2Type iteration) + { + selected[ITERATION] = 0; + if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITERATION < num_remaining)) + selected[ITERATION] = select_op(items[ITERATION]); + + ApplySelectionOp(block_offset, num_remaining, items, selected, Int2Type()); + } + + /** + * Template unrolled selection via selection operator + */ + template + __device__ __forceinline__ void ApplySelectionOp( + Offset block_offset, + Offset num_remaining, + T (&items)[ITEMS_PER_THREAD], + Offset (&selected)[ITEMS_PER_THREAD], + Int2Type iteration) + {} + + /** + * Initialize selections (specialized for selection operator) + */ + template + __device__ __forceinline__ void InitializeSelections( + Offset block_offset, + Offset num_remaining, + T (&items)[ITEMS_PER_THREAD], + Offset (&selected)[ITEMS_PER_THREAD], + Int2Type select_method) + { + ApplySelectionOp(block_offset, num_remaining, items, selected, Int2Type<0>()); + } + + + /** + * Initialize selections (specialized for valid flags) + */ + template + __device__ __forceinline__ void InitializeSelections( + Offset block_offset, + Offset num_remaining, + T (&items)[ITEMS_PER_THREAD], + Offset (&selected)[ITEMS_PER_THREAD], + Int2Type select_method) + { + Flag flags[ITEMS_PER_THREAD]; + + if (LAST_TILE) + BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags, num_remaining, 0); + else + BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + selected[ITEM] = flags[ITEM]; + } + + if (SYNC_AFTER_LOAD) + __syncthreads(); + } + + + /** + * Initialize selections (specialized for discontinuity detection) + */ + template + __device__ __forceinline__ void InitializeSelections( + Offset block_offset, + Offset num_remaining, + T (&items)[ITEMS_PER_THREAD], + Offset (&selected)[ITEMS_PER_THREAD], + Int2Type select_method) + { + if (FIRST_TILE) + { + // First tile always flags the first item + BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op); + } + else + { + // Subsequent tiles require the last item from the previous tile + T tile_predecessor_item; + if (threadIdx.x == 0) + tile_predecessor_item = d_in[block_offset - 1]; + + BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op, tile_predecessor_item); + } + } + + + //--------------------------------------------------------------------- + // Utility methods for scattering selections + //--------------------------------------------------------------------- + + /** + * Scatter data items to select offsets (specialized for direct scattering and for discarding rejected items) + */ + template + __device__ __forceinline__ void Scatter( + Offset block_offset, + T (&items)[ITEMS_PER_THREAD], + Offset selected[ITEMS_PER_THREAD], + Offset scatter_offsets[ITEMS_PER_THREAD], + Offset tile_num_selected_prefix, + Offset tile_num_selected, + Offset num_remaining, + Int2Type keep_rejects, + Int2Type two_phase_scatter) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (selected[ITEM]) + { + // Selected items are placed front-to-back + d_out[scatter_offsets[ITEM]] = items[ITEM]; + } + } + } + + + /** + * Scatter data items to select offsets (specialized for direct scattering and for partitioning rejected items after selected items) + */ + template + __device__ __forceinline__ void Scatter( + Offset block_offset, + T (&items)[ITEMS_PER_THREAD], + Offset selected[ITEMS_PER_THREAD], + Offset scatter_offsets[ITEMS_PER_THREAD], + Offset tile_num_selected_prefix, + Offset tile_num_selected, + Offset num_remaining, + Int2Type keep_rejects, + Int2Type two_phase_scatter) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + if (selected[ITEM]) + { + // Selected items are placed front-to-back + d_out[scatter_offsets[ITEM]] = items[ITEM]; + } + else if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_remaining)) + { + Offset global_idx = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; + Offset reject_idx = global_idx - scatter_offsets[ITEM]; + + // Rejected items are placed back-to-front + d_out[num_items - reject_idx - 1] = items[ITEM]; + } + } + } + + + /** + * Scatter data items to select offsets (specialized for two-phase scattering and for discarding rejected items) + */ + template + __device__ __forceinline__ void Scatter( + Offset block_offset, + T (&items)[ITEMS_PER_THREAD], + Offset selected[ITEMS_PER_THREAD], + Offset scatter_offsets[ITEMS_PER_THREAD], + Offset tile_num_selected_prefix, + Offset tile_num_selected, + Offset num_remaining, + Int2Type keep_rejects, + Int2Type two_phase_scatter) + { + if ((tile_num_selected >> Log2::VALUE) == 0) + { + // Average number of selected items per thread is less than one, so just do a one-phase scatter + Scatter( + block_offset, + items, + selected, + scatter_offsets, + tile_num_selected_prefix, + tile_num_selected, + num_remaining, + keep_rejects, + Int2Type()); + } + else + { + // Share exclusive tile prefix + if (threadIdx.x == 0) + { + temp_storage.tile_num_selected_prefix = tile_num_selected_prefix; + } + + __syncthreads(); + + // Load exclusive tile prefix in all threads + tile_num_selected_prefix = temp_storage.tile_num_selected_prefix; + + int local_ranks[ITEMS_PER_THREAD]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + local_ranks[ITEM] = scatter_offsets[ITEM] - tile_num_selected_prefix; + } + + BlockExchangeT(temp_storage.exchange).ScatterToStriped(items, local_ranks, selected); + + // Selected items are placed front-to-back + StoreDirectStriped(threadIdx.x, d_out + tile_num_selected_prefix, items, tile_num_selected); + } + } + + + /** + * Scatter data items to select offsets (specialized for two-phase scattering and for partitioning rejected items after selected items) + */ + template + __device__ __forceinline__ void Scatter( + Offset block_offset, + T (&items)[ITEMS_PER_THREAD], + Offset selected[ITEMS_PER_THREAD], + Offset scatter_offsets[ITEMS_PER_THREAD], + Offset tile_num_selected_prefix, + Offset tile_num_selected, + Offset num_remaining, + Int2Type keep_rejects, + Int2Type two_phase_scatter) + { + // Share exclusive tile prefix + if (threadIdx.x == 0) + { + temp_storage.tile_num_selected_prefix = tile_num_selected_prefix; + } + + __syncthreads(); + + // Load the exclusive tile prefix in all threads + tile_num_selected_prefix = temp_storage.tile_num_selected_prefix; + + // Determine the exclusive prefix for rejects + Offset tile_rejected_exclusive_prefix = block_offset - tile_num_selected_prefix; + + // Determine local scatter offsets + int local_ranks[ITEMS_PER_THREAD]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + local_ranks[ITEM] = -1; + Offset global_idx = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; + Offset reject_idx = global_idx - scatter_offsets[ITEM]; + + if (selected[ITEM]) + { + // Selected items + local_ranks[ITEM] = scatter_offsets[ITEM] - tile_num_selected_prefix; + } + else if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_remaining)) + { + // Rejected items + local_ranks[ITEM] = (reject_idx - tile_rejected_exclusive_prefix) + tile_num_selected; + } + } + + // Coalesce selected and rejected items in shared memory, gathering in striped arrangements + if (LAST_TILE) + BlockExchangeT(temp_storage.exchange).ScatterToStripedGuarded(items, local_ranks); + else + BlockExchangeT(temp_storage.exchange).ScatterToStriped(items, local_ranks); + + // Store in striped order + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + Offset local_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; + Offset scatter_offset = tile_num_selected_prefix + local_idx; + if (local_idx >= tile_num_selected) + scatter_offset = num_items - (tile_rejected_exclusive_prefix + (local_idx - tile_num_selected)) - 1; + + if (!LAST_TILE || (local_idx < num_remaining)) + { + d_out[scatter_offset] = items[ITEM]; + } + } + } + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + /** + * Process a tile of input (dynamic domino scan) + */ + template + __device__ __forceinline__ Offset ConsumeTile( + Offset num_items, ///< Total number of input items + Offset num_remaining, ///< Total number of items remaining to be processed (including this tile) + int tile_idx, ///< Tile index + Offset block_offset, ///< Tile offset + ScanTileState &tile_status) ///< Global list of tile status + { + T items[ITEMS_PER_THREAD]; + Offset selected[ITEMS_PER_THREAD]; // Selection flags + Offset scatter_offsets[ITEMS_PER_THREAD]; // Scatter offsets + Offset tile_num_selected_prefix; // Total number of selected items prior to this tile + Offset tile_num_selected; // Total number of selected items within this tile + Offset num_selected; // + + // Load items + if (LAST_TILE) + BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items, num_remaining, d_in[num_items - 1]); // Repeat last item + else + BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items); + + if (SYNC_AFTER_LOAD) + __syncthreads(); + + if (tile_idx == 0) + { + // Initialize selected/rejected output flags for first tile + InitializeSelections( + block_offset, + num_remaining, + items, + selected, + Int2Type()); + + // Compute scatter offsets by scanning the flags + BlockScanAllocations(temp_storage.scan).ExclusiveSum(selected, scatter_offsets, tile_num_selected); + + // Update tile status if there may be successor tiles + if (!LAST_TILE && (threadIdx.x == 0)) + tile_status.SetInclusive(0, tile_num_selected); + + tile_num_selected_prefix = 0; + num_selected = tile_num_selected; + } + else + { + // Initialize selected/rejected output flags for non-first tile + InitializeSelections( + block_offset, + num_remaining, + items, + selected, + Int2Type()); + + // Compute scatter offsets by scanning the flags + LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, Sum(), tile_idx); + BlockScanAllocations(temp_storage.scan).ExclusiveSum(selected, scatter_offsets, tile_num_selected, prefix_op); + + tile_num_selected_prefix = prefix_op.exclusive_prefix; + num_selected = prefix_op.inclusive_prefix; + } + + // Store selected items + Scatter( + block_offset, + items, + selected, + scatter_offsets, + tile_num_selected_prefix, + tile_num_selected, + num_remaining, + Int2Type(), + Int2Type()); + + // Return total number of items selected (inclusive of this tile) + return num_selected; + } + + + /** + * Dequeue and scan tiles of items as part of a dynamic domino scan + */ + template ///< Output iterator type for recording number of items selected + __device__ __forceinline__ void ConsumeRange( + int num_tiles, ///< Total number of input tiles + GridQueue queue, ///< Queue descriptor for assigning tiles of work to thread blocks + ScanTileState &tile_status, ///< Global list of tile status + NumSelectedIterator d_num_selected) ///< Output total number selected + { +#if (CUB_PTX_ARCH <= 130) + // Blocks are launched in increasing order, so just assign one tile per block + + int tile_idx = (blockIdx.y * 32 * 1024) + blockIdx.x; // Current tile index + Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile + Offset num_remaining = num_items - block_offset; // Remaining items (including this tile) + + if (num_remaining > TILE_ITEMS) + { + ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); + } + else if (num_remaining > 0) + { + Offset total_selected = ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); + + // Output the total number of items selected + if (threadIdx.x == 0) + { + *d_num_selected = total_selected; + } + } + +#else + // Blocks may not be launched in increasing order, so work-steal tiles + + // Get first tile index + if (threadIdx.x == 0) + temp_storage.tile_idx = queue.Drain(1); + + __syncthreads(); + + int tile_idx = temp_storage.tile_idx; + Offset block_offset = Offset(TILE_ITEMS) * tile_idx; + Offset num_remaining = num_items - block_offset; + + while (num_remaining > TILE_ITEMS) + { + // Consume full tile + ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); + + // Get next tile + if (threadIdx.x == 0) + temp_storage.tile_idx = queue.Drain(1); + + __syncthreads(); + + tile_idx = temp_storage.tile_idx; + block_offset = Offset(TILE_ITEMS) * tile_idx; + num_remaining = num_items - block_offset; + } + + // Consume the last (and potentially partially-full) tile + if (num_remaining > 0) + { + Offset total_selected = ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); + + // Output the total number of items selected + if (threadIdx.x == 0) + { + *d_num_selected = total_selected; + } + } + +#endif + + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block_range/block_scan_prefix_operators.cuh b/SRC/cub/block_range/block_scan_prefix_operators.cuh new file mode 100644 index 00000000..ba72cc2e --- /dev/null +++ b/SRC/cub/block_range/block_scan_prefix_operators.cuh @@ -0,0 +1,566 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Callback operator types for supplying BlockScan prefixes + */ + +#pragma once + +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../warp/warp_reduce.cuh" +#include "../util_arch.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Prefix functor type for maintaining a running prefix while scanning a region + ******************************************************************************/ + +/** + * Stateful callback operator type for supplying BlockScan prefixes. + * Maintains a running prefix that can be applied to consecutive + * BlockScan operations. + */ +template < + typename T, ///< BlockScan value type + typename ScanOp> ///< Wrapped scan operator type +struct BlockScanRunningPrefixOp +{ + ScanOp op; ///< Wrapped scan operator + T running_total; ///< Running block-wide prefix + + /// Constructor + __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOp op) + : + op(op) + {} + + /// Constructor + __device__ __forceinline__ BlockScanRunningPrefixOp( + T starting_prefix, + ScanOp op) + : + op(op), + running_total(starting_prefix) + {} + + /** + * Prefix callback operator. Returns the block-wide running_total in thread-0. + */ + __device__ __forceinline__ T operator()( + const T &block_aggregate) ///< The aggregate sum of the BlockScan inputs + { + T retval = running_total; + running_total = op(running_total, block_aggregate); + return retval; + } +}; + + +/****************************************************************************** + * Bookkeeping and prefix functor types for single-pass device-wide scan with dynamic lookback + ******************************************************************************/ + + +/** + * Enumerations of tile status + */ +enum ScanTileStatus +{ + SCAN_TILE_OOB, // Out-of-bounds (e.g., padding) + SCAN_TILE_INVALID, // Not yet processed + SCAN_TILE_PARTIAL, // Tile aggregate is available + SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available +}; + + +/** + * Tile status interface. + */ +template < + typename T, + bool SINGLE_WORD = Traits::PRIMITIVE> +struct ScanTileState; + + +/** + * Tile status interface specialized for scan status and value types + * that can be combined into one machine word that can be + * read/written coherently in a single access. + */ +template +struct ScanTileState +{ + // Status word type + typedef typename If<(sizeof(T) == 8), + long long, + typename If<(sizeof(T) == 4), + int, + typename If<(sizeof(T) == 2), + short, + char>::Type>::Type>::Type StatusWord; + + + // Unit word type + typedef typename If<(sizeof(T) == 8), + longlong2, + typename If<(sizeof(T) == 4), + int2, + typename If<(sizeof(T) == 2), + int, + uchar2>::Type>::Type>::Type TxnWord; + + + // Device word type + struct TileDescriptor + { + StatusWord status; + T value; + }; + + + // Constants + enum + { + TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, + }; + + + // Device storage + TileDescriptor *d_tile_status; + + + /// Constructor + __host__ __device__ __forceinline__ + ScanTileState() + : + d_tile_status(NULL) + {} + + + /// Initializer + __host__ __device__ __forceinline__ + cudaError_t Init( + int num_tiles, ///< [in] Number of tiles + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation + { + d_tile_status = reinterpret_cast(d_temp_storage); + return cudaSuccess; + } + + + /** + * Compute device memory needed for tile status + */ + __host__ __device__ __forceinline__ + static cudaError_t AllocationSize( + int num_tiles, ///< [in] Number of tiles + size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + { + temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors + return cudaSuccess; + } + + + /** + * Initialize (from device) + */ + __device__ __forceinline__ void InitializeStatus(int num_tiles) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tile_idx < num_tiles) + { + // Not-yet-set + d_tile_status[TILE_STATUS_PADDING + tile_idx].status = StatusWord(SCAN_TILE_INVALID); + } + + if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) + { + // Padding + d_tile_status[threadIdx.x].status = StatusWord(SCAN_TILE_OOB); + } + } + + + /** + * Update the specified tile's inclusive value and corresponding status + */ + __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_INCLUSIVE; + tile_descriptor.value = tile_inclusive; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias); + } + + + /** + * Update the specified tile's partial value and corresponding status + */ + __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) + { + TileDescriptor tile_descriptor; + tile_descriptor.status = SCAN_TILE_PARTIAL; + tile_descriptor.value = tile_partial; + + TxnWord alias; + *reinterpret_cast(&alias) = tile_descriptor; + ThreadStore(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias); + } + + /** + * Wait for the corresponding tile to become non-invalid + */ + __device__ __forceinline__ void WaitForValid( + int tile_idx, + StatusWord &status, + T &value) + { + // Use warp-any to determine when all threads have valid status + TxnWord alias = ThreadLoad(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx)); + TileDescriptor tile_descriptor = reinterpret_cast(alias); + + while ((tile_descriptor.status == SCAN_TILE_INVALID)) + { + alias = ThreadLoad(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx)); + tile_descriptor = reinterpret_cast(alias); + } + + status = tile_descriptor.status; + value = tile_descriptor.value; + } + +}; + + + +/** + * Tile status interface specialized for scan status and value types that + * cannot be combined into one machine word. + */ +template +struct ScanTileState +{ + // Status word type + typedef char StatusWord; + + // Constants + enum + { + TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, + }; + + // Device storage + StatusWord *d_tile_status; + T *d_tile_partial; + T *d_tile_inclusive; + + /// Constructor + __host__ __device__ __forceinline__ + ScanTileState() + : + d_tile_status(NULL), + d_tile_partial(NULL), + d_tile_inclusive(NULL) + {} + + + /// Initializer + __host__ __device__ __forceinline__ + cudaError_t Init( + int num_tiles, ///< [in] Number of tiles + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation + { + cudaError_t error = cudaSuccess; + do + { + void* allocations[3]; + size_t allocation_sizes[3]; + + allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors + allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials + allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives + + // Compute allocation pointers into the single storage blob + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + // Alias the offsets + d_tile_status = reinterpret_cast(allocations[0]); + d_tile_partial = reinterpret_cast(allocations[1]); + d_tile_inclusive = reinterpret_cast(allocations[2]); + } + while (0); + + return error; + } + + + /** + * Compute device memory needed for tile status + */ + __host__ __device__ __forceinline__ + static cudaError_t AllocationSize( + int num_tiles, ///< [in] Number of tiles + size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation + { + // Specify storage allocation requirements + size_t allocation_sizes[3]; + allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors + allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials + allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives + + // Set the necessary size of the blob + void* allocations[3]; + return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes)); + } + + + /** + * Initialize (from device) + */ + __device__ __forceinline__ void InitializeStatus(int num_tiles) + { + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tile_idx < num_tiles) + { + // Not-yet-set + d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID); + } + + if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) + { + // Padding + d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB); + } + } + + + /** + * Update the specified tile's inclusive value and corresponding status + */ + __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) + { + // Update tile inclusive value + ThreadStore(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive); + + // Fence + __threadfence(); + + // Update tile status + ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE)); + } + + + /** + * Update the specified tile's partial value and corresponding status + */ + __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) + { + // Update tile partial value + ThreadStore(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial); + + // Fence + __threadfence(); + + // Update tile status + ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL)); + } + + /** + * Wait for the corresponding tile to become non-invalid + */ + __device__ __forceinline__ void WaitForValid( + int tile_idx, + StatusWord &status, + T &value) + { + status = ThreadLoad(d_tile_status + TILE_STATUS_PADDING + tile_idx); + while (status == SCAN_TILE_INVALID) + { + status = ThreadLoad(d_tile_status + TILE_STATUS_PADDING + tile_idx); + } + + T partial = ThreadLoad(d_tile_partial + TILE_STATUS_PADDING + tile_idx); + T inclusive = ThreadLoad(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx); + + value = (status == StatusWord(SCAN_TILE_PARTIAL)) ? + partial : + inclusive; + + } +}; + + + +/** + * Stateful block-scan prefix functor. Provides the the running prefix for + * the current tile by using the call-back warp to wait on on + * aggregates/prefixes from predecessor tiles to become available. + */ +template < + typename T, + typename ScanOp, + typename ScanTileState> +struct BlockScanLookbackPrefixOp +{ + // Parameterized warp reduce + typedef WarpReduce WarpReduceT; + + // Temporary storage type + typedef typename WarpReduceT::TempStorage _TempStorage; + + // Alias wrapper allowing temporary storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + // Type of status word + typedef typename ScanTileState::StatusWord StatusWord; + + // Scan operator for switching the scan arguments + struct SwizzleScanOp + { + ScanOp scan_op; + + // Constructor + __host__ __device__ __forceinline__ + SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {} + + // Switch the scan arguments + __host__ __device__ __forceinline__ + T operator()(const T &a, const T &b) + { + return scan_op(b, a); + } + }; + + // Fields + ScanTileState &tile_status; ///< Interface to tile status + _TempStorage &temp_storage; ///< Reference to a warp-reduction instance + ScanOp scan_op; ///< Binary scan operator + int tile_idx; ///< The current tile index + T exclusive_prefix; ///< Exclusive prefix for the tile + T inclusive_prefix; ///< Inclusive prefix for the tile + + // Constructor + __device__ __forceinline__ + BlockScanLookbackPrefixOp( + ScanTileState &tile_status, + TempStorage &temp_storage, + ScanOp scan_op, + int tile_idx) + : + tile_status(tile_status), + temp_storage(temp_storage.Alias()), + scan_op(scan_op), + tile_idx(tile_idx) {} + + + // Block until all predecessors within the warp-wide window have non-invalid status + __device__ __forceinline__ + void ProcessWindow( + int predecessor_idx, ///< Preceding tile index to inspect + StatusWord &predecessor_status, ///< [out] Preceding tile status + T &window_aggregate) ///< [out] Relevant partial reduction from this window of preceding tiles + { + T value; + tile_status.WaitForValid(predecessor_idx, predecessor_status, value); + + // Perform a segmented reduction to get the prefix for the current window. + // Use the swizzled scan operator because we are now scanning *down* towards thread0. + + int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE)); + + window_aggregate = WarpReduceT(temp_storage).TailSegmentedReduce( + value, + tail_flag, + SwizzleScanOp(scan_op)); + } + + + // BlockScan prefix callback functor (called by the first warp) + __device__ __forceinline__ + T operator()(T block_aggregate) + { + // Update our status with our tile-aggregate + if (threadIdx.x == 0) + { + tile_status.SetPartial(tile_idx, block_aggregate); + } + + int predecessor_idx = tile_idx - threadIdx.x - 1; + StatusWord predecessor_status; + T window_aggregate; + + // Wait for the warp-wide window of predecessor tiles to become valid + ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); + + // The exclusive tile prefix starts out as the current window aggregate + exclusive_prefix = window_aggregate; + + // Keep sliding the window back until we come across a tile whose inclusive prefix is known + while (WarpAll(predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE))) + { + predecessor_idx -= CUB_PTX_WARP_THREADS; + + // Update exclusive tile prefix with the window prefix + ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); + exclusive_prefix = scan_op(window_aggregate, exclusive_prefix); + } + + // Compute the inclusive tile prefix and update the status for this tile + if (threadIdx.x == 0) + { + inclusive_prefix = scan_op(exclusive_prefix, block_aggregate); + tile_status.SetInclusive(tile_idx, inclusive_prefix); + } + + // Return exclusive_prefix + return exclusive_prefix; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block_range/specializations/block_range_histo_gatomic.cuh b/SRC/cub/block_range/specializations/block_range_histo_gatomic.cuh new file mode 100644 index 00000000..ccfbd643 --- /dev/null +++ b/SRC/cub/block_range/specializations/block_range_histo_gatomic.cuh @@ -0,0 +1,184 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram. + */ + +#pragma once + +#include + +#include "../../util_type.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/** + * BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics + */ +template < + typename BlockRangeHistogramPolicy, ///< Tuning policy + int BINS, ///< Number of histogram bins per channel + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIterator, ///< The input iterator type \iterator. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1] + typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeHistogramGlobalAtomic +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Sample type + typedef typename std::iterator_traits::value_type SampleT; + + // Constants + enum + { + BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD, + TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS, + }; + + // Shared memory type required by this thread block + typedef NullType TempStorage; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + /// Reference to output histograms + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]; + + /// Input data to reduce + InputIterator d_in; + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeHistogramGlobalAtomic( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIterator d_in, ///< Input data to reduce + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms + : + d_in(d_in), + d_out_histograms(d_out_histograms) + {} + + + /** + * Process a single tile of input + */ + template + __device__ __forceinline__ void ConsumeTile( + Offset block_offset, ///< The offset the tile to consume + int valid_items = TILE_ITEMS) ///< The number of valid items in the tile + { + if (FULL_TILE) + { + // Full tile of samples to read and composite + SampleT items[ITEMS_PER_THREAD][CHANNELS]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (CHANNEL < ACTIVE_CHANNELS) + { + items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; + } + } + } + + __threadfence_block(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (CHANNEL < ACTIVE_CHANNELS) + { + atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1); + } + } + } + } + else + { + // Only a partially-full tile of samples to read and composite + int bounds = valid_items - (threadIdx.x * CHANNELS); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds)) + { + SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; + atomicAdd(d_out_histograms[CHANNEL] + item, 1); + } + } + } + + } + } + + + /** + * Aggregate results into output + */ + __device__ __forceinline__ void AggregateOutput() + {} +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block_range/specializations/block_range_histo_satomic.cuh b/SRC/cub/block_range/specializations/block_range_histo_satomic.cuh new file mode 100644 index 00000000..8c625695 --- /dev/null +++ b/SRC/cub/block_range/specializations/block_range_histo_satomic.cuh @@ -0,0 +1,245 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeHistogramSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics + */ + +#pragma once + +#include + +#include "../../util_type.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * BlockRangeHistogramSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics + */ +template < + typename BlockRangeHistogramPolicy, ///< Tuning policy + int BINS, ///< Number of histogram bins + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIterator, ///< The input iterator type \iterator. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1] + typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeHistogramSharedAtomic +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Sample type + typedef typename std::iterator_traits::value_type SampleT; + + // Constants + enum + { + BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD, + TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS, + }; + + /// Shared memory type required by this thread block + struct _TempStorage + { + HistoCounter histograms[ACTIVE_CHANNELS][BINS + 1]; // One word of padding between channel histograms to prevent warps working on different histograms from hammering on the same bank + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + /// Reference to temp_storage + _TempStorage &temp_storage; + + /// Reference to output histograms + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]; + + /// Input data to reduce + InputIterator d_in; + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeHistogramSharedAtomic( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIterator d_in, ///< Input data to reduce + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_out_histograms(d_out_histograms) + { + // Initialize histogram bin counts to zeros + #pragma unroll + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + { + int histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0; + } + // Finish up with guarded initialization if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS)) + { + this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0; + } + } + + __syncthreads(); + } + + + /** + * Process a single tile of input + */ + template + __device__ __forceinline__ void ConsumeTile( + Offset block_offset, ///< The offset the tile to consume + int valid_items = TILE_ITEMS) ///< The number of valid items in the tile + { + if (FULL_TILE) + { + // Full tile of samples to read and composite + SampleT items[ITEMS_PER_THREAD][CHANNELS]; + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (CHANNEL < ACTIVE_CHANNELS) + { + items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; + } + } + } + + __threadfence_block(); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (CHANNEL < ACTIVE_CHANNELS) + { + atomicAdd(temp_storage.histograms[CHANNEL] + items[ITEM][CHANNEL], 1); + } + } + } + + __threadfence_block(); + } + else + { + // Only a partially-full tile of samples to read and composite + int bounds = valid_items - (threadIdx.x * CHANNELS); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + #pragma unroll + for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) + { + if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds)) + { + SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; + atomicAdd(temp_storage.histograms[CHANNEL] + item, 1); + } + } + } + + } + } + + + /** + * Aggregate results into output + */ + __device__ __forceinline__ void AggregateOutput() + { + // Barrier to ensure shared memory histograms are coherent + __syncthreads(); + + // Copy shared memory histograms to output + int channel_offset = (blockIdx.x * BINS); + + #pragma unroll + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + { + int histo_offset = 0; + + #pragma unroll + for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) + { + HistoCounter count = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x]; + + d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = count; + } + + // Finish up with guarded initialization if necessary + if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS)) + { + HistoCounter count = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x]; + + d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = count; + } + } + } +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block_range/specializations/block_range_histo_sort.cuh b/SRC/cub/block_range/specializations/block_range_histo_sort.cuh new file mode 100644 index 00000000..c28d1a74 --- /dev/null +++ b/SRC/cub/block_range/specializations/block_range_histo_sort.cuh @@ -0,0 +1,364 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockRangeHistogramSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting + */ + +#pragma once + +#include + +#include "../../block/block_radix_sort.cuh" +#include "../../block/block_discontinuity.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * BlockRangeHistogramSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting + */ +template < + typename BlockRangeHistogramPolicy, ///< Tuning policy + int BINS, ///< Number of histogram bins per channel + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIterator, ///< The input iterator type \iterator. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1] + typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin + typename Offset> ///< Signed integer type for global offsets +struct BlockRangeHistogramSort +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // Sample type + typedef typename std::iterator_traits::value_type SampleT; + + // Constants + enum + { + BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD, + TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS, + + STRIPED_COUNTERS_PER_THREAD = (BINS + BLOCK_THREADS - 1) / BLOCK_THREADS, + }; + + // Parameterize BlockRadixSort type for our thread block + typedef BlockRadixSort BlockRadixSortT; + + // Parameterize BlockDiscontinuity type for our thread block + typedef BlockDiscontinuity BlockDiscontinuityT; + + /// Shared memory type required by this thread block + union _TempStorage + { + // Storage for sorting bin values + typename BlockRadixSortT::TempStorage sort; + + struct + { + // Storage for detecting discontinuities in the tile of sorted bin values + typename BlockDiscontinuityT::TempStorage flag; + + // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values + int run_begin[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD]; + int run_end[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD]; + }; + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + // Discontinuity functor + struct DiscontinuityOp + { + // Reference to temp_storage + _TempStorage &temp_storage; + + // Constructor + __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) : + temp_storage(temp_storage) + {} + + // Discontinuity predicate + __device__ __forceinline__ bool operator()(const SampleT &a, const SampleT &b, int b_index) + { + if (a != b) + { + // Note the begin/end offsets in shared storage + temp_storage.run_begin[b] = b_index; + temp_storage.run_end[a] = b_index; + + return true; + } + else + { + return false; + } + } + }; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + /// Reference to temp_storage + _TempStorage &temp_storage; + + /// Histogram counters striped across threads + HistoCounter thread_counters[ACTIVE_CHANNELS][STRIPED_COUNTERS_PER_THREAD]; + + /// Reference to output histograms + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]; + + /// Input data to reduce + InputIterator d_in; + + + //--------------------------------------------------------------------- + // Interface + //--------------------------------------------------------------------- + + /** + * Constructor + */ + __device__ __forceinline__ BlockRangeHistogramSort( + TempStorage &temp_storage, ///< Reference to temp_storage + InputIterator d_in, ///< Input data to reduce + HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms + : + temp_storage(temp_storage.Alias()), + d_in(d_in), + d_out_histograms(d_out_histograms) + { + // Initialize histogram counters striped across threads + #pragma unroll + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + { + #pragma unroll + for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) + { + thread_counters[CHANNEL][COUNTER] = 0; + } + } + } + + + /** + * Composite a tile of input items + */ + __device__ __forceinline__ void Composite( + SampleT (&items)[ITEMS_PER_THREAD], ///< Tile of samples + HistoCounter thread_counters[STRIPED_COUNTERS_PER_THREAD]) ///< Histogram counters striped across threads + { + // Sort bytes in blocked arrangement + BlockRadixSortT(temp_storage.sort).Sort(items); + + __syncthreads(); + + // Initialize the shared memory's run_begin and run_end for each bin + #pragma unroll + for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) + { + temp_storage.run_begin[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS; + temp_storage.run_end[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS; + } + + __syncthreads(); + + // Note the begin/end run offsets of bin runs in the sorted tile + int flags[ITEMS_PER_THREAD]; // unused + DiscontinuityOp flag_op(temp_storage); + BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op); + + // Update begin for first item + if (threadIdx.x == 0) temp_storage.run_begin[items[0]] = 0; + + __syncthreads(); + + // Composite into histogram + // Initialize the shared memory's run_begin and run_end for each bin + #pragma unroll + for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) + { + int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x; + HistoCounter run_length = temp_storage.run_end[bin] - temp_storage.run_begin[bin]; + + thread_counters[COUNTER] += run_length; + } + } + + + /** + * Process one channel within a tile. + */ + template + __device__ __forceinline__ void ConsumeTileChannel( + int channel, + Offset block_offset, + int valid_items) + { + // Load items in striped fashion + if (FULL_TILE) + { + // Full tile of samples to read and composite + SampleT items[ITEMS_PER_THREAD]; + + // Unguarded loads + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)]; + } + + // Composite our histogram data + Composite(items, thread_counters[channel]); + } + else + { + // Only a partially-full tile of samples to read and composite + SampleT items[ITEMS_PER_THREAD]; + + // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later + int bounds = (valid_items - (threadIdx.x * CHANNELS)); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) + { + items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ? + d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] : + 0; + } + + // Composite our histogram data + Composite(items, thread_counters[channel]); + + __syncthreads(); + + // Correct the overcounting in the zero-bin from invalid (out-of-bounds) items + if (threadIdx.x == 0) + { + int extra = (TILE_ITEMS - valid_items) / CHANNELS; + thread_counters[channel][0] -= extra; + } + } + } + + + /** + * Template iteration over channels (to silence not-unrolled warnings for SM10-13). Inductive step. + */ + template + struct IterateChannels + { + /** + * Process one channel within a tile. + */ + static __device__ __forceinline__ void ConsumeTileChannel( + BlockRangeHistogramSort *cta, + Offset block_offset, + int valid_items) + { + __syncthreads(); + + cta->ConsumeTileChannel(CHANNEL, block_offset, valid_items); + + IterateChannels::ConsumeTileChannel(cta, block_offset, valid_items); + } + }; + + + /** + * Template iteration over channels (to silence not-unrolled warnings for SM10-13). Base step. + */ + template + struct IterateChannels + { + static __device__ __forceinline__ void ConsumeTileChannel(BlockRangeHistogramSort *cta, Offset block_offset, int valid_items) {} + }; + + + /** + * Process a single tile of input + */ + template + __device__ __forceinline__ void ConsumeTile( + Offset block_offset, ///< The offset the tile to consume + int valid_items = TILE_ITEMS) ///< The number of valid items in the tile + { + // First channel + ConsumeTileChannel(0, block_offset, valid_items); + + // Iterate through remaining channels + IterateChannels::ConsumeTileChannel(this, block_offset, valid_items); + } + + + /** + * Aggregate results into output + */ + __device__ __forceinline__ void AggregateOutput() + { + // Copy counters striped across threads into the histogram output + #pragma unroll + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + { + int channel_offset = (blockIdx.x * BINS); + + #pragma unroll + for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) + { + int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x; + + if ((STRIPED_COUNTERS_PER_THREAD * BLOCK_THREADS == BINS) || (bin < BINS)) + { + d_out_histograms[CHANNEL][channel_offset + bin] = thread_counters[CHANNEL][COUNTER]; + } + } + } + } +}; + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/cub.cuh b/SRC/cub/cub.cuh new file mode 100644 index 00000000..a0902ba8 --- /dev/null +++ b/SRC/cub/cub.cuh @@ -0,0 +1,95 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * CUB umbrella include file + */ + +#pragma once + + +// Block +#include "block/block_histogram.cuh" +#include "block/block_discontinuity.cuh" +#include "block/block_exchange.cuh" +#include "block/block_load.cuh" +#include "block/block_radix_rank.cuh" +#include "block/block_radix_sort.cuh" +#include "block/block_reduce.cuh" +#include "block/block_scan.cuh" +#include "block/block_store.cuh" +#include "block/block_shift.cuh" + +// Device +#include "device/device_histogram.cuh" +#include "device/device_partition.cuh" +#include "device/device_radix_sort.cuh" +#include "device/device_reduce.cuh" +#include "device/device_scan.cuh" +#include "device/device_select.cuh" + +// Grid +//#include "grid/grid_barrier.cuh" +#include "grid/grid_even_share.cuh" +#include "grid/grid_mapping.cuh" +#include "grid/grid_queue.cuh" + +// Host +#include "host/spinlock.cuh" + +// Thread +#include "thread/thread_load.cuh" +#include "thread/thread_operators.cuh" +#include "thread/thread_reduce.cuh" +#include "thread/thread_scan.cuh" +#include "thread/thread_store.cuh" + +// Warp +#include "warp/warp_reduce.cuh" +#include "warp/warp_scan.cuh" + +// Iterator +#include "iterator/arg_index_input_iterator.cuh" +#include "iterator/cache_modified_input_iterator.cuh" +#include "iterator/cache_modified_output_iterator.cuh" +#include "iterator/constant_input_iterator.cuh" +#include "iterator/counting_input_iterator.cuh" +#include "iterator/tex_obj_input_iterator.cuh" +#include "iterator/tex_ref_input_iterator.cuh" +#include "iterator/transform_input_iterator.cuh" + +// Util +#include "util_allocator.cuh" +#include "util_arch.cuh" +#include "util_debug.cuh" +#include "util_device.cuh" +#include "util_macro.cuh" +#include "util_ptx.cuh" +#include "util_type.cuh" + diff --git a/SRC/cub/device/device_histogram.cuh b/SRC/cub/device/device_histogram.cuh new file mode 100644 index 00000000..1ce687e2 --- /dev/null +++ b/SRC/cub/device/device_histogram.cuh @@ -0,0 +1,653 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within global memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/device_histogram_dispatch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within global memory. ![](histogram_logo.png) + * \ingroup DeviceModule + * + * \par Overview + * A histogram + * counts the number of observations that fall into each of the disjoint categories (known as bins). + * + * \par Usage Considerations + * \cdp_class{DeviceHistogram} + * + * \par Performance + * + * \image html histo_perf.png + * + */ +struct DeviceHistogram +{ + /******************************************************************//** + * \name Single-channel samples + *********************************************************************/ + //@{ + + + /** + * \brief Computes a device-wide histogram using fast block-wide sorting. + * + * \par + * - The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS. + * - Delivers consistent throughput regardless of sample diversity + * - Histograms having a large number of bins (e.g., thousands) may adversely affect shared memory occupancy and performance (or even the ability to launch). + * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator). + * - \devicestorage + * - \cdp + * + * \par Snippet + * The code snippet below illustrates the computation of a 8-bin histogram of + * single-channel unsigned char samples. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input and histogram + * int num_samples; // e.g., 12 + * unsigned char *d_samples; // e.g., [2, 6, 7, 5, 3, 0, 2, 1, 7, 0, 6, 2] + * unsigned int *d_histogram; // e.g., [ , , , , , , , ] + * ... + * + * // Wrap d_samples device pointer in a random-access texture iterator + * cub::TexObjInputIterator d_samples_tex_itr; + * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char)); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::SingleChannelSorting<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histogram + * cub::DeviceHistogram::SingleChannelSorting<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples); + * + * // Unbind texture iterator + * d_samples_tex_itr.UnbindTexture(); + * + * // d_histogram <-- [2, 1, 3, 1, 0, 1, 2, 2] + * + * \endcode + * + * \tparam BINS Number of histogram bins per channel + * \tparam InputIterator [inferred] Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]) \iterator + * \tparam HistoCounter [inferred] Integer type for counting sample occurrences per histogram bin + */ + template < + int BINS, + typename InputIterator, + typename HistoCounter> + CUB_RUNTIME_FUNCTION + static cudaError_t SingleChannelSorting( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_samples, ///< [in] Input samples + HistoCounter* d_histogram, ///< [out] Array of BINS counters of integral type \p HistoCounter. + int num_samples, ///< [in] Number of samples to process + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + // Dispatch type + typedef DeviceHistogramDispatch< + DEVICE_HISTO_SORT, + BINS, + 1, + 1, + InputIterator, + HistoCounter, + Offset> + DeviceHistogramDispatch; + + return DeviceHistogramDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + &d_histogram, + num_samples, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide histogram using shared-memory atomic read-modify-write operations. + * + * \par + * - Input samples having lower diversity can cause performance to be degraded due to serializations from bin-collisions. + * - Histograms having a large number of bins (e.g., thousands) may adversely affect shared memory occupancy and performance (or even the ability to launch). + * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator). + * - \devicestorage + * - \cdp + * + * \par Snippet + * The code snippet below illustrates the computation of a 8-bin histogram of + * single-channel unsigned char samples. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input and histogram + * int num_samples; // e.g., 12 + * unsigned char *d_samples; // e.g., [2, 6, 7, 5, 3, 0, 2, 1, 7, 0, 6, 2] + * unsigned int *d_histogram; // e.g., [ , , , , , , , ] + * ... + * + * // Wrap d_samples device pointer in a random-access texture iterator + * cub::TexObjInputIterator d_samples_tex_itr; + * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char)); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::SingleChannelSorting<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histogram + * cub::DeviceHistogram::SingleChannelSharedAtomic<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples); + * + * // Unbind texture iterator + * d_samples_tex_itr.UnbindTexture(); + * + * // d_histogram <-- [2, 1, 3, 1, 0, 1, 2, 2] + * + * \endcode + * + * \tparam BINS Number of histogram bins per channel + * \tparam InputIterator [inferred] Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]) \iterator + * \tparam HistoCounter [inferred] Integer type for counting sample occurrences per histogram bin + */ + template < + int BINS, + typename InputIterator, + typename HistoCounter> + CUB_RUNTIME_FUNCTION + static cudaError_t SingleChannelSharedAtomic( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_samples, ///< [in] Input samples + HistoCounter* d_histogram, ///< [out] Array of BINS counters of integral type \p HistoCounter. + int num_samples, ///< [in] Number of samples to process + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + // Dispatch type + typedef DeviceHistogramDispatch< + DEVICE_HISTO_SHARED_ATOMIC, + BINS, + 1, + 1, + InputIterator, + HistoCounter, + Offset> + DeviceHistogramDispatch; + + return DeviceHistogramDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + &d_histogram, + num_samples, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide histogram using global-memory atomic read-modify-write operations. + * + * \par + * - Input samples having lower diversity can cause performance to be degraded due to serializations from bin-collisions. + * - Performance is not significantly impacted when computing histograms having large numbers of bins (e.g., thousands). + * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator). + * - \devicestorage + * - \cdp + * + * \par Snippet + * The code snippet below illustrates the computation of a 8-bin histogram of + * single-channel unsigned char samples. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input and histogram + * int num_samples; // e.g., 12 + * unsigned char *d_samples; // e.g., [2, 6, 7, 5, 3, 0, 2, 1, 7, 0, 6, 2] + * unsigned int *d_histogram; // e.g., [ , , , , , , , ] + * ... + * + * // Wrap d_samples device pointer in a random-access texture iterator + * cub::TexObjInputIterator d_samples_tex_itr; + * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char)); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::SingleChannelSorting<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histogram + * cub::DeviceHistogram::SingleChannelGlobalAtomic<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples); + * + * // Unbind texture iterator + * d_samples_tex_itr.UnbindTexture(); + * + * // d_histogram <-- [2, 1, 3, 1, 0, 1, 2, 2] + * + * \endcode + * + * \tparam BINS Number of histogram bins per channel + * \tparam InputIterator [inferred] Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]) \iterator + * \tparam HistoCounter [inferred] Integer type for counting sample occurrences per histogram bin + */ + template < + int BINS, + typename InputIterator, + typename HistoCounter> + CUB_RUNTIME_FUNCTION + static cudaError_t SingleChannelGlobalAtomic( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_samples, ///< [in] Input samples + HistoCounter* d_histogram, ///< [out] Array of BINS counters of integral type \p HistoCounter. + int num_samples, ///< [in] Number of samples to process + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + // Dispatch type + typedef DeviceHistogramDispatch< + DEVICE_HISTO_GLOBAL_ATOMIC, + BINS, + 1, + 1, + InputIterator, + HistoCounter, + Offset> + DeviceHistogramDispatch; + + return DeviceHistogramDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + &d_histogram, + num_samples, + stream, + debug_synchronous); + } + + + //@} end member group + /******************************************************************//** + * \name Interleaved multi-channel samples + *********************************************************************/ + //@{ + + + /** + * \brief Computes a device-wide histogram from multi-channel data using fast block-sorting. + * + * \par + * - The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS. + * - Delivers consistent throughput regardless of sample diversity + * - Histograms having a large number of bins (e.g., thousands) may adversely affect shared memory occupancy and performance (or even the ability to launch). + * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator). + * - \devicestorage + * - \cdp + * + * \par Snippet + * The code snippet below illustrates the computation of three 256-bin histograms from + * an input sequence of quad-channel (interleaved) unsigned char samples. + * (E.g., RGB histograms from RGBA pixel samples.) + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input and histograms + * int num_samples; // e.g., 20 (five pixels with four channels each) + * unsigned char *d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), + * // (0, 6, 7, 5), (3, 0, 2, 6)] + * unsigned int *d_histogram[3]; // e.g., [ [ , , , , , , , ]; + * // [ , , , , , , , ]; + * // [ , , , , , , , ] ] + * ... + * + * // Wrap d_samples device pointer in a random-access texture iterator + * cub::TexObjInputIterator d_samples_tex_itr; + * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char)); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiChannelSorting<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiChannelSorting<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples); + * + * // Unbind texture iterator + * d_samples_tex_itr.UnbindTexture(); + * + * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1]; + * // [0, 3, 0, 0, 0, 0, 2, 0]; + * // [0, 0, 2, 0, 0, 0, 1, 2] ] + * + * \endcode + * + * \tparam BINS Number of histogram bins per channel + * \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam InputIterator [inferred] Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]) \iterator + * \tparam HistoCounter [inferred] Integer type for counting sample occurrences per histogram bin + */ + template < + int BINS, + int CHANNELS, + int ACTIVE_CHANNELS, + typename InputIterator, + typename HistoCounter> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiChannelSorting( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_samples, ///< [in] Pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples). + HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of active channel histogram pointers, each pointing to an output array having BINS counters of integral type \p HistoCounter. + int num_samples, ///< [in] Total number of samples to process in all channels, including non-active channels + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + // Dispatch type + typedef DeviceHistogramDispatch< + DEVICE_HISTO_SORT, + BINS, + CHANNELS, + ACTIVE_CHANNELS, + InputIterator, + HistoCounter, + Offset> DeviceHistogramDispatch; + + return DeviceHistogramDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histograms, + num_samples, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide histogram from multi-channel data using shared-memory atomic read-modify-write operations. + * + * \par + * - The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS. + * - Input samples having lower diversity can cause performance to be degraded due to serializations from bin-collisions. + * - Histograms having a large number of bins (e.g., thousands) may adversely affect shared memory occupancy and performance (or even the ability to launch). + * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator). + * - \devicestorage + * - \cdp + * + * \par Snippet + * The code snippet below illustrates the computation of three 256-bin histograms from + * an input sequence of quad-channel (interleaved) unsigned char samples. + * (E.g., RGB histograms from RGBA pixel samples.) + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input and histograms + * int num_samples; // e.g., 20 (five pixels with four channels each) + * unsigned char *d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), + * // (0, 6, 7, 5), (3, 0, 2, 6)] + * unsigned int *d_histogram[3]; // e.g., [ [ , , , , , , , ]; + * // [ , , , , , , , ]; + * // [ , , , , , , , ] ] + * ... + * + * // Wrap d_samples device pointer in a random-access texture iterator + * cub::TexObjInputIterator d_samples_tex_itr; + * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char)); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiChannelSharedAtomic<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiChannelSharedAtomic<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples); + * + * // Unbind texture iterator + * d_samples_tex_itr.UnbindTexture(); + * + * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1]; + * // [0, 3, 0, 0, 0, 0, 2, 0]; + * // [0, 0, 2, 0, 0, 0, 1, 2] ] + * + * \endcode + * + * \tparam BINS Number of histogram bins per channel + * \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam InputIterator [inferred] Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]) \iterator + * \tparam HistoCounter [inferred] Integer type for counting sample occurrences per histogram bin + */ + template < + int BINS, + int CHANNELS, + int ACTIVE_CHANNELS, + typename InputIterator, + typename HistoCounter> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiChannelSharedAtomic( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_samples, ///< [in] Pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples). + HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of active channel histogram pointers, each pointing to an output array having BINS counters of integral type \p HistoCounter. + int num_samples, ///< [in] Total number of samples to process in all channels, including non-active channels + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + // Dispatch type + typedef DeviceHistogramDispatch< + DEVICE_HISTO_SHARED_ATOMIC, + BINS, + CHANNELS, + ACTIVE_CHANNELS, + InputIterator, + HistoCounter, + Offset> DeviceHistogramDispatch; + + return DeviceHistogramDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histograms, + num_samples, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide histogram from multi-channel data using global-memory atomic read-modify-write operations. + * + * \par + * - The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS. + * - Input samples having lower diversity can cause performance to be degraded due to serializations from bin-collisions. + * - Performance is not significantly impacted when computing histograms having large numbers of bins (e.g., thousands). + * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator). + * - \devicestorage + * - \cdp + * + * \par Snippet + * The code snippet below illustrates the computation of three 256-bin histograms from + * an input sequence of quad-channel (interleaved) unsigned char samples. + * (E.g., RGB histograms from RGBA pixel samples.) + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input and histograms + * int num_samples; // e.g., 20 (five pixels with four channels each) + * unsigned char *d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), + * // (0, 6, 7, 5), (3, 0, 2, 6)] + * unsigned int *d_histogram[3]; // e.g., [ [ , , , , , , , ]; + * // [ , , , , , , , ]; + * // [ , , , , , , , ] ] + * ... + * + * // Wrap d_samples device pointer in a random-access texture iterator + * cub::TexObjInputIterator d_samples_tex_itr; + * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char)); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiChannelGlobalAtomic<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiChannelGlobalAtomic<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples); + * + * // Unbind texture iterator + * d_samples_tex_itr.UnbindTexture(); + * + * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1]; + * // [0, 3, 0, 0, 0, 0, 2, 0]; + * // [0, 0, 2, 0, 0, 0, 1, 2] ] + * + * \endcode + * + * \tparam BINS Number of histogram bins per channel + * \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam InputIterator [inferred] Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]) \iterator + * \tparam HistoCounter [inferred] Integer type for counting sample occurrences per histogram bin + */ + template < + int BINS, + int CHANNELS, + int ACTIVE_CHANNELS, + typename InputIterator, + typename HistoCounter> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiChannelGlobalAtomic( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_samples, ///< [in] Pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples). + HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of active channel histogram pointers, each pointing to an output array having BINS counters of integral type \p HistoCounter. + int num_samples, ///< [in] Total number of samples to process in all channels, including non-active channels + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + // Dispatch type + typedef DeviceHistogramDispatch< + DEVICE_HISTO_GLOBAL_ATOMIC, + BINS, + CHANNELS, + ACTIVE_CHANNELS, + InputIterator, + HistoCounter, + Offset> + DeviceHistogramDispatch; + + return DeviceHistogramDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histograms, + num_samples, + stream, + debug_synchronous); + } + + //@} end member group + +}; + +/** + * \example example_device_histogram.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/device_partition.cuh b/SRC/cub/device/device_partition.cuh new file mode 100644 index 00000000..c9418af0 --- /dev/null +++ b/SRC/cub/device/device_partition.cuh @@ -0,0 +1,275 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within global memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/device_select_dispatch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within global memory. ![](partition_logo.png) + * \ingroup DeviceModule + * + * \par Overview + * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from + * a specified input sequence. + * + * \par Usage Considerations + * \cdp_class{DevicePartition} + * + * \par Performance + * \linear_performance{partition} + * + * \par + * The following chart illustrates DevicePartition::If + * performance across different CUDA architectures for \p int32 items, + * where 50% of the items are randomly selected for the first partition. + * \plots_below + * + * \image html partition_if_int32_50_percent.png + * + */ +struct DevicePartition +{ + /** + * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected. ![](partition_flags_logo.png) + * + * \par + * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). + * - Copies of the selected items are compacted into \p d_out and maintain their original + * relative ordering, however copies of the unselected items are compacted into the + * rear of \p d_out in reverse order. + * - \devicestorage + * - \cdp + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input, flags, and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected, num_items); + * + * // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2] + * // d_num_selected <-- [4] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator + * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator + * \tparam OutputIterator [inferred] Random-access output iterator type for writing output items \iterator + * \tparam NumSelectedIterator [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename InputIterator, + typename FlagIterator, + typename OutputIterator, + typename NumSelectedIterator> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Flagged( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIterator d_out, ///< [out] Pointer to the output sequence of partitioned data items + NumSelectedIterator d_num_selected, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) + int num_items, ///< [in] Total number of items to select from + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int Offset; // Signed integer type for global offsets + typedef NullType SelectOp; // Selection op (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DeviceSelectDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected, + SelectOp(), + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected. ![](partition_logo.png) + * + * \par + * - Copies of the selected items are compacted into \p d_out and maintain their original + * relative ordering, however copies of the unselected items are compacted into the + * rear of \p d_out in reverse order. + * - \devicestorage + * - \cdp + * + * \par Performance + * The following charts illustrate saturated partition-if performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Items are + * selected for the first partition with 50% probability. + * + * \image html partition_if_int32_50_percent.png + * \image html partition_if_int64_50_percent.png + * + * \par + * The following charts are similar, but 5% selection probability for the first partition: + * + * \image html partition_if_int32_5_percent.png + * \image html partition_if_int64_5_percent.png + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Functor type for selecting values less than some criteria + * struct LessThan + * { + * int compare; + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * LessThan(int compare) : compare(compare) {} + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * bool operator()(const int &a) const { + * return (a < compare); + * } + * }; + * + * // Declare, allocate, and initialize device pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected; // e.g., [ ] + * LessThan select_op(7); + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected, num_items, select_op); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected, num_items, select_op); + * + * // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9] + * // d_num_selected <-- [5] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIterator [inferred] Random-access output iterator type for writing output items \iterator + * \tparam NumSelectedIterator [inferred] Output iterator type for recording the number of items selected \iterator + * \tparam SelectOp [inferred] Selection functor type having member bool operator()(const T &a) + */ + template < + typename InputIterator, + typename OutputIterator, + typename NumSelectedIterator, + typename SelectOp> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t If( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output sequence of partitioned data items + NumSelectedIterator d_num_selected, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) + int num_items, ///< [in] Total number of items to select from + SelectOp select_op, ///< [in] Unary selection operator + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int Offset; // Signed integer type for global offsets + typedef NullType* FlagIterator; // Flag iterator type (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DeviceSelectDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + NULL, + d_out, + d_num_selected, + select_op, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + +}; + +/** + * \example example_device_partition_flagged.cu + * \example example_device_partition_if.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/device_radix_sort.cuh b/SRC/cub/device/device_radix_sort.cuh new file mode 100644 index 00000000..8585f088 --- /dev/null +++ b/SRC/cub/device/device_radix_sort.cuh @@ -0,0 +1,420 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within global memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/device_radix_sort_dispatch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within global memory. ![](sorting_logo.png) + * \ingroup DeviceModule + * + * \par Overview + * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges + * items into ascending order. It relies upon a positional representation for + * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, + * characters, etc.) specified from least-significant to most-significant. For a + * given input sequence of keys and a set of rules specifying a total ordering + * of the symbolic alphabet, the radix sorting method produces a lexicographic + * ordering of those keys. + * + * \par + * DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.: + * unsigned char, \p int, \p double, etc. Although the direct radix sorting + * method can only be applied to unsigned integral types, BlockRadixSort + * is able to sort signed and floating-point types via simple bit-wise transformations + * that ensure lexicographic key ordering. + * + * \par Usage Considerations + * \cdp_class{DeviceRadixSort} + * + * \par Performance + * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys + * performance across different CUDA architectures for uniform-random \p uint32 keys. + * \plots_below + * + * \image html lsb_radix_sort_int32_keys.png + * + */ +struct DeviceRadixSort +{ + /** + * \brief Sorts key-value pairs into ascending order. + * + * \par + * - The sorting operation requires a pair of key buffers and a pair of value + * buffers. Each pair is wrapped in a DoubleBuffer structure whose member + * DoubleBuffer::Current() references the active buffer. The currently-active + * buffer may be changed by the sorting operation. + * - \devicestorage + * - \cdp + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random uint32,uint32 and + * uint64,uint64 pairs, respectively. + * + * \image html lsb_radix_sort_int32_pairs.png + * \image html lsb_radix_sort_int64_pairs.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] + * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] + * + * \endcode + * + * \tparam Key [inferred] Key type + * \tparam Value [inferred] Value type + */ + template < + typename Key, + typename Value> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] Number of items to reduce + int begin_bit = 0, ///< [in] [optional] The first (least-significant) bit index needed for key comparison + int end_bit = sizeof(Key) * 8, ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + return DeviceRadixSortDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts key-value pairs into descending order. + * + * \par + * - The sorting operation requires a pair of key buffers and a pair of value + * buffers. Each pair is wrapped in a DoubleBuffer structure whose member + * DoubleBuffer::Current() references the active buffer. The currently-active + * buffer may be changed by the sorting operation. + * - \devicestorage + * - \cdp + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortPairs. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] + * // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5] + * + * \endcode + * + * \tparam Key [inferred] Key type + * \tparam Value [inferred] Value type + */ + template < + typename Key, + typename Value> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] Number of items to reduce + int begin_bit = 0, ///< [in] [optional] The first (least-significant) bit index needed for key comparison + int end_bit = sizeof(Key) * 8, ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + return DeviceRadixSortDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts keys into ascending order + * + * \par + * - The sorting operation requires a pair of key buffers. The pair is + * wrapped in a DoubleBuffer structure whose member DoubleBuffer::Current() + * references the active buffer. The currently-active buffer may be changed + * by the sorting operation. + * - \devicestorage + * - \cdp + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. + * + * \image html lsb_radix_sort_int32_keys.png + * \image html lsb_radix_sort_int64_keys.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] + * + * \endcode + * + * \tparam Key [inferred] Key type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] Number of items to reduce + int begin_bit = 0, ///< [in] [optional] The first (least-significant) bit index needed for key comparison + int end_bit = sizeof(Key) * 8, ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + // Null value type + DoubleBuffer d_values; + + return DeviceRadixSortDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts keys into ascending order + * + * \par + * - The sorting operation requires a pair of key buffers. The pair is + * wrapped in a DoubleBuffer structure whose member DoubleBuffer::Current() + * references the active buffer. The currently-active buffer may be changed + * by the sorting operation. + * - \devicestorage + * - \cdp + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortKeys. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] + * + * \endcode + * + * \tparam Key [inferred] Key type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] Number of items to reduce + int begin_bit = 0, ///< [in] [optional] The first (least-significant) bit index needed for key comparison + int end_bit = sizeof(Key) * 8, ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + // Null value type + DoubleBuffer d_values; + + return DeviceRadixSortDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + stream, + debug_synchronous); + } + +}; + +/** + * \example example_device_radix_sort.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/device_reduce.cuh b/SRC/cub/device/device_reduce.cuh new file mode 100644 index 00000000..480248b0 --- /dev/null +++ b/SRC/cub/device/device_reduce.cuh @@ -0,0 +1,804 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within global memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/device_reduce_dispatch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within global memory. ![](reduce_logo.png) + * \ingroup DeviceModule + * + * \par Overview + * A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a sequence of input elements. + * + * \par Usage Considerations + * \cdp_class{DeviceReduce} + * + * \par Performance + * \linear_performance{reduction, reduce-by-key, and run-length encode} + * + * \par + * The following chart illustrates DeviceReduce::Sum + * performance across different CUDA architectures for \p int32 keys. + * + * \image html reduce_int32.png + * + * \par + * The following chart illustrates DeviceReduce::ReduceByKey (summation) + * performance across different CUDA architectures for \p fp32 + * values. Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000]. + * + * \image html reduce_by_key_fp32_len_500.png + * + * \par + * The following chart illustrates DeviceReduce::RunLengthEncode performance across + * different CUDA architectures for \p int32 items. + * Segments have lengths uniformly sampled from [1,1000]. + * + * \image html rle_int32_len_500.png + * + * \par + * \plots_below + * + * + */ +struct DeviceReduce +{ + /** + * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor. + * + * \par + * - Does not support non-commutative reduction operators. + * - \devicestorage + * - \cdp + * + * \par Performance + * Performance is typically similar to DeviceReduce::Sum. + * + * \par Snippet + * The code snippet below illustrates a custom min reduction of a device vector of \p int items. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ ] + * CustomMin min_op; + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run reduction + * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op); + * + * // d_out <-- [0] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIterator [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template < + typename InputIterator, + typename OutputIterator, + typename ReductionOp> + CUB_RUNTIME_FUNCTION + static cudaError_t Reduce( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOp reduction_op, ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + // Dispatch type + typedef DeviceReduceDispatch DeviceReduceDispatch; + + return DeviceReduceDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + reduction_op, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide sum using the addition ('+') operator. + * + * \par + * - Does not support non-commutative reduction operators. + * - \devicestorage + * - \cdp + * + * \par Performance + * The following charts illustrate saturated reduction (sum) performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. + * + * \image html reduce_int32.png + * \image html reduce_int64.png + * + * \par Snippet + * The code snippet below illustrates the sum reduction of a device vector of \p int items. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_sum, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sum-reduction + * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_sum, num_items); + * + * // d_out <-- [38] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIterator [inferred] Output iterator type for recording the reduced aggregate \iterator + */ + template < + typename InputIterator, + typename OutputIterator> + CUB_RUNTIME_FUNCTION + static cudaError_t Sum( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + // Dispatch type + typedef DeviceReduceDispatch DeviceReduceDispatch; + + return DeviceReduceDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Sum(), + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide minimum using the less-than ('<') operator. + * + * \par + * - Does not support non-commutative minimum operators. + * - \devicestorage + * - \cdp + * + * \par Performance + * Performance is typically similar to DeviceReduce::Sum. + * + * \par Snippet + * The code snippet below illustrates the min-reduction of a device vector of \p int items. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_min, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run min-reduction + * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_min, num_items); + * + * // d_out <-- [0] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIterator [inferred] Output iterator type for recording the reduced aggregate \iterator + */ + template < + typename InputIterator, + typename OutputIterator> + CUB_RUNTIME_FUNCTION + static cudaError_t Min( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + // Dispatch type + typedef DeviceReduceDispatch DeviceReduceDispatch; + + return DeviceReduceDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Min(), + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item. + * + * \par + * Assuming the input \p d_in has value type \p T, the output \p d_out must have value type + * ItemOffsetPair. The minimum value is written to d_out.value and its + * location in the input array is written to d_out.offset. + * + * \par + * - Does not support non-commutative minimum operators. + * - \devicestorage + * - \cdp + * + * \par Performance + * Performance is typically similar to DeviceReduce::Sum. + * + * \par Snippet + * The code snippet below illustrates the argmin-reduction of a device vector of \p int items. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * ItemOffsetPair *d_out; // e.g., [{ , }] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmin-reduction + * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); + * + * // d_out <-- [{0, 5}] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIterator [inferred] Output iterator type for recording the reduced aggregate (having value type ItemOffsetPair) \iterator + */ + template < + typename InputIterator, + typename OutputIterator> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMin( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + // Wrapped input iterator + typedef ArgIndexInputIterator ArgIndexInputIterator; + ArgIndexInputIterator d_argmin_in(d_in, 0); + + // Dispatch type + typedef DeviceReduceDispatch DeviceReduceDispatch; + + return DeviceReduceDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_argmin_in, + d_out, + num_items, + cub::ArgMin(), + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide maximum using the greater-than ('>') operator. + * + * \par + * - Does not support non-commutative maximum operators. + * - \devicestorage + * - \cdp + * + * \par Performance + * Performance is typically similar to DeviceReduce::Sum. + * + * \par Snippet + * The code snippet below illustrates the max-reduction of a device vector of \p int items. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run max-reduction + * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); + * + * // d_out <-- [9] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIterator [inferred] Output iterator type for recording the reduced aggregate \iterator + */ + template < + typename InputIterator, + typename OutputIterator> + CUB_RUNTIME_FUNCTION + static cudaError_t Max( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + // Dispatch type + typedef DeviceReduceDispatch DeviceReduceDispatch; + + return DeviceReduceDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + cub::Max(), + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item + * + * \par + * Assuming the input \p d_in has value type \p T, the output \p d_out must have value type + * ItemOffsetPair. The maximum value is written to d_out.value and its + * location in the input array is written to d_out.offset. + * + * \par + * - Does not support non-commutative maximum operators. + * - \devicestorage + * - \cdp + * + * \par Performance + * Performance is typically similar to DeviceReduce::Sum. + * + * \par Snippet + * The code snippet below illustrates the argmax-reduction of a device vector of \p int items. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * ItemOffsetPair *d_out; // e.g., [{ , }] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmax-reduction + * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); + * + * // d_out <-- [{9, 6}] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIterator [inferred] Output iterator type for recording the reduced aggregate (having value type ItemOffsetPair) \iterator + */ + template < + typename InputIterator, + typename OutputIterator> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMax( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output aggregate + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + // Wrapped input iterator + typedef ArgIndexInputIterator ArgIndexInputIterator; + ArgIndexInputIterator d_argmax_in(d_in, 0); + + // Dispatch type + typedef DeviceReduceDispatch DeviceReduceDispatch; + + return DeviceReduceDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_argmax_in, + d_out, + num_items, + cub::ArgMax(), + stream, + debug_synchronous); + } + + + /** + * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys. + * + * \par + * This operation computes segmented reductions using the specified binary + * \p reduction_op functor. Each "run" of consecutive, identical keys in \p d_keys_in + * is used to identify a corresponding segment of values in \p d_values_in. The first key in + * the ith segment is copied to d_keys_out[i], and + * the value aggregate for that segment is written to d_values_out[i]. + * The total number of segments discovered is written to \p d_num_segments. + * + * \par + * - The == equality operator is used to determine whether keys are equivalent + * - \devicestorage + * - \cdp + * + * \par Performance + * The following chart illustrates reduction-by-key (sum) performance across + * different CUDA architectures for \p fp32 and \p fp64 values, respectively. Segments + * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000]. + * + * \image html reduce_by_key_fp32_len_500.png + * \image html reduce_by_key_fp64_len_500.png + * + * \par + * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: + * + * \image html reduce_by_key_fp32_len_5.png + * \image html reduce_by_key_fp64_len_5.png + * + * \par Snippet + * The code snippet below illustrates the segmented reduction of \p int values grouped + * by runs of associated \p int keys. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device pointers for input and output + * int num_items; // e.g., 8 + * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4] + * int *d_keys_out; // e.g., [ , , , , , , , ] + * int *d_values_out; // e.g., [ , , , , , , , ] + * int *d_num_segments; // e.g., [ ] + * CustomMin reduction_op; + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_segments, reduction_op, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run reduce-by-key + * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_segments, reduction_op, num_items); + * + * // d_keys_out <-- [0, 2, 9, 5, 8] + * // d_values_out <-- [0, 1, 6, 2, 4] + * // d_num_segments <-- [5] + * + * \endcode + * + * \tparam KeyInputIterator [inferred] Random-access input iterator type for reading input keys \iterator + * \tparam KeyOutputIterator [inferred] Random-access output iterator type for writing output keys \iterator + * \tparam ValueInputIterator [inferred] Random-access input iterator type for reading input values \iterator + * \tparam ValueOutputIterator [inferred] Random-access output iterator type for writing output values \iterator + * \tparam NumSegmentsIterator [inferred] Output iterator type for recording the number of segments encountered \iterator + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template < + typename KeyInputIterator, + typename KeyOutputIterator, + typename ValueInputIterator, + typename ValueOutputIterator, + typename NumSegmentsIterator, + typename ReductionOp> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t ReduceByKey( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeyInputIterator d_keys_in, ///< [in] Pointer to consecutive runs of input keys + KeyOutputIterator d_keys_out, ///< [out] Pointer to output keys (one key per run) + ValueInputIterator d_values_in, ///< [in] Pointer to consecutive runs of input values + ValueOutputIterator d_values_out, ///< [out] Pointer to output value aggregates (one aggregate per run) + NumSegmentsIterator d_num_segments, ///< [out] Pointer to total number of segments + ReductionOp reduction_op, ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int Offset; // Signed integer type for global offsets + typedef NullType* FlagIterator; // Flag iterator type (not used) + typedef NullType SelectOp; // Selection op (not used) + typedef Equality EqualityOp; // Default == operator + + return DeviceReduceByKeyDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + d_num_segments, + EqualityOp(), + reduction_op, + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Counts the segment lengths in the sequence \p d_in, where segments are demarcated by runs of identical values. + * + * \par + * This operation computes a run-length encoding of \p d_in, where segments are identified + * by "runs" of consecutive, identical values. The length of the ith segment + * is written to d_counts_out[i]. The unique values are also compacted, + * i.e., the first value in the ith segment is copied to + * d_compacted_out[i]. The total number of segments discovered is written + * to \p d_num_segments. + * + * \par + * - The == equality operator is used to determine whether values are equivalent + * - \devicestorage + * - \cdp + * + * \par Performance + * The following charts illustrate saturated encode performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have + * lengths uniformly sampled from [1,1000]. + * + * \image html rle_int32_len_500.png + * \image html rle_int64_len_500.png + * + * \par + * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: + * + * \image html rle_int32_len_5.png + * \image html rle_int64_len_5.png + * + * \par Snippet + * The code snippet below illustrates the run-length encoding of a sequence of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_compacted_out; // e.g., [ , , , , , , , ] + * int *d_counts_out; // e.g., [ , , , , , , , ] + * int *d_num_segments; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceReduce::RunLengthEncode(d_temp_storage, temp_storage_bytes, d_in, d_compacted_out, d_counts_out, d_num_segments, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run encoding + * cub::DeviceReduce::RunLengthEncode(d_temp_storage, temp_storage_bytes, d_in, d_compacted_out, d_counts_out, d_num_segments, num_items); + * + * // d_keys_out <-- [0, 2, 9, 5, 8] + * // d_values_out <-- [1, 2, 1, 3, 1] + * // d_num_segments <-- [5] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIterator [inferred] Random-access output iterator type for writing compacted output items \iterator + * \tparam CountsOutputIterator [inferred] Random-access output iterator type for writing output counts \iterator + * \tparam NumSegmentsIterator [inferred] Output iterator type for recording the number of segments encountered \iterator + */ + template < + typename InputIterator, + typename OutputIterator, + typename CountsOutputIterator, + typename NumSegmentsIterator> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t RunLengthEncode( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to consecutive runs of input keys + OutputIterator d_compacted_out, ///< [out] Pointer to output keys (one key per run) + CountsOutputIterator d_counts_out, ///< [out] Pointer to output value aggregates (one aggregate per run) + NumSegmentsIterator d_num_segments, ///< [out] Pointer to total number of segments + int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Data type of value iterator + typedef typename std::iterator_traits::value_type Value; + + typedef int Offset; // Signed integer type for global offsets + typedef NullType* FlagIterator; // Flag iterator type (not used) + typedef NullType SelectOp; // Selection op (not used) + typedef Equality EqualityOp; // Default == operator + typedef cub::Sum ReductionOp; // Value reduction operator + + // Generator type for providing 1s values for run-length reduction + typedef ConstantInputIterator CountsInputIterator; + + Value one_val; + one_val = 1; + + return DeviceReduceByKeyDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_compacted_out, + CountsInputIterator(one_val), + d_counts_out, + d_num_segments, + EqualityOp(), + ReductionOp(), + num_items, + stream, + debug_synchronous); + } + +}; + +/** + * \example example_device_reduce.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/device_scan.cuh b/SRC/cub/device/device_scan.cuh new file mode 100644 index 00000000..511acc88 --- /dev/null +++ b/SRC/cub/device/device_scan.cuh @@ -0,0 +1,419 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within global memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/device_scan_dispatch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within global memory. ![](device_scan.png) + * \ingroup DeviceModule + * + * \par Overview + * Given a sequence of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) + * produces an output sequence where each element is computed to be the reduction + * of the elements occurring earlier in the input sequence. Prefix sum + * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * that the ith output reduction incorporates the ith input. + * The term \em exclusive indicates the ith input is not incorporated into + * the ith output reduction. + * + * \par Usage Considerations + * \cdp_class{DeviceScan} + * + * \par Performance + * \linear_performance{prefix scan} + * + * \par + * The following chart illustrates DeviceScan::ExclusiveSum + * performance across different CUDA architectures for \p int32 keys. + * \plots_below + * + * \image html scan_int32.png + * + */ +struct DeviceScan +{ + /******************************************************************//** + * \name Exclusive scans + *********************************************************************/ + //@{ + + /** + * \brief Computes a device-wide exclusive prefix sum. + * + * \par + * - Supports non-commutative sum operators. + * - \devicestorage + * - \cdp + * + * \par Performance + * The following charts illustrate saturated exclusive sum performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. + * + * \image html scan_int32.png + * \image html scan_int64.png + * + * \par Snippet + * The code snippet below illustrates the exclusive prefix sum of an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix sum + * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out s<-- [0, 8, 14, 21, 26, 29, 29] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading scan input data \iterator + * \tparam OutputIterator [inferred] Random-access output iterator type for writing scan output data \iterator + */ + template < + typename InputIterator, + typename OutputIterator> + CUB_RUNTIME_FUNCTION + static cudaError_t ExclusiveSum( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output sequence of data items + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + // Scan data type + typedef typename std::iterator_traits::value_type T; + + return DeviceScanDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + Sum(), + T(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor. + * + * \par + * - Supports non-commutative scan operators. + * - \devicestorage + * - \cdp + * + * \par Performance + * Performance is typically similar to DeviceScan::ExclusiveSum. + * + * \par Snippet + * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * CustomMin min_op + * ... + * + * // Determine temporary device storage requirements for exclusive prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); + * + * // Allocate temporary storage for exclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix min-scan + * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); + * + * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading scan input data \iterator + * \tparam OutputIterator [inferred] Random-access output iterator type for writing scan output data \iterator + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam Identity [inferred] Type of the \p identity value used Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + typename InputIterator, + typename OutputIterator, + typename ScanOp, + typename Identity> + CUB_RUNTIME_FUNCTION + static cudaError_t ExclusiveScan( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output sequence of data items + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + Identity identity, ///< [in] Identity element + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + return DeviceScanDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + identity, + num_items, + stream, + debug_synchronous); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive scans + *********************************************************************/ + //@{ + + + /** + * \brief Computes a device-wide inclusive prefix sum. + * + * \par + * - Supports non-commutative sum operators. + * - \devicestorage + * - \cdp + * + * \par Performance + * Performance is typically similar to DeviceScan::ExclusiveSum. + * + * \par Snippet + * The code snippet below illustrates the inclusive prefix sum of an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * ... + * + * // Determine temporary device storage requirements for inclusive prefix sum + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage for inclusive prefix sum + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix sum + * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out <-- [8, 14, 21, 26, 29, 29, 38] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading scan input data \iterator + * \tparam OutputIterator [inferred] Random-access output iterator type for writing scan output data \iterator + */ + template < + typename InputIterator, + typename OutputIterator> + CUB_RUNTIME_FUNCTION + static cudaError_t InclusiveSum( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output sequence of data items + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + return DeviceScanDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + Sum(), + NullType(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor. + * + * \par + * - Supports non-commutative scan operators. + * - \devicestorage + * - \cdp + * + * \par Performance + * Performance is typically similar to DeviceScan::ExclusiveSum. + * + * \par Snippet + * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * CustomMin min_op; + * ... + * + * // Determine temporary device storage requirements for inclusive prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); + * + * // Allocate temporary storage for inclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix min-scan + * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); + * + * // d_out <-- [8, 6, 6, 5, 3, 0, 0] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading scan input data \iterator + * \tparam OutputIterator [inferred] Random-access output iterator type for writing scan output data \iterator + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + typename InputIterator, + typename OutputIterator, + typename ScanOp> + CUB_RUNTIME_FUNCTION + static cudaError_t InclusiveScan( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output sequence of data items + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int Offset; + + return DeviceScanDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + NullType(), + num_items, + stream, + debug_synchronous); + } + + //@} end member group + +}; + +/** + * \example example_device_scan.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/device_select.cuh b/SRC/cub/device/device_select.cuh new file mode 100644 index 00000000..8357c59d --- /dev/null +++ b/SRC/cub/device/device_select.cuh @@ -0,0 +1,372 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within global memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/device_select_dispatch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within global memory. ![](select_logo.png) + * \ingroup DeviceModule + * + * \par Overview + * These operations apply a selection criterion to selectively copy + * items from a specified input sequence to a compact output sequence. + * + * \par Usage Considerations + * \cdp_class{DeviceSelect} + * + * \par Performance + * \linear_performance{select-flagged, select-if, and select-unique} + * + * \par + * The following chart illustrates DeviceSelect::If + * performance across different CUDA architectures for \p int32 items, + * where 50% of the items are randomly selected. + * + * \image html select_if_int32_50_percent.png + * + * \par + * The following chart illustrates DeviceSelect::Unique + * performance across different CUDA architectures for \p int32 items + * where segments have lengths uniformly sampled from [1,1000]. + * + * \image html select_unique_int32_len_500.png + * + * \par + * \plots_below + * + */ +struct DeviceSelect +{ + /** + * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected. ![](select_flags_logo.png) + * + * \par + * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * - \cdp + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input, flags, and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected, num_items); + * + * // d_out <-- [1, 4, 6, 7] + * // d_num_selected <-- [4] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator + * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator + * \tparam OutputIterator [inferred] Random-access output iterator type for writing selected items \iterator + * \tparam NumSelectedIterator [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename InputIterator, + typename FlagIterator, + typename OutputIterator, + typename NumSelectedIterator> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Flagged( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIterator d_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIterator d_num_selected, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int Offset; // Signed integer type for global offsets + typedef NullType SelectOp; // Selection op (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DeviceSelectDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected, + SelectOp(), + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected. ![](select_logo.png) + * + * \par + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * - \cdp + * + * \par Performance + * The following charts illustrate saturated select-if performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Items are + * selected with 50% probability. + * + * \image html select_if_int32_50_percent.png + * \image html select_if_int64_50_percent.png + * + * \par + * The following charts are similar, but 5% selection probability: + * + * \image html select_if_int32_5_percent.png + * \image html select_if_int64_5_percent.png + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Functor type for selecting values less than some criteria + * struct LessThan + * { + * int compare; + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * LessThan(int compare) : compare(compare) {} + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * bool operator()(const int &a) const { + * return (a < compare); + * } + * }; + * + * // Declare, allocate, and initialize device pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected; // e.g., [ ] + * LessThan select_op(7); + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected, num_items, select_op); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected, num_items, select_op); + * + * // d_out <-- [0, 2, 3, 5, 2] + * // d_num_selected <-- [5] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIterator [inferred] Random-access output iterator type for writing selected items \iterator + * \tparam NumSelectedIterator [inferred] Output iterator type for recording the number of items selected \iterator + * \tparam SelectOp [inferred] Selection operator type having member bool operator()(const T &a) + */ + template < + typename InputIterator, + typename OutputIterator, + typename NumSelectedIterator, + typename SelectOp> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t If( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIterator d_num_selected, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + SelectOp select_op, ///< [in] Unary selection operator + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int Offset; // Signed integer type for global offsets + typedef NullType* FlagIterator; // Flag iterator type (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DeviceSelectDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + NULL, + d_out, + d_num_selected, + select_op, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out. The total number of items selected is written to \p d_num_selected. ![](unique_logo.png) + * + * \par + * - The == equality operator is used to determine whether keys are equivalent + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * - \cdp + * + * \par Performance + * The following charts illustrate saturated select-unique performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have + * lengths uniformly sampled from [1,1000]. + * + * \image html select_unique_int32_len_500.png + * \image html select_unique_int64_len_500.png + * + * \par + * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: + * + * \image html select_unique_int32_len_5.png + * \image html select_unique_int64_len_5.png + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected, num_items); + * + * // d_out <-- [0, 2, 9, 5, 8] + * // d_num_selected <-- [5] + * + * \endcode + * + * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIterator [inferred] Random-access output iterator type for writing selected items \iterator + * \tparam NumSelectedIterator [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename InputIterator, + typename OutputIterator, + typename NumSelectedIterator> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Unique( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIterator d_num_selected, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int Offset; // Signed integer type for global offsets + typedef NullType* FlagIterator; // Flag iterator type (not used) + typedef NullType SelectOp; // Selection op (not used) + typedef Equality EqualityOp; // Default == operator + + return DeviceSelectDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + NULL, + d_out, + d_num_selected, + SelectOp(), + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + +}; + +/** + * \example example_device_select_flagged.cu + * \example example_device_select_if.cu + * \example example_device_select_unique.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/dispatch/device_histogram_dispatch.cuh b/SRC/cub/device/dispatch/device_histogram_dispatch.cuh new file mode 100644 index 00000000..1c2d1b36 --- /dev/null +++ b/SRC/cub/device/dispatch/device_histogram_dispatch.cuh @@ -0,0 +1,554 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within global memory. + */ + +#pragma once + +#include +#include + +#include "../../block_range/block_range_histo.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Initialization kernel entry point (multi-block). Prepares queue descriptors and zeroes global counters. + */ +template < + int BINS, ///< Number of histogram bins per channel + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename Offset, ///< Signed integer type for global offsets + typename HistoCounter> ///< Integer type for counting sample occurrences per histogram bin +__launch_bounds__ (BINS, 1) +__global__ void HistoInitKernel( + GridQueue grid_queue, ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks + ArrayWrapper d_out_histograms, ///< [out] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][BINS] + Offset num_samples) ///< [in] Total number of samples \p d_samples for all channels +{ + d_out_histograms.array[blockIdx.x][threadIdx.x] = 0; + if (threadIdx.x == 0) grid_queue.FillAndResetDrain(num_samples); +} + + +/** + * Histogram tiles kernel entry point (multi-block). Computes privatized histograms, one per thread block. + */ +template < + typename BlockRangeHistogramPolicy, ///< Parameterized BlockRangeHistogramPolicy tuning policy type + int BINS, ///< Number of histogram bins per channel + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIterator, ///< The input iterator type \iterator. Must have a value type that is assignable to unsigned char + typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin + typename Offset> ///< Signed integer type for global offsets +__launch_bounds__ (int(BlockRangeHistogramPolicy::BLOCK_THREADS)) +__global__ void HistoRegionKernel( + InputIterator d_samples, ///< [in] Array of sample data. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples). + ArrayWrapper d_out_histograms, ///< [out] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][gridDim.x][BINS] + Offset num_samples, ///< [in] Total number of samples \p d_samples for all channels + GridEvenShare even_share, ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block + GridQueue queue) ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + // Constants + enum + { + BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD, + TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + // Thread block type for compositing input tiles + typedef BlockRangeHistogram BlockRangeHistogramT; + + // Shared memory for BlockRangeHistogram + __shared__ typename BlockRangeHistogramT::TempStorage temp_storage; + + // Consume input tiles + BlockRangeHistogramT(temp_storage, d_samples, d_out_histograms.array).ConsumeRange( + num_samples, + even_share, + queue, + Int2Type()); +} + + +/** + * Aggregation kernel entry point (single-block). Aggregates privatized threadblock histograms from a previous multi-block histogram pass. + */ +template < + int BINS, ///< Number of histogram bins per channel + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename HistoCounter> ///< Integer type for counting sample occurrences per histogram bin +__launch_bounds__ (BINS, 1) +__global__ void HistoAggregateKernel( + HistoCounter* d_block_histograms, ///< [in] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][num_threadblocks][BINS] + ArrayWrapper d_out_histograms, ///< [out] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][BINS] + int num_threadblocks) ///< [in] Number of threadblock histograms per channel in \p d_block_histograms +{ + // Accumulate threadblock-histograms from the channel + HistoCounter bin_aggregate = 0; + + int block_offset = blockIdx.x * (num_threadblocks * BINS); + int block_end = block_offset + (num_threadblocks * BINS); + +#if CUB_PTX_ARCH >= 200 + #pragma unroll 32 +#endif + while (block_offset < block_end) + { + HistoCounter block_bin_count = d_block_histograms[block_offset + threadIdx.x]; + + bin_aggregate += block_bin_count; + block_offset += BINS; + } + + // Output + d_out_histograms.array[blockIdx.x][threadIdx.x] = bin_aggregate; +} + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram + */ +template < + DeviceHistogramAlgorithm HISTO_ALGORITHM, ///< Cooperative histogram algorithm to use + int BINS, ///< Number of histogram bins per channel + int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename InputIterator, ///< The input iterator type \iterator. Must have a value type that is assignable to unsigned char + typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin + typename Offset> ///< Signed integer type for global offsets +struct DeviceHistogramDispatch +{ + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + // HistoRegionPolicy + typedef BlockRangeHistogramPolicy< + (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 128 : 256, + (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 12 : (30 / ACTIVE_CHANNELS), + HISTO_ALGORITHM, + (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE> + HistoRegionPolicy; + }; + + /// SM30 + struct Policy300 + { + // HistoRegionPolicy + typedef BlockRangeHistogramPolicy< + 128, + (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 20 : (22 / ACTIVE_CHANNELS), + HISTO_ALGORITHM, + (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE> + HistoRegionPolicy; + }; + + /// SM20 + struct Policy200 + { + // HistoRegionPolicy + typedef BlockRangeHistogramPolicy< + 128, + (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 21 : (23 / ACTIVE_CHANNELS), + HISTO_ALGORITHM, + GRID_MAPPING_DYNAMIC> + HistoRegionPolicy; + }; + + /// SM10 + struct Policy100 + { + // HistoRegionPolicy + typedef BlockRangeHistogramPolicy< + 128, + 7, + DEVICE_HISTO_SORT, // (use sort regardless because g-atomics are unsupported and s-atomics are perf-useless) + GRID_MAPPING_EVEN_SHARE> + HistoRegionPolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxHistoRegionPolicy : PtxPolicy::HistoRegionPolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &histo_range_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + histo_range_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + histo_range_config.template Init(); + } + else if (ptx_version >= 300) + { + histo_range_config.template Init(); + } + else if (ptx_version >= 200) + { + histo_range_config.template Init(); + } + else + { + histo_range_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + DeviceHistogramAlgorithm block_algorithm; + GridMappingStrategy grid_mapping; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = BlockPolicy::BLOCK_THREADS; + items_per_thread = BlockPolicy::ITEMS_PER_THREAD; + block_algorithm = BlockPolicy::HISTO_ALGORITHM; + grid_mapping = BlockPolicy::GRID_MAPPING; + } + + CUB_RUNTIME_FUNCTION __forceinline__ + void Print() + { + printf("%d, %d, %d, %d", block_threads, items_per_thread, block_algorithm, grid_mapping); + } + + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + + /** + * Internal dispatch routine + */ + template < + typename InitHistoKernelPtr, ///< Function type of cub::HistoInitKernel + typename HistoRegionKernelPtr, ///< Function type of cub::HistoRegionKernel + typename AggregateHistoKernelPtr> ///< Function type of cub::HistoAggregateKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_samples, ///< [in] Input samples to histogram + HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter. + Offset num_samples, ///< [in] Number of samples to process + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false. + InitHistoKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::HistoInitKernel + HistoRegionKernelPtr histo_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::HistoRegionKernel + AggregateHistoKernelPtr aggregate_kernel, ///< [in] Kernel function pointer to parameterization of cub::HistoAggregateKernel + KernelConfig histo_range_config) ///< [in] Dispatch parameters that match the policy that \p histo_range_kernel was compiled for + { + #ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + + #else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get device SM version + int sm_version; + if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Get SM occupancy for histo_range_kernel + int histo_range_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + histo_range_sm_occupancy, + sm_version, + histo_range_kernel, + histo_range_config.block_threads))) break; + + // Get device occupancy for histo_range_kernel + int histo_range_occupancy = histo_range_sm_occupancy * sm_count; + + // Get tile size for histo_range_kernel + int channel_tile_size = histo_range_config.block_threads * histo_range_config.items_per_thread; + int tile_size = channel_tile_size * CHANNELS; + + // Even-share work distribution + int subscription_factor = histo_range_sm_occupancy; // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic) + GridEvenShare even_share( + num_samples, + histo_range_occupancy * subscription_factor, + tile_size); + + // Get grid size for histo_range_kernel + int histo_range_grid_size; + switch (histo_range_config.grid_mapping) + { + case GRID_MAPPING_EVEN_SHARE: + + // Work is distributed evenly + histo_range_grid_size = even_share.grid_size; + break; + + case GRID_MAPPING_DYNAMIC: + + // Work is distributed dynamically + int num_tiles = (num_samples + tile_size - 1) / tile_size; + histo_range_grid_size = (num_tiles < histo_range_occupancy) ? + num_tiles : // Not enough to fill the device with threadblocks + histo_range_occupancy; // Fill the device with threadblocks + break; + }; + + // Temporary storage allocation requirements + void* allocations[2]; + size_t allocation_sizes[2] = + { + ACTIVE_CHANNELS * histo_range_grid_size * sizeof(HistoCounter) * BINS, // bytes needed for privatized histograms + GridQueue::AllocationSize() // bytes needed for grid queue descriptor + }; + + // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + return cudaSuccess; + } + + // Alias the allocation for the privatized per-block reductions + HistoCounter *d_block_histograms = (HistoCounter*) allocations[0]; + + // Alias the allocation for the grid queue descriptor + GridQueue queue(allocations[1]); + + // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters) + ArrayWrapper d_histo_wrapper; + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL]; + + // Setup array wrapper for temporary histogram channel output (because we can't pass static arrays as kernel parameters) + ArrayWrapper d_temp_histo_wrapper; + for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) + d_temp_histo_wrapper.array[CHANNEL] = d_block_histograms + (CHANNEL * histo_range_grid_size * BINS); + + // Log init_kernel configuration + if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", ACTIVE_CHANNELS, BINS, (long long) stream); + + // Invoke init_kernel to initialize counters and queue descriptor + init_kernel<<>>(queue, d_histo_wrapper, num_samples); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Whether we need privatized histograms (i.e., non-global atomics and multi-block) + bool privatized_temporaries = (histo_range_grid_size > 1) && (histo_range_config.block_algorithm != DEVICE_HISTO_GLOBAL_ATOMIC); + + // Log histo_range_kernel configuration + if (debug_synchronous) CubLog("Invoking histo_range_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + histo_range_grid_size, histo_range_config.block_threads, (long long) stream, histo_range_config.items_per_thread, histo_range_sm_occupancy); + + // Invoke histo_range_kernel + histo_range_kernel<<>>( + d_samples, + (privatized_temporaries) ? + d_temp_histo_wrapper : + d_histo_wrapper, + num_samples, + even_share, + queue); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Aggregate privatized block histograms if necessary + if (privatized_temporaries) + { + // Log aggregate_kernel configuration + if (debug_synchronous) CubLog("Invoking aggregate_kernel<<<%d, %d, 0, %lld>>>()\n", + ACTIVE_CHANNELS, BINS, (long long) stream); + + // Invoke aggregate_kernel + aggregate_kernel<<>>( + d_block_histograms, + d_histo_wrapper, + histo_range_grid_size); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + } + while (0); + + return error; + + #endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_samples, ///< [in] Input samples to histogram + HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter. + int num_samples, ///< [in] Number of samples to process + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig histo_range_config; + InitConfigs(ptx_version, histo_range_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_histograms, + num_samples, + stream, + debug_synchronous, + HistoInitKernel, + HistoRegionKernel, + HistoAggregateKernel, + histo_range_config))) break; + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/dispatch/device_radix_sort_dispatch.cuh b/SRC/cub/device/dispatch/device_radix_sort_dispatch.cuh new file mode 100644 index 00000000..028a5684 --- /dev/null +++ b/SRC/cub/device/dispatch/device_radix_sort_dispatch.cuh @@ -0,0 +1,939 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within global memory. + */ + +#pragma once + +#include +#include + +#include "../../block_range/block_range_radix_sort_upsweep.cuh" +#include "../../block_range/block_range_radix_sort_downsweep.cuh" +#include "../../block_range/block_range_scan.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Upsweep pass kernel entry point (multi-block). Computes privatized digit histograms, one per block. + */ +template < + typename BlockRangeRadixSortUpsweepPolicy, ///< Parameterized BlockRangeRadixSortUpsweepPolicy tuning policy type + bool DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename Key, ///< Key type + typename Offset> ///< Signed integer type for global offsets +__launch_bounds__ (int(BlockRangeRadixSortUpsweepPolicy::BLOCK_THREADS), 1) +__global__ void RadixSortUpsweepKernel( + Key *d_keys, ///< [in] Input keys buffer + Offset *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + Offset num_items, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int num_bits, ///< [in] Number of bits of current radix digit + bool first_pass, ///< [in] Whether this is the first digit pass + GridEvenShare even_share) ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block +{ + // Parameterize BlockRangeRadixSortUpsweep type for the current configuration + typedef BlockRangeRadixSortUpsweep BlockRangeRadixSortUpsweepT; // Primary + + // Shared memory storage + __shared__ typename BlockRangeRadixSortUpsweepT::TempStorage temp_storage; + + // Initialize even-share descriptor for this thread block + even_share.BlockInit(); + + Offset bin_count; + BlockRangeRadixSortUpsweepT(temp_storage, d_keys, current_bit, num_bits).ProcessRegion( + even_share.block_offset, + even_share.block_end, + bin_count); + + // Write out digit counts (striped) + if (threadIdx.x < BlockRangeRadixSortUpsweepT::RADIX_DIGITS) + { + int bin_idx = (DESCENDING) ? + BlockRangeRadixSortUpsweepT::RADIX_DIGITS - threadIdx.x - 1 : + threadIdx.x; + + d_spine[(gridDim.x * bin_idx) + blockIdx.x] = bin_count; + } +} + + +/** + * Spine scan kernel entry point (single-block). Computes an exclusive prefix sum over the privatized digit histograms + */ +template < + typename BlockRangeScanPolicy, ///< Parameterizable tuning policy type for cub::BlockRangeScan abstraction + typename Offset> ///< Signed integer type for global offsets +__launch_bounds__ (int(BlockRangeScanPolicy::BLOCK_THREADS), 1) +__global__ void RadixSortScanKernel( + Offset *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + int num_counts) ///< [in] Total number of bin-counts +{ + // Parameterize the BlockRangeScan type for the current configuration + typedef BlockRangeScan BlockRangeScanT; + + // Shared memory storage + __shared__ typename BlockRangeScanT::TempStorage temp_storage; + + if (blockIdx.x > 0) return; + + // Block scan instance + BlockRangeScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), Offset(0)) ; + + // Process full input tiles + int block_offset = 0; + BlockScanRunningPrefixOp prefix_op(0, Sum()); + while (block_offset + BlockRangeScanT::TILE_ITEMS <= num_counts) + { + block_scan.ConsumeTile(block_offset, prefix_op); + block_offset += BlockRangeScanT::TILE_ITEMS; + } +} + + +/** + * Downsweep pass kernel entry point (multi-block). Scatters keys (and values) into corresponding bins for the current digit place. + */ +template < + typename BlockRangeRadixSortDownsweepPolicy, ///< Parameterizable tuning policy type for cub::BlockRangeRadixSortUpsweep abstraction + bool DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename Key, ///< Key type + typename Value, ///< Value type + typename Offset> ///< Signed integer type for global offsets +__launch_bounds__ (int(BlockRangeRadixSortDownsweepPolicy::BLOCK_THREADS), 1) +__global__ void RadixSortDownsweepKernel( + Key *d_keys_in, ///< [in] Input keys ping buffer + Key *d_keys_out, ///< [in] Output keys pong buffer + Value *d_values_in, ///< [in] Input values ping buffer + Value *d_values_out, ///< [in] Output values pong buffer + Offset *d_spine, ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + Offset num_items, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int num_bits, ///< [in] Number of bits of current radix digit + bool first_pass, ///< [in] Whether this is the first digit pass + bool last_pass, ///< [in] Whether this is the last digit pass + GridEvenShare even_share) ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block +{ + // Parameterize BlockRangeRadixSortDownsweep type for the current configuration + typedef BlockRangeRadixSortDownsweep BlockRangeRadixSortDownsweepT; + + // Shared memory storage + __shared__ typename BlockRangeRadixSortDownsweepT::TempStorage temp_storage; + + // Initialize even-share descriptor for this thread block + even_share.BlockInit(); + + // Process input tiles + BlockRangeRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion( + even_share.block_offset, + even_share.block_end); +} + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceRadixSort + */ +template < + bool DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename Key, ///< Key type + typename Value, ///< Value type + typename Offset> ///< Signed integer type for global offsets +struct DeviceRadixSortDispatch +{ + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + KEYS_ONLY = (Equals::VALUE), + SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4, + RADIX_BITS = 5, + }; + + // Primary UpsweepPolicy + typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs; + typedef typename If::Type UpsweepPolicy; + + // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 22 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS - 1> AltUpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS - 1> AltUpsweepPolicyPairs; + typedef typename If::Type AltUpsweepPolicy; + + // ScanPolicy + typedef BlockRangeScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_WARP_SCANS> ScanPolicy; + + // Primary DownsweepPolicy + typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs; + typedef typename If::Type DownsweepPolicy; + + // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 11 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyPairs; + typedef typename If::Type AltDownsweepPolicy; + }; + + + /// SM30 + struct Policy300 + { + enum { + KEYS_ONLY = (Equals::VALUE), + SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4, + RADIX_BITS = 5, + }; + + // UpsweepPolicy + typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs; + typedef typename If::Type UpsweepPolicy; + + // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs; + typedef typename If::Type AltUpsweepPolicy; + + // ScanPolicy + typedef BlockRangeScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // DownsweepPolicy + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs; + typedef typename If::Type DownsweepPolicy; + + // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyPairs; + typedef typename If::Type AltDownsweepPolicy; + }; + + + /// SM20 + struct Policy200 + { + enum { + KEYS_ONLY = (Equals::VALUE), + SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4, + RADIX_BITS = 5, + }; + + // UpsweepPolicy + typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs; + typedef typename If::Type UpsweepPolicy; + + // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs; + typedef typename If::Type AltUpsweepPolicy; + + // ScanPolicy + typedef BlockRangeScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // DownsweepPolicy + typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs; + typedef typename If::Type DownsweepPolicy; + + // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyPairs; + typedef typename If::Type AltDownsweepPolicy; + }; + + + /// SM13 + struct Policy130 + { + enum { + KEYS_ONLY = (Equals::VALUE), + SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4, + RADIX_BITS = 5, + }; + + // UpsweepPolicy + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs; + typedef typename If::Type UpsweepPolicy; + + // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys; + typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs; + typedef typename If::Type AltUpsweepPolicy; + + // ScanPolicy + typedef BlockRangeScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_WARP_SCANS> ScanPolicy; + + // DownsweepPolicy + typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs; + typedef typename If::Type DownsweepPolicy; + + // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyKeys; + typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyPairs; + typedef typename If::Type AltDownsweepPolicy; + }; + + + /// SM10 + struct Policy100 + { + enum { + RADIX_BITS = 4, + }; + + // UpsweepPolicy + typedef BlockRangeRadixSortUpsweepPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS> UpsweepPolicy; + + // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortUpsweepPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicy; + + // ScanPolicy + typedef BlockRangeScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // DownsweepPolicy + typedef BlockRangeRadixSortDownsweepPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicy; + + // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes + typedef BlockRangeRadixSortDownsweepPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxUpsweepPolicy : PtxPolicy::UpsweepPolicy {}; + struct PtxAltUpsweepPolicy : PtxPolicy::AltUpsweepPolicy {}; + struct PtxScanPolicy : PtxPolicy::ScanPolicy {}; + struct PtxDownsweepPolicy : PtxPolicy::DownsweepPolicy {}; + struct PtxAltDownsweepPolicy : PtxPolicy::AltDownsweepPolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template < + typename Policy, + typename KernelConfig, + typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel + typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel + typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t InitConfigs( + int sm_version, + int sm_count, + KernelConfig &upsweep_config, + KernelConfig &alt_upsweep_config, + KernelConfig &scan_config, + KernelConfig &downsweep_config, + KernelConfig &alt_downsweep_config, + UpsweepKernelPtr upsweep_kernel, + UpsweepKernelPtr alt_upsweep_kernel, + ScanKernelPtr scan_kernel, + DownsweepKernelPtr downsweep_kernel, + DownsweepKernelPtr alt_downsweep_kernel) + { + cudaError_t error; + do { + if (CubDebug(error = upsweep_config.template InitUpsweepPolicy( sm_version, sm_count, upsweep_kernel))) break; + if (CubDebug(error = alt_upsweep_config.template InitUpsweepPolicy( sm_version, sm_count, alt_upsweep_kernel))) break; + if (CubDebug(error = scan_config.template InitScanPolicy( sm_version, sm_count, scan_kernel))) break; + if (CubDebug(error = downsweep_config.template InitDownsweepPolicy( sm_version, sm_count, downsweep_kernel))) break; + if (CubDebug(error = alt_downsweep_config.template InitDownsweepPolicy( sm_version, sm_count, alt_downsweep_kernel))) break; + + } while (0); + + return error; + } + + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template < + typename KernelConfig, + typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel + typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel + typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t InitConfigs( + int ptx_version, + int sm_version, + int sm_count, + KernelConfig &upsweep_config, + KernelConfig &alt_upsweep_config, + KernelConfig &scan_config, + KernelConfig &downsweep_config, + KernelConfig &alt_downsweep_config, + UpsweepKernelPtr upsweep_kernel, + UpsweepKernelPtr alt_upsweep_kernel, + ScanKernelPtr scan_kernel, + DownsweepKernelPtr downsweep_kernel, + DownsweepKernelPtr alt_downsweep_kernel) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + cudaError_t error; + do { + + if (CubDebug(error = upsweep_config.template InitUpsweepPolicy( sm_version, sm_count, upsweep_kernel))) break; + if (CubDebug(error = alt_upsweep_config.template InitUpsweepPolicy( sm_version, sm_count, alt_upsweep_kernel))) break; + if (CubDebug(error = scan_config.template InitScanPolicy( sm_version, sm_count, scan_kernel))) break; + if (CubDebug(error = downsweep_config.template InitDownsweepPolicy( sm_version, sm_count, downsweep_kernel))) break; + if (CubDebug(error = alt_downsweep_config.template InitDownsweepPolicy( sm_version, sm_count, alt_downsweep_kernel))) break; + + } while (0); + + return error; + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + cudaError_t error; + if (ptx_version >= 350) + { + error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + } + else if (ptx_version >= 300) + { + error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + } + else if (ptx_version >= 200) + { + error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + } + else if (ptx_version >= 130) + { + error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + } + else + { + error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); + } + + return error; + + #endif + } + + + + /** + * Kernel kernel dispatch configurations + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_size; + cudaSharedMemConfig smem_config; + int radix_bits; + int sm_occupancy; // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic) + int max_grid_size; + int subscription_factor; + + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitUpsweepPolicy( + int sm_version, int sm_count, UpsweepKernelPtr upsweep_kernel) + { + block_threads = UpsweepPolicy::BLOCK_THREADS; + items_per_thread = UpsweepPolicy::ITEMS_PER_THREAD; + radix_bits = UpsweepPolicy::RADIX_BITS; + smem_config = cudaSharedMemBankSizeFourByte; + tile_size = block_threads * items_per_thread; + cudaError_t retval = MaxSmOccupancy(sm_occupancy, sm_version, upsweep_kernel, block_threads); + subscription_factor = CUB_SUBSCRIPTION_FACTOR(sm_version); + max_grid_size = (sm_occupancy * sm_count) * subscription_factor; + + return retval; + } + + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitScanPolicy( + int sm_version, int sm_count, ScanKernelPtr scan_kernel) + { + block_threads = ScanPolicy::BLOCK_THREADS; + items_per_thread = ScanPolicy::ITEMS_PER_THREAD; + radix_bits = 0; + smem_config = cudaSharedMemBankSizeFourByte; + tile_size = block_threads * items_per_thread; + sm_occupancy = 1; + subscription_factor = 1; + max_grid_size = 1; + + return cudaSuccess; + } + + template + CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitDownsweepPolicy( + int sm_version, int sm_count, DownsweepKernelPtr downsweep_kernel) + { + block_threads = DownsweepPolicy::BLOCK_THREADS; + items_per_thread = DownsweepPolicy::ITEMS_PER_THREAD; + radix_bits = DownsweepPolicy::RADIX_BITS; + smem_config = DownsweepPolicy::SMEM_CONFIG; + tile_size = block_threads * items_per_thread; + cudaError_t retval = MaxSmOccupancy(sm_occupancy, sm_version, downsweep_kernel, block_threads); + subscription_factor = CUB_SUBSCRIPTION_FACTOR(sm_version); + max_grid_size = (sm_occupancy * sm_count) * subscription_factor; + + return retval; + } + }; + + + /****************************************************************************** + * Allocation of device temporaries + ******************************************************************************/ + + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t AllocateTemporaries( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + Offset* &d_spine, ///< [out] Digit count histograms per thread block + KernelConfig &scan_config, ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for + KernelConfig &downsweep_config) ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for + { + cudaError error = cudaSuccess; + do + { + // Get spine size (conservative) + int spine_size = (downsweep_config.max_grid_size * (1 << downsweep_config.radix_bits)) + scan_config.tile_size; + + // Temporary storage allocation requirements + void* allocations[1]; + size_t allocation_sizes[1] = + { + spine_size * sizeof(Offset), // bytes needed for privatized block digit histograms + }; + + // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + return cudaSuccess; + + // Alias the allocation for the privatized per-block digit histograms + d_spine = (Offset*) allocations[0]; + + } while(0); + + return error; + } + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide radix sort using the + * specified kernel functions. + */ + template < + typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel + typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel + typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + Offset *d_spine, ///< [in] Digit count histograms per thread block + int spine_size, ///< [in] Number of histogram counters + Offset num_items, ///< [in] Number of items to reduce + int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + KernelConfig &upsweep_config, ///< [in] Dispatch parameters that match the policy that \p upsweep_kernel was compiled for + KernelConfig &scan_config, ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for + KernelConfig &downsweep_config, ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for + UpsweepKernelPtr upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel + ScanKernelPtr scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel + DownsweepKernelPtr downsweep_kernel) ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel + { +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); + +#else + + cudaError error = cudaSuccess; + do + { + // Get even-share work distribution descriptor + GridEvenShare even_share(num_items, downsweep_config.max_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size)); + +#if (CUB_PTX_ARCH == 0) + // Get current smem bank configuration + cudaSharedMemConfig original_smem_config; + if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break; + cudaSharedMemConfig current_smem_config = original_smem_config; +#endif + // Iterate over digit places + int current_bit = begin_bit; + while (current_bit < end_bit) + { + int num_bits = CUB_MIN(end_bit - current_bit, downsweep_config.radix_bits); + +#if (CUB_PTX_ARCH == 0) + // Update smem config if necessary + if (current_smem_config != upsweep_config.smem_config) + { + if (CubDebug(error = cudaDeviceSetSharedMemConfig(upsweep_config.smem_config))) break; + current_smem_config = upsweep_config.smem_config; + } +#endif + + // Log upsweep_kernel configuration + if (debug_synchronous) + CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy, selector %d, current bit %d, bit_grain %d\n", + even_share.grid_size, upsweep_config.block_threads, (long long) stream, upsweep_config.smem_config, upsweep_config.items_per_thread, upsweep_config.sm_occupancy, d_keys.selector, current_bit, downsweep_config.radix_bits); + + // Invoke upsweep_kernel with same grid size as downsweep_kernel + upsweep_kernel<<>>( + d_keys.d_buffers[d_keys.selector], + d_spine, + num_items, + current_bit, + num_bits, + (current_bit == begin_bit), + even_share); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log scan_kernel configuration + if (debug_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", + 1, scan_config.block_threads, (long long) stream, scan_config.items_per_thread); + + // Invoke scan_kernel + scan_kernel<<<1, scan_config.block_threads, 0, stream>>>( + d_spine, + spine_size); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + +#if (CUB_PTX_ARCH == 0) + // Update smem config if necessary + if (current_smem_config != downsweep_config.smem_config) + { + if (CubDebug(error = cudaDeviceSetSharedMemConfig(downsweep_config.smem_config))) break; + current_smem_config = downsweep_config.smem_config; + } +#endif + // Log downsweep_kernel configuration + if (debug_synchronous) CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy\n", + even_share.grid_size, downsweep_config.block_threads, (long long) stream, downsweep_config.smem_config, downsweep_config.items_per_thread, downsweep_config.sm_occupancy); + + // Invoke downsweep_kernel + downsweep_kernel<<>>( + d_keys.d_buffers[d_keys.selector], + d_keys.d_buffers[d_keys.selector ^ 1], + d_values.d_buffers[d_values.selector], + d_values.d_buffers[d_values.selector ^ 1], + d_spine, + num_items, + current_bit, + num_bits, + (current_bit == begin_bit), + (current_bit + downsweep_config.radix_bits >= end_bit), + even_share); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Invert selectors + d_keys.selector ^= 1; + d_values.selector ^= 1; + + // Update current bit position + current_bit += downsweep_config.radix_bits; + } + +#if (CUB_PTX_ARCH == 0) + // Reset smem config if necessary + if (current_smem_config != original_smem_config) + { + if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break; + } +#endif + + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + template < + typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel + typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel + typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + Offset num_items, ///< [in] Number of items to reduce + int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + UpsweepKernelPtr upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel + UpsweepKernelPtr alt_upsweep_kernel, ///< [in] Alternate kernel function pointer to parameterization of cub::RadixSortUpsweepKernel + ScanKernelPtr scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel + DownsweepKernelPtr downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel + DownsweepKernelPtr alt_downsweep_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::RadixSortUpsweepKernel + { +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); + +#else + + cudaError error = cudaSuccess; + + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get device SM version + int sm_version; + if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Get kernel kernel dispatch configurations + KernelConfig upsweep_config; + KernelConfig alt_upsweep_config; + KernelConfig scan_config; + KernelConfig downsweep_config; + KernelConfig alt_downsweep_config; + + if (CubDebug(error = InitConfigs(ptx_version, sm_version, sm_count, + upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, + upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel))) break; + + // Get spine sizes (conservative) + int spine_size = (downsweep_config.max_grid_size * (1 << downsweep_config.radix_bits)) + scan_config.tile_size; + int alt_spine_size = (alt_downsweep_config.max_grid_size * (1 << alt_downsweep_config.radix_bits)) + scan_config.tile_size; + + // Allocate temporaries + Offset *d_spine; + if (spine_size > alt_spine_size) + { + if (CubDebug(error = AllocateTemporaries(d_temp_storage, temp_storage_bytes, d_spine, scan_config, downsweep_config))) break; + } + else + { + if (CubDebug(error = AllocateTemporaries(d_temp_storage, temp_storage_bytes, d_spine, scan_config, alt_downsweep_config))) break; + } + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + return cudaSuccess; + + // Run radix sorting passes + int num_bits = end_bit - begin_bit; + int remaining_bits = num_bits % downsweep_config.radix_bits; + + if (remaining_bits != 0) + { + // Run passes of alternate configuration + int max_alt_passes = downsweep_config.radix_bits - remaining_bits; + int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_downsweep_config.radix_bits)); + + if (CubDebug(error = Dispatch( + d_keys, + d_values, + d_spine, + alt_spine_size, + num_items, + begin_bit, + alt_end_bit, + stream, + debug_synchronous, + alt_upsweep_config, + scan_config, + alt_downsweep_config, + alt_upsweep_kernel, + scan_kernel, + alt_downsweep_kernel))) break; + + begin_bit = alt_end_bit; + } + + // Run passes of primary configuration + if (CubDebug(error = Dispatch( + d_keys, + d_values, + d_spine, + spine_size, + num_items, + begin_bit, + end_bit, + stream, + debug_synchronous, + upsweep_config, + scan_config, + downsweep_config, + upsweep_kernel, + scan_kernel, + downsweep_kernel))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + Offset num_items, ///< [in] Number of items to reduce + int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + return Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + stream, + debug_synchronous, + RadixSortUpsweepKernel, + RadixSortUpsweepKernel, + RadixSortScanKernel, + RadixSortDownsweepKernel, + RadixSortDownsweepKernel); + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/dispatch/device_reduce_by_key_dispatch.cuh b/SRC/cub/device/dispatch/device_reduce_by_key_dispatch.cuh new file mode 100644 index 00000000..81c028e1 --- /dev/null +++ b/SRC/cub/device/dispatch/device_reduce_by_key_dispatch.cuh @@ -0,0 +1,594 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within global memory. + */ + +#pragma once + +#include +#include + +#include "device_scan_dispatch.cuh" +#include "../../block_range/block_range_reduce_by_key.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Reduce-by-key kernel entry point (multi-block) + */ +template < + typename BlockRangeReduceByKeyPolicy, ///< Parameterized BlockRangeReduceByKeyPolicy tuning policy type + typename KeyInputIterator, ///< Random-access input iterator type for keys + typename KeyOutputIterator, ///< Random-access output iterator type for keys + typename ValueInputIterator, ///< Random-access input iterator type for values + typename ValueOutputIterator, ///< Random-access output iterator type for values + typename NumSegmentsIterator, ///< Output iterator type for recording number of segments encountered + typename ScanTileState, ///< Tile status interface type + typename EqualityOp, ///< Key equality operator type + typename ReductionOp, ///< Value reduction operator type + typename Offset> ///< Signed integer type for global offsets +__launch_bounds__ (int(BlockRangeReduceByKeyPolicy::BLOCK_THREADS)) +__global__ void ReduceByKeyRegionKernel( + KeyInputIterator d_keys_in, ///< [in] Pointer to consecutive runs of input keys + KeyOutputIterator d_keys_out, ///< [in] Pointer to output keys (one key per run) + ValueInputIterator d_values_in, ///< [in] Pointer to consecutive runs of input values + ValueOutputIterator d_values_out, ///< [in] Pointer to output value aggregates (one aggregate per run) + NumSegmentsIterator d_num_segments, ///< [in] Pointer to total number of runs + ScanTileState tile_status, ///< [in] Tile status interface + EqualityOp equality_op, ///< [in] Key equality operator + ReductionOp reduction_op, ///< [in] Value reduction operator + Offset num_items, ///< [in] Total number of items to select from + int num_tiles, ///< [in] Total number of tiles for the entire problem + GridQueue queue) ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + // Thread block type for reducing tiles of value segments + typedef BlockRangeReduceByKey< + BlockRangeReduceByKeyPolicy, + KeyInputIterator, + KeyOutputIterator, + ValueInputIterator, + ValueOutputIterator, + EqualityOp, + ReductionOp, + Offset> BlockRangeReduceByKeyT; + + // Shared memory for BlockRangeReduceByKey + __shared__ typename BlockRangeReduceByKeyT::TempStorage temp_storage; + + // Process tiles + BlockRangeReduceByKeyT(temp_storage, d_keys_in, d_keys_out, d_values_in, d_values_out, equality_op, reduction_op, num_items).ConsumeRange( + num_tiles, + queue, + tile_status, + d_num_segments); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey + */ +template < + typename KeyInputIterator, ///< Random-access input iterator type for keys + typename KeyOutputIterator, ///< Random-access output iterator type for keys + typename ValueInputIterator, ///< Random-access input iterator type for values + typename ValueOutputIterator, ///< Random-access output iterator type for values + typename NumSegmentsIterator, ///< Output iterator type for recording number of segments encountered + typename EqualityOp, ///< Key equality operator type + typename ReductionOp, ///< Value reduction operator type + typename Offset> ///< Signed integer type for global offsets +struct DeviceReduceByKeyDispatch +{ + /****************************************************************************** + * Types and constants + ******************************************************************************/ + + // Data type of key input iterator + typedef typename std::iterator_traits::value_type Key; + + // Data type of value input iterator + typedef typename std::iterator_traits::value_type Value; + + enum + { + INIT_KERNEL_THREADS = 128, + MAX_INPUT_BYTES = CUB_MAX(sizeof(Key), sizeof(Value)), + COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value), + }; + + // Value-offset tuple type for scanning (maps accumulated values to segment index) + typedef ItemOffsetPair ValueOffsetPair; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileState; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 8, + ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 8 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef BlockRangeReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicy; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 6, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef BlockRangeReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicy; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 13, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef BlockRangeReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicy; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 7, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef BlockRangeReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicy; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 5, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)), + }; + + typedef BlockRangeReduceByKeyPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING> + ReduceByKeyPolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &reduce_by_key_range_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + reduce_by_key_range_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + reduce_by_key_range_config.template Init(); + } + else if (ptx_version >= 300) + { + reduce_by_key_range_config.template Init(); + } + else if (ptx_version >= 200) + { + reduce_by_key_range_config.template Init(); + } + else if (ptx_version >= 130) + { + reduce_by_key_range_config.template Init(); + } + else + { + reduce_by_key_range_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. Mirrors the constants within BlockRangeReduceByKeyPolicy. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + BlockLoadAlgorithm load_policy; + bool two_phase_scatter; + BlockScanAlgorithm scan_algorithm; + cudaSharedMemConfig smem_config; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = BlockRangeReduceByKeyPolicy::BLOCK_THREADS; + items_per_thread = BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD; + load_policy = BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM; + two_phase_scatter = BlockRangeReduceByKeyPolicy::TWO_PHASE_SCATTER; + scan_algorithm = BlockRangeReduceByKeyPolicy::SCAN_ALGORITHM; + smem_config = cudaSharedMemBankSizeEightByte; + } + + CUB_RUNTIME_FUNCTION __forceinline__ + void Print() + { + printf("%d, %d, %d, %d, %d", + block_threads, + items_per_thread, + load_policy, + two_phase_scatter, + scan_algorithm); + } + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide prefix scan using the + * specified kernel functions. + */ + template < + typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel + typename ReduceByKeyRegionKernelPtr> ///< Function type of cub::ReduceByKeyRegionKernelPtr + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeyInputIterator d_keys_in, ///< [in] Pointer to consecutive runs of input keys + KeyOutputIterator d_keys_out, ///< [in] Pointer to output keys (one key per run) + ValueInputIterator d_values_in, ///< [in] Pointer to consecutive runs of input values + ValueOutputIterator d_values_out, ///< [in] Pointer to output value aggregates (one aggregate per run) + NumSegmentsIterator d_num_segments, ///< [in] Pointer to total number of runs + EqualityOp equality_op, ///< [in] Key equality operator + ReductionOp reduction_op, ///< [in] Value reduction operator + Offset num_items, ///< [in] Total number of items to select from + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version, ///< [in] PTX version of dispatch kernels + ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel + ReduceByKeyRegionKernelPtr reduce_by_key_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::ReduceByKeyRegionKernel + KernelConfig reduce_by_key_range_config) ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_range_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get device SM version + int sm_version; + if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = reduce_by_key_range_config.block_threads * reduce_by_key_range_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[2]; + if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + allocation_sizes[1] = GridQueue::AllocationSize(); // bytes needed for grid queue descriptor + + // Compute allocation pointers into the single storage blob (or set the necessary size of the blob) + void* allocations[2]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + return cudaSuccess; + } + + // Construct the tile status interface + ScanTileState tile_status; + if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Construct the grid queue descriptor + GridQueue queue(allocations[1]); + + // Log init_kernel configuration + int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; + if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors and queue descriptors + init_kernel<<>>( + queue, + tile_status, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Get SM occupancy for reduce_by_key_range_kernel + int reduce_by_key_range_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + reduce_by_key_range_sm_occupancy, // out + sm_version, + reduce_by_key_range_kernel, + reduce_by_key_range_config.block_threads))) break; + + // Get grid size for scanning tiles + dim3 reduce_by_key_grid_size; + if (ptx_version <= 130) + { + // Blocks are launched in order, so just assign one block per tile + int max_dim_x = 32 * 1024; + reduce_by_key_grid_size.z = 1; + reduce_by_key_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x; + reduce_by_key_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + } + else + { + // Blocks may not be launched in order, so use atomics + int reduce_by_key_range_occupancy = reduce_by_key_range_sm_occupancy * sm_count; // Whole-device occupancy for reduce_by_key_range_kernel + reduce_by_key_grid_size.z = 1; + reduce_by_key_grid_size.y = 1; + reduce_by_key_grid_size.x = (num_tiles < reduce_by_key_range_occupancy) ? + num_tiles : // Not enough to fill the device with threadblocks + reduce_by_key_range_occupancy; // Fill the device with threadblocks + } + +#if (CUB_PTX_ARCH == 0) + // Get current smem bank configuration + cudaSharedMemConfig original_smem_config; + if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break; + cudaSharedMemConfig current_smem_config = original_smem_config; + + // Update smem config if necessary + if (current_smem_config != reduce_by_key_range_config.smem_config) + { + if (CubDebug(error = cudaDeviceSetSharedMemConfig(reduce_by_key_range_config.smem_config))) break; + current_smem_config = reduce_by_key_range_config.smem_config; + } +#endif + + // Log reduce_by_key_range_kernel configuration + if (debug_synchronous) CubLog("Invoking reduce_by_key_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + reduce_by_key_grid_size.x, reduce_by_key_grid_size.y, reduce_by_key_grid_size.z, reduce_by_key_range_config.block_threads, (long long) stream, reduce_by_key_range_config.items_per_thread, reduce_by_key_range_sm_occupancy); + + // Invoke reduce_by_key_range_kernel + reduce_by_key_range_kernel<<>>( + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + d_num_segments, + tile_status, + equality_op, + reduction_op, + num_items, + num_tiles, + queue); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + +#if (CUB_PTX_ARCH == 0) + // Reset smem config if necessary + if (current_smem_config != original_smem_config) + { + if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break; + } +#endif + + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeyInputIterator d_keys_in, ///< [in] Pointer to consecutive runs of input keys + KeyOutputIterator d_keys_out, ///< [in] Pointer to output keys (one key per run) + ValueInputIterator d_values_in, ///< [in] Pointer to consecutive runs of input values + ValueOutputIterator d_values_out, ///< [in] Pointer to output value aggregates (one aggregate per run) + NumSegmentsIterator d_num_segments, ///< [in] Pointer to total number of runs + EqualityOp equality_op, ///< [in] Key equality operator + ReductionOp reduction_op, ///< [in] Value reduction operator + Offset num_items, ///< [in] Total number of items to select from + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig reduce_by_key_range_config; + InitConfigs(ptx_version, reduce_by_key_range_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + d_num_segments, + equality_op, + reduction_op, + num_items, + stream, + debug_synchronous, + ptx_version, + ScanInitKernel, + ReduceByKeyRegionKernel, + reduce_by_key_range_config))) break; + } + while (0); + + return error; + } +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/dispatch/device_reduce_dispatch.cuh b/SRC/cub/device/dispatch/device_reduce_dispatch.cuh new file mode 100644 index 00000000..3c0bce5b --- /dev/null +++ b/SRC/cub/device/dispatch/device_reduce_dispatch.cuh @@ -0,0 +1,743 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within global memory. + */ + +#pragma once + +#include +#include + +#include "device_reduce_by_key_dispatch.cuh" +#include "../../block_range/block_range_reduce.cuh" +#include "../../iterator/constant_input_iterator.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../iterator/arg_index_input_iterator.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Reduce region kernel entry point (multi-block). Computes privatized reductions, one per thread block. + */ +template < + typename BlockRangeReducePolicy, ///< Parameterized BlockRangeReducePolicy tuning policy type + typename InputIterator, ///< Random-access input iterator type for reading input items \iterator + typename OutputIterator, ///< Output iterator type for recording the reduced aggregate \iterator + typename Offset, ///< Signed integer type for global offsets + typename ReductionOp> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +__launch_bounds__ (int(BlockRangeReducePolicy::BLOCK_THREADS)) +__global__ void ReduceRegionKernel( + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output aggregate + Offset num_items, ///< [in] Total number of input data items + GridEvenShare even_share, ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block + GridQueue queue, ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks + ReductionOp reduction_op) ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) +{ + // Data type + typedef typename std::iterator_traits::value_type T; + + // Thread block type for reducing input tiles + typedef BlockRangeReduce BlockRangeReduceT; + + // Block-wide aggregate + T block_aggregate; + + // Shared memory storage + __shared__ typename BlockRangeReduceT::TempStorage temp_storage; + + // Consume input tiles + BlockRangeReduceT(temp_storage, d_in, reduction_op).ConsumeRange( + num_items, + even_share, + queue, + block_aggregate, + Int2Type()); + + // Output result + if (threadIdx.x == 0) + { + d_out[blockIdx.x] = block_aggregate; + } +} + + +/** + * Reduce a single tile kernel entry point (single-block). Can be used to aggregate privatized threadblock reductions from a previous multi-block reduction pass. + */ +template < + typename BlockRangeReducePolicy, ///< Parameterized BlockRangeReducePolicy tuning policy type + typename InputIterator, ///< Random-access input iterator type for reading input items \iterator + typename OutputIterator, ///< Output iterator type for recording the reduced aggregate \iterator + typename Offset, ///< Signed integer type for global offsets + typename ReductionOp> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +__launch_bounds__ (int(BlockRangeReducePolicy::BLOCK_THREADS), 1) +__global__ void SingleTileKernel( + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output aggregate + Offset num_items, ///< [in] Total number of input data items + ReductionOp reduction_op) ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) +{ + // Data type + typedef typename std::iterator_traits::value_type T; + + // Thread block type for reducing input tiles + typedef BlockRangeReduce BlockRangeReduceT; + + // Block-wide aggregate + T block_aggregate; + + // Shared memory storage + __shared__ typename BlockRangeReduceT::TempStorage temp_storage; + + // Consume input tiles + BlockRangeReduceT(temp_storage, d_in, reduction_op).ConsumeRange( + Offset(0), + Offset(num_items), + block_aggregate); + + // Output result + if (threadIdx.x == 0) + { + d_out[blockIdx.x] = block_aggregate; + } +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceReduce + */ +template < + typename InputIterator, ///< Random-access input iterator type for reading input items \iterator + typename OutputIterator, ///< Output iterator type for recording the reduced aggregate \iterator + typename Offset, ///< Signed integer type for global offsets + typename ReductionOp> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +struct DeviceReduceDispatch +{ + // Data type of input iterator + typedef typename std::iterator_traits::value_type T; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + // ReduceRegionPolicy1B (GTX Titan: 228.7 GB/s @ 192M 1B items) + typedef BlockRangeReducePolicy< + 128, ///< Threads per thread block + 24, ///< Items per thread per tile of input + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG, ///< Cache load modifier + GRID_MAPPING_DYNAMIC> ///< How to map tiles of input onto thread blocks + ReduceRegionPolicy1B; + + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 20, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + // ReduceRegionPolicy4B (GTX Titan: 255.1 GB/s @ 48M 4B items) + typedef BlockRangeReducePolicy< + 256, ///< Threads per thread block + ITEMS_PER_THREAD, ///< Items per thread per tile of input + 2, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG, ///< Cache load modifier + GRID_MAPPING_DYNAMIC> ///< How to map tiles of input onto thread blocks + ReduceRegionPolicy4B; + + // ReduceRegionPolicy + typedef typename If<(sizeof(T) >= 4), + ReduceRegionPolicy4B, + ReduceRegionPolicy1B>::Type ReduceRegionPolicy; + + // SingleTilePolicy + typedef BlockRangeReducePolicy< + 256, ///< Threads per thread block + 8, ///< Items per thread per tile of input + 1, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + SingleTilePolicy; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 2, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + // ReduceRegionPolicy (GTX670: 154.0 @ 48M 4B items) + typedef BlockRangeReducePolicy< + 256, ///< Threads per thread block + ITEMS_PER_THREAD, ///< Items per thread per tile of input + 1, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + ReduceRegionPolicy; + + // SingleTilePolicy + typedef BlockRangeReducePolicy< + 256, ///< Threads per thread block + 24, ///< Items per thread per tile of input + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + SingleTilePolicy; + }; + + /// SM20 + struct Policy200 + { + // ReduceRegionPolicy1B (GTX 580: 158.1 GB/s @ 192M 1B items) + typedef BlockRangeReducePolicy< + 192, ///< Threads per thread block + 24, ///< Items per thread per tile of input + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + (sizeof(T) == 1) ? ///< How to map tiles of input onto thread blocks + GRID_MAPPING_EVEN_SHARE : + GRID_MAPPING_DYNAMIC> + ReduceRegionPolicy1B; + + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 8, + NOMINAL_4B_VEC_ITEMS = 4, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + VEC_ITEMS = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))), + }; + + // ReduceRegionPolicy4B (GTX 580: 178.9 GB/s @ 48M 4B items) + typedef BlockRangeReducePolicy< + 128, ///< Threads per thread block + ITEMS_PER_THREAD, ///< Items per thread per tile of input + VEC_ITEMS, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_DYNAMIC> ///< How to map tiles of input onto thread blocks + ReduceRegionPolicy4B; + + // ReduceRegionPolicy + typedef typename If<(sizeof(T) < 4), + ReduceRegionPolicy1B, + ReduceRegionPolicy4B>::Type ReduceRegionPolicy; + + // SingleTilePolicy + typedef BlockRangeReducePolicy< + 192, ///< Threads per thread block + 7, ///< Items per thread per tile of input + 1, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + SingleTilePolicy; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 8, + NOMINAL_4B_VEC_ITEMS = 2, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + VEC_ITEMS = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))), + }; + + // ReduceRegionPolicy + typedef BlockRangeReducePolicy< + 128, ///< Threads per thread block + ITEMS_PER_THREAD, ///< Items per thread per tile of input + VEC_ITEMS, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + ReduceRegionPolicy; + + // SingleTilePolicy + typedef BlockRangeReducePolicy< + 32, ///< Threads per thread block + 4, ///< Items per thread per tile of input + VEC_ITEMS, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + SingleTilePolicy; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 8, + NOMINAL_4B_VEC_ITEMS = 2, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + VEC_ITEMS = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))), + }; + + // ReduceRegionPolicy + typedef BlockRangeReducePolicy< + 128, ///< Threads per thread block + ITEMS_PER_THREAD, ///< Items per thread per tile of input + VEC_ITEMS, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + ReduceRegionPolicy; + + // SingleTilePolicy + typedef BlockRangeReducePolicy< + 32, ///< Threads per thread block + 4, ///< Items per thread per tile of input + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT, ///< Cache load modifier + GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks + SingleTilePolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxReduceRegionPolicy : PtxPolicy::ReduceRegionPolicy {}; + struct PtxSingleTilePolicy : PtxPolicy::SingleTilePolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &reduce_range_config, + KernelConfig &single_tile_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + reduce_range_config.template Init(); + single_tile_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + reduce_range_config.template Init(); + single_tile_config.template Init(); + } + else if (ptx_version >= 300) + { + reduce_range_config.template Init(); + single_tile_config.template Init(); + } + else if (ptx_version >= 200) + { + reduce_range_config.template Init(); + single_tile_config.template Init(); + } + else if (ptx_version >= 130) + { + reduce_range_config.template Init(); + single_tile_config.template Init(); + } + else + { + reduce_range_config.template Init(); + single_tile_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int vector_load_length; + BlockReduceAlgorithm block_algorithm; + CacheLoadModifier load_modifier; + GridMappingStrategy grid_mapping; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = BlockPolicy::BLOCK_THREADS; + items_per_thread = BlockPolicy::ITEMS_PER_THREAD; + vector_load_length = BlockPolicy::VECTOR_LOAD_LENGTH; + block_algorithm = BlockPolicy::BLOCK_ALGORITHM; + load_modifier = BlockPolicy::LOAD_MODIFIER; + grid_mapping = BlockPolicy::GRID_MAPPING; + } + + CUB_RUNTIME_FUNCTION __forceinline__ + void Print() + { + printf("%d threads, %d per thread, %d veclen, %d algo, %d loadmod, %d mapping", + block_threads, + items_per_thread, + vector_load_length, + block_algorithm, + load_modifier, + grid_mapping); + } + }; + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide reduction using the + * specified kernel functions. + * + * If the input is larger than a single tile, this method uses two-passes of + * kernel invocations. + */ + template < + typename ReduceRegionKernelPtr, ///< Function type of cub::ReduceRegionKernel + typename AggregateTileKernelPtr, ///< Function type of cub::SingleTileKernel for consuming partial reductions (T*) + typename SingleTileKernelPtr, ///< Function type of cub::SingleTileKernel for consuming input (InputIterator) + typename FillAndResetDrainKernelPtr> ///< Function type of cub::FillAndResetDrainKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output aggregate + Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOp reduction_op, ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + FillAndResetDrainKernelPtr prepare_drain_kernel, ///< [in] Kernel function pointer to parameterization of cub::FillAndResetDrainKernel + ReduceRegionKernelPtr reduce_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::ReduceRegionKernel + AggregateTileKernelPtr aggregate_kernel, ///< [in] Kernel function pointer to parameterization of cub::SingleTileKernel for consuming partial reductions (T*) + SingleTileKernelPtr single_kernel, ///< [in] Kernel function pointer to parameterization of cub::SingleTileKernel for consuming input (InputIterator) + KernelConfig &reduce_range_config, ///< [in] Dispatch parameters that match the policy that \p reduce_range_kernel_ptr was compiled for + KernelConfig &single_tile_config) ///< [in] Dispatch parameters that match the policy that \p single_kernel was compiled for + { +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); + +#else + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get device SM version + int sm_version; + if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Tile size of reduce_range_kernel + int tile_size = reduce_range_config.block_threads * reduce_range_config.items_per_thread; + + if ((reduce_range_kernel == NULL) || (num_items <= tile_size)) + { + // Dispatch a single-block reduction kernel + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + // Log single_kernel configuration + if (debug_synchronous) CubLog("Invoking ReduceSingle<<<1, %d, 0, %lld>>>(), %d items per thread\n", + single_tile_config.block_threads, (long long) stream, single_tile_config.items_per_thread); + + // Invoke single_kernel + single_kernel<<<1, single_tile_config.block_threads, 0, stream>>>( + d_in, + d_out, + num_items, + reduction_op); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + } + else + { + // Dispatch two kernels: (1) a multi-block kernel to compute + // privatized per-block reductions, and (2) a single-block + // to reduce those partial reductions + + // Get SM occupancy for reduce_range_kernel + int reduce_range_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + reduce_range_sm_occupancy, + sm_version, + reduce_range_kernel, + reduce_range_config.block_threads))) break; + + // Get device occupancy for reduce_range_kernel + int reduce_range_occupancy = reduce_range_sm_occupancy * sm_count; + + // Even-share work distribution + int subscription_factor = reduce_range_sm_occupancy; // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic) + GridEvenShare even_share( + num_items, + reduce_range_occupancy * subscription_factor, + tile_size); + + // Get grid size for reduce_range_kernel + int reduce_range_grid_size; + switch (reduce_range_config.grid_mapping) + { + case GRID_MAPPING_EVEN_SHARE: + + // Work is distributed evenly + reduce_range_grid_size = even_share.grid_size; + break; + + case GRID_MAPPING_DYNAMIC: + + // Work is distributed dynamically + int num_tiles = (num_items + tile_size - 1) / tile_size; + reduce_range_grid_size = (num_tiles < reduce_range_occupancy) ? + num_tiles : // Not enough to fill the device with threadblocks + reduce_range_occupancy; // Fill the device with threadblocks + break; + }; + + // Temporary storage allocation requirements + void* allocations[2]; + size_t allocation_sizes[2] = + { + reduce_range_grid_size * sizeof(T), // bytes needed for privatized block reductions + GridQueue::AllocationSize() // bytes needed for grid queue descriptor + }; + + // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + return cudaSuccess; + } + + // Alias the allocation for the privatized per-block reductions + T *d_block_reductions = (T*) allocations[0]; + + // Alias the allocation for the grid queue descriptor + GridQueue queue(allocations[1]); + + // Prepare the dynamic queue descriptor if necessary + if (reduce_range_config.grid_mapping == GRID_MAPPING_DYNAMIC) + { + // Prepare queue using a kernel so we know it gets prepared once per operation + if (debug_synchronous) CubLog("Invoking prepare_drain_kernel<<<1, 1, 0, %lld>>>()\n", (long long) stream); + + // Invoke prepare_drain_kernel + prepare_drain_kernel<<<1, 1, 0, stream>>>(queue, num_items); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + + // Log reduce_range_kernel configuration + if (debug_synchronous) CubLog("Invoking reduce_range_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + reduce_range_grid_size, reduce_range_config.block_threads, (long long) stream, reduce_range_config.items_per_thread, reduce_range_sm_occupancy); + + // Invoke reduce_range_kernel + reduce_range_kernel<<>>( + d_in, + d_block_reductions, + num_items, + even_share, + queue, + reduction_op); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log single_kernel configuration + if (debug_synchronous) CubLog("Invoking single_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", + 1, single_tile_config.block_threads, (long long) stream, single_tile_config.items_per_thread); + + // Invoke single_kernel + aggregate_kernel<<<1, single_tile_config.block_threads, 0, stream>>>( + d_block_reductions, + d_out, + reduce_range_grid_size, + reduction_op); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine for computing a device-wide reduction + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output aggregate + Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOp reduction_op, ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig reduce_range_config; + KernelConfig single_tile_config; + InitConfigs(ptx_version, reduce_range_config, single_tile_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_items, + reduction_op, + stream, + debug_synchronous, + FillAndResetDrainKernel, + ReduceRegionKernel, + SingleTileKernel, + SingleTileKernel, + reduce_range_config, + single_tile_config))) break; + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/dispatch/device_scan_dispatch.cuh b/SRC/cub/device/dispatch/device_scan_dispatch.cuh new file mode 100644 index 00000000..6abeb29d --- /dev/null +++ b/SRC/cub/device/dispatch/device_scan_dispatch.cuh @@ -0,0 +1,565 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within global memory. + */ + +#pragma once + +#include +#include + +#include "../../block_range/block_range_scan.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Initialization kernel for tile status initialization (multi-block) + */ +template < + typename Offset, ///< Signed integer type for global offsets + typename ScanTileState> ///< Tile status interface type +__global__ void ScanInitKernel( + GridQueue grid_queue, ///< [in] Descriptor for performing dynamic mapping of input tiles to thread blocks + ScanTileState tile_status, ///< [in] Tile status interface + int num_tiles) ///< [in] Number of tiles +{ + // Reset queue descriptor + if ((blockIdx.x == 0) && (threadIdx.x == 0)) + grid_queue.FillAndResetDrain(num_tiles); + + // Initialize tile status + tile_status.InitializeStatus(num_tiles); +} + + +/** + * Scan kernel entry point (multi-block) + */ +template < + typename BlockRangeScanPolicy, ///< Parameterized BlockRangeScanPolicy tuning policy type + typename InputIterator, ///< Random-access input iterator type for reading scan input data \iterator + typename OutputIterator, ///< Random-access output iterator type for writing scan output data \iterator + typename ScanTileState, ///< Tile status interface type + typename ScanOp, ///< Binary scan functor type having member T operator()(const T &a, const T &b) + typename Identity, ///< Identity value type (cub::NullType for inclusive scans) + typename Offset> ///< Signed integer type for global offsets +__launch_bounds__ (int(BlockRangeScanPolicy::BLOCK_THREADS)) +__global__ void ScanRegionKernel( + InputIterator d_in, ///< Input data + OutputIterator d_out, ///< Output data + ScanTileState tile_status, ///< [in] Tile status interface + ScanOp scan_op, ///< Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + Identity identity, ///< Identity element + Offset num_items, ///< Total number of scan items for the entire problem + GridQueue queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + // Thread block type for scanning input tiles + typedef BlockRangeScan< + BlockRangeScanPolicy, + InputIterator, + OutputIterator, + ScanOp, + Identity, + Offset> BlockRangeScanT; + + // Shared memory for BlockRangeScan + __shared__ typename BlockRangeScanT::TempStorage temp_storage; + + // Process tiles + BlockRangeScanT(temp_storage, d_in, d_out, scan_op, identity).ConsumeRange( + num_items, + queue, + tile_status); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceScan + */ +template < + typename InputIterator, ///< Random-access input iterator type for reading scan input data \iterator + typename OutputIterator, ///< Random-access output iterator type for writing scan output data \iterator + typename ScanOp, ///< Binary scan functor type having member T operator()(const T &a, const T &b) + typename Identity, ///< Identity value type (cub::NullType for inclusive scans) + typename Offset> ///< Signed integer type for global offsets +struct DeviceScanDispatch +{ + enum + { + INIT_KERNEL_THREADS = 128 + }; + + // Data type + typedef typename std::iterator_traits::value_type T; + + // Tile status descriptor interface type + typedef ScanTileState ScanTileState; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 12, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T + typedef BlockRangeScanPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + false, + LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanRegionPolicy; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeScanPolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + false, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + false, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanRegionPolicy; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 15, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T + typedef BlockRangeScanPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + false, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + false, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanRegionPolicy; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 21, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeScanPolicy< + 96, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + false, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + false, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanRegionPolicy; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeScanPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + true, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + true, + BLOCK_SCAN_WARP_SCANS> + ScanRegionPolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxScanRegionPolicy : PtxPolicy::ScanRegionPolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &scan_range_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + scan_range_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + scan_range_config.template Init(); + } + else if (ptx_version >= 300) + { + scan_range_config.template Init(); + } + else if (ptx_version >= 200) + { + scan_range_config.template Init(); + } + else if (ptx_version >= 130) + { + scan_range_config.template Init(); + } + else + { + scan_range_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. Mirrors the constants within BlockRangeScanPolicy. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + BlockLoadAlgorithm load_policy; + BlockStoreAlgorithm store_policy; + BlockScanAlgorithm scan_algorithm; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = BlockRangeScanPolicy::BLOCK_THREADS; + items_per_thread = BlockRangeScanPolicy::ITEMS_PER_THREAD; + load_policy = BlockRangeScanPolicy::LOAD_ALGORITHM; + store_policy = BlockRangeScanPolicy::STORE_ALGORITHM; + scan_algorithm = BlockRangeScanPolicy::SCAN_ALGORITHM; + } + + CUB_RUNTIME_FUNCTION __forceinline__ + void Print() + { + printf("%d, %d, %d, %d, %d", + block_threads, + items_per_thread, + load_policy, + store_policy, + scan_algorithm); + } + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide prefix scan using the + * specified kernel functions. + */ + template < + typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel + typename ScanRegionKernelPtr> ///< Function type of cub::ScanRegionKernelPtr + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output sequence of data items + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + Identity identity, ///< [in] Identity element + Offset num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version, ///< [in] PTX version of dispatch kernels + ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel + ScanRegionKernelPtr scan_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanRegionKernel + KernelConfig scan_range_config) ///< [in] Dispatch parameters that match the policy that \p scan_range_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get device SM version + int sm_version; + if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = scan_range_config.block_threads * scan_range_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[2]; + if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + allocation_sizes[1] = GridQueue::AllocationSize(); // bytes needed for grid queue descriptor + + // Compute allocation pointers into the single storage blob (or set the necessary size of the blob) + void* allocations[2]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + return cudaSuccess; + } + + // Construct the tile status interface + ScanTileState tile_status; + if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Construct the grid queue descriptor + GridQueue queue(allocations[1]); + + // Log init_kernel configuration + int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; + if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors and queue descriptors + init_kernel<<>>( + queue, + tile_status, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Get SM occupancy for scan_range_kernel + int scan_range_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + scan_range_sm_occupancy, // out + sm_version, + scan_range_kernel, + scan_range_config.block_threads))) break; + + // Get grid size for scanning tiles + dim3 scan_grid_size; + if (ptx_version <= 130) + { + // Blocks are launched in order, so just assign one block per tile + int max_dim_x = 32 * 1024; + scan_grid_size.z = 1; + scan_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x; + scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + } + else + { + // Blocks may not be launched in order, so use atomics + int scan_range_occupancy = scan_range_sm_occupancy * sm_count; // Whole-device occupancy for scan_range_kernel + scan_grid_size.z = 1; + scan_grid_size.y = 1; + scan_grid_size.x = (num_tiles < scan_range_occupancy) ? + num_tiles : // Not enough to fill the device with threadblocks + scan_range_occupancy; // Fill the device with threadblocks + } + + // Log scan_range_kernel configuration + if (debug_synchronous) CubLog("Invoking scan_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, scan_range_config.block_threads, (long long) stream, scan_range_config.items_per_thread, scan_range_sm_occupancy); + + // Invoke scan_range_kernel + scan_range_kernel<<>>( + d_in, + d_out, + tile_status, + scan_op, + identity, + num_items, + queue); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to the input sequence of data items + OutputIterator d_out, ///< [out] Pointer to the output sequence of data items + ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + Identity identity, ///< [in] Identity element + Offset num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig scan_range_config; + InitConfigs(ptx_version, scan_range_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + identity, + num_items, + stream, + debug_synchronous, + ptx_version, + ScanInitKernel, + ScanRegionKernel, + scan_range_config))) break; + } + while (0); + + return error; + } +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/dispatch/device_select_dispatch.cuh b/SRC/cub/device/dispatch/device_select_dispatch.cuh new file mode 100644 index 00000000..de6f38b5 --- /dev/null +++ b/SRC/cub/device/dispatch/device_select_dispatch.cuh @@ -0,0 +1,564 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within global memory. + */ + +#pragma once + +#include +#include + +#include "device_scan_dispatch.cuh" +#include "../../block_range/block_range_select.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Select kernel entry point (multi-block) + * + * Performs functor-based selection if SelectOp functor type != NullType + * Otherwise performs flag-based selection if FlagIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename BlockRangeSelectPolicy, ///< Parameterized BlockRangeSelectPolicy tuning policy type + typename InputIterator, ///< Random-access input iterator type for reading input items + typename FlagIterator, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename OutputIterator, ///< Random-access output iterator type for writing selected items + typename NumSelectedIterator, ///< Output iterator type for recording the number of items selected + typename ScanTileState, ///< Tile status interface type + typename SelectOp, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) + typename EqualityOp, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) + typename Offset, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +__launch_bounds__ (int(BlockRangeSelectPolicy::BLOCK_THREADS)) +__global__ void SelectRegionKernel( + InputIterator d_in, ///< [in] Pointer to input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIterator d_out, ///< [in] Pointer to output sequence of selected data items + NumSelectedIterator d_num_selected, ///< [in] Pointer to total number of items selected (i.e., length of \p d_out) + ScanTileState tile_status, ///< [in] Tile status interface + SelectOp select_op, ///< [in] Selection operator + EqualityOp equality_op, ///< [in] Equality operator + Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + int num_tiles, ///< [in] Total number of tiles for the entire problem + GridQueue queue) ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + // Thread block type for selecting data from input tiles + typedef BlockRangeSelect< + BlockRangeSelectPolicy, + InputIterator, + FlagIterator, + OutputIterator, + SelectOp, + EqualityOp, + Offset, + KEEP_REJECTS> BlockRangeSelectT; + + // Shared memory for BlockRangeSelect + __shared__ typename BlockRangeSelectT::TempStorage temp_storage; + + // Process tiles + BlockRangeSelectT(temp_storage, d_in, d_flags, d_out, select_op, equality_op, num_items).ConsumeRange( + num_tiles, + queue, + tile_status, + d_num_selected); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect + */ +template < + typename InputIterator, ///< Random-access input iterator type for reading input items + typename FlagIterator, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename OutputIterator, ///< Random-access output iterator type for writing selected items + typename NumSelectedIterator, ///< Output iterator type for recording the number of items selected + typename SelectOp, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) + typename EqualityOp, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) + typename Offset, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +struct DeviceSelectDispatch +{ + /****************************************************************************** + * Types and constants + ******************************************************************************/ + + // Data type of input iterator + typedef typename std::iterator_traits::value_type T; + + // Data type of flag iterator + typedef typename std::iterator_traits::value_type Flag; + + enum + { + INIT_KERNEL_THREADS = 128, + }; + + // Tile status descriptor interface type + typedef ScanTileState ScanTileState; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 11, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeSelectPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + BLOCK_SCAN_WARP_SCANS> + SelectRegionPolicy; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 5, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeSelectPolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + SelectRegionPolicy; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 17, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeSelectPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_WARP_SCANS> + SelectRegionPolicy; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeSelectPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + SelectRegionPolicy; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef BlockRangeSelectPolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + SelectRegionPolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxSelectRegionPolicy : PtxPolicy::SelectRegionPolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &select_range_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + select_range_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + select_range_config.template Init(); + } + else if (ptx_version >= 300) + { + select_range_config.template Init(); + } + else if (ptx_version >= 200) + { + select_range_config.template Init(); + } + else if (ptx_version >= 130) + { + select_range_config.template Init(); + } + else + { + select_range_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. Mirrors the constants within BlockRangeSelectPolicy. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + BlockLoadAlgorithm load_policy; + bool two_phase_scatter; + BlockScanAlgorithm scan_algorithm; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = BlockRangeSelectPolicy::BLOCK_THREADS; + items_per_thread = BlockRangeSelectPolicy::ITEMS_PER_THREAD; + load_policy = BlockRangeSelectPolicy::LOAD_ALGORITHM; + two_phase_scatter = BlockRangeSelectPolicy::TWO_PHASE_SCATTER; + scan_algorithm = BlockRangeSelectPolicy::SCAN_ALGORITHM; + } + + CUB_RUNTIME_FUNCTION __forceinline__ + void Print() + { + printf("%d, %d, %d, %d, %d", + block_threads, + items_per_thread, + load_policy, + two_phase_scatter, + scan_algorithm); + } + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide prefix scan using the + * specified kernel functions. + */ + template < + typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel + typename SelectRegionKernelPtr> ///< Function type of cub::SelectRegionKernelPtr + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIterator d_out, ///< [in] Pointer to output sequence of selected data items + NumSelectedIterator d_num_selected, ///< [in] Pointer to total number of items selected (i.e., length of \p d_out) + SelectOp select_op, ///< [in] Selection operator + EqualityOp equality_op, ///< [in] Equality operator + Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version, ///< [in] PTX version of dispatch kernels + ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel + SelectRegionKernelPtr select_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::SelectRegionKernel + KernelConfig select_range_config) ///< [in] Dispatch parameters that match the policy that \p select_range_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get device SM version + int sm_version; + if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = select_range_config.block_threads * select_range_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[2]; + if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + allocation_sizes[1] = GridQueue::AllocationSize(); // bytes needed for grid queue descriptor + + // Compute allocation pointers into the single storage blob (or set the necessary size of the blob) + void* allocations[2]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + return cudaSuccess; + } + + // Construct the tile status interface + ScanTileState tile_status; + if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Construct the grid queue descriptor + GridQueue queue(allocations[1]); + + // Log init_kernel configuration + int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; + if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors and queue descriptors + init_kernel<<>>( + queue, + tile_status, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Get SM occupancy for select_range_kernel + int select_range_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + select_range_sm_occupancy, // out + sm_version, + select_range_kernel, + select_range_config.block_threads))) break; + + // Get grid size for scanning tiles + dim3 select_grid_size; + if (ptx_version <= 130) + { + // Blocks are launched in order, so just assign one block per tile + int max_dim_x = 32 * 1024; + select_grid_size.z = 1; + select_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x; + select_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + } + else + { + // Blocks may not be launched in order, so use atomics + int select_range_occupancy = select_range_sm_occupancy * sm_count; // Whole-device occupancy for select_range_kernel + select_grid_size.z = 1; + select_grid_size.y = 1; + select_grid_size.x = (num_tiles < select_range_occupancy) ? + num_tiles : // Not enough to fill the device with threadblocks + select_range_occupancy; // Fill the device with threadblocks + } + + // Log select_range_kernel configuration + if (debug_synchronous) CubLog("Invoking select_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + select_grid_size.x, select_grid_size.y, select_grid_size.z, select_range_config.block_threads, (long long) stream, select_range_config.items_per_thread, select_range_sm_occupancy); + + // Invoke select_range_kernel + select_range_kernel<<>>( + d_in, + d_flags, + d_out, + d_num_selected, + tile_status, + select_op, + equality_op, + num_items, + num_tiles, + queue); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIterator d_in, ///< [in] Pointer to input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIterator d_out, ///< [in] Pointer to output sequence of selected data items + NumSelectedIterator d_num_selected, ///< [in] Pointer to total number of items selected (i.e., length of \p d_out) + SelectOp select_op, ///< [in] Selection operator + EqualityOp equality_op, ///< [in] Equality operator + Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig select_range_config; + InitConfigs(ptx_version, select_range_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected, + select_op, + equality_op, + num_items, + stream, + debug_synchronous, + ptx_version, + ScanInitKernel, + SelectRegionKernel, + select_range_config))) break; + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/grid/grid_barrier.cuh b/SRC/cub/grid/grid_barrier.cuh new file mode 100644 index 00000000..eab5b518 --- /dev/null +++ b/SRC/cub/grid/grid_barrier.cuh @@ -0,0 +1,211 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid + */ + +#pragma once + +#include "../util_debug.cuh" +#include "../util_namespace.cuh" +#include "../thread/thread_load.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/** + * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid + */ +class GridBarrier +{ +protected : + + typedef unsigned int SyncFlag; + + // Counters in global device memory + SyncFlag* d_sync; + +public: + + /** + * Constructor + */ + GridBarrier() : d_sync(NULL) {} + + + /** + * Synchronize + */ + __device__ __forceinline__ void Sync() const + { + volatile SyncFlag *d_vol_sync = d_sync; + + // Threadfence and syncthreads to make sure global writes are visible before + // thread-0 reports in with its sync counter + __threadfence(); + __syncthreads(); + + if (blockIdx.x == 0) + { + // Report in ourselves + if (threadIdx.x == 0) + { + d_vol_sync[blockIdx.x] = 1; + } + + __syncthreads(); + + // Wait for everyone else to report in + for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) + { + while (ThreadLoad(d_sync + peer_block) == 0) + { + __threadfence_block(); + } + } + + __syncthreads(); + + // Let everyone know it's safe to proceed + for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) + { + d_vol_sync[peer_block] = 0; + } + } + else + { + if (threadIdx.x == 0) + { + // Report in + d_vol_sync[blockIdx.x] = 1; + + // Wait for acknowledgment + while (ThreadLoad(d_sync + blockIdx.x) == 1) + { + __threadfence_block(); + } + } + + __syncthreads(); + } + } +}; + + +/** + * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation. + * + * Uses RAII for lifetime, i.e., device resources are reclaimed when + * the destructor is called. + */ +class GridBarrierLifetime : public GridBarrier +{ +protected: + + // Number of bytes backed by d_sync + size_t sync_bytes; + +public: + + /** + * Constructor + */ + GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {} + + + /** + * DeviceFrees and resets the progress counters + */ + cudaError_t HostReset() + { + cudaError_t retval = cudaSuccess; + if (d_sync) + { + CubDebug(retval = cudaFree(d_sync)); + d_sync = NULL; + } + sync_bytes = 0; + return retval; + } + + + /** + * Destructor + */ + virtual ~GridBarrierLifetime() + { + HostReset(); + } + + + /** + * Sets up the progress counters for the next kernel launch (lazily + * allocating and initializing them if necessary) + */ + cudaError_t Setup(int sweep_grid_size) + { + cudaError_t retval = cudaSuccess; + do { + size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag); + if (new_sync_bytes > sync_bytes) + { + if (d_sync) + { + if (CubDebug(retval = cudaFree(d_sync))) break; + } + + sync_bytes = new_sync_bytes; + + // Allocate and initialize to zero + if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break; + if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break; + } + } while (0); + + return retval; + } +}; + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/grid/grid_even_share.cuh b/SRC/cub/grid/grid_even_share.cuh new file mode 100644 index 00000000..a3556329 --- /dev/null +++ b/SRC/cub/grid/grid_even_share.cuh @@ -0,0 +1,185 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). + */ + + +#pragma once + +#include "../util_namespace.cuh" +#include "../util_macro.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/** + * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). + * + * \par Overview + * GridEvenShare indicates which sections of input are to be mapped onto which threadblocks. + * Threadblocks may receive one of three different amounts of work: "big", "normal", + * and "last". The "big" workloads are one scheduling grain larger than "normal". The "last" work unit + * for the last threadblock may be partially-full if the input is not an even multiple of + * the scheduling grain size. + * + * \par + * Before invoking a child grid, a parent thread will typically construct an instance of + * GridEvenShare. The instance can be passed to child threadblocks which can + * initialize their per-threadblock offsets using \p BlockInit(). + * + * \tparam Offset Signed integer type for global offsets + */ +template +struct GridEvenShare +{ + Offset total_grains; + int big_blocks; + Offset big_share; + Offset normal_share; + Offset normal_base_offset; + + /// Total number of input items + Offset num_items; + + /// Grid size in threadblocks + int grid_size; + + /// Offset into input marking the beginning of the owning thread block's segment of input tiles + Offset block_offset; + + /// Offset into input of marking the end (one-past) of the owning thread block's segment of input tiles + Offset block_end; + + /** + * \brief Default constructor. Zero-initializes block-specific fields. + */ + __host__ __device__ __forceinline__ GridEvenShare() : + num_items(0), + grid_size(0), + block_offset(0), + block_end(0) {} + + /** + * \brief Constructor. Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch) + */ + __host__ __device__ __forceinline__ GridEvenShare( + Offset num_items, ///< Total number of input items + int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) + int schedule_granularity) ///< Granularity by which the input can be parcelled into and distributed among threablocks. Usually the thread block's native tile size (or a multiple thereof. + { + this->num_items = num_items; + this->block_offset = num_items; + this->block_end = num_items; + this->total_grains = (num_items + schedule_granularity - 1) / schedule_granularity; + this->grid_size = CUB_MIN(total_grains, max_grid_size); + Offset grains_per_block = total_grains / grid_size; + this->big_blocks = total_grains - (grains_per_block * grid_size); // leftover grains go to big blocks + this->normal_share = grains_per_block * schedule_granularity; + this->normal_base_offset = big_blocks * schedule_granularity; + this->big_share = normal_share + schedule_granularity; + } + + + + /** + * \brief Initializes ranges for the specified partition index + */ + __device__ __forceinline__ void Init(int partition_id) + { + if (partition_id < big_blocks) + { + // This threadblock gets a big share of grains (grains_per_block + 1) + block_offset = (partition_id * big_share); + block_end = block_offset + big_share; + } + else if (partition_id < total_grains) + { + // This threadblock gets a normal share of grains (grains_per_block) + block_offset = normal_base_offset + (partition_id * normal_share); + block_end = CUB_MIN(num_items, block_offset + normal_share); + } + } + + + /** + * \brief Initializes ranges for the current thread block (e.g., to be called by each threadblock after startup) + */ + __device__ __forceinline__ void BlockInit() + { + Init(blockIdx.x); + } + + + /** + * Print to stdout + */ + __host__ __device__ __forceinline__ void Print() + { + printf( +#if (CUB_PTX_ARCH > 0) + "\tthreadblock(%d) " + "block_offset(%lu) " + "block_end(%lu) " +#endif + "num_items(%lu) " + "total_grains(%lu) " + "big_blocks(%lu) " + "big_share(%lu) " + "normal_share(%lu)\n", +#if (CUB_PTX_ARCH > 0) + blockIdx.x, + (unsigned long) block_offset, + (unsigned long) block_end, +#endif + (unsigned long) num_items, + (unsigned long) total_grains, + (unsigned long) big_blocks, + (unsigned long) big_share, + (unsigned long) normal_share); + } +}; + + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/grid/grid_mapping.cuh b/SRC/cub/grid/grid_mapping.cuh new file mode 100644 index 00000000..ff6679b9 --- /dev/null +++ b/SRC/cub/grid/grid_mapping.cuh @@ -0,0 +1,95 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. + */ + +#pragma once + +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/****************************************************************************** + * Mapping policies + *****************************************************************************/ + + +/** + * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. + */ +enum GridMappingStrategy +{ + /** + * \brief An "even-share" strategy for assigning input tiles to thread blocks. + * + * \par Overview + * The input is evenly partitioned into \p p segments, where \p p is + * constant and corresponds loosely to the number of thread blocks that may + * actively reside on the target device. Each segment is comprised of + * consecutive tiles, where a tile is a small, constant-sized unit of input + * to be processed to completion before the thread block terminates or + * obtains more work. The kernel invokes \p p thread blocks, each + * of which iteratively consumes a segment of n/p elements + * in tile-size increments. + */ + GRID_MAPPING_EVEN_SHARE, + + /** + * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. + * + * \par Overview + * The input is treated as a queue to be dynamically consumed by a grid of + * thread blocks. Work is atomically dequeued in tiles, where a tile is a + * unit of input to be processed to completion before the thread block + * terminates or obtains more work. The grid size \p p is constant, + * loosely corresponding to the number of thread blocks that may actively + * reside on the target device. + */ + GRID_MAPPING_DYNAMIC, +}; + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/grid/grid_queue.cuh b/SRC/cub/grid/grid_queue.cuh new file mode 100644 index 00000000..86566166 --- /dev/null +++ b/SRC/cub/grid/grid_queue.cuh @@ -0,0 +1,216 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::GridQueue is a descriptor utility for dynamic queue management. + */ + +#pragma once + +#include "../util_namespace.cuh" +#include "../util_debug.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup GridModule + * @{ + */ + + +/** + * \brief GridQueue is a descriptor utility for dynamic queue management. + * + * \par Overview + * GridQueue descriptors provides abstractions for "filling" or + * "draining" globally-shared vectors. + * + * \par + * A "filling" GridQueue works by atomically-adding to a zero-initialized counter, + * returning a unique offset for the calling thread to write its items. + * The GridQueue maintains the total "fill-size". The fill counter must be reset + * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that + * will be filling. + * + * \par + * Similarly, a "draining" GridQueue works by works by atomically-incrementing a + * zero-initialized counter, returning a unique offset for the calling thread to + * read its items. Threads can safely drain until the array's logical fill-size is + * exceeded. The drain counter must be reset using GridQueue::ResetDrain or + * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that + * will be filling. (For dynamic work distribution of existing data, the corresponding fill-size + * is simply the number of elements in the array.) + * + * \par + * Iterative work management can be implemented simply with a pair of flip-flopping + * work buffers, each with an associated set of fill and drain GridQueue descriptors. + * + * \tparam Offset Signed integer type for global offsets + */ +template +class GridQueue +{ +private: + + /// Counter indices + enum + { + FILL = 0, + DRAIN = 1, + }; + + /// Pair of counters + Offset *d_counters; + +public: + + /// Returns the device allocation size in bytes needed to construct a GridQueue instance + __host__ __device__ __forceinline__ + static size_t AllocationSize() + { + return sizeof(Offset) * 2; + } + + + /// Constructs an invalid GridQueue descriptor + __host__ __device__ __forceinline__ GridQueue() + : + d_counters(NULL) + {} + + + /// Constructs a GridQueue descriptor around the device storage allocation + __host__ __device__ __forceinline__ GridQueue( + void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as AllocationSize(). + : + d_counters((Offset*) d_storage) + {} + + + /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining. + __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain( + Offset fill_size, + cudaStream_t stream = 0) + { +#if (CUB_PTX_ARCH > 0) + d_counters[FILL] = fill_size; + d_counters[DRAIN] = 0; + return cudaSuccess; +#else + Offset counters[2]; + counters[FILL] = fill_size; + counters[DRAIN] = 0; + return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(Offset) * 2, cudaMemcpyHostToDevice, stream)); +#endif + } + + + /// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining. + __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0) + { +#if (CUB_PTX_ARCH > 0) + d_counters[DRAIN] = 0; + return cudaSuccess; +#else + return FillAndResetDrain(0, stream); +#endif + } + + + /// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling. + __host__ __device__ __forceinline__ cudaError_t ResetFill() + { +#if (CUB_PTX_ARCH > 0) + d_counters[FILL] = 0; + return cudaSuccess; +#else + return CubDebug(cudaMemset(d_counters + FILL, 0, sizeof(Offset))); +#endif + } + + + /// Returns the fill-size established by the parent or by the previous kernel. + __host__ __device__ __forceinline__ cudaError_t FillSize( + Offset &fill_size, + cudaStream_t stream = 0) + { +#if (CUB_PTX_ARCH > 0) + fill_size = d_counters[FILL]; + return cudaSuccess; +#else + return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(Offset), cudaMemcpyDeviceToHost, stream)); +#endif + } + + + /// Drain num_items. Returns offset from which to read items. + __device__ __forceinline__ Offset Drain(Offset num_items) + { + return atomicAdd(d_counters + DRAIN, num_items); + } + + + /// Fill num_items. Returns offset from which to write items. + __device__ __forceinline__ Offset Fill(Offset num_items) + { + return atomicAdd(d_counters + FILL, num_items); + } +}; + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/** + * Reset grid queue (call with 1 block of 1 thread) + */ +template +__global__ void FillAndResetDrainKernel( + GridQueue grid_queue, + Offset num_items) +{ + grid_queue.FillAndResetDrain(num_items); +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group GridModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/host/spinlock.cuh b/SRC/cub/host/spinlock.cuh new file mode 100644 index 00000000..6e4b47c7 --- /dev/null +++ b/SRC/cub/host/spinlock.cuh @@ -0,0 +1,123 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Simple x86/x64 atomic spinlock, portable across MS Windows (cl.exe) & Linux (g++) + */ + + +#pragma once + +#if defined(_WIN32) || defined(_WIN64) + #include + #include + #undef small // Windows is terrible for polluting macro namespace + + /** + * Compiler read/write barrier + */ + #pragma intrinsic(_ReadWriteBarrier) + +#endif + +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +#if defined(_MSC_VER) + + // Microsoft VC++ + typedef long Spinlock; + +#else + + // GNU g++ + typedef int Spinlock; + + /** + * Compiler read/write barrier + */ + __forceinline__ void _ReadWriteBarrier() + { + __sync_synchronize(); + } + + /** + * Atomic exchange + */ + __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value) + { + // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier + _ReadWriteBarrier(); + return __sync_lock_test_and_set(Target, Value); + } + + /** + * Pause instruction to prevent excess processor bus usage + */ + __forceinline__ void YieldProcessor() + { +#ifndef __arm__ + asm volatile("pause\n": : :"memory"); +#endif // __arm__ + } + +#endif // defined(_MSC_VER) + +/** + * Return when the specified spinlock has been acquired + */ +__forceinline__ void Lock(volatile Spinlock *lock) +{ + while (1) + { + if (!_InterlockedExchange(lock, 1)) return; + while (*lock) YieldProcessor(); + } +} + + +/** + * Release the specified spinlock + */ +__forceinline__ void Unlock(volatile Spinlock *lock) +{ + _ReadWriteBarrier(); + *lock = 0; +} + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/iterator/arg_index_input_iterator.cuh b/SRC/cub/iterator/arg_index_input_iterator.cuh new file mode 100644 index 00000000..03b842d4 --- /dev/null +++ b/SRC/cub/iterator/arg_index_input_iterator.cuh @@ -0,0 +1,255 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#include + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p ItemOffsetPair tuples). + * + * \par Overview + * - ArgIndexInputIterator wraps a random access input iterator \p itr of type \p InputIterator. + * Dereferencing an ArgIndexInputIterator at offset \p i produces a \p ItemOffsetPair value whose + * \p offset field is \p i and whose \p item field is itr[i]. + * - Can be used with any data type. + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions. Wrapped host memory can only be dereferenced on the host, and wrapped + * device memory can only be dereferenced on the device. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p ArgIndexInputIterator to + * dereference an array of doubles + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::ArgIndexInputIterator itr(d_in); + * + * // Within device code: + * typedef typename cub::ArgIndexInputIterator::value_type Tuple; + * Tuple item_offset_pair.offset = *itr; + * printf("%f @ %d\n", + * item_offset_pair.value, + * item_offset_pair.offset); // 8.0 @ 0 + * + * itr = itr + 6; + * item_offset_pair.offset = *itr; + * printf("%f @ %d\n", + * item_offset_pair.value, + * item_offset_pair.offset); // 9.0 @ 6 + * + * \endcode + * + * \tparam InputIterator The type of the wrapped input iterator + * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename InputIterator, + typename Offset = ptrdiff_t> +class ArgIndexInputIterator +{ +private: + + // Data type of input iterator + typedef typename std::iterator_traits::value_type T; + +public: + + + // Required iterator traits + typedef ArgIndexInputIterator self_type; ///< My own type + typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ItemOffsetPair value_type; ///< The type of the element the iterator can point to + typedef value_type* pointer; ///< The type of a pointer to an element the iterator can point to + typedef value_type reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + InputIterator itr; + difference_type offset; + +public: + + /// Constructor + __host__ __device__ __forceinline__ ArgIndexInputIterator( + InputIterator itr, ///< Input iterator to wrap + difference_type offset = 0) ///< Offset (in items) from \p itr denoting the position of the iterator + : + itr(itr), + offset(offset) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + value_type retval; + retval.value = itr[offset]; + retval.offset = offset; + return retval; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(itr, offset + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(itr, offset - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return offset - other.offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return *(*this + n); + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &(*(*this)); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return ((itr == rhs.itr) && (offset == rhs.offset)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return ((itr != rhs.itr) || (offset != rhs.offset)); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/iterator/cache_modified_input_iterator.cuh b/SRC/cub/iterator/cache_modified_input_iterator.cuh new file mode 100644 index 00000000..16ba3a4a --- /dev/null +++ b/SRC/cub/iterator/cache_modified_input_iterator.cuh @@ -0,0 +1,240 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier. + * + * \par Overview + * - CacheModifiedInputIterator is a random-access input iterator that wraps a native + * device pointer of type ValueType*. \p ValueType references are + * made by reading \p ValueType values through loads modified by \p MODIFIER. + * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG", + * "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.). + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions, but can only be dereferenced within device functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p CacheModifiedInputIterator to + * dereference a device array of double using the "ldg" PTX load modifier + * (i.e., load values through texture cache). + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::CacheModifiedInputIterator itr(d_in); + * + * // Within device code: + * printf("%f\n", itr[0]); // 8.0 + * printf("%f\n", itr[1]); // 6.0 + * printf("%f\n", itr[6]); // 9.0 + * + * \endcode + * + * \tparam CacheLoadModifier The cub::CacheLoadModifier to use when accessing data + * \tparam ValueType The value type of this iterator + * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + CacheLoadModifier MODIFIER, + typename ValueType, + typename Offset = ptrdiff_t> +class CacheModifiedInputIterator +{ +public: + + // Required iterator traits + typedef CacheModifiedInputIterator self_type; ///< My own type + typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + + +private: + + ValueType* ptr; + +public: + + /// Constructor + __host__ __device__ __forceinline__ CacheModifiedInputIterator( + ValueType* ptr) ///< Native pointer to wrap + : + ptr(ptr) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + ptr++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + ptr++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return ThreadLoad(ptr); + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(ptr + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + ptr += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(ptr - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + ptr -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return ptr - other.ptr; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return ThreadLoad(ptr + n); + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &ThreadLoad(ptr); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (ptr == rhs.ptr); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (ptr != rhs.ptr); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/iterator/cache_modified_output_iterator.cuh b/SRC/cub/iterator/cache_modified_output_iterator.cuh new file mode 100644 index 00000000..179ce146 --- /dev/null +++ b/SRC/cub/iterator/cache_modified_output_iterator.cuh @@ -0,0 +1,253 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access output wrapper for storing array values using a PTX cache-modifier. + * + * \par Overview + * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native + * device pointer of type ValueType*. \p ValueType references are + * made by writing \p ValueType values through stores modified by \p MODIFIER. + * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB", + * "STORE_CG", "STORE_CS", "STORE_WT", etc.). + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions, but can only be dereferenced within device functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to + * dereference a device array of doubles using the "wt" PTX load modifier + * (i.e., write-through to system memory). + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * double *d_out; // e.g., [, , , , , , ] + * + * // Create an iterator wrapper + * cub::CacheModifiedOutputIterator itr(d_out); + * + * // Within device code: + * itr[0] = 8.0; + * itr[1] = 66.0; + * itr[55] = 24.0; + * + * \endcode + * + * \par Usage Considerations + * - Can only be dereferenced within device code + * + * \tparam CacheStoreModifier The cub::CacheStoreModifier to use when accessing data + * \tparam ValueType The value type of this iterator + * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + CacheStoreModifier MODIFIER, + typename ValueType, + typename Offset = ptrdiff_t> +class CacheModifiedOutputIterator +{ +private: + + // Proxy object + struct Reference + { + ValueType* ptr; + + /// Constructor + __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {} + + /// Assignment + __host__ __device__ __forceinline__ ValueType operator =(ValueType val) + { + ThreadStore(ptr, val); + return val; + } + }; + +public: + + // Required iterator traits + typedef CacheModifiedOutputIterator self_type; ///< My own type + typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef Reference reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ValueType* ptr; + +public: + + /// Constructor + __host__ __device__ __forceinline__ CacheModifiedOutputIterator( + ValueType* ptr) ///< Native pointer to wrap + : + ptr(ptr) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + ptr++; + return retval; + } + + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + ptr++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return Reference(ptr); + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(ptr + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + ptr += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(ptr - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + ptr -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return ptr - other.ptr; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return Reference(ptr + n); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (ptr == rhs.ptr); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (ptr != rhs.ptr); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } +}; + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/iterator/constant_input_iterator.cuh b/SRC/cub/iterator/constant_input_iterator.cuh new file mode 100644 index 00000000..4c386a6b --- /dev/null +++ b/SRC/cub/iterator/constant_input_iterator.cuh @@ -0,0 +1,235 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input generator for dereferencing a sequence of homogeneous values + * + * \par Overview + * - Read references to a ConstantInputIterator iterator always return the supplied constant + * of type \p ValueType. + * - Can be used with any data type. + * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device + * functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p ConstantInputIterator to + * dereference a sequence of homogeneous doubles. + * \par + * \code + * #include // or equivalently + * + * cub::ConstantInputIterator itr(5.0); + * + * printf("%f\n", itr[0]); // 5.0 + * printf("%f\n", itr[1]); // 5.0 + * printf("%f\n", itr[2]); // 5.0 + * printf("%f\n", itr[50]); // 5.0 + * + * \endcode + * + * \tparam ValueType The value type of this iterator + * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename ValueType, + typename Offset = ptrdiff_t> +class ConstantInputIterator +{ +public: + + // Required iterator traits + typedef ConstantInputIterator self_type; ///< My own type + typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ValueType val; + Offset offset; +#ifdef _WIN32 + Offset pad[CUB_MAX(1, (16 / sizeof(Offset) - 1))]; // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) +#endif + +public: + + /// Constructor + __host__ __device__ __forceinline__ ConstantInputIterator( + ValueType val, ///< Starting value for the iterator instance to report + Offset offset = 0) ///< Base offset + : + val(val), + offset(offset) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return val; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(val, offset + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(val, offset - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return offset - other.offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return val; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &val; + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (offset == rhs.offset) && ((val == rhs.val)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (offset != rhs.offset) || (val!= rhs.val); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + os << "[" << itr.val << "," << itr.offset << "]"; + return os; + } + +}; + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/iterator/counting_input_iterator.cuh b/SRC/cub/iterator/counting_input_iterator.cuh new file mode 100644 index 00000000..7c6320f9 --- /dev/null +++ b/SRC/cub/iterator/counting_input_iterator.cuh @@ -0,0 +1,228 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + +/** + * \brief A random-access input generator for dereferencing a sequence of incrementing integer values. + * + * \par Overview + * - After initializing a CountingInputIterator to a certain integer \p base, read references + * at \p offset will return the value \p base + \p offset. + * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device + * functions. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p CountingInputIterator to + * dereference a sequence of incrementing integers. + * \par + * \code + * #include // or equivalently + * + * cub::CountingInputIterator itr(5); + * + * printf("%d\n", itr[0]); // 5 + * printf("%d\n", itr[1]); // 6 + * printf("%d\n", itr[2]); // 7 + * printf("%d\n", itr[50]); // 55 + * + * \endcode + * + * \tparam ValueType The value type of this iterator + * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename ValueType, + typename Offset = ptrdiff_t> +class CountingInputIterator +{ +public: + + // Required iterator traits + typedef CountingInputIterator self_type; ///< My own type + typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ValueType val; + +public: + + /// Constructor + __host__ __device__ __forceinline__ CountingInputIterator( + const ValueType &val) ///< Starting value for the iterator instance to report + : + val(val) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + val++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + val++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return val; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(val + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + val += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(val - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + val -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return val - other.val; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return val + n; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &val; + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (val == rhs.val); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (val != rhs.val); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + os << "[" << itr.val << "]"; + return os; + } + +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/iterator/tex_obj_input_iterator.cuh b/SRC/cub/iterator/tex_obj_input_iterator.cuh new file mode 100644 index 00000000..be5c79c1 --- /dev/null +++ b/SRC/cub/iterator/tex_obj_input_iterator.cuh @@ -0,0 +1,308 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_debug.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + + + +/** + * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses newer Kepler-style texture objects. + * + * \par Overview + * - TexObjInputIterator wraps a native device pointer of type ValueType*. References + * to elements are to be loaded through texture cache. + * - Can be used to load any data type from memory through texture cache. + * - Can be manipulated and exchanged within and between host and device + * functions, can only be constructed within host functions, and can only be + * dereferenced within device functions. + * - With regard to nested/dynamic parallelism, TexObjInputIterator iterators may only be + * created by the host thread, but can be used by any descendant kernel. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p TexRefInputIterator to + * dereference a device array of doubles through texture cache. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * int num_items; // e.g., 7 + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::TexObjInputIterator itr; + * itr.BindTexture(d_in, sizeof(double) * num_items); + * ... + * + * // Within device code: + * printf("%f\n", itr[0]); // 8.0 + * printf("%f\n", itr[1]); // 6.0 + * printf("%f\n", itr[6]); // 9.0 + * + * ... + * itr.UnbindTexture(); + * + * \endcode + * + * \tparam T The value type of this iterator + * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename T, + typename Offset = ptrdiff_t> +class TexObjInputIterator +{ +public: + + // Required iterator traits + typedef TexObjInputIterator self_type; ///< My own type + typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another + typedef T value_type; ///< The type of the element the iterator can point to + typedef T* pointer; ///< The type of a pointer to an element the iterator can point to + typedef T reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + // Largest texture word we can use in device + typedef typename UnitWord::TextureWord TextureWord; + + // Number of texture words per T + enum { + TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) + }; + +private: + + T* ptr; + difference_type tex_offset; + cudaTextureObject_t tex_obj; + +public: + + /// Constructor + __host__ __device__ __forceinline__ TexObjInputIterator() + : + ptr(NULL), + tex_offset(0), + tex_obj(0) + {} + + /// Use this iterator to bind \p ptr with a texture reference + cudaError_t BindTexture( + T *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment + size_t bytes, ///< Number of bytes in the range + size_t tex_offset = 0) ///< Offset (in items) from \p ptr denoting the position of the iterator + { + this->ptr = ptr; + this->tex_offset = tex_offset; + + cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); + cudaResourceDesc res_desc; + cudaTextureDesc tex_desc; + memset(&res_desc, 0, sizeof(cudaResourceDesc)); + memset(&tex_desc, 0, sizeof(cudaTextureDesc)); + res_desc.resType = cudaResourceTypeLinear; + res_desc.res.linear.devPtr = ptr; + res_desc.res.linear.desc = channel_desc; + res_desc.res.linear.sizeInBytes = bytes; + tex_desc.readMode = cudaReadModeElementType; + return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL); + } + + /// Unbind this iterator from its texture reference + cudaError_t UnbindTexture() + { + return cudaDestroyTextureObject(tex_obj); + } + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + tex_offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + tex_offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { +#if (CUB_PTX_ARCH == 0) + // Simply dereference the pointer on the host + return ptr[tex_offset]; +#else + // Move array of uninitialized words, then alias and assign to return value + TextureWord words[TEXTURE_MULTIPLE]; + + #pragma unroll + for (int i = 0; i < TEXTURE_MULTIPLE; ++i) + { + words[i] = tex1Dfetch( + tex_obj, + (tex_offset * TEXTURE_MULTIPLE) + i); + } + + // Load from words + return *reinterpret_cast(words); +#endif + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_obj = tex_obj; + retval.tex_offset = tex_offset + n; + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + tex_offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_obj = tex_obj; + retval.tex_offset = tex_offset - n; + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + tex_offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return tex_offset - other.tex_offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return *(*this + n); + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &(*(*this)); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj)); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } + +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/iterator/tex_ref_input_iterator.cuh b/SRC/cub/iterator/tex_ref_input_iterator.cuh new file mode 100644 index 00000000..c1102af5 --- /dev/null +++ b/SRC/cub/iterator/tex_ref_input_iterator.cuh @@ -0,0 +1,370 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_debug.cuh" +#include "../util_namespace.cuh" + +#if (CUDA_VERSION >= 5050) || defined(DOXYGEN_ACTIVE) // This iterator is compatible with CUDA 5.5 and newer + +#if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Static file-scope Tesla/Fermi-style texture references + *****************************************************************************/ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +// Anonymous namespace +namespace { + +/// Global texture reference specialized by type +template +struct IteratorTexRef +{ + /// And by unique ID + template + struct TexId + { + // Largest texture word we can use in device + typedef typename UnitWord::DeviceWord DeviceWord; + typedef typename UnitWord::TextureWord TextureWord; + + // Number of texture words per T + enum { + DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord), + TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) + }; + + // Texture reference type + typedef texture TexRef; + + // Texture reference + static TexRef ref; + + /// Bind texture + static cudaError_t BindTexture(void *d_in) + { + if (d_in) + { + cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc(); + ref.channelDesc = tex_desc; + return (CubDebug(cudaBindTexture(NULL, ref, d_in))); + } + + return cudaSuccess; + } + + /// Unbind texture + static cudaError_t UnbindTexture() + { + return CubDebug(cudaUnbindTexture(ref)); + } + + /// Fetch element + template + static __device__ __forceinline__ T Fetch(Distance tex_offset) + { + DeviceWord temp[DEVICE_MULTIPLE]; + TextureWord *words = reinterpret_cast(temp); + + #pragma unroll + for (int i = 0; i < TEXTURE_MULTIPLE; ++i) + { + words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i); + } + + return reinterpret_cast(temp); + } + }; +}; + +// Texture reference definitions +template +template +typename IteratorTexRef::template TexId::TexRef IteratorTexRef::template TexId::ref = 0; + + +} // Anonymous namespace + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/** + * \addtogroup UtilIterator + * @{ + */ + + + +/** + * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses older Tesla/Fermi-style texture references. + * + * \par Overview + * - TexRefInputIterator wraps a native device pointer of type ValueType*. References + * to elements are to be loaded through texture cache. + * - Can be used to load any data type from memory through texture cache. + * - Can be manipulated and exchanged within and between host and device + * functions, can only be constructed within host functions, and can only be + * dereferenced within device functions. + * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture + * reference. Only one TexRefInputIterator instance can be bound at any given time for a + * specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host + * thread, and (4) compilation .o unit. + * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be + * created by the host thread and used by a top-level kernel (i.e. the one which is launched + * from the host). + * - Compatible with Thrust API v1.7 or newer. + * - Compatible with CUDA toolkit v5.5 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p TexRefInputIterator to + * dereference a device array of doubles through texture cache. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize a device array + * int num_items; // e.g., 7 + * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] + * + * // Create an iterator wrapper + * cub::TexRefInputIterator itr; + * itr.BindTexture(d_in, sizeof(double) * num_items); + * ... + * + * // Within device code: + * printf("%f\n", itr[0]); // 8.0 + * printf("%f\n", itr[1]); // 6.0 + * printf("%f\n", itr[6]); // 9.0 + * + * ... + * itr.UnbindTexture(); + * + * \endcode + * + * \tparam T The value type of this iterator + * \tparam UNIQUE_ID A globally-unique identifier (within the compilation unit) to name the underlying texture reference + * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + */ +template < + typename T, + int UNIQUE_ID, + typename Offset = ptrdiff_t> +class TexRefInputIterator +{ +public: + + // Required iterator traits + typedef TexRefInputIterator self_type; ///< My own type + typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another + typedef T value_type; ///< The type of the element the iterator can point to + typedef T* pointer; ///< The type of a pointer to an element the iterator can point to + typedef T reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::device_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + T* ptr; + difference_type tex_offset; + + // Texture reference wrapper (old Tesla/Fermi-style textures) + typedef typename IteratorTexRef::template TexId TexId; + +public: + + /// Constructor + __host__ __device__ __forceinline__ TexRefInputIterator() + : + ptr(NULL), + tex_offset(0) + {} + + /// Use this iterator to bind \p ptr with a texture reference + cudaError_t BindTexture( + T *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment + size_t bytes, ///< Number of bytes in the range + size_t tex_offset = 0) ///< Offset (in items) from \p ptr denoting the position of the iterator + { + this->ptr = ptr; + this->tex_offset = tex_offset; + return TexId::BindTexture(ptr); + } + + /// Unbind this iterator from its texture reference + cudaError_t UnbindTexture() + { + return TexId::UnbindTexture(); + } + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + tex_offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + tex_offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { +#if (CUB_PTX_ARCH == 0) + // Simply dereference the pointer on the host + return ptr[tex_offset]; +#else + // Use the texture reference + return TexId::Fetch(tex_offset); +#endif + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_offset = tex_offset + n; + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + tex_offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval; + retval.ptr = ptr; + retval.tex_offset = tex_offset - n; + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + tex_offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return tex_offset - other.tex_offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return *(*this + n); + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &(*(*this)); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset)); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset)); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } + +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + +#endif // CUDA_VERSION diff --git a/SRC/cub/iterator/transform_input_iterator.cuh b/SRC/cub/iterator/transform_input_iterator.cuh new file mode 100644 index 00000000..90ffbaad --- /dev/null +++ b/SRC/cub/iterator/transform_input_iterator.cuh @@ -0,0 +1,252 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../thread/thread_load.cuh" +#include "../thread/thread_store.cuh" +#include "../util_device.cuh" +#include "../util_namespace.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A random-access input wrapper for transforming dereferenced values. + * + * \par Overview + * - TransformInputIterator wraps a unary conversion functor of type \p + * ConversionOp and a random-access input iterator of type InputIterator, + * using the former to produce references of type \p ValueType from the latter. + * - Can be used with any data type. + * - Can be constructed, manipulated, and exchanged within and between host and device + * functions. Wrapped host memory can only be dereferenced on the host, and wrapped + * device memory can only be dereferenced on the device. + * - Compatible with Thrust API v1.7 or newer. + * + * \par Snippet + * The code snippet below illustrates the use of \p TransformInputIterator to + * dereference an array of integers, tripling the values and converting them to doubles. + * \par + * \code + * #include // or equivalently + * + * // Functor for tripling integer values and converting to doubles + * struct TripleDoubler + * { + * __host__ __device__ __forceinline__ + * double operator()(const int &a) const { + * return double(a * 2); + * } + * }; + * + * // Declare, allocate, and initialize a device array + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * TripleDoubler conversion_op; + * + * // Create an iterator wrapper + * cub::TransformInputIterator itr(d_in, conversion_op); + * + * // Within device code: + * printf("%f\n", itr[0]); // 24.0 + * printf("%f\n", itr[1]); // 18.0 + * printf("%f\n", itr[6]); // 27.0 + * + * \endcode + * + * \tparam ValueType The value type of this iterator + * \tparam ConversionOp Unary functor type for mapping objects of type \p InputType to type \p ValueType. Must have member ValueType operator()(const InputType &datum). + * \tparam InputIterator The type of the wrapped input iterator + * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + * + */ +template < + typename ValueType, + typename ConversionOp, + typename InputIterator, + typename Offset = ptrdiff_t> +class TransformInputIterator +{ +public: + + // Required iterator traits + typedef TransformInputIterator self_type; ///< My own type + typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another + typedef ValueType value_type; ///< The type of the element the iterator can point to + typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef ValueType reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + ConversionOp conversion_op; + InputIterator input_itr; + +public: + + /// Constructor + __host__ __device__ __forceinline__ TransformInputIterator( + InputIterator input_itr, ///< Input iterator to wrap + ConversionOp conversion_op) ///< Conversion functor to wrap + : + conversion_op(conversion_op), + input_itr(input_itr) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + input_itr++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + input_itr++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ reference operator*() const + { + return conversion_op(*input_itr); + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(input_itr + n, conversion_op); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + input_itr += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(input_itr - n, conversion_op); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + input_itr -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return input_itr - other.input_itr; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ reference operator[](Distance n) const + { + return conversion_op(input_itr[n]); + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return &conversion_op(*input_itr); + } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (input_itr == rhs.input_itr); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (input_itr != rhs.input_itr); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + return os; + } +}; + + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/thread/thread_load.cuh b/SRC/cub/thread/thread_load.cuh new file mode 100644 index 00000000..8e3790f5 --- /dev/null +++ b/SRC/cub/thread/thread_load.cuh @@ -0,0 +1,444 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for reading memory using PTX cache modifiers. + */ + +#pragma once + +#include + +#include + +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + +//----------------------------------------------------------------------------- +// Tags and constants +//----------------------------------------------------------------------------- + +/** + * \brief Enumeration of cache modifiers for memory load operations. + */ +enum CacheLoadModifier +{ + LOAD_DEFAULT, ///< Default (no modifier) + LOAD_CA, ///< Cache at all levels + LOAD_CG, ///< Cache at global level + LOAD_CS, ///< Cache streaming (likely to be accessed once) + LOAD_CV, ///< Cache as volatile (including cached system lines) + LOAD_LDG, ///< Cache as texture + LOAD_VOLATILE, ///< Volatile (any memory space) +}; + + +/** + * \name Thread I/O (cache modified) + * @{ + */ + +/** + * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. + * + * \par Example + * \code + * #include // or equivalently + * + * // 32-bit load using cache-global modifier: + * int *d_in; + * int val = cub::ThreadLoad(d_in + threadIdx.x); + * + * // 16-bit load using default modifier + * short *d_in; + * short val = cub::ThreadLoad(d_in + threadIdx.x); + * + * // 256-bit load using cache-volatile modifier + * double4 *d_in; + * double4 val = cub::ThreadLoad(d_in + threadIdx.x); + * + * // 96-bit load using cache-streaming modifier + * struct TestFoo { bool a; short b; }; + * TestFoo *d_struct; + * TestFoo val = cub::ThreadLoad(d_in + threadIdx.x); + * \endcode + * + * \tparam MODIFIER [inferred] CacheLoadModifier enumeration + * \tparam InputIterator [inferred] Input iterator type \iterator + */ +template < + CacheLoadModifier MODIFIER, + typename InputIterator> +__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIterator itr); + + +//@} end member group + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/// Helper structure for templated load iteration (inductive case) +template +struct IterateThreadLoad +{ + template + static __device__ __forceinline__ void Load(T *ptr, T *vals) + { + vals[COUNT] = ThreadLoad(ptr + COUNT); + IterateThreadLoad::template Load(ptr, vals); + } + + template + static __device__ __forceinline__ void Dereference(InputIterator ptr, T *vals) + { + vals[COUNT] = ptr[COUNT]; + IterateThreadLoad::Dereference(ptr, vals); + } +}; + + +/// Helper structure for templated load iteration (termination case) +template +struct IterateThreadLoad +{ + template + static __device__ __forceinline__ void Load(T *ptr, T *vals) {} + + template + static __device__ __forceinline__ void Dereference(InputIterator ptr, T *vals) {} +}; + + +/** + * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier + */ +#define CUB_LOAD_16(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ uint4 ThreadLoad(uint4* ptr) \ + { \ + uint4 retval; \ + asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \ + "=r"(retval.x), \ + "=r"(retval.y), \ + "=r"(retval.z), \ + "=r"(retval.w) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } \ + template<> \ + __device__ __forceinline__ ulonglong2 ThreadLoad(ulonglong2* ptr) \ + { \ + ulonglong2 retval; \ + asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \ + "=l"(retval.x), \ + "=l"(retval.y) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + +/** + * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier + */ +#define CUB_LOAD_8(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ ushort4 ThreadLoad(ushort4* ptr) \ + { \ + ushort4 retval; \ + asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \ + "=h"(retval.x), \ + "=h"(retval.y), \ + "=h"(retval.z), \ + "=h"(retval.w) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } \ + template<> \ + __device__ __forceinline__ uint2 ThreadLoad(uint2* ptr) \ + { \ + uint2 retval; \ + asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \ + "=r"(retval.x), \ + "=r"(retval.y) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } \ + template<> \ + __device__ __forceinline__ unsigned long long ThreadLoad(unsigned long long* ptr) \ + { \ + unsigned long long retval; \ + asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \ + "=l"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + +/** + * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier + */ +#define CUB_LOAD_4(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ unsigned int ThreadLoad(unsigned int* ptr) \ + { \ + unsigned int retval; \ + asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \ + "=r"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + + +/** + * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier + */ +#define CUB_LOAD_2(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ unsigned short ThreadLoad(unsigned short* ptr) \ + { \ + unsigned short retval; \ + asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \ + "=h"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return retval; \ + } + + +/** + * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier + */ +#define CUB_LOAD_1(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ unsigned char ThreadLoad(unsigned char* ptr) \ + { \ + unsigned short retval; \ + asm volatile ( \ + "{" \ + " .reg .u8 datum;" \ + " ld."#ptx_modifier".u8 datum, [%1];" \ + " cvt.u16.u8 %0, datum;" \ + "}" : \ + "=h"(retval) : \ + _CUB_ASM_PTR_(ptr)); \ + return (unsigned char) retval; \ + } + + +/** + * Define powers-of-two ThreadLoad specializations for the given Cache load modifier + */ +#define CUB_LOAD_ALL(cub_modifier, ptx_modifier) \ + CUB_LOAD_16(cub_modifier, ptx_modifier) \ + CUB_LOAD_8(cub_modifier, ptx_modifier) \ + CUB_LOAD_4(cub_modifier, ptx_modifier) \ + CUB_LOAD_2(cub_modifier, ptx_modifier) \ + CUB_LOAD_1(cub_modifier, ptx_modifier) \ + + +/** + * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers + */ +#if CUB_PTX_ARCH >= 200 + CUB_LOAD_ALL(LOAD_CA, ca) + CUB_LOAD_ALL(LOAD_CG, cg) + CUB_LOAD_ALL(LOAD_CS, cs) + CUB_LOAD_ALL(LOAD_CV, cv) +#else + CUB_LOAD_ALL(LOAD_CA, global) + // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1 + CUB_LOAD_ALL(LOAD_CG, volatile.global) + CUB_LOAD_ALL(LOAD_CS, global) + CUB_LOAD_ALL(LOAD_CV, volatile.global) +#endif + +#if CUB_PTX_ARCH >= 350 + CUB_LOAD_ALL(LOAD_LDG, global.nc) +#else + CUB_LOAD_ALL(LOAD_LDG, global) +#endif + + +/** + * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types + */ +template +__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad( + InputIterator itr, + Int2Type modifier, + Int2Type is_pointer) +{ + return *itr; +} + + +/** + * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types + */ +template +__device__ __forceinline__ T ThreadLoad( + T *ptr, + Int2Type modifier, + Int2Type is_pointer) +{ + return *ptr; +} + + +/** + * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types + */ +template +__device__ __forceinline__ T ThreadLoadVolatilePointer( + T *ptr, + Int2Type is_primitive) +{ + T retval = *reinterpret_cast(ptr); + +#if (CUB_PTX_ARCH <= 130) + if (sizeof(T) == 1) __threadfence_block(); +#endif + + return retval; +} + + +/** + * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types + */ +template +__device__ __forceinline__ T ThreadLoadVolatilePointer( + T *ptr, + Int2Type is_primitive) +{ + +#if CUB_PTX_ARCH <= 130 + + T retval = *ptr; + __threadfence_block(); + return retval; + +#else + + typedef typename UnitWord::VolatileWord VolatileWord; // Word type for memcopying + + const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); +/* + VolatileWord words[VOLATILE_MULTIPLE]; + + IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( + reinterpret_cast(ptr), + words); + + return *reinterpret_cast(words); +*/ + + T retval; + VolatileWord *words = reinterpret_cast(&retval); + IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( + reinterpret_cast(ptr), + words); + return retval; + +#endif // CUB_PTX_ARCH <= 130 +} + + +/** + * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types + */ +template +__device__ __forceinline__ T ThreadLoad( + T *ptr, + Int2Type modifier, + Int2Type is_pointer) +{ + // Apply tags for partial-specialization + return ThreadLoadVolatilePointer(ptr, Int2Type::PRIMITIVE>()); +} + + +/** + * ThreadLoad definition for generic modifiers on pointer types + */ +template +__device__ __forceinline__ T ThreadLoad( + T *ptr, + Int2Type modifier, + Int2Type is_pointer) +{ + typedef typename UnitWord::DeviceWord DeviceWord; + + const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); + + DeviceWord words[DEVICE_MULTIPLE]; + + IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load( + reinterpret_cast(ptr), + words); + + return *reinterpret_cast(words); +} + + +/** + * ThreadLoad definition for generic modifiers + */ +template < + CacheLoadModifier MODIFIER, + typename InputIterator> +__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIterator itr) +{ + // Apply tags for partial-specialization + return ThreadLoad( + itr, + Int2Type(), + Int2Type::VALUE>()); +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group UtilIo + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/thread/thread_operators.cuh b/SRC/cub/thread/thread_operators.cuh new file mode 100644 index 00000000..75c96273 --- /dev/null +++ b/SRC/cub/thread/thread_operators.cuh @@ -0,0 +1,206 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Simple binary operator functor types + */ + +/****************************************************************************** + * Simple functor operators + ******************************************************************************/ + +#pragma once + +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilModule + * @{ + */ + +/** + * \brief Default equality functor + */ +struct Equality +{ + /// Boolean equality operator, returns (a == b) + template + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const + { + return a == b; + } +}; + + +/** + * \brief Default inequality functor + */ +struct Inequality +{ + /// Boolean inequality operator, returns (a != b) + template + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const + { + return a != b; + } +}; + + +/** + * \brief Inequality functor (wraps equality functor) + */ +template +struct InequalityWrapper +{ + /// Wrapped equality operator + EqualityOp op; + + /// Constructor + __host__ __device__ __forceinline__ + InequalityWrapper(EqualityOp op) : op(op) {} + + /// Boolean inequality operator, returns (a != b) + template + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const + { + return !op(a, b); + } +}; + + +/** + * \brief Default sum functor + */ +struct Sum +{ + /// Boolean sum operator, returns a + b + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return a + b; + } +}; + + +/** + * \brief Default max functor + */ +struct Max +{ + /// Boolean max operator, returns (a > b) ? a : b + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return CUB_MAX(a, b); + } +}; + + +/** + * \brief Arg max functor (keeps the value and offset of the first occurrence of the l item) + */ +struct ArgMax +{ + /// Boolean max operator, preferring the item having the smaller offset in case of ties + template + __host__ __device__ __forceinline__ ItemOffsetPair operator()( + const ItemOffsetPair &a, + const ItemOffsetPair &b) const + { + if (a.value == b.value) + return (b.offset < a.offset) ? b : a; + + return (b.value > a.value) ? b : a; + } +}; + + +/** + * \brief Default min functor + */ +struct Min +{ + /// Boolean min operator, returns (a < b) ? a : b + template + __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const + { + return CUB_MIN(a, b); + } +}; + + +/** + * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item) + */ +struct ArgMin +{ + /// Boolean min operator, preferring the item having the smaller offset in case of ties + template + __host__ __device__ __forceinline__ ItemOffsetPair operator()( + const ItemOffsetPair &a, + const ItemOffsetPair &b) const + { + if (a.value == b.value) + return (b.offset < a.offset) ? b : a; + + return (b.value < a.value) ? b : a; + } +}; + + +/** + * \brief Default cast functor + */ +template +struct Cast +{ + /// Boolean max operator, returns (a > b) ? a : b + template + __host__ __device__ __forceinline__ B operator()(const A &a) const + { + return (B) a; + } +}; + + + +/** @} */ // end group UtilModule + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/thread/thread_reduce.cuh b/SRC/cub/thread/thread_reduce.cuh new file mode 100644 index 00000000..29bc8ce0 --- /dev/null +++ b/SRC/cub/thread/thread_reduce.cuh @@ -0,0 +1,169 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for sequential reduction over statically-sized array types + */ + +#pragma once + +#include "../thread/thread_operators.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilModule + * @{ + */ + +/** + * \name Sequential reduction over statically-sized array types + * @{ + */ + + +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix, ///< [in] Prefix to seed reduction with + Int2Type length) +{ + T addend = *input; + prefix = reduction_op(prefix, addend); + + return ThreadReduce(input + 1, reduction_op, prefix, Int2Type()); +} + +template < + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix, ///< [in] Prefix to seed reduction with + Int2Type<0> length) +{ + return prefix; +} + + +/** + * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH Length of input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix) ///< [in] Prefix to seed reduction with +{ + return ThreadReduce(input, reduction_op, prefix, Int2Type()); +} + + +/** + * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned. + * + * \tparam LENGTH Length of input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T* input, ///< [in] Input array + ReductionOp reduction_op) ///< [in] Binary reduction operator +{ + T prefix = input[0]; + return ThreadReduce(input + 1, reduction_op, prefix); +} + + +/** + * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH [inferred] Length of \p input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T (&input)[LENGTH], ///< [in] Input array + ReductionOp reduction_op, ///< [in] Binary reduction operator + T prefix) ///< [in] Prefix to seed reduction with +{ + return ThreadReduce(input, reduction_op, prefix); +} + + +/** + * \brief Serial reduction with the specified operator + * + * \tparam LENGTH [inferred] Length of \p input array + * \tparam T [inferred] The data type to be reduced. + * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ReductionOp> +__device__ __forceinline__ T ThreadReduce( + T (&input)[LENGTH], ///< [in] Input array + ReductionOp reduction_op) ///< [in] Binary reduction operator +{ + return ThreadReduce((T*) input, reduction_op); +} + + +//@} end member group + +/** @} */ // end group UtilModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/thread/thread_scan.cuh b/SRC/cub/thread/thread_scan.cuh new file mode 100644 index 00000000..6276bf83 --- /dev/null +++ b/SRC/cub/thread/thread_scan.cuh @@ -0,0 +1,283 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for sequential prefix scan over statically-sized array types + */ + +#pragma once + +#include "../thread/thread_operators.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilModule + * @{ + */ + +/** + * \name Sequential prefix scan over statically-sized array types + * @{ + */ + +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T inclusive, + T exclusive, + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type length) +{ + T addend = *input; + inclusive = scan_op(exclusive, addend); + *output = exclusive; + exclusive = inclusive; + + return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type()); +} + +template < + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T inclusive, + T exclusive, + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type<0> length) +{ + return inclusive; +} + + +/** + * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH Length of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. If not, the first output element is undefined. (Handy for preventing thread-0 from applying a prefix.) +{ + T inclusive = input[0]; + if (apply_prefix) + { + inclusive = scan_op(prefix, inclusive); + } + output[0] = prefix; + T exclusive = inclusive; + + return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type()); +} + + +/** + * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH [inferred] Length of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanExclusive( + T (&input)[LENGTH], ///< [in] Input array + T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +{ + return ThreadScanExclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); +} + + + + + + + + + +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T inclusive, + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type length) +{ + T addend = *input; + inclusive = scan_op(inclusive, addend); + output[0] = inclusive; + + return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); +} + +template < + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T inclusive, + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type<0> length) +{ + return inclusive; +} + + +/** + * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array. The aggregate is returned. + * + * \tparam LENGTH Length of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator +{ + T inclusive = input[0]; + output[0] = inclusive; + + // Continue scan + return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); +} + + +/** + * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array. The aggregate is returned. + * + * \tparam LENGTH [inferred] Length of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T (&input)[LENGTH], ///< [in] Input array + T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator +{ + return ThreadScanInclusive((T*) input, (T*) output, scan_op); +} + + +/** + * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH Length of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T *input, ///< [in] Input array + T *output, ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +{ + T inclusive = input[0]; + if (apply_prefix) + { + inclusive = scan_op(prefix, inclusive); + } + output[0] = inclusive; + + // Continue scan + return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); +} + + +/** + * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. + * + * \tparam LENGTH [inferred] Length of \p input and \p output arrays + * \tparam T [inferred] The data type to be scanned. + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ +template < + int LENGTH, + typename T, + typename ScanOp> +__device__ __forceinline__ T ThreadScanInclusive( + T (&input)[LENGTH], ///< [in] Input array + T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T prefix, ///< [in] Prefix to seed scan with + bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) +{ + return ThreadScanInclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); +} + + +//@} end member group + +/** @} */ // end group UtilModule + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/thread/thread_store.cuh b/SRC/cub/thread/thread_store.cuh new file mode 100644 index 00000000..6d036d42 --- /dev/null +++ b/SRC/cub/thread/thread_store.cuh @@ -0,0 +1,414 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for writing memory using PTX cache modifiers. + */ + +#pragma once + +#include + +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup UtilIo + * @{ + */ + + +//----------------------------------------------------------------------------- +// Tags and constants +//----------------------------------------------------------------------------- + +/** + * \brief Enumeration of cache modifiers for memory store operations. + */ +enum CacheStoreModifier +{ + STORE_DEFAULT, ///< Default (no modifier) + STORE_WB, ///< Cache write-back all coherent levels + STORE_CG, ///< Cache at global level + STORE_CS, ///< Cache streaming (likely to be accessed once) + STORE_WT, ///< Cache write-through (to system memory) + STORE_VOLATILE, ///< Volatile shared (any memory space) +}; + + +/** + * \name Thread I/O (cache modified) + * @{ + */ + +/** + * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type. + * + * \par Example + * \code + * #include // or equivalently + * + * // 32-bit store using cache-global modifier: + * int *d_out; + * int val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * + * // 16-bit store using default modifier + * short *d_out; + * short val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * + * // 256-bit store using write-through modifier + * double4 *d_out; + * double4 val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * + * // 96-bit store using cache-streaming cache modifier + * struct TestFoo { bool a; short b; }; + * TestFoo *d_struct; + * TestFoo val; + * cub::ThreadStore(d_out + threadIdx.x, val); + * \endcode + * + * \tparam MODIFIER [inferred] CacheStoreModifier enumeration + * \tparam InputIterator [inferred] Output iterator type \iterator + * \tparam T [inferred] Data type of output value + */ +template < + CacheStoreModifier MODIFIER, + typename OutputIterator, + typename T> +__device__ __forceinline__ void ThreadStore(OutputIterator itr, T val); + + +//@} end member group + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/// Helper structure for templated store iteration (inductive case) +template +struct IterateThreadStore +{ + template + static __device__ __forceinline__ void Store(T *ptr, T *vals) + { + ThreadStore(ptr + COUNT, vals[COUNT]); + IterateThreadStore::template Store(ptr, vals); + } + + template + static __device__ __forceinline__ void Dereference(OutputIterator ptr, T *vals) + { + ptr[COUNT] = vals[COUNT]; + IterateThreadStore::Dereference(ptr, vals); + } + +}; + +/// Helper structure for templated store iteration (termination case) +template +struct IterateThreadStore +{ + template + static __device__ __forceinline__ void Store(T *ptr, T *vals) {} + + template + static __device__ __forceinline__ void Dereference(OutputIterator ptr, T *vals) {} +}; + + +/** + * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier + */ +#define CUB_STORE_16(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(uint4* ptr, uint4 val) \ + { \ + asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \ + _CUB_ASM_PTR_(ptr), \ + "r"(val.x), \ + "r"(val.y), \ + "r"(val.z), \ + "r"(val.w)); \ + } \ + template<> \ + __device__ __forceinline__ void ThreadStore(ulonglong2* ptr, ulonglong2 val) \ + { \ + asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \ + _CUB_ASM_PTR_(ptr), \ + "l"(val.x), \ + "l"(val.y)); \ + } + + +/** + * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier + */ +#define CUB_STORE_8(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(ushort4* ptr, ushort4 val) \ + { \ + asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \ + _CUB_ASM_PTR_(ptr), \ + "h"(val.x), \ + "h"(val.y), \ + "h"(val.z), \ + "h"(val.w)); \ + } \ + template<> \ + __device__ __forceinline__ void ThreadStore(uint2* ptr, uint2 val) \ + { \ + asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \ + _CUB_ASM_PTR_(ptr), \ + "r"(val.x), \ + "r"(val.y)); \ + } \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned long long* ptr, unsigned long long val) \ + { \ + asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + "l"(val)); \ + } + +/** + * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier + */ +#define CUB_STORE_4(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned int* ptr, unsigned int val) \ + { \ + asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + "r"(val)); \ + } + + +/** + * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier + */ +#define CUB_STORE_2(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned short* ptr, unsigned short val) \ + { \ + asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \ + _CUB_ASM_PTR_(ptr), \ + "h"(val)); \ + } + + +/** + * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier + */ +#define CUB_STORE_1(cub_modifier, ptx_modifier) \ + template<> \ + __device__ __forceinline__ void ThreadStore(unsigned char* ptr, unsigned char val) \ + { \ + asm volatile ( \ + "{" \ + " .reg .u8 datum;" \ + " cvt.u8.u16 datum, %1;" \ + " st."#ptx_modifier".u8 [%0], datum;" \ + "}" : : \ + _CUB_ASM_PTR_(ptr), \ + "h"((unsigned short) val)); \ + } + +/** + * Define powers-of-two ThreadStore specializations for the given Cache load modifier + */ +#define CUB_STORE_ALL(cub_modifier, ptx_modifier) \ + CUB_STORE_16(cub_modifier, ptx_modifier) \ + CUB_STORE_8(cub_modifier, ptx_modifier) \ + CUB_STORE_4(cub_modifier, ptx_modifier) \ + CUB_STORE_2(cub_modifier, ptx_modifier) \ + CUB_STORE_1(cub_modifier, ptx_modifier) \ + + +/** + * Define ThreadStore specializations for the various Cache load modifiers + */ +#if CUB_PTX_ARCH >= 200 + CUB_STORE_ALL(STORE_WB, ca) + CUB_STORE_ALL(STORE_CG, cg) + CUB_STORE_ALL(STORE_CS, cs) + CUB_STORE_ALL(STORE_WT, wt) +#else + CUB_STORE_ALL(STORE_WB, global) + CUB_STORE_ALL(STORE_CG, global) + CUB_STORE_ALL(STORE_CS, global) + CUB_STORE_ALL(STORE_WT, volatile.global) +#endif + + +/** + * ThreadStore definition for STORE_DEFAULT modifier on iterator types + */ +template +__device__ __forceinline__ void ThreadStore( + OutputIterator itr, + T val, + Int2Type modifier, + Int2Type is_pointer) +{ + *itr = val; +} + + +/** + * ThreadStore definition for STORE_DEFAULT modifier on pointer types + */ +template +__device__ __forceinline__ void ThreadStore( + T *ptr, + T val, + Int2Type modifier, + Int2Type is_pointer) +{ + *ptr = val; +} + + +/** + * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types + */ +template +__device__ __forceinline__ void ThreadStoreVolatilePtr( + T *ptr, + T val, + Int2Type is_primitive) +{ + *reinterpret_cast(ptr) = val; +} + + +/** + * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types + */ +template +__device__ __forceinline__ void ThreadStoreVolatilePtr( + T *ptr, + T val, + Int2Type is_primitive) +{ +#if CUB_PTX_ARCH <= 130 + + *ptr = val; + __threadfence_block(); + +#else + + typedef typename UnitWord::VolatileWord VolatileWord; // Word type for memcopying + + const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); + + VolatileWord words[VOLATILE_MULTIPLE]; + *reinterpret_cast(words) = val; + +// VolatileWord *words = reinterpret_cast(&val); + + IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference( + reinterpret_cast(ptr), + words); + +#endif // CUB_PTX_ARCH <= 130 + +} + + +/** + * ThreadStore definition for STORE_VOLATILE modifier on pointer types + */ +template +__device__ __forceinline__ void ThreadStore( + T *ptr, + T val, + Int2Type modifier, + Int2Type is_pointer) +{ + ThreadStoreVolatilePtr(ptr, val, Int2Type::PRIMITIVE>()); +} + + +/** + * ThreadStore definition for generic modifiers on pointer types + */ +template +__device__ __forceinline__ void ThreadStore( + T *ptr, + T val, + Int2Type modifier, + Int2Type is_pointer) +{ + typedef typename UnitWord::DeviceWord DeviceWord; // Word type for memcopying + + const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); + + DeviceWord words[DEVICE_MULTIPLE]; + + *reinterpret_cast(words) = val; + + IterateThreadStore<0, DEVICE_MULTIPLE>::template Store( + reinterpret_cast(ptr), + words); +} + + +/** + * ThreadStore definition for generic modifiers + */ +template +__device__ __forceinline__ void ThreadStore(OutputIterator itr, T val) +{ + ThreadStore( + itr, + val, + Int2Type(), + Int2Type::VALUE>()); +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** @} */ // end group UtilIo + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/util_allocator.cuh b/SRC/cub/util_allocator.cuh new file mode 100644 index 00000000..9e4b1ff6 --- /dev/null +++ b/SRC/cub/util_allocator.cuh @@ -0,0 +1,664 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Simple caching allocator for device memory allocations. The allocator is + * thread-safe and capable of managing device allocations on multiple devices. + ******************************************************************************/ + +#pragma once + +#if (CUB_PTX_ARCH == 0) + #include // NVCC (EDG, really) takes FOREVER to compile std::map + #include +#endif + +#include + +#include "util_namespace.cuh" +#include "util_debug.cuh" + +#include "host/spinlock.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + + +/****************************************************************************** + * CachingDeviceAllocator (host use) + ******************************************************************************/ + +/** + * \brief A simple caching allocator for device memory allocations. + * + * \par Overview + * The allocator is thread-safe and is capable of managing cached device allocations + * on multiple devices. It behaves as follows: + * + * \par + * - Allocations categorized by bin size. + * - Bin sizes progress geometrically in accordance with the growth factor + * \p bin_growth provided during construction. Unused device allocations within + * a larger bin cache are not reused for allocation requests that categorize to + * smaller bin sizes. + * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to + * (\p bin_growth ^ \p min_bin). + * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest + * bin and are simply freed when they are deallocated instead of being returned + * to a bin-cache. + * - %If the total storage of cached allocations on a given device will exceed + * \p max_cached_bytes, allocations for that device are simply freed when they are + * deallocated instead of being returned to their bin-cache. + * + * \par + * For example, the default-constructed CachingDeviceAllocator is configured with: + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = 6MB - 1B + * + * \par + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB + * and sets a maximum of 6,291,455 cached bytes per device + * + */ +struct CachingDeviceAllocator +{ +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + + //--------------------------------------------------------------------- + // Type definitions and constants + //--------------------------------------------------------------------- + + enum + { + /// Invalid device ordinal + INVALID_DEVICE_ORDINAL = -1, + }; + + /** + * Integer pow function for unsigned base and exponent + */ + static unsigned int IntPow( + unsigned int base, + unsigned int exp) + { + unsigned int retval = 1; + while (exp > 0) + { + if (exp & 1) { + retval = retval * base; // multiply the result by the current base + } + base = base * base; // square the base + exp = exp >> 1; // divide the exponent in half + } + return retval; + } + + + /** + * Round up to the nearest power-of + */ + static void NearestPowerOf( + unsigned int &power, + size_t &rounded_bytes, + unsigned int base, + size_t value) + { + power = 0; + rounded_bytes = 1; + + while (rounded_bytes < value) + { + rounded_bytes *= base; + power++; + } + } + + /** + * Descriptor for device memory allocations + */ + struct BlockDescriptor + { + int device; // device ordinal + void* d_ptr; // Device pointer + size_t bytes; // Size of allocation in bytes + unsigned int bin; // Bin enumeration + + // Constructor + BlockDescriptor(void *d_ptr, int device) : + d_ptr(d_ptr), + bytes(0), + bin(0), + device(device) {} + + // Constructor + BlockDescriptor(size_t bytes, unsigned int bin, int device) : + d_ptr(NULL), + bytes(bytes), + bin(bin), + device(device) {} + + // Comparison functor for comparing device pointers + static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + if (a.device < b.device) { + return true; + } else if (a.device > b.device) { + return false; + } else { + return (a.d_ptr < b.d_ptr); + } + } + + // Comparison functor for comparing allocation sizes + static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) + { + if (a.device < b.device) { + return true; + } else if (a.device > b.device) { + return false; + } else { + return (a.bytes < b.bytes); + } + } + }; + + /// BlockDescriptor comparator function interface + typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); + +#if (CUB_PTX_ARCH == 0) // Only define STL container members in host code + + /// Set type for cached blocks (ordered by size) + typedef std::multiset CachedBlocks; + + /// Set type for live blocks (ordered by ptr) + typedef std::multiset BusyBlocks; + + /// Map type of device ordinals to the number of cached bytes cached by each device + typedef std::map GpuCachedBytes; + +#endif // CUB_PTX_ARCH + + //--------------------------------------------------------------------- + // Fields + //--------------------------------------------------------------------- + + Spinlock spin_lock; /// Spinlock for thread-safety + + unsigned int bin_growth; /// Geometric growth factor for bin-sizes + unsigned int min_bin; /// Minimum bin enumeration + unsigned int max_bin; /// Maximum bin enumeration + + size_t min_bin_bytes; /// Minimum bin size + size_t max_bin_bytes; /// Maximum bin size + size_t max_cached_bytes; /// Maximum aggregate cached bytes per device + + bool debug; /// Whether or not to print (de)allocation events to stdout + bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) + +#if (CUB_PTX_ARCH == 0) // Only define STL container members in host code + + GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device + CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse + BusyBlocks live_blocks; /// Set of live device allocations currently in use + +#endif // CUB_PTX_ARCH + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + //--------------------------------------------------------------------- + // Methods + //--------------------------------------------------------------------- + + /** + * \brief Constructor. + */ + CachingDeviceAllocator( + unsigned int bin_growth, ///< Geometric growth factor for bin-sizes + unsigned int min_bin, ///< Minimum bin + unsigned int max_bin, ///< Maximum bin + size_t max_cached_bytes, ///< Maximum aggregate cached bytes per device + bool skip_cleanup = false) ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called. (Useful for preventing warnings when the allocator is declared at file/static/global scope: by the time the destructor is called on program exit, the CUDA runtime may have already shut down and freed all allocations.) + : + #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare), + #endif + debug(false), + spin_lock(0), + bin_growth(bin_growth), + min_bin(min_bin), + max_bin(max_bin), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes(max_cached_bytes) + {} + + + /** + * \brief Default constructor. + * + * Configured with: + * \par + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes + * + * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and + * sets a maximum of 6,291,455 cached bytes per device + */ + CachingDeviceAllocator( + bool skip_cleanup = false) ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called. (Useful for preventing warnings when the allocator is declared at file/static/global scope: by the time the destructor is called on program exit, the CUDA runtime may have already shut down and freed all allocations.) + : + #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare), + #endif + skip_cleanup(skip_cleanup), + debug(false), + spin_lock(0), + bin_growth(8), + min_bin(3), + max_bin(7), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes((max_bin_bytes * 3) - 1) + {} + + + /** + * \brief Sets the limit on the number bytes this allocator is allowed to cache per device. + */ + cudaError_t SetMaxCachedBytes( + size_t max_cached_bytes) + { + #if (CUB_PTX_ARCH > 0) + // Caching functionality only defined on host + return CubDebug(cudaErrorInvalidConfiguration); + #else + + // Lock + Lock(&spin_lock); + + this->max_cached_bytes = max_cached_bytes; + + if (debug) CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes); + + // Unlock + Unlock(&spin_lock); + + return cudaSuccess; + + #endif // CUB_PTX_ARCH + } + + + /** + * \brief Provides a suitable allocation of device memory for the given size on the specified device + */ + cudaError_t DeviceAllocate( + void** d_ptr, + size_t bytes, + int device) + { + #if (CUB_PTX_ARCH > 0) + // Caching functionality only defined on host + return CubDebug(cudaErrorInvalidConfiguration); + #else + + bool locked = false; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + cudaError_t error = cudaSuccess; + + // Round up to nearest bin size + unsigned int bin; + size_t bin_bytes; + NearestPowerOf(bin, bin_bytes, bin_growth, bytes); + if (bin < min_bin) { + bin = min_bin; + bin_bytes = min_bin_bytes; + } + + // Check if bin is greater than our maximum bin + if (bin > max_bin) + { + // Allocate the request exactly and give out-of-range bin + bin = (unsigned int) -1; + bin_bytes = bytes; + } + + BlockDescriptor search_key(bin_bytes, bin, device); + + // Lock + if (!locked) { + Lock(&spin_lock); + locked = true; + } + + do { + // Find a free block big enough within the same bin on the same device + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); + if ((block_itr != cached_blocks.end()) && + (block_itr->device == device) && + (block_itr->bin == search_key.bin)) + { + // Reuse existing cache block. Insert into live blocks. + search_key = *block_itr; + live_blocks.insert(search_key); + + // Remove from free blocks + cached_blocks.erase(block_itr); + cached_bytes[device] -= search_key.bytes; + + if (debug) CubLog("\tdevice %d reused cached block (%lld bytes). %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", + device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); + } + else + { + // Need to allocate a new cache block. Unlock. + if (locked) { + Unlock(&spin_lock); + locked = false; + } + + // Set to specified device + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; + if (CubDebug(error = cudaSetDevice(device))) break; + + // Allocate + if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break; + + // Lock + if (!locked) { + Lock(&spin_lock); + locked = true; + } + + // Insert into live blocks + live_blocks.insert(search_key); + + if (debug) CubLog("\tdevice %d allocating new device block %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", + device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); + } + } while(0); + + // Unlock + if (locked) { + Unlock(&spin_lock); + locked = false; + } + + // Copy device pointer to output parameter (NULL on error) + *d_ptr = search_key.d_ptr; + + // Attempt to revert back to previous device if necessary + if (entrypoint_device != INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + + #endif // CUB_PTX_ARCH + } + + + /** + * \brief Provides a suitable allocation of device memory for the given size on the current device + */ + cudaError_t DeviceAllocate( + void** d_ptr, + size_t bytes) + { + #if (CUB_PTX_ARCH > 0) + // Caching functionality only defined on host + return CubDebug(cudaErrorInvalidConfiguration); + #else + cudaError_t error = cudaSuccess; + do { + int current_device; + if (CubDebug(error = cudaGetDevice(¤t_device))) break; + if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break; + } while(0); + + return error; + + #endif // CUB_PTX_ARCH + } + + + /** + * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator + */ + cudaError_t DeviceFree( + void* d_ptr, + int device) + { + #if (CUB_PTX_ARCH > 0) + // Caching functionality only defined on host + return CubDebug(cudaErrorInvalidConfiguration); + #else + + bool locked = false; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + cudaError_t error = cudaSuccess; + + BlockDescriptor search_key(d_ptr, device); + + // Lock + if (!locked) { + Lock(&spin_lock); + locked = true; + } + + do { + // Find corresponding block descriptor + BusyBlocks::iterator block_itr = live_blocks.find(search_key); + if (block_itr == live_blocks.end()) + { + // Cannot find pointer + if (CubDebug(error = cudaErrorUnknown)) break; + } + else + { + // Remove from live blocks + search_key = *block_itr; + live_blocks.erase(block_itr); + + // Check if we should keep the returned allocation + if (cached_bytes[device] + search_key.bytes <= max_cached_bytes) + { + // Insert returned allocation into free blocks + cached_blocks.insert(search_key); + cached_bytes[device] += search_key.bytes; + + if (debug) CubLog("\tdevice %d returned %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", + device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); + } + else + { + // Free the returned allocation. Unlock. + if (locked) { + Unlock(&spin_lock); + locked = false; + } + + // Set to specified device + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; + if (CubDebug(error = cudaSetDevice(device))) break; + + // Free device memory + if (CubDebug(error = cudaFree(d_ptr))) break; + + if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", + device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); + } + } + } while (0); + + // Unlock + if (locked) { + Unlock(&spin_lock); + locked = false; + } + + // Attempt to revert back to entry-point device if necessary + if (entrypoint_device != INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + + #endif // CUB_PTX_ARCH + } + + + /** + * \brief Frees a live allocation of device memory on the current device, returning it to the allocator + */ + cudaError_t DeviceFree( + void* d_ptr) + { + #if (CUB_PTX_ARCH > 0) + // Caching functionality only defined on host + return CubDebug(cudaErrorInvalidConfiguration); + #else + + int current_device; + cudaError_t error = cudaSuccess; + + do { + if (CubDebug(error = cudaGetDevice(¤t_device))) break; + if (CubDebug(error = DeviceFree(d_ptr, current_device))) break; + } while(0); + + return error; + + #endif // CUB_PTX_ARCH + } + + + /** + * \brief Frees all cached device allocations on all devices + */ + cudaError_t FreeAllCached() + { + #if (CUB_PTX_ARCH > 0) + // Caching functionality only defined on host + return CubDebug(cudaErrorInvalidConfiguration); + #else + + cudaError_t error = cudaSuccess; + bool locked = false; + int entrypoint_device = INVALID_DEVICE_ORDINAL; + int current_device = INVALID_DEVICE_ORDINAL; + + // Lock + if (!locked) { + Lock(&spin_lock); + locked = true; + } + + while (!cached_blocks.empty()) + { + // Get first block + CachedBlocks::iterator begin = cached_blocks.begin(); + + // Get entry-point device ordinal if necessary + if (entrypoint_device == INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; + } + + // Set current device ordinal if necessary + if (begin->device != current_device) + { + if (CubDebug(error = cudaSetDevice(begin->device))) break; + current_device = begin->device; + } + + // Free device memory + if (CubDebug(error = cudaFree(begin->d_ptr))) break; + + // Reduce balance and erase entry + cached_bytes[current_device] -= begin->bytes; + cached_blocks.erase(begin); + + if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", + current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size()); + } + + // Unlock + if (locked) { + Unlock(&spin_lock); + locked = false; + } + + // Attempt to revert back to entry-point device if necessary + if (entrypoint_device != INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + + return error; + + #endif // CUB_PTX_ARCH + } + + + /** + * \brief Destructor + */ + virtual ~CachingDeviceAllocator() + { + if (!skip_cleanup) + FreeAllCached(); + } + +}; + + + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/util_arch.cuh b/SRC/cub/util_arch.cuh new file mode 100644 index 00000000..917c3606 --- /dev/null +++ b/SRC/cub/util_arch.cuh @@ -0,0 +1,197 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Static architectural properties by SM version. + */ + +#pragma once + +#include "util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + + +/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). +#ifndef __CUDA_ARCH__ + #define CUB_PTX_ARCH 0 +#else + #define CUB_PTX_ARCH __CUDA_ARCH__ +#endif + + +/// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. +#if (CUB_PTX_ARCH == 0) || defined(CUB_CDP) + #define CUB_RUNTIME_ENABLED + #define CUB_RUNTIME_FUNCTION __host__ __device__ +#else + #define CUB_RUNTIME_FUNCTION __host__ +#endif + + + +/// Number of threads per warp (log) +#define CUB_LOG_WARP_THREADS(arch) \ + (5) + +/// Number of threads per warp +#define CUB_WARP_THREADS(arch) \ + (1 << CUB_LOG_WARP_THREADS(arch)) + +/// Number of smem banks (log) +#define CUB_LOG_SMEM_BANKS(arch) \ + ((arch >= 200) ? \ + (5) : \ + (4)) + +/// Number of smem banks +#define CUB_SMEM_BANKS(arch) \ + (1 << CUB_LOG_SMEM_BANKS(arch)) + +/// Number of bytes per smem bank +#define CUB_SMEM_BANK_BYTES(arch) \ + (4) + +/// Number of smem bytes provisioned per SM +#define CUB_SMEM_BYTES(arch) \ + ((arch >= 200) ? \ + (48 * 1024) : \ + (16 * 1024)) + +/// Smem allocation size in bytes +#define CUB_SMEM_ALLOC_UNIT(arch) \ + ((arch >= 300) ? \ + (256) : \ + ((arch >= 200) ? \ + (128) : \ + (512))) + +/// Whether or not the architecture allocates registers by block (or by warp) +#define CUB_REGS_BY_BLOCK(arch) \ + ((arch >= 200) ? \ + (false) : \ + (true)) + +/// Number of registers allocated at a time per block (or by warp) +#define CUB_REG_ALLOC_UNIT(arch) \ + ((arch >= 300) ? \ + (256) : \ + ((arch >= 200) ? \ + (64) : \ + ((arch >= 120) ? \ + (512) : \ + (256)))) + +/// Granularity of warps for which registers are allocated +#define CUB_WARP_ALLOC_UNIT(arch) \ + ((arch >= 300) ? \ + (4) : \ + (2)) + +/// Maximum number of threads per SM +#define CUB_MAX_SM_THREADS(arch) \ + ((arch >= 300) ? \ + (2048) : \ + ((arch >= 200) ? \ + (1536) : \ + ((arch >= 120) ? \ + (1024) : \ + (768)))) + +/// Maximum number of thread blocks per SM +#define CUB_MAX_SM_BLOCKS(arch) \ + ((arch >= 300) ? \ + (16) : \ + (8)) + +/// Maximum number of threads per thread block +#define CUB_MAX_BLOCK_THREADS(arch) \ + ((arch >= 200) ? \ + (1024) : \ + (512)) + +/// Maximum number of registers per SM +#define CUB_MAX_SM_REGISTERS(arch) \ + ((arch >= 300) ? \ + (64 * 1024) : \ + ((arch >= 200) ? \ + (32 * 1024) : \ + ((arch >= 120) ? \ + (16 * 1024) : \ + (8 * 1024)))) + +/// Oversubscription factor +#define CUB_SUBSCRIPTION_FACTOR(arch) \ + ((arch >= 300) ? \ + (5) : \ + ((arch >= 200) ? \ + (3) : \ + (10))) + +/// Prefer padding overhead vs X-way conflicts greater than this threshold +#define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ + ((arch >= 300) ? \ + (1) : \ + (4)) + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +#define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) +#define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) +#define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) +#define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) +#define CUB_PTX_SMEM_BANK_BYTES CUB_SMEM_BANK_BYTES(CUB_PTX_ARCH) +#define CUB_PTX_SMEM_BYTES CUB_SMEM_BYTES(CUB_PTX_ARCH) +#define CUB_PTX_SMEM_ALLOC_UNIT CUB_SMEM_ALLOC_UNIT(CUB_PTX_ARCH) +#define CUB_PTX_REGS_BY_BLOCK CUB_REGS_BY_BLOCK(CUB_PTX_ARCH) +#define CUB_PTX_REG_ALLOC_UNIT CUB_REG_ALLOC_UNIT(CUB_PTX_ARCH) +#define CUB_PTX_WARP_ALLOC_UNIT CUB_WARP_ALLOC_UNIT(CUB_PTX_ARCH) +#define CUB_PTX_MAX_SM_THREADS CUB_MAX_SM_THREADS(CUB_PTX_ARCH) +#define CUB_PTX_MAX_SM_BLOCKS CUB_MAX_SM_BLOCKS(CUB_PTX_ARCH) +#define CUB_PTX_MAX_BLOCK_THREADS CUB_MAX_BLOCK_THREADS(CUB_PTX_ARCH) +#define CUB_PTX_MAX_SM_REGISTERS CUB_MAX_SM_REGISTERS(CUB_PTX_ARCH) +#define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) + +#endif // Do not document + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/util_debug.cuh b/SRC/cub/util_debug.cuh new file mode 100644 index 00000000..375fd5e4 --- /dev/null +++ b/SRC/cub/util_debug.cuh @@ -0,0 +1,115 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Error and event logging routines. + * + * The following macros definitions are supported: + * - \p CUB_LOG. Simple event messages are printed to \p stdout. + */ + +#pragma once + +#include +#include "util_namespace.cuh" +#include "util_arch.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + + +/// CUB error reporting macro (prints error messages to stderr) +#if (defined(DEBUG) || defined(_DEBUG)) + #define CUB_STDERR +#endif + + + +/** + * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context. + * + * \return The CUDA error. + */ +__host__ __device__ __forceinline__ cudaError_t Debug( + cudaError_t error, + const char* filename, + int line) +{ +#ifdef CUB_STDERR + if (error) + { + #if (CUB_PTX_ARCH == 0) + fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); + fflush(stderr); + #elif (CUB_PTX_ARCH >= 200) + printf("CUDA error %d [block %d, thread %d, %s, %d]\n", error, blockIdx.x, threadIdx.x, filename, line); + #endif + } +#endif + return error; +} + + +/** + * \brief Debug macro + */ +#define CubDebug(e) cub::Debug((e), __FILE__, __LINE__) + + +/** + * \brief Debug macro with exit + */ +#define CubDebugExit(e) if (cub::Debug((e), __FILE__, __LINE__)) { exit(1); } + + +/** + * \brief Log macro for printf statements. + */ +#if (CUB_PTX_ARCH == 0) + #define CubLog(format, ...) printf(format,__VA_ARGS__); +#elif (CUB_PTX_ARCH >= 200) + #define CubLog(format, ...) printf("[block %d, thread %d]: " format, blockIdx.x, threadIdx.x, __VA_ARGS__); +#endif + + + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/util_device.cuh b/SRC/cub/util_device.cuh new file mode 100644 index 00000000..f3b79078 --- /dev/null +++ b/SRC/cub/util_device.cuh @@ -0,0 +1,372 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Properties of a given CUDA device and the corresponding PTX bundle + */ + +#pragma once + +#include "util_arch.cuh" +#include "util_debug.cuh" +#include "util_namespace.cuh" +#include "util_macro.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilMgmt + * @{ + */ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/** + * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device + */ +template +__global__ void EmptyKernel(void) { } + + +/** + * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). + */ +template +CUB_RUNTIME_FUNCTION __forceinline__ +cudaError_t AliasTemporaries( + void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation + void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed + size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed +{ + const int ALIGN_BYTES = 256; + const int ALIGN_MASK = ~(ALIGN_BYTES - 1); + + // Compute exclusive prefix sum over allocation requests + size_t allocation_offsets[ALLOCATIONS]; + size_t bytes_needed = 0; + for (int i = 0; i < ALLOCATIONS; ++i) + { + size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK; + allocation_offsets[i] = bytes_needed; + bytes_needed += allocation_bytes; + } + + // Check if the caller is simply requesting the size of the storage allocation + if (!d_temp_storage) + { + temp_storage_bytes = bytes_needed; + return cudaSuccess; + } + + // Check if enough storage provided + if (temp_storage_bytes < bytes_needed) + { + return CubDebug(cudaErrorInvalidValue); + } + + // Alias + for (int i = 0; i < ALLOCATIONS; ++i) + { + allocations[i] = static_cast(d_temp_storage) + allocation_offsets[i]; + } + + return cudaSuccess; +} + + + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/** + * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10) + */ +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version) +{ + struct Dummy + { + /// Type definition of the EmptyKernel kernel entry point + typedef void (*EmptyKernelPtr)(); + + /// Force EmptyKernel to be generated if this class is used + CUB_RUNTIME_FUNCTION __forceinline__ + EmptyKernelPtr Empty() + { + return EmptyKernel; + } + }; + + +#ifndef CUB_RUNTIME_ENABLED + + // CUDA API calls not supported from this device + return cudaErrorInvalidConfiguration; + +#elif (CUB_PTX_ARCH > 0) + + ptx_version = CUB_PTX_ARCH; + return cudaSuccess; + +#else + + cudaError_t error = cudaSuccess; + do + { + cudaFuncAttributes empty_kernel_attrs; + if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel))) break; + ptx_version = empty_kernel_attrs.ptxVersion * 10; + } + while (0); + + return error; + +#endif +} + + +/** + * \brief Retrieves the SM version (major * 100 + minor * 10) + */ +CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal) +{ +#ifndef CUB_RUNTIME_ENABLED + + // CUDA API calls not supported from this device + return cudaErrorInvalidConfiguration; + +#else + + cudaError_t error = cudaSuccess; + do + { + // Fill in SM version + int major, minor; + if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break; + if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break; + sm_version = major * 100 + minor * 10; + } + while (0); + + return error; + +#endif +} + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Synchronize the stream if specified + */ +CUB_RUNTIME_FUNCTION __forceinline__ +static cudaError_t SyncStream(cudaStream_t stream) +{ +#if (CUB_PTX_ARCH == 0) + return cudaStreamSynchronize(stream); +#else + // Device can't yet sync on a specific stream + return cudaDeviceSynchronize(); +#endif +} + + +/** + * \brief Computes maximum SM occupancy in thread blocks for the given kernel function pointer \p kernel_ptr. + */ +template +CUB_RUNTIME_FUNCTION __forceinline__ +cudaError_t MaxSmOccupancy( + int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM + int sm_version, ///< [in] The SM architecture to run on + KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy + int block_threads) ///< [in] Number of threads per thread block +{ +#ifndef CUB_RUNTIME_ENABLED + + // CUDA API calls not supported from this device + return CubDebug(cudaErrorInvalidConfiguration); + +#else + + cudaError_t error = cudaSuccess; + do + { + int warp_threads = 1 << CUB_LOG_WARP_THREADS(sm_version); + int max_sm_blocks = CUB_MAX_SM_BLOCKS(sm_version); + int max_sm_warps = CUB_MAX_SM_THREADS(sm_version) / warp_threads; + int regs_by_block = CUB_REGS_BY_BLOCK(sm_version); + int max_sm_registers = CUB_MAX_SM_REGISTERS(sm_version); + int warp_alloc_unit = CUB_WARP_ALLOC_UNIT(sm_version); + int smem_alloc_unit = CUB_SMEM_ALLOC_UNIT(sm_version); + int reg_alloc_unit = CUB_REG_ALLOC_UNIT(sm_version); + int smem_bytes = CUB_SMEM_BYTES(sm_version); + + // Get kernel attributes + cudaFuncAttributes kernel_attrs; + if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break; + + // Number of warps per threadblock + int block_warps = (block_threads + warp_threads - 1) / warp_threads; + + // Max warp occupancy + int max_warp_occupancy = (block_warps > 0) ? + max_sm_warps / block_warps : + max_sm_blocks; + + // Maximum register occupancy + int max_reg_occupancy; + if ((block_threads == 0) || (kernel_attrs.numRegs == 0)) + { + // Prevent divide-by-zero + max_reg_occupancy = max_sm_blocks; + } + else if (regs_by_block) + { + // Allocates registers by threadblock + int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit); + max_reg_occupancy = max_sm_registers / block_regs; + } + else + { + // Allocates registers by warp + int sm_sides = warp_alloc_unit; + int sm_registers_per_side = max_sm_registers / sm_sides; + int regs_per_warp = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit); + int warps_per_side = sm_registers_per_side / regs_per_warp; + int warps = warps_per_side * sm_sides; + max_reg_occupancy = warps / block_warps; + } + + // Shared memory per threadblock + int block_allocated_smem = CUB_ROUND_UP_NEAREST( + kernel_attrs.sharedSizeBytes, + smem_alloc_unit); + + // Max shared memory occupancy + int max_smem_occupancy = (block_allocated_smem > 0) ? + (smem_bytes / block_allocated_smem) : + max_sm_blocks; + + // Max occupancy + max_sm_occupancy = CUB_MIN( + CUB_MIN(max_sm_blocks, max_warp_occupancy), + CUB_MIN(max_smem_occupancy, max_reg_occupancy)); + +// printf("max_smem_occupancy(%d), max_warp_occupancy(%d), max_reg_occupancy(%d) \n", max_smem_occupancy, max_warp_occupancy, max_reg_occupancy); + + } while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED +} + +#endif // Do not document + + +/** + * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block. + * + * \par Snippet + * The code snippet below illustrates the use of the MaxSmOccupancy function. + * \par + * \code + * #include // or equivalently + * + * template + * __global__ void ExampleKernel() + * { + * // Allocate shared memory for BlockScan + * __shared__ volatile T buffer[4096]; + * + * ... + * } + * + * ... + * + * // Determine SM occupancy for ExampleKernel specialized for unsigned char + * int max_sm_occupancy; + * MaxSmOccupancy(max_sm_occupancy, ExampleKernel, 64); + * + * // max_sm_occupancy <-- 4 on SM10 + * // max_sm_occupancy <-- 8 on SM20 + * // max_sm_occupancy <-- 12 on SM35 + * + * \endcode + * + */ +template +CUB_RUNTIME_FUNCTION __forceinline__ +cudaError_t MaxSmOccupancy( + int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM + KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy + int block_threads) ///< [in] Number of threads per thread block +{ +#ifndef CUB_RUNTIME_ENABLED + + // CUDA API calls not supported from this device + return CubDebug(cudaErrorInvalidConfiguration); + +#else + + cudaError_t error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get device SM version + int sm_version; + if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; + + // Get SM occupancy + if (CubDebug(error = MaxSmOccupancy(max_sm_occupancy, sm_version, kernel_ptr, block_threads))) break; + + } while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + +} + + +/** @} */ // end group UtilMgmt + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/util_macro.cuh b/SRC/cub/util_macro.cuh new file mode 100644 index 00000000..a94031a4 --- /dev/null +++ b/SRC/cub/util_macro.cuh @@ -0,0 +1,107 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Common C/C++ macro utilities + ******************************************************************************/ + +#pragma once + +#include "util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilModule + * @{ + */ + +/** + * Align struct + */ +#if defined(_WIN32) || defined(_WIN64) + #define CUB_ALIGN(bytes) __declspec(align(32)) +#else + #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) +#endif + +/** + * Select maximum(a, b) + */ +#define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) + +/** + * Select minimum(a, b) + */ +#define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) + +/** + * Quotient of x/y rounded down to nearest integer + */ +#define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) + +/** + * Quotient of x/y rounded up to nearest integer + */ +#define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) + +/** + * x rounded up to the nearest multiple of y + */ +#define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) + +/** + * x rounded down to the nearest multiple of y + */ +#define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) + +/** + * Return character string for given type + */ +#define CUB_TYPE_STRING(type) ""#type + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + #define CUB_CAT_(a, b) a ## b + #define CUB_CAT(a, b) CUB_CAT_(a, b) +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * Static assert + */ +#define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] + + +/** @} */ // end group UtilModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/util_namespace.cuh b/SRC/cub/util_namespace.cuh new file mode 100644 index 00000000..39603644 --- /dev/null +++ b/SRC/cub/util_namespace.cuh @@ -0,0 +1,41 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Place-holder for prefixing the cub namespace + */ + +#pragma once + +// For example: +//#define CUB_NS_PREFIX namespace thrust{ namespace detail { +//#define CUB_NS_POSTFIX } } + +#define CUB_NS_PREFIX +#define CUB_NS_POSTFIX diff --git a/SRC/cub/util_ptx.cuh b/SRC/cub/util_ptx.cuh new file mode 100644 index 00000000..4172de2a --- /dev/null +++ b/SRC/cub/util_ptx.cuh @@ -0,0 +1,606 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * PTX intrinsics + */ + + +#pragma once + +#include "util_type.cuh" +#include "util_arch.cuh" +#include "util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilPtx + * @{ + */ + + +/****************************************************************************** + * PTX helper macros + ******************************************************************************/ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Register modifier for pointer-types (for inlining PTX assembly) + */ +#if defined(_WIN64) || defined(__LP64__) + #define __CUB_LP64__ 1 + // 64-bit register modifier for inlined asm + #define _CUB_ASM_PTR_ "l" + #define _CUB_ASM_PTR_SIZE_ "u64" +#else + #define __CUB_LP64__ 0 + // 32-bit register modifier for inlined asm + #define _CUB_ASM_PTR_ "r" + #define _CUB_ASM_PTR_SIZE_ "u32" +#endif + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/****************************************************************************** + * Inlined PTX intrinsics + ******************************************************************************/ + +/** + * \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend. + */ +__device__ __forceinline__ unsigned int SHR_ADD( + unsigned int x, + unsigned int shift, + unsigned int addend) +{ + unsigned int ret; +#if CUB_PTX_ARCH >= 200 + asm("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : + "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); +#else + ret = (x >> shift) + addend; +#endif + return ret; +} + + +/** + * \brief Shift-left then add. Returns (\p x << \p shift) + \p addend. + */ +__device__ __forceinline__ unsigned int SHL_ADD( + unsigned int x, + unsigned int shift, + unsigned int addend) +{ + unsigned int ret; +#if CUB_PTX_ARCH >= 200 + asm("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : + "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); +#else + ret = (x << shift) + addend; +#endif + return ret; +} + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Bitfield-extract. + */ +template +__device__ __forceinline__ unsigned int BFE( + UnsignedBits source, + unsigned int bit_start, + unsigned int num_bits, + Int2Type byte_len) +{ + unsigned int bits; +#if CUB_PTX_ARCH >= 200 + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits)); +#else + const unsigned int MASK = (1 << num_bits) - 1; + bits = (source >> bit_start) & MASK; +#endif + return bits; +} + + +/** + * Bitfield-extract for 64-bit types. + */ +template +__device__ __forceinline__ unsigned int BFE( + UnsignedBits source, + unsigned int bit_start, + unsigned int num_bits, + Int2Type<8> byte_len) +{ + const unsigned long long MASK = (1ull << num_bits) - 1; + return (source >> bit_start) & MASK; +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * \brief Bitfield-extract. Extracts \p num_bits from \p source starting at bit-offset \p bit_start. The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type. + */ +template +__device__ __forceinline__ unsigned int BFE( + UnsignedBits source, + unsigned int bit_start, + unsigned int num_bits) +{ + return BFE(source, bit_start, num_bits, Int2Type()); +} + + +/** + * \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start. + */ +__device__ __forceinline__ void BFI( + unsigned int &ret, + unsigned int x, + unsigned int y, + unsigned int bit_start, + unsigned int num_bits) +{ +#if CUB_PTX_ARCH >= 200 + asm("bfi.b32 %0, %1, %2, %3, %4;" : + "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits)); +#else + x <<= bit_start; + unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start; + unsigned int MASK_Y = ~MASK_X; + ret = (y & MASK_Y) | (x & MASK_X); +#endif +} + + +/** + * \brief Three-operand add. Returns \p x + \p y + \p z. + */ +__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) +{ +#if CUB_PTX_ARCH >= 200 + asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); +#else + x = x + y + z; +#endif + return x; +} + + +/** + * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later. + * + * \par + * The bytes in the two source registers \p a and \p b are numbered from 0 to 7: + * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes + * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within + * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0} + * + * \par Snippet + * The code snippet below illustrates byte-permute. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * int a = 0x03020100; + * int b = 0x07060504; + * int index = 0x00007531; + * + * int selected = PRMT(a, b, index); // 0x07050301 + * + * \endcode + * + */ +__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index) +{ + int ret; + asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); + return ret; +} + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Sync-threads barrier. + */ +__device__ __forceinline__ void BAR(int count) +{ + asm volatile("bar.sync 1, %0;" : : "r"(count)); +} + + +/** + * Floating point multiply. (Mantissa LSB rounds towards zero.) + */ +__device__ __forceinline__ float FMUL_RZ(float a, float b) +{ + float d; + asm("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b)); + return d; +} + + +/** + * Floating point multiply-add. (Mantissa LSB rounds towards zero.) + */ +__device__ __forceinline__ float FFMA_RZ(float a, float b, float c) +{ + float d; + asm("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c)); + return d; +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * \brief Terminates the calling thread + */ +__device__ __forceinline__ void ThreadExit() { + asm("exit;"); +} + + +/** + * \brief Returns the row-major linear thread identifier for a multidimensional threadblock + */ +__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z) +{ + return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) + + ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) + + threadIdx.x; +} + + +/** + * \brief Returns the warp lane ID of the calling thread + */ +__device__ __forceinline__ unsigned int LaneId() +{ + unsigned int ret; + asm("mov.u32 %0, %laneid;" : "=r"(ret) ); + return ret; +} + + +/** + * \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block. + */ +__device__ __forceinline__ unsigned int WarpId() +{ + unsigned int ret; + asm("mov.u32 %0, %warpid;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes less than the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskLt() +{ + unsigned int ret; + asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskLe() +{ + unsigned int ret; + asm("mov.u32 %0, %lanemask_le;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes greater than the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskGt() +{ + unsigned int ret; + asm("mov.u32 %0, %lanemask_gt;" : "=r"(ret) ); + return ret; +} + +/** + * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread + */ +__device__ __forceinline__ unsigned int LaneMaskGe() +{ + unsigned int ret; + asm("mov.u32 %0, %lanemask_ge;" : "=r"(ret) ); + return ret; +} + +/** @} */ // end group UtilPtx + + + + +/** + * \brief Shuffle-up for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei-src_offset. For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png) + * \ingroup WarpModule + * + * \par + * - Available only for SM3.0 or newer + * + * \par Snippet + * The code snippet below illustrates each thread obtaining a \p double value from the + * predecessor of its predecessor. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Obtain one input item per thread + * double thread_data = ... + * + * // Obtain item from two ranks below + * double peer_data = ShuffleUp(thread_data, 2); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. + * The corresponding output \p peer_data will be {1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}. + * + */ +template +__device__ __forceinline__ T ShuffleUp( + T input, ///< [in] The value to broadcast + int src_offset) ///< [in] The relative down-offset of the peer to read from +{ + enum + { + SHFL_C = 0, + }; + + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); + T output; + ShuffleWord *output_alias = reinterpret_cast(&output); + ShuffleWord *input_alias = reinterpret_cast(&input); + + #pragma unroll + for (int WORD = 0; WORD < WORDS; ++WORD) + { + unsigned int shuffle_word = input_alias[WORD]; + asm( + " shfl.up.b32 %0, %1, %2, %3;" + : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_offset), "r"(SHFL_C)); + output_alias[WORD] = (ShuffleWord) shuffle_word; + } + + return output; +} + + +/** + * \brief Shuffle-down for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei+src_offset. For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread. ![](shfl_down_logo.png) + * \ingroup WarpModule + * + * \par + * - Available only for SM3.0 or newer + * + * \par Snippet + * The code snippet below illustrates each thread obtaining a \p double value from the + * successor of its successor. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Obtain one input item per thread + * double thread_data = ... + * + * // Obtain item from two ranks below + * double peer_data = ShuffleDown(thread_data, 2); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. + * The corresponding output \p peer_data will be {3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}. + * + */ +template +__device__ __forceinline__ T ShuffleDown( + T input, ///< [in] The value to broadcast + int src_offset) ///< [in] The relative up-offset of the peer to read from +{ + enum + { + SHFL_C = CUB_PTX_WARP_THREADS - 1, + }; + + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); + T output; + ShuffleWord *output_alias = reinterpret_cast(&output); + ShuffleWord *input_alias = reinterpret_cast(&input); + + #pragma unroll + for (int WORD = 0; WORD < WORDS; ++WORD) + { + unsigned int shuffle_word = input_alias[WORD]; + asm( + " shfl.down.b32 %0, %1, %2, %3;" + : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_offset), "r"(SHFL_C)); + output_alias[WORD] = (ShuffleWord) shuffle_word; + } + + return output; +} + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * \brief Shuffle-broadcast for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanesrc_lane. For \p src_lane < 0 or \p src_lane >= WARP_THREADS, then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png) + * \ingroup WarpModule + * + * \par + * - Available only for SM3.0 or newer + */ +template +__device__ __forceinline__ T ShuffleBroadcast( + T input, ///< [in] The value to broadcast + int src_lane, ///< [in] Which warp lane is to do the broadcasting + int logical_warp_threads) ///< [in] Number of threads per logical warp +{ + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); + T output; + ShuffleWord *output_alias = reinterpret_cast(&output); + ShuffleWord *input_alias = reinterpret_cast(&input); + + #pragma unroll + for (int WORD = 0; WORD < WORDS; ++WORD) + { + unsigned int shuffle_word = input_alias[WORD]; + asm("shfl.idx.b32 %0, %1, %2, %3;" + : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_lane), "r"(logical_warp_threads - 1)); + output_alias[WORD] = (ShuffleWord) shuffle_word; + } + + return output; +} + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + /** + * \brief Shuffle-broadcast for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanesrc_lane. For \p src_lane < 0 or \p src_lane >= WARP_THREADS, then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png) + * \ingroup WarpModule + * + * \par + * - Available only for SM3.0 or newer + * + * \par Snippet + * The code snippet below illustrates each thread obtaining a \p double value from warp-lane0. + * + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Obtain one input item per thread + * double thread_data = ... + * + * // Obtain item from thread 0 + * double peer_data = ShuffleBroadcast(thread_data, 0); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. + * The corresponding output \p peer_data will be {1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}. + * + */ +template +__device__ __forceinline__ T ShuffleBroadcast( + T input, ///< [in] The value to broadcast + int src_lane) ///< [in] Which warp lane is to do the broadcasting +{ + return ShuffleBroadcast(input, src_lane, CUB_PTX_WARP_THREADS); +} + + + + + +/** + * \brief Portable implementation of __all + * \ingroup WarpModule + */ +__device__ __forceinline__ int WarpAll(int cond) +{ +#if CUB_PTX_ARCH < 120 + + __shared__ volatile int warp_signals[CUB_PTX_MAX_SM_THREADS / CUB_PTX_WARP_THREADS]; + + if (LaneId() == 0) + warp_signals[WarpId()] = 1; + + if (cond == 0) + warp_signals[WarpId()] = 0; + + return warp_signals[WarpId()]; + +#else + + return __all(cond); + +#endif +} + + +/** + * \brief Portable implementation of __any + * \ingroup WarpModule + */ +__device__ __forceinline__ int WarpAny(int cond) +{ +#if CUB_PTX_ARCH < 120 + + __shared__ volatile int warp_signals[CUB_PTX_MAX_SM_THREADS / CUB_PTX_WARP_THREADS]; + + if (LaneId() == 0) + warp_signals[WarpId()] = 0; + + if (cond) + warp_signals[WarpId()] = 1; + + return warp_signals[WarpId()]; + +#else + + return __any(cond); + +#endif +} + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/util_type.cuh b/SRC/cub/util_type.cuh new file mode 100644 index 00000000..821a55db --- /dev/null +++ b/SRC/cub/util_type.cuh @@ -0,0 +1,1027 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Common type manipulation (metaprogramming) utilities + */ + +#pragma once + +#include +#include + +#include "util_macro.cuh" +#include "util_arch.cuh" +#include "util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilModule + * @{ + */ + + + +/****************************************************************************** + * Type equality + ******************************************************************************/ + +/** + * \brief Type selection (IF ? ThenType : ElseType) + */ +template +struct If +{ + /// Conditional type result + typedef ThenType Type; // true +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct If +{ + typedef ElseType Type; // false +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Conditional types + ******************************************************************************/ + +/** + * \brief Type equality test + */ +template +struct Equals +{ + enum { + VALUE = 0, + NEGATE = 1 + }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct Equals +{ + enum { + VALUE = 1, + NEGATE = 0 + }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/****************************************************************************** + * Marker types + ******************************************************************************/ + +/** + * \brief A simple "NULL" marker type + */ +struct NullType +{ +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + template + __host__ __device__ __forceinline__ NullType& operator =(const T& b) { return *this; } + + __host__ __device__ __forceinline__ bool operator ==(const NullType& b) { return true; } + + __host__ __device__ __forceinline__ bool operator !=(const NullType& b) { return false; } + +#endif // DOXYGEN_SHOULD_SKIP_THIS +}; + + +/** + * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values) + */ +template +struct Int2Type +{ + enum {VALUE = A}; +}; + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/****************************************************************************** + * Size and alignment + ******************************************************************************/ + +/// Structure alignment +template +struct AlignBytes +{ + struct Pad + { + T val; + char byte; + }; + + enum + { + /// The alignment of T in bytes + ALIGN_BYTES = sizeof(Pad) - sizeof(T) + }; +}; + +// Specializations where host C++ compilers (e.g., Windows) may disagree with device C++ compilers (EDG) + +template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; +template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; +template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; +template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; +#ifdef _WIN32 + template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; + template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; +#endif +template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; +template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; +template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; +template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; + +template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; +template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; +template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; +#ifndef _WIN32 + template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; + template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; +#endif +template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; +template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; +template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; +template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; +template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; +template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; +template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; +template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; + + +/// Unit-words of data movement +template +struct UnitWord +{ + enum { + ALIGN_BYTES = AlignBytes::ALIGN_BYTES + }; + + template + struct IsMultiple + { + enum { + UNIT_ALIGN_BYTES = AlignBytes::ALIGN_BYTES, + IS_MULTIPLE = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0) + }; + }; + + /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + unsigned int, + typename If::IS_MULTIPLE, + unsigned short, + unsigned char>::Type>::Type ShuffleWord; + + /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + unsigned long long, + ShuffleWord>::Type VolatileWord; + + /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + ulonglong2, + VolatileWord>::Type DeviceWord; + + /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T + typedef typename If::IS_MULTIPLE, + uint4, + typename If::IS_MULTIPLE, + uint2, + ShuffleWord>::Type>::Type TextureWord; +}; + + +// float2 specialization workaround (for SM10-SM13) +template <> +struct UnitWord +{ + typedef int ShuffleWord; +#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) + typedef float VolatileWord; + typedef uint2 DeviceWord; +#else + typedef unsigned long long VolatileWord; + typedef unsigned long long DeviceWord; +#endif + typedef float2 TextureWord; +}; + +// float4 specialization workaround (for SM10-SM13) +template <> +struct UnitWord +{ + typedef int ShuffleWord; +#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) + typedef float VolatileWord; + typedef uint4 DeviceWord; +#else + typedef unsigned long long VolatileWord; + typedef ulonglong2 DeviceWord; +#endif + typedef float4 TextureWord; +}; + + +// char2 specialization workaround (for SM10-SM13) +template <> +struct UnitWord +{ + typedef unsigned short ShuffleWord; +#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) + typedef unsigned short VolatileWord; + typedef short DeviceWord; +#else + typedef unsigned short VolatileWord; + typedef unsigned short DeviceWord; +#endif + typedef unsigned short TextureWord; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Vector type inference utilities. + ******************************************************************************/ + +/** + * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists. Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields. + */ +template struct CubVector; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +enum +{ + /// The maximum number of elements in CUDA vector types + MAX_VEC_ELEMENTS = 4, +}; + + +/** + * Generic vector-1 type + */ +template +struct CubVector +{ + T x; + + typedef T BaseType; + typedef CubVector Type; +}; + +/** + * Generic vector-2 type + */ +template +struct CubVector +{ + T x; + T y; + + typedef T BaseType; + typedef CubVector Type; +}; + +/** + * Generic vector-3 type + */ +template +struct CubVector +{ + T x; + T y; + T z; + + typedef T BaseType; + typedef CubVector Type; +}; + +/** + * Generic vector-4 type + */ +template +struct CubVector +{ + T x; + T y; + T z; + T w; + + typedef T BaseType; + typedef CubVector Type; +}; + + +/** + * Macro for expanding partially-specialized built-in vector types + */ +#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type) \ + \ + template<> struct CubVector : short_type##1 \ + { \ + typedef base_type BaseType; \ + typedef short_type##1 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + return retval; \ + } \ + }; \ + \ + template<> struct CubVector : short_type##2 \ + { \ + typedef base_type BaseType; \ + typedef short_type##2 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + retval.y = y + other.y; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + retval.y = y - other.y; \ + return retval; \ + } \ + }; \ + \ + template<> struct CubVector : short_type##3 \ + { \ + typedef base_type BaseType; \ + typedef short_type##3 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + retval.y = y + other.y; \ + retval.z = z + other.z; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + retval.y = y - other.y; \ + retval.z = z - other.z; \ + return retval; \ + } \ + }; \ + \ + template<> struct CubVector : short_type##4 \ + { \ + typedef base_type BaseType; \ + typedef short_type##4 Type; \ + __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x + other.x; \ + retval.y = y + other.y; \ + retval.z = z + other.z; \ + retval.w = w + other.w; \ + return retval; \ + } \ + __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ + CubVector retval; \ + retval.x = x - other.x; \ + retval.y = y - other.y; \ + retval.z = z - other.z; \ + retval.w = w - other.w; \ + return retval; \ + } \ + }; + + + +// Expand CUDA vector types for built-in primitives +CUB_DEFINE_VECTOR_TYPE(char, char) +CUB_DEFINE_VECTOR_TYPE(signed char, char) +CUB_DEFINE_VECTOR_TYPE(short, short) +CUB_DEFINE_VECTOR_TYPE(int, int) +CUB_DEFINE_VECTOR_TYPE(long, long) +CUB_DEFINE_VECTOR_TYPE(long long, longlong) +CUB_DEFINE_VECTOR_TYPE(unsigned char, uchar) +CUB_DEFINE_VECTOR_TYPE(unsigned short, ushort) +CUB_DEFINE_VECTOR_TYPE(unsigned int, uint) +CUB_DEFINE_VECTOR_TYPE(unsigned long, ulong) +CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong) +CUB_DEFINE_VECTOR_TYPE(float, float) +CUB_DEFINE_VECTOR_TYPE(double, double) +CUB_DEFINE_VECTOR_TYPE(bool, uchar) + +// Undefine macros +#undef CUB_DEFINE_VECTOR_TYPE + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Wrapper types + ******************************************************************************/ + +/** + * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions + */ +template +struct Uninitialized +{ + /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T + typedef typename UnitWord::DeviceWord DeviceWord; + + enum + { + WORDS = sizeof(T) / sizeof(DeviceWord) + }; + + /// Backing storage + DeviceWord storage[WORDS]; + + /// Alias + __host__ __device__ __forceinline__ T& Alias() + { + return reinterpret_cast(*this); + } +}; + + +/** + * \brief An item value paired with a corresponding offset + */ +template +struct ItemOffsetPair +{ + typedef _T T; ///< Item data type + typedef _Offset Offset; ///< Integer offset data type + +#if (CUB_PTX_ARCH == 0) + union + { + Offset offset; ///< Offset + typename UnitWord::DeviceWord align0; ///< Alignment/padding (for Win32 consistency between host/device) + }; +#else + Offset offset; ///< Offset +#endif + + T value; ///< Item value + + /// Inequality operator + __host__ __device__ __forceinline__ bool operator !=(const ItemOffsetPair &b) + { + return (value != b.value) || (offset != b.offset); + } +}; + + +/** + * \brief A key identifier paired with a corresponding value + */ +template +struct KeyValuePair +{ + typedef _Key Key; ///< Key data type + typedef _Value Value; ///< Value data type + + Value value; ///< Item value + Key key; ///< Item key + + /// Inequality operator + __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) + { + return (value != b.value) || (key != b.key); + } + +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/** + * Workaround for inability for SM1.x compiler to properly zero-initialize POD structures when it's supposed to + */ +template +__host__ __device__ __forceinline__ T ZeroInitialize() +{ +#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) + + typedef typename UnitWord::ShuffleWord ShuffleWord; + const int MULTIPLE = sizeof(T) / sizeof(ShuffleWord); + ShuffleWord words[MULTIPLE]; + #pragma unroll + for (int i = 0; i < MULTIPLE; ++i) + words[i] = 0; + return *reinterpret_cast(words); + +#else + + return T(); + +#endif +} + + +/** + * \brief A wrapper for passing simple static arrays as kernel parameters + */ +template +struct ArrayWrapper +{ + /// Static array of type \p T + T array[COUNT]; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/** + * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth. + * + * Many multi-pass computations require a pair of "ping-pong" storage + * buffers (e.g., one for reading from and the other for writing to, and then + * vice-versa for the subsequent pass). This structure wraps a set of device + * buffers and a "selector" member to track which is "current". + */ +template +struct DoubleBuffer +{ + /// Pair of device buffer pointers + T *d_buffers[2]; + + /// Selector into \p d_buffers (i.e., the active/valid buffer) + int selector; + + /// \brief Constructor + __host__ __device__ __forceinline__ DoubleBuffer() + { + selector = 0; + d_buffers[0] = NULL; + d_buffers[1] = NULL; + } + + /// \brief Constructor + __host__ __device__ __forceinline__ DoubleBuffer( + T *d_current, ///< The currently valid buffer + T *d_alternate) ///< Alternate storage buffer of the same size as \p d_current + { + selector = 0; + d_buffers[0] = d_current; + d_buffers[1] = d_alternate; + } + + /// \brief Return pointer to the currently valid buffer + __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; } +}; + + + +/****************************************************************************** + * Static math + ******************************************************************************/ + +/** + * \brief Statically determine log2(N), rounded up. + * + * For example: + * Log2<8>::VALUE // 3 + * Log2<3>::VALUE // 2 + */ +template +struct Log2 +{ + /// Static logarithm value + enum { VALUE = Log2> 1), COUNT + 1>::VALUE }; // Inductive case +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document +template +struct Log2 +{ + enum {VALUE = (1 << (COUNT - 1) < N) ? // Base case + COUNT : + COUNT - 1 }; +}; +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** + * \brief Statically determine if N is a power-of-two + */ +template +struct PowerOfTwo +{ + enum { VALUE = ((N & (N - 1)) == 0) }; +}; + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + +/****************************************************************************** + * Pointer vs. iterator detection + ******************************************************************************/ + +/** + * \brief Pointer vs. iterator + */ +template +struct IsPointer +{ + enum { VALUE = 0 }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct IsPointer +{ + enum { VALUE = 1 }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Qualifier detection + ******************************************************************************/ + +/** + * \brief Volatile modifier test + */ +template +struct IsVolatile +{ + enum { VALUE = 0 }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct IsVolatile +{ + enum { VALUE = 1 }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/****************************************************************************** + * Qualifier removal + ******************************************************************************/ + +/** + * \brief Removes \p const and \p volatile qualifiers from type \p Tp. + * + * For example: + * typename RemoveQualifiers::Type // int; + */ +template +struct RemoveQualifiers +{ + /// Type without \p const and \p volatile qualifiers + typedef Up Type; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct RemoveQualifiers +{ + typedef Up Type; +}; + +template +struct RemoveQualifiers +{ + typedef Up Type; +}; + +template +struct RemoveQualifiers +{ + typedef Up Type; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Typedef-detection + ******************************************************************************/ + + +/** + * \brief Defines a structure \p detector_name that is templated on type \p T. The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name + */ +#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name) \ + template \ + struct detector_name \ + { \ + template \ + static char& test(typename C::nested_type_name*); \ + template \ + static int& test(...); \ + enum \ + { \ + VALUE = sizeof(test(0)) < sizeof(int) \ + }; \ + }; + + + +/****************************************************************************** + * Simple enable-if (similar to Boost) + ******************************************************************************/ + +/** + * \brief Simple enable-if (similar to Boost) + */ +template +struct EnableIf +{ + /// Enable-if type for SFINAE dummy variables + typedef T Type; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct EnableIf {}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/****************************************************************************** + * Typedef-detection + ******************************************************************************/ + +/** + * \brief Determine whether or not BinaryOp's functor is of the form bool operator()(const T& a, const T&b) or bool operator()(const T& a, const T&b, unsigned int idx) + */ +template +struct BinaryOpHasIdxParam +{ +private: + template struct SFINAE1 {}; + template struct SFINAE2 {}; + template struct SFINAE3 {}; + template struct SFINAE4 {}; + + template struct SFINAE5 {}; + template struct SFINAE6 {}; + template struct SFINAE7 {}; + template struct SFINAE8 {}; + + template static char Test(SFINAE1 *); + template static char Test(SFINAE2 *); + template static char Test(SFINAE3 *); + template static char Test(SFINAE4 *); + + template static char Test(SFINAE5 *); + template static char Test(SFINAE6 *); + template static char Test(SFINAE7 *); + template static char Test(SFINAE8 *); + + template static int Test(...); + +public: + + /// Whether the functor BinaryOp has a third unsigned int index param + static const bool HAS_PARAM = sizeof(Test(NULL)) == sizeof(char); +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + +/****************************************************************************** + * Simple type traits utilities. + * + * For example: + * Traits::CATEGORY // SIGNED_INTEGER + * Traits::NULL_TYPE // true + * Traits::CATEGORY // NOT_A_NUMBER + * Traits::PRIMITIVE; // false + * + ******************************************************************************/ + +/** + * \brief Basic type traits categories + */ +enum Category +{ + NOT_A_NUMBER, + SIGNED_INTEGER, + UNSIGNED_INTEGER, + FLOATING_POINT +}; + + +/** + * \brief Basic type traits + */ +template +struct BaseTraits +{ + /// Category + static const Category CATEGORY = _CATEGORY; + enum + { + PRIMITIVE = _PRIMITIVE, + NULL_TYPE = _NULL_TYPE, + }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +/** + * Basic type traits (unsigned primitive specialization) + */ +template +struct BaseTraits +{ + typedef _UnsignedBits UnsignedBits; + + static const Category CATEGORY = UNSIGNED_INTEGER; + static const UnsignedBits MIN_KEY = UnsignedBits(0); + static const UnsignedBits MAX_KEY = UnsignedBits(-1); + + enum + { + PRIMITIVE = true, + NULL_TYPE = false, + }; + + + static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) + { + return key; + } + + static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) + { + return key; + } +}; + + +/** + * Basic type traits (signed primitive specialization) + */ +template +struct BaseTraits +{ + typedef _UnsignedBits UnsignedBits; + + static const Category CATEGORY = SIGNED_INTEGER; + static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); + static const UnsignedBits MIN_KEY = HIGH_BIT; + static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; + + enum + { + PRIMITIVE = true, + NULL_TYPE = false, + }; + + static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) + { + return key ^ HIGH_BIT; + }; + + static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) + { + return key ^ HIGH_BIT; + }; + +}; + + +/** + * Basic type traits (fp primitive specialization) + */ +template +struct BaseTraits +{ + typedef _UnsignedBits UnsignedBits; + + static const Category CATEGORY = FLOATING_POINT; + static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); + static const UnsignedBits MIN_KEY = UnsignedBits(-1); + static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; + + static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) + { + UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT; + return key ^ mask; + }; + + static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) + { + UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1); + return key ^ mask; + }; + + enum + { + PRIMITIVE = true, + NULL_TYPE = false, + }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** + * \brief Numeric type traits + */ +template struct NumericTraits : BaseTraits {}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits<(std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char> {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** + * \brief Type traits + */ +template +struct Traits : NumericTraits::Type> {}; + + + +/** @} */ // end group UtilModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/specializations/warp_reduce_shfl.cuh b/SRC/cub/warp/specializations/warp_reduce_shfl.cuh new file mode 100644 index 00000000..2f11eab6 --- /dev/null +++ b/SRC/cub/warp/specializations/warp_reduce_shfl.cuh @@ -0,0 +1,330 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../util_ptx.cuh" +#include "../../util_type.cuh" +#include "../../util_macro.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ +template < + typename T, ///< Data type being reduced + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpReduceShfl +{ + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// The number of warp reduction steps + STEPS = Log2::VALUE, + + // The 5-bit SHFL mask for logically splitting warps into sub-segments + SHFL_MASK = (-1 << STEPS) & 31, + + // The 5-bit SFHL clamp + SHFL_CLAMP = LOGICAL_WARP_THREADS - 1, + + // The packed C argument (mask starts 8 bits up) + SHFL_C = (SHFL_MASK << 8) | SHFL_CLAMP, + }; + + + /// Shared memory storage layout type + typedef NullType TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + int lane_id; + + + /****************************************************************************** + * Construction + ******************************************************************************/ + + /// Constructor + __device__ __forceinline__ WarpReduceShfl( + TempStorage &temp_storage) + : + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS) + {} + + + /****************************************************************************** + * Operation + ******************************************************************************/ + + /// Summation (single-SHFL) + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + int FOLDED_ITEMS_PER_LANE> ///< Number of items folded into each lane + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input + int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp + Int2Type single_shfl) ///< [in] Marker type indicating whether only one SHFL instruction is required + { + unsigned int output = reinterpret_cast(input); + + // Iterate reduction steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + const int OFFSET = 1 << STEP; + + if (ALL_LANES_VALID) + { + // Use predicate set from SHFL to guard against invalid peers + asm( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.down.b32 r0|p, %1, %2, %3;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(output), "r"(OFFSET), "r"(SHFL_C), "r"(output)); + } + else + { + // Set range predicate to guard against invalid peers + asm( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.down.b32 r0, %1, %2, %3;" + " setp.lt.u32 p, %5, %6;" + " mov.u32 %0, %1;" + " @p add.u32 %0, %1, r0;" + "}" + : "=r"(output) : "r"(output), "r"(OFFSET), "r"(SHFL_C), "r"(output), "r"((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE), "r"(folded_items_per_warp)); + } + } + + return output; + } + + + /// Summation (multi-SHFL) + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + int FOLDED_ITEMS_PER_LANE> ///< Number of items folded into each lane + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input + int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp + Int2Type single_shfl) ///< [in] Marker type indicating whether only one SHFL instruction is required + { + // Delegate to generic reduce + return Reduce(input, folded_items_per_warp, cub::Sum()); + } + + + /// Summation (float) + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + int FOLDED_ITEMS_PER_LANE> ///< Number of items folded into each lane + __device__ __forceinline__ float Sum( + float input, ///< [in] Calling thread's input + int folded_items_per_warp) ///< [in] Total number of valid items folded into each logical warp + { + T output = input; + + // Iterate reduction steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + const int OFFSET = 1 << STEP; + + if (ALL_LANES_VALID) + { + // Use predicate set from SHFL to guard against invalid peers + asm( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.down.b32 r0|p, %1, %2, %3;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(output), "r"(OFFSET), "r"(SHFL_C), "f"(output)); + } + else + { + // Set range predicate to guard against invalid peers + asm( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.down.b32 r0, %1, %2, %3;" + " setp.lt.u32 p, %5, %6;" + " mov.f32 %0, %1;" + " @p add.f32 %0, %0, r0;" + "}" + : "=f"(output) : "f"(output), "r"(OFFSET), "r"(SHFL_C), "f"(output), "r"((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE), "r"(folded_items_per_warp)); + } + } + + return output; + } + + /// Summation (generic) + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane + typename _T> + __device__ __forceinline__ _T Sum( + _T input, ///< [in] Calling thread's input + int folded_items_per_warp) ///< [in] Total number of valid items folded into each logical warp + { + // Whether sharing can be done with a single SHFL instruction (vs multiple SFHL instructions) + Int2Type<(Traits<_T>::PRIMITIVE) && (sizeof(_T) <= sizeof(unsigned int))> single_shfl; + + return Sum(input, folded_items_per_warp, single_shfl); + } + + + /// Reduction + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + T output = input; + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + // Grab addend from peer + const int OFFSET = 1 << STEP; + + T temp = ShuffleDown(output, OFFSET); + + // Perform reduction op if from a valid peer + if (ALL_LANES_VALID) + { + if (lane_id < LOGICAL_WARP_THREADS - OFFSET) + output = reduction_op(output, temp); + } + else + { + if (((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE) < folded_items_per_warp) + output = reduction_op(output, temp); + } + } + + return output; + } + + + /// Segmented reduction + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename Flag, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + Flag flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + T output = input; + + // Get the start flags for each thread in the warp. + int warp_flags = __ballot(flag); + + if (!HEAD_SEGMENTED) + warp_flags <<= 1; + + // Keep bits above the current thread. + warp_flags &= LaneMaskGt(); + + // Accommodate packing of multiple logical warps in a single physical warp + if (!IS_ARCH_WARP) + { + warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; + } + + // Find next flag + int next_flag = __clz(__brev(warp_flags)); + + // Clip the next segment at the warp boundary if necessary + if (LOGICAL_WARP_THREADS != 32) + next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS); + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + // Grab addend from peer + const int OFFSET = 1 << STEP; + + T temp = ShuffleDown(output, OFFSET); + + // Perform reduction op if valid + if (OFFSET < next_flag - lane_id) + output = reduction_op(output, temp); + } + + return output; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/specializations/warp_reduce_smem.cuh b/SRC/cub/warp/specializations/warp_reduce_smem.cuh new file mode 100644 index 00000000..78d3ea23 --- /dev/null +++ b/SRC/cub/warp/specializations/warp_reduce_smem.cuh @@ -0,0 +1,358 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../thread/thread_load.cuh" +#include "../../thread/thread_store.cuh" +#include "../../util_type.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ +template < + typename T, ///< Data type being reduced + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpReduceSmem +{ + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0), + + /// The number of warp scan steps + STEPS = Log2::VALUE, + + /// The number of threads in half a warp + HALF_WARP_THREADS = 1 << (STEPS - 1), + + /// The number of shared memory elements per warp + WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, + + /// Flag status (when not using ballot) + UNSET = 0x0, // Is initially unset + SET = 0x1, // Is initially set + SEEN = 0x2, // Has seen another head flag from a successor peer + }; + + /// Shared memory flag type + typedef unsigned char SmemFlag; + + /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) + struct _TempStorage + { + T reduce[WARP_SMEM_ELEMENTS]; + SmemFlag flags[WARP_SMEM_ELEMENTS]; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + _TempStorage &temp_storage; + int lane_id; + + + /****************************************************************************** + * Construction + ******************************************************************************/ + + /// Constructor + __device__ __forceinline__ WarpReduceSmem( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS) + {} + + + /****************************************************************************** + * Operation + ******************************************************************************/ + + /** + * Reduction step + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane + typename ReductionOp, + int STEP> + __device__ __forceinline__ T ReduceStep( + T input, ///< [in] Calling thread's input + int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type step) + { + const int OFFSET = 1 << STEP; + + // Share input through buffer + ThreadStore(&temp_storage.reduce[lane_id], input); + + // Update input if peer_addend is in range + if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE < folded_items_per_warp)) + { + T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + input = reduction_op(input, peer_addend); + } + + return ReduceStep(input, folded_items_per_warp, reduction_op, Int2Type()); + } + + + /** + * Reduction step (terminate) + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane + typename ReductionOp> + __device__ __forceinline__ T ReduceStep( + T input, ///< [in] Calling thread's input + int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type step) + { + return input; + } + + + /** + * Reduction + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp + ReductionOp reduction_op) ///< [in] Reduction operator + { + return ReduceStep(input, folded_items_per_warp, reduction_op, Int2Type<0>()); + } + + + /** + * Ballot-based segmented reduce + */ + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename Flag, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + Flag flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type has_ballot) ///< [in] Marker type for whether the target arch has ballot functionality + { + // Get the start flags for each thread in the warp. + int warp_flags = __ballot(flag); + + if (!HEAD_SEGMENTED) + warp_flags <<= 1; + + // Keep bits above the current thread. + warp_flags &= LaneMaskGt(); + + // Accommodate packing of multiple logical warps in a single physical warp + if (!IS_ARCH_WARP) + { + warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; + } + + // Find next flag + int next_flag = __clz(__brev(warp_flags)); + + // Clip the next segment at the warp boundary if necessary + if (LOGICAL_WARP_THREADS != 32) + next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS); + + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + const int OFFSET = 1 << STEP; + + // Share input into buffer + ThreadStore(&temp_storage.reduce[lane_id], input); + + // Update input if peer_addend is in range + if (OFFSET < next_flag - lane_id) + { + T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + input = reduction_op(input, peer_addend); + } + } + + return input; + } + + + /** + * Smem-based segmented reduce + */ + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename Flag, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + Flag flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type has_ballot) ///< [in] Marker type for whether the target arch has ballot functionality + { + enum + { + UNSET = 0x0, // Is initially unset + SET = 0x1, // Is initially set + SEEN = 0x2, // Has seen another head flag from a successor peer + }; + + // Alias flags onto shared data storage + volatile SmemFlag *flag_storage = temp_storage.flags; + + SmemFlag flag_status = (flag) ? SET : UNSET; + + for (int STEP = 0; STEP < STEPS; STEP++) + { + const int OFFSET = 1 << STEP; + + // Share input through buffer + ThreadStore(&temp_storage.reduce[lane_id], input); + + // Get peer from buffer + T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + + // Share flag through buffer + flag_storage[lane_id] = flag_status; + + // Get peer flag from buffer + SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET]; + + // Update input if peer was in range + if (lane_id < LOGICAL_WARP_THREADS - OFFSET) + { + if (HEAD_SEGMENTED) + { + // Head-segmented + if ((flag_status & SEEN) == 0) + { + // Has not seen a more distant head flag + if (peer_flag_status & SET) + { + // Has now seen a head flag + flag_status |= SEEN; + } + else + { + // Peer is not a head flag: grab its count + input = reduction_op(input, peer_addend); + } + + // Update seen status to include that of peer + flag_status |= (peer_flag_status & SEEN); + } + } + else + { + // Tail-segmented. Simply propagate flag status + if (!flag_status) + { + input = reduction_op(input, peer_addend); + flag_status |= peer_flag_status; + } + + } + } + } + + return input; + } + + + /** + * Segmented reduction + */ + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename Flag, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + Flag flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op) ///< [in] Reduction operator + { + return SegmentedReduce(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>()); + } + + + /** + * Summation + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + int FOLDED_ITEMS_PER_LANE> ///< Number of items folded into each lane + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input + int folded_items_per_warp) ///< [in] Total number of valid items folded into each logical warp + { + return Reduce(input, folded_items_per_warp, cub::Sum()); + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/specializations/warp_scan_shfl.cuh b/SRC/cub/warp/specializations/warp_scan_shfl.cuh new file mode 100644 index 00000000..43482986 --- /dev/null +++ b/SRC/cub/warp/specializations/warp_scan_shfl.cuh @@ -0,0 +1,401 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../util_type.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ +template < + typename T, ///< Data type being scanned + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpScanShfl +{ + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// The number of warp scan steps + STEPS = Log2::VALUE, + + // The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + SHFL_C = ((-1 << STEPS) & 31) << 8, + + // Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange + SMALL_INTEGER = ((Traits::CATEGORY == UNSIGNED_INTEGER) || (Traits::CATEGORY == SIGNED_INTEGER)) && (sizeof(T) <= sizeof(unsigned int)) + }; + + /// Shared memory storage layout type + typedef NullType TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + int lane_id; + + /****************************************************************************** + * Construction + ******************************************************************************/ + + /// Constructor + __device__ __forceinline__ WarpScanShfl( + TempStorage &temp_storage) + : + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS) + {} + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Inclusive prefix scan (specialized for summation across primitive integer types 32b or smaller) + template + __device__ __forceinline__ void InclusiveScan( + _T input, ///< [in] Calling thread's input item. + _T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + Sum scan_op, ///< [in] Binary scan operator + Int2Type is_small_integer) ///< [in] Marker type indicating whether T is a small integer + { + unsigned int temp = reinterpret_cast(input); + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + // Use predicate set from SHFL to guard against invalid peers + asm( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.up.b32 r0|p, %1, %2, %3;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(temp) : "r"(temp), "r"(1 << STEP), "r"(SHFL_C), "r"(temp)); + } + + output = reinterpret_cast<_T&>(temp); + } + + + /// Inclusive prefix scan (specialized for summation across float types) + __device__ __forceinline__ void InclusiveScan( + float input, ///< [in] Calling thread's input item. + float &output, ///< [out] Calling thread's output item. May be aliased with \p input. + Sum scan_op, ///< [in] Binary scan operator + Int2Type is_small_integer) ///< [in] Marker type indicating whether T is a small integer + { + output = input; + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + // Use predicate set from SHFL to guard against invalid peers + asm( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.up.b32 r0|p, %1, %2, %3;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(output), "r"(1 << STEP), "r"(SHFL_C), "f"(output)); + } + } + + + /// Inclusive prefix scan (specialized for summation across unsigned long long types) + __device__ __forceinline__ void InclusiveScan( + unsigned long long input, ///< [in] Calling thread's input item. + unsigned long long &output, ///< [out] Calling thread's output item. May be aliased with \p input. + Sum scan_op, ///< [in] Binary scan operator + Int2Type is_small_integer) ///< [in] Marker type indicating whether T is a small integer + { + output = input; + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + // Use predicate set from SHFL to guard against invalid peers + asm( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.up.b32 lo|p, lo, %2, %3;" + " shfl.up.b32 hi|p, hi, %2, %3;" + " mov.b64 %0, {lo, hi};" + " @p add.u64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(output), "r"(1 << STEP), "r"(SHFL_C)); + } + } + + + /// Inclusive prefix scan (specialized for summation across long long types) + __device__ __forceinline__ void InclusiveScan( + long long input, ///< [in] Calling thread's input item. + long long &output, ///< [out] Calling thread's output item. May be aliased with \p input. + Sum scan_op, ///< [in] Binary scan operator + Int2Type is_small_integer) ///< [in] Marker type indicating whether T is a small integer + { + output = input; + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + // Use predicate set from SHFL to guard against invalid peers + asm( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.up.b32 lo|p, lo, %2, %3;" + " shfl.up.b32 hi|p, hi, %2, %3;" + " mov.b64 %0, {lo, hi};" + " @p add.s64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(output), "r"(1 << STEP), "r"(SHFL_C)); + } + } + + + /// Inclusive prefix scan (specialized for summation across double types) + __device__ __forceinline__ void InclusiveScan( + double input, ///< [in] Calling thread's input item. + double &output, ///< [out] Calling thread's output item. May be aliased with \p input. + Sum scan_op, ///< [in] Binary scan operator + Int2Type is_small_integer) ///< [in] Marker type indicating whether T is a small integer + { + output = input; + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + // Use predicate set from SHFL to guard against invalid peers + asm( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.up.b32 lo|p, lo, %2, %3;" + " shfl.up.b32 hi|p, hi, %2, %3;" + " mov.b64 %0, {lo, hi};" + " @p add.f64 %0, %0, %1;" + "}" + : "=d"(output) : "d"(output), "r"(1 << STEP), "r"(SHFL_C)); + } + } + + + /// Inclusive prefix scan + template + __device__ __forceinline__ void InclusiveScan( + _T input, ///< [in] Calling thread's input item. + _T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type is_small_integer) ///< [in] Marker type indicating whether T is a small integer + { + output = input; + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + // Grab addend from peer + const int OFFSET = 1 << STEP; + T temp = ShuffleUp(output, OFFSET); + + // Perform scan op if from a valid peer + if (lane_id >= OFFSET) + output = scan_op(temp, output); + } + } + + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + + /// Broadcast + __device__ __forceinline__ T Broadcast( + T input, ///< [in] The value to broadcast + int src_lane) ///< [in] Which warp lane is to do the broadcasting + { + return ShuffleBroadcast(input, src_lane, LOGICAL_WARP_THREADS); + } + + + //--------------------------------------------------------------------- + // Inclusive operations + //--------------------------------------------------------------------- + + /// Inclusive scan + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + InclusiveScan(input, output, scan_op, Int2Type()); + } + + + /// Inclusive scan with aggregate + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InclusiveScan(input, output, scan_op); + + // Grab aggregate from last warp lane + warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1); + } + + + //--------------------------------------------------------------------- + // Combo (inclusive & exclusive) operations + //--------------------------------------------------------------------- + + /// Combination scan without identity + template + __device__ __forceinline__ void Scan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. + T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. + ScanOp scan_op) ///< [in] Binary scan operator + { + // Compute inclusive scan + InclusiveScan(input, inclusive_output, scan_op); + + // Grab result from predecessor + exclusive_output = ShuffleUp(inclusive_output, 1); + } + + /// Combination scan with identity + template + __device__ __forceinline__ void Scan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. + T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. + T identity, ///< [in] Identity value + ScanOp scan_op) ///< [in] Binary scan operator + { + // Compute inclusive scan + InclusiveScan(input, inclusive_output, scan_op); + + // Grab result from predecessor + exclusive_output = ShuffleUp(inclusive_output, 1); + + exclusive_output = (lane_id == 0) ? + identity : + exclusive_output; + } + + + //--------------------------------------------------------------------- + // Exclusive operations + //--------------------------------------------------------------------- + + /// Exclusive scan with aggregate + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + T identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + T inclusive_output; + Scan(input, inclusive_output, output, identity, scan_op); + + // Grab aggregate from last warp lane + warp_aggregate = Broadcast(inclusive_output, LOGICAL_WARP_THREADS - 1); + } + + + /// Exclusive scan with aggregate, without identity + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + T inclusive_output; + Scan(input, inclusive_output, output, scan_op); + + // Grab aggregate from last warp lane + warp_aggregate = Broadcast(inclusive_output, LOGICAL_WARP_THREADS - 1); + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/specializations/warp_scan_smem.cuh b/SRC/cub/warp/specializations/warp_scan_smem.cuh new file mode 100644 index 00000000..e23ebc41 --- /dev/null +++ b/SRC/cub/warp/specializations/warp_scan_smem.cuh @@ -0,0 +1,319 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../thread/thread_load.cuh" +#include "../../thread/thread_store.cuh" +#include "../../util_type.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ +template < + typename T, ///< Data type being scanned + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpScanSmem +{ + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// The number of warp scan steps + STEPS = Log2::VALUE, + + /// The number of threads in half a warp + HALF_WARP_THREADS = 1 << (STEPS - 1), + + /// The number of shared memory elements per warp + WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, + }; + + /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars) + typedef typename If<((Equals::VALUE || Equals::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT; + + /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) + typedef CellT _TempStorage[WARP_SMEM_ELEMENTS]; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + _TempStorage &temp_storage; + unsigned int lane_id; + + + /****************************************************************************** + * Construction + ******************************************************************************/ + + /// Constructor + __device__ __forceinline__ WarpScanSmem( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS) + {} + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Basic inclusive scan iteration(template unrolled, base-case specialization) + template < + bool HAS_IDENTITY, + typename ScanOp> + __device__ __forceinline__ void ScanStep( + T &partial, + ScanOp scan_op, + Int2Type step) + {} + + + /// Basic inclusive scan iteration (template unrolled, inductive-case specialization) + template < + bool HAS_IDENTITY, + int STEP, + typename ScanOp> + __device__ __forceinline__ void ScanStep( + T &partial, + ScanOp scan_op, + Int2Type step) + { + const int OFFSET = 1 << STEP; + + // Share partial into buffer + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial); + + // Update partial if addend is in range + if (HAS_IDENTITY || (lane_id >= OFFSET)) + { + T addend = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]); + partial = scan_op(addend, partial); + } + + ScanStep(partial, scan_op, Int2Type()); + } + + + /// Inclusive prefix scan with identity + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + T identity, ///< [in] Identity value + ScanOp scan_op) ///< [in] Binary scan operator + { + ThreadStore(&temp_storage[lane_id], (CellT) identity); + + // Iterate scan steps + output = input; + ScanStep(output, scan_op, Int2Type<0>()); + } + + + /// Inclusive prefix scan (specialized for summation across primitive types) + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + Sum scan_op, ///< [in] Binary scan operator + Int2Type is_primitive) ///< [in] Marker type indicating whether T is primitive type + { + T identity = ZeroInitialize(); + InclusiveScan(input, output, identity, scan_op); + } + + + /// Inclusive prefix scan + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type is_primitive) ///< [in] Marker type indicating whether T is primitive type + { + // Iterate scan steps + output = input; + ScanStep(output, scan_op, Int2Type<0>()); + } + + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + /// Broadcast + __device__ __forceinline__ T Broadcast( + T input, ///< [in] The value to broadcast + unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting + { + if (lane_id == src_lane) + { + ThreadStore(temp_storage, (CellT) input); + } + + return (T) ThreadLoad(temp_storage); + } + + + //--------------------------------------------------------------------- + // Inclusive operations + //--------------------------------------------------------------------- + + /// Inclusive scan + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + InclusiveScan(input, output, scan_op, Int2Type::PRIMITIVE>()); } + + + /// Inclusive scan with aggregate + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InclusiveScan(input, output, scan_op); + + // Retrieve aggregate + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) output); + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + } + + + //--------------------------------------------------------------------- + // Combo (inclusive & exclusive) operations + //--------------------------------------------------------------------- + + /// Combination scan without identity + template + __device__ __forceinline__ void Scan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. + T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. + ScanOp scan_op) ///< [in] Binary scan operator + { + // Compute inclusive scan + InclusiveScan(input, inclusive_output, scan_op); + + // Grab result from predecessor + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output); + exclusive_output = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); + } + + /// Combination scan with identity + template + __device__ __forceinline__ void Scan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. + T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. + T identity, ///< [in] Identity value + ScanOp scan_op) ///< [in] Binary scan operator + { + // Compute inclusive scan + InclusiveScan(input, inclusive_output, identity, scan_op); + + // Grab result from predecessor + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output); + exclusive_output = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); + } + + + //--------------------------------------------------------------------- + // Exclusive operations + //--------------------------------------------------------------------- + + /// Exclusive scan with aggregate + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + T identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + T inclusive_output; + Scan(input, inclusive_output, output, identity, scan_op); + + // Retrieve aggregate + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + } + + + /// Exclusive scan with aggregate, without identity + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + T inclusive_output; + Scan(input, inclusive_output, output, scan_op); + + // Retrieve aggregate + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/warp_reduce.cuh b/SRC/cub/warp/warp_reduce.cuh new file mode 100644 index 00000000..1cd3fe0c --- /dev/null +++ b/SRC/cub/warp/warp_reduce.cuh @@ -0,0 +1,627 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "specializations/warp_reduce_shfl.cuh" +#include "specializations/warp_reduce_smem.cuh" +#include "../thread/thread_operators.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup WarpModule + * @{ + */ + +/** + * \brief The WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png) + * + * \tparam T The reduction input/output element type + * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20). + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a list of input elements. + * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads) + * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS + * + * \par Performance Considerations + * - Uses special instructions when applicable (e.g., warp \p SHFL instructions) + * - Uses synchronization-free communication between warp lanes when applicable + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Summation (vs. generic reduction) + * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS + * + * \par Simple Examples + * \warpcollective{WarpReduce} + * \par + * The code snippet below illustrates four concurrent warp sum reductions within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for 4 warps + * __shared__ typename WarpReduce::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96) + * int warp_id = threadIdx.x / 32; + * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, + * \p 2544, and \p 3568, respectively (and is undefined in other threads). + * + * \par + * The code snippet below illustrates a single warp sum reduction within a block of + * 128 threads. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * ... + * + * // Only the first warp performs a reduction + * if (threadIdx.x < 32) + * { + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide sum to lane0 + * int aggregate = WarpReduce(temp_storage).Sum(thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the warp of threads is {0, 1, 2, 3, ..., 31}. + * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads). + * + */ +template < + typename T, + int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +class WarpReduce +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = PowerOfTwo::VALUE, + }; + +public: + + #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + /// Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) + typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), + WarpReduceShfl, + WarpReduceSmem >::Type InternalWarpReduce; + + #endif // DOXYGEN_SHOULD_SKIP_THIS + + +private: + + /// Shared memory storage layout type for WarpReduce + typedef typename InternalWarpReduce::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + +public: + + /// \smemstorage{WarpReduce} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. + */ + __device__ __forceinline__ WarpReduce( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()) + {} + + + //@} end member group + /******************************************************************//** + * \name Summation reductions + *********************************************************************/ + //@{ + + + /** + * \brief Computes a warp-wide sum in the calling warp. The output is valid in warp lane0. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp sum reductions within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for 4 warps + * __shared__ typename WarpReduce::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide sums to each lane0 + * int warp_id = threadIdx.x / 32; + * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, + * \p 2544, and \p 3568, respectively (and is undefined in other threads). + * + */ + __device__ __forceinline__ T Sum( + T input) ///< [in] Calling thread's input + { + return InternalWarpReduce(temp_storage).Sum(input, LOGICAL_WARP_THREADS); + } + + /** + * \brief Computes a partially-full warp-wide sum in the calling warp. The output is valid in warp lane0. + * + * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction within a single, partially-full + * block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(int *d_data, int valid_items) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item per thread if in range + * int thread_data; + * if (threadIdx.x < valid_items) + * thread_data = d_data[threadIdx.x]; + * + * // Return the warp-wide sums to each lane0 + * int aggregate = WarpReduce(temp_storage).Sum( + * thread_data, valid_items); + * + * \endcode + * \par + * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items + * is \p 4. The corresponding output \p aggregate in thread0 is \p 6 (and is + * undefined in other threads). + * + */ + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input + int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) + { + // Determine if we don't need bounds checking + if (valid_items >= LOGICAL_WARP_THREADS) + { + return InternalWarpReduce(temp_storage).Sum(input, valid_items); + } + else + { + return InternalWarpReduce(temp_storage).Sum(input, valid_items); + } + } + + + /** + * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a head-segmented warp sum + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int head_flag = ... + * + * // Return the warp-wide sums to each lane0 + * int aggregate = WarpReduce(temp_storage).HeadSegmentedSum( + * thread_data, head_flag); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p head_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + * + */ + template < + typename Flag> + __device__ __forceinline__ T HeadSegmentedSum( + T input, ///< [in] Calling thread's input + Flag head_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment + { + return HeadSegmentedReduce(input, head_flag, cub::Sum()); + } + + + /** + * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a tail-segmented warp sum + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int tail_flag = ... + * + * // Return the warp-wide sums to each lane0 + * int aggregate = WarpReduce(temp_storage).TailSegmentedSum( + * thread_data, tail_flag); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p tail_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template < + typename Flag> + __device__ __forceinline__ T TailSegmentedSum( + T input, ///< [in] Calling thread's input + Flag tail_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment + { + return TailSegmentedReduce(input, tail_flag, cub::Sum()); + } + + + + //@} end member group + /******************************************************************//** + * \name Generic reductions + *********************************************************************/ + //@{ + + /** + * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp max reductions within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for 4 warps + * __shared__ typename WarpReduce::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide reductions to each lane0 + * int warp_id = threadIdx.x / 32; + * int aggregate = WarpReduce(temp_storage[warp_id]).Reduce( + * thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63, + * \p 95, and \p 127, respectively (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + return InternalWarpReduce(temp_storage).Reduce(input, LOGICAL_WARP_THREADS, reduction_op); + } + + /** + * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. + * + * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction within a single, partially-full + * block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(int *d_data, int valid_items) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item per thread if in range + * int thread_data; + * if (threadIdx.x < valid_items) + * thread_data = d_data[threadIdx.x]; + * + * // Return the warp-wide reductions to each lane0 + * int aggregate = WarpReduce(temp_storage).Reduce( + * thread_data, cub::Max(), valid_items); + * + * \endcode + * \par + * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items + * is \p 4. The corresponding output \p aggregate in thread0 is \p 3 (and is + * undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op, ///< [in] Binary reduction operator + int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) + { + // Determine if we don't need bounds checking + if (valid_items >= LOGICAL_WARP_THREADS) + { + return InternalWarpReduce(temp_storage).Reduce(input, valid_items, reduction_op); + } + else + { + return InternalWarpReduce(temp_storage).Reduce(input, valid_items, reduction_op); + } + } + + + /** + * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a head-segmented warp max + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int head_flag = ... + * + * // Return the warp-wide reductions to each lane0 + * int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce( + * thread_data, head_flag, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p head_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template < + typename ReductionOp, + typename Flag> + __device__ __forceinline__ T HeadSegmentedReduce( + T input, ///< [in] Calling thread's input + Flag head_flag, ///< [in] Head flag denoting whether or not \p input is the start of a new segment + ReductionOp reduction_op) ///< [in] Reduction operator + { + return InternalWarpReduce(temp_storage).template SegmentedReduce(input, head_flag, reduction_op); + } + + + /** + * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a tail-segmented warp max + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int tail_flag = ... + * + * // Return the warp-wide reductions to each lane0 + * int aggregate = WarpReduce(temp_storage).TailSegmentedReduce( + * thread_data, tail_flag, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p tail_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template < + typename ReductionOp, + typename Flag> + __device__ __forceinline__ T TailSegmentedReduce( + T input, ///< [in] Calling thread's input + Flag tail_flag, ///< [in] Tail flag denoting whether or not \p input is the end of the current segment + ReductionOp reduction_op) ///< [in] Reduction operator + { + return InternalWarpReduce(temp_storage).template SegmentedReduce(input, tail_flag, reduction_op); + } + + + + //@} end member group +}; + +/** @} */ // end group WarpModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/warp_scan.cuh b/SRC/cub/warp/warp_scan.cuh new file mode 100644 index 00000000..a065f984 --- /dev/null +++ b/SRC/cub/warp/warp_scan.cuh @@ -0,0 +1,1451 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "specializations/warp_scan_shfl.cuh" +#include "specializations/warp_scan_smem.cuh" +#include "../thread/thread_operators.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup WarpModule + * @{ + */ + +/** + * \brief The WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. ![](warp_scan_logo.png) + * + * \tparam T The scan input/output element type + * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20). + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) + * produces an output list where each element is computed to be the reduction + * of the elements occurring earlier in the input list. Prefix sum + * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * that the ith output reduction incorporates the ith input. + * The term \em exclusive indicates the ith input is not incorporated into + * the ith output reduction. + * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads) + * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS + * + * \par Performance Considerations + * - Uses special instructions when applicable (e.g., warp \p SHFL) + * - Uses synchronization-free communication between warp lanes when applicable + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Summation (vs. generic scan) + * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS + * + * \par Simple Examples + * \warpcollective{WarpScan} + * \par + * The code snippet below illustrates four concurrent warp prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute warp-wide prefix sums + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 0, 1, 2, 3, ..., 31}. + * + * \par + * The code snippet below illustrates a single warp prefix sum within a block of + * 128 threads. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for one warp + * __shared__ typename WarpScan::TempStorage temp_storage; + * ... + * + * // Only the first warp performs a prefix sum + * if (threadIdx.x < 32) + * { + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute warp-wide prefix sums + * WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the warp of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data will be {0, 1, 2, 3, ..., 31}. + * + */ +template < + typename T, + int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +class WarpScan +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0), + + /// Whether the data type is an integer (which has fully-associative addition) + IS_INTEGER = ((Traits::CATEGORY == SIGNED_INTEGER) || (Traits::CATEGORY == UNSIGNED_INTEGER)) + }; + + /// Internal specialization. Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) + typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), + WarpScanShfl, + WarpScanSmem >::Type InternalWarpScan; + + /// Shared memory storage layout type for WarpScan + typedef typename InternalWarpScan::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + int lane_id; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + +public: + + /// \smemstorage{WarpScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. + */ + __device__ __forceinline__ WarpScan( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS) + {} + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix sums + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive prefix sum across the calling warp. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix sums + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 1, 2, 3, ..., 32}. + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item. + T &output) ///< [out] Calling thread's output item. May be aliased with \p input. + { + InternalWarpScan(temp_storage).InclusiveScan(input, output, cub::Sum()); + } + + + /** + * \brief Computes an inclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * The \p warp_aggregate is undefined in threads other than warp-lane0. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix sums + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 1, 2, 3, ..., 32}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan(temp_storage).InclusiveScan(input, output, cub::Sum(), warp_aggregate); + } + + + /** + * \brief Computes an inclusive prefix sum across the calling warp. Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * The \p warp_aggregate is undefined in threads other than warp-lane0. + * + * The \p warp_prefix_op functor must implement a member function T operator()(T warp_aggregate). + * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the entire warp of threads, however only the return value from + * lane0 is applied as the threadblock-wide prefix. Can be stateful. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively + * computes an inclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 32 integer items that are partitioned across the warp. + * \par + * \code + * #include + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct WarpPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ WarpPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the entire warp. Lane-0 is responsible + * // for returning a value for seeding the warp-wide scan. + * __device__ int operator()(int warp_aggregate) + * { + * int old_prefix = running_total; + * running_total += warp_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize WarpScan for int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for one warp + * __shared__ typename WarpScan::TempStorage temp_storage; + * + * // Initialize running total + * WarpPrefixCallbackOp prefix_op(0); + * + * // Have the warp iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 32) + * { + * // Load a segment of consecutive items + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the warp-wide inclusive prefix sum + * int warp_aggregate; + * WarpScan(temp_storage).InclusiveSum( + * thread_data, thread_data, warp_aggregate, prefix_op); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is {1, 1, 1, 1, 1, 1, 1, 1, ...}. + * The corresponding output for the first segment will be {1, 2, 3, ..., 32}. + * The output for the second segment will be {33, 34, 35, ..., 64}. Furthermore, + * the value \p 32 will be stored in \p warp_aggregate for all threads after each scan. + * + * \tparam WarpPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T warp_aggregate) + */ + template + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + T &warp_aggregate, ///< [out] [warp-lane0 only] Warp-wide aggregate reduction of input items, exclusive of the \p warp_prefix_op value + WarpPrefixCallbackOp &warp_prefix_op) ///< [in-out] [warp-lane0 only] Call-back functor for specifying a warp-wide prefix to be applied to all inputs. + { + // Compute inclusive warp scan + InclusiveSum(input, output, warp_aggregate); + + // Compute warp-wide prefix from aggregate, then broadcast to other lanes + T prefix; + prefix = warp_prefix_op(warp_aggregate); + prefix = InternalWarpScan(temp_storage).Broadcast(prefix, 0); + + // Update output + output = prefix + output; + } + + //@} end member group + +private: + + /// Combination scan with identity + __device__ __forceinline__ void Sum(T input, T &inclusive_output, T &exclusive_output, Int2Type is_integer) + { + // Compute exclusive warp scan from inclusive warp scan + InclusiveSum(input, inclusive_output); + exclusive_output = inclusive_output - input; + } + + /// Combination scan with identity + __device__ __forceinline__ void Sum(T input, T &inclusive_output, T &exclusive_output, Int2Type is_integer) + { + // Delegate to regular scan for non-integer types (because we won't be able to use subtraction) + T identity = ZeroInitialize(); + InternalWarpScan(temp_storage).Scan(input, inclusive_output, exclusive_output, identity, cub::Sum()); + } + + /// Computes an exclusive prefix sum across the calling warp. + __device__ __forceinline__ void ExclusiveSum(T input, T &output, Int2Type is_integer) + { + // Compute exclusive warp scan from inclusive warp scan + T inclusive; + InclusiveSum(input, inclusive); + output = inclusive - input; + } + + /// Computes an exclusive prefix sum across the calling warp. Specialized for non-integer types. + __device__ __forceinline__ void ExclusiveSum(T input, T &output, Int2Type is_integer) + { + // Delegate to regular scan for non-integer types (because we won't be able to use subtraction) + T identity = ZeroInitialize(); + ExclusiveScan(input, output, identity, cub::Sum()); + } + + /// Computes an exclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, Int2Type is_integer) + { + // Compute exclusive warp scan from inclusive warp scan + T inclusive; + InclusiveSum(input, inclusive, warp_aggregate); + output = inclusive - input; + } + + /// Computes an exclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. Specialized for non-integer types. + __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, Int2Type is_integer) + { + // Delegate to regular scan for non-integer types (because we won't be able to use subtraction) + T identity = ZeroInitialize(); + ExclusiveScan(input, output, identity, cub::Sum(), warp_aggregate); + } + + /// Computes an exclusive prefix sum across the calling warp. Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, WarpPrefixCallbackOp &warp_prefix_op, Int2Type is_integer) + { + // Compute exclusive warp scan from inclusive warp scan + T inclusive; + InclusiveSum(input, inclusive, warp_aggregate, warp_prefix_op); + output = inclusive - input; + } + + /// Computes an exclusive prefix sum across the calling warp. Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. Specialized for non-integer types. + template + __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, WarpPrefixCallbackOp &warp_prefix_op, Int2Type is_integer) + { + // Delegate to regular scan for non-integer types (because we won't be able to use subtraction) + T identity = ZeroInitialize(); + ExclusiveScan(input, output, identity, cub::Sum(), warp_aggregate, warp_prefix_op); + } + +public: + + + /******************************************************************//** + * \name Exclusive prefix sums + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive prefix sum across the calling warp. + * + * This operation assumes the value of obtained by the T's default + * constructor (or by zero-initialization if no user-defined default + * constructor exists) is suitable as the identity value "zero" for + * addition. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix sums + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 0, 1, 2, ..., 31}. + * + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item. + T &output) ///< [out] Calling thread's output item. May be aliased with \p input. + { + ExclusiveSum(input, output, Int2Type()); + } + + + /** + * \brief Computes an exclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * This operation assumes the value of obtained by the T's default + * constructor (or by zero-initialization if no user-defined default + * constructor exists) is suitable as the identity value "zero" for + * addition. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix sums + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 0, 1, 2, ..., 31}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + ExclusiveSum(input, output, warp_aggregate, Int2Type()); + } + + + /** + * \brief Computes an exclusive prefix sum across the calling warp. Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * This operation assumes the value of obtained by the T's default + * constructor (or by zero-initialization if no user-defined default + * constructor exists) is suitable as the identity value "zero" for + * addition. + * + * The \p warp_prefix_op functor must implement a member function T operator()(T warp_aggregate). + * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the entire warp of threads, however only the return value from + * lane0 is applied as the threadblock-wide prefix. Can be stateful. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively + * computes an exclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 32 integer items that are partitioned across the warp. + * \par + * \code + * #include + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct WarpPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ WarpPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the entire warp. Lane-0 is responsible + * // for returning a value for seeding the warp-wide scan. + * __device__ int operator()(int warp_aggregate) + * { + * int old_prefix = running_total; + * running_total += warp_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize WarpScan for int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for one warp + * __shared__ typename WarpScan::TempStorage temp_storage; + * + * // Initialize running total + * WarpPrefixCallbackOp prefix_op(0); + * + * // Have the warp iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 32) + * { + * // Load a segment of consecutive items + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the warp-wide exclusive prefix sum + * int warp_aggregate; + * WarpScan(temp_storage).ExclusiveSum( + * thread_data, thread_data, warp_aggregate, prefix_op); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is {1, 1, 1, 1, 1, 1, 1, 1, ...}. + * The corresponding output for the first segment will be {0, 1, 2, ..., 31}. + * The output for the second segment will be {32, 33, 34, ..., 63}. Furthermore, + * the value \p 32 will be stored in \p warp_aggregate for all threads after each scan. + * + * \tparam WarpPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T warp_aggregate) + */ + template + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + T &warp_aggregate, ///< [out] [warp-lane0 only] Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value). + WarpPrefixCallbackOp &warp_prefix_op) ///< [in-out] [warp-lane0 only] Call-back functor for specifying a warp-wide prefix to be applied to all inputs. + { + ExclusiveSum(input, output, warp_aggregate, warp_prefix_op, Int2Type()); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix scans + *********************************************************************/ + //@{ + + /** + * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. + * + * Supports non-commutative scan operators. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan(temp_storage).InclusiveScan(input, output, scan_op); + } + + + /** + * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * Supports non-commutative scan operators. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix max scans + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveScan( + * thread_data, thread_data, cub::Max(), warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads + * in the second warp, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan(temp_storage).InclusiveScan(input, output, scan_op, warp_aggregate); + } + + + /** + * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. The call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * The \p warp_prefix_op functor must implement a member function T operator()(T warp_aggregate). + * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the entire warp of threads, however only the return value from + * lane0 is applied as the threadblock-wide prefix. Can be stateful. + * + * Supports non-commutative scan operators. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively + * computes an inclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 32 integer items that are partitioned across the warp. + * \par + * \code + * #include + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct WarpPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ WarpPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the entire warp. Lane-0 is responsible + * // for returning a value for seeding the warp-wide scan. + * __device__ int operator()(int warp_aggregate) + * { + * int old_prefix = running_total; + * running_total = (warp_aggregate > old_prefix) ? warp_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize WarpScan for int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for one warp + * __shared__ typename WarpScan::TempStorage temp_storage; + * + * // Initialize running total + * WarpPrefixCallbackOp prefix_op(0); + * + * // Have the warp iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 32) + * { + * // Load a segment of consecutive items + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the warp-wide inclusive prefix max scan + * int warp_aggregate; + * WarpScan(temp_storage).InclusiveScan( + * thread_data, thread_data, cub::Max(), warp_aggregate, prefix_op); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is {0, -1, 2, -3, 4, -5, ...}. + * The corresponding output for the first segment will be {0, 0, 2, 2, ..., 30, 30}. + * The output for the second segment will be {32, 32, 34, 34, ..., 62, 62}. Furthermore, + * \p block_aggregate will be assigned \p 30 in all threads after the first scan, assigned \p 62 after the second + * scan, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + * \tparam WarpPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T warp_aggregate) + */ + template < + typename ScanOp, + typename WarpPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate, ///< [out] [warp-lane0 only] Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value). + WarpPrefixCallbackOp &warp_prefix_op) ///< [in-out] [warp-lane0 only] Call-back functor for specifying a warp-wide prefix to be applied to all inputs. + { + // Compute inclusive warp scan + InclusiveScan(input, output, scan_op, warp_aggregate); + + // Compute warp-wide prefix from aggregate, then broadcast to other lanes + T prefix; + prefix = warp_prefix_op(warp_aggregate); + prefix = InternalWarpScan(temp_storage).Broadcast(prefix, 0); + + // Update output + output = scan_op(prefix, output); + } + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix scans + *********************************************************************/ + //@{ + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. + * + * Supports non-commutative scan operators. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + T identity, ///< [in] Identity value + ScanOp scan_op) ///< [in] Binary scan operator + { + T inclusive_output; + InternalWarpScan(temp_storage).Scan(input, inclusive_output, output, identity, scan_op); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * Supports non-commutative scan operators. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. + * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads + * in the second warp, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + T identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, warp_aggregate); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. The call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * The \p warp_prefix_op functor must implement a member function T operator()(T warp_aggregate). + * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the entire warp of threads, however only the return value from + * lane0 is applied as the threadblock-wide prefix. Can be stateful. + * + * Supports non-commutative scan operators. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively + * computes an exclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 32 integer items that are partitioned across the warp. + * \par + * \code + * #include + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct WarpPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ WarpPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the entire warp. Lane-0 is responsible + * // for returning a value for seeding the warp-wide scan. + * __device__ int operator()(int warp_aggregate) + * { + * int old_prefix = running_total; + * running_total = (warp_aggregate > old_prefix) ? warp_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize WarpScan for int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for one warp + * __shared__ typename WarpScan::TempStorage temp_storage; + * + * // Initialize running total + * WarpPrefixCallbackOp prefix_op(INT_MIN); + * + * // Have the warp iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 32) + * { + * // Load a segment of consecutive items + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the warp-wide exclusive prefix max scan + * int warp_aggregate; + * WarpScan(temp_storage).ExclusiveScan( + * thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate, prefix_op); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is {0, -1, 2, -3, 4, -5, ...}. + * The corresponding output for the first segment will be {INT_MIN, 0, 0, 2, ..., 28, 30}. + * The output for the second segment will be {30, 32, 32, 34, ..., 60, 62}. Furthermore, + * \p block_aggregate will be assigned \p 30 in all threads after the first scan, assigned \p 62 after the second + * scan, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + * \tparam WarpPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T warp_aggregate) + */ + template < + typename ScanOp, + typename WarpPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + T identity, ///< [in] Identity value + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate, ///< [out] [warp-lane0 only] Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value). + WarpPrefixCallbackOp &warp_prefix_op) ///< [in-out] [warp-lane0 only] Call-back functor for specifying a warp-wide prefix to be applied to all inputs. + { + // Exclusive warp scan + ExclusiveScan(input, output, identity, scan_op, warp_aggregate); + + // Compute warp-wide prefix from aggregate, then broadcast to other lanes + T prefix = warp_prefix_op(warp_aggregate); + prefix = InternalWarpScan(temp_storage).Broadcast(prefix, 0); + + // Update output + output = (lane_id == 0) ? + prefix : + scan_op(prefix, output); + } + + + //@} end member group + /******************************************************************//** + * \name Identityless exclusive prefix scans + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no identity value is supplied, the \p output computed for warp-lane0 is undefined. + * + * Supports non-commutative scan operators. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. + * (The output \p thread_data in warp lane0 is undefined.) + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + T inclusive_output; + InternalWarpScan(temp_storage).Scan(input, inclusive_output, output, scan_op); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no identity value is supplied, the \p output computed for warp-lane0 is undefined. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * Supports non-commutative scan operators. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. + * (The output \p thread_data in warp lane0 is undefined.) Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads + * in the second warp, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan(temp_storage).ExclusiveScan(input, output, scan_op, warp_aggregate); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. The \p warp_prefix_op value from warp-lane0 is applied to all scan outputs. Also computes the warp-wide \p warp_aggregate of all inputs for warp-lane0. + * + * The \p warp_prefix_op functor must implement a member function T operator()(T warp_aggregate)}. + * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the entire warp of threads, however only the return value from + * lane0 is applied as the threadblock-wide prefix. Can be stateful. + * + * Supports non-commutative scan operators. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively + * computes an exclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 32 integer items that are partitioned across the warp. + * \par + * \code + * #include + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct WarpPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ WarpPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the entire warp. Lane-0 is responsible + * // for returning a value for seeding the warp-wide scan. + * __device__ int operator()(int warp_aggregate) + * { + * int old_prefix = running_total; + * running_total = (warp_aggregate > old_prefix) ? warp_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize WarpScan for int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for one warp + * __shared__ typename WarpScan::TempStorage temp_storage; + * + * // Initialize running total + * WarpPrefixCallbackOp prefix_op(INT_MIN); + * + * // Have the warp iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 32) + * { + * // Load a segment of consecutive items + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the warp-wide exclusive prefix max scan + * int warp_aggregate; + * WarpScan(temp_storage).ExclusiveScan( + * thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate, prefix_op); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is {0, -1, 2, -3, 4, -5, ...}. + * The corresponding output for the first segment will be {INT_MIN, 0, 0, 2, ..., 28, 30}. + * The output for the second segment will be {30, 32, 32, 34, ..., 60, 62}. Furthermore, + * \p block_aggregate will be assigned \p 30 in all threads after the first scan, assigned \p 62 after the second + * scan, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + * \tparam WarpPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T warp_aggregate) + */ + template < + typename ScanOp, + typename WarpPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate, ///< [out] [warp-lane0 only] Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value). + WarpPrefixCallbackOp &warp_prefix_op) ///< [in-out] [warp-lane0 only] Call-back functor for specifying a warp-wide prefix to be applied to all inputs. + { + // Exclusive warp scan + ExclusiveScan(input, output, scan_op, warp_aggregate); + + // Compute warp-wide prefix from aggregate, then broadcast to other lanes + T prefix = warp_prefix_op(warp_aggregate); + prefix = InternalWarpScan(temp_storage).Broadcast(prefix, 0); + + // Update output with prefix + output = (lane_id == 0) ? + prefix : + scan_op(prefix, output); + } + + //@} end member group + /******************************************************************//** + * \name Combination (inclusive & exclusive) prefix scans + *********************************************************************/ + //@{ + + /** + * \brief Computes both inclusive and exclusive prefix sums across the calling warp. + * + * This operation assumes the value of obtained by the T's default + * constructor (or by zero-initialization if no user-defined default + * constructor exists) is suitable as the identity value "zero" for + * addition. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute in|exclusive warp-wide prefix sums + * int inclusive_partial, exclusive_partial; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).Sum(thread_data, inclusive_partial, exclusive_partial); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p inclusive_partial in each of the four warps of threads will be + * 1, 2, 3, ..., 32}. + * The corresponding output \p exclusive_partial in each of the four warps of threads will be + * 0, 1, 2, ..., 31}. + * + */ + __device__ __forceinline__ void Sum( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. + T &exclusive_output) ///< [out] Calling thread's exclusive-scan output item. + { + Sum(input, inclusive_output, exclusive_output, Int2Type()); + } + + + /** + * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. + * + * Supports non-commutative scan operators. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * int inclusive_partial, exclusive_partial; + * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p inclusive_partial in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * The corresponding output \p exclusive_partial in the first warp would be + * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void Scan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. + T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. + T identity, ///< [in] Identity value + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan(temp_storage).Scan(input, inclusive_output, exclusive_output, identity, scan_op); + } + + + /** + * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. Because no identity value is supplied, the \p exclusive_output computed for warp-lane0 is undefined. + * + * Supports non-commutative scan operators. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int inclusive_partial, exclusive_partial; + * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p inclusive_partial in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * The corresponding output \p exclusive_partial in the first warp would be + * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. + * (The output \p thread_data in warp lane0 is undefined.) + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void Scan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. + T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan(temp_storage).Scan(input, inclusive_output, exclusive_output, scan_op); + } + + + //@} end member group +}; + +/** @} */ // end group WarpModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) From f0abc3c49375cb50e07eee12db26f033c60f8d13 Mon Sep 17 00:00:00 2001 From: piyush sao Date: Fri, 5 Feb 2021 12:26:09 -0500 Subject: [PATCH 061/147] Forcing everything on GPU; Fixing a memory free; minor fixes on superlu_gpu.cu, some configurations gives incorrect results --- SRC/superlu_gpu.cu | 1458 ++++++++++++++++++------------------ SRC/treeFactorizationGPU.c | 32 +- SRC/util.c | 11 +- 3 files changed, 764 insertions(+), 737 deletions(-) diff --git a/SRC/superlu_gpu.cu b/SRC/superlu_gpu.cu index e8eaa92c..73c4bc29 100644 --- a/SRC/superlu_gpu.cu +++ b/SRC/superlu_gpu.cu @@ -48,7 +48,7 @@ cudaError_t checkCuda(cudaError_t result) int_t getnCudaStreams() { - char *ttemp; + char *ttemp; ttemp = getenv ("N_CUDA_STREAMS"); if (ttemp) @@ -111,46 +111,46 @@ void device_scatter_l_2D (int thread_id, int *IndirectJ3 ) { - if ( thread_id < temp_nbrow * ColPerBlock ) - { - int thread_id_x = thread_id % temp_nbrow; - int thread_id_y = thread_id / temp_nbrow; + if ( thread_id < temp_nbrow * ColPerBlock ) + { + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; #define UNROLL_ITER 8 #pragma unroll 4 - for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) - { - nzval[ldv * IndirectJ3[col] + indirect2_thread[thread_id_x]] - -= tempv[nbrow * col + thread_id_x]; + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + nzval[ldv * IndirectJ3[col] + indirect2_thread[thread_id_x]] + -= tempv[nbrow * col + thread_id_x]; + } } - } } /* Sherry: this routine is not used */ __global__ void cub_scan_test(void) { - int thread_id = threadIdx.x; - typedef cub::BlockScan BlockScan; /*1D int data type*/ + int thread_id = threadIdx.x; + typedef cub::BlockScan BlockScan; /*1D int data type*/ - __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ - - __shared__ int IndirectJ1[MAX_SUPER_SIZE]; - __shared__ int IndirectJ2[MAX_SUPER_SIZE]; + __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ - if (thread_id < MAX_SUPER_SIZE) - { - IndirectJ1[thread_id] = (thread_id + 1) % 2; - } + __shared__ int IndirectJ1[MAX_SUPER_SIZE]; + __shared__ int IndirectJ2[MAX_SUPER_SIZE]; - __syncthreads(); - if (thread_id < MAX_SUPER_SIZE) - BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]); + if (thread_id < MAX_SUPER_SIZE) + { + IndirectJ1[thread_id] = (thread_id + 1) % 2; + } + + __syncthreads(); + if (thread_id < MAX_SUPER_SIZE) + BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]); - if (thread_id < MAX_SUPER_SIZE) - printf("%d %d\n", thread_id, IndirectJ2[thread_id]); + if (thread_id < MAX_SUPER_SIZE) + printf("%d %d\n", thread_id, IndirectJ2[thread_id]); } @@ -168,19 +168,19 @@ void device_scatter_u_2D (int thread_id, int *IndirectJ3 ) { - if ( thread_id < temp_nbrow * ColPerBlock ) - { - /* 1D threads are logically arranged in 2D shape. */ - int thread_id_x = thread_id % temp_nbrow; - int thread_id_y = thread_id / temp_nbrow; + if ( thread_id < temp_nbrow * ColPerBlock ) + { + /* 1D threads are logically arranged in 2D shape. */ + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; #pragma unroll 4 - for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) - { - ucol[IndirectJ1[IndirectJ3[col]] + indirect[thread_id_x]] - -= tempv[nbrow * col + thread_id_x]; + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + ucol[IndirectJ1[IndirectJ3[col]] + indirect[thread_id_x]] + -= tempv[nbrow * col + thread_id_x]; + } } - } } @@ -226,7 +226,7 @@ void device_scatter_u (int_t thread_id, __global__ void Scatter_GPU_kernel( int_t streamId, - int_t ii_st, int_t ii_end, + int_t ii_st, int_t ii_end, int_t jj_st, int_t jj_end, /* defines rectangular Schur block to be scatter */ int_t klst, int_t jj0, /* 0 on entry */ @@ -234,221 +234,224 @@ void Scatter_GPU_kernel( LUstruct_gpu * A_gpu) { - /* initializing pointers */ - int_t *xsup = A_gpu->xsup; - int_t *UrowindPtr = A_gpu->UrowindPtr; - int_t *UrowindVec = A_gpu->UrowindVec; - int_t *UnzvalPtr = A_gpu->UnzvalPtr; - double *UnzvalVec = A_gpu->UnzvalVec; - int_t *LrowindPtr = A_gpu->LrowindPtr; - int_t *LrowindVec = A_gpu->LrowindVec; - int_t *LnzvalPtr = A_gpu->LnzvalPtr; - double *LnzvalVec = A_gpu->LnzvalVec; - double *bigV = A_gpu->scubufs[streamId].bigV; - local_l_blk_info_t *local_l_blk_infoVec = A_gpu->local_l_blk_infoVec; - local_u_blk_info_t *local_u_blk_infoVec = A_gpu->local_u_blk_infoVec; - int_t *local_l_blk_infoPtr = A_gpu->local_l_blk_infoPtr; - int_t *local_u_blk_infoPtr = A_gpu->local_u_blk_infoPtr; - Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info; - Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info; - int_t *lsub = A_gpu->scubufs[streamId].lsub; - int_t *usub = A_gpu->scubufs[streamId].usub; - - /* thread block assignment: this thread block is - assigned to block (lb, j) in 2D grid */ - int lb = blockIdx.x + ii_st; - int j = blockIdx.y + jj_st; - __shared__ int indirect_thread[MAX_SUPER_SIZE]; /* row-wise */ - __shared__ int indirect2_thread[MAX_SUPER_SIZE]; /* row-wise */ - __shared__ int IndirectJ1[THREAD_BLOCK_SIZE]; /* column-wise */ - __shared__ int IndirectJ3[THREAD_BLOCK_SIZE]; /* column-wise */ - - /* see CUB page https://nvlabs.github.io/cub/. Implement threads collectives */ - typedef cub::BlockScan BlockScan; /*1D int data type*/ - __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ - - int thread_id = threadIdx.x; - - int iukp = Ublock_info[j].iukp; - int jb = Ublock_info[j].jb; - int nsupc = SuperSize (jb); - int ljb = jb / npcol; - - double *tempv1; - if (jj_st == jj0) - { - tempv1 = (j == jj_st) ? bigV - : bigV + Ublock_info[j - 1].full_u_cols * nrows; - } - else - { - tempv1 = (j == jj_st) ? bigV - : bigV + (Ublock_info[j - 1].full_u_cols - - Ublock_info[jj_st - 1].full_u_cols) * nrows; - } - - /* # of nonzero columns in block j */ - int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols - : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols); - int cum_ncol = (j == 0) ? 0 : Ublock_info[j - 1].full_u_cols; - - int lptr = Remain_info[lb].lptr; - int ib = Remain_info[lb].ib; - int temp_nbrow = lsub[lptr + 1]; /* number of rows in the current L block */ - lptr += LB_DESCRIPTOR; - - int_t cum_nrow; - if (ii_st == 0) - { - cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow); - } - else - { - cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow - Remain_info[ii_st - 1].FullRow); - } - - tempv1 += cum_nrow; - - if (ib < jb) /*scatter U code */ - { - int ilst = FstBlockC (ib + 1); - int lib = ib / nprow; /* local index of row block ib */ - int_t *index = &UrowindVec[UrowindPtr[lib]]; - - int num_u_blocks = index[0]; - - int ljb = (jb) / npcol; /* local index of column block jb */ - - /* Each thread is responsible for one block column */ - __shared__ int ljb_ind; - /*do a search ljb_ind at local row lib*/ - int blks_per_threads = CEILING(num_u_blocks, THREAD_BLOCK_SIZE); - for (int i = 0; i < blks_per_threads; ++i) - /* each thread is assigned a chunk of consecutive U blocks to search */ - { - /* only one thread finds the block index matching ljb */ - if (thread_id * blks_per_threads + i < num_u_blocks && - local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + thread_id * blks_per_threads + i ].ljb == ljb) - { - ljb_ind = thread_id * blks_per_threads + i; - } + /* initializing pointers */ + int_t *xsup = A_gpu->xsup; + int_t *UrowindPtr = A_gpu->UrowindPtr; + int_t *UrowindVec = A_gpu->UrowindVec; + int_t *UnzvalPtr = A_gpu->UnzvalPtr; + double *UnzvalVec = A_gpu->UnzvalVec; + int_t *LrowindPtr = A_gpu->LrowindPtr; + int_t *LrowindVec = A_gpu->LrowindVec; + int_t *LnzvalPtr = A_gpu->LnzvalPtr; + double *LnzvalVec = A_gpu->LnzvalVec; + double *bigV = A_gpu->scubufs[streamId].bigV; + local_l_blk_info_t *local_l_blk_infoVec = A_gpu->local_l_blk_infoVec; + local_u_blk_info_t *local_u_blk_infoVec = A_gpu->local_u_blk_infoVec; + int_t *local_l_blk_infoPtr = A_gpu->local_l_blk_infoPtr; + int_t *local_u_blk_infoPtr = A_gpu->local_u_blk_infoPtr; + Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info; + Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info; + int_t *lsub = A_gpu->scubufs[streamId].lsub; + int_t *usub = A_gpu->scubufs[streamId].usub; + + /* thread block assignment: this thread block is + assigned to block (lb, j) in 2D grid */ + int lb = blockIdx.x + ii_st; + int j = blockIdx.y + jj_st; + __shared__ int indirect_thread[MAX_SUPER_SIZE]; /* row-wise */ + __shared__ int indirect2_thread[MAX_SUPER_SIZE]; /* row-wise */ + __shared__ int IndirectJ1[THREAD_BLOCK_SIZE]; /* column-wise */ + __shared__ int IndirectJ3[THREAD_BLOCK_SIZE]; /* column-wise */ + + /* see CUB page https://nvlabs.github.io/cub/. Implement threads collectives */ + typedef cub::BlockScan BlockScan; /*1D int data type*/ + __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ + + int thread_id = threadIdx.x; + + int iukp = Ublock_info[j].iukp; + int jb = Ublock_info[j].jb; + int nsupc = SuperSize (jb); + int ljb = jb / npcol; + + double *tempv1; + if (jj_st == jj0) + { + tempv1 = (j == jj_st) ? bigV + : bigV + Ublock_info[j - 1].full_u_cols * nrows; } - __syncthreads(); + else + { + tempv1 = (j == jj_st) ? bigV + : bigV + (Ublock_info[j - 1].full_u_cols - + Ublock_info[jj_st - 1].full_u_cols) * nrows; + } + + /* # of nonzero columns in block j */ + int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols + : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols); + int cum_ncol = (j == 0) ? 0 : Ublock_info[j - 1].full_u_cols; - int iuip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].iuip; - int ruip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].ruip; - iuip_lib += UB_DESCRIPTOR; - double *Unzval_lib = &UnzvalVec[UnzvalPtr[lib]]; - double *ucol = &Unzval_lib[ruip_lib]; + int lptr = Remain_info[lb].lptr; + int ib = Remain_info[lb].ib; + int temp_nbrow = lsub[lptr + 1]; /* number of rows in the current L block */ + lptr += LB_DESCRIPTOR; - if (thread_id < temp_nbrow) /* row-wise */ - { /* cyclically map each thread to a row */ - indirect_thread[thread_id] = (int) lsub[lptr + thread_id]; + int_t cum_nrow; + if (ii_st == 0) + { + cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow); + } + else + { + cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow - Remain_info[ii_st - 1].FullRow); } - /* column-wise: each thread is assigned one column */ - if (thread_id < nnz_cols) - IndirectJ3[thread_id] = A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; - /* indirectJ3[j] == kk means the j-th nonzero segment - points to column kk in this supernode */ + tempv1 += cum_nrow; - __syncthreads(); + if (ib < jb) /*scatter U code */ + { + int ilst = FstBlockC (ib + 1); + int lib = ib / nprow; /* local index of row block ib */ + int_t *index = &UrowindVec[UrowindPtr[lib]]; - /* threads are divided into multiple columns */ - int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; - - if (thread_id < THREAD_BLOCK_SIZE) - IndirectJ1[thread_id] = 0; + int num_u_blocks = index[0]; - if (thread_id < THREAD_BLOCK_SIZE) - { - if (thread_id < nsupc) - { - /* fstnz subscript of each column in the block */ - IndirectJ1[thread_id] = index[iuip_lib + thread_id]; - } - } + int ljb = (jb) / npcol; /* local index of column block jb */ - /* perform an inclusive block-wide prefix sum among all threads */ - if (thread_id < THREAD_BLOCK_SIZE) - BlockScan(temp_storage).InclusiveSum(IndirectJ1[thread_id], IndirectJ1[thread_id]); + /* Each thread is responsible for one block column */ + __shared__ int ljb_ind; + /*do a search ljb_ind at local row lib*/ + int blks_per_threads = CEILING(num_u_blocks, THREAD_BLOCK_SIZE); + for (int i = 0; i < blks_per_threads; ++i) + /* each thread is assigned a chunk of consecutive U blocks to search */ + { + /* only one thread finds the block index matching ljb */ + if (thread_id * blks_per_threads + i < num_u_blocks && + local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + thread_id * blks_per_threads + i ].ljb == ljb) + { + ljb_ind = thread_id * blks_per_threads + i; + } + } + __syncthreads(); - if (thread_id < THREAD_BLOCK_SIZE) - IndirectJ1[thread_id] = -IndirectJ1[thread_id] + ilst * thread_id; + int iuip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].iuip; + int ruip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].ruip; + iuip_lib += UB_DESCRIPTOR; + double *Unzval_lib = &UnzvalVec[UnzvalPtr[lib]]; + double *ucol = &Unzval_lib[ruip_lib]; - __syncthreads(); + if (thread_id < temp_nbrow) /* row-wise */ + { + /* cyclically map each thread to a row */ + indirect_thread[thread_id] = (int) lsub[lptr + thread_id]; + } - device_scatter_u_2D ( - thread_id, - temp_nbrow, nsupc, - ucol, - usub, iukp, - ilst, klst, - index, iuip_lib, - tempv1, nrows, - indirect_thread, - nnz_cols, ColPerBlock, - IndirectJ1, - IndirectJ3 ); - - } else { /* ib >= jb, scatter L code */ - - int rel; - double *nzval; - int_t *index = &LrowindVec[LrowindPtr[ljb]]; - int num_l_blocks = index[0]; - int ldv = index[1]; - - int fnz = FstBlockC (ib); - int lib = ib / nprow; - - __shared__ int lib_ind; - /*do a search lib_ind for lib*/ - int blks_per_threads = CEILING(num_l_blocks, THREAD_BLOCK_SIZE); - for (int i = 0; i < blks_per_threads; ++i) - { - if (thread_id * blks_per_threads + i < num_l_blocks && - local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + thread_id * blks_per_threads + i ].lib == lib) - { - lib_ind = thread_id * blks_per_threads + i; - } - } - __syncthreads(); + /* column-wise: each thread is assigned one column */ + if (thread_id < nnz_cols) + IndirectJ3[thread_id] = A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; + /* indirectJ3[j] == kk means the j-th nonzero segment + points to column kk in this supernode */ - int lptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].lptrj; - int luptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].luptrj; - lptrj += LB_DESCRIPTOR; - int dest_nbrow = index[lptrj - 1]; + __syncthreads(); - if (thread_id < dest_nbrow) - { - rel = index[lptrj + thread_id] - fnz; - indirect_thread[rel] = thread_id; - } - __syncthreads(); + /* threads are divided into multiple columns */ + int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; + + if (thread_id < THREAD_BLOCK_SIZE) + IndirectJ1[thread_id] = 0; + + if (thread_id < THREAD_BLOCK_SIZE) + { + if (thread_id < nsupc) + { + /* fstnz subscript of each column in the block */ + IndirectJ1[thread_id] = index[iuip_lib + thread_id]; + } + } + + /* perform an inclusive block-wide prefix sum among all threads */ + if (thread_id < THREAD_BLOCK_SIZE) + BlockScan(temp_storage).InclusiveSum(IndirectJ1[thread_id], IndirectJ1[thread_id]); + + if (thread_id < THREAD_BLOCK_SIZE) + IndirectJ1[thread_id] = -IndirectJ1[thread_id] + ilst * thread_id; + + __syncthreads(); + + device_scatter_u_2D ( + thread_id, + temp_nbrow, nsupc, + ucol, + usub, iukp, + ilst, klst, + index, iuip_lib, + tempv1, nrows, + indirect_thread, + nnz_cols, ColPerBlock, + IndirectJ1, + IndirectJ3 ); - /* can be precalculated */ - if (thread_id < temp_nbrow) - { - rel = lsub[lptr + thread_id] - fnz; - indirect2_thread[thread_id] = indirect_thread[rel]; } - if (thread_id < nnz_cols) - IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; - __syncthreads(); + else /* ib >= jb, scatter L code */ + { - int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; + int rel; + double *nzval; + int_t *index = &LrowindVec[LrowindPtr[ljb]]; + int num_l_blocks = index[0]; + int ldv = index[1]; + + int fnz = FstBlockC (ib); + int lib = ib / nprow; + + __shared__ int lib_ind; + /*do a search lib_ind for lib*/ + int blks_per_threads = CEILING(num_l_blocks, THREAD_BLOCK_SIZE); + for (int i = 0; i < blks_per_threads; ++i) + { + if (thread_id * blks_per_threads + i < num_l_blocks && + local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + thread_id * blks_per_threads + i ].lib == lib) + { + lib_ind = thread_id * blks_per_threads + i; + } + } + __syncthreads(); + + int lptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].lptrj; + int luptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].luptrj; + lptrj += LB_DESCRIPTOR; + int dest_nbrow = index[lptrj - 1]; + + if (thread_id < dest_nbrow) + { + rel = index[lptrj + thread_id] - fnz; + indirect_thread[rel] = thread_id; + } + __syncthreads(); - nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj; - device_scatter_l_2D( - thread_id, - nsupc, temp_nbrow, - usub, iukp, klst, - nzval, ldv, - tempv1, nrows, indirect2_thread, - nnz_cols, ColPerBlock, - IndirectJ3); - } /* end else ib >= jb */ + /* can be precalculated */ + if (thread_id < temp_nbrow) + { + rel = lsub[lptr + thread_id] - fnz; + indirect2_thread[thread_id] = indirect_thread[rel]; + } + if (thread_id < nnz_cols) + IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; + __syncthreads(); + + int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; + + nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj; + device_scatter_l_2D( + thread_id, + nsupc, temp_nbrow, + usub, iukp, klst, + nzval, ldv, + tempv1, nrows, indirect2_thread, + nnz_cols, ColPerBlock, + IndirectJ3); + } /* end else ib >= jb */ } /* end Scatter_GPU_kernel */ @@ -470,263 +473,264 @@ int_t SchurCompUpdate_GPU( ) { - LUstruct_gpu * A_gpu = sluGPU->A_gpu; - LUstruct_gpu * dA_gpu = sluGPU->dA_gpu; - int_t nprow = grid->nprow; - int_t npcol = grid->npcol; - - cudaStream_t FunCallStream = sluGPU->funCallStreams[streamId]; - cublasHandle_t cublas_handle0 = sluGPU->cublasHandles[streamId]; - int_t * lsub = A_gpu->scubufs[streamId].lsub_buf; - int_t * usub = A_gpu->scubufs[streamId].usub_buf; - Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info_host; - double * Remain_L_buff = A_gpu->scubufs[streamId].Remain_L_buff_host; - Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info_host; - double * bigU = A_gpu->scubufs[streamId].bigU_host; - - A_gpu->isOffloaded[k0] = 1; - /* start by sending data to */ - int_t *xsup = A_gpu->xsup_host; - int_t col_back = (jj_cpu == 0) ? 0 : Ublock_info[jj_cpu - 1].full_u_cols; - // if(nub<1) return; - int_t ncols = Ublock_info[nub - 1].full_u_cols - col_back; - - /* Sherry: can get max_super_size from sp_ienv(3) */ - int_t indirectJ1[MAX_SUPER_SIZE]; // 0 indicates an empry segment - int_t indirectJ2[MAX_SUPER_SIZE]; // # of nonzero segments so far - int_t indirectJ3[MAX_SUPER_SIZE]; /* indirectJ3[j] == k means the + LUstruct_gpu * A_gpu = sluGPU->A_gpu; + LUstruct_gpu * dA_gpu = sluGPU->dA_gpu; + int_t nprow = grid->nprow; + int_t npcol = grid->npcol; + + cudaStream_t FunCallStream = sluGPU->funCallStreams[streamId]; + cublasHandle_t cublas_handle0 = sluGPU->cublasHandles[streamId]; + int_t * lsub = A_gpu->scubufs[streamId].lsub_buf; + int_t * usub = A_gpu->scubufs[streamId].usub_buf; + Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info_host; + double * Remain_L_buff = A_gpu->scubufs[streamId].Remain_L_buff_host; + Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info_host; + double * bigU = A_gpu->scubufs[streamId].bigU_host; + + A_gpu->isOffloaded[k0] = 1; + /* start by sending data to */ + int_t *xsup = A_gpu->xsup_host; + int_t col_back = (jj_cpu == 0) ? 0 : Ublock_info[jj_cpu - 1].full_u_cols; + // if(nub<1) return; + int_t ncols = Ublock_info[nub - 1].full_u_cols - col_back; + + /* Sherry: can get max_super_size from sp_ienv(3) */ + int_t indirectJ1[MAX_SUPER_SIZE]; // 0 indicates an empry segment + int_t indirectJ2[MAX_SUPER_SIZE]; // # of nonzero segments so far + int_t indirectJ3[MAX_SUPER_SIZE]; /* indirectJ3[j] == k means the j-th nonzero segment points to column k in this supernode */ - /* calculate usub_indirect */ - for (int jj = jj_cpu; jj < nub; ++jj) - { - int_t iukp = Ublock_info[jj].iukp; - int_t jb = Ublock_info[jj].jb; - int_t nsupc = SuperSize (jb); - int_t addr = (jj == 0) ? 0 - : Ublock_info[jj - 1].full_u_cols - col_back; - - for (int_t kk = 0; kk < MAX_SUPER_SIZE; ++kk) + /* calculate usub_indirect */ + for (int jj = jj_cpu; jj < nub; ++jj) { - indirectJ1[kk] = 0; - } + int_t iukp = Ublock_info[jj].iukp; + int_t jb = Ublock_info[jj].jb; + int_t nsupc = SuperSize (jb); + int_t addr = (jj == 0) ? 0 + : Ublock_info[jj - 1].full_u_cols - col_back; - for (int_t kk = 0; kk < nsupc; ++kk) - { - indirectJ1[kk] = ((klst - usub[iukp + kk]) == 0) ? 0 : 1; - } + for (int_t kk = 0; kk < MAX_SUPER_SIZE; ++kk) + { + indirectJ1[kk] = 0; + } - /*prefix sum - indicates # of nonzero segments up to column kk */ - indirectJ2[0] = indirectJ1[0]; - for (int_t kk = 1; kk < MAX_SUPER_SIZE; ++kk) - { - indirectJ2[kk] = indirectJ2[kk - 1] + indirectJ1[kk]; - } + for (int_t kk = 0; kk < nsupc; ++kk) + { + indirectJ1[kk] = ((klst - usub[iukp + kk]) == 0) ? 0 : 1; + } - /* total number of nonzero segments in this supernode */ - int nnz_col = indirectJ2[MAX_SUPER_SIZE - 1]; + /*prefix sum - indicates # of nonzero segments up to column kk */ + indirectJ2[0] = indirectJ1[0]; + for (int_t kk = 1; kk < MAX_SUPER_SIZE; ++kk) + { + indirectJ2[kk] = indirectJ2[kk - 1] + indirectJ1[kk]; + } - /* compactation */ - for (int_t kk = 0; kk < MAX_SUPER_SIZE; ++kk) - { - if (indirectJ1[kk]) /* kk is a nonzero segment */ - { - /* indirectJ3[j] == kk means the j-th nonzero segment - points to column kk in this supernode */ - indirectJ3[indirectJ2[kk] - 1] = kk; - } - } + /* total number of nonzero segments in this supernode */ + int nnz_col = indirectJ2[MAX_SUPER_SIZE - 1]; - for (int i = 0; i < nnz_col; ++i) - { - /* addr == total # of full columns before current block jj */ - A_gpu->scubufs[streamId].usub_IndirectJ3_host[addr + i] = indirectJ3[i]; - } - } /* end for jj ... calculate usub_indirect */ - - //printf("SchurCompUpdate_GPU[3]: jj_cpu %d, nub %d\n", jj_cpu, nub); fflush(stdout); - - /*sizeof RemainLbuf = Rnbuf*knsupc */ - double tTmp = SuperLU_timer_(); - cudaEventRecord(A_gpu->ePCIeH2D[k0], FunCallStream); - - checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].usub_IndirectJ3, - A_gpu->scubufs[streamId].usub_IndirectJ3_host, - ncols * sizeof(int_t), cudaMemcpyHostToDevice, - FunCallStream)) ; - - checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Remain_L_buff, Remain_L_buff, - Remain_lbuf_send_size * sizeof(double), - cudaMemcpyHostToDevice, FunCallStream)) ; - - checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].bigU, bigU, - bigu_send_size * sizeof(double), - cudaMemcpyHostToDevice, FunCallStream) ); - - checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Remain_info, Remain_info, - RemainBlk * sizeof(Remain_info_t), - cudaMemcpyHostToDevice, FunCallStream) ); - - checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Ublock_info, Ublock_info, - mcb * sizeof(Ublock_info_t), cudaMemcpyHostToDevice, - FunCallStream) ); - - checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].lsub, lsub, - lsub_len * sizeof(int_t), cudaMemcpyHostToDevice, - FunCallStream) ); - - checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].usub, usub, - usub_len * sizeof(int_t), cudaMemcpyHostToDevice, - FunCallStream) ); - - A_gpu->tHost_PCIeH2D += SuperLU_timer_() - tTmp; - A_gpu->cPCIeH2D += Remain_lbuf_send_size * sizeof(double) - + bigu_send_size * sizeof(double) - + RemainBlk * sizeof(Remain_info_t) - + mcb * sizeof(Ublock_info_t) - + lsub_len * sizeof(int_t) - + usub_len * sizeof(int_t); - - double alpha = 1.0, beta = 0.0; - - int_t ii_st = 0; - int_t ii_end = 0; - int_t maxGemmBlockDim= (int) sqrt(buffer_size); - // int_t maxGemmBlockDim = 8000; - - /* Organize GEMM by blocks of [ii_st : ii_end, jj_st : jj_end] that - fits in the buffer_size */ - while (ii_end < RemainBlk) - { - ii_st = ii_end; - ii_end = RemainBlk; - int_t nrow_max = maxGemmBlockDim; -// nrow_max = Rnbrow; - int_t remaining_rows = (ii_st == 0) ? Rnbrow : Rnbrow - Remain_info[ii_st - 1].FullRow; - nrow_max = (remaining_rows / nrow_max) > 0 ? remaining_rows / CEILING(remaining_rows, nrow_max) : nrow_max; + /* compactation */ + for (int_t kk = 0; kk < MAX_SUPER_SIZE; ++kk) + { + if (indirectJ1[kk]) /* kk is a nonzero segment */ + { + /* indirectJ3[j] == kk means the j-th nonzero segment + points to column kk in this supernode */ + indirectJ3[indirectJ2[kk] - 1] = kk; + } + } - int_t ResRow = (ii_st == 0) ? 0 : Remain_info[ii_st - 1].FullRow; - for (int_t i = ii_st; i < RemainBlk - 1; ++i) - { - if ( Remain_info[i + 1].FullRow > ResRow + nrow_max) + for (int i = 0; i < nnz_col; ++i) { - ii_end = i; - break; /* row dimension reaches nrow_max */ + /* addr == total # of full columns before current block jj */ + A_gpu->scubufs[streamId].usub_IndirectJ3_host[addr + i] = indirectJ3[i]; } - } + } /* end for jj ... calculate usub_indirect */ - int_t nrows; /* actual row dimension for GEMM */ - int_t st_row; - if (ii_st > 0) - { - nrows = Remain_info[ii_end - 1].FullRow - Remain_info[ii_st - 1].FullRow; - st_row = Remain_info[ii_st - 1].FullRow; - } - else - { - nrows = Remain_info[ii_end - 1].FullRow; - st_row = 0; - } + //printf("SchurCompUpdate_GPU[3]: jj_cpu %d, nub %d\n", jj_cpu, nub); fflush(stdout); + + /*sizeof RemainLbuf = Rnbuf*knsupc */ + double tTmp = SuperLU_timer_(); + cudaEventRecord(A_gpu->ePCIeH2D[k0], FunCallStream); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].usub_IndirectJ3, + A_gpu->scubufs[streamId].usub_IndirectJ3_host, + ncols * sizeof(int_t), cudaMemcpyHostToDevice, + FunCallStream)) ; + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Remain_L_buff, Remain_L_buff, + Remain_lbuf_send_size * sizeof(double), + cudaMemcpyHostToDevice, FunCallStream)) ; + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].bigU, bigU, + bigu_send_size * sizeof(double), + cudaMemcpyHostToDevice, FunCallStream) ); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Remain_info, Remain_info, + RemainBlk * sizeof(Remain_info_t), + cudaMemcpyHostToDevice, FunCallStream) ); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Ublock_info, Ublock_info, + mcb * sizeof(Ublock_info_t), cudaMemcpyHostToDevice, + FunCallStream) ); - int_t jj_st = jj_cpu; - int_t jj_end = jj_cpu; + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].lsub, lsub, + lsub_len * sizeof(int_t), cudaMemcpyHostToDevice, + FunCallStream) ); - while (jj_end < nub && nrows > 0 ) + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].usub, usub, + usub_len * sizeof(int_t), cudaMemcpyHostToDevice, + FunCallStream) ); + + A_gpu->tHost_PCIeH2D += SuperLU_timer_() - tTmp; + A_gpu->cPCIeH2D += Remain_lbuf_send_size * sizeof(double) + + bigu_send_size * sizeof(double) + + RemainBlk * sizeof(Remain_info_t) + + mcb * sizeof(Ublock_info_t) + + lsub_len * sizeof(int_t) + + usub_len * sizeof(int_t); + + double alpha = 1.0, beta = 0.0; + + int_t ii_st = 0; + int_t ii_end = 0; + int_t maxGemmBlockDim = (int) sqrt(buffer_size); + // int_t maxGemmBlockDim = 8000; + + /* Organize GEMM by blocks of [ii_st : ii_end, jj_st : jj_end] that + fits in the buffer_size */ + while (ii_end < RemainBlk) { - int_t remaining_cols = (jj_st == jj_cpu) ? ncols : ncols - Ublock_info[jj_st - 1].full_u_cols; - if ( remaining_cols * nrows < buffer_size) - { - jj_st = jj_end; - jj_end = nub; - } - else /* C matrix cannot fit in buffer, need to break into pieces */ - { - int_t ncol_max = buffer_size / nrows; - /** Must revisit **/ - ncol_max = SUPERLU_MIN(ncol_max, maxGemmBlockDim); - ncol_max = (remaining_cols / ncol_max) > 0 ? - remaining_cols / CEILING(remaining_cols, ncol_max) - : ncol_max; - - jj_st = jj_end; - jj_end = nub; - - int_t ResCol = (jj_st == 0) ? 0 : Ublock_info[jj_st - 1].full_u_cols; - for (int_t j = jj_st; j < nub - 1; ++j) + ii_st = ii_end; + ii_end = RemainBlk; + int_t nrow_max = maxGemmBlockDim; +// nrow_max = Rnbrow; + int_t remaining_rows = (ii_st == 0) ? Rnbrow : Rnbrow - Remain_info[ii_st - 1].FullRow; + nrow_max = (remaining_rows / nrow_max) > 0 ? remaining_rows / CEILING(remaining_rows, nrow_max) : nrow_max; + + int_t ResRow = (ii_st == 0) ? 0 : Remain_info[ii_st - 1].FullRow; + for (int_t i = ii_st; i < RemainBlk - 1; ++i) { - if (Ublock_info[j + 1].full_u_cols > ResCol + ncol_max) - { - jj_end = j; - break; - } + if ( Remain_info[i + 1].FullRow > ResRow + nrow_max) + { + ii_end = i; + break; /* row dimension reaches nrow_max */ + } + } + + int_t nrows; /* actual row dimension for GEMM */ + int_t st_row; + if (ii_st > 0) + { + nrows = Remain_info[ii_end - 1].FullRow - Remain_info[ii_st - 1].FullRow; + st_row = Remain_info[ii_st - 1].FullRow; } - } /* end-if-else */ - - int_t ncols; - int_t st_col; - if (jj_st > 0) - { - ncols = Ublock_info[jj_end - 1].full_u_cols - Ublock_info[jj_st - 1].full_u_cols; - st_col = Ublock_info[jj_st - 1].full_u_cols; - if (ncols == 0) exit(0); - } - else - { - ncols = Ublock_info[jj_end - 1].full_u_cols; - st_col = 0; - } - - /* none of the matrix dimension is zero. */ - if (nrows > 0 && ldu > 0 && ncols > 0) - { - if (nrows * ncols > buffer_size) { - printf("!! Matrix size %lld x %lld exceeds buffer_size \n", - nrows, ncols, buffer_size); - fflush(stdout); + else + { + nrows = Remain_info[ii_end - 1].FullRow; + st_row = 0; } - assert(nrows * ncols <= buffer_size); - cublasSetStream(cublas_handle0, FunCallStream); - cudaEventRecord(A_gpu->GemmStart[k0], FunCallStream); - cublasDgemm(cublas_handle0, CUBLAS_OP_N, CUBLAS_OP_N, - nrows, ncols, ldu, &alpha, - &A_gpu->scubufs[streamId].Remain_L_buff[(knsupc - ldu) * Rnbrow + st_row], Rnbrow, - &A_gpu->scubufs[streamId].bigU[st_col * ldu], ldu, - &beta, A_gpu->scubufs[streamId].bigV, nrows); + + int_t jj_st = jj_cpu; + int_t jj_end = jj_cpu; + + while (jj_end < nub && nrows > 0 ) + { + int_t remaining_cols = (jj_st == jj_cpu) ? ncols : ncols - Ublock_info[jj_st - 1].full_u_cols; + if ( remaining_cols * nrows < buffer_size) + { + jj_st = jj_end; + jj_end = nub; + } + else /* C matrix cannot fit in buffer, need to break into pieces */ + { + int_t ncol_max = buffer_size / nrows; + /** Must revisit **/ + ncol_max = SUPERLU_MIN(ncol_max, maxGemmBlockDim); + ncol_max = (remaining_cols / ncol_max) > 0 ? + remaining_cols / CEILING(remaining_cols, ncol_max) + : ncol_max; + + jj_st = jj_end; + jj_end = nub; + + int_t ResCol = (jj_st == 0) ? 0 : Ublock_info[jj_st - 1].full_u_cols; + for (int_t j = jj_st; j < nub - 1; ++j) + { + if (Ublock_info[j + 1].full_u_cols > ResCol + ncol_max) + { + jj_end = j; + break; + } + } + } /* end-if-else */ + + int_t ncols; + int_t st_col; + if (jj_st > 0) + { + ncols = Ublock_info[jj_end - 1].full_u_cols - Ublock_info[jj_st - 1].full_u_cols; + st_col = Ublock_info[jj_st - 1].full_u_cols; + if (ncols == 0) exit(0); + } + else + { + ncols = Ublock_info[jj_end - 1].full_u_cols; + st_col = 0; + } + + /* none of the matrix dimension is zero. */ + if (nrows > 0 && ldu > 0 && ncols > 0) + { + if (nrows * ncols > buffer_size) + { + printf("!! Matrix size %lld x %lld exceeds buffer_size \n", + nrows, ncols, buffer_size); + fflush(stdout); + } + assert(nrows * ncols <= buffer_size); + cublasSetStream(cublas_handle0, FunCallStream); + cudaEventRecord(A_gpu->GemmStart[k0], FunCallStream); + cublasDgemm(cublas_handle0, CUBLAS_OP_N, CUBLAS_OP_N, + nrows, ncols, ldu, &alpha, + &A_gpu->scubufs[streamId].Remain_L_buff[(knsupc - ldu) * Rnbrow + st_row], Rnbrow, + &A_gpu->scubufs[streamId].bigU[st_col * ldu], ldu, + &beta, A_gpu->scubufs[streamId].bigV, nrows); // #define SCATTER_OPT #ifdef SCATTER_OPT - cudaStreamSynchronize(FunCallStream); + cudaStreamSynchronize(FunCallStream); #warning this function is synchrnous #endif - cudaEventRecord(A_gpu->GemmEnd[k0], FunCallStream); + cudaEventRecord(A_gpu->GemmEnd[k0], FunCallStream); - A_gpu->GemmFLOPCounter += 2.0 * (double) nrows * ncols * ldu ; + A_gpu->GemmFLOPCounter += 2.0 * (double) nrows * ncols * ldu ; - /* - * Scattering the output - */ - dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread + /* + * Scattering the output + */ + dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread - dim3 dimGrid(ii_end - ii_st, jj_end - jj_st); + dim3 dimGrid(ii_end - ii_st, jj_end - jj_st); - Scatter_GPU_kernel <<< dimGrid, dimBlock, 0, FunCallStream>>> - (streamId, ii_st, ii_end, jj_st, jj_end, klst, - 0, nrows, ldt, npcol, nprow, dA_gpu); + Scatter_GPU_kernel <<< dimGrid, dimBlock, 0, FunCallStream>>> + (streamId, ii_st, ii_end, jj_st, jj_end, klst, + 0, nrows, ldt, npcol, nprow, dA_gpu); #ifdef SCATTER_OPT - cudaStreamSynchronize(FunCallStream); + cudaStreamSynchronize(FunCallStream); #warning this function is synchrnous #endif - cudaEventRecord(A_gpu->ScatterEnd[k0], FunCallStream); + cudaEventRecord(A_gpu->ScatterEnd[k0], FunCallStream); - A_gpu->ScatterMOPCounter += 3.0 * (double) nrows * ncols; - } /* endif ... none of the matrix dimension is zero. */ + A_gpu->ScatterMOPCounter += 3.0 * (double) nrows * ncols; + } /* endif ... none of the matrix dimension is zero. */ - } /* end while jj_end < nub */ + } /* end while jj_end < nub */ - } /* end while (ii_end < RemainBlk) */ + } /* end while (ii_end < RemainBlk) */ - return 0; + return 0; } /* end SchurCompUpdate_GPU */ @@ -861,14 +865,16 @@ int_t free_LUstruct_gpu (LUstruct_gpu * A_gpu) -void dPrint_matrix( char *desc, int_t m, int_t n, double * dA, int_t lda ) { +void dPrint_matrix( char *desc, int_t m, int_t n, double * dA, int_t lda ) +{ double *cPtr = (double *) malloc(sizeof(double) * lda * n); checkCuda(cudaMemcpy( cPtr, dA, lda * n * sizeof(double), cudaMemcpyDeviceToHost)) ; int_t i, j; printf( "\n %s\n", desc ); - for ( i = 0; i < m; i++ ) { + for ( i = 0; i < m; i++ ) + { for ( j = 0; j < n; j++ ) printf( " %.3e", cPtr[i + j * lda] ); printf( "\n" ); } @@ -877,41 +883,41 @@ void dPrint_matrix( char *desc, int_t m, int_t n, double * dA, int_t lda ) { void printGPUStats(LUstruct_gpu * A_gpu) { - double tGemm = 0; - double tScatter = 0; - double tPCIeH2D = 0; - double tPCIeD2H = 0; + double tGemm = 0; + double tScatter = 0; + double tPCIeH2D = 0; + double tPCIeD2H = 0; - for (int_t i = 0; i < A_gpu->nsupers; ++i) - { - float milliseconds = 0; - - if (A_gpu->isOffloaded[i]) + for (int_t i = 0; i < A_gpu->nsupers; ++i) { - cudaEventElapsedTime(&milliseconds, A_gpu->ePCIeH2D[i], A_gpu->GemmStart[i]); - tPCIeH2D += 1e-3 * (double) milliseconds; - milliseconds = 0; - cudaEventElapsedTime(&milliseconds, A_gpu->GemmStart[i], A_gpu->GemmEnd[i]); - tGemm += 1e-3 * (double) milliseconds; - milliseconds = 0; - cudaEventElapsedTime(&milliseconds, A_gpu->GemmEnd[i], A_gpu->ScatterEnd[i]); - tScatter += 1e-3 * (double) milliseconds; + float milliseconds = 0; + + if (A_gpu->isOffloaded[i]) + { + cudaEventElapsedTime(&milliseconds, A_gpu->ePCIeH2D[i], A_gpu->GemmStart[i]); + tPCIeH2D += 1e-3 * (double) milliseconds; + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, A_gpu->GemmStart[i], A_gpu->GemmEnd[i]); + tGemm += 1e-3 * (double) milliseconds; + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, A_gpu->GemmEnd[i], A_gpu->ScatterEnd[i]); + tScatter += 1e-3 * (double) milliseconds; + } + + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, A_gpu->ePCIeD2H_Start[i], A_gpu->ePCIeD2H_End[i]); + tPCIeD2H += 1e-3 * (double) milliseconds; } - milliseconds = 0; - cudaEventElapsedTime(&milliseconds, A_gpu->ePCIeD2H_Start[i], A_gpu->ePCIeD2H_End[i]); - tPCIeD2H += 1e-3 * (double) milliseconds; - } - - printf("GPU: Flops offloaded %.3e Time spent %lf Flop rate %lf GF/sec \n", - A_gpu->GemmFLOPCounter, tGemm, 1e-9 * A_gpu->GemmFLOPCounter / tGemm ); - printf("GPU: Mop offloaded %.3e Time spent %lf Bandwidth %lf GByte/sec \n", - A_gpu->ScatterMOPCounter, tScatter, 8e-9 * A_gpu->ScatterMOPCounter / tScatter ); - printf("PCIe Data Transfer H2D:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", - 1e-9 * A_gpu->cPCIeH2D, A_gpu->tHost_PCIeH2D, tPCIeH2D, 1e-9 * A_gpu->cPCIeH2D / tPCIeH2D ); - printf("PCIe Data Transfer D2H:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", - 1e-9 * A_gpu->cPCIeD2H, A_gpu->tHost_PCIeD2H, tPCIeD2H, 1e-9 * A_gpu->cPCIeD2H / tPCIeD2H ); - fflush(stdout); + printf("GPU: Flops offloaded %.3e Time spent %lf Flop rate %lf GF/sec \n", + A_gpu->GemmFLOPCounter, tGemm, 1e-9 * A_gpu->GemmFLOPCounter / tGemm ); + printf("GPU: Mop offloaded %.3e Time spent %lf Bandwidth %lf GByte/sec \n", + A_gpu->ScatterMOPCounter, tScatter, 8e-9 * A_gpu->ScatterMOPCounter / tScatter ); + printf("PCIe Data Transfer H2D:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", + 1e-9 * A_gpu->cPCIeH2D, A_gpu->tHost_PCIeH2D, tPCIeH2D, 1e-9 * A_gpu->cPCIeH2D / tPCIeH2D ); + printf("PCIe Data Transfer D2H:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", + 1e-9 * A_gpu->cPCIeD2H, A_gpu->tHost_PCIeD2H, tPCIeD2H, 1e-9 * A_gpu->cPCIeD2H / tPCIeD2H ); + fflush(stdout); } /* end printGPUStats */ @@ -934,12 +940,14 @@ int_t initSluGPU3D_t( int_t* isNodeInMyGrid = sluGPU->isNodeInMyGrid; sluGPU->nCudaStreams = getnCudaStreams(); - if (!grid->iam) { - printf("initSluGPU3D_t: Using hardware acceleration, with %d cuda streams \n", sluGPU->nCudaStreams); - fflush(stdout); - if ( MAX_SUPER_SIZE < ldt ) { - ABORT("MAX_SUPER_SIZE smaller than requested NSUP"); - } + if (!grid->iam) + { + printf("initSluGPU3D_t: Using hardware acceleration, with %d cuda streams \n", sluGPU->nCudaStreams); + fflush(stdout); + if ( MAX_SUPER_SIZE < ldt ) + { + ABORT("MAX_SUPER_SIZE smaller than requested NSUP"); + } } cudaStreamCreate(&(sluGPU->CopyStream)); @@ -1213,11 +1221,11 @@ int_t sendSCUdataHost2GPU( HyP_t* HyP ) { - //{printf("....[enter] sendSCUdataHost2GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} + //{printf("....[enter] sendSCUdataHost2GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} int_t usub_len = usub[2]; int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; - //{printf("....[2] in sendSCUdataHost2GPU, lsub_len %d\n", lsub_len); fflush(stdout);} + //{printf("....[2] in sendSCUdataHost2GPU, lsub_len %d\n", lsub_len); fflush(stdout);} LUstruct_gpu *A_gpu = sluGPU->A_gpu; memcpy(A_gpu->scubufs[streamId].lsub_buf, lsub, sizeof(int_t)*lsub_len); memcpy(A_gpu->scubufs[streamId].usub_buf, usub, sizeof(int_t)*usub_len); @@ -1251,17 +1259,18 @@ void CopyLUToGPU3D ( int_t ldt ) { - gridinfo_t* grid = &(grid3d->grid2d); - LUstruct_gpu * A_gpu = sluGPU->A_gpu; - LUstruct_gpu **dA_gpu = &(sluGPU->dA_gpu); + gridinfo_t* grid = &(grid3d->grid2d); + LUstruct_gpu * A_gpu = sluGPU->A_gpu; + LUstruct_gpu **dA_gpu = &(sluGPU->dA_gpu); #ifdef GPU_DEBUG - if ( grid3d->iam==0 ) { - print_occupany(); - cudaDeviceProp devProp; - cudaGetDeviceProperties(&devProp, 0); - printDevProp(devProp); - } + if ( grid3d->iam == 0 ) + { + print_occupany(); + cudaDeviceProp devProp; + cudaGetDeviceProperties(&devProp, 0); + printDevProp(devProp); + } #endif int_t *xsup ; xsup = Glu_persist->xsup; @@ -1294,55 +1303,55 @@ void CopyLUToGPU3D ( int_t nCudaStreams = sluGPU->nCudaStreams; /*pinned memory allocations. - Paged-locked memory by cudaMallocHost is accessible to the device.*/ + Paged-locked memory by cudaMallocHost is accessible to the device.*/ for (int_t streamId = 0; streamId < nCudaStreams; streamId++ ) { - void *tmp_ptr; - checkCudaErrors(cudaMallocHost( &tmp_ptr, (n) * sizeof(int_t) )) ; - A_gpu->scubufs[streamId].usub_IndirectJ3_host = (int_t*) tmp_ptr; - - checkCudaErrors(cudaMalloc( &tmp_ptr, ( n) * sizeof(int_t) )); - A_gpu->scubufs[streamId].usub_IndirectJ3 = (int_t*) tmp_ptr; - gpu_mem_used += ( n) * sizeof(int_t); - checkCudaErrors(cudaMallocHost( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; - A_gpu->scubufs[streamId].Remain_info_host = (Remain_info_t*)tmp_ptr; - checkCudaErrors(cudaMallocHost( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; - A_gpu->scubufs[streamId].Ublock_info_host = (Ublock_info_t*)tmp_ptr; - checkCudaErrors(cudaMallocHost( &tmp_ptr, remain_l_max * sizeof(double) )) ; - A_gpu->scubufs[streamId].Remain_L_buff_host = (double *) tmp_ptr; - checkCudaErrors(cudaMallocHost( &tmp_ptr, bigu_size * sizeof(double) )) ; - A_gpu->scubufs[streamId].bigU_host = (double *) tmp_ptr; - - cudaMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[1])); - A_gpu->acc_L_buff = (double *) tmp_ptr; - cudaMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[3])); - A_gpu->acc_U_buff = (double *) tmp_ptr; - cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[0])); - A_gpu->scubufs[streamId].lsub_buf = (int_t *) tmp_ptr; - cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[2])); - A_gpu->scubufs[streamId].usub_buf = (int_t *) tmp_ptr; - - checkCudaErrors(cudaMalloc( &tmp_ptr, remain_l_max * sizeof(double) )) ; - A_gpu->scubufs[streamId].Remain_L_buff = (double *) tmp_ptr; - gpu_mem_used += remain_l_max * sizeof(double); - checkCudaErrors(cudaMalloc( &tmp_ptr, bigu_size * sizeof(double) )) ; - A_gpu->scubufs[streamId].bigU = (double *) tmp_ptr; - gpu_mem_used += bigu_size * sizeof(double); - checkCudaErrors(cudaMalloc( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; - A_gpu->scubufs[streamId].Ublock_info = (Ublock_info_t *) tmp_ptr; - gpu_mem_used += mcb * sizeof(Ublock_info_t); - checkCudaErrors(cudaMalloc( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; - A_gpu->scubufs[streamId].Remain_info = (Remain_info_t *) tmp_ptr; - gpu_mem_used += mrb * sizeof(Remain_info_t); - checkCudaErrors(cudaMalloc( &tmp_ptr, buffer_size * sizeof(double))) ; - A_gpu->scubufs[streamId].bigV = (double *) tmp_ptr; - gpu_mem_used += buffer_size * sizeof(double); - checkCudaErrors(cudaMalloc( &tmp_ptr, A_host->bufmax[0]*sizeof(int_t))) ; - A_gpu->scubufs[streamId].lsub = (int_t *) tmp_ptr; - gpu_mem_used += A_host->bufmax[0] * sizeof(int_t); - checkCudaErrors(cudaMalloc( &tmp_ptr, A_host->bufmax[2]*sizeof(int_t))) ; - A_gpu->scubufs[streamId].usub = (int_t *) tmp_ptr; - gpu_mem_used += A_host->bufmax[2] * sizeof(int_t); + void *tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, (n) * sizeof(int_t) )) ; + A_gpu->scubufs[streamId].usub_IndirectJ3_host = (int_t*) tmp_ptr; + + checkCudaErrors(cudaMalloc( &tmp_ptr, ( n) * sizeof(int_t) )); + A_gpu->scubufs[streamId].usub_IndirectJ3 = (int_t*) tmp_ptr; + gpu_mem_used += ( n) * sizeof(int_t); + checkCudaErrors(cudaMallocHost( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; + A_gpu->scubufs[streamId].Remain_info_host = (Remain_info_t*)tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; + A_gpu->scubufs[streamId].Ublock_info_host = (Ublock_info_t*)tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, remain_l_max * sizeof(double) )) ; + A_gpu->scubufs[streamId].Remain_L_buff_host = (double *) tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, bigu_size * sizeof(double) )) ; + A_gpu->scubufs[streamId].bigU_host = (double *) tmp_ptr; + + cudaMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[1])); + A_gpu->acc_L_buff = (double *) tmp_ptr; + cudaMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[3])); + A_gpu->acc_U_buff = (double *) tmp_ptr; + cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[0])); + A_gpu->scubufs[streamId].lsub_buf = (int_t *) tmp_ptr; + cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[2])); + A_gpu->scubufs[streamId].usub_buf = (int_t *) tmp_ptr; + + checkCudaErrors(cudaMalloc( &tmp_ptr, remain_l_max * sizeof(double) )) ; + A_gpu->scubufs[streamId].Remain_L_buff = (double *) tmp_ptr; + gpu_mem_used += remain_l_max * sizeof(double); + checkCudaErrors(cudaMalloc( &tmp_ptr, bigu_size * sizeof(double) )) ; + A_gpu->scubufs[streamId].bigU = (double *) tmp_ptr; + gpu_mem_used += bigu_size * sizeof(double); + checkCudaErrors(cudaMalloc( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; + A_gpu->scubufs[streamId].Ublock_info = (Ublock_info_t *) tmp_ptr; + gpu_mem_used += mcb * sizeof(Ublock_info_t); + checkCudaErrors(cudaMalloc( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; + A_gpu->scubufs[streamId].Remain_info = (Remain_info_t *) tmp_ptr; + gpu_mem_used += mrb * sizeof(Remain_info_t); + checkCudaErrors(cudaMalloc( &tmp_ptr, buffer_size * sizeof(double))) ; + A_gpu->scubufs[streamId].bigV = (double *) tmp_ptr; + gpu_mem_used += buffer_size * sizeof(double); + checkCudaErrors(cudaMalloc( &tmp_ptr, A_host->bufmax[0]*sizeof(int_t))) ; + A_gpu->scubufs[streamId].lsub = (int_t *) tmp_ptr; + gpu_mem_used += A_host->bufmax[0] * sizeof(int_t); + checkCudaErrors(cudaMalloc( &tmp_ptr, A_host->bufmax[2]*sizeof(int_t))) ; + A_gpu->scubufs[streamId].usub = (int_t *) tmp_ptr; + gpu_mem_used += A_host->bufmax[2] * sizeof(int_t); } /* endfor streamID ... allocate paged-locked memory */ @@ -1356,14 +1365,14 @@ void CopyLUToGPU3D ( for (int_t i = 0; i < nsupers; ++i) { - A_gpu->isOffloaded[i] = 0; - checkCudaErrors(cudaEventCreate(&(A_gpu->GemmStart[i]))); - checkCudaErrors(cudaEventCreate(&(A_gpu->GemmEnd[i]))); - checkCudaErrors(cudaEventCreate(&(A_gpu->ScatterEnd[i]))); - checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeH2D[i]))); - checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeH2D[i]))); - checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeD2H_Start[i]))); - checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeD2H_End[i]))); + A_gpu->isOffloaded[i] = 0; + checkCudaErrors(cudaEventCreate(&(A_gpu->GemmStart[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->GemmEnd[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ScatterEnd[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeH2D[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeH2D[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeD2H_Start[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeD2H_End[i]))); } /*---- Copy L data structure to GPU ----*/ @@ -1376,14 +1385,15 @@ void CopyLUToGPU3D ( /* First pass: count total L blocks */ int_t cum_num_l_blocks = 0; /* total number of L blocks I own */ for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) - { /* going through each block column I own */ - - if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) - { - int_t *index = A_host->Lrowind_bc_ptr[i]; - int_t num_l_blocks = index[0]; - cum_num_l_blocks += num_l_blocks; - } + { + /* going through each block column I own */ + + if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) + { + int_t *index = A_host->Lrowind_bc_ptr[i]; + int_t num_l_blocks = index[0]; + cum_num_l_blocks += num_l_blocks; + } } /*allocating memory*/ @@ -1395,35 +1405,35 @@ void CopyLUToGPU3D ( /*initialzing vectors */ for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) { - if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) - { - int_t *index = A_host->Lrowind_bc_ptr[i]; - int_t num_l_blocks = index[0]; /* # L blocks in this column */ - - if (num_l_blocks > 0) + if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) { + int_t *index = A_host->Lrowind_bc_ptr[i]; + int_t num_l_blocks = index[0]; /* # L blocks in this column */ + + if (num_l_blocks > 0) + { + + local_l_blk_info_t *local_l_blk_info_i = local_l_blk_infoVec + cum_num_l_blocks; + local_l_blk_infoPtr[i] = cum_num_l_blocks; - local_l_blk_info_t *local_l_blk_info_i = local_l_blk_infoVec + cum_num_l_blocks; - local_l_blk_infoPtr[i] = cum_num_l_blocks; + int_t lptrj = BC_HEADER; + int_t luptrj = 0; + + for (int_t j = 0; j < num_l_blocks ; ++j) + { - int_t lptrj = BC_HEADER; - int_t luptrj = 0; - - for (int_t j = 0; j < num_l_blocks ; ++j) - { + int_t ijb = index[lptrj]; - int_t ijb = index[lptrj]; + local_l_blk_info_i[j].lib = ijb / Pr; + local_l_blk_info_i[j].lptrj = lptrj; + local_l_blk_info_i[j].luptrj = luptrj; + luptrj += index[lptrj + 1]; + lptrj += LB_DESCRIPTOR + index[lptrj + 1]; - local_l_blk_info_i[j].lib = ijb / Pr; - local_l_blk_info_i[j].lptrj = lptrj; - local_l_blk_info_i[j].luptrj = luptrj; - luptrj += index[lptrj + 1]; - lptrj += LB_DESCRIPTOR + index[lptrj + 1]; - - } + } + } + cum_num_l_blocks += num_l_blocks; } - cum_num_l_blocks += num_l_blocks; - } } /* endfor all block columns */ @@ -1452,13 +1462,13 @@ void CopyLUToGPU3D ( for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) { - if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) - { - int_t *index = A_host->Ufstnz_br_ptr[i]; - int_t num_u_blocks = index[0]; - cum_num_u_blocks += num_u_blocks; + if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) + { + int_t *index = A_host->Ufstnz_br_ptr[i]; + int_t num_u_blocks = index[0]; + cum_num_u_blocks += num_u_blocks; - } + } } @@ -1469,35 +1479,35 @@ void CopyLUToGPU3D ( for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) { - if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) - { - int_t *index = A_host->Ufstnz_br_ptr[i]; - int_t num_u_blocks = index[0]; - - if (num_u_blocks > 0) + if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) { - local_u_blk_info_t *local_u_blk_info_i = local_u_blk_infoVec + cum_num_u_blocks; - local_u_blk_infoPtr[i] = cum_num_u_blocks; + int_t *index = A_host->Ufstnz_br_ptr[i]; + int_t num_u_blocks = index[0]; + + if (num_u_blocks > 0) + { + local_u_blk_info_t *local_u_blk_info_i = local_u_blk_infoVec + cum_num_u_blocks; + local_u_blk_infoPtr[i] = cum_num_u_blocks; - int_t iuip_lib, ruip_lib; - iuip_lib = BR_HEADER; - ruip_lib = 0; + int_t iuip_lib, ruip_lib; + iuip_lib = BR_HEADER; + ruip_lib = 0; - for (int_t j = 0; j < num_u_blocks ; ++j) - { + for (int_t j = 0; j < num_u_blocks ; ++j) + { - int_t ijb = index[iuip_lib]; - local_u_blk_info_i[j].ljb = ijb / Pc; - local_u_blk_info_i[j].iuip = iuip_lib; - local_u_blk_info_i[j].ruip = ruip_lib; + int_t ijb = index[iuip_lib]; + local_u_blk_info_i[j].ljb = ijb / Pc; + local_u_blk_info_i[j].iuip = iuip_lib; + local_u_blk_info_i[j].ruip = ruip_lib; - ruip_lib += index[iuip_lib + 1]; - iuip_lib += UB_DESCRIPTOR + SuperSize (ijb); + ruip_lib += index[iuip_lib + 1]; + iuip_lib += UB_DESCRIPTOR + SuperSize (ijb); - } + } + } + cum_num_u_blocks += num_u_blocks; } - cum_num_u_blocks += num_u_blocks; - } } @@ -1521,35 +1531,35 @@ void CopyLUToGPU3D ( int_t l_val_len = 0; for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ { - int_t pc = PCOL( jb, grid ); - if (mycol == pc && isNodeInMyGrid[jb] == 1) - { - int_t ljb = LBj( jb, grid ); /* Local block number */ - int_t *index_host; - index_host = A_host->Lrowind_bc_ptr[ljb]; - - temp_LrowindPtr[ljb] = l_ind_len; - temp_LnzvalPtr[ljb] = l_val_len; // ### - Lnzval_size[ljb] = 0; //### - if (index_host != NULL) + int_t pc = PCOL( jb, grid ); + if (mycol == pc && isNodeInMyGrid[jb] == 1) { - int_t nrbl = index_host[0]; /* number of L blocks */ - int_t len = index_host[1]; /* LDA of the nzval[] */ - int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *index_host; + index_host = A_host->Lrowind_bc_ptr[ljb]; + + temp_LrowindPtr[ljb] = l_ind_len; + temp_LnzvalPtr[ljb] = l_val_len; // ### + Lnzval_size[ljb] = 0; //### + if (index_host != NULL) + { + int_t nrbl = index_host[0]; /* number of L blocks */ + int_t len = index_host[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - /* Global block number is mycol + ljb*Pc */ - int_t nsupc = SuperSize(jb); + /* Global block number is mycol + ljb*Pc */ + int_t nsupc = SuperSize(jb); + + l_ind_len += len1; + l_val_len += len * nsupc; + Lnzval_size[ljb] = len * nsupc ; // ### + } + else + { + Lnzval_size[ljb] = 0 ; // ### + } - l_ind_len += len1; - l_val_len += len * nsupc; - Lnzval_size[ljb] = len * nsupc ; // ### - } - else - { - Lnzval_size[ljb] = 0 ; // ### } - - } } /* endfor jb = 0 ... */ /* Copy the actual U indices and values */ @@ -1561,24 +1571,24 @@ void CopyLUToGPU3D ( int_t u_val_len = 0; for ( int_t lb = 0; lb < u_k; ++lb) { - int_t *index_host; - index_host = A_host->Ufstnz_br_ptr[lb]; - temp_UrowindPtr[lb] = u_ind_len; - temp_UnzvalPtr[lb] = u_val_len; - Unzval_size[lb] = 0; - if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) - { - int_t len = index_host[1]; - int_t len1 = index_host[2]; - - u_ind_len += len1; - u_val_len += len; - Unzval_size[lb] = len; - } - else - { + int_t *index_host; + index_host = A_host->Ufstnz_br_ptr[lb]; + temp_UrowindPtr[lb] = u_ind_len; + temp_UnzvalPtr[lb] = u_val_len; Unzval_size[lb] = 0; - } + if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) + { + int_t len = index_host[1]; + int_t len1 = index_host[2]; + + u_ind_len += len1; + u_val_len += len; + Unzval_size[lb] = len; + } + else + { + Unzval_size[lb] = 0; + } } gpu_mem_used += l_ind_len * sizeof(int_t); @@ -1590,12 +1600,12 @@ void CopyLUToGPU3D ( for (int_t i = 0; i < l_k; ++i) { - temp_LnzvalPtr[i] = -1; + temp_LnzvalPtr[i] = -1; } for (int_t i = 0; i < u_k; ++i) { - temp_UnzvalPtr[i] = -1; + temp_UnzvalPtr[i] = -1; } /*setting these pointers back */ @@ -1609,63 +1619,63 @@ void CopyLUToGPU3D ( /* Find the trailing matrix size that can fit into GPU memory */ for (int_t i = nsupers - 1; i > -1; --i) { - /* ulte se chalte hai eleimination tree */ - /* bottom up ordering */ - int_t i_sup = A_gpu->perm_c_supno[i]; - - int_t pc = PCOL( i_sup, grid ); - if (isNodeInMyGrid[i_sup] == 1) - { - if (mycol == pc ) - { - int_t ljb = LBj(i_sup, grid); - mem_l_block = sizeof(double) * Lnzval_size[ljb]; - if (gpu_mem_used + mem_l_block > max_gpu_memory) - { - break; - } - else - { - gpu_mem_used += mem_l_block; - temp_LnzvalPtr[ljb] = l_val_len; - l_val_len += Lnzval_size[ljb]; - num_gpu_l_blocks++; - A_gpu->first_l_block_gpu = i; - } - } + /* ulte se chalte hai eleimination tree */ + /* bottom up ordering */ + int_t i_sup = A_gpu->perm_c_supno[i]; - int_t pr = PROW( i_sup, grid ); - if (myrow == pr) + int_t pc = PCOL( i_sup, grid ); + if (isNodeInMyGrid[i_sup] == 1) { - int_t lib = LBi(i_sup, grid); - mem_u_block = sizeof(double) * Unzval_size[lib]; - if (gpu_mem_used + mem_u_block > max_gpu_memory) - { - break; - } - else - { - gpu_mem_used += mem_u_block; - temp_UnzvalPtr[lib] = u_val_len; - u_val_len += Unzval_size[lib]; - num_gpu_u_blocks++; - A_gpu->first_u_block_gpu = i; - } + if (mycol == pc ) + { + int_t ljb = LBj(i_sup, grid); + mem_l_block = sizeof(double) * Lnzval_size[ljb]; + if (gpu_mem_used + mem_l_block > max_gpu_memory) + { + break; + } + else + { + gpu_mem_used += mem_l_block; + temp_LnzvalPtr[ljb] = l_val_len; + l_val_len += Lnzval_size[ljb]; + num_gpu_l_blocks++; + A_gpu->first_l_block_gpu = i; + } + } - } - } /* endif */ + int_t pr = PROW( i_sup, grid ); + if (myrow == pr) + { + int_t lib = LBi(i_sup, grid); + mem_u_block = sizeof(double) * Unzval_size[lib]; + if (gpu_mem_used + mem_u_block > max_gpu_memory) + { + break; + } + else + { + gpu_mem_used += mem_u_block; + temp_UnzvalPtr[lib] = u_val_len; + u_val_len += Unzval_size[lib]; + num_gpu_u_blocks++; + A_gpu->first_u_block_gpu = i; + } + + } + } /* endif */ } /* endfor i .... nsupers */ #if (PRNTlevel>=1) - printf("(%d) Number of L blocks in GPU %d, U blocks %d\n", + printf("(%d) Number of L blocks in GPU %d, U blocks %d\n", grid3d->iam, num_gpu_l_blocks, num_gpu_u_blocks ); - printf("(%d) elimination order of first block in GPU: L block %d, U block %d\n", + printf("(%d) elimination order of first block in GPU: L block %d, U block %d\n", grid3d->iam, A_gpu->first_l_block_gpu, A_gpu->first_u_block_gpu); printf("(%d) Memory of L %.1f GB, memory for U %.1f GB, Total device memory used %.1f GB, Memory allowed %.1f GB \n", grid3d->iam, - l_val_len * sizeof(double)*1e-9, - u_val_len * sizeof(double)*1e-9, - gpu_mem_used*1e-9, max_gpu_memory*1e-9); + l_val_len * sizeof(double) * 1e-9, + u_val_len * sizeof(double) * 1e-9, + gpu_mem_used * 1e-9, max_gpu_memory * 1e-9); fflush(stdout); #endif @@ -1673,22 +1683,22 @@ void CopyLUToGPU3D ( int_t *indtemp = (int_t *) malloc(sizeof(int_t) * l_ind_len); for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ { - int_t pc = PCOL( jb, grid ); - if (mycol == pc && isNodeInMyGrid[jb] == 1) - { - int_t ljb = LBj( jb, grid ); /* Local block number */ - int_t *index_host; - index_host = A_host->Lrowind_bc_ptr[ljb]; - - if (index_host != NULL) + int_t pc = PCOL( jb, grid ); + if (mycol == pc && isNodeInMyGrid[jb] == 1) { - int_t nrbl = index_host[0]; /* number of L blocks */ - int_t len = index_host[1]; /* LDA of the nzval[] */ - int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *index_host; + index_host = A_host->Lrowind_bc_ptr[ljb]; - memcpy(&indtemp[temp_LrowindPtr[ljb]] , index_host, len1 * sizeof(int_t)) ; + if (index_host != NULL) + { + int_t nrbl = index_host[0]; /* number of L blocks */ + int_t len = index_host[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + + memcpy(&indtemp[temp_LrowindPtr[ljb]] , index_host, len1 * sizeof(int_t)) ; + } } - } } checkCudaErrors(cudaMalloc( &tmp_ptr, l_ind_len * sizeof(int_t))) ; @@ -1840,7 +1850,7 @@ int_t reduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount, } /*if (myGrid == sender)*/ dreduceAllAncestors3d(ilvl, myNodeCount, treePerm, - LUvsb, LUstruct, grid3d, SCT ); + LUvsb, LUstruct, grid3d, SCT ); return 0; } diff --git a/SRC/treeFactorizationGPU.c b/SRC/treeFactorizationGPU.c index 33bdd56d..24b0639a 100644 --- a/SRC/treeFactorizationGPU.c +++ b/SRC/treeFactorizationGPU.c @@ -324,12 +324,27 @@ int_t sparseTreeFactor_ASYNC_GPU( #endif int_t LU_nonempty = dSchurComplementSetupGPU(k, - msgss[offset], packLUInfo, - myIperm, gIperm_c_supno, perm_c_supno, - gEtreeInfo, fNlists, scuBufs, - LUvsbs[offset], grid, LUstruct, HyP); + msgss[offset], packLUInfo, + myIperm, gIperm_c_supno, perm_c_supno, + gEtreeInfo, fNlists, scuBufs, + LUvsbs[offset], grid, LUstruct, HyP); // initializing D2H data transfer. D2H = Device To Host. int_t jj_cpu; /* limit between CPU and GPU */ + +#if 1 + if (superlu_acc_offload) + { + jj_cpu = HyP->num_u_blks_Phi; // -1 ?? + HyP->offloadCondition = 1; + } + else + { + /* code */ + HyP->offloadCondition = 0; + jj_cpu = 0; + } + +#else if (superlu_acc_offload) { jj_cpu = getAccUPartition(HyP); @@ -345,6 +360,7 @@ int_t sparseTreeFactor_ASYNC_GPU( { jj_cpu = 0; } +#endif // int_t jj_cpu = HyP->num_u_blks_Phi-1; // if (HyP->Rnbrow > 0 && jj_cpu>=0) @@ -391,7 +407,7 @@ int_t sparseTreeFactor_ASYNC_GPU( int_t j = ij / HyP->lookAheadBlk; int_t lb = ij % HyP->lookAheadBlk; dblock_gemm_scatterTopLeft(lb, j, bigV, knsupc, klst, lsub, - usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); } #pragma omp for @@ -400,7 +416,7 @@ int_t sparseTreeFactor_ASYNC_GPU( int_t j = ij / HyP->lookAheadBlk; int_t lb = ij % HyP->lookAheadBlk; dblock_gemm_scatterTopRight(lb, j, bigV, knsupc, klst, lsub, - usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); } #pragma omp for @@ -409,7 +425,7 @@ int_t sparseTreeFactor_ASYNC_GPU( int_t j = ij / HyP->RemainBlk; int_t lb = ij % HyP->RemainBlk; dblock_gemm_scatterBottomLeft(lb, j, bigV, knsupc, klst, lsub, - usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); } /* for int_t ij = ... */ } /* end parallel region ... end look-ahead update */ @@ -540,7 +556,7 @@ int_t sparseTreeFactor_ASYNC_GPU( int_t j = ij / HyP->RemainBlk + jj_cpu; int_t lb = ij % HyP->RemainBlk; dblock_gemm_scatterBottomRight(lb, j, bigV, knsupc, klst, lsub, - usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); } /* for int_t ij = ... */ } /* end omp parallel region */ diff --git a/SRC/util.c b/SRC/util.c index 76abe96f..2d4ba451 100644 --- a/SRC/util.c +++ b/SRC/util.c @@ -161,11 +161,12 @@ Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) for (i = 0; i < nb; ++i) if ( Llu->Lrowind_bc_ptr[i] ) { SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); -#ifdef GPU_ACC - checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i])); -#else - SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); -#endif + SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); +// #ifdef GPU_ACC +// checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i])); +// #else +// SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); +// #endif } SUPERLU_FREE (Llu->Lrowind_bc_ptr); SUPERLU_FREE (Llu->Lnzval_bc_ptr); From 39fbf89bc501b9444d0e3c09e3f52271a4861464 Mon Sep 17 00:00:00 2001 From: piyush sao Date: Fri, 5 Feb 2021 12:27:55 -0500 Subject: [PATCH 062/147] minor edits --- SRC/util.c | 1721 ++++++++++++++++++++++++++++------------------------ 1 file changed, 914 insertions(+), 807 deletions(-) diff --git a/SRC/util.c b/SRC/util.c index 2d4ba451..4bb1b224 100644 --- a/SRC/util.c +++ b/SRC/util.c @@ -26,159 +26,151 @@ at the top-level directory. #include "superlu_ddefs.h" /*! \brief Deallocate the structure pointing to the actual storage of the matrix. */ -void -Destroy_SuperMatrix_Store_dist(SuperMatrix *A) +void Destroy_SuperMatrix_Store_dist(SuperMatrix *A) { - SUPERLU_FREE ( A->Store ); + SUPERLU_FREE(A->Store); } -void -Destroy_CompCol_Matrix_dist(SuperMatrix *A) +void Destroy_CompCol_Matrix_dist(SuperMatrix *A) { NCformat *Astore = A->Store; - SUPERLU_FREE( Astore->rowind ); - SUPERLU_FREE( Astore->colptr ); - if ( Astore->nzval ) SUPERLU_FREE( Astore->nzval ); - SUPERLU_FREE( Astore ); + SUPERLU_FREE(Astore->rowind); + SUPERLU_FREE(Astore->colptr); + if (Astore->nzval) + SUPERLU_FREE(Astore->nzval); + SUPERLU_FREE(Astore); } -void -Destroy_CompRowLoc_Matrix_dist(SuperMatrix *A) +void Destroy_CompRowLoc_Matrix_dist(SuperMatrix *A) { NRformat_loc *Astore = A->Store; - SUPERLU_FREE( Astore->rowptr ); - SUPERLU_FREE( Astore->colind ); - SUPERLU_FREE( Astore->nzval ); - SUPERLU_FREE( Astore ); + SUPERLU_FREE(Astore->rowptr); + SUPERLU_FREE(Astore->colind); + SUPERLU_FREE(Astore->nzval); + SUPERLU_FREE(Astore); } -void -Destroy_CompRow_Matrix_dist(SuperMatrix *A) +void Destroy_CompRow_Matrix_dist(SuperMatrix *A) { - SUPERLU_FREE( ((NRformat *)A->Store)->rowptr ); - SUPERLU_FREE( ((NRformat *)A->Store)->colind ); - SUPERLU_FREE( ((NRformat *)A->Store)->nzval ); - SUPERLU_FREE( A->Store ); + SUPERLU_FREE(((NRformat *)A->Store)->rowptr); + SUPERLU_FREE(((NRformat *)A->Store)->colind); + SUPERLU_FREE(((NRformat *)A->Store)->nzval); + SUPERLU_FREE(A->Store); } -void -Destroy_SuperNode_Matrix_dist(SuperMatrix *A) +void Destroy_SuperNode_Matrix_dist(SuperMatrix *A) { - SUPERLU_FREE ( ((SCformat *)A->Store)->rowind ); - SUPERLU_FREE ( ((SCformat *)A->Store)->rowind_colptr ); - SUPERLU_FREE ( ((SCformat *)A->Store)->nzval ); - SUPERLU_FREE ( ((SCformat *)A->Store)->nzval_colptr ); - SUPERLU_FREE ( ((SCformat *)A->Store)->col_to_sup ); - SUPERLU_FREE ( ((SCformat *)A->Store)->sup_to_col ); - SUPERLU_FREE ( A->Store ); + SUPERLU_FREE(((SCformat *)A->Store)->rowind); + SUPERLU_FREE(((SCformat *)A->Store)->rowind_colptr); + SUPERLU_FREE(((SCformat *)A->Store)->nzval); + SUPERLU_FREE(((SCformat *)A->Store)->nzval_colptr); + SUPERLU_FREE(((SCformat *)A->Store)->col_to_sup); + SUPERLU_FREE(((SCformat *)A->Store)->sup_to_col); + SUPERLU_FREE(A->Store); } /*! \brief A is of type Stype==NCP */ -void -Destroy_CompCol_Permuted_dist(SuperMatrix *A) +void Destroy_CompCol_Permuted_dist(SuperMatrix *A) { - SUPERLU_FREE ( ((NCPformat *)A->Store)->colbeg ); - SUPERLU_FREE ( ((NCPformat *)A->Store)->colend ); - SUPERLU_FREE ( A->Store ); + SUPERLU_FREE(((NCPformat *)A->Store)->colbeg); + SUPERLU_FREE(((NCPformat *)A->Store)->colend); + SUPERLU_FREE(A->Store); } /*! \brief A is of type Stype==DN */ -void -Destroy_Dense_Matrix_dist(SuperMatrix *A) +void Destroy_Dense_Matrix_dist(SuperMatrix *A) { - DNformat* Astore = A->Store; - SUPERLU_FREE (Astore->nzval); - SUPERLU_FREE ( A->Store ); + DNformat *Astore = A->Store; + SUPERLU_FREE(Astore->nzval); + SUPERLU_FREE(A->Store); } - - /*! \brief Destroy the binary trees associated with the panel. These are used in triangular solve. */ -void -Destroy_Tree(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) +void Destroy_Tree(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) { int_t i, nb, nsupers; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; -#if ( DEBUGlevel>=1 ) +#if (DEBUGlevel >= 1) int iam; - MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + MPI_Comm_rank(MPI_COMM_WORLD, &iam); CHECK_MALLOC(iam, "Enter Destroy_Tree()"); #endif - nsupers = Glu_persist->supno[n-1] + 1; + nsupers = Glu_persist->supno[n - 1] + 1; nb = CEILING(nsupers, grid->npcol); - for (i=0;iLBtree_ptr[i]!=NULL){ - BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt); - } - if(Llu->UBtree_ptr[i]!=NULL){ - BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt); - } + for (i = 0; i < nb; ++i) + { + if (Llu->LBtree_ptr[i] != NULL) + { + BcTree_Destroy(Llu->LBtree_ptr[i], LUstruct->dt); + } + if (Llu->UBtree_ptr[i] != NULL) + { + BcTree_Destroy(Llu->UBtree_ptr[i], LUstruct->dt); + } } SUPERLU_FREE(Llu->LBtree_ptr); SUPERLU_FREE(Llu->UBtree_ptr); - + nb = CEILING(nsupers, grid->nprow); - for (i=0;iLRtree_ptr[i]!=NULL){ - RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt); - } - if(Llu->URtree_ptr[i]!=NULL){ - RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt); - } + for (i = 0; i < nb; ++i) + { + if (Llu->LRtree_ptr[i] != NULL) + { + RdTree_Destroy(Llu->LRtree_ptr[i], LUstruct->dt); + } + if (Llu->URtree_ptr[i] != NULL) + { + RdTree_Destroy(Llu->URtree_ptr[i], LUstruct->dt); + } } SUPERLU_FREE(Llu->LRtree_ptr); SUPERLU_FREE(Llu->URtree_ptr); -#if ( DEBUGlevel>=1 ) +#if (DEBUGlevel >= 1) CHECK_MALLOC(iam, "Exit Destroy_Tree()"); #endif } - /*! \brief Destroy distributed L & U matrices. */ -void -Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) +void Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) { int_t i, nb, nsupers; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; LocalLU_t *Llu = LUstruct->Llu; -#if ( DEBUGlevel>=1 ) +#if (DEBUGlevel >= 1) int iam; - MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + MPI_Comm_rank(MPI_COMM_WORLD, &iam); CHECK_MALLOC(iam, "Enter Destroy_LU()"); #endif Destroy_Tree(n, grid, LUstruct); // from asynchronous triangular solve - nsupers = Glu_persist->supno[n-1] + 1; + nsupers = Glu_persist->supno[n - 1] + 1; nb = CEILING(nsupers, grid->npcol); - for (i = 0; i < nb; ++i) - if ( Llu->Lrowind_bc_ptr[i] ) { - SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); - SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); -// #ifdef GPU_ACC -// checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i])); -// #else -// SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); -// #endif - } - SUPERLU_FREE (Llu->Lrowind_bc_ptr); - SUPERLU_FREE (Llu->Lnzval_bc_ptr); + for (i = 0; i < nb; ++i) + if (Llu->Lrowind_bc_ptr[i]) + { + SUPERLU_FREE(Llu->Lrowind_bc_ptr[i]); + SUPERLU_FREE(Llu->Lnzval_bc_ptr[i]); + } + SUPERLU_FREE(Llu->Lrowind_bc_ptr); + SUPERLU_FREE(Llu->Lnzval_bc_ptr); nb = CEILING(nsupers, grid->nprow); for (i = 0; i < nb; ++i) - if ( Llu->Ufstnz_br_ptr[i] ) { - SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]); - SUPERLU_FREE (Llu->Unzval_br_ptr[i]); - } - SUPERLU_FREE (Llu->Ufstnz_br_ptr); - SUPERLU_FREE (Llu->Unzval_br_ptr); + if (Llu->Ufstnz_br_ptr[i]) + { + SUPERLU_FREE(Llu->Ufstnz_br_ptr[i]); + SUPERLU_FREE(Llu->Unzval_br_ptr[i]); + } + SUPERLU_FREE(Llu->Ufstnz_br_ptr); + SUPERLU_FREE(Llu->Unzval_br_ptr); /* The following can be freed after factorization. */ SUPERLU_FREE(Llu->ToRecv); @@ -197,89 +189,97 @@ Destroy_LU(int_t n, gridinfo_t *grid, LUstruct_t *LUstruct) SUPERLU_FREE(Llu->mod_bit); nb = CEILING(nsupers, grid->npcol); - for (i = 0; i < nb; ++i) - if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) { - SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]); - } + for (i = 0; i < nb; ++i) + if (Llu->Lindval_loc_bc_ptr[i] != NULL) + { + SUPERLU_FREE(Llu->Lindval_loc_bc_ptr[i]); + } SUPERLU_FREE(Llu->Lindval_loc_bc_ptr); - + nb = CEILING(nsupers, grid->npcol); - for (i=0; iLinv_bc_ptr[i]!=NULL) { - SUPERLU_FREE(Llu->Linv_bc_ptr[i]); - } - if(Llu->Uinv_bc_ptr[i]!=NULL){ - SUPERLU_FREE(Llu->Uinv_bc_ptr[i]); - } + for (i = 0; i < nb; ++i) + { + if (Llu->Linv_bc_ptr[i] != NULL) + { + SUPERLU_FREE(Llu->Linv_bc_ptr[i]); + } + if (Llu->Uinv_bc_ptr[i] != NULL) + { + SUPERLU_FREE(Llu->Uinv_bc_ptr[i]); + } } SUPERLU_FREE(Llu->Linv_bc_ptr); SUPERLU_FREE(Llu->Uinv_bc_ptr); SUPERLU_FREE(Llu->Unnz); - + nb = CEILING(nsupers, grid->npcol); for (i = 0; i < nb; ++i) - if ( Llu->Urbs[i] ) { - SUPERLU_FREE(Llu->Ucb_indptr[i]); - SUPERLU_FREE(Llu->Ucb_valptr[i]); - } + if (Llu->Urbs[i]) + { + SUPERLU_FREE(Llu->Ucb_indptr[i]); + SUPERLU_FREE(Llu->Ucb_valptr[i]); + } SUPERLU_FREE(Llu->Ucb_indptr); - SUPERLU_FREE(Llu->Ucb_valptr); + SUPERLU_FREE(Llu->Ucb_valptr); SUPERLU_FREE(Llu->Urbs); SUPERLU_FREE(Glu_persist->xsup); SUPERLU_FREE(Glu_persist->supno); -#if ( DEBUGlevel>=1 ) +#if (DEBUGlevel >= 1) CHECK_MALLOC(iam, "Exit Destroy_LU()"); #endif } -int DeAllocLlu_3d(int_t n, LUstruct_t * LUstruct, gridinfo3d_t* grid3d) +int DeAllocLlu_3d(int_t n, LUstruct_t *LUstruct, gridinfo3d_t *grid3d) { int i, nbc, nbr, nsupers; LocalLU_t *Llu = LUstruct->Llu; - nsupers = (LUstruct->Glu_persist)->supno[n-1] + 1; + nsupers = (LUstruct->Glu_persist)->supno[n - 1] + 1; nbc = CEILING(nsupers, grid3d->npcol); - for (i = 0; i < nbc; ++i) - if ( Llu->Lrowind_bc_ptr[i] ) { - SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); + for (i = 0; i < nbc; ++i) + if (Llu->Lrowind_bc_ptr[i]) + { + SUPERLU_FREE(Llu->Lrowind_bc_ptr[i]); #ifdef GPU_ACC - checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i])); + checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i])); #else - SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); + SUPERLU_FREE(Llu->Lnzval_bc_ptr[i]); #endif - } - SUPERLU_FREE (Llu->Lrowind_bc_ptr); - SUPERLU_FREE (Llu->Lnzval_bc_ptr); + } + SUPERLU_FREE(Llu->Lrowind_bc_ptr); + SUPERLU_FREE(Llu->Lnzval_bc_ptr); nbr = CEILING(nsupers, grid3d->nprow); for (i = 0; i < nbr; ++i) - if ( Llu->Ufstnz_br_ptr[i] ) { - SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]); - SUPERLU_FREE (Llu->Unzval_br_ptr[i]); - } - SUPERLU_FREE (Llu->Ufstnz_br_ptr); - SUPERLU_FREE (Llu->Unzval_br_ptr); + if (Llu->Ufstnz_br_ptr[i]) + { + SUPERLU_FREE(Llu->Ufstnz_br_ptr[i]); + SUPERLU_FREE(Llu->Unzval_br_ptr[i]); + } + SUPERLU_FREE(Llu->Ufstnz_br_ptr); + SUPERLU_FREE(Llu->Unzval_br_ptr); /* The following can be freed after factorization. */ SUPERLU_FREE(Llu->ToRecv); SUPERLU_FREE(Llu->ToSendD); - for (i = 0; i < nbc; ++i) SUPERLU_FREE(Llu->ToSendR[i]); + for (i = 0; i < nbc; ++i) + SUPERLU_FREE(Llu->ToSendR[i]); SUPERLU_FREE(Llu->ToSendR); return 0; } /*! \brief Allocate storage in ScalePermstruct */ void ScalePermstructInit(const int_t m, const int_t n, - ScalePermstruct_t *ScalePermstruct) + ScalePermstruct_t *ScalePermstruct) { ScalePermstruct->DiagScale = NOEQUIL; - if ( !(ScalePermstruct->perm_r = intMalloc_dist(m)) ) - ABORT("Malloc fails for perm_r[]."); - if ( !(ScalePermstruct->perm_c = intMalloc_dist(n)) ) - ABORT("Malloc fails for perm_c[]."); + if (!(ScalePermstruct->perm_r = intMalloc_dist(m))) + ABORT("Malloc fails for perm_r[]."); + if (!(ScalePermstruct->perm_c = intMalloc_dist(n))) + ABORT("Malloc fails for perm_c[]."); } /*! \brief Deallocate ScalePermstruct */ @@ -287,41 +287,43 @@ void ScalePermstructFree(ScalePermstruct_t *ScalePermstruct) { SUPERLU_FREE(ScalePermstruct->perm_r); SUPERLU_FREE(ScalePermstruct->perm_c); - switch ( ScalePermstruct->DiagScale ) { - case ROW: - SUPERLU_FREE(ScalePermstruct->R); - break; - case COL: - SUPERLU_FREE(ScalePermstruct->C); - break; - case BOTH: - SUPERLU_FREE(ScalePermstruct->R); - SUPERLU_FREE(ScalePermstruct->C); - break; - default: break; + switch (ScalePermstruct->DiagScale) + { + case ROW: + SUPERLU_FREE(ScalePermstruct->R); + break; + case COL: + SUPERLU_FREE(ScalePermstruct->C); + break; + case BOTH: + SUPERLU_FREE(ScalePermstruct->R); + SUPERLU_FREE(ScalePermstruct->C); + break; + default: + break; } } /*! \brief Allocate storage in LUstruct */ void LUstructInit(const int_t n, LUstruct_t *LUstruct) { - if ( !(LUstruct->etree = intMalloc_dist(n)) ) - ABORT("Malloc fails for etree[]."); - if ( !(LUstruct->Glu_persist = (Glu_persist_t *) - SUPERLU_MALLOC(sizeof(Glu_persist_t))) ) - ABORT("Malloc fails for Glu_persist_t."); - if ( !(LUstruct->Llu = (LocalLU_t *) - SUPERLU_MALLOC(sizeof(LocalLU_t))) ) - ABORT("Malloc fails for LocalLU_t."); - LUstruct->Llu->inv = 0; + if (!(LUstruct->etree = intMalloc_dist(n))) + ABORT("Malloc fails for etree[]."); + if (!(LUstruct->Glu_persist = (Glu_persist_t *) + SUPERLU_MALLOC(sizeof(Glu_persist_t)))) + ABORT("Malloc fails for Glu_persist_t."); + if (!(LUstruct->Llu = (LocalLU_t *) + SUPERLU_MALLOC(sizeof(LocalLU_t)))) + ABORT("Malloc fails for LocalLU_t."); + LUstruct->Llu->inv = 0; } /*! \brief Deallocate LUstruct */ void LUstructFree(LUstruct_t *LUstruct) { -#if ( DEBUGlevel>=1 ) +#if (DEBUGlevel >= 1) int iam; - MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + MPI_Comm_rank(MPI_COMM_WORLD, &iam); CHECK_MALLOC(iam, "Enter LUstructFree()"); #endif @@ -329,7 +331,7 @@ void LUstructFree(LUstruct_t *LUstruct) SUPERLU_FREE(LUstruct->Glu_persist); SUPERLU_FREE(LUstruct->Llu); -#if ( DEBUGlevel>=1 ) +#if (DEBUGlevel >= 1) CHECK_MALLOC(iam, "Exit LUstructFree()"); #endif } @@ -341,57 +343,60 @@ void LUstructFree(LUstruct_t *LUstruct) * symmetrically reduced L. *
*/ -void -countnz_dist(const int_t n, int_t *xprune, - int_t *nnzL, int_t *nnzU, - Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable) +void countnz_dist(const int_t n, int_t *xprune, + int_t *nnzL, int_t *nnzU, + Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable) { - int_t fnz, fsupc, i, j, nsuper; - int_t jlen, irep; + int_t fnz, fsupc, i, j, nsuper; + int_t jlen, irep; long long int nnzL0; - int_t *supno, *xsup, *xlsub, *xusub, *usub; - - supno = Glu_persist->supno; - xsup = Glu_persist->xsup; - xlsub = Glu_freeable->xlsub; - xusub = Glu_freeable->xusub; - usub = Glu_freeable->usub; - *nnzL = 0; - *nnzU = 0; - nnzL0 = 0; + int_t *supno, *xsup, *xlsub, *xusub, *usub; + + supno = Glu_persist->supno; + xsup = Glu_persist->xsup; + xlsub = Glu_freeable->xlsub; + xusub = Glu_freeable->xusub; + usub = Glu_freeable->usub; + *nnzL = 0; + *nnzU = 0; + nnzL0 = 0; nsuper = supno[n]; - if ( n <= 0 ) return; + if (n <= 0) + return; /* * For each supernode in L. */ - for (i = 0; i <= nsuper; i++) { - fsupc = xsup[i]; - jlen = xlsub[fsupc+1] - xlsub[fsupc]; - - for (j = fsupc; j < xsup[i+1]; j++) { - *nnzL += jlen; - *nnzU += j - fsupc + 1; - jlen--; - } - irep = xsup[i+1] - 1; - nnzL0 += xprune[irep] - xlsub[irep]; + for (i = 0; i <= nsuper; i++) + { + fsupc = xsup[i]; + jlen = xlsub[fsupc + 1] - xlsub[fsupc]; + + for (j = fsupc; j < xsup[i + 1]; j++) + { + *nnzL += jlen; + *nnzU += j - fsupc + 1; + jlen--; + } + irep = xsup[i + 1] - 1; + nnzL0 += xprune[irep] - xlsub[irep]; } - + /* printf("\tNo of nonzeros in symm-reduced L = %ld\n", nnzL0);*/ - + /* For each column in U. */ - for (j = 0; j < n; ++j) { - for (i = xusub[j]; i < xusub[j+1]; ++i) { - fnz = usub[i]; - fsupc = xsup[supno[fnz]+1]; - *nnzU += fsupc - fnz; - } + for (j = 0; j < n; ++j) + { + for (i = xusub[j]; i < xusub[j + 1]; ++i) + { + fnz = usub[i]; + fsupc = xsup[supno[fnz] + 1]; + *nnzU += fsupc - fnz; + } } } - /*! \brief * *
@@ -401,36 +406,38 @@ countnz_dist(const int_t n, int_t *xprune,
  * 
*/ int64_t -fixupL_dist(const int_t n, const int_t *perm_r, - Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable) +fixupL_dist(const int_t n, const int_t *perm_r, + Glu_persist_t *Glu_persist, Glu_freeable_t *Glu_freeable) { register int_t nsuper, fsupc, nextl, i, j, k, jstrt; register long long int lsub_size; - int_t *xsup, *lsub, *xlsub; + int_t *xsup, *lsub, *xlsub; - if ( n <= 1 ) return 0; + if (n <= 1) + return 0; - xsup = Glu_persist->xsup; - lsub = Glu_freeable->lsub; - xlsub = Glu_freeable->xlsub; - nextl = 0; + xsup = Glu_persist->xsup; + lsub = Glu_freeable->lsub; + xlsub = Glu_freeable->xlsub; + nextl = 0; nsuper = (Glu_persist->supno)[n]; lsub_size = xlsub[n]; - + /* * For each supernode ... */ - for (i = 0; i <= nsuper; i++) { - fsupc = xsup[i]; - jstrt = xlsub[fsupc]; - xlsub[fsupc] = nextl; - for (j = jstrt; j < xlsub[fsupc+1]; j++) { - lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */ - nextl++; - } - for (k = fsupc+1; k < xsup[i+1]; k++) - xlsub[k] = nextl; /* Other columns in supernode i */ - + for (i = 0; i <= nsuper; i++) + { + fsupc = xsup[i]; + jstrt = xlsub[fsupc]; + xlsub[fsupc] = nextl; + for (j = jstrt; j < xlsub[fsupc + 1]; j++) + { + lsub[nextl] = perm_r[lsub[j]]; /* Now indexed into P*A */ + nextl++; + } + for (k = fsupc + 1; k < xsup[i + 1]; k++) + xlsub[k] = nextl; /* Other columns in supernode i */ } xlsub[n] = nextl; @@ -441,28 +448,28 @@ fixupL_dist(const int_t n, const int_t *perm_r, */ void set_default_options_dist(superlu_dist_options_t *options) { - options->Fact = DOFACT; - options->Equil = YES; - options->ParSymbFact = NO; + options->Fact = DOFACT; + options->Equil = YES; + options->ParSymbFact = NO; #ifdef HAVE_PARMETIS - options->ColPerm = METIS_AT_PLUS_A; + options->ColPerm = METIS_AT_PLUS_A; #else - options->ColPerm = MMD_AT_PLUS_A; + options->ColPerm = MMD_AT_PLUS_A; #endif - options->RowPerm = LargeDiag_MC64; - options->ReplaceTinyPivot = NO; - options->IterRefine = SLU_DOUBLE; - options->Trans = NOTRANS; - options->SolveInitialized = NO; + options->RowPerm = LargeDiag_MC64; + options->ReplaceTinyPivot = NO; + options->IterRefine = SLU_DOUBLE; + options->Trans = NOTRANS; + options->SolveInitialized = NO; options->RefineInitialized = NO; - options->PrintStat = YES; - options->num_lookaheads = 10; - options->lookahead_etree = NO; - options->SymPattern = NO; + options->PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; #ifdef SLU_HAVE_LAPACK - options->DiagInv = YES; + options->DiagInv = YES; #else - options->DiagInv = NO; + options->DiagInv = NO; #endif } @@ -470,7 +477,8 @@ void set_default_options_dist(superlu_dist_options_t *options) */ void print_options_dist(superlu_dist_options_t *options) { - if ( options->PrintStat == NO ) return; + if (options->PrintStat == NO) + return; printf("**************************************************\n"); printf(".. options:\n"); @@ -493,7 +501,8 @@ void print_options_dist(superlu_dist_options_t *options) */ void print_sp_ienv_dist(superlu_dist_options_t *options) { - if ( options->PrintStat == NO ) return; + if (options->PrintStat == NO) + return; printf("**************************************************\n"); printf(".. blocking parameters from sp_ienv():\n"); @@ -504,7 +513,6 @@ void print_sp_ienv_dist(superlu_dist_options_t *options) printf("**************************************************\n"); } - /*! \brief * *
@@ -538,10 +546,9 @@ void print_sp_ienv_dist(superlu_dist_options_t *options)
  *        The 2D process mesh.
  * 
*/ -int_t -pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, - int_t perm_r[], int_t perm_c[], gridinfo_t *grid, - Glu_persist_t *Glu_persist, SOLVEstruct_t *SOLVEstruct) +int_t pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, + int_t perm_r[], int_t perm_c[], gridinfo_t *grid, + Glu_persist_t *Glu_persist, SOLVEstruct_t *SOLVEstruct) { int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; @@ -550,7 +557,7 @@ pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, int_t *row_to_proc; int_t i, gbi, k, l, num_diag_procs, *diag_procs; int_t irow, q, knsupc, nsupers, *xsup, *supno; - int iam, p, pkk, procs; + int iam, p, pkk, procs; pxgstrs_comm_t *gstrs_comm; procs = grid->nprow * grid->npcol; @@ -558,44 +565,48 @@ pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, gstrs_comm = SOLVEstruct->gstrs_comm; xsup = Glu_persist->xsup; supno = Glu_persist->supno; - nsupers = Glu_persist->supno[n-1] + 1; + nsupers = Glu_persist->supno[n - 1] + 1; row_to_proc = SOLVEstruct->row_to_proc; /* ------------------------------------------------------------ SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X. ------------------------------------------------------------*/ - if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) + if (!(itemp = SUPERLU_MALLOC(8 * procs * sizeof(int)))) ABORT("Malloc fails for B_to_X_itemp[]."); - SendCnt = itemp; - SendCnt_nrhs = itemp + procs; - RecvCnt = itemp + 2*procs; - RecvCnt_nrhs = itemp + 3*procs; - sdispls = itemp + 4*procs; - sdispls_nrhs = itemp + 5*procs; - rdispls = itemp + 6*procs; - rdispls_nrhs = itemp + 7*procs; + SendCnt = itemp; + SendCnt_nrhs = itemp + procs; + RecvCnt = itemp + 2 * procs; + RecvCnt_nrhs = itemp + 3 * procs; + sdispls = itemp + 4 * procs; + sdispls_nrhs = itemp + 5 * procs; + rdispls = itemp + 6 * procs; + rdispls_nrhs = itemp + 7 * procs; /* Count the number of elements to be sent to each diagonal process.*/ - for (p = 0; p < procs; ++p) SendCnt[p] = 0; - for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { + for (p = 0; p < procs; ++p) + SendCnt[p] = 0; + for (i = 0, l = fst_row; i < m_loc; ++i, ++l) + { irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ - gbi = BlockNum( irow ); - p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ - ++SendCnt[p]; + gbi = BlockNum(irow); + p = PNUM(PROW(gbi, grid), PCOL(gbi, grid), grid); /* Diagonal process */ + ++SendCnt[p]; } - + /* Set up the displacements for alltoall. */ MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); sdispls[0] = rdispls[0] = 0; - for (p = 1; p < procs; ++p) { - sdispls[p] = sdispls[p-1] + SendCnt[p-1]; - rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; + for (p = 1; p < procs; ++p) + { + sdispls[p] = sdispls[p - 1] + SendCnt[p - 1]; + rdispls[p] = rdispls[p - 1] + RecvCnt[p - 1]; } - for (p = 0; p < procs; ++p) { + for (p = 0; p < procs; ++p) + { SendCnt_nrhs[p] = SendCnt[p] * nrhs; - sdispls_nrhs[p] = sdispls[p] * nrhs; + sdispls_nrhs[p] = sdispls[p] * nrhs; RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; - rdispls_nrhs[p] = rdispls[p] * nrhs; + rdispls_nrhs[p] = rdispls[p] * nrhs; } /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ @@ -605,39 +616,44 @@ pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B. ------------------------------------------------------------*/ /* This is freed in pxgstrs_finalize(). */ - if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) + if (!(itemp = SUPERLU_MALLOC(8 * procs * sizeof(int)))) ABORT("Malloc fails for X_to_B_itemp[]."); - SendCnt = itemp; - SendCnt_nrhs = itemp + procs; - RecvCnt = itemp + 2*procs; - RecvCnt_nrhs = itemp + 3*procs; - sdispls = itemp + 4*procs; - sdispls_nrhs = itemp + 5*procs; - rdispls = itemp + 6*procs; - rdispls_nrhs = itemp + 7*procs; + SendCnt = itemp; + SendCnt_nrhs = itemp + procs; + RecvCnt = itemp + 2 * procs; + RecvCnt_nrhs = itemp + 3 * procs; + sdispls = itemp + 4 * procs; + sdispls_nrhs = itemp + 5 * procs; + rdispls = itemp + 6 * procs; + rdispls_nrhs = itemp + 7 * procs; /* Count the number of X entries to be sent to each process.*/ - for (p = 0; p < procs; ++p) SendCnt[p] = 0; + for (p = 0; p < procs; ++p) + SendCnt[p] = 0; num_diag_procs = SOLVEstruct->num_diag_procs; diag_procs = SOLVEstruct->diag_procs; - for (p = 0; p < num_diag_procs; ++p) { /* for all diagonal processes */ - pkk = diag_procs[p]; - if ( iam == pkk ) { - for (k = p; k < nsupers; k += num_diag_procs) { - knsupc = SuperSize( k ); - irow = FstBlockC( k ); - for (i = 0; i < knsupc; ++i) { + for (p = 0; p < num_diag_procs; ++p) + { /* for all diagonal processes */ + pkk = diag_procs[p]; + if (iam == pkk) + { + for (k = p; k < nsupers; k += num_diag_procs) + { + knsupc = SuperSize(k); + irow = FstBlockC(k); + for (i = 0; i < knsupc; ++i) + { #if 0 q = row_to_proc[inv_perm_c[irow]]; #else - q = row_to_proc[irow]; + q = row_to_proc[irow]; #endif - ++SendCnt[q]; - ++irow; - } - } - } + ++SendCnt[q]; + ++irow; + } + } + } } MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); @@ -645,19 +661,20 @@ pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, sdispls_nrhs[0] = rdispls_nrhs[0] = 0; SendCnt_nrhs[0] = SendCnt[0] * nrhs; RecvCnt_nrhs[0] = RecvCnt[0] * nrhs; - for (p = 1; p < procs; ++p) { - sdispls[p] = sdispls[p-1] + SendCnt[p-1]; - rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; + for (p = 1; p < procs; ++p) + { + sdispls[p] = sdispls[p - 1] + SendCnt[p - 1]; + rdispls[p] = rdispls[p - 1] + RecvCnt[p - 1]; sdispls_nrhs[p] = sdispls[p] * nrhs; rdispls_nrhs[p] = rdispls[p] * nrhs; - SendCnt_nrhs[p] = SendCnt[p] * nrhs; - RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; + SendCnt_nrhs[p] = SendCnt[p] * nrhs; + RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; } /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ gstrs_comm->X_to_B_SendCnt = SendCnt; - if ( !(ptr_to_ibuf = SUPERLU_MALLOC(2*procs * sizeof(int))) ) + if (!(ptr_to_ibuf = SUPERLU_MALLOC(2 * procs * sizeof(int)))) ABORT("Malloc fails for ptr_to_ibuf[]."); gstrs_comm->ptr_to_ibuf = ptr_to_ibuf; gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs; @@ -665,7 +682,6 @@ pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, return 0; } /* PXGSTRS_INIT */ - void pxgstrs_finalize(pxgstrs_comm_t *gstrs_comm) { SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt); @@ -674,201 +690,213 @@ void pxgstrs_finalize(pxgstrs_comm_t *gstrs_comm) SUPERLU_FREE(gstrs_comm); } - /*! \brief Diagnostic print of segment info after panel_dfs(). */ -void print_panel_seg_dist(int_t n, int_t w, int_t jcol, int_t nseg, - int_t *segrep, int_t *repfnz) +void print_panel_seg_dist(int_t n, int_t w, int_t jcol, int_t nseg, + int_t *segrep, int_t *repfnz) { int_t j, k; - - for (j = jcol; j < jcol+w; j++) { - printf("\tcol " IFMT ":\n", j); - for (k = 0; k < nseg; k++) - printf("\t\tseg " IFMT ", segrep " IFMT ", repfnz " IFMT "\n", k, - segrep[k], repfnz[(j-jcol)*n + segrep[k]]); - } + for (j = jcol; j < jcol + w; j++) + { + printf("\tcol " IFMT ":\n", j); + for (k = 0; k < nseg; k++) + printf("\t\tseg " IFMT ", segrep " IFMT ", repfnz " IFMT "\n", k, + segrep[k], repfnz[(j - jcol) * n + segrep[k]]); + } } -void -PStatInit(SuperLUStat_t *stat) +void PStatInit(SuperLUStat_t *stat) { register int_t i; - if ( !(stat->utime = SUPERLU_MALLOC(NPHASES*sizeof(double))) ) - ABORT("Malloc fails for stat->utime[]"); - if ( !(stat->ops = (flops_t *) SUPERLU_MALLOC(NPHASES * sizeof(flops_t))) ) - ABORT("SUPERLU_MALLOC fails for stat->ops[]"); - for (i = 0; i < NPHASES; ++i) { + if (!(stat->utime = SUPERLU_MALLOC(NPHASES * sizeof(double)))) + ABORT("Malloc fails for stat->utime[]"); + if (!(stat->ops = (flops_t *)SUPERLU_MALLOC(NPHASES * sizeof(flops_t)))) + ABORT("SUPERLU_MALLOC fails for stat->ops[]"); + for (i = 0; i < NPHASES; ++i) + { stat->utime[i] = 0.; stat->ops[i] = 0.; } stat->TinyPivots = stat->RefineSteps = 0; } -void -PStatPrint(superlu_dist_options_t *options, SuperLUStat_t *stat, gridinfo_t *grid) +void PStatPrint(superlu_dist_options_t *options, SuperLUStat_t *stat, gridinfo_t *grid) { - double *utime = stat->utime; + double *utime = stat->utime; flops_t *ops = stat->ops; - int_t iam = grid->iam; + int_t iam = grid->iam; flops_t flopcnt, factflop, solveflop; - if ( options->PrintStat == NO ) return; - - if ( !iam && options->Fact != FACTORED ) { - printf("**************************************************\n"); - printf("**** Time (seconds) ****\n"); + if (options->PrintStat == NO) + return; - if ( options->Equil != NO ) - printf("\tEQUIL time %8.2f\n", utime[EQUIL]); - if ( options->RowPerm != NOROWPERM ) - printf("\tROWPERM time %8.2f\n", utime[ROWPERM]); - if ( options->ColPerm != NATURAL ) - printf("\tCOLPERM time %8.2f\n", utime[COLPERM]); + if (!iam && options->Fact != FACTORED) + { + printf("**************************************************\n"); + printf("**** Time (seconds) ****\n"); + + if (options->Equil != NO) + printf("\tEQUIL time %8.2f\n", utime[EQUIL]); + if (options->RowPerm != NOROWPERM) + printf("\tROWPERM time %8.2f\n", utime[ROWPERM]); + if (options->ColPerm != NATURAL) + printf("\tCOLPERM time %8.2f\n", utime[COLPERM]); printf("\tSYMBFACT time %8.2f\n", utime[SYMBFAC]); - printf("\tDISTRIBUTE time %8.2f\n", utime[DIST]); - + printf("\tDISTRIBUTE time %8.2f\n", utime[DIST]); } MPI_Reduce(&ops[FACT], &flopcnt, 1, MPI_FLOAT, MPI_SUM, - 0, grid->comm); + 0, grid->comm); factflop = flopcnt; - if ( !iam && options->Fact != FACTORED ) { - printf("\tFACTOR time %8.2f\n", utime[FACT]); - if ( utime[FACT] != 0.0 ) - printf("\tFactor flops\t%e\tMflops \t%8.2f\n", - flopcnt, - flopcnt*1e-6/utime[FACT]); + if (!iam && options->Fact != FACTORED) + { + printf("\tFACTOR time %8.2f\n", utime[FACT]); + if (utime[FACT] != 0.0) + printf("\tFactor flops\t%e\tMflops \t%8.2f\n", + flopcnt, + flopcnt * 1e-6 / utime[FACT]); } - - MPI_Reduce(&ops[SOLVE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, - 0, grid->comm); + + MPI_Reduce(&ops[SOLVE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, + 0, grid->comm); solveflop = flopcnt; - if ( !iam ) { - printf("\tSOLVE time %8.3f\n", utime[SOLVE]); - if ( utime[SOLVE] != 0.0 ) - printf("\tSolve flops\t%e\tMflops \t%8.2f\n", - flopcnt, - flopcnt*1e-6/utime[SOLVE]); - if ( options->IterRefine != NOREFINE ) { - printf("\tREFINEMENT time %8.3f\tSteps%8d\n\n", - utime[REFINE], stat->RefineSteps); - } - printf("**************************************************\n"); + if (!iam) + { + printf("\tSOLVE time %8.3f\n", utime[SOLVE]); + if (utime[SOLVE] != 0.0) + printf("\tSolve flops\t%e\tMflops \t%8.2f\n", + flopcnt, + flopcnt * 1e-6 / utime[SOLVE]); + if (options->IterRefine != NOREFINE) + { + printf("\tREFINEMENT time %8.3f\tSteps%8d\n\n", + utime[REFINE], stat->RefineSteps); + } + printf("**************************************************\n"); } - double *utime1,*utime2,*utime3,*utime4; - flops_t *ops1; -#if ( PROFlevel>=1 ) - fflush(stdout); - MPI_Barrier( grid->comm ); + double *utime1, *utime2, *utime3, *utime4; + flops_t *ops1; +#if (PROFlevel >= 1) + fflush(stdout); + MPI_Barrier(grid->comm); { - int_t i, P = grid->nprow*grid->npcol; - flops_t b, maxflop; - - - if ( !iam )utime1=doubleMalloc_dist(P); - if ( !iam )utime2=doubleMalloc_dist(P); - if ( !iam )utime3=doubleMalloc_dist(P); - if ( !iam )utime4=doubleMalloc_dist(P); - if ( !iam )ops1=(flops_t *) SUPERLU_MALLOC(P * sizeof(flops_t)); - - - // fflush(stdout); - // if ( !iam ) printf("\n.. Tree max sizes:\tbtree\trtree\n"); - // fflush(stdout); - // sleep(2.0); - // MPI_Barrier( grid->comm ); - // for (i = 0; i < P; ++i) { - // if ( iam == i) { - // printf("\t\t%d %5d %5d\n", iam, stat->MaxActiveBTrees,stat->MaxActiveRTrees); - // fflush(stdout); - // } - // MPI_Barrier( grid->comm ); - // } - - // sleep(2.0); - - - MPI_Barrier( grid->comm ); - - if ( !iam ) printf("\n.. FACT time breakdown:\tcomm\ttotal\n"); - - MPI_Gather(&utime[COMM], 1, MPI_DOUBLE,utime1, 1 , MPI_DOUBLE, 0, grid->comm); - MPI_Gather(&utime[FACT], 1, MPI_DOUBLE,utime2, 1 , MPI_DOUBLE, 0, grid->comm); - if ( !iam ) - for (i = 0; i < P; ++i) { - printf("\t\t(%d)%8.2f%8.2f\n", i, utime1[i], utime2[i]); - } - fflush(stdout); - MPI_Barrier( grid->comm ); - - if ( !iam ) printf("\n.. FACT ops distribution:\n"); - MPI_Gather(&ops[FACT], 1, MPI_FLOAT,ops1, 1 , MPI_FLOAT, 0, grid->comm); - - if ( !iam ) - for (i = 0; i < P; ++i) { - printf("\t\t(%d)\t%e\n", i, ops1[i]); - } - fflush(stdout); - MPI_Barrier( grid->comm ); - - MPI_Reduce(&ops[FACT], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); - - if ( !iam ) { - b = factflop/P/maxflop; - printf("\tFACT load balance: %.2f\n", b); - } - fflush(stdout); - MPI_Barrier( grid->comm ); - - - if ( !iam ) printf("\n.. SOLVE time breakdown:\tcommL \tgemmL\ttrsmL\ttotal\n"); - - MPI_Gather(&utime[SOL_COMM], 1, MPI_DOUBLE,utime1, 1 , MPI_DOUBLE, 0, grid->comm); - MPI_Gather(&utime[SOL_GEMM], 1, MPI_DOUBLE,utime2, 1 , MPI_DOUBLE, 0, grid->comm); - MPI_Gather(&utime[SOL_TRSM], 1, MPI_DOUBLE,utime3, 1 , MPI_DOUBLE, 0, grid->comm); - MPI_Gather(&utime[SOL_TOT], 1, MPI_DOUBLE,utime4, 1 , MPI_DOUBLE, 0, grid->comm); - if ( !iam ) - for (i = 0; i < P; ++i) { - printf("\t\t\t%d%10.5f%10.5f%10.5f%10.5f\n", i,utime1[i],utime2[i],utime3[i], utime4[i]); - } - fflush(stdout); - MPI_Barrier( grid->comm ); - - if ( !iam ) printf("\n.. SOLVE ops distribution:\n"); - MPI_Gather(&ops[SOLVE], 1, MPI_FLOAT,ops1, 1 , MPI_FLOAT, 0, grid->comm); - if ( !iam ) - for (i = 0; i < P; ++i) { - printf("\t\t%d\t%e\n", i, ops1[i]); - } - MPI_Reduce(&ops[SOLVE], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0,grid->comm); - if ( !iam ) { - b = solveflop/P/maxflop; - printf("\tSOLVE load balance: %.2f\n", b); - fflush(stdout); - } - + int_t i, P = grid->nprow * grid->npcol; + flops_t b, maxflop; + + if (!iam) + utime1 = doubleMalloc_dist(P); + if (!iam) + utime2 = doubleMalloc_dist(P); + if (!iam) + utime3 = doubleMalloc_dist(P); + if (!iam) + utime4 = doubleMalloc_dist(P); + if (!iam) + ops1 = (flops_t *)SUPERLU_MALLOC(P * sizeof(flops_t)); + + // fflush(stdout); + // if ( !iam ) printf("\n.. Tree max sizes:\tbtree\trtree\n"); + // fflush(stdout); + // sleep(2.0); + // MPI_Barrier( grid->comm ); + // for (i = 0; i < P; ++i) { + // if ( iam == i) { + // printf("\t\t%d %5d %5d\n", iam, stat->MaxActiveBTrees,stat->MaxActiveRTrees); + // fflush(stdout); + // } + // MPI_Barrier( grid->comm ); + // } + + // sleep(2.0); + + MPI_Barrier(grid->comm); + + if (!iam) + printf("\n.. FACT time breakdown:\tcomm\ttotal\n"); + + MPI_Gather(&utime[COMM], 1, MPI_DOUBLE, utime1, 1, MPI_DOUBLE, 0, grid->comm); + MPI_Gather(&utime[FACT], 1, MPI_DOUBLE, utime2, 1, MPI_DOUBLE, 0, grid->comm); + if (!iam) + for (i = 0; i < P; ++i) + { + printf("\t\t(%d)%8.2f%8.2f\n", i, utime1[i], utime2[i]); + } + fflush(stdout); + MPI_Barrier(grid->comm); + + if (!iam) + printf("\n.. FACT ops distribution:\n"); + MPI_Gather(&ops[FACT], 1, MPI_FLOAT, ops1, 1, MPI_FLOAT, 0, grid->comm); + + if (!iam) + for (i = 0; i < P; ++i) + { + printf("\t\t(%d)\t%e\n", i, ops1[i]); + } + fflush(stdout); + MPI_Barrier(grid->comm); + + MPI_Reduce(&ops[FACT], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + + if (!iam) + { + b = factflop / P / maxflop; + printf("\tFACT load balance: %.2f\n", b); + } + fflush(stdout); + MPI_Barrier(grid->comm); + + if (!iam) + printf("\n.. SOLVE time breakdown:\tcommL \tgemmL\ttrsmL\ttotal\n"); + + MPI_Gather(&utime[SOL_COMM], 1, MPI_DOUBLE, utime1, 1, MPI_DOUBLE, 0, grid->comm); + MPI_Gather(&utime[SOL_GEMM], 1, MPI_DOUBLE, utime2, 1, MPI_DOUBLE, 0, grid->comm); + MPI_Gather(&utime[SOL_TRSM], 1, MPI_DOUBLE, utime3, 1, MPI_DOUBLE, 0, grid->comm); + MPI_Gather(&utime[SOL_TOT], 1, MPI_DOUBLE, utime4, 1, MPI_DOUBLE, 0, grid->comm); + if (!iam) + for (i = 0; i < P; ++i) + { + printf("\t\t\t%d%10.5f%10.5f%10.5f%10.5f\n", i, utime1[i], utime2[i], utime3[i], utime4[i]); + } + fflush(stdout); + MPI_Barrier(grid->comm); + + if (!iam) + printf("\n.. SOLVE ops distribution:\n"); + MPI_Gather(&ops[SOLVE], 1, MPI_FLOAT, ops1, 1, MPI_FLOAT, 0, grid->comm); + if (!iam) + for (i = 0; i < P; ++i) + { + printf("\t\t%d\t%e\n", i, ops1[i]); + } + MPI_Reduce(&ops[SOLVE], &maxflop, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + if (!iam) + { + b = solveflop / P / maxflop; + printf("\tSOLVE load balance: %.2f\n", b); + fflush(stdout); + } + } + + if (!iam) + { + SUPERLU_FREE(utime1); + SUPERLU_FREE(utime2); + SUPERLU_FREE(utime3); + SUPERLU_FREE(utime4); + SUPERLU_FREE(ops1); } - - if ( !iam ){ - SUPERLU_FREE(utime1); - SUPERLU_FREE(utime2); - SUPERLU_FREE(utime3); - SUPERLU_FREE(utime4); - SUPERLU_FREE(ops1); - } - + #endif -/* if ( !iam ) fflush(stdout); CRASH THE SYSTEM pierre. */ + /* if ( !iam ) fflush(stdout); CRASH THE SYSTEM pierre. */ } -void -PStatFree(SuperLUStat_t *stat) +void PStatFree(SuperLUStat_t *stat) { SUPERLU_FREE(stat->utime); SUPERLU_FREE(stat->ops); @@ -879,13 +907,12 @@ PStatFree(SuperLUStat_t *stat) void ifill_dist(int_t *a, int_t alen, int_t ival) { register int_t i; - for (i = 0; i < alen; i++) a[i] = ival; + for (i = 0; i < alen; i++) + a[i] = ival; } - -void -get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid, - int_t *num_diag_procs, int_t **diag_procs, int_t **diag_len) +void get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid, + int_t *num_diag_procs, int_t **diag_procs, int_t **diag_len) { int_t i, j, k, knsupc, nprow, npcol, nsupers, pkk; int_t *xsup; @@ -893,99 +920,108 @@ get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid, i = j = *num_diag_procs = pkk = 0; nprow = grid->nprow; npcol = grid->npcol; - nsupers = Glu_persist->supno[n-1] + 1; + nsupers = Glu_persist->supno[n - 1] + 1; xsup = Glu_persist->xsup; - do { - ++(*num_diag_procs); - i = (++i) % nprow; - j = (++j) % npcol; - pkk = PNUM( i, j, grid ); - } while ( pkk != 0 ); /* Until wrap back to process 0 */ - if ( !(*diag_procs = intMalloc_dist(*num_diag_procs)) ) - ABORT("Malloc fails for diag_procs[]"); - if ( !(*diag_len = intCalloc_dist(*num_diag_procs)) ) - ABORT("Calloc fails for diag_len[]"); - for (i = j = k = 0; k < *num_diag_procs; ++k) { - pkk = PNUM( i, j, grid ); - (*diag_procs)[k] = pkk; - i = (++i) % nprow; - j = (++j) % npcol; + do + { + ++(*num_diag_procs); + i = (++i) % nprow; + j = (++j) % npcol; + pkk = PNUM(i, j, grid); + } while (pkk != 0); /* Until wrap back to process 0 */ + if (!(*diag_procs = intMalloc_dist(*num_diag_procs))) + ABORT("Malloc fails for diag_procs[]"); + if (!(*diag_len = intCalloc_dist(*num_diag_procs))) + ABORT("Calloc fails for diag_len[]"); + for (i = j = k = 0; k < *num_diag_procs; ++k) + { + pkk = PNUM(i, j, grid); + (*diag_procs)[k] = pkk; + i = (++i) % nprow; + j = (++j) % npcol; } - for (k = 0; k < nsupers; ++k) { - knsupc = SuperSize( k ); - i = k % *num_diag_procs; - (*diag_len)[i] += knsupc; + for (k = 0; k < nsupers; ++k) + { + knsupc = SuperSize(k); + i = k % *num_diag_procs; + (*diag_len)[i] += knsupc; } } - /*! \brief Get the statistics of the supernodes */ #define NBUCKS 10 -static int_t max_sup_size; +static int_t max_sup_size; void super_stats_dist(int_t nsuper, int_t *xsup) { register int_t nsup1 = 0; - int_t i, isize, whichb, bl, bh; - int_t bucket[NBUCKS]; + int_t i, isize, whichb, bl, bh; + int_t bucket[NBUCKS]; max_sup_size = 0; - for (i = 0; i <= nsuper; i++) { - isize = xsup[i+1] - xsup[i]; - if ( isize == 1 ) nsup1++; - if ( max_sup_size < isize ) max_sup_size = isize; + for (i = 0; i <= nsuper; i++) + { + isize = xsup[i + 1] - xsup[i]; + if (isize == 1) + nsup1++; + if (max_sup_size < isize) + max_sup_size = isize; } - printf(" Supernode statistics:\n\tno of super = " IFMT "\n", nsuper+1); + printf(" Supernode statistics:\n\tno of super = " IFMT "\n", nsuper + 1); printf("\tmax supernode size = " IFMT "\n", max_sup_size); printf("\tno of size 1 supernodes = " IFMT "\n", nsup1); /* Histogram of the supernode sizes */ - ifill_dist (bucket, NBUCKS, 0); + ifill_dist(bucket, NBUCKS, 0); - for (i = 0; i <= nsuper; i++) { - isize = xsup[i+1] - xsup[i]; - whichb = (float) isize / max_sup_size * NBUCKS; - if (whichb >= NBUCKS) whichb = NBUCKS - 1; + for (i = 0; i <= nsuper; i++) + { + isize = xsup[i + 1] - xsup[i]; + whichb = (float)isize / max_sup_size * NBUCKS; + if (whichb >= NBUCKS) + whichb = NBUCKS - 1; bucket[whichb]++; } - + printf("\tHistogram of supernode sizes:\n"); - for (i = 0; i < NBUCKS; i++) { - bl = (float) i * max_sup_size / NBUCKS; - bh = (float) (i+1) * max_sup_size / NBUCKS; - printf("\tsnode: " IFMT "-" IFMT "\t\t" IFMT "\n", bl+1, bh, bucket[i]); + for (i = 0; i < NBUCKS; i++) + { + bl = (float)i * max_sup_size / NBUCKS; + bh = (float)(i + 1) * max_sup_size / NBUCKS; + printf("\tsnode: " IFMT "-" IFMT "\t\t" IFMT "\n", bl + 1, bh, bucket[i]); } - } - /*! \brief Check whether repfnz[] == EMPTY after reset. */ void check_repfnz_dist(int_t n, int_t w, int_t jcol, int_t *repfnz) { int_t jj, k; - for (jj = jcol; jj < jcol+w; jj++) - for (k = 0; k < n; k++) - if ( repfnz[(jj-jcol)*n + k] != EMPTY ) { - fprintf(stderr, "col " IFMT ", repfnz_col[" IFMT "] = " IFMT "\n", - jj, k, repfnz[(jj-jcol)*n + k]); - ABORT("check_repfnz_dist"); - } + for (jj = jcol; jj < jcol + w; jj++) + for (k = 0; k < n; k++) + if (repfnz[(jj - jcol) * n + k] != EMPTY) + { + fprintf(stderr, "col " IFMT ", repfnz_col[" IFMT "] = " IFMT "\n", + jj, k, repfnz[(jj - jcol) * n + k]); + ABORT("check_repfnz_dist"); + } } void PrintInt10(char *name, int_t len, int_t *x) { register int_t i; - + printf("%10s:", name); - for (i = 0; i < len; ++i) { - if ( i % 10 == 0 ) printf("\n\t[" IFMT "-" IFMT "]", i, i+9); - printf(IFMT, x[i]); + for (i = 0; i < len; ++i) + { + if (i % 10 == 0) + printf("\n\t[" IFMT "-" IFMT "]", i, i + 9); + printf(IFMT, x[i]); } printf("\n"); } @@ -993,11 +1029,13 @@ void PrintInt10(char *name, int_t len, int_t *x) void PrintInt32(char *name, int len, int *x) { register int i; - + printf("%10s:", name); - for (i = 0; i < len; ++i) { - if ( i % 10 == 0 ) printf("\n\t[%2d-%2d]", i, i+9); - printf("%6d", x[i]); + for (i = 0; i < len; ++i) + { + if (i % 10 == 0) + printf("\n\t[%2d-%2d]", i, i + 9); + printf("%6d", x[i]); } printf("\n"); } @@ -1005,11 +1043,13 @@ void PrintInt32(char *name, int len, int *x) int file_PrintInt10(FILE *fp, char *name, int_t len, int_t *x) { register int_t i; - + fprintf(fp, "%10s:", name); - for (i = 0; i < len; ++i) { - if ( i % 10 == 0 ) fprintf(fp, "\n\t[" IFMT "-" IFMT "]", i, i+9); - fprintf(fp, IFMT, x[i]); + for (i = 0; i < len; ++i) + { + if (i % 10 == 0) + fprintf(fp, "\n\t[" IFMT "-" IFMT "]", i, i + 9); + fprintf(fp, IFMT, x[i]); } fprintf(fp, "\n"); return 0; @@ -1018,43 +1058,50 @@ int file_PrintInt10(FILE *fp, char *name, int_t len, int_t *x) int file_PrintInt32(FILE *fp, char *name, int len, int *x) { register int i; - + fprintf(fp, "%10s:", name); - for (i = 0; i < len; ++i) { - if ( i % 10 == 0 ) fprintf(fp, "\n\t[%2d-%2d]", i, i+9); - fprintf(fp, "%6d", x[i]); + for (i = 0; i < len; ++i) + { + if (i % 10 == 0) + fprintf(fp, "\n\t[%2d-%2d]", i, i + 9); + fprintf(fp, "%6d", x[i]); } fprintf(fp, "\n"); return 0; } -int_t -CheckZeroDiagonal(int_t n, int_t *rowind, int_t *colbeg, int_t *colcnt) +int_t CheckZeroDiagonal(int_t n, int_t *rowind, int_t *colbeg, int_t *colcnt) { register int_t i, j, zd, numzd = 0; - for (j = 0; j < n; ++j) { - zd = 0; - for (i = colbeg[j]; i < colbeg[j]+colcnt[j]; ++i) { - /*if ( iperm[rowind[i]] == j ) zd = 1;*/ - if ( rowind[i] == j ) { zd = 1; break; } - } - if ( zd == 0 ) { -#if ( PRNTlevel>=2 ) - printf(".. Diagonal of column %d is zero.\n", j); + for (j = 0; j < n; ++j) + { + zd = 0; + for (i = colbeg[j]; i < colbeg[j] + colcnt[j]; ++i) + { + /*if ( iperm[rowind[i]] == j ) zd = 1;*/ + if (rowind[i] == j) + { + zd = 1; + break; + } + } + if (zd == 0) + { +#if (PRNTlevel >= 2) + printf(".. Diagonal of column %d is zero.\n", j); #endif - ++numzd; - } + ++numzd; + } } return numzd; } - /* --------------------------------------------------------------------------- */ void isort(int_t N, int_t *ARRAY1, int_t *ARRAY2) { -/* + /* * Purpose * ======= * Use quick sort algorithm to sort ARRAY1 and ARRAY2 in the increasing @@ -1073,34 +1120,39 @@ void isort(int_t N, int_t *ARRAY1, int_t *ARRAY2) * On entry, contains the array to be sorted. * On exit, contains the sorted array. */ - int_t IGAP, I, J; - int_t TEMP; - IGAP = N / 2; - while (IGAP > 0) { - for (I = IGAP; I < N; I++) { - J = I - IGAP; - while (J >= 0) { - if (ARRAY1[J] > ARRAY1[J + IGAP]) { - TEMP = ARRAY1[J]; - ARRAY1[J] = ARRAY1[J + IGAP]; - ARRAY1[J + IGAP] = TEMP; - TEMP = ARRAY2[J]; - ARRAY2[J] = ARRAY2[J + IGAP]; - ARRAY2[J + IGAP] = TEMP; - J = J - IGAP; - } else { - break; - } - } - } - IGAP = IGAP / 2; - } + int_t IGAP, I, J; + int_t TEMP; + IGAP = N / 2; + while (IGAP > 0) + { + for (I = IGAP; I < N; I++) + { + J = I - IGAP; + while (J >= 0) + { + if (ARRAY1[J] > ARRAY1[J + IGAP]) + { + TEMP = ARRAY1[J]; + ARRAY1[J] = ARRAY1[J + IGAP]; + ARRAY1[J + IGAP] = TEMP; + TEMP = ARRAY2[J]; + ARRAY2[J] = ARRAY2[J + IGAP]; + ARRAY2[J + IGAP] = TEMP; + J = J - IGAP; + } + else + { + break; + } + } + } + IGAP = IGAP / 2; + } } - void isort1(int_t N, int_t *ARRAY) { -/* + /* * Purpose * ======= * Use quick sort algorithm to sort ARRAY in increasing order. @@ -1115,90 +1167,97 @@ void isort1(int_t N, int_t *ARRAY) * On exit, contains the sorted array. * */ - int_t IGAP, I, J; - int_t TEMP; - IGAP = N / 2; - while (IGAP > 0) { - for (I = IGAP; I < N; I++) { - J = I - IGAP; - while (J >= 0) { - if (ARRAY[J] > ARRAY[J + IGAP]) { - TEMP = ARRAY[J]; - ARRAY[J] = ARRAY[J + IGAP]; - ARRAY[J + IGAP] = TEMP; - J = J - IGAP; - } else { - break; - } - } - } - IGAP = IGAP / 2; - } + int_t IGAP, I, J; + int_t TEMP; + IGAP = N / 2; + while (IGAP > 0) + { + for (I = IGAP; I < N; I++) + { + J = I - IGAP; + while (J >= 0) + { + if (ARRAY[J] > ARRAY[J + IGAP]) + { + TEMP = ARRAY[J]; + ARRAY[J] = ARRAY[J + IGAP]; + ARRAY[J + IGAP] = TEMP; + J = J - IGAP; + } + else + { + break; + } + } + } + IGAP = IGAP / 2; + } } /* Only log the memory for the buffer space, excluding the LU factors */ -void log_memory(int64_t cur_bytes, SuperLUStat_t *stat) { - stat->current_buffer += (float) cur_bytes; - if (cur_bytes > 0) { - stat->peak_buffer = - SUPERLU_MAX(stat->peak_buffer, stat->current_buffer); +void log_memory(int64_t cur_bytes, SuperLUStat_t *stat) +{ + stat->current_buffer += (float)cur_bytes; + if (cur_bytes > 0) + { + stat->peak_buffer = + SUPERLU_MAX(stat->peak_buffer, stat->current_buffer); } } -void print_memorylog(SuperLUStat_t *stat, char *msg) { +void print_memorylog(SuperLUStat_t *stat, char *msg) +{ printf("__ %s (MB):\n\tcurrent_buffer : %8.2f\tpeak_buffer : %8.2f\n", - msg, stat->current_buffer, stat->peak_buffer); + msg, stat->current_buffer, stat->peak_buffer); } -int compare_pair (const void *a, const void *b) +int compare_pair(const void *a, const void *b) { - return (((struct superlu_pair *) a)->val - ((struct superlu_pair *) b)->val); + return (((struct superlu_pair *)a)->val - ((struct superlu_pair *)b)->val); } int get_thread_per_process() -{ - char* ttemp; +{ + char *ttemp; ttemp = getenv("THREAD_PER_PROCESS"); - if(ttemp) return atoi(ttemp); - else return 1; + if (ttemp) + return atoi(ttemp); + else + return 1; } -int_t -get_max_buffer_size () +int_t get_max_buffer_size() { char *ttemp; - ttemp = getenv ("MAX_BUFFER_SIZE"); + ttemp = getenv("MAX_BUFFER_SIZE"); if (ttemp) - return atoi (ttemp); + return atoi(ttemp); else return 5000000; } -int_t -get_cublas_nb () +int_t get_cublas_nb() { char *ttemp; - ttemp = getenv ("CUBLAS_NB"); + ttemp = getenv("CUBLAS_NB"); if (ttemp) - return atoi (ttemp); + return atoi(ttemp); else return 64; } -int_t -get_num_cuda_streams () +int_t get_num_cuda_streams() { char *ttemp; - ttemp = getenv ("NUM_CUDA_STREAMS"); + ttemp = getenv("NUM_CUDA_STREAMS"); if (ttemp) - return atoi (ttemp); + return atoi(ttemp); else return 8; } -int_t -get_min (int_t * sums, int_t nprocs) +int_t get_min(int_t *sums, int_t nprocs) { int_t min_ind, min_val; min_ind = 0; @@ -1215,9 +1274,8 @@ get_min (int_t * sums, int_t nprocs) return min_ind; } -int_t -static_partition (struct superlu_pair *work_load, int_t nwl, int_t *partition, - int_t ldp, int_t * sums, int_t * counts, int nprocs) +int_t static_partition(struct superlu_pair *work_load, int_t nwl, int_t *partition, + int_t ldp, int_t *sums, int_t *counts, int nprocs) { //initialization loop for (int i = 0; i < nprocs; ++i) @@ -1225,16 +1283,15 @@ static_partition (struct superlu_pair *work_load, int_t nwl, int_t *partition, counts[i] = 0; sums[i] = 0; } - qsort (work_load, nwl, sizeof (struct superlu_pair), compare_pair); + qsort(work_load, nwl, sizeof(struct superlu_pair), compare_pair); // for(int i=0;i= 0; i--) { - int_t ind = get_min (sums, nprocs); + int_t ind = get_min(sums, nprocs); // printf("ind %d\n",ind ); partition[ldp * ind + counts[ind]] = work_load[i].ind; counts[ind]++; sums[ind] += work_load[i].val; - } return 0; @@ -1243,19 +1300,18 @@ static_partition (struct superlu_pair *work_load, int_t nwl, int_t *partition, /* * Search for the metadata of the j-th block in a U panel. */ -void -arrive_at_ublock (int_t j, /* j-th block in a U panel */ - int_t * iukp, /* output : point to index[] of j-th block */ - int_t * rukp, /* output : point to nzval[] of j-th block */ - int_t * jb, /* Global block number of block U(k,j). */ - int_t * ljb, /* Local block number of U(k,j). */ - int_t * nsupc,/* supernode size of destination block */ - int_t iukp0, /* input : search starting point */ - int_t rukp0, - int_t * usub, /* U subscripts */ - int_t * perm_u, /* permutation vector from static schedule */ - int_t * xsup, /* for SuperSize and LBj */ - gridinfo_t * grid) +void arrive_at_ublock(int_t j, /* j-th block in a U panel */ + int_t *iukp, /* output : point to index[] of j-th block */ + int_t *rukp, /* output : point to nzval[] of j-th block */ + int_t *jb, /* Global block number of block U(k,j). */ + int_t *ljb, /* Local block number of U(k,j). */ + int_t *nsupc, /* supernode size of destination block */ + int_t iukp0, /* input : search starting point */ + int_t rukp0, + int_t *usub, /* U subscripts */ + int_t *perm_u, /* permutation vector from static schedule */ + int_t *xsup, /* for SuperSize and LBj */ + gridinfo_t *grid) { int_t jj; *iukp = iukp0; /* point to the first block in index[] */ @@ -1276,49 +1332,48 @@ arrive_at_ublock (int_t j, /* j-th block in a U panel */ * usub[] - index array for panel U(k,:) */ // printf("iukp %d \n",*iukp ); - *jb = usub[*iukp]; /* Global block number of block U(k,jj). */ + *jb = usub[*iukp]; /* Global block number of block U(k,jj). */ // printf("jb %d \n",*jb ); - *nsupc = SuperSize (*jb); + *nsupc = SuperSize(*jb); // printf("nsupc %d \n",*nsupc ); - *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ + *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ *rukp += usub[*iukp - 1]; /* Jump # of nonzeros in block U(k,jj); - Move to block U(k,jj+1) in nzval[] */ + Move to block U(k,jj+1) in nzval[] */ *iukp += *nsupc; } /* Set the pointers to the beginning of U block U(k,j) */ - *jb = usub[*iukp]; /* Global block number of block U(k,j). */ - *ljb = LBj (*jb, grid); /* Local block number of U(k,j). */ - *nsupc = SuperSize (*jb); - *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ + *jb = usub[*iukp]; /* Global block number of block U(k,j). */ + *ljb = LBj(*jb, grid); /* Local block number of U(k,j). */ + *nsupc = SuperSize(*jb); + *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ } - /* * Count the maximum size of U(kk,:) that I own locally. * September 28, 2016. * Modified December 4, 2018. */ -int_t num_full_cols_U -( - int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup, - gridinfo_t *grid, int_t *perm_u, - int_t *ldu /* max. segment size of nonzero columns in U(kk,:) */ +int_t num_full_cols_U( + int_t kk, int_t **Ufstnz_br_ptr, int_t *xsup, + gridinfo_t *grid, int_t *perm_u, + int_t *ldu /* max. segment size of nonzero columns in U(kk,:) */ ) { - int_t lk = LBi (kk, grid); + int_t lk = LBi(kk, grid); int_t *usub = Ufstnz_br_ptr[lk]; - if (usub == NULL) return 0; /* code */ + if (usub == NULL) + return 0; /* code */ + + int_t iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ + int_t rukp = 0; /* Pointer to nzval[] of U(k,:) */ + int_t nub = usub[0]; /* Number of blocks in the block row U(k,:) */ - int_t iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ - int_t rukp = 0; /* Pointer to nzval[] of U(k,:) */ - int_t nub = usub[0]; /* Number of blocks in the block row U(k,:) */ - - int_t klst = FstBlockC (kk + 1); + int_t klst = FstBlockC(kk + 1); int_t iukp0 = iukp; int_t rukp0 = rukp; - int_t jb,ljb; + int_t jb, ljb; int_t nsupc; int_t full = 1; int_t full_Phi = 1; @@ -1327,38 +1382,41 @@ int_t num_full_cols_U *ldu = 0; - for (int_t j = 0; j < nub; ++j) { - - /* Sherry -- no need to search from beginning ?? */ + for (int_t j = 0; j < nub; ++j) + { + + /* Sherry -- no need to search from beginning ?? */ arrive_at_ublock( - j, &iukp, &rukp, &jb, &ljb, &nsupc, - iukp0, rukp0, usub, perm_u, xsup, grid - ); - for (int_t jj = iukp; jj < iukp + nsupc; ++jj) { + j, &iukp, &rukp, &jb, &ljb, &nsupc, + iukp0, rukp0, usub, perm_u, xsup, grid); + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { segsize = klst - usub[jj]; - if ( segsize ) ++temp_ncols; - if ( segsize > *ldu ) *ldu = segsize; + if (segsize) + ++temp_ncols; + if (segsize > *ldu) + *ldu = segsize; } } return temp_ncols; } int_t estimate_bigu_size( - int_t nsupers, - int_t**Ufstnz_br_ptr, /* point to U index[] array */ - Glu_persist_t *Glu_persist, - gridinfo_t* grid, int_t* perm_u, - int_t *max_ncols /* Output: Max. number of columns in among all U(k,:). + int_t nsupers, + int_t **Ufstnz_br_ptr, /* point to U index[] array */ + Glu_persist_t *Glu_persist, + gridinfo_t *grid, int_t *perm_u, + int_t *max_ncols /* Output: Max. number of columns in among all U(k,:). This is used for allocating GEMM V buffer. */ - ) +) { int_t iam = grid->iam; int_t Pc = grid->npcol; int_t Pr = grid->nprow; - int_t myrow = MYROW (iam, grid); - int_t mycol = MYCOL (iam, grid); - - int_t* xsup = Glu_persist->xsup; + int_t myrow = MYROW(iam, grid); + int_t mycol = MYCOL(iam, grid); + + int_t *xsup = Glu_persist->xsup; int_t ncols = 0; /* Count local number of nonzero columns */ int_t ldu = 0; /* Count max. segment size in one row U(k,:) */ @@ -1366,12 +1424,14 @@ int_t estimate_bigu_size( int_t max_ldu = 0; /* Initialize perm_u */ - for (int i = 0; i < nsupers; ++i) perm_u[i] = i; + for (int i = 0; i < nsupers; ++i) + perm_u[i] = i; - for (int lk = myrow; lk < nsupers; lk += Pr) {/* Go through my block rows */ + for (int lk = myrow; lk < nsupers; lk += Pr) + { /* Go through my block rows */ ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr, - xsup, grid, perm_u, &ldu) ); - my_max_ldu = SUPERLU_MAX(ldu, my_max_ldu); + xsup, grid, perm_u, &ldu)); + my_max_ldu = SUPERLU_MAX(ldu, my_max_ldu); } /* Need U buffer size large enough to hold all U(k,:) transferred from @@ -1379,196 +1439,243 @@ int_t estimate_bigu_size( MPI_Allreduce(&my_max_ldu, &max_ldu, 1, mpi_int_t, MPI_MAX, grid->cscp.comm); MPI_Allreduce(&ncols, max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm); -#if ( PRNTlevel>=1 ) - if ( iam==0 ) { - printf("max_ncols " IFMT ", max_ldu " IFMT ", bigu_size " IFMT "\n", - *max_ncols, max_ldu, max_ldu * (*max_ncols)); - fflush(stdout); +#if (PRNTlevel >= 1) + if (iam == 0) + { + printf("max_ncols " IFMT ", max_ldu " IFMT ", bigu_size " IFMT "\n", + *max_ncols, max_ldu, max_ldu * (*max_ncols)); + fflush(stdout); } #endif - return(max_ldu * (*max_ncols)); + return (max_ldu * (*max_ncols)); } -void quickSort( int_t* a, int_t l, int_t r, int_t dir) +void quickSort(int_t *a, int_t l, int_t r, int_t dir) { - int_t j; - - if( l < r ) - { - // divide and conquer - j = partition( a, l, r, dir); - quickSort( a, l, j-1, dir); - quickSort( a, j+1, r, dir); - } - -} + int_t j; -int_t partition( int_t* a, int_t l, int_t r, int_t dir) { - int_t pivot, i, j, t; - pivot = a[l]; - i = l; j = r+1; - - if(dir==0){ - while( 1) - { - do ++i; while( a[i] <= pivot && i <= r ); - do --j; while( a[j] > pivot ); - if( i >= j ) break; - t = a[i]; a[i] = a[j]; a[j] = t; - } - t = a[l]; a[l] = a[j]; a[j] = t; - return j; - }else if(dir==1){ - while( 1) - { - do ++i; while( a[i] >= pivot && i <= r ); - do --j; while( a[j] < pivot ); - if( i >= j ) break; - t = a[i]; a[i] = a[j]; a[j] = t; - } - t = a[l]; a[l] = a[j]; a[j] = t; - return j; - } - return 0; + if (l < r) + { + // divide and conquer + j = partition(a, l, r, dir); + quickSort(a, l, j - 1, dir); + quickSort(a, j + 1, r, dir); + } } +int_t partition(int_t *a, int_t l, int_t r, int_t dir) +{ + int_t pivot, i, j, t; + pivot = a[l]; + i = l; + j = r + 1; + if (dir == 0) + { + while (1) + { + do + ++i; + while (a[i] <= pivot && i <= r); + do + --j; + while (a[j] > pivot); + if (i >= j) + break; + t = a[i]; + a[i] = a[j]; + a[j] = t; + } + t = a[l]; + a[l] = a[j]; + a[j] = t; + return j; + } + else if (dir == 1) + { + while (1) + { + do + ++i; + while (a[i] >= pivot && i <= r); + do + --j; + while (a[j] < pivot); + if (i >= j) + break; + t = a[i]; + a[i] = a[j]; + a[j] = t; + } + t = a[l]; + a[l] = a[j]; + a[j] = t; + return j; + } + return 0; +} -void quickSortM( int_t* a, int_t l, int_t r, int_t lda, int_t dir, int_t dims) +void quickSortM(int_t *a, int_t l, int_t r, int_t lda, int_t dir, int_t dims) { - int_t j; - - if( l < r ) - { - // printf("dims: %5d",dims); - // fflush(stdout); - - // divide and conquer - j = partitionM( a, l, r,lda,dir, dims); - quickSortM( a, l, j-1,lda,dir,dims); - quickSortM( a, j+1, r,lda,dir,dims); - } - -} + int_t j; + if (l < r) + { + // printf("dims: %5d",dims); + // fflush(stdout); -int_t partitionM( int_t* a, int_t l, int_t r, int_t lda, int_t dir, int_t dims) { - int_t pivot, i, j, t, dd; - pivot = a[l]; - i = l; j = r+1; - - if(dir==0){ - while( 1) - { - do ++i; while( a[i] <= pivot && i <= r ); - do --j; while( a[j] > pivot ); - if( i >= j ) break; - for(dd=0;dd= pivot && i <= r ); - do --j; while( a[j] < pivot ); - if( i >= j ) break; - for(dd=0;dd pivot); + if (i >= j) + break; + for (dd = 0; dd < dims; dd++) + { + t = a[i + lda * dd]; + a[i + lda * dd] = a[j + lda * dd]; + a[j + lda * dd] = t; + } + } + for (dd = 0; dd < dims; dd++) + { + t = a[l + lda * dd]; + a[l + lda * dd] = a[j + lda * dd]; + a[j + lda * dd] = t; + } + return j; + } + else if (dir == 1) + { + while (1) + { + do + ++i; + while (a[i] >= pivot && i <= r); + do + --j; + while (a[j] < pivot); + if (i >= j) + break; + for (dd = 0; dd < dims; dd++) + { + t = a[i + lda * dd]; + a[i + lda * dd] = a[j + lda * dd]; + a[j + lda * dd] = t; + } + } + for (dd = 0; dd < dims; dd++) + { + t = a[l + lda * dd]; + a[l + lda * dd] = a[j + lda * dd]; + a[j + lda * dd] = t; + } + return j; + } + + return 0; +} /* * The following are from 3D code p3dcomm.c */ -int AllocGlu_3d(int_t n, int_t nsupers, LUstruct_t * LUstruct) +int AllocGlu_3d(int_t n, int_t nsupers, LUstruct_t *LUstruct) { /*broadcasting Glu_persist*/ - LUstruct->Glu_persist->xsup = intMalloc_dist(nsupers+1); //INT_T_ALLOC(nsupers+1); - LUstruct->Glu_persist->supno = intMalloc_dist(n); //INT_T_ALLOC(n); + LUstruct->Glu_persist->xsup = intMalloc_dist(nsupers + 1); //INT_T_ALLOC(nsupers+1); + LUstruct->Glu_persist->supno = intMalloc_dist(n); //INT_T_ALLOC(n); return 0; } // Sherry added -int DeAllocGlu_3d(LUstruct_t * LUstruct) +int DeAllocGlu_3d(LUstruct_t *LUstruct) { SUPERLU_FREE(LUstruct->Glu_persist->xsup); SUPERLU_FREE(LUstruct->Glu_persist->supno); return 0; } -int_t** getTreePerm( int_t* myTreeIdxs, int_t* myZeroTrIdxs, - int_t* nodeCount, int_t** nodeList, - int_t* perm_c_supno, int_t* iperm_c_supno, - gridinfo3d_t* grid3d) +int_t **getTreePerm(int_t *myTreeIdxs, int_t *myZeroTrIdxs, + int_t *nodeCount, int_t **nodeList, + int_t *perm_c_supno, int_t *iperm_c_supno, + gridinfo3d_t *grid3d) { int_t maxLvl = log2i(grid3d->zscp.Np) + 1; - - int_t** treePerm = SUPERLU_MALLOC(sizeof(int_t*)*maxLvl); + + int_t **treePerm = SUPERLU_MALLOC(sizeof(int_t *) * maxLvl); for (int_t lvl = 0; lvl < maxLvl; lvl++) - { - // treePerm[lvl] = NULL; - int_t treeId = myTreeIdxs[lvl]; - treePerm[lvl] = getPermNodeList(nodeCount[treeId], nodeList[treeId], - perm_c_supno, iperm_c_supno); - - } + { + // treePerm[lvl] = NULL; + int_t treeId = myTreeIdxs[lvl]; + treePerm[lvl] = getPermNodeList(nodeCount[treeId], nodeList[treeId], + perm_c_supno, iperm_c_supno); + } return treePerm; } -int_t* getMyNodeCounts(int_t maxLvl, int_t* myTreeIdxs, int_t* gNodeCount) +int_t *getMyNodeCounts(int_t maxLvl, int_t *myTreeIdxs, int_t *gNodeCount) { - int_t* myNodeCount = INT_T_ALLOC(maxLvl); + int_t *myNodeCount = INT_T_ALLOC(maxLvl); for (int i = 0; i < maxLvl; ++i) { - myNodeCount[i] = gNodeCount[myTreeIdxs[i]]; + myNodeCount[i] = gNodeCount[myTreeIdxs[i]]; } return myNodeCount; } /*chekc a vector vec of len across different process grids*/ -int_t checkIntVector3d(int_t* vec, int_t len, gridinfo3d_t* grid3d) +int_t checkIntVector3d(int_t *vec, int_t len, gridinfo3d_t *grid3d) { int_t nP = grid3d->zscp.Np; int_t myGrid = grid3d->zscp.Iam; - int_t * buf = intMalloc_dist(len); - - if (!myGrid) { - for (int_t p = 1; p < nP; ++p) - { - MPI_Status status; - MPI_Recv(buf, len, mpi_int_t, p, p, grid3d->zscp.comm, &status); - - for (int_t i = 0; i < len ; ++i) { - /* code */ - if (buf[i] != vec[i]) { - /* code */ - printf("Error occured at (%d) Loc %d \n", (int) p, (int) i); - exit(0); - } - } - } + int_t *buf = intMalloc_dist(len); + + if (!myGrid) + { + for (int_t p = 1; p < nP; ++p) + { + MPI_Status status; + MPI_Recv(buf, len, mpi_int_t, p, p, grid3d->zscp.comm, &status); + + for (int_t i = 0; i < len; ++i) + { + /* code */ + if (buf[i] != vec[i]) + { + /* code */ + printf("Error occured at (%d) Loc %d \n", (int)p, (int)i); + exit(0); + } + } + } } else - { - MPI_Send(vec, len, mpi_int_t, 0, myGrid, grid3d->zscp.comm); - } - + { + MPI_Send(vec, len, mpi_int_t, 0, myGrid, grid3d->zscp.comm); + } + return 0; } @@ -1576,19 +1683,19 @@ int_t checkIntVector3d(int_t* vec, int_t len, gridinfo3d_t* grid3d) * reduce the states from all the two grids before prinitng it out * See the defenition of enum PhaseType in superlu_enum_const.h */ -int_t reduceStat(PhaseType PHASE, - SuperLUStat_t *stat, gridinfo3d_t * grid3d) +int_t reduceStat(PhaseType PHASE, + SuperLUStat_t *stat, gridinfo3d_t *grid3d) { flops_t *ops = stat->ops; - + flops_t flopcnt; MPI_Reduce(&ops[PHASE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, 0, grid3d->zscp.comm); - + if (!grid3d->zscp.Iam) - { - ops[PHASE] = flopcnt; - } - + { + ops[PHASE] = flopcnt; + } + return 0; } From 3389c5cbb16d57b93cc5841288fb396439ae9b7e Mon Sep 17 00:00:00 2001 From: piyush sao Date: Wed, 10 Feb 2021 21:13:13 -0500 Subject: [PATCH 063/147] disabling multiple cuda stream --- SRC/superlu_gpu.cu | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/SRC/superlu_gpu.cu b/SRC/superlu_gpu.cu index 73c4bc29..d2450484 100644 --- a/SRC/superlu_gpu.cu +++ b/SRC/superlu_gpu.cu @@ -48,13 +48,18 @@ cudaError_t checkCuda(cudaError_t result) int_t getnCudaStreams() { - char *ttemp; - ttemp = getenv ("N_CUDA_STREAMS"); - - if (ttemp) - return atoi (ttemp); - else + // Disabling multiple cuda streams + #if 1 return 1; + #else + char *ttemp; + ttemp = getenv ("N_CUDA_STREAMS"); + + if (ttemp) + return atoi (ttemp); + else + return 1; + #endif } @@ -751,7 +756,9 @@ void printDevProp(cudaDeviceProp devProp) { size_t mfree, mtotal; cudaMemGetInfo (&mfree, &mtotal); - + + printf("pciBusID: %d\n", devProp.pciBusID); + printf("pciDeviceID: %d\n", devProp.pciDeviceID); printf("GPU Name: %s\n", devProp.name); printf("Total global memory: %zu\n", devProp.totalGlobalMem); printf("Total free memory: %zu\n", mfree); @@ -1264,7 +1271,7 @@ void CopyLUToGPU3D ( LUstruct_gpu **dA_gpu = &(sluGPU->dA_gpu); #ifdef GPU_DEBUG - if ( grid3d->iam == 0 ) + // if ( grid3d->iam == 0 ) { print_occupany(); cudaDeviceProp devProp; @@ -1322,13 +1329,13 @@ void CopyLUToGPU3D ( checkCudaErrors(cudaMallocHost( &tmp_ptr, bigu_size * sizeof(double) )) ; A_gpu->scubufs[streamId].bigU_host = (double *) tmp_ptr; - cudaMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[1])); + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[1]))); A_gpu->acc_L_buff = (double *) tmp_ptr; - cudaMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[3])); + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[3]))); A_gpu->acc_U_buff = (double *) tmp_ptr; - cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[0])); + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[0]))); A_gpu->scubufs[streamId].lsub_buf = (int_t *) tmp_ptr; - cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[2])); + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[2]))); A_gpu->scubufs[streamId].usub_buf = (int_t *) tmp_ptr; checkCudaErrors(cudaMalloc( &tmp_ptr, remain_l_max * sizeof(double) )) ; From 46da847a7ab9517d21681050e9c4fd575393742a Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Mon, 22 Feb 2021 11:21:07 -0500 Subject: [PATCH 064/147] Update CMake CUDA setup in CMakeLists.txt to use the new CUDAToolkit. Update the configure files for CUDA setup: superlu_dist_config.h.in, make.inc.in --- CMakeLists.txt | 142 +++++++++++++++++++++++++++-------- SRC/CMakeLists.txt | 6 ++ SRC/superlu_dist_config.h | 3 + SRC/superlu_dist_config.h.in | 3 + SRC/superlu_gpu.cu | 6 +- make.inc.in | 24 ++++-- 6 files changed, 142 insertions(+), 42 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 80089345..76f838a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR) # Project version numbers -project(SuperLU_DIST C CXX CUDA) +project(SuperLU_DIST C CXX) set(VERSION_MAJOR "7") set(VERSION_MINOR "0") set(VERSION_BugFix "0") @@ -19,52 +19,46 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") # Set up options option(enable_doc "Build doxygen documentation" OFF) option(enable_double "Enable double precision library" ON) -#option(enable_complex16 "Enable complex16 precision library" ON) -option(enable_complex16 "Enable complex16 precision library" OFF) +option(enable_complex16 "Enable complex16 precision library" ON) option(enable_tests "Build tests" ON) option(enable_examples "Build examples" ON) -option(TPL_ENABLE_BLASLIB "Build the CBLAS library" ${enable_blaslib_DEFAULT}) +#-- BLAS +option(TPL_ENABLE_INTERNAL_BLASLIB "Build the CBLAS library" ${enable_blaslib_DEFAULT}) option(TPL_BLAS_LIBRARIES "List of absolute paths to blas libraries [].") -option(TPL_ENABLE_PARMETISLIB "Build the ParMETIS library" ON) +#-- LAPACK option(TPL_ENABLE_LAPACKLIB "Enable LAPACK library" ON) -option(TPL_LAPACK_LIBRARIES "List of absolute paths to lapack libraries [].") +option(TPL_LAPACK_LIBRARIES "List of absolute paths to LAPACK libraries [].") +#-- ParMETIS +option(TPL_ENABLE_PARMETISLIB "Build the ParMETIS library" ON) option(TPL_PARMETIS_LIBRARIES "List of absolute paths to ParMETIS link libraries [].") option(TPL_PARMETIS_INCLUDE_DIRS "List of absolute paths to ParMETIS include directories [].") +#-- CombBLAS option(TPL_ENABLE_COMBBLASLIB "Build the CombBLAS library" OFF) option(TPL_COMBBLAS_LIBRARIES "List of absolute paths to CombBLAS link libraries [].") option(TPL_COMBBLAS_INCLUDE_DIRS "List of absolute paths to CombBLAS include directories [].") - +#-- CUDA +option(TPL_ENABLE_CUDALIB "Enable the CUDA libraries" OFF) ###################################################################### # # IDEAS: xSDK standards module -MESSAGE("\nProcess XSDK defaults ...") +#MESSAGE("\nProcess XSDK defaults ...") # SET(USE_XSDK_DEFAULTS_DEFAULT TRUE) # Set to false if desired -INCLUDE("cmake/XSDKDefaults.cmake") -INCLUDE(CTest) +#INCLUDE("cmake/XSDKDefaults.cmake") ###################################################################### +include(CTest) +include(CheckLanguage) + ###################################################################### # # Usual initialization stuff # ###################################################################### + set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) ## ???? set(CMAKE_INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib") -# set the position independent code property on all targets, so that -# -fPIC is added in compiler flag. -set(CMAKE_POSITION_INDEPENDENT_CODE ON) - -find_package(CUDA REQUIRED) -if (CUDA_FOUND) - if (NOT CMAKE_CUDA_FLAGS) - cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS Auto) - endif() - set(CUDA_NVCC_FLAGS_RELEASE "-O3 --expt-relaxed-constexpr -DNDEBUG ${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS}") - set(CUDA_NVCC_FLAGS_DEBUG "-O0 --expt-relaxed-constexpr -DDEBUG -g ${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS}") -endif() - #---- For shared library @@ -87,7 +81,14 @@ SET(BUILD_STATIC_LIBS TRUE CACHE BOOL "Include static libs when building shared" if (BUILD_SHARED_LIBS) message("-- SuperLU_DIST will be built as a shared library.") + # set the position independent code property on all targets, so that + # -fPIC is added in compiler flag. + set(CMAKE_POSITION_INDEPENDENT_CODE ON) + set(PROJECT_NAME_LIB_EXPORT libsuperlu_dist.so) + SET(CMAKE_EXE_LINKER_FLAGS + "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath,${CMAKE_INSTALL_PREFIX}/SRC") + if (BUILD_STATIC_LIBS) message("-- SuperLU_DIST will also be built as a static library.") endif() @@ -98,6 +99,8 @@ else() endif() set(CMAKE_CXX_STANDARD 11) +#set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) if (XSDK_ENABLE_Fortran) enable_language (Fortran) @@ -134,31 +137,70 @@ set(INSTALL_BIN_DIR "${default_install_bin_dir}" CACHE STRING "The folder where # Set up required compiler defines and options. ## get_directory_property( DirDefs COMPILE_DEFINITIONS ) -# set(CMAKE_C_FLAGS "-g -DDEBUGlevel=0 -DPRNTlevel=0 ${CMAKE_C_FLAGS}") -set(CMAKE_CXX_FLAGS "-g -std=c++11 ${CMAKE_CXX_FLAGS}") if(XSDK_INDEX_SIZE EQUAL 64) message("-- Using 64 bit integer for index size.") endif() set(CMAKE_C_FLAGS_RELEASE "-O3 -g" CACHE STRING "") -message("cmake_c_flags_release '${CMAKE_C_FLAGS_RELEASE}'") -message("cmake_shared_library_c_flags '${CMAKE_SHARED_LIBRARY_C_FLAGS}'") set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g" CACHE STRING "") +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0") +set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0") + ###################################################################### # # Find packages # ###################################################################### # -set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0") -set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0") #--------------------- MPI --------------------- -find_package(MPI) +find_package(MPI REQUIRED) if(MPI_C_FOUND) set(CMAKE_C_FLAGS "${MPI_C_COMPILE_FLAGS} ${CMAKE_C_FLAGS}") set(CMAKE_CXX_FLAGS "${MPI_CXX_COMPILE_FLAGS} ${CMAKE_CXX_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_C_LINK_FLAGS}" ) endif() +if (XSDK_ENABLE_Fortran) + if(MPI_Fortran_FOUND) + add_definitions(${MPI_Fortran_COMPILE_FLAGS}) + include_directories(${MPI_Fortran_INCLUDE_PATH}) + set(CMAKE_Fortran_FLAGS "${MPI_Fortran_COMPILE_FLAGS} ${CMAKE_Fortran_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_Fortran_LINK_FLAGS}") + endif() +endif() +#---- CUDA libraries +if (TPL_ENABLE_CUDALIB) ## want to use cuda + check_language(CUDA) + if(CMAKE_CUDA_COMPILER) + message("-- Enabled support for CUDA.") + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) + + #message("toolkit include: '${CUDAToolkit_INCLUDE_DIRS}'") + set(CUDA_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcudart.so") + set(CUDA_CUBLAS_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcublas.so") + + # The following make.inc exporting does not work + set(CUDA_LIB CUDA::cudart CUDA::cublas CUDA::cusolver) +# fix up CUDA library names + string (REPLACE ";" " " CUDA_LIB_STR "${CUDA_LIB}") + set(CUDA_LIB_EXPORT ${CUDA_LIB_STR}) + set(HAVE_CUDA TRUE) + # set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_CUDA") + # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DHAVE_CUDA") + else() + message("-- CUDA libraries not found.") + endif() +endif() + +#find_package(CUDA REQUIRED) +#if (CUDA_FOUND) +# if (NOT CMAKE_CUDA_FLAGS) +# cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS Auto) +# endif() +####### set(CUDA_NVCC_FLAGS_RELEASE "-O3 --expt-relaxed-constexpr -DNDEBUG ${CMAKE_#CUDA_FLAGS} ${CUDA_ARCH_FLAGS}") +## set(CUDA_NVCC_FLAGS_DEBUG "-O0 --expt-relaxed-constexpr -DDEBUG -g ${CMAKE_CUDA_#FLAGS} ${CUDA_ARCH_FLAGS}") +#endif() + #--------------------- OpenMP --------------------- if (NOT DEFINED enable_openmp) set(enable_openmp TRUE) @@ -177,7 +219,7 @@ if (enable_openmp) endif() endif() #--------------------- BLAS --------------------- -if(NOT TPL_ENABLE_BLASLIB) +if(NOT TPL_ENABLE_INTERNAL_BLASLIB) # set(TPL_BLAS_LIBRARIES "" CACHE FILEPATH # "Override of list of absolute path to libs for BLAS.") if(TPL_BLAS_LIBRARIES) @@ -194,6 +236,7 @@ endif() if(BLAS_FOUND) message("-- Using TPL_BLAS_LIBRARIES='${TPL_BLAS_LIBRARIES}'") set(CMAKE_C_FLAGS "-DUSE_VENDOR_BLAS ${CMAKE_C_FLAGS}") + set(CMAKE_CUDA_FLAGS "-DUSE_VENDOR_BLAS ${CMAKE_CUDA_FLAGS}") set(BLAS_LIB ${TPL_BLAS_LIBRARIES}) # fix up BLAS library name string (REPLACE ";" " " BLAS_LIB_STR "${BLAS_LIB}") @@ -203,7 +246,6 @@ else() add_subdirectory(CBLAS) set(BLAS_LIB blas) if (BUILD_SHARED_LIBS) # export to be referenced by downstream makefile -# set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/CBLAS/libblas.so) set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/${INSTALL_LIB_DIR}/libblas.so) else() set(BLAS_LIB_EXPORT ${CMAKE_INSTALL_PREFIX}/${INSTALL_LIB_DIR}/libblas.a) @@ -276,6 +318,26 @@ if(PARMETIS_FOUND) set(HAVE_PARMETIS TRUE) endif() +#---------------------- Additional C linker library --------- +SET(_c_libs ${CMAKE_C_IMPLICIT_LINK_LIBRARIES}) +FOREACH(_lib ${_c_libs}) + set(EXTRA_LIB "-l${_lib} ${EXTRA_LIB}") +ENDFOREACH() +string (REPLACE ";" " " EXTRA_LIB_STR "${EXTRA_LIB}") +set(EXTRA_LIB_EXPORT ${EXTRA_LIB_STR}) +message("-- EXTRA_LIB_EXPORT='${EXTRA_LIB_EXPORT}'") + +#---------------------- Additional Fortran linker library --------- +if (XSDK_ENABLE_Fortran) + SET(_f_libs ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}) + FOREACH(_lib ${_f_libs}) + set(EXTRA_FLIB "${EXTRA_FLIB} -l${_lib}") + ENDFOREACH() + string (REPLACE ";" " " EXTRA_FLIB_STR "${EXTRA_FLIB}") + set(EXTRA_FLIB_EXPORT ${EXTRA_FLIB_STR}) + message("-- EXTRA_FLIB_EXPORT='${EXTRA_FLIB_EXPORT}'") +endif() + #--------------------- CombBLAS --------------------- if (TPL_ENABLE_COMBBLASLIB) ## want to use CombBLAS if (NOT TPL_COMBBLAS_LIBRARIES) @@ -311,6 +373,19 @@ else() set(LOADER $(CC)) endif() +###################################################################### +# +# Fortran-C name mangling +# +###################################################################### +if (XSDK_ENABLE_Fortran) + include(FortranCInterface) + FortranCInterface_HEADER(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_FortranCInterface.h + MACRO_NAMESPACE "FC_") + FortranCInterface_VERIFY(CXX) + SET(MPI_Fortran_LINK_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") + add_subdirectory(FORTRAN) +endif() ###################################################################### # @@ -351,6 +426,9 @@ if(enable_examples) add_subdirectory(EXAMPLE) endif() +# superlu_dist uses c++11. PUBLIC means that the other codes linking to it need c++11 +#target_compile_features(SuperLU_DIST PUBLIC cxx_std_11) + # configure_file(${CMAKE_SOURCE_DIR}/make.inc.in ${CMAKE_BINARY_DIR}/make.inc) configure_file(${SuperLU_DIST_SOURCE_DIR}/make.inc.in ${SuperLU_DIST_SOURCE_DIR}/make.inc) configure_file(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h.in ${SuperLU_DIST_BINARY_DIR}/SRC/superlu_dist_config.h) diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 916ce8df..393517ef 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -175,6 +175,7 @@ endif() add_library(superlu_dist ${sources} ${HEADERS}) set(targets superlu_dist) + if (BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS) # build both shared and static libs add_library(superlu_dist-static STATIC ${sources} ${HEADERS}) @@ -206,6 +207,11 @@ foreach(target ${targets}) endif() endforeach(target) +# Add CUDA runtime library and CUBLAS library +if(CUDAToolkit_FOUND) # this is found in top-level CMakeLists.txt + target_link_libraries(superlu_dist CUDA::cudart CUDA::cublas) +endif() + target_compile_definitions(superlu_dist PRIVATE SUPERLU_DIST_EXPORTS) if(MSVC AND BUILD_SHARED_LIBS) set_target_properties(superlu_dist PROPERTIES diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index 2416fef5..8dcabe22 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -1,5 +1,8 @@ /* superlu_dist_config.h.in */ +/* Enable CUDA */ +#define HAVE_CUDA TRUE + /* Enable parmetis */ #define HAVE_PARMETIS TRUE diff --git a/SRC/superlu_dist_config.h.in b/SRC/superlu_dist_config.h.in index b8529d6c..7020e74d 100644 --- a/SRC/superlu_dist_config.h.in +++ b/SRC/superlu_dist_config.h.in @@ -1,5 +1,8 @@ /* superlu_dist_config.h.in */ +/* Enable CUDA */ +#cmakedefine HAVE_CUDA @HAVE_CUDA@ + /* Enable parmetis */ #cmakedefine HAVE_PARMETIS @HAVE_PARMETIS@ diff --git a/SRC/superlu_gpu.cu b/SRC/superlu_gpu.cu index d2450484..bec314e6 100644 --- a/SRC/superlu_gpu.cu +++ b/SRC/superlu_gpu.cu @@ -6,9 +6,11 @@ #include #include #include + #undef Reduce -// #include -#include "cub/cub.cuh" +//#include "cub/cub.cuh" +#include + #include "lustruct_gpu.h" // #include "p3dcomm.h" // #include "mkl_cblas.h" diff --git a/make.inc.in b/make.inc.in index 115b2396..9c5eeaa8 100644 --- a/make.inc.in +++ b/make.inc.in @@ -9,7 +9,7 @@ # Creation date: March 1, 2016 version 5.0.0 # # Modified: October 13, 2017 version 5.2.1 -# +# February 20, 2021 version 7.0.0 # ############################################################################ # @@ -20,12 +20,21 @@ SuperLUroot = ${CMAKE_INSTALL_PREFIX} DSUPERLULIB = $(SuperLUroot)/@CMAKE_INSTALL_LIBDIR@/${PROJECT_NAME_LIB_EXPORT} INCLUDEDIR = $(SuperLUroot)/@CMAKE_INSTALL_INCLUDEDIR@ -LIBS = $(DSUPERLULIB) ${BLAS_LIB_EXPORT} -BLASLIB = ${BLAS_LIB_EXPORT} +XSDK_INDEX_SIZE = @XSDK_INDEX_SIZE@ +SLU_HAVE_LAPACK = @SLU_HAVE_LAPACK@ +HAVE_PARMETIS = @HAVE_PARMETIS@ +HAVE_COMBBLAS = @HAVE_COMBBLAS@ +HAVE_CUDA = @HAVE_CUDA@ + +LIBS = $(DSUPERLULIB) ${BLAS_LIB_EXPORT} -lm LIBS += ${LAPACK_LIB_EXPORT} LIBS += ${PARMETIS_LIB_EXPORT} LIBS += ${COMBBLAS_LIB_EXPORT} +#LIBS += ${EXTRA_FLIB_EXPORT} +CUDALIBS = ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} #${CUDA_LIB_EXPORT} +LIBS += ${CUDALIBS} + # # The archiver and the flag(s) to use when building archive (library) # If your system has no ranlib, set RANLIB = echo. @@ -41,12 +50,11 @@ CFLAGS = @CMAKE_C_FLAGS_RELEASE@ @CMAKE_C_FLAGS@ ${SHARED_C_FLAGS_EXPORT} # CFLAGS += @COMPILE_DEFINITIONS@ CXX = @CMAKE_CXX_COMPILER@ CXXFLAGS = @CMAKE_CXX_FLAGS_RELEASE@ @CMAKE_CXX_FLAGS@ -XSDK_INDEX_SIZE=@XSDK_INDEX_SIZE@ -SLU_HAVE_LAPACK=@SLU_HAVE_LAPACK@ -HAVE_PARMETIS=@HAVE_PARMETIS@ -HAVE_COMBBLAS=@HAVE_COMBBLAS@ +NVCC = @CMAKE_CUDA_COMPILER@ +NVCCFLAGS = @CMAKE_CUDA_FLAGS@ + NOOPTS = -O0 FORTRAN = @CMAKE_Fortran_COMPILER@ LOADER = @CMAKE_CXX_COMPILER@ -LOADOPTS = @CMAKE_EXE_LINKER_FLAGS@ +LOADOPTS = @CMAKE_EXE_LINKER_FLAGS@ @CMAKE_CXX_LINK_FLAGS@ From 434e03b21c4241ccfbf17bf0d608fc6e09f383e3 Mon Sep 17 00:00:00 2001 From: Piyush Sao Date: Tue, 23 Feb 2021 20:22:34 -0500 Subject: [PATCH 065/147] changing cblas_daxpy to superlu_daxpy --- SRC/superlu_gpu.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/SRC/superlu_gpu.cu b/SRC/superlu_gpu.cu index bec314e6..8801a447 100644 --- a/SRC/superlu_gpu.cu +++ b/SRC/superlu_gpu.cu @@ -13,7 +13,7 @@ #include "lustruct_gpu.h" // #include "p3dcomm.h" -// #include "mkl_cblas.h" + extern "C" { void cblas_daxpy(const int N, const double alpha, const double *X, @@ -1145,7 +1145,7 @@ int_t reduceGPUlu( int_t llen = ksup_size * len; double alpha = 1; - cblas_daxpy (llen, alpha, A_gpu->acc_L_buff, 1, nzval_host, 1); + superlu_daxpy (llen, alpha, A_gpu->acc_L_buff, 1, nzval_host, 1); } } @@ -1165,7 +1165,7 @@ int_t reduceGPUlu( nzval_host = Unzval_br_ptr[kijb]; double alpha = 1; - cblas_daxpy (len, alpha, A_gpu->acc_U_buff, 1, nzval_host, 1); + superlu_daxpy (len, alpha, A_gpu->acc_U_buff, 1, nzval_host, 1); } } From bba5a3a3c8236c16287941a7844a771c73e5eefd Mon Sep 17 00:00:00 2001 From: piyush Date: Fri, 5 Mar 2021 10:37:08 -0500 Subject: [PATCH 066/147] version-working on summit --- CMakeLists.txt | 45 ++++++++++++++++++++++++--------------------- SRC/superlu_gpu.cu | 4 ++-- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 76f838a9..e8afca1c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,7 +19,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") # Set up options option(enable_doc "Build doxygen documentation" OFF) option(enable_double "Enable double precision library" ON) -option(enable_complex16 "Enable complex16 precision library" ON) +option(enable_complex16 "Enable complex16 precision library" OFF) option(enable_tests "Build tests" ON) option(enable_examples "Build examples" ON) #-- BLAS @@ -173,18 +173,27 @@ if (TPL_ENABLE_CUDALIB) ## want to use cuda if(CMAKE_CUDA_COMPILER) message("-- Enabled support for CUDA.") enable_language(CUDA) - find_package(CUDAToolkit REQUIRED) - - #message("toolkit include: '${CUDAToolkit_INCLUDE_DIRS}'") - set(CUDA_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcudart.so") - set(CUDA_CUBLAS_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcublas.so") - - # The following make.inc exporting does not work - set(CUDA_LIB CUDA::cudart CUDA::cublas CUDA::cusolver) -# fix up CUDA library names - string (REPLACE ";" " " CUDA_LIB_STR "${CUDA_LIB}") - set(CUDA_LIB_EXPORT ${CUDA_LIB_STR}) - set(HAVE_CUDA TRUE) + find_package(CUDA REQUIRED) + if (CUDA_FOUND) + if (NOT CMAKE_CUDA_FLAGS) + cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS Auto) + endif() + ###### set(CUDA_NVCC_FLAGS_RELEASE "-O3 --expt-relaxed-constexpr -DNDEBUG ${CMAKE_#CUDA_FLAGS} ${CUDA_ARCH_FLAGS}") + # set(CUDA_NVCC_FLAGS_DEBUG "-O0 --expt-relaxed-constexpr -DDEBUG -g ${CMAKE_CUDA_#FLAGS} ${CUDA_ARCH_FLAGS}") + endif() +# find_package(CUDAToolkit REQUIRED) + + message("cuda libraries at : '${CUDA_LIBRARIES}'") +# find_package(CUB REQUIRED) +# set(CUDA_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcudart.so") +# set(CUDA_CUBLAS_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcublas.so") + +# # The following make.inc exporting does not work +# set(CUDA_LIB CUDA::cudart CUDA::cublas CUDA::cusolver) +# # fix up CUDA library names +# string (REPLACE ";" " " CUDA_LIB_STR "${CUDA_LIB}") +# set(CUDA_LIB_EXPORT ${CUDA_LIB_STR}) +# set(HAVE_CUDA TRUE) # set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DHAVE_CUDA") # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -DHAVE_CUDA") else() @@ -192,14 +201,7 @@ if (TPL_ENABLE_CUDALIB) ## want to use cuda endif() endif() -#find_package(CUDA REQUIRED) -#if (CUDA_FOUND) -# if (NOT CMAKE_CUDA_FLAGS) -# cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS Auto) -# endif() -####### set(CUDA_NVCC_FLAGS_RELEASE "-O3 --expt-relaxed-constexpr -DNDEBUG ${CMAKE_#CUDA_FLAGS} ${CUDA_ARCH_FLAGS}") -## set(CUDA_NVCC_FLAGS_DEBUG "-O0 --expt-relaxed-constexpr -DDEBUG -g ${CMAKE_CUDA_#FLAGS} ${CUDA_ARCH_FLAGS}") -#endif() + #--------------------- OpenMP --------------------- if (NOT DEFINED enable_openmp) @@ -443,3 +445,4 @@ endif() configure_file(${CMAKE_CURRENT_SOURCE_DIR}/superlu_dist.pc.in ${CMAKE_CURRENT_BINARY_DIR}/superlu_dist.pc @ONLY) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/superlu_dist.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) + diff --git a/SRC/superlu_gpu.cu b/SRC/superlu_gpu.cu index 8801a447..3ab1a39e 100644 --- a/SRC/superlu_gpu.cu +++ b/SRC/superlu_gpu.cu @@ -8,8 +8,8 @@ #include #undef Reduce -//#include "cub/cub.cuh" -#include +#include "cub/cub.cuh" +//#include #include "lustruct_gpu.h" // #include "p3dcomm.h" From a90008761c0bd9f1197684e9e3af947be4aa3251 Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Thu, 11 Mar 2021 12:32:36 -0500 Subject: [PATCH 067/147] Start adding complex. --- CMakeLists.txt | 1 - SRC/superlu_dist_config.h | 2 +- SRC/superlu_gpu.cu | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e8afca1c..8c7b1165 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -202,7 +202,6 @@ if (TPL_ENABLE_CUDALIB) ## want to use cuda endif() - #--------------------- OpenMP --------------------- if (NOT DEFINED enable_openmp) set(enable_openmp TRUE) diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index 8dcabe22..ffd061a2 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -1,7 +1,7 @@ /* superlu_dist_config.h.in */ /* Enable CUDA */ -#define HAVE_CUDA TRUE +/* #undef HAVE_CUDA */ /* Enable parmetis */ #define HAVE_PARMETIS TRUE diff --git a/SRC/superlu_gpu.cu b/SRC/superlu_gpu.cu index 3ab1a39e..173560c8 100644 --- a/SRC/superlu_gpu.cu +++ b/SRC/superlu_gpu.cu @@ -14,6 +14,7 @@ #include "lustruct_gpu.h" // #include "p3dcomm.h" +#include "dcomplex.h" extern "C" { void cblas_daxpy(const int N, const double alpha, const double *X, From d0fa4796a8db17ca09f419c2d92bac086eaf42af Mon Sep 17 00:00:00 2001 From: liuyangzhuan Date: Sun, 21 Mar 2021 01:02:03 -0700 Subject: [PATCH 068/147] minor change --- EXAMPLE/pddrive_spawn.c | 3 ++- EXAMPLE/pzdrive_spawn.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/EXAMPLE/pddrive_spawn.c b/EXAMPLE/pddrive_spawn.c index 47b04729..3bcd9028 100755 --- a/EXAMPLE/pddrive_spawn.c +++ b/EXAMPLE/pddrive_spawn.c @@ -302,6 +302,7 @@ int main(int argc, char *argv[]) RELEASE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ out: +if(parent!=MPI_COMM_NULL) MPI_Reduce(result, MPI_BOTTOM, 2, MPI_FLOAT,MPI_MAX, 0, parent); superlu_gridexit(&grid); @@ -309,7 +310,7 @@ int main(int argc, char *argv[]) TERMINATES THE MPI EXECUTION ENVIRONMENT. ------------------------------------------------------------*/ - + if(parent!=MPI_COMM_NULL) MPI_Comm_disconnect(&parent); MPI_Finalize(); diff --git a/EXAMPLE/pzdrive_spawn.c b/EXAMPLE/pzdrive_spawn.c index faf725c6..0704cbad 100755 --- a/EXAMPLE/pzdrive_spawn.c +++ b/EXAMPLE/pzdrive_spawn.c @@ -305,6 +305,7 @@ int main(int argc, char *argv[]) RELEASE THE SUPERLU PROCESS GRID. ------------------------------------------------------------*/ out: + if(parent!=MPI_COMM_NULL) MPI_Reduce(result, MPI_BOTTOM, 2, MPI_FLOAT,MPI_MAX, 0, parent); superlu_gridexit(&grid); @@ -312,7 +313,7 @@ int main(int argc, char *argv[]) TERMINATES THE MPI EXECUTION ENVIRONMENT. ------------------------------------------------------------*/ - + if(parent!=MPI_COMM_NULL) MPI_Comm_disconnect(&parent); MPI_Finalize(); From 294290ef803e8489d7b0aac63673e1a87e14e0ad Mon Sep 17 00:00:00 2001 From: liuyangzhuan Date: Sun, 21 Mar 2021 22:03:32 -0700 Subject: [PATCH 069/147] minor change --- EXAMPLE/pddrive_spawn.c | 2 +- EXAMPLE/pzdrive_spawn.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/EXAMPLE/pddrive_spawn.c b/EXAMPLE/pddrive_spawn.c index 3bcd9028..990a62f9 100755 --- a/EXAMPLE/pddrive_spawn.c +++ b/EXAMPLE/pddrive_spawn.c @@ -265,7 +265,7 @@ int main(int argc, char *argv[]) result[1] = total * 1e-6; if (!iam) { printf("returning data:\n" - " Factor time : %8.2f | Total MEM : %8.2f\n", + " Factor time : %8.2f\n Total MEM : %8.2f\n", stat.utime[FACT], total * 1e-6); printf("**************************************************\n"); fflush(stdout); diff --git a/EXAMPLE/pzdrive_spawn.c b/EXAMPLE/pzdrive_spawn.c index 0704cbad..3e87b598 100755 --- a/EXAMPLE/pzdrive_spawn.c +++ b/EXAMPLE/pzdrive_spawn.c @@ -266,7 +266,7 @@ int main(int argc, char *argv[]) // result[1] = total * 1e-6; if (!iam) { printf("returning data:\n" - " Factor time : %8.2f | Total MEM : %8.2f\n", + " Factor time : %8.2f\n Total MEM : %8.2f\n", stat.utime[FACT], total * 1e-6); printf(" Solve time : %8.2f \n", stat.utime[SOLVE]); From 458350ccc373f65003d00233ec20e59eefe3bcb3 Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Thu, 1 Apr 2021 15:20:25 -0400 Subject: [PATCH 070/147] All the complex codes are compiled. Next step is testing. Double-prec code still working. --- CMakeLists.txt | 7 +- EXAMPLE/big.rua | 11496 ----------------------------------- EXAMPLE/pddrive_spawn.c | 2 +- EXAMPLE/pzdrive_spawn.c | 2 +- SRC/CMakeLists.txt | 10 +- SRC/dscatter3d.c | 84 +- SRC/dtreeFactorization.c | 33 +- SRC/dtrfAux.c | 11 +- SRC/lustruct_gpu.h | 258 - SRC/memory.c | 4 +- SRC/pd3dcomm.c | 1327 ++-- SRC/pdgssvx3d.c | 2168 +++---- SRC/pdgstrf.c | 77 +- SRC/pdgstrf3d.c | 73 +- SRC/psymbfact.c | 4 + SRC/pz3dcomm.c | 46 +- SRC/pzgssvx3d.c | 84 +- SRC/pzgstrf.c | 77 +- SRC/pzgstrf3d.c | 51 +- SRC/sp_ienv.c | 2 +- SRC/superlu_ddefs.h | 529 +- SRC/superlu_defs.h | 6 + SRC/superlu_dist_config.h | 2 +- SRC/superlu_gpu.cu | 1877 ------ SRC/superlu_zdefs.h | 89 +- SRC/treeFactorizationGPU.c | 6 +- SRC/util.c | 316 +- SRC/util_dist.h | 1 + SRC/zscatter3d.c | 113 +- SRC/ztreeFactorization.c | 32 +- SRC/ztrfAux.c | 18 +- 31 files changed, 2379 insertions(+), 16426 deletions(-) delete mode 100644 EXAMPLE/big.rua delete mode 100644 SRC/lustruct_gpu.h delete mode 100644 SRC/superlu_gpu.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index dbc88e62..eb99b3c1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -182,8 +182,11 @@ if (TPL_ENABLE_CUDALIB) ## want to use cuda if (NOT CMAKE_CUDA_FLAGS) cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS Auto) endif() - ###### set(CUDA_NVCC_FLAGS_RELEASE "-O3 --expt-relaxed-constexpr -DNDEBUG ${CMAKE_#CUDA_FLAGS} ${CUDA_ARCH_FLAGS}") - # set(CUDA_NVCC_FLAGS_DEBUG "-O0 --expt-relaxed-constexpr -DDEBUG -g ${CMAKE_CUDA_#FLAGS} ${CUDA_ARCH_FLAGS}") + set(HAVE_CUDA TRUE) + set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --expt-relaxed-constexpr -DNDEBUG" CACHE STRING "") + set(CMAKE_CUDA_FLAGS_DDEBUG "-O0 --expt-relaxed-constexpr -DDEBUG -g" CACHE STRING "") +# set(CUDA_NVCC_FLAGS_RELEASE "-O3 --expt-relaxed-constexpr -DNDEBUG ${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS}") +# set(CUDA_NVCC_FLAGS_DEBUG "-O0 --expt-relaxed-constexpr -DDEBUG -g ${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS}") endif() # find_package(CUDAToolkit REQUIRED) diff --git a/EXAMPLE/big.rua b/EXAMPLE/big.rua deleted file mode 100644 index 3a5c16f7..00000000 --- a/EXAMPLE/big.rua +++ /dev/null @@ -1,11496 +0,0 @@ -32-bit adder, from Steve Hamm (Motorola) hamm@austoto.sps.mot.com add32 - 11491 382 1493 7962 1654 -RUA 4960 4960 23884 0 -(13i6) (16i5) (3e26.18) (3e26.18) -F 1 0 - 1 30 42 45 57 65 73 76 87 111 114 117 127 - 130 144 157 165 178 181 191 203 206 214 226 236 239 - 251 254 263 274 278 288 310 321 324 340 350 359 362 - 373 397 400 404 417 420 430 440 450 468 471 484 496 - 499 509 520 530 534 546 549 558 570 573 581 609 622 - 625 635 644 653 656 671 695 698 702 714 717 728 739 - 752 765 768 778 793 796 804 816 826 829 841 844 853 - 865 868 876 899 912 915 925 934 943 946 961 988 991 - 995 1009 1012 1022 1031 1043 1058 1061 1071 1087 1090 1098 1110 - 1121 1124 1136 1140 1149 1160 1164 1174 1202 1214 1218 1230 1239 - 1248 1251 1263 1288 1291 1294 1304 1307 1319 1330 1343 1356 1359 - 1371 1385 1388 1398 1409 1418 1422 1436 1440 1449 1461 1464 1472 - 1501 1512 1516 1532 1542 1551 1554 1565 1590 1593 1596 1607 1611 - 1623 1634 1646 1660 1663 1675 1689 1692 1700 1712 1722 1725 1737 - 1740 1749 1761 1764 1772 1801 1812 1816 1832 1842 1851 1854 1865 - 1891 1894 1897 1910 1913 1923 1932 1945 1959 1962 1974 1990 1993 - 2003 2014 2024 2028 2039 2043 2052 2064 2067 2075 2101 2113 2117 - 2131 2140 2149 2152 2163 2190 2193 2197 2209 2212 2224 2234 2244 - 2258 2261 2271 2286 2289 2297 2309 2319 2322 2334 2337 2346 2358 - 2361 2369 2399 2411 2414 2425 2434 2443 2446 2459 2483 2486 2490 - 2504 2507 2517 2526 2536 2552 2555 2565 2581 2584 2592 2604 2614 - 2617 2629 2632 2641 2653 2656 2664 2696 2708 2711 2722 2731 2740 - 2743 2755 2780 2783 2787 2799 2802 2813 2827 2839 2852 2855 2868 - 2879 2882 2890 2902 2912 2915 2927 2930 2939 2951 2954 2962 2992 - 3004 3007 3018 3027 3036 3039 3051 3078 3081 3085 3096 3099 3114 - 3126 3139 3150 3153 3165 3177 3180 3188 3200 3210 3213 3225 3228 - 3237 3248 3252 3262 3292 3304 3307 3318 3327 3336 3339 3351 3377 - 3380 3384 3395 3398 3412 3425 3437 3449 3452 3465 3476 3479 3487 - 3499 3509 3512 3524 3527 3536 3548 3551 3559 3586 3598 3601 3614 - 3623 3632 3635 3647 3673 3676 3680 3691 3694 3709 3722 3735 3746 - 3749 3761 3773 3776 3784 3796 3806 3809 3821 3824 3833 3845 3848 - 3856 3886 3898 3901 3914 3923 3932 3935 3946 3973 3976 3980 3991 - 3994 4006 4019 4031 4044 4047 4057 4070 4073 4081 4093 4103 4106 - 4118 4121 4130 4142 4145 4153 4180 4193 4196 4206 4215 4224 4227 - 4241 4267 4270 4274 4287 4290 4300 4312 4322 4338 4341 4353 4365 - 4368 4376 4388 4398 4401 4413 4416 4425 4436 4440 4450 4476 4489 - 4492 4502 4511 4520 4523 4537 4566 4569 4573 4584 4587 4602 4614 - 4626 4638 4641 4651 4664 4667 4677 4688 4697 4701 4712 4715 4724 - 4736 4739 4747 4779 4792 4795 4805 4814 4823 4826 4841 4864 4867 - 4871 4882 4885 4897 4908 4920 4934 4937 4948 4962 4965 4973 4985 - 4995 4998 5010 5013 5022 5034 5037 5045 5070 5082 5085 5096 5105 - 5114 5117 5137 5164 5167 5171 5183 5186 5200 5213 5225 5237 5240 - 5252 5264 5267 5275 5287 5297 5300 5312 5315 5324 5336 5339 5347 - 5376 5389 5392 5402 5411 5420 5423 5437 5463 5466 5470 5481 5484 - 5498 5512 5526 5537 5540 5553 5564 5567 5575 5587 5597 5600 5612 - 5615 5624 5636 5639 5647 5674 5687 5690 5700 5709 5718 5721 5735 - 5759 5762 5766 5777 5780 5792 5805 5815 5829 5832 5843 5856 5859 - 5867 5879 5889 5894 5906 5909 5918 5930 5935 5943 5970 5982 5985 - 5997 6007 6016 6019 6033 6056 6059 6062 6072 6075 6088 6098 6111 - 6125 6129 6142 6156 6160 6169 6179 6189 6193 6205 6208 6217 6228 - 6233 6240 6264 6275 6278 6294 6304 6313 6316 6327 6351 6354 6357 - 6367 6370 6383 6393 6406 6420 6424 6437 6451 6455 6463 6475 6485 - 6490 6502 6505 6514 6526 6531 6539 6563 6574 6577 6596 6605 6613 - 6616 6627 6652 6655 6658 6671 6674 6684 6693 6706 6721 6725 6737 - 6754 6758 6765 6776 6786 6791 6803 6806 6815 6825 6829 6838 6867 - 6879 6882 6897 6907 6916 6919 6931 6955 6958 6961 6972 6975 6987 - 6997 7010 7023 7027 7039 7054 7058 7066 7078 7088 7093 7105 7108 - 7117 7129 7134 7142 7168 7179 7182 7197 7207 7216 7219 7230 7255 - 7258 7261 7271 7274 7287 7297 7309 7324 7328 7341 7354 7357 7364 - 7374 7383 7388 7400 7403 7412 7423 7428 7435 7465 7478 7481 7491 - 7500 7509 7512 7527 7551 7554 7557 7567 7570 7583 7593 7604 7618 - 7622 7633 7648 7652 7663 7673 7683 7687 7699 7702 7711 7722 7728 - 7737 7765 7778 7781 7791 7800 7809 7812 7827 7851 7854 7857 7867 - 7870 7883 7893 7903 7917 7920 7930 7945 7948 7957 7968 7978 7984 - 7996 7999 8008 8019 8025 8034 8061 8074 8077 8087 8096 8105 8108 - 8123 8150 8154 8158 8171 8174 8184 8196 8206 8222 8225 8236 8249 - 8253 8262 8273 8283 8289 8301 8304 8313 8323 8327 8338 8369 8381 - 8384 8395 8404 8413 8416 8429 8451 8454 8458 8469 8472 8484 8495 - 8505 8520 8523 8534 8548 8551 8560 8571 8581 8587 8599 8602 8611 - 8622 8628 8637 8663 8676 8679 8689 8698 8707 8710 8724 8750 8754 - 8758 8769 8772 8784 8795 8805 8820 8823 8834 8848 8852 8861 8872 - 8882 8888 8900 8903 8912 8922 8926 8937 8959 8971 8974 8986 8995 - 9004 9007 9018 9044 9048 9052 9063 9066 9079 9089 9099 9113 9116 - 9127 9142 9146 9155 9166 9176 9182 9194 9197 9206 9216 9220 9231 - 9234 9244 9247 9259 9269 9278 9281 9292 9317 9320 9323 9333 9336 - 9352 9362 9376 9387 9391 9404 9417 9421 9432 9442 9452 9456 9468 - 9471 9480 9491 9497 9506 9510 9513 9518 9521 9525 9532 9537 9540 - 9542 9545 9548 9550 9554 9557 9559 9562 9566 9569 9574 9577 9582 - 9585 9590 9596 9599 9601 9605 9608 9611 9615 9619 9622 9626 9629 - 9633 9636 9638 9641 9645 9648 9651 9653 9655 9658 9662 9665 9669 - 9672 9677 9680 9684 9691 9695 9698 9700 9703 9707 9710 9713 9715 - 9719 9722 9726 9729 9734 9737 9742 9748 9752 9755 9757 9760 9763 - 9765 9769 9772 9775 9779 9783 9786 9790 9793 9798 9801 9805 9813 - 9817 9820 9824 9827 9830 9834 9838 9841 9845 9848 9852 9855 9858 - 9862 9866 9869 9872 9874 9876 9879 9883 9886 9890 9893 9898 9901 - 9905 9912 9916 9919 9921 9924 9928 9931 9934 9936 9940 9943 9947 - 9950 9955 9958 9962 9970 9974 9977 9979 9982 9985 9987 9991 9994 - 9997 10001 10005 10008 10012 10015 10020 10023 10027 10034 10038 10041 10045 - 10048 10051 10055 10059 10062 10066 10069 10073 10076 10079 10083 10087 10090 - 10093 10095 10097 10100 10104 10107 10111 10114 10119 10122 10126 10133 10137 - 10140 10142 10145 10149 10152 10155 10157 10161 10164 10168 10171 10176 10179 - 10183 10190 10194 10197 10199 10202 10205 10207 10211 10214 10217 10221 10225 - 10228 10232 10235 10240 10243 10247 10256 10260 10263 10267 10270 10273 10277 - 10281 10284 10288 10291 10295 10298 10301 10305 10309 10312 10315 10317 10319 - 10322 10326 10329 10333 10336 10340 10343 10347 10353 10357 10360 10362 10365 - 10369 10372 10375 10377 10381 10384 10388 10391 10395 10398 10402 10408 10412 - 10415 10417 10420 10423 10425 10429 10432 10435 10439 10443 10446 10451 10454 - 10459 10462 10467 10473 10477 10480 10484 10487 10490 10494 10498 10501 10505 - 10508 10512 10515 10518 10522 10526 10529 10532 10534 10536 10539 10543 10546 - 10550 10553 10558 10561 10565 10570 10574 10577 10579 10582 10586 10589 10592 - 10594 10598 10601 10605 10608 10612 10615 10620 10625 10629 10632 10634 10637 - 10640 10642 10646 10649 10652 10656 10660 10663 10668 10671 10675 10678 10682 - 10688 10692 10695 10699 10702 10705 10709 10713 10716 10720 10723 10727 10730 - 10733 10737 10741 10744 10747 10749 10751 10754 10758 10761 10765 10768 10773 - 10776 10780 10787 10791 10794 10796 10799 10803 10806 10809 10811 10815 10818 - 10822 10825 10829 10832 10837 10842 10846 10849 10851 10854 10857 10859 10863 - 10866 10869 10873 10877 10880 10885 10888 10893 10896 10900 10909 10913 10916 - 10920 10923 10926 10930 10934 10937 10941 10944 10948 10951 10954 10958 10962 - 10965 10968 10970 10972 10975 10979 10982 10986 10989 10993 10996 11000 11007 - 11011 11014 11016 11019 11023 11026 11029 11031 11035 11038 11042 11045 11049 - 11052 11056 11062 11066 11069 11071 11074 11077 11079 11083 11086 11089 11093 - 11097 11100 11104 11107 11112 11115 11119 11126 11130 11133 11137 11140 11143 - 11147 11151 11154 11158 11161 11165 11168 11171 11175 11179 11182 11185 11187 - 11189 11192 11196 11199 11203 11206 11211 11214 11218 11225 11229 11232 11234 - 11237 11241 11244 11247 11249 11253 11256 11260 11263 11268 11271 11275 11282 - 11286 11289 11291 11294 11297 11299 11303 11306 11309 11313 11317 11320 11324 - 11327 11332 11335 11339 11348 11352 11355 11359 11362 11365 11369 11373 11376 - 11380 11383 11387 11390 11393 11397 11401 11404 11407 11409 11411 11414 11418 - 11421 11425 11428 11433 11436 11440 11447 11451 11454 11456 11459 11463 11466 - 11469 11471 11475 11478 11482 11485 11490 11493 11497 11504 11508 11511 11513 - 11516 11519 11521 11525 11528 11531 11535 11539 11542 11546 11549 11554 11557 - 11561 11568 11572 11575 11579 11582 11585 11589 11593 11596 11600 11603 11607 - 11610 11613 11617 11621 11624 11627 11629 11631 11634 11638 11641 11645 11648 - 11653 11656 11660 11667 11671 11674 11676 11679 11683 11686 11689 11691 11695 - 11698 11702 11705 11710 11713 11717 11724 11728 11731 11733 11736 11739 11741 - 11745 11748 11751 11755 11759 11762 11766 11769 11774 11777 11782 11788 11792 - 11795 11799 11802 11805 11809 11813 11816 11820 11823 11827 11830 11833 11837 - 11841 11844 11847 11849 11851 11854 11858 11861 11865 11868 11873 11876 11880 - 11887 11891 11894 11896 11899 11903 11906 11909 11911 11915 11918 11922 11925 - 11930 11933 11937 11944 11948 11951 11953 11956 11959 11961 11965 11968 11971 - 11975 11979 11982 11986 11989 11994 11997 12002 12008 12012 12015 12019 12022 - 12025 12029 12033 12036 12040 12043 12047 12050 12053 12057 12061 12064 12067 - 12069 12071 12074 12078 12081 12085 12088 12093 12096 12100 12107 12111 12114 - 12116 12119 12123 12126 12129 12131 12135 12138 12142 12145 12150 12153 12157 - 12164 12168 12171 12173 12176 12179 12181 12185 12188 12191 12195 12199 12202 - 12206 12209 12214 12217 12222 12228 12232 12235 12239 12242 12245 12249 12253 - 12256 12260 12263 12267 12270 12273 12277 12281 12284 12287 12289 12291 12294 - 12298 12301 12305 12308 12313 12316 12320 12327 12331 12334 12336 12339 12343 - 12346 12349 12351 12355 12358 12362 12365 12370 12373 12377 12384 12388 12391 - 12393 12396 12399 12401 12405 12408 12411 12415 12419 12422 12426 12429 12434 - 12437 12442 12448 12452 12455 12459 12462 12465 12469 12473 12476 12480 12483 - 12487 12490 12493 12497 12501 12504 12507 12509 12511 12514 12518 12521 12525 - 12528 12533 12536 12540 12547 12551 12554 12556 12559 12563 12566 12569 12571 - 12575 12578 12582 12585 12590 12593 12597 12605 12609 12612 12614 12617 12620 - 12622 12626 12629 12632 12636 12640 12643 12647 12650 12655 12658 12662 12670 - 12674 12677 12681 12684 12687 12691 12695 12698 12702 12705 12709 12712 12715 - 12719 12723 12726 12729 12731 12733 12736 12740 12743 12747 12750 12755 12758 - 12762 12769 12773 12776 12778 12781 12785 12788 12791 12793 12797 12800 12804 - 12807 12812 12815 12819 12827 12831 12834 12836 12839 12842 12844 12848 12851 - 12854 12858 12862 12865 12869 12872 12877 12880 12885 12891 12895 12898 12902 - 12905 12908 12912 12916 12919 12923 12926 12930 12933 12936 12940 12944 12947 - 12950 12952 12954 12957 12961 12964 12968 12971 12976 12979 12983 12991 12995 - 12998 13000 13003 13007 13010 13013 13015 13019 13022 13026 13029 13034 13037 - 13041 13049 13053 13056 13058 13061 13064 13066 13070 13073 13076 13080 13084 - 13087 13091 13094 13099 13102 13107 13113 13117 13120 13124 13127 13130 13134 - 13138 13141 13145 13148 13152 13155 13158 13162 13166 13169 13172 13174 13176 - 13179 13183 13186 13190 13193 13198 13201 13205 13212 13216 13219 13221 13224 - 13228 13231 13234 13236 13240 13243 13247 13250 13255 13258 13262 13269 13273 - 13276 13278 13281 13284 13286 13290 13293 13296 13300 13304 13307 13311 13314 - 13319 13322 13326 13333 13337 13340 13344 13347 13350 13354 13358 13361 13365 - 13368 13372 13375 13378 13382 13386 13389 13392 13394 13396 13399 13403 13406 - 13410 13413 13418 13421 13425 13432 13436 13439 13441 13444 13448 13451 13454 - 13456 13460 13463 13467 13470 13475 13478 13482 13490 13494 13497 13499 13502 - 13505 13507 13511 13514 13517 13521 13525 13528 13532 13535 13540 13543 13548 - 13554 13558 13561 13565 13568 13571 13575 13579 13582 13586 13589 13593 13596 - 13599 13603 13607 13610 13613 13615 13617 13620 13624 13627 13631 13634 13639 - 13642 13646 13653 13657 13660 13662 13665 13669 13672 13675 13677 13681 13684 - 13688 13691 13696 13699 13703 13710 13714 13717 13719 13722 13725 13727 13731 - 13734 13737 13741 13745 13748 13752 13755 13760 13763 13768 13774 13778 13781 - 13785 13788 13791 13795 13799 13802 13806 13809 13813 13816 13819 13823 13827 - 13830 13833 13835 13837 13840 13844 13847 13851 13854 13859 13862 13866 13873 - 13877 13880 13882 13885 13889 13892 13895 13897 13901 13904 13908 13911 13916 - 13919 13924 13930 13934 13937 13939 13942 13945 13947 13951 13954 13957 13961 - 13965 13968 13973 13976 13981 13984 13989 13995 13999 14002 14006 14009 14012 - 14016 14020 14023 14027 14030 14034 14037 14040 14044 14048 14051 14054 14056 - 14058 14061 14066 14069 14073 14076 14081 14084 14088 14095 14099 14102 14104 - 14107 14112 14115 14118 14120 14124 14127 14131 14134 14139 14142 14147 14153 - 14157 14160 14162 14165 14168 14170 14174 14177 14180 14184 14188 14191 14196 - 14199 14204 14207 14212 14218 14222 14225 14229 14232 14235 14239 14243 14246 - 14250 14253 14257 14260 14263 14267 14271 14274 14277 14279 14281 14284 14288 - 14291 14295 14298 14303 14306 14310 14317 14321 14324 14326 14329 14333 14336 - 14339 14341 14345 14348 14352 14355 14360 14363 14368 14374 14379 14382 14384 - 14387 14390 14392 14396 14399 14402 14406 14410 14413 14418 14421 14426 14429 - 14433 14442 14446 14449 14453 14456 14459 14463 14467 14470 14474 14477 14481 - 14484 14487 14491 14495 14498 14501 14503 14505 14508 14513 14516 14520 14523 - 14528 14531 14535 14542 14546 14549 14551 14554 14559 14562 14565 14567 14571 - 14574 14578 14581 14586 14589 14594 14600 14604 14607 14609 14612 14615 14617 - 14621 14624 14627 14631 14635 14638 14643 14646 14651 14654 14658 14665 14669 - 14672 14676 14679 14682 14686 14690 14693 14697 14700 14704 14707 14710 14714 - 14718 14721 14724 14726 14728 14731 14735 14738 14742 14745 14750 14753 14757 - 14764 14768 14771 14773 14776 14780 14783 14786 14788 14792 14795 14799 14802 - 14807 14810 14815 14821 14825 14828 14830 14833 14836 14838 14842 14845 14848 - 14852 14856 14859 14864 14867 14872 14875 14880 14886 14890 14893 14897 14900 - 14903 14907 14911 14914 14918 14921 14925 14928 14931 14935 14939 14942 14945 - 14947 14949 14952 14957 14960 14964 14967 14972 14975 14979 14986 14990 14993 - 14995 14998 15003 15006 15009 15011 15016 15019 15023 15026 15031 15034 15038 - 15046 15050 15053 15055 15058 15061 15063 15067 15070 15073 15077 15081 15084 - 15089 15092 15097 15100 15105 15111 15115 15118 15122 15125 15128 15132 15136 - 15139 15143 15146 15150 15153 15156 15160 15164 15167 15170 15172 15174 15177 - 15181 15184 15188 15191 15196 15199 15203 15210 15214 15217 15219 15222 15226 - 15229 15232 15234 15238 15241 15245 15248 15253 15256 15260 15268 15272 15275 - 15277 15280 15283 15285 15289 15292 15295 15299 15303 15306 15311 15314 15319 - 15322 15327 15333 15337 15340 15344 15347 15350 15354 15358 15361 15365 15368 - 15372 15375 15378 15382 15386 15389 15392 15394 15396 15399 15403 15406 15410 - 15413 15418 15421 15425 15432 15436 15439 15441 15444 15448 15451 15454 15456 - 15460 15463 15467 15470 15475 15478 15482 15490 15494 15497 15499 15502 15505 - 15507 15511 15514 15517 15521 15525 15528 15532 15535 15540 15543 15547 15555 - 15559 15562 15566 15569 15572 15576 15580 15583 15587 15590 15594 15597 15600 - 15604 15608 15611 15614 15616 15618 15621 15625 15628 15632 15635 15640 15643 - 15647 15654 15658 15661 15663 15666 15670 15673 15676 15678 15682 15685 15689 - 15692 15697 15700 15704 15711 15715 15718 15720 15723 15726 15728 15732 15735 - 15738 15742 15746 15749 15753 15756 15761 15764 15769 15775 15779 15782 15786 - 15789 15792 15796 15800 15803 15807 15810 15814 15817 15820 15824 15828 15831 - 15834 15836 15838 15841 15845 15848 15852 15855 15860 15863 15867 15874 15878 - 15881 15883 15886 15890 15893 15896 15898 15902 15905 15909 15912 15917 15920 - 15924 15932 15936 15939 15941 15944 15947 15949 15953 15956 15959 15963 15967 - 15970 15974 15977 15982 15985 15990 15996 16000 16003 16007 16010 16013 16017 - 16021 16024 16028 16031 16035 16038 16041 16045 16049 16052 16055 16057 16059 - 16062 16066 16069 16073 16076 16081 16084 16088 16095 16099 16102 16104 16107 - 16111 16114 16117 16119 16123 16126 16130 16133 16138 16141 16145 16152 16156 - 16159 16161 16164 16167 16169 16173 16176 16179 16183 16187 16190 16194 16197 - 16202 16205 16210 16216 16220 16223 16227 16230 16233 16237 16241 16244 16248 - 16251 16255 16258 16261 16265 16269 16272 16275 16277 16279 16282 16286 16289 - 16293 16296 16301 16304 16308 16315 16319 16322 16324 16327 16331 16334 16337 - 16339 16343 16346 16351 16354 16359 16362 16367 16373 16377 16380 16382 16385 - 16388 16390 16394 16397 16400 16404 16408 16411 16416 16419 16424 16427 16432 - 16438 16442 16445 16449 16452 16455 16459 16463 16466 16470 16473 16477 16480 - 16483 16487 16491 16494 16497 16499 16501 16504 16508 16511 16515 16518 16523 - 16526 16530 16537 16541 16544 16546 16549 16553 16556 16559 16561 16565 16568 - 16572 16575 16579 16582 16585 16589 16594 16597 16600 16603 16606 16608 16612 - 16615 16620 16623 16628 16631 16635 16638 16641 16645 16649 16652 16656 16662 - 16668 16671 16674 16676 16680 16683 16686 16691 16697 16700 16704 16707 16711 - 16714 16717 16722 16725 16727 16731 16734 16738 16741 16744 16748 16752 16755 - 16758 16763 16767 16770 16773 16775 16781 16784 16787 16793 16797 16800 16804 - 16807 16810 16814 16818 16821 16824 16827 16830 16832 16836 16839 16844 16847 - 16851 16854 16858 16861 16864 16868 16872 16875 16879 16887 16892 16895 16899 - 16902 16906 16909 16914 16920 16925 16928 16932 16935 16939 16942 16945 16950 - 16953 16955 16959 16962 16966 16969 16972 16976 16980 16983 16986 16991 16997 - 17000 17003 17005 17009 17012 17017 17023 17027 17030 17034 17037 17040 17044 - 17048 17051 17054 17057 17060 17062 17066 17069 17074 17077 17081 17084 17088 - 17091 17094 17098 17102 17105 17110 17116 17121 17124 17128 17131 17135 17138 - 17142 17150 17155 17158 17162 17165 17169 17172 17175 17180 17183 17185 17189 - 17192 17196 17199 17202 17206 17210 17213 17216 17221 17227 17230 17233 17235 - 17241 17244 17248 17255 17259 17262 17266 17269 17272 17276 17280 17283 17286 - 17289 17292 17294 17298 17301 17306 17309 17313 17316 17320 17323 17326 17330 - 17334 17337 17342 17348 17353 17356 17360 17363 17367 17370 17374 17382 17387 - 17390 17394 17397 17401 17404 17407 17412 17415 17417 17421 17424 17428 17431 - 17434 17438 17442 17445 17448 17453 17457 17460 17463 17465 17471 17474 17479 - 17485 17489 17492 17496 17499 17502 17506 17510 17513 17516 17519 17522 17524 - 17528 17531 17536 17539 17544 17547 17551 17554 17557 17561 17565 17568 17573 - 17579 17584 17587 17591 17594 17598 17601 17606 17612 17617 17620 17624 17627 - 17631 17634 17637 17642 17645 17647 17651 17654 17658 17661 17664 17668 17672 - 17675 17678 17683 17689 17692 17695 17697 17701 17704 17709 17715 17719 17722 - 17726 17729 17732 17736 17740 17743 17746 17749 17752 17754 17758 17761 17766 - 17769 17774 17777 17781 17784 17787 17791 17795 17798 17802 17809 17814 17817 - 17821 17824 17828 17831 17836 17842 17847 17850 17854 17857 17861 17864 17867 - 17872 17875 17877 17881 17884 17888 17891 17894 17898 17902 17905 17908 17913 - 17919 17922 17925 17927 17933 17936 17941 17947 17951 17954 17958 17961 17964 - 17968 17972 17975 17978 17981 17984 17986 17990 17993 17998 18001 18006 18009 - 18013 18016 18019 18023 18027 18030 18035 18041 18046 18049 18053 18056 18060 - 18063 18068 18074 18079 18082 18086 18089 18093 18096 18099 18104 18107 18109 - 18113 18116 18121 18124 18127 18131 18135 18138 18141 18146 18152 18155 18158 - 18160 18164 18167 18172 18178 18182 18185 18189 18192 18195 18199 18203 18206 - 18209 18212 18215 18217 18221 18224 18229 18232 18236 18239 18243 18246 18249 - 18253 18257 18260 18264 18272 18277 18280 18284 18287 18291 18294 18298 18306 - 18311 18314 18318 18321 18325 18328 18331 18336 18339 18341 18345 18348 18352 - 18355 18358 18362 18366 18369 18372 18377 18383 18386 18389 18391 18397 18400 - 18405 18411 18415 18418 18422 18425 18428 18432 18436 18439 18442 18445 18448 - 18450 18454 18457 18462 18465 18469 18472 18476 18479 18482 18486 18490 18493 - 18497 18505 18510 18513 18517 18520 18524 18527 18531 18539 18544 18547 18551 - 18554 18558 18561 18564 18569 18572 18574 18578 18581 18585 18588 18591 18595 - 18599 18602 18605 18610 18616 18619 18622 18624 18630 18633 18637 18644 18648 - 18651 18655 18658 18661 18665 18669 18672 18675 18678 18681 18683 18687 18690 - 18695 18698 18702 18705 18709 18712 18715 18719 18723 18726 18731 18737 18742 - 18745 18749 18752 18756 18759 18764 18770 18775 18778 18782 18785 18789 18792 - 18795 18800 18803 18805 18809 18812 18816 18819 18822 18826 18830 18833 18836 - 18841 18847 18850 18853 18855 18861 18864 18869 18875 18879 18882 18886 18889 - 18892 18896 18900 18903 18906 18909 18912 18914 18918 18921 18926 18929 18933 - 18936 18940 18943 18946 18950 18954 18957 18962 18968 18973 18976 18980 18983 - 18987 18990 18995 19001 19006 19009 19013 19016 19020 19023 19026 19031 19034 - 19036 19040 19043 19047 19050 19053 19057 19061 19064 19067 19072 19076 19079 - 19082 19084 19090 19093 19098 19104 19108 19111 19115 19118 19121 19125 19129 - 19132 19135 19138 19141 19143 19147 19150 19155 19158 19162 19165 19169 19172 - 19175 19179 19183 19186 19191 19197 19202 19205 19209 19212 19216 19219 19224 - 19230 19235 19238 19242 19245 19249 19252 19255 19260 19263 19265 19269 19272 - 19276 19279 19282 19286 19290 19293 19296 19301 19307 19310 19313 19315 19321 - 19324 19329 19335 19339 19342 19346 19349 19352 19356 19360 19363 19366 19369 - 19372 19374 19378 19381 19386 19389 19393 19396 19400 19403 19406 19410 19414 - 19417 19422 19428 19433 19436 19440 19443 19447 19450 19454 19461 19466 19469 - 19473 19476 19480 19483 19486 19491 19494 19496 19500 19503 19507 19510 19513 - 19517 19521 19524 19527 19532 19538 19541 19544 19546 19552 19555 19559 19566 - 19570 19573 19577 19580 19583 19587 19591 19594 19597 19600 19603 19605 19609 - 19612 19617 19620 19624 19627 19631 19634 19637 19641 19645 19648 19653 19659 - 19664 19667 19671 19674 19678 19681 19685 19693 19698 19701 19705 19708 19712 - 19715 19718 19723 19726 19728 19732 19735 19739 19742 19745 19749 19753 19756 - 19759 19764 19770 19773 19776 19778 19784 19787 19792 19798 19802 19805 19809 - 19812 19815 19819 19823 19826 19829 19832 19835 19837 19841 19844 19849 19852 - 19856 19859 19863 19866 19869 19873 19877 19880 19884 19892 19897 19900 19904 - 19907 19911 19914 19919 19925 19930 19933 19937 19940 19944 19947 19950 19955 - 19958 19960 19964 19967 19971 19974 19977 19981 19985 19988 19991 19996 20000 - 20003 20006 20008 20014 20017 20022 20028 20032 20035 20039 20042 20045 20049 - 20053 20056 20059 20062 20065 20067 20071 20074 20079 20082 20086 20089 20093 - 20096 20099 20103 20107 20110 20114 20121 20126 20129 20133 20136 20140 20143 - 20147 20155 20160 20163 20167 20170 20174 20177 20180 20185 20188 20190 20194 - 20197 20202 20205 20208 20212 20216 20219 20222 20227 20233 20236 20239 20241 - 20245 20248 20253 20259 20263 20266 20270 20273 20276 20280 20284 20287 20290 - 20293 20296 20298 20302 20305 20310 20313 20317 20320 20324 20327 20330 20334 - 20338 20341 20346 20352 20357 20360 20364 20367 20371 20374 20378 20385 20390 - 20393 20397 20400 20404 20407 20410 20415 20418 20420 20424 20427 20431 20434 - 20437 20441 20445 20448 20451 20456 20462 20465 20468 20470 20476 20479 20483 - 20490 20494 20497 20501 20504 20507 20511 20515 20518 20521 20524 20527 20529 - 20533 20536 20541 20544 20548 20551 20555 20558 20561 20565 20569 20572 20576 - 20583 20588 20591 20595 20598 20602 20605 20609 20616 20621 20624 20628 20631 - 20635 20638 20641 20646 20649 20651 20655 20658 20662 20665 20668 20672 20676 - 20679 20682 20687 20693 20696 20699 20701 20707 20710 20714 20721 20725 20728 - 20732 20735 20738 20742 20746 20749 20752 20755 20758 20760 20764 20767 20772 - 20775 20779 20782 20786 20789 20792 20796 20800 20803 20808 20814 20819 20822 - 20826 20829 20833 20836 20841 20847 20852 20855 20859 20862 20866 20869 20872 - 20877 20880 20882 20886 20889 20893 20896 20899 20903 20907 20910 20913 20918 - 20924 20927 20930 20932 20938 20941 20946 20952 20956 20959 20963 20966 20969 - 20973 20977 20980 20983 20986 20989 20991 20995 20998 21003 21006 21010 21013 - 21017 21020 21023 21027 21031 21034 21038 21046 21051 21054 21058 21061 21065 - 21068 21072 21079 21084 21087 21091 21094 21098 21101 21105 21108 21111 21113 - 21117 21120 21124 21127 21130 21134 21138 21141 21145 21148 21153 21156 21159 - 21161 21166 21169 21173 21180 21184 21187 21191 21194 21197 21201 21205 21208 - 21211 21214 21217 21219 21223 21226 21231 21234 21239 21242 21246 21249 21252 - 21256 21260 21263 21268 21273 21277 21280 21284 21287 21291 21294 21298 21304 - 21308 21311 21315 21318 21322 21325 21328 21334 21337 21339 21343 21346 21350 - 21353 21356 21360 21364 21367 21371 21374 21380 21383 21386 21388 21392 21395 - 21399 21406 21410 21413 21417 21420 21423 21427 21431 21434 21437 21440 21443 - 21445 21449 21452 21457 21460 21465 21468 21472 21475 21478 21482 21486 21489 - 21494 21499 21503 21506 21510 21513 21517 21520 21524 21530 21534 21537 21541 - 21544 21548 21551 21555 21558 21561 21563 21567 21570 21574 21577 21580 21584 - 21588 21591 21595 21598 21603 21606 21609 21611 21616 21619 21624 21630 21634 - 21637 21641 21644 21647 21651 21656 21659 21662 21665 21668 21670 21674 21677 - 21682 21685 21690 21693 21697 21700 21703 21707 21711 21714 21718 21724 21728 - 21731 21735 21738 21742 21745 21749 21755 21759 21762 21766 21769 21773 21776 - 21780 21783 21786 21788 21792 21795 21799 21802 21805 21809 21813 21816 21819 - 21825 21829 21832 21835 21837 21843 21846 21851 21857 21861 21864 21868 21871 - 21874 21878 21882 21885 21888 21891 21894 21896 21900 21903 21908 21911 21916 - 21919 21923 21926 21929 21933 21937 21940 21945 21950 21954 21957 21961 21964 - 21968 21971 21975 21981 21985 21988 21992 21995 21999 22002 22006 22009 22012 - 22014 22018 22021 22025 22028 22031 22035 22039 22042 22046 22049 22054 22057 - 22060 22062 22067 22070 22074 22081 22085 22088 22092 22095 22098 22102 22106 - 22109 22112 22115 22118 22120 22124 22127 22132 22135 22140 22143 22147 22150 - 22153 22157 22161 22164 22168 22174 22178 22181 22185 22188 22192 22195 22200 - 22206 22211 22214 22218 22221 22226 22229 22233 22236 22239 22241 22245 22248 - 22252 22255 22258 22262 22266 22269 22273 22276 22282 22285 22288 22290 22296 - 22299 22304 22310 22314 22317 22321 22324 22327 22331 22335 22338 22341 22344 - 22347 22349 22353 22356 22361 22364 22369 22372 22376 22379 22382 22386 22390 - 22393 22397 22404 22408 22411 22415 22418 22422 22425 22429 22436 22440 22443 - 22447 22450 22454 22457 22461 22465 22468 22470 22474 22477 22481 22484 22487 - 22491 22495 22498 22502 22505 22509 22512 22515 22517 22521 22524 22528 22535 - 22539 22542 22546 22549 22552 22556 22560 22563 22566 22569 22572 22574 22578 - 22581 22586 22589 22594 22597 22601 22604 22607 22611 22615 22618 22622 22630 - 22635 22638 22642 22645 22649 22652 22656 22664 22669 22672 22676 22679 22683 - 22686 22690 22693 22696 22698 22702 22705 22709 22712 22715 22719 22723 22726 - 22730 22733 22737 22740 22743 22745 22749 22752 22756 22763 22767 22770 22774 - 22777 22780 22784 22788 22791 22794 22797 22800 22802 22806 22809 22813 22816 - 22820 22823 22827 22830 22833 22837 22841 22844 22848 22856 22861 22864 22868 - 22871 22875 22878 22882 22889 22893 22896 22900 22903 22907 22910 22914 22917 - 22920 22922 22926 22929 22933 22936 22939 22943 22947 22950 22954 22958 22962 - 22965 22968 22970 22974 22977 22981 22987 22991 22994 22998 23001 23004 23008 - 23012 23015 23018 23021 23024 23026 23030 23033 23038 23041 23045 23048 23052 - 23055 23058 23062 23066 23069 23073 23081 23086 23089 23093 23096 23100 23103 - 23107 23114 23119 23122 23126 23129 23133 23136 23140 23143 23146 23148 23152 - 23155 23159 23162 23165 23169 23173 23176 23180 23183 23187 23190 23193 23195 - 23199 23202 23206 23213 23217 23220 23224 23227 23230 23234 23238 23241 23244 - 23247 23250 23252 23256 23259 23263 23266 23270 23273 23277 23280 23283 23287 - 23291 23294 23298 23306 23311 23314 23318 23321 23325 23328 23332 23339 23343 - 23346 23350 23353 23357 23360 23364 23367 23370 23372 23376 23379 23383 23386 - 23389 23393 23397 23400 23404 23408 23412 23415 23418 23420 23424 23427 23432 - 23437 23441 23444 23448 23451 23454 23458 23462 23465 23468 23471 23474 23476 - 23480 23483 23487 23490 23494 23497 23501 23504 23507 23511 23515 23518 23522 - 23530 23535 23538 23542 23545 23549 23552 23556 23563 23567 23570 23574 23577 - 23581 23584 23588 23591 23594 23596 23600 23603 23607 23610 23613 23617 23621 - 23624 23628 23632 23636 23639 23642 23644 23648 23651 23656 23661 23666 23669 - 23673 23676 23679 23683 23687 23690 23693 23696 23699 23701 23705 23708 23713 - 23716 23721 23724 23728 23731 23734 23738 23742 23745 23750 23755 23759 23762 - 23766 23769 23773 23776 23781 23786 23790 23793 23797 23800 23804 23807 23811 - 23815 23818 23820 23824 23827 23831 23834 23837 23841 23845 23848 23852 23855 - 23859 23862 23865 23867 23871 23874 23879 23885 - 1 2 4 9 14 15 35 39 40 47 48 50 51 63 993 1069 - 1070 1079 1080 1091 1092 2977 3064 3067 3068 3071 3072 3099 3100 1 2 4 - 5 993 994 997 998 2977 2978 2979 2982 3 995 998 1 2 4 8 - 9 995 996 998 1005 2979 2980 2989 2 5 997 998 999 2981 2982 2983 - 6 999 1000 1002 1003 2983 2984 2985 7 2986 2987 4 8 9 1005 1006 - 1008 1009 2989 2990 3037 3038 1 4 8 9 14 15 17 19 26 1009 - 1010 1021 1022 1025 1026 1039 2991 2992 3001 3002 3013 3014 3021 3038 10 2991 - 3038 11 1011 2993 12 14 15 1011 1012 1015 2993 2994 2995 2998 13 1013 - 1016 1 9 12 14 15 17 1013 1014 1015 1016 1019 2995 2996 2999 1 - 9 12 14 15 19 20 1015 1016 1027 2997 2998 3007 16 17 1017 1023 - 1024 3003 3004 3005 9 14 16 17 1019 1020 1022 1023 2999 3000 3001 3002 - 3003 18 3002 3003 9 15 19 20 1025 1031 1032 3011 3012 3013 15 19 - 20 1027 1028 1030 1031 3007 3008 3009 3010 3011 21 3010 3011 22 23 1033 - 1037 1038 3019 3035 3036 22 23 24 1036 1037 1053 1054 3015 3016 3017 3018 - 3035 23 24 26 1041 1042 1044 1053 3015 3023 3024 25 3018 3035 9 24 - 26 28 1039 1040 1043 1044 3021 3022 3023 3026 27 1041 1044 26 28 29 - 1043 1044 1045 3025 3026 3027 28 29 31 1045 1046 1048 1049 3027 3028 3029 - 3030 30 31 3030 3031 29 30 31 1049 1050 1051 3030 3031 3032 3033 32 - 33 35 63 70 71 78 82 1055 1131 1132 1141 1142 1153 1154 3039 3129 - 3130 3133 3134 3161 3162 32 33 35 36 1055 1056 1059 3039 3040 3041 3044 - 34 1057 1060 1 32 33 35 36 39 40 63 1057 1058 1059 1060 1067 - 3041 3042 3051 33 35 36 37 1059 1060 1061 3043 3044 3045 36 37 1061 - 1062 1064 1065 3045 3046 3047 38 3048 3049 1 35 39 40 1067 1068 1070 - 1071 3051 3052 3099 1 35 39 40 48 50 57 63 1071 1072 1083 1084 - 1087 1088 1101 3053 3054 3063 3064 3075 3076 3083 3099 3100 41 3053 3100 42 - 43 1073 3055 42 43 45 46 48 1073 1074 1077 1078 3055 3056 3057 3060 - 44 1075 1078 43 45 48 1075 1076 1078 1081 3057 3058 3061 43 46 48 - 51 1077 1078 1089 3059 3060 3069 1 47 48 1079 1085 1086 3064 3065 3066 - 3067 1 40 43 45 46 47 48 50 51 1078 1081 1082 1084 1085 3061 - 3062 3063 3064 49 3064 3065 1 40 48 50 51 1087 1093 1094 3071 3072 - 3073 3074 3075 1 46 48 50 51 1089 1090 1092 1093 3069 3070 3071 52 - 3072 3073 53 54 56 1095 1099 1100 3080 3081 3097 3098 53 54 55 1098 - 1099 1115 1116 3077 3078 3079 3080 54 55 57 1103 1104 1106 1115 3077 3085 - 3086 53 56 3080 3097 40 55 57 59 1101 1102 1105 1106 3083 3084 3085 - 3088 58 1103 1106 57 59 60 1105 1106 1107 3087 3088 3089 59 60 62 - 1107 1108 1110 1111 3089 3090 3091 3092 3093 61 3092 3093 60 62 1111 1112 - 1113 3093 3094 3095 1 32 35 40 63 64 70 101 102 109 110 113 - 156 280 1117 1193 1194 1203 1204 1215 1216 3101 3191 3192 3195 3196 3223 3224 - 63 64 66 67 70 1117 1118 1121 1122 3101 3102 3103 3106 65 1119 1122 - 64 66 70 1119 1120 1122 1129 3103 3104 3113 64 67 68 1121 1122 1123 - 3105 3106 3107 67 68 1123 1124 1126 1127 3107 3108 3109 69 3110 3111 32 - 63 64 66 70 71 1122 1129 1130 1132 1133 3113 3114 3161 3162 32 70 - 71 78 79 81 82 88 1133 1134 1145 1146 1149 1150 1163 3115 3116 3125 - 3126 3134 3137 3138 3145 3162 72 3115 3162 73 74 1135 3117 73 74 76 - 77 1135 1136 1139 1140 3117 3118 3119 3122 75 1137 1140 74 76 77 79 - 1137 1138 1140 1143 3119 3120 3123 74 76 77 79 82 1139 1140 1151 3121 - 3122 3131 32 71 78 79 82 1141 1147 1148 3125 3126 3127 3128 3129 71 - 76 77 78 79 82 1143 1144 1146 1147 3123 3124 3125 80 3126 3127 71 - 81 82 1149 1155 1156 3134 3135 3136 3137 32 71 77 78 79 81 82 - 1151 1152 1154 1155 3131 3132 3133 3134 83 3134 3135 84 85 1157 1161 1162 - 3143 3159 3160 84 85 86 1160 1161 1177 1178 3139 3140 3141 3142 3159 85 - 86 88 1165 1166 1168 1177 3139 3147 3148 87 3142 3159 71 86 88 90 - 1163 1164 1167 1168 3145 3146 3147 3150 89 1165 1168 88 90 91 1167 1168 - 1169 3149 3150 3151 90 91 93 1169 1170 1172 1173 3151 3152 3153 3154 3155 - 92 3154 3155 91 93 1173 1174 1175 3155 3156 3157 94 95 101 125 132 - 133 140 143 144 1179 1255 1256 1265 1266 1277 1278 3163 3253 3254 3257 3258 - 3285 3286 94 95 97 98 101 1179 1180 1183 1184 3163 3164 3165 3168 96 - 1181 1184 95 97 101 1181 1182 1184 1191 3165 3166 3175 95 98 99 1183 - 1184 1185 3167 3168 3169 98 99 1185 1186 1188 1189 3169 3170 3171 100 3172 - 3173 63 94 95 97 101 102 125 156 1191 1192 1194 1195 3175 3176 3223 - 63 101 102 109 110 112 113 119 156 280 1195 1196 1207 1208 1211 1212 - 1225 3177 3178 3187 3188 3196 3199 3200 3207 3223 3224 103 3177 3224 104 105 - 1197 3179 104 105 107 108 110 113 1197 1198 1201 1202 3179 3180 3181 3184 - 106 1199 1202 105 107 110 1199 1200 1202 1205 3181 3182 3185 105 108 113 - 1201 1202 1213 3183 3184 3193 63 102 109 110 1203 1209 1210 3187 3188 3189 - 3190 3191 63 102 105 107 109 110 113 1202 1205 1206 1208 1209 3185 3186 - 3187 111 3188 3189 102 112 113 1211 1217 1218 3196 3197 3198 3199 63 102 - 105 108 110 112 113 1202 1213 1214 1216 1217 3193 3194 3195 3196 114 3196 - 3197 115 116 1219 1223 1224 3205 3221 3222 115 116 117 1222 1223 1239 1240 - 3201 3202 3203 3204 3221 116 117 119 120 1227 1228 1230 1239 3201 3209 3210 - 118 3204 3221 102 117 119 121 1225 1226 1229 1230 3207 3208 3209 3212 117 - 120 1227 1230 119 121 122 1229 1230 1231 3211 3212 3213 121 122 124 1231 - 1232 1234 1235 3213 3214 3215 3216 123 124 3216 3217 122 123 124 1235 1236 - 1237 3216 3217 3218 3219 94 101 125 126 128 132 133 156 159 163 164 - 171 174 175 1241 1317 1318 1327 1328 1339 1340 3225 3315 3316 3319 3320 3347 - 3348 125 126 128 129 1241 1242 1245 1246 3225 3226 3227 3230 127 128 1243 - 1246 125 126 127 128 132 1243 1244 1246 1253 3227 3228 3237 126 129 130 - 1245 1246 1247 3229 3230 3231 129 130 1247 1248 1250 1251 3231 3232 3233 131 - 3234 3235 94 125 128 132 133 1253 1254 1256 1257 3237 3238 3285 94 125 - 132 133 140 141 143 144 150 1257 1258 1269 1270 1273 1274 1287 3239 3240 - 3249 3250 3261 3262 3269 3285 3286 134 3239 3286 135 1259 3241 136 138 139 - 1259 1260 1263 3241 3242 3243 3246 137 1261 1264 136 138 139 141 1261 1262 - 1263 1264 1267 3243 3244 3247 136 138 139 141 144 1263 1264 1275 3245 3246 - 3255 94 133 140 141 144 1265 1271 1272 3249 3250 3251 3252 3253 133 138 - 139 140 141 144 1267 1268 1270 1271 3247 3248 3249 142 3250 3251 94 133 - 143 144 1273 1279 1280 3257 3258 3259 3260 3261 94 133 139 140 141 143 - 144 1275 1276 1278 1279 3255 3256 3257 145 3258 3259 146 147 149 1281 1285 - 1286 3266 3267 3283 3284 146 147 148 1284 1285 1301 1302 3263 3264 3265 3266 - 147 148 150 1289 1290 1301 3263 3271 3272 146 149 3266 3283 133 148 150 - 151 152 1287 1288 1289 1291 1292 3269 3270 3271 3274 150 151 1289 1292 150 - 152 153 1291 1292 1293 3273 3274 3275 152 153 155 1293 1294 1296 1297 3275 - 3276 3277 3278 3279 154 3278 3279 153 155 1297 1298 1299 3279 3280 3281 63 - 101 102 125 156 157 159 187 190 194 195 202 205 206 280 1303 1379 - 1380 1389 1390 1401 1402 3287 3377 3378 3381 3382 3409 3410 156 157 159 160 - 1303 1304 1307 3287 3288 3289 3292 158 159 1305 1308 125 156 157 158 159 - 160 163 164 1305 1306 1307 1308 1315 3289 3290 3299 157 159 160 161 1307 - 1308 1309 3291 3292 3293 160 161 1309 1310 1312 1313 3293 3294 3295 162 3296 - 3297 125 159 163 164 1315 1316 1318 1319 3299 3300 3347 125 159 163 164 - 171 172 174 175 181 1319 1320 1331 1332 1335 1336 1349 3301 3302 3311 3312 - 3323 3324 3331 3347 3348 165 3301 3348 166 1321 3303 167 169 170 1321 1322 - 1325 1326 3303 3304 3305 3308 168 169 1323 1326 167 168 169 170 172 1323 - 1324 1326 1329 3305 3306 3309 167 169 170 172 175 1325 1326 1337 3307 3308 - 3317 125 164 171 172 175 1327 1333 1334 3312 3313 3314 3315 164 169 170 - 171 172 175 1329 1330 1332 1333 3309 3310 3311 3312 173 3312 3313 125 164 - 174 175 1335 1341 1342 3319 3320 3321 3322 3323 125 164 170 171 172 174 - 175 1337 1338 1340 1341 3317 3318 3319 176 3320 3321 177 178 1343 1347 1348 - 3329 3345 3346 177 178 179 1346 1347 1363 1364 3325 3326 3327 3328 3345 178 - 179 181 1351 1352 1354 1363 3325 3333 3334 180 3328 3345 164 179 181 183 - 1349 1350 1353 1354 3331 3332 3333 3336 182 1351 1354 181 183 184 1353 1354 - 1355 3335 3336 3337 183 184 186 1355 1356 1358 1359 3337 3338 3339 3340 3341 - 185 3340 3341 184 186 1359 1360 1361 3341 3342 3343 156 187 188 190 195 - 218 221 225 226 233 234 237 249 280 1365 1441 1442 1451 1452 1463 1464 - 3349 3436 3439 3440 3443 3444 3471 3472 187 188 190 191 1365 1366 1369 3349 - 3350 3351 3354 189 190 1367 1370 156 187 188 189 190 191 194 195 1367 - 1368 1369 1370 1377 3351 3352 3361 188 190 191 192 1369 1370 1371 3353 3354 - 3355 191 192 1371 1372 1374 1375 3355 3356 3357 193 3358 3359 156 190 194 - 195 1377 1378 1380 1381 3361 3362 3409 156 187 190 194 195 202 203 205 - 206 212 1381 1382 1393 1394 1397 1398 1411 3363 3364 3373 3374 3385 3386 3393 - 3409 3410 196 3363 3410 197 1383 3365 198 200 201 203 206 1383 1384 1387 - 1388 3365 3366 3367 3370 199 1385 1388 198 200 203 1385 1386 1388 1391 3367 - 3368 3371 198 201 206 1387 1388 1399 3369 3370 3379 156 195 202 203 206 - 1389 1395 1396 3373 3374 3375 3376 3377 195 198 200 202 203 206 1388 1391 - 1392 1394 1395 3371 3372 3373 204 3374 3375 156 195 205 206 1397 1403 1404 - 3381 3382 3383 3384 3385 156 195 198 201 202 203 205 206 1388 1399 1400 - 1402 1403 3379 3380 3381 207 3382 3383 208 209 211 1405 1409 1410 3390 3391 - 3407 3408 208 209 210 1408 1409 1425 1426 3387 3388 3389 3390 209 210 213 - 1413 1414 1416 1425 3387 3395 3396 208 211 3390 3407 195 212 214 1411 1412 - 1415 1416 3393 3394 3395 3398 210 213 1413 1416 212 214 215 1415 1416 1417 - 3397 3398 3399 214 215 217 1417 1418 1420 1421 3399 3400 3401 3402 3403 216 - 3402 3403 215 217 1421 1422 1423 3403 3404 3405 187 218 219 221 226 249 - 256 257 264 265 268 1427 1503 1504 1513 1514 1525 1526 3411 3498 3501 3502 - 3505 3506 3533 3534 218 219 221 222 1427 1428 1431 1432 3411 3412 3413 3416 - 220 221 1429 1432 187 218 219 220 221 225 226 1429 1430 1432 1439 3413 - 3414 3423 219 222 223 1431 1432 1433 3415 3416 3417 222 223 1433 1434 1436 - 1437 3417 3418 3419 224 3420 3421 187 221 225 226 1439 1440 1442 1443 3423 - 3424 3471 187 218 221 225 226 234 236 237 243 249 1443 1444 1455 1456 - 1459 1460 1473 3425 3426 3435 3436 3444 3447 3448 3455 3471 3472 227 3425 3472 - 228 229 1445 3427 228 229 231 232 1445 1446 1449 1450 3427 3428 3429 3432 - 230 1447 1450 229 231 232 234 237 1447 1448 1450 1453 3429 3430 3433 229 - 231 232 237 1449 1450 1461 3431 3432 3441 187 233 234 1451 1457 1458 3436 - 3437 3438 3439 187 226 231 233 234 237 1453 1454 1456 1457 3433 3434 3435 - 3436 235 3436 3437 226 236 237 1459 1465 1466 3444 3445 3446 3447 187 226 - 231 232 234 236 237 1461 1462 1464 1465 3441 3442 3443 3444 238 3444 3445 - 239 240 1467 1471 1472 3453 3469 3470 239 240 241 1470 1471 1487 1488 3449 - 3450 3451 3452 3469 240 241 243 1475 1476 1478 1487 3449 3457 3458 242 3452 - 3469 226 241 243 245 1473 1474 1477 1478 3455 3456 3457 3460 244 1475 1478 - 243 245 246 1477 1478 1479 3459 3460 3461 245 246 248 1479 1480 1482 1483 - 3461 3462 3463 3464 3465 247 3464 3465 246 248 1483 1484 1485 3465 3466 3467 - 187 218 226 249 250 252 256 257 280 287 288 294 295 296 298 299 - 1489 1565 1566 1575 1576 1587 1588 3473 3563 3564 3567 3568 3595 3596 249 250 - 252 253 1489 1490 1493 1494 3473 3474 3475 3478 251 1491 1494 249 250 252 - 256 1491 1492 1494 1501 3475 3476 3485 250 253 254 1493 1494 1495 3477 3478 - 3479 253 254 1495 1496 1498 1499 3479 3480 3481 255 3482 3483 218 249 252 - 256 257 1501 1502 1504 1505 3485 3486 3533 3534 218 249 256 257 265 267 - 268 274 1505 1506 1517 1518 1521 1522 1535 3487 3488 3497 3498 3506 3509 3510 - 3517 3534 258 3487 3534 259 260 1507 3489 259 260 262 263 265 268 1507 - 1508 1511 1512 3489 3490 3491 3494 261 1509 1512 260 262 265 1509 1510 1512 - 1515 3491 3492 3495 260 263 268 1511 1512 1523 3493 3494 3503 218 264 265 - 1513 1519 1520 3498 3499 3500 3501 218 257 260 262 264 265 268 1512 1515 - 1516 1518 1519 3495 3496 3497 3498 266 3498 3499 257 267 268 1521 1527 1528 - 3506 3507 3508 3509 218 257 260 263 265 267 268 1512 1523 1524 1526 1527 - 3503 3504 3505 3506 269 3506 3507 270 271 1529 1533 1534 3515 3531 3532 270 - 271 272 1532 1533 1549 1550 3511 3512 3513 3514 3531 271 272 274 1537 1538 - 1540 1549 3511 3519 3520 273 3514 3531 257 272 274 276 1535 1536 1539 1540 - 3517 3518 3519 3522 275 1537 1540 274 276 277 1539 1540 1541 3521 3522 3523 - 276 277 279 1541 1542 1544 1545 3523 3524 3525 3526 3527 278 3526 3527 277 - 279 1545 1546 1547 3527 3528 3529 63 102 156 187 249 280 281 283 287 - 288 311 318 319 324 325 326 329 330 1551 1627 1628 1637 1638 1649 1650 - 3535 3625 3626 3629 3630 3657 3658 280 281 283 284 1551 1552 1555 1556 3535 - 3536 3537 3540 282 1553 1556 280 281 283 287 1553 1554 1556 1563 3537 3538 - 3547 281 284 285 1555 1556 1557 3539 3540 3541 284 285 1557 1558 1560 1561 - 3541 3542 3543 286 3544 3545 249 280 283 287 288 1563 1564 1566 1567 3547 - 3548 3595 249 280 287 288 294 295 296 298 305 1567 1568 1579 1580 1583 - 1584 1597 3549 3550 3559 3560 3571 3572 3579 3595 3596 289 3549 3596 290 291 - 1569 3551 290 291 293 294 1569 1570 1573 1574 3551 3552 3553 3556 292 1571 - 1574 291 293 294 296 1571 1572 1574 1577 3553 3554 3557 249 288 291 293 - 294 296 298 299 1573 1574 1585 3555 3556 3565 249 288 295 296 1575 1581 - 1582 3559 3560 3561 3562 3563 249 288 293 294 295 296 1577 1578 1580 1581 - 3557 3558 3559 297 3560 3561 249 288 294 298 299 1583 1589 1590 3567 3568 - 3569 3570 3571 249 294 298 299 1585 1586 1588 1589 3565 3566 3567 300 3568 - 3569 301 302 1591 1595 1596 3577 3593 3594 301 302 303 1594 1595 1611 1612 - 3573 3574 3575 3576 3593 302 303 305 1599 1600 1602 1611 3573 3581 3582 304 - 3576 3593 288 303 305 307 1597 1598 1601 1602 3579 3580 3581 3584 306 1599 - 1602 305 307 308 1601 1602 1603 3583 3584 3585 307 308 310 1603 1604 1606 - 1607 3585 3586 3587 3588 3589 309 3588 3589 308 310 1607 1608 1609 3589 3590 - 3591 280 311 312 314 318 319 342 349 350 355 356 357 358 360 361 - 404 1613 1689 1690 1699 1700 1711 1712 3597 3687 3688 3691 3692 3719 3720 311 - 312 314 315 1613 1614 1617 1618 3597 3598 3599 3602 313 1615 1618 311 312 - 314 318 1615 1616 1618 1625 3599 3600 3609 312 315 316 1617 1618 1619 3601 - 3602 3603 315 316 1619 1620 1622 1623 3603 3604 3605 317 3606 3607 280 311 - 314 318 319 1625 1626 1628 1629 3609 3610 3657 280 311 318 319 324 325 - 326 327 329 330 336 1629 1630 1641 1642 1645 1646 1659 3611 3612 3621 3622 - 3633 3634 3641 3657 3658 320 3611 3658 321 322 1631 3613 321 322 324 325 - 1631 1632 1635 3613 3614 3615 3618 323 1633 1636 280 319 322 324 325 326 - 327 1633 1634 1635 1636 1639 3615 3616 3619 280 319 322 324 325 330 1635 - 1636 1647 3617 3618 3627 280 319 324 326 327 1637 1643 1644 3621 3622 3623 - 3624 3625 319 324 326 327 1639 1640 1642 1643 3619 3620 3621 328 3622 3623 - 280 319 329 330 1645 1651 1652 3629 3630 3631 3632 3633 280 319 325 329 - 330 1647 1648 1650 1651 3627 3628 3629 331 3630 3631 332 333 1653 1657 1658 - 3639 3655 3656 332 333 334 1656 1657 1673 1674 3635 3636 3637 3638 3655 333 - 334 336 1661 1662 1664 1673 3635 3643 3644 335 3638 3655 319 334 336 338 - 1659 1660 1663 1664 3641 3642 3643 3646 337 1661 1664 336 338 339 1663 1664 - 1665 3645 3646 3647 338 339 341 1665 1666 1668 1669 3647 3648 3649 3650 340 - 341 3650 3651 339 340 341 1669 1670 1671 3650 3651 3652 3653 311 342 343 - 345 349 350 373 376 380 381 386 387 388 391 392 404 1675 1751 1752 - 1761 1762 1773 1774 3659 3749 3750 3753 3754 3781 3782 342 343 345 346 1675 - 1676 1679 1680 3659 3660 3661 3664 344 1677 1680 342 343 345 349 1677 1678 - 1680 1687 3661 3662 3671 343 346 347 1679 1680 1681 3663 3664 3665 346 347 - 1681 1682 1684 1685 3665 3666 3667 348 3668 3669 311 342 345 349 350 1687 - 1688 1690 1691 3671 3672 3719 311 342 349 350 355 356 357 358 360 367 - 1691 1692 1703 1704 1707 1708 1721 3673 3674 3683 3684 3695 3696 3703 3719 3720 - 351 3673 3720 352 353 1693 3675 352 353 355 356 1693 1694 1697 3675 3676 - 3677 3680 354 1695 1698 311 350 353 355 356 358 1695 1696 1697 1698 1701 - 3677 3678 3681 311 350 353 355 356 360 361 1697 1698 1709 3679 3680 3689 - 311 350 357 358 1699 1705 1706 3683 3684 3685 3686 3687 311 350 355 357 - 358 1701 1702 1704 1705 3681 3682 3683 359 3684 3685 311 350 356 360 361 - 1707 1713 1714 3691 3692 3693 3694 3695 311 356 360 361 1709 1710 1712 1713 - 3689 3690 3691 362 3692 3693 363 364 1715 1719 1720 3701 3717 3718 363 364 - 365 1718 1719 1735 1736 3697 3698 3699 3700 3717 364 365 367 1723 1724 1726 - 1735 3697 3705 3706 366 3700 3717 350 365 367 369 1721 1722 1725 1726 3703 - 3704 3705 3708 368 1723 1726 367 369 370 1725 1726 1727 3707 3708 3709 369 - 370 372 1727 1728 1730 1731 3709 3710 3711 3712 3713 371 3712 3713 370 372 - 1731 1732 1733 3713 3714 3715 342 373 374 376 381 404 407 411 412 418 - 419 420 423 1737 1813 1814 1823 1824 1835 1836 3721 3811 3812 3815 3816 3843 - 3844 373 374 376 377 1737 1738 1741 1742 3721 3722 3723 3726 375 1739 1742 - 342 373 374 376 380 381 1739 1740 1742 1749 3723 3724 3733 374 377 378 - 1741 1742 1743 3725 3726 3727 377 378 1743 1744 1746 1747 3727 3728 3729 379 - 3730 3731 342 376 380 381 1749 1750 1752 1753 3733 3734 3781 3782 342 373 - 376 380 381 386 387 388 389 391 398 1753 1754 1765 1766 1769 1770 1783 - 3735 3736 3745 3746 3757 3758 3765 3782 382 3735 3782 383 384 1755 3737 383 - 384 386 387 1755 1756 1759 3737 3738 3739 3742 385 1757 1760 342 381 384 - 386 387 388 389 1757 1758 1759 1760 1763 3739 3740 3743 342 381 384 386 - 387 391 392 1759 1760 1771 3741 3742 3751 342 381 386 388 389 1761 1767 - 1768 3745 3746 3747 3748 3749 381 386 388 389 1763 1764 1766 1767 3743 3744 - 3745 390 3746 3747 342 381 387 391 392 1769 1775 1776 3754 3755 3756 3757 - 342 387 391 392 1771 1772 1774 1775 3751 3752 3753 3754 393 3754 3755 394 - 395 1777 1781 1782 3763 3779 3780 394 395 396 1780 1781 1797 1798 3759 3760 - 3761 3762 3779 395 396 398 1785 1786 1788 1797 3759 3767 3768 397 3762 3779 - 381 396 398 400 1783 1784 1787 1788 3765 3766 3767 3770 399 1785 1788 398 - 400 401 1787 1788 1789 3769 3770 3771 400 401 403 1789 1790 1792 1793 3771 - 3772 3773 3774 3775 402 3774 3775 401 403 1793 1794 1795 3775 3776 3777 311 - 342 373 404 405 407 435 442 443 449 450 451 453 454 497 1799 1875 - 1876 1885 1886 1897 1898 3783 3870 3873 3874 3877 3878 3905 3906 404 405 407 - 408 1799 1800 1803 1804 3783 3784 3785 3788 406 1801 1804 373 404 405 407 - 411 412 1801 1802 1804 1811 3785 3786 3795 405 408 409 1803 1804 1805 3787 - 3788 3789 408 409 1805 1806 1808 1809 3789 3790 3791 410 3792 3793 373 407 - 411 412 1811 1812 1814 1815 3795 3796 3843 373 407 411 412 418 419 420 - 422 423 429 1815 1816 1827 1828 1831 1832 1845 3797 3798 3807 3808 3816 3819 - 3820 3827 3843 3844 413 3797 3844 414 415 1817 3799 414 415 417 418 1817 - 1818 1821 3799 3800 3801 3804 416 1819 1822 415 417 418 420 1819 1820 1821 - 1822 1825 3801 3802 3805 373 412 415 417 418 420 423 1821 1822 1833 3803 - 3804 3813 373 412 419 420 1823 1829 1830 3807 3808 3809 3810 3811 373 412 - 417 418 419 420 1825 1826 1828 1829 3805 3806 3807 421 3808 3809 412 422 - 423 1831 1837 1838 3816 3817 3818 3819 373 412 418 422 423 1833 1834 1836 - 1837 3813 3814 3815 3816 424 3816 3817 425 426 1839 1843 1844 3825 3841 3842 - 425 426 427 1842 1843 1859 1860 3821 3822 3823 3824 3841 426 427 429 1847 - 1848 1850 1859 3821 3829 3830 428 3824 3841 412 427 429 431 1845 1846 1849 - 1850 3827 3828 3829 3832 430 1847 1850 429 431 432 1849 1850 1851 3831 3832 - 3833 431 432 434 1851 1852 1854 1855 3833 3834 3835 3836 3837 433 3836 3837 - 432 434 1855 1856 1857 3837 3838 3839 404 435 436 442 443 466 473 474 - 479 480 481 485 497 1861 1937 1938 1947 1948 1959 1960 3845 3935 3936 3939 - 3940 3967 3968 435 436 438 439 442 1861 1862 1865 1866 3845 3846 3847 3850 - 437 1863 1866 436 438 442 1863 1864 1866 1873 3847 3848 3857 436 439 440 - 1865 1866 1867 3849 3850 3851 439 440 1867 1868 1870 1871 3851 3852 3853 441 - 3854 3855 404 435 436 438 442 443 1866 1873 1874 1876 1877 3857 3858 3905 - 404 435 442 443 449 451 453 454 460 497 1877 1878 1889 1890 1893 1894 - 1907 3859 3860 3869 3870 3881 3882 3889 3905 3906 444 3859 3906 445 446 1879 - 3861 445 446 448 449 451 1879 1880 1883 1884 3861 3862 3863 3866 447 1881 - 1884 446 448 451 1881 1882 1884 1887 3863 3864 3867 404 443 446 449 451 - 454 1883 1884 1895 3865 3866 3875 404 450 451 1885 1891 1892 3870 3871 3872 - 3873 404 443 446 448 449 450 451 1884 1887 1888 1890 1891 3867 3868 3869 - 3870 452 3870 3871 404 443 453 454 1893 1899 1900 3877 3878 3879 3880 3881 - 404 443 449 453 454 1895 1896 1898 1899 3875 3876 3877 455 3878 3879 456 - 457 1901 1905 1906 3887 3903 3904 456 457 458 1904 1905 1921 1922 3883 3884 - 3885 3886 3903 457 458 460 1909 1910 1912 1921 3883 3891 3892 459 3886 3903 - 443 458 460 462 1907 1908 1911 1912 3889 3890 3891 3894 461 1909 1912 460 - 462 463 1911 1912 1913 3893 3894 3895 462 463 465 1913 1914 1916 1917 3895 - 3896 3897 3898 464 465 3898 3899 463 464 465 1917 1918 1919 3898 3899 3900 - 3901 435 466 467 473 474 497 504 505 512 513 515 516 1923 1999 2000 - 2009 2010 2021 2022 3907 3997 3998 4001 4002 4029 4030 466 467 469 470 473 - 1923 1924 1927 1928 3907 3908 3909 3912 468 1925 1928 467 469 473 1925 1926 - 1928 1935 3909 3910 3919 467 470 471 1927 1928 1929 3911 3912 3913 470 471 - 1929 1930 1932 1933 3913 3914 3915 472 3916 3917 435 466 467 469 473 474 - 1928 1935 1936 1938 1939 3919 3920 3967 435 466 473 474 479 480 481 482 - 484 485 491 497 1939 1940 1951 1952 1955 1956 1969 3921 3922 3931 3932 3940 - 3943 3944 3951 3967 3968 475 3921 3968 476 477 1941 3923 476 477 479 480 - 1941 1942 1945 3923 3924 3925 3928 478 1943 1946 435 474 477 479 480 481 - 482 1943 1944 1945 1946 1949 3925 3926 3929 435 474 477 479 480 485 1945 - 1946 1957 3927 3928 3937 435 474 479 481 482 1947 1953 1954 3932 3933 3934 - 3935 474 479 481 482 1949 1950 1952 1953 3929 3930 3931 3932 483 3932 3933 - 474 484 485 1955 1961 1962 3940 3941 3942 3943 435 474 480 484 485 1957 - 1958 1960 1961 3937 3938 3939 3940 486 3940 3941 487 488 490 1963 1967 1968 - 3948 3949 3965 3966 487 488 489 1966 1967 1983 1984 3945 3946 3947 3948 488 - 489 1971 1972 1974 1983 3945 3953 3954 487 490 3948 3965 474 491 493 1969 - 1970 1973 1974 3951 3952 3953 3956 492 1971 1974 491 493 494 1973 1974 1975 - 3955 3956 3957 493 494 496 1975 1976 1978 1979 3957 3958 3959 3960 3961 495 - 3960 3961 494 496 1979 1980 1981 3961 3962 3963 404 435 443 466 474 497 - 498 504 535 536 541 542 543 546 547 714 776 869 1985 2061 2062 2071 - 2072 2083 2084 3969 4059 4060 4063 4064 4091 4092 497 498 500 501 504 1985 - 1986 1989 1990 3969 3970 3971 3974 499 1987 1990 498 500 504 1987 1988 1990 - 1997 3971 3972 3981 498 501 502 1989 1990 1991 3973 3974 3975 501 502 1991 - 1992 1994 1995 3975 3976 3977 503 3978 3979 466 497 498 500 504 505 1990 - 1997 1998 2000 2001 3981 3982 4029 4030 466 504 505 512 513 515 516 522 - 2001 2002 2013 2014 2017 2018 2031 3983 3984 3993 3994 4005 4006 4013 4030 506 - 3983 4030 507 508 2003 3985 507 508 510 511 2003 2004 2007 3985 3986 3987 - 3990 509 2005 2008 508 510 511 513 2005 2006 2007 2008 2011 3987 3988 3991 - 508 510 511 513 516 2007 2008 2019 3989 3990 3999 466 505 512 513 2009 - 2015 2016 3993 3994 3995 3996 3997 466 505 510 511 512 513 516 2011 2012 - 2014 2015 3991 3992 3993 514 3994 3995 466 505 515 516 2017 2023 2024 4002 - 4003 4004 4005 466 505 511 513 515 516 2019 2020 2022 2023 3999 4000 4001 - 4002 517 4002 4003 518 519 2025 2029 2030 4011 4027 4028 518 519 520 2028 - 2029 2045 2046 4007 4008 4009 4010 4027 519 520 522 2033 2034 2036 2045 4007 - 4015 4016 521 4010 4027 505 520 522 524 2031 2032 2035 2036 4013 4014 4015 - 4018 523 2033 2036 522 524 525 2035 2036 2037 4017 4018 4019 524 525 527 - 2037 2038 2040 2041 4019 4020 4021 4022 4023 526 4022 4023 525 527 2041 2042 - 2043 4023 4024 4025 528 529 531 535 559 566 567 573 574 577 578 2047 - 2123 2124 2133 2134 2145 2146 4031 4121 4122 4125 4126 4153 4154 528 529 531 - 532 2047 2048 2051 2052 4031 4032 4033 4036 530 2049 2052 528 529 531 535 - 2049 2050 2052 2059 4033 4034 4043 529 532 533 2051 2052 2053 4035 4036 4037 - 532 533 2053 2054 2056 2057 4037 4038 4039 534 4040 4041 497 528 531 535 - 536 559 590 621 652 655 686 714 2059 2060 2062 2063 4043 4044 4091 4092 - 497 535 536 541 542 543 544 546 553 714 776 869 2063 2064 2075 2076 - 2079 2080 2093 4045 4046 4055 4056 4067 4068 4075 4092 537 4045 4092 538 539 - 2065 4047 538 539 541 542 2065 2066 2069 2070 4047 4048 4049 4052 540 2067 - 2070 497 536 539 541 542 543 544 2067 2068 2070 2073 4049 4050 4053 497 - 536 539 541 542 546 547 2069 2070 2081 4051 4052 4061 497 536 541 543 - 544 2071 2077 2078 4056 4057 4058 4059 536 541 543 544 2073 2074 2076 2077 - 4053 4054 4055 4056 545 4056 4057 497 536 542 546 547 2079 2085 2086 4064 - 4065 4066 4067 497 542 546 547 2081 2082 2084 2085 4061 4062 4063 4064 548 - 4064 4065 549 550 2087 2091 2092 4073 4089 4090 549 550 551 2090 2091 2107 - 2108 4069 4070 4071 4072 4089 550 551 553 2095 2096 2098 2107 4069 4077 4078 - 552 4072 4089 536 551 553 555 2093 2094 2097 2098 4075 4076 4077 4080 554 - 2095 2098 553 555 556 2097 2098 2099 4079 4080 4081 555 556 558 2099 2100 - 2102 2103 4081 4082 4083 4084 4085 557 4084 4085 556 558 2103 2104 2105 4085 - 4086 4087 528 535 559 560 566 567 590 597 598 604 605 606 608 609 - 2109 2185 2186 2195 2196 2207 2208 4093 4180 4183 4184 4187 4188 4215 4216 559 - 560 562 563 566 2109 2110 2113 2114 4093 4094 4095 4098 561 2111 2114 560 - 562 566 2111 2112 2114 2121 4095 4096 4105 560 563 564 2113 2114 2115 4097 - 4098 4099 563 564 2115 2116 2118 2119 4099 4100 4101 565 4102 4103 528 559 - 560 562 566 567 2114 2121 2122 2124 2125 4105 4106 4153 528 559 566 567 - 572 573 574 575 577 584 2125 2126 2137 2138 2141 2142 2155 4107 4108 4117 - 4118 4129 4130 4137 4153 4154 568 4107 4154 569 570 2127 4109 569 570 572 - 573 2127 2128 2131 4109 4110 4111 4114 571 2129 2132 567 570 572 573 574 - 575 2129 2130 2131 2132 2135 4111 4112 4115 528 567 570 572 573 574 577 - 578 2131 2132 2143 4113 4114 4123 528 567 572 573 574 575 2133 2139 2140 - 4117 4118 4119 4120 4121 567 572 574 575 2135 2136 2138 2139 4115 4116 4117 - 576 4118 4119 528 567 573 577 578 2141 2147 2148 4125 4126 4127 4128 4129 - 528 573 577 578 2143 2144 2146 2147 4123 4124 4125 579 4126 4127 580 581 - 2149 2153 2154 4135 4151 4152 580 581 582 2152 2153 2169 2170 4131 4132 4133 - 4134 4151 581 582 584 2157 2158 2160 2169 4131 4139 4140 583 4134 4151 567 - 582 584 586 2155 2156 2159 2160 4137 4138 4139 4142 585 2157 2160 584 586 - 587 2159 2160 2161 4141 4142 4143 586 587 589 2161 2162 2164 2165 4143 4144 - 4145 4146 4147 588 4146 4147 587 589 2165 2166 2167 4147 4148 4149 535 559 - 590 591 597 598 621 628 629 636 637 639 640 2171 2247 2248 2257 2258 - 2269 2270 4155 4245 4246 4249 4250 4277 4278 590 591 593 594 597 2171 2172 - 2175 2176 4155 4156 4157 4160 592 2173 2176 591 593 597 2173 2174 2176 2183 - 4157 4158 4167 591 594 595 2175 2176 2177 4159 4160 4161 594 595 2177 2178 - 2180 2181 4161 4162 4163 596 4164 4165 559 590 591 593 597 598 2183 2184 - 2186 2187 4167 4168 4215 4216 559 590 597 598 604 606 608 609 615 2187 - 2188 2199 2200 2203 2204 2217 4169 4170 4179 4180 4191 4192 4199 4216 599 4169 - 4216 600 601 2189 4171 600 601 603 604 2189 2190 2193 4171 4172 4173 4176 - 602 2191 2194 601 603 604 606 2191 2192 2193 2194 2197 4173 4174 4177 559 - 598 601 603 604 606 609 2193 2194 2205 4175 4176 4185 559 605 606 2195 - 2201 2202 4180 4181 4182 4183 559 598 603 604 605 606 2197 2198 2200 2201 - 4177 4178 4179 4180 607 4180 4181 559 598 608 609 2203 2209 2210 4188 4189 - 4190 4191 559 598 604 608 609 2205 2206 2208 2209 4185 4186 4187 4188 610 - 4188 4189 611 612 2211 2215 2216 4197 4213 4214 611 612 613 614 2214 2215 - 2231 2232 4193 4194 4195 4213 612 613 615 2219 2220 2222 2231 4193 4201 4202 - 612 614 4195 4196 4213 598 613 615 617 2217 2218 2221 2222 4199 4200 4201 - 4204 616 2219 2222 615 617 618 2221 2222 2223 4203 4204 4205 617 618 619 - 620 2223 2224 2226 2227 4205 4206 4207 4209 618 619 4207 4208 4209 618 620 - 2227 2228 2229 4209 4210 4211 535 590 621 622 628 629 655 659 660 667 - 668 670 671 2233 2309 2310 2319 2320 2331 2332 4217 4307 4308 4311 4312 4339 - 4340 621 622 624 625 628 2233 2234 2237 4217 4218 4219 4222 623 2235 2238 - 622 624 625 628 2235 2236 2237 2238 2245 4219 4220 4229 622 624 625 626 - 2237 2238 2239 4221 4222 4223 625 626 2239 2240 2242 2243 4223 4224 4225 627 - 4226 4227 590 621 622 624 628 629 2245 2246 2248 2249 4229 4230 4277 4278 - 590 621 628 629 636 637 639 646 2249 2250 2261 2262 2265 2266 2279 4231 - 4232 4241 4242 4253 4254 4261 4278 630 4231 4278 631 2251 4233 632 634 635 - 2251 2252 2255 4233 4234 4235 4238 633 2253 2256 632 634 635 637 640 2253 - 2254 2255 2256 2259 4235 4236 4239 632 634 635 640 2255 2256 2267 4237 4238 - 4247 590 629 636 637 638 2257 2263 2264 4241 4242 4243 4244 4245 590 629 - 634 636 637 639 640 2259 2260 2262 2263 4239 4240 4241 636 638 4242 4243 - 590 629 637 639 640 641 2265 2271 2272 4250 4251 4252 4253 590 634 635 - 637 639 640 2267 2268 2270 2271 4247 4248 4249 4250 639 641 4250 4251 642 - 645 2273 2277 2278 4258 4259 4275 4276 643 644 2276 2277 2293 2294 4255 4256 - 4257 4258 643 644 646 2281 2282 2284 2293 4255 4263 4264 642 645 4258 4275 - 629 644 646 648 2279 2280 2283 2284 4261 4262 4263 4266 647 2281 2284 646 - 648 649 2283 2284 2285 4265 4266 4267 648 649 650 2285 2286 2288 2289 4267 - 4268 4269 4271 649 650 4269 4270 4271 651 2289 2290 2291 4271 4272 4273 535 - 652 653 655 686 690 691 698 701 702 2295 2371 2372 2381 2382 2393 2394 - 4279 4369 4370 4373 4374 4401 4402 652 653 655 656 2295 2296 2299 4279 4280 - 4281 4284 654 2297 2300 535 621 652 653 655 656 659 660 2297 2298 2299 - 2300 2307 4281 4282 4291 653 655 656 657 2299 2300 2301 4283 4284 4285 656 - 657 2301 2302 2304 2305 4285 4286 4287 658 4288 4289 621 655 659 660 2307 - 2308 2310 2311 4291 4292 4339 621 655 659 660 667 668 670 677 2311 2312 - 2323 2324 2327 2328 2341 4293 4294 4303 4304 4315 4316 4323 4339 4340 661 4293 - 4340 662 2313 4295 663 665 666 2313 2314 2317 4295 4296 4297 4300 664 2315 - 2318 663 665 666 668 671 2315 2316 2317 2318 2321 4297 4298 4301 663 665 - 666 671 2317 2318 2329 4299 4300 4309 621 660 667 668 669 2319 2325 2326 - 4303 4304 4305 4306 4307 621 660 665 667 668 670 671 2321 2322 2324 2325 - 4301 4302 4303 667 669 4304 4305 621 660 668 670 671 672 2327 2333 2334 - 4312 4313 4314 4315 621 665 666 668 670 671 2329 2330 2332 2333 4309 4310 - 4311 4312 670 672 4312 4313 673 674 2335 2339 2340 4321 4337 4338 673 674 - 675 676 2338 2339 2355 2356 4317 4318 4319 4337 674 675 677 2343 2344 2346 - 2355 4317 4325 4326 674 676 4319 4320 4337 660 675 677 679 2341 2342 2345 - 2346 4323 4324 4325 4328 678 2343 2346 677 679 680 2345 2346 2347 4327 4328 - 4329 679 680 681 682 2347 2348 2350 2351 4329 4330 4331 4333 680 681 4331 - 4332 4333 680 682 2351 2352 2353 4333 4334 4335 683 684 686 717 721 722 - 729 730 732 733 2357 2433 2434 2443 2444 2455 2456 4341 4431 4432 4435 4436 - 4463 4464 683 684 686 687 2357 2358 2361 4341 4342 4343 4346 685 2359 2362 - 535 652 683 684 686 687 690 691 714 715 717 2359 2360 2361 2362 2369 - 4343 4344 4353 684 686 687 2361 2362 2363 4345 4346 4347 688 2363 2364 2366 - 2367 4347 4348 4349 689 4350 4351 652 686 690 691 2369 2370 2372 2373 4353 - 4354 4401 652 686 690 691 698 699 701 702 708 2373 2374 2385 2386 2389 - 2390 2403 4355 4356 4365 4366 4377 4378 4385 4401 4402 692 4355 4402 693 2375 - 4357 694 696 697 699 702 2375 2376 2379 2380 4357 4358 4359 4362 695 2377 - 2380 694 696 699 2377 2378 2380 2383 4359 4360 4363 694 697 702 2379 2380 - 2391 4361 4362 4371 652 691 698 699 700 702 2381 2387 2388 4366 4367 4368 - 4369 691 694 696 698 699 702 2380 2383 2384 2386 2387 4363 4364 4365 4366 - 698 700 4366 4367 652 691 701 702 703 2389 2395 2396 4374 4375 4376 4377 - 652 691 694 697 698 699 701 702 2380 2391 2392 2394 2395 4371 4372 4373 - 4374 701 703 4374 4375 704 2397 2401 2402 4383 4399 4400 705 706 707 2400 - 2401 2417 2418 4379 4380 4381 4399 705 706 708 2405 2406 2408 2417 4379 4387 - 4388 705 707 4381 4382 4399 691 706 708 710 2403 2404 2407 2408 4385 4386 - 4387 4390 709 2405 2408 708 710 711 2407 2408 2409 4389 4390 4391 710 711 - 2409 2410 2412 2413 4391 4392 4393 4394 712 713 4394 4395 712 713 2413 2414 - 2415 4394 4395 4396 4397 497 535 536 686 714 715 745 748 752 753 760 - 761 763 764 776 2419 2495 2496 2505 2506 2517 2518 4403 4493 4494 4497 4498 - 4525 4526 686 714 715 717 718 2419 2420 2423 4403 4404 4405 4408 716 2421 - 2424 683 686 715 717 718 721 722 2421 2422 2423 2424 2431 4405 4406 4415 - 715 717 718 719 2423 2424 2425 4407 4408 4409 718 719 2425 2426 2428 2429 - 4409 4410 4411 720 4412 4413 683 717 721 722 2431 2432 2434 2435 4415 4416 - 4463 4464 683 717 721 722 729 730 732 733 739 2435 2436 2447 2448 2451 - 2452 2465 4417 4418 4427 4428 4439 4440 4447 4464 723 4417 4464 724 2437 4419 - 725 727 728 2437 2438 2441 2442 4419 4420 4421 4424 726 2439 2442 725 727 - 728 730 733 2439 2440 2442 2445 4421 4422 4425 725 727 728 733 2441 2442 - 2453 4423 4424 4433 683 722 729 730 731 2443 2449 2450 4427 4428 4429 4430 - 4431 683 722 727 729 730 733 2445 2446 2448 2449 4425 4426 4427 729 731 - 4428 4429 683 722 732 733 734 2451 2457 2458 4436 4437 4438 4439 683 722 - 727 728 730 732 733 2453 2454 2456 2457 4433 4434 4435 4436 732 734 4436 - 4437 735 736 2459 2463 2464 4445 4461 4462 735 736 737 738 2462 2463 2479 - 2480 4441 4442 4443 4461 736 737 739 2467 2468 2470 2479 4441 4449 4450 736 - 738 4443 4444 4461 722 737 739 741 2465 2466 2469 2470 4447 4448 4449 4452 - 740 2467 2470 739 741 742 2469 2470 2471 4451 4452 4453 741 742 743 744 - 2471 2472 2474 2475 4453 4454 4455 4457 742 743 4455 4456 4457 742 744 2475 - 2476 2477 4457 4458 4459 714 745 746 748 753 776 783 784 791 792 795 - 2481 2557 2558 2567 2568 2579 2580 4465 4552 4555 4556 4559 4560 4587 4588 745 - 746 748 749 2481 2482 2485 4465 4466 4467 4470 747 2483 2486 714 745 746 - 748 749 752 753 2483 2484 2485 2486 2493 4467 4468 4477 746 748 749 750 - 2485 2486 2487 4469 4470 4471 749 750 2487 2488 2490 2491 4471 4472 4473 751 - 4474 4475 714 748 752 753 2493 2494 2496 2497 4477 4478 4525 714 745 748 - 752 753 760 761 763 770 2497 2498 2509 2510 2513 2514 2527 4479 4480 4489 - 4490 4501 4502 4509 4525 4526 754 4479 4526 755 2499 4481 756 758 759 2499 - 2500 2503 4481 4482 4483 4486 757 2501 2504 756 758 759 761 764 2501 2502 - 2503 2504 2507 4483 4484 4487 756 758 759 764 2503 2504 2515 4485 4486 4495 - 714 753 760 761 762 2505 2511 2512 4490 4491 4492 4493 714 753 758 760 - 761 763 764 2507 2508 2510 2511 4487 4488 4489 4490 760 762 4490 4491 714 - 753 761 763 764 2513 2519 2520 4497 4498 4499 4500 4501 714 758 759 761 - 763 764 2515 2516 2518 2519 4495 4496 4497 765 4498 4499 766 2521 2525 2526 - 4507 4523 4524 767 769 2524 2525 2541 2542 4503 4504 4505 4523 768 770 2529 - 2530 2532 2541 4503 4511 4512 767 769 4505 4506 4523 753 768 770 772 2527 - 2528 2531 2532 4509 4510 4511 4514 771 2529 2532 770 772 773 2531 2532 2533 - 4513 4514 4515 772 773 774 2533 2534 2536 2537 4515 4516 4517 4519 773 774 - 4517 4518 4519 775 2537 2538 2539 4519 4520 4521 497 536 714 745 776 777 - 783 784 807 814 815 822 823 826 869 2543 2619 2620 2629 2630 2641 2642 - 4527 4614 4617 4618 4621 4622 4649 4650 776 777 779 780 783 2543 2544 2547 - 2548 4527 4528 4529 4532 778 2545 2548 777 779 783 2545 2546 2548 2555 4529 - 4530 4539 777 780 781 2547 2548 2549 4531 4532 4533 780 781 2549 2550 2552 - 2553 4533 4534 4535 782 4536 4537 745 776 777 779 783 784 2548 2555 2556 - 2558 2559 4539 4540 4587 4588 745 776 783 784 792 794 795 801 2559 2560 - 2571 2572 2575 2576 2589 4541 4542 4551 4552 4560 4563 4564 4571 4588 785 4541 - 4588 786 2561 4543 787 789 790 2561 2562 2565 4543 4544 4545 4548 788 2563 - 2566 787 789 790 792 795 2563 2564 2565 2566 2569 4545 4546 4549 787 789 - 790 795 2565 2566 2577 4547 4548 4557 745 791 792 793 2567 2573 2574 4552 - 4553 4554 4555 745 784 789 791 792 795 2569 2570 2572 2573 4549 4550 4551 - 4552 791 793 4552 4553 784 794 795 796 2575 2581 2582 4560 4561 4562 4563 - 745 784 789 790 792 794 795 2577 2578 2580 2581 4557 4558 4559 4560 794 - 796 4560 4561 797 798 800 2583 2587 2588 4567 4568 4569 4585 4586 797 798 - 799 2586 2587 2603 2604 4565 4566 4567 798 799 801 2591 2592 2594 2603 4565 - 4573 4574 797 800 4568 4585 784 799 801 803 2589 2590 2593 2594 4571 4572 - 4573 4576 802 2591 2594 801 803 804 2593 2594 2595 4575 4576 4577 803 804 - 805 806 2595 2596 2598 2599 4577 4578 4579 804 805 806 4579 4580 4581 804 - 805 806 2599 2600 2601 4581 4582 4583 776 807 808 814 815 838 845 846 - 852 853 854 857 869 2605 2681 2682 2691 2692 2703 2704 4589 4676 4679 4680 - 4683 4684 4711 4712 807 808 810 811 814 2605 2606 2609 2610 4589 4590 4591 - 4594 809 2607 2610 808 810 814 2607 2608 2610 2617 4591 4592 4601 808 811 - 812 2609 2610 2611 4593 4594 4595 811 812 2611 2612 2614 2615 4595 4596 4597 - 813 4598 4599 776 807 808 810 814 815 2610 2617 2618 2620 2621 4601 4602 - 4649 4650 776 807 814 815 823 825 826 832 2621 2622 2633 2634 2637 2638 - 2651 4603 4604 4613 4614 4622 4625 4626 4633 4650 816 4603 4650 817 2623 4605 - 818 820 821 2623 2624 2627 4605 4606 4607 4610 819 2625 2628 818 820 821 - 823 826 2625 2626 2627 2628 2631 4607 4608 4611 818 820 821 826 2627 2628 - 2639 4609 4610 4619 776 822 823 2629 2635 2636 4614 4615 4616 4617 776 815 - 820 822 823 826 2631 2632 2634 2635 4611 4612 4613 4614 824 4614 4615 815 - 825 826 2637 2643 2644 4622 4623 4624 4625 776 815 820 821 823 825 826 - 2639 2640 2642 2643 4619 4620 4621 4622 827 4622 4623 828 829 831 2645 2649 - 2650 4631 4647 4648 828 829 830 831 2648 2649 2665 2666 4627 4628 4629 829 - 830 832 2653 2654 2656 2665 4627 4635 4636 828 829 831 4629 4630 4647 815 - 830 832 834 2651 2652 2655 2656 4633 4634 4635 4638 833 2653 2656 832 834 - 835 2655 2656 2657 4637 4638 4639 834 835 836 837 2657 2658 2660 2661 4639 - 4640 4641 835 836 837 4641 4642 4643 835 836 837 2661 2662 2663 4643 4644 - 4645 807 838 839 845 846 869 876 877 884 885 887 888 2667 2743 2744 - 2753 2754 2765 2766 4651 4738 4741 4742 4745 4746 4773 4774 838 839 841 842 - 845 2667 2668 2671 2672 4651 4652 4653 4656 840 2669 2672 839 841 845 2669 - 2670 2672 2679 4653 4654 4663 839 842 843 2671 2672 2673 4655 4656 4657 842 - 843 2673 2674 2676 2677 4657 4658 4659 844 4660 4661 807 838 839 841 845 - 846 2672 2679 2680 2682 2683 4663 4664 4711 4712 807 838 845 846 847 852 - 854 856 857 863 869 2683 2684 2695 2696 2699 2700 2713 4665 4666 4675 4676 - 4684 4687 4688 4695 4712 846 847 4665 4712 848 849 2685 4667 848 849 851 - 852 854 2685 2686 2689 2690 4667 4668 4669 4672 850 2687 2690 849 851 854 - 2687 2688 2690 2693 4669 4670 4673 807 846 849 852 854 857 2689 2690 2701 - 4671 4672 4681 807 853 854 2691 2697 2698 4676 4677 4678 4679 807 846 849 - 851 852 853 854 2690 2693 2694 2696 2697 4673 4674 4675 4676 855 4676 4677 - 846 856 857 858 2699 2705 2706 4684 4685 4686 4687 807 846 852 856 857 - 2701 2702 2704 2705 4681 4682 4683 4684 856 858 4684 4685 859 860 862 2707 - 2711 2712 4693 4709 4710 859 860 861 862 2710 2711 2727 2728 4689 4690 4691 - 860 861 863 2715 2716 2718 2727 4689 4697 4698 859 860 862 4691 4692 4709 - 846 861 863 865 2713 2714 2717 2718 4695 4696 4697 4700 864 2715 2718 863 - 865 866 2717 2718 2719 4699 4700 4701 865 866 868 2719 2720 2722 2723 4701 - 4702 4703 867 868 4704 4705 866 867 868 2723 2724 2725 4703 4704 4705 4706 - 4707 497 536 776 807 838 846 869 870 872 876 900 907 908 915 916 - 919 2729 2805 2806 2815 2816 2827 2828 4713 4800 4803 4804 4807 4808 4835 4836 - 869 870 872 873 2729 2730 2733 2734 4713 4714 4715 4718 871 2731 2734 869 - 870 872 876 2731 2732 2734 2741 4715 4716 4725 870 873 874 2733 2734 2735 - 4717 4718 4719 873 874 2735 2736 2738 2739 4719 4720 4721 875 4722 4723 838 - 869 872 876 877 2741 2742 2744 2745 4725 4726 4773 4774 838 876 877 885 - 887 888 894 2745 2746 2757 2758 2761 2762 2775 4727 4728 4737 4738 4749 4750 - 4757 4774 878 4727 4774 879 880 2747 4729 879 880 882 883 2747 2748 2751 - 4729 4730 4731 4734 881 2749 2752 880 882 883 885 2749 2750 2751 2752 2755 - 4731 4732 4735 880 882 883 885 888 2751 2752 2763 4733 4734 4743 838 884 - 885 2753 2759 2760 4738 4739 4740 4741 838 877 882 883 884 885 888 2755 - 2756 2758 2759 4735 4736 4737 4738 886 4738 4739 838 877 887 888 2761 2767 - 2768 4746 4747 4748 4749 838 877 883 885 887 888 2763 2764 2766 2767 4743 - 4744 4745 4746 889 4746 4747 890 891 893 2769 2773 2774 4755 4771 4772 890 - 891 892 893 2772 2773 2789 2790 4751 4752 4753 891 892 894 2777 2778 2780 - 2789 4751 4759 4760 890 891 893 4753 4754 4771 877 892 894 896 2775 2776 - 2779 2780 4757 4758 4759 4762 895 2777 2780 894 896 897 2779 2780 2781 4761 - 4762 4763 896 897 898 899 2781 2782 2784 2785 4763 4764 4765 897 898 899 - 4765 4766 4767 897 898 899 2785 2786 2787 4767 4768 4769 869 900 901 907 - 908 934 938 939 946 947 950 2791 2867 2868 2877 2878 2889 2890 4775 4862 - 4865 4866 4869 4870 4897 4898 900 901 903 904 907 2791 2792 2795 2796 4775 - 4776 4777 4780 902 2793 2796 901 903 907 2793 2794 2796 2803 4777 4778 4787 - 901 904 905 2795 2796 2797 4779 4780 4781 904 905 2797 2798 2800 2801 4781 - 4782 4783 906 4784 4785 869 900 901 903 907 908 2796 2803 2804 2806 2807 - 4787 4788 4835 869 900 907 908 909 916 918 919 925 2807 2808 2819 2820 - 2823 2824 2837 4789 4790 4799 4800 4808 4811 4812 4819 4835 4836 908 909 4789 - 4836 910 911 2809 4791 910 911 913 914 2809 2810 2813 4791 4792 4793 4796 - 912 2811 2814 911 913 914 916 2811 2812 2813 2814 2817 4793 4794 4797 911 - 913 914 916 919 2813 2814 2825 4795 4796 4805 869 915 916 2815 2821 2822 - 4800 4801 4802 4803 869 908 913 914 915 916 919 2817 2818 2820 2821 4797 - 4798 4799 4800 917 4800 4801 908 918 919 920 2823 2829 2830 4808 4809 4810 - 4811 869 908 914 916 918 919 2825 2826 2828 2829 4805 4806 4807 4808 918 - 920 4808 4809 921 922 924 2831 2835 2836 4817 4833 4834 921 922 923 924 - 2834 2835 2851 2852 4813 4814 4815 922 923 925 2839 2840 2842 2851 4813 4821 - 4822 921 922 924 4815 4816 4833 908 923 925 927 2837 2838 2841 2842 4819 - 4820 4821 4824 926 2839 2842 925 927 928 2841 2842 2843 4823 4824 4825 927 - 928 930 2843 2844 2846 2847 4825 4826 4827 929 930 4828 4829 928 929 930 - 2847 2848 2849 4827 4828 4829 4830 4831 931 932 969 970 975 977 980 981 - 2853 2929 2930 2939 2940 2951 2952 4837 4927 4928 4931 4932 4959 4960 931 932 - 934 935 2853 2854 2857 2858 4837 4838 4839 4842 933 2855 2858 900 932 934 - 938 939 2855 2856 2858 2865 4839 4840 4849 932 935 936 2857 2858 2859 4841 - 4842 4843 935 936 2859 2860 2862 2863 4843 4844 4845 937 4846 4847 900 934 - 938 939 2865 2866 2868 2869 4849 4850 4897 900 934 938 939 940 947 949 - 950 956 2869 2870 2881 2882 2885 2886 2899 4851 4852 4861 4862 4870 4873 4874 - 4881 4897 4898 939 940 4851 4898 941 942 2871 4853 941 942 944 945 2871 - 2872 2875 4853 4854 4855 4858 943 2873 2876 942 944 945 947 950 2873 2874 - 2875 2876 2879 4855 4856 4859 942 944 945 950 2875 2876 2887 4857 4858 4867 - 900 946 947 2877 2883 2884 4862 4863 4864 4865 900 939 944 946 947 950 - 2879 2880 2882 2883 4859 4860 4861 4862 948 4862 4863 939 949 950 951 2885 - 2891 2892 4870 4871 4872 4873 900 939 944 945 947 949 950 2887 2888 2890 - 2891 4867 4868 4869 4870 949 951 4870 4871 952 953 955 2893 2897 2898 4879 - 4895 4896 952 953 954 955 2896 2897 2913 2914 4875 4876 4877 953 954 956 - 2901 2902 2904 2913 4875 4883 4884 952 953 955 4877 4878 4895 939 954 956 - 958 2899 2900 2903 2904 4881 4882 4883 4886 957 2901 2904 956 958 959 2903 - 2904 2905 4885 4886 4887 958 959 961 2905 2906 2908 2909 4887 4888 4889 960 - 961 4890 4891 959 960 961 2909 2910 2911 4889 4890 4891 4892 4893 962 2915 - 4899 963 965 966 2915 2916 2919 4899 4900 4901 4904 964 2917 2920 963 965 - 966 969 2917 2918 2919 2920 2927 4901 4902 4911 963 965 966 967 2919 2920 - 2921 4903 4904 4905 966 967 2921 2922 2924 2925 4905 4906 4907 968 4908 4909 - 931 965 969 970 2927 2928 2930 2931 4911 4912 4959 931 969 970 975 977 - 978 980 981 987 2931 2932 2943 2944 2947 2948 2961 4913 4914 4923 4924 4935 - 4936 4943 4959 4960 971 4913 4960 972 2933 4915 973 975 976 2933 2934 2937 - 4915 4916 4917 4920 974 2935 2938 931 970 973 975 976 977 978 981 2935 - 2936 2937 2938 2941 4917 4918 4921 973 975 976 981 2937 2938 2949 4919 4920 - 4929 931 970 975 977 978 979 2939 2945 2946 4923 4924 4925 4926 4927 970 - 975 977 978 2941 2942 2944 2945 4921 4922 4923 977 979 4924 4925 931 970 - 980 981 982 2947 2953 2954 4931 4932 4933 4934 4935 931 970 975 976 980 - 981 2949 2950 2952 2953 4929 4930 4931 980 982 4932 4933 983 984 986 2955 - 2959 2960 4939 4940 4941 4957 4958 983 984 985 2958 2959 2975 2976 4937 4938 - 4939 984 985 987 2963 2964 2966 2975 4937 4945 4946 983 986 4940 4957 970 - 985 987 989 2961 2962 2965 2966 4943 4944 4945 4948 988 2963 2966 987 989 - 990 2965 2966 2967 4947 4948 4949 989 990 991 992 2967 2968 2970 2971 4949 - 4950 4951 990 991 992 4951 4952 4953 990 991 992 2971 2972 2973 4953 4954 - 4955 1 2 993 994 2 993 994 3 4 995 996 998 4 995 996 - 2 5 997 998 2 3 4 5 995 997 998 5 6 999 1000 2983 - 6 999 1000 1001 1002 6 1001 1002 6 1003 1004 1003 1004 4 8 1005 - 1006 8 1005 1006 1007 1008 8 1007 1008 8 9 1009 1010 9 1009 1010 - 11 12 1011 1012 2993 12 1011 1012 13 14 1013 1014 1016 14 1013 1014 - 12 14 15 1015 1016 13 14 15 1013 1015 1016 16 1017 1018 1017 1018 - 14 17 1019 1020 17 1019 1020 9 1021 1022 9 17 1021 1022 16 17 - 1023 1024 16 1023 1024 9 19 1025 1026 9 1025 1026 15 20 1027 1028 - 20 1027 1028 1029 1030 20 1029 1030 19 20 1031 1032 19 1031 1032 22 - 1033 1034 1033 1034 1035 1036 23 1035 1036 22 23 1037 1038 22 1037 1038 - 9 26 1039 1040 26 1039 1040 24 27 1041 1042 1044 24 1041 1042 26 - 28 1043 1044 24 26 27 28 1041 1043 1044 28 29 1045 1046 29 1045 - 1046 1047 1048 29 1047 1048 29 31 1049 1050 31 1049 1050 31 1051 1052 - 1051 1052 23 24 1053 1054 23 1053 1054 32 33 1055 1056 33 1055 1056 - 34 35 1057 1058 1060 35 1057 1058 33 35 36 1059 1060 34 35 36 - 1057 1059 1060 36 37 1061 1062 37 1061 1062 1063 1064 37 1063 1064 37 - 1065 1066 1065 1066 35 39 1067 1068 39 1067 1068 1 1069 1070 1 39 - 1069 1070 39 40 1071 1072 40 1071 1072 42 43 1073 1074 43 1073 1074 - 44 45 1075 1076 1078 45 1075 1076 43 46 1077 1078 43 44 45 46 - 48 1075 1077 1078 1 47 1079 1080 1 1079 1080 45 48 1081 1082 48 - 1081 1082 40 1083 1084 40 48 1083 1084 47 48 1085 1086 47 1085 1086 - 40 50 1087 1088 40 1087 1088 46 51 1089 1090 51 1089 1090 1 1091 - 1092 1 51 1091 1092 50 51 1093 1094 50 1093 1094 53 1095 1096 1095 - 1096 1097 1098 54 1097 1098 53 54 1099 1100 53 1099 1100 40 57 1101 - 1102 57 1101 1102 55 58 1103 1104 1106 55 1103 1104 57 59 1105 1106 - 55 57 58 59 1103 1105 1106 59 60 1107 1108 60 1107 1108 1109 1110 - 60 1109 1110 60 62 1111 1112 62 1111 1112 62 1113 1114 1113 1114 54 - 55 1115 1116 54 1115 1116 63 64 1117 1118 64 1117 1118 65 66 1119 - 1120 1122 66 1119 1120 64 67 1121 1122 64 65 66 67 70 1119 1121 - 1122 67 68 1123 1124 68 1123 1124 1125 1126 68 1125 1126 68 1127 1128 - 1127 1128 66 70 1129 1130 70 1129 1130 32 1131 1132 32 70 1131 1132 - 70 71 1133 1134 71 1133 1134 73 74 1135 1136 74 1135 1136 75 76 - 1137 1138 1140 76 1137 1138 74 77 1139 1140 74 75 76 77 1137 1139 - 1140 32 78 1141 1142 32 1141 1142 76 79 1143 1144 79 1143 1144 71 - 1145 1146 71 79 1145 1146 78 79 1147 1148 78 1147 1148 71 81 1149 - 1150 71 1149 1150 77 82 1151 1152 82 1151 1152 32 1153 1154 32 82 - 1153 1154 81 82 1155 1156 81 1155 1156 84 1157 1158 1157 1158 1159 1160 - 85 1159 1160 84 85 1161 1162 84 1161 1162 71 88 1163 1164 88 1163 - 1164 86 89 1165 1166 1168 86 1165 1166 88 90 1167 1168 86 88 89 - 90 1165 1167 1168 90 91 1169 1170 91 1169 1170 1171 1172 91 1171 1172 - 91 93 1173 1174 93 1173 1174 93 1175 1176 1175 1176 85 86 1177 1178 - 85 1177 1178 94 95 1179 1180 95 1179 1180 96 97 1181 1182 1184 97 - 1181 1182 95 98 1183 1184 95 96 97 98 1181 1183 1184 98 99 1185 - 1186 99 1185 1186 1187 1188 99 1187 1188 99 1189 1190 1189 1190 97 101 - 1191 1192 101 1191 1192 63 1193 1194 63 101 1193 1194 101 102 1195 1196 - 102 1195 1196 104 105 1197 1198 105 1197 1198 106 107 1199 1200 1202 107 - 1199 1200 105 108 1201 1202 105 106 107 108 110 113 1199 1201 1202 63 - 109 1203 1204 63 1203 1204 107 110 1205 1206 110 1205 1206 102 1207 1208 - 102 110 1207 1208 109 110 1209 1210 109 1209 1210 102 112 1211 1212 102 - 1211 1212 108 113 1213 1214 113 1213 1214 63 1215 1216 63 113 1215 1216 - 112 113 1217 1218 112 1217 1218 115 1219 1220 1219 1220 1221 1222 116 1221 - 1222 115 116 1223 1224 115 1223 1224 102 119 1225 1226 119 1225 1226 117 - 120 1227 1228 117 1227 1228 119 121 1229 1230 117 119 120 121 1229 1230 - 121 122 1231 1232 122 1231 1232 1233 1234 122 1233 1234 122 124 1235 1236 - 124 1235 1236 124 1237 1238 1237 1238 116 117 1239 1240 116 1239 1240 125 - 126 1241 1242 126 1241 1242 127 128 1243 1244 128 1243 1244 126 129 1245 - 1246 126 127 128 129 1245 1246 129 130 1247 1248 130 1247 1248 1249 1250 - 130 1249 1250 130 1251 1252 1251 1252 128 132 1253 1254 132 1253 1254 94 - 1255 1256 94 132 1255 1256 132 133 1257 1258 133 1257 1258 135 136 1259 - 1260 3241 136 1259 1260 137 138 1261 1262 1264 138 1261 1262 136 138 139 - 1263 1264 137 138 139 1261 1263 1264 94 140 1265 1266 94 1265 1266 138 - 141 1267 1268 141 1267 1268 133 1269 1270 133 141 1269 1270 140 141 1271 - 1272 140 1271 1272 133 143 1273 1274 133 1273 1274 139 144 1275 1276 144 - 1275 1276 94 1277 1278 94 144 1277 1278 143 144 1279 1280 143 1279 1280 - 146 1281 1282 1281 1282 1283 1284 147 1283 1284 146 147 1285 1286 146 1285 - 1286 133 150 1287 1288 150 1287 1288 148 150 151 1289 1290 148 1289 1290 - 150 152 1291 1292 150 151 152 1291 1292 152 153 1293 1294 153 1293 1294 - 1295 1296 153 1295 1296 153 155 1297 1298 155 1297 1298 155 1299 1300 1299 - 1300 147 148 1301 1302 147 1301 1302 156 157 1303 1304 157 1303 1304 158 - 159 1305 1306 159 1305 1306 157 159 160 1307 1308 158 159 160 1307 1308 - 160 161 1309 1310 161 1309 1310 1311 1312 161 1311 1312 161 1313 1314 1313 - 1314 159 163 1315 1316 163 1315 1316 125 1317 1318 125 163 1317 1318 163 - 164 1319 1320 164 1319 1320 166 167 1321 1322 3303 167 1321 1322 168 169 - 1323 1324 169 1323 1324 167 170 1325 1326 167 168 169 170 1325 1326 125 - 171 1327 1328 125 1327 1328 169 172 1329 1330 172 1329 1330 164 1331 1332 - 164 172 1331 1332 171 172 1333 1334 171 1333 1334 164 174 1335 1336 164 - 1335 1336 170 175 1337 1338 175 1337 1338 125 1339 1340 125 175 1339 1340 - 174 175 1341 1342 174 1341 1342 177 1343 1344 1343 1344 1345 1346 178 1345 - 1346 177 178 1347 1348 177 1347 1348 164 181 1349 1350 181 1349 1350 179 - 182 1351 1352 1354 179 1351 1352 181 183 1353 1354 179 181 182 183 1351 - 1353 1354 183 184 1355 1356 184 1355 1356 1357 1358 184 1357 1358 184 186 - 1359 1360 186 1359 1360 186 1361 1362 1361 1362 178 179 1363 1364 178 1363 - 1364 187 188 1365 1366 188 1365 1366 189 190 1367 1368 190 1367 1368 188 - 190 191 1369 1370 189 190 191 1369 1370 191 192 1371 1372 192 1371 1372 - 1373 1374 192 1373 1374 192 1375 1376 1375 1376 190 194 1377 1378 194 1377 - 1378 156 1379 1380 156 194 1379 1380 194 195 1381 1382 195 1381 1382 197 - 198 1383 1384 3365 198 1383 1384 199 200 1385 1386 1388 200 1385 1386 198 - 201 1387 1388 198 199 200 201 203 206 1385 1387 1388 156 202 1389 1390 - 156 1389 1390 200 203 1391 1392 203 1391 1392 195 1393 1394 195 203 1393 - 1394 202 203 1395 1396 202 1395 1396 195 205 1397 1398 195 1397 1398 201 - 206 1399 1400 206 1399 1400 156 1401 1402 156 206 1401 1402 205 206 1403 - 1404 205 1403 1404 208 1405 1406 1405 1406 1407 1408 209 1407 1408 208 209 - 1409 1410 208 1409 1410 195 212 1411 1412 212 1411 1412 210 213 1413 1414 - 210 1413 1414 212 214 1415 1416 210 212 213 214 1415 1416 3395 214 215 - 1417 1418 215 1417 1418 1419 1420 215 1419 1420 215 217 1421 1422 217 1421 - 1422 217 1423 1424 1423 1424 209 210 1425 1426 209 1425 1426 218 219 1427 - 1428 219 1427 1428 220 221 1429 1430 221 1429 1430 219 222 1431 1432 219 - 220 221 222 1431 1432 222 223 1433 1434 223 1433 1434 1435 1436 223 1435 - 1436 223 1437 1438 1437 1438 221 225 1439 1440 225 1439 1440 187 1441 1442 - 187 225 1441 1442 225 226 1443 1444 226 1443 1444 228 229 1445 1446 229 - 1445 1446 230 231 1447 1448 1450 231 1447 1448 229 232 1449 1450 229 230 - 231 232 1447 1449 1450 187 233 1451 1452 187 1451 1452 231 234 1453 1454 - 234 1453 1454 226 1455 1456 226 234 1455 1456 233 234 1457 1458 233 1457 - 1458 226 236 1459 1460 226 1459 1460 232 237 1461 1462 237 1461 1462 187 - 1463 1464 187 237 1463 1464 236 237 1465 1466 236 1465 1466 239 1467 1468 - 1467 1468 1469 1470 240 1469 1470 239 240 1471 1472 239 1471 1472 226 243 - 1473 1474 243 1473 1474 241 244 1475 1476 1478 241 1475 1476 243 245 1477 - 1478 241 243 244 245 1475 1477 1478 245 246 1479 1480 246 1479 1480 1481 - 1482 246 1481 1482 246 248 1483 1484 248 1483 1484 248 1485 1486 1485 1486 - 240 241 1487 1488 240 1487 1488 249 250 1489 1490 250 1489 1490 251 252 - 1491 1492 1494 252 1491 1492 250 253 1493 1494 250 251 252 253 1491 1493 - 1494 253 254 1495 1496 254 1495 1496 1497 1498 254 1497 1498 254 1499 1500 - 1499 1500 252 256 1501 1502 256 1501 1502 218 1503 1504 218 256 1503 1504 - 256 257 1505 1506 257 1505 1506 259 260 1507 1508 260 1507 1508 261 262 - 1509 1510 1512 262 1509 1510 260 263 1511 1512 260 261 262 263 265 268 - 1509 1511 1512 218 264 1513 1514 218 1513 1514 262 265 1515 1516 265 1515 - 1516 257 1517 1518 257 265 1517 1518 264 265 1519 1520 264 1519 1520 257 - 267 1521 1522 257 1521 1522 263 268 1523 1524 268 1523 1524 218 1525 1526 - 218 268 1525 1526 267 268 1527 1528 267 1527 1528 270 1529 1530 1529 1530 - 1531 1532 271 1531 1532 270 271 1533 1534 270 1533 1534 257 274 1535 1536 - 274 1535 1536 272 275 1537 1538 1540 272 1537 1538 274 276 1539 1540 272 - 274 275 276 1537 1539 1540 276 277 1541 1542 277 1541 1542 1543 1544 277 - 1543 1544 277 279 1545 1546 279 1545 1546 279 1547 1548 1547 1548 271 272 - 1549 1550 271 1549 1550 280 281 1551 1552 281 1551 1552 282 283 1553 1554 - 1556 283 1553 1554 281 284 1555 1556 281 282 283 284 1553 1555 1556 284 - 285 1557 1558 285 1557 1558 1559 1560 285 1559 1560 285 1561 1562 1561 1562 - 283 287 1563 1564 287 1563 1564 249 1565 1566 249 287 1565 1566 287 288 - 1567 1568 288 1567 1568 290 291 1569 1570 291 1569 1570 292 293 1571 1572 - 1574 293 1571 1572 291 294 1573 1574 291 292 293 294 1571 1573 1574 249 - 295 1575 1576 249 1575 1576 293 296 1577 1578 296 1577 1578 288 1579 1580 - 288 296 1579 1580 295 296 1581 1582 295 1581 1582 288 298 1583 1584 288 - 1583 1584 294 299 1585 1586 299 1585 1586 249 1587 1588 249 299 1587 1588 - 298 299 1589 1590 298 1589 1590 301 1591 1592 1591 1592 1593 1594 302 1593 - 1594 301 302 1595 1596 301 1595 1596 288 305 1597 1598 305 1597 1598 303 - 306 1599 1600 1602 303 1599 1600 305 307 1601 1602 303 305 306 307 1599 - 1601 1602 307 308 1603 1604 308 1603 1604 1605 1606 308 1605 1606 308 310 - 1607 1608 310 1607 1608 310 1609 1610 1609 1610 302 303 1611 1612 302 1611 - 1612 311 312 1613 1614 312 1613 1614 313 314 1615 1616 1618 314 1615 1616 - 312 315 1617 1618 312 313 314 315 1615 1617 1618 315 316 1619 1620 316 - 1619 1620 1621 1622 316 1621 1622 316 1623 1624 1623 1624 314 318 1625 1626 - 318 1625 1626 280 1627 1628 280 318 1627 1628 318 319 1629 1630 319 1629 - 1630 321 322 1631 1632 322 1631 1632 323 324 1633 1634 1636 324 1633 1634 - 322 324 325 1635 1636 323 324 325 1633 1635 1636 280 326 1637 1638 280 - 1637 1638 324 327 1639 1640 327 1639 1640 319 1641 1642 319 327 1641 1642 - 326 327 1643 1644 326 1643 1644 319 329 1645 1646 319 1645 1646 325 330 - 1647 1648 330 1647 1648 280 1649 1650 280 330 1649 1650 329 330 1651 1652 - 329 1651 1652 332 1653 1654 1653 1654 1655 1656 333 1655 1656 332 333 1657 - 1658 332 1657 1658 319 336 1659 1660 336 1659 1660 334 337 1661 1662 1664 - 334 1661 1662 336 338 1663 1664 334 336 337 338 1661 1663 1664 338 339 - 1665 1666 339 1665 1666 1667 1668 339 1667 1668 339 341 1669 1670 341 1669 - 1670 341 1671 1672 1671 1672 333 334 1673 1674 333 1673 1674 342 343 1675 - 1676 343 1675 1676 344 345 1677 1678 1680 345 1677 1678 343 346 1679 1680 - 343 344 345 346 1677 1679 1680 346 347 1681 1682 347 1681 1682 1683 1684 - 347 1683 1684 347 1685 1686 1685 1686 345 349 1687 1688 349 1687 1688 311 - 1689 1690 311 349 1689 1690 349 350 1691 1692 350 1691 1692 352 353 1693 - 1694 353 1693 1694 354 355 1695 1696 1698 355 1695 1696 353 355 356 1697 - 1698 354 355 356 1695 1697 1698 311 357 1699 1700 311 1699 1700 355 358 - 1701 1702 358 1701 1702 350 1703 1704 350 358 1703 1704 357 358 1705 1706 - 357 1705 1706 350 360 1707 1708 350 1707 1708 356 361 1709 1710 361 1709 - 1710 311 1711 1712 311 361 1711 1712 360 361 1713 1714 360 1713 1714 363 - 1715 1716 1715 1716 1717 1718 364 1717 1718 363 364 1719 1720 363 1719 1720 - 350 367 1721 1722 367 1721 1722 365 368 1723 1724 1726 365 1723 1724 367 - 369 1725 1726 365 367 368 369 1723 1725 1726 369 370 1727 1728 370 1727 - 1728 1729 1730 370 1729 1730 370 372 1731 1732 372 1731 1732 372 1733 1734 - 1733 1734 364 365 1735 1736 364 1735 1736 373 374 1737 1738 374 1737 1738 - 375 376 1739 1740 1742 376 1739 1740 374 377 1741 1742 374 375 376 377 - 1739 1741 1742 377 378 1743 1744 378 1743 1744 1745 1746 378 1745 1746 378 - 1747 1748 1747 1748 376 380 1749 1750 380 1749 1750 342 1751 1752 342 380 - 1751 1752 380 381 1753 1754 381 1753 1754 383 384 1755 1756 384 1755 1756 - 385 386 1757 1758 1760 386 1757 1758 384 386 387 1759 1760 385 386 387 - 1757 1759 1760 342 388 1761 1762 342 1761 1762 386 389 1763 1764 389 1763 - 1764 381 1765 1766 381 389 1765 1766 388 389 1767 1768 388 1767 1768 381 - 391 1769 1770 381 1769 1770 387 392 1771 1772 392 1771 1772 342 1773 1774 - 342 392 1773 1774 391 392 1775 1776 391 1775 1776 394 1777 1778 1777 1778 - 1779 1780 395 1779 1780 394 395 1781 1782 394 1781 1782 381 398 1783 1784 - 398 1783 1784 396 399 1785 1786 1788 396 1785 1786 398 400 1787 1788 396 - 398 399 400 1785 1787 1788 400 401 1789 1790 401 1789 1790 1791 1792 401 - 1791 1792 401 403 1793 1794 403 1793 1794 403 1795 1796 1795 1796 395 396 - 1797 1798 395 1797 1798 404 405 1799 1800 405 1799 1800 406 407 1801 1802 - 1804 407 1801 1802 405 408 1803 1804 405 406 407 408 1801 1803 1804 408 - 409 1805 1806 409 1805 1806 1807 1808 409 1807 1808 409 1809 1810 1809 1810 - 407 411 1811 1812 411 1811 1812 373 1813 1814 373 411 1813 1814 411 412 - 1815 1816 412 1815 1816 414 415 1817 1818 415 1817 1818 416 417 1819 1820 - 1822 417 1819 1820 415 417 418 1821 1822 416 417 418 1819 1821 1822 373 - 419 1823 1824 373 1823 1824 417 420 1825 1826 420 1825 1826 412 1827 1828 - 412 420 1827 1828 419 420 1829 1830 419 1829 1830 412 422 1831 1832 412 - 1831 1832 418 423 1833 1834 423 1833 1834 373 1835 1836 373 423 1835 1836 - 422 423 1837 1838 422 1837 1838 425 1839 1840 1839 1840 1841 1842 426 1841 - 1842 425 426 1843 1844 425 1843 1844 412 429 1845 1846 429 1845 1846 427 - 430 1847 1848 1850 427 1847 1848 429 431 1849 1850 427 429 430 431 1847 - 1849 1850 431 432 1851 1852 432 1851 1852 1853 1854 432 1853 1854 432 434 - 1855 1856 434 1855 1856 434 1857 1858 1857 1858 426 427 1859 1860 426 1859 - 1860 435 436 1861 1862 436 1861 1862 437 438 1863 1864 1866 438 1863 1864 - 436 439 1865 1866 436 437 438 439 442 1863 1865 1866 439 440 1867 1868 - 440 1867 1868 1869 1870 440 1869 1870 440 1871 1872 1871 1872 438 442 1873 - 1874 442 1873 1874 404 1875 1876 404 442 1875 1876 442 443 1877 1878 443 - 1877 1878 445 446 1879 1880 446 1879 1880 447 448 1881 1882 1884 448 1881 - 1882 446 449 1883 1884 446 447 448 449 451 1881 1883 1884 404 450 1885 - 1886 404 1885 1886 448 451 1887 1888 451 1887 1888 443 1889 1890 443 451 - 1889 1890 450 451 1891 1892 450 1891 1892 443 453 1893 1894 443 1893 1894 - 449 454 1895 1896 454 1895 1896 404 1897 1898 404 454 1897 1898 453 454 - 1899 1900 453 1899 1900 456 1901 1902 1901 1902 1903 1904 457 1903 1904 456 - 457 1905 1906 456 1905 1906 443 460 1907 1908 460 1907 1908 458 461 1909 - 1910 1912 458 1909 1910 460 462 1911 1912 458 460 461 462 1909 1911 1912 - 462 463 1913 1914 463 1913 1914 1915 1916 463 1915 1916 463 465 1917 1918 - 465 1917 1918 465 1919 1920 1919 1920 457 458 1921 1922 457 1921 1922 466 - 467 1923 1924 467 1923 1924 468 469 1925 1926 1928 469 1925 1926 467 470 - 1927 1928 467 468 469 470 473 1925 1927 1928 470 471 1929 1930 471 1929 - 1930 1931 1932 471 1931 1932 471 1933 1934 1933 1934 469 473 1935 1936 473 - 1935 1936 435 1937 1938 435 473 1937 1938 473 474 1939 1940 474 1939 1940 - 476 477 1941 1942 477 1941 1942 478 479 1943 1944 1946 479 1943 1944 477 - 479 480 1945 1946 478 479 480 1943 1945 1946 435 481 1947 1948 435 1947 - 1948 479 482 1949 1950 482 1949 1950 474 1951 1952 474 482 1951 1952 481 - 482 1953 1954 481 1953 1954 474 484 1955 1956 474 1955 1956 480 485 1957 - 1958 485 1957 1958 435 1959 1960 435 485 1959 1960 484 485 1961 1962 484 - 1961 1962 487 1963 1964 1963 1964 1965 1966 488 1965 1966 487 488 1967 1968 - 487 1967 1968 474 491 1969 1970 491 1969 1970 489 492 1971 1972 1974 489 - 1971 1972 491 493 1973 1974 489 491 492 493 1971 1973 1974 3953 493 494 - 1975 1976 494 1975 1976 1977 1978 494 1977 1978 494 496 1979 1980 496 1979 - 1980 496 1981 1982 1981 1982 488 489 1983 1984 488 1983 1984 497 498 1985 - 1986 498 1985 1986 499 500 1987 1988 1990 500 1987 1988 498 501 1989 1990 - 498 499 500 501 504 1987 1989 1990 501 502 1991 1992 502 1991 1992 1993 - 1994 502 1993 1994 502 1995 1996 1995 1996 500 504 1997 1998 504 1997 1998 - 466 1999 2000 466 504 1999 2000 504 505 2001 2002 505 2001 2002 507 508 - 2003 2004 508 2003 2004 509 510 2005 2006 2008 510 2005 2006 508 510 511 - 2007 2008 509 510 511 2005 2007 2008 466 512 2009 2010 466 2009 2010 510 - 513 2011 2012 513 2011 2012 505 2013 2014 505 513 2013 2014 512 513 2015 - 2016 512 2015 2016 505 515 2017 2018 505 2017 2018 511 516 2019 2020 516 - 2019 2020 466 2021 2022 466 516 2021 2022 515 516 2023 2024 515 2023 2024 - 518 2025 2026 2025 2026 2027 2028 519 2027 2028 518 519 2029 2030 518 2029 - 2030 505 522 2031 2032 522 2031 2032 520 523 2033 2034 2036 520 2033 2034 - 522 524 2035 2036 520 522 523 524 2033 2035 2036 524 525 2037 2038 525 - 2037 2038 2039 2040 525 2039 2040 525 527 2041 2042 527 2041 2042 527 2043 - 2044 2043 2044 519 520 2045 2046 519 2045 2046 528 529 2047 2048 529 2047 - 2048 530 531 2049 2050 2052 531 2049 2050 529 532 2051 2052 529 530 531 - 532 2049 2051 2052 532 533 2053 2054 533 2053 2054 2055 2056 533 2055 2056 - 533 2057 2058 2057 2058 531 535 2059 2060 535 2059 2060 497 2061 2062 497 - 535 2061 2062 535 536 2063 2064 536 2063 2064 538 539 2065 2066 539 2065 - 2066 540 541 2067 2068 2070 541 2067 2068 539 542 2069 2070 539 540 541 - 542 2067 2069 2070 497 543 2071 2072 497 2071 2072 541 544 2073 2074 544 - 2073 2074 536 2075 2076 536 544 2075 2076 543 544 2077 2078 543 2077 2078 - 536 546 2079 2080 536 2079 2080 542 547 2081 2082 547 2081 2082 497 2083 - 2084 497 547 2083 2084 546 547 2085 2086 546 2085 2086 549 2087 2088 2087 - 2088 2089 2090 550 2089 2090 549 550 2091 2092 549 2091 2092 536 553 2093 - 2094 553 2093 2094 551 554 2095 2096 2098 551 2095 2096 553 555 2097 2098 - 551 553 554 555 2095 2097 2098 555 556 2099 2100 556 2099 2100 2101 2102 - 556 2101 2102 556 558 2103 2104 558 2103 2104 558 2105 2106 2105 2106 550 - 551 2107 2108 550 2107 2108 559 560 2109 2110 560 2109 2110 561 562 2111 - 2112 2114 562 2111 2112 560 563 2113 2114 560 561 562 563 566 2111 2113 - 2114 563 564 2115 2116 564 2115 2116 2117 2118 564 2117 2118 564 2119 2120 - 2119 2120 562 566 2121 2122 566 2121 2122 528 2123 2124 528 566 2123 2124 - 566 567 2125 2126 567 2125 2126 569 570 2127 2128 570 2127 2128 571 572 - 2129 2130 2132 572 2129 2130 570 572 573 2131 2132 571 572 573 2129 2131 - 2132 528 574 2133 2134 528 2133 2134 572 575 2135 2136 575 2135 2136 567 - 2137 2138 567 575 2137 2138 574 575 2139 2140 574 2139 2140 567 577 2141 - 2142 567 2141 2142 573 578 2143 2144 578 2143 2144 528 2145 2146 528 578 - 2145 2146 577 578 2147 2148 577 2147 2148 580 2149 2150 2149 2150 2151 2152 - 581 2151 2152 580 581 2153 2154 580 2153 2154 567 584 2155 2156 584 2155 - 2156 582 585 2157 2158 2160 582 2157 2158 584 586 2159 2160 582 584 585 - 586 2157 2159 2160 586 587 2161 2162 587 2161 2162 2163 2164 587 2163 2164 - 587 589 2165 2166 589 2165 2166 589 2167 2168 2167 2168 581 582 2169 2170 - 581 2169 2170 590 591 2171 2172 591 2171 2172 592 593 2173 2174 2176 593 - 2173 2174 591 594 2175 2176 591 592 593 594 2173 2175 2176 594 595 2177 - 2178 595 2177 2178 2179 2180 595 2179 2180 595 2181 2182 2181 2182 593 597 - 2183 2184 597 2183 2184 559 2185 2186 559 597 2185 2186 597 598 2187 2188 - 598 2187 2188 600 601 2189 2190 601 2189 2190 602 603 2191 2192 2194 603 - 2191 2192 601 603 604 2193 2194 602 603 604 2191 2193 2194 559 605 2195 - 2196 559 2195 2196 603 606 2197 2198 606 2197 2198 598 2199 2200 598 606 - 2199 2200 605 606 2201 2202 605 2201 2202 598 608 2203 2204 598 2203 2204 - 604 609 2205 2206 609 2205 2206 559 2207 2208 559 609 2207 2208 608 609 - 2209 2210 608 2209 2210 611 2211 2212 2211 2212 2213 2214 612 2213 2214 611 - 612 2215 2216 611 2215 2216 598 615 2217 2218 615 2217 2218 613 616 2219 - 2220 2222 613 2219 2220 615 617 2221 2222 613 615 616 617 2219 2221 2222 - 617 618 2223 2224 618 2223 2224 2225 2226 618 2225 2226 618 620 2227 2228 - 620 2227 2228 620 2229 2230 2229 2230 612 613 2231 2232 612 2231 2232 621 - 622 2233 2234 622 2233 2234 623 624 2235 2236 2238 624 2235 2236 622 624 - 625 2237 2238 623 624 625 2235 2237 2238 625 626 2239 2240 626 2239 2240 - 2241 2242 626 2241 2242 626 2243 2244 2243 2244 624 628 2245 2246 628 2245 - 2246 590 2247 2248 590 628 2247 2248 628 629 2249 2250 629 2249 2250 631 - 632 2251 2252 4233 632 2251 2252 633 634 2253 2254 2256 634 2253 2254 632 - 634 635 2255 2256 633 634 635 2253 2255 2256 590 636 2257 2258 590 2257 - 2258 634 637 2259 2260 637 2259 2260 629 2261 2262 629 637 2261 2262 636 - 637 2263 2264 636 2263 2264 629 639 2265 2266 629 2265 2266 635 640 2267 - 2268 640 2267 2268 590 2269 2270 590 640 2269 2270 639 640 2271 2272 639 - 2271 2272 642 2273 2274 2273 2274 2275 2276 643 2275 2276 642 643 2277 2278 - 4258 642 2277 2278 629 646 2279 2280 646 2279 2280 644 647 2281 2282 2284 - 644 2281 2282 646 648 2283 2284 644 646 647 648 2281 2283 2284 648 649 - 2285 2286 649 2285 2286 2287 2288 649 2287 2288 649 651 2289 2290 4271 651 - 2289 2290 651 2291 2292 2291 2292 643 644 2293 2294 643 2293 2294 652 653 - 2295 2296 653 2295 2296 654 655 2297 2298 2300 655 2297 2298 653 655 656 - 2299 2300 654 655 656 2297 2299 2300 656 657 2301 2302 657 2301 2302 2303 - 2304 657 2303 2304 657 2305 2306 2305 2306 655 659 2307 2308 659 2307 2308 - 621 2309 2310 621 659 2309 2310 659 660 2311 2312 660 2311 2312 662 663 - 2313 2314 4295 663 2313 2314 664 665 2315 2316 2318 665 2315 2316 663 665 - 666 2317 2318 664 665 666 2315 2317 2318 621 667 2319 2320 621 2319 2320 - 665 668 2321 2322 668 2321 2322 660 2323 2324 660 668 2323 2324 667 668 - 2325 2326 667 2325 2326 660 670 2327 2328 660 2327 2328 666 671 2329 2330 - 671 2329 2330 621 2331 2332 621 671 2331 2332 670 671 2333 2334 670 2333 - 2334 673 2335 2336 2335 2336 2337 2338 674 2337 2338 673 674 2339 2340 673 - 2339 2340 660 677 2341 2342 677 2341 2342 675 678 2343 2344 2346 675 2343 - 2344 677 679 2345 2346 675 677 678 679 2343 2345 2346 679 680 2347 2348 - 680 2347 2348 2349 2350 680 2349 2350 680 682 2351 2352 682 2351 2352 682 - 2353 2354 2353 2354 674 675 2355 2356 674 2355 2356 683 684 2357 2358 684 - 2357 2358 685 686 2359 2360 2362 686 2359 2360 684 686 687 2361 2362 685 - 686 687 2359 2361 2362 687 688 2363 2364 4347 688 2363 2364 2365 2366 688 - 2365 2366 688 2367 2368 2367 2368 686 690 2369 2370 690 2369 2370 652 2371 - 2372 652 690 2371 2372 690 691 2373 2374 691 2373 2374 693 694 2375 2376 - 4357 694 2375 2376 695 696 2377 2378 2380 696 2377 2378 694 697 2379 2380 - 694 695 696 697 699 702 2377 2379 2380 652 698 2381 2382 652 2381 2382 - 696 699 2383 2384 699 2383 2384 691 2385 2386 691 699 2385 2386 698 699 - 2387 2388 698 2387 2388 691 701 2389 2390 691 2389 2390 697 702 2391 2392 - 702 2391 2392 652 2393 2394 652 702 2393 2394 701 702 2395 2396 701 2395 - 2396 704 2397 2398 2397 2398 2399 2400 705 2399 2400 704 705 2401 2402 4399 - 704 2401 2402 691 708 2403 2404 708 2403 2404 706 709 2405 2406 2408 706 - 2405 2406 708 710 2407 2408 706 708 709 710 2405 2407 2408 710 711 2409 - 2410 711 2409 2410 2411 2412 711 2411 2412 711 713 2413 2414 4394 713 2413 - 2414 713 2415 2416 2415 2416 705 706 2417 2418 705 2417 2418 714 715 2419 - 2420 715 2419 2420 716 717 2421 2422 2424 717 2421 2422 715 717 718 2423 - 2424 716 717 718 2421 2423 2424 718 719 2425 2426 719 2425 2426 2427 2428 - 719 2427 2428 719 2429 2430 2429 2430 717 721 2431 2432 721 2431 2432 683 - 2433 2434 683 721 2433 2434 721 722 2435 2436 722 2435 2436 724 725 2437 - 2438 4419 725 2437 2438 726 727 2439 2440 2442 727 2439 2440 725 728 2441 - 2442 725 726 727 728 2439 2441 2442 683 729 2443 2444 683 2443 2444 727 - 730 2445 2446 730 2445 2446 722 2447 2448 722 730 2447 2448 729 730 2449 - 2450 729 2449 2450 722 732 2451 2452 722 2451 2452 728 733 2453 2454 733 - 2453 2454 683 2455 2456 683 733 2455 2456 732 733 2457 2458 732 2457 2458 - 735 2459 2460 2459 2460 2461 2462 736 2461 2462 735 736 2463 2464 735 2463 - 2464 722 739 2465 2466 739 2465 2466 737 740 2467 2468 2470 737 2467 2468 - 739 741 2469 2470 737 739 740 741 2467 2469 2470 741 742 2471 2472 742 - 2471 2472 2473 2474 742 2473 2474 742 744 2475 2476 744 2475 2476 744 2477 - 2478 2477 2478 736 737 2479 2480 736 2479 2480 745 746 2481 2482 746 2481 - 2482 747 748 2483 2484 2486 748 2483 2484 746 748 749 2485 2486 747 748 - 749 2483 2485 2486 749 750 2487 2488 750 2487 2488 2489 2490 750 2489 2490 - 750 2491 2492 2491 2492 748 752 2493 2494 752 2493 2494 714 2495 2496 714 - 752 2495 2496 752 753 2497 2498 753 2497 2498 755 756 2499 2500 4481 756 - 2499 2500 757 758 2501 2502 2504 758 2501 2502 756 758 759 2503 2504 757 - 758 759 2501 2503 2504 714 760 2505 2506 714 2505 2506 758 761 2507 2508 - 761 2507 2508 753 2509 2510 753 761 2509 2510 760 761 2511 2512 760 2511 - 2512 753 763 2513 2514 753 2513 2514 759 764 2515 2516 764 2515 2516 714 - 2517 2518 714 764 2517 2518 763 764 2519 2520 763 2519 2520 766 2521 2522 - 2521 2522 2523 2524 767 2523 2524 766 767 2525 2526 4523 766 2525 2526 753 - 770 2527 2528 770 2527 2528 768 771 2529 2530 2532 768 2529 2530 770 772 - 2531 2532 768 770 771 772 2529 2531 2532 772 773 2533 2534 773 2533 2534 - 2535 2536 773 2535 2536 773 775 2537 2538 4519 775 2537 2538 775 2539 2540 - 2539 2540 767 768 2541 2542 4503 767 2541 2542 776 777 2543 2544 777 2543 - 2544 778 779 2545 2546 2548 779 2545 2546 777 780 2547 2548 777 778 779 - 780 783 2545 2547 2548 780 781 2549 2550 781 2549 2550 2551 2552 781 2551 - 2552 781 2553 2554 2553 2554 779 783 2555 2556 783 2555 2556 745 2557 2558 - 745 783 2557 2558 783 784 2559 2560 784 2559 2560 786 787 2561 2562 4543 - 787 2561 2562 788 789 2563 2564 2566 789 2563 2564 787 789 790 2565 2566 - 788 789 790 2563 2565 2566 745 791 2567 2568 745 2567 2568 789 792 2569 - 2570 792 2569 2570 784 2571 2572 784 792 2571 2572 791 792 2573 2574 791 - 2573 2574 784 794 2575 2576 784 2575 2576 790 795 2577 2578 795 2577 2578 - 745 2579 2580 745 795 2579 2580 794 795 2581 2582 794 2581 2582 797 2583 - 2584 2583 2584 2585 2586 798 2585 2586 797 798 2587 2588 797 2587 2588 784 - 801 2589 2590 801 2589 2590 799 802 2591 2592 2594 799 2591 2592 801 803 - 2593 2594 799 801 802 803 2591 2593 2594 803 804 2595 2596 804 2595 2596 - 2597 2598 804 2597 2598 804 806 2599 2600 806 2599 2600 806 2601 2602 2601 - 2602 798 799 2603 2604 798 2603 2604 807 808 2605 2606 808 2605 2606 809 - 810 2607 2608 2610 810 2607 2608 808 811 2609 2610 808 809 810 811 814 - 2607 2609 2610 811 812 2611 2612 812 2611 2612 2613 2614 812 2613 2614 812 - 2615 2616 2615 2616 810 814 2617 2618 814 2617 2618 776 2619 2620 776 814 - 2619 2620 814 815 2621 2622 815 2621 2622 817 818 2623 2624 4605 818 2623 - 2624 819 820 2625 2626 2628 820 2625 2626 818 820 821 2627 2628 819 820 - 821 2625 2627 2628 776 822 2629 2630 776 2629 2630 820 823 2631 2632 823 - 2631 2632 815 2633 2634 815 823 2633 2634 822 823 2635 2636 822 2635 2636 - 815 825 2637 2638 815 2637 2638 821 826 2639 2640 826 2639 2640 776 2641 - 2642 776 826 2641 2642 825 826 2643 2644 825 2643 2644 828 2645 2646 2645 - 2646 2647 2648 829 2647 2648 828 829 2649 2650 828 2649 2650 815 832 2651 - 2652 832 2651 2652 830 833 2653 2654 2656 830 2653 2654 832 834 2655 2656 - 830 832 833 834 2653 2655 2656 834 835 2657 2658 835 2657 2658 2659 2660 - 835 2659 2660 835 837 2661 2662 837 2661 2662 837 2663 2664 2663 2664 829 - 830 2665 2666 829 2665 2666 838 839 2667 2668 839 2667 2668 840 841 2669 - 2670 2672 841 2669 2670 839 842 2671 2672 839 840 841 842 845 2669 2671 - 2672 842 843 2673 2674 843 2673 2674 2675 2676 843 2675 2676 843 2677 2678 - 2677 2678 841 845 2679 2680 845 2679 2680 807 2681 2682 807 845 2681 2682 - 845 846 2683 2684 846 2683 2684 848 849 2685 2686 849 2685 2686 850 851 - 2687 2688 2690 851 2687 2688 849 852 2689 2690 849 850 851 852 854 2687 - 2689 2690 807 853 2691 2692 807 2691 2692 851 854 2693 2694 854 2693 2694 - 846 2695 2696 846 854 2695 2696 853 854 2697 2698 853 2697 2698 846 856 - 2699 2700 846 2699 2700 852 857 2701 2702 857 2701 2702 807 2703 2704 807 - 857 2703 2704 856 857 2705 2706 856 2705 2706 859 2707 2708 2707 2708 2709 - 2710 860 2709 2710 859 860 2711 2712 859 2711 2712 846 863 2713 2714 863 - 2713 2714 861 864 2715 2716 2718 861 2715 2716 863 865 2717 2718 861 863 - 864 865 2715 2717 2718 865 866 2719 2720 866 2719 2720 2721 2722 866 2721 - 2722 866 868 2723 2724 868 2723 2724 868 2725 2726 2725 2726 860 861 2727 - 2728 860 2727 2728 869 870 2729 2730 870 2729 2730 871 872 2731 2732 2734 - 872 2731 2732 870 873 2733 2734 870 871 872 873 2731 2733 2734 873 874 - 2735 2736 874 2735 2736 2737 2738 874 2737 2738 874 2739 2740 2739 2740 872 - 876 2741 2742 876 2741 2742 838 2743 2744 838 876 2743 2744 876 877 2745 - 2746 877 2745 2746 879 880 2747 2748 880 2747 2748 881 882 2749 2750 2752 - 882 2749 2750 880 882 883 2751 2752 881 882 883 2749 2751 2752 838 884 - 2753 2754 838 2753 2754 882 885 2755 2756 885 2755 2756 877 2757 2758 877 - 885 2757 2758 884 885 2759 2760 884 2759 2760 877 887 2761 2762 877 2761 - 2762 883 888 2763 2764 888 2763 2764 838 2765 2766 838 888 2765 2766 887 - 888 2767 2768 887 2767 2768 890 2769 2770 2769 2770 2771 2772 891 2771 2772 - 890 891 2773 2774 890 2773 2774 877 894 2775 2776 894 2775 2776 892 895 - 2777 2778 2780 892 2777 2778 894 896 2779 2780 892 894 895 896 2777 2779 - 2780 896 897 2781 2782 897 2781 2782 2783 2784 897 2783 2784 897 899 2785 - 2786 899 2785 2786 899 2787 2788 2787 2788 891 892 2789 2790 891 2789 2790 - 900 901 2791 2792 901 2791 2792 902 903 2793 2794 2796 903 2793 2794 901 - 904 2795 2796 901 902 903 904 907 2793 2795 2796 904 905 2797 2798 905 - 2797 2798 2799 2800 905 2799 2800 905 2801 2802 2801 2802 903 907 2803 2804 - 907 2803 2804 869 2805 2806 869 907 2805 2806 907 908 2807 2808 908 2807 - 2808 910 911 2809 2810 911 2809 2810 912 913 2811 2812 2814 913 2811 2812 - 911 913 914 2813 2814 912 913 914 2811 2813 2814 869 915 2815 2816 869 - 2815 2816 913 916 2817 2818 916 2817 2818 908 2819 2820 908 916 2819 2820 - 915 916 2821 2822 915 2821 2822 908 918 2823 2824 908 2823 2824 914 919 - 2825 2826 919 2825 2826 869 2827 2828 869 919 2827 2828 918 919 2829 2830 - 918 2829 2830 921 2831 2832 2831 2832 2833 2834 922 2833 2834 921 922 2835 - 2836 921 2835 2836 908 925 2837 2838 925 2837 2838 923 926 2839 2840 2842 - 923 2839 2840 925 927 2841 2842 923 925 926 927 2839 2841 2842 927 928 - 2843 2844 928 2843 2844 2845 2846 928 2845 2846 928 930 2847 2848 930 2847 - 2848 930 2849 2850 2849 2850 922 923 2851 2852 922 2851 2852 931 932 2853 - 2854 932 2853 2854 933 934 2855 2856 2858 934 2855 2856 932 935 2857 2858 - 932 933 934 935 2855 2857 2858 935 936 2859 2860 936 2859 2860 2861 2862 - 936 2861 2862 936 2863 2864 2863 2864 934 938 2865 2866 938 2865 2866 900 - 2867 2868 900 938 2867 2868 938 939 2869 2870 939 2869 2870 941 942 2871 - 2872 942 2871 2872 943 944 2873 2874 2876 944 2873 2874 942 944 945 2875 - 2876 943 944 945 2873 2875 2876 900 946 2877 2878 900 2877 2878 944 947 - 2879 2880 947 2879 2880 939 2881 2882 939 947 2881 2882 946 947 2883 2884 - 946 2883 2884 939 949 2885 2886 939 2885 2886 945 950 2887 2888 950 2887 - 2888 900 2889 2890 900 950 2889 2890 949 950 2891 2892 949 2891 2892 952 - 2893 2894 2893 2894 2895 2896 953 2895 2896 952 953 2897 2898 952 2897 2898 - 939 956 2899 2900 956 2899 2900 954 957 2901 2902 2904 954 2901 2902 956 - 958 2903 2904 954 956 957 958 2901 2903 2904 958 959 2905 2906 959 2905 - 2906 2907 2908 959 2907 2908 959 961 2909 2910 961 2909 2910 961 2911 2912 - 2911 2912 953 954 2913 2914 953 2913 2914 962 963 2915 2916 4899 963 2915 - 2916 964 965 2917 2918 2920 965 2917 2918 963 965 966 2919 2920 964 965 - 966 2917 2919 2920 966 967 2921 2922 967 2921 2922 2923 2924 967 2923 2924 - 967 2925 2926 2925 2926 965 969 2927 2928 969 2927 2928 931 2929 2930 931 - 969 2929 2930 969 970 2931 2932 970 2931 2932 972 973 2933 2934 4915 973 - 2933 2934 974 975 2935 2936 2938 975 2935 2936 973 975 976 2937 2938 974 - 975 976 2935 2937 2938 931 977 2939 2940 931 2939 2940 975 978 2941 2942 - 978 2941 2942 970 2943 2944 970 978 2943 2944 977 978 2945 2946 977 2945 - 2946 970 980 2947 2948 970 2947 2948 976 981 2949 2950 981 2949 2950 931 - 2951 2952 931 981 2951 2952 980 981 2953 2954 980 2953 2954 983 2955 2956 - 2955 2956 2957 2958 984 2957 2958 983 984 2959 2960 983 2959 2960 970 987 - 2961 2962 987 2961 2962 985 988 2963 2964 2966 985 2963 2964 987 989 2965 - 2966 985 987 988 989 2963 2965 2966 989 990 2967 2968 990 2967 2968 2969 - 2970 990 2969 2970 990 992 2971 2972 992 2971 2972 992 2973 2974 2973 2974 - 984 985 2975 2976 984 2975 2976 1 2 2977 2978 2 2977 2978 2 4 - 2979 2980 4 2979 2980 5 2981 2982 2 5 2981 2982 5 6 999 2983 - 2984 6 2983 2984 6 2985 2986 7 2985 2986 7 2987 2988 2987 2988 4 - 8 2989 2990 8 2989 2990 9 10 2991 2992 3038 9 2991 2992 11 12 - 1011 2993 2994 12 2993 2994 12 14 2995 2996 14 2995 2996 15 2997 2998 - 12 15 2997 2998 14 17 2999 3000 17 2999 3000 9 17 3001 3002 9 - 17 18 3001 3002 3003 16 17 18 3002 3003 3004 16 3003 3004 16 3005 - 3006 3005 3006 15 20 3007 3008 20 3007 3008 20 3009 3010 20 21 3009 - 3010 3011 19 20 21 3010 3011 3012 19 3011 3012 9 19 3013 3014 9 - 3013 3014 23 24 3015 3016 23 3015 3016 23 3017 3018 23 25 3017 3018 - 3035 22 3019 3020 3019 3020 9 26 3021 3022 26 3021 3022 24 26 3023 - 3024 24 3023 3024 28 3025 3026 26 28 3025 3026 28 29 3027 3028 29 - 3027 3028 29 3029 3030 29 30 31 3029 3030 30 31 3031 3032 31 3031 - 3032 31 3033 3034 3033 3034 22 23 25 3018 3035 3036 22 3035 3036 8 - 3037 3038 8 9 10 2991 3037 3038 32 33 3039 3040 33 3039 3040 33 - 35 3041 3042 35 3041 3042 36 3043 3044 33 36 3043 3044 36 37 3045 - 3046 37 3045 3046 37 3047 3048 38 3047 3048 38 3049 3050 3049 3050 35 - 39 3051 3052 39 3051 3052 40 41 3053 3054 3100 40 3053 3054 42 43 - 3055 3056 43 3055 3056 43 45 3057 3058 45 3057 3058 46 3059 3060 43 - 46 3059 3060 45 48 3061 3062 48 3061 3062 40 48 3063 3064 1 40 - 47 48 49 3063 3064 3065 47 49 3064 3065 3066 47 3065 3066 1 47 - 3067 3068 1 3067 3068 46 51 3069 3070 51 3069 3070 1 50 51 3071 - 3072 1 50 52 3071 3072 3073 50 52 3072 3073 3074 50 3073 3074 40 - 50 3075 3076 40 3075 3076 54 55 3077 3078 54 3077 3078 54 3079 3080 - 53 54 56 3079 3080 53 3081 3082 3081 3082 40 57 3083 3084 57 3083 - 3084 55 57 3085 3086 55 3085 3086 59 3087 3088 57 59 3087 3088 59 - 60 3089 3090 60 3089 3090 60 3091 3092 60 61 3091 3092 3093 60 61 - 62 3092 3093 3094 62 3093 3094 62 3095 3096 3095 3096 53 56 3097 3098 - 53 3097 3098 1 39 40 3099 3100 1 40 41 3053 3099 3100 63 64 - 3101 3102 64 3101 3102 64 66 3103 3104 66 3103 3104 67 3105 3106 64 - 67 3105 3106 67 68 3107 3108 68 3107 3108 68 3109 3110 69 3109 3110 - 69 3111 3112 3111 3112 66 70 3113 3114 70 3113 3114 71 72 3115 3116 - 3162 71 3115 3116 73 74 3117 3118 74 3117 3118 74 76 3119 3120 76 - 3119 3120 77 3121 3122 74 77 3121 3122 76 79 3123 3124 79 3123 3124 - 71 78 79 3125 3126 71 78 80 3125 3126 3127 78 80 3126 3127 3128 - 78 3127 3128 32 78 3129 3130 32 3129 3130 77 82 3131 3132 82 3131 - 3132 32 82 3133 3134 32 71 81 82 83 3133 3134 3135 81 83 3134 - 3135 3136 81 3135 3136 71 81 3137 3138 71 3137 3138 85 86 3139 3140 - 85 3139 3140 85 3141 3142 85 87 3141 3142 3159 84 3143 3144 3143 3144 - 71 88 3145 3146 88 3145 3146 86 88 3147 3148 86 3147 3148 90 3149 - 3150 88 90 3149 3150 90 91 3151 3152 91 3151 3152 91 3153 3154 91 - 92 3153 3154 3155 91 92 93 3154 3155 3156 93 3155 3156 93 3157 3158 - 3157 3158 84 85 87 3142 3159 3160 84 3159 3160 32 70 3161 3162 32 - 70 71 72 3115 3161 3162 94 95 3163 3164 95 3163 3164 95 97 3165 - 3166 97 3165 3166 98 3167 3168 95 98 3167 3168 98 99 3169 3170 99 - 3169 3170 99 3171 3172 100 3171 3172 100 3173 3174 3173 3174 97 101 3175 - 3176 101 3175 3176 102 103 3177 3178 3224 102 3177 3178 104 105 3179 3180 - 105 3179 3180 105 107 3181 3182 107 3181 3182 108 3183 3184 105 108 3183 - 3184 107 110 3185 3186 110 3185 3186 102 109 110 3187 3188 102 109 111 - 3187 3188 3189 109 111 3188 3189 3190 109 3189 3190 63 109 3191 3192 63 - 3191 3192 108 113 3193 3194 113 3193 3194 63 113 3195 3196 63 102 112 - 113 114 3195 3196 3197 112 114 3196 3197 3198 112 3197 3198 102 112 3199 - 3200 102 3199 3200 116 117 3201 3202 116 3201 3202 116 3203 3204 116 118 - 3203 3204 3221 115 3205 3206 3205 3206 102 119 3207 3208 119 3207 3208 117 - 119 3209 3210 117 3209 3210 121 3211 3212 119 121 3211 3212 121 122 3213 - 3214 122 3213 3214 122 3215 3216 122 123 124 3215 3216 123 124 3217 3218 - 124 3217 3218 124 3219 3220 3219 3220 115 116 118 3204 3221 3222 115 3221 - 3222 63 101 102 3223 3224 63 102 103 3177 3223 3224 125 126 3225 3226 - 126 3225 3226 126 128 3227 3228 128 3227 3228 129 3229 3230 126 129 3229 - 3230 129 130 3231 3232 130 3231 3232 130 3233 3234 131 3233 3234 131 3235 - 3236 3235 3236 128 132 3237 3238 132 3237 3238 133 134 3239 3240 3286 133 - 3239 3240 135 136 1259 3241 3242 136 3241 3242 136 138 3243 3244 138 3243 - 3244 139 3245 3246 136 139 3245 3246 138 141 3247 3248 141 3247 3248 133 - 140 141 3249 3250 133 140 142 3249 3250 3251 140 142 3250 3251 3252 140 - 3251 3252 94 140 3253 3254 94 3253 3254 139 144 3255 3256 144 3255 3256 - 94 143 144 3257 3258 94 143 145 3257 3258 3259 143 145 3258 3259 3260 - 143 3259 3260 133 143 3261 3262 133 3261 3262 147 148 3263 3264 147 3263 - 3264 147 3265 3266 146 147 149 3265 3266 146 3267 3268 3267 3268 133 150 - 3269 3270 150 3269 3270 148 150 3271 3272 148 3271 3272 152 3273 3274 150 - 152 3273 3274 152 153 3275 3276 153 3275 3276 153 3277 3278 153 154 3277 - 3278 3279 153 154 155 3278 3279 3280 155 3279 3280 155 3281 3282 3281 3282 - 146 149 3283 3284 146 3283 3284 94 132 133 3285 3286 94 133 134 3239 - 3285 3286 156 157 3287 3288 157 3287 3288 157 159 3289 3290 159 3289 3290 - 160 3291 3292 157 160 3291 3292 160 161 3293 3294 161 3293 3294 161 3295 - 3296 162 3295 3296 162 3297 3298 3297 3298 159 163 3299 3300 163 3299 3300 - 164 165 3301 3302 3348 164 3301 3302 166 167 1321 3303 3304 167 3303 3304 - 167 169 3305 3306 169 3305 3306 170 3307 3308 167 170 3307 3308 169 172 - 3309 3310 172 3309 3310 164 172 3311 3312 164 171 172 173 3311 3312 3313 - 171 173 3312 3313 3314 171 3313 3314 125 171 3315 3316 125 3315 3316 170 - 175 3317 3318 175 3317 3318 125 174 175 3319 3320 125 174 176 3319 3320 - 3321 174 176 3320 3321 3322 174 3321 3322 164 174 3323 3324 164 3323 3324 - 178 179 3325 3326 178 3325 3326 178 3327 3328 178 180 3327 3328 3345 177 - 3329 3330 3329 3330 164 181 3331 3332 181 3331 3332 179 181 3333 3334 179 - 3333 3334 183 3335 3336 181 183 3335 3336 183 184 3337 3338 184 3337 3338 - 184 3339 3340 184 185 3339 3340 3341 184 185 186 3340 3341 3342 186 3341 - 3342 186 3343 3344 3343 3344 177 178 180 3328 3345 3346 177 3345 3346 125 - 163 164 3347 3348 125 164 165 3301 3347 3348 187 188 3349 3350 188 3349 - 3350 188 190 3351 3352 190 3351 3352 191 3353 3354 188 191 3353 3354 191 - 192 3355 3356 192 3355 3356 192 3357 3358 193 3357 3358 193 3359 3360 3359 - 3360 190 194 3361 3362 194 3361 3362 195 196 3363 3364 3410 195 3363 3364 - 197 198 1383 3365 3366 198 3365 3366 198 200 3367 3368 200 3367 3368 201 - 3369 3370 198 201 3369 3370 200 203 3371 3372 203 3371 3372 195 202 203 - 3373 3374 195 202 204 3373 3374 3375 202 204 3374 3375 3376 202 3375 3376 - 156 202 3377 3378 156 3377 3378 201 206 3379 3380 206 3379 3380 156 205 - 206 3381 3382 156 205 207 3381 3382 3383 205 207 3382 3383 3384 205 3383 - 3384 195 205 3385 3386 195 3385 3386 209 210 3387 3388 209 3387 3388 209 - 3389 3390 208 209 211 3389 3390 208 3391 3392 3391 3392 195 212 3393 3394 - 212 3393 3394 210 212 1416 3395 3396 210 3395 3396 214 3397 3398 212 214 - 3397 3398 214 215 3399 3400 215 3399 3400 215 3401 3402 215 216 3401 3402 - 3403 215 216 217 3402 3403 3404 217 3403 3404 217 3405 3406 3405 3406 208 - 211 3407 3408 208 3407 3408 156 194 195 3409 3410 156 195 196 3363 3409 - 3410 218 219 3411 3412 219 3411 3412 219 221 3413 3414 221 3413 3414 222 - 3415 3416 219 222 3415 3416 222 223 3417 3418 223 3417 3418 223 3419 3420 - 224 3419 3420 224 3421 3422 3421 3422 221 225 3423 3424 225 3423 3424 226 - 227 3425 3426 3472 226 3425 3426 228 229 3427 3428 229 3427 3428 229 231 - 3429 3430 231 3429 3430 232 3431 3432 229 232 3431 3432 231 234 3433 3434 - 234 3433 3434 226 234 3435 3436 187 226 233 234 235 3435 3436 3437 233 - 235 3436 3437 3438 233 3437 3438 187 233 3439 3440 187 3439 3440 232 237 - 3441 3442 237 3441 3442 187 237 3443 3444 187 226 236 237 238 3443 3444 - 3445 236 238 3444 3445 3446 236 3445 3446 226 236 3447 3448 226 3447 3448 - 240 241 3449 3450 240 3449 3450 240 3451 3452 240 242 3451 3452 3469 239 - 3453 3454 3453 3454 226 243 3455 3456 243 3455 3456 241 243 3457 3458 241 - 3457 3458 245 3459 3460 243 245 3459 3460 245 246 3461 3462 246 3461 3462 - 246 3463 3464 246 247 3463 3464 3465 246 247 248 3464 3465 3466 248 3465 - 3466 248 3467 3468 3467 3468 239 240 242 3452 3469 3470 239 3469 3470 187 - 225 226 3471 3472 187 226 227 3425 3471 3472 249 250 3473 3474 250 3473 - 3474 250 252 3475 3476 252 3475 3476 253 3477 3478 250 253 3477 3478 253 - 254 3479 3480 254 3479 3480 254 3481 3482 255 3481 3482 255 3483 3484 3483 - 3484 252 256 3485 3486 256 3485 3486 257 258 3487 3488 3534 257 3487 3488 - 259 260 3489 3490 260 3489 3490 260 262 3491 3492 262 3491 3492 263 3493 - 3494 260 263 3493 3494 262 265 3495 3496 265 3495 3496 257 265 3497 3498 - 218 257 264 265 266 3497 3498 3499 264 266 3498 3499 3500 264 3499 3500 - 218 264 3501 3502 218 3501 3502 263 268 3503 3504 268 3503 3504 218 268 - 3505 3506 218 257 267 268 269 3505 3506 3507 267 269 3506 3507 3508 267 - 3507 3508 257 267 3509 3510 257 3509 3510 271 272 3511 3512 271 3511 3512 - 271 3513 3514 271 273 3513 3514 3531 270 3515 3516 3515 3516 257 274 3517 - 3518 274 3517 3518 272 274 3519 3520 272 3519 3520 276 3521 3522 274 276 - 3521 3522 276 277 3523 3524 277 3523 3524 277 3525 3526 277 278 3525 3526 - 3527 277 278 279 3526 3527 3528 279 3527 3528 279 3529 3530 3529 3530 270 - 271 273 3514 3531 3532 270 3531 3532 218 256 3533 3534 218 256 257 258 - 3487 3533 3534 280 281 3535 3536 281 3535 3536 281 283 3537 3538 283 3537 - 3538 284 3539 3540 281 284 3539 3540 284 285 3541 3542 285 3541 3542 285 - 3543 3544 286 3543 3544 286 3545 3546 3545 3546 283 287 3547 3548 287 3547 - 3548 288 289 3549 3550 3596 288 3549 3550 290 291 3551 3552 291 3551 3552 - 291 293 3553 3554 293 3553 3554 294 3555 3556 291 294 3555 3556 293 296 - 3557 3558 296 3557 3558 288 295 296 3559 3560 288 295 297 3559 3560 3561 - 295 297 3560 3561 3562 295 3561 3562 249 295 3563 3564 249 3563 3564 294 - 299 3565 3566 299 3565 3566 249 298 299 3567 3568 249 298 300 3567 3568 - 3569 298 300 3568 3569 3570 298 3569 3570 288 298 3571 3572 288 3571 3572 - 302 303 3573 3574 302 3573 3574 302 3575 3576 302 304 3575 3576 3593 301 - 3577 3578 3577 3578 288 305 3579 3580 305 3579 3580 303 305 3581 3582 303 - 3581 3582 307 3583 3584 305 307 3583 3584 307 308 3585 3586 308 3585 3586 - 308 3587 3588 308 309 3587 3588 3589 308 309 310 3588 3589 3590 310 3589 - 3590 310 3591 3592 3591 3592 301 302 304 3576 3593 3594 301 3593 3594 249 - 287 288 3595 3596 249 288 289 3549 3595 3596 311 312 3597 3598 312 3597 - 3598 312 314 3599 3600 314 3599 3600 315 3601 3602 312 315 3601 3602 315 - 316 3603 3604 316 3603 3604 316 3605 3606 317 3605 3606 317 3607 3608 3607 - 3608 314 318 3609 3610 318 3609 3610 319 320 3611 3612 3658 319 3611 3612 - 321 322 3613 3614 322 3613 3614 322 324 3615 3616 324 3615 3616 325 3617 - 3618 322 325 3617 3618 324 327 3619 3620 327 3619 3620 319 326 327 3621 - 3622 319 326 328 3621 3622 3623 326 328 3622 3623 3624 326 3623 3624 280 - 326 3625 3626 280 3625 3626 325 330 3627 3628 330 3627 3628 280 329 330 - 3629 3630 280 329 331 3629 3630 3631 329 331 3630 3631 3632 329 3631 3632 - 319 329 3633 3634 319 3633 3634 333 334 3635 3636 333 3635 3636 333 3637 - 3638 333 335 3637 3638 3655 332 3639 3640 3639 3640 319 336 3641 3642 336 - 3641 3642 334 336 3643 3644 334 3643 3644 338 3645 3646 336 338 3645 3646 - 338 339 3647 3648 339 3647 3648 339 3649 3650 339 340 341 3649 3650 340 - 341 3651 3652 341 3651 3652 341 3653 3654 3653 3654 332 333 335 3638 3655 - 3656 332 3655 3656 280 318 319 3657 3658 280 319 320 3611 3657 3658 342 - 343 3659 3660 343 3659 3660 343 345 3661 3662 345 3661 3662 346 3663 3664 - 343 346 3663 3664 346 347 3665 3666 347 3665 3666 347 3667 3668 348 3667 - 3668 348 3669 3670 3669 3670 345 349 3671 3672 349 3671 3672 350 351 3673 - 3674 3720 350 3673 3674 352 353 3675 3676 353 3675 3676 353 355 3677 3678 - 355 3677 3678 356 3679 3680 353 356 3679 3680 355 358 3681 3682 358 3681 - 3682 350 357 358 3683 3684 350 357 359 3683 3684 3685 357 359 3684 3685 - 3686 357 3685 3686 311 357 3687 3688 311 3687 3688 356 361 3689 3690 361 - 3689 3690 311 360 361 3691 3692 311 360 362 3691 3692 3693 360 362 3692 - 3693 3694 360 3693 3694 350 360 3695 3696 350 3695 3696 364 365 3697 3698 - 364 3697 3698 364 3699 3700 364 366 3699 3700 3717 363 3701 3702 3701 3702 - 350 367 3703 3704 367 3703 3704 365 367 3705 3706 365 3705 3706 369 3707 - 3708 367 369 3707 3708 369 370 3709 3710 370 3709 3710 370 3711 3712 370 - 371 3711 3712 3713 370 371 372 3712 3713 3714 372 3713 3714 372 3715 3716 - 3715 3716 363 364 366 3700 3717 3718 363 3717 3718 311 349 350 3719 3720 - 311 350 351 3673 3719 3720 373 374 3721 3722 374 3721 3722 374 376 3723 - 3724 376 3723 3724 377 3725 3726 374 377 3725 3726 377 378 3727 3728 378 - 3727 3728 378 3729 3730 379 3729 3730 379 3731 3732 3731 3732 376 380 3733 - 3734 380 3733 3734 381 382 3735 3736 3782 381 3735 3736 383 384 3737 3738 - 384 3737 3738 384 386 3739 3740 386 3739 3740 387 3741 3742 384 387 3741 - 3742 386 389 3743 3744 389 3743 3744 381 388 389 3745 3746 381 388 390 - 3745 3746 3747 388 390 3746 3747 3748 388 3747 3748 342 388 3749 3750 342 - 3749 3750 387 392 3751 3752 392 3751 3752 342 392 3753 3754 342 391 392 - 393 3753 3754 3755 391 393 3754 3755 3756 391 3755 3756 381 391 3757 3758 - 381 3757 3758 395 396 3759 3760 395 3759 3760 395 3761 3762 395 397 3761 - 3762 3779 394 3763 3764 3763 3764 381 398 3765 3766 398 3765 3766 396 398 - 3767 3768 396 3767 3768 400 3769 3770 398 400 3769 3770 400 401 3771 3772 - 401 3771 3772 401 3773 3774 401 402 3773 3774 3775 401 402 403 3774 3775 - 3776 403 3775 3776 403 3777 3778 3777 3778 394 395 397 3762 3779 3780 394 - 3779 3780 342 380 3781 3782 342 380 381 382 3735 3781 3782 404 405 3783 - 3784 405 3783 3784 405 407 3785 3786 407 3785 3786 408 3787 3788 405 408 - 3787 3788 408 409 3789 3790 409 3789 3790 409 3791 3792 410 3791 3792 410 - 3793 3794 3793 3794 407 411 3795 3796 411 3795 3796 412 413 3797 3798 3844 - 412 3797 3798 414 415 3799 3800 415 3799 3800 415 417 3801 3802 417 3801 - 3802 418 3803 3804 415 418 3803 3804 417 420 3805 3806 420 3805 3806 412 - 419 420 3807 3808 412 419 421 3807 3808 3809 419 421 3808 3809 3810 419 - 3809 3810 373 419 3811 3812 373 3811 3812 418 423 3813 3814 423 3813 3814 - 373 423 3815 3816 373 412 422 423 424 3815 3816 3817 422 424 3816 3817 - 3818 422 3817 3818 412 422 3819 3820 412 3819 3820 426 427 3821 3822 426 - 3821 3822 426 3823 3824 426 428 3823 3824 3841 425 3825 3826 3825 3826 412 - 429 3827 3828 429 3827 3828 427 429 3829 3830 427 3829 3830 431 3831 3832 - 429 431 3831 3832 431 432 3833 3834 432 3833 3834 432 3835 3836 432 433 - 3835 3836 3837 432 433 434 3836 3837 3838 434 3837 3838 434 3839 3840 3839 - 3840 425 426 428 3824 3841 3842 425 3841 3842 373 411 412 3843 3844 373 - 412 413 3797 3843 3844 435 436 3845 3846 436 3845 3846 436 438 3847 3848 - 438 3847 3848 439 3849 3850 436 439 3849 3850 439 440 3851 3852 440 3851 - 3852 440 3853 3854 441 3853 3854 441 3855 3856 3855 3856 438 442 3857 3858 - 442 3857 3858 443 444 3859 3860 3906 443 3859 3860 445 446 3861 3862 446 - 3861 3862 446 448 3863 3864 448 3863 3864 449 3865 3866 446 449 3865 3866 - 448 451 3867 3868 451 3867 3868 443 451 3869 3870 404 443 450 451 452 - 3869 3870 3871 450 452 3870 3871 3872 450 3871 3872 404 450 3873 3874 404 - 3873 3874 449 454 3875 3876 454 3875 3876 404 453 454 3877 3878 404 453 - 455 3877 3878 3879 453 455 3878 3879 3880 453 3879 3880 443 453 3881 3882 - 443 3881 3882 457 458 3883 3884 457 3883 3884 457 3885 3886 457 459 3885 - 3886 3903 456 3887 3888 3887 3888 443 460 3889 3890 460 3889 3890 458 460 - 3891 3892 458 3891 3892 462 3893 3894 460 462 3893 3894 462 463 3895 3896 - 463 3895 3896 463 3897 3898 463 464 465 3897 3898 464 465 3899 3900 465 - 3899 3900 465 3901 3902 3901 3902 456 457 459 3886 3903 3904 456 3903 3904 - 404 442 443 3905 3906 404 443 444 3859 3905 3906 466 467 3907 3908 467 - 3907 3908 467 469 3909 3910 469 3909 3910 470 3911 3912 467 470 3911 3912 - 470 471 3913 3914 471 3913 3914 471 3915 3916 472 3915 3916 472 3917 3918 - 3917 3918 469 473 3919 3920 473 3919 3920 474 475 3921 3922 3968 474 3921 - 3922 476 477 3923 3924 477 3923 3924 477 479 3925 3926 479 3925 3926 480 - 3927 3928 477 480 3927 3928 479 482 3929 3930 482 3929 3930 474 482 3931 - 3932 474 481 482 483 3931 3932 3933 481 483 3932 3933 3934 481 3933 3934 - 435 481 3935 3936 435 3935 3936 480 485 3937 3938 485 3937 3938 435 485 - 3939 3940 435 474 484 485 486 3939 3940 3941 484 486 3940 3941 3942 484 - 3941 3942 474 484 3943 3944 474 3943 3944 488 489 3945 3946 488 3945 3946 - 488 3947 3948 487 488 490 3947 3948 487 3949 3950 3949 3950 474 491 3951 - 3952 491 3951 3952 489 491 1974 3953 3954 489 3953 3954 493 3955 3956 491 - 493 3955 3956 493 494 3957 3958 494 3957 3958 494 3959 3960 494 495 3959 - 3960 3961 494 495 496 3960 3961 3962 496 3961 3962 496 3963 3964 3963 3964 - 487 490 3965 3966 487 3965 3966 435 473 474 3967 3968 435 474 475 3921 - 3967 3968 497 498 3969 3970 498 3969 3970 498 500 3971 3972 500 3971 3972 - 501 3973 3974 498 501 3973 3974 501 502 3975 3976 502 3975 3976 502 3977 - 3978 503 3977 3978 503 3979 3980 3979 3980 500 504 3981 3982 504 3981 3982 - 505 506 3983 3984 4030 505 3983 3984 507 508 3985 3986 508 3985 3986 508 - 510 3987 3988 510 3987 3988 511 3989 3990 508 511 3989 3990 510 513 3991 - 3992 513 3991 3992 505 512 513 3993 3994 505 512 514 3993 3994 3995 512 - 514 3994 3995 3996 512 3995 3996 466 512 3997 3998 466 3997 3998 511 516 - 3999 4000 516 3999 4000 466 516 4001 4002 466 515 516 517 4001 4002 4003 - 515 517 4002 4003 4004 515 4003 4004 505 515 4005 4006 505 4005 4006 519 - 520 4007 4008 519 4007 4008 519 4009 4010 519 521 4009 4010 4027 518 4011 - 4012 4011 4012 505 522 4013 4014 522 4013 4014 520 522 4015 4016 520 4015 - 4016 524 4017 4018 522 524 4017 4018 524 525 4019 4020 525 4019 4020 525 - 4021 4022 525 526 4021 4022 4023 525 526 527 4022 4023 4024 527 4023 4024 - 527 4025 4026 4025 4026 518 519 521 4010 4027 4028 518 4027 4028 466 504 - 4029 4030 466 504 505 506 3983 4029 4030 528 529 4031 4032 529 4031 4032 - 529 531 4033 4034 531 4033 4034 532 4035 4036 529 532 4035 4036 532 533 - 4037 4038 533 4037 4038 533 4039 4040 534 4039 4040 534 4041 4042 4041 4042 - 531 535 4043 4044 535 4043 4044 536 537 4045 4046 4092 536 4045 4046 538 - 539 4047 4048 539 4047 4048 539 541 4049 4050 541 4049 4050 542 4051 4052 - 539 542 4051 4052 541 544 4053 4054 544 4053 4054 536 544 4055 4056 536 - 543 544 545 4055 4056 4057 543 545 4056 4057 4058 543 4057 4058 497 543 - 4059 4060 497 4059 4060 542 547 4061 4062 547 4061 4062 497 547 4063 4064 - 497 546 547 548 4063 4064 4065 546 548 4064 4065 4066 546 4065 4066 536 - 546 4067 4068 536 4067 4068 550 551 4069 4070 550 4069 4070 550 4071 4072 - 550 552 4071 4072 4089 549 4073 4074 4073 4074 536 553 4075 4076 553 4075 - 4076 551 553 4077 4078 551 4077 4078 555 4079 4080 553 555 4079 4080 555 - 556 4081 4082 556 4081 4082 556 4083 4084 556 557 4083 4084 4085 556 557 - 558 4084 4085 4086 558 4085 4086 558 4087 4088 4087 4088 549 550 552 4072 - 4089 4090 549 4089 4090 497 535 4091 4092 497 535 536 537 4045 4091 4092 - 559 560 4093 4094 560 4093 4094 560 562 4095 4096 562 4095 4096 563 4097 - 4098 560 563 4097 4098 563 564 4099 4100 564 4099 4100 564 4101 4102 565 - 4101 4102 565 4103 4104 4103 4104 562 566 4105 4106 566 4105 4106 567 568 - 4107 4108 4154 567 4107 4108 569 570 4109 4110 570 4109 4110 570 572 4111 - 4112 572 4111 4112 573 4113 4114 570 573 4113 4114 572 575 4115 4116 575 - 4115 4116 567 574 575 4117 4118 567 574 576 4117 4118 4119 574 576 4118 - 4119 4120 574 4119 4120 528 574 4121 4122 528 4121 4122 573 578 4123 4124 - 578 4123 4124 528 577 578 4125 4126 528 577 579 4125 4126 4127 577 579 - 4126 4127 4128 577 4127 4128 567 577 4129 4130 567 4129 4130 581 582 4131 - 4132 581 4131 4132 581 4133 4134 581 583 4133 4134 4151 580 4135 4136 4135 - 4136 567 584 4137 4138 584 4137 4138 582 584 4139 4140 582 4139 4140 586 - 4141 4142 584 586 4141 4142 586 587 4143 4144 587 4143 4144 587 4145 4146 - 587 588 4145 4146 4147 587 588 589 4146 4147 4148 589 4147 4148 589 4149 - 4150 4149 4150 580 581 583 4134 4151 4152 580 4151 4152 528 566 567 4153 - 4154 528 567 568 4107 4153 4154 590 591 4155 4156 591 4155 4156 591 593 - 4157 4158 593 4157 4158 594 4159 4160 591 594 4159 4160 594 595 4161 4162 - 595 4161 4162 595 4163 4164 596 4163 4164 596 4165 4166 4165 4166 593 597 - 4167 4168 597 4167 4168 598 599 4169 4170 4216 598 4169 4170 600 601 4171 - 4172 601 4171 4172 601 603 4173 4174 603 4173 4174 604 4175 4176 601 604 - 4175 4176 603 606 4177 4178 606 4177 4178 598 606 4179 4180 559 598 605 - 606 607 4179 4180 4181 605 607 4180 4181 4182 605 4181 4182 559 605 4183 - 4184 559 4183 4184 604 609 4185 4186 609 4185 4186 559 609 4187 4188 559 - 608 609 610 4187 4188 4189 608 610 4188 4189 4190 608 4189 4190 598 608 - 4191 4192 598 4191 4192 612 613 4193 4194 612 4193 4194 612 614 4195 4196 - 614 4195 4196 611 4197 4198 4197 4198 598 615 4199 4200 615 4199 4200 613 - 615 4201 4202 613 4201 4202 617 4203 4204 615 617 4203 4204 617 618 4205 - 4206 618 4205 4206 618 619 4207 4208 619 4207 4208 618 619 620 4209 4210 - 620 4209 4210 620 4211 4212 4211 4212 611 612 614 4213 4214 611 4213 4214 - 559 597 4215 4216 559 597 598 599 4169 4215 4216 621 622 4217 4218 622 - 4217 4218 622 624 4219 4220 624 4219 4220 625 4221 4222 622 625 4221 4222 - 625 626 4223 4224 626 4223 4224 626 4225 4226 627 4225 4226 627 4227 4228 - 4227 4228 624 628 4229 4230 628 4229 4230 629 630 4231 4232 4278 629 4231 - 4232 631 632 2251 4233 4234 632 4233 4234 632 634 4235 4236 634 4235 4236 - 635 4237 4238 632 635 4237 4238 634 637 4239 4240 637 4239 4240 629 636 - 637 4241 4242 629 636 638 4241 4242 636 638 4243 4244 636 4243 4244 590 - 636 4245 4246 590 4245 4246 635 640 4247 4248 640 4247 4248 590 640 4249 - 4250 590 639 640 641 4249 4250 639 641 4251 4252 639 4251 4252 629 639 - 4253 4254 629 4253 4254 643 644 4255 4256 643 4255 4256 643 4257 4258 642 - 643 645 2277 4257 4258 642 4259 4260 4259 4260 629 646 4261 4262 646 4261 - 4262 644 646 4263 4264 644 4263 4264 648 4265 4266 646 648 4265 4266 648 - 649 4267 4268 649 4267 4268 649 650 4269 4270 650 4269 4270 649 650 651 - 2289 4271 4272 651 4271 4272 651 4273 4274 4273 4274 642 645 4275 4276 642 - 4275 4276 590 628 4277 4278 590 628 629 630 4231 4277 4278 652 653 4279 - 4280 653 4279 4280 653 655 4281 4282 655 4281 4282 656 4283 4284 653 656 - 4283 4284 656 657 4285 4286 657 4285 4286 657 4287 4288 658 4287 4288 658 - 4289 4290 4289 4290 655 659 4291 4292 659 4291 4292 660 661 4293 4294 4340 - 660 4293 4294 662 663 2313 4295 4296 663 4295 4296 663 665 4297 4298 665 - 4297 4298 666 4299 4300 663 666 4299 4300 665 668 4301 4302 668 4301 4302 - 660 667 668 4303 4304 660 667 669 4303 4304 667 669 4305 4306 667 4305 - 4306 621 667 4307 4308 621 4307 4308 666 671 4309 4310 671 4309 4310 621 - 671 4311 4312 621 670 671 672 4311 4312 670 672 4313 4314 670 4313 4314 - 660 670 4315 4316 660 4315 4316 674 675 4317 4318 674 4317 4318 674 676 - 4319 4320 676 4319 4320 673 4321 4322 4321 4322 660 677 4323 4324 677 4323 - 4324 675 677 4325 4326 675 4325 4326 679 4327 4328 677 679 4327 4328 679 - 680 4329 4330 680 4329 4330 680 681 4331 4332 681 4331 4332 680 681 682 - 4333 4334 682 4333 4334 682 4335 4336 4335 4336 673 674 676 4337 4338 673 - 4337 4338 621 659 660 4339 4340 621 660 661 4293 4339 4340 683 684 4341 - 4342 684 4341 4342 684 686 4343 4344 686 4343 4344 687 4345 4346 684 687 - 4345 4346 687 688 2363 4347 4348 688 4347 4348 688 4349 4350 689 4349 4350 - 689 4351 4352 4351 4352 686 690 4353 4354 690 4353 4354 691 692 4355 4356 - 4402 691 4355 4356 693 694 2375 4357 4358 694 4357 4358 694 696 4359 4360 - 696 4359 4360 697 4361 4362 694 697 4361 4362 696 699 4363 4364 699 4363 - 4364 691 699 4365 4366 691 698 699 700 4365 4366 698 700 4367 4368 698 - 4367 4368 652 698 4369 4370 652 4369 4370 697 702 4371 4372 702 4371 4372 - 652 702 4373 4374 652 701 702 703 4373 4374 701 703 4375 4376 701 4375 - 4376 691 701 4377 4378 691 4377 4378 705 706 4379 4380 705 4379 4380 705 - 707 4381 4382 707 4381 4382 704 4383 4384 4383 4384 691 708 4385 4386 708 - 4385 4386 706 708 4387 4388 706 4387 4388 710 4389 4390 708 710 4389 4390 - 710 711 4391 4392 711 4391 4392 711 4393 4394 711 712 713 2413 4393 4394 - 712 713 4395 4396 713 4395 4396 713 4397 4398 4397 4398 704 705 707 2401 - 4399 4400 704 4399 4400 652 690 691 4401 4402 652 691 692 4355 4401 4402 - 714 715 4403 4404 715 4403 4404 715 717 4405 4406 717 4405 4406 718 4407 - 4408 715 718 4407 4408 718 719 4409 4410 719 4409 4410 719 4411 4412 720 - 4411 4412 720 4413 4414 4413 4414 717 721 4415 4416 721 4415 4416 722 723 - 4417 4418 4464 722 4417 4418 724 725 2437 4419 4420 725 4419 4420 725 727 - 4421 4422 727 4421 4422 728 4423 4424 725 728 4423 4424 727 730 4425 4426 - 730 4425 4426 722 729 730 4427 4428 722 729 731 4427 4428 729 731 4429 - 4430 729 4429 4430 683 729 4431 4432 683 4431 4432 728 733 4433 4434 733 - 4433 4434 683 733 4435 4436 683 732 733 734 4435 4436 732 734 4437 4438 - 732 4437 4438 722 732 4439 4440 722 4439 4440 736 737 4441 4442 736 4441 - 4442 736 738 4443 4444 738 4443 4444 735 4445 4446 4445 4446 722 739 4447 - 4448 739 4447 4448 737 739 4449 4450 737 4449 4450 741 4451 4452 739 741 - 4451 4452 741 742 4453 4454 742 4453 4454 742 743 4455 4456 743 4455 4456 - 742 743 744 4457 4458 744 4457 4458 744 4459 4460 4459 4460 735 736 738 - 4461 4462 735 4461 4462 683 721 4463 4464 683 721 722 723 4417 4463 4464 - 745 746 4465 4466 746 4465 4466 746 748 4467 4468 748 4467 4468 749 4469 - 4470 746 749 4469 4470 749 750 4471 4472 750 4471 4472 750 4473 4474 751 - 4473 4474 751 4475 4476 4475 4476 748 752 4477 4478 752 4477 4478 753 754 - 4479 4480 4526 753 4479 4480 755 756 2499 4481 4482 756 4481 4482 756 758 - 4483 4484 758 4483 4484 759 4485 4486 756 759 4485 4486 758 761 4487 4488 - 761 4487 4488 753 761 4489 4490 753 760 761 762 4489 4490 760 762 4491 - 4492 760 4491 4492 714 760 4493 4494 714 4493 4494 759 764 4495 4496 764 - 4495 4496 714 763 764 4497 4498 714 763 765 4497 4498 4499 763 765 4498 - 4499 4500 763 4499 4500 753 763 4501 4502 753 4501 4502 767 768 2541 4503 - 4504 767 4503 4504 767 769 4505 4506 769 4505 4506 766 4507 4508 4507 4508 - 753 770 4509 4510 770 4509 4510 768 770 4511 4512 768 4511 4512 772 4513 - 4514 770 772 4513 4514 772 773 4515 4516 773 4515 4516 773 774 4517 4518 - 774 4517 4518 773 774 775 2537 4519 4520 775 4519 4520 775 4521 4522 4521 - 4522 766 767 769 2525 4523 4524 766 4523 4524 714 752 753 4525 4526 714 - 753 754 4479 4525 4526 776 777 4527 4528 777 4527 4528 777 779 4529 4530 - 779 4529 4530 780 4531 4532 777 780 4531 4532 780 781 4533 4534 781 4533 - 4534 781 4535 4536 782 4535 4536 782 4537 4538 4537 4538 779 783 4539 4540 - 783 4539 4540 784 785 4541 4542 4588 784 4541 4542 786 787 2561 4543 4544 - 787 4543 4544 787 789 4545 4546 789 4545 4546 790 4547 4548 787 790 4547 - 4548 789 792 4549 4550 792 4549 4550 784 792 4551 4552 745 784 791 792 - 793 4551 4552 791 793 4553 4554 791 4553 4554 745 791 4555 4556 745 4555 - 4556 790 795 4557 4558 795 4557 4558 745 795 4559 4560 745 784 794 795 - 796 4559 4560 794 796 4561 4562 794 4561 4562 784 794 4563 4564 784 4563 - 4564 798 799 4565 4566 798 4565 4566 797 798 4567 4568 797 800 4567 4568 - 797 4569 4570 4569 4570 784 801 4571 4572 801 4571 4572 799 801 4573 4574 - 799 4573 4574 803 4575 4576 801 803 4575 4576 803 804 4577 4578 804 4577 - 4578 804 805 4579 4580 805 4579 4580 805 806 4581 4582 806 4581 4582 806 - 4583 4584 4583 4584 797 800 4585 4586 797 4585 4586 745 783 4587 4588 745 - 783 784 785 4541 4587 4588 807 808 4589 4590 808 4589 4590 808 810 4591 - 4592 810 4591 4592 811 4593 4594 808 811 4593 4594 811 812 4595 4596 812 - 4595 4596 812 4597 4598 813 4597 4598 813 4599 4600 4599 4600 810 814 4601 - 4602 814 4601 4602 815 816 4603 4604 4650 815 4603 4604 817 818 2623 4605 - 4606 818 4605 4606 818 820 4607 4608 820 4607 4608 821 4609 4610 818 821 - 4609 4610 820 823 4611 4612 823 4611 4612 815 823 4613 4614 776 815 822 - 823 824 4613 4614 4615 822 824 4614 4615 4616 822 4615 4616 776 822 4617 - 4618 776 4617 4618 821 826 4619 4620 826 4619 4620 776 826 4621 4622 776 - 815 825 826 827 4621 4622 4623 825 827 4622 4623 4624 825 4623 4624 815 - 825 4625 4626 815 4625 4626 829 830 4627 4628 829 4627 4628 829 831 4629 - 4630 831 4629 4630 828 4631 4632 4631 4632 815 832 4633 4634 832 4633 4634 - 830 832 4635 4636 830 4635 4636 834 4637 4638 832 834 4637 4638 834 835 - 4639 4640 835 4639 4640 835 836 4641 4642 836 4641 4642 836 837 4643 4644 - 837 4643 4644 837 4645 4646 4645 4646 828 831 4647 4648 828 4647 4648 776 - 814 4649 4650 776 814 815 816 4603 4649 4650 838 839 4651 4652 839 4651 - 4652 839 841 4653 4654 841 4653 4654 842 4655 4656 839 842 4655 4656 842 - 843 4657 4658 843 4657 4658 843 4659 4660 844 4659 4660 844 4661 4662 4661 - 4662 841 845 4663 4664 845 4663 4664 846 847 4665 4666 846 4665 4666 848 - 849 4667 4668 849 4667 4668 849 851 4669 4670 851 4669 4670 852 4671 4672 - 849 852 4671 4672 851 854 4673 4674 854 4673 4674 846 854 4675 4676 807 - 846 853 854 855 4675 4676 4677 853 855 4676 4677 4678 853 4677 4678 807 - 853 4679 4680 807 4679 4680 852 857 4681 4682 857 4681 4682 807 857 4683 - 4684 807 846 856 857 858 4683 4684 856 858 4685 4686 856 4685 4686 846 - 856 4687 4688 846 4687 4688 860 861 4689 4690 860 4689 4690 860 862 4691 - 4692 862 4691 4692 859 4693 4694 4693 4694 846 863 4695 4696 863 4695 4696 - 861 863 4697 4698 861 4697 4698 865 4699 4700 863 865 4699 4700 865 866 - 4701 4702 866 4701 4702 866 868 4703 4704 867 868 4703 4704 867 868 4705 - 4706 868 4705 4706 868 4707 4708 4707 4708 859 862 4709 4710 859 4709 4710 - 807 845 4711 4712 807 845 846 847 4711 4712 869 870 4713 4714 870 4713 - 4714 870 872 4715 4716 872 4715 4716 873 4717 4718 870 873 4717 4718 873 - 874 4719 4720 874 4719 4720 874 4721 4722 875 4721 4722 875 4723 4724 4723 - 4724 872 876 4725 4726 876 4725 4726 877 878 4727 4728 4774 877 4727 4728 - 879 880 4729 4730 880 4729 4730 880 882 4731 4732 882 4731 4732 883 4733 - 4734 880 883 4733 4734 882 885 4735 4736 885 4735 4736 877 885 4737 4738 - 838 877 884 885 886 4737 4738 4739 884 886 4738 4739 4740 884 4739 4740 - 838 884 4741 4742 838 4741 4742 883 888 4743 4744 888 4743 4744 838 888 - 4745 4746 838 887 888 889 4745 4746 4747 887 889 4746 4747 4748 887 4747 - 4748 877 887 4749 4750 877 4749 4750 891 892 4751 4752 891 4751 4752 891 - 893 4753 4754 893 4753 4754 890 4755 4756 4755 4756 877 894 4757 4758 894 - 4757 4758 892 894 4759 4760 892 4759 4760 896 4761 4762 894 896 4761 4762 - 896 897 4763 4764 897 4763 4764 897 898 4765 4766 898 4765 4766 898 899 - 4767 4768 899 4767 4768 899 4769 4770 4769 4770 890 893 4771 4772 890 4771 - 4772 838 876 4773 4774 838 876 877 878 4727 4773 4774 900 901 4775 4776 - 901 4775 4776 901 903 4777 4778 903 4777 4778 904 4779 4780 901 904 4779 - 4780 904 905 4781 4782 905 4781 4782 905 4783 4784 906 4783 4784 906 4785 - 4786 4785 4786 903 907 4787 4788 907 4787 4788 908 909 4789 4790 908 4789 - 4790 910 911 4791 4792 911 4791 4792 911 913 4793 4794 913 4793 4794 914 - 4795 4796 911 914 4795 4796 913 916 4797 4798 916 4797 4798 908 916 4799 - 4800 869 908 915 916 917 4799 4800 4801 915 917 4800 4801 4802 915 4801 - 4802 869 915 4803 4804 869 4803 4804 914 919 4805 4806 919 4805 4806 869 - 919 4807 4808 869 908 918 919 920 4807 4808 918 920 4809 4810 918 4809 - 4810 908 918 4811 4812 908 4811 4812 922 923 4813 4814 922 4813 4814 922 - 924 4815 4816 924 4815 4816 921 4817 4818 4817 4818 908 925 4819 4820 925 - 4819 4820 923 925 4821 4822 923 4821 4822 927 4823 4824 925 927 4823 4824 - 927 928 4825 4826 928 4825 4826 928 930 4827 4828 929 930 4827 4828 929 - 930 4829 4830 930 4829 4830 930 4831 4832 4831 4832 921 924 4833 4834 921 - 4833 4834 869 907 908 4835 4836 869 908 909 4835 4836 931 932 4837 4838 - 932 4837 4838 932 934 4839 4840 934 4839 4840 935 4841 4842 932 935 4841 - 4842 935 936 4843 4844 936 4843 4844 936 4845 4846 937 4845 4846 937 4847 - 4848 4847 4848 934 938 4849 4850 938 4849 4850 939 940 4851 4852 939 4851 - 4852 941 942 4853 4854 942 4853 4854 942 944 4855 4856 944 4855 4856 945 - 4857 4858 942 945 4857 4858 944 947 4859 4860 947 4859 4860 939 947 4861 - 4862 900 939 946 947 948 4861 4862 4863 946 948 4862 4863 4864 946 4863 - 4864 900 946 4865 4866 900 4865 4866 945 950 4867 4868 950 4867 4868 900 - 950 4869 4870 900 939 949 950 951 4869 4870 949 951 4871 4872 949 4871 - 4872 939 949 4873 4874 939 4873 4874 953 954 4875 4876 953 4875 4876 953 - 955 4877 4878 955 4877 4878 952 4879 4880 4879 4880 939 956 4881 4882 956 - 4881 4882 954 956 4883 4884 954 4883 4884 958 4885 4886 956 958 4885 4886 - 958 959 4887 4888 959 4887 4888 959 961 4889 4890 960 961 4889 4890 960 - 961 4891 4892 961 4891 4892 961 4893 4894 4893 4894 952 955 4895 4896 952 - 4895 4896 900 938 939 4897 4898 900 939 940 4897 4898 962 963 2915 4899 - 4900 963 4899 4900 963 965 4901 4902 965 4901 4902 966 4903 4904 963 966 - 4903 4904 966 967 4905 4906 967 4905 4906 967 4907 4908 968 4907 4908 968 - 4909 4910 4909 4910 965 969 4911 4912 969 4911 4912 970 971 4913 4914 4960 - 970 4913 4914 972 973 2933 4915 4916 973 4915 4916 973 975 4917 4918 975 - 4917 4918 976 4919 4920 973 976 4919 4920 975 978 4921 4922 978 4921 4922 - 970 977 978 4923 4924 970 977 979 4923 4924 977 979 4925 4926 977 4925 - 4926 931 977 4927 4928 931 4927 4928 976 981 4929 4930 981 4929 4930 931 - 980 981 4931 4932 931 980 982 4931 4932 980 982 4933 4934 980 4933 4934 - 970 980 4935 4936 970 4935 4936 984 985 4937 4938 984 4937 4938 983 984 - 4939 4940 983 986 4939 4940 983 4941 4942 4941 4942 970 987 4943 4944 987 - 4943 4944 985 987 4945 4946 985 4945 4946 989 4947 4948 987 989 4947 4948 - 989 990 4949 4950 990 4949 4950 990 991 4951 4952 991 4951 4952 991 992 - 4953 4954 992 4953 4954 992 4955 4956 4955 4956 983 986 4957 4958 983 4957 - 4958 931 969 970 4959 4960 931 970 971 4913 4959 4960 - 0.320886418015886993E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435047786080177985E-03 -0.435050661596638019E-03 -0.435015088300818990E-03 - -0.435084759117159022E-03 -0.435058330687056996E-03 -0.435040431898216988E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.138286756523262001E-03 - -0.140229438392076996E-03 -0.138900356244381991E-03 -0.139615838670957006E-03 - -0.138288251646271991E-03 -0.140227943269067006E-03 0.000000000000000000E+00 - 0.419834067892811968E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423305217256013E-03 -0.161424187158655996E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435071410144034006E-03 -0.435067418468867008E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.000000000000000000E+00 0.255994304135462002E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.434980740464878987E-03 -0.435120576273734998E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286719937541003E-03 - -0.140229474977798997E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.253690283189994988E-01 -0.434980401777350985E-03 -0.435120927891995981E-03 - -0.666666666666666970E-02 -0.140229499401465997E-03 -0.138286695513873000E-03 - -0.166666666666667011E-01 0.319795589003930014E-01 -0.161423181596791996E-03 - -0.161424310779120013E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435063425289485017E-03 -0.435075493914430017E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.320997920666244024E-01 0.000000000000000000E+00 - -0.161423317051876997E-03 -0.161424175324035012E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435064354014662996E-03 -0.435074543616996011E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.292675967555632993E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423746720073988E-03 - -0.161423745655837994E-03 -0.161424013317901000E-03 -0.161423479058011009E-03 - -0.161423479016896003E-03 -0.161424013359016006E-03 -0.666666666666666970E-02 - -0.434770925530010013E-03 -0.435374516178396991E-03 -0.434923678631289005E-03 - -0.435218350961854000E-03 -0.434767192010144024E-03 -0.435378332843076009E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.238543465439772992E-01 - -0.666666666666666970E-02 -0.166666666666667011E-01 0.420929664672890988E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435014954239530986E-03 - -0.435084898922045006E-03 -0.666666666666666970E-02 -0.138286666946142001E-03 - -0.140229527969196996E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.255503077544010984E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423360613601998E-03 -0.161424131762310011E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435068344066269984E-03 - -0.435070463673477021E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.252823531939237005E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423118685809013E-03 -0.161424373690102996E-03 -0.666666666666666970E-02 - -0.139258124485676013E-03 -0.139258070429663011E-03 -0.166666666666667011E-01 - 0.255489774258116989E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423314542717992E-03 -0.161424177833194993E-03 -0.434708406678657993E-03 - -0.435438495815256975E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.322022752306447974E-01 - -0.435029928828667000E-03 -0.435069283946130990E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.138272576530863001E-03 -0.140243618384475996E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.258515188587684006E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435046874781375994E-03 - -0.435051611957959993E-03 -0.138907408932722987E-03 -0.139608785982616010E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320997478880679990E-01 -0.161423315884339007E-03 -0.161424176491573002E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435068970788089987E-03 - -0.435069822829565989E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.255013626659516997E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161778512187199996E-03 -0.161068980188712013E-03 - -0.166666666666667011E-01 -0.430721544738902977E-03 -0.439524019069752994E-03 - 0.000000000000000000E+00 0.319967450902398018E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161423540386620003E-03 - -0.161423951989293009E-03 -0.436779536615916999E-03 -0.433396525451669977E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.257464062516243007E-01 0.000000000000000000E+00 - -0.357123043190452993E-03 -0.513335165827105049E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138849239149727987E-03 - -0.139666955765612013E-03 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.421600788316065025E-01 0.000000000000000000E+00 -0.438009740604286991E-03 - -0.432217430813423007E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.110342284848130999E-03 -0.562588969540806947E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.250247725514408000E-01 - 0.000000000000000000E+00 0.312521876302675970E-04 -0.718933610019259007E-03 - -0.666666666666666970E-02 -0.143947962448449994E-03 -0.134568232466890006E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.321784029946046990E-01 - 0.000000000000000000E+00 -0.131393250908101988E-03 -0.556288171480889002E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.344954211373242007E-03 - -0.526364720145119946E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.252170876990303987E-01 -0.161423598610935996E-03 -0.161423893764976013E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435940683599852022E-03 - -0.434217221119004976E-03 -0.166666666666667011E-01 0.320886759388818998E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435055736276135005E-03 - -0.435042919782540022E-03 -0.435014872722787977E-03 -0.435084983920160974E-03 - -0.435084944266096979E-03 -0.435014910749576022E-03 -0.166666666666667011E-01 - -0.138286772670705003E-03 -0.140229422244634997E-03 -0.138288005601589994E-03 - -0.140228189313749003E-03 -0.138900291039652002E-03 -0.139615903875686995E-03 - 0.000000000000000000E+00 0.419834016282712025E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423305078950007E-03 -0.161424187296962002E-03 - -0.666666666666666970E-02 -0.435072265974590982E-03 -0.435066581504923002E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.255995473964411996E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.434977413290588025E-03 -0.435124045681951985E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.138286961247241009E-03 -0.140229233668097989E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.253627459848576008E-01 - 0.000000000000000000E+00 -0.434982364929778985E-03 -0.435118882445508014E-03 - -0.666666666666666970E-02 -0.140242815655179008E-03 -0.138273379260159989E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.319681677869403033E-01 - -0.161422464571142990E-03 -0.161425027804768992E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.434576582527966993E-03 -0.435573489949336987E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320997527624522980E-01 0.000000000000000000E+00 -0.161423316033646006E-03 - -0.161424176342266003E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435072976730357019E-03 -0.435065886383583010E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.292687587508236001E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423746234547004E-03 - -0.161423746141366008E-03 -0.161424006944943987E-03 -0.161423485430967995E-03 - -0.161423485376057003E-03 -0.161424006999856008E-03 -0.666666666666666970E-02 - -0.435021380261554999E-03 -0.435118476044887013E-03 -0.434889017803224000E-03 - -0.435253786460061004E-03 -0.434968358559784986E-03 -0.435172681171430026E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.238543465439772992E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.166666666666667011E-01 0.000000000000000000E+00 0.420929989771356011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435014809641986002E-03 -0.435085049702760990E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.138286733578440991E-03 -0.140229461336898007E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.255502685417309007E-01 0.000000000000000000E+00 -0.161423359690042998E-03 - -0.161424132685869011E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435070746654247023E-03 -0.435068067322774026E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.252823730509023001E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423118749988009E-03 -0.161424373625924000E-03 - -0.666666666666666970E-02 -0.139258062928016002E-03 -0.139258131987322995E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.255489363703280993E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423313440402000E-03 - -0.161424178935510009E-03 0.000000000000000000E+00 -0.434870514603341021E-03 - -0.435272746258683974E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.322040616077563027E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435018118913463021E-03 -0.435081599403137019E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.138276442783731991E-03 -0.140239752131608009E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.258538650317957017E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435040431700919026E-03 - -0.435058330932033995E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.138909454179009012E-03 -0.139606740736330012E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320997274074339978E-01 -0.161423315341556987E-03 - -0.161424177034354995E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435070727065607990E-03 -0.435068086477601978E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.256925653634595998E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.430785237193071976E-03 -0.439488203700820024E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.138857649137926994E-03 - -0.139658545777412004E-03 0.000000000000000000E+00 0.319726623957349021E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.161422835104515990E-03 -0.161424657271395992E-03 -0.427144778551284004E-03 - -0.443167299433834994E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256448637616590001E-01 0.000000000000000000E+00 - -0.324936573418277007E-03 -0.544311326562666028E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138671855069575001E-03 - -0.139844339845764999E-03 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.421522789439464016E-01 0.000000000000000000E+00 - -0.438342696270233999E-03 -0.431915478441594015E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.403316551935295006E-04 -0.713262909582468036E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.250271632275507992E-01 0.000000000000000000E+00 0.157694230048913999E-04 - -0.703450845393883040E-03 -0.666666666666666970E-02 -0.143088258477573990E-03 - -0.135427936437765008E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.321536300563562971E-01 0.000000000000000000E+00 0.469561289675323007E-04 - -0.734637551356524029E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.337097960055970989E-03 -0.529554555471797037E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.251453403201698997E-01 -0.161349265278626000E-03 -0.161498227097286009E-03 - -0.666666666666666970E-02 -0.239135715409318991E-03 -0.630652995284446973E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.312158150516623004E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161424178244695010E-03 -0.161423314131216999E-03 - -0.161423314059395002E-03 -0.161424178316517007E-03 -0.161423748120379993E-03 - -0.161423744255532992E-03 -0.166666666666667011E-01 -0.435052937193477996E-03 - -0.435086217679552987E-03 -0.138285143313543007E-03 -0.140231051601796993E-03 - -0.435084982345309993E-03 -0.435054144804906983E-03 0.000000000000000000E+00 - 0.420929687910095995E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435023352535621991E-03 -0.435076141203039014E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.138286660715920991E-03 - -0.140229534199418006E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.256871015736835004E-01 0.000000000000000000E+00 - -0.435049188847810018E-03 -0.435049198755422021E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.139258077328959006E-03 -0.139258117586380995E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.252598187185257006E-01 - 0.000000000000000000E+00 -0.161423313380501998E-03 -0.161424178995410011E-03 - -0.666666666666666970E-02 -0.435070190010078980E-03 -0.435068611700095025E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.320929801235649015E-01 - -0.435014945836726004E-03 -0.435084907682568021E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.138286694923928999E-03 -0.140229499991409998E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320997361127142999E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423315570077993E-03 - -0.161424176805833989E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435069372832834977E-03 -0.435069411726189985E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.296373133570986998E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435042919779554996E-03 -0.435055736280175014E-03 -0.435084883176088981E-03 - -0.435014969337498025E-03 -0.435014925977329975E-03 -0.435084928391770982E-03 - -0.666666666666666970E-02 -0.138910376259480993E-03 -0.139605818655859007E-03 - -0.138287928869166004E-03 -0.140228266046172993E-03 0.000000000000000000E+00 - -0.138286695535614994E-03 -0.140229499379725006E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.238543465439772992E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.419834073423381998E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423305232433000E-03 -0.161424187143480012E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435069384652433987E-03 -0.435069399640343013E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.256582249639075999E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.434980660603881994E-03 -0.435120659531148981E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.138286858193205001E-03 -0.140229336722134999E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.256868354321434997E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.434980641366321990E-03 -0.435120678040121982E-03 -0.666666666666666970E-02 - -0.140229336893623989E-03 -0.138286858021716011E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.255642318718434999E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423743718790006E-03 -0.161423748657122003E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435032845221742983E-03 -0.435106762281529014E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320996545778364992E-01 - 0.000000000000000000E+00 -0.161423313381549988E-03 -0.161424178994361994E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384536214995E-03 - -0.435069399759127010E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.256786742296024988E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423743714627998E-03 -0.161423748661284011E-03 0.000000000000000000E+00 - -0.435032854625325984E-03 -0.435106752665999992E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320996544124479005E-01 -0.161423313377080012E-03 -0.161424178998831997E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069378322735991E-03 - -0.435069406112619992E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.256614590292301992E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435016112049723975E-03 -0.435083691563065015E-03 -0.166666666666667011E-01 - -0.138320998722293008E-03 -0.140195196193045990E-03 0.000000000000000000E+00 - 0.319837098823798002E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.161423313471231993E-03 -0.161424178904679989E-03 - -0.435069384789494977E-03 -0.435069399500138975E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.256582249673321001E-01 0.000000000000000000E+00 -0.434980660337917985E-03 - -0.435120659808483994E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.166666666666667011E-01 -0.138286858200591995E-03 -0.140229336714748005E-03 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.419834073415166001E-01 - 0.000000000000000000E+00 -0.161423305232409988E-03 -0.161424187143501994E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435069382174439019E-03 - -0.435069402174196023E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.253691078098052004E-01 0.000000000000000000E+00 - -0.434980641344910027E-03 -0.435120678062418979E-03 -0.666666666666666970E-02 - -0.140229336715104002E-03 -0.138286858200234995E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.319837098823800986E-01 0.000000000000000000E+00 - -0.161423313471231993E-03 -0.161424178904679989E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435069384776443026E-03 -0.435069399513485016E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.253136149492917996E-01 -0.435016112049779974E-03 - -0.435083691563007010E-03 -0.666666666666666970E-02 -0.138320998722294011E-03 - -0.140195196193044987E-03 -0.166666666666667011E-01 0.312157552277101992E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161424178824619004E-03 -0.161423313551293005E-03 -0.161423313542176003E-03 - -0.161424178833736006E-03 -0.161423748436264003E-03 -0.161423743939648006E-03 - -0.166666666666667011E-01 -0.435065044970892999E-03 -0.435073837108244986E-03 - -0.138284790665693988E-03 -0.140231404249645009E-03 -0.435089215417341023E-03 - -0.435050005423793986E-03 0.000000000000000000E+00 0.420929694350214018E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435016885622778980E-03 -0.435082884886724001E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.138286670421676992E-03 -0.140229524493662005E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.254915570467122987E-01 0.000000000000000000E+00 -0.161423360047314990E-03 - -0.161424132328596992E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435068591366232014E-03 -0.435070210801716977E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.249645634044841011E-01 0.000000000000000000E+00 - -0.161423119061156986E-03 -0.161424373314754995E-03 -0.666666666666666970E-02 - -0.139258118141762012E-03 -0.139258076773577012E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.319795589138204966E-01 -0.161423181597302004E-03 - -0.161424310778610005E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435063329379546009E-03 -0.435075591995543994E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.322057822807787983E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435022004975030990E-03 - -0.435077546786745027E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.138280140954257988E-03 -0.140236053961082012E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.286987467725142997E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423485461702009E-03 -0.161424006914210000E-03 - -0.161423743701962999E-03 -0.161423748673950013E-03 -0.161423485478031991E-03 - -0.161424006897879991E-03 -0.666666666666666970E-02 -0.434818199339674985E-03 - -0.435326186862402978E-03 -0.138107939681827004E-03 -0.140408255233511993E-03 - 0.000000000000000000E+00 -0.434750487935355013E-03 -0.435395405802110002E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.238543465439772992E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.166666666666667011E-01 0.000000000000000000E+00 0.419834078568206009E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423305246576987E-03 -0.161424187129334995E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435069368742756007E-03 - -0.435069415908644010E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.256580957796501002E-01 0.000000000000000000E+00 - -0.434980863637934997E-03 -0.435120447850457018E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.138286593339164988E-03 -0.140229601576174009E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.256868273049125004E-01 - 0.000000000000000000E+00 -0.434980612246946025E-03 -0.435120708410804025E-03 - -0.666666666666666970E-02 -0.140229353764145001E-03 -0.138286841151193996E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.256611955308473985E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435106719725676021E-03 -0.434994029732162983E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.138320118013872997E-03 -0.140196076901467003E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320999233193210026E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423320520808004E-03 -0.161424171855104005E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435059778147493012E-03 -0.435079222540740013E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.257716863609747000E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435022363974353014E-03 - -0.435077172536910996E-03 0.000000000000000000E+00 -0.138312498857105987E-03 - -0.140203696058234013E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320996765524547006E-01 - 0.000000000000000000E+00 -0.161423313971483987E-03 -0.161424178404427995E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435068773858159996E-03 - -0.435070024197671990E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.255013626644550011E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161778513994175003E-03 -0.161068978381738009E-03 -0.166666666666667011E-01 - -0.430721544928323017E-03 -0.439524018875910982E-03 0.000000000000000000E+00 - 0.319967451293014030E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.161423540387061002E-03 -0.161423951988851007E-03 - -0.436779134539225992E-03 -0.433396918749564988E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.257463974004799000E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.357123790381716987E-03 -0.513334443915504997E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138849213053176987E-03 - -0.139666981862162010E-03 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.421600707339388966E-01 0.000000000000000000E+00 -0.438162222473089020E-03 - -0.432071491834290977E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.110310967384949997E-03 -0.562620287003987962E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.250247683829405015E-01 0.000000000000000000E+00 0.312489112000452975E-04 - -0.718930333589037039E-03 -0.666666666666666970E-02 -0.143948366628467004E-03 - -0.134567828286871993E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.321784029807069966E-01 0.000000000000000000E+00 -0.131393333165154990E-03 - -0.556288089223836053E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.344952960576817022E-03 -0.526365968885628946E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.252170877020880015E-01 -0.161423598610962993E-03 - -0.161423893764948989E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435940683620558982E-03 -0.434217221098728986E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.312157364394618012E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423748574843987E-03 -0.161423743801067995E-03 -0.161423313378308007E-03 - -0.161424178997604002E-03 -0.161424178997597009E-03 -0.161423313378315000E-03 - -0.166666666666667011E-01 -0.435069384129798016E-03 -0.435069400174701974E-03 - -0.435069377183192995E-03 -0.435069407277841015E-03 -0.138284694479560007E-03 - -0.140231500435778991E-03 0.000000000000000000E+00 0.420929781177558013E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435014927902171013E-03 - -0.435084926384880994E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.138286690846829989E-03 -0.140229504068509008E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.254915410590286010E-01 - 0.000000000000000000E+00 -0.161423359665572013E-03 -0.161424132710339996E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435069213980331018E-03 - -0.435069574158249974E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.249645180764926991E-01 0.000000000000000000E+00 -0.161423119329893002E-03 - -0.161424373046020010E-03 -0.666666666666666970E-02 -0.139258102070886994E-03 - -0.139258092844452003E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.319784847376137008E-01 -0.161423139773135993E-03 -0.161424352602775989E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435059186696719997E-03 - -0.435079828747282984E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.322084822393016029E-01 - 0.000000000000000000E+00 -0.435016615284991993E-03 -0.435083166852704025E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138285779894368007E-03 - -0.140230415020971993E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.286738419021206996E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423325593204012E-03 - -0.161424166782707997E-03 -0.161423747542489989E-03 -0.161423744833421993E-03 - -0.161423325604668989E-03 -0.161424166771242993E-03 -0.666666666666666970E-02 - -0.435017582368835013E-03 -0.435122368257235984E-03 -0.138274982048601996E-03 - -0.140241212866737001E-03 -0.435000864585250997E-03 -0.435139462305625011E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.238543465439772992E-01 -0.666666666666666970E-02 -0.166666666666667011E-01 - 0.419834075447891031E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423305237978992E-03 -0.161424187137933992E-03 -0.666666666666666970E-02 - -0.435069372007444006E-03 -0.435069412570366005E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.256581825663056012E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.434980761705325996E-03 - -0.435120554118331997E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.138286771391029988E-03 -0.140229423524309009E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.256868309172346015E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.434980606570965994E-03 -0.435120714326099026E-03 -0.666666666666666970E-02 - -0.140229346218551998E-03 -0.138286848696788002E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256614027694726000E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435029702345694975E-03 -0.435069519599524978E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.138320910572908000E-03 -0.140195284342430998E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320997009695584007E-01 - 0.000000000000000000E+00 -0.161423314617753998E-03 -0.161424177758158011E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435066266755664011E-03 - -0.435072587790152974E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.257749746704468014E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435018010012214000E-03 -0.435081712497056015E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.138319195561076991E-03 - -0.140196999354262006E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320996637608070984E-01 - -0.161423313627734002E-03 -0.161424178748178007E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435069045666087995E-03 -0.435069746265178998E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.255649837723719991E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423599069663996E-03 - -0.161423893306248013E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.435941020866372994E-03 -0.434216890957876992E-03 0.000000000000000000E+00 - 0.321778215180335975E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.133203886290680012E-03 -0.554477536098311953E-03 - -0.310287644885220981E-03 -0.560775130597174036E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.258308431809457992E-01 - 0.000000000000000000E+00 -0.200815543944558010E-03 -0.626132470433443992E-03 - -0.666666666666666970E-02 -0.166666666666667011E-01 -0.446184483319574977E-03 - -0.411111793506487000E-03 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.421914017926215967E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435150874851761001E-03 -0.434951664732459008E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.138419555564682994E-03 -0.140096639350656004E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.251983574296692006E-01 0.000000000000000000E+00 -0.874932041930874000E-05 - -0.678932101969682953E-03 -0.666666666666666970E-02 -0.139012530212417987E-03 - -0.139503664702921010E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.319960855039493966E-01 0.000000000000000000E+00 -0.161423534312431000E-03 - -0.161423958063482012E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.422822405468364991E-03 -0.447571581638852981E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.251635090367436989E-01 -0.161499426491679002E-03 -0.161348065884233007E-03 - -0.666666666666666970E-02 -0.431483288576659993E-03 -0.438741647262608995E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.312157284528267985E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161424179076904013E-03 - -0.161423313299007996E-03 -0.161423313309436991E-03 -0.161424179066474991E-03 - -0.161423748561558987E-03 -0.161423743814352995E-03 -0.166666666666667011E-01 - -0.435069198160920999E-03 -0.435069590334165020E-03 -0.138284743432991999E-03 - -0.140231451482346998E-03 -0.435089075923097014E-03 -0.435050142023235000E-03 - 0.000000000000000000E+00 0.420929803448919018E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435014018174735974E-03 -0.435085875041650019E-03 - -0.666666666666666970E-02 -0.138286696532327987E-03 -0.140229498383011010E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256871018283196008E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435049188665910004E-03 - -0.435049198945112027E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.139258096875932990E-03 -0.139258098039406008E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.252598186653136998E-01 0.000000000000000000E+00 -0.161423313379014988E-03 - -0.161424178996896994E-03 -0.666666666666666970E-02 -0.435069414335974010E-03 - -0.435069370280600999E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.320929801358365019E-01 -0.435014929695106008E-03 -0.435084924514935985E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286694969620992E-03 - -0.140229499945718005E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320996579150382988E-01 0.000000000000000000E+00 - -0.161423313471394000E-03 -0.161424178904518009E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435069382051928022E-03 -0.435069402299393022E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.295485029250618994E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435016114097582006E-03 -0.435083689427570000E-03 - -0.435055716891851981E-03 -0.435042938371768973E-03 -0.435014924444715012E-03 - -0.435084929989965974E-03 -0.666666666666666970E-02 -0.138321048329605000E-03 - -0.140195146585733998E-03 -0.138900264540687006E-03 -0.139615930374651991E-03 - -0.138286695606836994E-03 -0.140229499308502004E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.238543465439772992E-01 - -0.666666666666666970E-02 -0.166666666666667011E-01 0.420929804454360026E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435014924421165002E-03 - -0.435084930014524021E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.138286695611057007E-03 -0.140229499304281991E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.000000000000000000E+00 0.257458287849329989E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435049188237201015E-03 -0.435049199392181004E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258097270754996E-03 - -0.139258097644584002E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.255775463582341989E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423313378621992E-03 -0.161424178997289990E-03 - -0.666666666666666970E-02 -0.435069399485626008E-03 -0.435069384803689026E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.257404831787951989E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435042938371768973E-03 -0.435055716891853011E-03 - 0.000000000000000000E+00 -0.138910376033074995E-03 -0.139605818882264002E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320997361628787004E-01 - 0.000000000000000000E+00 -0.161423315571431998E-03 -0.161424176804480011E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384827771000E-03 - -0.435069399460985995E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256633780151170991E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423313378029990E-03 - -0.161424178997881992E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435069376688627994E-03 -0.435069407783549012E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.322089281648140976E-01 -0.435014924425583993E-03 -0.435084930009956982E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286694981505013E-03 - -0.140229499933834011E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.256614590292298002E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435016112049637998E-03 - -0.435083691563155003E-03 -0.166666666666667011E-01 -0.138320998722293008E-03 - -0.140195196193045990E-03 0.000000000000000000E+00 0.319837098823795019E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.161423313471231993E-03 -0.161424178904679989E-03 -0.435069384800439022E-03 - -0.435069399488948003E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256582249675047017E-01 - 0.000000000000000000E+00 -0.434980660275586010E-03 -0.435120659873482023E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - -0.138286858201177004E-03 -0.140229336714161993E-03 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.419834073409681013E-01 0.000000000000000000E+00 - -0.161423305232394999E-03 -0.161424187143517010E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435069384650815002E-03 -0.435069399641999023E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.253691078101502994E-01 0.000000000000000000E+00 -0.434980641228884999E-03 - -0.435120678183400979E-03 -0.666666666666666970E-02 -0.140229336714165002E-03 - -0.138286858201173995E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.319837098823795019E-01 0.000000000000000000E+00 -0.161423313471231993E-03 - -0.161424178904679989E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435069384800325018E-03 -0.435069399489064988E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.253136149492911994E-01 -0.435016112049640979E-03 -0.435083691563150992E-03 - -0.666666666666666970E-02 -0.138320998722293008E-03 -0.140195196193045990E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.312154838512909011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423744465432012E-03 -0.161423747910479997E-03 - -0.161423311121030010E-03 -0.161424181254881999E-03 -0.161424181258488001E-03 - -0.161423311117424008E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.435082183658245017E-03 -0.435056882388395007E-03 -0.435077576153311009E-03 - -0.435061388362830001E-03 -0.138286177937990999E-03 -0.140230016977347998E-03 - 0.000000000000000000E+00 0.420359769758308008E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.434988625092396007E-03 -0.435112371917261017E-03 - -0.666666666666666970E-02 -0.138123103593930992E-03 -0.140393091321408005E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.254927196052617995E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423391697327001E-03 - -0.161424100678585008E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.429493592762821984E-03 -0.440767063383173984E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.251692612175913009E-01 0.000000000000000000E+00 -0.161423746187955991E-03 - -0.161423746187955991E-03 -0.666666666666666970E-02 -0.440899213583507998E-03 - -0.429390539341747025E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.318962392046409984E-01 -0.363404809254292002E-03 -0.507297969422810992E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.135240574414478994E-03 - -0.143275620500860003E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.322093262366704020E-01 0.000000000000000000E+00 - -0.435187611870158021E-03 -0.434916457449104998E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.138287300795975000E-03 -0.140228894119363997E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.286700106542116008E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423291382795011E-03 - -0.161424200993116998E-03 -0.161423748962132008E-03 -0.161423743413780001E-03 - -0.161423291392504990E-03 -0.161424200983406992E-03 -0.666666666666666970E-02 - -0.435081950459870019E-03 -0.435057110654062009E-03 -0.138300165059673992E-03 - -0.140216029855665006E-03 -0.435072720707763974E-03 -0.435066136833905980E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.238543465439772992E-01 -0.666666666666666970E-02 -0.166666666666667011E-01 - 0.419834073815130021E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423305233497009E-03 - -0.161424187142415000E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435069380325788988E-03 -0.435069404064516009E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.256582188293747009E-01 - 0.000000000000000000E+00 -0.434980686274821977E-03 -0.435120632764312025E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286845679548006E-03 - -0.140229349235790991E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.256868342262811002E-01 0.000000000000000000E+00 -0.434980627630137003E-03 - -0.435120692364288997E-03 -0.666666666666666970E-02 -0.140229339372730996E-03 - -0.138286855542608001E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256614658944893000E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435012511657230014E-03 - -0.435087446025792977E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.138321094746602009E-03 -0.140195100168737991E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320996483486796005E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423313210778008E-03 -0.161424179165134001E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435068950004979999E-03 - -0.435069844082024008E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.257759423107052008E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435016296291419009E-03 -0.435083499433781991E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.138321121808393998E-03 - -0.140195073106944999E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320996585742336984E-01 0.000000000000000000E+00 -0.161423313488731992E-03 - -0.161424178887179990E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435069291046625011E-03 -0.435069495355377978E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.255844281626146988E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.327328574610849998E-03 -0.542038187704011951E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.138213770503833004E-03 - -0.140302424411505993E-03 0.000000000000000000E+00 0.321601835856102983E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.946023952620347994E-04 -0.593079027126956976E-03 -0.133620312350716999E-03 - -0.725519910394541970E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.257239980440517012E-01 0.000000000000000000E+00 - 0.196968326301319991E-04 -0.707378255019124025E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 -0.458132642260517973E-03 - -0.413595834475461983E-03 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.423184714573687989E-01 0.000000000000000000E+00 -0.434738589247653001E-03 - -0.435372752845944011E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.138688174968105987E-03 -0.139828019947233010E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.251897402149190985E-01 0.000000000000000000E+00 -0.646688794373514043E-04 - -0.623012542951639965E-03 -0.666666666666666970E-02 -0.137976511090806987E-03 - -0.140539683824532011E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.319702861190497012E-01 0.000000000000000000E+00 -0.161422661148781000E-03 - -0.161424831227131009E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.425027871300158022E-03 -0.445328608699814012E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.253132418654096990E-01 -0.429884568296530988E-03 -0.440422631514232989E-03 - -0.666666666666666970E-02 -0.138411741521856003E-03 -0.140104453393482994E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.310595796671125010E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161430908664596994E-03 -0.161416583711314988E-03 - -0.161367227947133000E-03 -0.161480264428779009E-03 -0.161480354961919997E-03 - -0.161367137413992012E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.438392776826464984E-03 -0.431826121382375026E-03 -0.436721410041992001E-03 - -0.433458024629497019E-03 -0.139062002018278005E-03 -0.139454192897061995E-03 - 0.000000000000000000E+00 0.421404524821471990E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.402803578099727999E-03 -0.467868848878249025E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 0.227910657339421994E-03 - -0.900841911728360008E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256049262714956004E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.340826630558288979E-03 - -0.529260041764853008E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.138562349500047995E-03 -0.139953845415291002E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.253040215793172013E-01 0.000000000000000000E+00 - -0.246116548137240024E-03 -0.598443155917590948E-03 -0.666666666666666970E-02 - -0.140073789192540987E-03 -0.138442405722798010E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.319703071004562994E-01 -0.161422660702825987E-03 - -0.161424831673085995E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.426922177757430997E-03 -0.443395080957603976E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320998407804300029E-01 - 0.000000000000000000E+00 -0.161423318608057989E-03 -0.161424173767853993E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.432673706188636982E-03 - -0.437518364259369976E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.295486137458738987E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435073936139989976E-03 -0.435025467270096005E-03 -0.435055941094787983E-03 - -0.435042723373920998E-03 -0.435014132966001985E-03 -0.435085755332848012E-03 - -0.666666666666666970E-02 -0.138321121970870991E-03 -0.140195072944468007E-03 - -0.138900280797982998E-03 -0.139615914117355999E-03 0.000000000000000000E+00 - -0.138286771859713999E-03 -0.140229423055624998E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.238543465439772992E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.420929803055049012E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435014930179625984E-03 -0.435084924009661019E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.138286695318050990E-03 - -0.140229499597288007E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.257458286284305005E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435049187797943010E-03 - -0.435049199850251001E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.139258097008661995E-03 -0.139258097906677002E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.255775441079788002E-01 - 0.000000000000000000E+00 -0.161423313317997011E-03 -0.161424179057914998E-03 - -0.666666666666666970E-02 -0.435069409759835009E-03 -0.435069374755894995E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.257406249620084002E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435042723384703009E-03 - -0.435055941080233978E-03 0.000000000000000000E+00 -0.138910497854425013E-03 - -0.139605697060914011E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320997355336692033E-01 0.000000000000000000E+00 -0.161423315554604991E-03 - -0.161424176821307994E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435069370488565992E-03 -0.435069414123279022E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.256633754959206990E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423313307973996E-03 - -0.161424179067938013E-03 0.000000000000000000E+00 -0.435085186467014016E-03 - -0.435053945729502005E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.322092437021506001E-01 - -0.435014144601562013E-03 -0.435085743160859985E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.138287341133016000E-03 -0.140228853782322998E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.254980067883627988E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161506286311511997E-03 - -0.161341206064400012E-03 -0.166666666666667011E-01 -0.431364756974581974E-03 - -0.438866309542754976E-03 0.000000000000000000E+00 0.319866973588324027E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.161423386153582989E-03 -0.161424106222328993E-03 -0.435249329951151978E-03 - -0.434893408363733994E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256606789218085003E-01 - 0.000000000000000000E+00 -0.434963065954236988E-03 -0.435139005528701976E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - -0.138291831741704991E-03 -0.140224363173634006E-03 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.419833492947222983E-01 0.000000000000000000E+00 - -0.161423303638426010E-03 -0.161424188737485999E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435072120349480976E-03 -0.435066723923213975E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.253728413941171002E-01 0.000000000000000000E+00 -0.434979152196512009E-03 - -0.435122229395894016E-03 -0.666666666666666970E-02 -0.140221824647693988E-03 - -0.138294370267645010E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.319851023354644978E-01 0.000000000000000000E+00 -0.161423349940120991E-03 - -0.161424142435790991E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435344407954329007E-03 -0.434800430411269015E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.251600082854132011E-01 -0.161430149009246000E-03 -0.161417343366666009E-03 - -0.666666666666666970E-02 -0.432078300694966016E-03 -0.438133229980573974E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320887275245489018E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435111004034048989E-03 -0.434989920105145983E-03 -0.435014834120551011E-03 - -0.435085024172310027E-03 -0.435084968865059006E-03 -0.435014887158174024E-03 - -0.166666666666667011E-01 -0.138286802031117000E-03 -0.140229392884223000E-03 - -0.138288034558563993E-03 -0.140228160356775005E-03 -0.138900326717328012E-03 - -0.139615868198011988E-03 0.000000000000000000E+00 0.419834068202609018E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423305219030012E-03 - -0.161424187156881997E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435073349340887991E-03 -0.435065522020417015E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.000000000000000000E+00 - 0.256004806855422984E-01 0.000000000000000000E+00 -0.434981132067385973E-03 - -0.435120167645717019E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.138288861702206010E-03 -0.140227333213132987E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.253696619275626985E-01 0.000000000000000000E+00 - -0.434980119724182024E-03 -0.435121221827560020E-03 -0.666666666666666970E-02 - -0.140228210971901000E-03 -0.138287983943439000E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.319694542067313001E-01 -0.161422588303278990E-03 - -0.161424904072633994E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435109649626736015E-03 -0.435030036925322975E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320971379137156981E-01 0.000000000000000000E+00 -0.161423238729728995E-03 - -0.161424253646182987E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435144484966556002E-03 -0.434995958682276988E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.296734541379750985E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.434149030098187011E-03 -0.435987412204495999E-03 -0.435031452984650986E-03 - -0.435067691864950016E-03 -0.434950548419927988E-03 -0.435152046333653016E-03 - -0.666666666666666970E-02 -0.138392945863299996E-03 -0.140123249052040005E-03 - -0.138921815209089002E-03 -0.139594379706249995E-03 0.000000000000000000E+00 - -0.138356769775533993E-03 -0.140159425139805004E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.238543465439772992E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.420931571567067994E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435014671747375992E-03 - -0.435085193476965012E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.138287057706326004E-03 -0.140229137209012993E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.257455695212117017E-01 - 0.000000000000000000E+00 -0.435047022783618996E-03 -0.435051457582689993E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258438339454998E-03 - -0.139257756575883999E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.255770773094191987E-01 0.000000000000000000E+00 -0.161423300794449003E-03 - -0.161424191581463006E-03 -0.666666666666666970E-02 -0.435056164326672024E-03 - -0.435082918027994002E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.257823402544687004E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435067703423035017E-03 -0.435031441124867973E-03 0.000000000000000000E+00 - -0.138940670377310000E-03 -0.139575524538028997E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320998367437754992E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423318157696995E-03 - -0.161424174218214987E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435046243704917976E-03 -0.435093061865261994E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.256631852001646017E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423307528138997E-03 - -0.161424184847773012E-03 0.000000000000000000E+00 -0.438796404237050007E-03 - -0.431424781290872027E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.323103002924175972E-01 - 0.000000000000000000E+00 -0.434856382728898011E-03 -0.435250190151687997E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138421511330616989E-03 - -0.140094683584722008E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.255452807930411988E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423207454343989E-03 -0.161424284921568996E-03 -0.166666666666667011E-01 - -0.404324480174985020E-03 -0.466398849792140982E-03 0.000000000000000000E+00 - 0.318949373068843009E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.323838290351469984E-03 -0.544927583290811006E-03 - -0.135274543056306012E-03 -0.143241651859033012E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.255622073519536988E-01 0.000000000000000000E+00 -0.114452729937741999E-03 - -0.573228692451250047E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.166666666666667011E-01 0.218484783290129011E-03 -0.891416037679068000E-03 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.421663414565641992E-01 - 0.000000000000000000E+00 0.687960644515914959E-05 -0.694561028834151002E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.438096223299876001E-03 - -0.432167718169455990E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.255893861003781004E-01 0.000000000000000000E+00 - -0.330656818749315009E-03 -0.535638411162091960E-03 -0.666666666666666970E-02 - -0.139701772394438010E-03 -0.138814422520901990E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.319824507944573982E-01 0.000000000000000000E+00 - -0.161423276715586998E-03 -0.161424215660325011E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.453058027683075014E-03 -0.417478867706813021E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.252771264600550986E-01 -0.434642415203527976E-03 - -0.435473406290417023E-03 -0.666666666666666970E-02 -0.142683907585935987E-03 - -0.135832287329404013E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.323033163369174023E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435248914259523019E-03 -0.434857618105281016E-03 -0.434883815526439026E-03 - -0.435221605059342973E-03 -0.435144012463656023E-03 -0.434958245914109976E-03 - -0.166666666666667011E-01 -0.138392691801889008E-03 -0.140123503113449990E-03 - -0.138936896109547012E-03 -0.139579298805792012E-03 -0.138396653973258001E-03 - -0.140119540942080996E-03 0.000000000000000000E+00 0.419134531555254014E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.160003730040957000E-03 - -0.162843762334955009E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.439266035090063022E-03 -0.430980363867674026E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.000000000000000000E+00 - 0.256392673957523999E-01 0.000000000000000000E+00 -0.434777562142257996E-03 - -0.435332391625641989E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.138409864927514006E-03 -0.140106329987824992E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.254101279172769014E-01 0.000000000000000000E+00 - -0.373644623806981997E-03 -0.492026351519330037E-03 -0.666666666666666970E-02 - -0.908412435934918957E-03 0.235481181545980998E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.321268372004474001E-01 -0.200001267706564000E-03 - -0.627372315602944992E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.127171097759715991E-03 -0.545760156629222998E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320995602660988991E-01 0.000000000000000000E+00 -0.161423310639775001E-03 - -0.161424181736137008E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.437288207604906009E-03 -0.432899576991382993E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.296377168165362984E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.434989920176705983E-03 -0.435111003938864983E-03 -0.435084786849818019E-03 - -0.435015061692732000E-03 -0.435015088357717974E-03 -0.435084759043768022E-03 - -0.666666666666666970E-02 -0.138910479291025010E-03 -0.139605715624313987E-03 - -0.138288220944727007E-03 -0.140227973970611991E-03 -0.138286985838064012E-03 - -0.140229209077275013E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.238543465439772992E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.419834073428042021E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423305232445008E-03 -0.161424187143467001E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435069384775871976E-03 -0.435069399514123015E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.256582244991315003E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.434980660285921980E-03 -0.435120659862830007E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.138286857240759006E-03 -0.140229337674580994E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256868355803533989E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.434980641844436011E-03 -0.435120677541527977E-03 -0.666666666666666970E-02 - -0.140229336589824008E-03 -0.138286858325514989E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.255642317604596984E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423743725482000E-03 - -0.161423748650431012E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435032785443368025E-03 -0.435106823406514024E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320996548652853000E-01 - -0.161423313389232997E-03 -0.161424178986679012E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435069349700356026E-03 -0.435069435379967977E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256786747395171995E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423743712989009E-03 -0.161423748662923000E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435032864937970982E-03 - -0.435106742120615006E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320996543404327983E-01 - -0.161423313375148994E-03 -0.161424179000762988E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435069389550788983E-03 -0.435069394631554978E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.256124534411805993E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.422563580132133995E-03 -0.447998624216019981E-03 - -0.166666666666667011E-01 -0.138286150728846990E-03 -0.140230044186492007E-03 - 0.000000000000000000E+00 0.319552376669786989E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161419817025352993E-03 - -0.161427675350558989E-03 -0.434132277678501012E-03 -0.436028351482909977E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256464487088938001E-01 0.000000000000000000E+00 - -0.434899896421036018E-03 -0.435204881377557976E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138261308742496992E-03 - -0.140254886172842006E-03 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.419830852158453033E-01 0.000000000000000000E+00 -0.161423296248439004E-03 - -0.161424196127473005E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435080307367357005E-03 -0.435058717463834973E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.253594622355668986E-01 - 0.000000000000000000E+00 -0.434950331860284995E-03 -0.435152283722859020E-03 - -0.666666666666666970E-02 -0.140249996876446007E-03 -0.138266198038893993E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.319571786575690020E-01 - 0.000000000000000000E+00 -0.161420475113086009E-03 -0.161427017262827003E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.434308830151222019E-03 - -0.435847642640102974E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.252660555352819012E-01 - -0.423620518084152984E-03 -0.446908164705096009E-03 -0.666666666666666970E-02 - -0.138290733115794992E-03 -0.140225461799544005E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.320831749900574031E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.434897829232994994E-03 - -0.435207041701621983E-03 -0.435182072257829001E-03 -0.434921773469817985E-03 - -0.434970483470178027E-03 -0.435131276171263001E-03 -0.166666666666667011E-01 - -0.138283460799556990E-03 -0.140232734115782007E-03 -0.138290545167624998E-03 - -0.140225649747715002E-03 -0.138898900526048002E-03 -0.139617294389290995E-03 - 0.000000000000000000E+00 0.421628681453787013E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.397188434436698007E-04 -0.727400265832661014E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435230978617207986E-03 - -0.434916855000890001E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256973883757283007E-01 - 0.000000000000000000E+00 -0.435443117881719019E-03 -0.434671353469976026E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.138672805722981989E-03 - -0.139843389192358011E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.253811407347032010E-01 0.000000000000000000E+00 -0.111660189923879004E-03 - -0.576021232465113043E-03 -0.666666666666666970E-02 -0.807402873679339953E-03 - 0.113200250901337992E-04 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.318951479564182974E-01 -0.334801618998244014E-03 -0.534674353093398985E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.135236534922172994E-03 - -0.143279659993166003E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320991543625473996E-01 - 0.000000000000000000E+00 -0.161423299320882996E-03 -0.161424193055029988E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.439089523234416986E-03 - -0.431138190963970024E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.292550790147567998E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423735549563995E-03 -0.161423756826347987E-03 - -0.161424083758227992E-03 -0.161423408617683990E-03 -0.161423408591030993E-03 - -0.161424083784880988E-03 -0.666666666666666970E-02 -0.434939106055095008E-03 - -0.435202598868165004E-03 -0.434989974486818000E-03 -0.435150588304691020E-03 - -0.434898044278314978E-03 -0.435244580342165975E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.238543465439772992E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.420929736779580982E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435014942481458009E-03 -0.435084911182310021E-03 - -0.666666666666666970E-02 -0.138286681738491990E-03 -0.140229513176847007E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.255502900122650013E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423360190818011E-03 -0.161424132185093998E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435068884552577983E-03 - -0.435069911008902977E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.252824734873780994E-01 0.000000000000000000E+00 -0.161423118220565989E-03 - -0.161424374155345993E-03 -0.666666666666666970E-02 -0.139258110569624000E-03 - -0.139258084345714997E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.255485447377079983E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423302756024003E-03 - -0.161424189619888006E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.434872688531394978E-03 -0.435270525342987020E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.322052497845927016E-01 -0.435023979094238992E-03 -0.435075488205511978E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138279010834734011E-03 - -0.140237184080605013E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.258530259445254006E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.434958189051660006E-03 -0.435144092590821993E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.138909064929736007E-03 - -0.139607129985603993E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320994819723091004E-01 -0.161423308725767013E-03 -0.161424183650144996E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435080068461045984E-03 - -0.435058950989111021E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.256614731138163008E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.429527300766954008E-03 - -0.440793399874086025E-03 -0.166666666666667011E-01 -0.138412525549365991E-03 - -0.140103669365973006E-03 0.000000000000000000E+00 0.319693244854589026E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.161422576318977012E-03 -0.161424916056934997E-03 -0.437741651795111979E-03 - -0.432457178334932999E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.257682962742948989E-01 - 0.000000000000000000E+00 -0.372135275695876014E-03 -0.498733752802202964E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - -0.138908713201793988E-03 -0.139607481713545009E-03 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.418896828694112003E-01 0.000000000000000000E+00 - -0.436533664253622989E-03 -0.433624016341012007E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.134626819469842998E-03 -0.143889375445495999E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.252833740154079986E-01 0.000000000000000000E+00 0.378159457418904982E-04 - -0.725497368130881976E-03 -0.666666666666666970E-02 -0.597713125202172023E-03 - -0.752181291867666958E-04 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.321707087590768978E-01 0.000000000000000000E+00 -0.981956570690248044E-04 - -0.589485765319966958E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.242422521139187011E-03 -0.626419769058526952E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.252392538739079006E-01 -0.333702074783931012E-03 - -0.536013663819399954E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.138219319560300990E-03 -0.140296875355038007E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.310595586219111000E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161431110154876994E-03 - -0.161416382221035991E-03 -0.161367182761019008E-03 -0.161480309614893001E-03 - -0.161480399925954988E-03 -0.161367092449956993E-03 -0.166666666666667011E-01 - -0.438417298132810025E-03 -0.431802193377014986E-03 -0.436709554426625997E-03 - -0.433469595476341000E-03 -0.139063297967854010E-03 -0.139452896947484987E-03 - 0.000000000000000000E+00 0.421404520292116974E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.402798477196058982E-03 -0.467873676471954026E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 0.227909367425477999E-03 - -0.900840621814416988E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256044922875989997E-01 - 0.000000000000000000E+00 -0.340827301030340014E-03 -0.529259221222554946E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.138561533653603990E-03 - -0.139954661261736010E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.253040211801738996E-01 0.000000000000000000E+00 -0.246116211180549989E-03 - -0.598443499755276051E-03 -0.666666666666666970E-02 -0.140073790975437003E-03 - -0.138442403939901995E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.319703070963996000E-01 -0.161422660702506988E-03 -0.161424831673404994E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.426922132602362023E-03 - -0.443395127058409007E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.321002728756756978E-01 - 0.000000000000000000E+00 -0.161423329712289998E-03 -0.161424162663622011E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.432632949803903998E-03 - -0.437559988660470999E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.296364413425855011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435207087979181024E-03 -0.434897780856480991E-03 -0.435078144333426976E-03 - -0.435021431652866995E-03 -0.435027766934605010E-03 -0.435071537880043005E-03 - -0.666666666666666970E-02 -0.138910179724992006E-03 -0.139606015190346991E-03 - -0.138289402276009992E-03 -0.140226792639329005E-03 -0.138286064088546994E-03 - -0.140230130826792004E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.238543465439772992E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.419834145316453980E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423305428603012E-03 -0.161424186947310000E-03 -0.666666666666666970E-02 - -0.435068797728623978E-03 -0.435069999793927990E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256581977277514015E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.434982640822920976E-03 -0.435118594642042004E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.138286799402304007E-03 -0.140229395513034991E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256861835048418988E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.434980941091875014E-03 -0.435120365734433022E-03 - -0.666666666666666970E-02 -0.140230675700951006E-03 -0.138285519214387991E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.255648151652009999E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423744258310013E-03 -0.161423748117601996E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.434971272637178004E-03 -0.435169719904374022E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320996755017593990E-01 - -0.161423313937937010E-03 -0.161424178437974999E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435067422348721020E-03 -0.435071406160561979E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256787278728396011E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423752315616002E-03 -0.161423740060296007E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.434863912658681987E-03 - -0.435279496392151021E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.321000539265607968E-01 - -0.161423323758083008E-03 -0.161424168617829001E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435021975200768020E-03 -0.435117876636681000E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.254980067884808988E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161506286650737010E-03 -0.161341205725174999E-03 - -0.166666666666667011E-01 -0.431364756994681022E-03 -0.438866309522160990E-03 - 0.000000000000000000E+00 0.319866973712260028E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161423386153855991E-03 - -0.161424106222056993E-03 -0.435249227300768993E-03 -0.434893508757877025E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256606773361423013E-01 0.000000000000000000E+00 - -0.434963273622003977E-03 -0.435138788986911998E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138291826977172987E-03 - -0.140224367938166010E-03 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.419833531788693995E-01 0.000000000000000000E+00 -0.161423303747212010E-03 - -0.161424188628699999E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435047841600877025E-03 -0.435091428270091996E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.253728372267130013E-01 - 0.000000000000000000E+00 -0.434980109299934025E-03 -0.435121231414349987E-03 - -0.666666666666666970E-02 -0.140221834450205004E-03 -0.138294360465133993E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.319851023684395025E-01 - 0.000000000000000000E+00 -0.161423349940937992E-03 -0.161424142434973990E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435344118977629016E-03 - -0.434800713024421025E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.251600082863014003E-01 - -0.161430149161919989E-03 -0.161417343213992996E-03 -0.666666666666666970E-02 - -0.432078300658010010E-03 -0.438133230018339012E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.320880852120032020E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435006978817776007E-03 - -0.435093216238015019E-03 -0.435023227600191012E-03 -0.435076271502717992E-03 - -0.435080703092234026E-03 -0.435018977896097989E-03 -0.166666666666667011E-01 - -0.138286342284362987E-03 -0.140229852630976986E-03 -0.138288877468917000E-03 - -0.140227317446421998E-03 -0.138898309276879996E-03 -0.139617885638460004E-03 - 0.000000000000000000E+00 0.419834046918393031E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423305160284008E-03 -0.161424187215628001E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435057325047151988E-03 - -0.435081731065996996E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.256008015072015996E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.434979053168488001E-03 -0.435122335339392023E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.138289510792666997E-03 -0.140226684122673003E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.253691080193319991E-01 - 0.000000000000000000E+00 -0.434981271968578004E-03 -0.435120020450890975E-03 - -0.666666666666666970E-02 -0.140229335054007991E-03 -0.138286859861331006E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.319837881512948022E-01 - -0.161423315571830009E-03 -0.161424176804082000E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435069394631856982E-03 -0.435069389550488985E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320970354177166975E-01 0.000000000000000000E+00 -0.161423235472326997E-03 - -0.161424256903585988E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435168272526173982E-03 -0.434972697148703995E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.295545368095916985E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.434122085949861976E-03 -0.436015593157589986E-03 -0.435101402844330994E-03 - -0.434999127125352007E-03 -0.434882942033044983E-03 -0.435222550914836999E-03 - -0.666666666666666970E-02 -0.138329236201211012E-03 -0.140186958714128013E-03 - -0.138899088561680992E-03 -0.139617106353658006E-03 -0.138290618264299002E-03 - -0.140225576651039995E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.238543465439772992E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.166666666666667011E-01 0.000000000000000000E+00 0.420931657846550997E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435014640889280020E-03 - -0.435085225654339980E-03 -0.666666666666666970E-02 -0.138287075366651002E-03 - -0.140229119548687995E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.257458666890181993E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435045894309902018E-03 -0.435052634377398005E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.139258454546935994E-03 -0.139257740368403004E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.255770740455284994E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423300836045991E-03 -0.161424191539865991E-03 - -0.666666666666666970E-02 -0.435055528504503020E-03 -0.435083568186561014E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.257826760506618992E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.434999152127107014E-03 -0.435101368436342979E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.138940653027276996E-03 - -0.139575541888062001E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320996072561336027E-01 - -0.161423312108405990E-03 -0.161424180267505992E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435072662950887002E-03 -0.435066193259916003E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256629422996789008E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423300977684998E-03 -0.161424191398227011E-03 - 0.000000000000000000E+00 -0.438825800967892977E-03 -0.431396059158853015E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.323107970182680976E-01 -0.434855364031651011E-03 - -0.435251251813800982E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.138421931556495010E-03 -0.140094263358843987E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.255642320215948987E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423743718458999E-03 -0.161423748657454013E-03 - -0.166666666666667011E-01 -0.435032852641439025E-03 -0.435106754694577981E-03 - 0.000000000000000000E+00 0.319837065352637992E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161423313381099990E-03 - -0.161424178994811992E-03 -0.435069595976944991E-03 -0.435069192642484977E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256582283981556004E-01 0.000000000000000000E+00 - -0.434978918685956012E-03 -0.435122475932554027E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138286871097143010E-03 - -0.140229323818195987E-03 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.419833928488543007E-01 0.000000000000000000E+00 -0.161423304820364997E-03 - -0.161424187555547012E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435207587129756010E-03 -0.434934243879776989E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.253691146856406986E-01 - 0.000000000000000000E+00 -0.434977844183998008E-03 -0.435123594722203980E-03 - -0.666666666666666970E-02 -0.140229316757458007E-03 -0.138286878157881994E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.319837065219248998E-01 - 0.000000000000000000E+00 -0.161423313380713987E-03 -0.161424178995198998E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069861087792979E-03 - -0.435068933373936994E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.252163879413985006E-01 - -0.161423743718170005E-03 -0.161423748657742004E-03 -0.666666666666666970E-02 - -0.435032852613000023E-03 -0.435106754723656988E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320883554506867022E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435127306548502995E-03 -0.434974286158534991E-03 - -0.435023129650981017E-03 -0.435076373631457973E-03 -0.435052776118324998E-03 - -0.435045758390568017E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.138286581602824004E-03 -0.140229613312514993E-03 -0.138900368233470993E-03 - -0.139615826681868004E-03 -0.138293378267740995E-03 -0.140222816647599005E-03 - 0.000000000000000000E+00 0.419834123874497020E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423305373013988E-03 -0.161424187002898997E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435065741772807020E-03 - -0.435073124632554995E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.255992190832709000E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.434982794949810992E-03 -0.435118433992394990E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.138286284757403011E-03 -0.140229910157936013E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.253696592428569005E-01 - 0.000000000000000000E+00 -0.434980341928489984E-03 -0.435120990165656005E-03 - -0.666666666666666970E-02 -0.140228218876105011E-03 -0.138287976039234013E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.319694542100162002E-01 - -0.161422588303611000E-03 -0.161424904072301009E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435109462002123012E-03 -0.435030220343913024E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320999013551653001E-01 0.000000000000000000E+00 -0.161423319891360013E-03 - -0.161424172484551996E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435048884195216989E-03 -0.435090361870336983E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.297460083034186001E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435093184138760016E-03 -0.435007004727823009E-03 - -0.435044582892759007E-03 -0.435054001407353006E-03 -0.435082620665603974E-03 - -0.435017135654766987E-03 -0.666666666666666970E-02 -0.138933893426106987E-03 - -0.139582301489232010E-03 -0.138357128758214001E-03 -0.140159066157124996E-03 - 0.000000000000000000E+00 -0.138352953476254988E-03 -0.140163241439084010E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.238543465439772992E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.166666666666667011E-01 0.000000000000000000E+00 0.419834091612526009E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423305282162996E-03 - -0.161424187093749013E-03 -0.666666666666666970E-02 -0.435069239563229014E-03 - -0.435069548000012994E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.256578127707293990E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.434981761685145974E-03 -0.435119511480767007E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.138286013333851002E-03 -0.140230181581488998E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256868183233514985E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.434980125799632985E-03 -0.435121215670877001E-03 - -0.666666666666666970E-02 -0.140229372680005009E-03 -0.138286822235333988E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.255642631986222994E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423746696742987E-03 -0.161423745679168995E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.434938825463917979E-03 -0.435202898680430025E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320998066618511033E-01 -0.161423317331866001E-03 -0.161424175044046008E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435039227329621022E-03 - -0.435100236286997026E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.256789254737601995E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423744043055008E-03 -0.161423748332858004E-03 0.000000000000000000E+00 - -0.434994690186026999E-03 -0.435145776107332002E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320996673803692970E-01 -0.161423313722635004E-03 - -0.161424178653277005E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435068125149740983E-03 -0.435070687523708016E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.255452807933189002E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423207454355997E-03 -0.161424284921556012E-03 - -0.166666666666667011E-01 -0.404324471515191976E-03 -0.466398858585693999E-03 - 0.000000000000000000E+00 0.318949372864687011E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.323838542624367001E-03 - -0.544927348767803980E-03 -0.135274539864154011E-03 -0.143241655051185013E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.255622073357968001E-01 0.000000000000000000E+00 - -0.114453213670021004E-03 -0.573228208718970988E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 0.218486799119613006E-03 - -0.891418053508551047E-03 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.421663421972207977E-01 0.000000000000000000E+00 0.686273215429911012E-05 - -0.694544154543290989E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.437941742226114026E-03 -0.432315834894540991E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.255893865594537008E-01 - 0.000000000000000000E+00 -0.330660242956903980E-03 -0.535635480866768947E-03 - -0.666666666666666970E-02 -0.139701781018268989E-03 -0.138814413897070008E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.319824507952650994E-01 - 0.000000000000000000E+00 -0.161423276715632995E-03 -0.161424215660278987E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.453057648293402017E-03 - -0.417479238786919984E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.252771264643981003E-01 - -0.434642414870016003E-03 -0.435473406638131990E-03 -0.666666666666666970E-02 - -0.142683907492472013E-03 -0.135832287422867011E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.320831993534842971E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435039282000798980E-03 - -0.435059530140830008E-03 -0.435183177514903016E-03 -0.434920713595511989E-03 - -0.434923578188304021E-03 -0.435180190251448023E-03 -0.166666666666667011E-01 - -0.138283467016899007E-03 -0.140232727898440993E-03 -0.138898928565553995E-03 - -0.139617266349785003E-03 -0.138290715294724011E-03 -0.140225479620615013E-03 - 0.000000000000000000E+00 0.421628681647200979E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.397187181120990971E-04 - -0.727400140501091022E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435231404167815000E-03 -0.434916453249015015E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.256970881787325009E-01 - 0.000000000000000000E+00 -0.435443459120569022E-03 -0.434671026438330013E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.138672556881986003E-03 - -0.139843638033352994E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.253811403923139001E-01 0.000000000000000000E+00 -0.111659736914792001E-03 - -0.576021685474199015E-03 -0.666666666666666970E-02 -0.807403253803167008E-03 - 0.113208142554363006E-04 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.318951479398702983E-01 -0.334801708387352976E-03 -0.534674268837695009E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.135236533028903003E-03 - -0.143279661886435995E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320995045206775992E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423308955499994E-03 -0.161424183420412991E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.439073955171491003E-03 -0.431153380462344989E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.292551256260864985E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423741996618989E-03 - -0.161423750379292993E-03 -0.161424083526681006E-03 -0.161423408849231003E-03 - -0.161423408822156008E-03 -0.161424083553757003E-03 -0.666666666666666970E-02 - -0.434870224107475974E-03 -0.435273023345194990E-03 -0.434990359763353014E-03 - -0.435150194228865013E-03 -0.434886571394757027E-03 -0.435256310073468009E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.238543465439772992E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.166666666666667011E-01 0.000000000000000000E+00 0.420929704392936990E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435014948460666999E-03 -0.435084904947665005E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.138286675090450010E-03 -0.140229519824888987E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.255502912848136997E-01 0.000000000000000000E+00 -0.161423360221235005E-03 - -0.161424132154678007E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435068640548550009E-03 -0.435070160511034009E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.252823540608277005E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423118749632011E-03 -0.161424373626279998E-03 -0.666666666666666970E-02 - -0.139258116844783998E-03 -0.139258078070556002E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.255489628695773005E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423314131588989E-03 -0.161424178244322993E-03 - 0.000000000000000000E+00 -0.434859092529006977E-03 -0.435284424993224019E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.322050771166806013E-01 0.000000000000000000E+00 - -0.435024403864370983E-03 -0.435075045265772024E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.138278642730750004E-03 -0.140237552184589996E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.258528526278573000E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435045758287675984E-03 -0.435052776259060999E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.138908581714380013E-03 - -0.139607613200959987E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320997445770579981E-01 -0.161423315795968994E-03 -0.161424176579942987E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069043735089990E-03 - -0.435069748238998001E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.256614731136071000E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.429527301021726991E-03 - -0.440793399609727991E-03 -0.166666666666667011E-01 -0.138412525548941011E-03 - -0.140103669366398013E-03 0.000000000000000000E+00 0.319693244861690984E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.161422576319042010E-03 -0.161424916056869999E-03 -0.437741641876694004E-03 - -0.432457188030005977E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.257682960389588010E-01 - 0.000000000000000000E+00 -0.372135294409653991E-03 -0.498733734440172968E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - -0.138908712464486990E-03 -0.139607482450852008E-03 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.418896816202265973E-01 0.000000000000000000E+00 - -0.436537889268050986E-03 -0.433619959227105999E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.134626693225825994E-03 -0.143889501689513003E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.252833739842331998E-01 0.000000000000000000E+00 0.378158663248480029E-04 - -0.725497288713840022E-03 -0.666666666666666970E-02 -0.597713400143132971E-03 - -0.752178542458052050E-04 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.321707087558135013E-01 0.000000000000000000E+00 -0.981956616589569036E-04 - -0.589485760730035007E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.242422452004102012E-03 -0.626419835077194048E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.252392538736065999E-01 -0.333702076879594987E-03 - -0.536013661829747045E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.138219319557457995E-03 -0.140296875357881002E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.321074799924774973E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.434939601065898027E-03 -0.435163477654634996E-03 - -0.435157861406833973E-03 -0.434944986847371983E-03 -0.434989471025225001E-03 - -0.435111473318402012E-03 -0.166666666666667011E-01 -0.138299620238519004E-03 - -0.140216574676820996E-03 -0.138306111153669995E-03 -0.140210083761669002E-03 - -0.138904474579713009E-03 -0.139611720335626992E-03 0.000000000000000000E+00 - 0.421630568906255984E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.382244495288109972E-04 -0.725905871917801953E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435871010405398983E-03 - -0.434311639967858000E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.256952418884623009E-01 0.000000000000000000E+00 - -0.435307159704155999E-03 -0.434801765636921985E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.138667044406462996E-03 -0.139849150508876001E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.253756621166979016E-01 - 0.000000000000000000E+00 -0.110334788320004006E-03 -0.577346634068988027E-03 - -0.666666666666666970E-02 -0.811801782227151991E-03 0.217125293756245991E-04 - -0.166666666666667011E-01 0.000000000000000000E+00 0.318985675848939970E-01 - -0.333010067147420014E-03 -0.536378952719955017E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.135533186849416004E-03 -0.142983008065922994E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320998187031011004E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423317205601008E-03 - -0.161424175170311001E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.439006972327073976E-03 -0.431218847172978989E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.292402861508459996E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423747268413004E-03 -0.161423745107499005E-03 -0.161424199494236009E-03 - -0.161423292881676000E-03 -0.161423292890866999E-03 -0.161424199485045010E-03 - -0.666666666666666970E-02 -0.434862443268883999E-03 -0.435281002767785006E-03 - -0.435100436115737983E-03 -0.435039032534641993E-03 0.000000000000000000E+00 - -0.435036509827943995E-03 -0.435103015721981983E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.238543465439772992E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.420929537555077005E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435015025801769976E-03 -0.435084824299273990E-03 - -0.666666666666666970E-02 -0.138286640917195994E-03 -0.140229553998143003E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.255502682711579998E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423359671854992E-03 -0.161424132704057993E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435067434800663979E-03 - -0.435071393428817989E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.252818652853033995E-01 0.000000000000000000E+00 -0.161423121023111999E-03 - -0.161424371352800010E-03 -0.666666666666666970E-02 -0.139258148276853998E-03 - -0.139258046638484999E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.255494975093474991E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423327915938004E-03 - -0.161424164459974005E-03 0.000000000000000000E+00 -0.435032834826549976E-03 - -0.435106772375089984E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.322088227833026985E-01 - -0.435016604539718995E-03 -0.435083178015839003E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.138286481481243002E-03 -0.140229713434095995E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.258545382043442011E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435180234489963001E-03 -0.434923532533521007E-03 0.000000000000000000E+00 - -0.138910062415604000E-03 -0.139606132499734998E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.321001796355565028E-01 -0.161423327064930995E-03 - -0.161424165310980987E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435025887435454999E-03 -0.435113876200826026E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.255844281592568015E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.327328582483525979E-03 - -0.542038180279041048E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.138213770482627013E-03 -0.140302424432712987E-03 0.000000000000000000E+00 - 0.321601835592413010E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.946024118762530064E-04 -0.593079010512739054E-03 - -0.133620199164305990E-03 -0.725519994603599015E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.257239982411411987E-01 - 0.196968234580236002E-04 -0.707378245847014990E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 -0.458129631085333017E-03 - -0.413598754070089024E-03 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.423184714605200005E-01 0.000000000000000000E+00 -0.434756036956663000E-03 - -0.435354579774434021E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.138688118698557992E-03 -0.139828076216782008E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.251897443601684992E-01 - 0.000000000000000000E+00 -0.646690982568611981E-04 -0.623012324132130035E-03 - -0.666666666666666970E-02 -0.137976768222350989E-03 -0.140539426692988008E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.319702861271339983E-01 - 0.000000000000000000E+00 -0.161422661149470011E-03 -0.161424831226441998E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.425027766008080989E-03 - -0.445328716144572018E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.253132418625061015E-01 - -0.429884571295451002E-03 -0.440422628401466976E-03 -0.666666666666666970E-02 - -0.138411741515959000E-03 -0.140104453399379997E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.312157365405992017E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423748574759013E-03 -0.161423743801152996E-03 - -0.161423313379198001E-03 -0.161424178996714008E-03 -0.161424178996706988E-03 - -0.161423313379204994E-03 -0.166666666666667011E-01 -0.435069382470465026E-03 - -0.435069401871426007E-03 -0.435069376779887026E-03 -0.435069407690234008E-03 - -0.138284693812444989E-03 -0.140231501102894008E-03 0.000000000000000000E+00 - 0.420929708077939987E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435014938579204977E-03 -0.435084915251974006E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.138286675893133993E-03 - -0.140229519022206007E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.256866648616451004E-01 0.000000000000000000E+00 - -0.435050846412010986E-03 -0.435047608857992002E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.139258078983040012E-03 -0.139258115932299013E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.252598190481393009E-01 - 0.000000000000000000E+00 -0.161423313207681012E-03 -0.161424179168232000E-03 - -0.666666666666666970E-02 -0.435070119779517976E-03 -0.435068680382981010E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.320929799686702974E-01 - -0.435014999741732006E-03 -0.435084851470821988E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.138286694533200995E-03 -0.140229500382138002E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.321001296867379007E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423325778209994E-03 - -0.161424166597702991E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435030925452593018E-03 -0.435108724823487006E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.296363951915997008E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435163526863417986E-03 -0.434939554418941980E-03 -0.435078552482020999E-03 - -0.435021040258217015E-03 -0.435027006817730023E-03 -0.435072330536212006E-03 - -0.666666666666666970E-02 -0.138910218392951005E-03 -0.139605976522387993E-03 - -0.138289242827388992E-03 -0.140226952087950006E-03 -0.138286026412109991E-03 - -0.140230168503229007E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.238543465439772992E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.166666666666667011E-01 0.000000000000000000E+00 0.419834140961602986E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423305416680989E-03 - -0.161424186959231996E-03 -0.666666666666666970E-02 -0.435068823290631025E-03 - -0.435069973655738992E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.256581988626478005E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.434982526778275992E-03 -0.435118713563232987E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.138286801913291003E-03 -0.140229393002047994E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256862275407307994E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.434980912593391981E-03 - -0.435120395436144002E-03 -0.666666666666666970E-02 -0.140230584882228997E-03 - -0.138285610033110000E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.255647395300480987E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423744222123004E-03 -0.161423748153789005E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.434974854599588009E-03 - -0.435166057532792997E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320996742791537990E-01 0.000000000000000000E+00 - -0.161423313905322989E-03 -0.161424178470588993E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435067503898425021E-03 -0.435071322773375989E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.256787237707600986E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423750872133001E-03 -0.161423741503779008E-03 0.000000000000000000E+00 - -0.434878052016917011E-03 -0.435265039122159018E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320999946846379988E-01 - -0.161423322217426002E-03 -0.161424170158486007E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435025227850866988E-03 -0.435114550827602994E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.256614590240232011E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435016115295252983E-03 - -0.435083688178646997E-03 -0.166666666666667011E-01 -0.138320998711983004E-03 - -0.140195196203355993E-03 0.000000000000000000E+00 0.319837098932621994E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.161423313471531992E-03 -0.161424178904379990E-03 -0.435069292513270988E-03 - -0.435069493855684000E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256582235323178989E-01 - 0.000000000000000000E+00 -0.434980856857106006E-03 -0.435120454886334974E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - -0.138286853775869010E-03 -0.140229341139469987E-03 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.419834109605791983E-01 0.000000000000000000E+00 - -0.161423305333356989E-03 -0.161424187042554993E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435046434997871025E-03 -0.435092866540485025E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.253691040607966004E-01 0.000000000000000000E+00 -0.434981527116148989E-03 - -0.435119754440041018E-03 -0.666666666666666970E-02 -0.140229345868751000E-03 - -0.138286849046589000E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.319837099126562968E-01 0.000000000000000000E+00 -0.161423313472065989E-03 - -0.161424178903845993E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435069123307102977E-03 -0.435069666874682976E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.253136149345605001E-01 -0.435016121073636018E-03 -0.435083682152984010E-03 - -0.666666666666666970E-02 -0.138320998693124988E-03 -0.140195196222214010E-03 - -0.166666666666667011E-01 0.312157824691738014E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423746615296988E-03 -0.161423745760614994E-03 - -0.161423313748345991E-03 -0.161424178627565991E-03 -0.161424178629453993E-03 - -0.161423313746457989E-03 -0.166666666666667011E-01 -0.435045651504759002E-03 - -0.435093667508772001E-03 -0.435080766409200008E-03 -0.435058268328494021E-03 - -0.138278294452393003E-03 -0.140237900462945994E-03 0.000000000000000000E+00 - 0.420929613697930971E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435018968638475976E-03 -0.435080712727501978E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.138286650710325003E-03 -0.140229544205013995E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256871012844454988E-01 0.000000000000000000E+00 - -0.435049191356535016E-03 -0.435049196139260014E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.139258062358015006E-03 -0.139258132557323991E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.252598188674287988E-01 - 0.000000000000000000E+00 -0.161423313384250004E-03 -0.161424178991662005E-03 - -0.666666666666666970E-02 -0.435070776497347995E-03 -0.435068038137203020E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.320929799834977977E-01 - -0.435014991950716994E-03 -0.435084859595236974E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.138286694574832000E-03 -0.140229500340508000E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320996579186692971E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423313471493991E-03 -0.161424178904418994E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069357599841020E-03 - -0.435069427302476014E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.295485029193944988E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435016115212415004E-03 -0.435083688265029017E-03 -0.435055716856611019E-03 - -0.435042938405563989E-03 -0.435014924529298014E-03 -0.435084929901764013E-03 - -0.666666666666666970E-02 -0.138321048324070988E-03 -0.140195146591268009E-03 - -0.138900264540679010E-03 -0.139615930374659987E-03 -0.138286695603002008E-03 - -0.140229499312336989E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.238543465439772992E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.166666666666667011E-01 0.000000000000000000E+00 0.420929804452524967E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435014924421641997E-03 - -0.435084930014025993E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.138286695610681005E-03 -0.140229499304657992E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.257458287848717007E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435049188240248003E-03 - -0.435049199389004021E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.139258097270406994E-03 -0.139258097644932003E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.255775463585184992E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423313378629012E-03 -0.161424178997282997E-03 - -0.666666666666666970E-02 -0.435069399499345990E-03 -0.435069384790270994E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.257404831595632010E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435042938405561007E-03 -0.435055716856614001E-03 - 0.000000000000000000E+00 -0.138910376016605991E-03 -0.139605818898733006E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320997361629841993E-01 -0.161423315571435007E-03 - -0.161424176804478005E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435069384822633020E-03 -0.435069399466241015E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256633780153988009E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423313378038013E-03 -0.161424178997873996E-03 - 0.000000000000000000E+00 -0.435069374982946988E-03 -0.435069409527666015E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.322089281358309970E-01 -0.435014924516908999E-03 - -0.435084929914729011E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.138286694922169990E-03 -0.140229499993169008E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.256614590292298002E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435016112049646021E-03 -0.435083691563146980E-03 - -0.166666666666667011E-01 -0.138320998722293008E-03 -0.140195196193045990E-03 - 0.000000000000000000E+00 0.319837098823795019E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161423313471231993E-03 - -0.161424178904679989E-03 -0.435069384800128994E-03 -0.435069399489265024E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256582249674996987E-01 0.000000000000000000E+00 - -0.434980660277106983E-03 -0.435120659871895998E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138286858201160009E-03 - -0.140229336714178988E-03 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.419834073409849004E-01 0.000000000000000000E+00 -0.161423305232394999E-03 - -0.161424187143517010E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435069384518094995E-03 -0.435069399777710994E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.253691078101383992E-01 - 0.000000000000000000E+00 -0.434980641232482002E-03 -0.435120678179650019E-03 - -0.666666666666666970E-02 -0.140229336714196010E-03 -0.138286858201142987E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.319837098823795990E-01 - 0.000000000000000000E+00 -0.161423313471231993E-03 -0.161424178904679989E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384799506987E-03 - -0.435069399489901993E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.253136149492911994E-01 - -0.435016112049662988E-03 -0.435083691563128007E-03 -0.666666666666666970E-02 - -0.138320998722293008E-03 -0.140195196193046992E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320885977646230974E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435028541278858008E-03 -0.435070730506436016E-03 -0.435014782666575002E-03 - -0.435085077833358993E-03 -0.435084998915685002E-03 -0.435014858345719973E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.138286722125585996E-03 - -0.140229472789753001E-03 -0.138287952903743989E-03 -0.140228242011595009E-03 - -0.138900254709783987E-03 -0.139615940205555010E-03 0.000000000000000000E+00 - 0.419834115616333006E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423305349811004E-03 -0.161424187026102008E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435070335762563986E-03 - -0.435068469166236015E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.255991790107528011E-01 0.000000000000000000E+00 - -0.434982884474892026E-03 -0.435118340650319981E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.138286203203484011E-03 -0.140229991711855013E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.253696597865739996E-01 - 0.000000000000000000E+00 -0.434979979753249990E-03 -0.435121367820745985E-03 - -0.666666666666666970E-02 -0.140228217416730994E-03 -0.138287977498608003E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.319694542079494021E-01 - -0.161422588303417009E-03 -0.161424904072495000E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435109502026177015E-03 -0.435030181217087981E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320997612467455035E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423316150031992E-03 - -0.161424176225879990E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435046027403482012E-03 -0.435093283082004022E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.296643788713649992E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435041466166206979E-03 -0.435057251216224993E-03 - -0.434987713518249003E-03 -0.435113298977872021E-03 -0.435084291440460005E-03 - -0.435015533322462983E-03 -0.666666666666666970E-02 -0.138384828446226003E-03 - -0.140131366469113997E-03 -0.138922740970408006E-03 -0.139593453944930992E-03 - -0.138352603070340010E-03 -0.140163591844998987E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.238543465439772992E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.420929717450221966E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435014955543717996E-03 -0.435084897561344988E-03 - -0.666666666666666970E-02 -0.138286677768216005E-03 -0.140229517147122992E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.000000000000000000E+00 0.257455320601093010E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435050355484345984E-03 - -0.435048079623292984E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.139258080653318999E-03 -0.139258114262021002E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.255775497212881002E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423313341254991E-03 -0.161424179034656991E-03 -0.666666666666666970E-02 - -0.435070056149005984E-03 -0.435068742611218005E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.257402390203077015E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435113345596286011E-03 -0.434987675900872992E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.138910391927832998E-03 - -0.139605802987505999E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320999657878337022E-01 - -0.161423321565590998E-03 -0.161424170810321011E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435043064748192026E-03 -0.435096312338415002E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256636236446474014E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423319781588012E-03 -0.161424172594323997E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435040266525950976E-03 - -0.435099173616122011E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.322086798404708011E-01 - -0.435015968986542014E-03 -0.435083840783048986E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.138286185919272000E-03 -0.140230008996066998E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.255452807944169004E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423207454397007E-03 -0.161424284921515002E-03 - -0.166666666666667011E-01 -0.404324467010457988E-03 -0.466398863159433010E-03 - 0.000000000000000000E+00 0.318949372706568007E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.323838600968767004E-03 - -0.544927294428741000E-03 -0.135274538150394994E-03 -0.143241656764945006E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.255622073245997013E-01 0.000000000000000000E+00 - -0.114453247275369001E-03 -0.573228175113622992E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 0.218487273219374002E-03 - -0.891418527608312016E-03 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.421663423170699977E-01 0.000000000000000000E+00 0.686068254293191984E-05 - -0.694542104931923968E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.437928115910693992E-03 -0.432328899183500985E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.255893867272030014E-01 - 0.000000000000000000E+00 -0.330661927924627975E-03 -0.535633960374462992E-03 - -0.666666666666666970E-02 -0.139701786050046989E-03 -0.138814408865292008E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.319824508284424977E-01 - 0.000000000000000000E+00 -0.161423276716622004E-03 -0.161424215659290005E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.453057324015285993E-03 - -0.417479555947782987E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.252771264598301987E-01 - -0.434642424564362999E-03 -0.435473396530201005E-03 -0.666666666666666970E-02 - -0.142683907588298003E-03 -0.135832287327041998E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.310595659754696989E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161480379356609011E-03 - -0.161367113019302998E-03 -0.161367201132704989E-03 -0.161480291243207996E-03 - -0.161431097435676003E-03 -0.161416394940237009E-03 -0.166666666666667011E-01 - -0.438433354062566980E-03 -0.431786524598485992E-03 -0.139063303639515990E-03 - -0.139452891275823008E-03 -0.436729888333051012E-03 -0.433449749920077998E-03 - 0.000000000000000000E+00 0.421404522783956001E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.402800893973209023E-03 - -0.467871394310760984E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.227909597161434006E-03 -0.900840851550373050E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.256049312238144991E-01 - 0.000000000000000000E+00 -0.340826929960757021E-03 -0.529259757575146952E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.138562359099061988E-03 - -0.139953835816277009E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.253040222377561991E-01 0.000000000000000000E+00 -0.246116167738164992E-03 - -0.598443392983163958E-03 -0.666666666666666970E-02 -0.140073786324521011E-03 - -0.138442408590818013E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.319703070887365007E-01 -0.161422660701845001E-03 -0.161424831674067008E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.426922283598156992E-03 - -0.443394972905718991E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320998325318406982E-01 0.000000000000000000E+00 -0.161423318386659000E-03 - -0.161424173989254012E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.432674210328745981E-03 -0.437517849506738994E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.296374388638415004E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435070730494953989E-03 -0.435028541287397997E-03 - -0.435085047523665978E-03 -0.435014811728346980E-03 -0.435014733314390012E-03 - -0.435085129293168984E-03 -0.666666666666666970E-02 -0.138910408020893991E-03 - -0.139605786894445006E-03 -0.138288015962704006E-03 -0.140228178952634991E-03 - -0.138286786003025006E-03 -0.140229408912313991E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.238543465439772992E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.419834073379218020E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423305232311000E-03 -0.161424187143602012E-03 -0.666666666666666970E-02 - -0.435069384517251974E-03 -0.435069399778572989E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.256582252366890015E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.434980659785321009E-03 - -0.435120660384639003E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.138286858754340991E-03 -0.140229336160998006E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256868357246730003E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.434980640625951985E-03 - -0.435120678812035981E-03 -0.666666666666666970E-02 -0.140229336292177010E-03 - -0.138286858623161988E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.255642315719874988E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423743707386991E-03 -0.161423748668524991E-03 0.000000000000000000E+00 - -0.435032940295220990E-03 -0.435106665066769985E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320996541101558966E-01 - -0.161423313368980995E-03 -0.161424179006930987E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435069405363546012E-03 -0.435069379055302017E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256786736305051992E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423743710730996E-03 -0.161423748665181989E-03 - 0.000000000000000000E+00 -0.435032918551319986E-03 -0.435106687300586010E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320996542492566969E-01 - -0.161423313372716993E-03 -0.161424179003194989E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435069400469630010E-03 -0.435069383841369989E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.254980067883639992E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161506286329310991E-03 - -0.161341206046600991E-03 -0.166666666666667011E-01 -0.431364756977960023E-03 - -0.438866309539295992E-03 0.000000000000000000E+00 0.319866973594409992E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.161423386153596000E-03 -0.161424106222316009E-03 - -0.435249325540191991E-03 -0.434893412677740991E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256606788604577009E-01 - 0.000000000000000000E+00 -0.434963057292216018E-03 -0.435139014560906973E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - -0.138291831602560002E-03 -0.140224363312778995E-03 0.000000000000000000E+00 - 0.333333333333332982E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.419833493286519976E-01 0.000000000000000000E+00 -0.161423303639283993E-03 - -0.161424188736628991E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435072704181455982E-03 -0.435066152962260003E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.253728412360505994E-01 - 0.000000000000000000E+00 -0.434979171689485976E-03 -0.435122209070102999E-03 - -0.666666666666666970E-02 -0.140221824975804004E-03 -0.138294369939534993E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.319851023368492998E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423349940155008E-03 - -0.161424142435757001E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435344396466284009E-03 -0.434800441646335982E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.251600082854456994E-01 -0.161430149015849008E-03 - -0.161417343360063001E-03 -0.666666666666666970E-02 -0.432078300695953020E-03 - -0.438133229979561004E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.312157350718397994E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423748575108993E-03 -0.161423743800802989E-03 - -0.161423313366150007E-03 -0.161424179009763005E-03 -0.161424179009759996E-03 - -0.161423313366152013E-03 -0.166666666666667011E-01 -0.435069386750616997E-03 - -0.435069397494826013E-03 -0.435069386095123000E-03 -0.435069398165091018E-03 - -0.138284703375701995E-03 -0.140231491539637002E-03 0.000000000000000000E+00 - 0.420931147064136021E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435014771024603002E-03 -0.435085089957181989E-03 - -0.666666666666666970E-02 -0.138286970731685001E-03 -0.140229224183653997E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.254910621183637998E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423348017174001E-03 -0.161424144358738008E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435079665188463989E-03 - -0.435059345445455996E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.249645250281322999E-01 0.000000000000000000E+00 - -0.161423121269891999E-03 -0.161424371106020010E-03 -0.666666666666666970E-02 - -0.139257838659735000E-03 -0.139258356255603997E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.319784847308889966E-01 -0.161423139772803008E-03 - -0.161424352603109001E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435059648379540009E-03 -0.435079356600350973E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.323098477771483969E-01 0.000000000000000000E+00 - -0.434855411650391993E-03 -0.435251202539120021E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.138421129190984010E-03 -0.140095065724355991E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.286728645834729984E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423315838776007E-03 -0.161424176537136002E-03 - -0.161423752187323010E-03 -0.161423740188588999E-03 -0.161423317510291011E-03 - -0.161424174865620998E-03 -0.666666666666666970E-02 -0.438771655527875001E-03 - -0.431448953116295001E-03 -0.138271241563716002E-03 -0.140244953351622995E-03 - -0.435165323893648012E-03 -0.434975574060252989E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.238543465439772992E-01 -0.666666666666666970E-02 - -0.166666666666667011E-01 0.419833961506188985E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423304924141007E-03 -0.161424187451771002E-03 - -0.666666666666666970E-02 -0.435069598057073975E-03 -0.435069190609702994E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.256582546102008015E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.434978063949837974E-03 -0.435123367209186982E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.138286924064156001E-03 -0.140229270851182996E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256881329485436002E-01 - 0.000000000000000000E+00 -0.434979402202776018E-03 -0.435121969745410990E-03 - -0.666666666666666970E-02 -0.140226692592859009E-03 -0.138289502322480991E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.257149793095595991E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.434964716442319023E-03 -0.435137270613733019E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.138403971130206005E-03 - -0.140112223785132992E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320995089735353969E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423309436031000E-03 -0.161424182939881009E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435071687413104014E-03 -0.435067147303640016E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.257775991854455010E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.434123548609677019E-03 -0.436014071331951010E-03 0.000000000000000000E+00 - -0.138328115136844999E-03 -0.140188079778493999E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320970398516952013E-01 - -0.161423235617312989E-03 -0.161424256758598993E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435167788232856023E-03 -0.434973170725627982E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.255649837703663986E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423599069647001E-03 -0.161423893306266011E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.435941020818056012E-03 -0.434216891005145009E-03 - 0.321778215370647006E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.133203828204913997E-03 -0.554477594184077968E-03 - -0.310288607973751023E-03 -0.560774180157768039E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.258307931039124007E-01 - 0.000000000000000000E+00 -0.200815667582686987E-03 -0.626132462157779008E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - -0.446235732743376025E-03 -0.410993106848831988E-03 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.421918986255057984E-01 - 0.000000000000000000E+00 -0.435044026349501998E-03 -0.435054581088656020E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.138420116213701009E-03 - -0.140096078701637989E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.251983478038565016E-01 0.000000000000000000E+00 - -0.874760871910627032E-05 -0.678933813669884995E-03 -0.666666666666666970E-02 - -0.139011566068860000E-03 -0.139504628846480000E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.319960854906845005E-01 0.000000000000000000E+00 - -0.161423534312251998E-03 -0.161423958063660011E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.422822744160627974E-03 -0.447571236367950997E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.333333333333332982E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.251635090375166015E-01 -0.161499426349241996E-03 - -0.161348066026670989E-03 -0.666666666666666970E-02 -0.431483288490702005E-03 - -0.438741647350535027E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.312158150513081011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161424178244698994E-03 -0.161423314131212988E-03 - -0.161423314059391994E-03 -0.161424178316519988E-03 -0.161423748120379993E-03 - -0.161423744255532992E-03 -0.166666666666667011E-01 -0.435052937198406996E-03 - -0.435086217674513994E-03 -0.138285143315771992E-03 -0.140231051599568009E-03 - -0.435084983208553009E-03 -0.435054143960659026E-03 0.000000000000000000E+00 - 0.420929687916430026E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435023352498181996E-03 -0.435076141242082002E-03 -0.666666666666666970E-02 - -0.138286660717270009E-03 -0.140229534198068988E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256871016294627011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435049188710877018E-03 -0.435049198898220012E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258077330179004E-03 - -0.139258117585160996E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.252598187185143000E-01 0.000000000000000000E+00 - -0.161423313380516987E-03 -0.161424178995394995E-03 -0.666666666666666970E-02 - -0.435070189961645989E-03 -0.435068611747460024E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.320929801235530984E-01 -0.435014945837523977E-03 - -0.435084907681736004E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.138286694923904008E-03 -0.140229499991435992E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320996579074415977E-01 - 0.000000000000000000E+00 -0.161423313471188002E-03 -0.161424178904724007E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069377867833978E-03 - -0.435069406577771021E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.295485029270328992E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435016111300222006E-03 -0.435083692344642020E-03 - -0.435055716669636018E-03 -0.435042938584860996E-03 -0.435014924590734005E-03 - -0.435084929837698995E-03 -0.666666666666666970E-02 -0.138321048357603003E-03 - -0.140195146557735994E-03 -0.138900264538850991E-03 -0.139615930376488006E-03 - -0.138286695606074989E-03 -0.140229499309265011E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.238543465439772992E-01 - -0.666666666666666970E-02 -0.166666666666667011E-01 0.420929804437093005E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435014924430680990E-03 - -0.435084930004601023E-03 -0.666666666666666970E-02 -0.138286695607513997E-03 - -0.140229499307826003E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.257458287840863984E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435049188263473999E-03 - -0.435049199364782998E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.139258097267473007E-03 -0.139258097647866993E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.255775463585166986E-01 0.000000000000000000E+00 -0.161423313378626003E-03 - -0.161424178997286006E-03 -0.666666666666666970E-02 -0.435069399615192016E-03 - -0.435069384676977993E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.257404832051559007E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435042938584863977E-03 - -0.435055716669632006E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.138910376055990990E-03 -0.139605818859349010E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320997361635547013E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423315571449996E-03 -0.161424176804462013E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384753956998E-03 - -0.435069399536463976E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.256633780155203009E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423313378038989E-03 -0.161424178997872993E-03 - 0.000000000000000000E+00 -0.435069385238039011E-03 -0.435069399041488984E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.322089283932210022E-01 -0.435014924466399001E-03 -0.435084929967365997E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286695450416002E-03 - -0.140229499464922995E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.256614590292296996E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435016112049732974E-03 -0.435083691563056016E-03 - -0.166666666666667011E-01 -0.138320998722292005E-03 -0.140195196193046992E-03 - 0.000000000000000000E+00 0.319837098823798002E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.161423313471231993E-03 -0.161424178904679989E-03 -0.435069384798286989E-03 - -0.435069399491148988E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256582249674734003E-01 0.000000000000000000E+00 - -0.434980660273001976E-03 -0.435120659876175994E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138286858201095011E-03 - -0.140229336714244013E-03 0.000000000000000000E+00 0.333333333333332982E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.419834073410084996E-01 - 0.000000000000000000E+00 -0.161423305232396002E-03 -0.161424187143516007E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435069384645917010E-03 - -0.435069399647007983E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.253691078100609993E-01 0.000000000000000000E+00 - -0.434980641243667987E-03 -0.435120678167986010E-03 -0.666666666666666970E-02 - -0.140229336714366013E-03 -0.138286858200973987E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.319837098823803970E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423313471231993E-03 -0.161424178904679989E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384793870004E-03 - -0.435069399495665991E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.253136149492908004E-01 -0.435016112049908018E-03 -0.435083691562873978E-03 - -0.666666666666666970E-02 -0.138320998722292005E-03 -0.140195196193046992E-03 - -0.166666666666667011E-01 0.312157364805190005E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423748574844990E-03 -0.161423743801066992E-03 -0.161423313378673003E-03 - -0.161424178997239006E-03 -0.161424178997232013E-03 -0.161423313378679996E-03 - -0.166666666666667011E-01 -0.435069383970378013E-03 -0.435069400337715023E-03 - -0.435069376957132018E-03 -0.435069407508996008E-03 -0.138284694213328010E-03 - -0.140231500702010987E-03 0.000000000000000000E+00 0.420929739826170005E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435014932601145024E-03 - -0.435084921485382001E-03 -0.666666666666666970E-02 -0.138286682366655988E-03 - -0.140229512548683009E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.254915570191668016E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423360046659997E-03 - -0.161424132329252012E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435068904052976024E-03 -0.435069891069177000E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.249645635933331000E-01 -0.161423119060364001E-03 -0.161424373315548008E-03 - -0.666666666666666970E-02 -0.139258110049798010E-03 -0.139258084865540987E-03 - -0.166666666666667011E-01 0.319795589129939009E-01 -0.161423181597269993E-03 - -0.161424310778641989E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435063340741964014E-03 -0.435075580375908022E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.322057823008502012E-01 - 0.000000000000000000E+00 -0.435021995940365993E-03 -0.435077556208205004E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138280141008673011E-03 - -0.140236053906665986E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.286987467722891014E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423485461700003E-03 - -0.161424006914212006E-03 -0.161423743701962999E-03 -0.161423748673950013E-03 - -0.161423485478030988E-03 -0.161424006897880994E-03 -0.666666666666666970E-02 - -0.434818199915888975E-03 -0.435326186273355992E-03 -0.138107939683270998E-03 - -0.140408255232067999E-03 -0.434750487938305018E-03 -0.435395405799094998E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.238543465439772992E-01 -0.666666666666666970E-02 -0.166666666666667011E-01 - 0.419834078568204969E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423305246576987E-03 - -0.161424187129334995E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435069368742756982E-03 -0.435069415908642004E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.256580957796514984E-01 - 0.000000000000000000E+00 -0.434980863637928004E-03 -0.435120447850464987E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286593339167997E-03 - -0.140229601576171000E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.256868273049145994E-01 0.000000000000000000E+00 -0.434980612246944019E-03 - -0.435120708410805001E-03 -0.666666666666666970E-02 -0.140229353764139986E-03 - -0.138286841151199011E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256611955309135990E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435106719724830994E-03 -0.434994029732972990E-03 0.000000000000000000E+00 - -0.138320118014007004E-03 -0.140196076901331993E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320999233193186018E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423320520808004E-03 -0.161424171855104005E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435059778147594005E-03 - -0.435079222540635984E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.257716863610135995E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435022363973044002E-03 -0.435077172538276982E-03 - 0.000000000000000000E+00 -0.138312498857191992E-03 -0.140203696058148009E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320996765524508010E-01 - 0.000000000000000000E+00 -0.161423313971483987E-03 -0.161424178404427995E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435068773858323006E-03 - -0.435070024197505999E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.255013626644550011E-01 -0.666666666666666970E-02 - -0.161778513994174000E-03 -0.161068978381738009E-03 -0.166666666666667011E-01 - -0.430721544928321987E-03 -0.439524018875910982E-03 0.319967451293014030E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.161423540387061002E-03 -0.161423951988851007E-03 - -0.436779134539303025E-03 -0.433396918749490016E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.257463974004814994E-01 - 0.000000000000000000E+00 -0.357123790381600977E-03 -0.513334443915617971E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - -0.138849213053182001E-03 -0.139666981862157999E-03 0.000000000000000000E+00 - 0.333333333333332982E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.421600707339402012E-01 0.000000000000000000E+00 -0.438162222449141976E-03 - -0.432071491857211011E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.110310967390135005E-03 -0.562620286998803958E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.250247683829411989E-01 - 0.000000000000000000E+00 0.312489112005970006E-04 -0.718930333589588031E-03 - -0.666666666666666970E-02 -0.143948366628396992E-03 -0.134567828286942005E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.321784029807069966E-01 - -0.131393333165138998E-03 -0.556288089223852967E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.344952960577046006E-03 -0.526365968885401047E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.252170877020880015E-01 -0.161423598610962993E-03 - -0.161423893764948989E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435940683620558982E-03 -0.434217221098728986E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.312157364068764015E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423748574870008E-03 -0.161423743801043004E-03 - -0.161423313378020991E-03 -0.161424178997891994E-03 -0.161424178997885001E-03 - -0.161423313378028011E-03 -0.166666666666667011E-01 -0.435069384799240003E-03 - -0.435069399490174995E-03 -0.435069377535609984E-03 -0.435069406917481974E-03 - -0.138284694694889009E-03 -0.140231500220449988E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 0.420929804454727996E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435014924420943987E-03 -0.435084930014753980E-03 - -0.666666666666666970E-02 -0.138286695611132006E-03 -0.140229499304206991E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256871018618805014E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435049188236810973E-03 -0.435049199392586983E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258097270825008E-03 - -0.139258097644513989E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.252598186487732017E-01 0.000000000000000000E+00 - -0.161423313378620013E-03 -0.161424178997291996E-03 -0.666666666666666970E-02 - -0.435069399482861021E-03 -0.435069384806393027E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.320929801562968992E-01 -0.435014924399239995E-03 - -0.435084930037426003E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.138286695018654008E-03 -0.140229499896684989E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320996579148839015E-01 - 0.000000000000000000E+00 -0.161423313471389989E-03 -0.161424178904521993E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384785220998E-03 - -0.435069399504509015E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.295485029252238011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435016114058515979E-03 -0.435083689468307975E-03 -0.435055716894270023E-03 - -0.435042938369449973E-03 -0.435014924446868020E-03 -0.435084929987721025E-03 - -0.666666666666666970E-02 -0.138321048328908996E-03 -0.140195146586430001E-03 - -0.138900264540856006E-03 -0.139615930374483994E-03 -0.138286695607003012E-03 - -0.140229499308336012E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.238543465439772992E-01 -0.666666666666666970E-02 -0.166666666666667011E-01 - 0.420929804454603998E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435014924420999986E-03 -0.435084930014695975E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.138286695611106988E-03 -0.140229499304232009E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.257458287849481986E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435049188236881013E-03 -0.435049199392514017E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258097270799990E-03 - -0.139258097644539007E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.255775463582476013E-01 0.000000000000000000E+00 - -0.161423313378621992E-03 -0.161424178997289990E-03 -0.666666666666666970E-02 - -0.435069399483869980E-03 -0.435069384805406023E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.257404831771072991E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435042938369448998E-03 -0.435055716894270999E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.138910376031616011E-03 -0.139605818883723013E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320997361628699990E-01 - 0.000000000000000000E+00 -0.161423315571431998E-03 -0.161424176804481013E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384829067001E-03 - -0.435069399459660992E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.256633780151233996E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423313378029990E-03 -0.161424178997881992E-03 0.000000000000000000E+00 - -0.435069376411787974E-03 -0.435069408066628019E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.322089281582298992E-01 -0.435014924431676993E-03 -0.435084930003603991E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286694967978995E-03 - -0.140229499947360002E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.256614590292298002E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435016112049635016E-03 -0.435083691563157009E-03 - -0.166666666666667011E-01 -0.138320998722293008E-03 -0.140195196193045990E-03 - 0.000000000000000000E+00 0.319837098823795019E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.161423313471231993E-03 -0.161424178904679989E-03 -0.435069384800495997E-03 - -0.435069399488889998E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256582249675054996E-01 0.000000000000000000E+00 - -0.434980660275541991E-03 -0.435120659873527993E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138286858201179010E-03 - -0.140229336714159988E-03 0.000000000000000000E+00 0.333333333333332982E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.419834073409666025E-01 - 0.000000000000000000E+00 -0.161423305232394999E-03 -0.161424187143517010E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435069384656581982E-03 - -0.435069399636101993E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.253691078101523984E-01 0.000000000000000000E+00 - -0.434980641228473002E-03 -0.435120678183829998E-03 -0.666666666666666970E-02 - -0.140229336714159988E-03 -0.138286858201179010E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.319837098823795019E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423313471231993E-03 -0.161424178904679989E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384800471982E-03 - -0.435069399488914989E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.253136149492911994E-01 -0.435016112049635992E-03 -0.435083691563157009E-03 - -0.666666666666666970E-02 -0.138320998722293008E-03 -0.140195196193045990E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.320885629537309008E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435084884244977990E-03 -0.435014968312467975E-03 - -0.435014924420903005E-03 -0.435084930014797023E-03 -0.435055716900959984E-03 - -0.435042938363035996E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.138286695608023002E-03 -0.140229499307315995E-03 -0.138900264540568990E-03 - -0.139615930374770007E-03 -0.138287928689845009E-03 -0.140228266225493988E-03 - 0.000000000000000000E+00 0.419834073409644029E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423305232394999E-03 -0.161424187143517010E-03 - -0.666666666666666970E-02 -0.435069384693222974E-03 -0.435069399598635002E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.255994980449185013E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.434980660275051986E-03 -0.435120659874038977E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286858201180012E-03 - -0.140229336714159012E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.253691078105353005E-01 0.000000000000000000E+00 - -0.434980641228200975E-03 -0.435120678184114005E-03 -0.666666666666666970E-02 - -0.140229336713374998E-03 -0.138286858201963999E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.319837881362097023E-01 -0.161423315571430996E-03 - -0.161424176804481013E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435069384829208001E-03 -0.435069399459517986E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320996579140455027E-01 - 0.000000000000000000E+00 -0.161423313471367004E-03 -0.161424178904545006E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384800495021E-03 - -0.435069399488890974E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.295485029266231991E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435016113771493024E-03 - -0.435083689767614020E-03 -0.435055716901091010E-03 -0.435042938362910012E-03 - -0.435014924420824020E-03 -0.435084930014878989E-03 -0.666666666666666970E-02 - -0.138321048331202002E-03 -0.140195146584136995E-03 -0.138900264540522993E-03 - -0.139615930374817007E-03 -0.138286695607886989E-03 -0.140229499307452008E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.238543465439772992E-01 -0.666666666666666970E-02 -0.166666666666667011E-01 - 0.420929804454732021E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435014924420943012E-03 -0.435084930014755010E-03 -0.666666666666666970E-02 - -0.138286695611133009E-03 -0.140229499304205988E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.257458287849504017E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435049188236650999E-03 -0.435049199392754005E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258097270826011E-03 - -0.139258097644513013E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.255775463581538985E-01 0.000000000000000000E+00 - -0.161423313378620013E-03 -0.161424178997291996E-03 -0.666666666666666970E-02 - -0.435069399482816027E-03 -0.435069384806436991E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.257404831864265007E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435042938362910012E-03 -0.435055716901091010E-03 0.000000000000000000E+00 - -0.138910376039622004E-03 -0.139605818875716993E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320997361628520966E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423315571430996E-03 -0.161424176804481013E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384829304983E-03 - -0.435069399459418022E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256633780150350016E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423313378028011E-03 -0.161424178997883998E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435069377535611014E-03 - -0.435069406917480998E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.322089281809833025E-01 -0.435014924399087014E-03 - -0.435084930037585977E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.138286695014638991E-03 -0.140229499900700006E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.255642320220383010E-01 -0.666666666666666970E-02 -0.161423743718748996E-03 - -0.161423748657163013E-03 -0.166666666666667011E-01 -0.435032852654396000E-03 - -0.435106754681328976E-03 0.319837065485489985E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161423313381479000E-03 - -0.161424178994433009E-03 -0.435069384799287003E-03 -0.435069399490127019E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.256582249674894985E-01 - 0.000000000000000000E+00 -0.434980660275194016E-03 -0.435120659873890984E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - -0.138286858201148001E-03 -0.140229336714191999E-03 0.000000000000000000E+00 - 0.333333333333332982E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.419834073409640976E-01 0.000000000000000000E+00 -0.161423305232394999E-03 - -0.161424187143517010E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435069384687570974E-03 -0.435069399604414992E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.253691078101366992E-01 - 0.000000000000000000E+00 -0.434980641228032002E-03 -0.435120678184291001E-03 - -0.666666666666666970E-02 -0.140229336714191999E-03 -0.138286858201148001E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.319837065485489985E-01 - 0.000000000000000000E+00 -0.161423313381479000E-03 -0.161424178994433009E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384799287003E-03 - -0.435069399490127019E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.252163879420997002E-01 - -0.161423743718748996E-03 -0.161423748657163013E-03 -0.666666666666666970E-02 - -0.435032852654396000E-03 -0.435106754681328976E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320887366348247965E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435111019942127013E-03 -0.434989904848891011E-03 - -0.435014811994715992E-03 -0.435085047244578027E-03 -0.435084986246467005E-03 - -0.435014870489667008E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.138286808290661993E-03 -0.140229386624677005E-03 -0.138288040456547998E-03 - -0.140228154458792002E-03 -0.138900328862363002E-03 -0.139615866052975995E-03 - 0.000000000000000000E+00 0.419834068919483008E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423305219605994E-03 - -0.161424187156306991E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435073590544243023E-03 -0.435065286134219016E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.255994981604824000E-01 - 0.000000000000000000E+00 -0.434980605991956010E-03 -0.435120716478292976E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286858621097992E-03 - -0.140229336294241005E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.253691080400011010E-01 0.000000000000000000E+00 -0.434980551188209996E-03 - -0.435120772070955015E-03 -0.666666666666666970E-02 -0.140229336059168004E-03 - -0.138286858856170994E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.319837098814821016E-01 -0.161423313471205999E-03 -0.161424178904706010E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069401084582977E-03 - -0.435069383239967001E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320996545746903006E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423313381464986E-03 -0.161424178994447998E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435069392005489001E-03 -0.435069392121542001E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.292425277050150992E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423743718737991E-03 -0.161423748657174993E-03 - -0.161424178997885001E-03 -0.161423313378027008E-03 -0.161423313378019988E-03 - -0.161424178997891994E-03 -0.666666666666666970E-02 -0.435032852646152974E-03 - -0.435106754689757998E-03 -0.435069377535869000E-03 -0.435069406917216995E-03 - 0.000000000000000000E+00 -0.435069384798973994E-03 -0.435069399490447021E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.238543465439772992E-01 - -0.666666666666666970E-02 -0.166666666666667011E-01 0.420929804212083966E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435014924419121986E-03 - -0.435084930016657026E-03 -0.666666666666666970E-02 -0.138286695561419006E-03 - -0.140229499353920994E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.255502626954251001E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423359539924989E-03 - -0.161424132835986993E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435069382989675027E-03 -0.435069401340516977E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.252823579642372995E-01 0.000000000000000000E+00 -0.161423118848985994E-03 - -0.161424373526926991E-03 -0.666666666666666970E-02 -0.139258097691293003E-03 - -0.139258097224046997E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.255489358069172015E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423313378028011E-03 -0.161424178997883998E-03 - 0.000000000000000000E+00 -0.435069377535374983E-03 -0.435069406917722016E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.322089281809831013E-01 - 0.000000000000000000E+00 -0.435014924399098994E-03 -0.435084930037573996E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286695014637988E-03 - -0.140229499900701009E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.258549253945415010E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435042938363035996E-03 -0.435055716900959008E-03 0.000000000000000000E+00 - -0.138910376039619998E-03 -0.139605818875718999E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320997361628526032E-01 -0.161423315571430996E-03 -0.161424176804481013E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384829208001E-03 - -0.435069399459517010E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.255489358069375012E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423313378028011E-03 - -0.161424178997883998E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.435069377571737986E-03 -0.435069406880540007E-03 - 0.000000000000000000E+00 0.320929801553170024E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435014924399170009E-03 - -0.435084930037499024E-03 -0.138286695016646012E-03 -0.140229499898693013E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.255502626954253013E-01 - 0.000000000000000000E+00 -0.161423359539924989E-03 -0.161424132835986993E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - -0.435069382989559993E-03 -0.435069401340634017E-03 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.420929804212068007E-01 - 0.000000000000000000E+00 -0.435014924417415994E-03 -0.435084930018437015E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.138286695561417000E-03 - -0.140229499353921997E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.249646302548570016E-01 0.000000000000000000E+00 - -0.161423118848985994E-03 -0.161424373526926991E-03 -0.666666666666666970E-02 - -0.139258097691296012E-03 -0.139258097224043988E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.319837881362063023E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423315571430996E-03 -0.161424176804481013E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384829284004E-03 - -0.435069399459439977E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.253926391078276997E-01 -0.435042938363074973E-03 - -0.435055716900919001E-03 -0.666666666666666970E-02 -0.138910376047635993E-03 - -0.139605818867703004E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.323033163368178014E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435248914087700014E-03 -0.434857618270111026E-03 - -0.434883815525587006E-03 -0.435221605060230013E-03 -0.435144012464388997E-03 - -0.434958245913406004E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.138392691801862987E-03 -0.140123503113476010E-03 -0.138936896109538013E-03 - -0.139579298805801011E-03 -0.138396653972959005E-03 -0.140119540942379992E-03 - 0.000000000000000000E+00 0.419134531554943013E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.160003730031608006E-03 - -0.162843762344304003E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.439266035089253990E-03 -0.430980363868474004E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.256392674343047001E-01 - 0.000000000000000000E+00 -0.434777562089547990E-03 -0.435332391680561979E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.138409864981691995E-03 - -0.140106329933648005E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.254101279172756003E-01 0.000000000000000000E+00 -0.373644623812598002E-03 - -0.492026351508191052E-03 -0.666666666666666970E-02 -0.908412435916435045E-03 - 0.235481181527497004E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.321268372004408984E-01 -0.200001267691483995E-03 -0.627372315610022989E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.127171097760206999E-03 - -0.545760156628731963E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320995602249590020E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423310638646997E-03 -0.161424181737265012E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.437288210106853009E-03 -0.432899574546833006E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.296377168966710999E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.434989904918347983E-03 -0.435111019849799988E-03 - -0.435084787576991007E-03 -0.435015060995395985E-03 -0.435015087313978986E-03 - -0.435084760132168008E-03 -0.666666666666666970E-02 -0.138910479330903999E-03 - -0.139605715584434998E-03 -0.138288220902347007E-03 -0.140227974012992993E-03 - 0.000000000000000000E+00 -0.138286985894706013E-03 -0.140229209020633011E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.238543465439772992E-01 - -0.666666666666666970E-02 -0.166666666666667011E-01 0.419834073423189028E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423305232431997E-03 - -0.161424187143480012E-03 -0.666666666666666970E-02 -0.435069384782949973E-03 - -0.435069399506886020E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.256582245002681016E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.434980660161395994E-03 - -0.435120659992680999E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.138286857243334990E-03 -0.140229337672004008E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.256868356243260994E-01 0.000000000000000000E+00 -0.434980641780946975E-03 - -0.435120677607716026E-03 -0.666666666666666970E-02 -0.140229336498973991E-03 - -0.138286858416365006E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.255642316967265998E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423743725422992E-03 -0.161423748650489993E-03 0.000000000000000000E+00 - -0.435032788457063980E-03 -0.435106820324987996E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320996548633918979E-01 0.000000000000000000E+00 - -0.161423313389182012E-03 -0.161424178986729997E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435069349793661977E-03 -0.435069435284559975E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.256786747351973009E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423743711796008E-03 -0.161423748664116001E-03 0.000000000000000000E+00 - -0.435032875429173019E-03 -0.435106731393101022E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320996542943430968E-01 -0.161423313373898990E-03 -0.161424179002013995E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069393047989017E-03 - -0.435069391099458976E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.256124534411805993E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.422563580131968979E-03 -0.447998624216189984E-03 - -0.166666666666667011E-01 -0.138286150728846990E-03 -0.140230044186493010E-03 - 0.000000000000000000E+00 0.319552376669784977E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.161419817025352993E-03 -0.161427675350558989E-03 -0.434132277684113981E-03 - -0.436028351477167012E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.256464487089681989E-01 0.000000000000000000E+00 -0.434899896387440008E-03 - -0.435204881412590012E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.166666666666667011E-01 -0.138261308742774005E-03 -0.140254886172565995E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.419830852156009016E-01 - 0.000000000000000000E+00 -0.161423296248432011E-03 -0.161424196127479998E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435080309509953001E-03 - -0.435058715368486990E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.253594622357068006E-01 0.000000000000000000E+00 - -0.434950331803641015E-03 -0.435152283781920987E-03 -0.666666666666666970E-02 - -0.140249996876020999E-03 -0.138266198039317998E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.319571786575686967E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161420475113085006E-03 -0.161427017262827003E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.434308830163140978E-03 - -0.435847642627907976E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.252660555352814988E-01 -0.423620518083848974E-03 - -0.446908164705409994E-03 -0.666666666666666970E-02 -0.138290733115793013E-03 - -0.140225461799546011E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.320833316936339971E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.434976564103407978E-03 -0.435124935063587002E-03 -0.435182102606195996E-03 - -0.434921744344358996E-03 -0.434970473518194996E-03 -0.435131286534540024E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.138283560982178005E-03 - -0.140232633933160992E-03 -0.138290646200932993E-03 -0.140225548714406005E-03 - -0.138898978959791009E-03 -0.139617215955547988E-03 0.000000000000000000E+00 - 0.421628681367203009E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.397192681545031025E-04 -0.727400690543495007E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435235074593796000E-03 - -0.434912988301708998E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.256973882949261996E-01 0.000000000000000000E+00 - -0.435443110746612982E-03 -0.434671360314161997E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.138672805730211991E-03 -0.139843389185127006E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.253811408048402014E-01 - 0.000000000000000000E+00 -0.111660192932873994E-03 -0.576021229456117972E-03 - -0.666666666666666970E-02 -0.807402803567670987E-03 0.113198743547475004E-04 - -0.166666666666667011E-01 0.000000000000000000E+00 0.318951479546440986E-01 - -0.334801614937765011E-03 -0.534674356904564995E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.135236534786964987E-03 -0.143279660128374010E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320991543630683995E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423299320898012E-03 - -0.161424193055014999E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.439089518603328992E-03 -0.431138195492317014E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.292550790147875009E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423735549572994E-03 -0.161423756826338988E-03 -0.161424083758227992E-03 - -0.161423408617684993E-03 -0.161423408591030993E-03 -0.161424083784880988E-03 - -0.666666666666666970E-02 -0.434939106056469018E-03 -0.435202598866759986E-03 - -0.434989974486759020E-03 -0.435150588304750977E-03 0.000000000000000000E+00 - -0.434898044279196977E-03 -0.435244580341264027E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.238543465439772992E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.420929736779582994E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435014942481456979E-03 - -0.435084911182310997E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.138286681738492993E-03 -0.140229513176846004E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.255502900122648001E-01 - 0.000000000000000000E+00 -0.161423360190818011E-03 -0.161424132185093998E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435068884552595005E-03 - -0.435069911008886009E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.252824734873801985E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423118220565989E-03 - -0.161424374155345993E-03 -0.666666666666666970E-02 -0.139258110569624000E-03 - -0.139258084345714997E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.255485447377057015E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.161423302756024003E-03 -0.161424189619888006E-03 0.000000000000000000E+00 - -0.434872688531902005E-03 -0.435270525342468988E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.322052497845985025E-01 0.000000000000000000E+00 -0.435023979094175024E-03 - -0.435075488205578982E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.138279010834745992E-03 -0.140237184080593006E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.258530259445350005E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.434958189050955980E-03 -0.435144092591555997E-03 0.000000000000000000E+00 - -0.138909064929744003E-03 -0.139607129985594994E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320994819723071992E-01 -0.161423308725767013E-03 - -0.161424183650144996E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435080068461255018E-03 -0.435058950988906974E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.256614731138163008E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.429527300766952978E-03 -0.440793399874087001E-03 -0.166666666666667011E-01 - -0.138412525549365991E-03 -0.140103669365973006E-03 0.000000000000000000E+00 - 0.319693244854589026E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161422576318977012E-03 - -0.161424916056934997E-03 -0.437741651795140981E-03 -0.432457178334903997E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.257682962742953013E-01 - 0.000000000000000000E+00 -0.372135275695885989E-03 -0.498733752802193965E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - -0.138908713201794991E-03 -0.139607481713544007E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.418896828694114015E-01 0.000000000000000000E+00 - -0.436533664255522999E-03 -0.433624016339187999E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.134626819469867013E-03 -0.143889375445472011E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.252833740154079986E-01 0.000000000000000000E+00 0.378159457419296988E-04 - -0.725497368130921007E-03 -0.666666666666666970E-02 -0.597713125202046038E-03 - -0.752181291868920024E-04 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.321707087590768978E-01 0.000000000000000000E+00 -0.981956570690172963E-04 - -0.589485765319974005E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.242422521139259002E-03 -0.626419769058458972E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.252392538739079006E-01 -0.333702074783927001E-03 -0.536013663819403965E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 0.000000000000000000E+00 - -0.138219319560300990E-03 -0.140296875355038007E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.323033117395759989E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435248913863458978E-03 - -0.434857618485890026E-03 -0.434883818075460025E-03 -0.435221602402711982E-03 - -0.435144010721807983E-03 -0.434958247585231009E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 -0.138392690175634999E-03 -0.140123504739703998E-03 - -0.138936895546960007E-03 -0.139579299368378990E-03 -0.138396652402613996E-03 - -0.140119542512725002E-03 0.000000000000000000E+00 0.419134536981460026E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.160003888239780996E-03 - -0.162843604136131013E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.439265966752186974E-03 -0.430980430216653992E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.000000000000000000E+00 - 0.256385430489332010E-01 0.000000000000000000E+00 -0.434778992070864020E-03 - -0.435330901615854019E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.138408856053690005E-03 -0.140107338861648992E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.254101277388419991E-01 0.000000000000000000E+00 - -0.373643453136775986E-03 -0.492027450061672958E-03 -0.666666666666666970E-02 - -0.908412791146940971E-03 0.235481536758003012E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.321268373345425032E-01 -0.200001595110305000E-03 - -0.627372161158813030E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.127171079349847012E-03 -0.545760175039092005E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.321000029443719026E-01 0.000000000000000000E+00 -0.161423322269462990E-03 - -0.161424170106448992E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.437246082492378997E-03 -0.432940746439987984E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.296367144183955003E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.435124963867026995E-03 - -0.434976534547966984E-03 -0.435077880745983003E-03 -0.435021684411966020E-03 - -0.435028125198068990E-03 -0.435071164272797983E-03 -0.666666666666666970E-02 - -0.138910250043690000E-03 -0.139605944871648997E-03 -0.138289604026781002E-03 - -0.140226590888557995E-03 -0.138286261066744988E-03 -0.140229933848595012E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.238543465439772992E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.419834145363481015E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423305428732005E-03 -0.161424186947180004E-03 - -0.666666666666666970E-02 -0.435068797988044014E-03 -0.435069999528659985E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.256581969780162004E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.434982641293220015E-03 -0.435118594151826009E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286797863401998E-03 - -0.140229397051936999E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256861833871666993E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.434980942275246015E-03 -0.435120364500525027E-03 - -0.666666666666666970E-02 -0.140230675945465999E-03 -0.138285518969872998E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.255648152871906995E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423744276026012E-03 - -0.161423748099886999E-03 0.000000000000000000E+00 -0.434971116348022017E-03 - -0.435169879709118980E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320996762669861976E-01 0.000000000000000000E+00 - -0.161423313958423010E-03 -0.161424178417488999E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435067366048129010E-03 -0.435071463729721000E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.256787289926059988E-01 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423752315998997E-03 -0.161423740059913012E-03 - 0.000000000000000000E+00 -0.434863867040502017E-03 -0.435279543030525011E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.321000539630773019E-01 -0.161423323759014989E-03 -0.161424168616896993E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435021966445662005E-03 - -0.435117885588833984E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.256124534378156000E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.422563583786935012E-03 -0.447998620447711019E-03 - -0.166666666666667011E-01 -0.138286150716901006E-03 -0.140230044198437991E-03 - 0.000000000000000000E+00 0.319552376748891975E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.161419817028325008E-03 -0.161427675347588004E-03 -0.434132178542880997E-03 - -0.436028452917342011E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.256464472998883010E-01 0.000000000000000000E+00 -0.434900113529982019E-03 - -0.435204654991785977E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.166666666666667011E-01 -0.138261303887879009E-03 -0.140254891027459988E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.419830890665787021E-01 - 0.000000000000000000E+00 -0.161423296358576002E-03 -0.161424196017336007E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435055172719591018E-03 - -0.435083932013876024E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.253594585320825010E-01 0.000000000000000000E+00 - -0.434951272976298023E-03 -0.435151302408671016E-03 -0.666666666666666970E-02 - -0.140250006846153006E-03 -0.138266188069185992E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.319571786801332997E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161420475119938004E-03 -0.161427017255975008E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.434308550272304017E-03 - -0.435847929000599975E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.252660555256693994E-01 -0.423620528167419021E-03 - -0.446908154298355977E-03 -0.666666666666666970E-02 -0.138290733082334995E-03 - -0.140225461833005005E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.320861713072602972E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435354479890220003E-03 - -0.434756412513020009E-03 -0.435057879269062974E-03 -0.435040865004169999E-03 - -0.434966118570618990E-03 -0.435135827690980980E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 -0.138285724464454012E-03 -0.140230470450885988E-03 - -0.138897585402013996E-03 -0.139618609513325002E-03 -0.138201697331153989E-03 - -0.140314497584185008E-03 0.000000000000000000E+00 0.421628678547669028E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.397284147640295009E-04 -0.727409837153020992E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435314260356761012E-03 -0.434838234146065004E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.256973904451089016E-01 0.000000000000000000E+00 -0.435443064032224982E-03 - -0.434671405121811992E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.138672811291356012E-03 -0.139843383623983013E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.253811442749607014E-01 0.000000000000000000E+00 - -0.111659649866176005E-03 -0.576021772522815052E-03 -0.666666666666666970E-02 - -0.807399256528189952E-03 0.113123198788523993E-04 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.318951480124047976E-01 -0.334801256611660003E-03 - -0.534674694551013989E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.135236541601312007E-03 -0.143279653314027993E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.320991543634817009E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423299320899991E-03 -0.161424193055011991E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.439089603173896989E-03 - -0.431138112796988022E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.292550790154386987E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423735549587007E-03 - -0.161423756826326004E-03 -0.161424083758224008E-03 -0.161423408617689004E-03 - -0.161423408591035005E-03 -0.161424083784877004E-03 -0.666666666666666970E-02 - -0.434939104555981024E-03 -0.435202600400863017E-03 -0.434989974551208984E-03 - -0.435150588238853982E-03 0.000000000000000000E+00 -0.434898044011626994E-03 - -0.435244580614824986E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.238543465439772992E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.420929736778404007E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435014942481818994E-03 -0.435084911181932989E-03 - -0.666666666666666970E-02 -0.138286681738250999E-03 -0.140229513177087998E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.255502900122961014E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423360190818987E-03 -0.161424132185092995E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435068884543961991E-03 - -0.435069911017712987E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.252824734843049986E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423118220580002E-03 -0.161424374155332007E-03 - -0.666666666666666970E-02 -0.139258110569847997E-03 -0.139258084345492004E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.255485447455677007E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423302756240003E-03 - -0.161424189619672006E-03 0.000000000000000000E+00 -0.434872688199729998E-03 - -0.435270525682074014E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.322052497810565996E-01 0.000000000000000000E+00 - -0.435023979104199015E-03 -0.435075488195124996E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.138279010827208998E-03 -0.140237184088130000E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.258530259407728988E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.434958190724600005E-03 -0.435144090846366989E-03 - 0.000000000000000000E+00 -0.138909064921109010E-03 -0.139607129994229987E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320994819774753012E-01 -0.161423308725907011E-03 -0.161424183650004998E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435080068181975977E-03 - -0.435058951262028994E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.256614731138162001E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.429527300766965989E-03 - -0.440793399874073991E-03 -0.166666666666667011E-01 -0.138412525549365991E-03 - -0.140103669365973006E-03 0.000000000000000000E+00 0.319693244854589997E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.161422576318977012E-03 -0.161424916056934997E-03 - -0.437741651794766986E-03 -0.432457178335270023E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.257682962742878004E-01 0.000000000000000000E+00 - -0.372135275696198998E-03 -0.498733752801886051E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138908713201775990E-03 - -0.139607481713563007E-03 0.000000000000000000E+00 0.000000000000000000E+00 - 0.333333333333332982E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.418896828693844023E-01 0.000000000000000000E+00 -0.436533664324920991E-03 - -0.433624016272548000E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.134626819467095006E-03 -0.143889375448244994E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.252833740154073013E-01 - 0.000000000000000000E+00 0.378159457400615982E-04 -0.725497368129053035E-03 - -0.666666666666666970E-02 -0.597713125208466033E-03 -0.752181291804723058E-04 - -0.166666666666667011E-01 0.000000000000000000E+00 0.321707087590768007E-01 - 0.000000000000000000E+00 -0.981956570691642057E-04 -0.589485765319826987E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.242422521137329990E-03 - -0.626419769060301032E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.252392538739078000E-01 - -0.333702074783996986E-03 -0.536013663819338046E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.138219319560300990E-03 - -0.140296875355038007E-03 -0.166666666666667011E-01 0.320887149633356031E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435084983875554023E-03 - -0.435014872763852026E-03 -0.435014828662647025E-03 -0.435085029863970982E-03 - -0.435055763684865022E-03 -0.435042893498926986E-03 -0.166666666666667011E-01 - -0.138286799284913995E-03 -0.140229395630425002E-03 -0.138900300230182012E-03 - -0.139615894685157012E-03 -0.138288031619809013E-03 -0.140228163295530011E-03 - 0.000000000000000000E+00 0.419834394758250989E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423306124254990E-03 -0.161424186251656992E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435073250189719975E-03 - -0.435065618982948010E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.000000000000000000E+00 0.255995701724177002E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.434985228537088976E-03 - -0.435115896247923015E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.138286994987186010E-03 -0.140229199928153990E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.253544285340416009E-01 0.000000000000000000E+00 - -0.434987985656363021E-03 -0.435113023678724013E-03 -0.666666666666666970E-02 - -0.140261879841764011E-03 -0.138254315073575989E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.319435795329029021E-01 -0.161410251962658001E-03 - -0.161437240413254008E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.433888695127137008E-03 -0.436278193993204976E-03 -0.166666666666667011E-01 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.320989341466072983E-01 - 0.000000000000000000E+00 -0.161423293334831010E-03 -0.161424199041080999E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435076232841126974E-03 - -0.435062702151373017E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.291210504532323985E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423172358814994E-03 - -0.161424320017096988E-03 -0.161436491869935994E-03 -0.161411000505975988E-03 - -0.161411014301727991E-03 -0.161436478074183991E-03 -0.666666666666666970E-02 - -0.437347517893396990E-03 -0.432844697077642020E-03 -0.436241571863408976E-03 - -0.433925164846824982E-03 0.000000000000000000E+00 -0.437803693779869989E-03 - -0.432399238650052009E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.238543465439772992E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.420931105150842977E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435014654010180979E-03 -0.435085211979509994E-03 - -0.666666666666666970E-02 -0.138286962315859997E-03 -0.140229232599479000E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.255498409722413992E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423349311523990E-03 -0.161424143064387991E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435079351261626014E-03 -0.435059652438656008E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.252819204017892991E-01 - 0.000000000000000000E+00 -0.161423122522292011E-03 -0.161424369853619998E-03 - -0.666666666666666970E-02 -0.139257846415800999E-03 -0.139258348499537998E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.255491014693066017E-01 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423317190226995E-03 - -0.161424175185684987E-03 0.000000000000000000E+00 -0.438043455875436027E-03 - -0.432161023931338002E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.322801706716391024E-01 0.000000000000000000E+00 -0.434874122345711006E-03 - -0.435231709745513020E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.138392757283109009E-03 -0.140123437632229988E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.258886981903099014E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435135825668647020E-03 -0.434966107095303013E-03 0.000000000000000000E+00 - -0.138935446888224990E-03 -0.139580748027114007E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.321000528704651969E-01 -0.161423323773416988E-03 -0.161424168602494994E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435030730220607980E-03 - -0.435108924522503010E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.254844804391417996E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161142124363106994E-03 - -0.161705368012804988E-03 -0.166666666666667011E-01 -0.225281625809598993E-03 - -0.643927738894806969E-03 0.000000000000000000E+00 0.321599842976618006E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.464257778793151032E-04 -0.734107200268307048E-03 - -0.388025614613707001E-03 -0.483301313684787997E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.255020432374560985E-01 0.000000000000000000E+00 - -0.337126126111301995E-03 -0.532578756276378025E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 -0.137258701832956002E-03 - -0.141257493082382995E-03 0.000000000000000000E+00 0.000000000000000000E+00 - 0.333333333333332982E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.421427039886230967E-01 0.000000000000000000E+00 -0.413943151144718978E-03 - -0.457074858319932024E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.211434099434618995E-03 -0.884365353823558038E-03 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.133333333333332995E-01 -0.666666666666666970E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 0.252964489671319986E-01 - 0.000000000000000000E+00 -0.213567925484438011E-03 -0.614115947829842971E-03 - -0.666666666666666970E-02 -0.140017356100088014E-03 -0.138498838815251987E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.319739847535212979E-01 - 0.000000000000000000E+00 -0.161422918075368997E-03 -0.161424574300543012E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.427230082581125004E-03 - -0.443079920816016025E-03 -0.166666666666667011E-01 0.333333333333332982E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.253449783929956991E-01 - -0.431322912949462978E-03 -0.438929377851577025E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.138850109293811998E-03 - -0.139666085621527000E-03 -0.166666666666667011E-01 0.240847367787417992E-01 - -0.666666666666666970E-02 -0.166666666666667011E-01 0.419834073413790990E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.161423305232406004E-03 - -0.161424187143506005E-03 -0.666666666666666970E-02 -0.435069384667189979E-03 - -0.435069399625256014E-03 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 0.255994979990794985E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.434980660475906998E-03 -0.435120659664607981E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.138286858107371994E-03 -0.140229336807968006E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.253691078096668007E-01 - 0.000000000000000000E+00 -0.434980641165363003E-03 -0.435120678249643021E-03 - -0.666666666666666970E-02 -0.140229336715335995E-03 -0.138286858200003003E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.319837098824014010E-01 - -0.161423313471232996E-03 -0.161424178904679013E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435069384765853027E-03 -0.435069399524313973E-03 - -0.166666666666667011E-01 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320996543007720028E-01 0.000000000000000000E+00 -0.161423313374075010E-03 - -0.161424179001836999E-03 -0.666666666666666970E-02 -0.666666666666666970E-02 - -0.435069381467433980E-03 -0.435069402897058982E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.292425276973896017E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423743711891987E-03 -0.161423748664019995E-03 -0.161424178997872993E-03 - -0.161423313378038989E-03 -0.161423313378032999E-03 -0.161424178997880013E-03 - -0.666666666666666970E-02 -0.435032871054405002E-03 -0.435106735866860005E-03 - -0.435069379977790014E-03 -0.435069404420270985E-03 -0.435069383385271993E-03 - -0.435069400936004996E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.333333333333332982E-01 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.238543465439772992E-01 -0.666666666666666970E-02 - -0.166666666666667011E-01 0.420929804189209972E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.435014924423208018E-03 -0.435084930012396979E-03 - -0.666666666666666970E-02 -0.138286695556728991E-03 -0.140229499358610006E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.255502626963007989E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.161423359539945995E-03 -0.161424132835965987E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435069382819562025E-03 -0.435069401514463013E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.252823579206017009E-01 - 0.000000000000000000E+00 -0.161423118849177003E-03 -0.161424373526735006E-03 - -0.666666666666666970E-02 -0.139258097695681013E-03 -0.139258097219658011E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.255489355337556005E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423313370652997E-03 - -0.161424179005259012E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.435069374869375021E-03 -0.435069409643798000E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.322089281766608018E-01 -0.435014924689618014E-03 -0.435084929734623027E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286695005679009E-03 - -0.140229499909660992E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 0.258549254171622014E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435042893494948018E-03 -0.435055763690273998E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.138910376350831992E-03 -0.139605818564507005E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - 0.320997360358935974E-01 -0.161423315568021993E-03 -0.161424176807890992E-03 - -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069380937729011E-03 - -0.435069403438677978E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01 - 0.255489358069375012E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.161423313378028011E-03 -0.161424178997883998E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.435069377571565001E-03 -0.435069406880717003E-03 0.000000000000000000E+00 - 0.320929801553103966E-01 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435014924404768016E-03 -0.435084930031662005E-03 - -0.138286695016626008E-03 -0.140229499898713992E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.255502626954418992E-01 0.000000000000000000E+00 - -0.161423359539924989E-03 -0.161424132835986993E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166666666666667011E-01 -0.435069382752795012E-03 - -0.435069401582733977E-03 0.000000000000000000E+00 0.333333333333332982E-01 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 0.420929804179970002E-01 0.000000000000000000E+00 - -0.435014924505136027E-03 -0.435084929926964016E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.138286695554652988E-03 -0.140229499360686009E-03 - -0.166666666666667011E-01 -0.166666666666667011E-01 0.133333333333332995E-01 - -0.666666666666666970E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - 0.249646302547502988E-01 0.000000000000000000E+00 -0.161423118848985994E-03 - -0.161424373526925988E-03 -0.666666666666666970E-02 -0.139258097697407009E-03 - -0.139258097217931988E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.319837881362066007E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.161423315571430996E-03 -0.161424176804481013E-03 -0.666666666666666970E-02 - -0.666666666666666970E-02 -0.435069384823893025E-03 -0.435069399464952007E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.333333333333332982E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - 0.253926391078274985E-01 -0.435042938363147018E-03 -0.435055716900843974E-03 - -0.666666666666666970E-02 -0.138910376047635993E-03 -0.139605818867703004E-03 - -0.166666666666667011E-01 -0.666666666666666970E-02 -0.162064605978737010E-03 - 0.755721724764775969E-02 -0.421583401256607965E-11 -0.164615581083446994E-03 - -0.622132993821362964E-09 0.822717713000382066E-02 -0.666666666666666970E-02 - -0.435031787535767993E-03 0.921658221859024927E-02 -0.671472362675333990E-03 - 0.000000000000000000E+00 -0.435066598341386974E-03 -0.622856281165874953E-03 - 0.926582425818990064E-02 -0.666666666666666970E-02 -0.435031698814498024E-03 - 0.921598015968093029E-02 -0.671350185242260036E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435066687046999979E-03 - 0.000000000000000000E+00 -0.622803871309166967E-03 0.926515487581706064E-02 - -0.666666666666666970E-02 -0.162091523387943991E-03 0.755721896647259996E-02 - -0.540935128117279988E-11 0.000000000000000000E+00 -0.164750534006632987E-03 - -0.795366002646085000E-09 0.822717800583096931E-02 0.926452029956657992E-02 - -0.622281045706827999E-03 -0.666666666666666970E-02 -0.668721208315983004E-03 - 0.920448404025630065E-02 -0.666666666666666970E-02 0.920291930713102065E-02 - -0.668399798797767049E-03 -0.622140084769083981E-03 0.926273190908652976E-02 - -0.666666666666666970E-02 -0.162061623289820004E-03 0.755721877319791985E-02 - -0.410171929572062999E-11 -0.164601757794207001E-03 -0.605571814693378962E-09 - 0.822717790541416943E-02 0.921717278330758984E-02 -0.671606351121000053E-03 - -0.666666666666666970E-02 -0.622958188273658011E-03 0.926596681290394082E-02 - -0.666666666666666970E-02 -0.164300067974464987E-03 0.822730729925378983E-02 - -0.253113151843113024E-09 -0.164103118034321003E-03 -0.251546774270035010E-09 - 0.822722715999001965E-02 -0.666666666666666970E-02 -0.435040487769458974E-03 - 0.921687604564088055E-02 -0.671540363061452956E-03 0.000000000000000000E+00 - -0.435057899246222006E-03 -0.622872037532872002E-03 0.926585755069535971E-02 - -0.666666666666666970E-02 -0.162127209687436009E-03 0.759121330666192966E-02 - -0.381352367097723024E-11 0.000000000000000000E+00 -0.164582493974596989E-03 - -0.545377294793035977E-09 0.822717728074968957E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.166319287245753013E-03 0.755721788134880966E-02 - -0.125543672150161994E-10 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.166854288422184003E-03 0.000000000000000000E+00 -0.695251112369320969E-09 - 0.759121397932040025E-02 -0.666666666666666970E-02 0.921698284063145017E-02 - -0.671564905209266956E-03 -0.622877569603531964E-03 0.926586949940542970E-02 - -0.666666666666666970E-02 -0.435044292403726006E-03 0.921676136558105021E-02 - -0.671305578260984965E-03 -0.435054094871408024E-03 -0.622625142915718996E-03 - 0.926561885584748987E-02 -0.164408450418608012E-03 0.822722728633254984E-02 - -0.379408966293651983E-09 -0.162009019530446013E-03 -0.666666666666666970E-02 - -0.254284697644517994E-11 0.755732846813902005E-02 -0.162062503890727003E-03 - -0.666666666666666970E-02 0.755732483871392972E-02 -0.412630295389444991E-11 - -0.164604272365608011E-03 -0.609067369792561007E-09 0.822717730203972007E-02 - -0.162008946283214003E-03 -0.666666666666666970E-02 0.755728526608244033E-02 - -0.254314505624484981E-11 -0.164408477352361010E-03 -0.379466146432811988E-09 - 0.822722728638950081E-02 -0.666666666666666970E-02 -0.162061922095729002E-03 - 0.755721774336119961E-02 -0.411297420986845978E-11 -0.164602266437615996E-03 - -0.607177113853009997E-09 0.822717717450133050E-02 0.926588728390210017E-02 - -0.622885394141277989E-03 -0.666666666666666970E-02 -0.671602423890051950E-03 - 0.921714903947036014E-02 -0.435048603885989987E-03 -0.666666666666666970E-02 - 0.921700338564301024E-02 -0.671444778088914037E-03 -0.435049783507797990E-03 - -0.622737470874805995E-03 0.926573446347610068E-02 -0.666666666666666970E-02 - 0.897119442365579972E-02 -0.606814776285979950E-03 -0.607757345476577036E-03 - 0.923427126759777082E-02 0.921865858272334984E-02 -0.671454242057297965E-03 - -0.666666666666666970E-02 -0.646212455869751953E-03 0.929466342228658926E-02 - -0.192798491760539997E-03 -0.666666666666666970E-02 0.825745086852652972E-02 - -0.932628952743546979E-06 -0.167240534534016013E-03 -0.225014212014958993E-07 - 0.822685297950877942E-02 -0.666666666666666970E-02 -0.435814163537570013E-03 - 0.921887387061852068E-02 -0.575307467536130958E-03 -0.434281293408232021E-03 - -0.604501016235682955E-03 0.915698677472209961E-02 -0.413213566066566995E-03 - -0.666666666666666970E-02 0.877723361706403069E-02 -0.541754211244169052E-03 - 0.000000000000000000E+00 -0.454683602340957979E-03 -0.590990555173378953E-03 - 0.920776881757097952E-02 -0.666666666666666970E-02 -0.163098401897450001E-03 - 0.763192105293324010E-02 -0.471912653584295006E-04 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.530165759527253025E-03 - 0.000000000000000000E+00 -0.266005332113615023E-03 0.861802401723344921E-02 - -0.666666666666666970E-02 -0.162004211770428006E-03 0.757730699043844026E-02 - -0.176948923798146998E-05 -0.529438965832964951E-03 -0.405267405870187984E-04 - 0.875391888040484047E-02 0.921979764348120068E-02 -0.602143221168287973E-03 - -0.666666666666666970E-02 -0.555336710991042974E-03 0.881585220388428001E-02 - -0.666666666666666970E-02 -0.163010868628506010E-03 0.795402855233807993E-02 - -0.273140602330698999E-11 -0.164270868470497012E-03 -0.213417929117084997E-09 - 0.822716826170043040E-02 -0.666666666666666970E-02 0.927693305369737056E-02 - -0.632604553292874993E-03 -0.671545581861832000E-03 0.921776054175638994E-02 - -0.161978982193291990E-03 -0.666666666666666970E-02 0.755685732173456970E-02 - -0.195386460922786995E-11 -0.164191969605661007E-03 -0.291699488577104992E-09 - 0.822713227252660009E-02 -0.666666666666666970E-02 -0.162064640844275011E-03 - 0.755721703602540030E-02 -0.421716748215653003E-11 -0.164615755229752009E-03 - -0.622326809900769984E-09 0.822717689294136939E-02 -0.666666666666666970E-02 - -0.435030941620315019E-03 0.921655770481275005E-02 -0.671470405959126958E-03 - 0.000000000000000000E+00 -0.435067444105677011E-03 -0.622859268229516961E-03 - 0.926582520767522955E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435032188115217974E-03 0.921571954136004949E-02 -0.671094905936150000E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435066197830839016E-03 - 0.000000000000000000E+00 -0.622560079517476997E-03 0.926486481761856079E-02 - -0.666666666666666970E-02 -0.162178699975160996E-03 0.755732514584614001E-02 - -0.123678884662636997E-10 -0.165233346068301005E-03 -0.180156095420544003E-08 - 0.822721902239778938E-02 0.925859295317845929E-02 -0.620261598377330964E-03 - -0.666666666666666970E-02 -0.659675714565077006E-03 0.916581375466305956E-02 - -0.666666666666666970E-02 0.818856060767081048E-02 -0.736710799666523000E-11 - -0.221521744718127990E-09 0.822698054922356946E-02 -0.666666666666666970E-02 - -0.162061886876159011E-03 0.755721683506239972E-02 -0.411153398251473010E-11 - -0.164602034973986998E-03 -0.606965384602901960E-09 0.822717599389061020E-02 - -0.435048835805744005E-03 0.921717024009354943E-02 -0.671607913788692021E-03 - -0.435049551589144980E-03 -0.666666666666666970E-02 -0.622896101942852008E-03 - 0.926589988923041952E-02 -0.666666666666666970E-02 -0.164092201396150991E-03 - 0.822715599945273920E-02 -0.245104005328256975E-09 -0.164092160478588006E-03 - -0.245036767609787999E-09 0.822714877827463061E-02 -0.666666666666666970E-02 - -0.435040451026806000E-03 0.921687587838842040E-02 -0.671541321966330983E-03 - -0.435057935985662001E-03 -0.622873182117880046E-03 0.926585851990125060E-02 - -0.666666666666666970E-02 -0.162127502447745005E-03 0.759121254289110009E-02 - -0.382274259639387968E-11 0.000000000000000000E+00 -0.164583524873151011E-03 - -0.546666909348633031E-09 0.822717637948596967E-02 -0.666666666666666970E-02 - -0.166319284889783996E-03 0.755721738061322038E-02 -0.125530862526258001E-10 - 0.000000000000000000E+00 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.166854283136274001E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.695179911733615969E-09 0.759121329234466961E-02 -0.435040521890532996E-03 - -0.666666666666666970E-02 0.921687815467167083E-02 -0.671542181838102967E-03 - -0.435057865128120000E-03 -0.622873658606718991E-03 0.926585863114167026E-02 - -0.666666666666666970E-02 -0.435041288920689020E-03 0.921669954253361012E-02 - -0.671349556748095989E-03 -0.435057098161677983E-03 -0.622686448593169997E-03 - 0.926564688510513046E-02 -0.164325652029393012E-03 0.822714890084452928E-02 - -0.369068713238530013E-09 -0.162006572449318004E-03 -0.666666666666666970E-02 - -0.248169403822924009E-11 0.755729660596458994E-02 -0.162062728183659998E-03 - -0.666666666666666970E-02 0.755729867674227035E-02 -0.413683822802849013E-11 - -0.164605417097992009E-03 -0.610604701856152959E-09 0.822717650154041955E-02 - -0.162006472670696007E-03 -0.666666666666666970E-02 0.755723750063842965E-02 - -0.248209213801367997E-11 -0.164325689607119994E-03 -0.369144828512267008E-09 - 0.822714890092034017E-02 -0.666666666666666970E-02 -0.162062060860401010E-03 - 0.755721735591260990E-02 -0.411820697467498986E-11 -0.164602902125366996E-03 - -0.607936291979959974E-09 0.822717685023799036E-02 -0.435051421420119008E-03 - 0.926588186737323932E-02 -0.622883850263154012E-03 -0.435046965950540009E-03 - -0.666666666666666970E-02 -0.671591030490054954E-03 0.921709452945720931E-02 - -0.435046965633857023E-03 -0.666666666666666970E-02 0.921703017052688013E-02 - -0.671537494669817957E-03 -0.435051421736794026E-03 -0.622834247686287991E-03 - 0.926581357500960018E-02 -0.666666666666666970E-02 0.755832397448806970E-02 - -0.411138722941705998E-11 -0.609957135264493987E-09 0.822725821738184981E-02 - 0.926024553873549920E-02 -0.621022213891748000E-03 -0.666666666666666970E-02 - -0.663176099841847051E-03 0.918049665304914927E-02 -0.433952552704485026E-03 - -0.666666666666666970E-02 0.917940522365195966E-02 -0.660850447221769969E-03 - -0.436139822938484006E-03 -0.618641376898239033E-03 0.925939891361087994E-02 - -0.666666666666666970E-02 -0.436004520088601995E-03 0.917698869910051063E-02 - -0.534972138649924970E-03 -0.434089279817291974E-03 -0.560727539174508006E-03 - 0.911474373371438072E-02 -0.405342384042105025E-03 -0.666666666666666970E-02 - 0.868224382189996917E-02 -0.501678846320934967E-03 0.000000000000000000E+00 - -0.460809060945838991E-03 -0.586502661037227048E-03 0.920011158722021036E-02 - -0.666666666666666970E-02 -0.163211684871365992E-03 0.764753570908803006E-02 - -0.433463713660776003E-04 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.529849870720847950E-03 0.000000000000000000E+00 - -0.242696464819676003E-03 0.855013113167163961E-02 -0.666666666666666970E-02 - -0.162715770416370988E-03 0.761282345029158040E-02 -0.385507239315511005E-04 - -0.531931290617636970E-03 -0.289327756212754013E-03 0.896268459992746042E-02 - 0.822708378408086929E-02 -0.453015989481950975E-09 -0.666666666666666970E-02 - -0.344818166136396003E-11 0.765885647535816984E-02 -0.666666666666666970E-02 - -0.162981032279518011E-03 0.765876435591864024E-02 -0.850978247264382040E-09 - -0.167181094192769006E-03 -0.102217899652555996E-06 0.822724871883005922E-02 - -0.666666666666666970E-02 0.899524486155203924E-02 -0.614036210565926987E-03 - -0.609553123871605014E-03 0.923729250745260015E-02 -0.162146754878560000E-03 - -0.666666666666666970E-02 0.755888395212829002E-02 -0.877868214680283015E-11 - -0.165048028543372008E-03 -0.128278104037022006E-08 0.822721063460546062E-02 - -0.666666666666666970E-02 -0.435042623242022023E-03 0.921694787133059065E-02 - -0.671556689557351017E-03 -0.435055763937149977E-03 -0.622875549102152042E-03 - 0.926586538966900006E-02 -0.666666666666666970E-02 -0.435049192464642004E-03 - 0.921716877978374927E-02 -0.671606871920753961E-03 0.000000000000000000E+00 - -0.435049194930888991E-03 -0.622886313955485970E-03 0.926588938212210070E-02 - -0.666666666666666970E-02 -0.162062558982797991E-03 0.755721781611762029E-02 - -0.413711697147502992E-11 0.000000000000000000E+00 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.164605380154927001E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.610686417053202014E-09 0.822717710018053966E-02 - -0.666666666666666970E-02 -0.435040485643464010E-03 0.921687647065221917E-02 - -0.671540876850887998E-03 -0.435057901372032005E-03 -0.622872543909229947E-03 - 0.926585805993978932E-02 0.822717712936564018E-02 -0.606929319768994989E-09 - -0.666666666666666970E-02 -0.411128314368316016E-11 0.755721765744643992E-02 - -0.666666666666666970E-02 0.755721765744643992E-02 -0.411128317078820962E-11 - -0.606929319768994989E-09 0.822717712936564018E-02 -0.666666666666666970E-02 - -0.162062002168695000E-03 0.755721765497851042E-02 -0.411600421982957979E-11 - -0.164602608742393997E-03 -0.607615952871632013E-09 0.822717713082168033E-02 - -0.435050788864285991E-03 0.926588513557140975E-02 -0.622885206966810982E-03 - -0.435047598518492997E-03 -0.666666666666666970E-02 -0.671596264858268996E-03 - 0.921711667933667982E-02 -0.666666666666666970E-02 -0.435047598512343999E-03 - 0.921711525903850040E-02 -0.671594794936314007E-03 -0.435050788870434014E-03 - -0.622883798838939949E-03 0.926588365351022937E-02 -0.666666666666666970E-02 - -0.162064603060552002E-03 0.755721765252307005E-02 -0.421568902488327991E-11 - -0.164615562620811005E-03 -0.622111732773898991E-09 0.822717714449052936E-02 - -0.666666666666666970E-02 -0.435031767332582026E-03 0.921658397831942068E-02 - -0.671474831069555976E-03 0.000000000000000000E+00 -0.435066618541047982E-03 - -0.622858773502414035E-03 0.926582672366347941E-02 -0.666666666666666970E-02 - -0.435031759872446977E-03 0.921598447326531928E-02 -0.671353605549905000E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435066625999740012E-03 0.000000000000000000E+00 -0.622806880419023005E-03 - 0.926515741228120006E-02 -0.435040467082004024E-03 -0.666666666666666970E-02 - 0.921687727787984939E-02 -0.671542217009017000E-03 -0.435057919931870025E-03 - -0.622873933298062990E-03 0.926585948649214994E-02 -0.666666666666666970E-02 - -0.162062558212559002E-03 0.755721765246759013E-02 -0.413710611008680003E-11 - -0.164605378163800012E-03 -0.610684955957268983E-09 0.822717713313388978E-02 - -0.435057895396635016E-03 0.926585809258491030E-02 -0.622872563956516044E-03 - -0.435040491619382013E-03 -0.666666666666666970E-02 -0.671540933200040989E-03 - 0.921687668189401059E-02 -0.164358965526126012E-03 -0.666666666666666970E-02 - 0.822694399085341019E-02 -0.400851057692599981E-09 -0.164360419602091007E-03 - -0.404402306244331983E-09 0.822717693055982982E-02 -0.435040480593904986E-03 - -0.666666666666666970E-02 0.921687631186799035E-02 -0.671540849649739004E-03 - -0.435057906421149976E-03 -0.622872546527165980E-03 0.926585805293679045E-02 - -0.666666666666666970E-02 -0.162062559343999999E-03 0.755721765385561003E-02 - -0.413714921327754003E-11 -0.164605383792317005E-03 -0.610691224557087966E-09 - 0.822717713418924003E-02 -0.435057910263422993E-03 0.926585952122008921E-02 - -0.622873948580168974E-03 -0.435040476751295975E-03 -0.666666666666666970E-02 - -0.671542290274572949E-03 0.921687760235279964E-02 -0.164358958283310994E-03 - -0.666666666666666970E-02 0.822694359327636945E-02 -0.400841676414764018E-09 - -0.164360412981448008E-03 -0.404398890304639026E-09 0.822717692927778937E-02 - -0.666666666666666970E-02 0.755721765237515972E-02 -0.411128322752053976E-11 - -0.606929330558722046E-09 0.822717712936565058E-02 0.822717692676106990E-02 - -0.401916224886799020E-09 -0.666666666666666970E-02 -0.398503993611501997E-09 - 0.822695176179768020E-02 -0.435040782183989977E-03 -0.666666666666666970E-02 - 0.921688643492079955E-02 -0.671543136482274949E-03 -0.435057604856947981E-03 - -0.622873024631925972E-03 0.926585913880650924E-02 -0.666666666666666970E-02 - -0.162064603067295007E-03 0.755721765304565030E-02 -0.421568935096809032E-11 - -0.164615562649187992E-03 -0.622111764150747044E-09 0.822717714448892995E-02 - -0.435031767264953018E-03 -0.666666666666666970E-02 0.921658397617671973E-02 - -0.671474830688970005E-03 0.000000000000000000E+00 -0.435066618608665010E-03 - -0.622858773521975970E-03 0.926582672355137985E-02 -0.666666666666666970E-02 - -0.435031759867127014E-03 0.921598447384829045E-02 -0.671353608399908978E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435066626005059000E-03 0.000000000000000000E+00 -0.622806883450133957E-03 - 0.926515741287571928E-02 -0.666666666666666970E-02 -0.162062535484642990E-03 - 0.755721765241379028E-02 -0.413624149795890984E-11 -0.164605264966733003E-03 - -0.610559194893120966E-09 0.822717713298093054E-02 0.822717692676106990E-02 - -0.401916224778066025E-09 -0.666666666666666970E-02 -0.398503993617073007E-09 - 0.822695176179807051E-02 -0.666666666666666970E-02 -0.435040782184004994E-03 - 0.921688643492129048E-02 -0.671543136482392043E-03 -0.435057604856933019E-03 - -0.622873024631955029E-03 0.926585913880656996E-02 -0.666666666666666970E-02 - 0.755721765237515018E-02 -0.411128322752061973E-11 -0.606929330558734970E-09 - 0.822717712936565058E-02 -0.162062535484639006E-03 -0.666666666666666970E-02 - 0.755721765241101993E-02 -0.413624149803294008E-11 -0.164605264966739996E-03 - -0.610559194905281038E-09 0.822717713298093054E-02 -0.666666666666666970E-02 - -0.435040978876939991E-03 0.921689266842156026E-02 -0.671544215875962010E-03 - -0.435057408180386984E-03 -0.622872937769437958E-03 0.926585946462473060E-02 - -0.666666666666666970E-02 -0.162127387652292998E-03 0.759121301093484014E-02 - -0.381917589201341967E-11 0.000000000000000000E+00 -0.164583145431396004E-03 - -0.546168929922635014E-09 0.822717717091626943E-02 -0.666666666666666970E-02 - -0.166319270517048003E-03 0.755721782869738998E-02 -0.125468645423563995E-10 - 0.000000000000000000E+00 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.166854266113373008E-03 0.000000000000000000E+00 -0.694835327004200008E-09 - 0.759121369277350033E-02 -0.666666666666666970E-02 -0.162091523336998009E-03 - 0.755721898664039968E-02 -0.540934653228686967E-11 -0.164750533613684997E-03 - -0.795365292654328000E-09 0.822717800588296938E-02 0.926452030366763969E-02 - -0.622281047524610035E-03 -0.666666666666666970E-02 -0.668721216930222967E-03 - 0.920448407782870071E-02 -0.666666666666666970E-02 0.920291934460982022E-02 - -0.668399807408691956E-03 -0.622140086592986989E-03 0.926273191317288080E-02 - -0.666666666666666970E-02 -0.435042278634080015E-03 0.921682356015443051E-02 - -0.671431390315878037E-03 -0.435056108521798986E-03 -0.622756696995610017E-03 - 0.926574695980873948E-02 -0.164604656267662011E-03 0.822717725537036940E-02 - -0.609640569558637034E-09 -0.162062479874256008E-03 -0.666666666666666970E-02 - -0.413006082275806018E-11 0.755726759085299012E-02 -0.666666666666666970E-02 - -0.162006162226947987E-03 0.755727139217683957E-02 -0.248123277233144982E-11 - -0.164398193103688998E-03 -0.370429925449825014E-09 0.822722962912562916E-02 - -0.666666666666666970E-02 -0.162064599546255997E-03 0.755721765602699984E-02 - -0.421555271070431014E-11 -0.164615545095071005E-03 -0.622091901724106049E-09 - 0.822717714627387020E-02 -0.666666666666666970E-02 -0.435031818763031985E-03 - 0.921658104872763971E-02 -0.671470214801736043E-03 0.000000000000000000E+00 - -0.435066567119566977E-03 -0.622854034507517988E-03 0.926582206155602932E-02 - -0.666666666666666970E-02 -0.435031752461292008E-03 0.921598570305218016E-02 - -0.671353593598150960E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435066633409600010E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.622806719992169021E-03 - 0.926515905834468002E-02 -0.162062393641141989E-03 -0.666666666666666970E-02 - 0.755722114123831036E-02 -0.413058273432428028E-11 -0.164604687174786012E-03 - -0.609739837378995011E-09 0.822717725546923996E-02 -0.666666666666666970E-02 - -0.162060731985959994E-03 0.755721979203459980E-02 -0.406827649196786028E-11 - -0.164598252833557000E-03 -0.600736958190943012E-09 0.822717859850461038E-02 - -0.164089877343614012E-03 0.822722950591434077E-02 -0.245723982778697014E-09 - -0.164740360816590010E-03 -0.666666666666666970E-02 -0.253032241614393982E-09 - 0.822760867484073968E-02 -0.435063218164074010E-03 -0.666666666666666970E-02 - 0.926625279535532066E-02 -0.623222898205732980E-03 -0.435035168245856982E-03 - -0.671596115310440014E-03 0.921718087666401065E-02 -0.162006191893170997E-03 - -0.666666666666666970E-02 0.755728897650832006E-02 -0.248111426872892018E-11 - -0.164398182178321996E-03 -0.370407207277356981E-09 0.822722962910301010E-02 - -0.666666666666666970E-02 -0.162062408593516993E-03 0.755721778752637980E-02 - -0.413141814596678035E-11 -0.164604734980543005E-03 -0.609861061511916968E-09 - 0.822717721123181027E-02 -0.164359758083610009E-03 0.822717705188663086E-02 - -0.403734886259138006E-09 -0.164358443024191010E-03 -0.666666666666666970E-02 - -0.400953600054809976E-09 0.822699461577980004E-02 -0.435042369262490986E-03 - -0.666666666666666970E-02 0.921678605590701971E-02 -0.671388823849048961E-03 - -0.435056017899627977E-03 -0.622715238127973967E-03 0.926570506586106915E-02 - -0.666666666666666970E-02 0.897119442339463016E-02 -0.606814776206859971E-03 - -0.607757345455318975E-03 0.923427126754335081E-02 0.921865858631707932E-02 - -0.671454241685535990E-03 -0.666666666666666970E-02 -0.646212508082324977E-03 - 0.929466349621914031E-02 -0.192798550395760997E-03 -0.666666666666666970E-02 - 0.825745095197305025E-02 -0.932632147975482013E-06 -0.167240534661016013E-03 - -0.225014932097670997E-07 0.822685297960725967E-02 -0.666666666666666970E-02 - -0.435853333051191975E-03 0.921982672935077983E-02 -0.576029289720921011E-03 - -0.434241817036211018E-03 -0.604491260294178975E-03 0.915701687929672964E-02 - -0.413213785008022022E-03 -0.666666666666666970E-02 0.877723644056978974E-02 - -0.541755128362865035E-03 -0.454683425554907011E-03 -0.590990666065798007E-03 - 0.920776909864313986E-02 -0.666666666666666970E-02 -0.163098438357505001E-03 - 0.763192860819322978E-02 -0.471912218750692030E-04 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.530165770547247946E-03 - -0.266001580374522018E-03 0.861802239077459967E-02 -0.666666666666666970E-02 - -0.162004212070115012E-03 0.757730725874164977E-02 -0.176948864124370000E-05 - -0.529438965863178955E-03 -0.405266342026540003E-04 0.875391878258710045E-02 - 0.921979766533967057E-02 -0.602143233936809023E-03 -0.666666666666666970E-02 - -0.555336769390324045E-03 0.881585236637343062E-02 -0.666666666666666970E-02 - -0.163010868918803012E-03 0.795402864518520922E-02 -0.273140627481144992E-11 - -0.164270868404786012E-03 -0.213417891562764991E-09 0.822716826171294990E-02 - -0.666666666666666970E-02 0.927693305629184034E-02 -0.632604555455441994E-03 - -0.671545581847466972E-03 0.921776054189130979E-02 -0.161978982117798992E-03 - -0.666666666666666970E-02 0.755685740523829967E-02 -0.195386057816982017E-11 - -0.164191968508161013E-03 -0.291698868024367013E-09 0.822713227244221967E-02 - -0.666666666666666970E-02 -0.435040481081572998E-03 0.921687625100787999E-02 - -0.671540765653113052E-03 -0.435057905933523977E-03 -0.622872462287793992E-03 - 0.926585797472436953E-02 -0.666666666666666970E-02 -0.162127507324803994E-03 - 0.759121282163092038E-02 -0.382298621466253031E-11 -0.164583632044819009E-03 - -0.546703997412867024E-09 0.822717713283360047E-02 -0.666666666666666970E-02 - -0.166319256859904011E-03 0.755721769623335978E-02 -0.125414820597383008E-10 - 0.000000000000000000E+00 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.166854250172961013E-03 -0.694537633219151044E-09 0.759121350560284982E-02 - -0.666666666666666970E-02 -0.162099247952961000E-03 0.755721989572244977E-02 - -0.581368808752400979E-11 -0.164789516502383002E-03 -0.853932691235368040E-09 - 0.822717846311052034E-02 0.926237298878906934E-02 -0.621979743485002014E-03 - -0.666666666666666970E-02 -0.667643958464268013E-03 0.919963618647812002E-02 - -0.666666666666666970E-02 0.920119115376864978E-02 -0.667964676147511953E-03 - -0.622121227338204037E-03 0.926416149864764914E-02 -0.666666666666666970E-02 - -0.435040909817440986E-03 0.921687655493176952E-02 -0.671527187062299024E-03 - -0.435057477234175008E-03 -0.622856729377207043E-03 0.926584500443678079E-02 - -0.164605266534459990E-03 0.822717720592583933E-02 -0.610450247156119009E-09 - -0.162062530200907004E-03 -0.666666666666666970E-02 -0.413549431266714970E-11 - 0.755722446015559979E-02 -0.666666666666666970E-02 -0.162059365323539989E-03 - 0.755722567800456990E-02 -0.401936380178025961E-11 -0.164612123412715989E-03 - -0.594277800995966983E-09 0.822719391360227960E-02 -0.666666666666666970E-02 - -0.162064601680787996E-03 0.755721765533357015E-02 -0.421563549128826973E-11 - 0.000000000000000000E+00 -0.164615555733905002E-03 -0.622103957379088996E-09 - 0.822717714640426936E-02 -0.666666666666666970E-02 -0.435031792976918995E-03 - 0.921658339535963086E-02 -0.671473400089229013E-03 0.000000000000000000E+00 - -0.435066592901186004E-03 -0.622857239963683028E-03 0.926582531946548021E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435031751019518008E-03 - 0.921598490375117978E-02 -0.671353543894939037E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435066634851121988E-03 0.000000000000000000E+00 - -0.622806775506295953E-03 0.926515820468288079E-02 -0.162062519267639990E-03 - -0.666666666666666970E-02 0.755721857194846958E-02 -0.413556062572659997E-11 - -0.164605270453123994E-03 -0.610462848248556989E-09 0.822717720593840046E-02 - -0.666666666666666970E-02 -0.162062243310156996E-03 0.755721835822449010E-02 - -0.412518743024167011E-11 -0.164604700854507999E-03 -0.608980634819725018E-09 - 0.822717779878246944E-02 -0.164369437450745994E-03 0.822719371500816969E-02 - -0.393313770364309007E-09 -0.164344328728707013E-03 -0.666666666666666970E-02 - -0.391103618326530975E-09 0.822706248393468946E-02 -0.435044237783256996E-03 - -0.666666666666666970E-02 0.921700053762436987E-02 -0.671567082152385998E-03 - -0.435054149489181002E-03 -0.622876302108100002E-03 0.926586962058313993E-02 - -0.162059379398541000E-03 -0.666666666666666970E-02 0.755723329728289969E-02 - -0.401928043513828996E-11 -0.164612118388109011E-03 -0.594261933530200978E-09 - 0.822719391358646933E-02 -0.666666666666666970E-02 -0.162062495727046009E-03 - 0.755721772845492981E-02 -0.413473254943709991E-11 -0.164605146051557987E-03 - -0.610342338260452991E-09 0.822717719432444067E-02 -0.164360315553491003E-03 - 0.822717700213156967E-02 -0.404232600081692006E-09 -0.164358887835959998E-03 - -0.666666666666666970E-02 -0.400997831179622018E-09 0.822696488974702066E-02 - -0.435041264130012022E-03 -0.666666666666666970E-02 0.921687154445821957E-02 - -0.671511051258173990E-03 -0.435057122950393985E-03 -0.622839088072463958E-03 - 0.926582874613315031E-02 -0.666666666666666970E-02 0.927696108771089954E-02 - -0.632627917074633003E-03 -0.671545420285009982E-03 0.921776198086651961E-02 - 0.922026584017879985E-02 -0.602408843182564049E-03 -0.666666666666666970E-02 - -0.556545745722415003E-03 0.881922185436606956E-02 -0.163017068240428012E-03 - -0.666666666666666970E-02 0.795594799175567031E-02 -0.273445423603751999E-11 - -0.164269850461446013E-03 -0.212776812668013003E-09 0.822716866999695026E-02 - -0.666666666666666970E-02 -0.435073828677985024E-03 0.926998697001005928E-02 - -0.626775693155938018E-03 -0.435024555678502995E-03 -0.674658125777386950E-03 - 0.922087193677081918E-02 -0.317388619046037024E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 0.807555110066160951E-02 -0.161231565052261995E-03 - -0.506658534832258992E-03 -0.311775039000433988E-03 0.895372823636290081E-02 - -0.666666666666666970E-02 -0.163033789441224992E-03 0.757712252412545001E-02 - -0.230784432560672012E-04 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.528851957602917964E-03 -0.189448307782970002E-03 0.832713693082491081E-02 - -0.666666666666666970E-02 -0.161988418356935989E-03 0.755983351837515982E-02 - -0.201738757027985980E-11 -0.164212922403823006E-03 -0.300296756336277975E-09 - 0.822710828992056012E-02 0.921859794240553060E-02 -0.671461758481382994E-03 - -0.666666666666666970E-02 -0.645287749704886960E-03 0.929336169606828980E-02 - -0.666666666666666970E-02 -0.192231218611419992E-03 0.825469369225220045E-02 - -0.211366001320515989E-06 -0.166616012215181007E-03 -0.134857476054616004E-07 - 0.822777129012250072E-02 -0.666666666666666970E-02 0.902089049785161946E-02 - -0.621512175397749989E-03 -0.611398199404824985E-03 0.924071304541716938E-02 - -0.162013975253551008E-03 -0.666666666666666970E-02 0.758489435794543979E-02 - -0.176583046930321999E-05 -0.529428370426132016E-03 -0.381593418223961990E-04 - 0.875176249380833077E-02 -0.666666666666666970E-02 -0.435040249760507993E-03 - 0.921686856770915011E-02 -0.671539094886240001E-03 -0.435058137234120004E-03 - -0.622872175240221974E-03 0.926585722647491929E-02 -0.666666666666666970E-02 - -0.435049192418392975E-03 0.921716878281565039E-02 -0.671606876372485046E-03 - -0.435049194977138020E-03 -0.622886318490127947E-03 0.926588938673533064E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.162062559002504992E-03 - 0.755721765529914023E-02 -0.413713063586915014E-11 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.164605381839701988E-03 -0.610688496265456990E-09 - 0.822717709903768996E-02 -0.666666666666666970E-02 -0.435040481539034974E-03 - 0.921687633332718068E-02 -0.671540846145312980E-03 -0.435057905476102008E-03 - -0.622872537796382957E-03 0.926585804562602040E-02 0.822717712936564018E-02 - -0.606929320140830976E-09 -0.666666666666666970E-02 -0.411128314549578026E-11 - 0.755721765728442022E-02 -0.666666666666666970E-02 0.755721765728442022E-02 - -0.411128314549578026E-11 -0.606929320113726025E-09 0.822717712936564018E-02 - -0.666666666666666970E-02 -0.162062535444855996E-03 0.755721765299004027E-02 - -0.413623988345109963E-11 -0.164605264762881988E-03 -0.610558967689581019E-09 - 0.822717713298422998E-02 -0.164360411104607013E-03 0.822717692934940049E-02 - -0.404392079821727997E-09 -0.164359007307861987E-03 -0.666666666666666970E-02 - -0.400959182056933016E-09 0.822695177830758996E-02 -0.666666666666666970E-02 - -0.435040782704709025E-03 0.921688645230588945E-02 -0.671543140332089947E-03 - -0.435057604336271976E-03 -0.622873025362695042E-03 0.926585914058404916E-02 - -0.666666666666666970E-02 -0.435040480198235974E-03 0.921687629980992920E-02 - -0.671540848018962978E-03 0.000000000000000000E+00 -0.435057906816784022E-03 - -0.622872547225220984E-03 0.926585805277986042E-02 -0.666666666666666970E-02 - -0.435049192309392005E-03 0.921716878048172046E-02 -0.671606876991212012E-03 - -0.435049195086140020E-03 -0.622886319711294994E-03 0.926588938772130062E-02 - -0.666666666666666970E-02 -0.162062559108483992E-03 0.755721765240897035E-02 - -0.413713455792400007E-11 0.000000000000000000E+00 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.164605382375200988E-03 -0.610689046115110989E-09 - 0.822717709593154961E-02 -0.162062559033916009E-03 -0.666666666666666970E-02 - 0.755721765255952006E-02 -0.413713742434565006E-11 -0.164605382254266990E-03 - -0.610689502144610012E-09 0.822717713322326967E-02 -0.666666666666666970E-02 - -0.162062001821161990E-03 0.755721765238182019E-02 -0.411599099559748981E-11 - -0.164602607031998988E-03 -0.607614053906671031E-09 0.822717713004775947E-02 - -0.435050784143273998E-03 0.926588367156185033E-02 -0.622883807548416991E-03 - -0.435047603239580017E-03 -0.666666666666666970E-02 -0.671594832030014948E-03 - 0.921711541879277965E-02 -0.435047603239578987E-03 -0.666666666666666970E-02 - 0.921711541854514961E-02 -0.671594831758394982E-03 -0.435050784143274974E-03 - -0.622883807286240968E-03 0.926588367130479033E-02 -0.435040480204222993E-03 - -0.666666666666666970E-02 0.921687629993508950E-02 -0.671540847984813971E-03 - -0.435057906810797979E-03 -0.622872547158363982E-03 0.926585805272238036E-02 - -0.666666666666666970E-02 -0.435040480199119979E-03 0.921687628837800955E-02 - -0.671540836043143011E-03 -0.435057906815900994E-03 -0.622872535731676044E-03 - 0.926585804082549064E-02 -0.164605382251114998E-03 0.822717713322325926E-02 - -0.610689492032270982E-09 -0.162062559042711003E-03 -0.666666666666666970E-02 - -0.413713737102918003E-11 0.755721765729555021E-02 -0.162062559115393992E-03 - -0.666666666666666970E-02 0.755721765728750976E-02 -0.413714006582654996E-11 - -0.164605382612441987E-03 -0.610689891716875000E-09 0.822717713311152052E-02 - -0.666666666666666970E-02 0.755721765237515972E-02 -0.411128320041545960E-11 - -0.606929330558716048E-09 0.822717712936565058E-02 0.822717692676106990E-02 - -0.401916224778854018E-09 -0.666666666666666970E-02 -0.398503993603185006E-09 - 0.822695176179710080E-02 -0.435040782183968022E-03 -0.666666666666666970E-02 - 0.921688643492006056E-02 -0.671543136482093996E-03 -0.435057604856969990E-03 - -0.622873024631876967E-03 0.926585913880641036E-02 -0.666666666666666970E-02 - -0.162064603069931001E-03 0.755721765252316026E-02 -0.421568944254191005E-11 - -0.164615562667500004E-03 -0.622111785613204007E-09 0.822717714448713971E-02 - -0.435031767249101982E-03 -0.666666666666666970E-02 0.921658397565039943E-02 - -0.671474830574549054E-03 0.000000000000000000E+00 -0.435066618624513010E-03 - -0.622858773502365029E-03 0.926582672349992968E-02 -0.666666666666666970E-02 - -0.435031759837616007E-03 0.921598447232674020E-02 -0.671353608078640034E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435066626034565020E-03 0.000000000000000000E+00 -0.622806883368089994E-03 - 0.926515741217365935E-02 -0.666666666666666970E-02 -0.162062535484635997E-03 - 0.755721765240871968E-02 -0.413624136259383020E-11 -0.164605264966749998E-03 - -0.610559194918970050E-09 0.822717713298093921E-02 0.822717692676106990E-02 - -0.401916224724624988E-09 -0.666666666666666970E-02 -0.398503993603535989E-09 - 0.822695176179711989E-02 -0.666666666666666970E-02 -0.435040782183968998E-03 - 0.921688643492009005E-02 -0.671543136482101043E-03 -0.435057604856969014E-03 - -0.622873024631878051E-03 0.926585913880642077E-02 -0.666666666666666970E-02 - 0.755721765237515972E-02 -0.411128320041545960E-11 -0.606929330558716048E-09 - 0.822717712936565058E-02 -0.162062535484635997E-03 -0.666666666666666970E-02 - 0.755721765240869973E-02 -0.413624144391172994E-11 -0.164605264966749998E-03 - -0.610559194919383020E-09 0.822717713298093921E-02 -0.666666666666666970E-02 - -0.435033689603804975E-03 0.921378384069738987E-02 -0.668589820131555040E-03 - -0.435064696586663006E-03 -0.620092661077489989E-03 0.926283488586463963E-02 - -0.666666666666666970E-02 -0.162162554053133000E-03 0.761060627573421043E-02 - -0.358738548769622997E-11 -0.164556192142978992E-03 -0.502148070051966022E-09 - 0.822717405269838935E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.163092637467958997E-03 0.755846712970139993E-02 0.813151629364127964E-19 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.163377020705577990E-03 - -0.197563428050820995E-37 0.761068726943637999E-02 -0.666666666666666970E-02 - -0.415446954693720976E-03 0.879694625554016015E-02 -0.528338181427255972E-03 - -0.452858816762892013E-03 -0.571958066866902980E-03 0.919346567847542998E-02 - 0.822717710541285077E-02 -0.582878441461362035E-09 -0.666666666666666970E-02 - -0.398790879900396976E-11 0.756907608051521959E-02 -0.666666666666666970E-02 - 0.756911094403798970E-02 -0.399682676269469016E-11 -0.586791272997618962E-09 - 0.822723940155752065E-02 -0.666666666666666970E-02 -0.435082932504862979E-03 - 0.926680195170886956E-02 -0.623730586218653013E-03 -0.435015449187736995E-03 - -0.671618387476765970E-03 0.921723408637379997E-02 -0.164605497784444998E-03 - 0.822717715380158997E-02 -0.610801293217642997E-09 -0.162062563985841001E-03 - -0.666666666666666970E-02 -0.413787738555715978E-11 0.755721095967978960E-02 - -0.666666666666666970E-02 -0.162067963173172003E-03 0.755721116190670970E-02 - -0.434923998820925017E-11 -0.164635999950897013E-03 -0.641653733272013955E-09 - 0.822717987065326069E-02 -0.666666666666666970E-02 -0.162064602794511991E-03 - 0.755721765349726994E-02 -0.421567878592677010E-11 0.000000000000000000E+00 - -0.164615561292109994E-03 -0.622110240351067001E-09 0.822717714537648040E-02 - -0.666666666666666970E-02 -0.435031773851333981E-03 0.921658402777584054E-02 - -0.671474653364973048E-03 0.000000000000000000E+00 -0.435066612023434006E-03 - -0.622858559544564984E-03 0.926582657482898954E-02 -0.666666666666666970E-02 - -0.435031756377946010E-03 0.921598454249483029E-02 -0.671353578907302054E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435066629493629977E-03 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.622806849901958001E-03 0.926515761104865047E-02 - -0.162062576485603999E-03 -0.666666666666666970E-02 0.755721769097717974E-02 - -0.413780156463713995E-11 -0.164605493304321993E-03 -0.610786879472733959E-09 - 0.822717715378724034E-02 -0.666666666666666970E-02 -0.162062600946185996E-03 - 0.755721775647281990E-02 -0.413875277158846027E-11 -0.164605835862048996E-03 - -0.610932451422013981E-09 0.822717731789045920E-02 -0.164390239995456007E-03 - 0.822717965629857044E-02 -0.424752159962523975E-09 -0.164384780603813011E-03 - -0.666666666666666970E-02 -0.420718048853076981E-09 0.822693037781527942E-02 - -0.435039866684628978E-03 -0.666666666666666970E-02 0.921685570981070970E-02 - -0.671536106102198996E-03 -0.435058520274921989E-03 -0.622871476563668946E-03 - 0.926585585423385025E-02 -0.162067974342572992E-03 -0.666666666666666970E-02 - 0.755721712453114990E-02 -0.434916937807187029E-11 -0.164635995970298013E-03 - -0.641640314610242035E-09 0.822717987063988944E-02 -0.666666666666666970E-02 - -0.162062530993410003E-03 0.755721767391838006E-02 -0.413607275117757979E-11 - -0.164605272998311010E-03 -0.610535653474081026E-09 0.822717715805352037E-02 - -0.164360523631990990E-03 0.822717694987937058E-02 -0.404455003494009021E-09 - -0.164359098316623007E-03 -0.666666666666666970E-02 -0.401040888608453987E-09 - 0.822695309202677028E-02 -0.435040829062186994E-03 -0.666666666666666970E-02 - 0.921689067116016926E-02 -0.671545174080842035E-03 -0.435057557982693015E-03 - -0.622874572247597981E-03 0.926586218088495950E-02 -0.666666666666666970E-02 - 0.755944258225673023E-02 -0.409900186798689021E-11 -0.607550069944511006E-09 - 0.822725761878751946E-02 0.822717683107231958E-02 -0.307429333767333980E-09 - -0.666666666666666970E-02 -0.346590492659272013E-11 0.788944072959344919E-02 - -0.405969476545236976E-03 -0.666666666666666970E-02 0.868864828240666076E-02 - -0.503334320458691947E-03 -0.460337026543280981E-03 -0.585683663662835011E-03 - 0.919951403822613992E-02 -0.666666666666666970E-02 -0.434971941726958005E-03 - 0.922397396166772972E-02 -0.680859150263666982E-03 -0.435126415795954993E-03 - -0.632212694429038955E-03 0.927545373694507042E-02 -0.163258163012422003E-03 - -0.666666666666666970E-02 0.771427725269166996E-02 -0.477633450518533990E-04 - -0.532374012608134001E-03 -0.264816289609958973E-03 0.893589131240556979E-02 - -0.666666666666666970E-02 -0.163022374738535991E-03 0.756010628736956004E-02 - -0.985505903625681948E-05 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.528465266513926044E-03 -0.112888812204124995E-03 - 0.818271031310010069E-02 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.162166578301091007E-03 0.755933919789102040E-02 -0.104753730850025000E-10 - -0.165099058739546009E-03 -0.152246080895752994E-08 0.822717141726839012E-02 - 0.822711899108566000E-02 -0.221645141852008993E-09 -0.666666666666666970E-02 - -0.803318716673082971E-11 0.819386805739684074E-02 -0.666666666666666970E-02 - -0.433725918784746999E-03 0.917430859668704086E-02 -0.661288978702212048E-03 - -0.436363722694927986E-03 -0.620322134736719043E-03 0.926073953711441071E-02 - -0.666666666666666970E-02 0.755741928796660033E-02 -0.410903028695346997E-11 - -0.606500685244875008E-09 0.822717712893872993E-02 -0.162225051571071008E-03 - -0.666666666666666970E-02 0.762801677615301994E-02 -0.369132365187440996E-05 - -0.529778141305529949E-03 -0.883337217346735949E-04 0.880271406258762920E-02 - -0.666666666666666970E-02 -0.415930173645343991E-03 0.874527436890885045E-02 - -0.345835963857271027E-03 -0.452435882453698006E-03 -0.346466114750530014E-03 - 0.898436350079646028E-02 -0.666666666666666970E-02 -0.410333749350875977E-03 - 0.874024429470159937E-02 -0.526324780402007012E-03 -0.456977206374832004E-03 - -0.594710214126934963E-03 0.921019309223168083E-02 -0.666666666666666970E-02 - -0.349138390596867001E-03 0.808745258250311017E-02 -0.174389744426073004E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.493489741678000975E-03 -0.295632081822985011E-03 0.859314073796063962E-02 - -0.666666666666666970E-02 -0.162165456144072008E-03 0.755894484280849981E-02 - -0.104756642479310002E-10 -0.165141511878991989E-03 -0.152686212523292010E-08 - 0.822721248571258931E-02 0.925936813944002075E-02 -0.620622043504368980E-03 - -0.666666666666666970E-02 -0.661330297689515954E-03 0.917272030438071040E-02 - -0.666666666666666970E-02 0.917272030438071040E-02 -0.661330297689515954E-03 - -0.620622043504368980E-03 0.925936813944002075E-02 -0.666666666666666970E-02 - -0.162062370509243004E-03 0.755772622350076983E-02 -0.408856756901033998E-11 - -0.164599404158889001E-03 -0.603371041254018047E-09 0.822717649469902924E-02 - -0.164363072970947994E-03 0.822717536870335060E-02 -0.406417442325602006E-09 - -0.164592544349163996E-03 -0.666666666666666970E-02 -0.411459868924217992E-09 - 0.822733545578738061E-02 -0.666666666666666970E-02 -0.435055225953795985E-03 - 0.926605052142811039E-02 -0.623035876737523049E-03 -0.435043161259378005E-03 - -0.671607266693034040E-03 0.921717853066967956E-02 -0.666666666666666970E-02 - -0.435040481662372024E-03 0.921687634507689993E-02 -0.671540853956346050E-03 - -0.435057905352776017E-03 -0.622872544450323051E-03 0.926585805410279004E-02 - -0.666666666666666970E-02 -0.435049192197708014E-03 0.921716876912447025E-02 - -0.671606868732131960E-03 0.000000000000000000E+00 -0.435049195197822981E-03 - -0.622886312493042956E-03 0.926588937934206061E-02 -0.666666666666666970E-02 - -0.162062574467009994E-03 0.755721765477525027E-02 -0.413771902340413000E-11 - 0.000000000000000000E+00 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.164605458858507004E-03 0.000000000000000000E+00 -0.610774061235578027E-09 - 0.822717709793101964E-02 -0.162063129702431002E-03 -0.666666666666666970E-02 - 0.755721488286742026E-02 -0.415889664939935991E-11 -0.164608241410742993E-03 - -0.613854083280822974E-09 0.822717557368093065E-02 -0.666666666666666970E-02 - -0.162062006116684993E-03 0.755721765538701958E-02 -0.411615339006451024E-11 - -0.164602628392850010E-03 -0.607637651237361961E-09 0.822717712956365019E-02 - -0.435050838802116988E-03 0.926588470716894000E-02 -0.622885079077197044E-03 - -0.435047548579851017E-03 -0.666666666666666970E-02 -0.671595813711089052E-03 - 0.921711477741266050E-02 -0.435047548601973024E-03 -0.666666666666666970E-02 - 0.921711980798223078E-02 -0.671600947327989010E-03 -0.435050838779994981E-03 - -0.622889987523941044E-03 0.926588996280537994E-02 -0.435040278978450999E-03 - -0.666666666666666970E-02 0.921687073978887973E-02 -0.671540718908447947E-03 - -0.435058108018793015E-03 -0.622873586970733978E-03 0.926585856104042080E-02 - -0.666666666666666970E-02 -0.435040282159379990E-03 0.921688118588532938E-02 - -0.671551578142270001E-03 -0.435058104838147977E-03 -0.622883989557962013E-03 - 0.926586935714399963E-02 -0.164608242952759995E-03 0.822717557368590063E-02 - -0.613859067359255033E-09 -0.162063125398168002E-03 -0.666666666666666970E-02 - -0.415892283159718001E-11 0.755721256705853967E-02 -0.162062565682641010E-03 - -0.666666666666666970E-02 0.755721268194402968E-02 -0.413779732258981993E-11 - -0.164605464633444011E-03 -0.610787973952621006E-09 0.822717713109540061E-02 - -0.666666666666666970E-02 0.895231589357397932E-02 -0.601006644779317027E-03 - -0.606288347917921001E-03 0.923179444181716075E-02 0.921580902907083964E-02 - -0.671118539167890951E-03 -0.666666666666666970E-02 -0.628092210114751027E-03 - 0.927168309130125927E-02 -0.174566225284372993E-03 -0.666666666666666970E-02 - 0.823503971979330958E-02 -0.510771887614783954E-06 -0.167453071907815989E-03 - -0.287656487025054002E-06 0.822679160087593994E-02 -0.666666666666666970E-02 - -0.162064999238019001E-03 0.755721705065465991E-02 -0.423106066889392027E-11 - -0.164617538202307007E-03 -0.624346471180137971E-09 0.822717661766741992E-02 - -0.435027297636500021E-03 -0.666666666666666970E-02 0.921651161171017051E-02 - -0.671532969891762031E-03 0.000000000000000000E+00 -0.435071087356345989E-03 - -0.622941366988572030E-03 0.926589066343610004E-02 -0.666666666666666970E-02 - -0.435031386261265976E-03 0.921594142114651009E-02 -0.671460160302111021E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435066999544915002E-03 0.000000000000000000E+00 -0.622929217552493962E-03 - 0.926511198239901926E-02 -0.666666666666666970E-02 -0.162052976426746003E-03 - 0.755715649452497022E-02 -0.378464898874789026E-11 -0.164557847455290991E-03 - -0.559380698263932047E-09 0.822711615636790086E-02 0.921569960061689036E-02 - -0.671199549999528054E-03 -0.666666666666666970E-02 -0.624992481787264051E-03 - 0.926820615335867057E-02 -0.666666666666666970E-02 -0.170162248239514002E-03 - 0.823042259709222922E-02 -0.875293298501571007E-07 -0.166825903639511989E-03 - -0.692727623674652971E-07 0.822728043870269918E-02 -0.666666666666666970E-02 - 0.900314235021752035E-02 -0.616359389498942999E-03 -0.610140243508493002E-03 - 0.923851413586198981E-02 -0.162042399166736006E-03 -0.666666666666666970E-02 - 0.755717693384080035E-02 -0.343592701208587996E-11 -0.164505035953650008E-03 - -0.508562496748712960E-09 0.822712657547092045E-02 -0.666666666666666970E-02 - -0.162064604852803001E-03 0.755721682250679039E-02 -0.421581532487358014E-11 - -0.164615579255980999E-03 -0.622130465298954038E-09 0.822717706389368073E-02 - -0.666666666666666970E-02 -0.435031888747001994E-03 0.921662497939955937E-02 - -0.671514002290806984E-03 0.000000000000000000E+00 -0.435066497147766000E-03 - -0.622895617170409984E-03 0.926586574231100073E-02 -0.666666666666666970E-02 - -0.435031628074990023E-03 0.921600144320258025E-02 -0.671374321788007020E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435066757774081993E-03 0.000000000000000000E+00 -0.622827422523968997E-03 - 0.926517931888306938E-02 -0.666666666666666970E-02 -0.162168008296603987E-03 - 0.755720830543264981E-02 -0.111631538525960998E-10 -0.165130579439813012E-03 - -0.162383789215979997E-08 0.822716472453222970E-02 0.822711943021576034E-02 - -0.222017847039514990E-09 -0.666666666666666970E-02 -0.777760925671697071E-11 - 0.819196919955125953E-02 -0.666666666666666970E-02 0.917235354741821086E-02 - -0.661205826288643948E-03 -0.620672923073943951E-03 0.926095692434543948E-02 - -0.666666666666666970E-02 -0.162079935652635995E-03 0.755720153609774035E-02 - -0.485738294134179040E-11 -0.164692040195761001E-03 -0.715311464965949002E-09 - 0.822716985670068034E-02 -0.166685978125188992E-03 0.822673928904943050E-02 - -0.488041135101126002E-07 -0.166618503841219014E-03 -0.666666666666666970E-02 - -0.388518789845462021E-07 0.822064815916740028E-02 -0.666666666666666970E-02 - -0.434820802690177985E-03 0.921114685377608967E-02 -0.671312500692984007E-03 - -0.435277323473812975E-03 -0.623870937176576037E-03 0.926677062396401990E-02 - -0.666666666666666970E-02 -0.435040416088372018E-03 0.921688078845643056E-02 - -0.671547258120006041E-03 -0.435057970921028016E-03 -0.622879055329303975E-03 - 0.926586475383493063E-02 -0.666666666666666970E-02 -0.435048641715767004E-03 - 0.921713990189115991E-02 -0.671590634431848974E-03 0.000000000000000000E+00 - -0.435049745678237000E-03 -0.622873878413765037E-03 0.926587662260439961E-02 - -0.666666666666666970E-02 -0.162065760605982007E-03 0.755721473567340981E-02 - -0.425851668132086027E-11 0.000000000000000000E+00 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.164621246760972002E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.628327256337155983E-09 - 0.822716093119939956E-02 -0.162542321360015994E-03 -0.666666666666666970E-02 - 0.755650259353967020E-02 -0.562981055991819022E-09 -0.167001540638854007E-03 - -0.773289585904606956E-07 0.822676761660280989E-02 -0.666666666666666970E-02 - -0.162061328166368990E-03 0.755722295014580030E-02 -0.409106119013757001E-11 - -0.164607216616360989E-03 -0.604245617915833026E-09 0.822718306150760012E-02 - -0.435044697597018011E-03 0.921886702873469036E-02 -0.673104082698351962E-03 - -0.435053689697235003E-03 -0.666666666666666970E-02 -0.624556261940502019E-03 - 0.926763947624762059E-02 -0.435053689776730007E-03 -0.666666666666666970E-02 - 0.926765139731208082E-02 -0.624573254433248028E-03 -0.435044697517509996E-03 - -0.673283560783819986E-03 0.921873769877605033E-02 -0.435024190531306987E-03 - -0.666666666666666970E-02 0.921794328133558952E-02 -0.672853471668093963E-03 - -0.435074193732175026E-03 -0.624365177556240967E-03 0.926744434610436013E-02 - -0.666666666666666970E-02 -0.435000466897472022E-03 0.921805866981871037E-02 - -0.673973626915920021E-03 -0.435097908603312988E-03 -0.625447361535428985E-03 - 0.926834572012657024E-02 -0.167001904506701996E-03 0.822676773783739997E-02 - -0.774505155958715954E-07 -0.162540904682313003E-03 -0.666666666666666970E-02 - -0.563656255578501956E-09 0.755607546000543022E-02 -0.162061504581954011E-03 - -0.666666666666666970E-02 0.755611035298445026E-02 -0.418942890780224988E-11 - -0.164621967289905991E-03 -0.619216517642183046E-09 0.822718518765969999E-02 - -0.666666666666666970E-02 0.920452787793652957E-02 -0.668731214720833010E-03 - -0.622283241864520008E-03 0.926452697307078056E-02 0.822724025612167967E-02 - -0.586193188243385978E-09 -0.666666666666666970E-02 -0.399370806297400039E-11 - 0.756944692405215041E-02 -0.162114939468458987E-03 -0.666666666666666970E-02 - 0.756941303972038983E-02 -0.521957340938646036E-11 -0.164756390961434999E-03 - -0.760317731950115969E-09 0.822719049518791032E-02 -0.666666666666666970E-02 - -0.162348256591775996E-03 0.757138537095577036E-02 -0.154279395379191983E-04 - -0.530720557527397988E-03 -0.221706573191253990E-03 0.891607556388816086E-02 - -0.163035589715460993E-03 -0.666666666666666970E-02 0.791125444579478081E-02 - -0.568881226622524034E-05 0.000000000000000000E+00 -0.530385305835545987E-03 - -0.644753151430826047E-04 0.878526723619102966E-02 -0.666666666666666970E-02 - -0.399599558422803001E-03 0.843930969858660944E-02 -0.399745648940732980E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.464884917361518986E-03 0.000000000000000000E+00 -0.476898399198092998E-03 - 0.874546494059007078E-02 -0.666666666666666970E-02 -0.162063070389469993E-03 - 0.755351606433557997E-02 -0.447630187954819978E-11 -0.164644944937428005E-03 - -0.661962104712968044E-09 0.822717544598733017E-02 0.926551780113546962E-02 - -0.622722840997360001E-03 -0.666666666666666970E-02 -0.670824268645125988E-03 - 0.921370889062123923E-02 -0.666666666666666970E-02 -0.434945178559459974E-03 - 0.921154593743686925E-02 -0.668579923758682053E-03 -0.435153154613506974E-03 - -0.620572207517385986E-03 0.926326406971371939E-02 -0.666666666666666970E-02 - 0.755816621993720978E-02 -0.411326355534032026E-11 -0.610348502456840988E-09 - 0.822725905587259020E-02 -0.402874184979866987E-03 -0.666666666666666970E-02 - 0.865106066171478054E-02 -0.472609065736268020E-03 -0.462635081887802994E-03 - -0.557606013013843028E-03 0.917391259078658046E-02 -0.666666666666666970E-02 - -0.162816160491244000E-03 0.755629729934226019E-02 -0.145267309791605992E-07 - -0.168362009612906010E-03 -0.190258327748242995E-05 0.822792910350438923E-02 - -0.666666666666666970E-02 -0.434980287177094997E-03 0.921591582004819936E-02 - -0.672220539599956038E-03 0.000000000000000000E+00 -0.435118076428732989E-03 - -0.623871208911141008E-03 0.926674480491154057E-02 -0.666666666666666970E-02 - -0.391663109206836005E-03 0.857592364748300043E-02 -0.270377370496600986E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.470345934717641013E-03 0.000000000000000000E+00 -0.290023568937984979E-03 - 0.893922648721857081E-02 -0.666666666666666970E-02 -0.317994853290403008E-03 - 0.807402136257530031E-02 -0.163000017527141987E-03 -0.506466269749422031E-03 - -0.315285081405852004E-03 0.895741238458845970E-02 0.822721524635505914E-02 - -0.564092130899272960E-09 -0.666666666666666970E-02 -0.389036333446228008E-11 - 0.758050134032894028E-02 -0.666666666666666970E-02 0.758050134032894028E-02 - -0.389036333446228008E-11 -0.564092130736643047E-09 0.822721524635505914E-02 - -0.666666666666666970E-02 -0.162062220957438010E-03 0.755675930940021001E-02 - -0.416185255233357985E-11 -0.164608241679856012E-03 -0.614515052693167971E-09 - 0.822717669744772034E-02 -0.435064264203551974E-03 0.926583708706145946E-02 - -0.622864427576335982E-03 -0.435034122053574979E-03 -0.666666666666666970E-02 - -0.671494672063580045E-03 0.921666478777227044E-02 -0.666666666666666970E-02 - -0.435034122194007025E-03 0.921666834178432037E-02 -0.671498215917801949E-03 - -0.435064264063141017E-03 -0.622867805184998033E-03 0.926584080673045936E-02 - -0.666666666666666970E-02 -0.162064603057306009E-03 0.755721765249671960E-02 - -0.421568895545965981E-11 -0.164615562604910991E-03 -0.622111714812958048E-09 - 0.822717714449142969E-02 -0.666666666666666970E-02 -0.435031767250997005E-03 - 0.921658395801952003E-02 -0.671474812281343020E-03 0.000000000000000000E+00 - -0.435066618622617987E-03 -0.622858755966087044E-03 0.926582670501814067E-02 - -0.666666666666666970E-02 -0.435031759994269993E-03 0.921598448039129964E-02 - -0.671353611750878003E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435066625877937976E-03 0.000000000000000000E+00 - -0.622806885942570987E-03 0.926515741571650939E-02 -0.435040457277539980E-03 - -0.666666666666666970E-02 0.921687744869944051E-02 -0.671542694130693044E-03 - -0.435057929735475977E-03 -0.622874450455320979E-03 0.926585997023653997E-02 - -0.666666666666666970E-02 -0.162062556267884004E-03 0.755721766013062960E-02 - -0.413703217741733986E-11 -0.164605370008190008E-03 -0.610674234287949039E-09 - 0.822717713670985916E-02 -0.435057871804128975E-03 0.926586389852728995E-02 - -0.622877919476854986E-03 -0.435040515213944027E-03 -0.666666666666666970E-02 - -0.671546676413121014E-03 0.921688294468172957E-02 -0.164358973103396013E-03 - -0.666666666666666970E-02 0.822694462574627937E-02 -0.400863023298765998E-09 - -0.164360422527890990E-03 -0.404404639852252990E-09 0.822717693008181983E-02 - -0.435040521994138000E-03 -0.666666666666666970E-02 0.921688317220892978E-02 - -0.671546727788281000E-03 -0.435057865024523994E-03 -0.622877930192895020E-03 - 0.926586392288021934E-02 -0.666666666666666970E-02 -0.162062559830136006E-03 - 0.755721765142735018E-02 -0.413716779390587020E-11 -0.164605386233707005E-03 - -0.610693931490910976E-09 0.822717713361387042E-02 -0.435057916250486973E-03 - 0.926586001868622000E-02 -0.622874471770934967E-03 -0.435040470763708000E-03 - -0.666666666666666970E-02 -0.671542796319704962E-03 0.921687790126752074E-02 - -0.164358941319385992E-03 -0.666666666666666970E-02 0.822694343451954926E-02 - -0.400827305234235980E-09 -0.164360402068052007E-03 -0.404386932680776984E-09 - 0.822717693276291996E-02 -0.666666666666666970E-02 0.755853528834142963E-02 - -0.410907237024926035E-11 -0.609517161481542976E-09 0.822725835180459011E-02 - 0.822699835769964000E-02 -0.229769146362661995E-09 -0.666666666666666970E-02 - -0.552503881837150022E-11 0.815277410258053933E-02 -0.431800076739517009E-03 - -0.666666666666666970E-02 0.911683250971320920E-02 -0.645107775821999975E-03 - -0.438245882118603019E-03 -0.614761191522381957E-03 0.925173111512482028E-02 - -0.666666666666666970E-02 -0.162066807230739002E-03 0.755721537399490963E-02 - -0.430232877656178001E-11 -0.164626562646872987E-03 -0.634707547989976979E-09 - 0.822717705663869939E-02 -0.435011184600756027E-03 -0.666666666666666970E-02 - 0.921541718649479015E-02 -0.670832821913695961E-03 0.000000000000000000E+00 - -0.435087195554581009E-03 -0.622361407627594967E-03 0.926525297165626920E-02 - -0.666666666666666970E-02 -0.435024023596373988E-03 0.921463311000128051E-02 - -0.670757573686644971E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435074360623453004E-03 0.000000000000000000E+00 - -0.622355873651302019E-03 0.926393658659479917E-02 -0.666666666666666970E-02 - -0.162274431703276004E-03 0.755738308037023996E-02 -0.318241339520051988E-10 - -0.165730950021604001E-03 -0.457847648567586989E-08 0.822724281267087944E-02 - 0.822699408685815918E-02 -0.228425571690641000E-09 -0.666666666666666970E-02 - -0.569261988097333975E-11 0.815841787771899064E-02 -0.666666666666666970E-02 - -0.432079853778089007E-03 0.912452759982078018E-02 -0.647134253482418054E-03 - -0.437974705900714988E-03 -0.615294574958598047E-03 0.925273449425248994E-02 - -0.666666666666666970E-02 0.755849436233489019E-02 -0.410955655495479020E-11 - -0.609618023537665964E-09 0.822725856237870035E-02 -0.162292777295443990E-03 - -0.666666666666666970E-02 0.755742123691062993E-02 -0.382880604592767993E-10 - -0.165829805992421012E-03 -0.549637808741221009E-08 0.822725133924320957E-02 - -0.666666666666666970E-02 -0.162519834506453002E-03 0.758488071339646994E-02 - -0.277296855958790984E-04 -0.531411092685664019E-03 -0.271305840096227018E-03 - 0.895289465200115042E-02 -0.666666666666666970E-02 -0.435144739153746991E-03 - 0.927049386352079077E-02 -0.627195195351904020E-03 0.000000000000000000E+00 - -0.434953602544600011E-03 -0.673646030614466983E-03 0.921927161130219920E-02 - -0.666666666666666970E-02 -0.162365832482225009E-03 0.770469472296904986E-02 - -0.341374365570676015E-05 0.000000000000000000E+00 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.530093834704794035E-03 0.000000000000000000E+00 - -0.670683663828369948E-04 0.878401698585971023E-02 -0.666666666666666970E-02 - -0.406469353648676005E-03 0.868845378029185940E-02 -0.487445867647356014E-03 - -0.459955485443886000E-03 -0.561455537392793009E-03 0.917876821487357934E-02 - 0.822724019224464927E-02 -0.586236000786575972E-09 -0.666666666666666970E-02 - -0.399393159268467017E-11 0.756942275692288041E-02 -0.666666666666666970E-02 - 0.756938783566538965E-02 -0.398490203650706992E-11 -0.582276441424578042E-09 - 0.822717710481313952E-02 -0.666666666666666970E-02 -0.162064188321971013E-03 - 0.755638508819920975E-02 -0.426955668367828011E-11 -0.164621776697154991E-03 - -0.630372643058242999E-09 0.822717559459556005E-02 -0.435097665976917993E-03 - 0.926770146147764921E-02 -0.624846565957563015E-03 -0.435000709641257001E-03 - -0.666666666666666970E-02 -0.673347123713348948E-03 0.921745137370448032E-02 - -0.666666666666666970E-02 -0.164221437092461006E-03 0.822590307325241077E-02 - -0.301540502789892983E-09 -0.164276463598209013E-03 -0.317267646894687990E-09 - 0.822720896113244074E-02 -0.666666666666666970E-02 -0.435040484785290994E-03 - 0.921687621618137026E-02 -0.671540621958410973E-03 -0.435057902230129995E-03 - -0.622872302682165040E-03 0.926585782249825934E-02 -0.666666666666666970E-02 - -0.162127341684403998E-03 0.759121275229191969E-02 -0.381774204087762988E-11 - 0.000000000000000000E+00 -0.164583013632055004E-03 -0.545969305874372025E-09 - 0.822717722380156974E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.166319316569640000E-03 0.755721776450155036E-02 -0.125637202975271002E-10 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.166854313971363013E-03 - 0.000000000000000000E+00 -0.695766372129872957E-09 0.759121342784058965E-02 - -0.435007341079701976E-03 -0.666666666666666970E-02 0.921767423502365020E-02 - -0.673397510122010013E-03 -0.435091037539719026E-03 -0.624857046911454035E-03 - 0.926772529096031930E-02 -0.666666666666666970E-02 -0.435042780427774989E-03 - 0.921682237061306926E-02 -0.671414330503463958E-03 -0.435055606761622013E-03 - -0.622737221386204017E-03 0.926573013279845085E-02 -0.164512437178604990E-03 - 0.822720912015355993E-02 -0.478141451497207985E-09 -0.162035132712287004E-03 - -0.666666666666666970E-02 -0.321987108752259996E-11 0.755727832217654039E-02 - -0.162065357686432991E-03 -0.666666666666666970E-02 0.755727582940853970E-02 - -0.423979675954428002E-11 -0.164618725863945991E-03 -0.625584793967118990E-09 - 0.822717479197860972E-02 -0.162035093479730998E-03 -0.666666666666666970E-02 - 0.755725620859361031E-02 -0.322006451958004010E-11 -0.164512451389381995E-03 - -0.478178423500335003E-09 0.822720912019037943E-02 -0.666666666666666970E-02 - -0.162063731980656992E-03 0.755721533834744982E-02 -0.418199541718481972E-11 - -0.164611237647245009E-03 -0.617212851732219973E-09 0.822717573520458968E-02 - -0.435072205240009014E-03 0.926779361315624960E-02 -0.624886867601484965E-03 - -0.435026179501724010E-03 -0.666666666666666970E-02 -0.673540811814344947E-03 - 0.921830845677890955E-02 -0.435026047901024976E-03 -0.666666666666666970E-02 - 0.921631021320170989E-02 -0.671342130465355002E-03 -0.435072336809685012E-03 - -0.622763518226994005E-03 0.926572086598713082E-02 -0.666666666666666970E-02 - 0.755741246850622026E-02 -0.410910644194978984E-11 -0.606515170668247041E-09 - 0.822717712895315936E-02 0.822711877443555936E-02 -0.222069534738639993E-09 - -0.666666666666666970E-02 -0.774169806703398991E-11 0.819168186272310056E-02 - -0.433633789741900023E-03 -0.666666666666666970E-02 0.917151071300197074E-02 - -0.660635211420097053E-03 -0.436454594164531018E-03 -0.620192759941486029E-03 - 0.926043761605101039E-02 -0.666666666666666970E-02 -0.435400642028349984E-03 - 0.924569487161715914E-02 -0.602675681802749023E-03 -0.434697126892320004E-03 - -0.641341627123005026E-03 0.919049697819046031E-02 -0.417766802147069983E-03 - -0.666666666666666970E-02 0.884119022200588987E-02 -0.565196699593361013E-03 - 0.000000000000000000E+00 -0.450926780658111002E-03 -0.597865634330250052E-03 - 0.921780464690032976E-02 -0.666666666666666970E-02 -0.162927680290126010E-03 - 0.761083369281832008E-02 -0.416350633852703977E-04 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.530230645664085047E-03 - 0.000000000000000000E+00 -0.269827347276391011E-03 0.866009136742803935E-02 - -0.666666666666666970E-02 -0.162144345304456006E-03 0.760092331739828028E-02 - -0.320930758282627016E-05 -0.529813749405098054E-03 -0.838256824281321020E-04 - 0.879776792041750931E-02 0.822717682684740922E-02 -0.303187285772573992E-09 - -0.666666666666666970E-02 -0.349301427021032004E-11 0.790044609360178972E-02 - -0.666666666666666970E-02 -0.407938480218889010E-03 0.871084858846607989E-02 - -0.512295469801531947E-03 -0.458837927071481999E-03 -0.588085060255285046E-03 - 0.920289209649226975E-02 -0.666666666666666970E-02 0.755933306797651038E-02 - -0.410021674909237023E-11 -0.607789044576712979E-09 0.822725774217296021E-02 - -0.162167689058960993E-03 -0.666666666666666970E-02 0.755666327531443957E-02 - -0.112741283803980995E-10 -0.165135297900302988E-03 -0.164049635527082004E-08 - 0.822716577111987958E-02 -0.666666666666666970E-02 -0.415926985437479015E-03 - 0.874524524800943982E-02 -0.345826974275356008E-03 -0.452438516787343978E-03 - -0.346461802416189973E-03 0.898436011623418053E-02 -0.666666666666666970E-02 - -0.410332540159175974E-03 0.874022409518679030E-02 -0.526301953402972958E-03 - 0.000000000000000000E+00 -0.456978154604879979E-03 -0.594686220254121993E-03 - 0.921017399119665979E-02 -0.666666666666666970E-02 -0.349138358539420027E-03 - 0.808745335846144074E-02 -0.174389979817568010E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.493489760022879976E-03 - 0.000000000000000000E+00 -0.295632770892417004E-03 0.859314451744194047E-02 - -0.666666666666666970E-02 -0.162165456200227012E-03 0.755894485257972964E-02 - -0.104756674728628994E-10 -0.165141511934384990E-03 -0.152686256191205990E-08 - 0.822721248562563977E-02 0.925936813792594023E-02 -0.620622042811125991E-03 - -0.666666666666666970E-02 -0.661330294502400018E-03 0.917272029101801056E-02 - -0.666666666666666970E-02 0.917272029101801056E-02 -0.661330294502400018E-03 - -0.620622042811125991E-03 0.925936813792594023E-02 -0.666666666666666970E-02 - -0.162059451012688989E-03 0.755773564896643039E-02 -0.398147961980706018E-11 - -0.164599233536076003E-03 -0.588248966932235027E-09 0.822718777261857021E-02 - -0.435010710626149014E-03 0.921784913653543939E-02 -0.671757497579554964E-03 - -0.435087669354308976E-03 -0.666666666666666970E-02 -0.624205044887739973E-03 - 0.926731518984568077E-02 -0.666666666666666970E-02 -0.435087680804149997E-03 - 0.926689880496680933E-02 -0.623818561754816976E-03 -0.435010699170602000E-03 - -0.671588999254513967E-03 0.921721221696823051E-02 -0.666666666666666970E-02 - -0.162064554135907989E-03 0.755721778431040023E-02 -0.421379742453608961E-11 - -0.164615465489623012E-03 -0.621841597575447999E-09 0.822717725634031921E-02 - -0.666666666666666970E-02 -0.435032270858094002E-03 0.921659999696208994E-02 - -0.671477624444696982E-03 0.000000000000000000E+00 -0.435066115102212007E-03 - -0.622858575415140996E-03 0.926582763556333018E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435031835174660986E-03 0.921598350730943976E-02 - -0.671332923411385946E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435066550710658023E-03 0.000000000000000000E+00 -0.622784369281448010E-03 - 0.926515556236343074E-02 -0.435081584415368018E-03 -0.666666666666666970E-02 - 0.926715509609260003E-02 -0.624058431079940984E-03 -0.435016797724696992E-03 - -0.671759302322477020E-03 0.921784300407063953E-02 -0.666666666666666970E-02 - -0.162062416434052993E-03 0.755721809516292017E-02 -0.413174197731702039E-11 - -0.164605215334601997E-03 -0.609922581957833024E-09 0.822717754034535051E-02 - -0.435056252509799003E-03 0.926585458068619923E-02 -0.622864166386205044E-03 - -0.435042134635998002E-03 -0.666666666666666970E-02 -0.671542142824929958E-03 - 0.921692265814959971E-02 -0.164341185001132012E-03 -0.666666666666666970E-02 - 0.822700548023182036E-02 -0.388254230807434987E-09 -0.164366486837535988E-03 - -0.391288606812102024E-09 0.822719359654751001E-02 -0.435043745553257018E-03 - -0.666666666666666970E-02 0.921697677898357035E-02 -0.671554360684184981E-03 - -0.435054641693514987E-03 -0.622866717131421977E-03 0.926586042794482964E-02 - -0.666666666666666970E-02 -0.162059863500320009E-03 0.755722860146506008E-02 - -0.403707215866304025E-11 -0.164610171594745992E-03 -0.596715205847239044E-09 - 0.822719066601751078E-02 -0.435029181351461001E-03 0.921782667546000058E-02 - -0.671761723962046014E-03 -0.435069204038055987E-03 -0.666666666666666970E-02 - -0.623757152955008967E-03 0.926682695577680030E-02 -0.165176244427055993E-03 - -0.666666666666666970E-02 0.822774267939753966E-02 -0.421637432761214996E-09 - -0.164358900259774010E-03 -0.403642926186002997E-09 0.822717766237675979E-02 - -0.666666666666666970E-02 0.895231589389712014E-02 -0.601006644879792970E-03 - -0.606288347943506004E-03 0.923179444186011944E-02 0.921580902981316945E-02 - -0.671118538607027963E-03 -0.666666666666666970E-02 -0.628092231229955013E-03 - 0.927168311539479026E-02 -0.174566249107139995E-03 -0.666666666666666970E-02 - 0.823503974641245050E-02 -0.510772714070998049E-06 -0.167453071908487002E-03 - -0.287656386989065009E-06 0.822679160061134084E-02 -0.666666666666666970E-02 - -0.162064983519178004E-03 0.755722213005034001E-02 -0.423003049664173980E-11 - -0.164617409769547002E-03 -0.624194129888606984E-09 0.822717667703955040E-02 - -0.435027350434486978E-03 -0.666666666666666970E-02 0.921651333906866981E-02 - -0.671533317952461976E-03 0.000000000000000000E+00 -0.435071034569932023E-03 - -0.622941399756069946E-03 0.926589080870137034E-02 -0.666666666666666970E-02 - -0.435031629619492017E-03 0.921595126475713983E-02 -0.671462224228685047E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.666666666666666970E-02 - -0.435066756229832998E-03 0.000000000000000000E+00 -0.622929617617789958E-03 - 0.926511475629528992E-02 -0.666666666666666970E-02 -0.162052976330494004E-03 - 0.755715655404278971E-02 -0.378464136913797002E-11 -0.164557846396870007E-03 - -0.559379553656162034E-09 0.822711615671048967E-02 0.921569960267614070E-02 - -0.671199548462878990E-03 -0.666666666666666970E-02 -0.624992541148623954E-03 - 0.926820621881743051E-02 -0.666666666666666970E-02 -0.170162317751058011E-03 - 0.823042266464544918E-02 -0.875297413337334996E-07 -0.166825903611445003E-03 - -0.692727437231052063E-07 0.822728043872133011E-02 -0.666666666666666970E-02 - 0.900314235197411002E-02 -0.616359390013905962E-03 -0.610140243636164963E-03 - 0.923851413610003031E-02 -0.162042399128667987E-03 -0.666666666666666970E-02 - 0.755717695498461014E-02 -0.343592447834058983E-11 -0.164505035561990999E-03 - -0.508562114575286970E-09 0.822712657561797990E-02 -0.666666666666666970E-02 - -0.162064626628578005E-03 0.755722018474085013E-02 -0.421639430439463977E-11 - -0.164615674182357004E-03 -0.622213619794602018E-09 0.822717716142255980E-02 - -0.666666666666666970E-02 -0.435031360634292993E-03 0.921661780771376032E-02 - -0.671522125948738010E-03 0.000000000000000000E+00 -0.435067025167480974E-03 - -0.622906555198169992E-03 0.926587475408661987E-02 -0.666666666666666970E-02 - -0.435031920197820993E-03 0.921597580576540010E-02 -0.671351984885895973E-03 - 0.000000000000000000E+00 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435066465702252001E-03 0.000000000000000000E+00 -0.622805921827540012E-03 - 0.926514231021871061E-02 -0.666666666666666970E-02 -0.162062001713370989E-03 - 0.755721764997798959E-02 -0.411598727890175012E-11 -0.164602606520337013E-03 - -0.607613494988126020E-09 0.822717713025281072E-02 0.926588367987988062E-02 - -0.622883813196616022E-03 -0.666666666666666970E-02 -0.671594845205971042E-03 - 0.921711546375830054E-02 -0.666666666666666970E-02 0.921711546375830054E-02 - -0.671594845205971042E-03 -0.622883813196616022E-03 0.926588367987988062E-02 - -0.666666666666666970E-02 -0.162080643229434011E-03 0.755719622314534028E-02 - -0.488882760495459993E-11 -0.164695578667720994E-03 -0.719871721533777035E-09 - 0.822716411325388926E-02 -0.166686373905860005E-03 0.822673885725208956E-02 - -0.487733280019655016E-07 -0.166616817170293989E-03 -0.666666666666666970E-02 - -0.385414488502229971E-07 0.822045888465093924E-02 -0.666666666666666970E-02 - -0.434813234051737991E-03 0.920937424335109035E-02 -0.669894558723525000E-03 - -0.435284874465938004E-03 -0.622580915854730019E-03 0.926510963371094017E-02 - -0.666666666666666970E-02 -0.435040408248740026E-03 0.921688082460559062E-02 - -0.671547530054927020E-03 -0.435057978759970999E-03 -0.622879363153527985E-03 - 0.926586503625058920E-02 -0.666666666666666970E-02 -0.435048354812829995E-03 - 0.921714230585230984E-02 -0.671602199049514052E-03 0.000000000000000000E+00 - -0.435050032579175011E-03 -0.622886590993160022E-03 0.926588809980974024E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.162065782294302011E-03 - 0.755721459868404031E-02 -0.425813299026563987E-11 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.164621303266333996E-03 0.000000000000000000E+00 - -0.628265660483508001E-09 0.822715314376032923E-02 -0.162542375670815992E-03 - -0.666666666666666970E-02 0.755649757850690006E-02 -0.563432271384059970E-09 - -0.167001893968643994E-03 -0.773903559881119996E-07 0.822676727708255076E-02 - -0.666666666666666970E-02 -0.162062879664528998E-03 0.755721695458686990E-02 - -0.414938028726372967E-11 -0.164606984614506006E-03 -0.612470146931394021E-09 - 0.822717691631253030E-02 -0.435061921280182988E-03 0.926591210512174059E-02 - -0.622933977614011981E-03 -0.435036465303411009E-03 -0.666666666666666970E-02 - -0.671580853268202043E-03 0.921680734331616937E-02 -0.435036519090423993E-03 - -0.666666666666666970E-02 0.921831642815977917E-02 -0.673202190294888052E-03 - -0.435061867500153025E-03 -0.624494985629193999E-03 0.926748120028762952E-02 - -0.435006921432357974E-03 -0.666666666666666970E-02 0.921581685576767021E-02 - -0.671357037929466998E-03 -0.435091457008391985E-03 -0.622887259624104036E-03 - 0.926580572699929952E-02 -0.666666666666666970E-02 -0.435000210414426987E-03 - 0.921805741531904997E-02 -0.673980656048901966E-03 -0.435098164960824000E-03 - -0.625455737994957050E-03 0.926835240170618968E-02 -0.167002256657801997E-03 - 0.822676739801933045E-02 -0.775116142162273040E-07 -0.162540963549468999E-03 - -0.666666666666666970E-02 -0.564105828717064959E-09 0.755607184337712002E-02 - -0.162063151808471010E-03 -0.666666666666666970E-02 0.755610609873923027E-02 - -0.425254382354864998E-11 -0.164619383929850006E-03 -0.628041057868921975E-09 - 0.822717599205483041E-02 -0.666666666666666970E-02 0.921687630329096064E-02 - -0.671540851769114968E-03 -0.622872550846322991E-03 0.926585805654763991E-02 - 0.926585809314046070E-02 -0.622872566927465052E-03 -0.666666666666666970E-02 - -0.671540928853904041E-03 0.921687664468394917E-02 -0.164358960620165987E-03 - -0.666666666666666970E-02 0.822694395609338026E-02 -0.400847008848852017E-09 - -0.164360413027222997E-03 -0.404398713049403977E-09 0.822717692924596067E-02 - -0.666666666666666970E-02 -0.162064640983030012E-03 0.755718903530966032E-02 - -0.421955570049609974E-11 -0.164616035944705001E-03 -0.622688578366344997E-09 - 0.822717711474709075E-02 -0.435031324368629995E-03 -0.666666666666666970E-02 - 0.921656922225446047E-02 -0.671471582389523958E-03 0.000000000000000000E+00 - -0.435067061426646024E-03 -0.622858176290813008E-03 0.926582523607523932E-02 - -0.666666666666666970E-02 -0.435031048387666001E-03 0.921594565739619967E-02 - -0.671345384731177964E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435067337357646024E-03 0.000000000000000000E+00 - -0.622804669702495955E-03 0.926513810679983028E-02 -0.666666666666666970E-02 - -0.162062558200820996E-03 0.755721755342539988E-02 -0.413711391502012990E-11 - -0.164605379088546999E-03 -0.610686125296504951E-09 0.822717713332750053E-02 - 0.926585808945160939E-02 -0.622872565296268030E-03 -0.666666666666666970E-02 - -0.671540921029493000E-03 0.921687661003382959E-02 -0.666666666666666970E-02 - -0.164358960452099987E-03 0.822694392878840081E-02 -0.400846613572565987E-09 - -0.164360413029369988E-03 -0.404398732358932989E-09 0.822717692925123943E-02 - -0.666666666666666970E-02 0.921687630355970053E-02 -0.671540851829773044E-03 - -0.622872550859014987E-03 0.926585805657732971E-02 -0.162062558227438999E-03 - -0.666666666666666970E-02 0.755721760853244034E-02 -0.413711033678103961E-11 - -0.164605378673800013E-03 -0.610685584929304044E-09 0.822717713318102915E-02 - -0.666666666666666970E-02 -0.162064570152263006E-03 0.755721841422958996E-02 - -0.421433625240463005E-11 -0.164615390632404009E-03 -0.621914612647070953E-09 - 0.822717706877345960E-02 -0.666666666666666970E-02 -0.435032309673597981E-03 - 0.921659272513397979E-02 -0.671468254207514999E-03 0.000000000000000000E+00 - -0.435066076293283009E-03 -0.622849275891169040E-03 0.926581888559491022E-02 - -0.666666666666666970E-02 -0.435031684645659992E-03 0.921601778227170050E-02 - -0.671377596560439956E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435066701213360013E-03 0.000000000000000000E+00 - -0.622828682917879006E-03 0.926519565944300058E-02 -0.666666666666666970E-02 - -0.162168008367528997E-03 0.755720834454587983E-02 -0.111631510700418996E-10 - -0.165130579339017988E-03 -0.162383743141729992E-08 0.822716472424545042E-02 - 0.822711943021554003E-02 -0.222017845615760010E-09 -0.666666666666666970E-02 - -0.777761013879667033E-11 0.819196920656370918E-02 -0.666666666666666970E-02 - 0.917235355722251927E-02 -0.661205828628974026E-03 -0.620672923579346962E-03 - 0.926095692540444999E-02 -0.666666666666666970E-02 -0.162060886928861006E-03 - 0.755722230561571041E-02 -0.407434722062596981E-11 -0.164603197375618002E-03 - -0.601754536458248027E-09 0.822718170485291916E-02 -0.435038460514972026E-03 - 0.921717884367924054E-02 -0.671598994291641997E-03 -0.435059926303323985E-03 - -0.666666666666666970E-02 -0.623144536207649959E-03 0.926616805245615051E-02 - -0.666666666666666970E-02 -0.435059886891442024E-03 0.926772984879662039E-02 - -0.624637266544077001E-03 -0.435038499931220991E-03 -0.673028756927541039E-03 - 0.921880428832917015E-02 -0.666666666666666970E-02 -0.162064590679631006E-03 - 0.755721768483237979E-02 -0.421520958094493002E-11 -0.164615528433844000E-03 - -0.622042930544206962E-09 0.822717716750567991E-02 -0.666666666666666970E-02 - -0.435032046708114988E-03 0.921657978568275055E-02 -0.671461221312615953E-03 - 0.000000000000000000E+00 -0.435066339213917980E-03 -0.622843930781788978E-03 - 0.926581370321850027E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435031628760092985E-03 0.921598912055172914E-02 -0.671353905916793046E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435066757089098022E-03 - 0.000000000000000000E+00 -0.622806875303639953E-03 0.926516708273138027E-02 - -0.435042591383769009E-03 -0.666666666666666970E-02 0.921694193442373043E-02 - -0.671550566383684018E-03 -0.435055795793300019E-03 -0.622869755363486023E-03 - 0.926586026640358919E-02 -0.666666666666666970E-02 -0.162061536519942994E-03 - 0.755722454624959024E-02 -0.409902564328030991E-11 -0.164610432002776987E-03 - -0.605475143914787990E-09 0.822718468701886044E-02 -0.435048024953900003E-03 - 0.921879226944885034E-02 -0.673030814417662025E-03 -0.435050362434787020E-03 - -0.666666666666666970E-02 -0.624403907899169952E-03 0.926747756344729022E-02 - -0.164429321124159010E-03 -0.666666666666666970E-02 0.822722510694912958E-02 - -0.405099938099922984E-09 -0.164359494664249005E-03 -0.403602349458583981E-09 - 0.822717730152964025E-02 -0.435057319269020976E-03 -0.666666666666666970E-02 - 0.926766299189552001E-02 -0.624575498879247960E-03 -0.435041067795715979E-03 - -0.673029869981998019E-03 0.921880276298027955E-02 -0.666666666666666970E-02 - -0.162062471480618010E-03 0.755721793381790036E-02 -0.413381786270217965E-11 - -0.164605232829300992E-03 -0.610216091166618047E-09 0.822717734939633002E-02 - -0.435056876315470997E-03 0.926585635253173064E-02 -0.622868044974361963E-03 - -0.435041510784235001E-03 -0.666666666666666970E-02 -0.671542372337881029E-03 - 0.921690563981085974E-02 -0.164351356240546013E-03 -0.666666666666666970E-02 - 0.822698086373702987E-02 -0.395512544131405997E-09 -0.164365831071158996E-03 - -0.398742006637490978E-09 0.822718605874551978E-02 -0.666666666666666970E-02 - 0.920452787846717975E-02 -0.668731214842509985E-03 -0.622283241890150981E-03 - 0.926452697312771072E-02 0.822724025612022077E-02 -0.586193181292780024E-09 - -0.666666666666666970E-02 -0.399370802774403963E-11 0.756944692768340022E-02 - -0.162114939474388000E-03 -0.666666666666666970E-02 0.756941304335114005E-02 - -0.521957327686064991E-11 -0.164756390953382006E-03 -0.760317714252139982E-09 - 0.822719049518821043E-02 -0.666666666666666970E-02 -0.162348325334986989E-03 - 0.757141350179527019E-02 -0.154261696442177995E-04 -0.530720487223206049E-03 - -0.221684447280562997E-03 0.891605622167588978E-02 -0.163035610806201990E-03 - -0.666666666666666970E-02 0.791126088236420930E-02 -0.568886536946596036E-05 - 0.000000000000000000E+00 -0.530385316755896006E-03 -0.644747260841582991E-04 - 0.878526676544224942E-02 -0.666666666666666970E-02 -0.399601245888043025E-03 - 0.843932649296406934E-02 -0.399753779534292998E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.464883727737099017E-03 - 0.000000000000000000E+00 -0.476904229282087023E-03 0.874547695183781025E-02 - -0.666666666666666970E-02 -0.162063070557396998E-03 0.755351614246298989E-02 - -0.447630170821152977E-11 -0.164644944992450995E-03 -0.661962042461270997E-09 - 0.822717544552603076E-02 0.926551780075880044E-02 -0.622722840857442951E-03 - -0.666666666666666970E-02 -0.670824267990035965E-03 0.921370888772457021E-02 - -0.666666666666666970E-02 -0.434945178474266999E-03 0.921154593499040955E-02 - -0.668579923648182010E-03 -0.435153154698610990E-03 -0.620572207908768009E-03 - 0.926326406979680050E-02 -0.666666666666666970E-02 0.755816621971812028E-02 - -0.411326358485724017E-11 -0.610348502915667010E-09 0.822725905587247917E-02 - -0.402874267847195980E-03 -0.666666666666666970E-02 0.865106149194994067E-02 - -0.472609392802310021E-03 -0.462635021140100986E-03 -0.557606090580912007E-03 - 0.917391269482082002E-02 -0.666666666666666970E-02 -0.162519833129987003E-03 - 0.758488053948729989E-02 -0.277295971224945994E-04 -0.531411087991590981E-03 - -0.271305613723866980E-03 0.895289446740019958E-02 -0.666666666666666970E-02 - -0.435144823516233985E-03 0.927048838466438055E-02 -0.627189812839501980E-03 - 0.000000000000000000E+00 -0.434953518101383974E-03 -0.673638210139792994E-03 - 0.921926445312559045E-02 -0.666666666666666970E-02 -0.162365832936652005E-03 - 0.770469512965636042E-02 -0.341376585662229018E-05 0.000000000000000000E+00 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.530094362034539009E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.670698858845583960E-04 - 0.878401949557726033E-02 -0.666666666666666970E-02 -0.406469382005018019E-03 - 0.868845408484301973E-02 -0.487445981816212999E-03 -0.459955463954967011E-03 - -0.561455560363951947E-03 0.917876824764385021E-02 0.822724019224600929E-02 - -0.586235995361394981E-09 -0.666666666666666970E-02 -0.399393156536155037E-11 - 0.756942275977688043E-02 -0.666666666666666970E-02 0.756938783851883977E-02 - -0.398490198193408992E-11 -0.582276435921294038E-09 0.822717710481313952E-02 - -0.666666666666666970E-02 -0.162061804589790998E-03 0.755638842160883967E-02 - -0.417671676259941005E-11 -0.164610667790356010E-03 -0.616892111647492956E-09 - 0.822717774708076922E-02 -0.435068239353668981E-03 0.926581925277365943E-02 - -0.622854391016698997E-03 -0.435030146223642990E-03 -0.666666666666666970E-02 - -0.671460680594175010E-03 0.921652801901830070E-02 -0.666666666666666970E-02 - -0.164225650703184010E-03 0.822670097400921008E-02 -0.310328437504517979E-09 - -0.164276003216507990E-03 -0.316769467002113981E-09 0.822720900471031989E-02 - -0.666666666666666970E-02 -0.435040486303126981E-03 0.921687614266927975E-02 - -0.671540506442383023E-03 -0.435057900712426009E-03 -0.622872183625812047E-03 - 0.926585769785132074E-02 -0.666666666666666970E-02 -0.162127332716032000E-03 - 0.759121301448910032E-02 -0.381743951397943036E-11 0.000000000000000000E+00 - -0.164582975768345007E-03 -0.545926699323485040E-09 0.822717722734729075E-02 - -0.666666666666666970E-02 -0.166319286578935988E-03 0.755721781846689991E-02 - -0.125531050075483999E-10 0.000000000000000000E+00 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.166854284080451002E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.695180386014204010E-09 0.759121369168153960E-02 - -0.435042566545895985E-03 -0.666666666666666970E-02 0.921694463459228966E-02 - -0.671554769008383002E-03 -0.435055820629525977E-03 -0.622874016968664036E-03 - 0.926586380297488021E-02 -0.666666666666666970E-02 -0.435042888369998996E-03 - 0.921681962552989981E-02 -0.671408260658942963E-03 -0.435055498826277974E-03 - -0.622730755410115045E-03 0.926572389917782943E-02 -0.164512123628282998E-03 - 0.822720916387005043E-02 -0.477817077089683953E-09 -0.162035062407616011E-03 - -0.666666666666666970E-02 -0.321765105888902011E-11 0.755728112204505018E-02 - -0.162062504982868002E-03 -0.666666666666666970E-02 0.755727880376912999E-02 - -0.413009842789627016E-11 -0.164604675325971005E-03 -0.609640565546088004E-09 - 0.822717725918549990E-02 -0.162035022530451994E-03 -0.666666666666666970E-02 - 0.755725864235686981E-02 -0.321784768823900019E-11 -0.164512138073065988E-03 - -0.477854634654902035E-09 0.822720916390744933E-02 -0.666666666666666970E-02 - -0.162061944628342011E-03 0.755721772719453004E-02 -0.411382630968498961E-11 - -0.164602365207998006E-03 -0.607300598025874993E-09 0.822717716454718015E-02 - -0.435050067144334015E-03 0.926588448108721027E-02 -0.622883111379359043E-03 - -0.435048320247372999E-03 -0.666666666666666970E-02 -0.671598373882453998E-03 - 0.921713778918750032E-02 -0.435048320032475997E-03 -0.666666666666666970E-02 - 0.921705200694796986E-02 -0.671504927255521023E-03 -0.435050067359229987E-03 - -0.622795092030723960E-03 0.926579461935501947E-02 -0.666666666666666970E-02 - 0.755741246850972007E-02 -0.410910644191072979E-11 -0.606515170660810997E-09 - 0.822717712895315936E-02 0.822711877443554028E-02 -0.222069534616361010E-09 - -0.666666666666666970E-02 -0.774169825927849969E-11 0.819168186428156052E-02 - -0.433633789807636014E-03 -0.666666666666666970E-02 0.917151071499614987E-02 - -0.660635211889124972E-03 -0.436454594099722020E-03 -0.620192760036731021E-03 - 0.926043761627064026E-02 -0.666666666666666970E-02 -0.435401639669350015E-03 - 0.924572083824642954E-02 -0.602697496588707024E-03 -0.434696125740512986E-03 - -0.641341546899128006E-03 0.919049825465466028E-02 -0.417766807487059990E-03 - -0.666666666666666970E-02 0.884119030057015955E-02 -0.565196723638367016E-03 - 0.000000000000000000E+00 -0.450926776155331018E-03 -0.597865637200630960E-03 - 0.921780465508625045E-02 -0.666666666666666970E-02 -0.162927680984810988E-03 - 0.761083385223151974E-02 -0.416350559257179991E-04 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.530230645905115008E-03 - 0.000000000000000000E+00 -0.269827250117159992E-03 0.866009133064084974E-02 - -0.666666666666666970E-02 -0.162144345329803000E-03 0.760092333284733031E-02 - -0.320930747823896000E-05 -0.529813749349416032E-03 -0.838256764278110937E-04 - 0.879776791477051949E-02 0.822717682684740922E-02 -0.303187284199640013E-09 - -0.666666666666666970E-02 -0.349301427958089990E-11 0.790044609725218047E-02 - -0.666666666666666970E-02 -0.407938480851945009E-03 0.871084859569853931E-02 - -0.512295472619773030E-03 -0.458837926585046020E-03 -0.588085060897069957E-03 - 0.920289209739394085E-02 -0.666666666666666970E-02 0.755933306799180978E-02 - -0.410021688445045033E-11 -0.607789044544431015E-09 0.822725774217296021E-02 - -0.162167689058370998E-03 -0.666666666666666970E-02 0.755666327735462964E-02 - -0.112741278002712006E-10 -0.165135297873814004E-03 -0.164049626541977997E-08 - 0.822716577112805013E-02 -0.666666666666666970E-02 -0.162508948582128004E-03 - 0.758386230747792964E-02 -0.268846850272863999E-04 -0.531370159696699959E-03 - -0.268798452226511010E-03 0.895121067622118953E-02 -0.666666666666666970E-02 - -0.435111783295980990E-03 0.926957338892927044E-02 -0.626355975132891958E-03 - 0.000000000000000000E+00 -0.434986584483092024E-03 -0.673610961683688029E-03 - 0.921919033051740028E-02 -0.666666666666666970E-02 -0.162383112105014989E-03 - 0.770842507081398020E-02 -0.352838184825081003E-05 0.000000000000000000E+00 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.529946733926804961E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.685200481248618984E-04 - 0.878522933538209046E-02 -0.666666666666666970E-02 -0.406001492103280994E-03 - 0.868371047881797038E-02 -0.486532578576920989E-03 -0.460309450350815017E-03 - -0.562408172008129988E-03 0.917930422133162068E-02 0.822717710596379027E-02 - -0.583431679178089002E-09 -0.666666666666666970E-02 -0.399067558641347968E-11 - 0.756879025163890995E-02 -0.666666666666666970E-02 0.756882506046350958E-02 - -0.399965820178691986E-11 -0.587373439708311021E-09 0.822723979745249023E-02 - -0.666666666666666970E-02 -0.162059699334328987E-03 0.755640315367181025E-02 - -0.409724983315023040E-11 -0.164617694447567010E-03 -0.605894425669846960E-09 - 0.822719082319614027E-02 -0.435046673499388009E-03 0.921780363530167943E-02 - -0.671765264155256970E-03 -0.435051713864318020E-03 -0.666666666666666970E-02 - -0.623330396350863002E-03 0.926636396497984918E-02 -0.666666666666666970E-02 - -0.164528465340170010E-03 0.822727665463759972E-02 -0.426431688727269020E-09 - -0.164384495696545988E-03 -0.423252858884490006E-09 0.822718001752816008E-02 - -0.666666666666666970E-02 -0.435040505956391999E-03 0.921687628114545993E-02 - -0.671540025560727998E-03 -0.435057881060873976E-03 -0.622871599674077966E-03 - 0.926585723223368081E-02 -0.666666666666666970E-02 -0.162127506812547009E-03 - 0.759121355987753960E-02 -0.382292666777456993E-11 0.000000000000000000E+00 - -0.164583701911017989E-03 -0.546697495368834967E-09 0.822717719106172071E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.166319165150470994E-03 - 0.755721813537886013E-02 -0.125075381051564997E-10 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.166854154167308990E-03 0.000000000000000000E+00 - -0.692662263886114992E-09 0.759121431463458986E-02 -0.435081853759711993E-03 - -0.666666666666666970E-02 0.926716255638974050E-02 -0.624065278105386955E-03 - -0.435016528292601001E-03 -0.671759479502790952E-03 0.921784374777362935E-02 - -0.666666666666666970E-02 -0.435040907342144000E-03 0.921689077912838002E-02 - -0.671540798041408006E-03 -0.435057479709268001E-03 -0.622869631331658029E-03 - 0.926586003647436943E-02 -0.164634692602426999E-03 0.822718023129201045E-02 - -0.639569061694972014E-09 -0.162067618394035989E-03 -0.666666666666666970E-02 - -0.433481102637178975E-11 0.755721907361002043E-02 -0.162058744590199013E-03 - -0.666666666666666970E-02 0.755722007835428998E-02 -0.399695343689710995E-11 - -0.164609059967339006E-03 -0.591014713275666970E-09 0.822719388300061083E-02 - -0.162067628996523003E-03 -0.666666666666666970E-02 0.755722473679793037E-02 - -0.433474416210013021E-11 -0.164634688823378012E-03 -0.639556359277955005E-09 - 0.822718023127935044E-02 -0.666666666666666970E-02 -0.162059007940064013E-03 - 0.755722761558702029E-02 -0.400519574062733029E-11 -0.164602739721675997E-03 - -0.591972127525160032E-09 0.822718830313987938E-02 -0.435017256528589982E-03 - 0.921784067930350032E-02 -0.671758935795494969E-03 -0.435081125759231022E-03 - -0.666666666666666970E-02 -0.624046237027197953E-03 0.926714180760297018E-02 - -0.435081137126827990E-03 -0.666666666666666970E-02 0.926672070407219030E-02 - -0.623654480728591011E-03 -0.435017245156463000E-03 -0.671584693353717023E-03 - 0.921719987714082929E-02 -0.666666666666666970E-02 0.755944258238581013E-02 - -0.409900186657922026E-11 -0.607550069673094021E-09 0.822725761878752987E-02 - 0.822717683107230917E-02 -0.307429328465473987E-09 -0.666666666666666970E-02 - -0.346590495801865012E-11 0.788944074290687074E-02 -0.405969478945487005E-03 - -0.666666666666666970E-02 0.868864830866255926E-02 -0.503334330728072020E-03 - -0.460337024732346983E-03 -0.585683665883423958E-03 0.919951404153332084E-02 - -0.666666666666666970E-02 -0.434976282613521999E-03 0.922412114032335022E-02 - -0.680891393674205956E-03 -0.435122078171959979E-03 -0.632218355554151046E-03 - 0.927546870226302056E-02 -0.163258162677796989E-03 -0.666666666666666970E-02 - 0.771427722386641007E-02 -0.477633227286587011E-04 0.000000000000000000E+00 - -0.532374011844484007E-03 -0.264816258060028011E-03 0.893589130313274076E-02 - -0.666666666666666970E-02 -0.163022377118752999E-03 0.756010678634908007E-02 - -0.985505930025391935E-05 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.528465266297688968E-03 0.000000000000000000E+00 - -0.112888567375700995E-03 0.818271007019684068E-02 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.162166578290559013E-03 0.755933922022227964E-02 - -0.104753666208369006E-10 -0.165099058432877988E-03 -0.152245985024684009E-08 - 0.822717141733115068E-02 0.822711899107518921E-02 -0.221645138348997998E-09 - -0.666666666666666970E-02 -0.803318982792332002E-11 0.819386807586616923E-02 - -0.666666666666666970E-02 -0.433725919557807001E-03 0.917430862025363052E-02 - -0.661288984210802052E-03 -0.436363721932067020E-03 -0.620322135837134050E-03 - 0.926073953967261956E-02 -0.666666666666666970E-02 0.755741928801698017E-02 - -0.410903028639137974E-11 -0.606500685137881987E-09 0.822717712893872993E-02 - -0.162225051674325002E-03 -0.666666666666666970E-02 0.762801683545900975E-02 - -0.369132323199304980E-05 -0.529778141458443004E-03 -0.883337008316910978E-04 - 0.880271404367008979E-02 -0.666666666666666970E-02 -0.435040483790848998E-03 - 0.921687610102163955E-02 -0.671540507590834041E-03 -0.435057903224484984E-03 - -0.622872195907222962E-03 0.926585773544527994E-02 -0.666666666666666970E-02 - -0.435049596668294974E-03 0.926588177948937074E-02 -0.622878658890590954E-03 - 0.000000000000000000E+00 -0.435048790726422978E-03 -0.671587306492918033E-03 - 0.921715304894545062E-02 -0.666666666666666970E-02 -0.162062557266168006E-03 - 0.755721780085659000E-02 -0.413875860912190023E-11 0.000000000000000000E+00 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.164620130721840991E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.611410160549998955E-09 - 0.822718806718837972E-02 -0.666666666666666970E-02 -0.435040499350135008E-03 - 0.921687692625696060E-02 -0.671540975224692001E-03 -0.435057887666556015E-03 - -0.622872560192916044E-03 0.926585810471101945E-02 0.822717712936562977E-02 - -0.606929315017626020E-09 -0.666666666666666970E-02 -0.411128311855796991E-11 - 0.755721765969236993E-02 -0.666666666666666970E-02 0.755721765969236993E-02 - -0.411128306434785967E-11 -0.606929314990520966E-09 0.822717712936562977E-02 - -0.666666666666666970E-02 -0.162059344493046002E-03 0.755722651317146995E-02 - -0.401759691005694024E-11 -0.164603667382743994E-03 -0.593754382165260968E-09 - 0.822718774646260045E-02 -0.435021349583778003E-03 0.921808799379093997E-02 - -0.672054642719300036E-03 -0.435077033929117025E-03 -0.666666666666666970E-02 - -0.624215698660223004E-03 0.926731300229175978E-02 -0.666666666666666970E-02 - -0.435077064835482003E-03 0.926661685822192938E-02 -0.623558994505915955E-03 - -0.435021318668023997E-03 -0.671590322333262958E-03 0.921719752068604972E-02 - -0.666666666666666970E-02 -0.162064557104371002E-03 0.755721777860633021E-02 - -0.421391246707965999E-11 -0.164615474626227001E-03 -0.621858123915516982E-09 - 0.822717725212621966E-02 -0.666666666666666970E-02 -0.435032241858439002E-03 - 0.921659906098254954E-02 -0.671477447178317022E-03 0.000000000000000000E+00 - -0.435066144096944024E-03 -0.622858570800819051E-03 0.926582756926139055E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435031827990191983E-03 - 0.921598377863167006E-02 -0.671334364038543986E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435066557893876995E-03 0.000000000000000000E+00 - -0.622785914397349987E-03 0.926515598810159942E-02 -0.435075666026484009E-03 - -0.666666666666666970E-02 0.926727820268466972E-02 -0.624183768120280973E-03 - -0.435022717858528992E-03 -0.672055614043404951E-03 0.921808835078249955E-02 - -0.666666666666666970E-02 -0.162062424733906989E-03 0.755721807683177975E-02 - -0.413205652975882023E-11 -0.164605235421495002E-03 -0.609967620239067977E-09 - 0.822717752452148052E-02 -0.435056352038405984E-03 0.926585352741150087E-02 - -0.622863300988667017E-03 -0.435042035100298007E-03 -0.666666666666666970E-02 - -0.671540654928718013E-03 0.921691864794477005E-02 -0.164343513344571992E-03 - -0.666666666666666970E-02 0.822700195448362002E-02 -0.389924360631551020E-09 - -0.164367623390296991E-03 -0.392998923185469007E-09 0.822719276781523970E-02 - -0.435043552264714024E-03 -0.666666666666666970E-02 0.921696961930900012E-02 - -0.671552161840081006E-03 -0.435054834971315978E-03 -0.622865703361334027E-03 - 0.926585903533608951E-02 -0.666666666666666970E-02 -0.162060264459924991E-03 - 0.755722786475403977E-02 -0.405192380789587974E-11 -0.164611230243440009E-03 - -0.598848077831665017E-09 0.822718997281340035E-02 -0.435034019503558974E-03 - 0.921807335886390064E-02 -0.672057800833721053E-03 -0.435064366738565982E-03 - -0.666666666666666970E-02 -0.623908223929953956E-03 0.926697860106714955E-02 - -0.164987547210470009E-03 -0.666666666666666970E-02 0.822761164777149022E-02 - -0.417375828718230023E-09 -0.164359000422478990E-03 -0.403612204340233986E-09 - 0.822717763383406034E-02 -0.666666666666666970E-02 0.755721765245576018E-02 - -0.411128322661895974E-11 -0.606929330387248952E-09 0.822717712936565058E-02 - 0.822717692676105082E-02 -0.401916206499038982E-09 -0.666666666666666970E-02 - -0.398504312842813992E-09 0.822695178414330035E-02 -0.435040783009246019E-03 - -0.666666666666666970E-02 0.921688646246055995E-02 -0.671543142540194956E-03 - -0.435057604031761002E-03 -0.622873025744601029E-03 0.926585914161285988E-02 - -0.666666666666666970E-02 -0.162064588603687992E-03 0.755722246650228965E-02 - -0.421473369469323961E-11 -0.164615514901262993E-03 -0.621972777019227005E-09 - 0.822717720033716050E-02 -0.435031817237567986E-03 -0.666666666666666970E-02 - 0.921658561270818043E-02 -0.671475161541192035E-03 0.000000000000000000E+00 - -0.435066568644765022E-03 -0.622858805909676998E-03 0.926582686247801046E-02 - -0.666666666666666970E-02 -0.435031985152347994E-03 0.921599362553296962E-02 - -0.671355526523753957E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435066400758952999E-03 0.000000000000000000E+00 - -0.622807256895068981E-03 0.926516002397628075E-02 -0.666666666666666970E-02 - -0.162062535395593997E-03 0.755721770745760007E-02 -0.413623360119686986E-11 - -0.164605263979841011E-03 -0.610558026872576020E-09 0.822717713332134053E-02 - 0.822717692676101960E-02 -0.401916174204150005E-09 -0.666666666666666970E-02 - -0.398504881043303978E-09 0.822695182391632029E-02 -0.666666666666666970E-02 - -0.435040784478542004E-03 0.921688651148403967E-02 -0.671543153316058990E-03 - -0.435057602562588996E-03 -0.622873027716196984E-03 0.926585914660035988E-02 - -0.666666666666666970E-02 0.755721765260307984E-02 -0.411128317076067032E-11 - -0.606929330073791030E-09 0.822717712936565058E-02 -0.162062535451855009E-03 - -0.666666666666666970E-02 0.755721767183212961E-02 -0.413623865254959015E-11 - -0.164605264611767001E-03 -0.610558775336228996E-09 0.822717713310498061E-02 - -0.666666666666666970E-02 -0.435041508530557993E-03 0.921691025951686983E-02 - -0.671547950168372028E-03 -0.435056878568975021E-03 -0.622873496063732976E-03 - 0.926586118314657974E-02 -0.666666666666666970E-02 -0.435049193102497007E-03 - 0.921716879431322932E-02 -0.671606868811568960E-03 0.000000000000000000E+00 - -0.435049194293033988E-03 -0.622886307249136953E-03 0.926588937729599070E-02 - -0.666666666666666970E-02 -0.162062558223755992E-03 0.755721793600594959E-02 - -0.413708123041369019E-11 0.000000000000000000E+00 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.164605375314667000E-03 0.000000000000000000E+00 - -0.610681172826885034E-09 0.822717711923300066E-02 -0.666666666666666970E-02 - -0.435040497369072016E-03 0.921687685990914968E-02 -0.671540960664754978E-03 - -0.435057889647446998E-03 -0.622872557534397998E-03 0.926585809770509930E-02 - 0.822717712936562977E-02 -0.606929315458585975E-09 -0.666666666666666970E-02 - -0.411128312059151024E-11 0.755721765951058999E-02 -0.666666666666666970E-02 - 0.755721765951058999E-02 -0.411128314769655971E-11 -0.606929315377271019E-09 - 0.822717712936562977E-02 -0.666666666666666970E-02 -0.162062535431107012E-03 - 0.755721765815363031E-02 -0.413623888851339997E-11 -0.164605264643440992E-03 - -0.610558828302725956E-09 0.822717713301761994E-02 -0.164360410638444992E-03 - 0.822717692977796913E-02 -0.404391259451850998E-09 -0.164359006270221988E-03 - -0.666666666666666970E-02 -0.400958477955193996E-09 0.822695178639428967E-02 - -0.666666666666666970E-02 -0.435040782988183007E-03 0.921688646176349081E-02 - -0.671543142412070991E-03 -0.435057604052822009E-03 -0.622873025744216029E-03 - 0.926585914154554914E-02 -0.666666666666666970E-02 -0.435040480198357025E-03 - 0.921687629980787008E-02 -0.671540848012874966E-03 -0.435057906816663025E-03 - -0.622872547218613964E-03 0.926585805277396063E-02 -0.666666666666666970E-02 - -0.435049192310166017E-03 0.921716878050548964E-02 -0.671606876994400976E-03 - 0.000000000000000000E+00 -0.435049195085364978E-03 -0.622886319709903963E-03 - 0.926588938772177073E-02 -0.666666666666666970E-02 -0.162062559106551998E-03 - 0.755721765241205989E-02 -0.413713446085067003E-11 0.000000000000000000E+00 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.164605382365712999E-03 - 0.000000000000000000E+00 -0.610689035956021986E-09 0.822717709595601962E-02 - -0.162062558807452007E-03 -0.666666666666666970E-02 0.755721765294025023E-02 - -0.413712892374228960E-11 -0.164605381125477004E-03 -0.610688254193982979E-09 - 0.822717713365142024E-02 -0.666666666666666970E-02 -0.162062001820446010E-03 - 0.755721765238299981E-02 -0.411599096858730035E-11 -0.164602607028427003E-03 - -0.607614049978665960E-09 0.822717713004911949E-02 -0.435050784134680991E-03 - 0.926588367152923927E-02 -0.622883807494963005E-03 -0.435047603248173024E-03 - -0.666666666666666970E-02 -0.671594832025931951E-03 0.921711541901995036E-02 - -0.435047603248167982E-03 -0.666666666666666970E-02 0.921711541808560922E-02 - -0.671594830956834992E-03 -0.435050784134685979E-03 -0.622883806457666020E-03 - 0.926588367056311972E-02 -0.435040480225729010E-03 -0.666666666666666970E-02 - 0.921687630059584047E-02 -0.671540848078642995E-03 -0.435057906789294022E-03 - -0.622872547125350026E-03 0.926585805273653050E-02 -0.666666666666666970E-02 - -0.435040480222318978E-03 0.921687628821601067E-02 -0.671540835117597997E-03 - -0.435057906792703025E-03 -0.622872534696777965E-03 0.926585803993784998E-02 - -0.164605381122261992E-03 0.822717713365140983E-02 -0.610688243907033049E-09 - -0.162062558816420988E-03 -0.666666666666666970E-02 -0.413712878804995986E-11 - 0.755721765777050969E-02 -0.162062559114418996E-03 -0.666666666666666970E-02 - 0.755721765773097014E-02 -0.413714007422677992E-11 -0.164605382603210006E-03 - -0.610689880891928972E-09 0.822717713311491017E-02 -0.666666666666666970E-02 - 0.755721765237515972E-02 -0.411128320041550968E-11 -0.606929330558722976E-09 - 0.822717712936565058E-02 0.822717692676106990E-02 -0.401916224778808988E-09 - -0.666666666666666970E-02 -0.398503993603989025E-09 0.822695176179714938E-02 - -0.435040782183969974E-03 -0.666666666666666970E-02 0.921688643492011954E-02 - -0.671543136482110042E-03 -0.435057604856967008E-03 -0.622873024631880002E-03 - 0.926585913880642077E-02 -0.666666666666666970E-02 -0.162064603069876005E-03 - 0.755721765255115002E-02 -0.421568943808864994E-11 -0.164615562666949012E-03 - -0.622111784951490950E-09 0.822717714448729931E-02 -0.435031767249488988E-03 - -0.666666666666666970E-02 0.921658397566323985E-02 -0.671474830577318974E-03 - 0.000000000000000000E+00 -0.435066618624126004E-03 -0.622858773502819960E-03 - 0.926582672350118042E-02 -0.666666666666666970E-02 -0.435031759838531019E-03 - 0.921598447237031992E-02 -0.671353608087835044E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435066626033650983E-03 - 0.000000000000000000E+00 -0.622806883370278998E-03 0.926515741219141944E-02 - -0.666666666666666970E-02 -0.162062535484635997E-03 0.755721765240888969E-02 - -0.413624141678407034E-11 -0.164605264966746989E-03 -0.610559194915996025E-09 - 0.822717713298093921E-02 0.822717692676106990E-02 -0.401916224778717016E-09 - -0.666666666666666970E-02 -0.398503993605691001E-09 0.822695176179727081E-02 - -0.666666666666666970E-02 -0.435040782183975015E-03 0.921688643492027046E-02 - -0.671543136482143001E-03 -0.435057604856962997E-03 -0.622873024631885965E-03 - 0.926585913880642945E-02 -0.666666666666666970E-02 0.755721765237515972E-02 - -0.411128320041550968E-11 -0.606929330558722046E-09 0.822717712936565058E-02 - -0.162062535484635997E-03 -0.666666666666666970E-02 0.755721765240875958E-02 - -0.413624138969446992E-11 -0.164605264966748995E-03 -0.610559194918309029E-09 - 0.822717713298093921E-02 -0.666666666666666970E-02 -0.162064573740720012E-03 - 0.755721745014789007E-02 -0.421455659253393022E-11 -0.164615418100806996E-03 - -0.621947116183705050E-09 0.822717706997160014E-02 -0.666666666666666970E-02 - -0.435032332380832999E-03 0.921659235812519011E-02 -0.671466933340958026E-03 - 0.000000000000000000E+00 -0.435066053589888019E-03 -0.622847839674434958E-03 - 0.926581781614685060E-02 -0.666666666666666970E-02 -0.435031592540386985E-03 - 0.921601550080636961E-02 -0.671377077483803031E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435066793302425986E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.622828624049149020E-03 - 0.926519622533869026E-02 -0.666666666666666970E-02 -0.162168008363252009E-03 - 0.755720833616017017E-02 -0.111631527935513995E-10 -0.165130579415455000E-03 - -0.162383770179478996E-08 0.822716472435720998E-02 0.822711943027826069E-02 - -0.222017847415367999E-09 -0.666666666666666970E-02 -0.777760943851021986E-11 - 0.819196920090728073E-02 -0.666666666666666970E-02 0.917235354987984940E-02 - -0.661205826876668044E-03 -0.620672923200048048E-03 0.926095692459277982E-02 - -0.666666666666666970E-02 -0.162061842522603000E-03 0.755722298561666960E-02 - -0.411041893471449994E-11 -0.164609505054689987E-03 -0.607053573850319957E-09 - 0.822718286444302019E-02 -0.164360468540171998E-03 0.822717730472606081E-02 - -0.403903416083619976E-09 -0.164359675113830009E-03 -0.666666666666666970E-02 - -0.403282765837450980E-09 0.822713708288526011E-02 -0.666666666666666970E-02 - -0.435047234874819013E-03 0.921870444385484988E-02 -0.672996776069070043E-03 - -0.435051152501487017E-03 -0.624363760097220040E-03 0.926743532429376987E-02 - -0.666666666666666970E-02 -0.435040488105227008E-03 0.921687626274407014E-02 - -0.671540573261780044E-03 -0.435057898910482974E-03 -0.622872236309804958E-03 - 0.926585776729671982E-02 -0.666666666666666970E-02 -0.435049476964798998E-03 - 0.926588407933098997E-02 -0.622880979701605024E-03 0.000000000000000000E+00 - -0.435048910430331005E-03 -0.671593403370661043E-03 0.921715753812691989E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.162062536856723012E-03 - 0.755721779084045971E-02 -0.413747795884770991E-11 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.164615649658616002E-03 0.000000000000000000E+00 - -0.611079896253776960E-09 0.822718482294398055E-02 -0.162062474956520004E-03 - -0.666666666666666970E-02 0.755722267180921042E-02 -0.413358698004224000E-11 - -0.164605418170935013E-03 -0.610187187425962033E-09 0.822717750858028077E-02 - -0.666666666666666970E-02 -0.162060451582619013E-03 0.755722364032399008E-02 - -0.405820744876872962E-11 -0.164603101435012986E-03 -0.599472333441493042E-09 - 0.822718324339584070E-02 -0.435033610082050993E-03 0.921880734854766022E-02 - -0.673021763224571968E-03 -0.435064776097052002E-03 -0.666666666666666970E-02 - -0.624752427388953016E-03 0.926785482617738070E-02 -0.435064833463209995E-03 - -0.666666666666666970E-02 0.926629223154662972E-02 -0.623259249505167968E-03 - -0.435033552706628004E-03 -0.671592121883009046E-03 0.921718022533242996E-02 - -0.435057725512486980E-03 -0.666666666666666970E-02 0.926766886039601979E-02 - -0.624580707385012017E-03 -0.435040661518347027E-03 -0.673023701802679037E-03 - 0.921879973034194986E-02 -0.666666666666666970E-02 -0.435040745621057988E-03 - 0.921687745366213916E-02 -0.671533468502133989E-03 -0.435057641416789994E-03 - -0.622863794394841984E-03 0.926585102903257005E-02 -0.164605418982737001E-03 - 0.822717750858287938E-02 -0.610189797184088988E-09 -0.162062472691372001E-03 - -0.666666666666666970E-02 -0.413360072924052010E-11 0.755722145179233036E-02 - -0.162060895421523010E-03 -0.666666666666666970E-02 0.755722209603739998E-02 - -0.407539453490869988E-11 -0.164609527091146007E-03 -0.602110183541000016E-09 - 0.822718635496286037E-02 -0.666666666666666970E-02 0.920452788146493978E-02 - -0.668731215529775008E-03 -0.622283242035163997E-03 0.926452697345470956E-02 - 0.822724025612128936E-02 -0.586193175774719989E-09 -0.666666666666666970E-02 - -0.399370800068332996E-11 0.756944693050668957E-02 -0.162114939472544992E-03 - -0.666666666666666970E-02 0.756941304617413969E-02 -0.521957294732459965E-11 - -0.164756390920850005E-03 -0.760317657384023048E-09 0.822719049519187069E-02 - -0.666666666666666970E-02 -0.162348328933429009E-03 0.757141579588467995E-02 - -0.154258694392289985E-04 -0.530720471615547980E-03 -0.221681707782894001E-03 - 0.891605396648051038E-02 -0.163035609389389007E-03 -0.666666666666666970E-02 - 0.791126060525454075E-02 -0.568885700258806994E-05 0.000000000000000000E+00 - -0.530385314277834956E-03 -0.644746756760560056E-04 0.878526670786249943E-02 - -0.666666666666666970E-02 -0.399601863075062019E-03 0.843933027528507955E-02 - -0.399755487421508981E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.464883291727197006E-03 0.000000000000000000E+00 - -0.476904595508027981E-03 0.874547668628808915E-02 -0.666666666666666970E-02 - -0.162063070476976004E-03 0.755351620828405030E-02 -0.447629245041502000E-11 - -0.164644943926219012E-03 -0.661960659410746001E-09 0.822717544592907989E-02 - 0.926551780966223919E-02 -0.622722844759921010E-03 -0.666666666666666970E-02 - -0.670824286631610035E-03 0.921370896996778939E-02 -0.666666666666666970E-02 - -0.434945180954293003E-03 0.921154601677883975E-02 -0.668579941724334022E-03 - -0.435153152221171024E-03 -0.620572211261187045E-03 0.926326407816084954E-02 - -0.666666666666666970E-02 0.755816621995536019E-02 -0.411326355512401980E-11 - -0.610348502411631949E-09 0.822725905587248958E-02 -0.402874286571233979E-03 - -0.666666666666666970E-02 0.865106167785427947E-02 -0.472609462967472005E-03 - -0.462635007412455013E-03 -0.557606102878732972E-03 0.917391271174717964E-02 - -0.666666666666666970E-02 -0.415928551473747994E-03 0.874525952715160017E-02 - -0.345831822895715981E-03 -0.452437222882241992E-03 -0.346464452179155984E-03 - 0.898436212884005936E-02 -0.666666666666666970E-02 -0.410333854430279001E-03 - 0.874024567848459034E-02 -0.526325481552128048E-03 0.000000000000000000E+00 - -0.456977123849370008E-03 -0.594710599463255975E-03 0.921019356669105019E-02 - -0.666666666666666970E-02 -0.349138128782108001E-03 0.808745191949900022E-02 - -0.174389554292269988E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.493489859954412968E-03 0.000000000000000000E+00 - -0.295632059850581025E-03 0.859314138531184005E-02 -0.666666666666666970E-02 - -0.162165456183634003E-03 0.755894482058337962E-02 -0.104756736090630003E-10 - -0.165141512110283999E-03 -0.152686347573174001E-08 0.822721248554279978E-02 - 0.925936813557192914E-02 -0.620622041734691043E-03 -0.666666666666666970E-02 - -0.661330289554338960E-03 0.917272027027182975E-02 -0.666666666666666970E-02 - 0.917272027027182975E-02 -0.661330289554338960E-03 -0.620622041734691043E-03 - 0.925936813557192914E-02 -0.666666666666666970E-02 -0.162062426567090008E-03 - 0.755772612433826957E-02 -0.409070240552687966E-11 -0.164599684884186013E-03 - -0.603681595043669966E-09 0.822717664154948985E-02 -0.435043942803598025E-03 - 0.921717692320900978E-02 -0.671606591937570947E-03 -0.435054444453763980E-03 - -0.666666666666666970E-02 -0.623016030658318006E-03 0.926602914632313965E-02 - -0.666666666666666970E-02 -0.435054444436950010E-03 0.926603032447032016E-02 - -0.623017161092738014E-03 -0.435043942820411995E-03 -0.671607789864231051E-03 - 0.921717803397737075E-02 -0.666666666666666970E-02 -0.162064603090850004E-03 - 0.755721765255309031E-02 -0.421569022865129025E-11 -0.164615562771554003E-03 - -0.622111903849669951E-09 0.822717714451166003E-02 -0.666666666666666970E-02 - -0.435031767124851004E-03 0.921658398188585082E-02 -0.671474840199063022E-03 - 0.000000000000000000E+00 -0.435066618748741979E-03 -0.622858783402943975E-03 - 0.926582673394005067E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435031759684571979E-03 0.921598447096053965E-02 -0.671353614127903012E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435066626187581996E-03 - 0.000000000000000000E+00 -0.622806890458470035E-03 0.926515741526657936E-02 - -0.435040444163285018E-03 -0.666666666666666970E-02 0.921687548937809992E-02 - -0.671541060679635036E-03 -0.435057942848580980E-03 -0.622872964977058013E-03 - 0.926585833344844949E-02 -0.666666666666666970E-02 -0.162062561393839003E-03 - 0.755721764798701970E-02 -0.413722734407884995E-11 -0.164605394042161002E-03 - -0.610702580324128951E-09 0.822717713158752952E-02 -0.435057935435147018E-03 - 0.926585952661160978E-02 -0.622874138897695035E-03 -0.435040451577369989E-03 - -0.666666666666666970E-02 -0.671542323723484992E-03 0.921687685921447966E-02 - -0.164358967946371006E-03 -0.666666666666666970E-02 0.822694291289794051E-02 - -0.400842285147718976E-09 -0.164360426857548987E-03 -0.404409911340958016E-09 - 0.822717692852258965E-02 -0.435040431638602989E-03 -0.666666666666666970E-02 - 0.921687619006436934E-02 -0.671542172631628951E-03 -0.435057955372164984E-03 - -0.622874107379275976E-03 0.926585945492748933E-02 -0.666666666666666970E-02 - -0.162062560447055011E-03 0.755721764904289991E-02 -0.413719132263620016E-11 - -0.164605389320235000E-03 -0.610697341609479036E-09 0.822717713218053953E-02 - -0.435057923606845009E-03 0.926585840260774063E-02 -0.622872995394380970E-03 - -0.435040463406708008E-03 -0.666666666666666970E-02 -0.671541206497535000E-03 - 0.921687613516897035E-02 -0.164358974674477006E-03 -0.666666666666666970E-02 - 0.822694323093815967E-02 -0.400850494105121994E-09 -0.164360431601409999E-03 - -0.404413311485434000E-09 0.822717692781809069E-02 -0.666666666666666970E-02 - 0.895231589357087069E-02 -0.601006644778351003E-03 -0.606288347917667949E-03 - 0.923179444181664033E-02 0.921580902911038960E-02 -0.671118539139058980E-03 - -0.666666666666666970E-02 -0.628092211209406968E-03 0.927168309255030007E-02 - -0.174566226528932001E-03 -0.666666666666666970E-02 0.823503972118425076E-02 - -0.510771931197153037E-06 -0.167453071908171011E-03 -0.287656482013064974E-06 - 0.822679160086088948E-02 -0.666666666666666970E-02 -0.162064998742904011E-03 - 0.755721692802432000E-02 -0.423105196131728011E-11 -0.164617536966823991E-03 - -0.624345265608089027E-09 0.822717661914844009E-02 -0.435027295434020991E-03 - -0.666666666666666970E-02 0.921651153587084958E-02 -0.671532950972917025E-03 - 0.000000000000000000E+00 -0.435071089558342007E-03 -0.622941361344091053E-03 - 0.926589065346502053E-02 -0.666666666666666970E-02 -0.435031391217050999E-03 - 0.921594150620621937E-02 -0.671460176695444977E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435066994590015989E-03 - 0.000000000000000000E+00 -0.622929213193418037E-03 0.926511191021490987E-02 - -0.666666666666666970E-02 -0.162052976422443997E-03 0.755715649690166975E-02 - -0.378464866461816970E-11 -0.164557847410658995E-03 -0.559380649971562964E-09 - 0.822711615636883935E-02 0.921569960071171035E-02 -0.671199549931674020E-03 - -0.666666666666666970E-02 -0.624992484434852037E-03 0.926820615627817028E-02 - -0.666666666666666970E-02 -0.170162251258758009E-03 0.823042260002626071E-02 - -0.875293478015414056E-07 -0.166825903638244990E-03 -0.692727617230331017E-07 - 0.822728043872173083E-02 -0.666666666666666970E-02 0.900314235025987016E-02 - -0.616359389511348982E-03 -0.610140243511595988E-03 0.923851413586814982E-02 - -0.162042399164606009E-03 -0.666666666666666970E-02 0.755717693475327010E-02 - -0.343592691293592996E-11 -0.164505035934295996E-03 -0.508562477958249036E-09 - 0.822712657547360927E-02 -0.666666666666666970E-02 -0.435040441298520982E-03 - 0.921688002980029959E-02 -0.671545791530904035E-03 -0.435057945713094023E-03 - -0.622877507553345035E-03 0.926586316856804065E-02 -0.666666666666666970E-02 - -0.162131095725141989E-03 0.759120433096500003E-02 -0.393925786733449011E-11 - 0.000000000000000000E+00 -0.164599730797555013E-03 -0.563072851826958952E-09 - 0.822717559878123963E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.166319227572208993E-03 0.755721552651014965E-02 -0.125030561412552005E-10 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.166854118797604998E-03 - 0.000000000000000000E+00 -0.692387796644269027E-09 0.759120514631322994E-02 - -0.666666666666666970E-02 -0.162099247780077998E-03 0.755721979757678023E-02 - -0.581369082613002011E-11 -0.164789517015928995E-03 -0.853933169653901039E-09 - 0.822717846337374034E-02 0.926237298745190979E-02 -0.621979742880892045E-03 - -0.666666666666666970E-02 -0.667643955617555022E-03 0.919963617413947950E-02 - -0.666666666666666970E-02 0.920119114136109043E-02 -0.667964673290031966E-03 - -0.622121226735743030E-03 0.926416149733888007E-02 -0.666666666666666970E-02 - -0.435000220488568974E-03 0.921804320947046917E-02 -0.673963456108928050E-03 - -0.435098154891607001E-03 -0.625438906427754982E-03 0.926833740265235045E-02 - -0.167002100358977013E-03 0.822676745335870936E-02 -0.774836785822800982E-07 - -0.162540956180643993E-03 -0.666666666666666970E-02 -0.563903073588569969E-09 - 0.755607876483108988E-02 -0.666666666666666970E-02 -0.162059390613440994E-03 - 0.755611424124089017E-02 -0.410936949933791987E-11 -0.164621823131188991E-03 - -0.607895080715946992E-09 0.822719290698344964E-02 -0.666666666666666970E-02 - -0.162064679580902008E-03 0.755721760719274029E-02 -0.421866212845835019E-11 - 0.000000000000000000E+00 -0.164615944113866006E-03 -0.622543990942232019E-09 - 0.822717713446135049E-02 -0.666666666666666970E-02 -0.435031107052827001E-03 - 0.921656310812869056E-02 -0.671471001355917964E-03 0.000000000000000000E+00 - -0.435067278703290996E-03 -0.622858834069494032E-03 0.926582569486444040E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435031446660309002E-03 - 0.921599853918038045E-02 -0.671396777488897042E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435066939156653987E-03 0.000000000000000000E+00 - -0.622852608399671048E-03 0.926518020755455994E-02 -0.162542334394280010E-03 - -0.666666666666666970E-02 0.755649428756813030E-02 -0.563245912569737047E-09 - -0.167001746375183998E-03 -0.773653708482778048E-07 0.822676733536495917E-02 - -0.666666666666666970E-02 -0.162063551455513005E-03 0.755721718231079035E-02 - -0.417514955388785973E-11 -0.164610567026664999E-03 -0.616225367346680050E-09 - 0.822717731694085942E-02 -0.164377985077527987E-03 0.822719270254208997E-02 - -0.401055733513956985E-09 -0.164351625080718007E-03 -0.666666666666666970E-02 - -0.392176583167848991E-09 0.822661805239805963E-02 -0.435027796634762003E-03 - -0.666666666666666970E-02 0.921803071428142946E-02 -0.673143999294995021E-03 - -0.435070588466728980E-03 -0.624488956449440998E-03 0.926745798940319977E-02 - -0.162061399017261011E-03 -0.666666666666666970E-02 0.755719884111734960E-02 - -0.409719349842826014E-11 -0.164621105345924009E-03 -0.605582245755702968E-09 - 0.822719290467982013E-02 -0.666666666666666970E-02 -0.162080612318650990E-03 - 0.755719632784433975E-02 -0.488742792731292979E-11 -0.164695424088829993E-03 - -0.719668653572314038E-09 0.822716417739921026E-02 -0.166686224033829990E-03 - 0.822673892793746031E-02 -0.487607625654436989E-07 -0.166616778531177993E-03 - -0.666666666666666970E-02 -0.385465175249295009E-07 0.822046889213087079E-02 - -0.434813594858741027E-03 -0.666666666666666970E-02 0.920936888606463042E-02 - -0.669875781881485994E-03 -0.435284514510405013E-03 -0.622560397625597029E-03 - 0.926509289165874939E-02 -0.666666666666666970E-02 0.927696108602655070E-02 - -0.632627915671010992E-03 -0.671545420293352051E-03 0.921776198077606072E-02 - 0.922026582434420060E-02 -0.602408833914311001E-03 -0.666666666666666970E-02 - -0.556545703137562042E-03 0.881922173549517033E-02 -0.163017068024577997E-03 - -0.666666666666666970E-02 0.795594792411035041E-02 -0.273445418458471009E-11 - 0.000000000000000000E+00 -0.164269850502767994E-03 -0.212776837501969996E-09 - 0.822716867001030069E-02 -0.666666666666666970E-02 -0.435047888244456974E-03 - 0.922079054223712974E-02 -0.674652724066845955E-03 -0.435050499142537988E-03 - -0.626174907785970051E-03 0.926933372330603945E-02 -0.317388785409780010E-03 - -0.666666666666666970E-02 0.807555137448344972E-02 -0.161231757480141994E-03 - 0.000000000000000000E+00 -0.506658477841221994E-03 -0.311775274478785975E-03 - 0.895372834363604084E-02 -0.666666666666666970E-02 -0.163033771455105996E-03 - 0.757711880141135989E-02 -0.230783976059587012E-04 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.528851957736768035E-03 - 0.000000000000000000E+00 -0.189450184417205001E-03 0.832713861616275926E-02 - -0.666666666666666970E-02 -0.161988418302652995E-03 0.755983344534295974E-02 - -0.201738904538359992E-11 -0.164212922771786994E-03 -0.300297007087386020E-09 - 0.822710829050683073E-02 0.921859794089704976E-02 -0.671461758609938038E-03 - -0.666666666666666970E-02 -0.645287728552976008E-03 0.929336166647132082E-02 - -0.666666666666666970E-02 -0.192231196795263987E-03 0.825469366452986041E-02 - -0.211365722510570006E-06 0.000000000000000000E+00 -0.166616012173876997E-03 - -0.134858265397606994E-07 0.822777128965413058E-02 -0.666666666666666970E-02 - 0.902089049924758961E-02 -0.621512175798593952E-03 -0.611398199502805962E-03 - 0.924071304560030934E-02 -0.162013975003999001E-03 -0.666666666666666970E-02 - 0.758489414663056968E-02 -0.176583073381748006E-05 -0.529428370961672052E-03 - -0.381594181184865010E-04 0.875176256452839946E-02 -0.666666666666666970E-02 - -0.435042623232502024E-03 0.921694787103495040E-02 -0.671556689509785008E-03 - -0.435055763946669001E-03 -0.622875549110628985E-03 0.926586538965971929E-02 - -0.666666666666666970E-02 -0.435049192429825995E-03 0.921716878108901072E-02 - -0.671606874278178004E-03 0.000000000000000000E+00 -0.435049194965705000E-03 - -0.622886316419352971E-03 0.926588938457374028E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.162062558982806990E-03 0.755721781610755976E-02 - -0.413711682785996024E-11 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.164605380148779006E-03 0.000000000000000000E+00 -0.610686395487684964E-09 - 0.822717709924931928E-02 -0.666666666666666970E-02 -0.435040485643667027E-03 - 0.921687647065858040E-02 -0.671540876851966020E-03 -0.435057901371828989E-03 - -0.622872543909110035E-03 0.926585805994004953E-02 0.822717712936564018E-02 - -0.606929319904127040E-09 -0.666666666666666970E-02 -0.411128314368110998E-11 - 0.755721765744662034E-02 -0.666666666666666970E-02 0.755721765744662034E-02 - -0.411128314368110998E-11 -0.606929319768601975E-09 0.822717712936562977E-02 - -0.666666666666666970E-02 -0.162062535498602012E-03 0.755721765388590004E-02 - -0.413624192489096004E-11 -0.164605265022377999E-03 -0.610559256318664035E-09 - 0.822717713308667060E-02 -0.164360427767194014E-03 0.822717693070432014E-02 - -0.404403342469463976E-09 -0.164359021906931011E-03 -0.666666666666666970E-02 - -0.400970042731464983E-09 0.822695176085696048E-02 -0.666666666666666970E-02 - -0.435040781993407998E-03 0.921688642847521070E-02 -0.671543134942478989E-03 - -0.435057605047514022E-03 -0.622873024233365990E-03 0.926585913807351051E-02 - -0.666666666666666970E-02 -0.435040480200654992E-03 0.921687629983308949E-02 - -0.671540847971295054E-03 0.000000000000000000E+00 -0.435057906814365980E-03 - -0.622872547165051992E-03 0.926585805272850914E-02 -0.666666666666666970E-02 - -0.435049192316071991E-03 0.921716878067056940E-02 -0.671606877003452980E-03 - 0.000000000000000000E+00 -0.435049195079459979E-03 -0.622886319684859977E-03 - 0.926588938770844979E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.162062559106629004E-03 0.755721765243790033E-02 -0.413713440957626983E-11 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.164605382367111999E-03 - 0.000000000000000000E+00 -0.610689040449987030E-09 0.822717709614393007E-02 - -0.162062562108060003E-03 -0.666666666666666970E-02 0.755721765207954983E-02 - -0.413725467961124014E-11 -0.164605397579012993E-03 -0.610706551944964963E-09 - 0.822717713458392085E-02 -0.666666666666666970E-02 -0.162062001816588988E-03 - 0.755721765239816996E-02 -0.411599093120005983E-11 -0.164602607009130996E-03 - -0.607614028771139961E-09 0.822717713005806026E-02 -0.435050784089093976E-03 - 0.926588367178806001E-02 -0.622883807627942028E-03 -0.435047603293761014E-03 - -0.666666666666666970E-02 -0.671594832438412981E-03 0.921711542063954024E-02 - -0.435047603293765026E-03 -0.666666666666666970E-02 0.921711542225988993E-02 - -0.671594833533352984E-03 -0.435050784089088989E-03 -0.622883808601936025E-03 - 0.926588367352939972E-02 -0.435040480241351984E-03 -0.666666666666666970E-02 - 0.921687630121104974E-02 -0.671540848264394041E-03 -0.435057906773672024E-03 - -0.622872547211143052E-03 0.926585805288993036E-02 -0.666666666666666970E-02 - -0.435040480209677994E-03 0.921687629817317076E-02 -0.671540845217228006E-03 - -0.435057906805342979E-03 -0.622872544373526951E-03 0.926585805077998967E-02 - -0.164605397577960991E-03 0.822717713458392085E-02 -0.610706548612634985E-09 - -0.162062562110996998E-03 -0.666666666666666970E-02 -0.413725463469866987E-11 - 0.755721765366132014E-02 -0.162062559104681993E-03 -0.666666666666666970E-02 - 0.755721765355718989E-02 -0.413713996767741984E-11 -0.164605382596239995E-03 - -0.610689879340446991E-09 0.822717713313662023E-02 -0.666666666666666970E-02 - 0.755721765237517013E-02 -0.411128320041531016E-11 -0.606929330558691957E-09 - 0.822717712936565058E-02 0.822717692676106990E-02 -0.401916224886744012E-09 - -0.666666666666666970E-02 -0.398503993612473003E-09 0.822695176179774959E-02 - -0.435040782183991983E-03 -0.666666666666666970E-02 0.921688643492086027E-02 - -0.671543136482271046E-03 -0.435057604856944999E-03 -0.622873024631908950E-03 - 0.926585913880649016E-02 -0.666666666666666970E-02 -0.162064603069654990E-03 - 0.755721765252322011E-02 -0.421568940485412024E-11 -0.164615562666129003E-03 - -0.622111784075274967E-09 0.822717714448810943E-02 -0.435031767248445010E-03 - -0.666666666666666970E-02 0.921658397562742995E-02 -0.671474830568459958E-03 - 0.000000000000000000E+00 -0.435066618625169982E-03 -0.622858773500244004E-03 - 0.926582672349660075E-02 -0.666666666666666970E-02 -0.435031759841376020E-03 - 0.921598447242718069E-02 -0.671353608099204964E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435066626030805982E-03 - 0.000000000000000000E+00 -0.622806883368841996E-03 0.926515741215891940E-02 - -0.666666666666666970E-02 -0.162062535484632989E-03 0.755721765241007971E-02 - -0.413624141658310988E-11 -0.164605264966721999E-03 -0.610559194886172028E-09 - 0.822717713298093921E-02 0.822717692676106990E-02 -0.401916224750255024E-09 - -0.666666666666666970E-02 -0.398503993629537016E-09 0.822695176179893961E-02 - -0.666666666666666970E-02 -0.435040782184036978E-03 0.921688643492233999E-02 - -0.671543136482596957E-03 -0.435057604856900980E-03 -0.622873024631969015E-03 - 0.926585913880663935E-02 -0.666666666666666970E-02 0.755721765237517013E-02 - -0.411128320041521968E-11 -0.606929330558677998E-09 0.822717712936565058E-02 - -0.162062535484634994E-03 -0.666666666666666970E-02 0.755721765240914990E-02 - -0.413624147093784990E-11 -0.164605264966739996E-03 -0.610559194907676036E-09 - 0.822717713298093921E-02 -0.666666666666666970E-02 -0.435040482273178026E-03 - 0.921687613607913986E-02 -0.671540613305316998E-03 -0.435057904742022978E-03 - -0.622872309434753979E-03 0.926585781733488961E-02 -0.666666666666666970E-02 - -0.162127387857516993E-03 0.759121301019225012E-02 -0.381918248612579987E-11 - 0.000000000000000000E+00 -0.164583146079131992E-03 -0.546169842951880971E-09 - 0.822717717071064919E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.166319269560149001E-03 0.755721776272199035E-02 -0.125468739914664006E-10 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.166854266156419006E-03 - 0.000000000000000000E+00 -0.694836199054006039E-09 0.759121369204142967E-02 - -0.666666666666666970E-02 -0.162091523337547997E-03 0.755721898424006021E-02 - -0.540934677579967032E-11 0.000000000000000000E+00 -0.164750533637444990E-03 - -0.795365337200214006E-09 0.822717800588011056E-02 0.926452030343002941E-02 - -0.622281047419194033E-03 -0.666666666666666970E-02 -0.668721216430620004E-03 - 0.920448407564962955E-02 -0.666666666666666970E-02 0.920291934243665927E-02 - -0.668399806909403954E-03 -0.622140086487220031E-03 0.926273191293569033E-02 - -0.666666666666666970E-02 -0.435042276336174984E-03 0.921682348368780928E-02 - -0.671431373639223947E-03 -0.435056110819545018E-03 -0.622756694074473051E-03 - 0.926574695221287978E-02 -0.164604656270789013E-03 0.822717725536971020E-02 - -0.609640574535708954E-09 -0.162062479874466994E-03 -0.666666666666666970E-02 - -0.413006085521291964E-11 0.755726759055258979E-02 -0.666666666666666970E-02 - -0.162006162226876999E-03 0.755727139187646006E-02 -0.248123275689687997E-11 - -0.164398193105456003E-03 -0.370429927246309000E-09 0.822722962912524058E-02 - -0.666666666666666970E-02 -0.162064599546255997E-03 0.755721765602699984E-02 - -0.421555276492109033E-11 0.000000000000000000E+00 -0.164615545095072007E-03 - -0.622091901725074988E-09 0.822717714627387020E-02 -0.666666666666666970E-02 - -0.435031818763029980E-03 0.921658104872762930E-02 -0.671470214801774966E-03 - 0.000000000000000000E+00 -0.435066567119568007E-03 -0.622854034507566018E-03 - 0.926582206155607963E-02 -0.666666666666666970E-02 -0.435031752461292008E-03 - 0.921598570305219057E-02 -0.671353593598220024E-03 0.000000000000000000E+00 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435066633409600010E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.622806719992244047E-03 0.926515905834469043E-02 -0.162062393641909007E-03 - -0.666666666666666970E-02 0.755722114123722009E-02 -0.413058279052571960E-11 - -0.164604687177714008E-03 -0.609739841581688048E-09 0.822717725546858077E-02 - -0.666666666666666970E-02 -0.162060731985976013E-03 0.755721979203457985E-02 - -0.406827649258389981E-11 -0.164598252833624004E-03 -0.600736958280085992E-09 - 0.822717859850459997E-02 -0.164089877345694000E-03 0.822722950591394005E-02 - -0.245723983761991010E-09 -0.164740360809531990E-03 -0.666666666666666970E-02 - -0.253032242462855987E-09 0.822760867483479999E-02 -0.435063218163868012E-03 - -0.666666666666666970E-02 0.926625279535238031E-02 -0.623222898203098044E-03 - -0.435035168246062981E-03 -0.671596115313023018E-03 0.921718087666609925E-02 - -0.162006191893605003E-03 -0.666666666666666970E-02 0.755728897650767041E-02 - -0.248111433259074991E-11 -0.164398182179902004E-03 -0.370407208686471976E-09 - 0.822722962910260938E-02 -0.666666666666666970E-02 -0.162062408593543990E-03 - 0.755721778752635031E-02 -0.413141811987293010E-11 -0.164604734980651994E-03 - -0.609861061658230019E-09 0.822717721123178945E-02 -0.164359758086323008E-03 - 0.822717705188595952E-02 -0.403734889040530024E-09 -0.164358443027807014E-03 - -0.666666666666666970E-02 -0.400953602691749013E-09 0.822699461577017926E-02 - -0.435042369262157974E-03 -0.666666666666666970E-02 0.921678605589727924E-02 - -0.671388823848062012E-03 -0.435056017899960990E-03 -0.622715238128926981E-03 - 0.926570506586136926E-02 -0.666666666666666970E-02 0.897119442339463016E-02 - -0.606814776206859971E-03 -0.607757345455318975E-03 0.923427126754335081E-02 - 0.921865858631707932E-02 -0.671454241685535990E-03 -0.666666666666666970E-02 - -0.646212508082314027E-03 0.929466349621911950E-02 -0.192798550395747987E-03 - -0.666666666666666970E-02 0.825745095197302943E-02 -0.932632147974648003E-06 - 0.000000000000000000E+00 -0.167240534661016013E-03 -0.225014932097518994E-07 - 0.822685297960725967E-02 -0.666666666666666970E-02 -0.435853333045042000E-03 - 0.921982672920109922E-02 -0.576029289608052963E-03 -0.434241817042410974E-03 - -0.604491260295870981E-03 0.915701687929209966E-02 -0.413213785007987978E-03 - -0.666666666666666970E-02 0.877723644056935953E-02 -0.541755128362727992E-03 - 0.000000000000000000E+00 -0.454683425554934008E-03 -0.590990666065788033E-03 - 0.920776909864309996E-02 -0.666666666666666970E-02 -0.163098438357499011E-03 - 0.763192860819197037E-02 -0.471912218750803974E-04 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.530165770547245995E-03 - 0.000000000000000000E+00 -0.266001580375158011E-03 0.861802239077489977E-02 - -0.666666666666666970E-02 -0.162004212070115012E-03 0.757730725874160033E-02 - -0.176948864124387004E-05 -0.529438965863178955E-03 -0.405266342026753998E-04 - 0.875391878258711953E-02 0.921979766533967057E-02 -0.602143233936805987E-03 - -0.666666666666666970E-02 -0.555336769390312010E-03 0.881585236637339939E-02 - -0.666666666666666970E-02 -0.163010868918803012E-03 0.795402864518519014E-02 - -0.273140611218113010E-11 0.000000000000000000E+00 -0.164270868404786012E-03 - -0.213417891562774994E-09 0.822716826171294990E-02 -0.666666666666666970E-02 - 0.927693305629184034E-02 -0.632604555455441018E-03 -0.671545581847466972E-03 - 0.921776054189130979E-02 -0.161978982117798992E-03 -0.666666666666666970E-02 - 0.755685740523827972E-02 -0.195386055106566009E-11 -0.164191968508161013E-03 - -0.291698868024501998E-09 0.822713227244221967E-02 -0.666666666666666970E-02 - -0.435040480198179975E-03 0.921687629980929950E-02 -0.671540848019947000E-03 - -0.435057906816840021E-03 -0.622872547226495030E-03 0.926585805278096024E-02 - -0.666666666666666970E-02 -0.435049192309293017E-03 0.921716878045670054E-02 - -0.671606876968350959E-03 0.000000000000000000E+00 -0.435049195086237978E-03 - -0.622886319690002022E-03 0.926588938769827043E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.162062559109026988E-03 0.755721765240835973E-02 - -0.413713441548758037E-11 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.164605382377888996E-03 0.000000000000000000E+00 -0.610689049049815970E-09 - 0.822717709592826925E-02 -0.666666666666666970E-02 -0.435040480192435005E-03 - 0.921687628880354937E-02 -0.671540836694003019E-03 -0.435057906822584992E-03 - -0.622872536396895009E-03 0.926585804147755931E-02 0.822717712936564018E-02 - -0.606929320743386971E-09 -0.666666666666666970E-02 -0.411128314894905010E-11 - 0.755721765697572965E-02 -0.666666666666666970E-02 0.755721765697572965E-02 - -0.411128314894905010E-11 -0.606929320770492025E-09 0.822717712936564018E-02 - -0.666666666666666970E-02 -0.162062535444674989E-03 0.755721765241194020E-02 - -0.413623989629973001E-11 -0.164605264767690994E-03 -0.610558973788768004E-09 - 0.822717713298089064E-02 -0.164360410611332999E-03 0.822717692931345009E-02 - -0.404391742677632997E-09 -0.164359006864496997E-03 -0.666666666666666970E-02 - -0.400958843987352986E-09 0.822695177799862010E-02 -0.666666666666666970E-02 - -0.435040782694775998E-03 0.921688645197418083E-02 -0.671543140259736040E-03 - -0.435057604346205003E-03 -0.622873025349945041E-03 0.926585914054997052E-02 - -0.666666666666666970E-02 -0.435040480198194016E-03 0.921687629980919021E-02 - -0.671540848019547038E-03 0.000000000000000000E+00 -0.435057906816825981E-03 - -0.622872547226044002E-03 0.926585805278038951E-02 -0.666666666666666970E-02 - -0.435049192309309985E-03 0.921716878047955032E-02 -0.671606876991275004E-03 - 0.000000000000000000E+00 -0.435049195086221010E-03 -0.622886319711830048E-03 - 0.926588938772157991E-02 -0.666666666666666970E-02 -0.162062559108390995E-03 - 0.755721765240854968E-02 -0.413713449972606017E-11 0.000000000000000000E+00 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.164605382374721012E-03 - 0.000000000000000000E+00 -0.610689045533098955E-09 0.822717709592844966E-02 - -0.162062558941643005E-03 -0.666666666666666970E-02 0.755721765259115014E-02 - -0.413713398632515962E-11 -0.164605381794141997E-03 -0.610688990283018023E-09 - 0.822717713318714926E-02 -0.666666666666666970E-02 -0.162062001821220998E-03 - 0.755721765238152009E-02 -0.411599110620314974E-11 -0.164602607032291994E-03 - -0.607614054224395964E-09 0.822717713004737956E-02 -0.435050784143862991E-03 - 0.926588367156189023E-02 -0.622883807550479035E-03 -0.435047603238991024E-03 - -0.666666666666666970E-02 -0.671594832028562985E-03 0.921711541877510976E-02 - -0.435047603238989018E-03 -0.666666666666666970E-02 0.921711541842836976E-02 - -0.671594831671507020E-03 -0.435050784143864997E-03 -0.622883807208665977E-03 - 0.926588367119991069E-02 -0.435040480204769973E-03 -0.666666666666666970E-02 - 0.921687629995552975E-02 -0.671540847991965043E-03 -0.435057906810250024E-03 - -0.622872547162217020E-03 0.926585805272642921E-02 -0.666666666666666970E-02 - -0.435040480200663991E-03 0.921687628816156984E-02 -0.671540835793514026E-03 - -0.435057906814356981E-03 -0.622872535485858030E-03 0.926585804054953083E-02 - -0.164605381790940998E-03 0.822717713318714058E-02 -0.610688980014658047E-09 - -0.162062558950573009E-03 -0.666666666666666970E-02 -0.413713385087310001E-11 - 0.755721765740006036E-02 -0.162062559115579987E-03 -0.666666666666666970E-02 - 0.755721765739483971E-02 -0.413714011819061037E-11 -0.164605382612299008E-03 - -0.610689891393699981E-09 0.822717713311063928E-02 -0.666666666666666970E-02 - 0.755721765237515972E-02 -0.411128320041545960E-11 -0.606929330558715014E-09 - 0.822717712936565058E-02 0.822717692676106990E-02 -0.401916224751758011E-09 - -0.666666666666666970E-02 -0.398503993602972989E-09 0.822695176179707999E-02 - -0.435040782183968022E-03 -0.666666666666666970E-02 0.921688643492003974E-02 - -0.671543136482089984E-03 -0.435057604856969990E-03 -0.622873024631875991E-03 - 0.926585913880641036E-02 -0.666666666666666970E-02 -0.162064603069938997E-03 - 0.755721765252195983E-02 -0.421568947004117974E-11 -0.164615562667550012E-03 - -0.622111785671113991E-09 0.822717714448712063E-02 -0.435031767249090978E-03 - -0.666666666666666970E-02 0.921658397565005075E-02 -0.671474830574493001E-03 - 0.000000000000000000E+00 -0.435066618624524015E-03 -0.622858773502375004E-03 - 0.926582672349991927E-02 -0.666666666666666970E-02 -0.435031759837511002E-03 - 0.921598447232290993E-02 -0.671353608077845964E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435066626034670025E-03 - 0.000000000000000000E+00 -0.622806883367964985E-03 0.926515741217292035E-02 - -0.666666666666666970E-02 -0.162062535484635997E-03 0.755721765240869019E-02 - -0.413624141680835988E-11 -0.164605264966749998E-03 -0.610559194919630036E-09 - 0.822717713298093921E-02 0.822717692676106990E-02 -0.401916224778863996E-09 - -0.666666666666666970E-02 -0.398503993603033011E-09 0.822695176179709040E-02 - -0.666666666666666970E-02 -0.435040782183968022E-03 0.921688643492003974E-02 - -0.671543136482092044E-03 -0.435057604856969990E-03 -0.622873024631875991E-03 - 0.926585913880641036E-02 -0.666666666666666970E-02 0.755721765237515972E-02 - -0.411128320041545960E-11 -0.606929330558716048E-09 0.822717712936565058E-02 - -0.162062535484635997E-03 -0.666666666666666970E-02 0.755721765240869019E-02 - -0.413624144391367996E-11 -0.164605264966749998E-03 -0.610559194919673050E-09 - 0.822717713298093921E-02 -0.666666666666666970E-02 -0.162064603069936991E-03 - 0.755721765251413016E-02 -0.421568947063007983E-11 -0.164615562667617992E-03 - -0.622111785760815008E-09 0.822717714448719002E-02 -0.666666666666666970E-02 - -0.435031767248966023E-03 0.921658397564588047E-02 -0.671474830573548986E-03 - 0.000000000000000000E+00 -0.435066618624647993E-03 -0.622858773502177028E-03 - 0.926582672349948039E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435031759837441992E-03 0.921598447233066935E-02 -0.671353608091401961E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435066626034738980E-03 - 0.000000000000000000E+00 -0.622806883381814042E-03 0.926515741218287940E-02 - -0.666666666666666970E-02 -0.162062001821317004E-03 0.755721765238150968E-02 - -0.411599108277345033E-11 -0.164602607032771997E-03 -0.607614054759129973E-09 - 0.822717713004762069E-02 0.926588367541716050E-02 -0.622883811249623975E-03 - -0.666666666666666970E-02 -0.671594835878273032E-03 0.921711542243204042E-02 - -0.666666666666666970E-02 0.921711542243204042E-02 -0.671594835878273032E-03 - -0.622883811249622999E-03 0.926588367541716050E-02 -0.666666666666666970E-02 - -0.162062535450388002E-03 0.755721765240867979E-02 -0.413624016808590973E-11 - -0.164605264796174991E-03 -0.610559005432624996E-09 0.822717713298075012E-02 - -0.164360411311726003E-03 0.822717692923711011E-02 -0.404392344956488014E-09 - -0.164359007643746004E-03 -0.666666666666666970E-02 -0.400959412732088008E-09 - 0.822695177596773942E-02 -0.666666666666666970E-02 -0.435040782621792983E-03 - 0.921688644954029961E-02 -0.671543139722759002E-03 -0.435057604419182000E-03 - -0.622873025249980950E-03 0.926585914030375081E-02 -0.666666666666666970E-02 - -0.435040480198179975E-03 0.921687629980933072E-02 -0.671540848019966949E-03 - 0.000000000000000000E+00 -0.435057906816840021E-03 -0.622872547226514003E-03 - 0.926585805278100014E-02 -0.666666666666666970E-02 -0.435049192309251980E-03 - 0.921716878047779999E-02 -0.671606876990949960E-03 0.000000000000000000E+00 - -0.435049195086279015E-03 -0.622886319711839047E-03 0.926588938772160073E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.162062559109029997E-03 - 0.755721765240835019E-02 -0.413713457806971004E-11 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.164605382377896992E-03 0.000000000000000000E+00 - -0.610689049042042024E-09 0.822717709592721974E-02 -0.162062559106823997E-03 - -0.666666666666666970E-02 0.755721765240988975E-02 -0.413714016632381966E-11 - -0.164605382618112988E-03 -0.610689904846968049E-09 0.822717713311111980E-02 - -0.666666666666666970E-02 -0.162062001821342998E-03 0.755721765238148973E-02 - -0.411599100244394014E-11 -0.164602607032901993E-03 -0.607614054902494021E-09 - 0.822717713004762069E-02 -0.435050784145525995E-03 0.926588367157234020E-02 - -0.622883807563274030E-03 -0.435047603237328020E-03 -0.666666666666666970E-02 - -0.671594832032081004E-03 0.921711541873500989E-02 -0.435047603237328020E-03 - -0.666666666666666970E-02 0.921711541879069972E-02 -0.671594832090159004E-03 - -0.435050784145525995E-03 -0.622883807618967002E-03 0.926588367163041007E-02 - -0.435040480198147991E-03 -0.666666666666666970E-02 0.921687629974903000E-02 - -0.671540847957907000E-03 -0.435057906816872005E-03 -0.622872547167174968E-03 - 0.926585805271907051E-02 -0.666666666666666970E-02 -0.435040480192394998E-03 - 0.921687628872891983E-02 -0.671540836617235976E-03 -0.435057906822624999E-03 - -0.622872536323500051E-03 0.926585804140095046E-02 -0.164605382615034992E-03 - 0.822717713311110939E-02 -0.610689894920461967E-09 -0.162062559115410987E-03 - -0.666666666666666970E-02 -0.413714014137465006E-11 0.755721765703380993E-02 - -0.162062559115412993E-03 -0.666666666666666970E-02 0.755721765703380993E-02 - -0.413714014146466007E-11 -0.164605382615047000E-03 -0.610689894960657982E-09 - 0.822717713311110939E-02 -0.666666666666666970E-02 0.921687630349792009E-02 - -0.671540851815856984E-03 -0.622872550856052947E-03 0.926585805656933957E-02 - 0.926585809685315925E-02 -0.622872568564181964E-03 -0.666666666666666970E-02 - -0.671540936702172951E-03 0.921687667944103006E-02 -0.164358960784040002E-03 - -0.666666666666666970E-02 0.822694398364712948E-02 -0.400847403514244019E-09 - 0.000000000000000000E+00 -0.164360413019343991E-03 -0.404398689239843015E-09 - 0.822717692924298041E-02 -0.666666666666666970E-02 -0.162064603069942006E-03 - 0.755721765251538002E-02 -0.421568947070519009E-11 -0.164615562667628997E-03 - -0.622111785771050024E-09 0.822717714448712931E-02 -0.435031767249002019E-03 - -0.666666666666666970E-02 0.921658397564648069E-02 -0.671474830573193042E-03 - 0.000000000000000000E+00 -0.435066618624612973E-03 -0.622858773501632000E-03 - 0.926582672349898079E-02 -0.666666666666666970E-02 -0.435031759837399004E-03 - 0.921598447231553042E-02 -0.671353608075763971E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435066626034782023E-03 - 0.000000000000000000E+00 -0.622806883366930982E-03 0.926515741216851069E-02 - -0.666666666666666970E-02 -0.162062558230456009E-03 0.755721765240984032E-02 - -0.413710687281216962E-11 -0.164605378253319007E-03 -0.610685055063975992E-09 - 0.822717713310629033E-02 0.926585809685315925E-02 -0.622872568564181964E-03 - -0.666666666666666970E-02 -0.671540936702172951E-03 0.921687667944103006E-02 - -0.666666666666666970E-02 -0.164358960784040002E-03 0.822694398364712948E-02 - -0.400847403514244019E-09 0.000000000000000000E+00 -0.164360413019343991E-03 - -0.404398689239843015E-09 0.822717692924298041E-02 -0.666666666666666970E-02 - 0.921687630349792009E-02 -0.671540851815856984E-03 -0.622872550856052947E-03 - 0.926585805656933957E-02 -0.162062558230456009E-03 -0.666666666666666970E-02 - 0.755721765240984032E-02 -0.413710687281216962E-11 0.000000000000000000E+00 - -0.164605378253319007E-03 -0.610685055063975992E-09 0.822717713310629033E-02 - -0.666666666666666970E-02 -0.162064604275187002E-03 0.755721677996594975E-02 - -0.421580947944237963E-11 -0.164615577356503002E-03 -0.622129690587314019E-09 - 0.822717714566923060E-02 -0.666666666666666970E-02 -0.435031753444996012E-03 - 0.921658351635084964E-02 -0.671474729715682982E-03 0.000000000000000000E+00 - -0.435066632426207010E-03 -0.622858755231564033E-03 0.926582667773451045E-02 - -0.666666666666666970E-02 -0.435031736935309978E-03 0.921598323977216964E-02 - -0.671353347083358007E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435066648932866005E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.622806813899395959E-03 0.926515680964767017E-02 - -0.666666666666666970E-02 -0.162062535483415999E-03 0.755721764896765975E-02 - -0.413624165743050033E-11 -0.164605264995012004E-03 -0.610559231671319039E-09 - 0.822717713301475070E-02 0.822717692676106990E-02 -0.401916226534076012E-09 - -0.666666666666666970E-02 -0.398503963674476984E-09 0.822695175970213945E-02 - -0.666666666666666970E-02 0.921688643582909035E-02 -0.671543139638232974E-03 - -0.622873028116618000E-03 0.926585914228755976E-02 -0.666666666666666970E-02 - -0.162062558230619994E-03 0.755721765087292996E-02 -0.413710692525393008E-11 - -0.164605378269444997E-03 -0.610685075298003985E-09 0.822717713311651999E-02 - -0.435057895657227020E-03 0.926585809286093083E-02 -0.622872564813008981E-03 - -0.435040491358768000E-03 -0.666666666666666970E-02 -0.671540932535798999E-03 - 0.921687667431765958E-02 -0.666666666666666970E-02 -0.164358960778226998E-03 - 0.822694398251594058E-02 -0.400847388083247026E-09 -0.164360413020582990E-03 - -0.404398690916013012E-09 0.822717692924319031E-02 -0.666666666666666970E-02 - -0.435040480197697993E-03 0.921687629888584027E-02 -0.671540847069607041E-03 - 0.000000000000000000E+00 -0.435057906817322003E-03 -0.622872546317801987E-03 - 0.926585805183252967E-02 -0.666666666666666970E-02 -0.162127546311873996E-03 - 0.759121249955739975E-02 -0.382423929799540036E-11 0.000000000000000000E+00 - -0.164583726855931989E-03 -0.546878175870651968E-09 0.822717706955712960E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.166319285307283012E-03 - 0.755721766126241958E-02 -0.125511383854575001E-10 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.166854277196956007E-03 0.000000000000000000E+00 - -0.695070274241472012E-09 0.759121319033947018E-02 -0.435040480198167995E-03 - -0.666666666666666970E-02 0.921687629975386988E-02 -0.671540847960958053E-03 - -0.435057906816852002E-03 -0.622872547169800038E-03 0.926585805272362069E-02 - -0.666666666666666970E-02 -0.435040480192397004E-03 0.921687628872910024E-02 - -0.671540836617267960E-03 -0.435057906822622017E-03 -0.622872536323503955E-03 - 0.926585804140104934E-02 -0.164605382615351010E-03 0.822717713311131062E-02 - -0.610689895410060039E-09 -0.162062559115474006E-03 -0.666666666666666970E-02 - -0.413714014380830024E-11 0.755721765703380993E-02 -0.162062559115409008E-03 - -0.666666666666666970E-02 0.755721765703380038E-02 -0.413714008712550966E-11 - -0.164605382615029002E-03 -0.610689894941937954E-09 0.822717713311117010E-02 - -0.162062559106887993E-03 -0.666666666666666970E-02 0.755721765240996001E-02 - -0.413714022296702998E-11 -0.164605382618428003E-03 -0.610689905200928972E-09 - 0.822717713311131929E-02 -0.666666666666666970E-02 -0.162062001821339013E-03 - 0.755721765238150968E-02 -0.411599102940498021E-11 -0.164602607032881989E-03 - -0.607614054881568006E-09 0.822717713004766059E-02 -0.435050784145494011E-03 - 0.926588367157693028E-02 -0.622883807565917966E-03 -0.435047603237360004E-03 - -0.666666666666666970E-02 -0.671594832035220962E-03 0.921711541874025049E-02 - -0.435047603237360004E-03 -0.666666666666666970E-02 0.921711541879170933E-02 - -0.671594832090262003E-03 -0.435050784145494011E-03 -0.622883807618874953E-03 - 0.926588367163047079E-02 -0.666666666666666970E-02 0.921687630349792009E-02 - -0.671540851815856984E-03 -0.622872550856052947E-03 0.926585805656933957E-02 - 0.822717712936564018E-02 -0.606929320872887989E-09 -0.666666666666666970E-02 - -0.411128314877483972E-11 0.755721765699129966E-02 -0.162062559115238003E-03 - -0.666666666666666970E-02 0.755721765701823991E-02 -0.413714005475985968E-11 - -0.164605382614328987E-03 -0.610689894184682021E-09 0.822717713311110939E-02 - -0.666666666666666970E-02 -0.435040480197263987E-03 0.921687629887128941E-02 - -0.671540847066263036E-03 -0.435057906817756009E-03 -0.622872546317053020E-03 - 0.926585805183098923E-02 -0.162127546311872993E-03 -0.666666666666666970E-02 - 0.759121249955737026E-02 -0.382423935215729017E-11 0.000000000000000000E+00 - -0.164583726855924996E-03 -0.546878175863874980E-09 0.822717706955712960E-02 - -0.666666666666666970E-02 -0.166319285307283988E-03 0.755721766126244039E-02 - -0.125511383583243006E-10 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.166854277196956007E-03 0.000000000000000000E+00 - -0.695070274239683027E-09 0.759121319033943982E-02 -0.666666666666666970E-02 - -0.162062001821339989E-03 0.755721765238148973E-02 -0.411599111074019016E-11 - -0.164602607032884998E-03 -0.607614054884462001E-09 0.822717713004762069E-02 - 0.926588367541610926E-02 -0.622883811249163948E-03 -0.666666666666666970E-02 - -0.671594835876070041E-03 0.921711542242228087E-02 -0.666666666666666970E-02 - -0.435047603237369979E-03 0.921711541881043914E-02 -0.671594832109593978E-03 - -0.435050784145483982E-03 -0.622883807637366021E-03 0.926588367164967938E-02 - -0.666666666666666970E-02 0.755721765237515972E-02 -0.411128320041545960E-11 - -0.606929330558716048E-09 0.822717712936565058E-02 -0.435040480192415977E-03 - -0.666666666666666970E-02 0.921687628876629965E-02 -0.671540836655632996E-03 - -0.435057906822602990E-03 -0.622872536360199970E-03 0.926585804143926009E-02 - -0.666666666666666970E-02 -0.162816160491764011E-03 0.755629729934281964E-02 - -0.145267310755072998E-07 -0.168362009615544009E-03 -0.190258328993087994E-05 - 0.822792910352069043E-02 -0.666666666666666970E-02 -0.434980287163910990E-03 - 0.921591582112040938E-02 -0.672220540750253994E-03 0.000000000000000000E+00 - -0.435118076441907997E-03 -0.623871210047872993E-03 0.926674480647901068E-02 - -0.666666666666666970E-02 -0.391663109199982980E-03 0.857592364724167958E-02 - -0.270377370428809022E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.470345934721972021E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.290023568862184014E-03 0.893922648671382006E-02 - -0.666666666666666970E-02 -0.317994853278870024E-03 0.807402136254166923E-02 - -0.163000017519030990E-03 -0.506466269753455046E-03 -0.315285081402617016E-03 - 0.895741238458851001E-02 0.822721524635505914E-02 -0.564092130791223009E-09 - -0.666666666666666970E-02 -0.389036333446409034E-11 0.758050134032871043E-02 - -0.666666666666666970E-02 0.758050134032871043E-02 -0.389036336156913981E-11 - -0.564092130737013004E-09 0.822721524635505914E-02 -0.666666666666666970E-02 - -0.162062221237022010E-03 0.755675930893081986E-02 -0.416186344820719000E-11 - -0.164608243083756012E-03 -0.614516634575204050E-09 0.822717669815483006E-02 - -0.435064268078357019E-03 0.926583720685853970E-02 -0.622864535863296978E-03 - -0.435034118178185007E-03 -0.666666666666666970E-02 -0.671494763562757962E-03 - 0.921666478489410919E-02 -0.666666666666666970E-02 -0.435034118314003989E-03 - 0.921666821285587953E-02 -0.671498187612958996E-03 -0.435064267942557987E-03 - -0.622867800074395988E-03 0.926584079408090955E-02 -0.666666666666666970E-02 - -0.162064603060627001E-03 0.755721765249593985E-02 -0.421568916772846982E-11 - 0.000000000000000000E+00 -0.164615562621545010E-03 -0.622111733862203048E-09 - 0.822717714450407062E-02 -0.666666666666666970E-02 -0.435031767219332013E-03 - 0.921658395702201066E-02 -0.671474812089471003E-03 0.000000000000000000E+00 - -0.435066618654277992E-03 -0.622858755959619019E-03 0.926582670497335011E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435031759978188998E-03 - 0.921598448078022985E-02 -0.671353613199229967E-03 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435066625894017019E-03 0.000000000000000000E+00 - -0.622806887499908037E-03 0.926515741657525996E-02 -0.435040451653885985E-03 - -0.666666666666666970E-02 0.921687738713831972E-02 -0.671542772385395025E-03 - -0.435057935358636985E-03 -0.622874555977533041E-03 0.926586008373840933E-02 - -0.666666666666666970E-02 -0.162062556280765003E-03 0.755721766011261970E-02 - -0.413703259232341988E-11 -0.164605370112559998E-03 -0.610674307777927029E-09 - 0.822717713673958018E-02 -0.435057871981406005E-03 0.926586389913170057E-02 - -0.622877920215035986E-03 -0.435040515036651005E-03 -0.666666666666666970E-02 - -0.671546676151164976E-03 0.921688293990952957E-02 -0.164358975070530991E-03 - -0.666666666666666970E-02 0.822694462180378026E-02 -0.400864471382896996E-09 - -0.164360425623554987E-03 -0.404406184767255019E-09 0.822717693084428978E-02 - -0.435040521728764014E-03 -0.666666666666666970E-02 0.921688316448704067E-02 - -0.671546726860146044E-03 -0.435057865289874995E-03 -0.622877930792390950E-03 - 0.926586392317507029E-02 -0.666666666666666970E-02 -0.162062560143134006E-03 - 0.755721765072739013E-02 -0.413717992975032974E-11 -0.164605387803321994E-03 - -0.610695685352230028E-09 0.822717713417366048E-02 -0.435057920486110002E-03 - 0.926586013716025961E-02 -0.622874579485410002E-03 -0.435040466527716017E-03 - -0.666666666666666970E-02 -0.671542885086807029E-03 0.921687788626173940E-02 - -0.164358940726689012E-03 -0.666666666666666970E-02 0.822694332169499921E-02 - -0.400825748326332007E-09 -0.164360402303031009E-03 -0.404387092386443002E-09 - 0.822717693284892929E-02 -0.666666666666666970E-02 0.755853528834142963E-02 - -0.410907237024926035E-11 -0.609517161481544010E-09 0.822725835180459011E-02 - 0.822699835769964000E-02 -0.229769146389950011E-09 -0.666666666666666970E-02 - -0.552503881834807986E-11 0.815277410257971014E-02 -0.431800076739472991E-03 - -0.666666666666666970E-02 0.911683250971202959E-02 -0.645107775821699976E-03 - -0.438245882118646008E-03 -0.614761191522313001E-03 0.925173111512469018E-02 - -0.666666666666666970E-02 -0.162066807231452001E-03 0.755721537354540028E-02 - -0.430232887169997028E-11 -0.164626562654960999E-03 -0.634707558119433995E-09 - 0.822717705664591063E-02 -0.435011184592201997E-03 -0.666666666666666970E-02 - 0.921541718621200941E-02 -0.670832821851847973E-03 0.000000000000000000E+00 - -0.435087195563132003E-03 -0.622361407616723998E-03 0.926525297162917975E-02 - -0.666666666666666970E-02 -0.435024023581949979E-03 0.921463310923579040E-02 - -0.670757573524478033E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.435074360637873977E-03 0.000000000000000000E+00 - -0.622355873608441013E-03 0.926393658622583042E-02 -0.666666666666666970E-02 - -0.162274431703270989E-03 0.755738308036761966E-02 -0.318241338988657979E-10 - -0.165730950021739988E-03 -0.457847648587293988E-08 0.822724281267098005E-02 - 0.822699408685815918E-02 -0.228425571392821005E-09 -0.666666666666666970E-02 - -0.569261988092565005E-11 0.815841787771750919E-02 -0.666666666666666970E-02 - -0.432079853778008993E-03 0.912452759981858923E-02 -0.647134253481845053E-03 - -0.437974705900792997E-03 -0.615294574958452005E-03 0.925273449425225922E-02 - -0.666666666666666970E-02 0.755849436233489019E-02 -0.410955652784976012E-11 - -0.609618023537668031E-09 0.822725856237870035E-02 -0.162292777295442987E-03 - -0.666666666666666970E-02 0.755742123690941042E-02 -0.382880604873281030E-10 - -0.165829805992468013E-03 -0.549637808756437989E-08 0.822725133924324080E-02 - -0.666666666666666970E-02 -0.162519832090693000E-03 0.758487998779587991E-02 - -0.277297390858832013E-04 -0.531411093209674000E-03 -0.271306402284292975E-03 - 0.895289512037131946E-02 -0.666666666666666970E-02 -0.435144737424841995E-03 - 0.927049381409235027E-02 -0.627195150452744977E-03 0.000000000000000000E+00 - -0.434953604275157982E-03 -0.673646028178072039E-03 0.921927160555094036E-02 - -0.666666666666666970E-02 -0.162365832331587992E-03 0.770469465208874042E-02 - -0.341374299192897006E-05 0.000000000000000000E+00 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.530093823283070008E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.670683422235064935E-04 0.878401693892019025E-02 - -0.666666666666666970E-02 -0.406469352273000986E-03 0.868845376516133062E-02 - -0.487445861240936023E-03 -0.459955486486156979E-03 -0.561455535102076998E-03 - 0.917876821200271954E-02 0.822724019224538999E-02 -0.586236000132447954E-09 - -0.666666666666666970E-02 -0.399393158957938003E-11 0.756942275725619972E-02 - -0.666666666666666970E-02 0.756938783599863958E-02 -0.398490214171924003E-11 - -0.582276440781839991E-09 0.822717710481313952E-02 -0.666666666666666970E-02 - -0.162064188320470992E-03 0.755638508915526964E-02 -0.426955665163783005E-11 - -0.164621776680132990E-03 -0.630372622134120969E-09 0.822717559459638058E-02 - -0.435097665935211988E-03 0.926770146162825964E-02 -0.624846566023037008E-03 - -0.435000709682981980E-03 -0.666666666666666970E-02 -0.673347124029987048E-03 - 0.921745137510697997E-02 -0.666666666666666970E-02 -0.164221437098695006E-03 - 0.822590307436798021E-02 -0.301540515380110992E-09 -0.164276463597796989E-03 - -0.317267646597123000E-09 0.822720896113241992E-02 -0.666666666666666970E-02 - -0.435040484785290994E-03 0.921687621618138067E-02 -0.671540621958418996E-03 - -0.435057902230129995E-03 -0.622872302682172954E-03 0.926585782249826975E-02 - -0.666666666666666970E-02 -0.162127341684405001E-03 0.759121275229191969E-02 - -0.381774206802719039E-11 0.000000000000000000E+00 -0.164583013632061998E-03 - -0.545969305880644018E-09 0.822717722380156974E-02 -0.666666666666666970E-02 - -0.166319316569640000E-03 0.755721776450153995E-02 -0.125637203248046992E-10 - 0.000000000000000000E+00 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.166854313971363013E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.695766372139392979E-09 0.759121342784058965E-02 -0.435007341079486002E-03 - -0.666666666666666970E-02 0.921767423501631059E-02 -0.673397510119870990E-03 - -0.435091037539934999E-03 -0.624857046910582011E-03 0.926772529095947969E-02 - -0.666666666666666970E-02 -0.435042780427758998E-03 0.921682237061272926E-02 - -0.671414330503569993E-03 -0.435055606761638005E-03 -0.622737221386400041E-03 - 0.926573013279861045E-02 -0.164512437178424009E-03 0.822720912015354085E-02 - -0.478141451603002007E-09 -0.162035132712254993E-03 -0.666666666666666970E-02 - -0.321987108658194010E-11 0.755727832217645019E-02 -0.162065357686449010E-03 - -0.666666666666666970E-02 0.755727582940845036E-02 -0.423979678728262026E-11 - -0.164618725864026005E-03 -0.625584794059249953E-09 0.822717479197863921E-02 - -0.162035093479698011E-03 -0.666666666666666970E-02 0.755725620859342036E-02 - -0.322006454574526012E-11 -0.164512451389200987E-03 -0.478178423362356003E-09 - 0.822720912019037076E-02 -0.666666666666666970E-02 -0.162063731980670002E-03 - 0.755721533834740992E-02 -0.418199544479783967E-11 -0.164611237647310007E-03 - -0.617212851806125045E-09 0.822717573520461917E-02 -0.435072205240186985E-03 - 0.926779361315555050E-02 -0.624886867600672031E-03 -0.435026179501545984E-03 - -0.666666666666666970E-02 -0.673540811812492046E-03 0.921830845677282934E-02 - -0.435026047900845974E-03 -0.666666666666666970E-02 0.921631021319617959E-02 - -0.671342130464456957E-03 -0.435072336809864014E-03 -0.622763518227145035E-03 - 0.926572086598697990E-02 -0.666666666666666970E-02 0.755741246850622026E-02 - -0.410910644194978984E-11 -0.606515170668247041E-09 0.822717712895315936E-02 - 0.822711877443555936E-02 -0.222069534738638002E-09 -0.666666666666666970E-02 - -0.774169806703297047E-11 0.819168186272309015E-02 -0.433633789741900023E-03 - -0.666666666666666970E-02 0.917151071300196033E-02 -0.660635211420094017E-03 - -0.436454594164531018E-03 -0.620192759941485053E-03 0.926043761605101039E-02 - -0.666666666666666970E-02 -0.435400642028797977E-03 0.924569487162926924E-02 - -0.602675681812970990E-03 -0.434697126891870982E-03 -0.641341627123363030E-03 - 0.919049697819149941E-02 -0.417766802147073019E-03 -0.666666666666666970E-02 - 0.884119022200594018E-02 -0.565196699593391045E-03 0.000000000000000000E+00 - -0.450926780658108997E-03 -0.597865634330268050E-03 0.921780464690035925E-02 - -0.666666666666666970E-02 -0.162927680290126010E-03 0.761083369281826978E-02 - -0.416350633852817005E-04 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.530230645664085047E-03 0.000000000000000000E+00 - -0.269827347276449016E-03 0.866009136742811915E-02 -0.666666666666666970E-02 - -0.162144345304456006E-03 0.760092331739826033E-02 -0.320930758282650987E-05 - -0.529813749405098054E-03 -0.838256824281413042E-04 0.879776792041751972E-02 - 0.822717682684740922E-02 -0.303187285528630003E-09 -0.666666666666666970E-02 - -0.349301427021029015E-11 0.790044609360177931E-02 -0.666666666666666970E-02 - -0.407938480218887980E-03 0.871084858846606080E-02 -0.512295469801526959E-03 - -0.458837927071482975E-03 -0.588085060255285046E-03 0.920289209649226975E-02 - -0.666666666666666970E-02 0.755933306797651038E-02 -0.410021685751258990E-11 - -0.607789044576714013E-09 0.822725774217296021E-02 -0.162167689058960993E-03 - -0.666666666666666970E-02 0.755666327531443957E-02 -0.112741283804007006E-10 - -0.165135297900302988E-03 -0.164049635527119992E-08 0.822716577111987958E-02 - -0.666666666666666970E-02 -0.162816151525113997E-03 0.755629731334936036E-02 - -0.145250995281190996E-07 -0.168361965007562991E-03 -0.190237271058338006E-05 - 0.822792892080568931E-02 -0.666666666666666970E-02 -0.434980646616620992E-03 - 0.921590961843673021E-02 -0.672201796129614040E-03 0.000000000000000000E+00 - -0.435117717236683025E-03 -0.623850865689214011E-03 0.926672719210277010E-02 - -0.666666666666666970E-02 -0.391662568195668987E-03 0.857592608264677939E-02 - -0.270377852525331014E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.470346296009015001E-03 0.000000000000000000E+00 - -0.290024881064321978E-03 0.893923985221735037E-02 -0.666666666666666970E-02 - -0.317995102343517999E-03 0.807402209586039962E-02 -0.163000191627275006E-03 - -0.506466182711903996E-03 -0.315285148839635984E-03 0.895741240163453953E-02 - 0.822721524635562987E-02 -0.564092115954154982E-09 -0.666666666666666970E-02 - -0.389036326356696026E-11 0.758050134875735016E-02 -0.666666666666666970E-02 - 0.758050134875735016E-02 -0.389036331777706000E-11 -0.564092115791524967E-09 - 0.822721524635562987E-02 -0.666666666666666970E-02 -0.162059230234564013E-03 - 0.755676875879613980E-02 -0.404977791069632030E-11 -0.164607827252304003E-03 - -0.598672124761886979E-09 0.822718791318045005E-02 -0.435030727375933994E-03 - 0.921782544139550068E-02 -0.671763596290095993E-03 -0.435067658311500988E-03 - -0.666666666666666970E-02 -0.623720661515962042E-03 0.926678721368117027E-02 - -0.666666666666666970E-02 -0.435067665337827991E-03 0.926637336566514015E-02 - -0.623334437794025985E-03 -0.435030720348006980E-03 -0.671597031953381036E-03 - 0.921719062426287934E-02 -0.666666666666666970E-02 -0.162064554103597002E-03 - 0.755721778425398963E-02 -0.421379617299167968E-11 -0.164615465305043013E-03 - -0.621841414828084976E-09 0.822717725632217053E-02 -0.666666666666666970E-02 - -0.435032270976548998E-03 0.921659997236582915E-02 -0.671477595984673039E-03 - 0.000000000000000000E+00 -0.435066114983777015E-03 -0.622858547503055051E-03 - 0.926582760612120956E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435031835475485022E-03 0.921598351720689934E-02 -0.671332921939591038E-03 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.435066550409885975E-03 - 0.000000000000000000E+00 -0.622784365706614980E-03 0.926515556356406061E-02 - -0.435081591675335006E-03 -0.666666666666666970E-02 0.926715723864761018E-02 - -0.624060476111682047E-03 -0.435016790462375985E-03 -0.671761223543174959E-03 - 0.921784490296891079E-02 -0.666666666666666970E-02 -0.162062411240925002E-03 - 0.755721810753046033E-02 -0.413154453738800000E-11 -0.164605196762570010E-03 - -0.609894095762123987E-09 0.822717754580637971E-02 -0.435056188182044996E-03 - 0.926585891365665042E-02 -0.622867894213649997E-03 -0.435042198968283974E-03 - -0.666666666666666970E-02 -0.671546446351377053E-03 0.921692872689685964E-02 - -0.164341192321288998E-03 -0.666666666666666970E-02 0.822700721314235071E-02 - -0.388276097998368010E-09 -0.164366485757635008E-03 -0.391285012821627015E-09 - 0.822719359814269927E-02 -0.435043836697356014E-03 -0.666666666666666970E-02 - 0.921698374849288030E-02 -0.671558867523572019E-03 -0.435054550554351985E-03 - -0.622870487342790016E-03 0.926586485739499914E-02 -0.666666666666666970E-02 - -0.162059863256517012E-03 0.755722860343106036E-02 -0.403706320675834994E-11 - -0.164610172315551000E-03 -0.596713959736372018E-09 0.822719066745875027E-02 - -0.435029178909017021E-03 0.921782856828711958E-02 -0.671763646252384965E-03 - -0.435069206480010991E-03 -0.666666666666666970E-02 -0.623759080059059956E-03 - 0.926682896934285030E-02 -0.165176323153183011E-03 -0.666666666666666970E-02 - 0.822774275548648036E-02 -0.421612183862497016E-09 -0.164358865204686000E-03 - -0.403616558608118980E-09 0.822717766773221025E-02 -0.666666666666666970E-02 - 0.755853528843129958E-02 -0.410907228794313989E-11 -0.609517161291164049E-09 - 0.822725835180460052E-02 0.822699835770025062E-02 -0.229769141627132009E-09 - -0.666666666666666970E-02 -0.552503936541298003E-11 0.815277412222070036E-02 - -0.431800077704224978E-03 -0.666666666666666970E-02 0.911683253562737932E-02 - -0.645107782251850984E-03 -0.438245881184861979E-03 -0.614761192822427965E-03 - 0.925173111806255020E-02 -0.666666666666666970E-02 -0.162066792004203990E-03 - 0.755722059760233992E-02 -0.430128633401459982E-11 -0.164626435077370987E-03 - -0.634553335781814010E-09 0.822717711494816949E-02 -0.435011239879684001E-03 - -0.666666666666666970E-02 0.921541899333368046E-02 -0.670833188949691958E-03 - 0.000000000000000000E+00 -0.435087140296697020E-03 -0.622361445147185050E-03 - 0.926525312625075051E-02 -0.666666666666666970E-02 -0.435024263236271974E-03 - 0.921464288704679918E-02 -0.670759624758249955E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.435074121043737996E-03 - 0.000000000000000000E+00 -0.622356277676372038E-03 0.926393943568542926E-02 - -0.666666666666666970E-02 -0.162274431660521006E-03 0.755738313912261012E-02 - -0.318240683403240976E-10 -0.165730949472054013E-03 -0.457846708968348003E-08 - 0.822724281302687072E-02 0.822699408685964931E-02 -0.228425558756176994E-09 - -0.666666666666666970E-02 -0.569262164845434979E-11 0.815841793296116953E-02 - -0.666666666666666970E-02 -0.432079856432105011E-03 0.912452767230425080E-02 - -0.647134271290311994E-03 -0.437974703324669004E-03 -0.615294578525974042E-03 - 0.925273450239525071E-02 -0.666666666666666970E-02 0.755849436258760991E-02 - -0.410955652506049982E-11 -0.609618023001794995E-09 0.822725856237871075E-02 - -0.162292777280510000E-03 -0.666666666666666970E-02 0.755742125771192031E-02 - -0.382880319923928003E-10 -0.165829805796784997E-03 -0.549637401523144008E-08 - 0.822725133937288015E-02 -0.666666666666666970E-02 -0.162519791749040997E-03 - 0.758486654085705012E-02 -0.277312982546554016E-04 -0.531411130485370960E-03 - -0.271318851749606982E-03 0.895290555306309938E-02 -0.666666666666666970E-02 - -0.435144726093062025E-03 0.927049355419245005E-02 -0.627194917416272046E-03 - 0.000000000000000000E+00 -0.434953615617772005E-03 -0.673646073339009962E-03 - 0.921927163146945942E-02 -0.666666666666666970E-02 -0.162365825870953989E-03 - 0.770469126010004009E-02 -0.341374404077266992E-05 0.000000000000000000E+00 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.530093736436846978E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.670688877278153049E-04 - 0.878401724365305073E-02 -0.666666666666666970E-02 -0.406469238190150994E-03 - 0.868845253580060976E-02 -0.487445397507600979E-03 -0.459955572937245996E-03 - -0.561455436984956992E-03 0.917876806761750916E-02 0.822724019224498060E-02 - -0.586236017994280044E-09 -0.666666666666666970E-02 -0.399393167827312004E-11 - 0.756942274805354979E-02 -0.666666666666666970E-02 0.756938782679772958E-02 - -0.398490217607864004E-11 -0.582276458527030000E-09 0.822717710481316034E-02 - -0.666666666666666970E-02 -0.162064188279850002E-03 0.755638507164865976E-02 - -0.426955655883516001E-11 -0.164621776652662993E-03 -0.630372617736862023E-09 - 0.822717559470338006E-02 -0.435097665884428989E-03 0.926770143129381982E-02 - -0.624846533095091048E-03 -0.435000709733791000E-03 -0.666666666666666970E-02 - -0.673347090449469992E-03 0.921745134737769929E-02 -0.666666666666666970E-02 - -0.164221437102121003E-03 0.822590307607582068E-02 -0.301540531002169025E-09 - -0.164276463596261000E-03 -0.317267642602591980E-09 0.822720896113589978E-02 - -0.666666666666666970E-02 -0.435040484785382989E-03 0.921687621618049943E-02 - -0.671540621954609001E-03 -0.435057902230038000E-03 -0.622872302677949011E-03 - 0.926585782249450020E-02 -0.666666666666666970E-02 -0.162127341684184989E-03 - 0.759121275229850991E-02 -0.381774206063630980E-11 0.000000000000000000E+00 - -0.164583013631449993E-03 -0.545969304848883959E-09 0.822717722380189066E-02 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.166319316568878998E-03 - 0.755721776450365024E-02 -0.125637200248759998E-10 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.166854313970594991E-03 0.000000000000000000E+00 - -0.695766357075677050E-09 0.759121342784747997E-02 -0.435007341721630001E-03 - -0.666666666666666970E-02 0.921767422715922928E-02 -0.673397481031394029E-03 - -0.435091036898059991E-03 -0.624857014917886951E-03 0.926772526275846005E-02 - -0.666666666666666970E-02 -0.435042780430306005E-03 0.921682237057486024E-02 - -0.671414330383147007E-03 -0.435055606759090997E-03 -0.622737221255020975E-03 - 0.926573013268000914E-02 -0.164512437176590000E-03 0.822720912015702070E-02 - -0.478141445901109973E-09 -0.162035132711022011E-03 -0.666666666666666970E-02 - -0.321987104749231018E-11 0.755727832223314962E-02 -0.162065357632703997E-03 - -0.666666666666666970E-02 0.755727582947162985E-02 -0.423979469854183965E-11 - -0.164618725596388991E-03 -0.625584490453962995E-09 0.822717479206993944E-02 - -0.162035093478465002E-03 -0.666666666666666970E-02 0.755725620865007990E-02 - -0.322006453375858990E-11 -0.164512451387367005E-03 -0.478178417741383017E-09 - 0.822720912019385062E-02 -0.666666666666666970E-02 -0.162063731945476990E-03 - 0.755721533840931960E-02 -0.418199409455997034E-11 -0.164611237471814999E-03 - -0.617212655483738035E-09 0.822717573526372987E-02 -0.435072204818992981E-03 - 0.926779358414435948E-02 -0.624886835257608040E-03 -0.435026179922837024E-03 - -0.666666666666666970E-02 -0.673540781035909049E-03 0.921830844145678924E-02 - -0.435026048326556981E-03 -0.666666666666666970E-02 0.921631022735146070E-02 - -0.671342133550576967E-03 -0.435072336384253024E-03 -0.622763518765494998E-03 - 0.926572086739899971E-02 -0.666666666666666970E-02 0.755741246850622026E-02 - -0.410910644194978984E-11 -0.606515170668247041E-09 0.822717712895315936E-02 - 0.822711877443555936E-02 -0.222069534792841002E-09 -0.666666666666666970E-02 - -0.774169806704296934E-11 0.819168186272316995E-02 -0.433633789741903004E-03 - -0.666666666666666970E-02 0.917151071300205921E-02 -0.660635211420117002E-03 - -0.436454594164527982E-03 -0.620192759941490040E-03 0.926043761605102080E-02 - -0.666666666666666970E-02 -0.435400642045188024E-03 0.924569487205297025E-02 - -0.602675682168614998E-03 -0.434697126875424015E-03 -0.641341627119159036E-03 - 0.919049697820984064E-02 -0.417766802147158021E-03 -0.666666666666666970E-02 - 0.884119022200715969E-02 -0.565196699593685948E-03 0.000000000000000000E+00 - -0.450926780658037006E-03 -0.597865634330208961E-03 0.921780464690040956E-02 - -0.666666666666666970E-02 -0.162927680290140999E-03 0.761083369282181989E-02 - -0.416350633850501013E-04 0.000000000000000000E+00 0.000000000000000000E+00 - -0.666666666666666970E-02 -0.530230645664087974E-03 0.000000000000000000E+00 - -0.269827347274097977E-03 0.866009136742689964E-02 -0.666666666666666970E-02 - -0.162144345304457009E-03 0.760092331739868968E-02 -0.320930758282241997E-05 - -0.529813749405099029E-03 -0.838256824279560954E-04 0.879776792041734972E-02 - 0.822717682684740922E-02 -0.303187285555688993E-09 -0.666666666666666970E-02 - -0.349301427021062013E-11 0.790044609360190074E-02 -0.666666666666666970E-02 - -0.407938480218909014E-03 0.871084858846631060E-02 -0.512295469801619008E-03 - -0.458837927071466983E-03 -0.588085060255303044E-03 0.920289209649229924E-02 - -0.666666666666666970E-02 0.755933306797651038E-02 -0.410021674909242031E-11 - -0.607789044576720010E-09 0.822725774217296021E-02 -0.162167689058960993E-03 - -0.666666666666666970E-02 0.755666327531452024E-02 -0.112741284887906999E-10 - -0.165135297900302012E-03 -0.164049635526676995E-08 0.822716577111987958E-02 - -0.666666666666666970E-02 -0.162064381314455013E-03 0.755721681642354016E-02 - -0.420708647347467005E-11 -0.164614463272000995E-03 -0.620861048269696965E-09 - 0.822717666428922009E-02 -0.666666666666666970E-02 -0.435032929063819981E-03 - 0.921662843024339054E-02 -0.671487062718633038E-03 0.000000000000000000E+00 - -0.435065457005957975E-03 -0.622863607135336990E-03 0.926583682639911012E-02 - -0.666666666666666970E-02 -0.435033606033594978E-03 0.921549100307902068E-02 - -0.670764254703717011E-03 0.000000000000000000E+00 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435064780144060010E-03 0.000000000000000000E+00 - -0.622228101134277012E-03 0.926458636134656031E-02 -0.666666666666666970E-02 - -0.162412248298241007E-03 0.755747465195114043E-02 -0.132892288474876992E-09 - -0.166453306564733994E-03 -0.187762010463222004E-07 0.822729692880195054E-02 - 0.822701112868542972E-02 -0.238165066447310994E-09 -0.666666666666666970E-02 - -0.481872285692390998E-11 0.811743485472943976E-02 -0.666666666666666970E-02 - 0.907030660461941030E-02 -0.635289343114582030E-03 -0.614711342014916053E-03 - 0.924712095234279940E-02 -0.666666666666666970E-02 -0.162067491346243997E-03 - 0.755721660010066996E-02 -0.433042344878489997E-11 -0.164637989129536011E-03 - -0.639067222491722952E-09 0.822718313255254956E-02 -0.435123629441636984E-03 - 0.926610912903469072E-02 -0.623064810739766984E-03 -0.434974730176162019E-03 - -0.666666666666666970E-02 -0.671125339170808039E-03 0.921519637816755992E-02 - -0.666666666666666970E-02 -0.166000837408311988E-03 0.822475961254992040E-02 - -0.105963041200035007E-07 -0.166020054886163010E-03 -0.114023574905160996E-07 - 0.822673401751872028E-02 -0.666666666666666970E-02 -0.435040411541543013E-03 - 0.921687888628672063E-02 -0.671545404567263976E-03 -0.435057975467457981E-03 - -0.622877304336513001E-03 0.926586291132954973E-02 -0.666666666666666970E-02 - -0.162130707049625006E-03 0.759120585643323006E-02 -0.392634071196178009E-11 - 0.000000000000000000E+00 -0.164597967653741013E-03 -0.561253610552618950E-09 - 0.822717507317871979E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.166319143409792997E-03 0.755721565731708005E-02 -0.124778474586887002E-10 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.166854049995611003E-03 - 0.000000000000000000E+00 -0.691000750970318969E-09 0.759120679480445974E-02 - -0.435051311183006005E-03 -0.666666666666666970E-02 0.926639102794180047E-02 - -0.623356751569072967E-03 -0.435047076190057990E-03 -0.671803857406816016E-03 - 0.921783849635480004E-02 -0.666666666666666970E-02 -0.435004887685989992E-03 - 0.921760501413688982E-02 -0.673382235862111022E-03 -0.435093489874442013E-03 - -0.624855064858449022E-03 0.926773123149998018E-02 -0.166320189393602005E-03 - 0.822674016833681992E-02 -0.176036669988993003E-07 -0.162405090688964003E-03 - -0.666666666666666970E-02 -0.125356963990057004E-09 0.755633079584900041E-02 - -0.162059610339947000E-03 -0.666666666666666970E-02 0.755636445209539962E-02 - -0.409725503580782012E-11 -0.164619621806747006E-03 -0.605978989258597974E-09 - 0.822719229488561021E-02 -0.162405940044832003E-03 -0.666666666666666970E-02 - 0.755662367990892966E-02 -0.125254866802548012E-09 -0.166319952568922996E-03 - -0.175849783175382999E-07 0.822674014969775047E-02 -0.666666666666666970E-02 - -0.162059866198233001E-03 0.755722656090779960E-02 -0.403692298108515966E-11 - -0.164606538526818004E-03 -0.596577171893270041E-09 0.822718795529438045E-02 - -0.435028073525391026E-03 0.921786254908986934E-02 -0.671799805131304017E-03 - -0.435070311635940977E-03 -0.666666666666666970E-02 -0.623819803880377045E-03 - 0.926689343860783929E-02 -0.435070247954426976E-03 -0.666666666666666970E-02 - 0.926779304085995070E-02 -0.624694668192695042E-03 -0.435028137220242991E-03 - -0.672979588001651041E-03 0.921846396898064017E-02 -0.666666666666666970E-02 - 0.894946699024212929E-02 -0.600122097862237002E-03 -0.606052261066108988E-03 - 0.923126173183646918E-02 0.822707325920852932E-02 -0.439334748407924013E-09 - -0.666666666666666970E-02 -0.340850404729477990E-11 0.767218067483432017E-02 - -0.163197277611309987E-03 -0.666666666666666970E-02 0.767185136575801018E-02 - -0.332821151740468006E-08 -0.167963560070563987E-03 -0.390028261492079976E-06 - 0.822779264793695002E-02 -0.666666666666666970E-02 -0.424429129575981982E-03 - 0.884286864444589968E-02 -0.396957256272790994E-03 -0.445115815542256987E-03 - -0.385983334887198980E-03 0.902131707790924942E-02 -0.407763611263302025E-03 - -0.666666666666666970E-02 0.870515217458626978E-02 -0.498571990041084007E-03 - 0.000000000000000000E+00 -0.458970290280038998E-03 -0.570346192985176997E-03 - 0.918907244525967067E-02 -0.666666666666666970E-02 -0.322071566303724021E-03 - 0.801064459323599064E-02 -0.151974252254838009E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.504268248287133031E-03 - 0.000000000000000000E+00 -0.280047589106463980E-03 0.858275366599436061E-02 - -0.666666666666666970E-02 -0.162136502882231009E-03 0.755886084998397007E-02 - -0.797239446559356004E-11 -0.165003120614180988E-03 -0.116699722701056003E-08 - 0.822721467884345026E-02 0.926072857339886937E-02 -0.621240348529511033E-03 - -0.666666666666666970E-02 -0.664187000217392946E-03 0.918478778839808968E-02 - -0.666666666666666970E-02 -0.434091712912973990E-03 0.918381835623783074E-02 - -0.661965111513909037E-03 -0.436002089949920021E-03 -0.618955328845967952E-03 - 0.925997826239693955E-02 -0.666666666666666970E-02 0.755828330701808961E-02 - -0.411187050494477969E-11 -0.610057942551154034E-09 0.822725843300332015E-02 - -0.162643320272068002E-03 -0.666666666666666970E-02 0.760168784361417004E-02 - -0.350264221860245974E-04 -0.531763987837332967E-03 -0.285598197078351006E-03 - 0.896123574808329985E-02 -0.666666666666666970E-02 -0.162064603067110991E-03 - 0.755721765251989031E-02 -0.421568927958496012E-11 0.000000000000000000E+00 - -0.164615562653504011E-03 -0.622111769806188027E-09 0.822717714448987017E-02 - -0.666666666666666970E-02 -0.435031767299973019E-03 0.921658397588867064E-02 - -0.671474829222033949E-03 0.000000000000000000E+00 -0.435066618573651026E-03 - -0.622858771889590011E-03 0.926582672216797083E-02 -0.666666666666666970E-02 - 0.000000000000000000E+00 -0.435031759821466978E-03 0.921598447351964015E-02 - -0.671353608292523957E-03 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.435066626050711989E-03 0.000000000000000000E+00 -0.622806883477106956E-03 - 0.926515741405328948E-02 -0.666666666666666970E-02 -0.162062535484502993E-03 - 0.755721765241602027E-02 -0.413624146533982998E-11 -0.164605264966013012E-03 - -0.610559194090089995E-09 0.822717713298093921E-02 0.822717692676106990E-02 - -0.401916224683843019E-09 -0.666666666666666970E-02 -0.398503994321098006E-09 - 0.822695176184735054E-02 -0.666666666666666970E-02 0.921688643859516939E-02 - -0.671543140262901043E-03 -0.622873028246735054E-03 0.926585914257952933E-02 - -0.666666666666666970E-02 -0.162062560104490004E-03 0.755721765317601043E-02 - -0.413717829056970014E-11 -0.164605387585608994E-03 -0.610695437742429953E-09 - 0.822717713403002017E-02 -0.435057919911320979E-03 0.926585996691042975E-02 - -0.622874416570012033E-03 -0.435040467102555022E-03 -0.666666666666666970E-02 - -0.671542718462655053E-03 0.921687774051401995E-02 -0.666666666666666970E-02 - -0.164358956864967009E-03 0.822694333429544941E-02 -0.400838058665778016E-09 - -0.164360413187303008E-03 -0.404399206698849991E-09 0.822717692928500062E-02 - -0.666666666666666970E-02 -0.435040480198734978E-03 0.921687629883806946E-02 - -0.671540846988837015E-03 0.000000000000000000E+00 -0.435057906816285018E-03 - -0.622872546234125974E-03 0.926585805175031071E-02 -0.666666666666666970E-02 - -0.162127546305523010E-03 0.759121249965894005E-02 -0.382423911661819962E-11 - 0.000000000000000000E+00 -0.164583726826487010E-03 -0.546878146458841976E-09 - 0.822717706956129988E-02 -0.666666666666666970E-02 0.000000000000000000E+00 - -0.166319285296618989E-03 0.755721766130180994E-02 -0.125511345494907995E-10 - -0.666666666666666970E-02 0.000000000000000000E+00 -0.166854277186167003E-03 - 0.000000000000000000E+00 -0.695070062425207962E-09 0.759121319044449988E-02 - -0.435040455888697000E-03 -0.666666666666666970E-02 0.921687736420685050E-02 - -0.671542633493042978E-03 -0.435057931124196984E-03 -0.622874398846488971E-03 - 0.926585992663164944E-02 -0.666666666666666970E-02 -0.435040480266267004E-03 - 0.921687629128858064E-02 -0.671540837058780997E-03 -0.435057906748759010E-03 - -0.622872536301050019E-03 0.926585804176803009E-02 -0.164605382599736005E-03 - 0.822717713315264040E-02 -0.610689878464067020E-09 -0.162062559112368987E-03 - -0.666666666666666970E-02 -0.413714002853130017E-11 0.755721765707715026E-02 - -0.162062560979487995E-03 -0.666666666666666970E-02 0.755721765716044040E-02 - -0.413721119467013017E-11 -0.164605391905931007E-03 -0.610700236970853951E-09 - 0.822717713430983974E-02 -0.162062559104255008E-03 -0.666666666666666970E-02 - 0.755721765270791958E-02 -0.413714010482357039E-11 -0.164605382602643998E-03 - -0.610689887899547005E-09 0.822717713315264040E-02 -0.666666666666666970E-02 - -0.162062002688690993E-03 0.755721765325861970E-02 -0.411602396034761983E-11 - -0.164602611349319008E-03 -0.607618836657478970E-09 0.822717713082904076E-02 - -0.435050795545137013E-03 0.926588559200255957E-02 -0.622885679652955006E-03 - -0.435047591837533988E-03 -0.666666666666666970E-02 -0.671596715586648962E-03 - 0.921711691727978080E-02 -0.435047591829257026E-03 -0.666666666666666970E-02 - 0.921711503527408021E-02 -0.671594744907884041E-03 -0.435050795553413975E-03 - -0.622883788869598950E-03 0.926588363014568990E-02 -0.666666666666666970E-02 - 0.921687630349790968E-02 -0.671540851815855032E-03 -0.622872550856051971E-03 - 0.926585805656933957E-02 0.822717712936564018E-02 -0.606929320764278968E-09 - -0.666666666666666970E-02 -0.411128314877385017E-11 0.755721765699138986E-02 - -0.162062559115238003E-03 -0.666666666666666970E-02 0.755721765701833012E-02 - -0.413714010896968962E-11 -0.164605382614328987E-03 -0.610689894184595994E-09 - 0.822717713311110939E-02 -0.666666666666666970E-02 -0.435040480219567002E-03 - 0.921687629951207024E-02 -0.671540847113225037E-03 -0.435057906795455001E-03 - -0.622872546234027962E-03 0.926585805179963064E-02 -0.162127546311748987E-03 - -0.666666666666666970E-02 0.759121249955777965E-02 -0.382423940240163967E-11 - 0.000000000000000000E+00 -0.164583726855365006E-03 -0.546878175305287047E-09 - 0.822717706955716950E-02 -0.666666666666666970E-02 -0.166319285308019998E-03 - 0.755721766131254007E-02 -0.125511384896241995E-10 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.666666666666666970E-02 -0.166854277196930989E-03 - 0.000000000000000000E+00 -0.695070273739865966E-09 0.759121319033985009E-02 - -0.666666666666666970E-02 -0.162062001821340992E-03 0.755721765238263031E-02 - -0.411599097514774006E-11 -0.164602607032878005E-03 -0.607614054874119969E-09 - 0.822717713004762069E-02 0.926588367541618038E-02 -0.622883811249193005E-03 - -0.666666666666666970E-02 -0.671594835876209036E-03 0.921711542242288975E-02 - -0.666666666666666970E-02 -0.435047603237387977E-03 0.921711541881104976E-02 - -0.671594832109726034E-03 -0.435050784145465008E-03 -0.622883807637388031E-03 - 0.926588367164974010E-02 -0.666666666666666970E-02 0.755721765237517013E-02 - -0.411128320041531016E-11 -0.606929330558691026E-09 0.822717712936565058E-02 - -0.435040480193840022E-03 -0.666666666666666970E-02 0.921687628881386924E-02 - -0.671540836666198004E-03 -0.435057906821179974E-03 -0.622872536362236969E-03 - 0.926585804144416936E-02 -0.166666666666667011E-01 -0.435069629630581997E-03 - 0.194633698669611013E-01 -0.118122796424918004E-02 -0.435069154494976983E-03 - -0.125613726459116001E-02 0.193884422615678009E-01 -0.166666666666667011E-01 - -0.139671085486954007E-03 0.172964744640674989E-01 -0.239732978890832008E-07 - -0.141641486800695994E-03 -0.121305359387927990E-05 0.178147059436198005E-01 - -0.141641814770074998E-03 0.178147059816575991E-01 -0.121308374397845994E-05 - -0.166666666666667011E-01 -0.139671142305139987E-03 -0.239739107040978989E-07 - 0.172964744651716990E-01 -0.166666666666667011E-01 -0.435068671068379023E-03 - 0.000000000000000000E+00 0.193848780140391010E-01 -0.125250859257347992E-02 - -0.435070113055705985E-03 -0.117767774816952006E-02 0.194597410164681003E-01 - -0.166666666666667011E-01 0.172969944943952014E-01 -0.779115312058558027E-07 - -0.166666666666667011E-01 -0.755324935173564974E-05 0.179004996941746000E-01 - -0.166666666666667011E-01 0.178976999748920000E-01 -0.387667548978359018E-05 - -0.233628102655817991E-06 0.178135930268625009E-01 -0.166666666666667011E-01 - -0.435068785870240020E-03 0.193888089807052987E-01 -0.125652619301678007E-02 - -0.435069998254330982E-03 -0.118160505927794005E-02 0.194637572604987007E-01 - -0.435033736890810016E-03 -0.166666666666667011E-01 0.193943160872508992E-01 - -0.126292016425918010E-02 0.000000000000000000E+00 -0.435105043203752020E-03 - -0.118838524744318999E-02 0.194704567784157005E-01 -0.166666666666667011E-01 - -0.139670810780983011E-03 0.000000000000000000E+00 0.172964393749100998E-01 - -0.239726372267789987E-07 -0.141640262301834013E-03 -0.121310025861443004E-05 - 0.178147059964809014E-01 -0.166666666666667011E-01 -0.435069265953867983E-03 - 0.193887440338862994E-01 -0.125644950186007995E-02 -0.435069518171818988E-03 - -0.118152293796727997E-02 0.194636762447145013E-01 -0.140348173477511997E-03 - 0.178144318724902002E-01 -0.933813955352833025E-06 -0.166666666666667011E-01 - -0.140348162575281002E-03 -0.933775488811803979E-06 0.178144292153939003E-01 - -0.166666666666667011E-01 -0.139703533487054989E-03 0.172964344581288988E-01 - -0.243260553268771015E-07 -0.141829183573385009E-03 -0.123048280177751006E-05 - 0.178147222203411011E-01 -0.435051982399214975E-03 -0.166666666666667011E-01 - 0.193933724724300004E-01 -0.126258834031118010E-02 -0.435086800763898984E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.118819422876379008E-02 - 0.194685490108550997E-01 0.000000000000000000E+00 -0.435025946733501019E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.193872403842459996E-01 -0.125578622091835007E-02 -0.435112831393176015E-03 - -0.118149517659288990E-02 0.194634742330271997E-01 -0.166666666666667011E-01 - 0.172964350463347995E-01 -0.239721729446109010E-07 -0.121308719330683991E-05 - 0.178147059846293990E-01 -0.166666666666667011E-01 -0.435069341371819001E-03 - 0.193887901446231983E-01 -0.125649495627712989E-02 -0.435069442753911013E-03 - -0.118156624050173007E-02 0.194637211303591989E-01 -0.166666666666667011E-01 - 0.172964137870020990E-01 -0.947281877378979960E-08 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.426824290648908980E-06 0.177495617930084996E-01 - 0.000000000000000000E+00 -0.140229676531456008E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 0.177491875365273014E-01 - -0.500657647124818999E-07 -0.140479994479306001E-03 -0.522090189834258032E-06 - 0.178140176711019983E-01 -0.435033290824524978E-03 -0.166666666666667011E-01 - 0.193943004566385011E-01 -0.126291331604961991E-02 -0.435105489168504994E-03 - -0.118838501644233006E-02 0.194704545970095996E-01 -0.435268249897141982E-03 - -0.166666666666667011E-01 0.194803294031692005E-01 -0.120221696823341996E-02 - -0.434870408625407019E-03 -0.126728096347563995E-02 0.194000952173805016E-01 - -0.166666666666667011E-01 0.172950455057520984E-01 -0.209695222919133997E-07 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.104309476531645996E-05 - 0.178040398390986003E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.173086281050639987E-01 -0.234055546331482985E-07 -0.116921175397781010E-05 - 0.178149613334784990E-01 -0.166666666666667011E-01 -0.141165505762467993E-03 - 0.172962556489167016E-01 -0.473446914579739040E-05 -0.544658362791339019E-03 - -0.346186312977599982E-04 0.183333438873188002E-01 -0.139754712611172996E-03 - -0.166666666666667011E-01 0.176170424650436003E-01 -0.246162966049732998E-07 - -0.140206415574070008E-03 -0.567757815085220049E-06 0.178140588186939987E-01 - -0.165095245016433009E-03 0.178223322459852007E-01 -0.681146904710235036E-05 - -0.166666666666667011E-01 -0.150451686795303991E-03 -0.294774223703715983E-06 - 0.176167018986788000E-01 -0.166666666666667011E-01 -0.421169644089434015E-03 - 0.189193940368806990E-01 -0.945437876210229950E-03 -0.448291096827642985E-03 - -0.100548436568255006E-02 0.192537240725896008E-01 -0.166666666666667011E-01 - 0.173199119819372992E-01 -0.235984471741585014E-07 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.123676172594726990E-05 - 0.178172740172132006E-01 -0.166666666666667011E-01 -0.435171428458813985E-03 - 0.194709389718316986E-01 -0.119092785784032009E-02 -0.434967322589985023E-03 - -0.126074169121839991E-02 0.193933111634515994E-01 -0.166666666666667011E-01 - 0.172958818583838006E-01 -0.240054389113918998E-07 -0.121597095988231006E-05 - 0.178147088549027999E-01 -0.434490077538828977E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 0.192819870996423989E-01 - -0.115810409218265005E-02 -0.435647599097352997E-03 -0.109302488191087002E-02 - 0.193702791631770004E-01 -0.166666666666667011E-01 0.172964231611969985E-01 - -0.237439154341176999E-07 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.119986668002372008E-05 - 0.178139470404297014E-01 -0.166666666666667011E-01 -0.435069730380088988E-03 - 0.194633683526404012E-01 -0.118122901996727003E-02 -0.435069053745286004E-03 - -0.125613277918751993E-02 0.193884385258774015E-01 -0.166666666666667011E-01 - -0.139670526915550006E-03 0.172964750459083001E-01 -0.239672729870900994E-07 - -0.141638257586788001E-03 -0.121275616494760993E-05 0.178147056890299998E-01 - -0.141820432945145006E-03 0.178147229316946987E-01 -0.122951654344512007E-05 - -0.166666666666667011E-01 -0.139702093156581013E-03 -0.243082078767443003E-07 - 0.172964754242462999E-01 -0.166666666666667011E-01 -0.435009064920768992E-03 - 0.193710715232028007E-01 -0.123960904534558994E-02 -0.435129707573877990E-03 - -0.116591990546044996E-02 0.194474141497074995E-01 -0.166666666666667011E-01 - 0.172985028689811998E-01 -0.223410986359397003E-07 -0.166666666666667011E-01 - -0.111564513924942993E-05 0.178095672953987004E-01 -0.166666666666667011E-01 - 0.193790139692226991E-01 -0.125224418475917997E-02 -0.118134652118899995E-02 - 0.194616264856153007E-01 -0.166666666666667011E-01 -0.435069813890499976E-03 - 0.194637397133314016E-01 -0.118159504407725001E-02 -0.435068970234672974E-03 - -0.125650151292971989E-02 0.193888007309931006E-01 -0.435063659072025002E-03 - -0.166666666666667011E-01 0.193957932538242998E-01 -0.126373923029710002E-02 - 0.000000000000000000E+00 -0.435075124949578019E-03 -0.118877268529867007E-02 - 0.194710559190932990E-01 -0.166666666666667011E-01 -0.139670655296777991E-03 - 0.172964394256112999E-01 -0.239709674501792008E-07 -0.141639364432354998E-03 - -0.121301807421782997E-05 0.178147059272172996E-01 -0.166666666666667011E-01 - -0.435069551479130975E-03 0.194636473054102983E-01 -0.118149814806462004E-02 - -0.435069232646526994E-03 -0.125641682842023996E-02 0.193887163382647013E-01 - -0.140347813456026012E-03 0.178144318351523001E-01 -0.933769427979961972E-06 - -0.166666666666667011E-01 -0.140367975523149991E-03 -0.933880353406906957E-06 - 0.178144352914434007E-01 -0.166666666666667011E-01 -0.139694549084838003E-03 - 0.172964390046605986E-01 -0.242291801426243003E-07 -0.141777300691703004E-03 - -0.122571812837964004E-05 0.178147201764872996E-01 -0.435047850309519990E-03 - -0.166666666666667011E-01 0.193943737538610009E-01 -0.126316205126679004E-02 - 0.000000000000000000E+00 -0.435090932343969020E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.118860422039771007E-02 - 0.194699012602338010E-01 0.000000000000000000E+00 -0.435045457437436027E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.193878817571609997E-01 - -0.125604652907840004E-02 -0.435093324868215015E-03 -0.118146947412013995E-02 - 0.194635297377060007E-01 -0.139670603133990989E-03 -0.166666666666667011E-01 - 0.172964396533356012E-01 -0.239704033946370987E-07 -0.141639061330424995E-03 - -0.121298992344433003E-05 0.178147059349155999E-01 -0.166666666666667011E-01 - -0.435069549163963982E-03 0.194637077506569015E-01 -0.118155678661577995E-02 - -0.435069234961695992E-03 -0.125647743137780000E-02 0.193887749967509987E-01 - -0.139700055968098002E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.172964161546672983E-01 -0.947619561016756952E-08 -0.141286732534576009E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.426994435274359988E-06 - 0.177495882894109988E-01 0.000000000000000000E+00 -0.140193998721740011E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.177492151422632000E-01 - -0.498003721032979000E-07 -0.140435023641922004E-03 -0.519120801586516000E-06 - 0.178140167474564008E-01 -0.435057327600382997E-03 -0.166666666666667011E-01 - 0.193955726939472016E-01 -0.126364253425448000E-02 -0.435081456064144001E-03 - -0.118875145877563992E-02 0.194710320647638012E-01 -0.434101287475744974E-03 - -0.166666666666667011E-01 0.193457297954395986E-01 -0.123123427263146005E-02 - -0.436034461490277003E-03 -0.117022370416927993E-02 0.194484060865461987E-01 - -0.166666666666667011E-01 0.172978496798759998E-01 -0.927299841537387054E-08 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.415800227862941001E-06 0.177484703644168011E-01 -0.166666666666667011E-01 - 0.193674050743437992E-01 -0.124713171719898993E-02 -0.118115734794936003E-02 - 0.194600874520639011E-01 -0.166666666666667011E-01 -0.141885046454967004E-03 - 0.173019565602799998E-01 -0.104149302119304992E-04 -0.548756258214203995E-03 - -0.211535543706460008E-03 0.185091585802883984E-01 -0.140739916525857997E-03 - -0.166666666666667011E-01 0.175467171312232995E-01 -0.284331238335905014E-07 - -0.142759739737579006E-03 -0.795488176607448023E-06 0.178133916912854989E-01 - -0.159214075622715010E-03 0.178183546415562992E-01 -0.506772449993064000E-05 - -0.166666666666667011E-01 -0.147688204700038996E-03 -0.193011674962318992E-06 - 0.175471635134074010E-01 -0.166666666666667011E-01 -0.390248902500896981E-03 - 0.183597810468748013E-01 -0.389827888875239994E-03 -0.472748684898519024E-03 - -0.455376277614669004E-03 0.187227106527687003E-01 -0.166666666666667011E-01 - 0.184527954209229988E-01 -0.738253265195438005E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.918001551148692977E-03 0.188197523488804995E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.405995886601362996E-03 0.000000000000000000E+00 0.187454059199008011E-01 - -0.889550329385700050E-03 -0.461282908632618994E-03 -0.106896425909096997E-02 - 0.192970827972911013E-01 -0.166666666666667011E-01 0.173071968997474991E-01 - -0.234787153019127012E-07 -0.117638457460658001E-05 0.178149781133057003E-01 - -0.141084828730741996E-03 -0.166666666666667011E-01 0.177480973766994017E-01 - -0.558748334339287969E-07 -0.141568067840571008E-03 -0.593631384362742965E-06 - 0.178140924667235002E-01 -0.139670647362560011E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.172964277750680996E-01 -0.239358302523397992E-07 - -0.141638396626583005E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 -0.121100320738714998E-05 0.178145893661829993E-01 - -0.166666666666667011E-01 -0.139670792138544994E-03 0.172964364510580990E-01 - -0.239726277356469989E-07 -0.141640184766718993E-03 -0.121310638575149002E-05 - 0.178147060027850010E-01 -0.166666666666667011E-01 -0.140348083609600002E-03 - 0.178144298829951003E-01 -0.933774429165531998E-06 -0.140348091728343988E-03 - -0.933803076573281956E-06 0.178144318618429984E-01 -0.435069485970576990E-03 - 0.194636360494852001E-01 -0.118148312544618999E-02 -0.166666666666667011E-01 - -0.435069298155133020E-03 -0.125640935261609995E-02 0.193887056174587014E-01 - -0.166666666666667011E-01 -0.139670745411716994E-03 0.172964393771044000E-01 - -0.239719351623946988E-07 -0.141639884975219995E-03 -0.121306573571080992E-05 - 0.178147059632727985E-01 -0.166666666666667011E-01 0.193887976520876991E-01 - -0.125650164472155992E-02 -0.166666666666667011E-01 -0.118157208029700004E-02 - 0.194637273004373001E-01 -0.166666666666667011E-01 0.193887977203392008E-01 - -0.125650167109372002E-02 -0.118157208884185996E-02 0.194637273863749009E-01 - -0.166666666666667011E-01 -0.435069389748935983E-03 0.193887814175709994E-01 - -0.125648501071033009E-02 -0.435069394376801999E-03 -0.118155580222921008E-02 - 0.194637107296697989E-01 -0.140178146762424992E-03 -0.166666666666667011E-01 - 0.177492251547946012E-01 -0.496800708557238970E-07 0.000000000000000000E+00 - -0.140415043495099005E-03 -0.517779583567776022E-06 0.178140139542000003E-01 - -0.166666666666667011E-01 -0.435069391170850978E-03 0.193884418595630000E-01 - -0.125613777078238003E-02 -0.435069392954887005E-03 -0.118121619467206006E-02 - 0.194633634756040015E-01 -0.166666666666667011E-01 -0.139670761471267005E-03 - 0.172964743992243992E-01 -0.239698239143705995E-07 -0.141639617585322011E-03 - -0.121288295374220008E-05 0.178147057813429002E-01 -0.141639619900249991E-03 - 0.178147057815410993E-01 -0.121288316535549008E-05 -0.166666666666667011E-01 - -0.139670761872411997E-03 -0.239698282191903987E-07 0.172964743992280005E-01 - -0.166666666666667011E-01 -0.435069391157123027E-03 0.193887088952683015E-01 - -0.125641082509626010E-02 -0.435069392968615010E-03 -0.118148323088716994E-02 - 0.194636365300335998E-01 -0.139670783971901987E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 0.172964393406603995E-01 -0.239438477767506995E-07 - -0.141639278555754001E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.121143526565240011E-05 0.178146131748509011E-01 0.000000000000000000E+00 - -0.435064994003248019E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.193885566142219003E-01 -0.125634391561759004E-02 -0.435073790061057985E-03 - -0.118148087424704991E-02 0.194636165590847988E-01 -0.139670564073325994E-03 - -0.166666666666667011E-01 0.172964393993381010E-01 -0.239699852667253984E-07 - -0.141638838042723001E-03 -0.121296980432410002E-05 0.178147058666821984E-01 - -0.166666666666667011E-01 -0.435069390409400993E-03 0.193887087203693000E-01 - -0.125641066243104005E-02 -0.435069393716336990E-03 -0.118148308295969000E-02 - 0.194636363738771995E-01 -0.139670603963582999E-03 -0.166666666666667011E-01 - 0.172964393580567997E-01 -0.239419221487562986E-07 -0.141638239923676007E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.121134061697532001E-05 0.178146131028351987E-01 - 0.000000000000000000E+00 -0.435064995135525016E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.193885567877557993E-01 -0.125634406896881997E-02 - -0.435073788928811995E-03 -0.118148100749493009E-02 0.194636167020634000E-01 - -0.139670744060081990E-03 -0.166666666666667011E-01 0.172964393838881993E-01 - -0.239719201961261003E-07 -0.141639877103680014E-03 -0.121306498448360998E-05 - 0.178147059625157998E-01 -0.435069391187604992E-03 -0.166666666666667011E-01 - 0.193887118630177990E-01 -0.125641385907748998E-02 -0.435069392938132991E-03 - -0.118148619756011002E-02 0.194636395637266015E-01 -0.166666666666667011E-01 - 0.172965077542917012E-01 -0.927568247376278936E-07 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.102046200973755008E-05 0.173051339160060005E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.193887977549710017E-01 - -0.125650168627743997E-02 -0.118157208932238010E-02 0.194637273908665996E-01 - -0.166666666666667011E-01 -0.435069390872567011E-03 0.193884418484903988E-01 - -0.125613776548786000E-02 -0.435069393253171026E-03 -0.118121619377422010E-02 - 0.194633634734872989E-01 -0.139670761455125999E-03 -0.166666666666667011E-01 - 0.172964743993230009E-01 -0.239698237345604007E-07 -0.141639617491158998E-03 - -0.121288294467888995E-05 0.178147057813337999E-01 -0.141639617495969007E-03 - 0.178147057813342995E-01 -0.121288294511917005E-05 -0.166666666666667011E-01 - -0.139670761455959995E-03 -0.239698237435694994E-07 0.172964743993230009E-01 - -0.166666666666667011E-01 -0.435069391186033983E-03 0.193887118629635993E-01 - -0.125641385905383009E-02 -0.435069392939704000E-03 -0.118148619755951002E-02 - 0.194636395637197007E-01 -0.166666666666667011E-01 0.172965077542917012E-01 - -0.927568247375443947E-07 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.102046200973782007E-05 0.173051339160060005E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.139674680709096002E-03 - 0.000000000000000000E+00 0.173041376249854009E-01 -0.235410513736254011E-07 - -0.141584640574228987E-03 -0.117464392253343004E-05 0.178146677207011994E-01 - -0.166666666666667011E-01 0.193887977549710017E-01 -0.125650168627745992E-02 - -0.118157208932238010E-02 0.194637273908665996E-01 -0.139674680709097004E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.173041376249854009E-01 -0.235410513736612015E-07 -0.141584640574232999E-03 - -0.117464392253384995E-05 0.178146677207011994E-01 -0.139700009140160993E-03 - -0.166666666666667011E-01 0.172964151618227001E-01 -0.947777749626817048E-08 - -0.141286602860218007E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.427075077777844986E-06 - 0.177495996071506991E-01 -0.166666666666667011E-01 -0.139670794784869993E-03 - 0.172964386794358001E-01 -0.239725105714824009E-07 -0.141640177118637010E-03 - -0.121309559766888995E-05 0.178147059913013987E-01 -0.166666666666667011E-01 - -0.435069295710136016E-03 0.193887241621244003E-01 -0.125642846904032001E-02 - -0.435069488415573019E-03 -0.118150189715466995E-02 0.194636550484258014E-01 - -0.140349307522104008E-03 0.178144320174323002E-01 -0.933958710839114049E-06 - -0.166666666666667011E-01 -0.140349299171436005E-03 -0.933929273088272012E-06 - 0.178144299843184008E-01 -0.166666666666667011E-01 -0.435068659479091984E-03 - 0.193848776262358011E-01 -0.125250842953309989E-02 -0.435070124644940006E-03 - -0.117767775442942997E-02 0.194597409762624005E-01 -0.166666666666667011E-01 - 0.172969944928271988E-01 -0.779115314441779059E-07 -0.166666666666667011E-01 - -0.755324939016156960E-05 0.179004996942138013E-01 -0.166666666666667011E-01 - 0.178976999748929992E-01 -0.387667548981126021E-05 -0.233628102656007991E-06 - 0.178135930268625009E-01 -0.166666666666667011E-01 -0.139686009556357012E-03 - 0.172964370559683007E-01 -0.241363115163189999E-07 -0.141728010347422987E-03 - -0.122115135534191003E-05 0.178147135015842006E-01 -0.435039396134656023E-03 - -0.166666666666667011E-01 0.193948601305157994E-01 -0.126336416797764991E-02 - 0.000000000000000000E+00 -0.435099385138650019E-03 -0.118873902082604994E-02 - 0.194708368874158995E-01 -0.166666666666667011E-01 -0.435069389255753991E-03 - 0.193884422504904987E-01 -0.125613821100521001E-02 -0.435069394869983992E-03 - -0.118121665337718993E-02 0.194633639337775007E-01 -0.166666666666667011E-01 - -0.139671380124092994E-03 0.172964743448792008E-01 -0.239764688220868006E-07 - -0.141643188367998986E-03 -0.121320975071519003E-05 0.178147060922408014E-01 - -0.141639844172888998E-03 0.178147058017989017E-01 -0.121290395200246995E-05 - -0.166666666666667011E-01 -0.139670800627354991E-03 -0.239702482700813011E-07 - 0.172964743394126014E-01 -0.166666666666667011E-01 -0.435068235431626006E-03 - 0.193889053227481005E-01 -0.125663654535791008E-02 -0.435070548689864018E-03 - -0.118172116532760008E-02 0.194638725033138005E-01 -0.161678337891911993E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.172964935301388005E-01 - -0.110214153508716999E-06 -0.162422470311612014E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.120622187566448997E-05 0.173052150067069012E-01 - 0.000000000000000000E+00 -0.139675704206961997E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.173040340098522989E-01 -0.235576501589924013E-07 - -0.141591382930956008E-03 -0.117568258902886992E-05 0.178146687262424989E-01 - -0.435067411942580010E-03 -0.166666666666667011E-01 0.193886625538398985E-01 - -0.125640355926819004E-02 -0.435071372170706019E-03 -0.118150457761581010E-02 - 0.194636501951265008E-01 -0.166666666666667011E-01 -0.435069317670580980E-03 - 0.193887257892274983E-01 -0.125642966436278992E-02 -0.435069466455139981E-03 - -0.118150274119342008E-02 0.194636560434883997E-01 -0.161601801934664006E-03 - -0.166666666666667011E-01 0.172965054748792993E-01 -0.926502681628198944E-07 - -0.162351306455949001E-03 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.101988138676949999E-05 - 0.173051393629537996E-01 0.000000000000000000E+00 -0.139695639988150999E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.173041438368814991E-01 - -0.237586532645376999E-07 -0.141701626147540003E-03 -0.118517343244098004E-05 - 0.178146776002247008E-01 -0.435031309394906995E-03 -0.166666666666667011E-01 - 0.193945788398839983E-01 -0.126324081474057004E-02 -0.435107470132360024E-03 - -0.118873511395483993E-02 0.194708001084720993E-01 -0.435268203178909021E-03 - -0.166666666666667011E-01 0.194803280841454006E-01 -0.120221471825894999E-02 - -0.434870455402753989E-03 -0.126728100453280998E-02 0.194000950994309002E-01 - -0.166666666666667011E-01 0.172950455023187996E-01 -0.209695226067755011E-07 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.104309478839939992E-05 - 0.178040398395854990E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.173086281051332003E-01 -0.234055546297331013E-07 -0.116921175366159004E-05 - 0.178149613334783012E-01 -0.166666666666667011E-01 -0.141165531303238012E-03 - 0.172962108365329013E-01 -0.473445694664380988E-05 -0.544660594244451043E-03 - -0.346550124406059992E-04 0.183333673483180998E-01 -0.139754726222278999E-03 - -0.166666666666667011E-01 0.176170084219189997E-01 -0.246149781576928007E-07 - -0.140206540519620008E-03 -0.567788699156871954E-06 0.178140588497450007E-01 - -0.165095492820405989E-03 0.178223326710732002E-01 -0.681196695094317026E-05 - -0.166666666666667011E-01 -0.150450799069272987E-03 -0.294765827682543002E-06 - 0.176166680355560011E-01 -0.166666666666667011E-01 -0.421169440873413019E-03 - 0.189193905894523007E-01 -0.945436157371731009E-03 -0.448291280314589975E-03 - -0.100548420996315999E-02 0.192537236307863992E-01 -0.166666666666667011E-01 - 0.173199119681975988E-01 -0.235984477489150007E-07 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.123676178966298004E-05 - 0.178172740173286014E-01 -0.166666666666667011E-01 -0.435171428461128974E-03 - 0.194709389729267983E-01 -0.119092785894799004E-02 -0.434967322587667975E-03 - -0.126074169224678996E-02 0.193933111644412001E-01 -0.166666666666667011E-01 - 0.172958818582512990E-01 -0.240054389194565998E-07 -0.121597096057541996E-05 - 0.178147088549035007E-01 -0.434490077563752020E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 0.192819871000558009E-01 - -0.115810409202821997E-02 -0.435647599072524984E-03 -0.109302488139546004E-02 - 0.193702791629623006E-01 -0.435071227105071981E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.194623470555931009E-01 -0.118137412384183003E-02 - -0.435067557009957988E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 -0.125600599101201998E-02 0.193876331668423017E-01 - -0.166666666666667011E-01 -0.139670755029073005E-03 0.172964393832697010E-01 - -0.239720378568450991E-07 -0.141639940417128004E-03 -0.121307076941768003E-05 - 0.178147059675646015E-01 -0.166666666666667011E-01 -0.435069370632784017E-03 - 0.193887126423957987E-01 -0.125641513719206011E-02 -0.435069413492952990E-03 - -0.118148776859817008E-02 0.194636409705596994E-01 -0.140350151817307989E-03 - 0.178144321253345984E-01 -0.934063912396124954E-06 -0.166666666666667011E-01 - -0.140350134684648006E-03 -0.934057299894995007E-06 0.178144316718926998E-01 - -0.166666666666667011E-01 -0.435068157528115975E-03 0.193838368298109992E-01 - -0.125145454139508993E-02 -0.435070626592773998E-03 -0.117665452507550007E-02 - 0.194586918508348992E-01 -0.166666666666667011E-01 0.172970581684627998E-01 - -0.692387039594140975E-18 -0.166666666666667011E-01 -0.116362368857371003E-16 - 0.173226956581742017E-01 -0.166666666666667011E-01 0.173227088035448999E-01 - -0.227142423077076990E-07 -0.109342148655364995E-05 0.178145868761214014E-01 - -0.166666666666667011E-01 -0.139672882575824002E-03 0.172964387861730996E-01 - -0.239948730000151010E-07 -0.141652224900244998E-03 -0.121419432935652996E-05 - 0.178147068681758017E-01 -0.435063160018721003E-03 -0.166666666666667011E-01 - 0.193888839247167016E-01 -0.125673333404555001E-02 0.000000000000000000E+00 - -0.435075623983676026E-03 -0.118189505882440008E-02 0.194640013018383996E-01 - -0.166666666666667011E-01 -0.435069389648734019E-03 0.000000000000000000E+00 - 0.193884419847699004E-01 -0.125613793152611996E-02 -0.435069394477004018E-03 - -0.118121637448865007E-02 0.194633636498748007E-01 -0.166666666666667011E-01 - -0.139670964419093003E-03 0.172964743771341009E-01 -0.239720016692284015E-07 - -0.141640788952219994E-03 -0.121299004622935002E-05 0.178147058760876990E-01 - -0.141639744525080005E-03 0.178147057920136007E-01 -0.121289465915329990E-05 - -0.166666666666667011E-01 -0.139670783426588003E-03 -0.239700609664733988E-07 - 0.172964743758160996E-01 -0.166666666666667011E-01 -0.435069015984241008E-03 - 0.193887360934393016E-01 -0.125644691154631004E-02 -0.435069768141048007E-03 - -0.118152418161517991E-02 0.194636756633727002E-01 -0.161605811424037006E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.172965033247536003E-01 - -0.935932238155109053E-07 -0.162355789101947996E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.103050516658616990E-05 0.173051502951346997E-01 - 0.000000000000000000E+00 -0.139674971988557002E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.173041442833562001E-01 -0.235436840180659000E-07 - -0.141586201352507011E-03 -0.117475642480550998E-05 0.178146678139698006E-01 - -0.435068868942317015E-03 -0.166666666666667011E-01 0.193886960011524015E-01 - -0.125640827839761994E-02 -0.435069915182551979E-03 -0.118148826831425001E-02 - 0.194636394345385003E-01 -0.166666666666667011E-01 -0.435069350378355024E-03 - 0.193887155593089992E-01 -0.125641852657909004E-02 -0.435069433747378026E-03 - -0.118149137295133996E-02 0.194636445794403000E-01 -0.161601725677973997E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.172965069829389001E-01 - -0.926650122837621983E-07 -0.162351498594983989E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.102023342811811993E-05 0.173051439965640000E-01 - 0.000000000000000000E+00 -0.139679244153132989E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.173041479614229991E-01 -0.235877316475454016E-07 - -0.141610020722497009E-03 -0.117688204166686002E-05 0.178146696122764014E-01 - -0.435061149026493986E-03 -0.166666666666667011E-01 0.193888142839695986E-01 - -0.125670280436314006E-02 -0.435077634883449980E-03 -0.118189407884959998E-02 - 0.194639921507386016E-01 -0.415397337630780993E-03 -0.166666666666667011E-01 - 0.188278985006655984E-01 -0.898332650045155024E-03 -0.453392799083327013E-03 - -0.100006104572014989E-02 0.192410591480840984E-01 -0.166666666666667011E-01 - 0.173196351833987995E-01 -0.236101000931807013E-07 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.123805476906965002E-05 - 0.178172764972891993E-01 -0.166666666666667011E-01 0.172958803102313992E-01 - -0.240055324643047990E-07 -0.121597905454717007E-05 0.178147088629614994E-01 - -0.166666666666667011E-01 -0.139361332584411003E-03 0.172963780471655990E-01 - -0.206406041294607990E-07 -0.139853947917575004E-03 -0.104886506051046009E-05 - 0.178144680355106985E-01 -0.482519288981585995E-03 -0.166666666666667011E-01 - 0.184437042794661984E-01 -0.182960367117892994E-03 -0.375083574717954013E-03 - -0.210197986405779994E-03 0.183325256678834002E-01 -0.143207812762676998E-03 - 0.178148150557865988E-01 -0.128505524246536990E-05 -0.166666666666667011E-01 - -0.199761155404034002E-03 -0.199498340007143991E-05 0.178294573883331017E-01 - -0.166666666666667011E-01 -0.433603776206709980E-03 0.193497604591337004E-01 - -0.124503388545287009E-02 -0.436528045694049990E-03 -0.119072564830830993E-02 - 0.194675615509681994E-01 -0.166666666666667011E-01 0.172951192308572997E-01 - -0.214703230748565997E-07 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.107141838335927994E-05 0.178058595670048010E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.434603197488181014E-03 - 0.000000000000000000E+00 0.193058537142562006E-01 -0.118063268562849000E-02 - -0.435534875266554988E-03 -0.111369468800605006E-02 0.193919603585517991E-01 - -0.166666666666667011E-01 0.173056689022044012E-01 -0.235558337256563013E-07 - -0.118363055389575001E-05 0.178149857852359997E-01 -0.435171466645126008E-03 - -0.166666666666667011E-01 0.194709518251613006E-01 -0.119094077856059005E-02 - -0.434967284379077007E-03 -0.126075351291434000E-02 0.193933220870970008E-01 - -0.435071725054391020E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.194634518078929016E-01 -0.118146952991411008E-02 -0.435067059054058004E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.125632584199591004E-02 0.193884910636192009E-01 -0.166666666666667011E-01 - -0.139670745136767009E-03 0.172964396823254013E-01 -0.239719121263430998E-07 - -0.141639880244726003E-03 -0.121306391322725994E-05 0.178147059609039017E-01 - -0.166666666666667011E-01 -0.140348087747312988E-03 0.178144318030939997E-01 - -0.933800582768691986E-06 -0.140348087981950005E-03 -0.933801410703608978E-06 - 0.178144318602838012E-01 -0.435069394684114985E-03 0.194636364171987992E-01 - -0.118148312778892993E-02 -0.166666666666667011E-01 -0.435069389441622998E-03 - -0.125641069437981007E-02 0.193887087332374995E-01 -0.166666666666667011E-01 - -0.139670745366996012E-03 0.172964393825588009E-01 -0.239719343252174001E-07 - -0.141639884660971991E-03 -0.121306568223363996E-05 0.178147059632162985E-01 - -0.166666666666667011E-01 0.193887976553534999E-01 -0.125650164603376005E-02 - -0.166666666666667011E-01 -0.118157208059822003E-02 0.194637273034572005E-01 - -0.166666666666667011E-01 0.193887977225202998E-01 -0.125650167205010994E-02 - -0.118157208887189995E-02 0.194637273866554994E-01 -0.166666666666667011E-01 - -0.435069390858170975E-03 0.193887118569476997E-01 -0.125641385953187998E-02 - -0.435069393267567008E-03 -0.118148620273378011E-02 0.194636395676883005E-01 - -0.139674686768931008E-03 -0.166666666666667011E-01 0.173041492305231991E-01 - -0.235404449719397016E-07 0.000000000000000000E+00 -0.141584559768768005E-03 - -0.117458831050516992E-05 0.178146676653484007E-01 -0.166666666666667011E-01 - -0.139670743889800991E-03 0.000000000000000000E+00 0.172964393844355011E-01 - -0.239719183336094004E-07 -0.141639876115250000E-03 -0.121306489168701010E-05 - 0.178147059624313986E-01 -0.166666666666667011E-01 -0.140348087304844010E-03 - 0.178144318418145993E-01 -0.933801043769264029E-06 -0.140348087380234008E-03 - -0.933801309788676977E-06 0.178144318601899006E-01 -0.435069392936427975E-03 - 0.194636364108725993E-01 -0.118148311418031005E-02 -0.166666666666667011E-01 - -0.435069391189310008E-03 -0.125641070624249993E-02 0.193887087796953995E-01 - -0.166666666666667011E-01 -0.435069391192276006E-03 0.193887815134154012E-01 - -0.125648507879496003E-02 -0.435069392933461977E-03 -0.118155584780088997E-02 - 0.194637107833945007E-01 -0.139700198050289988E-03 -0.166666666666667011E-01 - 0.172964151566850008E-01 -0.947849280649777055E-08 -0.141287467726875003E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.427106383900035001E-06 0.177495992954622012E-01 0.000000000000000000E+00 - -0.140178145665099013E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.177492248107081994E-01 -0.496797031657574988E-07 -0.140415043450978997E-03 - -0.517779495462221951E-06 0.178140139541173997E-01 -0.435069391108214992E-03 - -0.166666666666667011E-01 0.193887087862484007E-01 -0.125641071469571007E-02 - -0.435069393017522990E-03 -0.118148312365616006E-02 0.194636364200341007E-01 - -0.166666666666667011E-01 -0.139670745360696987E-03 0.172964393844495003E-01 - -0.239719341341385992E-07 -0.141639884605172008E-03 -0.121306566857870991E-05 - 0.178147059632026983E-01 -0.435069390272270994E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 0.193887086899636012E-01 -0.125641067599288007E-02 - -0.435069393853468019E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.118148311480717992E-02 0.194636363313261986E-01 0.000000000000000000E+00 - -0.435069390212755017E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.193887087459734993E-01 -0.125641069148615990E-02 -0.435069393912983020E-03 - -0.118148311376017998E-02 0.194636364065413001E-01 -0.139670743899566996E-03 - -0.166666666666667011E-01 0.172964393844273999E-01 -0.239719184390481014E-07 - -0.141639876171704000E-03 -0.121306489688977996E-05 0.178147059624365993E-01 - -0.435069391188922026E-03 -0.166666666666667011E-01 0.193887118630630996E-01 - -0.125641385909724002E-02 -0.435069392936816011E-03 -0.118148619756053004E-02 - 0.194636395637322983E-01 -0.166666666666667011E-01 0.172965077542917012E-01 - -0.927568247378449987E-07 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.102046200973719009E-05 0.173051339160060005E-01 0.000000000000000000E+00 - -0.166666666666667011E-01 0.193887977549708004E-01 -0.125650168627737991E-02 - -0.118157208932236990E-02 0.194637273908664990E-01 -0.166666666666667011E-01 - -0.435069391170655984E-03 0.193884418583357004E-01 -0.125613776952834995E-02 - -0.435069392955081998E-03 -0.118121619344725010E-02 0.194633634743562010E-01 - -0.139670761454458998E-03 -0.166666666666667011E-01 0.172964743993847016E-01 - -0.239698237234471001E-07 -0.141639617486673003E-03 -0.121288294398923001E-05 - 0.178147057813330990E-01 -0.141639617486714989E-03 0.178147057813330990E-01 - -0.121288294399298003E-05 -0.166666666666667011E-01 -0.139670761454465991E-03 - -0.239698237234977997E-07 0.172964743993847016E-01 -0.166666666666667011E-01 - -0.435069391188907986E-03 0.193887118630626000E-01 -0.125641385909704010E-02 - -0.435069392936829997E-03 -0.118148619756053004E-02 0.194636395637322011E-01 - -0.166666666666667011E-01 0.172965077542917012E-01 -0.927568247378397048E-07 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.102046200973721995E-05 - 0.173051339160060005E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.139674680709099010E-03 0.000000000000000000E+00 - 0.173041376249854009E-01 -0.235410513737083011E-07 -0.141584640574244004E-03 - -0.117464392253482002E-05 0.178146677207011994E-01 -0.166666666666667011E-01 - 0.193887977549708004E-01 -0.125650168627737991E-02 -0.118157208932236990E-02 - 0.194637273908664990E-01 -0.139674680709099010E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 0.173041376249854009E-01 - -0.235410513737084004E-07 -0.141584640574244004E-03 -0.117464392253482002E-05 - 0.178146677207011994E-01 -0.161601720191613987E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.172965076669214998E-01 -0.926701710590744007E-07 - -0.162351552424590994E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 -0.102033061970779005E-05 0.173051453907301991E-01 - -0.166666666666667011E-01 -0.140025227157220011E-03 0.172964538609525001E-01 - -0.280836077054835003E-07 -0.143700853603787992E-03 -0.141493042418215998E-05 - 0.178149166877635996E-01 -0.166666666666667011E-01 -0.434397138667670006E-03 - 0.193663283421353014E-01 -0.124681061178207996E-02 -0.435740193435873980E-03 - -0.118155329481477991E-02 0.194610974716315013E-01 -0.435888399868572985E-03 - 0.193018909827777987E-01 -0.102736762162752003E-02 -0.166666666666667011E-01 - -0.434248105255362999E-03 -0.108835030296548004E-02 0.192105826235978994E-01 - -0.166666666666667011E-01 -0.142766725250475000E-03 0.173186084279148000E-01 - -0.107223373775717997E-06 -0.158331602080537010E-03 -0.502583390911269042E-05 - 0.178188218385040009E-01 -0.166666666666667011E-01 0.191887710237797007E-01 - -0.116955156282324006E-02 -0.166666666666667011E-01 -0.115321249659207001E-02 - 0.193095009677901989E-01 -0.166666666666667011E-01 0.192716415760125007E-01 - -0.120400166465171009E-02 -0.117888093553793010E-02 0.194469246141813004E-01 - -0.166666666666667011E-01 -0.139668701380672987E-03 0.172963839829421012E-01 - -0.239535905300788984E-07 -0.141628653565274010E-03 -0.121228849838231003E-05 - 0.178147051859353991E-01 -0.435070871110831980E-03 -0.166666666666667011E-01 - 0.194629509412433989E-01 -0.118084397985067997E-02 0.000000000000000000E+00 - -0.435067913007957025E-03 -0.125568351559249996E-02 0.193879990665461986E-01 - -0.166666666666667011E-01 -0.435069390650039015E-03 0.000000000000000000E+00 - 0.193884418750694017E-01 -0.125613779809562003E-02 -0.435069393475699022E-03 - -0.118121622922623989E-02 0.194633635071832997E-01 -0.166666666666667011E-01 - -0.139670790826866987E-03 0.172964743947118006E-01 -0.239701379014204989E-07 - -0.141639787000618012E-03 -0.121289838766225000E-05 0.178147057914929997E-01 - -0.141639653189466996E-03 0.178147057839792011E-01 -0.121288622158938009E-05 - -0.166666666666667011E-01 -0.139670767633548004E-03 -0.239698902061346000E-07 - 0.172964743947353998E-01 -0.166666666666667011E-01 -0.435069338865947026E-03 - 0.193887012207536007E-01 -0.125640423249911005E-02 -0.435069445259782012E-03 - -0.118147761261617004E-02 0.194636302225951012E-01 -0.161594071152451998E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.172965071943530987E-01 - -0.911198389499341023E-07 -0.162344730250092007E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.100404518995791990E-05 0.173051412553598995E-01 - 0.000000000000000000E+00 -0.139674707461267010E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.173041614291120985E-01 -0.235399540925235988E-07 - -0.141584554759037012E-03 -0.117453693858401004E-05 0.178146676069954012E-01 - -0.435069368728934001E-03 -0.166666666666667011E-01 0.193887057094737983E-01 - -0.125640804230569007E-02 -0.435069415396801976E-03 -0.118148083984231998E-02 - 0.194636339567480984E-01 -0.166666666666667011E-01 -0.435069379906770002E-03 - 0.193887120222560987E-01 -0.125641427645285992E-02 -0.435069404218967981E-03 - -0.118148677817681990E-02 0.194636400648426008E-01 -0.161601688606437998E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.172965075862324992E-01 - -0.926646811008730053E-07 -0.162351530879915987E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.102027850567167008E-05 0.173051453753680015E-01 - 0.000000000000000000E+00 -0.139674509841842008E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.173041492584409991E-01 -0.235385724457535989E-07 - -0.141583570403258998E-03 -0.117449718296302996E-05 0.178146674600714004E-01 - -0.435069784091323016E-03 -0.166666666666667011E-01 0.194629197189570002E-01 - -0.118078836231266999E-02 -0.435069000033927022E-03 -0.125568468813201011E-02 - 0.193879970350075009E-01 -0.371043793435626003E-03 -0.166666666666667011E-01 - 0.180438947424557995E-01 -0.288534286733046986E-03 -0.486290336507850022E-03 - -0.560900198964180987E-03 0.187957070663157012E-01 -0.166666666666667011E-01 - 0.177374439856716991E-01 -0.214133613745063999E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.267021979533145016E-03 - 0.179665664602573008E-01 -0.166666666666667011E-01 0.193465910019366000E-01 - -0.123791262554833999E-02 -0.118077342883706003E-02 0.194573127658026999E-01 - -0.166666666666667011E-01 -0.139258097457669987E-03 0.172965222480018985E-01 - -0.139396279238899008E-07 -0.139258097457669987E-03 -0.716212299733630966E-06 - 0.178142098387169996E-01 -0.448746699171803979E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.187014712985399990E-01 -0.470424366568322023E-03 - -0.420614096697062990E-03 -0.416028776742435001E-03 0.185800358532766004E-01 - -0.143647797085317999E-03 0.178142596258036995E-01 -0.784777563840381003E-06 - -0.166666666666667011E-01 -0.289432939916457006E-03 -0.392675356000534004E-05 - 0.178691669573259014E-01 -0.166666666666667011E-01 -0.433838074751410983E-03 - 0.193348477920875991E-01 -0.122492972126329001E-02 -0.436295777836767011E-03 - -0.116751654000400999E-02 0.194447290230824014E-01 -0.166666666666667011E-01 - 0.173080151788632987E-01 -0.244742519208693003E-04 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.133996458919445003E-03 0.178636488290151996E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.139757971205888996E-03 0.000000000000000000E+00 0.173348621828142008E-01 - -0.229201529246951995E-07 -0.141718205587288988E-03 -0.107549290031851995E-05 - 0.178145692111312988E-01 -0.166666666666667011E-01 0.193848567630524014E-01 - -0.125477293338469996E-02 -0.118151635484324991E-02 0.194632155620537983E-01 - -0.140442473096456008E-03 -0.166666666666667011E-01 0.173484955878829006E-01 - -0.294133603610693987E-07 -0.144782161123405996E-03 -0.133037267977017010E-05 - 0.178148273618939992E-01 -0.435071708554150990E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.194637484731423997E-01 -0.118167963393035996E-02 - -0.435067075554543010E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 -0.125644827298293993E-02 0.193888013904018990E-01 - -0.166666666666667011E-01 -0.143972797957216006E-03 0.173505484374359004E-01 - -0.481359947088405975E-04 -0.558385337699288022E-03 -0.438375372304428026E-03 - 0.187218716475071011E-01 -0.166666666666667011E-01 -0.139937861588156992E-03 - 0.174078894816094010E-01 -0.227986700629730998E-07 -0.141801880833368992E-03 - -0.908696204756822957E-06 0.178144027207976993E-01 -0.143597614564550997E-03 - 0.178145584123696014E-01 -0.106316430421145007E-05 -0.166666666666667011E-01 - -0.140417723829495007E-03 -0.268131472654641992E-07 0.174078938787147995E-01 - -0.166666666666667011E-01 -0.434071246446273990E-03 0.193423068719833988E-01 - -0.122826345821061989E-02 -0.436064307367708980E-03 -0.116769061492820003E-02 - 0.194456917758701990E-01 -0.166666666666667011E-01 0.172982030065722998E-01 - -0.280949763883982991E-07 -0.166666666666667011E-01 -0.189517763026067993E-05 - 0.178271977135994013E-01 -0.166666666666667011E-01 0.178265008932220990E-01 - -0.134657557070520009E-05 -0.925862840555429982E-06 0.178144080134048990E-01 - -0.166666666666667011E-01 -0.434780858347314024E-03 0.193789566829400996E-01 - -0.125221572856264000E-02 -0.435357659993625004E-03 -0.118148634697722000E-02 - 0.194624944824026008E-01 -0.139674508339166013E-03 -0.166666666666667011E-01 - 0.173041495432516984E-01 -0.235385803949071996E-07 0.000000000000000000E+00 - -0.141583560353484997E-03 -0.117449727533117009E-05 0.178146675901048983E-01 - -0.166666666666667011E-01 -0.139670744554657995E-03 0.172964393825064990E-01 - -0.239719255799683993E-07 -0.141639879972026009E-03 -0.121306525215112003E-05 - 0.178147059627193002E-01 -0.166666666666667011E-01 -0.140348090038086001E-03 - 0.178144318165064999E-01 -0.933801051164943002E-06 -0.140348090679100993E-03 - -0.933801691611406014E-06 0.178144318606493005E-01 -0.435069394145568998E-03 - 0.194636343488718000E-01 -0.118148110868380009E-02 -0.166666666666667011E-01 - -0.435069389980168985E-03 -0.125640863633868995E-02 0.193887067279608985E-01 - -0.166666666666667011E-01 -0.435069389466851027E-03 0.193887808952679999E-01 - -0.125648448071444003E-02 -0.435069394658887010E-03 -0.118155528731557990E-02 - 0.194637102049162007E-01 -0.139700012758421009E-03 -0.166666666666667011E-01 - 0.172964152304735004E-01 -0.947803391050105932E-08 0.000000000000000000E+00 - -0.141286629556396993E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.427087720755819975E-06 0.177496011881496003E-01 - 0.000000000000000000E+00 -0.140176041318299010E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.177492266992976995E-01 -0.496642740357883019E-07 - -0.140412389918406994E-03 -0.517602927754366013E-06 0.178140137848851986E-01 - -0.435070897570947004E-03 -0.166666666666667011E-01 0.194636050822587989E-01 - -0.118148880805278990E-02 -0.435067886547593015E-03 -0.125633531890120007E-02 - 0.193886425635053997E-01 -0.166666666666667011E-01 -0.139669237663479995E-03 - 0.172964396325578015E-01 -0.239557312619153985E-07 -0.141631179766294000E-03 - -0.121226845353324010E-05 0.178147051921049987E-01 -0.435070355283276994E-03 - -0.166666666666667011E-01 0.194636498694088986E-01 -0.118156237601487000E-02 - -0.435068428839514019E-03 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.125638673489450003E-02 - 0.193887513139221999E-01 0.000000000000000000E+00 -0.435071250841686976E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.194636876903557000E-01 - -0.118157615129436001E-02 -0.435067533273078989E-03 -0.125640800692181000E-02 - 0.193887125978184013E-01 -0.139670568327848998E-03 -0.166666666666667011E-01 - 0.172964396537817998E-01 -0.239700195293086008E-07 -0.141638860134678002E-03 - -0.121297096046396990E-05 0.178147058843058995E-01 -0.435090508141796988E-03 - -0.166666666666667011E-01 0.194669892927517990E-01 -0.118530039028566991E-02 - -0.435048274568398019E-03 -0.125913754523842994E-02 0.193915247098193008E-01 - -0.166666666666667011E-01 0.172960984745374989E-01 -0.213609391405068995E-07 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.106367312891359006E-05 - 0.178056546914439999E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.173098191165470990E-01 -0.233466429807202985E-07 -0.116342624068746999E-05 - 0.178149495945450001E-01 -0.166666666666667011E-01 -0.435069713252911978E-03 - 0.194633204071541013E-01 -0.118118203409811997E-02 -0.435069070872497979E-03 - -0.125608526726808992E-02 0.193883924273124987E-01 -0.139659157492490009E-03 - -0.166666666666667011E-01 0.172964811115927999E-01 -0.238453400472707005E-07 - -0.141572592558481008E-03 -0.120675012864579991E-05 0.178147008765491000E-01 - -0.141538194636943009E-03 0.178146978498745015E-01 -0.120361727918045010E-05 - -0.166666666666667011E-01 -0.139653196804214987E-03 -0.237816257834915005E-07 - 0.172964810525967003E-01 -0.166666666666667011E-01 -0.435101710749684009E-03 - 0.194659176105031993E-01 -0.118451620585209995E-02 -0.435037070057877023E-03 - -0.125773914717291005E-02 0.193902105162142985E-01 -0.166666666666667011E-01 - 0.172962783354724016E-01 -0.218342668490586010E-07 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.109032448231275997E-05 0.178073744716654991E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.434677413069518002E-03 0.000000000000000000E+00 0.193010409680071000E-01 - -0.117438144583154004E-02 -0.435460867464983019E-03 -0.110662888765544992E-02 - 0.193848162965538015E-01 -0.166666666666667011E-01 0.173066829056823003E-01 - -0.235026937709481004E-07 -0.117850668810495007E-05 0.178149746842341007E-01 - -0.434571088710914019E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.192763238295202005E-01 -0.115086473839517999E-02 - -0.435566874653087015E-03 -0.108492248964389996E-02 0.193620926343772010E-01 - -0.161599193400744999E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.172964883975278011E-01 -0.923842913120058025E-07 -0.162350867043699006E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.101863211158590009E-05 0.173051440367491989E-01 -0.166666666666667011E-01 - -0.435069857911570023E-03 0.194633765819712985E-01 -0.118123987997616003E-02 - -0.435068926213479003E-03 -0.125613733340555994E-02 0.193884432348929991E-01 - -0.166666666666667011E-01 -0.139666077646513989E-03 0.172964743925189991E-01 - -0.239195397112311000E-07 -0.141612586198800003E-03 -0.121041044884479009E-05 - 0.178147032886921998E-01 -0.141624350828127994E-03 0.178147048678200005E-01 - -0.121149428933203998E-05 -0.166666666666667011E-01 -0.139668115474050994E-03 - -0.239415570468690011E-07 0.172964744463235016E-01 -0.166666666666667011E-01 - -0.435074201017785996E-03 0.194493813662804001E-01 -0.116767077284960006E-02 - -0.435064583034155982E-03 -0.124201442563575998E-02 0.193746726576543987E-01 - -0.166666666666667011E-01 0.173081303334150990E-01 -0.244868683853623015E-04 - -0.166666666666667011E-01 -0.133924491696816997E-03 0.178635877956252014E-01 - -0.166666666666667011E-01 0.173348654764515983E-01 -0.223062641979707998E-07 - -0.104752868600600992E-05 0.178145411980304008E-01 -0.166666666666667011E-01 - -0.435078250390669987E-03 0.194615680563809991E-01 -0.117967106973176005E-02 - -0.435060533485638016E-03 -0.125408416347905010E-02 0.193864744856198992E-01 - -0.139507036577244005E-03 -0.166666666666667011E-01 0.173052664826232015E-01 - -0.217638922762432993E-07 0.000000000000000000E+00 -0.140641592938044988E-03 - -0.108598665496889000E-05 0.178145535889315011E-01 -0.166666666666667011E-01 - -0.139669898838359002E-03 0.172964394567505994E-01 -0.239628351519636015E-07 - -0.141634997663802990E-03 -0.121261808844727004E-05 0.178147055141749007E-01 - -0.166666666666667011E-01 -0.140550312154719988E-03 0.178144664524045003E-01 - -0.935528049442035979E-06 -0.140352831077276009E-03 -0.934435514417231040E-06 - 0.178144323723362016E-01 -0.435067799539816001E-03 0.193882988824195003E-01 - -0.125598375468558010E-02 -0.166666666666667011E-01 -0.435070984577867010E-03 - -0.118114930153955010E-02 0.194632560036250013E-01 -0.166666666666667011E-01 - -0.435066606879862022E-03 0.193887645192647000E-01 -0.125653020767648002E-02 - -0.435072177221240990E-03 -0.118164241777987005E-02 0.194637770279468998E-01 - -0.139525322491481995E-03 -0.166666666666667011E-01 0.172964018029032988E-01 - -0.889114618565385951E-08 0.000000000000000000E+00 -0.140487892558472998E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.401772060623745995E-06 0.177503217815941985E-01 0.000000000000000000E+00 - -0.139680840737954994E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.177499929679008003E-01 -0.461429010297917004E-07 -0.139808785077553992E-03 - -0.474216741479885991E-06 0.178139754223665993E-01 -0.435490707175785019E-03 - -0.166666666666667011E-01 0.194038024150119011E-01 -0.113136000392566002E-02 - -0.434647496176783011E-03 -0.118521045340098008E-02 0.193210677155511990E-01 - -0.166666666666667011E-01 -0.139356047055132996E-03 0.172964913696200991E-01 - -0.205948470749521995E-07 -0.139855780445013013E-03 -0.104675229678023004E-05 - 0.178145500428797988E-01 -0.435278470720009021E-03 -0.166666666666667011E-01 - 0.194122813378182005E-01 -0.114346492577733008E-02 -0.434860167096455020E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.119664955180343990E-02 0.193456144785291995E-01 - 0.000000000000000000E+00 -0.435507216634916999E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.194759659828977992E-01 -0.120235295974980010E-02 - -0.434630954199676010E-03 -0.125613199041933006E-02 0.193898784861901013E-01 - -0.139505276818091995E-03 -0.166666666666667011E-01 0.172964538951550992E-01 - -0.222158902483321995E-07 -0.140684567793642991E-03 -0.112653151422048998E-05 - 0.178145939457222992E-01 -0.142980894339064004E-03 -0.166666666666667011E-01 - 0.173314540736370995E-01 -0.109865934611584997E-06 -0.158534103555776988E-03 - -0.501444248842295042E-05 0.178188430592681017E-01 -0.166666666666667011E-01 - 0.191833679361077016E-01 -0.116695965850981001E-02 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.115255810703607004E-02 0.193073017106333011E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.172969314941423015E-01 - -0.239425929155972008E-07 -0.121051376505055003E-05 0.178147034232610003E-01 - -0.166666666666667011E-01 -0.435705239999077025E-03 0.190415627118516000E-01 - -0.780602612534949010E-03 -0.434432059542404973E-03 -0.823587259891796967E-03 - 0.189719852118625000E-01 -0.147270631146331993E-03 -0.166666666666667011E-01 - 0.174669495140968994E-01 -0.100890785221647000E-03 -0.564704614718581016E-03 - -0.477370877328551027E-03 0.187551209753295985E-01 -0.139439405915424988E-03 - 0.178140970873433002E-01 -0.575941857140058983E-06 -0.166666666666667011E-01 - -0.139258097457669987E-03 -0.129861811110761993E-07 0.173691987550076005E-01 - -0.166666666666667011E-01 -0.437167563287824015E-03 0.195228637533087016E-01 - -0.126248812464285006E-02 -0.432956764514017976E-03 -0.125491318191218006E-02 - 0.193932789336802996E-01 -0.166666666666667011E-01 0.172966662190472999E-01 - -0.121413434559184010E-06 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.972291627814046991E-05 0.179398884476568016E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.350105792758167024E-03 - 0.000000000000000000E+00 0.179358412564323000E-01 -0.564926759529481039E-05 - -0.144639853954665995E-03 -0.314581136116586985E-06 0.178138319445949010E-01 - -0.166666666666667011E-01 0.193703856784436003E-01 -0.124844632146220002E-02 - -0.118120734927350994E-02 0.194604805412335984E-01 -0.431301131288118977E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.192675040792685999E-01 -0.119991065467065991E-02 -0.438790367540799003E-03 - -0.117513383093279010E-02 0.194438480854111986E-01 -0.160851119958975013E-03 - -0.166666666666667011E-01 0.172967403401987005E-01 -0.176667226349783012E-07 - -0.161653225232298001E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.204715316608505000E-06 - 0.173054598817215001E-01 -0.166666666666667011E-01 -0.435630521898672989E-03 - 0.193599440196800983E-01 -0.109077905797679009E-02 -0.434507214005793005E-03 - -0.113826801298030997E-02 0.192764807436608988E-01 -0.166666666666667011E-01 - -0.139536670190099001E-03 0.173126649368885999E-01 -0.217199105747271010E-07 - -0.140761521185631995E-03 -0.106837091099441006E-05 0.178145668888614987E-01 - -0.559739137192076996E-03 0.187342109887928994E-01 -0.453812468447541010E-03 - -0.166666666666667011E-01 -0.144437867213115988E-03 -0.570571543832924991E-04 - 0.173662394826720017E-01 -0.166666666666667011E-01 -0.141554480273217009E-03 - 0.173902726296936011E-01 -0.612977187708155032E-05 -0.543269256103985979E-03 - -0.155576690925164998E-04 0.183343607413548014E-01 -0.166666666666667011E-01 - 0.190435224881784007E-01 -0.109812319463792004E-02 -0.166666666666667011E-01 - -0.111910148431190000E-02 0.192002058717034990E-01 -0.166666666666667011E-01 - 0.191810336689013997E-01 -0.116112314020396998E-02 -0.117547931931091007E-02 - 0.194335300141191998E-01 -0.166666666666667011E-01 -0.435330233904635021E-03 - 0.194710205889397983E-01 -0.119426072742205005E-02 -0.434808333182332976E-03 - -0.125626108720282010E-02 0.193894446405454002E-01 -0.140176366522793013E-03 - -0.166666666666667011E-01 0.177492265794550012E-01 -0.496668406936042969E-07 - 0.000000000000000000E+00 -0.140412799232444004E-03 -0.517630044729356947E-06 - 0.178140137898683999E-01 -0.166666666666667011E-01 -0.435069391185710023E-03 - 0.193884418604889988E-01 -0.125613777143264004E-02 -0.435069392940028014E-03 - -0.118121619509623009E-02 0.194633634760910008E-01 -0.166666666666667011E-01 - -0.139670763696215990E-03 0.172964743991942983E-01 -0.239698478178091986E-07 - -0.141639630426265001E-03 -0.121288412910185994E-05 0.178147057825197991E-01 - -0.141639615804873989E-03 0.178147057811398994E-01 -0.121288279038434004E-05 - -0.166666666666667011E-01 -0.139670761162715996E-03 -0.239698205914474004E-07 - 0.172964743991641003E-01 -0.166666666666667011E-01 -0.435069386965028005E-03 - 0.193887089999775983E-01 -0.125641102065624996E-02 -0.435069397160709978E-03 - -0.118148348379683993E-02 0.194636367651009994E-01 -0.139670105523366001E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.172964392990828005E-01 - -0.239365198406219005E-07 -0.141635362805994999E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.121107464035148993E-05 0.178146126504961991E-01 - 0.000000000000000000E+00 -0.435064986808901988E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.193885562632203010E-01 -0.125634370176468005E-02 - -0.435073797255201974E-03 -0.118148076820924009E-02 0.194636164222987997E-01 - -0.139670495544523001E-03 -0.166666666666667011E-01 0.172964394121789995E-01 - -0.239692490633551009E-07 -0.141638442382618007E-03 -0.121293358252935993E-05 - 0.178147058331974001E-01 -0.166666666666667011E-01 -0.435069391760568977E-03 - 0.193887087038398000E-01 -0.125641061773689995E-02 -0.435069392365169006E-03 - -0.118148301966637995E-02 0.194636363154463005E-01 -0.139670535416434004E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.172964393658350986E-01 - -0.239411949136597000E-07 -0.141637844578005988E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.121130492256924999E-05 0.178146130952495999E-01 - 0.000000000000000000E+00 -0.435064996378640002E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.193885572616325011E-01 -0.125634453230068005E-02 - -0.435073787685731975E-03 -0.118148144433873992E-02 0.194636171468661988E-01 - -0.139670065580594011E-03 -0.166666666666667011E-01 0.172964393185634988E-01 - -0.239646313460681002E-07 -0.141635961417651987E-03 -0.121270668854361004E-05 - 0.178147055904161987E-01 -0.434952598750324988E-03 -0.166666666666667011E-01 - 0.193531525591995983E-01 -0.122237998046724992E-02 -0.435186141504625024E-03 - -0.114985437531919007E-02 0.194307215840397000E-01 -0.166666666666667011E-01 - 0.173112910791177986E-01 -0.253865277378717003E-04 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.138259363976047007E-03 0.178692219185750990E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.193634106903348008E-01 - -0.124536794862644003E-02 -0.118108829657000993E-02 0.194595580860945985E-01 - -0.166666666666667011E-01 -0.435070677392060000E-03 0.194631053165025998E-01 - -0.118099345597813001E-02 -0.435068106728430011E-03 -0.125584229113298996E-02 - 0.193881574845413997E-01 -0.139730287993371005E-03 -0.166666666666667011E-01 - 0.172965129995373001E-01 -0.246134321847894014E-07 -0.141982715458390012E-03 - -0.124443001739989997E-05 0.178147384368371013E-01 -0.141918001872325990E-03 - 0.178147321831834994E-01 -0.123839758886950008E-05 -0.166666666666667011E-01 - -0.139719073147956993E-03 -0.244906574666029988E-07 0.172965128605244989E-01 - -0.166666666666667011E-01 -0.434974886831416001E-03 0.193565238209871988E-01 - -0.122539181987882991E-02 -0.435163868606227981E-03 -0.115249078341496999E-02 - 0.194335198475120013E-01 -0.166666666666667011E-01 0.173108110594775988E-01 - -0.252486229488127016E-04 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.137635746082909991E-03 0.178684051236191015E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.140103968298785998E-03 - 0.000000000000000000E+00 0.173357792454707990E-01 -0.263227882027079996E-07 - -0.143405882709686988E-03 -0.122767817853369000E-05 0.178147220760386006E-01 - -0.166666666666667011E-01 0.193641798191389013E-01 -0.124570775063777002E-02 - -0.118110177773261002E-02 0.194596602746646002E-01 -0.140117598274002998E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.173359378634117996E-01 -0.264605452289240990E-07 -0.143470065352009998E-03 - -0.123349701123618009E-05 0.178147283869216008E-01 -0.139699946083262993E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.172964334581704009E-01 - -0.947724450009741998E-08 -0.141286163109395011E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.427036318179888015E-06 - 0.177496010208582009E-01 -0.166666666666667011E-01 -0.435115619333969008E-03 - 0.188851999358167003E-01 -0.625665684806612967E-03 -0.435023156520779992E-03 - -0.676915827627737983E-03 0.188324700803031986E-01 -0.166666666666667011E-01 - -0.139445450795412993E-03 0.174062618030441998E-01 -0.190292129986386985E-07 - -0.139963260485348001E-03 -0.765758079538392050E-06 0.178142630176348002E-01 - -0.532898920971015019E-03 0.187657876890418990E-01 -0.512856477472066019E-03 - -0.166666666666667011E-01 -0.274567299328894003E-03 -0.183874244209534995E-03 - 0.177330719708947010E-01 -0.166666666666667011E-01 -0.142934406904232013E-03 - 0.173279989505247985E-01 -0.109940048199017001E-06 -0.158611614219584011E-03 - -0.505928267052957010E-05 0.178189142473271986E-01 -0.166666666666667011E-01 - 0.191837857182392997E-01 -0.116715933513682007E-02 -0.166666666666667011E-01 - -0.115261551048108000E-02 0.193075009319374005E-01 -0.166666666666667011E-01 - 0.192715504217954997E-01 -0.120392392385177999E-02 -0.117898318247606007E-02 - 0.194477863404796010E-01 -0.166666666666667011E-01 -0.435541716493012012E-03 - 0.194766610463366005E-01 -0.120362759000766000E-02 -0.434596353391778010E-03 - -0.125584159818315009E-02 0.193897157787361017E-01 -0.435053772892073011E-03 - -0.166666666666667011E-01 0.193916866829711015E-01 -0.125980517520123993E-02 - 0.000000000000000000E+00 -0.435085010459558010E-03 -0.118504095738582995E-02 - 0.194671524144862003E-01 -0.166666666666667011E-01 -0.139670776269951012E-03 - 0.172964393783541989E-01 -0.239722661869433015E-07 -0.141640063065600003E-03 - -0.121308200605217006E-05 0.178147059781510013E-01 -0.166666666666667011E-01 - -0.435069330991406000E-03 0.193887306635154993E-01 -0.125643445841802995E-02 - -0.435069453134320002E-03 -0.118150726982913000E-02 0.194636605867374995E-01 - -0.140346027430110010E-03 0.178144315994148983E-01 -0.933532497936932975E-06 - -0.166666666666667011E-01 -0.140345973839162006E-03 -0.933513688391224014E-06 - 0.178144303098709995E-01 -0.166666666666667011E-01 -0.139688634719759995E-03 - 0.172964363725470992E-01 -0.241646447299009009E-07 -0.141743168416124011E-03 - -0.122254521804739008E-05 0.178147146809073016E-01 -0.435059870297027000E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.193910369733675987E-01 - -0.125956508747497001E-02 -0.435078913540753987E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.118493531740329009E-02 0.194660938610287000E-01 - 0.000000000000000000E+00 -0.435045710174179019E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.193875426795061985E-01 -0.125569402526470994E-02 - -0.435093072169510985E-03 -0.118112078544253991E-02 0.194631751705502017E-01 - -0.139419879842034002E-03 -0.166666666666667011E-01 0.172964832727320016E-01 - -0.213161584771942993E-07 -0.140230214929088989E-03 -0.108242332728665990E-05 - 0.178145871202937990E-01 -0.166666666666667011E-01 -0.435070648727867019E-03 - 0.194635173256733005E-01 -0.118139703309711993E-02 -0.435068135392855987E-03 - -0.125625483745250995E-02 0.193885626932315015E-01 -0.139434374869669994E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.172964444175748994E-01 - -0.849107878174372025E-08 -0.140068522154330002E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.383811097196054990E-06 0.177500579653602011E-01 - 0.000000000000000000E+00 -0.140208513794569996E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.177497183126398006E-01 -0.504600381193170004E-07 - -0.140451251998329987E-03 -0.520324482162036022E-06 0.178140160927331009E-01 - -0.435048849909038004E-03 -0.166666666666667011E-01 0.193915158406327004E-01 - -0.125973026995557995E-02 -0.435089932877583986E-03 -0.118503856970240008E-02 - 0.194671300235447001E-01 -0.435388199753574995E-03 -0.166666666666667011E-01 - 0.194580790950913997E-01 -0.118263497490539004E-02 -0.434750258397955996E-03 - -0.124178531210186001E-02 0.193754724111637015E-01 -0.166666666666667011E-01 - 0.173081480358036992E-01 -0.244895773689186006E-04 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.133922698934494993E-03 0.178635891343980995E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.193849898137051016E-01 - -0.125483132565947994E-02 -0.118151827368068998E-02 0.194632328928430985E-01 - -0.166666666666667011E-01 -0.142602728611152993E-03 0.172960572372911001E-01 - -0.117224517748718006E-06 -0.158587369045204001E-03 -0.567482454003337982E-05 - 0.178189579824735989E-01 -0.139720580884210006E-03 -0.166666666666667011E-01 - 0.177036794314483006E-01 -0.306504126162400988E-07 -0.139932852514187012E-03 - -0.499375406901578031E-06 0.178139944442497998E-01 -0.546160778925738975E-03 - 0.184421024561669013E-01 -0.894165394996089942E-04 -0.166666666666667011E-01 - -0.147442848397534993E-03 -0.240246210218283011E-04 0.177137736588957005E-01 - -0.166666666666667011E-01 -0.400413710403848991E-03 0.186079235147509001E-01 - -0.750380380436909983E-03 -0.465646679664707009E-03 -0.925379800643584029E-03 - 0.191586075383834986E-01 -0.166666666666667011E-01 0.177029162142693999E-01 - -0.193968097316907988E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 -0.245143979550067003E-03 0.179475102930650998E-01 - -0.166666666666667011E-01 -0.140401212229714989E-03 0.173460971401947009E-01 - -0.290712390925789007E-07 -0.144639789824670004E-03 -0.132223121416674009E-05 - 0.178148169771682983E-01 -0.166666666666667011E-01 0.193485878399233015E-01 - -0.123880031430750989E-02 -0.118081271609363995E-02 0.194575794108941998E-01 - -0.139755721134129007E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.173348680988957005E-01 -0.228988835753830997E-07 - -0.141707074016602990E-03 -0.107451126571115990E-05 0.178145682418863992E-01 - -0.139419950330123000E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.172964916757554002E-01 -0.212265241915647985E-07 -0.140190964958966001E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.107696768404692002E-05 0.178142558899385996E-01 -0.166666666666667011E-01 - -0.143972832788044007E-03 0.173505501235237010E-01 -0.481361186863988992E-04 - -0.558385355651620026E-03 -0.438374207210451001E-03 0.187218708288803999E-01 - -0.166666666666667011E-01 -0.139941367226140996E-03 0.174078891209918998E-01 - -0.228259192699078984E-07 -0.141815000445607008E-03 -0.909747619509022010E-06 - 0.178144035116544995E-01 -0.143597630564741997E-03 0.178145584149138995E-01 - -0.106316648366329002E-05 -0.166666666666667011E-01 -0.140417726652687994E-03 - -0.268131811843163990E-07 0.174078935255252004E-01 -0.166666666666667011E-01 - -0.434071240888807990E-03 0.193423066886313985E-01 -0.122826337417718994E-02 - -0.436064312888945010E-03 -0.116769060685687994E-02 0.194456917479709016E-01 - -0.166666666666667011E-01 0.172982030071983997E-01 -0.280949763384092005E-07 - -0.166666666666667011E-01 -0.189517762455052993E-05 0.178271977135736996E-01 - -0.166666666666667011E-01 0.178265008932003005E-01 -0.134657556992853991E-05 - -0.925862840600532052E-06 0.178144080134048990E-01 -0.166666666666667011E-01 - -0.434776065018225015E-03 0.193791533646124996E-01 -0.125252599578476996E-02 - -0.435362444422488025E-03 -0.118186375443277993E-02 0.194628395316310994E-01 - -0.140181958380352012E-03 -0.166666666666667011E-01 0.177492492651218983E-01 - -0.497367681715309972E-07 0.000000000000000000E+00 -0.140419741913999988E-03 - -0.518097724412703002E-06 0.178140141000072000E-01 -0.166666666666667011E-01 - -0.435069320521429986E-03 0.193884456522794985E-01 -0.125614319532225006E-02 - -0.435069463604292005E-03 -0.118122256043727992E-02 0.194633694896899009E-01 - -0.166666666666667011E-01 -0.139670889730207013E-03 0.172964735664216003E-01 - -0.239712525939313013E-07 -0.141640366257597987E-03 -0.121295504750436001E-05 - 0.178147058424521984E-01 -0.141657676433903007E-03 0.178147071049579997E-01 - -0.121453449198612004E-05 -0.166666666666667011E-01 -0.139673889655509000E-03 - -0.240033947134963004E-07 0.172964735804191985E-01 -0.166666666666667011E-01 - -0.435069155034934006E-03 0.193887186536986986E-01 -0.125642600972210999E-02 - -0.435069629090626005E-03 -0.118150164198114007E-02 0.194636536348553983E-01 - -0.139672279648728003E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.172964368900011996E-01 -0.239116221440321008E-07 -0.141646519897784998E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.120943312255959009E-05 - 0.178144561861929007E-01 0.000000000000000000E+00 -0.435057590298097978E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.193887674296497996E-01 - -0.125672688730438004E-02 -0.435081193385279013E-03 -0.118196873889853007E-02 - 0.194640540700679009E-01 -0.139679127767890996E-03 -0.166666666666667011E-01 - 0.172963729770981983E-01 -0.240508992733560004E-07 -0.141688504708434011E-03 - -0.121696226254563010E-05 0.178146602550901985E-01 -0.166666666666667011E-01 - -0.435063688039146997E-03 0.193888428280263002E-01 -0.125667755640774008E-02 - -0.435075095983264025E-03 -0.118183170521553998E-02 0.194639441233023999E-01 - -0.139679366822631002E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.172963909933037986E-01 -0.238904374634853986E-07 -0.141684903032989010E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.120762219986995010E-05 - 0.178141341272457995E-01 0.000000000000000000E+00 -0.435044663859789993E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.193878878622800004E-01 - -0.125607230703734000E-02 -0.435094118323148986E-03 -0.118150745952062992E-02 - 0.194635591497266010E-01 -0.139672170490270990E-03 -0.166666666666667011E-01 - 0.172964346444417008E-01 -0.239875039867713010E-07 -0.141648157868298996E-03 - -0.121384147158235009E-05 0.178147065643662998E-01 -0.435090496096461013E-03 - -0.166666666666667011E-01 0.194669889560541987E-01 -0.118529978595172006E-02 - -0.435048286615349021E-03 -0.125913755896959003E-02 0.193915246813111015E-01 - -0.166666666666667011E-01 0.172960984732630010E-01 -0.213609392236131997E-07 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.106367313562377998E-05 - 0.178056546915008017E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.173098191165334987E-01 -0.233466429813017999E-07 -0.116342624074817007E-05 - 0.178149495945451007E-01 -0.166666666666667011E-01 -0.435066797805972990E-03 - 0.193883041569685999E-01 -0.125604896999190001E-02 -0.435071986298389026E-03 - -0.118116640188501007E-02 0.194633028406514010E-01 -0.139659163853909999E-03 - -0.166666666666667011E-01 0.172964806846983010E-01 -0.238454355521033004E-07 - -0.141572633531096987E-03 -0.120675578133157002E-05 0.178147008816651985E-01 - -0.141538304155846992E-03 0.178146978602927997E-01 -0.120362914934832999E-05 - -0.166666666666667011E-01 -0.139653215055702006E-03 -0.237818478129986986E-07 - 0.172964806257772004E-01 -0.166666666666667011E-01 -0.435101676794290987E-03 - 0.194659166622981016E-01 -0.118451450992946000E-02 -0.435037104020240977E-03 - -0.125773918484995990E-02 0.193902104354441995E-01 -0.166666666666667011E-01 - 0.172962783320088007E-01 -0.218342670137599985E-07 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.109032449721754008E-05 0.178073744715902017E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.434677413065023984E-03 0.000000000000000000E+00 0.193010409686150998E-01 - -0.117438144655306010E-02 -0.435460867469465978E-03 -0.110662888842583997E-02 - 0.193848162973076985E-01 -0.166666666666667011E-01 0.173066829055725999E-01 - -0.235026937765211008E-07 -0.117850668861997998E-05 0.178149746842344997E-01 - -0.434571088713672988E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.192763238297126993E-01 -0.115086473852541002E-02 - -0.435566874650336990E-03 -0.108492248973003007E-02 0.193620926345084016E-01 - -0.139708981740112000E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.172963531511250990E-01 -0.951377049798812932E-08 -0.141328242724759995E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.428708426390729985E-06 0.177495866410289013E-01 -0.166666666666667011E-01 - -0.435067939506500023E-03 0.193883890526193002E-01 -0.125611318806702994E-02 - -0.435070844612536994E-03 -0.118121301451965996E-02 0.194633542459594008E-01 - -0.166666666666667011E-01 -0.139664557957332988E-03 0.172964746611408010E-01 - -0.239032606349812988E-07 -0.141603814090319997E-03 -0.120960968951782991E-05 - 0.178147026032577996E-01 -0.141639613175400988E-03 0.178147057784965000E-01 - -0.121288106723124010E-05 -0.166666666666667011E-01 -0.139670761286573002E-03 - -0.239698004783898005E-07 0.172964747236447995E-01 -0.166666666666667011E-01 - -0.435069392365180986E-03 0.194637108092800999E-01 -0.118155587635168993E-02 - -0.435069391760556996E-03 -0.125648510526804006E-02 0.193887815570613006E-01 - -0.166666666666667011E-01 0.172964151549298006E-01 -0.947849091229853943E-08 - -0.166666666666667011E-01 -0.427106302281655022E-06 0.177495992959328004E-01 - -0.166666666666667011E-01 0.177492248112503005E-01 -0.496796927091242970E-07 - -0.517779383578906051E-06 0.178140139540073003E-01 -0.166666666666667011E-01 - -0.435081057490304974E-03 0.194615601858393016E-01 -0.117973336574162002E-02 - -0.435057726202830975E-03 -0.125399377351724994E-02 0.193864040661351003E-01 - -0.139666404223663996E-03 -0.166666666666667011E-01 0.173052378719510992E-01 - -0.233928603649598996E-07 0.000000000000000000E+00 -0.141527802484294987E-03 - -0.116498797383902993E-05 0.178146582832621990E-01 -0.166666666666667011E-01 - -0.139669857630702005E-03 0.172964394671825013E-01 -0.239623924701885009E-07 - -0.141634759721171003E-03 -0.121259630273193999E-05 0.178147054943494990E-01 - -0.166666666666667011E-01 -0.140555314552680993E-03 0.178144675283670000E-01 - -0.934892070952238007E-06 -0.140347413389892010E-03 -0.933745297868121043E-06 - 0.178144318010057985E-01 -0.435067722986518012E-03 0.193882964019922999E-01 - -0.125598089226015993E-02 -0.166666666666667011E-01 -0.435071061130372014E-03 - -0.118115048591624990E-02 0.194632553716142000E-01 -0.166666666666667011E-01 - -0.435069777018943008E-03 0.194636048172950014E-01 -0.118146131478344993E-02 - -0.435069007106324973E-03 -0.125636789473562000E-02 0.193886685693907997E-01 - -0.139690885024778004E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.172964302280185014E-01 -0.950505359387174008E-08 -0.141247379876703009E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.428634134774421003E-06 - 0.177500186390967991E-01 0.000000000000000000E+00 -0.139677196785329999E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.177496423494717011E-01 - -0.457508561380249031E-07 -0.139805680326738989E-03 -0.473830440639680986E-06 - 0.178139751639271003E-01 -0.435493815113026025E-03 -0.166666666666667011E-01 - 0.194038726929433997E-01 -0.113147962792324000E-02 -0.434644379614391009E-03 - -0.118519815382812010E-02 0.193210650330794993E-01 -0.166666666666667011E-01 - -0.139355161737402005E-03 0.172964917336673983E-01 -0.205843400324393006E-07 - -0.139851518567708007E-03 -0.104623713006894006E-05 0.178145496587001004E-01 - -0.435276969900176027E-03 -0.166666666666667011E-01 0.194123409556366984E-01 - -0.114355309658945004E-02 -0.434861669984602990E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.119672311455852002E-02 - 0.193457917807220992E-01 0.000000000000000000E+00 -0.435510762531134978E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.194758550382455008E-01 - -0.120231103100156000E-02 -0.434627398233908020E-03 -0.125592048119514991E-02 - 0.193896956616587991E-01 -0.139662076715157995E-03 -0.166666666666667011E-01 - 0.172964848493273003E-01 -0.238760385660033009E-07 -0.141589392729580995E-03 - -0.120824848732212997E-05 0.178147013421607013E-01 -0.435069416060821000E-03 - -0.166666666666667011E-01 0.194636372101129017E-01 -0.118148444615658998E-02 - -0.435069368064916007E-03 -0.125641081782009996E-02 0.193887089888112006E-01 - -0.166666666666667011E-01 0.172964393423152007E-01 -0.239438465741853011E-07 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.121143525352024005E-05 - 0.178146131933784001E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.172964393844348002E-01 -0.239719133473768010E-07 -0.121306464651348996E-05 - 0.178147059621872987E-01 -0.166666666666667011E-01 -0.435085659385655980E-03 - 0.194638181335962009E-01 -0.118203438802173995E-02 -0.435053123899394999E-03 - -0.125612026912269993E-02 0.193884809592554004E-01 -0.139670749044936993E-03 - -0.166666666666667011E-01 0.172964759607952996E-01 -0.239695886158460013E-07 - -0.141639529816893012E-03 -0.121286786314684000E-05 0.178147057662445991E-01 - -0.141639434640671995E-03 0.178147057572842007E-01 -0.121285914937942990E-05 - -0.166666666666667011E-01 -0.139670732553277014E-03 -0.239694113970498999E-07 - 0.172964759605990989E-01 -0.166666666666667011E-01 -0.435069447260804027E-03 - 0.194636380955877999E-01 -0.118148603206807003E-02 -0.435069336864923981E-03 - -0.125641079713200007E-02 0.193887090761660011E-01 -0.166666666666667011E-01 - 0.172964393436225994E-01 -0.239438464742388014E-07 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.121143524546324995E-05 0.178146131932993002E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.435064994893337995E-03 0.000000000000000000E+00 0.193885567753522003E-01 - -0.125634406111458990E-02 -0.435073789170992999E-03 -0.118148100326300994E-02 - 0.194636166968780999E-01 -0.166666666666667011E-01 0.172964393844214012E-01 - -0.239719133482090010E-07 -0.121306464658341994E-05 0.178147059621873993E-01 - -0.435064994896760008E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.193885567753732009E-01 -0.125634406106166999E-02 - -0.435073789167570986E-03 -0.118148100316007990E-02 0.194636166967958983E-01 - -0.160852226257028993E-03 -0.166666666666667011E-01 0.172967522287055010E-01 - -0.175965589473195009E-07 -0.161650188610159987E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.203407230368285005E-06 0.173054266458900000E-01 -0.166666666666667011E-01 - -0.435068952667249981E-03 0.193884313397972990E-01 -0.125613577038263011E-02 - -0.435069831457874994E-03 -0.118122048876310006E-02 0.194633662905455002E-01 - -0.166666666666667011E-01 -0.139672097032745988E-03 0.172964738095684016E-01 - -0.239841856617656007E-07 -0.141647330681414008E-03 -0.121359018892023999E-05 - 0.178147064035918991E-01 -0.141624421727624012E-03 0.178147048779919992E-01 - -0.121150370057904998E-05 -0.166666666666667011E-01 -0.139668126615946994E-03 - -0.239417188530348007E-07 0.172964738011583997E-01 -0.166666666666667011E-01 - -0.435074178605607004E-03 0.194493807353944004E-01 -0.116766964883922007E-02 - -0.435064605447021003E-03 -0.124201443598458009E-02 0.193746725929095989E-01 - -0.166666666666667011E-01 0.173081303330447009E-01 -0.244868682847540015E-04 - -0.166666666666667011E-01 -0.133924491202099991E-03 0.178635877949816016E-01 - -0.166666666666667011E-01 0.173348654763293003E-01 -0.223062642015643999E-07 - -0.104752868644245000E-05 0.178145411980309004E-01 -0.166666666666667011E-01 - -0.435066924736750998E-03 0.193888351972144012E-01 -0.125659460890097997E-02 - -0.435071859369655009E-03 -0.118170026599907991E-02 0.194638400334165990E-01 - -0.139784932051159995E-03 -0.166666666666667011E-01 0.177494904443947013E-01 - -0.465951791816761021E-07 0.000000000000000000E+00 -0.139919920565161006E-03 - -0.483545322941620994E-06 0.178139539234989999E-01 -0.166666666666667011E-01 - -0.435069373706029993E-03 0.193884428434829004E-01 -0.125613915345877008E-02 - -0.435069410419707014E-03 -0.118121780684916997E-02 0.194633650118269004E-01 - -0.166666666666667011E-01 -0.139672737807363003E-03 0.172964741998577996E-01 - -0.239910227158871999E-07 -0.141651023984394999E-03 -0.121392525480250999E-05 - 0.178147066608579999E-01 -0.141640091614955997E-03 0.178147058209647997E-01 - -0.121292720982870991E-05 -0.166666666666667011E-01 -0.139670843240139006E-03 - -0.239707145982510011E-07 0.172964741884388991E-01 -0.166666666666667011E-01 - -0.435065762561208998E-03 0.193887058048313986E-01 -0.125648894894022007E-02 - -0.435073021522692984E-03 -0.118161486512369002E-02 0.194637424061168997E-01 - -0.139514021925975012E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.172964189492996988E-01 -0.222147158226326007E-07 -0.140733410708959988E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.112570816185458990E-05 - 0.178142652186864016E-01 0.000000000000000000E+00 -0.435053679420991009E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.193881873511392994E-01 - -0.125619573968341011E-02 -0.435085103920473015E-03 -0.118149860448210999E-02 - 0.194635877202401011E-01 -0.139671537483522003E-03 -0.166666666666667011E-01 - 0.172964363686913987E-01 -0.239806171193915986E-07 -0.141644487121937989E-03 - -0.121349920180918992E-05 0.178147063153519004E-01 -0.166666666666667011E-01 - -0.435069239606207980E-03 0.193887146861679988E-01 -0.125642003417049007E-02 - -0.435069544519456981E-03 -0.118149450125770001E-02 0.194636470456143011E-01 - -0.139671620203623008E-03 -0.166666666666667011E-01 0.172964378382645008E-01 - -0.239229803350932009E-07 -0.141643246303753000E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.121015091751334009E-05 0.178145160915291999E-01 0.000000000000000000E+00 - -0.435060404326353023E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.193885964889095007E-01 -0.125648696022081000E-02 -0.435078379542822014E-03 - -0.118169041399521007E-02 0.194637954413787000E-01 -0.139513918934682001E-03 - -0.166666666666667011E-01 0.172964088839709998E-01 -0.223091853795638985E-07 - -0.140734728439723009E-03 -0.113122244037598008E-05 0.178145985581429009E-01 - -0.142980894195082010E-03 -0.166666666666667011E-01 0.173314539890347015E-01 - -0.109866000013417994E-06 -0.158534109366701998E-03 -0.501444632546631029E-05 - 0.178188430632978990E-01 -0.166666666666667011E-01 0.191833678841652998E-01 - -0.116695963416674989E-02 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.115255809689225998E-02 0.193073016724906003E-01 0.000000000000000000E+00 - -0.166666666666667011E-01 0.172969314941213009E-01 -0.239425929167627016E-07 - -0.121051376515920001E-05 0.178147034232611010E-01 -0.166666666666667011E-01 - -0.435672877594180986E-03 0.190410730770498998E-01 -0.779817530267083964E-03 - -0.434464569145039006E-03 -0.823623092840541026E-03 0.189719892294968988E-01 - -0.147270553185889990E-03 -0.166666666666667011E-01 0.174669464297294991E-01 - -0.100890525843168994E-03 -0.564704630029780048E-03 -0.477372644101759012E-03 - 0.187551220717942987E-01 -0.139439417619302005E-03 0.178140970993464001E-01 - -0.575952369555323042E-06 -0.166666666666667011E-01 -0.139258097457669987E-03 - -0.129863402849587998E-07 0.173691959079540006E-01 -0.166666666666667011E-01 - -0.437167519508436999E-03 0.195228624474010001E-01 -0.126248680382140001E-02 - -0.432956808905804989E-03 -0.125491318727090004E-02 0.193932788356039991E-01 - -0.166666666666667011E-01 0.172966662193905010E-01 -0.121413434668133993E-06 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.972291627722414967E-05 - 0.179398884478061994E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.350105795458797975E-03 0.000000000000000000E+00 - 0.179358412564393013E-01 -0.564926743849099038E-05 -0.144639853703507987E-03 - -0.314581126948723974E-06 0.178138319445937006E-01 -0.166666666666667011E-01 - 0.193703856826768009E-01 -0.124844632332796993E-02 -0.118120734934456010E-02 - 0.194604805418001001E-01 -0.431301130210841007E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 0.192675040489116989E-01 - -0.119991064076258990E-02 -0.438790368590655981E-03 -0.117513383015891001E-02 - 0.194438480812311985E-01 -0.139700882958903997E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.172963963463645017E-01 -0.952678627103132059E-08 - -0.141292751882909993E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 -0.429511932811560008E-06 0.177498907292438987E-01 - -0.166666666666667011E-01 -0.435115740933037984E-03 0.188852022804432002E-01 - -0.625668905066499950E-03 -0.435023034878147990E-03 -0.676916698779699965E-03 - 0.188324708841117010E-01 -0.166666666666667011E-01 -0.139446455078796010E-03 - 0.174062615201245989E-01 -0.190374494518299010E-07 -0.139967041096937007E-03 - -0.766079367871497027E-06 0.178142633180293011E-01 -0.532899100111402005E-03 - 0.187657876777795017E-01 -0.512856379296787999E-03 -0.166666666666667011E-01 - -0.274566803915548998E-03 -0.183873918065665000E-03 0.177330707817049994E-01 - -0.166666666666667011E-01 -0.142934407180007997E-03 0.173279989208002007E-01 - -0.109940092441766996E-06 -0.158611617255726990E-03 -0.505928492620163000E-05 - 0.178189142493529011E-01 -0.166666666666667011E-01 0.191837856777059002E-01 - -0.116715931604291999E-02 -0.166666666666667011E-01 -0.115261550320967993E-02 - 0.193075009050143985E-01 -0.166666666666667011E-01 0.192715504007066017E-01 - -0.120392391412785997E-02 -0.117898318183162006E-02 0.194477863374666986E-01 - -0.166666666666667011E-01 -0.435539737045843010E-03 0.194769265765691001E-01 - -0.120385500091470992E-02 -0.434598338907063995E-03 -0.125615984296152997E-02 - 0.193900185880402989E-01 -0.435045514645459013E-03 -0.166666666666667011E-01 - 0.193914099137533005E-01 -0.125968948646175004E-02 0.000000000000000000E+00 - -0.435093267670908987E-03 -0.118504666533010995E-02 0.194671248940259008E-01 - -0.166666666666667011E-01 -0.139670791769998004E-03 0.172964393766727002E-01 - -0.239724329252018989E-07 -0.141640152553009998E-03 -0.121309020944367992E-05 - 0.178147059866567009E-01 -0.166666666666667011E-01 -0.435069301629178995E-03 - 0.193887307626826010E-01 -0.125643515612083000E-02 -0.435069482496533021E-03 - -0.118150837470912999E-02 0.194636615934981015E-01 -0.140348157921430997E-03 - 0.178144318702273992E-01 -0.933811502946043969E-06 -0.166666666666667011E-01 - -0.140348150101401000E-03 -0.933783910990536035E-06 0.178144299643035985E-01 - -0.166666666666667011E-01 -0.139689488845466003E-03 0.172964362338874009E-01 - -0.241738769572596000E-07 -0.141748099793582007E-03 -0.122299932206489994E-05 - 0.178147151092019999E-01 -0.435059916514922017E-03 -0.166666666666667011E-01 - 0.193909887083489017E-01 -0.125955268129309002E-02 0.000000000000000000E+00 - -0.435078867325628023E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.118493749346423004E-02 0.194660289027400006E-01 - 0.000000000000000000E+00 -0.435044083341360984E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.193878554391240000E-01 -0.125604934766983993E-02 - -0.435094698749261975E-03 -0.118149272049225995E-02 0.194635444483519000E-01 - -0.139670978031983007E-03 -0.166666666666667011E-01 0.172964365263250999E-01 - -0.239746177115109000E-07 -0.141641256924305989E-03 -0.121320404495395004E-05 - 0.178147060925782016E-01 -0.166666666666667011E-01 -0.435069350149236000E-03 - 0.193887875206789004E-01 -0.125649208526834003E-02 -0.435069433976496020E-03 - -0.118156330259116009E-02 0.194637181802562002E-01 -0.139700455923195987E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.172964141750712001E-01 - -0.947598200812492946E-08 -0.141288502520823991E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.426976078434958026E-06 0.177495760715989000E-01 - 0.000000000000000000E+00 -0.140209281941629005E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.177492016410450010E-01 -0.499125605911364029E-07 - -0.140454288917710998E-03 -0.520385719864523011E-06 0.178140161378314010E-01 - -0.435047474599787026E-03 -0.166666666666667011E-01 0.193914777822561009E-01 - -0.125971925119921992E-02 -0.435091308001488001E-03 -0.118504760094160990E-02 - 0.194671336437930997E-01 -0.435388198571750988E-03 -0.166666666666667011E-01 - 0.194580790627339011E-01 -0.118263492130232990E-02 -0.434750259582202979E-03 - -0.124178531317212996E-02 0.193754724084427010E-01 -0.166666666666667011E-01 - 0.173081480357068010E-01 -0.244895773575376999E-04 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.133922698986129010E-03 0.178635891344391985E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.193849898136359000E-01 - -0.125483132562911989E-02 -0.118151827367967994E-02 0.194632328928340016E-01 - -0.166666666666667011E-01 -0.142602747236407002E-03 0.172960559757211994E-01 - -0.117227408613948994E-06 -0.158587582136584995E-03 -0.567497435835260990E-05 - 0.178189581300063002E-01 -0.139720580991034007E-03 -0.166666666666667011E-01 - 0.177036782302840007E-01 -0.306502532037200970E-07 -0.139932855252091994E-03 - -0.499376182467009957E-06 0.178139944449866999E-01 -0.546160785305606963E-03 - 0.184421025597138015E-01 -0.894167397427545998E-04 -0.166666666666667011E-01 - -0.147442821218409995E-03 -0.240244287551271008E-04 0.177137723141376005E-01 - -0.166666666666667011E-01 -0.400413696341282016E-03 0.186079233649920003E-01 - -0.750380296538879967E-03 -0.465646690569393017E-03 -0.925379786421282957E-03 - 0.191586075172780999E-01 -0.166666666666667011E-01 0.177029162002804996E-01 - -0.193968089936665005E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 -0.245143973844672025E-03 0.179475102895724006E-01 - -0.166666666666667011E-01 -0.140401212232880995E-03 0.173460971398785996E-01 - -0.290712391494978008E-07 -0.144639789846284008E-03 -0.132223121759111006E-05 - 0.178148169771744011E-01 -0.166666666666667011E-01 0.193485878396606990E-01 - -0.123880031419069990E-02 -0.118081271608876993E-02 0.194575794108615002E-01 - -0.139755721135340007E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.173348680988911000E-01 -0.228988835869186992E-07 - -0.141707074022604998E-03 -0.107451126624275990E-05 0.178145682418867010E-01 - -0.139671204369828987E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.172964527346999991E-01 -0.238206247996137016E-07 -0.141637869293981998E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.120424066353106008E-05 0.178141993541914984E-01 -0.166666666666667011E-01 - -0.435293328042460988E-03 0.188969200156620006E-01 -0.638325692737096986E-03 - -0.434845262794537977E-03 -0.686401912695584044E-03 0.188415679710590006E-01 - -0.166666666666667011E-01 -0.139450260715297009E-03 0.174035477820680996E-01 - -0.190790185253572011E-07 -0.139988379398403009E-03 -0.772546665100815971E-06 - 0.178142695812380999E-01 -0.535400152422887033E-03 0.187643522383528008E-01 - -0.509589395081856949E-03 -0.166666666666667011E-01 -0.267380606913843026E-03 - -0.178487068761980011E-03 0.177166953062542001E-01 -0.166666666666667011E-01 - -0.142812108775019998E-03 0.173285847433682014E-01 -0.101845505783431005E-06 - -0.158027855199464992E-03 -0.469583501144497018E-05 0.178185786022877005E-01 - -0.166666666666667011E-01 0.191928643172403005E-01 -0.117144676995123995E-02 - -0.166666666666667011E-01 -0.115407859919587998E-02 0.193128636528600017E-01 - -0.166666666666667011E-01 0.192743847992538006E-01 -0.120526566976207009E-02 - -0.117896448649838000E-02 0.194473150974792015E-01 -0.166666666666667011E-01 - -0.435531752463573995E-03 0.194769704025496010E-01 -0.120375472484332991E-02 - -0.434606347505284997E-03 -0.125643108232954001E-02 0.193902401795961014E-01 - -0.435044468242076017E-03 -0.166666666666667011E-01 0.193871801282229986E-01 - -0.125535484215398997E-02 0.000000000000000000E+00 -0.435094313909530008E-03 - -0.118080938637835000E-02 0.194628399719979014E-01 -0.166666666666667011E-01 - -0.139670871572238991E-03 0.172964393487799011E-01 -0.239732892684089006E-07 - -0.141640613379916011E-03 -0.121313235466317992E-05 0.178147060195563006E-01 - -0.166666666666667011E-01 -0.435069156529857982E-03 0.193887053622597005E-01 - -0.125641208621197001E-02 -0.435069627595703981E-03 -0.118148788386283009E-02 - 0.194636401109469989E-01 -0.140357102716023998E-03 0.178144327480211015E-01 - -0.934953983487829978E-06 -0.166666666666667011E-01 -0.140357082074367998E-03 - -0.934881658679828005E-06 0.178144277579760993E-01 -0.166666666666667011E-01 - -0.139671248276079991E-03 0.172964387342726990E-01 -0.239772668643883999E-07 - -0.141642790859008988E-03 -0.121332835924946005E-05 0.178147058633731994E-01 - -0.435073048109683026E-03 -0.166666666666667011E-01 0.194629028419433005E-01 - -0.118081642190051008E-02 -0.435065735973592016E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.125566691830197993E-02 - 0.193878887757382989E-01 0.000000000000000000E+00 -0.435064995098833989E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.193890263762267999E-01 - -0.125684228342590990E-02 -0.435073788965510016E-03 -0.118197542174123009E-02 - 0.194640901826458011E-01 -0.139679109099943003E-03 -0.166666666666667011E-01 - 0.172963725697189014E-01 -0.240507205201775987E-07 -0.141688401087265996E-03 - -0.121695435575037005E-05 0.178146602328799995E-01 -0.166666666666667011E-01 - -0.435064159301769998E-03 0.193889744409617010E-01 -0.125680005533066992E-02 - -0.435074624737013995E-03 -0.118194373351253996E-02 0.194640651337368992E-01 - -0.139708967713254010E-03 -0.166666666666667011E-01 0.172963616356281991E-01 - -0.951324367715890009E-08 -0.141328092009716008E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.428676150398848976E-06 0.177495851017135005E-01 0.000000000000000000E+00 - -0.140184006518702988E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.177492476417109996E-01 -0.497520252347835031E-07 -0.140422322794949995E-03 - -0.518268041627564046E-06 0.178140140868086015E-01 -0.435065432183330027E-03 - -0.166666666666667011E-01 0.193879055107330014E-01 -0.125567283667538999E-02 - -0.435073351892596022E-03 -0.118081969348847004E-02 0.194629358214771994E-01 - -0.371043726257224002E-03 -0.166666666666667011E-01 0.180438943525561002E-01 - -0.288534096345938022E-03 -0.486290379332660018E-03 -0.560900084076317983E-03 - 0.187957069582666003E-01 -0.166666666666667011E-01 0.177374439328937990E-01 - -0.214133585214810996E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.267021959071755976E-03 0.179665664489489993E-01 - -0.166666666666667011E-01 0.193465909995582004E-01 -0.123791262449076994E-02 - -0.118077342878954009E-02 0.194573127654812002E-01 -0.166666666666667011E-01 - -0.139258097457669987E-03 0.172965161025564994E-01 -0.139408267359583998E-07 - -0.139258097457669987E-03 -0.716280208726827027E-06 0.178142098987628013E-01 - -0.448745061591116986E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.187014625769068993E-01 -0.470417770715891002E-03 -0.420615920070051990E-03 - -0.416030208289801002E-03 0.185800388700857987E-01 -0.143647586674343006E-03 - 0.178142597734856009E-01 -0.784926914613626987E-06 -0.166666666666667011E-01 - -0.289422112454956976E-03 -0.392634159972045978E-05 0.178691591832313007E-01 - -0.166666666666667011E-01 -0.433838061791930977E-03 0.193348473865251995E-01 - -0.122492954566329995E-02 -0.436295790691586000E-03 -0.116751653970171990E-02 - 0.194447289756928994E-01 -0.166666666666667011E-01 0.173080151777424002E-01 - -0.244742517923652000E-04 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.133996459548231008E-03 0.178636488295263011E-01 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.139757971222783007E-03 - 0.000000000000000000E+00 0.173348621827651983E-01 -0.229201530844114011E-07 - -0.141718205670910005E-03 -0.107549290769848001E-05 0.178145692111379012E-01 - -0.166666666666667011E-01 0.193848567620656005E-01 -0.125477293295166007E-02 - -0.118151635482893996E-02 0.194632155619245996E-01 -0.140442473137899012E-03 - -0.166666666666667011E-01 0.173484955870056995E-01 -0.294133609017818009E-07 - -0.144782161335538988E-03 -0.133037270602234991E-05 0.178148273619183999E-01 - -0.139679362257274988E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.172964143215187001E-01 -0.238876115774515004E-07 -0.141684595140730989E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.120742002760442999E-05 0.178141299090062988E-01 -0.166666666666667011E-01 - -0.139670790012750987E-03 0.172964393796226010E-01 -0.239724130311146011E-07 - -0.141640142355812013E-03 -0.121308921736943007E-05 0.178147059830380990E-01 - -0.166666666666667011E-01 -0.140356055867897991E-03 0.178144308211048014E-01 - -0.934793117616238968E-06 -0.140356063365803011E-03 -0.934819410434324011E-06 - 0.178144326353903010E-01 -0.435069477705365992E-03 0.194636362328108985E-01 - -0.118148328063351999E-02 -0.166666666666667011E-01 -0.435069306420349006E-03 - -0.125640963064629994E-02 0.193887060480272995E-01 -0.166666666666667011E-01 - -0.139670746102450000E-03 0.172964393573767014E-01 -0.239719438513679986E-07 - -0.141639889164474000E-03 -0.121306620727653000E-05 0.178147059636829010E-01 - -0.166666666666667011E-01 0.193887976067478006E-01 -0.125650162570866992E-02 - -0.166666666666667011E-01 -0.118157207781601002E-02 0.194637272757679988E-01 - -0.166666666666667011E-01 0.193887977046092001E-01 -0.125650166419828007E-02 - -0.118157208862118990E-02 0.194637273843102990E-01 -0.166666666666667011E-01 - -0.435064765067235998E-03 0.193889522097447989E-01 -0.125676448751559002E-02 - -0.435074018990514973E-03 -0.118190001722009996E-02 0.194640239141652995E-01 - -0.140182361260136993E-03 -0.166666666666667011E-01 0.177493225039018002E-01 - -0.498169409981402992E-07 0.000000000000000000E+00 -0.140419962900806993E-03 - -0.518131012618996021E-06 0.178140141298323007E-01 -0.166666666666667011E-01 - -0.435069323598363008E-03 0.193884453764502987E-01 -0.125614284670238011E-02 - -0.435069460527360988E-03 -0.118122217360852990E-02 0.194633691143310007E-01 - -0.166666666666667011E-01 -0.139670884426293997E-03 0.172964736171049016E-01 - -0.239711924341550007E-07 -0.141640335129922002E-03 -0.121295197632909002E-05 - 0.178147058397751003E-01 -0.141656453358155989E-03 0.178147070060390988E-01 - -0.121442246005117002E-05 -0.166666666666667011E-01 -0.139673677795910993E-03 - -0.240011176184461992E-07 0.172964736295889016E-01 -0.166666666666667011E-01 - -0.435069164848058006E-03 0.193887179318738001E-01 -0.125642505656990006E-02 - -0.435069619277515991E-03 -0.118150056235400997E-02 0.194636526000948017E-01 - -0.139672366859797993E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.172964370349440004E-01 -0.239153496141802001E-07 -0.141647103173812004E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.120964050265571001E-05 - 0.178144653667999010E-01 0.000000000000000000E+00 -0.435058020368536008E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.193887196651586005E-01 - -0.125666847383042006E-02 -0.435080763346489007E-03 -0.118190508477706993E-02 - 0.194639922621226008E-01 -0.139641248248434992E-03 -0.166666666666667011E-01 - 0.172963810772421012E-01 -0.236457807710709016E-07 -0.141469789611806004E-03 - -0.119702907967459002E-05 0.178146434627204017E-01 -0.166666666666667011E-01 - -0.435064079024843019E-03 0.193888050589719994E-01 -0.125663030165246007E-02 - -0.435074705011247002E-03 -0.118177958688183999E-02 0.194638936988976996E-01 - -0.139641455576729011E-03 -0.166666666666667011E-01 0.172963976004947007E-01 - -0.234988035352734987E-07 -0.141466765709830994E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.118846831550546996E-05 - 0.178141535165701990E-01 0.000000000000000000E+00 -0.435046365527533990E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.193879446262342006E-01 - -0.125609596684336995E-02 -0.435092416913644979E-03 -0.118150619229764994E-02 - 0.194635647529617997E-01 -0.139672261692974013E-03 -0.166666666666667011E-01 - 0.172964349185803015E-01 -0.239884652187998984E-07 -0.141648681442598987E-03 - -0.121388810715753001E-05 0.178147066071378986E-01 -0.435069380083255997E-03 - -0.166666666666667011E-01 0.193887114872825991E-01 -0.125641369962899998E-02 - -0.435069404042481985E-03 -0.118148620112529007E-02 0.194636395221940012E-01 - -0.166666666666667011E-01 0.172965077531164017E-01 -0.927568178720202994E-07 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.102046202213369007E-05 - 0.173051339160117008E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.193887977533772002E-01 -0.125650168557878009E-02 -0.118157208930007003E-02 - 0.194637273906578985E-01 -0.166666666666667011E-01 -0.435066628630517986E-03 - 0.193883490811541000E-01 -0.125609889832018992E-02 -0.435072155470964984E-03 - -0.118121788505638997E-02 0.194633537877374994E-01 -0.139670767304303994E-03 - -0.166666666666667011E-01 0.172964740023511983E-01 -0.239699122889327989E-07 - -0.141639655324462003E-03 -0.121288819251166995E-05 0.178147057860663996E-01 - -0.141639719254381006E-03 0.178147057910134007E-01 -0.121289402770353995E-05 - -0.166666666666667011E-01 -0.139670778383217008E-03 -0.239700310192222999E-07 - 0.172964740024197997E-01 -0.166666666666667011E-01 -0.435069359721298977E-03 - 0.193887107978404985E-01 -0.125641340675414995E-02 -0.435069424404436024E-03 - -0.118148620716793996E-02 0.194636394455750990E-01 -0.166666666666667011E-01 - 0.172965077510262993E-01 -0.927568056586548005E-07 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.102046204410674010E-05 0.173051339160213008E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.139674680780886991E-03 0.000000000000000000E+00 0.173041376249668984E-01 - -0.235410521181961013E-07 -0.141584640975313996E-03 -0.117464395864609995E-05 - 0.178146677207322995E-01 -0.166666666666667011E-01 0.193887977504637009E-01 - -0.125650168430156001E-02 -0.118157208925927996E-02 0.194637273902763010E-01 - -0.139674680734473000E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.173041376249788992E-01 -0.235410516367698016E-07 - -0.141584640716005010E-03 -0.117464393529904008E-05 0.178146677207122010E-01 - -0.139669171022264988E-03 -0.166666666666667011E-01 0.172963674074106989E-01 - -0.936361616323369989E-08 -0.141145821435217987E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.422126237945786021E-06 0.177496556211017009E-01 -0.166666666666667011E-01 - -0.139670831173391990E-03 0.172964378151377017E-01 -0.239729563898503003E-07 - -0.141640396000908991E-03 -0.121311945483194002E-05 0.178147060102482983E-01 - -0.166666666666667011E-01 -0.140348082156047988E-03 0.178144284125189012E-01 - -0.933754616232120984E-06 -0.140348096313297011E-03 -0.933804570413684980E-06 - 0.178144318631820002E-01 -0.435069554992271979E-03 0.194636358566133011E-01 - -0.118148321892500995E-02 -0.166666666666667011E-01 -0.435069229133381978E-03 - -0.125640843271979000E-02 0.193887033474266987E-01 -0.166666666666667011E-01 - -0.139670746039409987E-03 0.172964393604615012E-01 -0.239719429806933989E-07 - -0.141639888769111005E-03 -0.121306615757996010E-05 0.178147059636586010E-01 - -0.166666666666667011E-01 0.193887976105126016E-01 -0.125650162733728990E-02 - -0.166666666666667011E-01 -0.118157207791531990E-02 0.194637272767340004E-01 - -0.166666666666667011E-01 0.193887977049908011E-01 -0.125650166436534001E-02 - -0.118157208862703007E-02 0.194637273843653001E-01 -0.166666666666667011E-01 - -0.435069387915653001E-03 0.193887117581042004E-01 -0.125641381798179000E-02 - -0.435069396210086012E-03 -0.118148620435052005E-02 0.194636395574343014E-01 - -0.139674686777477991E-03 -0.166666666666667011E-01 0.173041492300615996E-01 - -0.235404450870160989E-07 0.000000000000000000E+00 -0.141584559821053004E-03 - -0.117458831712666003E-05 0.178146676653543994E-01 -0.166666666666667011E-01 - -0.139670743890679005E-03 0.172964393844352998E-01 -0.239719183429222001E-07 - -0.141639876120318997E-03 -0.121306489215045994E-05 0.178147059624318010E-01 - -0.166666666666667011E-01 -0.140348087305803990E-03 0.178144318417805016E-01 - -0.933801043435799969E-06 -0.140348087381334013E-03 -0.933801309950123998E-06 - 0.178144318601900983E-01 -0.435069392938043002E-03 0.194636364111164008E-01 - -0.118148311443157998E-02 -0.166666666666667011E-01 -0.435069391187694980E-03 - -0.125641070647420998E-02 0.193887087798863995E-01 -0.166666666666667011E-01 - -0.435069391191658010E-03 0.193887815134855014E-01 -0.125648507888060008E-02 - -0.435069392934080027E-03 -0.118155584789411011E-02 0.194637107834846994E-01 - -0.139700198059465998E-03 -0.166666666666667011E-01 0.172964151566727987E-01 - -0.947849280924833965E-08 -0.141287467767569990E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.427106383838012019E-06 - 0.177495992952378008E-01 0.000000000000000000E+00 -0.140178145950843993E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.177492248104791014E-01 - -0.496797052902892032E-07 -0.140415043811154997E-03 -0.517779519367424970E-06 - 0.178140139541318986E-01 -0.435069390908532979E-03 -0.166666666666667011E-01 - 0.193887088082677003E-01 -0.125641074170613001E-02 -0.435069393217205004E-03 - -0.118148315311942002E-02 0.194636364485415991E-01 -0.166666666666667011E-01 - -0.139670745499349992E-03 0.172964393844174009E-01 -0.239719356219104995E-07 - -0.141639885405697995E-03 -0.121306574176735002E-05 0.178147059632627995E-01 - -0.435069390223737009E-03 -0.166666666666667011E-01 0.193887087097285998E-01 - -0.125641070240488004E-02 -0.435069393902000974E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.118148314340501995E-02 - 0.194636363510784990E-01 0.000000000000000000E+00 -0.435069390007495975E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.193887087390940994E-01 - -0.125641068860798000E-02 -0.435069394118242007E-03 -0.118148311389442004E-02 - 0.194636364058402012E-01 -0.139670743908197002E-03 -0.166666666666667011E-01 - 0.172964393843969000E-01 -0.239719185336623003E-07 -0.141639876221824010E-03 - -0.121306490160525007E-05 0.178147059624407002E-01 -0.435069391188885001E-03 - -0.166666666666667011E-01 0.193887118630617986E-01 -0.125641385909668990E-02 - -0.435069392936854012E-03 -0.118148619756053004E-02 0.194636395637321005E-01 - -0.166666666666667011E-01 0.172965077542917012E-01 -0.927568247378030971E-07 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.102046200973725997E-05 - 0.173051339160060005E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.193887977549708004E-01 -0.125650168627737991E-02 -0.118157208932236990E-02 - 0.194637273908664990E-01 -0.166666666666667011E-01 -0.435069391154679994E-03 - 0.193884418577966004E-01 -0.125613776930019998E-02 -0.435069392971057988E-03 - -0.118121619345346995E-02 0.194633634742979005E-01 -0.139670761454477999E-03 - -0.166666666666667011E-01 0.172964743993829009E-01 -0.239698237237728008E-07 - -0.141639617486802999E-03 -0.121288294400935000E-05 0.178147057813331997E-01 - -0.141639617487035994E-03 0.178147057813331997E-01 -0.121288294403079010E-05 - -0.166666666666667011E-01 -0.139670761454519009E-03 -0.239698237241789995E-07 - 0.172964743993829009E-01 -0.166666666666667011E-01 -0.435069391188809974E-03 - 0.193887118630593006E-01 -0.125641385909560006E-02 -0.435069392936928985E-03 - -0.118148619756054001E-02 0.194636395637317987E-01 -0.166666666666667011E-01 - 0.172965077542917012E-01 -0.927568247377395960E-07 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.102046200973731990E-05 0.173051339160060005E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.139674680709099010E-03 0.000000000000000000E+00 0.173041376249854009E-01 - -0.235410513736555998E-07 -0.141584640574245006E-03 -0.117464392253488990E-05 - 0.178146677207011994E-01 -0.166666666666667011E-01 0.193887977549708004E-01 - -0.125650168627737991E-02 -0.118157208932236990E-02 0.194637273908664990E-01 - -0.139674680709099010E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.173041376249854009E-01 -0.235410513736546998E-07 - -0.141584640574244004E-03 -0.117464392253485009E-05 0.178146677207011994E-01 - -0.161601720479338990E-03 -0.166666666666667011E-01 0.172965076665808001E-01 - -0.926702349005646988E-07 -0.162351552716187005E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.102033131897240007E-05 0.173051453909592000E-01 -0.166666666666667011E-01 - -0.435069503154383993E-03 0.194633707007618000E-01 -0.118122593631263999E-02 - -0.435069280971315012E-03 -0.125614175474171000E-02 0.193884463223708003E-01 - -0.166666666666667011E-01 -0.139672289609427992E-03 0.172964739021653005E-01 - -0.239862388766482005E-07 -0.141648440824267006E-03 -0.121369084068463994E-05 - 0.178147064711290000E-01 -0.141624407193420988E-03 0.178147048759245002E-01 - -0.121150194851005999E-05 -0.166666666666667011E-01 -0.139668124261626994E-03 - -0.239416874711979002E-07 0.172964738935757999E-01 -0.166666666666667011E-01 - -0.435074183386577979E-03 0.194493808683343997E-01 -0.116766988693369996E-02 - -0.435064600665905016E-03 -0.124201443215284004E-02 0.193746726050244011E-01 - -0.166666666666667011E-01 0.173081303333391008E-01 -0.244868683301465988E-04 - -0.166666666666667011E-01 -0.133924491176626011E-03 0.178635877950088992E-01 - -0.166666666666667011E-01 0.173348654763634015E-01 -0.223062642005740000E-07 - -0.104752868632071993E-05 0.178145411980307998E-01 -0.166666666666667011E-01 - -0.435066580651087987E-03 0.193886968271455989E-01 -0.125646132097252000E-02 - -0.435072203449548981E-03 -0.118157534474708009E-02 0.194637086880425011E-01 - -0.139516419388416990E-03 -0.166666666666667011E-01 0.173041769139335999E-01 - -0.219137423564159007E-07 0.000000000000000000E+00 -0.140700386591277008E-03 - -0.109554334382343009E-05 0.178145628168012993E-01 -0.166666666666667011E-01 - -0.139670785511788992E-03 0.172964393739205996E-01 -0.239723655938119001E-07 - -0.141640116450570004E-03 -0.121308690248200008E-05 0.178147059824802015E-01 - -0.166666666666667011E-01 -0.140353492284304998E-03 0.178144307839981988E-01 - -0.934469196153570009E-06 -0.140353499090780005E-03 -0.934493112001855014E-06 - 0.178144324348224990E-01 -0.435069470216686990E-03 0.194636389371891996E-01 - -0.118148595888806001E-02 -0.166666666666667011E-01 -0.435069313909032019E-03 - -0.125641246449570009E-02 0.193887089321175994E-01 -0.166666666666667011E-01 - -0.435066224778556999E-03 0.193888655360086003E-01 -0.125664138810585991E-02 - -0.435072559315323980E-03 -0.118175670116711000E-02 0.194638920667745997E-01 - -0.139535529617076010E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.172963867645380999E-01 -0.887106097806382040E-08 -0.140533144432323988E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.400546896298884985E-06 - 0.177498997117815013E-01 0.000000000000000000E+00 -0.140183011740981989E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.177495724693359017E-01 - -0.500888730287148026E-07 -0.140419805775519006E-03 -0.518188128586851950E-06 - 0.178140142381220008E-01 -0.435066535168476983E-03 -0.166666666666667011E-01 - 0.193886214195925013E-01 -0.125637957080158998E-02 -0.435072248931340003E-03 - -0.118149384028357995E-02 0.194636350622922998E-01 -0.166666666666667011E-01 - -0.139671934380400999E-03 0.172964390135027998E-01 -0.239846931501482010E-07 - -0.141646750301238991E-03 -0.121369351008339006E-05 0.178147064557130988E-01 - -0.435070730673722014E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.194635424129478984E-01 -0.118148467355335004E-02 -0.435068053446324991E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.125637211589557001E-02 - 0.193885950416877000E-01 0.000000000000000000E+00 -0.435065887914215998E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.193887904637096015E-01 - -0.125657470314374992E-02 -0.435072896172526973E-03 -0.118169761602073009E-02 - 0.194638244685289008E-01 -0.139514756609392007E-03 -0.166666666666667011E-01 - 0.172964082847710011E-01 -0.223179439037086001E-07 -0.140739566181214998E-03 - -0.113165464823919991E-05 0.178145987602886004E-01 -0.142980894581813998E-03 - -0.166666666666667011E-01 0.173314539689373992E-01 -0.109866042397868997E-06 - -0.158534112254571006E-03 -0.501444838217852000E-05 0.178188430651418997E-01 - -0.166666666666667011E-01 0.191833678439739010E-01 -0.116695961504079994E-02 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.115255809083636006E-02 - 0.193073016509602985E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.172969314939965015E-01 -0.239425929242464991E-07 -0.121051376580430009E-05 - 0.178147034232617012E-01 -0.166666666666667011E-01 -0.435670020554572023E-03 - 0.190410313364221991E-01 -0.779749213569466953E-03 -0.434467438812183976E-03 - -0.823627788704149036E-03 0.189719905685246000E-01 -0.147270534866566013E-03 - -0.166666666666667011E-01 0.174669457049126987E-01 -0.100890465185484999E-03 - -0.564704633717408051E-03 -0.477373059999760001E-03 0.187551223307545999E-01 - -0.139439401685823998E-03 0.178140971035927985E-01 -0.575958645398553022E-06 - -0.166666666666667011E-01 -0.139258097457669987E-03 -0.129864673787176997E-07 - 0.173691952385951996E-01 -0.166666666666667011E-01 -0.437167482025401009E-03 - 0.195228613586728994E-01 -0.126248570169920009E-02 -0.432956846913321015E-03 - -0.125491321979356992E-02 0.193932787767274011E-01 -0.166666666666667011E-01 - 0.172966662158898984E-01 -0.121413435136702005E-06 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.972291637013934920E-05 0.179398884476381013E-01 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.350105792551318000E-03 0.000000000000000000E+00 0.179358412563318005E-01 - -0.564926760193745028E-05 -0.144639853967615001E-03 -0.314581136545178986E-06 - 0.178138319445972984E-01 -0.166666666666667011E-01 0.193703856781149014E-01 - -0.124844632131722007E-02 -0.118120734926823010E-02 0.194604805411918991E-01 - -0.431301129654014016E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.192675040339660014E-01 -0.119991063440438991E-02 - -0.438790369133317979E-03 -0.117513383061983992E-02 0.194438480799019008E-01 - -0.161600618881618991E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.172964964024612013E-01 -0.929759484961031952E-07 -0.162354606074604997E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.102674406185603999E-05 0.173051827906571011E-01 -0.166666666666667011E-01 - -0.143972810900854002E-03 0.173505492137603010E-01 -0.481358872158661014E-04 - -0.558385319861011051E-03 -0.438374229680275993E-03 0.187218708148952986E-01 - -0.166666666666667011E-01 -0.139937822338344987E-03 0.174078896544059003E-01 - -0.227983603063990998E-07 -0.141801732328176988E-03 -0.908683871102670049E-06 - 0.178144027016026996E-01 -0.143597578982917004E-03 0.178145584095829000E-01 - -0.106316067196835009E-05 -0.166666666666667011E-01 -0.140417715032831007E-03 - -0.268130638269802001E-07 0.174078940527996016E-01 -0.166666666666667011E-01 - -0.434071259447730998E-03 0.193423072809523998E-01 -0.122826363249218997E-02 - -0.436064294450999015E-03 -0.116769061111692994E-02 0.194456918207231988E-01 - -0.166666666666667011E-01 0.172982030081412011E-01 -0.280949762784711011E-07 - -0.166666666666667011E-01 -0.189517761832509992E-05 0.178271977135750007E-01 - -0.166666666666667011E-01 0.178265008932085994E-01 -0.134657557022567991E-05 - -0.925862840583118960E-06 0.178144080134048990E-01 -0.166666666666667011E-01 - -0.434780916834727992E-03 0.193789511951828995E-01 -0.125220905631005992E-02 - -0.435357601613535017E-03 -0.118147901946294995E-02 0.194624869766942994E-01 - -0.140177592469087011E-03 -0.166666666666667011E-01 0.177492253272222983E-01 - -0.496756672234455021E-07 0.000000000000000000E+00 -0.140414345842645999E-03 - -0.517733155226927036E-06 0.178140139187708997E-01 -0.166666666666667011E-01 - -0.435069391154579001E-03 0.193884418550169003E-01 -0.125613776648296005E-02 - -0.435069392971158982E-03 -0.118121619070897998E-02 0.194633634714499008E-01 - -0.166666666666667011E-01 -0.139670760166673013E-03 0.172964743997286001E-01 - -0.239698098600330014E-07 -0.141639610050856989E-03 -0.121288226150826994E-05 - 0.178147057806271984E-01 -0.141639611820286996E-03 0.178147057807866993E-01 - -0.121288242338980006E-05 -0.166666666666667011E-01 -0.139670760473278990E-03 - -0.239698131526809996E-07 0.172964743997317989E-01 -0.166666666666667011E-01 - -0.435069393628180999E-03 0.194636361556436016E-01 -0.118148289887844995E-02 - -0.435069390497556984E-03 -0.125641041453345998E-02 0.193887085193421987E-01 - -0.139670572892313988E-03 -0.166666666666667011E-01 0.172964393935715992E-01 - -0.239416569730802012E-07 0.000000000000000000E+00 -0.141638062327426994E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.121132811726249994E-05 0.178146133160351008E-01 0.000000000000000000E+00 - -0.435065005443445983E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.193885567700676983E-01 -0.125634384183789002E-02 -0.435073778621178993E-03 - -0.118148063714991003E-02 0.194636163660859007E-01 -0.139670682267266008E-03 - -0.166666666666667011E-01 0.172964394328304003E-01 -0.239712554208225998E-07 - -0.141639519995227992E-03 -0.121303220633213010E-05 0.178147059371244997E-01 - -0.166666666666667011E-01 -0.435069393052231988E-03 0.194636362649158005E-01 - -0.118148299187565992E-02 -0.435069391073505994E-03 -0.125641054065752994E-02 - 0.193887086398788003E-01 -0.139670722089359006E-03 -0.166666666666667011E-01 - 0.172964393787637984E-01 -0.239432406825611007E-07 -0.141638922827194003E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.121140585286092006E-05 0.178146133355806986E-01 0.000000000000000000E+00 - -0.435065002826116019E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.193885565492021017E-01 -0.125634366816502010E-02 -0.435073781238435990E-03 - -0.118148050457107996E-02 0.194636162212637985E-01 -0.139670533104681001E-03 - -0.166666666666667011E-01 0.172964394486108987E-01 -0.239696530874210007E-07 - -0.141638658891901002E-03 -0.121295339218169004E-05 0.178147058622277997E-01 - -0.435090507624198979E-03 -0.166666666666667011E-01 0.194669892783514015E-01 - -0.118530036438423006E-02 -0.435048275086066013E-03 -0.125913754589597991E-02 - 0.193915247086616990E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.172960984744740011E-01 -0.213609391462743998E-07 -0.166666666666667011E-01 - -0.106367312934038010E-05 0.178056546914526007E-01 -0.166666666666667011E-01 - 0.173098191165480010E-01 -0.233466429807012998E-07 -0.116342624068299999E-05 - 0.178149495945450001E-01 -0.166666666666667011E-01 -0.435069781984934981E-03 - 0.194633224227003000E-01 -0.118118559577364002E-02 -0.435069002140319990E-03 - -0.125608528610373000E-02 0.193883926903337007E-01 -0.139659157768774999E-03 - -0.166666666666667011E-01 0.172964811071458988E-01 -0.238453432888061996E-07 - -0.141572594197268990E-03 -0.120675029799499991E-05 0.178147008767080006E-01 - -0.141538198842383991E-03 0.178146978502537016E-01 -0.120361768174549009E-05 - -0.166666666666667011E-01 -0.139653197525385994E-03 -0.237816337684043011E-07 - 0.172964810481539000E-01 -0.166666666666667011E-01 -0.435101709399810996E-03 - 0.194659175728730989E-01 -0.118451613849551002E-02 -0.435037071408026995E-03 - -0.125773914873593992E-02 0.193902105130663999E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.172962783353253005E-01 -0.218342668577251013E-07 - -0.166666666666667011E-01 -0.109032448304442004E-05 0.178073744716683996E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.434677413069656997E-03 - 0.193010409680305015E-01 -0.117438144585429007E-02 -0.435460867464844024E-03 - -0.110662888767641991E-02 0.193848162965736988E-01 -0.166666666666667011E-01 - 0.173066829056793998E-01 -0.235026937711214014E-07 -0.117850668811869000E-05 - 0.178149746842341007E-01 -0.434571088711364992E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 0.192763238295290996E-01 -0.115086473839449998E-02 - -0.435566874652637017E-03 -0.108492248963690989E-02 0.193620926343746995E-01 - -0.139700128428728001E-03 -0.166666666666667011E-01 0.172964062536986006E-01 - -0.947853442084885955E-08 -0.141287229661082013E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.427116700035002018E-06 0.177495998236214987E-01 -0.166666666666667011E-01 - -0.139670101692640009E-03 0.172964394288557984E-01 -0.239650162888405991E-07 - -0.141636168849436004E-03 -0.121272540531506999E-05 0.178147056226493003E-01 - -0.166666666666667011E-01 -0.435070601601427007E-03 0.194632424803467992E-01 - -0.118112723413059005E-02 -0.435068182519665007E-03 -0.125598135252497989E-02 - 0.193882951902035992E-01 -0.140350019130283996E-03 0.178144321328113989E-01 - -0.934070655061730960E-06 -0.166666666666667011E-01 -0.140500665652292997E-03 - -0.934901878808751052E-06 0.178144580209842994E-01 -0.166666666666667011E-01 - -0.435068213377728019E-03 0.193838387473541998E-01 -0.125145538065708989E-02 - -0.435070570743591027E-03 -0.117665454921121997E-02 0.194586920952139995E-01 - -0.166666666666667011E-01 0.172970581688366015E-01 -0.800807256662095042E-18 - -0.166666666666667011E-01 -0.116362368362492996E-16 0.173226956581742017E-01 - -0.166666666666667011E-01 0.173227088035448999E-01 -0.227142423077076990E-07 - -0.109342148655364995E-05 0.178145868761214014E-01 -0.166666666666667011E-01 - -0.139356858353544992E-03 0.172964916979878001E-01 -0.206044330373597012E-07 - -0.139859905327302997E-03 -0.104722219521747004E-05 0.178145504281299986E-01 - -0.435504189131306018E-03 -0.166666666666667011E-01 0.194761567651129015E-01 - -0.120248342376421996E-02 0.000000000000000000E+00 -0.434633990251299008E-03 - -0.125640745877263009E-02 0.193901277969210986E-01 -0.166666666666667011E-01 - -0.435069416312690977E-03 0.000000000000000000E+00 0.194633539559233987E-01 - -0.118120745095251002E-02 -0.435069367813045000E-03 -0.125612755432325993E-02 - 0.193884319840324990E-01 -0.166666666666667011E-01 -0.139670622926057006E-03 - 0.172964757091009988E-01 -0.239682454273153985E-07 -0.141638804410660994E-03 - -0.121280234601187995E-05 0.178147056852437993E-01 -0.141603983295218001E-03 - 0.178147026131165002E-01 -0.120962066737389997E-05 -0.166666666666667011E-01 - -0.139664589011853001E-03 -0.239035291403939993E-07 0.172964756492609006E-01 - -0.166666666666667011E-01 -0.435069662231408008E-03 0.194635113006111984E-01 - -0.118136680130286010E-02 -0.435069121894098010E-03 -0.125627780048987998E-02 - 0.193885794815036004E-01 -0.161604530780638008E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 0.172965258603989015E-01 -0.937092380803772050E-07 - -0.162357917896980013E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.103420292441458007E-05 0.173052119919553000E-01 -0.139467894615960005E-03 - -0.166666666666667011E-01 0.173042004359115001E-01 -0.214254882847721013E-07 - -0.140431934037818008E-03 -0.107186911344032006E-05 0.178145703665167007E-01 - -0.435495844207702974E-03 -0.166666666666667011E-01 0.194039307261216001E-01 - -0.113156930142864010E-02 -0.434642344857009977E-03 -0.118520221101701007E-02 - 0.193210744918359996E-01 -0.166666666666667011E-01 -0.435081000308743025E-03 - 0.194615626446394997E-01 -0.117973440054902992E-02 -0.435057783388624999E-03 - -0.125399790053982996E-02 0.193864078136761001E-01 -0.160852239681984991E-03 - -0.166666666666667011E-01 0.172967516936828013E-01 -0.175963431721074013E-07 - -0.161650172616747013E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.203401553309515001E-06 0.173054257802153014E-01 - -0.139669192621242992E-03 -0.166666666666667011E-01 0.173052370218566010E-01 - -0.234215439038825986E-07 -0.141543308308260998E-03 -0.116637645863834998E-05 - 0.178146593404366997E-01 -0.435080679695976980E-03 -0.166666666666667011E-01 - 0.194640875795392999E-01 -0.118216803173680995E-02 -0.435058104025100973E-03 - -0.125653847166064996E-02 0.193888504672653991E-01 -0.415397501230701984E-03 - -0.166666666666667011E-01 0.188279009057511007E-01 -0.898333912356110006E-03 - -0.453392657545613985E-03 -0.100006119280005999E-02 0.192410594709606017E-01 - -0.166666666666667011E-01 0.173196351930705005E-01 -0.236100996593661996E-07 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 -0.123805471953726010E-05 0.178172764971170003E-01 - -0.166666666666667011E-01 0.172958803103070990E-01 -0.240055324597324986E-07 - -0.121597905415154999E-05 0.178147088629611004E-01 -0.166666666666667011E-01 - -0.139360440123895005E-03 0.172964135806168012E-01 -0.206282988179274991E-07 - -0.139848706255163002E-03 -0.104818854429194995E-05 0.178144673884889013E-01 - -0.482623514256753016E-03 -0.166666666666667011E-01 0.184436867195339992E-01 - -0.182833591562523009E-03 -0.374919021333009015E-03 -0.210075563706943995E-03 - 0.183321751743828996E-01 -0.143208310342222002E-03 0.178148148672475012E-01 - -0.128486278788734005E-05 -0.166666666666667011E-01 -0.199920559280411991E-03 - -0.199737151953955009E-05 0.178295061512010987E-01 -0.166666666666667011E-01 - -0.433603816949246982E-03 0.193497617607279009E-01 -0.124503446125193997E-02 - -0.436528005342376002E-03 -0.119072566655100006E-02 0.194675617176627014E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.172951192319563997E-01 - -0.214703229569797989E-07 -0.166666666666667011E-01 -0.107141837494050999E-05 - 0.178058595667948995E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.434603197477276975E-03 0.000000000000000000E+00 0.193058537144648011E-01 - -0.118063268602804994E-02 -0.435534875277424982E-03 -0.111369468853339993E-02 - 0.193919603590869995E-01 -0.166666666666667011E-01 0.173056689021276987E-01 - -0.235558337294687006E-07 -0.118363055424740992E-05 0.178149857852361003E-01 - -0.435171466639532013E-03 -0.166666666666667011E-01 0.194709518243956006E-01 - -0.119094077767639001E-02 -0.434967284384675014E-03 -0.126075351231549003E-02 - 0.193933220864733989E-01 -0.435279548009691016E-03 -0.166666666666667011E-01 - 0.194122108470178995E-01 -0.114336878037841995E-02 -0.434859088315804978E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 -0.119657203671675997E-02 0.193454506408375984E-01 - -0.166666666666667011E-01 -0.139670792135546010E-03 0.172964364510711996E-01 - -0.239726277026338997E-07 -0.141640184749274994E-03 -0.121310638409450998E-05 - 0.178147060027833010E-01 -0.166666666666667011E-01 -0.140348082594213009E-03 - 0.178144298829873010E-01 -0.933774300842211964E-06 -0.140348090712458993E-03 - -0.933802946513419972E-06 0.178144318617154997E-01 -0.435069485964877014E-03 - 0.194636360495127995E-01 -0.118148312545114003E-02 -0.166666666666667011E-01 - -0.435069298160832996E-03 -0.125640935270473000E-02 0.193887056176578997E-01 - -0.166666666666667011E-01 -0.139670745411772993E-03 0.172964393771041017E-01 - -0.239719351629878006E-07 -0.141639884975546990E-03 -0.121306573574195002E-05 - 0.178147059632727985E-01 -0.166666666666667011E-01 0.193887976520839000E-01 - -0.125650164472002990E-02 -0.166666666666667011E-01 -0.118157208029668996E-02 - 0.194637273004341985E-01 -0.166666666666667011E-01 0.193887977203370011E-01 - -0.125650167109275009E-02 -0.118157208884183003E-02 0.194637273863745991E-01 - -0.166666666666667011E-01 -0.435069390354664992E-03 0.193887118325760006E-01 - -0.125641384488361994E-02 -0.435069393771072991E-03 -0.118148619567291996E-02 - 0.194636395582745009E-01 -0.139674686773609991E-03 -0.166666666666667011E-01 - 0.173041492371790000E-01 -0.235404446357735999E-07 0.000000000000000000E+00 - -0.141584559729122987E-03 -0.117458827916782993E-05 0.178146676653142995E-01 - -0.166666666666667011E-01 -0.139670743898058004E-03 0.000000000000000000E+00 - 0.172964393844322988E-01 -0.239719184224240012E-07 -0.141639876162939988E-03 - -0.121306489605736002E-05 0.178147059624353017E-01 -0.166666666666667011E-01 - -0.140348087318788991E-03 0.178144318414940016E-01 -0.933801041250177032E-06 - -0.140348087395501987E-03 -0.933801311940696952E-06 0.178144318601919996E-01 - -0.435069392951675977E-03 0.194636364109944011E-01 -0.118148311441449989E-02 - -0.166666666666667011E-01 -0.435069391174062005E-03 -0.125641070624533989E-02 - 0.193887087793657986E-01 -0.166666666666667011E-01 -0.435069391183394005E-03 - 0.193887815136928009E-01 -0.125648507926827001E-02 -0.435069392942343978E-03 - -0.118155584839527996E-02 0.194637107839485991E-01 -0.139700198051380994E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.172964151566069001E-01 - -0.947849285071832001E-08 -0.141287467734219009E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.427106386139514992E-06 0.177495992957071996E-01 - -0.140178145272992012E-03 -0.166666666666667011E-01 0.177492248109030991E-01 - -0.496797001355079032E-07 -0.140415042956877005E-03 -0.517779462070214031E-06 - 0.178140139540239988E-01 -0.435069391423600026E-03 -0.166666666666667011E-01 - 0.193887083934986999E-01 -0.125641030816468009E-02 -0.435069392702138011E-03 - -0.118148272213002004E-02 0.194636360081575990E-01 -0.166666666666667011E-01 - -0.139670744267195992E-03 0.172964393844029993E-01 -0.239719223657696016E-07 - -0.141639878293219989E-03 -0.121306508981309993E-05 0.178147059625506990E-01 - -0.435069391344719005E-03 -0.166666666666667011E-01 0.193887083608434013E-01 - -0.125641029540731006E-02 -0.435069392781018977E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.118148271833985003E-02 - 0.194636359700672001E-01 -0.435069391241579015E-03 -0.166666666666667011E-01 - 0.193887087819448016E-01 -0.125641070755089994E-02 -0.435069392884159022E-03 - -0.118148311473540010E-02 0.194636364115348993E-01 -0.139670743900726009E-03 - -0.166666666666667011E-01 0.172964393843643011E-01 -0.239719184546282004E-07 - -0.141639876179015995E-03 -0.121306489779411998E-05 0.178147059624346009E-01 - -0.435069391188663010E-03 -0.166666666666667011E-01 0.193887118630544017E-01 - -0.125641385909356002E-02 -0.435069392937075027E-03 -0.118148619756065993E-02 - 0.194636395637313997E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.172965077542917012E-01 -0.927568247375718968E-07 -0.166666666666667011E-01 - -0.102046200973759010E-05 0.173051339160060005E-01 -0.166666666666667011E-01 - 0.193887977549708004E-01 -0.125650168627735996E-02 -0.118157208932236990E-02 - 0.194637273908664990E-01 -0.166666666666667011E-01 -0.435069391170066991E-03 - 0.193884418583467992E-01 -0.125613776955457008E-02 -0.435069392955670991E-03 - -0.118121619348252006E-02 0.194633634743846991E-01 -0.139670761454596990E-03 - -0.166666666666667011E-01 0.172964743993801011E-01 -0.239698237251752988E-07 - -0.141639617487515998E-03 -0.121288294408737000E-05 0.178147057813331997E-01 - -0.141639617489162006E-03 0.178147057813334009E-01 -0.121288294423791994E-05 - -0.166666666666667011E-01 -0.139670761454882000E-03 -0.239698237282632997E-07 - 0.172964743993801011E-01 -0.166666666666667011E-01 -0.435069391188130992E-03 - 0.193887118630363987E-01 -0.125641385908596996E-02 -0.435069392937606990E-03 - -0.118148619756086007E-02 0.194636395637294013E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.172965077542917012E-01 -0.927568247372701942E-07 - -0.166666666666667011E-01 -0.102046200973822008E-05 0.173051339160060005E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.139674680709100989E-03 - 0.173041376249854009E-01 -0.235410513736760014E-07 -0.141584640574256011E-03 - -0.117464392253588008E-05 0.178146677207011994E-01 -0.166666666666667011E-01 - 0.193887977549706998E-01 -0.125650168627732006E-02 -0.118157208932236990E-02 - 0.194637273908664990E-01 -0.139674680709099010E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 0.173041376249854009E-01 -0.235410513737710014E-07 - -0.141584640574248991E-03 -0.117464392253522998E-05 0.178146677207011994E-01 - -0.161601715880123993E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.172965076669070010E-01 -0.926692831361944028E-07 -0.162351548531821993E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - -0.102032125850803006E-05 0.173051453880029987E-01 -0.166666666666667011E-01 - -0.139670774820815007E-03 0.172964393819063991E-01 -0.239722506017439995E-07 - -0.141640054670844003E-03 -0.121308123358901007E-05 0.178147059780696011E-01 - -0.166666666666667011E-01 -0.435069333337653979E-03 0.193887254401867992E-01 - -0.125642901506816990E-02 -0.435069450788072999E-03 -0.118150189097284992E-02 - 0.194636551938420013E-01 -0.140349304427809005E-03 0.178144320166155994E-01 - -0.933957824287751043E-06 -0.166666666666667011E-01 -0.140349299344085009E-03 - -0.933939903071356043E-06 0.178144307788948991E-01 -0.166666666666667011E-01 - -0.435068660852071026E-03 0.000000000000000000E+00 0.193848776728661985E-01 - -0.125250844957367998E-02 -0.435070123271966982E-03 -0.117767775440582993E-02 - 0.194597409817202013E-01 -0.166666666666667011E-01 0.172969944929179005E-01 - -0.779115314308188993E-07 -0.166666666666667011E-01 -0.755324938795983959E-05 - 0.179004996942120007E-01 -0.166666666666667011E-01 0.178976999748934017E-01 - -0.387667548982099008E-05 -0.233628102656076998E-06 0.178135930268625009E-01 - -0.166666666666667011E-01 -0.139686009466056008E-03 0.172964370591520006E-01 - -0.241363103321880989E-07 -0.141728009792282988E-03 -0.122115128991173003E-05 - 0.178147135015222988E-01 -0.435039396203468979E-03 -0.166666666666667011E-01 - 0.193948601328561010E-01 -0.126336416897148991E-02 0.000000000000000000E+00 - -0.435099385069850019E-03 -0.118873902080404996E-02 0.194708368876741998E-01 - -0.166666666666667011E-01 -0.435069389255753991E-03 0.000000000000000000E+00 - 0.193884422504904987E-01 -0.125613821100519006E-02 -0.435069394869983992E-03 - -0.118121665337716998E-02 0.194633639337775007E-01 -0.166666666666667011E-01 - -0.139671380124086001E-03 0.172964743448792008E-01 -0.239764688219899014E-07 - -0.141643188367961012E-03 -0.121320975071175002E-05 0.178147060922408014E-01 - -0.141639844172828988E-03 0.178147058017989017E-01 -0.121290395199703009E-05 - -0.166666666666667011E-01 -0.139670800627344013E-03 -0.239702482699698004E-07 - 0.172964743394126014E-01 -0.166666666666667011E-01 -0.435068235431639017E-03 - 0.193889053227464005E-01 -0.125663654535589997E-02 -0.435070548689851008E-03 - -0.118172116532546008E-02 0.194638725033117015E-01 -0.161678337891334005E-03 - -0.166666666666667011E-01 0.172964935301390017E-01 -0.110214153364595997E-06 - -0.162422470311079996E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.120622187414538000E-05 0.173052150067064016E-01 - -0.139675704206640992E-03 -0.166666666666667011E-01 0.173040340098533016E-01 - -0.235576501556051000E-07 -0.141591382929153007E-03 -0.117568258886253002E-05 - 0.178146687262423983E-01 -0.435067411943173014E-03 -0.166666666666667011E-01 - 0.193886625537611004E-01 -0.125640355917502997E-02 -0.435071372170113015E-03 - -0.118150457751595010E-02 0.194636501950278991E-01 -0.166666666666667011E-01 - -0.435069317670600008E-03 0.193887257892246985E-01 -0.125642966435953992E-02 - -0.435069466455119977E-03 -0.118150274118994998E-02 0.194636560434848990E-01 - -0.161601801933608996E-03 -0.166666666666667011E-01 0.172965054748796011E-01 - -0.926502679423437959E-07 -0.162351306454973002E-03 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.101988138442591992E-05 - 0.173051393629530988E-01 -0.139695639987958987E-03 -0.166666666666667011E-01 - 0.173041438368830985E-01 -0.237586532625412003E-07 -0.141701626146447995E-03 - -0.118517343233528007E-05 0.178146776002246002E-01 -0.435031309395256976E-03 - -0.166666666666667011E-01 0.193945788398414004E-01 -0.126324081468956006E-02 - -0.435107470132009989E-03 -0.118873511389969004E-02 0.194708001084177990E-01 - -0.435268203178918020E-03 -0.166666666666667011E-01 0.194803280841455984E-01 - -0.120221471825937998E-02 -0.434870455402744990E-03 -0.126728100453280001E-02 - 0.194000950994309002E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.172950455023187996E-01 -0.209695226068837989E-07 -0.166666666666667011E-01 - -0.104309478839938997E-05 0.178040398395854990E-01 -0.166666666666667011E-01 - 0.173086281051332003E-01 -0.234055546297331013E-07 -0.116921175366159004E-05 - 0.178149613334783012E-01 -0.166666666666667011E-01 -0.141165531303230992E-03 - 0.172962108365398992E-01 -0.473445694663900975E-05 -0.544660594244081005E-03 - -0.346550124345780994E-04 0.183333673483142001E-01 -0.139754726222275990E-03 - -0.166666666666667011E-01 0.176170084219247000E-01 -0.246149781579885013E-07 - -0.140206540519596996E-03 -0.567788699151559999E-06 0.178140588497450007E-01 - -0.165095492820345002E-03 0.178223326710730996E-01 -0.681196695085373968E-05 - -0.166666666666667011E-01 -0.150450799069420005E-03 -0.294765827683784011E-06 - 0.176166680355617014E-01 -0.166666666666667011E-01 -0.421169440873449990E-03 - 0.189193905894529009E-01 -0.945436157372043042E-03 -0.448291280314557015E-03 - -0.100548420996317994E-02 0.192537236307864998E-01 -0.166666666666667011E-01 - 0.173199119681975988E-01 -0.235984477489147988E-07 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 0.000000000000000000E+00 - -0.123676178966295993E-05 0.178172740173286014E-01 -0.166666666666667011E-01 - -0.435171428461128974E-03 0.194709389729267983E-01 -0.119092785894799004E-02 - -0.434967322587667975E-03 -0.126074169224678996E-02 0.193933111644412001E-01 - -0.166666666666667011E-01 0.172958818582512990E-01 -0.240054389194565998E-07 - -0.121597096057541996E-05 0.178147088549035007E-01 -0.434490077563752020E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.192819871000558009E-01 -0.115810409202821997E-02 -0.435647599072524984E-03 - -0.109302488139546004E-02 0.193702791629623006E-01 -0.435071227206677987E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.194623470628700994E-01 - -0.118137412448114003E-02 -0.435067556908351005E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.125600599177529007E-02 - 0.193876331694533000E-01 -0.166666666666667011E-01 -0.139670743889624998E-03 - 0.172964393844356017E-01 -0.239719183317204011E-07 -0.141639876114236000E-03 - -0.121306489159398005E-05 0.178147059624313014E-01 -0.166666666666667011E-01 - -0.140348087313353994E-03 0.178144318418225998E-01 -0.933801044951077008E-06 - -0.140348087388714990E-03 -0.933801310870910963E-06 0.178144318601911010E-01 - -0.435069392936102985E-03 0.194636364108025997E-01 -0.118148311410958993E-02 - -0.166666666666667011E-01 -0.435069391189634998E-03 -0.125641070617519006E-02 - 0.193887087796365994E-01 -0.166666666666667011E-01 -0.139670745273937991E-03 - 0.172964393844580004E-01 -0.239719332022523000E-07 -0.141639884104326007E-03 - -0.121306562274284990E-05 0.178147059631592990E-01 -0.166666666666667011E-01 - 0.193887976615549004E-01 -0.125650164862212989E-02 -0.166666666666667011E-01 - -0.118157208096353003E-02 0.194637273070949017E-01 -0.166666666666667011E-01 - 0.193887977248977002E-01 -0.125650167309232009E-02 -0.118157208890513009E-02 - 0.194637273869662994E-01 -0.166666666666667011E-01 -0.435069391187089996E-03 - 0.193887118682170012E-01 -0.125641386440402997E-02 -0.435069392938647987E-03 - -0.118148620277655990E-02 0.194636395690588986E-01 -0.139674686768426990E-03 - -0.166666666666667011E-01 0.173041492303226997E-01 -0.235404449782952015E-07 - 0.000000000000000000E+00 -0.141584559767933007E-03 -0.117458831126650007E-05 - 0.178146676653491987E-01 -0.166666666666667011E-01 -0.139670743889684006E-03 - 0.000000000000000000E+00 0.172964393844356017E-01 -0.239719183323613007E-07 - -0.141639876114578011E-03 -0.121306489162555998E-05 0.178147059624313986E-01 - -0.166666666666667011E-01 -0.140348087304587000E-03 0.178144318418189986E-01 - -0.933801043795399019E-06 -0.140348087379959000E-03 -0.933801309750847956E-06 - 0.178144318601899006E-01 -0.435069392936221976E-03 0.194636364108877989E-01 - -0.118148311419265998E-02 -0.166666666666667011E-01 -0.435069391189516982E-03 - -0.125641070625854005E-02 0.193887087797160983E-01 -0.166666666666667011E-01 - -0.435069391192432022E-03 0.193887815134137984E-01 -0.125648507878967997E-02 - -0.435069392933306015E-03 -0.118155584779333005E-02 0.194637107833882002E-01 - -0.139700198049885988E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.172964151566860000E-01 -0.947849280219223922E-08 -0.141287467724896009E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.427106383695399994E-06 - 0.177495992954452009E-01 -0.140178145690196993E-03 -0.166666666666667011E-01 - 0.177492248106932010E-01 -0.496797033573775009E-07 -0.140415043482606990E-03 - -0.517779497585701998E-06 0.178140139541216984E-01 -0.435069391089031011E-03 - -0.166666666666667011E-01 0.193887087977068009E-01 -0.125641072675559008E-02 - -0.435069393036708002E-03 -0.118148313570800006E-02 0.194636364323594983E-01 - -0.166666666666667011E-01 -0.139670745392212007E-03 0.172964393844482998E-01 - -0.239719344734911991E-07 -0.141639884787109996E-03 -0.121306568527222010E-05 - 0.178147059632214992E-01 -0.435069390245067006E-03 -0.166666666666667011E-01 - 0.193887086999318010E-01 -0.125641068746266005E-02 -0.435069393880670977E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.118148312670310991E-02 0.194636363420862997E-01 -0.435069390179441007E-03 - -0.166666666666667011E-01 0.193887087448267985E-01 -0.125641069098422005E-02 - -0.435069393946296975E-03 -0.118148311374637006E-02 0.194636364063980986E-01 - -0.139670743899172997E-03 -0.166666666666667011E-01 0.172964393844271015E-01 - -0.239719184348657009E-07 -0.141639876169436012E-03 -0.121306489668503007E-05 - 0.178147059624364015E-01 -0.435069391188929019E-03 -0.166666666666667011E-01 - 0.193887118630633008E-01 -0.125641385909733001E-02 -0.435069392936809993E-03 - -0.118148619756053004E-02 0.194636395637322983E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.172965077542917983E-01 -0.927568247378536014E-07 - -0.166666666666667011E-01 -0.102046200973724006E-05 0.173051339160060005E-01 - -0.166666666666667011E-01 0.193887977549708004E-01 -0.125650168627737991E-02 - -0.118157208932236990E-02 0.194637273908664990E-01 -0.166666666666667011E-01 - -0.435069391171351012E-03 0.193884418583585988E-01 -0.125613776953759993E-02 - -0.435069392954388001E-03 -0.118121619344631009E-02 0.194633634743581994E-01 - -0.139670761454455990E-03 -0.166666666666667011E-01 0.172964743993848993E-01 - -0.239698237233977009E-07 -0.141639617486649991E-03 -0.121288294398642993E-05 - 0.178147057813330990E-01 -0.141639617486657987E-03 0.178147057813330990E-01 - -0.121288294398714991E-05 -0.166666666666667011E-01 -0.139670761454456992E-03 - -0.239698237233852998E-07 0.172964743993848993E-01 -0.166666666666667011E-01 - -0.435069391188925984E-03 0.193887118630632002E-01 -0.125641385909728990E-02 - -0.435069392936811999E-03 -0.118148619756053004E-02 0.194636395637322983E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.172965077542917983E-01 - -0.927568247378527941E-07 -0.166666666666667011E-01 -0.102046200973724006E-05 - 0.173051339160060005E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.139674680709099010E-03 0.173041376249854009E-01 -0.235410513736267014E-07 - -0.141584640574244004E-03 -0.117464392253481007E-05 0.178146677207011994E-01 - -0.166666666666667011E-01 0.193887977549708004E-01 -0.125650168627737991E-02 - -0.118157208932236990E-02 0.194637273908664990E-01 -0.139674680709099010E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.173041376249854009E-01 - -0.235410513735998015E-07 -0.141584640574244004E-03 -0.117464392253481007E-05 - 0.178146677207011994E-01 -0.161601720324868993E-03 -0.166666666666667011E-01 - 0.172965076669642989E-01 -0.926701979568355970E-07 -0.162351552541144001E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.000000000000000000E+00 -0.102033090018047992E-05 0.173051453908108985E-01 - -0.166666666666667011E-01 -0.435069391175761004E-03 0.193884418585090999E-01 - -0.125613776960263991E-02 -0.435069392949976978E-03 -0.118121619344674009E-02 - 0.194633634743759006E-01 -0.166666666666667011E-01 -0.139670761454455990E-03 - 0.172964743993851006E-01 -0.239698237233875994E-07 -0.141639617486650994E-03 - -0.121288294398543001E-05 0.178147057813330990E-01 -0.141639617476079996E-03 - 0.178147057813322005E-01 -0.121288294301813998E-05 -0.166666666666667011E-01 - -0.139670761452624013E-03 -0.239698237036860001E-07 0.172964743993851006E-01 - -0.166666666666667011E-01 -0.435069391192448990E-03 0.193887815134013014E-01 - -0.125648507877692997E-02 -0.435069392933288993E-03 -0.118155584778073010E-02 - 0.194637107833748012E-01 -0.166666666666667011E-01 0.172964151566857016E-01 - -0.947849086778118929E-08 -0.166666666666667011E-01 -0.427106298729459992E-06 - 0.177495992959284983E-01 -0.166666666666667011E-01 0.177492248112494991E-01 - -0.496796927082655975E-07 -0.517779383578772961E-06 0.178140139540073003E-01 - -0.166666666666667011E-01 -0.435069391188929019E-03 0.193887118675337006E-01 - -0.125641386366846992E-02 -0.435069392936809993E-03 -0.118148620203093000E-02 - 0.194636395683033987E-01 -0.139674686766604988E-03 -0.166666666666667011E-01 - 0.173041492306688985E-01 -0.235404449394736996E-07 0.000000000000000000E+00 - -0.141584559754340987E-03 -0.117458830859994993E-05 0.178146676653464994E-01 - -0.166666666666667011E-01 -0.139670743889622992E-03 0.000000000000000000E+00 - 0.172964393844356017E-01 -0.239719183316415013E-07 -0.141639876114222989E-03 - -0.121306489159275990E-05 0.178147059624313014E-01 -0.166666666666667011E-01 - -0.140348087304557998E-03 0.178144318418215000E-01 -0.933801043825826983E-06 - -0.140348087379918993E-03 -0.933801309744065975E-06 0.178144318601899006E-01 - -0.435069392936097998E-03 0.194636364108022007E-01 -0.118148311410921003E-02 - -0.166666666666667011E-01 -0.435069391189641015E-03 -0.125641070617487000E-02 - 0.193887087796364016E-01 -0.166666666666667011E-01 -0.435069391192461025E-03 - 0.193887815133983003E-01 -0.125648507877364007E-02 -0.435069392933277012E-03 - -0.118155584777734001E-02 0.194637107833714011E-01 -0.139700198047769001E-03 - -0.166666666666667011E-01 0.172964151566880990E-01 -0.947849280983375026E-08 - -0.141287467715877995E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.427106384123253011E-06 0.177495992955506998E-01 - -0.140178145551705995E-03 -0.166666666666667011E-01 0.177492248107965003E-01 - -0.496797023203806995E-07 -0.140415043308048001E-03 -0.517779485956588989E-06 - 0.178140139541092986E-01 -0.435069391188775008E-03 -0.166666666666667011E-01 - 0.193887087796628006E-01 -0.125641070621944004E-02 -0.435069392936964004E-03 - -0.118148311416520994E-02 0.194636364108560014E-01 -0.166666666666667011E-01 - -0.139670745283318996E-03 0.172964393844581010E-01 -0.239719333030583992E-07 - -0.141639884158474994E-03 -0.121306562769789989E-05 0.178147059631642984E-01 - -0.435069390314679995E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.193887086857719992E-01 -0.125641066837268002E-02 -0.435069393811057987E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.118148310576349008E-02 - 0.194636363266590985E-01 0.000000000000000000E+00 -0.435069390314679995E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.193887087494363994E-01 - -0.125641069296695000E-02 -0.435069393811057987E-03 -0.118148311374545001E-02 - 0.194636364069358005E-01 -0.139670743897207989E-03 -0.166666666666667011E-01 - 0.172964393844356988E-01 -0.239719184131230996E-07 -0.141639876158003994E-03 - -0.121306489559900995E-05 0.178147059624353017E-01 -0.435069391188779996E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.193887088940556986E-01 - -0.125641082318989007E-02 -0.435069392936957987E-03 -0.118148322855787997E-02 - 0.194636365278264001E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.172964393409659987E-01 -0.239438466648548014E-07 -0.166666666666667011E-01 - -0.121143526110419994E-05 0.178146131934183993E-01 -0.166666666666667011E-01 - 0.172964393844281007E-01 -0.239719133478075007E-07 -0.121306464654854992E-05 - 0.178147059621872987E-01 -0.166666666666667011E-01 -0.435069391175080993E-03 - 0.193884418584855008E-01 -0.125613776959208998E-02 -0.435069392950656990E-03 - -0.118121619344611992E-02 0.194633634743727989E-01 -0.139670761454532995E-03 - -0.166666666666667011E-01 0.172964743993852012E-01 -0.239698237241572016E-07 - -0.141639617487093999E-03 -0.121288294402584004E-05 0.178147057813331997E-01 - -0.141639617487093999E-03 0.178147057813331997E-01 -0.121288294402580997E-05 - -0.166666666666667011E-01 -0.139670761454532995E-03 -0.239698237241842008E-07 - 0.172964743993852012E-01 -0.166666666666667011E-01 -0.435069391188779996E-03 - 0.193887088940556986E-01 -0.125641082318989007E-02 -0.435069392936957987E-03 - -0.118148322855787997E-02 0.194636365278264001E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.172964393409659987E-01 -0.239438466648548014E-07 - -0.166666666666667011E-01 -0.121143526110419994E-05 0.178146131934183993E-01 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.435064994898318983E-03 - 0.000000000000000000E+00 0.193885567754814997E-01 -0.125634406113793993E-02 - -0.435073789166010982E-03 -0.118148100321111005E-02 0.194636166968597014E-01 - -0.166666666666667011E-01 0.172964393844281007E-01 -0.239719133478075007E-07 - -0.121306464654854992E-05 0.178147059621872987E-01 -0.435064994898318983E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.193885567754814997E-01 -0.125634406113793993E-02 -0.435073789166010982E-03 - -0.118148100321111005E-02 0.194636166968597014E-01 -0.161601720097545002E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.172965076670156016E-01 - -0.926701503617335961E-07 -0.162351552330414998E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.102033039390967000E-05 - 0.173051453906522996E-01 -0.166666666666667011E-01 -0.435069886305821003E-03 - 0.194633772872509006E-01 -0.118124110147639996E-02 -0.435068897819141016E-03 - -0.125613724075302000E-02 0.193884430666162004E-01 -0.166666666666667011E-01 - -0.139670761026265005E-03 0.172964744480312016E-01 -0.239698159440932015E-07 - -0.141639614515128010E-03 -0.121288245172214010E-05 0.178147057808154992E-01 - -0.141639611356817998E-03 0.178147057804570984E-01 -0.121288216154248992E-05 - -0.166666666666667011E-01 -0.139670760479100993E-03 -0.239698100456143004E-07 - 0.172964744480210014E-01 -0.166666666666667011E-01 -0.435069393124601020E-03 - 0.194636395974100984E-01 -0.118148625156355995E-02 -0.435069391001137993E-03 - -0.125641387127325998E-02 0.193887118958671993E-01 -0.166666666666667011E-01 - 0.172965077543725011E-01 -0.927568140336766949E-07 -0.166666666666667011E-01 - -0.102046181269095990E-05 0.173051339147857995E-01 -0.166666666666667011E-01 - 0.173041376239574003E-01 -0.235410465544014012E-07 -0.117464369127346007E-05 - 0.178146677204709010E-01 -0.166666666666667011E-01 -0.435069392055963979E-03 - 0.193887089236092006E-01 -0.125641083591904006E-02 -0.435069392069774004E-03 - -0.118148322857497003E-02 0.194636365312624987E-01 -0.435064994897326992E-03 - -0.166666666666667011E-01 0.193885567754197990E-01 -0.125634406109512999E-02 - 0.000000000000000000E+00 -0.435073789167003027E-03 -0.118148100318356004E-02 - 0.194636166968270991E-01 -0.166666666666667011E-01 -0.139670744005780998E-03 - 0.000000000000000000E+00 0.172964393844374995E-01 -0.239719195794844008E-07 - -0.141639876784672010E-03 -0.121306495294346998E-05 0.178147059624923984E-01 - -0.166666666666667011E-01 -0.435069390971013978E-03 0.193887087720762998E-01 - -0.125641070286031001E-02 -0.435069393154724005E-03 -0.118148311400433992E-02 - 0.194636364098218009E-01 -0.140348087384463996E-03 0.178144318601930994E-01 - -0.933801313166044989E-06 -0.166666666666667011E-01 -0.140348087290235009E-03 - -0.933800980670895958E-06 0.178144318372258983E-01 -0.166666666666667011E-01 - -0.139670745283319999E-03 0.172964393844581010E-01 -0.239719333030395990E-07 - -0.141639884158481011E-03 -0.121306562769829991E-05 0.178147059631642984E-01 - -0.435069390314711004E-03 -0.166666666666667011E-01 0.193887086857633985E-01 - -0.125641066836418010E-02 0.000000000000000000E+00 -0.435069393811026979E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.118148310575508990E-02 0.194636363266489990E-01 -0.435069390314652023E-03 - -0.166666666666667011E-01 0.193887087494357992E-01 -0.125641069296695998E-02 - -0.435069393811086014E-03 -0.118148311374589995E-02 0.194636364069360017E-01 - -0.139670743896895007E-03 -0.166666666666667011E-01 0.172964393844356017E-01 - -0.239719184097173985E-07 -0.141639876156198012E-03 -0.121306489543125995E-05 - 0.178147059624349999E-01 -0.166666666666667011E-01 -0.435069391192448990E-03 - 0.193887815133984009E-01 -0.125648507877395990E-02 -0.435069392933288993E-03 - -0.118155584777784005E-02 0.194637107833718001E-01 -0.139700198047441003E-03 - -0.166666666666667011E-01 0.172964151566879984E-01 -0.947849280858640987E-08 - -0.141287467714373990E-03 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.427106384068703984E-06 - 0.177495992955511994E-01 -0.140178145551758009E-03 -0.166666666666667011E-01 - 0.177492248107972012E-01 -0.496797023215807996E-07 -0.140415043308110993E-03 - -0.517779485960914998E-06 0.178140139541092986E-01 -0.435069391188743024E-03 - -0.166666666666667011E-01 0.193887087796531001E-01 -0.125641070621038001E-02 - -0.435069392936995988E-03 -0.118148311415690994E-02 0.194636364108470017E-01 - -0.139670745278628006E-03 -0.166666666666667011E-01 0.172964393844581010E-01 - -0.239719332526419989E-07 -0.141639884131401002E-03 -0.121306562522040010E-05 - 0.178147059631618004E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.193887976612414012E-01 -0.125650164849316990E-02 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.118157208094097992E-02 0.194637273068695993E-01 - -0.166666666666667011E-01 0.172964393844281007E-01 -0.239719133478075007E-07 - -0.121306464654854992E-05 0.178147059621872987E-01 -0.166666666666667011E-01 - -0.139670744005789997E-03 0.172964393844378984E-01 -0.239719195795233015E-07 - -0.141639876784720013E-03 -0.121306495294561001E-05 0.178147059624923984E-01 - -0.435069390970999992E-03 -0.166666666666667011E-01 0.193887087720760014E-01 - -0.125641070286029006E-02 -0.435069393154737991E-03 -0.118148311400450992E-02 - 0.194636364098219015E-01 -0.140348087384448004E-03 0.178144318601930994E-01 - -0.933801313164241021E-06 -0.166666666666667011E-01 -0.140348087290217987E-03 - -0.933800980664578045E-06 0.178144318372256000E-01 -0.166666666666667011E-01 - -0.435069391192457989E-03 0.193887815133986993E-01 -0.125648507877402994E-02 - -0.435069392933279994E-03 -0.118155584777777001E-02 0.194637107833719007E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.172964151566877000E-01 - -0.947849244191978996E-08 -0.166666666666667011E-01 -0.427106377284980004E-06 - 0.177495993062723005E-01 -0.166666666666667011E-01 -0.140178145569362010E-03 - 0.177492248215363989E-01 -0.496797136596620033E-07 -0.140415043288559006E-03 - -0.517779487315692952E-06 0.178140139541107002E-01 -0.166666666666667011E-01 - 0.193887977549708004E-01 -0.125650168627737991E-02 -0.118157208932236990E-02 - 0.194637273908664990E-01 -0.435069390319028026E-03 -0.166666666666667011E-01 - 0.193887087496057986E-01 -0.125641069305237993E-02 -0.435069393806710987E-03 - -0.118148311376662990E-02 0.194636364069748005E-01 -0.139670783800487997E-03 - -0.166666666666667011E-01 0.172964393410236991E-01 -0.239438517116492010E-07 - -0.141639277731674996E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.121143550903037006E-05 - 0.178146131936444997E-01 -0.166666666666667011E-01 -0.435630521898627018E-03 - 0.193599440195765006E-01 -0.109077905787125992E-02 -0.434507214005837999E-03 - -0.113826801288032008E-02 0.192764807435569993E-01 -0.166666666666667011E-01 - -0.139536670047989993E-03 0.173126649368985017E-01 -0.217199091649137011E-07 - -0.140761520198303991E-03 -0.106837084216960004E-05 0.178145668887513993E-01 - -0.559739137191329981E-03 0.187342109887733005E-01 -0.453812468425848997E-03 - -0.166666666666667011E-01 -0.144437867213283009E-03 -0.570571543797145032E-04 - 0.173662394826822990E-01 -0.166666666666667011E-01 -0.141554480273248993E-03 - 0.173902726296997004E-01 -0.612977187720352984E-05 -0.543269256103840045E-03 - -0.155576690918531002E-04 0.183343607413550998E-01 -0.166666666666667011E-01 - 0.190435224881807010E-01 -0.109812319463910009E-02 -0.166666666666667011E-01 - -0.111910148431286992E-02 0.192002058717063995E-01 -0.166666666666667011E-01 - 0.191810336689041995E-01 -0.116112314020530008E-02 -0.117547931931102998E-02 - 0.194335300141195988E-01 -0.166666666666667011E-01 -0.435330234208519001E-03 - 0.194710205585382004E-01 -0.119426070267823997E-02 -0.434808332877938987E-03 - -0.125626104852707998E-02 0.193894446024132014E-01 -0.140176366249097992E-03 - -0.166666666666667011E-01 0.177492266081854984E-01 -0.496668683742545015E-07 - 0.000000000000000000E+00 -0.140412798776869000E-03 -0.517630021288783052E-06 - 0.178140137898258992E-01 -0.166666666666667011E-01 -0.435069391186561989E-03 - 0.000000000000000000E+00 0.193884418600598005E-01 -0.125613777099214995E-02 - -0.435069392939175994E-03 -0.118121619465940005E-02 0.194633634756198985E-01 - -0.166666666666667011E-01 -0.139670763690922996E-03 0.172964743992523005E-01 - -0.239698477564660998E-07 -0.141639630395104001E-03 -0.121288412595031991E-05 - 0.178147057825145012E-01 -0.141639614589797013E-03 0.178147057809878995E-01 - -0.121288267825071997E-05 -0.166666666666667011E-01 -0.139670760952332988E-03 - -0.239698183155154984E-07 0.172964743992174985E-01 -0.166666666666667011E-01 - -0.435069386976256979E-03 0.193887089986198996E-01 -0.125641101907938003E-02 - -0.435069397149481980E-03 -0.118148348210879005E-02 0.194636367633517007E-01 - -0.139670105391945004E-03 -0.166666666666667011E-01 0.172964392993067012E-01 - -0.239365206872189997E-07 0.000000000000000000E+00 -0.141635362111391987E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.121107470134998995E-05 0.178146126578259990E-01 0.000000000000000000E+00 - -0.435064987171317002E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.193885562197462988E-01 -0.125634365104461995E-02 -0.435073796892796990E-03 - -0.118148071382402999E-02 0.194636163662551993E-01 -0.139670481067569999E-03 - -0.166666666666667011E-01 0.172964394190999009E-01 -0.239690924292986984E-07 - -0.141638358730468997E-03 -0.121292585922250005E-05 0.178147058233791983E-01 - -0.166666666666667011E-01 -0.435069392178805006E-03 0.194636362774779012E-01 - -0.118148298166642000E-02 -0.435069391946934006E-03 -0.125641057978546000E-02 - 0.193887086723010006E-01 -0.139670520926417992E-03 -0.166666666666667011E-01 - 0.172964393709954013E-01 -0.239410464318514008E-07 -0.141637761129918003E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.121129767514461000E-05 0.178146131114593001E-01 - 0.000000000000000000E+00 -0.435064997641161015E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.193885573025995017E-01 -0.125634454891261995E-02 - -0.435073786423245982E-03 -0.118148144256120997E-02 0.194636171497057989E-01 - -0.139670065452413992E-03 -0.166666666666667011E-01 0.172964393188763006E-01 - -0.239646299428580996E-07 -0.141635960674409009E-03 -0.121270661878923007E-05 - 0.178147055903274988E-01 -0.434952598751025003E-03 -0.166666666666667011E-01 - 0.193531525592223995E-01 -0.122237998047711009E-02 -0.435186141503925985E-03 - -0.114985437531923995E-02 0.194307215840422015E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.173112910791177986E-01 -0.253865277378783987E-04 - -0.166666666666667011E-01 -0.138259363976029009E-03 0.178692219185750990E-01 - -0.166666666666667011E-01 0.193634106903348008E-01 -0.124536794862644003E-02 - -0.118108829657000993E-02 0.194595580860945985E-01 -0.166666666666667011E-01 - -0.435070677644360025E-03 0.194631053235192995E-01 -0.118099346864504007E-02 - -0.435068106476127980E-03 -0.125584229082854990E-02 0.193881574850998002E-01 - -0.139730287993055990E-03 -0.166666666666667011E-01 0.172965129995646012E-01 - -0.246134321794111988E-07 -0.141982715456250990E-03 -0.124443001707338994E-05 - 0.178147384368366989E-01 -0.141918001868253997E-03 0.178147321831826008E-01 - -0.123839758836049003E-05 -0.166666666666667011E-01 -0.139719073147306011E-03 - -0.244906574575426016E-07 0.172965128605519006E-01 -0.166666666666667011E-01 - -0.434974886832897021E-03 0.193565238210357989E-01 -0.122539181989999007E-02 - -0.435163868604747015E-03 -0.115249078341532994E-02 0.194335198475173998E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.173108110594776994E-01 - -0.252486229488234996E-04 -0.166666666666667011E-01 -0.137635746082908988E-03 - 0.178684051236191986E-01 -0.166666666666667011E-01 -0.140103968298790010E-03 - 0.173357792454707990E-01 -0.263227882029069994E-07 -0.143405882709704010E-03 - -0.122767817853530995E-05 0.178147220760386006E-01 -0.166666666666667011E-01 - 0.193641798191387000E-01 -0.124570775063769998E-02 -0.118110177773261002E-02 - 0.194596602746646002E-01 -0.140117598274002998E-03 -0.166666666666667011E-01 - 0.173359378634117996E-01 -0.264605452288720992E-07 -0.143470065352011001E-03 - -0.123349701123627008E-05 0.178147283869216008E-01 -0.139699930872859987E-03 - -0.166666666666667011E-01 0.172964334628613985E-01 -0.947718670749511931E-08 - -0.141286093437001994E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.427033784487718019E-06 - 0.177496010453740000E-01 -0.166666666666667011E-01 -0.435116790829975981E-03 - 0.188852131512914009E-01 -0.625688270833019968E-03 -0.435021984600298978E-03 - -0.676914763138863006E-03 0.188324695108036989E-01 -0.166666666666667011E-01 - -0.139445451109797009E-03 0.174062618540683015E-01 -0.190292153995733009E-07 - -0.139963261540874002E-03 -0.765758082281971048E-06 0.178142630176260988E-01 - -0.532898885828498952E-03 0.187657876969587016E-01 -0.512856505158025013E-03 - -0.166666666666667011E-01 -0.274567397040450026E-03 -0.183874311667027007E-03 - 0.177330722022739987E-01 -0.166666666666667011E-01 -0.142934406988999004E-03 - 0.173279989518831008E-01 -0.109940052362783001E-06 -0.158611614292560006E-03 - -0.505928282170762005E-05 0.178189142473604983E-01 -0.166666666666667011E-01 - 0.191837857135102006E-01 -0.116715933292911990E-02 -0.166666666666667011E-01 - -0.115261550950918989E-02 0.193075009282458986E-01 -0.166666666666667011E-01 - 0.192715504188021997E-01 -0.120392392247149001E-02 -0.117898318238484007E-02 - 0.194477863400541011E-01 -0.166666666666667011E-01 -0.435541715949862990E-03 - 0.194766610313947992E-01 -0.120362756621229003E-02 -0.434596353936575995E-03 - -0.125584159877691992E-02 0.193897157775975992E-01 -0.435053772892238027E-03 - -0.166666666666667011E-01 0.193916866829808993E-01 -0.125980517520797000E-02 - -0.435085010459394024E-03 -0.118504095739005010E-02 0.194671524144910991E-01 - -0.166666666666667011E-01 -0.139670776269950009E-03 0.172964393783541989E-01 - -0.239722661868781990E-07 -0.141640063065594013E-03 -0.121308200605163008E-05 - 0.178147059781510013E-01 -0.166666666666667011E-01 -0.435069330991408006E-03 - 0.193887306635153987E-01 -0.125643445841789009E-02 -0.435069453134317996E-03 - -0.118150726982896000E-02 0.194636605867372983E-01 -0.140346027430057995E-03 - 0.178144315994148983E-01 -0.933532497931826955E-06 -0.166666666666667011E-01 - -0.140345973839123002E-03 -0.933513688386790009E-06 0.178144303098709995E-01 - -0.166666666666667011E-01 -0.139688634719731995E-03 0.172964363725470992E-01 - -0.241646447296168007E-07 -0.141743168415957993E-03 -0.122254521803203994E-05 - 0.178147146809073016E-01 -0.435059870297020007E-03 -0.166666666666667011E-01 - 0.193910369733737015E-01 -0.125956508747994997E-02 0.000000000000000000E+00 - -0.435078913540760980E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.118493531740769997E-02 0.194660938610356007E-01 - 0.000000000000000000E+00 -0.435045710174240005E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.193875426795061985E-01 -0.125569402526346007E-02 - -0.435093072169449998E-03 -0.118112078544045998E-02 0.194631751705481998E-01 - -0.139419879842097997E-03 -0.166666666666667011E-01 0.172964832727322999E-01 - -0.213161584778305005E-07 -0.140230214929005993E-03 -0.108242332731610002E-05 - 0.178145871202936984E-01 -0.166666666666667011E-01 -0.435070648727890980E-03 - 0.194635173256722006E-01 -0.118139703309657999E-02 -0.435068135392832026E-03 - -0.125625483745070996E-02 0.193885626932298015E-01 -0.139434374869739004E-03 - -0.166666666666667011E-01 0.172964444175751006E-01 -0.849107878205794962E-08 - -0.140068522154645993E-03 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.383811097207312005E-06 - 0.177500579653601005E-01 -0.140208513794423006E-03 -0.166666666666667011E-01 - 0.177497183126398006E-01 -0.504600381181250993E-07 -0.140451251998147000E-03 - -0.520324482149772044E-06 0.178140160927331009E-01 -0.435048849909143985E-03 - -0.166666666666667011E-01 0.193915158406403991E-01 -0.125973026996141989E-02 - -0.435089932877478006E-03 -0.118503856970659008E-02 0.194671300235494012E-01 - -0.435388199753579007E-03 -0.166666666666667011E-01 0.194580790950915003E-01 - -0.118263497490555007E-02 -0.434750258397953015E-03 -0.124178531210186001E-02 - 0.193754724111637015E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.173081480358036992E-01 -0.244895773689184990E-04 -0.166666666666667011E-01 - -0.133922698934493990E-03 0.178635891343980995E-01 -0.166666666666667011E-01 - 0.193849898137051016E-01 -0.125483132565947994E-02 -0.118151827368068998E-02 - 0.194632328928430985E-01 -0.166666666666667011E-01 -0.142602728611135999E-03 - 0.172960572372906005E-01 -0.117224517748023995E-06 -0.158587369045146999E-03 - -0.567482454000282988E-05 0.178189579824735989E-01 -0.139720580884206998E-03 - -0.166666666666667011E-01 0.177036794314488002E-01 -0.306504126161977009E-07 - -0.139932852514181998E-03 -0.499375406900951968E-06 0.178139944442497998E-01 - -0.546160778925739950E-03 0.184421024561668007E-01 -0.894165394995071063E-04 - -0.166666666666667011E-01 -0.147442848397543992E-03 -0.240246210218879017E-04 - 0.177137736588962001E-01 -0.166666666666667011E-01 -0.400413710403862977E-03 - 0.186079235147511013E-01 -0.750380380436990973E-03 -0.465646679664696004E-03 - -0.925379800643594003E-03 0.191586075383834986E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.177029162142695005E-01 -0.193968097316926012E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.245143979550076002E-03 - 0.179475102930650998E-01 -0.166666666666667011E-01 -0.140401212229714989E-03 - 0.173460971401947009E-01 -0.290712390926593986E-07 -0.144639789824670004E-03 - -0.132223121416671002E-05 0.178148169771682983E-01 -0.166666666666667011E-01 - 0.193485878399233015E-01 -0.123880031430750989E-02 -0.118081271609363995E-02 - 0.194575794108941998E-01 -0.139755721134129007E-03 -0.166666666666667011E-01 - 0.173348680988957005E-01 -0.228988835753830005E-07 -0.141707074016601988E-03 - -0.107451126571115990E-05 0.178145682418863992E-01 -0.139419950330182008E-03 - -0.166666666666667011E-01 0.172964916756991986E-01 -0.212265241962166009E-07 - -0.140190964959546998E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.107696768439272990E-05 0.178142558899423015E-01 - -0.166666666666667011E-01 -0.435630511714110973E-03 0.193599454053489002E-01 - -0.109078026057308000E-02 -0.434507224229087005E-03 -0.113826963448308990E-02 - 0.192764822629944987E-01 -0.166666666666667011E-01 -0.139539348987166009E-03 - 0.173126647078671002E-01 -0.217461924902823005E-07 -0.140774574657894001E-03 - -0.106961747895517997E-05 0.178145678787864983E-01 -0.559739151621855982E-03 - 0.187342113122531996E-01 -0.453812905743477017E-03 -0.166666666666667011E-01 - -0.144437865847829008E-03 -0.570572508916922003E-04 0.173662393449596011E-01 - -0.166666666666667011E-01 -0.141554479709531003E-03 0.173902724631262995E-01 - -0.612976885063492026E-05 -0.543269258690614047E-03 -0.155576910986749004E-04 - 0.183343607371364015E-01 -0.166666666666667011E-01 0.190435223900498989E-01 - -0.109812314311903995E-02 -0.166666666666667011E-01 -0.111910146500585994E-02 - 0.192002058225646012E-01 -0.166666666666667011E-01 0.191810336309519991E-01 - -0.116112312176440999E-02 -0.117547931761240003E-02 0.194335300082579994E-01 - -0.166666666666667011E-01 -0.435325182482829002E-03 0.194712654108410016E-01 - -0.119439354715173991E-02 -0.434813392979988976E-03 -0.125665050313407000E-02 - 0.193897929840027988E-01 -0.140180749732543006E-03 -0.166666666666667011E-01 - 0.177492505912984003E-01 -0.497281655052260991E-07 0.000000000000000000E+00 - -0.140418216862212009E-03 -0.517996101551560007E-06 0.178140139712071009E-01 - -0.166666666666667011E-01 -0.435069320552657990E-03 0.193884456575911003E-01 - -0.125614320010757996E-02 -0.435069463573064001E-03 -0.118122256466309992E-02 - 0.194633694941631005E-01 -0.166666666666667011E-01 -0.139670893318960006E-03 - 0.172964735659094995E-01 -0.239712911883834992E-07 -0.141640386974580993E-03 - -0.121295694635543998E-05 0.178147058443772002E-01 -0.141657679701159996E-03 - 0.178147071052390006E-01 -0.121453479340950007E-05 -0.166666666666667011E-01 - -0.139673890220687005E-03 -0.240034008201301999E-07 0.172964735798710016E-01 - -0.166666666666667011E-01 -0.435069148260108026E-03 0.193887190916836004E-01 - -0.125642659983513007E-02 -0.435069635865442011E-03 -0.118150231844085991E-02 - 0.194636542898556017E-01 -0.139671819173706988E-03 -0.166666666666667011E-01 - 0.172964367943914989E-01 -0.239065653681595988E-07 0.000000000000000000E+00 - -0.141643861658325012E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.120918365724227991E-05 0.178144555196989990E-01 - 0.000000000000000000E+00 -0.435057571499996014E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.193887668592458984E-01 -0.125672668568808997E-02 - -0.435081212181971026E-03 -0.118196881219207990E-02 0.194640540662835010E-01 - -0.139678894290856009E-03 -0.166666666666667011E-01 0.172963729635307005E-01 - -0.240483845199339011E-07 -0.141687157028328993E-03 -0.121683863414542000E-05 - 0.178146601266745003E-01 -0.166666666666667011E-01 -0.435063686986125991E-03 - 0.193888428213916005E-01 -0.125667757246277997E-02 -0.435075097036248006E-03 - -0.118183173664951000E-02 0.194639441484454993E-01 -0.139679133300664996E-03 - -0.166666666666667011E-01 0.172963909863181990E-01 -0.238879039025341995E-07 - -0.141683556731170989E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.120749743554915009E-05 0.178141338836764994E-01 - 0.000000000000000000E+00 -0.435044658394895018E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.193878886191347008E-01 -0.125607319657829999E-02 - -0.435094123787185003E-03 -0.118150841007670995E-02 0.194635600909605008E-01 - -0.139671709913231998E-03 -0.166666666666667011E-01 0.172964345130826001E-01 - -0.239825539017364991E-07 -0.141645500444457010E-03 -0.121359829368120991E-05 - 0.178147062946855013E-01 -0.434952586391279994E-03 -0.166666666666667011E-01 - 0.193531521576786013E-01 -0.122237980995592004E-02 -0.435186153854376021E-03 - -0.114985437868289999E-02 0.194307215405399997E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.173112910777010014E-01 -0.253865275619669011E-04 - -0.166666666666667011E-01 -0.138259364869725991E-03 0.178692219193366010E-01 - -0.166666666666667011E-01 0.193634106886295017E-01 -0.124536794787297005E-02 - -0.118108829653989990E-02 0.194595580858667010E-01 -0.166666666666667011E-01 - -0.435067679897617021E-03 0.193880971906136984E-01 -0.125581995532966003E-02 - -0.435071104218809021E-03 -0.118092995363368010E-02 0.194630638101403984E-01 - -0.139730294231680001E-03 -0.166666666666667011E-01 0.172965125754464995E-01 - -0.246135293977140997E-07 -0.141982756437381995E-03 -0.124443577379881005E-05 - 0.178147384420490988E-01 -0.141918111126178997E-03 0.178147321937730009E-01 - -0.123840965646412996E-05 -0.166666666666667011E-01 -0.139719091240698002E-03 - -0.244908832673502001E-07 0.172965124365098005E-01 -0.166666666666667011E-01 - -0.434974852046012982E-03 0.193565226857273995E-01 -0.122539133752111993E-02 - -0.435163903370489979E-03 -0.115249079254224011E-02 0.194335197239201007E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.173108110555817013E-01 - -0.252486224725561010E-04 -0.166666666666667011E-01 -0.137635748516305002E-03 - 0.178684051256846992E-01 -0.166666666666667011E-01 -0.140103968384359013E-03 - 0.173357792453362990E-01 -0.263227891208892990E-07 -0.143405883131451987E-03 - -0.122767822040640990E-05 0.178147220760749986E-01 -0.166666666666667011E-01 - 0.193641798143363991E-01 -0.124570774851647990E-02 -0.118110177764830008E-02 - 0.194596602740228983E-01 -0.140117598304397007E-03 -0.166666666666667011E-01 - 0.173359378633641988E-01 -0.264605455567292014E-07 -0.143470065501732002E-03 - -0.123349702618485005E-05 0.178147283869347015E-01 -0.139708750660181013E-03 - -0.166666666666667011E-01 0.172963799080418992E-01 -0.951229877766877080E-08 - -0.141326952140999990E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.428620398176468988E-06 - 0.177495879055481996E-01 -0.166666666666667011E-01 -0.435139436933837007E-03 - 0.188854634743449989E-01 -0.626119914025915028E-03 -0.434999328204409004E-03 - -0.676888990402598964E-03 0.188324542389332987E-01 -0.166666666666667011E-01 - -0.139445446199876991E-03 0.174062644632392988E-01 -0.190291660309238009E-07 - -0.139963236529218007E-03 -0.765751518464875966E-06 0.178142630110634005E-01 - -0.532897137108844952E-03 0.187657880254245013E-01 -0.512857774623407003E-03 - -0.166666666666667011E-01 -0.274572252963939997E-03 -0.183877623767562004E-03 - 0.177330837430943015E-01 -0.166666666666667011E-01 -0.142934406314033008E-03 - 0.173279990759568001E-01 -0.109939898040709003E-06 -0.158611602798506006E-03 - -0.505927466613655039E-05 0.178189142396055003E-01 -0.166666666666667011E-01 - 0.191837858462441009E-01 -0.116715939594670996E-02 -0.166666666666667011E-01 - -0.115261553028868000E-02 0.193075010029012985E-01 -0.166666666666667011E-01 - 0.192715504745154011E-01 -0.120392394815805009E-02 -0.117898318409425003E-02 - 0.194477863480717016E-01 -0.166666666666667011E-01 -0.435541725864183982E-03 - 0.194766613131175000E-01 -0.120362800929753994E-02 -0.434596343992153973E-03 - -0.125584159683252996E-02 0.193897158068569997E-01 -0.435053772712346986E-03 - -0.166666666666667011E-01 0.193916866769025011E-01 -0.125980517264371998E-02 - -0.435085010639267014E-03 -0.118504095747396995E-02 0.194671524138403003E-01 - -0.166666666666667011E-01 -0.139670776270514011E-03 0.172964393783540983E-01 - -0.239722661929319005E-07 -0.141640063068849004E-03 -0.121308200634946994E-05 - 0.178147059781512997E-01 -0.166666666666667011E-01 -0.435069330990369015E-03 - 0.193887306635063990E-01 -0.125643445843000995E-02 -0.435069453135356987E-03 - -0.118150726985585992E-02 0.194636605867599989E-01 -0.140346027478982998E-03 - 0.178144315994209004E-01 -0.933532505083072018E-06 -0.166666666666667011E-01 - -0.140345973895093990E-03 -0.933513695241501967E-06 0.178144303098549985E-01 - -0.166666666666667011E-01 -0.139688634737237009E-03 0.172964363725436991E-01 - -0.241646449186064015E-07 -0.141743168517025991E-03 -0.122254522732879002E-05 - 0.178147146809153993E-01 -0.435059870304747008E-03 -0.166666666666667011E-01 - 0.193910369723532990E-01 -0.125956508720873007E-02 0.000000000000000000E+00 - -0.435078913533033980E-03 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.118493531739844002E-02 0.194660938594058003E-01 - 0.000000000000000000E+00 -0.435045710134425011E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.193875426850105004E-01 -0.125569403179517989E-02 - -0.435093072209260005E-03 -0.118112079243948005E-02 0.194631751773721995E-01 - -0.139419883587548995E-03 -0.166666666666667011E-01 0.172964832718376996E-01 - -0.213161991804850005E-07 -0.140230234433833000E-03 -0.108242532544913993E-05 - 0.178145871219505987E-01 -0.166666666666667011E-01 -0.435070648695012982E-03 - 0.194635173293556986E-01 -0.118139703587940995E-02 -0.435068135425710023E-03 - -0.125625484210198000E-02 0.193885626975772996E-01 -0.139434378884568000E-03 - -0.166666666666667011E-01 0.172964444169705009E-01 -0.849109393347174922E-08 - -0.140068540591421997E-03 0.000000000000000000E+00 0.000000000000000000E+00 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.383811762342743974E-06 - 0.177500579574621994E-01 -0.140208513821624988E-03 -0.166666666666667011E-01 - 0.177497183043298992E-01 -0.504600292032754029E-07 -0.140451252065581990E-03 - -0.520324484157912038E-06 0.178140160927338989E-01 -0.435048849877068026E-03 - -0.166666666666667011E-01 0.193915158396935003E-01 -0.125973026964695009E-02 - -0.435089932909550007E-03 -0.118503856986224010E-02 0.194671300235720011E-01 - -0.435388199753534012E-03 -0.166666666666667011E-01 0.194580790950902999E-01 - -0.118263497490352998E-02 -0.434750258397998009E-03 -0.124178531210191010E-02 - 0.193754724111636009E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.173081480358036992E-01 -0.244895773689180992E-04 -0.166666666666667011E-01 - -0.133922698934496999E-03 0.178635891343980995E-01 -0.166666666666667011E-01 - 0.193849898137051016E-01 -0.125483132565947994E-02 -0.118151827368068998E-02 - 0.194632328928430985E-01 -0.166666666666667011E-01 -0.142602728611646007E-03 - 0.172960572372683995E-01 -0.117224517813049999E-06 -0.158587369049941992E-03 - -0.567482454330607033E-05 0.178189579824768012E-01 -0.139720580884228004E-03 - -0.166666666666667011E-01 0.177036794314212008E-01 -0.306504126126849030E-07 - -0.139932852514272990E-03 -0.499375406920812032E-06 0.178139944442497998E-01 - -0.546160778925799039E-03 0.184421024561692987E-01 -0.894165395041286943E-04 - -0.166666666666667011E-01 -0.147442848396934996E-03 -0.240246210175965007E-04 - 0.177137736588654990E-01 -0.166666666666667011E-01 -0.400413710403472989E-03 - 0.186079235147468998E-01 -0.750380380434690947E-03 -0.465646679664998009E-03 - -0.925379800643236976E-03 0.191586075383828984E-01 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.177029162142689003E-01 -0.193968097316642006E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.245143979549885995E-03 - 0.179475102930649992E-01 -0.166666666666667011E-01 -0.140401212229714989E-03 - 0.173460971401947009E-01 -0.290712390925843005E-07 -0.144639789824673013E-03 - -0.132223121416698996E-05 0.178148169771682983E-01 -0.166666666666667011E-01 - 0.193485878399233015E-01 -0.123880031430749992E-02 -0.118081271609363995E-02 - 0.194575794108941998E-01 -0.139755721134129007E-03 -0.166666666666667011E-01 - 0.173348680988957005E-01 -0.228988835753840990E-07 -0.141707074016602990E-03 - -0.107451126571121009E-05 0.178145682418863992E-01 -0.139419954077844995E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.172964916755905009E-01 - -0.212265636585413010E-07 -0.140190986545845990E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.107696962013746004E-05 0.178142558877735016E-01 - -0.166666666666667011E-01 -0.435069846225575999E-03 0.194634066063343984E-01 - -0.118126932115139994E-02 -0.435068937899507017E-03 -0.125616762178684996E-02 - 0.193884733114783013E-01 -0.166666666666667011E-01 -0.139670406363162002E-03 - 0.172964705688706984E-01 -0.239661812368390999E-07 -0.141637605266820997E-03 - -0.121271180256684004E-05 0.178147053610641995E-01 -0.142072630916558992E-03 - 0.178147467538722001E-01 -0.125303449493682997E-05 -0.166666666666667011E-01 - -0.139745783560114997E-03 -0.247866380946762004E-07 0.172964714736013984E-01 - -0.166666666666667011E-01 -0.434919264262457973E-03 0.193344334821136006E-01 - -0.120384249982520997E-02 -0.435219446910729000E-03 -0.113217480938375002E-02 - 0.194124575348414985E-01 -0.166666666666667011E-01 0.173153477755207987E-01 - -0.270609262604330985E-04 -0.166666666666667011E-01 -0.148526207309566006E-03 - 0.178819709504901003E-01 -0.166666666666667011E-01 0.173381030673696000E-01 - -0.222893784193113984E-07 -0.104900286329099006E-05 0.178148086104071984E-01 - -0.166666666666667011E-01 -0.435070197675724009E-03 0.194629909526723016E-01 - -0.118086398896642989E-02 -0.435068586447951984E-03 -0.125574648914054000E-02 - 0.193880551117544993E-01 -0.435351553302722974E-03 -0.166666666666667011E-01 - 0.194221659534600001E-01 -0.114701564680454995E-02 -0.434786972534015011E-03 - -0.120710615390273008E-02 0.193420430171151007E-01 -0.166666666666667011E-01 - -0.139670121825768003E-03 0.172964394653271000E-01 -0.239652296206953015E-07 - -0.141636284668589994E-03 -0.121273580815533010E-05 0.178147056312705013E-01 - -0.166666666666667011E-01 -0.435070564583170979E-03 0.194632895210796984E-01 - -0.118117290840524997E-02 -0.435068219538200975E-03 -0.125602935372259991E-02 - 0.193883427513572015E-01 -0.140356103029070988E-03 0.178144326340144016E-01 - -0.934844351950365017E-06 -0.166666666666667011E-01 -0.140500373954539995E-03 - -0.935645690836052991E-06 0.178144577216279992E-01 -0.166666666666667011E-01 - -0.139419788136554014E-03 0.172964863338696991E-01 -0.213147023989074007E-07 - -0.140223522691785012E-03 -0.108230807367247995E-05 0.178145859435818003E-01 - -0.435214369892329993E-03 -0.166666666666667011E-01 0.194296915933112983E-01 - -0.115819918280740002E-02 0.000000000000000000E+00 -0.434924344707075003E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.121631545945458997E-02 0.193618156735865000E-01 0.000000000000000000E+00 - -0.435418791966529978E-03 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.194737697407992016E-01 -0.119862135208015992E-02 -0.434719602197164011E-03 - -0.125646773106410990E-02 0.193899154748590000E-01 -0.139674312472573998E-03 - -0.166666666666667011E-01 0.172964142795966995E-01 -0.239964322444068015E-07 - -0.141660280806240994E-03 -0.121419135139913995E-05 0.178146576430187990E-01 - -0.166666666666667011E-01 -0.435064741241858997E-03 0.193888829829566983E-01 - -0.125669444198150005E-02 -0.435074042815188978E-03 -0.118183195684604006E-02 - 0.194639537546182004E-01 -0.139703753692154993E-03 -0.166666666666667011E-01 - 0.172963763508955995E-01 -0.954521092502657923E-08 -0.141306390167940008E-03 - 0.000000000000000000E+00 0.000000000000000000E+00 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.430379165314042026E-06 0.177499308188320999E-01 - -0.139758462452603012E-03 -0.166666666666667011E-01 0.177495927166552991E-01 - -0.464544378539779993E-07 -0.139910115331338996E-03 -0.481441843847050964E-06 - 0.178139831899020995E-01 -0.435407966296089978E-03 -0.166666666666667011E-01 - 0.194236847721564986E-01 -0.114950911636884008E-02 -0.434730445651113973E-03 - -0.120708640519090991E-02 0.193421924466846001E-01 -0.416148250027581024E-03 - -0.166666666666667011E-01 0.185549551170714998E-01 -0.503294505756778955E-03 - -0.452623601339539015E-03 -0.518655138222936054E-03 0.187784353169859995E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.182536262402052986E-01 - -0.562462921602506017E-03 -0.166666666666667011E-01 -0.776619709920083967E-03 - 0.186715804513362012E-01 -0.166666666666667011E-01 0.173100319756059016E-01 - -0.233382192352905001E-07 -0.116274420326912994E-05 0.178149543965509007E-01 - -0.166666666666667011E-01 -0.143574443571915002E-03 0.173387404346211985E-01 - -0.394234709446026970E-04 -0.556824893407507947E-03 -0.414251929427215991E-03 - 0.187007563880755012E-01 -0.143222946652751988E-03 -0.166666666666667011E-01 - 0.174281634389273007E-01 -0.707336094321511021E-07 -0.153074744284785989E-03 - -0.259053351753128992E-05 0.178159320930474999E-01 -0.143343235945712010E-03 - 0.178139865856733991E-01 -0.995973199358266933E-06 -0.166666666666667011E-01 - -0.140432036984107007E-03 -0.263668001789773996E-07 0.174281413140941997E-01 - -0.166666666666667011E-01 -0.434113176607574020E-03 0.193474145478290012E-01 - -0.123277679909749010E-02 -0.436022648061583991E-03 -0.117159827098749008E-02 - 0.194498157758568983E-01 -0.166666666666667011E-01 0.000000000000000000E+00 - 0.172976663726301001E-01 -0.749716013957179010E-08 -0.166666666666667011E-01 - 0.000000000000000000E+00 -0.328919335683087000E-06 0.177355871461097984E-01 - -0.166666666666667011E-01 -0.140992307333851993E-03 0.177352147833274992E-01 - -0.465109349848994022E-07 -0.141543817822521994E-03 -0.593601105003849030E-06 - 0.178140918400187011E-01 -0.166666666666667011E-01 0.193681748787879991E-01 - -0.124747135029532009E-02 -0.118117041740436004E-02 0.194601894329670989E-01 - -0.402827943802412983E-03 -0.166666666666667011E-01 0.186951349558098986E-01 - -0.851875272769232967E-03 -0.463811391972161985E-03 -0.104547015467971994E-02 - 0.192724136708169007E-01 -0.139676914109847992E-03 -0.166666666666667011E-01 - 0.000000000000000000E+00 0.172965234506337016E-01 -0.258479543060504998E-07 - -0.166751637452680000E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - -0.152816050945488003E-05 0.178205642139422984E-01 -0.166666666666667011E-01 - -0.435069391172627010E-03 0.000000000000000000E+00 0.193884418587671990E-01 - -0.125613776993248002E-02 -0.435069392953110973E-03 -0.118121619381530009E-02 - 0.194633634747354983E-01 -0.166666666666667011E-01 -0.139670761673773989E-03 - 0.172964743993378987E-01 -0.239698260758715003E-07 -0.141639618752670998E-03 - -0.121288305970079009E-05 0.178147057814278011E-01 -0.141639617499855004E-03 - 0.178147057813345007E-01 -0.121288294541192010E-05 -0.166666666666667011E-01 - -0.139670761456658004E-03 -0.239698237501548993E-07 0.172964743993366983E-01 - -0.166666666666667011E-01 -0.435069391184759991E-03 0.193887118629383001E-01 - -0.125641385905375007E-02 -0.435069392940977992E-03 -0.118148619757772006E-02 - 0.194636395637332003E-01 -0.166666666666667011E-01 0.172965077542778997E-01 - -0.927568134856068057E-07 -0.166666666666667011E-01 -0.102046181376419003E-05 - 0.173051339147867016E-01 -0.166666666666667011E-01 0.173041376239571991E-01 - -0.235410465543548011E-07 -0.117464369127414998E-05 0.178146677204709010E-01 - -0.166666666666667011E-01 -0.435069390787829998E-03 0.193887086342375004E-01 - -0.125641056657932004E-02 -0.435069393337907985E-03 -0.118148298373717004E-02 - 0.194636362741715009E-01 -0.435064997112617009E-03 -0.166666666666667011E-01 - 0.193885568522766984E-01 -0.125634409504329003E-02 0.000000000000000000E+00 - -0.435073786951775026E-03 -0.118148100464545011E-02 0.194636167070038994E-01 - -0.166666666666667011E-01 -0.139670744016726994E-03 0.000000000000000000E+00 - 0.172964393844361013E-01 -0.239719196971529010E-07 -0.141639876847867010E-03 - -0.121306495872900994E-05 0.178147059624979009E-01 -0.166666666666667011E-01 - -0.435069390950542994E-03 0.193887087721348016E-01 -0.125641070333806998E-02 - -0.435069393175194989E-03 -0.118148311476701999E-02 0.194636364105120994E-01 - -0.140348088180879989E-03 0.178144318602812997E-01 -0.933801415168445956E-06 - -0.166666666666667011E-01 -0.140348088084880012E-03 -0.933801076426891949E-06 - 0.178144318368827007E-01 -0.166666666666667011E-01 -0.139670745303743007E-03 - 0.172964393843559987E-01 -0.239719335227778987E-07 -0.141639884277225002E-03 - -0.121306563867630006E-05 0.178147059631548997E-01 -0.435069390608568011E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 0.193887086845648017E-01 - -0.125641066869512999E-02 -0.435069393517170026E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.118148310483521006E-02 0.194636363135432984E-01 - -0.435069389993829011E-03 -0.166666666666667011E-01 0.193887084930986983E-01 - -0.125641043857266008E-02 -0.435069394131910002E-03 -0.118148287005216007E-02 - 0.194636361542637991E-01 -0.139670502006070010E-03 -0.166666666666667011E-01 - 0.172964394138492988E-01 -0.239693181051815999E-07 -0.141638479654202995E-03 - -0.121293697133386993E-05 0.178147058357341001E-01 -0.166666666666667011E-01 - -0.435069390724191016E-03 0.193887813829540998E-01 -0.125648495565761011E-02 - -0.435069393401547020E-03 -0.118155573441382005E-02 0.194637106641438011E-01 - -0.139699943939322992E-03 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.172964151706326008E-01 -0.947752965014964063E-08 -0.141286304262072990E-03 - 0.000000000000000000E+00 -0.166666666666667011E-01 -0.427064219430228011E-06 - 0.177495997096570012E-01 -0.140178146812241995E-03 -0.166666666666667011E-01 - 0.177492252668765005E-01 -0.496801880864810970E-07 -0.140415043122611998E-03 - -0.517779586424547950E-06 0.178140139541973011E-01 -0.435069391018619994E-03 - -0.166666666666667011E-01 0.193887087741046009E-01 -0.125641070402467006E-02 - -0.435069393107117989E-03 -0.118148311448062994E-02 0.194636364104064999E-01 - -0.139670745278655002E-03 -0.166666666666667011E-01 0.172964393844560992E-01 - -0.239719332530519000E-07 -0.141639884131573011E-03 -0.121306562524490010E-05 - 0.178147059631618004E-01 0.000000000000000000E+00 -0.166666666666667011E-01 - 0.193887976612396006E-01 -0.125650164849242007E-02 0.000000000000000000E+00 - -0.166666666666667011E-01 -0.118157208094086998E-02 0.194637273068684995E-01 - -0.166666666666667011E-01 0.172964393844281007E-01 -0.239719133479159011E-07 - -0.121306464654854992E-05 0.178147059621872987E-01 -0.166666666666667011E-01 - -0.139670744021023010E-03 0.172964393843868004E-01 -0.239719197464163008E-07 - -0.141639876873170012E-03 -0.121306496126475003E-05 0.178147059625002983E-01 - -0.435069390942508026E-03 -0.166666666666667011E-01 0.193887087711050003E-01 - -0.125641070244318989E-02 -0.435069393183230011E-03 -0.118148311400550999E-02 - 0.194636364097085998E-01 -0.140348087386128002E-03 0.178144318601935990E-01 - -0.933801313750166004E-06 -0.166666666666667011E-01 -0.140348087289432998E-03 - -0.933800972552974967E-06 0.178144318366253995E-01 -0.166666666666667011E-01 - -0.435069391191810016E-03 0.193887815133764012E-01 -0.125648507876439009E-02 - -0.435069392933928997E-03 -0.118155584777763990E-02 0.194637107833691009E-01 - -0.166666666666667011E-01 0.000000000000000000E+00 0.172964151566877000E-01 - -0.947849244181166918E-08 -0.166666666666667011E-01 -0.427106377285012985E-06 - 0.177495993062723005E-01 -0.166666666666667011E-01 -0.140178145569364992E-03 - 0.177492248215362983E-01 -0.496797136596528977E-07 -0.140415043288561988E-03 - -0.517779487315898039E-06 0.178140139541107002E-01 -0.166666666666667011E-01 - 0.193887977549708004E-01 -0.125650168627734998E-02 -0.118157208932236990E-02 - 0.194637273908664990E-01 -0.435069390319006992E-03 -0.166666666666667011E-01 - 0.193887087496051012E-01 -0.125641069305206009E-02 -0.435069393806730991E-03 - -0.118148311376660995E-02 0.194636364069746999E-01 -0.139670541873138993E-03 - -0.166666666666667011E-01 0.000000000000000000E+00 0.172964393705303011E-01 - -0.239412684749743987E-07 -0.141637881918781994E-03 0.000000000000000000E+00 - -0.166666666666667011E-01 0.000000000000000000E+00 -0.121130856871430998E-05 - 0.178146131135742992E-01 - 0.186264861206935986E-17 0.188469719401502014E-16 0.000000000000000000E+00 - 0.552421818745575958E-18 0.917870006576351024E-18 0.914518973837806942E-17 - -0.462479989200847985E-18 -0.203714864886006990E-16 -0.195088628411609992E-16 - 0.361312491563162037E-20 0.221189708273807991E-17 -0.223400286947376007E-18 - -0.740148597608108019E-18 0.715802397433987990E-18 0.210593204619582009E-19 - -0.247092745603879988E-17 0.234522247271034021E-18 0.359989002583078020E-20 - 0.130125436521942010E-18 0.523865972357081977E-17 0.000000000000000000E+00 - 0.502798757490152989E-17 -0.774865740148233950E-17 0.305533254404098983E-16 - 0.000000000000000000E+00 0.162630325872825993E-17 -0.590890184004599994E-17 - 0.202691596146165005E-15 -0.691124674850884999E-15 -0.677626357803439970E-20 - -0.638832248819193008E-17 -0.354921197842210034E-17 0.180212876973253014E-17 - 0.592118381778119037E-17 0.987885261959830012E-19 -0.448662764248762018E-18 - 0.394717353420504035E-17 0.000000000000000000E+00 -0.120292707493299994E-16 - 0.852284551527277001E-17 0.000000000000000000E+00 0.303397988982417995E-17 - 0.101048383629478005E-18 0.740148804403260964E-18 -0.135433289076572003E-16 - 0.223533152833267012E-19 -0.447899111107253025E-17 0.212817027997642990E-19 - 0.000000000000000000E+00 -0.403081803774639981E-18 -0.672555409776245036E-17 - 0.000000000000000000E+00 0.491279109407494004E-18 -0.205371608391277999E-16 - 0.173167415736669007E-16 0.462479989200847985E-18 -0.704731412115577978E-18 - 0.000000000000000000E+00 -0.474338450462407997E-18 -0.612574227454310005E-17 - -0.184314369322535985E-17 0.765717784317887969E-17 0.648342840630083007E-17 - -0.304990590835039002E-18 -0.592118946587380969E-17 0.339088701131534005E-19 - -0.112557736932424994E-16 -0.381335904017217022E-18 0.000000000000000000E+00 - -0.669901779474492986E-17 0.138243891116723001E-17 -0.462599103209056006E-18 - 0.620975527371875958E-25 0.182632162877623988E-17 0.000000000000000000E+00 - 0.430176265969362029E-19 -0.395986452044138980E-18 -0.425221003356408028E-17 - 0.763810107027342937E-17 0.449986253228846961E-21 0.149920187933444999E-16 - 0.933333695763632040E-17 -0.449986253228846961E-21 -0.102263638350942002E-18 - 0.716941713442839936E-17 -0.185604230015463005E-18 0.148029860142325988E-16 - 0.765722441914230953E-17 0.000000000000000000E+00 0.395087089926695984E-18 - -0.112484787433542003E-16 0.148029860142325988E-16 0.186249855764197994E-18 - -0.123980477187407996E-17 -0.177475529531162003E-18 0.000000000000000000E+00 - 0.468189851533179025E-17 0.846067343136115030E-19 0.251873187799946985E-17 - 0.000000000000000000E+00 0.427222242771388022E-18 -0.126919416816583993E-16 - 0.000000000000000000E+00 -0.639166659846272950E-25 -0.577339967784366969E-17 - 0.000000000000000000E+00 -0.299518791083001013E-18 0.885930391776569904E-18 - -0.415350546620027017E-18 0.777912411780388954E-17 0.148029595444530002E-16 - -0.870855748895828018E-19 0.370461440010243996E-17 0.000000000000000000E+00 - 0.925637604759498948E-17 -0.157717534778751000E-16 0.313749473992465019E-16 - 0.575982404132923990E-19 0.219550939928314995E-17 -0.590890184004599994E-17 - 0.200929767615876011E-15 -0.698361724352226037E-15 -0.762329652528870004E-20 - 0.762499059118321011E-17 0.420595399031252993E-17 -0.953539688950861913E-18 - 0.740148701005684010E-18 0.731844738233841023E-17 0.151625515003419992E-18 - 0.117147568281026001E-16 0.000000000000000000E+00 0.912148605074475063E-18 - 0.159581007262709991E-17 0.449986253228846961E-21 -0.130862557845303000E-24 - 0.171921983655087999E-17 0.000000000000000000E+00 0.158448927876369990E-18 - 0.945956303890129087E-18 0.984593083121869034E-18 -0.108044346378205993E-16 - -0.148029595444530002E-16 0.326160624252108976E-18 -0.372472316079360021E-17 - 0.148029595444530002E-16 -0.106167109608854004E-16 0.550232602536393962E-17 - 0.123056946577105000E-16 0.677626357803439970E-20 0.603087458445062034E-18 - -0.292734586571086005E-17 0.670850094225405965E-18 -0.124971241037898994E-16 - 0.000000000000000000E+00 0.128342432167972003E-16 0.179743617411750996E-16 - -0.477944389246467971E-18 0.592118946587380969E-17 -0.141023146889066998E-19 - -0.776697175539328012E-17 -0.238961064843469015E-19 0.000000000000000000E+00 - 0.212532690638741003E-16 -0.147509485406731005E-17 0.000000000000000000E+00 - 0.635022970263750003E-17 -0.216767304995182992E-18 0.592118946264263003E-17 - 0.350644635857248989E-19 -0.606692755904139014E-18 -0.229492316153697017E-18 - 0.119207909163859998E-16 -0.462599103209056006E-18 0.119335121574955004E-17 - -0.283978943926022000E-18 0.000000000000000000E+00 0.526661030130147969E-18 - 0.834631058858251966E-18 0.572340905439509016E-18 -0.148029860142325988E-16 - -0.952426877165385946E-17 0.592118381778119037E-17 -0.152714247567623012E-19 - -0.136949449939554005E-17 0.000000000000000000E+00 -0.453732450462848006E-18 - -0.649938968340065021E-17 -0.212605269760829000E-18 0.148061359180052007E-17 - 0.173938215718671006E-16 -0.149755425074559992E-16 -0.176182853028893989E-18 - 0.000000000000000000E+00 0.314858028362183973E-19 0.222314384918731995E-17 - -0.224993126614423998E-21 0.577572400366616003E-25 0.196242353951357996E-16 - 0.592118381778119037E-17 0.846496107036753039E-18 -0.256018086451873006E-19 - 0.809438002004071989E-18 -0.177860077526130999E-16 0.000000000000000000E+00 - -0.222683638344174993E-18 0.123060118814754002E-16 0.148029595444530002E-16 - 0.169406589450859996E-18 -0.813151629364127964E-19 -0.921571846612678961E-18 - 0.000000000000000000E+00 0.831362837730095975E-18 0.000000000000000000E+00 - -0.542101086242751976E-19 -0.159835117146885992E-17 0.000000000000000000E+00 - 0.726754268744189980E-18 -0.753520509877426044E-17 0.777915058758348969E-17 - 0.000000000000000000E+00 -0.513301966036106005E-18 -0.420128341838133016E-18 - 0.608169656128588029E-17 0.578099986501059981E-19 -0.201217970576179000E-17 - -0.369654608040760013E-19 -0.148029595444530002E-16 0.144305243943295996E-16 - -0.705346343352828992E-19 0.000000000000000000E+00 -0.147612593664461005E-18 - 0.183565822566964011E-16 0.278007338868100017E-18 -0.662135360669951994E-17 - 0.462585868319254965E-18 0.382512344841314970E-17 -0.174270411452671991E-18 - 0.000000000000000000E+00 -0.413352078260099028E-17 0.111698234754425002E-16 - 0.240451477901814990E-18 0.000000000000000000E+00 -0.117367664498414002E-16 - 0.000000000000000000E+00 -0.428175154837049014E-18 -0.174319380544935010E-17 - 0.575982404132923990E-19 -0.271728169479180011E-17 0.735145188877927989E-18 - 0.530165449030040969E-17 0.000000000000000000E+00 -0.422661168873770002E-18 - 0.147489611940655008E-17 0.542185789537478032E-17 0.000000000000000000E+00 - 0.160646151193882004E-16 -0.132814766129474004E-17 0.148027477862161989E-16 - 0.314846080772212007E-17 -0.793361300897053979E-19 0.000000000000000000E+00 - -0.798196203889169946E-19 -0.312772209729571009E-17 0.473491417515153986E-18 - 0.994130806456849964E-17 -0.462585868319254965E-18 0.140167012111641990E-16 - 0.389635155736977975E-19 -0.575982404132923990E-19 0.106920968931909993E-16 - -0.542101086242751976E-19 -0.921571846612678961E-18 -0.460785923306338999E-18 - 0.693889390390722992E-17 0.590890184004599994E-17 -0.176182853028893989E-18 - -0.708119543904594929E-17 0.000000000000000000E+00 -0.790175860669855044E-18 - -0.677626357803440031E-19 -0.899887803162968988E-17 0.000000000000000000E+00 - 0.282909004382935976E-18 -0.319839640883224001E-17 -0.189735380184963011E-18 - -0.921571846612678961E-18 0.940111280245706959E-17 -0.136473551414918999E-17 - -0.462585868319254965E-18 0.947011739499720023E-25 -0.774842994750098042E-17 - -0.592118381778119037E-17 0.380929522460731986E-18 -0.412849204146454994E-18 - -0.790655128793651072E-17 0.574799928767763978E-17 0.449986253228846961E-21 - -0.184669492435164995E-16 -0.668551742520389010E-17 0.000000000000000000E+00 - 0.757247454845344985E-18 -0.758433300971500986E-17 -0.956088439213291989E-19 - -0.739967982721356930E-17 0.674366604445490992E-17 0.000000000000000000E+00 - -0.129066645337873990E-18 0.567512074660380996E-17 0.740306795900258034E-17 - 0.289685267960971005E-18 -0.853809210832335031E-18 -0.346944695195360995E-17 - 0.000000000000000000E+00 -0.440457132572235996E-19 0.140946282423116000E-17 - 0.126038502551440006E-17 -0.460785923306338999E-18 -0.334096264176697982E-17 - -0.121481465295212001E-16 -0.179994501291539010E-20 0.348945878064791996E-18 - 0.402866943610530006E-18 0.000000000000000000E+00 0.102462200732453003E-16 - 0.324395420824920984E-19 -0.824904211507282051E-17 0.260886147754324986E-18 - -0.169406589450859993E-20 -0.300220240242445980E-18 -0.155398517387845997E-16 - -0.462612338098855988E-18 -0.113502414932076005E-18 0.853279815240300958E-17 - 0.118584612615602002E-19 0.000000000000000000E+00 -0.420128341838133016E-18 - -0.590890184004599994E-17 -0.325260651745650993E-18 -0.414707330975704984E-17 - -0.739967982721356930E-17 -0.609863722023095955E-18 -0.775204553327135978E-17 - -0.110046520507278993E-16 -0.596311194867026976E-17 0.506525702458071988E-18 - 0.406575814682063982E-19 -0.585807986321074037E-17 0.000000000000000000E+00 - -0.957507219399941999E-17 -0.993410828451683924E-18 0.000000000000000000E+00 - -0.258493941422821001E-23 -0.140905965640059988E-16 0.000000000000000000E+00 - -0.455999856282107988E-19 0.130766005188302993E-17 0.851098705401120981E-17 - 0.412593057330019984E-17 0.913207396258542929E-21 -0.199420672541224012E-17 - 0.107126374421619007E-16 0.169406589450859993E-20 -0.169813165265542009E-16 - 0.111740586401786993E-16 -0.220546203641338017E-18 -0.580217568869195973E-19 - 0.745639133001020026E-17 0.000000000000000000E+00 0.633051248954183005E-18 - -0.106607566741425994E-16 0.580217568869195973E-19 -0.175505226671090992E-17 - -0.127986678330124993E-17 -0.686052019522905026E-18 0.000000000000000000E+00 - 0.469395219121791008E-18 -0.219520603079348977E-18 0.373882580183111030E-17 - 0.462599103209056006E-18 0.597412337698458029E-17 -0.221075599233372003E-18 - 0.000000000000000000E+00 0.234092940533119000E-17 -0.456210270350426032E-18 - 0.000000000000000000E+00 0.247588805817228019E-18 0.115207598134018006E-16 - 0.293496916223615019E-18 -0.516984574123192008E-17 0.462585868319254965E-18 - -0.979837125471933953E-17 0.542101086242752024E-18 0.000000000000000000E+00 - -0.111998538839297005E-16 -0.886573158126109934E-17 0.107314690359872997E-18 - 0.000000000000000000E+00 0.132318325413044006E-16 0.000000000000000000E+00 - -0.490891988880818995E-18 0.300838917864701994E-17 -0.449986253228846961E-21 - 0.153224066374812011E-16 0.752300914782277012E-18 -0.202894665677824002E-16 - -0.592118381778119037E-17 -0.607071160275117030E-18 0.706332833781481025E-18 - 0.183276753962148989E-16 0.740306795900258034E-17 -0.673708830422389036E-18 - 0.996110745971057006E-18 0.000000000000000000E+00 0.562224322594635998E-24 - -0.143283150371639000E-17 -0.592118381778119037E-17 0.695175821679365018E-18 - -0.377743814046069017E-18 0.851295905259153931E-17 -0.617021150427395005E-17 - 0.000000000000000000E+00 -0.699522159489963935E-17 -0.230593470233653008E-17 - 0.899972506457693922E-21 0.136092783635348004E-16 -0.203287907341032009E-18 - -0.235813972515596983E-17 0.000000000000000000E+00 0.151788304147971006E-17 - 0.000000000000000000E+00 0.718283939271646976E-18 0.339279047022710001E-17 - 0.000000000000000000E+00 0.740095037663445042E-19 -0.151788304147971006E-17 - 0.498732999343331963E-17 -0.592076030130756014E-17 0.124344436656931001E-17 - 0.311708124589582987E-17 -0.189735380184963011E-18 -0.460785923306338999E-18 - 0.145657373796625989E-16 0.661871544984509977E-17 -0.179994501291539010E-20 - 0.232356033872243014E-17 0.178769240009195011E-19 0.740148597608108019E-18 - -0.485518623621675008E-18 0.132951964373823990E-18 -0.604535355389274969E-17 - -0.332301613119702991E-18 -0.169406589450859993E-20 0.379629579047537022E-18 - -0.785816701559762074E-17 0.000000000000000000E+00 -0.108081404069648993E-17 - -0.867086452280546054E-17 -0.128749007982653989E-18 0.000000000000000000E+00 - 0.406575814682063982E-19 -0.596311194867026976E-17 -0.157209315010398008E-17 - 0.542101086242752024E-18 0.739967982721356930E-17 0.348977574268771980E-18 - -0.242590236093632002E-17 0.357786716920216001E-17 0.000000000000000000E+00 - 0.508219768352580023E-19 -0.327971157176865014E-17 0.108420217248549998E-18 - 0.460785923306338999E-18 0.269695290405768997E-17 0.105729828849834003E-16 - 0.000000000000000000E+00 0.605520075134427981E-17 0.295792342423450008E-18 - 0.740148804403260964E-18 0.887040033888795054E-17 -0.196630757771205002E-18 - -0.695752862874681990E-17 -0.545383338913363016E-18 0.000000000000000000E+00 - 0.539586457180591013E-18 -0.226178972740580010E-17 0.000000000000000000E+00 - 0.101643953670515996E-19 -0.219550939928314995E-17 0.157209315010398008E-17 - -0.740306795900258034E-17 0.156701095242045996E-18 0.000000000000000000E+00 - 0.182959116606928994E-18 0.556924162819702011E-17 0.000000000000000000E+00 - 0.492126142354748016E-18 0.129542647455343004E-16 -0.267003768769613992E-18 - 0.000000000000000000E+00 0.511303497676177987E-18 -0.249632582995209015E-17 - -0.104854969288279003E-17 0.219719850209398022E-24 -0.877420254237048056E-17 - -0.186868703054100004E-17 0.000000000000000000E+00 0.310192729707385011E-23 - -0.646582600286570010E-17 0.000000000000000000E+00 -0.831029897533542997E-18 - 0.106143816202805005E-18 -0.664497347120999001E-18 -0.396502740054638028E-17 - 0.913207396258542929E-21 0.321382829034003006E-17 0.242124367972641983E-17 - -0.190582413132217990E-20 -0.144015989742860999E-18 -0.112475097000963994E-16 - 0.329502123734093982E-18 0.148029595444530002E-16 -0.150205510589462992E-16 - 0.000000000000000000E+00 0.901573514533289918E-18 0.229584989725403987E-17 - 0.000000000000000000E+00 -0.629603722527569038E-18 0.415692006776889032E-17 - -0.259039880627106002E-18 0.000000000000000000E+00 0.526221725872965983E-18 - 0.153314307621524007E-16 -0.121059189334008992E-17 0.219719850209398022E-24 - 0.518485626953725989E-17 0.124986814033672995E-17 -0.148029595444530002E-16 - 0.200429726146822988E-17 0.922342050742921017E-19 0.000000000000000000E+00 - 0.356920501310487004E-18 0.132035399740042995E-16 -0.379419114896017021E-18 - 0.180196383238973012E-16 0.000000000000000000E+00 0.117761188624817996E-16 - 0.427189116772794014E-18 0.000000000000000000E+00 -0.945056279636684015E-18 - -0.306948887770498012E-17 -0.627647260344651004E-18 -0.148029860142325988E-16 - -0.211552721983768995E-16 0.000000000000000000E+00 -0.363690567209131027E-19 - 0.235090141659442018E-17 0.000000000000000000E+00 -0.154382013937734992E-18 - 0.104986094217475005E-17 -0.202012618175241010E-16 0.000000000000000000E+00 - -0.201818834573138001E-18 -0.277787102030008014E-18 0.922418879559933069E-18 - -0.740306795900258034E-17 -0.161027316020146994E-16 0.159919820441612000E-17 - 0.000000000000000000E+00 0.155950028687531998E-16 -0.217486462555504989E-20 - 0.592118919445516966E-17 -0.163450889040478004E-18 0.138365478911949995E-17 - -0.688214269644119025E-20 0.137618501731589997E-16 0.462638807878457974E-18 - -0.300866102864727018E-17 -0.425819344452497991E-18 0.000000000000000000E+00 - -0.534138976538562025E-17 -0.230392961653169981E-18 -0.693889390390722992E-17 - 0.000000000000000000E+00 -0.124683249835832991E-17 -0.593600689435813986E-17 - 0.203287907341032009E-18 -0.146240238343454992E-17 -0.462585868319254965E-18 - 0.804681299891585021E-20 -0.921571846612678961E-18 0.758941520739853008E-18 - -0.596311194867026976E-17 0.103338019565025002E-18 -0.121972744404619008E-18 - -0.641119237776780004E-17 0.578099986501059981E-19 0.904419429430779033E-17 - -0.317770699320046013E-17 0.462599103209056006E-18 0.723480113396294003E-25 - 0.177058483496122985E-16 -0.592123675734038991E-17 0.353544910182120027E-20 - -0.227547188927349024E-18 -0.648947871549377031E-17 0.181164310453569009E-17 - -0.449986253228846961E-21 -0.153640401864531989E-16 -0.669340050167796020E-17 - -0.449986253228846961E-21 0.501443504774546032E-17 -0.146028480106641009E-16 - -0.454644934438745996E-18 0.580217568869195973E-19 0.102203789509092007E-17 - 0.000000000000000000E+00 -0.394823232538910986E-18 0.152398167869993994E-16 - 0.000000000000000000E+00 0.485858098545066983E-17 0.339523416855172998E-18 - -0.196963000034115994E-18 0.000000000000000000E+00 0.163869376255981004E-16 - -0.375984607678321967E-20 0.602139840335321016E-17 0.000000000000000000E+00 - 0.374388562686400991E-18 -0.187956610995729014E-17 0.000000000000000000E+00 - 0.117614743347384008E-23 -0.156049686373756991E-16 0.592118381778119037E-17 - -0.111927793918019994E-17 -0.125932623433033007E-17 0.107700239243383999E-17 - -0.680955925205089026E-17 0.000000000000000000E+00 -0.474338450462408009E-19 - -0.198459819541683001E-16 0.148031713026898014E-16 -0.589365524699542022E-17 - -0.100559751498030998E-16 0.119262238973404995E-16 0.000000000000000000E+00 - -0.406575814682063982E-19 -0.298155597433513989E-17 0.291379333855478980E-18 - -0.143173979074393986E-16 -0.580217568869195973E-19 0.793500464987829020E-17 - 0.509551198266546019E-17 -0.207274917893538006E-19 0.592118946587380969E-17 - -0.249908860027332019E-18 0.726475167665756951E-17 0.143720452004045010E-18 - 0.219719850209398022E-24 -0.713783873983173068E-17 0.273122128532314001E-17 - -0.148029860142325988E-16 0.616308195352442029E-17 -0.103685693916237999E-18 - 0.000000000000000000E+00 0.172732113104808003E-18 -0.119086404205086997E-17 - 0.148812751953918994E-18 -0.973629116658004042E-17 0.462585868319254965E-18 - -0.128415437610183997E-16 0.364164300122516006E-18 0.581611368201348043E-25 - 0.965757993893369943E-18 -0.318378260621606992E-17 0.523869464800819009E-18 - 0.000000000000000000E+00 -0.125934272098304005E-16 -0.592118381778119037E-17 - 0.170794585972679995E-18 -0.638088177192808014E-18 0.148029860142325988E-16 - 0.636134305731703948E-19 -0.115664869328127008E-16 0.103770143886900998E-17 - -0.740148701005684010E-18 0.109459983278530000E-16 0.860412658285831038E-19 - 0.453890545720096961E-18 -0.462479989200847985E-18 0.574764794271245981E-18 - -0.417079023228017025E-17 -0.359989002583078020E-20 -0.123844851232456994E-24 - -0.997743062832839024E-17 -0.592118381778119037E-17 -0.350289482720280986E-18 - 0.671115955244159048E-18 -0.462718217217262968E-18 0.366619682373303022E-17 - 0.148029595444530002E-16 -0.269938812378104984E-18 -0.623165778889683993E-17 - -0.148029595444530002E-16 -0.428259858131774020E-17 0.948338087745914968E-17 - 0.300036010576418020E-16 0.000000000000000000E+00 0.409286320113278001E-17 - -0.590890184004599994E-17 0.201607393973679993E-15 -0.699147770927278027E-15 - 0.762329652528870004E-20 0.813490442543030046E-17 -0.681799066366914999E-17 - -0.607060631336584989E-19 -0.592118946264263003E-17 0.138582841111243012E-18 - -0.269407654562161003E-17 0.361013453120626983E-18 0.232644547280538988E-24 - -0.191654297740858989E-16 -0.185276296712765011E-18 0.000000000000000000E+00 - 0.658091044686770012E-17 0.383052929495380021E-18 0.000000000000000000E+00 - -0.177559921692520007E-18 -0.413370558966372027E-17 -0.446111077151846004E-19 - -0.468735550811414966E-17 0.000000000000000000E+00 -0.165755308426161985E-16 - 0.248649031945594005E-18 -0.219719850209398022E-24 -0.274940472223534988E-18 - -0.658549193252238029E-19 0.554334983005379011E-18 0.000000000000000000E+00 - 0.614026536820119009E-17 -0.592118381778119037E-17 0.286291090607452007E-18 - -0.122457478756202001E-16 0.148029860142325988E-16 -0.552535673821147045E-18 - 0.768758534050744985E-19 0.492023165039092978E-18 0.592118381778119037E-17 - 0.874500688343341996E-18 0.104618365722086995E-19 0.111289438122280006E-16 - -0.462585868319254965E-18 0.884185636357436999E-17 0.155484871948446003E-18 - -0.148029595444530002E-16 -0.592115734800158020E-17 -0.959692361744608029E-20 - 0.592118946587380969E-17 -0.191465157017479006E-18 -0.592118381778119037E-17 - 0.220854337068372988E-18 0.888178896156158063E-17 0.000000000000000000E+00 - -0.806069459784106008E-17 0.127742485722792993E-18 0.219719850209398022E-24 - -0.296056543911099002E-17 0.296059190889059018E-17 0.719732306674833017E-19 - 0.000000000000000000E+00 -0.120712129288665994E-16 0.000000000000000000E+00 - 0.154282173561696008E-19 0.296059190889059018E-17 0.449986253228846961E-21 - -0.296056543911099002E-17 -0.166744391367418999E-17 0.868868912763262986E-17 - 0.000000000000000000E+00 -0.440416500555818969E-18 -0.236804406269326013E-18 - 0.139891281099502011E-16 0.000000000000000000E+00 0.569938098259822986E-17 - 0.131207118545926006E-16 -0.449986253228846961E-21 0.720490237761527062E-17 - 0.609070955769230024E-18 0.000000000000000000E+00 -0.856757678897806949E-17 - 0.148001282823888015E-19 0.148369108142005008E-16 -0.160937199231215010E-18 - 0.000000000000000000E+00 0.250599984904598978E-18 -0.593031173923045011E-17 - 0.000000000000000000E+00 -0.747366643183191062E-17 -0.539419458477777017E-18 - -0.728712515589093926E-17 0.000000000000000000E+00 -0.489944721671871044E-18 - -0.740148688080987035E-18 0.126086815672215012E-18 0.800808902944986061E-17 - 0.000000000000000000E+00 -0.401836344027630001E-18 -0.161952699515021994E-17 - -0.221719344273285996E-16 0.591906623541305000E-17 -0.113502414932076005E-18 - 0.769783542464707994E-17 -0.243945488809238016E-18 0.000000000000000000E+00 - -0.169496586701505991E-16 0.197941011861488998E-18 0.000000000000000000E+00 - -0.302670152115197982E-25 0.196007899393148981E-17 0.000000000000000000E+00 - 0.268184722574811015E-18 -0.507787081806380982E-18 -0.568046702169790025E-17 - 0.335786576739177007E-17 0.000000000000000000E+00 0.116899459592193998E-16 - 0.128688313346714001E-16 0.000000000000000000E+00 -0.259192081859816000E-18 - -0.276302147394353002E-17 0.527066251428988001E-18 0.740306795900258034E-17 - -0.150450521486901014E-16 -0.592118381778119037E-17 0.143254447204384002E-18 - -0.742678488152571000E-17 -0.740306795900258034E-17 0.813151629364127964E-19 - -0.108420217248549998E-18 -0.135525271560687996E-17 -0.592076030130756014E-17 - -0.118245799436700000E-17 -0.325260651745650993E-18 0.962229428080884993E-18 - 0.000000000000000000E+00 0.177156676220440991E-16 0.103507426154475994E-16 - -0.179994501291539010E-20 0.233949856058605991E-17 0.612251172066077035E-18 - -0.740148597608108019E-18 -0.667834855456528034E-17 -0.108665062709865995E-18 - 0.525176309165426997E-17 -0.475608999883290026E-18 0.190582413132217990E-20 - -0.340083728322602019E-18 0.438821300192851001E-17 -0.462585868319254965E-18 - -0.132984172718925009E-18 0.124611252035315994E-16 -0.315096256378600015E-18 - 0.740306795900258034E-17 -0.191090632900570002E-17 0.590890184004599994E-17 - -0.216840434497100983E-18 0.314418630020796016E-17 -0.739967982721356930E-17 - -0.860585474410368952E-18 -0.244623115167042015E-17 0.140133130793750986E-16 - -0.591906623541305000E-17 -0.271050543121376012E-18 -0.395733792957208964E-17 - 0.256142763249699999E-17 0.000000000000000000E+00 0.588772601636464012E-17 - 0.292279306361937011E-18 -0.462585868319254965E-18 0.305022850878928992E-23 - 0.125181543436834992E-17 0.592123675734038991E-17 -0.840797659796876002E-18 - -0.119751929896036996E-17 -0.803410750470703968E-18 0.990969653027917929E-17 - 0.000000000000000000E+00 -0.615012094155625980E-17 -0.804469541654772036E-18 - -0.190582413132217990E-20 -0.342201310690737000E-18 0.120498907076396997E-16 - 0.361683068477586004E-18 0.000000000000000000E+00 0.190463629996254992E-16 - 0.000000000000000000E+00 -0.736389268519206972E-18 -0.564293349460814963E-17 - 0.000000000000000000E+00 0.467562186884373980E-18 -0.188380127469356010E-17 - 0.650521303491303046E-18 0.000000000000000000E+00 0.298155597433514008E-18 - 0.254787510534094004E-17 0.474338450462407997E-18 0.000000000000000000E+00 - 0.253944448053778995E-17 -0.988826262624670062E-17 0.179994501291539010E-20 - 0.443980138421951982E-17 0.118181258669405998E-18 0.740148597608108019E-18 - 0.269978517047506999E-17 0.191455088678460996E-18 -0.363737123374678000E-17 - -0.190370654895404001E-18 0.000000000000000000E+00 0.452156775156186007E-18 - 0.744150207898425038E-17 0.000000000000000000E+00 -0.131290106824417010E-18 - -0.699437456195238967E-17 -0.154159996400282999E-18 -0.740306795900258034E-17 - 0.120617491689012002E-17 0.000000000000000000E+00 -0.704731412115577978E-18 - 0.135525271560687996E-17 -0.739967982721356930E-17 -0.304931861011547978E-18 - -0.133796794952696997E-17 0.309348476086989009E-16 0.000000000000000000E+00 - -0.444083492377669035E-18 0.396729056670233018E-18 -0.410980386007786987E-17 - 0.000000000000000000E+00 -0.829202903714597009E-17 0.612574227454310005E-17 - 0.000000000000000000E+00 0.876900950248488035E-17 -0.586046090260397985E-19 - 0.000000000000000000E+00 0.543265756545227034E-17 0.562482816536058969E-20 - 0.418476627590987005E-17 -0.160089227031063005E-18 0.304931861011548002E-19 - -0.397258452262267003E-18 0.144321708717923994E-16 0.000000000000000000E+00 - 0.203287907341032004E-17 -0.487890977618476995E-17 -0.101643953670516005E-18 - 0.184314369322535985E-17 -0.609863722023096013E-17 0.000000000000000000E+00 - 0.298968749062877971E-16 -0.674661742488049989E-17 0.000000000000000000E+00 - -0.346859991900635989E-18 -0.512412489595360027E-25 -0.229649779221929994E-16 - 0.592118381778119037E-17 0.968136764274826967E-18 0.248448055279030997E-19 - -0.107229265609408996E-16 0.148029595444530002E-16 0.106183896722397993E-16 - 0.555835236088826021E-17 0.000000000000000000E+00 -0.102162426050442001E-16 - 0.105082980687594002E-17 0.000000000000000000E+00 0.760580847189921965E-17 - 0.559763864169073973E-19 -0.733749295744850035E-18 0.839903522791146024E-19 - 0.219719850209398022E-24 0.117210647846692996E-18 -0.125995396489508994E-16 - 0.462585868319254965E-18 0.128018813561245007E-16 0.318308925156784023E-18 - -0.818753745432010944E-17 0.000000000000000000E+00 -0.817699696977523956E-19 - 0.740148681618638981E-18 -0.599551467812316950E-19 0.108906402847091006E-16 - -0.462599103209056006E-18 -0.494820159361191037E-18 -0.232935714856157988E-18 - -0.685910313144217059E-19 0.143819535639848992E-17 -0.400925499014078976E-17 - -0.410781759263196998E-17 -0.735211518423296988E-18 -0.686166170447438018E-18 - 0.184201211014737999E-17 0.565444743514457976E-18 -0.496970112021859027E-17 - -0.300019069917472988E-17 0.677626357803439982E-18 -0.639762992444437982E-18 - 0.112363387228590996E-17 -0.391203242024195008E-17 0.452220302627229964E-17 - 0.251521139731243992E-17 -0.321237245246193018E-17 -0.204498708472292984E-17 - 0.573610877316734989E-18 0.449273430335979002E-18 -0.296543215624495000E-17 - -0.273389396303369975E-19 0.644925013057074959E-18 0.106550458191936003E-17 - 0.459232167923234995E-18 0.257365667067299011E-17 -0.726396926719566976E-18 - -0.219646231134880988E-17 0.282697246146122979E-18 -0.721963238636283036E-18 - -0.174012786050891999E-17 0.322031338634244021E-18 -0.812092838180059969E-19 - -0.176127328530276993E-18 0.192190328165929010E-17 0.695940524306275957E-18 - 0.238706623117694998E-17 -0.975821659906356921E-18 -0.341196120810363013E-17 - -0.296800344717906993E-17 -0.310014058695074020E-18 0.106340354316346995E-17 - 0.127478458561772008E-17 0.185161402269790016E-17 0.246253653590506004E-17 - 0.468239813242176996E-17 0.147044919643347001E-17 -0.303020954682541977E-15 - 0.120558517020060009E-15 0.514724981387492984E-16 -0.395303500220003983E-15 - -0.937401198330966936E-15 -0.695346287059999988E-16 0.360751332235607007E-16 - -0.841069835305629941E-16 -0.465122731996280971E-16 0.138223865591080994E-17 - 0.316557388212613009E-17 0.153530511954130000E-17 -0.114158865466198001E-17 - 0.251918186425269985E-17 -0.826679341101821022E-18 -0.242299109877044003E-17 - -0.276451039904612007E-17 -0.219837847523779012E-17 0.189542150793871004E-17 - -0.349685640873116977E-17 -0.322931311140702004E-19 0.296090954624581016E-17 - 0.262580213648832990E-17 -0.138913403349705005E-17 0.200852687617675992E-18 - 0.140967458246796992E-17 -0.665692623106138030E-18 0.296959494267761993E-18 - -0.141501031100923995E-17 0.514096059424157008E-18 0.474857258142600987E-17 - 0.234265490408897985E-17 0.238700305525767010E-17 0.123478461273650007E-17 - -0.777840529785158018E-18 -0.393563436465992003E-18 0.400450542246108993E-19 - -0.231993556138459993E-18 -0.105675301103853995E-17 0.725377840204900971E-18 - 0.508905335644263999E-17 -0.168930133418030005E-17 0.151288025313498993E-17 - -0.167222832633720003E-18 0.128336079420867007E-17 0.258071334847867999E-17 - 0.101915268911433000E-18 0.208333047333116009E-17 0.371393755278652995E-18 - -0.927165655505069004E-18 -0.352746209139563026E-17 0.397578736595447006E-17 - -0.321124748682885999E-17 0.311549305911972006E-18 -0.439610099624981972E-18 - -0.860585474410368952E-18 0.169745402629762000E-17 0.229715335295365982E-17 - 0.248350060134961013E-17 -0.577305893113008979E-18 0.141623908780918997E-17 - -0.112485975395371008E-17 -0.291379333855478980E-18 0.351052804989545007E-17 - -0.548877349820787005E-18 0.332036915323686022E-18 0.230392961653169981E-18 - -0.317806761809812986E-17 -0.157886941368202007E-17 0.170761842166467006E-17 - 0.134170018845081000E-17 -0.340422541501502998E-17 -0.157209315010398008E-17 - -0.237169225231204018E-17 0.648827237596793963E-18 -0.152211820621598002E-17 - -0.254906293670056003E-17 -0.265743104157052001E-17 -0.576081633090791036E-18 - 0.295951293498425985E-17 0.491017591086957017E-18 0.570295000582212027E-17 - 0.425298666891756003E-17 0.322490243264190984E-17 0.170575850282616993E-17 - 0.316149982129022996E-18 0.316149982129022996E-18 0.170575850282616993E-17 - -0.104501833606318004E-18 0.278429414238003988E-19 0.201993100848688014E-17 - 0.689958412277429955E-18 0.253745521456217996E-17 0.317289218488110982E-17 - 0.416205158728287997E-19 -0.223087532941556984E-17 0.148681372554603001E-17 - 0.224561764616170982E-18 -0.161663087507475004E-17 0.252552248152889988E-19 - 0.181779337333578008E-17 0.362974215988189973E-17 0.105511449044334006E-18 - 0.811011020335495983E-18 0.212430842451632013E-17 0.291664890900878997E-18 - -0.253106873399842011E-17 0.740117494324606029E-18 -0.573664210352185016E-17 - -0.213444055721414999E-17 0.191567131261783994E-18 -0.154648946414654008E-17 - 0.911779475726123009E-19 0.386130223460529005E-18 -0.437674703786760990E-19 - 0.153868974389064991E-17 0.341944482530724993E-19 0.120185332147335992E-17 - 0.429533419052512003E-18 -0.521260203663573975E-17 -0.244979246689270001E-17 - 0.353401443498422013E-17 0.108347743827260008E-17 -0.297276516321888987E-17 - -0.502982861212076016E-18 0.717817057257475013E-18 -0.404118796699791005E-18 - -0.824222669082385976E-18 -0.496792572966965000E-18 -0.145300107449145002E-17 - 0.429727421330391006E-18 0.674705684382426013E-18 0.516237685549988988E-18 - 0.152970157584615000E-17 0.114320869044779997E-17 0.120185320191991991E-17 - -0.132446040544864991E-17 0.212862970992680982E-17 -0.506722571443859031E-18 - -0.328233673284015989E-17 0.174357265416989999E-19 -0.250028140764131988E-17 - 0.304621564884264023E-18 0.287174051018836004E-18 -0.360316566105648980E-18 - 0.246878009516860991E-17 0.307227700801689012E-17 -0.398859873928170002E-18 - 0.135186458381786007E-17 0.163646765409530999E-17 -0.134905216973517997E-17 - -0.277534977379301996E-17 0.107718664691529003E-17 -0.868460233841873968E-18 - -0.314460981668159001E-18 -0.295265097501007996E-17 0.587343884942705016E-18 - -0.103889092135869005E-17 0.220358991989202002E-17 0.172080450780937999E-17 - 0.203458402189975985E-17 -0.188128398314380995E-17 0.435593310570423997E-20 - 0.150997560841400997E-17 0.338780091677218015E-19 -0.474444329580814977E-18 - 0.134286485875329006E-17 0.260568510399103997E-18 0.163964402764750996E-17 - 0.391340595364909024E-18 -0.555124217806786988E-18 -0.134424128729257002E-17 - 0.120085428439502995E-18 -0.232039433643183985E-17 0.311023860107363000E-17 - 0.161539357362928995E-17 -0.227030637899263991E-17 -0.123412700414952002E-17 - 0.460785923306338999E-18 -0.167034897198548008E-17 -0.772738893357237991E-18 - 0.655603501174828000E-17 0.355923244436257002E-17 0.966041076343529922E-18 - 0.461463549664142979E-17 -0.813151629364127964E-19 -0.300879655391883993E-15 - 0.116570897662623010E-15 0.517977587904949990E-16 -0.397920832027020018E-15 - -0.938173692378863056E-15 -0.685385179600290032E-16 0.332324906525752010E-16 - -0.811796376648520952E-16 -0.500833641052523013E-16 -0.327264414061500016E-17 - -0.249197093082215006E-17 0.355081340008800993E-17 0.577464711790618967E-18 - 0.309632893868808988E-17 0.237164602067062017E-17 -0.236675814580756013E-17 - 0.129894498034675006E-18 -0.177843283691745018E-17 0.905876559778580015E-18 - 0.334760614285670004E-18 0.702903342961835980E-18 0.194244010831745001E-17 - -0.311708124589582987E-17 -0.384044738285100005E-17 -0.259964999424184991E-17 - -0.230437174456910998E-17 -0.203085578963201996E-17 -0.186536838192342997E-17 - -0.292541894847392008E-17 0.444950377659624030E-18 0.473120840600730037E-18 - 0.144694403214716009E-17 -0.387457071246265019E-18 -0.268733332021511007E-17 - 0.480630192996640042E-18 -0.968667995173845007E-18 -0.225453915329464011E-17 - -0.172670305520930993E-17 0.397513637481238987E-18 -0.223802472500957004E-17 - -0.103900998689928001E-17 0.125363275017413008E-17 0.283438399974970010E-18 - 0.289612476067066011E-17 0.115254714341709003E-17 -0.905774764864446938E-18 - 0.102320256539338992E-17 0.130652185136016004E-17 -0.227171558456370018E-18 - -0.315659132105927017E-17 -0.140885526007124000E-17 0.456135927492873002E-17 - -0.136525167485142000E-17 -0.181712389987688988E-17 0.477260714130435988E-17 - -0.161968912255027995E-17 -0.847032947254299978E-19 -0.283247817561837995E-17 - -0.347622321553165013E-17 0.308735568340312003E-17 0.609016689075842040E-18 - -0.308828212568918019E-17 -0.265629532258948987E-16 0.869394617061814022E-17 - 0.707441917546791951E-17 -0.104625509644850993E-16 -0.644592072860523040E-18 - 0.182366193543851017E-17 0.220253050832250004E-17 0.181095644122968989E-17 - 0.465952824284591002E-17 -0.179909797996813007E-17 -0.443167638003450014E-17 - -0.110114283143058996E-18 -0.271050543121376012E-18 -0.184653182501437012E-17 - -0.338094772539718024E-18 0.535365328665338963E-18 -0.193755392891210995E-18 - 0.238648297443423004E-17 0.185387667186595989E-18 -0.583221945298563973E-17 - 0.392947092559352014E-17 0.206476279333813002E-17 0.168586566858662003E-17 - 0.805104553428405950E-19 0.805104553428405950E-19 0.168586566858662003E-17 - 0.710517439930086985E-19 -0.850266110884137972E-18 0.661505608924907992E-18 - -0.318501041740767019E-17 -0.399340393980533022E-17 0.329464601164237984E-17 - 0.105759985244622995E-17 -0.263066876232583997E-17 -0.462818734786794995E-17 - 0.282065931092569004E-17 0.273173264807079987E-18 -0.191766477016721988E-17 - -0.325548795749949012E-18 -0.339916505461695014E-17 0.329796223641775006E-19 - 0.184871977884447985E-17 0.476420507970624038E-18 -0.416450631384511008E-17 - 0.953001031626930087E-18 0.133898346897670997E-17 -0.128899552551295991E-17 - -0.313369156609463019E-18 0.327541076635822000E-17 0.831022609569495005E-18 - -0.292570682126152987E-17 -0.143599266409235012E-18 0.746109032496585016E-18 - 0.369159449897113977E-18 0.213209126763412006E-18 0.120185446207787991E-17 - 0.429056143033479976E-18 0.100975456627066995E-17 -0.214856782078001993E-17 - 0.104065643190709998E-17 -0.985474136165822018E-19 -0.219472349794965002E-17 - -0.121077973535169001E-17 -0.306481117342981015E-17 -0.729646695292633014E-18 - -0.417870234395862012E-17 -0.982270248064390015E-19 -0.222740183391410013E-17 - 0.429074284915790001E-18 -0.757393922655926042E-18 -0.222277203054574005E-17 - -0.366790309232547969E-17 -0.513331296435003975E-18 0.120185407756813993E-17 - -0.768679095514467055E-19 -0.849541118088088029E-18 0.468620978068442048E-18 - -0.329866393396347012E-17 -0.130866590350788998E-17 -0.196412382089491007E-17 - 0.201593841446522999E-18 -0.493820208249257001E-18 0.525838053655470036E-17 - -0.339024937138534000E-18 -0.388951895548529013E-17 -0.206676039130048994E-18 - 0.863973606199385961E-18 -0.153736479926655993E-18 -0.531777872198089969E-17 - -0.115969894699322004E-17 0.228376918290164009E-17 0.239277618139722024E-18 - -0.675919057019131015E-18 0.872979948708863904E-18 0.700124022566006054E-19 - 0.160436471636099012E-18 -0.430442219081631023E-17 0.141636781779202008E-17 - -0.264584619614596010E-17 -0.214590875462231992E-17 0.136936648480856995E-18 - -0.261616426743056012E-17 -0.140853348681295003E-18 0.109188833473735006E-17 - 0.822558327290080971E-18 0.590700594208202994E-17 0.211440599458354992E-17 - 0.103192880386794999E-17 0.259586853807156981E-18 0.285297240247498991E-17 - 0.138834510998783011E-19 0.738126141010415962E-18 -0.284059369630692984E-17 - -0.500243348423792998E-17 -0.177634224131680013E-17 -0.282456371151746988E-17 - -0.224463731022390023E-18 -0.116551733542191996E-17 0.295162620673421996E-17 - 0.304931861011547978E-18 0.155854062294790993E-17 0.683820286230236957E-18 - 0.203340846900235013E-17 -0.972340883888733010E-18 0.813151629364127964E-19 - 0.219550939928314995E-17 0.636214387616585012E-18 -0.135525271560688006E-18 - -0.958841296291867984E-18 -0.337270652495431019E-17 0.881761298091727038E-18 - -0.346436475427008983E-18 0.455327854752469024E-17 -0.188188717875612010E-17 - 0.140212904092025990E-18 0.250969594776295999E-17 -0.298155597433514008E-18 - -0.317806761809812986E-17 0.271050543121375993E-17 0.335425047112703011E-17 - -0.105879118406788005E-17 0.387941089842470018E-17 -0.111130722679763993E-17 - 0.620875150337401956E-18 0.674555863369643047E-18 0.227798923252203012E-17 - 0.182620303428026999E-17 -0.209725357740164995E-17 -0.209725357740164995E-17 - 0.182620303428026999E-17 -0.414781446358590017E-19 0.219658225253763007E-17 - -0.306111089692803988E-17 0.206912281912994004E-17 -0.255938945946767010E-17 - 0.304542693212857005E-17 -0.388794043270335027E-17 -0.294961296034002983E-17 - -0.640841332232825959E-18 -0.176859620984863995E-17 -0.351940186256116005E-18 - -0.669305072706347014E-18 0.115719589845962996E-17 0.249115036765450015E-17 - -0.267263072927983979E-18 -0.327855516196318992E-17 -0.658166933893129994E-20 - 0.288553457408329992E-17 -0.288969575785353004E-17 0.633241671747305985E-19 - 0.180772237182975992E-17 0.283017902710578996E-17 -0.565930133097904035E-17 - -0.921226912297244036E-19 0.248537333825643019E-17 0.337764313885002993E-18 - 0.278072479341339001E-18 0.197859530774522996E-17 0.566495635123675990E-17 - 0.179570984817912009E-18 -0.119601052152306996E-17 -0.127224348677596007E-17 - -0.247841840366608010E-17 -0.225030184305866019E-17 0.475991984506901981E-18 - 0.205724781425613012E-18 -0.942906488971646074E-18 -0.319654352426012007E-17 - -0.119902807639766995E-17 -0.333492753201779013E-18 0.201170324972895992E-20 - -0.324868899007545985E-17 0.537018888559226030E-18 0.262834323533009010E-17 - 0.974087889342445029E-19 -0.954182615081968947E-18 0.422838847269347037E-17 - 0.232256434137128999E-17 0.579741112836364996E-18 -0.817757371014823969E-18 - 0.861889628043636043E-19 0.258000547574478990E-17 -0.228948704303652010E-18 - -0.986816337813258049E-18 -0.235678976639628984E-17 -0.976768781707729945E-18 - -0.542140790912154955E-18 0.304632752502048986E-17 0.286085377935140017E-17 - 0.137515798986736007E-17 0.341100167859307016E-17 -0.366794382918673990E-17 - 0.388735183230519980E-18 -0.792068449911377045E-18 0.321449003483007017E-17 - -0.494074318133433003E-17 -0.477853637193513999E-17 -0.292649883276361008E-18 - -0.408307537973959009E-17 -0.194192967971836990E-17 -0.100750598608959000E-18 - 0.430052854827544002E-18 0.212506008087322998E-18 -0.402770783864319995E-18 - -0.447233396150270964E-18 -0.198459819541682993E-17 -0.236970701884191007E-19 - -0.220994204661097016E-17 0.296419179891642018E-17 -0.212986434587094013E-17 - -0.359565486109450012E-18 -0.322052514457926014E-17 -0.491003823699637011E-17 - 0.156785798536770993E-17 0.182249726513603012E-17 0.101220437196889006E-17 - 0.126123205846164993E-17 -0.939359538505019076E-18 0.159242194083808006E-18 - -0.338384368472172993E-17 -0.302051948990884002E-17 -0.450578514547435023E-18 - -0.156574040299956994E-17 0.863973606199385985E-19 0.408269880576573004E-18 - -0.350142244571246017E-18 0.166018457661843011E-18 0.184314369322535985E-17 - -0.124683249835832991E-17 0.245300741524844993E-17 -0.271050543121375988E-19 - -0.525838053655470036E-17 0.500596471827291982E-18 -0.289852854753073998E-17 - -0.358234790318279013E-17 -0.745680161159402964E-18 0.395130447418413017E-17 - -0.782221691899545996E-18 -0.116890546721092989E-18 0.241870258088465000E-17 - 0.352365706057788989E-18 0.645439105807776955E-18 0.299849663328021983E-18 - 0.216671027907649988E-17 -0.233781093442186990E-18 -0.794834541879753984E-18 - -0.102999206386122998E-17 -0.467562186884373980E-18 0.241234983378025006E-17 - -0.226581313390524994E-17 0.247037159066717002E-17 0.853809210832335031E-18 - 0.853809210832335031E-18 0.250594697445184986E-17 0.696049324406221015E-18 - 0.310955224796036990E-17 0.556959317995735977E-19 -0.495739267270380026E-18 - -0.152187997819956003E-18 -0.313020110589274015E-17 0.360892352575412997E-19 - 0.274798235917439008E-17 0.219805416308431992E-17 0.128987869551562998E-17 - -0.181271491937153990E-17 0.479946121031822013E-17 0.521859986993336979E-17 - -0.162858689760435992E-17 -0.410148944122608997E-18 -0.129790139506878007E-17 - 0.180246481185638019E-17 0.252833916722396005E-17 0.184847156355081005E-17 - 0.234299073457090997E-17 0.381342421941949979E-17 -0.210785886003519001E-17 - -0.186922460423510001E-18 0.283411363932078009E-17 0.245599891393374988E-17 - 0.115727546289480009E-17 -0.267567233540009983E-17 -0.230406371026381001E-18 - 0.847032947254299978E-19 -0.199476259078387999E-17 -0.149691897603515996E-17 - 0.169237182861408996E-17 0.202949094162130015E-17 0.258247110728035017E-17 - -0.483304261121869975E-18 -0.267057163278154011E-17 -0.105757357420620006E-17 - 0.236854234853944018E-17 0.219443737320928007E-17 -0.769317674343717994E-18 - 0.107361426064483001E-18 -0.208888912704750988E-17 -0.317531476101955994E-17 - -0.239075049362525982E-18 0.460785923306338999E-18 0.297573262282276003E-18 - -0.931736241979730072E-20 0.330131091192363981E-17 -0.233781093442186990E-18 - 0.243924312985556986E-17 0.508219768352580011E-18 0.596311194867026957E-18 - -0.164154985177883001E-17 -0.290601122335189004E-17 0.116551733542191996E-17 - 0.240557357020220988E-17 -0.636968776335234048E-18 -0.350774872303727035E-17 - -0.278843246236116018E-17 -0.640356908124250960E-18 0.105032085459533001E-18 - -0.916951328718438058E-18 0.397946666531910978E-18 -0.123324688397775994E-17 - 0.324032453972133018E-17 -0.997275416273532054E-18 0.182323841896488004E-18 - -0.127499634385454002E-17 -0.769560981766082995E-18 -0.242439037817415004E-17 - 0.214918066753528012E-19 -0.279227223476463018E-17 -0.537361858320706021E-18 - -0.899122578378295995E-19 0.277170356165288005E-17 0.464597571568984025E-18 - -0.136671413017441992E-17 0.274447939333253981E-17 -0.310972264716655005E-17 - -0.120358087848916004E-18 -0.688214269644119025E-21 0.293847640803337985E-18 - -0.641415699308319028E-18 -0.881761298091727038E-18 0.296628622022741023E-18 - -0.105601516593214997E-17 -0.284306608745905986E-17 -0.338283783309686005E-18 - 0.266548033611127006E-17 0.116345931005788996E-17 -0.166613820907728004E-18 - 0.215419523463735996E-17 0.626804380968182010E-19 0.639509875176997045E-18 - -0.297890899637496982E-17 0.119222803137702003E-17 0.431986803099692980E-18 - 0.235475159336694994E-17 0.261563774112127983E-17 0.307684718090124983E-17 - 0.745388993583784009E-18 0.363483013490502018E-17 0.542101086242752024E-18 - 0.819927892942163017E-18 -0.137908119795569001E-17 0.153143556863578002E-17 - 0.243945488809237997E-17 -0.169967748778416003E-17 0.284603070277445010E-18 - 0.346055310600744018E-17 -0.934065582584680084E-18 -0.214364186615361989E-17 - 0.121972744404618998E-17 -0.187702501111552994E-17 0.129087821161555006E-17 - -0.375923809903299015E-17 0.745388993583784009E-18 0.496530713680471013E-17 - -0.156912853478858993E-18 0.322888959493338981E-17 -0.216332214728747998E-17 - -0.179570984817912009E-18 -0.179570984817912009E-18 -0.216332214728747998E-17 - 0.159764972230942001E-18 -0.124635604232549994E-17 0.583605700658213030E-18 - 0.123751513593852991E-17 -0.106874382119810999E-17 0.503334853245148010E-18 - -0.570046969647549980E-18 0.186124902247292013E-17 -0.224104734636542000E-17 - 0.346772455512312988E-17 0.205875328297097990E-17 0.145346221537408007E-17 - 0.586146799499976027E-18 0.264697796016969010E-17 -0.372201497146810978E-18 - -0.208443972253258984E-17 0.831015421872823993E-18 -0.985304458448664060E-18 - -0.122467729283171000E-17 -0.128923708528025005E-17 -0.334881093141887986E-17 - -0.263824293290113013E-17 0.237010406553594010E-18 -0.229704747383526007E-18 - 0.129257227751006001E-17 0.914372066561017023E-18 -0.293793377755153996E-17 - -0.146488971553649996E-17 0.586824425857778967E-17 -0.383028298748394999E-17 - 0.325260651745650984E-17 0.116043513773838992E-17 -0.711507675693612044E-19 - 0.270785845325358986E-17 0.158620154263168996E-18 0.269861719145015013E-17 - 0.383335348191774008E-18 -0.514254878101766997E-18 0.233966381899399013E-18 - -0.790076598996348969E-18 0.667991358028422990E-18 -0.269652938758407014E-17 - 0.390482188684231994E-17 0.140692172538938997E-17 0.380910716380259017E-17 - -0.247757137071882994E-18 -0.199222149194211016E-17 -0.181773270480773007E-17 - -0.490537955578646973E-18 0.115196480826585000E-17 -0.847198383376810945E-19 - -0.315405916610788012E-17 0.144814344403536006E-17 -0.268858018189743015E-17 - -0.495129841954081968E-18 -0.158074907822647006E-17 0.403686653748175000E-18 - -0.171910438467481003E-17 0.160230685720048001E-18 -0.580566047782802009E-18 - -0.580566047782802009E-18 0.160230685720048001E-18 -0.945765225168629963E-19 - 0.838986134255384004E-18 0.121803337815168004E-17 0.381164826264434972E-19 - -0.338474365722817992E-17 0.347055868269689008E-17 -0.280497979692496991E-18 - -0.317614840398065008E-17 -0.181985442307893013E-17 0.225148543511765008E-17 - 0.429147919237410989E-18 0.940467960525839941E-18 -0.454856692675558993E-18 - 0.142979161496525994E-17 0.279498538717379988E-18 0.440419909444671001E-18 - 0.234553680134310998E-17 0.186512684518457011E-19 0.991875581234786082E-18 - 0.226687192508932013E-17 0.186688708552808003E-17 0.291392237873034990E-17 - -0.296599174392933981E-17 -0.949735692108884063E-18 -0.128833711277379005E-17 - 0.571747239396652970E-19 0.453162626781050970E-18 0.663129190387836005E-18 - 0.217093571646361010E-17 -0.296568412519281004E-17 0.396069170862700993E-18 - -0.465832944811199039E-18 0.173788617681510005E-17 0.292773924322348985E-17 - 0.747797743527539004E-18 0.310340345257573986E-17 -0.667401578251672007E-18 - 0.168471584229874004E-17 -0.578829807955515980E-17 0.479071164337129953E-18 - -0.370114100408684010E-18 -0.957904246464584045E-18 0.356807490403558990E-18 - -0.988544004688977926E-18 0.260777962546842991E-17 -0.169343680949112000E-18 - -0.331726748645320978E-18 0.152314043111593995E-17 0.754985966476637967E-18 - -0.297055098494334995E-17 -0.629786780666742000E-18 -0.471802637821747000E-18 - 0.474004104058814025E-17 -0.183018673611032999E-17 0.454297022273105981E-17 - 0.143070440877121008E-17 0.922260060882323081E-18 0.793934569373296001E-18 - -0.330004036250274988E-17 -0.365155903561329030E-17 -0.558321767182671991E-17 - 0.340233613449596002E-17 -0.638672768397092996E-18 -0.904571630663488944E-18 - 0.737624249173749028E-18 0.273475174932892005E-17 -0.610287238496723009E-18 - -0.285280696635248008E-17 0.452885521275844996E-19 0.143734418754817006E-18 - 0.210147054416443989E-17 0.147160394056858999E-17 -0.253326999085539019E-17 - 0.491174926009342989E-17 -0.494642756650379982E-17 0.321644052671446983E-17 - 0.534345440819455030E-18 0.892521263499816052E-18 -0.104905030517445001E-17 - -0.843221298991655988E-18 0.269229422284779016E-17 -0.344928028861956983E-17 - 0.700708005616119955E-18 -0.147680194353786994E-17 -0.533011198491406996E-18 - -0.702080298252345043E-18 -0.236921071047437994E-17 0.328175636224288007E-18 - 0.405514376520035964E-17 0.228538753592071018E-17 -0.130887766174470991E-17 - 0.848968549887674039E-18 0.757247454845344985E-18 0.107234371122393999E-17 - 0.672544160119913988E-18 -0.233955793987558016E-17 -0.857197342621352039E-18 - -0.433680868994201966E-18 -0.257498015965307014E-18 -0.151110677790167007E-17 - 0.113841228110978004E-17 -0.813151629364128037E-18 0.508219768352580023E-19 - 0.102167724434385003E-17 0.247117416265649998E-17 -0.302836777956074006E-17 - 0.121851355649726996E-17 -0.366020141865323971E-17 0.252415818281781983E-18 - 0.178808655165382994E-17 -0.542101086242752024E-18 -0.133068876013651007E-17 - -0.149077798716756994E-17 -0.214129929065887010E-17 0.226581313390524994E-17 - -0.329051124184614016E-17 0.433680868994201966E-18 0.429276297668479027E-17 - 0.542101086242751976E-19 0.131970380160180008E-17 -0.479420648145933992E-18 - 0.660685698858354024E-19 -0.711507675693611996E-18 -0.669083120031099994E-18 - -0.599011112386401003E-18 -0.343234624711939003E-17 -0.195054689191077018E-17 - -0.204929033676336999E-18 -0.454904338278842028E-17 0.175727572819745014E-17 - 0.357017653500514010E-18 -0.114005842222755001E-17 -0.367261574528644008E-18 - -0.318721147938444995E-17 -0.278055444590598997E-18 -0.108154216643068992E-17 - 0.286835796186848008E-17 0.144473814824863007E-17 0.175732205031176005E-17 - -0.276777279938202999E-17 0.315270956923971003E-17 0.331322231274440013E-18 - -0.406575814682063982E-19 -0.190650738250813994E-17 -0.344001255703653014E-18 - 0.401387737880132007E-18 -0.550238940807836995E-18 0.166319913296329992E-18 - -0.231558231661587994E-17 0.179282091988977989E-17 -0.379113418345304005E-19 - -0.283504574423973992E-17 -0.850505037875492034E-19 0.215451811295723997E-17 - -0.600969876076925994E-18 0.965194043396275044E-18 0.422912962652231018E-17 - 0.270344316993893994E-17 -0.545658624621219970E-17 -0.227004829864152009E-18 - 0.253432257818487008E-17 0.476667791067357997E-18 -0.109775469964156997E-17 - 0.393478567735145006E-17 -0.542101086242752024E-18 -0.286974762529756993E-17 - -0.131324333068139999E-17 -0.230392961653169981E-18 0.265629532258949010E-17 - 0.313508069602497990E-17 -0.453162626781050970E-18 -0.273549290315775986E-17 - 0.514307817660970006E-18 -0.307181792277692011E-17 -0.460785923306338999E-18 - -0.569206140554890020E-18 -0.328648783534668994E-17 0.174804770128381011E-17 - 0.894466792300540965E-18 0.303576608295941010E-17 -0.894466792300540965E-18 - 0.361767771772311983E-17 -0.211059806315422997E-17 -0.100288700954909006E-17 - -0.830092288309213971E-19 -0.346436475427008993E-17 0.565288613173838978E-18 - 0.395511446808555009E-18 0.201424434857073006E-17 -0.695414049695780964E-18 - -0.493777856601893997E-17 -0.332585501505931016E-17 0.490182939999496966E-18 - 0.203239589653500981E-17 -0.274086295969446009E-18 0.384873697335969983E-18 - -0.622858729446303948E-19 -0.186535018394995997E-18 0.517960647246005043E-18 - 0.502290537721800005E-18 -0.168068542091994001E-17 0.219370283682532990E-17 - -0.834135547143374013E-18 0.702852057763857018E-18 0.742186150251978975E-18 - 0.299452616633997011E-18 -0.852618070750257997E-18 -0.208606347807502990E-17 - -0.405305265261183001E-18 -0.237848175077988005E-17 -0.266900081679829998E-17 - 0.347918783084704018E-17 -0.211758236813574988E-18 -0.154132203131700996E-17 - 0.376929661528163988E-18 0.262664916943558997E-17 0.316693613899383005E-17 - -0.806375365786094019E-18 -0.826704156520197035E-18 -0.734112867473461040E-18 - -0.262071993880480987E-17 -0.917283742317204066E-18 -0.243945488809238016E-18 - 0.172117094882074002E-17 -0.476614851508154026E-18 -0.271050543121375988E-19 - 0.149924831664010991E-18 -0.293611728892637004E-17 -0.138701645112892008E-17 - 0.199306852488936985E-17 -0.197300443195128015E-17 -0.137409919868329007E-17 - 0.401330662417864981E-18 0.251442911077089996E-17 -0.623416249179164953E-18 - -0.179232171639009990E-17 -0.450060497858702980E-17 0.117250396136948002E-17 - 0.139756465830018993E-17 -0.331705050461928996E-17 -0.113723582348357996E-18 - -0.464527261216917018E-18 0.124683311874379009E-17 0.389591589814325992E-17 - 0.189226867995338999E-17 -0.708799730321770986E-18 -0.708799730321770986E-18 - 0.189226867995338999E-17 0.730089460974003017E-18 -0.241616148204289018E-17 - -0.642898006966013957E-18 0.193208215268705984E-17 0.114116513818835998E-17 - 0.940812894841274938E-18 -0.213640693092592999E-18 0.191601065377061008E-17 - 0.197291178772267999E-17 0.162956090277563995E-17 0.467328921951633987E-17 - -0.169970726628621002E-17 -0.373541529739146016E-18 0.309167025747820009E-18 - -0.140413908980875996E-19 -0.890122439723413042E-18 0.444510317573746012E-18 - 0.159521450258606000E-17 0.588714368121340047E-18 0.247807429653125996E-17 - -0.150405258163781997E-17 0.211586183246164007E-18 0.825751244454536043E-18 - 0.100479283368041005E-18 -0.744541960636529998E-18 -0.301924894048795009E-17 - 0.654121193517132955E-18 -0.872008011489114096E-18 0.347595984938333010E-18 - 0.120877279715728994E-17 0.108739371704337996E-17 0.222961127874442003E-17 - -0.224159359253126005E-18 -0.703638131074452959E-19 0.364852824584889013E-18 - -0.251407909045142986E-17 -0.133273603215256995E-17 0.305124807351774998E-17 - 0.349674804807092995E-17 -0.668332363235947976E-18 -0.168391328646527988E-18 - 0.109031825667353006E-17 -0.648693735224327031E-18 -0.364916181449932026E-17 - -0.650813065602986984E-18 0.385502589719925017E-17 -0.120598315164507008E-17 - 0.122143819895578996E-17 -0.100551719832776992E-17 -0.315327560109728015E-17 - -0.315810940427846024E-18 -0.218116670784693986E-17 0.142069090570837004E-18 - -0.386443752054550038E-17 0.128694000471918996E-18 0.282494506116690973E-19 - -0.308648228407384022E-18 0.201310440079036986E-17 0.188112236626928001E-17 - 0.514747278364022024E-18 0.514747278364022024E-18 0.188112236626928001E-17 - 0.364092852397105992E-19 0.183392682345548997E-17 0.100813833416394001E-17 - -0.180547891482710983E-17 -0.148124239527669002E-17 -0.121626326668438009E-17 - -0.819908795219576046E-18 0.141548838806682992E-17 -0.102046261316098992E-17 - -0.176117589589950993E-17 0.674229731042702993E-18 0.876369010356629040E-20 - 0.392250486364678998E-18 0.305615136027426994E-17 0.253348435073654018E-18 - 0.509936297130785021E-18 0.346726570183350994E-17 -0.498687128533197966E-17 - -0.382091095733158007E-17 -0.130344191058839003E-18 -0.132234278800973008E-17 - -0.290958106810184998E-17 -0.547838083491907977E-17 -0.148880064826396994E-17 - -0.257873617436497015E-17 -0.166142330785849998E-18 -0.315692570397513015E-18 - -0.292263116495110999E-17 -0.485469728861570009E-18 0.120185407756813993E-17 - 0.429089928317842021E-18 0.125501397022973005E-17 0.805596610804541017E-19 - -0.244340208613324994E-17 0.286470517184008993E-18 -0.183891871488211999E-17 - 0.127540845256485994E-17 0.319791558987090013E-18 0.369842425732445988E-17 - -0.145110647130905996E-17 -0.142820325539914005E-17 0.130250556227022007E-17 - 0.429164057413021993E-18 0.923727205858856957E-18 -0.206702571605096997E-17 - 0.375403241563379036E-17 -0.302615707496255018E-20 0.120185410341754004E-17 - 0.833068366285379940E-19 0.230376963355947018E-17 -0.555518782958974975E-18 - -0.887232412834819968E-18 -0.103930280883612994E-18 -0.255683181701366009E-18 - 0.252700699284744995E-17 0.441434446465967964E-18 0.708476885929217991E-18 - -0.800512309604317966E-18 0.294386300818232006E-17 0.223320236543596015E-17 - 0.855079760253216000E-18 -0.338161691451272986E-17 -0.735145188877927989E-18 - -0.866514705041148958E-18 0.259976331798577006E-17 0.576172986546055986E-17 - 0.350862222576413020E-17 0.354864453252188973E-17 -0.407577010831044005E-17 - 0.449193684955051025E-18 -0.545627853502432998E-17 -0.652112798989854975E-18 - -0.316399912411029001E-18 -0.733328286662454975E-18 -0.102626644238229001E-18 - 0.179973821776224996E-18 0.426600202950748001E-18 0.133225047713300999E-17 - -0.149374260248296000E-17 -0.580937546874361965E-17 -0.151407139321705993E-18 - -0.290677388387667010E-17 0.242759642683082015E-17 -0.291633443739655992E-17 - -0.190359074366827988E-19 0.564585509653169000E-18 0.270940528099907018E-17 - 0.271364871754146018E-18 0.104165200177577996E-17 0.286929763904433993E-17 - 0.548919701468149027E-17 0.101931481651438992E-17 0.179570984817912019E-17 - -0.677626357803439970E-20 -0.691178884959508980E-18 -0.161023610251002992E-17 - 0.125360876193635993E-18 0.285280696635248008E-17 0.387602276663567989E-17 - 0.254787510534094004E-17 0.243945488809238016E-18 0.224971950790742016E-17 - -0.589534931288993036E-18 -0.846030404351885973E-18 0.133284687435465996E-17 - 0.132183461885973996E-17 0.130259894730511999E-17 -0.676196989704949025E-18 - 0.109267250195805009E-18 -0.239032697715164018E-17 0.211419423634673018E-17 - -0.355330321373178992E-17 0.124683249835832991E-17 -0.117229359899994994E-17 - 0.204812566646090015E-17 0.154366460681175994E-17 -0.102999206386122998E-17 - 0.473067901041526970E-17 -0.445751088492576035E-18 0.175706396996064004E-17 - 0.234119906621088999E-17 -0.149416611895659004E-17 -0.149416611895659004E-17 - 0.234119906621088999E-17 -0.545767150717587043E-18 0.293270268735776000E-17 - 0.335791322687940991E-17 -0.198208356635465987E-17 -0.400158216617632984E-17 - 0.149475408927575000E-17 -0.266451620510562989E-18 0.215783878432394017E-17 - 0.296575751287548994E-17 -0.132635612379727001E-17 0.125805661614397001E-17 - 0.329964597375880009E-17 -0.725804251810673020E-18 -0.202273142845066994E-17 - -0.361750742191450980E-18 0.238477798464633989E-17 -0.615240396004691044E-19 - -0.294892886904875015E-18 -0.136428694961263993E-17 -0.460427427752502973E-18 - 0.408633405776274967E-17 0.241370248087692008E-17 0.212034179096043994E-18 - 0.920172551053097029E-18 -0.155108110818391000E-17 0.871671969365264044E-18 - -0.135194748282488005E-18 0.579682825683748995E-18 0.307642366442761999E-17 - -0.397258452262267013E-17 0.241912609735827984E-17 0.178893358460108000E-17 - -0.498055372985528985E-18 0.296678583731738995E-17 -0.628909555627129947E-19 - -0.257821443584816018E-18 -0.909587653898011082E-18 0.284076321663370992E-17 - -0.257331256353816998E-17 0.642289202035175025E-18 0.575876525014516962E-18 - -0.132401837567687995E-17 0.105540305227885996E-17 -0.410048649765807025E-17 - -0.330766365902804013E-17 0.314333926726071010E-17 0.249366499671665981E-17 - -0.134678238613434004E-17 -0.326266503370516004E-18 -0.803834266944331022E-18 - 0.584275019020739044E-18 -0.521181830722954026E-18 -0.246360856197893001E-18 - -0.890764331878753999E-18 0.548568871229028054E-19 0.885333063976731012E-18 - -0.171898057455870004E-18 0.173926304317850004E-17 -0.285280696635248008E-17 - 0.132475952950572997E-17 -0.372813610800100016E-17 0.343791462020794002E-17 - -0.608794342927187998E-17 0.192763522971397014E-17 -0.638662842229741974E-18 - 0.836445035413621970E-18 -0.484502845829459987E-18 -0.647503748616709023E-18 - -0.270095252911453979E-18 0.150473208466164007E-17 -0.401671791702481975E-17 - 0.289321267132416987E-17 0.217652064114137995E-17 -0.225518220867271997E-17 - -0.839409650729012021E-18 -0.231409401189875007E-17 -0.246802570644996992E-18 - 0.105003309913973996E-17 0.491384988525900984E-18 0.217168659764162005E-17 - -0.437069000783218975E-18 0.285079526310274996E-17 0.445195223120940015E-18 - 0.207681890754913995E-17 0.875990886138556925E-18 -0.940788906603511093E-18 - -0.431139770152439017E-18 0.545235108147592974E-17 0.266137752027301012E-17 - -0.111469866730911004E-17 0.519358251608974022E-17 -0.341298856642441984E-17 - 0.329834629660825014E-17 0.162630325872826002E-18 0.349655200626575026E-17 - -0.140792757701426004E-17 0.634427677493470954E-18 -0.722519104007917997E-18 - -0.278911008871896016E-16 0.584113920426565976E-17 0.682708555486965996E-17 - -0.992044987824237019E-17 0.376929661528164012E-19 -0.102173349262550002E-18 - -0.187074505590503005E-17 0.622484512937185011E-17 0.489839153397161963E-17 - -0.178723951870656986E-17 -0.215485181781494006E-17 -0.132137139771670997E-18 - 0.650521303491303046E-18 0.110114283143059006E-17 -0.937384231202242061E-18 - 0.769355052567648023E-18 -0.277877270876202001E-17 -0.341527911112772982E-17 - 0.188585358734149006E-17 -0.307333598216620014E-17 -0.275261078554908998E-17 - 0.577606530023473009E-18 0.170577906601920992E-17 -0.163191393435032013E-19 - -0.163191393435032013E-19 0.170577906601920992E-17 0.448670475468240001E-18 - -0.291729493222043008E-17 0.269717696660611989E-17 -0.178990335531848991E-18 - 0.256607323391346977E-18 0.134101628714247995E-17 0.230979308011910995E-17 - -0.403170793523692000E-17 0.147556225905089993E-18 -0.337953223511012990E-17 - 0.117050351108804007E-18 -0.178627074576569006E-18 0.405489664315588988E-18 - -0.296848046543621999E-17 0.780119362087592001E-20 0.311241529209519983E-17 - 0.188868776166629986E-17 0.299491679632580009E-17 0.416857321052395034E-17 - -0.175461764953809003E-17 0.476829251835777018E-17 -0.883650515905821941E-18 - -0.237576416358402012E-17 -0.452084617380231971E-18 -0.281131514415595014E-17 - -0.148642275037992002E-17 0.311120674489479000E-18 0.238552529446801001E-17 - -0.793446617904934958E-19 0.120185555098361008E-17 0.429438776165916018E-18 - -0.198933798085102997E-17 0.251437329702588004E-17 0.193038858216918014E-17 - -0.381279230735173007E-18 -0.241638795747386001E-17 0.338070251827785015E-17 - -0.883954289874912006E-18 0.230546801479983986E-17 0.280509763384942007E-17 - 0.255304094930993989E-19 -0.243909807437984001E-19 0.430172629206993001E-18 - 0.270270744496287993E-17 0.151309846511602999E-17 -0.110864972489187992E-17 - -0.322256764930377990E-20 0.120185576747228005E-17 -0.704689046369850962E-19 - -0.210033616755281988E-17 0.380112200160870019E-18 -0.169167342968315994E-17 - -0.103844874485367003E-18 0.251310187995528003E-17 -0.687636018697156054E-19 - 0.690260094094115994E-18 0.443239768152865043E-18 0.344151637139014981E-17 - 0.106920187246231007E-17 -0.219024191314240976E-18 -0.110791909500862003E-17 - 0.171270061934819991E-17 -0.592029708016452967E-18 -0.107287972426087994E-17 - -0.206796869538028005E-17 0.276609196837732003E-20 -0.759576795450294022E-18 - 0.875726188342539956E-18 0.519314816225762026E-18 -0.109032911543856009E-17 - 0.142830372383842998E-17 0.100573850791572003E-17 0.604142927981981971E-17 - 0.479671630253418989E-17 0.110844683623943006E-18 -0.235422778124405997E-17 - -0.161887517682752998E-18 0.235435785539538009E-17 -0.177241644212961999E-17 - 0.210064170919065983E-17 0.814342769446204974E-18 -0.282994638255850981E-17 - 0.217052192733914010E-19 0.234776357155211005E-17 -0.308228175752572013E-20 - -0.145160674586253994E-17 -0.227329022625726990E-17 -0.495544052645817993E-18 - 0.544329179940724975E-17 0.237253928525929987E-17 -0.201932654625425008E-17 - -0.155176435936987995E-17 0.160208341039270009E-19 0.892772726406032943E-18 - -0.179570984817912019E-17 0.110643678735093001E-17 0.243945488809238016E-18 - -0.147722546001149998E-17 -0.300791563965369003E-15 0.123197871683703990E-15 - 0.519874941706799006E-16 -0.398570506297564017E-15 -0.938756451046574014E-15 - -0.674170463378642972E-16 0.300205417165869003E-16 -0.819385791855919976E-16 - -0.474473975733969001E-16 -0.327433820650950991E-17 -0.162460919283374998E-17 - 0.252287936159080996E-17 0.695202291458967003E-18 0.296090954624581016E-17 - 0.361391397802216010E-17 0.408203287321918007E-18 0.495238006025435965E-17 - 0.275835450342189021E-18 0.364880639030068002E-19 0.194083312525632001E-17 - 0.278119002058286018E-18 -0.641258404061236032E-18 0.165955267525392004E-17 - 0.475587792606270030E-18 0.475587792606270030E-18 0.165955267525392004E-17 - 0.499313877686834030E-18 0.118594492762701009E-17 -0.532072040726810014E-18 - 0.597479878085993991E-17 0.616095653587975001E-17 0.267973686332527998E-17 - 0.237255357212338008E-17 0.415645509636498002E-18 0.406581717756872996E-17 - -0.216855405062163982E-17 0.128431581959682994E-18 -0.333001210673627003E-17 - -0.529555996006423025E-18 0.158490256251231992E-17 0.779339004809818954E-19 - -0.184932006891523006E-17 -0.121805361876716993E-17 0.305020810095135983E-17 - 0.451000953589631032E-17 -0.143328195155579993E-17 0.261449190125046997E-17 - 0.213311232586893987E-17 0.308627464263071995E-18 -0.142468127993853007E-17 - 0.138447030256762007E-17 -0.416491634298082012E-18 -0.634896151970501991E-18 - 0.766861496138588998E-18 -0.678597100070076063E-20 0.120185447177140002E-17 - 0.429046681675597000E-18 -0.176084645720124014E-17 0.144825178137232999E-17 - 0.314218888807008010E-17 0.734785045431920036E-18 -0.117162594263929007E-17 - 0.608863462349075997E-17 -0.662627775282060006E-19 -0.127825823877199996E-17 - -0.171021320481935997E-18 0.496852428684837022E-19 -0.322174981279925987E-17 - 0.429050616589522023E-18 0.104417858731019005E-17 0.163454775014506994E-17 - 0.178823488869088012E-17 0.581478820662879978E-19 0.120185445884671008E-17 - -0.129816506770870004E-18 -0.127996804138450008E-17 -0.154357189784606990E-19 - 0.214940359022386997E-17 -0.331603456810303990E-17 0.189241510509073018E-17 - 0.197721030105515002E-18 -0.416669355256307981E-17 -0.851010143081933984E-19 - -0.125601149543838993E-17 -0.238461833303634009E-17 -0.169618840330709007E-17 - -0.169618840330709007E-17 -0.238461833303634009E-17 0.400549807943398004E-18 - -0.271531580761243988E-17 0.199875144619239998E-17 0.743049289256009004E-18 - -0.307195681303235978E-18 -0.282936086004326019E-17 -0.136888465210175001E-17 - 0.725986645135740000E-18 -0.163159410949012006E-17 0.172113641403015996E-17 - 0.328205462052660005E-19 -0.427613578091477998E-17 -0.808777579770756965E-20 - 0.234837217968203999E-17 -0.253264027376469004E-19 0.574118313848444962E-18 - -0.393557272547890009E-18 -0.326626289571594991E-17 -0.775608933612533044E-18 - 0.105058856049080991E-17 -0.972396842945079028E-18 0.134638939547039006E-17 - -0.102121372777366003E-18 -0.136083563029536003E-17 0.281128595079716003E-17 - 0.405831033256048975E-19 0.242827227208932005E-19 -0.170833281491397008E-17 - -0.127819918718633995E-17 0.635195301101919971E-18 0.626566152951766979E-18 - -0.199179797546848994E-17 0.132218081797795004E-18 -0.149746601790291009E-17 - 0.212336552384022990E-19 -0.292432710560827011E-17 0.729454186263562966E-18 - -0.214066401594843015E-17 0.303724839061711014E-17 -0.113084192414368992E-17 - -0.340000958562557999E-20 -0.173530847038446989E-17 0.626566152951766979E-18 - -0.199179797546848994E-17 0.132218081797795004E-18 -0.149746601790291009E-17 - -0.127819918718633995E-17 0.635195301101919971E-18 -0.108475496177924000E-19 - -0.173530847038446989E-17 0.407397618620635981E-18 0.303835815115187008E-17 - -0.182128580743015985E-17 0.358929916256484033E-17 -0.155546948389484994E-17 - 0.165743789754505008E-17 -0.733902645660011989E-18 -0.390963850189295982E-18 - -0.130426075704324999E-17 -0.336457008402797007E-17 -0.323961040779898006E-17 - 0.134737673440338004E-17 0.209549852390063008E-18 0.950982283458043069E-18 - 0.311317695677142983E-17 -0.147193675656687989E-18 -0.592093288710995990E-17 - 0.271635327503675007E-17 0.162966204193836999E-17 -0.257298989513354013E-17 - 0.145506049662061992E-17 -0.187935630575551008E-17 -0.341302337267962986E-19 - -0.249774864015655006E-18 -0.557204531224242037E-17 -0.140921796185665985E-19 - -0.191936809076605013E-17 0.109847716108033996E-17 0.307745161082448993E-17 - -0.606673286482521039E-18 -0.725513167447356014E-18 0.223354010499117018E-17 - 0.660402286256383978E-18 0.261454292557767008E-17 -0.573978120970277956E-18 - 0.106320334455071992E-17 -0.381807789389963005E-18 -0.201346560792103982E-17 - -0.501719392383432000E-18 0.170682457038868001E-18 0.276999327163805993E-17 - 0.143519550603408003E-17 0.166110181328899991E-17 0.857665614334221004E-19 - 0.132134526597416992E-18 -0.163342014702957008E-17 -0.159162656391022990E-18 - 0.333481192424111018E-18 0.110073055806264995E-17 -0.165744961785494992E-17 - -0.723664504377990964E-18 -0.373996400265941977E-18 -0.369427622167534017E-19 - 0.137017753030935008E-17 0.219171077410990982E-17 -0.120449150112210007E-17 - -0.380275891434780971E-17 0.355095637040657016E-17 -0.545627094050522035E-18 - 0.120185446207787991E-17 0.297192801651785003E-17 0.587392766169290955E-18 - 0.135525271560688006E-18 -0.279859685772820986E-17 -0.256481576428601989E-17 - 0.100246349307546002E-17 -0.813151629364128037E-18 -0.449943901581484032E-17 - -0.105709711817336990E-17 0.673391193067168962E-18 0.214299335655337986E-17 - -0.399799551104030001E-18 -0.399799551104030001E-18 0.214299335655337986E-17 - 0.815057453495450983E-18 0.164519606391896997E-18 -0.132860922807655002E-17 - 0.338956115711568991E-17 0.276652871974074985E-17 0.209062470706900983E-17 - 0.640476975732763991E-19 0.210311470833367019E-17 0.292498933315886017E-18 - -0.266081026581987999E-18 -0.339530842093640994E-18 0.891473360161341033E-18 - 0.127055950214517004E-17 0.864474929349381958E-18 -0.173876150030865007E-18 - -0.268677531257492988E-17 0.262439820419368005E-17 -0.394640487662082006E-17 - 0.239815370840882017E-17 -0.692232213035683983E-18 0.895380620082258923E-18 - 0.271301013410857011E-17 -0.270990856870302012E-18 -0.462501914737739018E-18 - -0.282121724895181004E-17 -0.224758925933616008E-17 -0.213500811600352005E-17 - -0.540097564326268985E-19 -0.571747239396653018E-18 -0.201000918383445008E-17 - -0.203393786459438984E-17 0.320178454062126010E-18 -0.899548989984066960E-18 - -0.643639160794860989E-18 -0.373628383703464999E-18 0.290045029488884995E-17 - 0.443262929210015969E-17 -0.124590605607226994E-17 0.105385457017215992E-17 - 0.250840866395480999E-17 -0.133195930955738998E-17 0.189142457121884996E-17 - -0.166018457661843011E-18 0.441388868814215984E-17 0.256481576428601989E-17 - -0.336229728412594982E-17 0.516690097825123026E-19 0.324032453972133018E-17 - 0.344954167769314006E-18 -0.224103742019807013E-17 -0.501443504774545993E-18 - -0.975781955236953991E-18 0.473322010925703029E-17 -0.283733538017529017E-17 - -0.116551733542191996E-17 0.253093444639585018E-17 0.203287907341032009E-18 - -0.279870273684662012E-17 -0.303322498411764990E-17 -0.289685267960971005E-18 - 0.198205709657505990E-18 -0.884990779140575957E-18 -0.239498565836152988E-18 - -0.148604651406377009E-17 -0.211101786279453002E-17 -0.360401931144864038E-17 - -0.530348504099598992E-18 0.888749319906575016E-18 -0.337449898656761984E-17 - 0.172225982869958989E-19 0.542094468797851985E-18 0.101984131697427992E-17 - 0.152297909443849011E-18 0.134546096510577998E-18 0.167670171908989002E-17 - -0.190413006542766986E-17 0.281780730027413984E-17 -0.127285229170680006E-17 - -0.171465938303872008E-17 0.128071381624849999E-17 -0.709098925749857965E-18 - 0.231568881611974999E-17 0.245639554703747008E-19 -0.577358832672211987E-18 - 0.380971266001097985E-18 0.689428570783346966E-18 -0.300781399570002012E-17 - -0.515313669285834969E-18 -0.334866534763107014E-17 -0.885606033578872933E-18 - -0.755712621218751975E-18 0.215419523463735996E-17 0.672544160119913988E-18 - -0.336399135002044994E-17 -0.127245524501276999E-17 0.482123254002297968E-18 - 0.163985578588433008E-17 -0.132475952950572997E-17 -0.288668828424266018E-17 - -0.171100655345368996E-17 0.948676900924816019E-19 -0.311083437790982016E-17 - -0.542101086242751976E-19 -0.274438674910393021E-18 -0.137908147745226995E-17 - -0.582758667710959018E-18 -0.948676900924816019E-19 0.260526158751741003E-17 - 0.896160858195049951E-18 0.346055310600744018E-17 0.406628754241267990E-18 - -0.158687652201152991E-17 0.481114714040442978E-18 -0.347283508374262985E-17 - 0.670850094225405965E-18 -0.124582664673347005E-17 -0.243945488809237997E-17 - 0.605120337518472008E-17 0.677626357803439982E-18 0.306668278553419014E-17 - 0.286720652645581002E-18 -0.311708124589583006E-18 -0.779270311473956023E-18 - -0.112231865511195007E-17 0.422245924206269008E-18 0.785411300341549989E-18 - -0.177876918923402994E-17 -0.271008191474013008E-17 -0.191577676845240994E-17 - 0.192117660349115996E-17 -0.172907527995913996E-18 -0.196332952071170988E-17 - -0.295853553788778990E-17 0.297413657783084012E-17 0.238766345557921019E-17 - -0.258331152278270982E-17 0.434993770062446016E-17 -0.391244518336761018E-17 - 0.249172443099960983E-18 0.295345913046855004E-17 0.180478236513759987E-17 - -0.333194968181260020E-18 -0.398801640413046019E-17 0.127766979159431005E-17 - 0.406976831843030025E-17 0.125052834133521992E-17 0.584942424528099012E-18 - 0.105310018145351002E-17 -0.248180653545509999E-18 -0.766564817265142048E-19 - -0.351391618168446033E-17 0.116087933372732994E-17 -0.435374934888709989E-18 - -0.120702194983737991E-17 -0.184250841851491989E-17 0.169237182861408996E-17 - -0.340337838206777992E-17 -0.248543289526053016E-17 0.456352235223054964E-18 - 0.241855596312107998E-17 -0.178644542531852011E-17 -0.843300708330460982E-18 - -0.535150122119347000E-17 0.259308548890062985E-17 -0.842797782518028958E-19 - 0.840044925439451976E-18 0.223701401369860989E-17 0.350290475337015982E-17 - 0.140776875833665005E-17 -0.137781820271733003E-17 0.193970544921234994E-18 - -0.692872950854017966E-18 0.230816478126796993E-19 0.914583824797830984E-18 - -0.332036915323686022E-18 -0.124683249835832991E-17 0.221583819001725007E-17 - -0.271333769763113996E-17 -0.102999206386122998E-17 -0.175844039849992982E-17 - -0.120617491689012002E-17 -0.683979104907848005E-18 -0.119558700504943992E-17 - 0.169406589450860008E-19 -0.143995601033231009E-18 -0.298933170139291000E-17 - -0.111914228155973999E-18 0.259477955479513999E-17 -0.505043394800376962E-18 - -0.261627301583172017E-17 -0.559571140779871969E-18 -0.249122977699329985E-17 - -0.244248420444131008E-17 0.763575231435010022E-18 -0.498453660450472978E-18 - 0.131165285269981993E-17 -0.409226721750142997E-18 -0.166919670939219005E-18 - 0.356664398265104977E-17 0.123031535588687009E-17 0.140753053032022997E-18 - -0.174446435487023001E-17 -0.348347593514250969E-17 0.608010837450976981E-19 - -0.288229430082877002E-18 -0.270444385168497019E-17 -0.511978477056020969E-18 - -0.180010383159299994E-17 -0.519787062038522031E-18 -0.778333943020545996E-18 - 0.223510818956729003E-17 -0.461209439779967016E-18 -0.254326936369024016E-17 - -0.754514450101469011E-18 -0.729630168938883045E-18 0.215419524756206011E-17 - 0.423516473627149977E-18 0.118034041199887003E-17 0.526282745952778004E-17 - 0.189749380216831006E-17 0.362191288245938979E-17 -0.373710936328597001E-17 - 0.325260651745650993E-18 0.315943289325853988E-17 -0.894466792300540965E-18 - -0.152582397536022009E-17 -0.189735380184963011E-18 0.109436656785256008E-17 - -0.137907918816530007E-17 0.548877349820787005E-18 0.989334482393022989E-18 - 0.359374903696317968E-17 0.372694496791892029E-19 0.346055310600744018E-17 - 0.325207712186447985E-18 -0.166962105304642995E-17 -0.189715527850262003E-18 - 0.189418900882599987E-17 -0.165532075461661994E-17 0.311165494107747979E-18 - -0.215505034116195009E-17 0.118796370852416008E-17 0.601181634313739955E-18 - -0.347389387492670004E-17 -0.208708918203459989E-17 0.337288519596662010E-17 - 0.931736241979729988E-18 -0.145689666927739995E-18 0.227220724004029024E-18 - -0.162143943672644003E-17 0.174742897018562006E-17 0.323481882556416991E-17 - -0.635274710440724965E-18 0.375235595633655002E-18 -0.145080924035446995E-17 - 0.143389882520053003E-17 -0.235633978014306013E-18 -0.112496563307211995E-17 - 0.293844332080887002E-18 -0.161068608876326000E-19 -0.307600014795399014E-17 - 0.258090939028385005E-17 -0.476763082273924010E-17 -0.228127148519264015E-17 - -0.309844652105622987E-17 0.174912303608012996E-18 0.531513174402072981E-18 - -0.382435375685316988E-18 0.900819539404948025E-18 0.326277091282355990E-17 - -0.264724265796570986E-18 -0.879365783037773085E-18 -0.260547334575423015E-17 - -0.143360326322790005E-17 -0.504831636563563002E-18 0.179242759550851016E-17 - 0.298155597433513989E-17 0.101982766849417992E-17 -0.297393267780985003E-17 - 0.257498015965306995E-17 0.151788304147971006E-17 -0.326954717640159979E-18 - 0.230392961653170000E-17 0.139591029707509004E-17 -0.742847894742021012E-18 - 0.256650983018053003E-17 -0.153143556863578002E-17 0.409540429997454021E-18 - -0.142725051612349992E-18 -0.443845264361252992E-18 0.113671821521527009E-17 - 0.263935466364439986E-17 0.180841534238793007E-17 -0.220183567660795011E-17 - -0.102829799796672003E-17 0.334027442749732992E-17 -0.454009659728304982E-18 - -0.325260651745650984E-17 -0.522448571488674997E-18 -0.336345389745121001E-17 - -0.457041606071333975E-17 -0.304355831294945982E-17 0.427237952841651978E-17 - -0.186941463343081017E-17 -0.129563254076186006E-17 -0.112984180780269004E-17 - 0.463510525760183965E-18 0.543403608029869993E-17 0.628157090539567034E-17 - 0.107863850412424995E-18 -0.496782957561507984E-18 0.143475058212455998E-17 - -0.157547458689992002E-17 -0.229157071099953982E-17 -0.353871744751775981E-17 - -0.655427268083501956E-18 -0.373057479213008010E-18 -0.288389363397911984E-17 - 0.267562859761936013E-18 0.221342377338269010E-17 0.759040992641635024E-19 - -0.427051187586338014E-18 -0.246546273902076005E-17 0.311034949497450000E-17 - -0.250275745587688010E-17 0.149384832852448007E-17 0.785170068645730031E-18 - -0.124784693198205006E-18 -0.251252152874503999E-19 0.306800057149170005E-17 - 0.512280225515675992E-18 0.352936752798798985E-18 -0.468641674547420009E-18 - 0.342326765393769015E-17 -0.232885905658585013E-17 0.440296799120129026E-17 - -0.682831107464595018E-18 -0.159760541806344992E-17 -0.278081237108601981E-17 - 0.216042711637734983E-17 0.166110979752061002E-17 0.117815888393792008E-17 - -0.169403964681784996E-18 0.249326037070838015E-17 -0.597174026640863979E-18 - -0.197246651090594008E-18 -0.770422236383328028E-18 -0.405218297824251983E-18 - -0.384955849310639988E-19 -0.747681707350008986E-18 -0.163334696595593989E-19 - 0.145763242918459997E-17 0.291859303545338004E-17 -0.294086889444445013E-17 - 0.108848259833857008E-17 0.139201626084264003E-17 0.337320625108601995E-20 - 0.120185558006418003E-17 0.379317687846262973E-17 -0.300611688448085986E-17 - -0.418480598057927000E-18 -0.261770197033990986E-18 -0.125675852095236000E-16 - 0.569849893866608996E-20 -0.467203818658997981E-18 -0.969974113361066045E-18 - -0.308892401784452019E-18 -0.128391665958031008E-18 -0.149140333571066003E-17 - 0.168559556503606008E-18 0.264274279543341995E-18 -0.103443898683430996E-18 - -0.166316242682362005E-18 0.493529040673638011E-19 0.366553507924297990E-18 - -0.965617559869902025E-19 -0.628914596258986970E-18 0.680130388614003019E-19 - 0.250558797806600002E-18 -0.222187329976643995E-18 0.573591561356962000E-19 - 0.337396632102723016E-19 -0.405891570879359990E-17 -0.890046339107058019E-19 - 0.484291087592646026E-18 0.594828887209332028E-18 -0.110802497412702995E-18 - -0.409245953699384999E-18 0.386593778060743030E-17 -0.846764863036188956E-19 - 0.417633151520367001E-19 0.269234881676822002E-19 -0.653364819565015025E-17 - -0.108858622973204005E-18 -0.264863232139479010E-19 -0.124010917433949999E-19 - 0.374812079160027996E-18 0.271262301358190021E-18 0.288414718540088996E-17 - -0.396919639083364985E-17 0.240811466904398010E-17 -0.235051642863068013E-19 - -0.148738985537855005E-17 -0.355753837846806022E-19 0.407761660808220029E-17 - 0.397766672030618977E-17 0.493311988480904980E-17 0.739565642071410984E-19 - 0.343477958748839018E-14 -0.366744259744019982E-14 0.123993155153046002E-13 - -0.668060306510329026E-14 0.135525271560687994E-19 0.152465930505773989E-18 - 0.242251422914729993E-18 -0.542312844479565985E-18 0.462585868319255004E-17 - -0.919879898349006038E-19 0.210064170919066003E-18 -0.643745039913268017E-19 - 0.125157058890702999E-16 -0.582864546829365060E-19 0.246041564481448014E-18 - 0.101296537813243998E-18 -0.238514220907215010E-17 -0.797809496952801045E-19 - -0.192534559377843010E-18 -0.187360213774079013E-17 -0.389635155736977975E-19 - -0.200746808499269012E-18 -0.245893664587923003E-17 -0.948676900924816019E-19 - 0.000000000000000000E+00 -0.108420217248549998E-18 0.261375838676955993E-18 - 0.725966792801038968E-19 -0.167646349107347008E-18 -0.805740091075652957E-19 - -0.511324047944521005E-17 -0.804386616798363005E-20 0.803374354523752020E-19 - 0.233691757936031022E-18 0.322467056021906979E-19 0.692362580416071978E-19 - 0.782403671634307027E-17 0.714617874796812030E-19 -0.205034912794744004E-18 - -0.370259277068535988E-18 -0.219593291575676978E-18 -0.226587930835426006E-18 - 0.589328467008100031E-17 -0.458175341293122029E-19 -0.555484868553859964E-19 - 0.307568912804366982E-18 0.664220076179670995E-17 -0.838628792230762059E-19 - 0.133440776417054003E-19 0.513182852027898008E-19 0.130496013436366007E-19 - -0.467879824239593957E-18 0.114281685243550001E-16 -0.521729943861285986E-17 - 0.141818726358787999E-16 -0.217687467444354995E-18 0.338813178901719991E-18 - 0.218534500391609006E-18 -0.564123942871364026E-17 0.155854062294790998E-18 - 0.738612730005749992E-18 0.132137139771670997E-18 0.652215369385811020E-19 - 0.528548559086683026E-18 0.393023287525995020E-18 0.813151629364127964E-19 - 0.287313575708658983E-17 0.687790753170491972E-18 0.106387338175140007E-17 - -0.813151629364127964E-19 -0.669156028330897037E-17 -0.169406589450859993E-20 - -0.383705925106198005E-18 0.159877468794249008E-19 0.637946834691517017E-17 - -0.251992301808153990E-19 0.130597260343342002E-17 -0.397914845927722009E-19 - -0.771823101021564955E-19 -0.678629956492047021E-19 0.244403275117150019E-18 - 0.616306942007001003E-19 0.578090437734863976E-17 0.519520791676689028E-19 - 0.250034502269737982E-18 0.145212009554413988E-18 -0.175005708853890006E-18 - 0.394009615144800003E-18 -0.110346462401244998E-18 0.222948572015913991E-18 - 0.171997642246803001E-18 0.319375928924822986E-20 0.101563716873418997E-18 - 0.901803069674074007E-19 -0.274728422129961016E-17 -0.421283293404232023E-21 - 0.608978550920113979E-19 0.763010629372050983E-18 -0.286952614910220011E-18 - 0.730908685021870015E-19 -0.451919320602110971E-17 0.589814511873686954E-22 - 0.126157748391557989E-18 0.141953434272505006E-18 0.100040548709848005E-17 - 0.102890605326513008E-20 -0.935924522377375949E-19 0.847890824022396966E-19 - -0.879092467376866968E-17 0.476821187834545992E-19 -0.753033951901363993E-19 - 0.168555449681110991E-19 -0.100415019031744998E-16 0.329480295536328014E-20 - -0.430020650141573979E-19 -0.930238733580653041E-19 0.843287605802546953E-19 - -0.745054355771767074E-17 -0.127373076708930990E-18 0.142668167903681001E-19 - -0.212738678080100992E-18 -0.124931763730073991E-18 0.141197446866016998E-17 - -0.623935018560200991E-21 -0.208825078665083006E-19 -0.702882739988749020E-17 - 0.135013553282889001E-18 -0.273662399296958983E-18 0.979870564908593936E-17 - -0.967980574380733076E-17 -0.586955097351761970E-17 0.928385604696770927E-21 - 0.588030583085907051E-19 -0.303121635649031013E-20 -0.785213004720344977E-17 - -0.143126772192240001E-21 0.442508069709277023E-17 0.348261833625540021E-18 - 0.403209644538310013E-17 0.158323610013567989E-18 -0.736025309049684025E-19 - 0.252063439340834008E-18 0.327873715300706015E-21 0.308873893618246024E-19 - -0.782545946699665940E-19 -0.916516118708754965E-20 -0.330581739190081984E-17 - 0.194817577868489006E-18 -0.215146368602591997E-18 0.701978555037001045E-19 - -0.378514539581815021E-17 0.926442286059390996E-20 0.154159996400282999E-18 - 0.327166475876973987E-18 0.149444776103045990E-18 -0.384315608916437980E-19 - 0.541531313897068035E-18 0.866120139888960971E-19 0.110405247154699008E-18 - 0.129192718002985003E-17 -0.571019320457605944E-19 0.236706004088174008E-19 - -0.840441972133477969E-17 -0.208661272600177011E-17 -0.128778654135807999E-16 - -0.241127284462270001E-19 -0.124068820076829006E-18 -0.257368975789749001E-19 - 0.361217365792718986E-18 0.122756911625320001E-18 -0.415100531279882990E-17 - -0.172979513388722001E-17 0.192235450868344009E-17 -0.817419881324901948E-19 - -0.542101086242751976E-19 -0.247333620598255988E-18 0.251695840276614999E-17 - -0.439123055680311038E-17 0.812389299711598993E-17 0.749624158320056041E-19 - -0.872105122493027937E-17 -0.372694496791892029E-19 0.467901000063275970E-17 - 0.399121924746226012E-17 0.660008072500550978E-17 -0.527542707461818990E-19 - 0.343843876982052995E-14 -0.367179973492087989E-14 0.124130984354223001E-13 - -0.668730478978197023E-14 0.811118750290718005E-17 0.122819777351874007E-18 - 0.342201310690737000E-18 -0.599540507978433977E-18 -0.469610947825544972E-17 - -0.214310597913247990E-19 0.372694496791892029E-19 -0.345589442479755020E-18 - 0.437836624391667992E-18 0.205643717725583008E-18 -0.312684304341145982E-18 - -0.472009937038071041E-21 0.120475133905591992E-18 -0.260656398339187997E-18 - 0.673638442522140059E-19 -0.964679226862538027E-19 -0.287342692466221023E-18 - -0.221571907600904013E-18 -0.434358495352004983E-17 -0.228529489169210000E-17 - 0.277911509994136018E-17 0.156800809279949005E-22 -0.476438579319618995E-17 - 0.793257935632228010E-19 -0.191429446079471997E-18 -0.792505201274804995E-19 - -0.761249535594635047E-19 -0.759284593898908991E-19 0.245210185927285981E-17 - 0.326322751652168995E-21 -0.219505881849384995E-18 -0.351708553712615972E-17 - -0.957527733479134017E-19 0.210332177437534002E-18 0.632850078629209974E-17 - 0.783703999557241003E-17 0.712107216201591027E-17 0.931799314501438023E-19 - -0.162733723449395010E-18 0.273630519451929002E-18 0.405238470426719007E-18 - 0.290400572395675995E-18 0.309034676849811019E-19 -0.982194259345465061E-17 - -0.459295178406396010E-17 0.378501304692013982E-19 0.660156303266319966E-19 - -0.393711501795638995E-18 0.111130722679763993E-17 -0.731836466427715011E-18 - 0.133492392487278003E-17 0.170253622398114007E-18 0.678102813836270969E-17 - 0.116382328043262001E-19 0.652850644096251986E-18 0.135101755087061000E-18 - 0.135525271560687994E-19 0.406575814682063982E-19 -0.688214269644119025E-20 - -0.338813178901719985E-20 0.291379333855478980E-18 -0.673391193067169010E-19 - 0.575558887659297003E-17 -0.529395592033938013E-20 -0.101643953670516005E-18 - 0.225310763969643986E-18 -0.860585474410368990E-17 -0.355753837846806022E-19 - 0.111384832563939989E-18 -0.619022265765282981E-18 0.659097512082252000E-20 - 0.367135843075535984E-19 -0.148258098918869009E-16 -0.403656384107634971E-20 - 0.301976499779261008E-19 0.189408714322993997E-20 0.272869291952943018E-18 - -0.557349643847283955E-19 0.426147018319669965E-17 0.254580724327268992E-21 - -0.869775563299138967E-19 -0.110887275953398993E-18 -0.998341549897226014E-19 - -0.481248056751729986E-20 -0.185349767128022002E-18 -0.145910104401558996E-18 - -0.131668031408930009E-17 -0.607341738615819030E-21 -0.624901402953516003E-17 - -0.116001184478416006E-20 -0.833449305952115017E-20 -0.220151721333313981E-21 - 0.165335936503532999E-18 -0.129960524270914003E-18 -0.396916978071411982E-18 - 0.219508923152721984E-18 -0.827237810600491973E-17 0.341451599951967009E-18 - 0.159674085227868006E-18 0.188315067856254009E-20 -0.847491822467939984E-19 - -0.455167206907106011E-19 -0.209933309261154010E-17 -0.145404646964087995E-21 - 0.168411398277699010E-18 -0.313218061614544984E-18 0.560594904015392966E-19 - 0.153058648530649989E-18 -0.122794367412262001E-17 0.111914706524383009E-20 - 0.171517425742714004E-18 -0.183906875580670001E-18 0.219862189175055999E-18 - 0.897452583439613029E-17 -0.352102288514519020E-19 0.153068958878854008E-18 - 0.146666444626402991E-18 -0.303260900504276985E-19 0.235125754644082987E-18 - 0.538382462411191964E-21 0.130959794039688006E-19 0.881466470414765984E-17 - -0.176754757224459993E-18 -0.338370818476285984E-19 0.458932469699724984E-17 - -0.582300740523848969E-17 0.582615730901109008E-17 -0.949456505418176036E-21 - 0.113959685864603008E-18 0.144076135613940989E-19 0.582626318812949995E-17 - -0.116571720284226000E-20 -0.141483631752557998E-16 0.100448206267148993E-17 - -0.113650645697845998E-17 -0.471162076910205015E-20 0.673391193067169010E-19 - 0.104026233834669001E-18 0.535324822664718007E-18 0.223616698075135001E-18 - 0.861263100768173047E-17 0.000000000000000000E+00 0.243945488809238016E-18 - -0.174488787134385990E-18 0.271050543121375988E-19 0.487890977618476995E-18 - -0.125527106409534999E-16 0.100472472860477996E-18 -0.955823741417275020E-19 - 0.107162902717469998E-18 -0.101127663721311999E-18 0.305449075076592991E-19 - -0.416346654315768998E-17 -0.370398036616291987E-19 0.639750339166004974E-19 - -0.124552860677330003E-16 0.295344837712059013E-19 0.438212164389766970E-18 - 0.101705744062273998E-16 -0.262887759400580013E-17 0.254481784579693981E-17 - 0.126166498411475007E-18 -0.267108613912254992E-18 -0.118145276312759989E-18 - 0.181096822855342012E-18 0.187530220069472992E-18 -0.423863827445876037E-17 - -0.640878176666766969E-17 -0.881828403118920017E-17 -0.228818836947481010E-19 - 0.146804706393460995E-18 -0.227785688362402982E-18 0.113841228110978004E-17 - 0.179570984817912009E-18 -0.121972744404618998E-17 0.237846851589007997E-17 - -0.233781093442186990E-18 -0.337119113007212016E-18 -0.477943634444158998E-17 - 0.130072496962737990E-18 0.000000000000000000E+00 -0.406575814682063982E-19 - -0.595040645446145988E-19 0.148442524006315990E-18 0.293073399749988014E-18 - 0.321449003483006979E-18 -0.327462937408513012E-17 -0.140607469244213991E-17 - 0.440457132572235984E-18 -0.105549829437158004E-18 -0.242918130488448022E-18 - -0.188834994088198992E-18 -0.239202104304613992E-17 0.119405175783255012E-18 - 0.120003227366130008E-18 -0.403498702803366996E-18 0.243945488809237997E-17 - -0.745388993583784058E-19 -0.514996031930614991E-17 -0.502876181595488006E-19 - 0.874958564734091035E-19 -0.126038502551440006E-17 0.198205709657505990E-18 - 0.262368455412020012E-18 -0.435120825004534007E-17 -0.847032947254300038E-20 - -0.933853824347866003E-19 -0.218110983917982006E-19 -0.158951026508189991E-18 - -0.123216824045899002E-19 0.105479069047138999E-16 -0.221602720078721986E-19 - -0.112094436116716006E-16 0.552668609373801021E-20 0.600580946092660953E-19 - -0.150321091163205985E-19 -0.566633402054696966E-18 0.471817359051710964E-20 - -0.309740174670613979E-18 -0.713982335130080985E-19 0.717113685500036983E-17 - -0.388401777744873018E-18 -0.159250724383875008E-18 0.102895322840944002E-18 - 0.202123237038557009E-18 0.107599654080898005E-19 -0.115560962453870006E-16 - 0.115264413034391004E-18 -0.323460706732736002E-19 0.217151454407421018E-18 - -0.162863590805566010E-19 -0.447140234933781975E-19 -0.458895903494596031E-17 - -0.264017439963143991E-19 0.132772414482112011E-18 -0.946559318556680990E-19 - -0.602663941971434999E-17 0.533630756770209021E-18 -0.107064964532944006E-17 - -0.152465930505774001E-19 0.198642461020933989E-18 0.271326821445969012E-18 - 0.764647908913610958E-17 -0.285476573004300971E-19 -0.423615735300657014E-18 - 0.405577242246589994E-17 -0.569841415265331035E-18 -0.202440874393777998E-18 - -0.138405183581353003E-17 0.741153828847513037E-19 0.440457132572235996E-19 - -0.145689666927739995E-18 -0.201593841446523018E-17 0.118584612615602002E-19 - 0.100288700954909006E-17 -0.389635155736978012E-18 0.253678923077150013E-17 - 0.463253320355523965E-17 -0.542861329849173998E-18 -0.578182497767161953E-18 - 0.378034402595259001E-17 -0.390167653256308987E-19 -0.809119123877932027E-18 - -0.637257875959320984E-17 -0.796210970419041958E-19 0.108526096366957005E-19 - -0.806968288849171933E-17 0.477726582251425006E-18 -0.325260651745650993E-18 - -0.114409418473740993E-21 -0.136584062744755990E-19 0.204770214998727011E-18 - -0.769571784227895035E-17 0.308108234563751989E-19 0.136366586621958005E-17 - 0.456872337957895004E-19 0.560083992759655039E-19 0.178381499097059997E-19 - 0.290512448573523984E-18 0.169088952095639994E-18 0.173985861321953990E-18 - -0.136689941863163006E-18 -0.742890246389383958E-17 0.225469582647254023E-18 - 0.414781446358590005E-18 0.579688173277162020E-20 0.298155597433514008E-18 - 0.166018457661843011E-18 -0.339766090967380983E-18 0.405517023497995986E-19 - 0.152465930505773989E-18 0.265968345437850018E-18 -0.671697127172659976E-18 - -0.315784470648243990E-18 -0.822315467062235941E-17 -0.257815653320528003E-18 - 0.697955148537542998E-18 -0.406575814682063982E-19 -0.247333620598255988E-18 - 0.582758667710959018E-18 -0.727897763222982996E-17 0.448338738054221020E-20 - -0.542101086242751976E-19 0.189735380184963011E-18 -0.758941520739853008E-18 - 0.460785923306338999E-18 0.111490711682346993E-18 -0.542101086242751986E-17 - 0.787740640946499026E-18 0.358824332280602981E-18 0.387587718284786979E-17 - 0.202017357920150992E-18 -0.887267012248879966E-19 -0.146410968421885989E-19 - -0.105032085459533001E-18 0.372694496791892029E-19 -0.189735380184963011E-18 - -0.355922582691767008E-18 -0.767073037033493925E-17 -0.682708555486965996E-17 - 0.203287907341032009E-18 0.345589442479755020E-18 0.662040951573960991E-17 - -0.326107684692905992E-19 0.393023287525995020E-18 0.105709711817336990E-17 - 0.792822838630025021E-18 -0.203287907341031991E-19 -0.542101086242751976E-19 - 0.542101086242752024E-18 0.474338450462407997E-18 -0.137219337455196992E-18 - -0.847032947254299978E-19 0.326173859141910017E-18 0.731219906678634021E-19 - -0.946472464592363042E-19 0.139237548491848992E-18 0.612680171563180001E-19 - -0.188808491239547999E-17 0.145513853081274004E-20 0.351754011649361006E-19 - 0.103319340142544008E-16 0.706026828653623967E-19 -0.227702440388566992E-18 - -0.768408726927613926E-17 -0.144477278643678006E-18 0.338346241908282006E-18 - 0.257484777844331994E-18 0.254997244763346007E-17 0.210498854330963010E-19 - -0.183381484398103994E-18 -0.755890011070466999E-19 0.802413433023315002E-17 - 0.699252607177727046E-19 0.213594500225259994E-18 -0.235617227606901007E-21 - 0.130534219874883997E-16 -0.939645122611503009E-19 0.181688567186046989E-18 - 0.550571415715294980E-19 0.367633474932048013E-17 0.264951905901144992E-17 - -0.711507675693612044E-19 -0.796210970419041958E-19 -0.555997720533642962E-19 - 0.694178241860625994E-19 -0.971685756843591012E-18 0.352246592049581004E-19 - -0.177016651086348013E-19 -0.809573907778713970E-17 -0.165171424714588999E-18 - 0.330342849429177012E-19 -0.540915240116595965E-17 -0.352026892878886990E-17 - -0.329326409892472010E-17 0.622238343986890020E-19 -0.660685698858354024E-19 - -0.982558218814988032E-19 0.528548559086683026E-17 -0.313402190484091005E-19 - -0.866032624180153034E-17 0.952700100629121048E-19 0.000000000000000000E+00 - 0.121972744404619008E-18 0.146367293285543002E-17 -0.155774652955986003E-18 - -0.406575814682063982E-19 0.127393755267047002E-17 0.142301535138722000E-18 - 0.508219768352579978E-20 -0.738612730005749992E-18 -0.115196480826584991E-18 - -0.121972744404619008E-18 0.894678550537355034E-20 -0.120066920273297001E-18 - 0.246976278573632984E-18 -0.799387343971245945E-19 -0.190053017540184000E-18 - 0.346884975340074986E-19 -0.175978087053195992E-19 -0.495753329340794005E-18 - -0.285070427323536989E-18 -0.441968288153793989E-19 0.812601554256781032E-21 - -0.748406217586133030E-17 -0.742510405052100003E-19 0.347283508374263019E-19 - 0.168771314740419006E-18 -0.142883870289960005E-18 0.396517298433418985E-19 - 0.398084309385840033E-17 -0.227640104574593005E-19 -0.514307817660970006E-19 - 0.349851076995627978E-18 0.116316152503736994E-16 -0.578894079889111008E-19 - 0.347336447933467014E-18 0.295353109518184001E-19 -0.312025761944802984E-18 - 0.281532575843647979E-18 0.110855436971906990E-18 0.133195930955738993E-18 - -0.630616029230826988E-17 -0.385230584411256025E-17 0.125369147999762005E-18 - -0.152512769607959997E-18 0.905901737088474059E-17 -0.838562617781756975E-19 - -0.264274279543341995E-18 -0.777020380207811956E-19 0.542101086242751976E-19 - -0.542101086242751976E-19 -0.542101086242751976E-19 0.406575814682063982E-19 - 0.840256683676266033E-18 0.551587855251999996E-17 0.158225754547102995E-17 - 0.362371282747230003E-19 -0.216840434497100983E-18 -0.745388993583784058E-19 - 0.400138364282931018E-17 0.162247341249214004E-19 0.161979169294623999E-17 - -0.595570041038180046E-19 0.894466792300540965E-18 0.284603070277445010E-18 - 0.502798757490152989E-17 -0.613933450636857020E-19 -0.294972606436410006E-18 - 0.599699326656045025E-17 0.148230765769502993E-18 0.350671640163280003E-18 - 0.628498446862691006E-17 0.169406589450860008E-19 0.381164826264434972E-19 - -0.220228566286117998E-19 -0.199529198637591003E-18 0.108631975485363994E-18 - -0.129066645337873990E-18 0.144456340634422998E-18 0.530040792911729013E-19 - 0.403618230401881005E-18 0.737014865216481058E-17 0.110059844318994989E-18 - -0.454074179816084022E-18 0.352795095513764987E-17 0.327364999223985992E-19 - 0.128494236353987006E-18 -0.206400753422191982E-17 0.908972231522271051E-19 - -0.196988099795828006E-18 -0.510866746312750012E-19 -0.832093403647102925E-17 - 0.508219768352579978E-20 -0.156595216123638992E-18 0.304349525860310983E-18 - -0.762382592088074007E-17 0.127054942088145006E-19 -0.180629776001980005E-18 - -0.293741761684931004E-18 0.590206271074140996E-17 -0.421489881126395011E-19 - -0.346224717190195023E-18 -0.299214388617581980E-18 -0.100318347108062999E-16 - 0.568359107607636009E-18 0.107471540347625997E-16 -0.118584612615602002E-19 - -0.106064406863998999E-18 -0.429267033245619011E-18 -0.243921335135351988E-17 - -0.434104385467828984E-19 0.210838411972415998E-18 -0.518765659418141033E-17 - -0.190582413132218010E-18 -0.912678000666508976E-19 0.540618778585057037E-17 - 0.694567016748526037E-19 0.345589442479755020E-18 -0.406575814682063982E-19 - 0.202271467804326998E-17 -0.711507675693612044E-19 0.118923425794503999E-17 - -0.203287907341031991E-19 0.797629750605692964E-17 0.188464830764082006E-19 - -0.451061588025165991E-18 -0.189821406968669007E-18 -0.400002748021503026E-17 - 0.124839752407727995E-18 0.327464209198704022E-18 0.545653123870147032E-17 - -0.251365764192934006E-18 -0.179010689506355000E-18 -0.904977154343205932E-18 - -0.183495918530850001E-18 -0.399801975895928995E-18 0.278820123303884006E-23 - 0.562694574772872009E-18 0.202758511748997993E-19 0.854656243779589042E-18 - -0.390636044278167019E-19 -0.107188002479181997E-17 -0.245956339028960991E-20 - -0.580863803722753026E-19 -0.791746263062787051E-20 -0.574261868458814053E-19 - 0.236017789818529992E-18 0.373936922071946989E-18 0.518031784778684025E-18 - 0.245470727140724987E-17 -0.691522992094331016E-19 -0.285741270800317976E-18 - 0.316578564036294992E-19 0.110114283143058996E-18 -0.169406589450859996E-18 - -0.754330485133237969E-17 0.942324153820408977E-20 0.281214938488428001E-18 - -0.111808349037568006E-18 0.279944389067546021E-18 -0.269065309651248993E-19 - 0.764138861964645979E-17 -0.310440883891150985E-19 0.204959664400601989E-18 - 0.515554994382785973E-19 0.579622776531413976E-17 -0.229418597622752010E-19 - 0.920281259736054051E-17 -0.399130950750337972E-23 -0.120874248551148997E-18 - -0.174419097167777988E-18 -0.868654828080977068E-18 -0.900415783177556089E-21 - 0.404465583235387984E-19 -0.681480957419390977E-17 0.149009478768038989E-19 - 0.338276207821614990E-18 -0.482449426342788030E-17 0.360144848996880021E-20 - -0.141509621152472008E-18 -0.132682285823646993E-18 -0.142110385631281993E-16 - 0.189967376932361008E-22 0.110892271134246991E-18 0.633346199603779993E-19 - -0.300611992980550998E-17 -0.269779993700494995E-18 0.391392087358040979E-18 - 0.589549975636383965E-19 0.134263796311126002E-16 -0.559058288800089044E-19 - -0.227134697220322979E-18 0.286999495230071982E-17 -0.530824960132428959E-18 - -0.419810704482912991E-18 -0.874646221334790992E-17 -0.519739416435238996E-17 - -0.217856874033806009E-17 -0.364476457406177966E-23 0.131634213959238997E-18 - -0.338019085513668988E-19 0.242463181151542991E-19 0.205405489709168013E-19 - 0.396478007354323017E-19 0.290554014399305015E-18 -0.223490056723352997E-17 - -0.582914177666118979E-20 -0.155517709978212000E-20 0.518526149271776997E-17 - 0.184335545146216991E-18 0.465497544075441045E-18 0.198713929425859014E-17 - 0.592923063078010012E-20 -0.474073752666390980E-19 -0.119365471113852009E-18 - -0.367304091612129012E-17 0.149091033606558009E-19 -0.402266203690663009E-19 - -0.295418456786575992E-18 -0.928032954377330083E-18 0.123415347392912000E-19 - -0.403637669146276001E-18 -0.193017632855574003E-18 0.642437432800943969E-17 - -0.115514118181804992E-18 0.261563774112127983E-17 -0.575982404132923990E-19 - -0.514996031930614991E-18 -0.508219768352580011E-18 -0.113336184716178005E-16 - -0.258810717670098996E-20 -0.318484388167617024E-18 0.813151629364127964E-19 - 0.115196480826585000E-17 -0.162630325872826002E-18 -0.178829830989063991E-18 - -0.260208521396520987E-17 -0.135525271560687994E-19 0.731095312598867980E-19 - -0.169050570915216994E-17 0.107679063419703003E-18 -0.465868120989865006E-19 - 0.981367078732912033E-20 0.423516473627149989E-19 -0.982558218814988032E-19 - -0.182959116606928994E-18 -0.101988060805338004E-18 -0.209992702514142018E-17 - 0.590276085117841030E-19 0.271050543121375988E-19 0.162630325872826002E-18 - -0.382181265801140005E-17 0.383811804224604982E-21 0.423516473627149977E-18 - -0.149077798716756994E-17 -0.279182059415017007E-17 0.592923063078009997E-19 - -0.338813178901719985E-20 0.308319992800564986E-18 0.150771864611265003E-18 - -0.740624433255478979E-19 -0.387094056895215014E-18 0.676501392170368033E-19 - -0.121549227930992002E-18 0.295402740354937003E-19 -0.291416832279092011E-17 - -0.460872725571868987E-19 0.758375729200866953E-19 0.521371940092172967E-20 - -0.406504324951388992E-19 -0.651424377925056947E-19 -0.108738516348261003E-17 - -0.957544277091384987E-20 -0.357024387267687978E-18 -0.120808074102144995E-18 - -0.257471546185705991E-18 0.344363560811950999E-18 0.515776973146925997E-17 - 0.290371414279082971E-19 0.268688528882230986E-18 0.191512370935880005E-18 - 0.454950577675083974E-17 0.264846688527228016E-18 -0.236094717615498001E-18 - -0.111563503576251997E-18 0.117314063194721006E-18 -0.277085652870562998E-18 - -0.285873619698326003E-19 0.390852765598656020E-18 0.429106891079029014E-17 - -0.138235776991902008E-17 0.236143521271638010E-19 0.298686647386773004E-19 - 0.674238226014422974E-18 0.847032947254300038E-20 0.176182853028893989E-18 - -0.202096767258955992E-19 -0.745388993583784058E-19 0.325260651745650993E-18 - 0.119262238973405006E-17 0.307472959853311023E-18 -0.216840434497101002E-17 - -0.154498809579183997E-17 -0.527193306371076993E-17 0.337489689921634990E-19 - 0.389635155736978012E-18 0.267662411332359003E-18 0.125360876193636007E-17 - 0.162317651601281012E-19 -0.846415539645091067E-17 0.115143541267381001E-20 - -0.813151629364127964E-19 -0.243945488809238016E-18 -0.454009659728304982E-17 - -0.104290931630686000E-19 0.338813178901719991E-18 0.623416249179164953E-18 - 0.313063377305188982E-17 0.220228566286117998E-19 0.609863722023096003E-19 - 0.155854062294790998E-18 -0.355753837846805998E-18 0.374388562686400991E-18 - -0.416740210049116008E-18 -0.124831480601602994E-18 0.223034362923898006E-18 - -0.571879588294660973E-19 -0.533562697900371999E-17 -0.143723355214125000E-18 - 0.107285742269608005E-18 -0.225737175575414976E-18 -0.854858075849051981E-19 - 0.293756650935957019E-19 -0.710708536503824959E-17 -0.797732982746140040E-20 - 0.214140516977727985E-19 -0.228315083954436005E-18 -0.165965518102638991E-18 - -0.213240544471269990E-18 0.618450518525887022E-17 0.116043513773839002E-18 - -0.139892785194967992E-18 -0.506313944221258028E-18 0.133937084784586000E-18 - 0.204346698525099987E-19 0.536807130322413020E-19 0.124523769413732997E-19 - 0.148952067263648992E-18 -0.301517259442928983E-18 -0.127393755267047002E-17 - 0.398105485209521015E-18 0.211419423634673018E-17 0.340168431617327016E-17 - -0.430292737205185006E-18 -0.193123511973979996E-18 -0.107814853389060005E-16 - 0.631039545704453993E-19 0.230392961653169981E-18 -0.135525271560687994E-19 - 0.976205471710581020E-19 -0.194817577868488988E-19 -0.813151629364127964E-19 - -0.117102304957906998E-18 -0.428768077900126986E-17 -0.606475590234079043E-18 - 0.101643953670515996E-19 -0.106005024343306003E-18 0.744917500634628952E-19 - 0.810409525633515039E-20 0.448588648865876997E-17 0.121760986167806011E-18 - 0.375955573638820974E-17 0.914795583034644005E-19 -0.891027874206852923E-17 - -0.104166389249707997E-18 0.901362169886782956E-19 -0.633570718378866017E-19 - 0.112392829688519002E-18 0.194071254160812988E-18 0.147344671802759991E-17 - 0.697367299791314956E-19 0.211231296913984995E-19 0.479296835202547044E-18 - -0.566753257617398987E-19 0.323987061890757993E-18 0.259933235688662995E-19 - 0.986793383551260015E-19 -0.179676863936318002E-18 -0.919808297546715955E-19 - -0.606703892083144040E-19 0.386989004957421020E-18 0.314651646799352012E-17 - 0.101994729949027002E-18 0.684812902965300976E-18 -0.806073444862511971E-18 - 0.110753693756562992E-18 -0.925118797079306001E-20 0.301637366067872000E-17 - 0.452103835596982987E-19 -0.753329927464293003E-19 -0.465974000108271974E-18 - 0.212139401639839982E-17 0.984675801183124024E-19 -0.342677766723567977E-18 - -0.147701370177469013E-19 0.162418567636011994E-18 0.148230765769503011E-19 - -0.840150804557859028E-19 0.448834817816173048E-18 -0.330239617288731015E-17 - -0.107028899458235995E-19 -0.720116040930874954E-19 0.171179072713674006E-18 - 0.947754072384057926E-17 -0.104498004108727002E-16 -0.159287360851361995E-18 - -0.546449624875564009E-18 -0.169016160201734991E-18 -0.141622419855816012E-18 - 0.105248409733328002E-16 -0.382470559941918998E-19 -0.297665499382925019E-18 - 0.962848986359687018E-18 -0.178557947061475997E-18 0.440432446400829996E-19 - 0.263750093250461983E-17 0.518265773643397017E-17 -0.455663906451196969E-17 - -0.368948450555535963E-20 -0.373638438713888985E-18 0.213542219055679009E-18 - -0.448714132358229986E-17 -0.219963878966174016E-20 0.576077695339490031E-17 - 0.158077523781333993E-18 -0.930746933967116952E-18 -0.950560029288081993E-19 - 0.423162853915283991E-19 0.142072256667234998E-18 0.287333593479481977E-19 - 0.170800388783012010E-19 -0.143679075460453001E-16 -0.306660014178995024E-19 - -0.150184396739703993E-18 -0.853679844308174942E-19 -0.189294334590933006E-18 - -0.272474283524034976E-18 0.125462270442160009E-18 -0.751674786718394965E-19 - 0.306550050694755012E-17 -0.309179488361727017E-21 -0.225680824731844995E-17 - 0.535668041239238007E-21 0.109157572264066003E-20 -0.602222147500702957E-21 - -0.399748891870335016E-18 0.651654608810402999E-19 -0.115821846718304011E-18 - 0.728333770199589983E-19 -0.105682434112270003E-16 0.785310085019301966E-19 - -0.113602115086027998E-18 -0.212083351005076997E-19 -0.204710795319530007E-18 - -0.245404952064859993E-18 -0.800663773371987988E-17 -0.216121373216464000E-19 - -0.990499766618607977E-19 0.114815399364617001E-19 -0.876268280833858963E-19 - -0.244044218853604000E-18 -0.920838732195320949E-17 -0.554290201001359983E-20 - -0.815213370858219041E-19 -0.323977921931644982E-19 0.244566550410973989E-17 - 0.922736264539544005E-17 -0.225107239567582988E-18 0.146285264029161001E-20 - -0.131179966443482995E-19 0.227861500072650001E-18 0.104859413987007006E-16 - -0.325575790958343987E-21 -0.128914520652134002E-20 0.712380445010489047E-17 - -0.689612758111431019E-19 -0.518476950651943026E-19 -0.987352703162770031E-18 - -0.651557083079744989E-17 0.618057019281626989E-17 -0.256050918324738993E-22 - -0.579315073000275023E-20 0.116825754961274992E-20 0.488482179479582027E-17 - -0.112839648754171999E-20 -0.171478222420645004E-17 0.121003483029333003E-16 - 0.281361349456850020E-19 0.196479900706991004E-18 0.446695067007620992E-17 - 0.296023125814352014E-19 -0.525838715399959994E-19 0.134402804013066001E-16 - -0.533630756770208973E-19 -0.135525271560687994E-19 -0.307388256558586017E-17 - 0.418095462764723033E-17 0.325938278103455002E-17 -0.804756263134598058E-22 - 0.212102343948397000E-18 0.553747789267498967E-19 0.178660424399612991E-17 - -0.110855436971906990E-18 -0.115760817367797994E-16 -0.210618278531899992E-19 - -0.934068891307130031E-19 0.132841484063259997E-18 0.281252988796605007E-18 - -0.193943299659809005E-18 -0.345377684242941011E-18 -0.582891016608967021E-18 - -0.584484497340988962E-17 -0.130601892554772000E-18 -0.345854140275771989E-18 - 0.152100316675026008E-18 -0.102914503091396998E-18 0.114150924532318007E-21 - -0.415920887652381993E-18 -0.147870942203041992E-18 -0.441264460850088028E-18 - -0.741908217566160965E-18 0.214934610365779012E-19 0.377141419764976985E-18 - 0.129262521706926995E-17 -0.169406589450860008E-19 0.576660030490727979E-17 - 0.296461531539004998E-19 0.508219768352580011E-18 0.572594272343907029E-18 - 0.357553782859722012E-18 -0.169662498452868990E-21 -0.162630325872826002E-18 - -0.745388993583784058E-19 -0.197866896478604983E-17 0.162630325872826002E-18 - -0.582335151237331002E-19 0.208708918203459989E-17 -0.508219768352580023E-19 - 0.100691041604855001E-18 -0.138156367653096994E-17 0.225522522206456984E-18 - 0.317637355220363012E-18 0.468614360623540998E-19 -0.230392961653169981E-18 - -0.332036915323686022E-18 -0.677626357803439970E-20 0.210729224131558998E-18 - 0.668009886874144024E-17 -0.181651509494605009E-17 -0.511607900141597020E-17 - 0.677626357803439970E-20 0.169406589450859991E-17 -0.870194004405784926E-21 - -0.119153712877039012E-18 -0.291379333855478980E-18 0.481961746987696989E-18 - 0.290320542671411008E-18 0.758687410855676968E-17 -0.385399991000707015E-19 - -0.444692297308507979E-20 0.647980204649540000E-19 0.108234928791338999E-18 - 0.201157090083095001E-18 -0.197605409177339990E-18 0.198550747370517010E-19 - -0.101671722281198999E-18 0.310643151866217981E-18 -0.125183016637982994E-16 - -0.145837243138382015E-19 0.171651875903994001E-19 -0.615441504747079020E-17 - 0.123241691163063998E-18 -0.110059088224217001E-18 -0.109498990014289995E-17 - -0.483878427864559008E-19 0.347403720981722015E-18 -0.340587449003903991E-18 - 0.741306893449987060E-17 -0.363633250169256998E-19 0.364955640550089989E-18 - 0.328622055261124998E-20 0.494646450528802992E-17 0.588947426258927007E-19 - 0.299974477127637992E-18 0.552209517671485973E-18 0.121661835129705993E-16 - -0.100268920998511004E-18 0.115831755537026005E-18 -0.310225816931887018E-18 - 0.122421671866663997E-16 0.579370535921941047E-18 -0.727770708280895005E-17 - -0.135525271560687994E-19 0.698851812321550961E-19 0.225434841061526989E-18 - -0.346550460915418993E-17 0.144465439621161009E-18 -0.453050130217742988E-18 - 0.304348202371331014E-17 0.166441974135469993E-18 -0.925383494875323000E-19 - -0.872359232377204073E-17 -0.423516473627150019E-20 0.172794721239877004E-18 - -0.105032085459533001E-18 -0.633919457725117989E-17 0.457397791517322003E-19 - 0.704731412115577978E-18 -0.674238226014422974E-18 -0.404116110412576017E-17 - -0.362753453114926982E-18 -0.475570224499606972E-17 -0.176475368004182995E-18 - 0.178340140066433007E-19 -0.163225895913863989E-18 0.482835275563946972E-19 - 0.399079748874743998E-20 0.367605681663466019E-18 0.163914110183507994E-19 - -0.579031722743040020E-17 -0.228529489169210000E-17 0.277911509994136018E-17 - 0.156800809279949005E-22 -0.101942532784422997E-16 -0.200111533788828004E-19 - 0.681861522539712014E-19 -0.438868945796134002E-19 -0.998186762513412040E-19 - -0.723376683507983022E-19 0.104239077746036003E-16 -0.222340978775424991E-19 - -0.146386980184122005E-18 0.409358532980754034E-17 0.223196490323958014E-18 - 0.222899946074357995E-18 0.351381030256605971E-17 -0.227693044133796995E-17 - 0.248805340344110007E-17 0.166759611490690003E-20 0.826704156520197035E-18 - 0.189735380184963011E-18 -0.925912890467357058E-19 0.229545928705915007E-18 - 0.121727104849915995E-16 -0.743059652978834972E-17 -0.681480357713447030E-17 - 0.102237042169716995E-18 0.239339747158542999E-18 0.388946941467333989E-18 - 0.338813178901719991E-18 -0.921571846612678961E-18 0.655264687995926973E-17 - 0.770799982001412947E-19 0.954817889792410057E-18 0.202589700059119992E-19 - -0.249620609555842002E-17 -0.114772964352958009E-18 -0.135525271560688006E-18 - 0.338813178901720015E-19 -0.451045044412914991E-19 0.101643953670516005E-18 - 0.627651413915437033E-18 0.282061971435682013E-18 0.234119906621088999E-17 - 0.834327453045485954E-19 0.216840434497100983E-18 0.237169225231203999E-18 - -0.646794358523383970E-17 -0.677626357803440031E-19 0.243521972335611010E-18 - -0.633315946750199974E-18 0.176182853028893989E-18 0.211758236813574988E-18 - -0.549982711273340999E-17 -0.421382929258131981E-19 -0.755877126763073964E-19 - -0.927923781985524985E-20 0.544312695782081001E-18 0.417435145161237000E-19 - -0.461341442457651995E-17 0.536108318836156013E-19 0.197084421782326988E-18 - -0.190691339550880991E-18 -0.193154897758039006E-18 -0.127537878015355010E-18 - 0.298736053454965990E-18 0.127528573266872008E-18 0.609637342641941969E-17 - 0.396854887229514001E-20 -0.817053422952606007E-17 -0.752305314909567013E-21 - -0.193634152214976995E-19 0.401511567862913983E-22 0.140692652691434998E-18 - 0.125895048470979998E-19 -0.964089261394241035E-19 0.637270598203130045E-19 - 0.357904713618655020E-17 -0.192415561086064008E-18 -0.335339833979068015E-18 - -0.144439750170145000E-20 0.293628202065289017E-18 -0.755801452661721952E-19 - 0.339411342868876015E-17 -0.400752487403498007E-20 0.763312882702686022E-19 - 0.247337834049500987E-18 0.791484376388382995E-19 0.117214281504105993E-18 - 0.710182978617857932E-17 0.342936365526693036E-20 0.555760486582942945E-19 - -0.781423606007173046E-21 0.298447611490521017E-17 -0.609211529548263984E-17 - 0.371005047806836004E-19 0.187569128895034009E-18 -0.189293009203637990E-18 - -0.886637171818198977E-19 0.294046943769591991E-17 0.259566158502791993E-21 - 0.291867570968205015E-20 0.719829617311043049E-17 0.113028268052886998E-19 - 0.773855520595252012E-19 -0.378454053841608976E-17 -0.480794137786766970E-17 - -0.101370917403036996E-16 0.115757992860620994E-21 -0.592809693679449974E-20 - 0.167889785464364989E-18 0.611388863593793994E-17 0.327757493780598990E-21 - 0.452478537105359005E-17 0.863084305453742042E-17 0.186191808557267986E-17 - 0.626132329032225955E-19 0.646801472276652056E-19 0.138210713419341001E-18 - 0.227784990428761008E-19 -0.145303063388947988E-19 0.114819286467261003E-18 - -0.105892353296588006E-18 -0.137844024253797010E-17 0.171947688292622993E-18 - 0.230392961653169981E-18 -0.409752188234268030E-19 -0.101232067356301001E-16 - 0.159844381569747008E-19 -0.165171424714588999E-18 0.324837135272023987E-18 - 0.247031279622018984E-18 0.105941867811068004E-18 0.165709810725904997E-17 - 0.250228339151884989E-19 0.121222449583768003E-18 0.220343942471932981E-17 - -0.175540960873553008E-18 -0.443011466303800022E-18 -0.485540461189846004E-17 - -0.599881968135295963E-17 -0.920317179059558028E-17 -0.319285512590875016E-19 - -0.109833041734791000E-19 -0.160489582447537996E-18 0.197370257238828010E-18 - 0.281536711746710985E-18 0.502518425980557979E-18 0.685177358743191994E-17 - 0.732594660177182060E-17 -0.814276594997199962E-19 -0.277826806699410993E-18 - -0.214934610365779000E-18 0.267620059684996000E-17 -0.444713473132189036E-17 - -0.670701863459635981E-17 0.264697796016969006E-19 0.585130359963270982E-17 - -0.372694496791892029E-19 0.801293168102568024E-18 0.410641572828885035E-17 - 0.502798757490152989E-17 0.178988649666674003E-18 0.343835745465758998E-14 - -0.367193526019243979E-14 0.124117838402881993E-13 -0.668735222362701016E-14 - 0.823316024731179930E-17 0.113502414932076005E-18 0.248604170019137005E-18 - -0.680961550033253983E-18 -0.504233419544565038E-17 -0.217204263911860001E-19 - -0.210064170919066003E-18 -0.277826806699410993E-18 0.184997289636259991E-18 - 0.124672661923991994E-18 0.142419093985962009E-17 0.318871176989522014E-22 - 0.362796009892000020E-20 0.475428702862992037E-20 0.114993378999595012E-18 - 0.296998248186471012E-18 0.137377009476437004E-17 0.107906108735239008E-20 - 0.344697057730012976E-18 0.233888414833785013E-18 -0.114499755441505997E-18 - 0.343701109734176990E-18 0.239648173231097987E-18 0.662407103109341977E-18 - -0.377078204440514013E-17 -0.211556892294861001E-20 -0.821331087183475025E-17 - 0.141328819223921000E-20 -0.122986833084708995E-19 -0.578785195122303022E-22 - 0.117628290555894005E-18 -0.482852030302985980E-19 0.152715824767208010E-18 - 0.191404425117076001E-18 -0.265067067422130010E-17 -0.664820294833819956E-19 - 0.731756791723990014E-19 0.711274425411596942E-19 -0.103554248939230999E-18 - 0.528365257801646029E-18 0.607076619005779013E-17 -0.156159844440068998E-18 - 0.131687040427342994E-19 -0.227879910615741999E-18 0.316422725621148005E-18 - 0.120637954277358989E-18 0.119435717232415004E-16 0.515901036912107984E-20 - 0.767837756490412976E-19 -0.195591700332982007E-18 0.102080243701931009E-17 - -0.582123393000517985E-17 -0.113091510730573003E-19 -0.737094946711110960E-24 - 0.812485213421242952E-19 0.293564361805019011E-18 -0.257901768391306992E-17 - 0.755454438035129986E-21 -0.102984835529547994E-20 -0.299206409188393008E-17 - 0.118414324366844011E-18 0.106238035287570000E-18 0.865029155483104026E-17 - -0.605451096465977991E-17 -0.908673286122591033E-17 -0.245986784410216017E-21 - -0.231756677129184016E-18 -0.175645901796425989E-18 0.582748079799118032E-17 - 0.756879467374077992E-22 0.697679309046806012E-17 0.378114738427846991E-17 - 0.185184588830498988E-18 0.240027296595149010E-19 0.612515214688381968E-17 - 0.116178223514405005E-20 0.107214617423943998E-19 -0.748335452127508986E-17 - 0.186841398172223001E-18 -0.219407572781849989E-18 -0.827097614197910050E-17 - 0.295813021938763978E-18 0.167050779066308997E-18 0.000000000000000000E+00 - -0.185963535048316991E-18 -0.266565349779309023E-18 0.395342870760552980E-17 - 0.710830601573883961E-22 0.714369058868556026E-17 0.340525277702111998E-22 - -0.382282094159552979E-20 0.149830506514481011E-21 0.883994619572126023E-19 - -0.758302221231688993E-19 0.109190432258762008E-18 -0.703507417453546983E-20 - -0.569691861010580966E-17 -0.419320786566922027E-19 0.150383337888478994E-18 - 0.240575431966751987E-21 0.280064166101323972E-19 0.105607927978014003E-18 - 0.524266083943383965E-17 0.504615464422202999E-23 0.269899549811437999E-18 - -0.169841572392483002E-18 0.110246530172922989E-18 0.595056711102719968E-19 - 0.830106334405030930E-17 0.722278172990987047E-22 -0.260802680060638985E-19 - -0.210156414482063007E-19 -0.129291638464487995E-17 0.652706764278169958E-22 - 0.409368376430043030E-17 0.000000000000000000E+00 0.231210476845850008E-18 - 0.729891645869777022E-19 0.462908799630395012E-17 -0.770274702091042971E-23 - -0.714684049245816005E-21 0.462959092211639007E-17 -0.138590428094594993E-19 - -0.208516916658589005E-19 -0.129291638464487995E-17 -0.149795744801019001E-21 - -0.553218393675464984E-20 -0.532042569994107024E-20 0.409368376430043030E-17 - 0.000000000000000000E+00 -0.494984878551731986E-20 -0.489690922631392007E-20 - -0.739443137510889016E-17 0.111000775673736002E-16 0.222679088850805999E-18 - 0.106845003637134005E-18 -0.490050851211016962E-17 0.293770518732017028E-19 - -0.802142049281503955E-19 -0.297911234386516011E-17 0.270478431405522001E-18 - 0.169875903679790995E-18 -0.867533249122434064E-17 -0.585861028395123963E-17 - 0.499436149037106027E-17 -0.308114995328776013E-22 -0.130599392574966007E-14 - 0.130515766480161008E-14 0.769254928189150031E-19 -0.103944874968919999E-18 - -0.878111951636581074E-17 -0.350432956441832991E-22 0.633450637144740049E-19 - -0.820001218346395020E-19 -0.398999557912142013E-21 0.148399850494123999E-19 - 0.909624493601297957E-17 0.302645293357789021E-21 -0.219210703865852003E-18 - 0.308119015134121004E-18 -0.275701318827977019E-18 -0.156380108301380001E-18 - -0.956776784104850991E-17 -0.220370776999360005E-18 0.316081995140841987E-18 - 0.198833102388086990E-19 0.717056862337457936E-17 0.855578145811417952E-19 - -0.993211691437939944E-19 0.833220302850867983E-21 -0.217506104734504998E-19 - -0.778452855386439975E-19 0.409724525050444025E-17 -0.504842952642287961E-21 - -0.346727880643630006E-18 -0.411653532459296998E-18 0.396489180937604980E-17 - -0.136437880709918003E-23 -0.124957018075439001E-16 0.302207630763774979E-21 - 0.274303192419421975E-19 0.735547307649483054E-19 0.845852113709716048E-22 - -0.134921032453032995E-18 -0.229587702135863011E-18 -0.161215647846060990E-19 - -0.410673872542583016E-17 0.980567686783852952E-19 -0.139877417969035991E-18 - -0.665224868865051959E-21 0.287046018190561014E-19 -0.154866630141438997E-19 - 0.223870818551690020E-18 0.124778581831986997E-18 0.153250017406549997E-17 - -0.448632604218867018E-19 0.711507675693612044E-19 -0.677626357803439970E-20 - 0.107945878798088004E-16 -0.300431998479260013E-19 0.433680868994201966E-18 - 0.118313562072480995E-16 -0.637646402693036987E-17 0.271050543121375988E-19 - -0.101643953670516005E-18 0.677626357803439970E-20 0.813151629364127964E-19 - 0.279520872593919016E-18 -0.745388993583784058E-19 -0.100849860282464999E-19 - -0.592051421507532003E-19 0.780283607724334012E-19 -0.664858447910062962E-19 - 0.437413849068639009E-19 -0.516544150489818993E-17 0.503227866641761002E-20 - -0.301136200599181013E-19 0.266177796878770016E-17 0.102904964664959001E-18 - 0.312088382102112999E-18 -0.297903979430933015E-17 0.202876384986286988E-20 - 0.543951385875457012E-19 -0.239374042842221994E-18 0.418443806615245012E-17 - -0.562255471114576966E-19 -0.295243281097027992E-18 -0.799691946769469941E-19 - -0.852375263513670959E-17 0.140981303182299992E-19 -0.492761908203678022E-18 - 0.254715677652711988E-18 -0.117430723578436001E-16 -0.309172195626648020E-19 - 0.796210970419041958E-19 0.158818677610181001E-18 0.208899500616592014E-17 - -0.443506451182351966E-17 -0.105032085459533001E-18 0.115196480826584991E-18 - 0.152282296409787000E-18 -0.133506123685445993E-18 0.772670733674762928E-17 - 0.347283508374263019E-19 -0.532836663382158006E-19 0.582929066917145019E-17 - 0.359989002583077981E-18 -0.155854062294790998E-18 -0.347495266611077003E-18 - 0.220906192643921991E-17 0.498055372985528985E-17 -0.167957369017666995E-18 - 0.338813178901720015E-19 -0.116890546721092989E-18 -0.293751026107791011E-17 - 0.733609941661028944E-19 0.100944390482868002E-16 -0.267894332096603011E-19 - 0.216840434497100983E-18 0.135525271560687994E-19 0.151788304147971006E-17 - -0.247227741479849002E-19 -0.138913403349704991E-18 0.379470760369926985E-18 - 0.643745039913267969E-18 0.203287907341031991E-19 0.304931861011548002E-19 - -0.182959116606928994E-18 0.347283508374262995E-18 0.196776341559015009E-18 - -0.427751638363421972E-19 -0.163649412387490995E-19 0.690331852012255017E-19 - -0.531407295283666964E-18 -0.422299341979263986E-18 -0.157579974642883004E-19 - 0.192321973960417003E-18 0.223317258693390997E-18 0.423311746425542994E-19 - 0.683894318895060959E-19 0.540270370111050010E-17 0.298115892764110993E-19 - -0.196617522881404009E-18 -0.811034046995993056E-19 0.129860738725924993E-18 - 0.422987078035115991E-19 -0.456513700878625969E-17 -0.224993126614424009E-19 - -0.459581548334461984E-18 -0.133004025053625993E-18 0.735806933367970026E-18 - 0.405093507024368993E-18 0.194632289411277007E-18 -0.352809074866117003E-19 - -0.232722302258119018E-18 -0.601393392550553000E-19 -0.627863172152250006E-19 - -0.931736241979730072E-20 -0.765971894202063951E-17 -0.399121924746226012E-17 - 0.190077832958559989E-18 0.179066508041831014E-19 -0.574288338238416015E-17 - 0.880914265144471992E-19 0.182959116606928994E-18 0.153140909885616995E-18 - 0.203287907341031991E-19 0.325260651745650993E-18 0.379470760369926985E-18 - -0.207523072077303992E-18 -0.361852475067037028E-17 0.301543729222530998E-17 - 0.394039727062700980E-17 0.350989277518501004E-19 0.393023287525995020E-18 - 0.272744609015884998E-18 -0.357447903741314974E-17 0.155423101195651015E-19 - -0.123801276779504007E-16 0.770270586409379010E-19 -0.443845264361252992E-18 - -0.250721752387272997E-18 -0.393700913883799029E-17 -0.601128694754536031E-19 - 0.103676832743925996E-17 -0.806375365786094077E-17 0.270372916763573015E-17 - 0.271050543121375988E-19 -0.168051336735252995E-17 -0.514996031930614991E-18 - -0.155854062294790998E-18 0.172794721239877004E-18 -0.387517573368842020E-18 - -0.207523072077304004E-19 0.390799826039453012E-18 -0.925342135844694988E-19 - 0.106281541774795000E-18 0.967048582329619004E-19 -0.834571884916494946E-17 - -0.954318272702428028E-19 0.207248448113936002E-18 0.517443990235404036E-17 - -0.244696568805436984E-18 -0.116783013241462011E-18 -0.578651881405754976E-17 - -0.731624708190902038E-19 -0.335583865790313019E-18 -0.197623374506268996E-18 - -0.163191485200382005E-17 0.101432195433701996E-18 0.148336644887909010E-18 - -0.173324116831910990E-18 0.539199998398406007E-17 -0.169406589450860008E-19 - -0.844915364886164949E-19 -0.416303458685688009E-19 0.233338717250593997E-17 - 0.273292202581394994E-19 -0.728448334638698051E-19 -0.525160427297666030E-19 - -0.110323923797504006E-16 -0.193123511973980015E-17 -0.372694496791892029E-19 - 0.115196480826584991E-18 0.242873462735370015E-18 0.910537153843644961E-20 - -0.105840274005222006E-16 -0.469176843440076996E-20 0.193136746863780989E-18 - -0.343708102894564009E-17 -0.209217137971811991E-18 0.414199111207353010E-18 - 0.394293836946877000E-18 0.304931861011548002E-19 -0.304931861011547978E-18 - 0.611915129942227994E-19 0.237169225231204005E-19 -0.299849663328021983E-18 - 0.207692478666753981E-17 -0.319490239792480988E-19 -0.133513568310958995E-17 - -0.374812079160028020E-19 -0.406575814682063982E-19 0.406575814682063982E-19 - -0.420128341838133016E-18 0.463088794131687014E-19 -0.138913403349704991E-18 - 0.127393755267047002E-17 -0.117906986257798992E-17 0.169406589450859993E-20 - -0.332036915323686022E-18 -0.227004829864152009E-18 -0.559041745187837983E-19 - -0.709390093325477015E-19 -0.947618109740748986E-19 0.242489650931144976E-18 - 0.359989002583078005E-19 -0.180312138646758992E-18 -0.229976452627177011E-17 - -0.157796657189281015E-19 0.349872583691553995E-19 0.723435620126626944E-19 - -0.457635088955547972E-19 -0.933145550948367955E-19 -0.123819011531837992E-17 - -0.782909906169189995E-19 -0.184229666027809993E-19 -0.157336369952485999E-18 - 0.346171777630992014E-18 0.105217373916745000E-19 -0.111834818817168995E-17 - -0.155642304057977988E-19 -0.955956090315283023E-19 -0.117479499317231010E-18 - -0.362238933849222033E-17 0.115699406639016991E-18 -0.613040095575299991E-19 - 0.294757539477146002E-19 0.720825038113409974E-18 0.197676314065472005E-18 - 0.436963121664811995E-18 0.347865843525499989E-18 -0.652215369385810972E-18 - 0.381842452622238978E-17 0.131349663828520009E-18 -0.153847115333583998E-18 - 0.736833960816516019E-17 -0.262580213648833015E-19 -0.284603070277445010E-18 - -0.197663079175671012E-19 -0.101643953670516005E-18 -0.298155597433514008E-18 - 0.840256683676266033E-18 -0.957147230397359022E-19 -0.220906192643921991E-17 - 0.447233396150270964E-17 0.315435069557500984E-17 0.371635705607823973E-19 - 0.216840434497100983E-18 0.643745039913267969E-18 0.358125530099118029E-17 - 0.155665051524822986E-19 -0.103592129449201009E-17 0.619922238271740940E-19 - -0.238439774652085979E-18 -0.407277263841508986E-18 -0.147777239183251985E-16 - -0.557205404227970984E-19 0.336503690631471986E-18 -0.140650350287169006E-16 - 0.180418017765165996E-18 0.145689666927739995E-18 0.814210420548196008E-18 - -0.687790753170491972E-18 0.718283939271646976E-18 -0.423516473627150019E-20 - -0.565295230618739017E-18 -0.740359735459462010E-19 -0.399799551104030001E-18 - 0.152465930505773989E-18 -0.679733946795801001E-17 -0.208960686132527017E-20 - -0.487083649340625048E-18 -0.251436436436519005E-18 -0.289066536862780991E-19 - 0.571135125743363007E-19 -0.514784273693801031E-18 0.586040920381569047E-19 - -0.225310763969643986E-18 -0.897854924089557998E-19 -0.374812079160027996E-18 - -0.124990299279213007E-18 -0.103052145945326007E-17 0.125149117956822995E-18 - -0.194764638309285997E-18 0.287673564711241993E-18 -0.105034203041900993E-16 - -0.128113733272213002E-19 0.546336250979023959E-19 -0.391223342513080029E-19 - -0.282909004382935976E-18 -0.487890977618476995E-18 0.135525271560688006E-18 - 0.948676900924816019E-19 -0.650521303491303046E-18 -0.664073830647370984E-18 - -0.546505657568474964E-17 -0.508219768352579978E-20 -0.460785923306338999E-17 - -0.338813178901720015E-19 0.482469966756049011E-17 0.372694496791892029E-19 - 0.184653182501436992E-18 0.528548559086683026E-18 0.178529134292789006E-16 - -0.885043550762337067E-17 0.203626720519933994E-17 0.109267250195805009E-18 - -0.243521972335611010E-18 -0.253712837482265010E-19 -0.291379333855478980E-18 - -0.145689666927739995E-18 -0.109097843606353999E-17 0.118584612615601999E-18 - 0.419895407777638036E-17 -0.436221967835964976E-19 0.166770584104119993E-18 - 0.301564784753128000E-18 0.105479300460810000E-16 0.153246012714406002E-21 - -0.721442795365178040E-20 0.785191282370076000E-17 0.611660841639171959E-19 - -0.269408879070079001E-19 0.980216235171613067E-20 -0.642700149301778963E-17 - -0.852508928293581066E-17 0.697544054984669996E-24 0.250693589472354980E-18 - 0.114838828609231990E-18 0.137684826882562005E-19 -0.881975394890917019E-19 - 0.988901378360807009E-17 -0.584661507095537965E-20 -0.378455001863782989E-18 - 0.568235859210977989E-20 -0.215342517844311983E-20 -0.145015040856607996E-18 - -0.469036545974539019E-17 -0.514100323766396972E-20 -0.305076832491833999E-18 - -0.196493615426171004E-18 0.205358492279442992E-18 0.117772935177185998E-18 - 0.205010377842294006E-17 0.202435071204792989E-19 -0.619429451884161010E-19 - 0.515229817082411962E-18 0.798642516943490968E-17 -0.375603283878283021E-18 - -0.257011107083719992E-18 -0.135643988942254000E-20 -0.402490229080583990E-19 - -0.873766277585149009E-19 0.487430530707361009E-17 0.786805309075646029E-21 - 0.927211481210010019E-19 -0.946732143119615016E-19 -0.966803394206032053E-17 - 0.210728270163397999E-24 -0.182379220144080012E-17 0.301144665518455994E-21 - 0.598805976239300004E-20 0.413513361367576011E-19 0.379595377543990018E-21 - -0.148122463346125989E-18 -0.690407715361336988E-19 -0.320930310026529998E-19 - -0.913936660975153941E-17 0.247663640323067001E-18 0.222959731044695013E-18 - -0.869297579538561051E-22 0.634741170484694975E-20 -0.391093459440610985E-21 - -0.235888633395294979E-18 -0.133873622544535990E-18 -0.597176764414821015E-17 - -0.972390876617005020E-20 diff --git a/EXAMPLE/pddrive_spawn.c b/EXAMPLE/pddrive_spawn.c index 47b04729..49c5d645 100755 --- a/EXAMPLE/pddrive_spawn.c +++ b/EXAMPLE/pddrive_spawn.c @@ -247,7 +247,7 @@ int main(int argc, char *argv[]) /* Check the accuracy of the solution. */ pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, &grid); + nrhs, b, ldb, xtrue, ldx, grid.comm); PStatPrint(&options, &stat, &grid); /* Print the statistics. */ diff --git a/EXAMPLE/pzdrive_spawn.c b/EXAMPLE/pzdrive_spawn.c index faf725c6..9437b129 100755 --- a/EXAMPLE/pzdrive_spawn.c +++ b/EXAMPLE/pzdrive_spawn.c @@ -247,7 +247,7 @@ int main(int argc, char *argv[]) /* Check the accuracy of the solution. */ pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, &grid); + nrhs, b, ldb, xtrue, ldx, grid.comm); PStatPrint(&options, &stat, &grid); /* Print the statistics. */ diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 4dc01bda..07726d7c 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -53,7 +53,6 @@ set(sources trfAux.c communication_aux.c treeFactorization.c - treeFactorizationGPU.c sec_structs.c cublas_utils.c ) @@ -65,7 +64,7 @@ endif () set_source_files_properties(superlu_timer.c PROPERTIES COMPILE_FLAGS -O0) if(enable_double) - list(APPEND headers superlu_ddefs.h) + list(APPEND headers superlu_ddefs.h dlustruct_gpu.h) list(APPEND sources dlangs_dist.c @@ -108,6 +107,7 @@ if(enable_double) dnrformat_loc3d.c pdgstrf3d.c dtreeFactorization.c + dtreeFactorizationGPU.c dgather.c dscatter3d.c pd3dcomm.c @@ -115,7 +115,7 @@ if(enable_double) dcommunication_aux.c dtrfCommWrapper.c dsuperlu_blas.c - superlu_gpu.cu + dsuperlu_gpu.cu ) if (HAVE_COMBBLAS) list(APPEND sources d_c2cpp_GetHWPM.cpp dHWPM_CombBLAS.hpp) @@ -125,7 +125,7 @@ endif() ## enable double if(enable_complex16) - list(APPEND headers superlu_zdefs.h) + list(APPEND headers superlu_zdefs.h zlustruct_gpu.h) list(APPEND sources dcomplex_dist.c @@ -169,12 +169,14 @@ if(enable_complex16) znrformat_loc3d.c pzgstrf3d.c ztreeFactorization.c + ztreeFactorizationGPU.c zscatter3d.c zgather.c pz3dcomm.c ztrfAux.c zcommunication_aux.c ztrfCommWrapper.c zsuperlu_blas.c + zsuperlu_gpu.cu ) if (HAVE_COMBBLAS) list(APPEND sources z_c2cpp_GetHWPM.cpp zHWPM_CombBLAS.hpp) diff --git a/SRC/dscatter3d.c b/SRC/dscatter3d.c index 73d4df7d..78bf5f0e 100644 --- a/SRC/dscatter3d.c +++ b/SRC/dscatter3d.c @@ -19,10 +19,7 @@ at the top-level directory. #else //#include "cblas.h" #endif - -#ifdef _OPENMP #include "omp.h" -#endif #define ISORT #define SCATTER_U_CPU scatter_u @@ -98,11 +95,11 @@ dblock_gemm_scatter( int_t lb, int_t j, ) { // return ; -#ifdef _OPENMP +#ifdef _OPENMP thread_id = omp_get_thread_num(); -#else +#else thread_id = 0; -#endif +#endif int *indirect_thread = indirect + ldt * thread_id; int *indirect2_thread = indirect2 + ldt * thread_id; double *tempv1 = bigV + thread_id * ldt * ldt; @@ -132,27 +129,18 @@ dblock_gemm_scatter( int_t lb, int_t j, int_t ib = Remain_info[lb].ib; int temp_nbrow = lsub[lptr + 1]; lptr += LB_DESCRIPTOR; - int_t cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow); - /* Getting L block information */ - // int_t lptr = Remain_info[lb].lptr; - // int_t ib = Remain_info[lb].ib; - // int_t temp_nbrow = lsub[lptr + 1]; - // lptr += LB_DESCRIPTOR; - // int_t cum_nrow = Remain_info[lb].StRow; - double alpha = 1.0; - double beta = 0.0; + int cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow); + double alpha = 1.0, beta = 0.0; - - superlu_dgemm("N", "N", - temp_nbrow, ncols, ldu, alpha, + /* calling DGEMM */ + // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col ); + superlu_dgemm("N", "N", temp_nbrow, ncols, ldu, alpha, &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl, &U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow); - - + // printf("SCU update: (%d, %d)\n",ib,jb ); #ifdef SCATTER_PROFILE - //unsigned long long ttx = __rdtsc(); double ttx = SuperLU_timer_(); #endif /*Now scattering the block*/ @@ -186,7 +174,6 @@ dblock_gemm_scatter( int_t lb, int_t j, // stat->ops[FACT] += 2*temp_nbrow*ncols*ldu + temp_nbrow*ncols; #ifdef SCATTER_PROFILE - //double t_s = (double) __rdtsc() - ttx; double t_s = SuperLU_timer_() - ttx; Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1) + (192 / 8) * (CEILING(ncols, 8) - 1))] += 3.0 * (double ) temp_nbrow * (double ) ncols; @@ -236,21 +223,17 @@ dblock_gemm_scatter_lock( int_t lb, int_t j, /* Getting L block information */ int_t lptr = Remain_info[lb].lptr; int_t ib = Remain_info[lb].ib; - int_t temp_nbrow = lsub[lptr + 1]; + int temp_nbrow = lsub[lptr + 1]; lptr += LB_DESCRIPTOR; - int_t cum_nrow = Remain_info[lb].StRow; + int cum_nrow = Remain_info[lb].StRow; double alpha = 1.0; double beta = 0.0; /* calling DGEMM */ - - superlu_dgemm("N", "N", - temp_nbrow, ncols, ldu, alpha, - &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl, - &U_mat[st_col * ldu], ldu, - beta, tempv1, temp_nbrow); - - + superlu_dgemm("N", "N", temp_nbrow, ncols, ldu, alpha, + &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl, + &U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow); + /*try to get the lock for the block*/ if (lock) /*lock is not null*/ while (!omp_test_lock(lock)) @@ -258,7 +241,6 @@ dblock_gemm_scatter_lock( int_t lb, int_t j, } #ifdef SCATTER_PROFILE - //unsigned long long ttx = __rdtsc(); double ttx = SuperLU_timer_(); #endif /*Now scattering the block*/ @@ -300,7 +282,8 @@ dblock_gemm_scatter_lock( int_t lb, int_t j, += t_s; #endif } /* dblock_gemm_scatter_lock */ -#endif // only if _OPENMP is defined +#endif // Only if _OPENMP is defined + // there are following three variations of block_gemm_scatter call /* @@ -341,11 +324,11 @@ int_t dblock_gemm_scatterTopLeft( int_t lb, /* block number in L */ int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; double** Unzval_br_ptr = Llu->Unzval_br_ptr; -#ifdef _OPENMP +#ifdef _OPENMP volatile int_t thread_id = omp_get_thread_num(); -#else +#else volatile int_t thread_id = 0; -#endif +#endif // printf("Thread's ID %lld \n", thread_id); //unsigned long long t1 = _rdtsc(); @@ -369,7 +352,7 @@ int_t dblock_gemm_scatterTopLeft( int_t lb, /* block number in L */ int_t dblock_gemm_scatterTopRight( int_t lb, int_t j, double* bigV, int_t knsupc, int_t klst, int_t* lsub, - int_t * usub, int_t ldt, int* indirect, int* indirect2, + int_t* usub, int_t ldt, int* indirect, int* indirect2, HyP_t* HyP, dLUstruct_t *LUstruct, gridinfo_t* grid, @@ -383,11 +366,11 @@ int_t dblock_gemm_scatterTopRight( int_t lb, int_t j, int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; double** Unzval_br_ptr = Llu->Unzval_br_ptr; -#ifdef _OPENMP +#ifdef _OPENMP volatile int_t thread_id = omp_get_thread_num(); -#else +#else volatile int_t thread_id = 0; -#endif +#endif //unsigned long long t1 = _rdtsc(); double t1 = SuperLU_timer_(); dblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow, @@ -406,7 +389,7 @@ int_t dblock_gemm_scatterTopRight( int_t lb, int_t j, int_t dblock_gemm_scatterBottomLeft( int_t lb, int_t j, double* bigV, int_t knsupc, int_t klst, int_t* lsub, - int_t * usub, int_t ldt, int* indirect, int* indirect2, + int_t* usub, int_t ldt, int* indirect, int* indirect2, HyP_t* HyP, dLUstruct_t *LUstruct, gridinfo_t* grid, @@ -420,11 +403,11 @@ int_t dblock_gemm_scatterBottomLeft( int_t lb, int_t j, int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; double** Unzval_br_ptr = Llu->Unzval_br_ptr; -#ifdef _OPENMP +#ifdef _OPENMP volatile int_t thread_id = omp_get_thread_num(); -#else +#else volatile int_t thread_id = 0; -#endif +#endif //printf("Thread's ID %lld \n", thread_id); //unsigned long long t1 = _rdtsc(); double t1 = SuperLU_timer_(); @@ -445,7 +428,7 @@ int_t dblock_gemm_scatterBottomLeft( int_t lb, int_t j, int_t dblock_gemm_scatterBottomRight( int_t lb, int_t j, double* bigV, int_t knsupc, int_t klst, int_t* lsub, - int_t * usub, int_t ldt, int* indirect, int* indirect2, + int_t* usub, int_t ldt, int* indirect, int* indirect2, HyP_t* HyP, dLUstruct_t *LUstruct, gridinfo_t* grid, @@ -459,11 +442,11 @@ int_t dblock_gemm_scatterBottomRight( int_t lb, int_t j, int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; double** Unzval_br_ptr = Llu->Unzval_br_ptr; -#ifdef _OPENMP +#ifdef _OPENMP volatile int_t thread_id = omp_get_thread_num(); -#else +#else volatile int_t thread_id = 0; -#endif +#endif // printf("Thread's ID %lld \n", thread_id); //unsigned long long t1 = _rdtsc(); double t1 = SuperLU_timer_(); @@ -477,7 +460,7 @@ int_t dblock_gemm_scatterBottomRight( int_t lb, int_t j, ); //unsigned long long t2 = _rdtsc(); - double t2 = SuperLU_timer_(); + double t2 = SuperLU_timer_(); SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1); return 0; @@ -610,7 +593,6 @@ scatter_u (int_t ib, indirect[i] = lsub[lptr + i] ; } - iuip_lib += UB_DESCRIPTOR; ucol = &Unzval_br_ptr[lib][ruip_lib]; diff --git a/SRC/dtreeFactorization.c b/SRC/dtreeFactorization.c index 78cf5ec2..562b9b18 100644 --- a/SRC/dtreeFactorization.c +++ b/SRC/dtreeFactorization.c @@ -34,19 +34,19 @@ int_t dLluBufInit(dLUValSubBuf_t* LUvsb, dLUstruct_t *LUstruct) return 0; } -diagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid) +ddiagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid) { - diagFactBufs_t** dFBufs; + ddiagFactBufs_t** dFBufs; /* Sherry fix: * mxLeafNode can be 0 for the replicated layers of the processes ?? */ - if ( mxLeafNode ) dFBufs = (diagFactBufs_t** ) - SUPERLU_MALLOC(mxLeafNode * sizeof(diagFactBufs_t*)); + if ( mxLeafNode ) dFBufs = (ddiagFactBufs_t** ) + SUPERLU_MALLOC(mxLeafNode * sizeof(ddiagFactBufs_t*)); for (int i = 0; i < mxLeafNode; ++i) { /* code */ - dFBufs[i] = (diagFactBufs_t* ) SUPERLU_MALLOC(sizeof(diagFactBufs_t)); + dFBufs[i] = (ddiagFactBufs_t* ) SUPERLU_MALLOC(sizeof(ddiagFactBufs_t)); assert(dFBufs[i]); dinitDiagFactBufs(ldt, dFBufs[i]); @@ -56,7 +56,7 @@ diagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* g } // sherry added -int dfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs) +int dfreeDiagFactBufsArr(int_t mxLeafNode, ddiagFactBufs_t** dFBufs) { for (int i = 0; i < mxLeafNode; ++i) { SUPERLU_FREE(dFBufs[i]->BlockUFactor); @@ -95,11 +95,12 @@ int dLluBufFreeArr(int_t numLA, dLUValSubBuf_t **LUvsbs) SUPERLU_FREE(LUvsbs[i]); } SUPERLU_FREE(LUvsbs); + return 0; } int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, - scuBufs_t* scuBufs, + dscuBufs_t* scuBufs, dLUstruct_t* LUstruct, gridinfo_t * grid) { @@ -109,14 +110,14 @@ int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, } // sherry added -int dfreeScuBufs(scuBufs_t* scuBufs) +int dfreeScuBufs(dscuBufs_t* scuBufs) { SUPERLU_FREE(scuBufs->bigV); SUPERLU_FREE(scuBufs->bigU); return 0; } -int_t dinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf) +int_t dinitDiagFactBufs(int_t ldt, ddiagFactBufs_t* dFBuf) { dFBuf->BlockUFactor = doubleMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt); dFBuf->BlockLFactor = doubleMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt); @@ -127,11 +128,11 @@ int_t ddenseTreeFactor( int_t nnodes, // number of nodes in the tree int_t *perm_c_supno, // list of nodes in the order of factorization commRequests_t *comReqs, // lists of communication requests - scuBufs_t *scuBufs, // contains buffers for schur complement update + dscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t*msgs, dLUValSubBuf_t* LUvsb, - diagFactBufs_t *dFBuf, + ddiagFactBufs_t *dFBuf, factStat_t *factStat, factNodelists_t *fNlists, superlu_dist_options_t *options, @@ -247,11 +248,11 @@ int_t ddenseTreeFactor( int_t klst = FstBlockC (k + 1); int_t *lsub = lPanelInfo->lsub; int_t *usub = uPanelInfo->usub; -#ifdef _OPENMP +#ifdef _OPENMP int_t thread_id = omp_get_thread_num(); -#else +#else int_t thread_id = 0; -#endif +#endif dblock_gemm_scatter( lb, ub, Ublock_info, Remain_info, @@ -288,11 +289,11 @@ int_t ddenseTreeFactor( int_t dsparseTreeFactor_ASYNC( sForest_t* sforest, commRequests_t **comReqss, // lists of communication requests // size maxEtree level - scuBufs_t *scuBufs, // contains buffers for schur complement update + dscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t**msgss, // size=num Look ahead dLUValSubBuf_t** LUvsbs, // size=num Look ahead - diagFactBufs_t **dFBufs, // size maxEtree level + ddiagFactBufs_t **dFBufs, // size maxEtree level factStat_t *factStat, factNodelists_t *fNlists, gEtreeInfo_t* gEtreeInfo, // global etree info diff --git a/SRC/dtrfAux.c b/SRC/dtrfAux.c index ad08eef4..b4ce16f2 100644 --- a/SRC/dtrfAux.c +++ b/SRC/dtrfAux.c @@ -337,7 +337,7 @@ int_t dSchurComplementSetupGPU( int_t* myIperm, int_t* iperm_c_supno, int_t*perm_c_supno, gEtreeInfo_t* gEtreeInfo, factNodelists_t* fNlists, - scuBufs_t* scuBufs, dLUValSubBuf_t* LUvsb, + dscuBufs_t* scuBufs, dLUValSubBuf_t* LUvsb, gridinfo_t *grid, dLUstruct_t *LUstruct, HyP_t* HyP) { @@ -554,11 +554,10 @@ trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers, CHECK_MALLOC (iam, "Enter dinitTrf3Dpartition()"); #endif int_t* perm_c_supno = getPerm_c_supno(nsupers, options, - LUstruct->etree, - LUstruct->Glu_persist, - LUstruct->Llu->Lrowind_bc_ptr, - LUstruct->Llu->Ufstnz_br_ptr, - grid); + LUstruct->etree, + LUstruct->Glu_persist, + LUstruct->Llu->Lrowind_bc_ptr, + LUstruct->Llu->Ufstnz_br_ptr, grid); int_t* iperm_c_supno = getFactIperm(perm_c_supno, nsupers); // calculating tree factorization diff --git a/SRC/lustruct_gpu.h b/SRC/lustruct_gpu.h deleted file mode 100644 index 8e3a7443..00000000 --- a/SRC/lustruct_gpu.h +++ /dev/null @@ -1,258 +0,0 @@ -// This file contains descriptions and declarations for structures used -// in GPU -/*also declaration used for GPUs*/ -#pragma once // Causes this source file to be included onle once - -#define DEBUG -// #ifdef DEBUG -// #include -// #endif -// #include -// #include "mkl.h" - -// #define USE_VENDOR_BLAS - - -#include -#include -#include "superlu_ddefs.h" -// #include "sec_structs.h" -// #include "supernodal_etree.h" - - -#define SLU_TARGET_GPU 0 - -#define MAX_BLOCK_SIZE 10000 - - -static -void check(cudaError_t result, char const *const func, const char *const file, int_t const line) -{ - if (result) - { - fprintf(stderr, "CUDA error at file %s: line %d code=(%s) \"%s\" \n", - file, line, cudaGetErrorString(result), func); - - // Make sure we call CUDA Device Reset before exiting - exit(EXIT_FAILURE); - } -} - -#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ ) - -typedef struct SCUbuf_gpu_ -{ - /*Informations for various buffers*/ - double *bigV; - double *bigU; - double *bigU_host; /*pinned location*/ - int_t *indirect; /*for indirect address calculations*/ - int_t *indirect2; /*for indirect address calculations*/ - - double *Remain_L_buff; /* on GPU */ - double *Remain_L_buff_host; /* Sherry: this memory is page-locked, why need another copy on GPU ? */ - - int_t *lsub; - int_t *usub; - - int_t *lsub_buf, *usub_buf; - - Ublock_info_t *Ublock_info; /* on GPU */ - Remain_info_t *Remain_info; - Ublock_info_t *Ublock_info_host; - Remain_info_t *Remain_info_host; - - int_t* usub_IndirectJ3; /* on GPU */ - int_t* usub_IndirectJ3_host; - -} SCUbuf_gpu_t; - - -#define MAX_NCUDA_STREAMS 32 - -typedef struct LUstruct_gpu_ -{ - - int_t *LrowindVec; /* A single vector */ - int_t *LrowindPtr; /* A single vector */ - - double *LnzvalVec; /* A single vector */ - int_t *LnzvalPtr; /* A single vector */ - int_t *LnzvalPtr_host; /* A single vector */ - - int_t *UrowindVec; /* A single vector */ - int_t *UrowindPtr; /* A single vector */ - int_t *UrowindPtr_host; /* A single vector */ - int_t *UnzvalPtr_host; - - double *UnzvalVec; /* A single vector */ - int_t *UnzvalPtr; /* A single vector */ - /*gpu pointers for easy block accesses */ - local_l_blk_info_t *local_l_blk_infoVec; - int_t *local_l_blk_infoPtr; - int_t *jib_lookupVec; - int_t *jib_lookupPtr; - local_u_blk_info_t *local_u_blk_infoVec; - - int_t *local_u_blk_infoPtr; - int_t *ijb_lookupVec; - int_t *ijb_lookupPtr; - - // GPU buffers for performing Schur Complement Update on GPU - SCUbuf_gpu_t scubufs[MAX_NCUDA_STREAMS]; - double *acc_L_buff, *acc_U_buff; - - /*Informations for various buffers*/ - int_t buffer_size; /**/ - int_t nsupers; /*should have number of supernodes*/ - int_t *xsup; - gridinfo_t *grid; - - - double ScatterMOPCounter; - double ScatterMOPTimer; - double GemmFLOPCounter; - double GemmFLOPTimer; - - double cPCIeH2D; - double cPCIeD2H; - double tHost_PCIeH2D; - double tHost_PCIeD2H; - - - /*cuda events to measure DGEMM and SCATTER timing */ - int_t *isOffloaded; /*stores if any iteration is offloaded or not*/ - cudaEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*cuda events to store gemm and scatter's begin and end*/ - cudaEvent_t *ePCIeH2D; - cudaEvent_t *ePCIeD2H_Start; - cudaEvent_t *ePCIeD2H_End; - - int_t *xsup_host; - int_t* perm_c_supno; - int_t first_l_block_gpu, first_u_block_gpu; -} LUstruct_gpu; - - -typedef struct sluGPU_t_ -{ - - int_t gpuId; // if there are multiple GPUs - LUstruct_gpu *A_gpu, *dA_gpu; - cudaStream_t funCallStreams[MAX_NCUDA_STREAMS], CopyStream; - cublasHandle_t cublasHandles[MAX_NCUDA_STREAMS]; - int_t lastOffloadStream[MAX_NCUDA_STREAMS]; - int_t nCudaStreams; - int_t* isNodeInMyGrid; - double acc_async_cost; - -} sluGPU_t; - - -#ifdef __cplusplus -extern "C" { -#endif - - - -int_t initD2Hreduce( - int_t next_k, - d2Hreduce_t* d2Hred, - int_t last_flag, - // int_t *perm_c_supno, - HyP_t* HyP, - sluGPU_t *sluGPU, - gridinfo_t *grid, - LUstruct_t *LUstruct - ,SCT_t* SCT -); - -int_t reduceGPUlu( - - int_t last_flag, - d2Hreduce_t* d2Hred, - sluGPU_t *sluGPU, - SCT_t *SCT, - gridinfo_t *grid, - LUstruct_t *LUstruct -); - -int_t waitGPUscu(int_t streamId, sluGPU_t *sluGPU, SCT_t *SCT); -int_t sendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred, sluGPU_t *sluGPU); -int_t sendSCUdataHost2GPU( - int_t streamId, - int_t* lsub, - int_t* usub, - double* bigU, - int_t bigu_send_size, - int_t Remain_lbuf_send_size, - sluGPU_t *sluGPU, - HyP_t* HyP -); - -int_t initSluGPU3D_t( - - sluGPU_t *sluGPU, - LUstruct_t *LUstruct, - gridinfo3d_t * grid3d, - int_t* perm_c_supno, - int_t n, - int_t buffer_size, - int_t bigu_size, - int_t ldt -); -int_t SchurCompUpdate_GPU( - int_t streamId, - int_t jj_cpu, int_t nub, int_t klst, int_t knsupc, - int_t Rnbrow, int_t RemainBlk, - int_t Remain_lbuf_send_size, - int_t bigu_send_size, int_t ldu, - int_t mcb, - int_t buffer_size, int_t lsub_len, int_t usub_len, - int_t ldt, int_t k0, - sluGPU_t *sluGPU, gridinfo_t *grid -); - - - -void CopyLUToGPU3D ( - int_t* isNodeInMyGrid, - LocalLU_t *A_host, - sluGPU_t *sluGPU, - Glu_persist_t *Glu_persist, int_t n, - gridinfo3d_t *grid3d, - int_t buffer_size, - int_t bigu_size, - int_t ldt -); - -int_t reduceAllAncestors3d_GPU( - int_t ilvl, int_t* myNodeCount, - int_t** treePerm, - dLUValSubBuf_t*LUvsb, - LUstruct_t* LUstruct, - gridinfo3d_t* grid3d, - sluGPU_t *sluGPU, - d2Hreduce_t* d2Hred, - factStat_t *factStat, - HyP_t* HyP, - SCT_t* SCT ); - - -void syncAllfunCallStreams(sluGPU_t* sluGPU, SCT_t* SCT); -int_t free_LUstruct_gpu (LUstruct_gpu *A_gpu); - -int_t freeSluGPU(sluGPU_t *sluGPU); - -cublasStatus_t checkCublas(cublasStatus_t result); -// cudaError_t checkCuda(cudaError_t result); - -void dPrint_matrix( char *desc, int_t m, int_t n, double *dA, int_t lda ); - -/*to print out various statistics*/ -void printGPUStats(LUstruct_gpu *A_gpu); - -#ifdef __cplusplus -} -#endif - -#undef DEBUG diff --git a/SRC/memory.c b/SRC/memory.c index 4695ccb6..4159206a 100644 --- a/SRC/memory.c +++ b/SRC/memory.c @@ -121,8 +121,8 @@ void superlu_free_dist(void *addr) #else /* The production mode. */ -//#if 0 -#if (__STDC_VERSION__ >= 201112L) +#if 0 +//#if (__STDC_VERSION__ >= 201112L) // cannot compile on Summit void * superlu_malloc_dist(size_t size) {void* ptr;int alignment=1<<12;if(size>1<<19){alignment=1<<21;}posix_memalign( (void**)&(ptr), alignment, size );return(ptr);} void superlu_free_dist(void * ptr) {free(ptr);} diff --git a/SRC/pd3dcomm.c b/SRC/pd3dcomm.c index 96dc1815..ee57c7e7 100644 --- a/SRC/pd3dcomm.c +++ b/SRC/pd3dcomm.c @@ -9,6 +9,7 @@ The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ + /*! @file * \brief Communication routines for the 3D algorithm. * @@ -18,7 +19,7 @@ at the top-level directory. * May 10, 2019 */ #include "superlu_ddefs.h" -// #include "cblas.h" +//#include "cblas.h" #if 0 #include "p3dcomm.h" #include "sec_structs.h" @@ -32,838 +33,844 @@ at the top-level directory. #endif // #define MPI_MALLOC -#define MPI_INT_ALLOC(a, b) (MPI_Alloc_mem((b) * sizeof(int_t), MPI_INFO_NULL, &(a))) -#define MPI_DATATYPE_ALLOC(a, b) (MPI_Alloc_mem((b) * sizeof(double), MPI_INFO_NULL, &(a))) +#define MPI_INT_ALLOC(a, b) (MPI_Alloc_mem( (b)*sizeof(int_t), MPI_INFO_NULL, &(a) )) +#define MPI_DATATYPE_ALLOC(a, b) (MPI_Alloc_mem((b)*sizeof(double), MPI_INFO_NULL, &(a))) -int_t dAllocLlu(int_t nsupers, dLUstruct_t *LUstruct, gridinfo3d_t *grid3d) +int_t dAllocLlu(int_t nsupers, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d) { - int i; - int_t Pc = grid3d->npcol; - int_t Pr = grid3d->nprow; - - int_t nbc = CEILING(nsupers, Pc); - int_t nbr = CEILING(nsupers, Pr); - - dLocalLU_t *Llu = LUstruct->Llu; - int_t **Lrowind_bc_ptr = - (int_t **)SUPERLU_MALLOC(sizeof(int_t *) * nbc); /* size ceil(NSUPERS/Pc) */ - double **Lnzval_bc_ptr = - (double **)SUPERLU_MALLOC(sizeof(double *) * nbc); /* size ceil(NSUPERS/Pc) */ - - for (i = 0; i < nbc; ++i) + int i; + int_t Pc = grid3d->npcol; + int_t Pr = grid3d->nprow; + + int_t nbc = CEILING(nsupers, Pc); + int_t nbr = CEILING(nsupers, Pr); + + dLocalLU_t *Llu = LUstruct->Llu; + int_t **Lrowind_bc_ptr = + (int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbc); /* size ceil(NSUPERS/Pc) */ + double **Lnzval_bc_ptr = + (double **) SUPERLU_MALLOC(sizeof(double*)*nbc); /* size ceil(NSUPERS/Pc) */ + + for (i = 0; i < nbc ; ++i) { - /* code */ - Lrowind_bc_ptr[i] = NULL; - Lnzval_bc_ptr[i] = NULL; + /* code */ + Lrowind_bc_ptr[i] = NULL; + Lnzval_bc_ptr[i] = NULL; } - - int_t **Ufstnz_br_ptr = - (int_t **)SUPERLU_MALLOC(sizeof(int_t *) * nbr); /* size ceil(NSUPERS/Pr) */ - double **Unzval_br_ptr = - (double **)SUPERLU_MALLOC(sizeof(double *) * nbr); /* size ceil(NSUPERS/Pr) */ - - for (i = 0; i < nbr; ++i) + + int_t **Ufstnz_br_ptr = + (int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbr); /* size ceil(NSUPERS/Pr) */ + double **Unzval_br_ptr = + (double **) SUPERLU_MALLOC(sizeof(double*)*nbr); /* size ceil(NSUPERS/Pr) */ + + for (i = 0; i < nbr ; ++i) { - /* code */ - Ufstnz_br_ptr[i] = NULL; - Unzval_br_ptr[i] = NULL; + /* code */ + Ufstnz_br_ptr[i] = NULL; + Unzval_br_ptr[i] = NULL; } -#if 0 // Sherry: change to int type - int_t *ToRecv = intCalloc_dist(nsupers); /* Recv from no one (0), left (1), and up (2).*/ - int_t *ToSendD = intCalloc_dist(nbr); /* Whether need to send down block row. */ - int_t **ToSendR = (int_t **) SUPERLU_MALLOC(nbc * sizeof(int_t*)); /* List of processes to send right block col. */ -#else - /* Recv from no one (0), left (1), and up (2).*/ - int *ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int)); - for (i = 0; i < nsupers; ++i) - ToRecv[i] = 0; - /* Whether need to send down block row. */ - int *ToSendD = SUPERLU_MALLOC(nbr * sizeof(int)); - for (i = 0; i < nbr; ++i) - ToSendD[i] = 0; - /* List of processes to send right block col. */ - int **ToSendR = (int **)SUPERLU_MALLOC(nbc * sizeof(int *)); -#endif - - for (i = 0; i < nbc; ++i) + // Sherry: use int type + /* Recv from no one (0), left (1), and up (2).*/ + int *ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int)); + for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; + /* Whether need to send down block row. */ + int *ToSendD = SUPERLU_MALLOC(nbr * sizeof(int)); + for (i = 0; i < nbr; ++i) ToSendD[i] = 0; + /* List of processes to send right block col. */ + int **ToSendR = (int **) SUPERLU_MALLOC(nbc * sizeof(int*)); + + for (int_t i = 0; i < nbc; ++i) { - /* code */ - //ToSendR[i] = INT_T_ALLOC(Pc); - ToSendR[i] = SUPERLU_MALLOC(Pc * sizeof(int)); + /* code */ + //ToSendR[i] = INT_T_ALLOC(Pc); + ToSendR[i] = SUPERLU_MALLOC(Pc * sizeof(int)); } - - /*now setup the pointers*/ - Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; - Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; - Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; - Llu->Unzval_br_ptr = Unzval_br_ptr; - Llu->ToRecv = ToRecv; - Llu->ToSendD = ToSendD; - Llu->ToSendR = ToSendR; - - return 0; + + /*now setup the pointers*/ + Llu->Lrowind_bc_ptr = Lrowind_bc_ptr ; + Llu->Lnzval_bc_ptr = Lnzval_bc_ptr ; + Llu->Ufstnz_br_ptr = Ufstnz_br_ptr ; + Llu->Unzval_br_ptr = Unzval_br_ptr ; + Llu->ToRecv = ToRecv ; + Llu->ToSendD = ToSendD ; + Llu->ToSendR = ToSendR ; + + return 0; } /* dAllocLlu */ -int_t dmpiMallocLUStruct(int_t nsupers, dLUstruct_t *LUstruct, gridinfo3d_t *grid3d) +int_t dmpiMallocLUStruct(int_t nsupers, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d) { - dLocalLU_t *Llu = LUstruct->Llu; - int_t *xsup = LUstruct->Glu_persist->xsup; - int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - double **Unzval_br_ptr = Llu->Unzval_br_ptr; - int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - gridinfo_t *grid = &(grid3d->grid2d); - - int_t k = CEILING(nsupers, grid->nprow); /* Number of local block rows */ - for (int_t lb = 0; lb < k; ++lb) + dLocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + for ( int_t lb = 0; lb < k; ++lb) { - int_t *usub, *usub_new; - usub = Ufstnz_br_ptr[lb]; - - double *uval = Unzval_br_ptr[lb]; - double *uval_new; - - /*if non empty set the flag*/ - if (usub != NULL) + int_t *usub, *usub_new; + usub = Ufstnz_br_ptr[lb]; + + double * uval = Unzval_br_ptr[lb]; + double * uval_new; + + /*if non empty set the flag*/ + if (usub != NULL) { - int_t lenv, lens; - lenv = usub[1]; - lens = usub[2]; - - MPI_INT_ALLOC(usub_new, lens); - memcpy(usub_new, usub, lens * sizeof(int_t)); - MPI_DATATYPE_ALLOC(uval_new, lenv); - memcpy(uval_new, uval, lenv * sizeof(double)); - Ufstnz_br_ptr[lb] = usub_new; - Unzval_br_ptr[lb] = uval_new; - SUPERLU_FREE(usub); - SUPERLU_FREE(uval); + int_t lenv, lens; + lenv = usub[1]; + lens = usub[2]; + + MPI_INT_ALLOC(usub_new, lens); + memcpy( usub_new, usub, lens * sizeof(int_t)); + MPI_DATATYPE_ALLOC(uval_new, lenv); + memcpy( uval_new, uval, lenv * sizeof(double)); + Ufstnz_br_ptr[lb] = usub_new; + Unzval_br_ptr[lb] = uval_new; + SUPERLU_FREE(usub); + SUPERLU_FREE(uval); } } /*for ( int_t lb = 0; lb < k; ++lb)*/ - - int_t iam = grid->iam; - int_t mycol = MYCOL(iam, grid); - - /*start broadcasting blocks*/ - for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + + /*start broadcasting blocks*/ + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ { - int_t pc = PCOL(jb, grid); - if (mycol == pc) + int_t pc = PCOL( jb, grid ); + if (mycol == pc) { - int_t ljb = LBj(jb, grid); /* Local block number */ - int_t *lsub, *lsub_new; - double *lnzval, *lnzval_new; - lsub = Lrowind_bc_ptr[ljb]; - lnzval = Lnzval_bc_ptr[ljb]; - - if (lsub) + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *lsub , *lsub_new; + double *lnzval, *lnzval_new; + lsub = Lrowind_bc_ptr[ljb]; + lnzval = Lnzval_bc_ptr[ljb]; + + if (lsub) { - int_t nrbl, len, len1, len2; - - nrbl = lsub[0]; /*number of L blocks */ - len = lsub[1]; /* LDA of the nzval[] */ - len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - len2 = SuperSize(jb) * len; - - MPI_INT_ALLOC(lsub_new, len1); - memcpy(lsub_new, lsub, len1 * sizeof(int_t)); - MPI_DATATYPE_ALLOC(lnzval_new, len2); - memcpy(lnzval_new, lnzval, len2 * sizeof(double)); - Lrowind_bc_ptr[ljb] = lsub_new; - SUPERLU_FREE(lsub); - Lnzval_bc_ptr[ljb] = lnzval_new; - SUPERLU_FREE(lnzval); + int_t nrbl, len, len1, len2; + + nrbl = lsub[0]; /*number of L blocks */ + len = lsub[1]; /* LDA of the nzval[] */ + len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + len2 = SuperSize(jb) * len; + + MPI_INT_ALLOC(lsub_new, len1); + memcpy( lsub_new, lsub, len1 * sizeof(int_t)); + MPI_DATATYPE_ALLOC(lnzval_new, len2); + memcpy( lnzval_new, lnzval, len2 * sizeof(double)); + Lrowind_bc_ptr[ljb] = lsub_new; + SUPERLU_FREE(lsub ); + Lnzval_bc_ptr[ljb] = lnzval_new; + SUPERLU_FREE(lnzval ); } } /* if mycol == pc ... */ - } /* for jb ... */ - - return 0; + } /* for jb ... */ + + return 0; } + int_t dzSendLPanel(int_t k, int_t receiver, - dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT) + dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) { - dLocalLU_t *Llu = LUstruct->Llu; - int_t *xsup = LUstruct->Glu_persist->xsup; - int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - gridinfo_t *grid = &(grid3d->grid2d); - int_t iam = grid->iam; - int_t mycol = MYCOL(iam, grid); - - int_t pc = PCOL(k, grid); - if (mycol == pc) + dLocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + + int_t pc = PCOL( k, grid ); + if (mycol == pc) { - int_t lk = LBj(k, grid); /* Local block number */ - int_t *lsub; - double *lnzval; - lsub = Lrowind_bc_ptr[lk]; - lnzval = Lnzval_bc_ptr[lk]; - - if (lsub != NULL) + int_t lk = LBj( k, grid ); /* Local block number */ + int_t *lsub; + double* lnzval; + lsub = Lrowind_bc_ptr[lk]; + lnzval = Lnzval_bc_ptr[lk]; + + if (lsub != NULL) { - int_t len = lsub[1]; /* LDA of the nzval[] */ - int_t len2 = SuperSize(k) * len; /* size of nzval of L panel */ - - MPI_Send(lnzval, len2, MPI_DOUBLE, receiver, k, grid3d->zscp.comm); - SCT->commVolRed += len2 * sizeof(double); + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(k) * len; /* size of nzval of L panel */ + + MPI_Send(lnzval, len2, MPI_DOUBLE, receiver, k, grid3d->zscp.comm); + SCT->commVolRed += len2 * sizeof(double); } } - return 0; + return 0; } + int_t dzRecvLPanel(int_t k, int_t sender, double alpha, double beta, - double *Lval_buf, - dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT) + double* Lval_buf, + dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) { - - // A(k) = alpha*A(k) + beta* A^{sender}(k) - dLocalLU_t *Llu = LUstruct->Llu; - int_t *xsup = LUstruct->Glu_persist->xsup; - int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - gridinfo_t *grid = &(grid3d->grid2d); - int inc = 1; - int_t iam = grid->iam; - int_t mycol = MYCOL(iam, grid); - - int_t pc = PCOL(k, grid); - if (mycol == pc) + + // A(k) = alpha*A(k) + beta* A^{sender}(k) + dLocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + int inc = 1; + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); + + int_t pc = PCOL( k, grid ); + if (mycol == pc) { - int_t lk = LBj(k, grid); /* Local block number */ - int_t *lsub; - double *lnzval; - lsub = Lrowind_bc_ptr[lk]; - lnzval = Lnzval_bc_ptr[lk]; - - if (lsub != NULL) + int_t lk = LBj( k, grid ); /* Local block number */ + int_t *lsub; + double* lnzval; + lsub = Lrowind_bc_ptr[lk]; + lnzval = Lnzval_bc_ptr[lk]; + + if (lsub != NULL) { - - int len = lsub[1]; /* LDA of the nzval[] */ - int len2 = SuperSize(k) * len; /*size of nzval of L panels*/ - - MPI_Status status; - MPI_Recv(Lval_buf, len2, MPI_DOUBLE, sender, k, - grid3d->zscp.comm, &status); - - /*reduce the updates*/ - superlu_dscal(len2, alpha, lnzval, 1); - superlu_daxpy(len2, beta, Lval_buf, 1, lnzval, 1); + int len = lsub[1]; /* LDA of the nzval[] */ + int len2 = SuperSize(k) * len; /* size of nzval of L panels */ + + MPI_Status status; + MPI_Recv(Lval_buf , len2, MPI_DOUBLE, sender, k, + grid3d->zscp.comm, &status); + + /*reduce the updates*/ + superlu_dscal(len2, alpha, lnzval, 1); + superlu_daxpy(len2, beta, Lval_buf, 1, lnzval, 1); } } - return 0; + return 0; } int_t dzSendUPanel(int_t k, int_t receiver, - dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT) + dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) { - dLocalLU_t *Llu = LUstruct->Llu; - int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - double **Unzval_br_ptr = Llu->Unzval_br_ptr; - gridinfo_t *grid = &(grid3d->grid2d); - int_t iam = grid->iam; - - int_t myrow = MYROW(iam, grid); - int_t pr = PROW(k, grid); - if (myrow == pr) + dLocalLU_t *Llu = LUstruct->Llu; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + int_t iam = grid->iam; + + int_t myrow = MYROW (iam, grid); + int_t pr = PROW( k, grid ); + if (myrow == pr) { - int_t lk = LBi(k, grid); /* Local block number */ - int_t *usub; - double *unzval; - usub = Ufstnz_br_ptr[lk]; - unzval = Unzval_br_ptr[lk]; - - if (usub != NULL) + int_t lk = LBi( k, grid ); /* Local block number */ + int_t *usub; + double* unzval; + usub = Ufstnz_br_ptr[lk]; + unzval = Unzval_br_ptr[lk]; + + if (usub != NULL) { - int lenv = usub[1]; - - /* code */ - MPI_Send(unzval, lenv, MPI_DOUBLE, receiver, k, grid3d->zscp.comm); - SCT->commVolRed += lenv * sizeof(double); + int lenv = usub[1]; + + /* code */ + MPI_Send(unzval, lenv, MPI_DOUBLE, receiver, k, grid3d->zscp.comm); + SCT->commVolRed += lenv * sizeof(double); } } - - return 0; + + return 0; } + int_t dzRecvUPanel(int_t k, int_t sender, double alpha, double beta, - double *Uval_buf, dLUstruct_t *LUstruct, - gridinfo3d_t *grid3d, SCT_t *SCT) + double* Uval_buf, dLUstruct_t* LUstruct, + gridinfo3d_t* grid3d, SCT_t* SCT) { - dLocalLU_t *Llu = LUstruct->Llu; - int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - double **Unzval_br_ptr = Llu->Unzval_br_ptr; - gridinfo_t *grid = &(grid3d->grid2d); - int inc = 1; - int_t iam = grid->iam; - int_t myrow = MYROW(iam, grid); - int_t pr = PROW(k, grid); - - if (myrow == pr) + dLocalLU_t *Llu = LUstruct->Llu; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + int inc = 1; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t pr = PROW( k, grid ); + + if (myrow == pr) { - int_t lk = LBi(k, grid); /* Local block number */ - int_t *usub; - double *unzval; - usub = Ufstnz_br_ptr[lk]; - unzval = Unzval_br_ptr[lk]; - - if (usub != NULL) + int_t lk = LBi( k, grid ); /* Local block number */ + int_t *usub; + double* unzval; + usub = Ufstnz_br_ptr[lk]; + unzval = Unzval_br_ptr[lk]; + + if (usub != NULL) { - int lenv = usub[1]; - MPI_Status status; - MPI_Recv(Uval_buf, lenv, MPI_DOUBLE, sender, k, - grid3d->zscp.comm, &status); - - /*reduce the updates*/ - superlu_dscal(lenv, alpha, unzval, 1); - superlu_daxpy(lenv, beta, Uval_buf, 1, unzval, 1); + int lenv = usub[1]; + MPI_Status status; + MPI_Recv(Uval_buf , lenv, MPI_DOUBLE, sender, k, + grid3d->zscp.comm, &status); + + /*reduce the updates*/ + superlu_dscal(lenv, alpha, unzval, 1); + superlu_daxpy(lenv, beta, Uval_buf, 1, unzval, 1); } } - return 0; + return 0; } -int_t dp3dScatter(int_t n, dLUstruct_t *LUstruct, gridinfo3d_t *grid3d) + +int_t dp3dScatter(int_t n, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d) /* Copies LU structure from layer 0 to all the layers */ { - gridinfo_t *grid = &(grid3d->grid2d); - int_t Pc = grid->npcol; - int_t Pr = grid->nprow; - - /* broadcast etree */ - int_t *etree = LUstruct->etree; - MPI_Bcast(etree, n, mpi_int_t, 0, grid3d->zscp.comm); - - int_t nsupers; - - if (!grid3d->zscp.Iam) - nsupers = getNsupers(n, LUstruct->Glu_persist); - - /* broadcast nsupers */ - MPI_Bcast(&nsupers, 1, mpi_int_t, 0, grid3d->zscp.comm); - - /* Scatter and alloc Glu_persist */ - if (grid3d->zscp.Iam) // all other process layers not equal 0 - dAllocGlu_3d(n, nsupers, LUstruct); - - /* broadcast Glu_persist */ - int_t *xsup = LUstruct->Glu_persist->xsup; - MPI_Bcast(xsup, nsupers + 1, mpi_int_t, 0, grid3d->zscp.comm); - - int_t *supno = LUstruct->Glu_persist->supno; - MPI_Bcast(supno, n, mpi_int_t, 0, grid3d->zscp.comm); - - /* now broadcast localLu */ - /* first allocating space for it */ - if (grid3d->zscp.Iam) // all other process layers not equal 0 - dAllocLlu(nsupers, LUstruct, grid3d); - - dLocalLU_t *Llu = LUstruct->Llu; - - /*scatter all the L blocks and indexes*/ - dscatter3dLPanels(nsupers, LUstruct, grid3d); - - /*scatter all the U blocks and indexes*/ - dscatter3dUPanels(nsupers, LUstruct, grid3d); - - int_t *bufmax = Llu->bufmax; - MPI_Bcast(bufmax, NBUFFERS, mpi_int_t, 0, grid3d->zscp.comm); - - /* now sending tosendR etc */ - int **ToSendR = Llu->ToSendR; - int *ToRecv = Llu->ToRecv; - int *ToSendD = Llu->ToSendD; - - int_t nbr = CEILING(nsupers, Pr); - int_t nbc = CEILING(nsupers, Pc); - //Sherry MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0, grid3d->zscp.comm); - MPI_Bcast(ToRecv, nsupers, MPI_INT, 0, grid3d->zscp.comm); - - MPI_Bcast(ToSendD, nbr, MPI_INT, 0, grid3d->zscp.comm); - for (int i = 0; i < nbc; ++i) + gridinfo_t* grid = &(grid3d->grid2d); + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + + /* broadcast etree */ + int_t *etree = LUstruct->etree; + MPI_Bcast( etree, n, mpi_int_t, 0, grid3d->zscp.comm); + + int_t nsupers; + + if (!grid3d->zscp.Iam) + nsupers = getNsupers(n, LUstruct->Glu_persist); + + /* broadcast nsupers */ + MPI_Bcast( &nsupers, 1, mpi_int_t, 0, grid3d->zscp.comm); + + /* Scatter and alloc Glu_persist */ + if ( grid3d->zscp.Iam ) // all other process layers not equal 0 + dAllocGlu_3d(n, nsupers, LUstruct); + + /* broadcast Glu_persist */ + int_t *xsup = LUstruct->Glu_persist->xsup; + MPI_Bcast( xsup, nsupers + 1, mpi_int_t, 0, grid3d->zscp.comm); + + int_t *supno = LUstruct->Glu_persist->supno; + MPI_Bcast( supno, n, mpi_int_t, 0, grid3d->zscp.comm); + + /* now broadcast local LU structure */ + /* first allocating space for it */ + if ( grid3d->zscp.Iam ) // all other process layers not equal 0 + dAllocLlu(nsupers, LUstruct, grid3d); + + dLocalLU_t *Llu = LUstruct->Llu; + + /*scatter all the L blocks and indexes*/ + dscatter3dLPanels( nsupers, LUstruct, grid3d); + + /*scatter all the U blocks and indexes*/ + dscatter3dUPanels( nsupers, LUstruct, grid3d); + + int_t* bufmax = Llu->bufmax; + MPI_Bcast( bufmax, NBUFFERS, mpi_int_t, 0, grid3d->zscp.comm); + + /* now sending tosendR etc */ + int** ToSendR = Llu->ToSendR; + int* ToRecv = Llu->ToRecv; + int* ToSendD = Llu->ToSendD; + + int_t nbr = CEILING(nsupers, Pr); + int_t nbc = CEILING(nsupers, Pc); + // MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( ToRecv, nsupers, MPI_INT, 0, grid3d->zscp.comm); + + MPI_Bcast( ToSendD, nbr, MPI_INT, 0, grid3d->zscp.comm); + for (int_t i = 0; i < nbc; ++i) { - /* code */ - MPI_Bcast(ToSendR[i], Pc, MPI_INT, 0, grid3d->zscp.comm); + /* code */ + MPI_Bcast( ToSendR[i], Pc, MPI_INT, 0, grid3d->zscp.comm); } - - // + + // #ifdef MPI_MALLOC - // change MY LU struct into MPI malloc based - if (!grid3d->zscp.Iam) - mpiMallocLUStruct(nsupers, LUstruct, grid3d); + // change MY LU struct into MPI malloc based + if (!grid3d->zscp.Iam) + mpiMallocLUStruct(nsupers, LUstruct, grid3d); #endif - return 0; + return 0; } /* dp3dScatter */ + int_t dscatter3dUPanels(int_t nsupers, - dLUstruct_t *LUstruct, gridinfo3d_t *grid3d) + dLUstruct_t * LUstruct, gridinfo3d_t* grid3d) { - dLocalLU_t *Llu = LUstruct->Llu; - int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - double **Unzval_br_ptr = Llu->Unzval_br_ptr; - gridinfo_t *grid = &(grid3d->grid2d); - - int_t k = CEILING(nsupers, grid->nprow); /* Number of local block rows */ - for (int_t lb = 0; lb < k; ++lb) - { - int_t *usub; - usub = Ufstnz_br_ptr[lb]; - - double *uval = Unzval_br_ptr[lb]; - - int_t flag = 0; - /*if non empty set the flag*/ - if (!grid3d->zscp.Iam && usub != NULL) - flag = 1; - /*bcast the flag*/ - MPI_Bcast(&flag, 1, mpi_int_t, 0, grid3d->zscp.comm); - - if (flag) + dLocalLU_t *Llu = LUstruct->Llu; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + for ( int_t lb = 0; lb < k; ++lb) { + int_t *usub; + usub = Ufstnz_br_ptr[lb]; + + double * uval = Unzval_br_ptr[lb]; + + int_t flag = 0; + /*if non empty set the flag*/ + if (!grid3d->zscp.Iam && usub != NULL) + flag = 1; + /*bcast the flag*/ + MPI_Bcast( &flag, 1, mpi_int_t, 0, grid3d->zscp.comm); + + if (flag) { + int_t lenv, lens; + lenv = 0; + lens = 0; + + if (!grid3d->zscp.Iam) { - int_t lenv, lens; - lenv = 0; - lens = 0; - - if (!grid3d->zscp.Iam) - { - lenv = usub[1]; - lens = usub[2]; - } - - /*broadcast the size of sub array*/ - MPI_Bcast(&lens, 1, mpi_int_t, 0, grid3d->zscp.comm); - MPI_Bcast(&lenv, 1, mpi_int_t, 0, grid3d->zscp.comm); - - /*allocate lsub*/ - if (grid3d->zscp.Iam) + lenv = usub[1]; + lens = usub[2]; + } + + /*broadcast the size of sub array*/ + MPI_Bcast( &lens, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &lenv, 1, mpi_int_t, 0, grid3d->zscp.comm); + + /*allocate lsub*/ + if (grid3d->zscp.Iam) #ifdef MPI_MALLOC - MPI_INT_ALLOC(usub, lens); + MPI_INT_ALLOC(usub, lens); #else - usub = INT_T_ALLOC(lens); + usub = INT_T_ALLOC(lens); #endif - /*bcast usub*/ - MPI_Bcast(usub, lens, mpi_int_t, 0, grid3d->zscp.comm); + /*bcast usub*/ + MPI_Bcast( usub, lens, mpi_int_t, 0, grid3d->zscp.comm); - /*allocate uval*/ - if (grid3d->zscp.Iam) + /*allocate uval*/ + if (grid3d->zscp.Iam) #ifdef MPI_MALLOC - MPI_DATATYPE_ALLOC(uval, lenv); + MPI_DATATYPE_ALLOC(uval, lenv); #else - uval = doubleMalloc_dist(lenv); //DOUBLE_ALLOC(lenv); + uval = doubleMalloc_dist(lenv); //DOUBLE_ALLOC(lenv); #endif - /*broadcast uval*/ - MPI_Bcast(uval, lenv, MPI_DOUBLE, 0, grid3d->zscp.comm); - - /*setup the pointer*/ - Unzval_br_ptr[lb] = uval; - Ufstnz_br_ptr[lb] = usub; - } /* end if flag */ - - } /* end for lb ... */ - return 0; + /*broadcast uval*/ + MPI_Bcast( uval, lenv, MPI_DOUBLE, 0, grid3d->zscp.comm); + + /*setup the pointer*/ + Unzval_br_ptr[lb] = uval; + Ufstnz_br_ptr[lb] = usub; + } /* end if flag */ + + } /* end for lb ... */ + return 0; } /* end dScatter3dUPanels */ + int_t dscatter3dLPanels(int_t nsupers, - dLUstruct_t *LUstruct, gridinfo3d_t *grid3d) + dLUstruct_t * LUstruct, gridinfo3d_t* grid3d) { - dLocalLU_t *Llu = LUstruct->Llu; - int_t *xsup = LUstruct->Glu_persist->xsup; - gridinfo_t *grid = &(grid3d->grid2d); - int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - int_t iam = grid->iam; - - int_t mycol = MYCOL(iam, grid); - - /*start broadcasting blocks*/ - for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ - { - int_t pc = PCOL(jb, grid); - if (mycol == pc) - { - int_t ljb = LBj(jb, grid); /* Local block number */ - int_t *lsub; - double *lnzval; - lsub = Lrowind_bc_ptr[ljb]; - lnzval = Lnzval_bc_ptr[ljb]; - - int_t flag = 0; - /*if non empty set the flag*/ - if (!grid3d->zscp.Iam && lsub != NULL) - flag = 1; - /*bcast the flag*/ - MPI_Bcast(&flag, 1, mpi_int_t, 0, grid3d->zscp.comm); - - if (flag) - { - int_t nrbl, len, len1, len2; - if (!grid3d->zscp.Iam) - { - nrbl = lsub[0]; /*number of L blocks */ - len = lsub[1]; /* LDA of the nzval[] */ - len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - len2 = SuperSize(jb) * len; - } - - /*bcast lsub len*/ - MPI_Bcast(&len1, 1, mpi_int_t, 0, grid3d->zscp.comm); - - /*allocate lsub*/ - if (grid3d->zscp.Iam) + dLocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + gridinfo_t* grid = &(grid3d->grid2d); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + int_t iam = grid->iam; + + int_t mycol = MYCOL (iam, grid); + + /*start broadcasting blocks*/ + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *lsub; + double* lnzval; + lsub = Lrowind_bc_ptr[ljb]; + lnzval = Lnzval_bc_ptr[ljb]; + + int_t flag = 0; + /*if non empty set the flag*/ + if (!grid3d->zscp.Iam && lsub != NULL) + flag = 1; + /*bcast the flag*/ + MPI_Bcast( &flag, 1, mpi_int_t, 0, grid3d->zscp.comm); + + if (flag) { + int_t nrbl, len, len1, len2; + if (!grid3d->zscp.Iam) + { + nrbl = lsub[0]; /*number of L blocks */ + len = lsub[1]; /* LDA of the nzval[] */ + len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + len2 = SuperSize(jb) * len; + } + + /*bcast lsub len*/ + MPI_Bcast( &len1, 1, mpi_int_t, 0, grid3d->zscp.comm); + + /*allocate lsub*/ + if (grid3d->zscp.Iam) #ifdef MPI_MALLOC - MPI_INT_ALLOC(lsub, len1); + MPI_INT_ALLOC(lsub, len1); #else - - lsub = INT_T_ALLOC(len1); + + lsub = INT_T_ALLOC(len1); #endif - /*now broadcast lsub*/ - MPI_Bcast(lsub, len1, mpi_int_t, 0, grid3d->zscp.comm); - - /*set up pointer*/ - Lrowind_bc_ptr[ljb] = lsub; - - /*bcast lnzval len*/ - MPI_Bcast(&len2, 1, mpi_int_t, 0, grid3d->zscp.comm); - - /*allocate space for nzval*/ - if (grid3d->zscp.Iam) + /*now broadcast lsub*/ + MPI_Bcast( lsub, len1, mpi_int_t, 0, grid3d->zscp.comm); + + /*set up pointer*/ + Lrowind_bc_ptr[ljb] = lsub; + + /*bcast lnzval len*/ + MPI_Bcast( &len2, 1, mpi_int_t, 0, grid3d->zscp.comm); + + /*allocate space for nzval*/ + if (grid3d->zscp.Iam) #ifdef MPI_MALLOC - MPI_DATATYPE_ALLOC(lnzval, len2); + MPI_DATATYPE_ALLOC(lnzval, len2); #else - lnzval = doubleCalloc_dist(len2); + lnzval = doubleCalloc_dist(len2); #endif + + /*bcast nonzero values*/ + MPI_Bcast( lnzval, len2, MPI_DOUBLE, 0, grid3d->zscp.comm); + + /*setup the pointers*/ + Lnzval_bc_ptr[ljb] = lnzval; - /*bcast nonzero values*/ - MPI_Bcast(lnzval, len2, MPI_DOUBLE, 0, grid3d->zscp.comm); - - /*setup the pointers*/ - Lnzval_bc_ptr[ljb] = lnzval; - - } /* end if flag */ + } /* end if flag */ - } /* end if mycol == pc */ - } /* end for jb ... */ + } /* end if mycol == pc */ + } /* end for jb ... */ - return 0; + return 0; } /* dscatter3dLPanels */ -int_t dcollect3dLpanels(int_t layer, int_t nsupers, dLUstruct_t *LUstruct, - gridinfo3d_t *grid3d) +int_t dcollect3dLpanels(int_t layer, int_t nsupers, dLUstruct_t * LUstruct, + gridinfo3d_t* grid3d) { - dLocalLU_t *Llu = LUstruct->Llu; - int_t *xsup = LUstruct->Glu_persist->xsup; - int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - gridinfo_t *grid = &(grid3d->grid2d); + dLocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + gridinfo_t* grid = &(grid3d->grid2d); - int_t iam = grid->iam; - int_t mycol = MYCOL(iam, grid); + int_t iam = grid->iam; + int_t mycol = MYCOL (iam, grid); - /*start broadcasting blocks*/ - for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + /*start broadcasting blocks*/ + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc) { - int_t pc = PCOL(jb, grid); - if (mycol == pc) + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *lsub; + double* lnzval; + lsub = Lrowind_bc_ptr[ljb]; + lnzval = Lnzval_bc_ptr[ljb]; + + if (lsub != NULL) + { + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(jb) * len; /*size of nzval of L panel */ + + if (grid3d->zscp.Iam == layer) { - int_t ljb = LBj(jb, grid); /* Local block number */ - int_t *lsub; - double *lnzval; - lsub = Lrowind_bc_ptr[ljb]; - lnzval = Lnzval_bc_ptr[ljb]; - - if (lsub != NULL) - { - int_t len = lsub[1]; /* LDA of the nzval[] */ - int_t len2 = SuperSize(jb) * len; /*size of nzval of L panel */ - - if (grid3d->zscp.Iam == layer) - { - MPI_Send(lnzval, len2, MPI_DOUBLE, 0, jb, grid3d->zscp.comm); - } - if (!grid3d->zscp.Iam) - { - MPI_Status status; - MPI_Recv(lnzval, len2, MPI_DOUBLE, layer, jb, grid3d->zscp.comm, &status); - } - } + MPI_Send(lnzval, len2, MPI_DOUBLE, 0, jb, grid3d->zscp.comm); } - } /* for jb ... */ - return 0; + if (!grid3d->zscp.Iam) + { + MPI_Status status; + MPI_Recv(lnzval, len2, MPI_DOUBLE, layer, jb, grid3d->zscp.comm, &status); + } + } + } + } /* for jb ... */ + return 0; } -int_t dcollect3dUpanels(int_t layer, int_t nsupers, dLUstruct_t *LUstruct, - gridinfo3d_t *grid3d) +int_t dcollect3dUpanels(int_t layer, int_t nsupers, dLUstruct_t * LUstruct, + gridinfo3d_t* grid3d) { - dLocalLU_t *Llu = LUstruct->Llu; - int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - double **Unzval_br_ptr = Llu->Unzval_br_ptr; - gridinfo_t *grid = &(grid3d->grid2d); - - int_t k = CEILING(nsupers, grid->nprow); /* Number of local block rows */ - for (int_t lb = 0; lb < k; ++lb) + dLocalLU_t *Llu = LUstruct->Llu; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + for ( int_t lb = 0; lb < k; ++lb) + { + int_t *usub; + usub = Ufstnz_br_ptr[lb]; + double * uval = Unzval_br_ptr[lb]; + + if (usub) { - int_t *usub; - usub = Ufstnz_br_ptr[lb]; - double *uval = Unzval_br_ptr[lb]; - - if (usub) + /* code */ + int lenv = usub[1]; + if (grid3d->zscp.Iam == layer) { - /* code */ - int lenv = usub[1]; - if (grid3d->zscp.Iam == layer) - { - MPI_Send(uval, lenv, MPI_DOUBLE, 0, lb, grid3d->zscp.comm); - } - - if (!grid3d->zscp.Iam) - { - MPI_Status status; - MPI_Recv(uval, lenv, MPI_DOUBLE, layer, lb, grid3d->zscp.comm, &status); - } + MPI_Send(uval, lenv, MPI_DOUBLE, 0, lb, grid3d->zscp.comm); } - } /* for lb ... */ - return 0; + + if (!grid3d->zscp.Iam) + { + MPI_Status status; + MPI_Recv(uval, lenv, MPI_DOUBLE, layer, lb, grid3d->zscp.comm, &status); + } + } + } /* for lb ... */ + return 0; } /* Gather the LU factors on layer-0 */ -int_t dp3dCollect(int_t layer, int_t n, dLUstruct_t *LUstruct, gridinfo3d_t *grid3d) +int_t dp3dCollect(int_t layer, int_t n, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d) { - int_t nsupers = getNsupers(n, LUstruct->Glu_persist); - dcollect3dLpanels(layer, nsupers, LUstruct, grid3d); - dcollect3dUpanels(layer, nsupers, LUstruct, grid3d); - return 0; + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); + dcollect3dLpanels(layer, nsupers, LUstruct, grid3d); + dcollect3dUpanels(layer, nsupers, LUstruct, grid3d); + return 0; } + /* Zero out LU non zero entries */ -int_t dzeroSetLU(int_t nnodes, int_t *nodeList, dLUstruct_t *LUstruct, - gridinfo3d_t *grid3d) +int_t dzeroSetLU(int_t nnodes, int_t* nodeList, dLUstruct_t *LUstruct, + gridinfo3d_t* grid3d) { - dLocalLU_t *Llu = LUstruct->Llu; - int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - double **Unzval_br_ptr = Llu->Unzval_br_ptr; - - int_t *xsup = LUstruct->Glu_persist->xsup; - int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - gridinfo_t *grid = &(grid3d->grid2d); - - int_t iam = grid->iam; - - int_t myrow = MYROW(iam, grid); - int_t mycol = MYCOL(iam, grid); - - /*first setting the L blocks to zero*/ - for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ + dLocalLU_t *Llu = LUstruct->Llu; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + gridinfo_t* grid = &(grid3d->grid2d); + + int_t iam = grid->iam; + + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + + /*first setting the L blocks to zero*/ + for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ { - - int_t jb = nodeList[node]; - int_t pc = PCOL(jb, grid); - if (mycol == pc) + + int_t jb = nodeList[node]; + int_t pc = PCOL( jb, grid ); + if (mycol == pc) { - int_t ljb = LBj(jb, grid); /* Local block number */ - int_t *lsub; - double *lnzval; - lsub = Lrowind_bc_ptr[ljb]; - lnzval = Lnzval_bc_ptr[ljb]; - - if (lsub != NULL) + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *lsub; + double* lnzval; + lsub = Lrowind_bc_ptr[ljb]; + lnzval = Lnzval_bc_ptr[ljb]; + + if (lsub != NULL) { - int_t len = lsub[1]; /* LDA of the nzval[] */ - int_t len2 = SuperSize(jb) * len; /*size of nzval of L panel */ - memset(lnzval, 0, len2 * sizeof(double)); + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len2 = SuperSize(jb) * len; /*size of nzval of L panel */ + memset( lnzval, 0, len2 * sizeof(double) ); } } } - for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ + for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ { - - int_t ib = nodeList[node]; - int_t pr = PROW(ib, grid); - if (myrow == pr) + + int_t ib = nodeList[node]; + int_t pr = PROW( ib, grid ); + if (myrow == pr) { - int_t lib = LBi(ib, grid); /* Local block number */ - int_t *usub; - double *unzval; - usub = Ufstnz_br_ptr[lib]; - unzval = Unzval_br_ptr[lib]; - - if (usub != NULL) + int_t lib = LBi( ib, grid ); /* Local block number */ + int_t *usub; + double* unzval; + usub = Ufstnz_br_ptr[lib]; + unzval = Unzval_br_ptr[lib]; + + if (usub != NULL) { - int lenv = usub[1]; - memset(unzval, 0, lenv * sizeof(double)); + int lenv = usub[1]; + memset( unzval, 0, lenv * sizeof(double) ); } } } - - return 0; + + return 0; } + int_t dreduceAncestors3d(int_t sender, int_t receiver, - int_t nnodes, int_t *nodeList, - double *Lval_buf, double *Uval_buf, - dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT) + int_t nnodes, int_t* nodeList, + double* Lval_buf, double* Uval_buf, + dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) { - double alpha = 1.0, beta = 1.0; - int_t myGrid = grid3d->zscp.Iam; - - /*first setting the L blocks to zero*/ - for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ + double alpha = 1.0, beta = 1.0; + int_t myGrid = grid3d->zscp.Iam; + + /*first setting the L blocks to zero*/ + for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ { - int_t jb = nodeList[node]; - - if (myGrid == sender) - { - dzSendLPanel(jb, receiver, LUstruct, grid3d, SCT); - dzSendUPanel(jb, receiver, LUstruct, grid3d, SCT); - } - else + int_t jb = nodeList[node]; + + if (myGrid == sender) { - dzRecvLPanel(jb, sender, alpha, beta, Lval_buf, - LUstruct, grid3d, SCT); - dzRecvUPanel(jb, sender, alpha, beta, Uval_buf, - LUstruct, grid3d, SCT); + dzSendLPanel(jb, receiver, LUstruct, grid3d, SCT); + dzSendUPanel(jb, receiver, LUstruct, grid3d, SCT); } + else { + dzRecvLPanel(jb, sender, alpha, beta, Lval_buf, + LUstruct, grid3d, SCT); + dzRecvUPanel(jb, sender, alpha, beta, Uval_buf, + LUstruct, grid3d, SCT); + } + } - return 0; + return 0; + } + int_t dgatherFactoredLU(int_t sender, int_t receiver, - int_t nnodes, int_t *nodeList, - dLUValSubBuf_t *LUvsb, - dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT) + int_t nnodes, int_t *nodeList, + dLUValSubBuf_t* LUvsb, + dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) { - double alpha = 0.0, beta = 1.0; - double *Lval_buf = LUvsb->Lval_buf; - double *Uval_buf = LUvsb->Uval_buf; - int_t myGrid = grid3d->zscp.Iam; - for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ + double alpha = 0.0, beta = 1.0; + double * Lval_buf = LUvsb->Lval_buf; + double * Uval_buf = LUvsb->Uval_buf; + int_t myGrid = grid3d->zscp.Iam; + for (int_t node = 0; node < nnodes; ++node) /* for each block column ... */ { - int_t jb = nodeList[node]; - if (myGrid == sender) + int_t jb = nodeList[node]; + if (myGrid == sender) { - dzSendLPanel(jb, receiver, LUstruct, grid3d, SCT); - dzSendUPanel(jb, receiver, LUstruct, grid3d, SCT); + dzSendLPanel(jb, receiver, LUstruct, grid3d, SCT); + dzSendUPanel(jb, receiver, LUstruct, grid3d, SCT); + } - else + else { - dzRecvLPanel(jb, sender, alpha, beta, Lval_buf, - LUstruct, grid3d, SCT); - dzRecvUPanel(jb, sender, alpha, beta, Uval_buf, - LUstruct, grid3d, SCT); + dzRecvLPanel(jb, sender, alpha, beta, Lval_buf, + LUstruct, grid3d, SCT); + dzRecvUPanel(jb, sender, alpha, beta, Uval_buf, + LUstruct, grid3d, SCT); } } - return 0; + return 0; + } -int_t dinit3DLUstruct(int_t *myTreeIdxs, int_t *myZeroTrIdxs, - int_t *nodeCount, int_t **nodeList, dLUstruct_t *LUstruct, - gridinfo3d_t *grid3d) -{ - int_t maxLvl = log2i(grid3d->zscp.Np) + 1; - for (int_t lvl = 0; lvl < maxLvl; lvl++) +int_t dinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + int_t* nodeCount, int_t** nodeList, dLUstruct_t* LUstruct, + gridinfo3d_t* grid3d) +{ + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + for (int_t lvl = 0; lvl < maxLvl; lvl++) { - if (myZeroTrIdxs[lvl]) + if (myZeroTrIdxs[lvl]) { - /* code */ - int_t treeId = myTreeIdxs[lvl]; - dzeroSetLU(nodeCount[treeId], nodeList[treeId], LUstruct, grid3d); + /* code */ + int_t treeId = myTreeIdxs[lvl]; + dzeroSetLU(nodeCount[treeId], nodeList[treeId], LUstruct, grid3d); } } - - return 0; + + return 0; } -int_t dreduceAllAncestors3d(int_t ilvl, int_t *myNodeCount, int_t **treePerm, - dLUValSubBuf_t *LUvsb, dLUstruct_t *LUstruct, - gridinfo3d_t *grid3d, SCT_t *SCT) -{ - double *Lval_buf = LUvsb->Lval_buf; - double *Uval_buf = LUvsb->Uval_buf; - int_t maxLvl = log2i(grid3d->zscp.Np) + 1; - int_t myGrid = grid3d->zscp.Iam; - int_t sender, receiver; - if ((myGrid % (1 << (ilvl + 1))) == 0) +int dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm, + dLUValSubBuf_t* LUvsb, dLUstruct_t* LUstruct, + gridinfo3d_t* grid3d, SCT_t* SCT ) +{ + double * Lval_buf = LUvsb->Lval_buf; + double * Uval_buf = LUvsb->Uval_buf; + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t myGrid = grid3d->zscp.Iam; + + int_t sender, receiver; + if ((myGrid % (1 << (ilvl + 1))) == 0) { - sender = myGrid + (1 << ilvl); - receiver = myGrid; + sender = myGrid + (1 << ilvl); + receiver = myGrid; } - else + else { - sender = myGrid; - receiver = myGrid - (1 << ilvl); + sender = myGrid; + receiver = myGrid - (1 << ilvl); } - - /*Reduce all the ancestors*/ - for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl) + + /*Reduce all the ancestors*/ + for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl) { - /* code */ - // int_t atree = myTreeIdxs[alvl]; - int_t nsAncestor = myNodeCount[alvl]; - int_t *cAncestorList = treePerm[alvl]; - double treduce = SuperLU_timer_(); - dreduceAncestors3d(sender, receiver, nsAncestor, cAncestorList, - Lval_buf, Uval_buf, LUstruct, grid3d, SCT); - SCT->ancsReduce += SuperLU_timer_() - treduce; + /* code */ + // int_t atree = myTreeIdxs[alvl]; + int_t nsAncestor = myNodeCount[alvl]; + int_t* cAncestorList = treePerm[alvl]; + double treduce = SuperLU_timer_(); + dreduceAncestors3d(sender, receiver, nsAncestor, cAncestorList, + Lval_buf, Uval_buf, LUstruct, grid3d, SCT); + SCT->ancsReduce += SuperLU_timer_() - treduce; + } - return 0; + return 0; } -int_t dgatherAllFactoredLU(trf3Dpartition_t *trf3Dpartition, - dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SCT_t *SCT) +int_t dgatherAllFactoredLU( trf3Dpartition_t* trf3Dpartition, + dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT ) { - int_t maxLvl = log2i(grid3d->zscp.Np) + 1; - int_t myGrid = grid3d->zscp.Iam; - int_t *myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs; - sForest_t **sForests = trf3Dpartition->sForests; - dLUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb; - int_t *gNodeCount = getNodeCountsFr(maxLvl, sForests); - int_t **gNodeLists = getNodeListFr(maxLvl, sForests); - - for (int_t ilvl = 0; ilvl < maxLvl - 1; ++ilvl) + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + int_t myGrid = grid3d->zscp.Iam; + int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs; + sForest_t** sForests = trf3Dpartition->sForests; + dLUValSubBuf_t* LUvsb = trf3Dpartition->LUvsb; + int_t* gNodeCount = getNodeCountsFr(maxLvl, sForests); + int_t** gNodeLists = getNodeListFr(maxLvl, sForests); + + for (int_t ilvl = 0; ilvl < maxLvl - 1; ++ilvl) { - /* code */ - int_t sender, receiver; - if (!myZeroTrIdxs[ilvl]) + /* code */ + int_t sender, receiver; + if (!myZeroTrIdxs[ilvl]) { - if ((myGrid % (1 << (ilvl + 1))) == 0) + if ((myGrid % (1 << (ilvl + 1))) == 0) { - sender = myGrid + (1 << ilvl); - receiver = myGrid; + sender = myGrid + (1 << ilvl); + receiver = myGrid; } - else + else { - sender = myGrid; - receiver = myGrid - (1 << ilvl); + sender = myGrid; + receiver = myGrid - (1 << ilvl); } - - for (int_t alvl = 0; alvl <= ilvl; alvl++) + + for (int_t alvl = 0; alvl <= ilvl; alvl++) { - int_t diffLvl = ilvl - alvl; - int_t numTrees = 1 << diffLvl; - int_t blvl = maxLvl - alvl - 1; - int_t st = (1 << blvl) - 1 + (sender >> alvl); - - for (int_t tr = st; tr < st + numTrees; ++tr) + int_t diffLvl = ilvl - alvl; + int_t numTrees = 1 << diffLvl; + int_t blvl = maxLvl - alvl - 1; + int_t st = (1 << blvl) - 1 + (sender >> alvl); + + for (int_t tr = st; tr < st + numTrees; ++tr) { - /* code */ - dgatherFactoredLU(sender, receiver, - gNodeCount[tr], gNodeLists[tr], - LUvsb, - LUstruct, grid3d, SCT); + /* code */ + dgatherFactoredLU(sender, receiver, + gNodeCount[tr], gNodeLists[tr], + LUvsb, + LUstruct, grid3d, SCT ); } } + } } /* for ilvl ... */ + + SUPERLU_FREE(gNodeCount); // sherry added + SUPERLU_FREE(gNodeLists); - SUPERLU_FREE(gNodeCount); // sherry added - SUPERLU_FREE(gNodeLists); - - return 0; + return 0; } /* dgatherAllFactoredLU */ + diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index ae67abe3..0b8bb280 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -9,6 +9,7 @@ The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ + /*! @file * \brief Solves a system of linear equations A*X=B using 3D process grid. * @@ -18,38 +19,6 @@ at the top-level directory. * May 10, 2019 */ #include "superlu_ddefs.h" -#include - -static void checkNRFMT(NRformat_loc *A, NRformat_loc *B) -{ - /* - int_t nnz_loc; - int_t m_loc; - int_t fst_row; - void *nzval; - int_t *rowptr; - int_t *colind; - */ - - assert(A->nnz_loc == B->nnz_loc); - assert(A->m_loc == B->m_loc); - assert(A->fst_row == B->fst_row); - - for (int_t i = 0; i < A->nnz_loc; i++) - { - assert(((double *)A->nzval)[i] == ((double *)B->nzval)[i]); - // printf("%lf \n", ((double *)A->nzval)[i]); - assert((A->colind)[i] == (B->colind)[i]); - } - - for (int_t i = 0; i < A->m_loc + 1; i++) - { - // assert(((double *)A->nzval)[i] ==((double *)B->nzval)[i]); - assert((A->rowptr)[i] == (B->rowptr)[i]); - } - - printf("Matrix check passed\n"); -} #if 0 #include "p3dcomm.h" #include "pdgstrf3d.h" @@ -490,9 +459,9 @@ static void checkNRFMT(NRformat_loc *A, NRformat_loc *B) * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * - * o Llu (LocalLU_t*) (local) + * o Llu (dLocalLU_t*) (local) * The distributed data structures to store L and U factors. - * See superlu_ddefs.h for the definition of 'LocalLU_t'. + * See superlu_ddefs.h for the definition of 'dLocalLU_t'. * * SOLVEstruct (input/output) dSOLVEstruct_t* * The data structure to hold the communication pattern used @@ -513,6 +482,7 @@ static void checkNRFMT(NRformat_loc *A, NRformat_loc *B) * * info (output) int* * = 0: successful exit + * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, @@ -524,1234 +494,1088 @@ static void checkNRFMT(NRformat_loc *A, NRformat_loc *B) *
*/ -void pdgssvx3d(superlu_dist_options_t *options, SuperMatrix *A, - dScalePermstruct_t *ScalePermstruct, - double B[], int ldb, int nrhs, gridinfo3d_t *grid3d, - dLUstruct_t *LUstruct, dSOLVEstruct_t *SOLVEstruct, - double *berr, SuperLUStat_t *stat, int *info) +void +pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, + dScalePermstruct_t * ScalePermstruct, + double B[], int ldb, int nrhs, gridinfo3d_t * grid3d, + dLUstruct_t * LUstruct, dSOLVEstruct_t * SOLVEstruct, + double *berr, SuperLUStat_t * stat, int *info) { - NRformat_loc *Astore; - SuperMatrix GA; /* Global A in NC format */ - NCformat *GAstore; - double *a_GA; - SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ - NCPformat *GACstore; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - Glu_freeable_t *Glu_freeable; - /* The nonzero structures of L and U factors, which are - replicated on all processrs. - (lsub, xlsub) contains the compressed subscript of - supernodes in L. - (usub, xusub) contains the compressed subscript of - nonzero segments in U. - If options->Fact != SamePattern_SameRowPerm, they are - computed by SYMBFACT routine, and then used by PDDISTRIBUTE - routine. They will be freed after PDDISTRIBUTE routine. - If options->Fact == SamePattern_SameRowPerm, these - structures are not used. */ - yes_no_t parSymbFact = options->ParSymbFact; - fact_t Fact; - double *a; - int_t *colptr, *rowind; - int_t *perm_r; /* row permutations from partial pivoting */ - int_t *perm_c; /* column permutation vector */ - int_t *etree; /* elimination tree */ - int_t *rowptr, *colind; /* Local A in NR */ - int_t colequ, Equil, factored, job, notran, rowequ, need_value; - int_t i, iinfo, j, irow, m, n, nnz, permc_spec; - int_t nnz_loc, m_loc, fst_row, icol; - int iam; - int ldx; /* LDA for matrix X (local). */ - char equed[1], norm[1]; - double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; - double *X, *b_col, *b_work, *x_col; - double t; - float GA_mem_use; /* memory usage by global A */ - float dist_mem_use; /* memory usage during distribution */ - superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; -#if (PRNTlevel >= 2) - double dmin, dsum, dprod; + NRformat_loc *Astore; + SuperMatrix GA; /* Global A in NC format */ + NCformat *GAstore; + double *a_GA; + SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ + NCPformat *GACstore; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t *Glu_freeable; + /* The nonzero structures of L and U factors, which are + replicated on all processrs. + (lsub, xlsub) contains the compressed subscript of + supernodes in L. + (usub, xusub) contains the compressed subscript of + nonzero segments in U. + If options->Fact != SamePattern_SameRowPerm, they are + computed by SYMBFACT routine, and then used by PDDISTRIBUTE + routine. They will be freed after PDDISTRIBUTE routine. + If options->Fact == SamePattern_SameRowPerm, these + structures are not used. */ + yes_no_t parSymbFact = options->ParSymbFact; + fact_t Fact; + double *a; + int_t *colptr, *rowind; + int_t *perm_r; /* row permutations from partial pivoting */ + int_t *perm_c; /* column permutation vector */ + int_t *etree; /* elimination tree */ + int_t *rowptr, *colind; /* Local A in NR */ + int_t colequ, Equil, factored, job, notran, rowequ, need_value; + int_t i, iinfo, j, irow, m, n, nnz, permc_spec; + int_t nnz_loc, m_loc, fst_row, icol; + int iam; + int ldx; /* LDA for matrix X (local). */ + char equed[1], norm[1]; + double *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; + double *X, *b_col, *b_work, *x_col; + double t; + float GA_mem_use; /* memory usage by global A */ + float dist_mem_use; /* memory usage during distribution */ + superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; +#if ( PRNTlevel>= 2 ) + double dmin, dsum, dprod; #endif - // get the 2d grid - gridinfo_t *grid = &(grid3d->grid2d); - iam = grid->iam; - - /* Initialization. */ - /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d - B3d and Astore3d will be restored on return */ - int ldb3d = ldb; - double *B3d = B; - NRformat_loc *Astore3d = (NRformat_loc *)A->Store; - double *B2d; - NRformat_loc3d *A3d = dGatherNRformat_loc3d((NRformat_loc *)A->Store, - B, ldb, nrhs, grid3d); - B2d = (double *) A3d->B2d; - NRformat_loc *Astore0 = A3d->A_nfmt; - NRformat_loc *A_orig = A->Store; - - /* definition of factored seen by each process layer */ - Fact = options->Fact; - factored = (Fact == FACTORED); - - /* Test the options choices. */ - *info = 0; - Fact = options->Fact; - if (Fact < 0 || Fact > FACTORED) - *info = -1; - else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR) - *info = -1; - else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC) - *info = -1; - else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA) - *info = -1; - else if (options->IterRefine == SLU_EXTRA) - { - *info = -1; - fprintf(stderr, - "Extra precise iterative refinement yet to support."); - } - /* Test the other input parameters. */ - else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc || A->Dtype != SLU_D || A->Mtype != SLU_GE) - *info = -2; - else if (ldb < Astore3d->m_loc) - *info = -5; - else if (nrhs < 0) { - *info = -6; - } - if (*info) - { - i = -(*info); - pxerr_dist("pdgssvx3d", grid, -*info); - return; - } - -#if (DEBUGlevel >= 1) - CHECK_MALLOC(iam, "Enter pdgssvx3d()"); + // get the 2d grid + gridinfo_t *grid = &(grid3d->grid2d); + iam = grid->iam; + + /* Initialization. */ + /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d + B3d and Astore3d will be restored on return */ + int ldb3d = ldb; + double *B3d = B; + NRformat_loc *Astore3d = (NRformat_loc *)A->Store; + double *B2d; + NRformat_loc3d *A3d = dGatherNRformat_loc3d((NRformat_loc *)A->Store, + B, ldb, nrhs, grid3d); + B2d = (double *) A3d->B2d; + NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0 + NRformat_loc *A_orig = A->Store; + + /* definition of factored seen by each process layer */ + Fact = options->Fact; + factored = (Fact == FACTORED); + + /* Test the options choices. */ + *info = 0; + Fact = options->Fact; + if (Fact < 0 || Fact > FACTORED) + *info = -1; + else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR) + *info = -1; + else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC) + *info = -1; + else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA) + *info = -1; + else if (options->IterRefine == SLU_EXTRA) { + *info = -1; + fprintf (stderr, + "Extra precise iterative refinement yet to support."); + } else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc + || A->Dtype != SLU_D || A->Mtype != SLU_GE) + *info = -2; + else if (ldb < Astore3d->m_loc) + *info = -5; + else if (nrhs < 0) { + *info = -6; + } + if (*info) { + i = -(*info); + pxerr_dist ("pdgssvx3d", grid, -*info); + return; + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter pdgssvx3d()"); #endif + + /* Perform preprocessing steps on process layer zero, including: + gather 3D matrices {A, B} onto 2D grid-0, + ordering, symbolic factorization, distribution of L & U */ - /* Perform preprocessing steps on process layer zero, including: - ordering, symbolic factorization, distribution of L & U */ #define NRFRMT - if (grid3d->zscp.Iam == 0) - { - m = A->nrow; - n = A->ncol; - // checkNRFMT(Astore0, (NRformat_loc *) A->Store); + if (grid3d->zscp.Iam == 0) + { + m = A->nrow; + n = A->ncol; + // checkNRFMT(Astore0, (NRformat_loc *) A->Store); #ifdef NRFRMT - // On input, A->Store is on 3D, now A->Store is re-assigned to 2D store - A->Store = Astore0; - ldb = Astore0->m_loc; - B = B2d; // B is now re-assigned to B2d + // On input, A->Store is on 3D, now A->Store is re-assigned to 2D store + A->Store = Astore0; + ldb = Astore0->m_loc; + B = B2d; // B is now re-assigned to B2d + //PrintDouble5("after gather B=B2d", ldb, B); #endif - //PrintDouble5("after gather B=B2d", ldb, B); - Astore = (NRformat_loc *)A->Store; // on 2D - - // #ifdef NRFRMT - // Astore = Astore0; - // #else - // Astore = Astore0; - // // Astore = (NRformat_loc *) A->Store; - // // Astore->nzval = Astore0->nzval; - // // Astore->rowptr = Astore0->rowptr; - // // Astore->colind = Astore0->colind; - // // Astore->nnz_loc = Astore0->nnz_loc; - // // Astore->m_loc = Astore0->m_loc; - // // Astore->fst_row = Astore0->fst_row; - // #endif - nnz_loc = Astore->nnz_loc; - m_loc = Astore->m_loc; - fst_row = Astore->fst_row; - a = (double *)Astore->nzval; - rowptr = Astore->rowptr; - colind = Astore->colind; - - /* Structures needed for parallel symbolic factorization */ - int_t *sizes, *fstVtxSep; - int noDomains, nprocs_num; - MPI_Comm symb_comm; /* communicator for symbolic factorization */ - int col, key; /* parameters for creating a new communicator */ - Pslu_freeable_t Pslu_freeable; - float flinfo; - - sizes = NULL; - fstVtxSep = NULL; - symb_comm = MPI_COMM_NULL; - - Equil = (!factored && options->Equil == YES); - notran = (options->Trans == NOTRANS); - - iam = grid->iam; - job = 5; - if (factored || (Fact == SamePattern_SameRowPerm && Equil)) - { - rowequ = (ScalePermstruct->DiagScale == ROW) || - (ScalePermstruct->DiagScale == BOTH); - colequ = (ScalePermstruct->DiagScale == COL) || - (ScalePermstruct->DiagScale == BOTH); - } - else - rowequ = colequ = FALSE; - - /* The following arrays are replicated on all processes. */ - perm_r = ScalePermstruct->perm_r; - perm_c = ScalePermstruct->perm_c; - etree = LUstruct->etree; - R = ScalePermstruct->R; - C = ScalePermstruct->C; - /********/ - - /* Not factored & ask for equilibration */ - if (Equil && Fact != SamePattern_SameRowPerm) - { - /* Allocate storage if not done so before. */ - switch (ScalePermstruct->DiagScale) - { - case NOEQUIL: - if (!(R = (double *)doubleMalloc_dist(m))) - ABORT("Malloc fails for R[]."); - if (!(C = (double *)doubleMalloc_dist(n))) - ABORT("Malloc fails for C[]."); - ScalePermstruct->R = R; - ScalePermstruct->C = C; - break; - case ROW: - if (!(C = (double *)doubleMalloc_dist(n))) - ABORT("Malloc fails for C[]."); - ScalePermstruct->C = C; - break; - case COL: - if (!(R = (double *)doubleMalloc_dist(m))) - ABORT("Malloc fails for R[]."); - ScalePermstruct->R = R; - break; - default: break; - } - } - /* ------------------------------------------------------------ - Diagonal scaling to equilibrate the matrix. - ------------------------------------------------------------ */ - if (Equil) - { -#if (DEBUGlevel >= 1) - CHECK_MALLOC(iam, "Enter equil"); + /* The following code now works on 2D grid-0 */ + Astore = (NRformat_loc *) A->Store; + nnz_loc = Astore->nnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + a = (double *) Astore->nzval; + rowptr = Astore->rowptr; + colind = Astore->colind; + + /* Structures needed for parallel symbolic factorization */ + int_t *sizes, *fstVtxSep; + int noDomains, nprocs_num; + MPI_Comm symb_comm; /* communicator for symbolic factorization */ + int col, key; /* parameters for creating a new communicator */ + Pslu_freeable_t Pslu_freeable; + float flinfo; + + sizes = NULL; + fstVtxSep = NULL; + symb_comm = MPI_COMM_NULL; + + Equil = (!factored && options->Equil == YES); + notran = (options->Trans == NOTRANS); + + iam = grid->iam; + job = 5; + if (factored || (Fact == SamePattern_SameRowPerm && Equil)) + { + rowequ = (ScalePermstruct->DiagScale == ROW) || + (ScalePermstruct->DiagScale == BOTH); + colequ = (ScalePermstruct->DiagScale == COL) || + (ScalePermstruct->DiagScale == BOTH); + } + else + rowequ = colequ = FALSE; + + /* The following arrays are replicated on all processes. */ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + etree = LUstruct->etree; + R = ScalePermstruct->R; + C = ScalePermstruct->C; + /********/ + + /* Not factored & ask for equilibration */ + if (Equil && Fact != SamePattern_SameRowPerm) { + /* Allocate storage if not done so before. */ + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + if (!(R = (double *) doubleMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + if (!(C = (double *) doubleMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->R = R; + ScalePermstruct->C = C; + break; + case ROW: + if (!(C = (double *) doubleMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->C = C; + break; + case COL: + if (!(R = (double *) doubleMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + ScalePermstruct->R = R; + break; + default: break; + } + } + + /* ------------------------------------------------------------ + Diagonal scaling to equilibrate the matrix. + ------------------------------------------------------------ */ + if (Equil) { +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter equil"); #endif - t = SuperLU_timer_(); - - if (Fact == SamePattern_SameRowPerm) - { - /* Reuse R and C. */ - switch (ScalePermstruct->DiagScale) - { - case NOEQUIL: - break; - case ROW: - irow = fst_row; - for (j = 0; j < m_loc; ++j) - { - for (i = rowptr[j]; i < rowptr[j + 1]; ++i) - { - a[i] *= R[irow]; /* Scale rows. */ - } - ++irow; - } - break; - case COL: - for (j = 0; j < m_loc; ++j) - for (i = rowptr[j]; i < rowptr[j + 1]; ++i) - { - icol = colind[i]; - a[i] *= C[icol]; /* Scale columns. */ - } - break; - case BOTH: - irow = fst_row; - for (j = 0; j < m_loc; ++j) - { - for (i = rowptr[j]; i < rowptr[j + 1]; ++i) - { - icol = colind[i]; - a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ - } - ++irow; - } - break; - } + t = SuperLU_timer_ (); + + if (Fact == SamePattern_SameRowPerm) { + /* Reuse R and C. */ + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + break; + case ROW: + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + a[i] *= R[irow]; /* Scale rows. */ } - else /* Compute R & C from scratch */ - { - /* Compute the row and column scalings. */ - pdgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); - - if (iinfo > 0) - { - if (iinfo <= m) - { -#if (PRNTlevel >= 1) - fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo); + ++irow; + } + break; + case COL: + for (j = 0; j < m_loc; ++j) + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + icol = colind[i]; + a[i] *= C[icol]; /* Scale columns. */ + } + break; + case BOTH: + irow = fst_row; + for (j = 0; j < m_loc; ++j) + { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) + { + icol = colind[i]; + a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ + } + ++irow; + } + break; + } + } else { /* Compute R & C from scratch */ + /* Compute the row and column scalings. */ + pdgsequ (A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); + + if ( iinfo > 0 ) { + if ( iinfo <= m ) { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo); #endif - } - else - { -#if (PRNTlevel >= 1) - fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo - n); + } else { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n); #endif - } - } - else if (iinfo < 0) - return; - - /* Now iinfo == 0 */ - - /* Equilibrate matrix A if it is badly-scaled. - A <-- diag(R)*A*diag(C) */ - pdlaqgs(A, R, C, rowcnd, colcnd, amax, equed); - - if (strncmp(equed, "R", 1) == 0) - { - ScalePermstruct->DiagScale = ROW; - rowequ = ROW; - } - else if (strncmp(equed, "C", 1) == 0) - { - ScalePermstruct->DiagScale = COL; - colequ = COL; - } - else if (strncmp(equed, "B", 1) == 0) - { - ScalePermstruct->DiagScale = BOTH; - rowequ = ROW; - colequ = COL; - } - else - ScalePermstruct->DiagScale = NOEQUIL; - -#if (PRNTlevel >= 1) - if (iam == 0) - { - printf(".. equilibrated? *equed = %c\n", *equed); - fflush(stdout); - } + } + } else if ( iinfo < 0 ) return; + + /* Now iinfo == 0 */ + + /* Equilibrate matrix A if it is badly-scaled. + A <-- diag(R)*A*diag(C) */ + pdlaqgs (A, R, C, rowcnd, colcnd, amax, equed); + + if ( strncmp(equed, "R", 1)==0 ) { + ScalePermstruct->DiagScale = ROW; + rowequ = ROW; + } else if ( strncmp(equed, "C", 1)==0 ) { + ScalePermstruct->DiagScale = COL; + colequ = COL; + } else if ( strncmp(equed, "B", 1)==0 ) { + ScalePermstruct->DiagScale = BOTH; + rowequ = ROW; + colequ = COL; + } else ScalePermstruct->DiagScale = NOEQUIL; + +#if ( PRNTlevel>=1 ) + if (iam==0) { + printf (".. equilibrated? *equed = %c\n", *equed); + fflush(stdout); + } #endif - } /* end if-else Fact ... */ + } /* end if-else Fact ... */ - stat->utime[EQUIL] = SuperLU_timer_() - t; -#if (DEBUGlevel >= 1) - CHECK_MALLOC(iam, "Exit equil"); + stat->utime[EQUIL] = SuperLU_timer_ () - t; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit equil"); #endif - } /* end if Equil ... LAPACK style, not involving MC64 */ - - if (!factored) /* Skip this if already factored. */ - { - /* - * Gather A from the distributed compressed row format to - * global A in compressed column format. - * Numerical values are gathered only when a row permutation - * for large diagonal is sought after. - */ - if (Fact != SamePattern_SameRowPerm && - (parSymbFact == NO || options->RowPerm != NO)) - { - - need_value = (options->RowPerm == LargeDiag_MC64); - - pdCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); - - GAstore = (NCformat *)GA.Store; - colptr = GAstore->colptr; - rowind = GAstore->rowind; - nnz = GAstore->nnz; - GA_mem_use = (nnz + n + 1) * sizeof(int_t); + } /* end if Equil ... LAPACK style, not involving MC64 */ + + if (!factored) { /* Skip this if already factored. */ + /* + * Gather A from the distributed compressed row format to + * global A in compressed column format. + * Numerical values are gathered only when a row permutation + * for large diagonal is sought after. + */ + if (Fact != SamePattern_SameRowPerm && + (parSymbFact == NO || options->RowPerm != NO)) { + + need_value = (options->RowPerm == LargeDiag_MC64); + + pdCompRow_loc_to_CompCol_global (need_value, A, grid, &GA); + + GAstore = (NCformat *) GA.Store; + colptr = GAstore->colptr; + rowind = GAstore->rowind; + nnz = GAstore->nnz; + GA_mem_use = (nnz + n + 1) * sizeof (int_t); + + if (need_value) { + a_GA = (double *) GAstore->nzval; + GA_mem_use += nnz * sizeof (double); + } - if (need_value) - { - a_GA = (double *)GAstore->nzval; - GA_mem_use += nnz * sizeof(double); + else + assert (GAstore->nzval == NULL); + } + + /* ------------------------------------------------------------ + Find the row permutation for A. + ------------------------------------------------------------ */ + if (options->RowPerm != NO) { + t = SuperLU_timer_ (); + if (Fact != SamePattern_SameRowPerm) { + if (options->RowPerm == MY_PERMR) { + /* Use user's perm_r. */ + /* Permute the global matrix GA for symbfact() */ + for (i = 0; i < colptr[n]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } else if ( options->RowPerm == LargeDiag_MC64 ) { + /* Get a new perm_r[] */ + if (job == 5) { + /* Allocate storage for scaling factors. */ + if (!(R1 = doubleMalloc_dist (m))) + ABORT ("SUPERLU_MALLOC fails for R1[]"); + if (!(C1 = doubleMalloc_dist (n))) + ABORT ("SUPERLU_MALLOC fails for C1[]"); + } + + if ( iam==0 ) { + /* Process 0 finds a row permutation */ + iinfo = dldperm_dist (job, m, nnz, colptr, rowind, a_GA, + perm_r, R1, C1); + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) { + MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); } - - else - assert(GAstore->nzval == NULL); + } + } else { + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) { + MPI_Bcast (R1, m, MPI_DOUBLE, 0, grid->comm); + MPI_Bcast (C1, n, MPI_DOUBLE, 0, grid->comm); + } + } } - /* ------------------------------------------------------------ - Find the row permutation for A. - ------------------------------------------------------------ */ - if (options->RowPerm != NO) - { - t = SuperLU_timer_(); - if (Fact != SamePattern_SameRowPerm) - { - if (options->RowPerm == MY_PERMR) - { - /* Use user's perm_r. */ - /* Permute the global matrix GA for symbfact() */ - for (i = 0; i < colptr[n]; ++i) - { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } - else if (options->RowPerm == LargeDiag_MC64) - { - /* Get a new perm_r[] */ - if (job == 5) - { - /* Allocate storage for scaling factors. */ - if (!(R1 = doubleMalloc_dist(m))) - ABORT("SUPERLU_MALLOC fails for R1[]"); - if (!(C1 = doubleMalloc_dist(n))) - ABORT("SUPERLU_MALLOC fails for C1[]"); - } - - if (iam == 0) - { - /* Process 0 finds a row permutation */ - iinfo = dldperm_dist(job, m, nnz, colptr, rowind, a_GA, - perm_r, R1, C1); - MPI_Bcast(&iinfo, 1, mpi_int_t, 0, grid->comm); - if (iinfo == 0) - { - MPI_Bcast(perm_r, m, mpi_int_t, 0, grid->comm); - if (job == 5 && Equil) - { - MPI_Bcast(R1, m, MPI_DOUBLE, 0, grid->comm); - MPI_Bcast(C1, n, MPI_DOUBLE, 0, grid->comm); - } - } - } - else - { - MPI_Bcast(&iinfo, 1, mpi_int_t, 0, grid->comm); - if (iinfo == 0) - { - MPI_Bcast(perm_r, m, mpi_int_t, 0, grid->comm); - if (job == 5 && Equil) - { - MPI_Bcast(R1, m, MPI_DOUBLE, 0, grid->comm); - MPI_Bcast(C1, n, MPI_DOUBLE, 0, grid->comm); - } - } - } - - if (iinfo && job == 5) /* Error return */ - { - SUPERLU_FREE(R1); - SUPERLU_FREE(C1); - } -#if (PRNTlevel >= 2) - dmin = damch_dist("Overflow"); - dsum = 0.0; - dprod = 1.0; -#endif - if (iinfo == 0) - { - if (job == 5) - { - if (Equil) - { - for (i = 0; i < n; ++i) - { - R1[i] = exp(R1[i]); - C1[i] = exp(C1[i]); - } - - /* Scale the distributed matrix further. - A <-- diag(R1)*A*diag(C1) */ - irow = fst_row; - for (j = 0; j < m_loc; ++j) - { - for (i = rowptr[j]; i < rowptr[j + 1]; ++i) - { - icol = colind[i]; - a[i] *= R1[irow] * C1[icol]; -#if (PRNTlevel >= 2) - if (perm_r[irow] == icol) - { - /* New diagonal */ - if (job == 2 || job == 3) - dmin = SUPERLU_MIN(dmin, fabs(a[i])); - else if (job == 4) - dsum += fabs(a[i]); - else if (job == 5) - dprod *= fabs(a[i]); - } + if ( iinfo && job == 5) { /* Error return */ + SUPERLU_FREE(R1); + SUPERLU_FREE(C1); + } +#if ( PRNTlevel>=2 ) + dmin = damch_dist ("Overflow"); + dsum = 0.0; + dprod = 1.0; #endif - } - ++irow; - } - - /* Multiply together the scaling factors -- - R/C from simple scheme, R1/C1 from MC64. */ - if (rowequ) - for (i = 0; i < m; ++i) - R[i] *= R1[i]; - else - for (i = 0; i < m; ++i) - R[i] = R1[i]; - if (colequ) - for (i = 0; i < n; ++i) - C[i] *= C1[i]; - else - for (i = 0; i < n; ++i) - C[i] = C1[i]; - - ScalePermstruct->DiagScale = BOTH; - rowequ = colequ = 1; - - } /* end if Equil */ - - /* Now permute global A to prepare for symbfact() */ - for (j = 0; j < n; ++j) - { - for (i = colptr[j]; i < colptr[j + 1]; ++i) - { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } - } - SUPERLU_FREE(R1); - SUPERLU_FREE(C1); - } - else /* job = 2,3,4 */ - { - for (j = 0; j < n; ++j) - { - for (i = colptr[j]; i < colptr[j + 1]; ++i) - { - irow = rowind[i]; - rowind[i] = perm_r[irow]; - } /* end for i ... */ - } /* end for j ... */ - } /* end else job ... */ - } - else /* if iinfo != 0 */ - { - for (i = 0; i < m; ++i) - perm_r[i] = i; - } -#if (PRNTlevel >= 2) + if ( iinfo == 0 ) { + if (job == 5) { + if ( Equil ) { + for (i = 0; i < n; ++i) { + R1[i] = exp (R1[i]); + C1[i] = exp (C1[i]); + } + + /* Scale the distributed matrix further. + A <-- diag(R1)*A*diag(C1) */ + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + icol = colind[i]; + a[i] *= R1[irow] * C1[icol]; +#if ( PRNTlevel>=2 ) + if (perm_r[irow] == icol) { + /* New diagonal */ if (job == 2 || job == 3) - { - if (!iam) - printf("\tsmallest diagonal %e\n", dmin); - } + dmin = SUPERLU_MIN(dmin, fabs(a[i])); else if (job == 4) - { - if (!iam) - printf("\tsum of diagonal %e\n", dsum); - } + dsum += fabs(a[i]); else if (job == 5) - { - if (!iam) - printf("\t product of diagonal %e\n", dprod); - } + dprod *= fabs(a[i]); + } #endif } - else /* use largeDiag_AWPM */ - { + ++irow; + } + + /* Multiply together the scaling factors -- + R/C from simple scheme, R1/C1 from MC64. */ + if (rowequ) + for (i = 0; i < m; ++i) R[i] *= R1[i]; + else + for (i = 0; i < m; ++i) R[i] = R1[i]; + if (colequ) + for (i = 0; i < n; ++i) C[i] *= C1[i]; + else + for (i = 0; i < n; ++i) C[i] = C1[i]; + + ScalePermstruct->DiagScale = BOTH; + rowequ = colequ = 1; + + } /* end if Equil */ + + /* Now permute global A to prepare for symbfact() */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } + SUPERLU_FREE (R1); + SUPERLU_FREE (C1); + } else { /* job = 2,3,4 */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) + { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } /* end for i ... */ + } /* end for j ... */ + } /* end else job ... */ + } else { /* if iinfo != 0 */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } +#if ( PRNTlevel>=2 ) + if (job == 2 || job == 3) { + if (!iam) + printf ("\tsmallest diagonal %e\n", dmin); + } else if (job == 4) { + if (!iam) + printf ("\tsum of diagonal %e\n", dsum); + } else if (job == 5) { + if (!iam) + printf ("\t product of diagonal %e\n", dprod); + } +#endif + } else { /* use largeDiag_AWPM */ #ifdef HAVE_COMBBLAS - c2cpp_GetAWPM(A, grid, ScalePermstruct); + c2cpp_GetAWPM(A, grid, ScalePermstruct); #else - if (iam == 0) - { - printf("CombBLAS is not available\n"); - fflush(stdout); - } -#endif - } /* end if-else options->RowPerm ... */ - - t = SuperLU_timer_() - t; - stat->utime[ROWPERM] = t; -#if (PRNTlevel >= 1) - if (!iam) - { - printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); - fflush(stdout); - } -#endif - } /* end if Fact not SamePattern_SameRowPerm ... */ - } - else /* options->RowPerm == NOROWPERM / NATURAL */ - { - for (i = 0; i < m; ++i) - perm_r[i] = i; + if ( iam == 0 ) { + printf("CombBLAS is not available\n"); fflush(stdout); } +#endif + } /* end if-else options->RowPerm ... */ -#if (DEBUGlevel >= 2) - if (!iam) - PrintInt10("perm_r", m, perm_r); + t = SuperLU_timer_ () - t; + stat->utime[ROWPERM] = t; +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); + fflush(stdout); + } +#endif + } /* end if Fact not SamePattern_SameRowPerm ... */ + } else { /* options->RowPerm == NOROWPERM / NATURAL */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } + +#if ( DEBUGlevel>=2 ) + if (!iam) + PrintInt10 ("perm_r", m, perm_r); +#endif + } /* end if (!factored) */ + + if (!factored || options->IterRefine) { + /* Compute norm(A), which will be used to adjust small diagonal. */ + if (notran) + *(unsigned char *) norm = '1'; + else + *(unsigned char *) norm = 'I'; + anorm = pdlangs (norm, A, grid); +#if ( PRNTlevel>=1 ) + if (!iam) { + printf (".. anorm %e\n", anorm); fflush(stdout); + } #endif - } /* end if (!factored) */ - - if (!factored || options->IterRefine) - { - /* Compute norm(A), which will be used to adjust small diagonal. */ - if (notran) - *(unsigned char *)norm = '1'; - else - *(unsigned char *)norm = 'I'; - anorm = pdlangs(norm, A, grid); -#if (PRNTlevel >= 1) - if (!iam) + } + + + /* ------------------------------------------------------------ + Perform the LU factorization. + ------------------------------------------------------------ */ + if (!factored) { + t = SuperLU_timer_ (); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A + * permc_spec = PARMETIS: parallel METIS on structure of A'+A + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + + if (parSymbFact == YES || permc_spec == PARMETIS) { + nprocs_num = grid->nprow * grid->npcol; + noDomains = (int) (pow (2, ((int) LOG2 (nprocs_num)))); + + /* create a new communicator for the first noDomains + processes in grid->comm */ + key = iam; + if (iam < noDomains) + col = 0; + else + col = MPI_UNDEFINED; + MPI_Comm_split (grid->comm, col, key, &symb_comm); + + if (permc_spec == NATURAL || permc_spec == MY_PERMC) { + if (permc_spec == NATURAL) { - printf(".. anorm %e\n", anorm); - fflush(stdout); - fflush(stdout); + for (j = 0; j < n; ++j) + perm_c[j] = j; } -#endif + if (!(sizes = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for sizes."); + if (!(fstVtxSep = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for fstVtxSep."); + for (i = 0; i < 2 * noDomains - 2; ++i) { + sizes[i] = 0; + fstVtxSep[i] = 0; + } + sizes[2 * noDomains - 2] = m; + fstVtxSep[2 * noDomains - 2] = 0; + } else if (permc_spec != PARMETIS) { + /* same as before */ + printf("{%4d,%4d}: pdgssvx3d: invalid ColPerm option when ParSymbfact is used\n", + (int) MYROW(grid->iam, grid), (int) MYCOL(grid->iam, grid)); } - - /* ------------------------------------------------------------ - Perform the LU factorization. - ------------------------------------------------------------ */ - if (!factored) - { - t = SuperLU_timer_(); - /* - * Get column permutation vector perm_c[], according to permc_spec: - * permc_spec = NATURAL: natural ordering - * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A - * permc_spec = MMD_ATA: minimum degree on structure of A'*A - * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A - * permc_spec = PARMETIS: parallel METIS on structure of A'+A - * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] - */ - permc_spec = options->ColPerm; - - if (parSymbFact == YES || permc_spec == PARMETIS) - { - nprocs_num = grid->nprow * grid->npcol; - noDomains = (int)(pow(2, ((int)LOG2(nprocs_num)))); - - /* create a new communicator for the first noDomains - processes in grid->comm */ - key = iam; - if (iam < noDomains) - col = 0; - else - col = MPI_UNDEFINED; - MPI_Comm_split(grid->comm, col, key, &symb_comm); - - if (permc_spec == NATURAL || permc_spec == MY_PERMC) - { - if (permc_spec == NATURAL) - { - for (j = 0; j < n; ++j) - perm_c[j] = j; - } - if (!(sizes = intMalloc_dist(2 * noDomains))) - ABORT("SUPERLU_MALLOC fails for sizes."); - if (!(fstVtxSep = intMalloc_dist(2 * noDomains))) - ABORT("SUPERLU_MALLOC fails for fstVtxSep."); - for (i = 0; i < 2 * noDomains - 2; ++i) - { - sizes[i] = 0; - fstVtxSep[i] = 0; - } - sizes[2 * noDomains - 2] = m; - fstVtxSep[2 * noDomains - 2] = 0; - } - else if (permc_spec != PARMETIS) - { - /* same as before */ - printf("{%4d,%4d}: pdgssvx3d: invalid ColPerm option when ParSymbfact is used\n", - (int)MYROW(grid->iam, grid), (int)MYCOL(grid->iam, grid)); - } - } /* end ... use parmetis */ - - if (permc_spec != MY_PERMC && Fact == DOFACT) - { - if (permc_spec == PARMETIS) - { - /* Get column permutation vector in perm_c. * - * This routine takes as input the distributed input matrix A * - * and does not modify it. It also allocates memory for * - * sizes[] and fstVtxSep[] arrays, that contain information * - * on the separator tree computed by ParMETIS. */ - flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, - noDomains, &sizes, &fstVtxSep, - grid, &symb_comm); - if (flinfo > 0) - ABORT("ERROR in get perm_c parmetis."); - } - else - { - get_perm_c_dist(iam, permc_spec, &GA, perm_c); - } + } /* end ... use parmetis */ + + if (permc_spec != MY_PERMC && Fact == DOFACT) { + if (permc_spec == PARMETIS) { + /* Get column permutation vector in perm_c. * + * This routine takes as input the distributed input matrix A * + * and does not modify it. It also allocates memory for * + * sizes[] and fstVtxSep[] arrays, that contain information * + * on the separator tree computed by ParMETIS. */ + flinfo = get_perm_c_parmetis (A, perm_r, perm_c, nprocs_num, + noDomains, &sizes, &fstVtxSep, + grid, &symb_comm); + if (flinfo > 0) + ABORT ("ERROR in get perm_c parmetis."); + } else { + get_perm_c_dist (iam, permc_spec, &GA, perm_c); + } + } + + stat->utime[COLPERM] = SuperLU_timer_ () - t; + + /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' + (a.k.a. column etree), depending on the choice of ColPerm. + Adjust perm_c[] to be consistent with a postorder of etree. + Permute columns of A to form A*Pc'. */ + if (Fact != SamePattern_SameRowPerm) { + if (parSymbFact == NO) { + + int_t *GACcolbeg, *GACcolend, *GACrowind; + + sp_colorder (options, &GA, perm_c, etree, &GAC); + + /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ + GACstore = (NCPformat *) GAC.Store; + GACcolbeg = GACstore->colbeg; + GACcolend = GACstore->colend; + GACrowind = GACstore->rowind; + for (j = 0; j < n; ++j) { + for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { + irow = GACrowind[i]; + GACrowind[i] = perm_c[irow]; } - - stat->utime[COLPERM] = SuperLU_timer_() - t; - - /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' - (a.k.a. column etree), depending on the choice of ColPerm. - Adjust perm_c[] to be consistent with a postorder of etree. - Permute columns of A to form A*Pc'. */ - if (Fact != SamePattern_SameRowPerm) - { - if (parSymbFact == NO) - { - - int_t *GACcolbeg, *GACcolend, *GACrowind; - - sp_colorder(options, &GA, perm_c, etree, &GAC); - - /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ - GACstore = (NCPformat *)GAC.Store; - GACcolbeg = GACstore->colbeg; - GACcolend = GACstore->colend; - GACrowind = GACstore->rowind; - for (j = 0; j < n; ++j) - { - for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) - { - irow = GACrowind[i]; - GACrowind[i] = perm_c[irow]; - } - } - - /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up - the nonzero data structures for L & U. */ -#if (PRNTlevel >= 1) - if (!iam) - { - printf(".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", - sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); - fflush(stdout); - } + } + + + /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up + the nonzero data structures for L & U. */ +#if ( PRNTlevel>=1 ) + if (!iam) + printf + (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", + sp_ienv_dist (2), sp_ienv_dist (3), sp_ienv_dist (6)); #endif - t = SuperLU_timer_(); - if (!(Glu_freeable = (Glu_freeable_t *) - SUPERLU_MALLOC(sizeof(Glu_freeable_t)))) - ABORT("Malloc fails for Glu_freeable."); - - /* Every process does this. */ - iinfo = symbfact(options, iam, &GAC, perm_c, etree, - Glu_persist, Glu_freeable); - - stat->utime[SYMBFAC] = SuperLU_timer_() - t; - if (iinfo < 0) - { - /* Successful return */ - QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); - -#if (PRNTlevel >= 1) - if (!iam) - { - printf("\tNo of supers %ld\n", - Glu_persist->supno[n - 1] + 1); - printf("\tSize of G(L) %ld\n", - Glu_freeable->xlsub[n]); - printf("\tSize of G(U) %ld\n", - Glu_freeable->xusub[n]); - printf("\tint %d, short %d, float %d, double %d\n", - sizeof(int_t), sizeof(short), - sizeof(float), sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", - symb_mem_usage.for_lu * 1e-6, - symb_mem_usage.total * 1e-6, - symb_mem_usage.expansions); - fflush(stdout); - } -#endif - } - else - { - if (!iam) - { - fprintf(stderr, "symbfact() error returns %d\n", - (int)iinfo); - exit(-1); - } - } - - } /* end serial symbolic factorization */ - else /* parallel symbolic factorization */ - { - t = SuperLU_timer_(); - flinfo = - symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r, - sizes, fstVtxSep, &Pslu_freeable, - &(grid->comm), &symb_comm, - &symb_mem_usage); - stat->utime[SYMBFAC] = SuperLU_timer_() - t; - if (flinfo > 0) - ABORT("Insufficient memory for parallel symbolic factorization."); - } - - /* Destroy GA */ - if (parSymbFact == NO || options->RowPerm != NO) - Destroy_CompCol_Matrix_dist(&GA); - if (parSymbFact == NO) - Destroy_CompCol_Permuted_dist(&GAC); - - } /* end if Fact not SamePattern_SameRowPerm */ - - if (sizes) - SUPERLU_FREE(sizes); - if (fstVtxSep) - SUPERLU_FREE(fstVtxSep); - if (symb_comm != MPI_COMM_NULL) - MPI_Comm_free(&symb_comm); - - if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) - { - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) - colind[j] = perm_c[colind[j]]; - - /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. - NOTE: the row permutation Pc*Pr is applied internally in the - distribution routine. */ - t = SuperLU_timer_(); - dist_mem_use = pddistribute(Fact, n, A, ScalePermstruct, - Glu_freeable, LUstruct, grid); - stat->utime[DIST] = SuperLU_timer_() - t; - - /* Deallocate storage used in symbolic factorization. */ - if (Fact != SamePattern_SameRowPerm) - { - iinfo = symbfact_SubFree(Glu_freeable); - SUPERLU_FREE(Glu_freeable); - } + t = SuperLU_timer_ (); + if (!(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC (sizeof (Glu_freeable_t)))) + ABORT ("Malloc fails for Glu_freeable."); + + /* Every process does this. */ + iinfo = symbfact (options, iam, &GAC, perm_c, etree, + Glu_persist, Glu_freeable); + + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (iinfo < 0) { + /* Successful return */ + QuerySpace_dist (n, -iinfo, Glu_freeable, &symb_mem_usage); + +#if ( PRNTlevel>=1 ) + if (!iam) { + printf ("\tNo of supers %ld\n", + (long) Glu_persist->supno[n - 1] + 1); + printf ("\tSize of G(L) %ld\n", (long) Glu_freeable->xlsub[n]); + printf ("\tSize of G(U) %ld\n", (long) Glu_freeable->xusub[n]); + printf ("\tint %d, short %d, float %d, double %d\n", + sizeof (int_t), sizeof (short), + sizeof (float), sizeof (double)); + printf + ("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", + symb_mem_usage.for_lu * 1e-6, + symb_mem_usage.total * 1e-6, + symb_mem_usage.expansions); } - else - { - /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. - NOTE: the row permutation Pc*Pr is applied internally in the - distribution routine. */ - /* Apply column permutation to the original distributed A */ - for (j = 0; j < nnz_loc; ++j) - colind[j] = perm_c[colind[j]]; - - t = SuperLU_timer_(); - dist_mem_use = ddist_psymbtonum(Fact, n, A, ScalePermstruct, - &Pslu_freeable, LUstruct, grid); - if (dist_mem_use > 0) - ABORT("Not enough memory available for dist_psymbtonum\n"); - - stat->utime[DIST] = SuperLU_timer_() - t; +#endif + } else { + if (!iam) { + fprintf (stderr, "symbfact() error returns %d\n", + (int) iinfo); + exit (-1); } + } + + } /* end serial symbolic factorization */ + else { /* parallel symbolic factorization */ + t = SuperLU_timer_ (); + flinfo = + symbfact_dist (nprocs_num, noDomains, A, perm_c, perm_r, + sizes, fstVtxSep, &Pslu_freeable, + &(grid->comm), &symb_comm, + &symb_mem_usage); + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (flinfo > 0) + ABORT + ("Insufficient memory for parallel symbolic factorization."); + } - /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); */ - } /* end if not Factored */ - } /* end if process layer 0 */ - - trf3Dpartition_t *trf3Dpartition; - - /* Perform numerical factorization in parallel on all process layers.*/ - if (!factored) - { + /* Destroy GA */ + if (parSymbFact == NO || options->RowPerm != NO) + Destroy_CompCol_Matrix_dist (&GA); + if (parSymbFact == NO) + Destroy_CompCol_Permuted_dist (&GAC); + + } /* end if Fact not SamePattern_SameRowPerm */ + + if (sizes) + SUPERLU_FREE (sizes); + if (fstVtxSep) + SUPERLU_FREE (fstVtxSep); + if (symb_comm != MPI_COMM_NULL) + MPI_Comm_free (&symb_comm); + + if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + t = SuperLU_timer_ (); + dist_mem_use = pddistribute (Fact, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid); + stat->utime[DIST] = SuperLU_timer_ () - t; + + /* Deallocate storage used in symbolic factorization. */ + if (Fact != SamePattern_SameRowPerm) + { + iinfo = symbfact_SubFree (Glu_freeable); + SUPERLU_FREE (Glu_freeable); + } + } else { + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + t = SuperLU_timer_ (); + dist_mem_use = ddist_psymbtonum (Fact, n, A, ScalePermstruct, + &Pslu_freeable, LUstruct, grid); + if (dist_mem_use > 0) + ABORT ("Not enough memory available for dist_psymbtonum\n"); + + stat->utime[DIST] = SuperLU_timer_ () - t; + } + + /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); */ + } /* end if not Factored */ + } /* end if process layer 0 */ - /* send the data across all the layers */ - MPI_Bcast(&m, 1, mpi_int_t, 0, grid3d->zscp.comm); - MPI_Bcast(&n, 1, mpi_int_t, 0, grid3d->zscp.comm); - MPI_Bcast(&anorm, 1, MPI_DOUBLE, 0, grid3d->zscp.comm); + trf3Dpartition_t* trf3Dpartition; - /* send the LU structure to all the grids */ - dp3dScatter(n, LUstruct, grid3d); + /* Perform numerical factorization in parallel on all process layers.*/ + if ( !factored ) { - int_t nsupers = getNsupers(n, LUstruct->Glu_persist); - trf3Dpartition = dinitTrf3Dpartition(nsupers, options, LUstruct, grid3d); + /* send the data across all the layers */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &anorm, 1, MPI_DOUBLE, 0, grid3d->zscp.comm); + + /* send the LU structure to all the grids */ + dp3dScatter(n, LUstruct, grid3d); - SCT_t *SCT = (SCT_t *)SUPERLU_MALLOC(sizeof(SCT_t)); - SCT_init(SCT); + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); + trf3Dpartition = dinitTrf3Dpartition(nsupers, options, LUstruct, grid3d); -#if (PRNTlevel >= 1) - if (iam == 0) - { - printf("after 3D initialization.\n"); - fflush(stdout); - } + SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t)); + SCT_init(SCT); + +#if ( PRNTlevel>=1 ) + if (grid3d->iam == 0) { + printf("after 3D initialization.\n"); fflush(stdout); + } #endif - t = SuperLU_timer_(); - - /*factorize in grid 1*/ - // if(grid3d->zscp.Iam) - - pdgstrf3d(options, m, n, anorm, trf3Dpartition, SCT, LUstruct, - grid3d, stat, info); - stat->utime[FACT] = SuperLU_timer_() - t; + t = SuperLU_timer_ (); - double tgather = SuperLU_timer_(); + /*factorize in grid 1*/ + // if(grid3d->zscp.Iam) - dgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT); - - SCT->gatherLUtimer += SuperLU_timer_() - tgather; - /*print stats for bottom grid*/ - -#if (PRNTlevel >= 1) - if (!grid3d->zscp.Iam) - { - SCT_print(grid, SCT); - SCT_print3D(grid3d, SCT); - } - SCT_printComm3D(grid3d, SCT); - - /*print memory usage*/ - d3D_printMemUse(trf3Dpartition, LUstruct, grid3d); + pdgstrf3d (options, m, n, anorm, trf3Dpartition, SCT, LUstruct, + grid3d, stat, info); + stat->utime[FACT] = SuperLU_timer_ () - t; + + double tgather = SuperLU_timer_(); + + dgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT); - /*print forest weight and costs*/ - printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d); - /*reduces stat from all the layers*/ + SCT->gatherLUtimer += SuperLU_timer_() - tgather; + /*print stats for bottom grid*/ + +#if ( PRNTlevel>=1 ) + if (!grid3d->zscp.Iam) + { + SCT_print(grid, SCT); + SCT_print3D(grid3d, SCT); + } + SCT_printComm3D(grid3d, SCT); + + /*print memory usage*/ + d3D_printMemUse( trf3Dpartition, LUstruct, grid3d ); + + /*print forest weight and costs*/ + printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d); + /*reduces stat from all the layers*/ #endif - dDestroy_trf3Dpartition(trf3Dpartition, grid3d); - SCT_free(SCT); - - } /* end if not Factored */ - - if (grid3d->zscp.Iam == 0) // only process layer 0 - { - if (!factored) - { - if (options->PrintStat) - { - int_t TinyPivots; - float for_lu, total, max, avg, temp; - - dQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage); - - if (parSymbFact == TRUE) - { - /* The memory used in the redistribution routine - includes the memory used for storing the symbolic - structure and the memory allocated for numerical factorization */ - temp = SUPERLU_MAX(symb_mem_usage.total, -dist_mem_use); - if (options->RowPerm != NO) - temp = SUPERLU_MAX(temp, GA_mem_use); - } - else - { - temp = SUPERLU_MAX(symb_mem_usage.total + GA_mem_use, /* symbfact step */ - symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu /* distribution step */ + dDestroy_trf3Dpartition(trf3Dpartition, grid3d); + SCT_free(SCT); + + } /* end if not Factored */ + + if ( grid3d->zscp.Iam == 0 ) { // only process layer 0 + if (!factored) { + if (options->PrintStat) { + int_t TinyPivots; + float for_lu, total, max, avg, temp; + + dQuerySpace_dist (n, LUstruct, grid, stat, &num_mem_usage); + + if (parSymbFact == TRUE) { + /* The memory used in the redistribution routine + includes the memory used for storing the symbolic + structure and the memory allocated for numerical factorization */ + temp = SUPERLU_MAX (symb_mem_usage.total, -dist_mem_use); + if (options->RowPerm != NO) + temp = SUPERLU_MAX (temp, GA_mem_use); + } + else { + temp = SUPERLU_MAX (symb_mem_usage.total + GA_mem_use, /* symbfact step */ + symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu /* distribution step */ ); - } - - temp = SUPERLU_MAX(temp, num_mem_usage.total); - - MPI_Reduce(&temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); - MPI_Reduce(&temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); - MPI_Allreduce(&stat->TinyPivots, &TinyPivots, 1, mpi_int_t, - MPI_SUM, grid->comm); - stat->TinyPivots = TinyPivots; - - MPI_Reduce(&num_mem_usage.for_lu, &for_lu, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); - MPI_Reduce(&num_mem_usage.total, &total, - 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); - - if (!iam) - { - printf("\tNUMfact space (MB) sum(procs): L\\U\t%.2f\tall\t%.2f\n", - for_lu * 1e-6, total * 1e-6); - printf("\tTotal highmark (MB): " - "All\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", - avg * 1e-6, - avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); - printf("**************************************************\n"); - fflush(stdout); - } - } - - } /* end if not Factored */ - - /* ------------------------------------------------------------ - Compute the solution matrix X. - ------------------------------------------------------------ */ - if (nrhs) - { - if (!(b_work = doubleMalloc_dist(n))) - ABORT("Malloc fails for b_work[]"); - - /* ------------------------------------------------------ - Scale the right-hand side if equilibration was performed - ------------------------------------------------------*/ - if (notran) - { - if (rowequ) - { - - b_col = B; - - for (j = 0; j < nrhs; ++j) - { - irow = fst_row; - for (i = 0; i < m_loc; ++i) - { - b_col[i] *= R[irow]; - ++irow; - } - b_col += ldb; - } - } - } - else if (colequ) - { + } + temp = SUPERLU_MAX (temp, num_mem_usage.total); + + MPI_Reduce (&temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + MPI_Reduce (&temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Allreduce (&stat->TinyPivots, &TinyPivots, 1, mpi_int_t, + MPI_SUM, grid->comm); + stat->TinyPivots = TinyPivots; + + MPI_Reduce (&num_mem_usage.for_lu, &for_lu, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&num_mem_usage.total, &total, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + + if (!iam) { + printf("\tNUMfact space (MB) sum(procs): L\\U\t%.2f\tall\t%.2f\n", + for_lu * 1e-6, total * 1e-6); + printf ("\tTotal highmark (MB): " + "All\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg * 1e-6, + avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); + printf("**************************************************\n"); + fflush(stdout); + } + } + + } /* end if not Factored */ + + /* ------------------------------------------------------------ + Compute the solution matrix X. + ------------------------------------------------------------ */ + if (nrhs) { + if (!(b_work = doubleMalloc_dist (n))) + ABORT ("Malloc fails for b_work[]"); + + /* ------------------------------------------------------ + Scale the right-hand side if equilibration was performed + ------------------------------------------------------*/ + if (notran) + { + if (rowequ) + { b_col = B; - for (j = 0; j < nrhs; ++j) - { + { irow = fst_row; for (i = 0; i < m_loc; ++i) - { - b_col[i] *= C[irow]; + { + b_col[i] *= R[irow]; ++irow; - } + } b_col += ldb; - } - } - - /* Save a copy of the right-hand side. */ - ldx = ldb; - if (!(X = doubleMalloc_dist(((size_t)ldx) * nrhs))) - ABORT("Malloc fails for X[]"); - x_col = X; + } + } + } + else if (colequ) + { b_col = B; for (j = 0; j < nrhs; ++j) - { + { + irow = fst_row; for (i = 0; i < m_loc; ++i) - x_col[i] = b_col[i]; - x_col += ldx; + { + b_col[i] *= C[irow]; + ++irow; + } b_col += ldb; - } + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if (!(X = doubleMalloc_dist (((size_t) ldx) * nrhs))) + ABORT ("Malloc fails for X[]"); + x_col = X; + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; + x_col += ldx; + b_col += ldb; + } - /* ------------------------------------------------------ - Solve the linear system. - ------------------------------------------------------*/ - if (options->SolveInitialized == NO) /* First time */ - /* Inside this routine, SolveInitialized is set to YES. - For repeated call to pdgssvx3d(), no need to re-initialilze - the Solve data & communication structures, unless a new - factorization with Fact == DOFACT or SamePattern is asked for. */ - { - dSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, - grid, SOLVEstruct); - } - stat->utime[SOLVE] = 0.0; + /* ------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------*/ + if (options->SolveInitialized == NO) /* First time */ + /* Inside this routine, SolveInitialized is set to YES. + For repeated call to pdgssvx3d(), no need to re-initialilze + the Solve data & communication structures, unless a new + factorization with Fact == DOFACT or SamePattern is asked for. */ + { + dSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct, + grid, SOLVEstruct); + } + stat->utime[SOLVE] = 0.0; #if 0 // Sherry: the following interface is needed by 3D trisolve. - pdgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc, - fst_row, ldb, nrhs, SOLVEstruct, stat, info); + pdgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); #else - pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, - fst_row, ldb, nrhs, SOLVEstruct, stat, info); - - //PrintDouble5("after pdgstrs X", ldb, X); + pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); #endif - /* ------------------------------------------------------------ - Use iterative refinement to improve the computed solution and - compute error bounds and backward error estimates for it. - ------------------------------------------------------------ */ - if (options->IterRefine) - { - /* Improve the solution by iterative refinement. */ - int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; - dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */ - - t = SuperLU_timer_(); - if (options->RefineInitialized == NO || Fact == DOFACT) - { - /* All these cases need to re-initialize gsmv structure */ - if (options->RefineInitialized) - pdgsmv_finalize(SOLVEstruct->gsmv_comm); - pdgsmv_init(A, SOLVEstruct->row_to_proc, grid, - SOLVEstruct->gsmv_comm); - - /* Save a copy of the transformed local col indices - in colind_gsmv[]. */ - if (colind_gsmv) - SUPERLU_FREE(colind_gsmv); - if (!(it = intMalloc_dist(nnz_loc))) - ABORT("Malloc fails for colind_gsmv[]"); - colind_gsmv = SOLVEstruct->A_colind_gsmv = it; - for (i = 0; i < nnz_loc; ++i) - colind_gsmv[i] = colind[i]; - options->RefineInitialized = YES; - } - else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) - { - double at; - int_t k, jcol, p; - /* Swap to beginning the part of A corresponding to the - local part of X, as was done in pdgsmv_init() */ - for (i = 0; i < m_loc; ++i) /* Loop through each row */ - { - k = rowptr[i]; - for (j = rowptr[i]; j < rowptr[i + 1]; ++j) - { - jcol = colind[j]; - p = SOLVEstruct->row_to_proc[jcol]; - if (p == iam) - { - /* Local */ - at = a[k]; - a[k] = a[j]; - a[j] = at; - ++k; - } - } - } - - /* Re-use the local col indices of A obtained from the - previous call to pdgsmv_init() */ - for (i = 0; i < nnz_loc; ++i) - colind[i] = colind_gsmv[i]; - } - - if (nrhs == 1) - { - /* Use the existing solve structure */ - SOLVEstruct1 = SOLVEstruct; - } - else - { - /* For nrhs > 1, since refinement is performed for RHS - one at a time, the communication structure for pdgstrs - is different than the solve with nrhs RHS. - So we use SOLVEstruct1 for the refinement step. - */ - if (!(SOLVEstruct1 = (dSOLVEstruct_t *) - SUPERLU_MALLOC(sizeof(dSOLVEstruct_t)))) - ABORT("Malloc fails for SOLVEstruct1"); - /* Copy the same stuff */ - SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; - SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; - SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; - SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; - SOLVEstruct1->diag_len = SOLVEstruct->diag_len; - SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; - SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; - - /* Initialize the *gstrs_comm for 1 RHS. */ - if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) - SUPERLU_MALLOC(sizeof(pxgstrs_comm_t)))) - ABORT("Malloc fails for gstrs_comm[]"); - pdgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, - Glu_persist, SOLVEstruct1); - } - - pdgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid, - B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); - - /* Deallocate the storage associated with SOLVEstruct1 */ - if (nrhs > 1) - { - pxgstrs_finalize(SOLVEstruct1->gstrs_comm); - SUPERLU_FREE(SOLVEstruct1); - } - - stat->utime[REFINE] = SuperLU_timer_() - t; + /* ------------------------------------------------------------ + Use iterative refinement to improve the computed solution and + compute error bounds and backward error estimates for it. + ------------------------------------------------------------ */ + if (options->IterRefine) + { + /* Improve the solution by iterative refinement. */ + int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; + dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */ + + t = SuperLU_timer_ (); + if (options->RefineInitialized == NO || Fact == DOFACT) { + /* All these cases need to re-initialize gsmv structure */ + if (options->RefineInitialized) + pdgsmv_finalize (SOLVEstruct->gsmv_comm); + pdgsmv_init (A, SOLVEstruct->row_to_proc, grid, + SOLVEstruct->gsmv_comm); + + /* Save a copy of the transformed local col indices + in colind_gsmv[]. */ + if (colind_gsmv) SUPERLU_FREE (colind_gsmv); + if (!(it = intMalloc_dist (nnz_loc))) + ABORT ("Malloc fails for colind_gsmv[]"); + colind_gsmv = SOLVEstruct->A_colind_gsmv = it; + for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; + options->RefineInitialized = YES; } - - /* Permute the solution matrix B <= Pc'*X. */ - pdPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc, - SOLVEstruct->inv_perm_c, - X, ldx, B, ldb, nrhs, grid); -#if (DEBUGlevel >= 2) - printf("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); - for (i = 0; i < m_loc; ++i) - printf("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]); -#endif - - /* Transform the solution matrix X to a solution of the original - system before the equilibration. */ - if (notran) - { - if (colequ) - { - b_col = B; - for (j = 0; j < nrhs; ++j) - { - irow = fst_row; - for (i = 0; i < m_loc; ++i) - { - b_col[i] *= C[irow]; - ++irow; - } - b_col += ldb; - } - } + else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) { + double at; + int_t k, jcol, p; + /* Swap to beginning the part of A corresponding to the + local part of X, as was done in pdgsmv_init() */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + k = rowptr[i]; + for (j = rowptr[i]; j < rowptr[i + 1]; ++j) + { + jcol = colind[j]; + p = SOLVEstruct->row_to_proc[jcol]; + if (p == iam) + { /* Local */ + at = a[k]; + a[k] = a[j]; + a[j] = at; + ++k; + } + } + } + + /* Re-use the local col indices of A obtained from the + previous call to pdgsmv_init() */ + for (i = 0; i < nnz_loc; ++i) + colind[i] = colind_gsmv[i]; } - else if (rowequ) - { + + if (nrhs == 1) + { /* Use the existing solve structure */ + SOLVEstruct1 = SOLVEstruct; + } + else { + /* For nrhs > 1, since refinement is performed for RHS + one at a time, the communication structure for pdgstrs + is different than the solve with nrhs RHS. + So we use SOLVEstruct1 for the refinement step. + */ + if (!(SOLVEstruct1 = (dSOLVEstruct_t *) + SUPERLU_MALLOC(sizeof(dSOLVEstruct_t)))) + ABORT ("Malloc fails for SOLVEstruct1"); + /* Copy the same stuff */ + SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; + SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; + SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; + SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; + SOLVEstruct1->diag_len = SOLVEstruct->diag_len; + SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; + SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; + + /* Initialize the *gstrs_comm for 1 RHS. */ + if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) + ABORT ("Malloc fails for gstrs_comm[]"); + pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, + Glu_persist, SOLVEstruct1); + } + + pdgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid, + B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); + + /* Deallocate the storage associated with SOLVEstruct1 */ + if (nrhs > 1) + { + pxgstrs_finalize (SOLVEstruct1->gstrs_comm); + SUPERLU_FREE (SOLVEstruct1); + } + + stat->utime[REFINE] = SuperLU_timer_ () - t; + } + + /* Permute the solution matrix B <= Pc'*X. */ + pdPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc, + SOLVEstruct->inv_perm_c, + X, ldx, B, ldb, nrhs, grid); +#if ( DEBUGlevel>=2 ) + printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); + for (i = 0; i < m_loc; ++i) + printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]); +#endif + + /* Transform the solution matrix X to a solution of the original + system before the equilibration. */ + if (notran) + { + if (colequ) + { b_col = B; for (j = 0; j < nrhs; ++j) - { + { irow = fst_row; for (i = 0; i < m_loc; ++i) - { - b_col[i] *= R[irow]; + { + b_col[i] *= C[irow]; ++irow; - } + } b_col += ldb; - } - } - - SUPERLU_FREE(b_work); - SUPERLU_FREE(X); - - } /* end if nrhs != 0 */ - -#if (PRNTlevel >= 1) - if (!iam) - printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); + } + } + } + else if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + + SUPERLU_FREE (b_work); + SUPERLU_FREE (X); + + } /* end if nrhs != 0 */ + +#if ( PRNTlevel>=1 ) + if (!iam) + printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale); #endif - - /* Deallocate R and/or C if it was not used. */ - if (Equil && Fact != SamePattern_SameRowPerm) - { - switch (ScalePermstruct->DiagScale) - { - case NOEQUIL: - SUPERLU_FREE(R); - SUPERLU_FREE(C); - break; - case ROW: - SUPERLU_FREE(C); - break; - case COL: - SUPERLU_FREE(R); - break; - default: break; - } + + /* Deallocate R and/or C if it was not used. */ + if (Equil && Fact != SamePattern_SameRowPerm) + { + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + SUPERLU_FREE (R); + SUPERLU_FREE (C); + break; + case ROW: + SUPERLU_FREE (C); + break; + case COL: + SUPERLU_FREE (R); + break; + default: break; } - } /* process layer 0 done solve */ - -#ifdef NRFRMT - dScatter_B3d(A3d, grid3d); + } +#if 0 + if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact) + Destroy_CompCol_Permuted_dist (&GAC); #endif - // double *B, int ldb, int nrhs, double *B2d, - // gridinfo3d_t *grid3d); - - // Sherry comment: - // Now, B <=> B2d, and is filled with the solution X - // B3d is the saved pointer of the B on input - // Need the following code: - // - scatter the solution from 2D back to 3D: {B2d,ldb} -> {B3d,ldb3d} - // (can we reuse b_count[] and b_disp[] already computed in 'Gather' routine?) + } /* process layer 0 done solve */ - A->Store = Astore3d; // restore Astore to 3D +#ifdef NRFRMT + /* Scatter the solution from 2D grid_0 to 3D grid */ + dScatter_B3d(A3d, grid3d); + A->Store = Astore3d; // restore Astore to 3D + /* free A2d and B2d, which are allocated only in 2D layer Grid_0 */ - NRformat_loc *A2d = A3d->A_nfmt; + NRformat_loc *A2d = A3d->A_nfmt; if (grid3d->zscp.Iam == 0) { - SUPERLU_FREE( A2d->rowptr ); SUPERLU_FREE( A2d->colind ); SUPERLU_FREE( A2d->nzval ); - - SUPERLU_FREE(A3d->B2d); - + SUPERLU_FREE( A3d->B2d ); } SUPERLU_FREE( A2d ); // free 2D structure - SUPERLU_FREE(A3d); // free 3D structure -#if (DEBUGlevel >= 1) - CHECK_MALLOC(iam, "Exit pdgssvx3d()"); + SUPERLU_FREE( A3d ); // free 3D structure #endif + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit pdgssvx3d()"); +#endif + } diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c index 1d60904d..dae6bc22 100644 --- a/SRC/pdgstrf.c +++ b/SRC/pdgstrf.c @@ -129,7 +129,7 @@ at the top-level directory. // #define SUPERNODE_PROFILE /* - Name : BAELINE + Name : BASELINE Purpose : baseline to compare performance against Overhead : NA : this won't be used for running experiments */ @@ -767,13 +767,14 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, SUPERLU_MAX (max_row_size * num_threads * ldt, get_max_buffer_size ()); */ -#ifdef GPU_ACC +#ifdef GPU_ACC /*-------- use GPU --------*/ int cublas_nb = get_cublas_nb(); // default 64 - int nstreams = get_num_cuda_streams (); + int nstreams = get_num_cuda_streams (); // default 8 - int_t buffer_size = SUPERLU_MAX(max_row_size*nstreams*cublas_nb,get_max_buffer_size()); + int_t buffer_size = SUPERLU_MAX(max_row_size * nstreams * cublas_nb, + get_max_buffer_size()); /* array holding last column blk for each partition, - used in SchCompUdt--CUDA.c */ + used in SchCompUdt-cuda.c */ #if 0 int *stream_end_col = (int_t *) _mm_malloc (sizeof (int_t) * nstreams,64); #else @@ -783,8 +784,9 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #else /* not to use GPU */ int Threads_per_process = get_thread_per_process(); - int_t buffer_size = SUPERLU_MAX(max_row_size*Threads_per_process*ldt,get_max_buffer_size()); -#endif /* end ifdef GPU_ACC */ + int_t buffer_size = SUPERLU_MAX(max_row_size * Threads_per_process * ldt, + get_max_buffer_size()); +#endif /* end ifdef GPU_ACC -----------*/ int_t max_ncols = 0; #if 0 @@ -811,8 +813,12 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, bigV = NULL; #if ( PRNTlevel>=1 ) - if(!iam) printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n", - max_row_size, max_ncols); + if(!iam) { + printf("\t.. MAX_BUFFER_SIZE " IFMT " set for GPU\n", get_max_buffer_size()); + printf("\t.. N_GEMM: " IFMT " flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7)); + printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n", + max_row_size, max_ncols); + } printf("[%d].. BIG U size " IFMT " (on CPU)\n", iam, bigu_size); fflush(stdout); #endif @@ -827,17 +833,20 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #endif #if ( PRNTlevel>=1 ) - printf("[%d].. BIG V size %d (on CPU), dC buffer_size %d (on GPU)\n", iam, bigv_size, buffer_size); + printf("[%d].. BIG V size " IFMT " (on CPU), dC buffer_size " IFMT " (on GPU)\n", + iam, bigv_size, buffer_size); fflush(stdout); #endif if ( checkCuda(cudaHostAlloc((void**)&bigV, bigv_size * sizeof(double) ,cudaHostAllocDefault)) ) ABORT("Malloc fails for dgemm buffer V"); - DisplayHeader(); - #if ( PRNTlevel>=1 ) - printf(" Starting with %d Cuda Streams \n",nstreams ); + if ( iam==0 ) { + DisplayHeader(); + printf(" Starting with %d Cuda Streams \n",nstreams ); + fflush(stdout); + } #endif cublasHandle_t *handle; @@ -879,10 +888,11 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, return 1; } - stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3) - + bigu_size + buffer_size ) * dword; + stat->gpu_buffer += dword * ( max_row_size * sp_ienv_dist(3) // dA + + bigu_size // dB + + buffer_size ); // dC -#else /*-- not to use GPU --*/ +#else /*-------- not to use GPU --------*/ // for GEMM padding 0 j = bigu_size / ldt; @@ -904,7 +914,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, ABORT ("Malloc failed for dgemm V buffer"); //#endif -#endif /* end ifdef GPU_ACC */ +#endif +/*************** end ifdef GPU_ACC ****************/ log_memory((bigv_size + bigu_size) * dword, stat); @@ -1757,29 +1768,29 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, MPI_Reduce(&RemainGEMM_flops, &allflops, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm); if ( iam==0 ) { - printf("\nInitialization time\t%8.2lf seconds\n" + printf("\nInitialization time\t%8.4lf seconds\n" "\t Serial: compute static schedule, allocate storage\n", InitTimer); printf("\n==== Time breakdown in factorization (rank 0) ====\n"); - printf("Panel factorization \t %8.2lf seconds\n", + printf("Panel factorization \t %8.4lf seconds\n", pdgstrf2_timer + pdgstrs2_timer); - printf(".. L-panel pxgstrf2 \t %8.2lf seconds\n", pdgstrf2_timer); - printf(".. U-panel pxgstrs2 \t %8.2lf seconds\n", pdgstrs2_timer); - printf("Time in Look-ahead update \t %8.2lf seconds\n", lookaheadupdatetimer); - printf("Time in Schur update \t\t %8.2lf seconds\n", NetSchurUpTimer); - printf(".. Time to Gather L buffer\t %8.2lf (Separate L panel by Lookahead/Remain)\n", GatherLTimer); - printf(".. Time to Gather U buffer\t %8.2lf \n", GatherUTimer); - - printf(".. Time in GEMM %8.2lf \n", + printf(".. L-panel pxgstrf2 \t %8.4lf seconds\n", pdgstrf2_timer); + printf(".. U-panel pxgstrs2 \t %8.4lf seconds\n", pdgstrs2_timer); + printf("Time in Look-ahead update \t %8.4lf seconds\n", lookaheadupdatetimer); + printf("Time in Schur update \t\t %8.4lf seconds\n", NetSchurUpTimer); + printf(".. Time to Gather L buffer\t %8.4lf (Separate L panel by Lookahead/Remain)\n", GatherLTimer); + printf(".. Time to Gather U buffer\t %8.4lf \n", GatherUTimer); + + printf(".. Time in GEMM %8.4lf \n", LookAheadGEMMTimer + RemainGEMMTimer); - printf("\t* Look-ahead\t %8.2lf \n", LookAheadGEMMTimer); - printf("\t* Remain\t %8.2lf\tFlops %8.2le\tGflops %8.2lf\n", + printf("\t* Look-ahead\t %8.4lf \n", LookAheadGEMMTimer); + printf("\t* Remain\t %8.4lf\tFlops %8.4le\tGflops %8.4lf\n", RemainGEMMTimer, allflops, allflops/RemainGEMMTimer*1e-9); - printf(".. Time to Scatter %8.2lf \n", + printf(".. Time to Scatter %8.4lf \n", LookAheadScatterTimer + RemainScatterTimer); - printf("\t* Look-ahead\t %8.2lf \n", LookAheadScatterTimer); - printf("\t* Remain\t %8.2lf \n", RemainScatterTimer); + printf("\t* Look-ahead\t %8.4lf \n", LookAheadScatterTimer); + printf("\t* Remain\t %8.4lf \n", RemainScatterTimer); - printf("Total factorization time \t: %8.2lf seconds, \n", pxgstrfTimer); + printf("Total factorization time \t: %8.4lf seconds, \n", pxgstrfTimer); printf("--------\n"); printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n); } diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c index 1fc78326..ec97a69d 100644 --- a/SRC/pdgstrf3d.c +++ b/SRC/pdgstrf3d.c @@ -39,8 +39,8 @@ at the top-level directory. #endif #ifdef GPU_ACC -#include "lustruct_gpu.h" -#include "acc_aux.c" +#include "dlustruct_gpu.h" +//#include "acc_aux.c" //no need anymore #endif @@ -152,7 +152,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, initFactStat(nsupers, &factStat); #if 0 // sherry: not used - diagFactBufs_t dFBuf; + ddiagFactBufs_t dFBuf; dinitDiagFactBufs(ldt, &dFBuf); commRequests_t comReqs; @@ -166,7 +166,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, packLUInfo_t packLUInfo; initPackLUInfo(nsupers, &packLUInfo); - scuBufs_t scuBufs; + dscuBufs_t scuBufs; dinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid); factNodelists_t fNlists; @@ -177,7 +177,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, int_t maxLvl = log2i(grid3d->zscp.Np) + 1; #if ( PRNTlevel>=1 ) - if (!iam) { + if (grid3d->iam == 0) { printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout); } #endif @@ -202,53 +202,47 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode ) mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1]; } - diagFactBufs_t** dFBufs = dinitDiagFactBufsArr(mxLeafNode, ldt, grid); + ddiagFactBufs_t** dFBufs = dinitDiagFactBufsArr(mxLeafNode, ldt, grid); commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), ldt, grid); /* Setting up GPU related data structures */ -#define GPU_FRAMEWORK -#ifdef GPU_FRAMEWORK + int_t first_l_block_acc = 0; int_t first_u_block_acc = 0; - int_t Pc = grid->npcol; + int_t Pc = grid->npcol; int_t Pr = grid->nprow; - int_t mrb = (nsupers + Pr - 1) / Pr; // Sherry check ... use ceiling + int_t mrb = (nsupers + Pr - 1) / Pr; int_t mcb = (nsupers + Pc - 1) / Pc; HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t)); + dInit_HyP(HyP, Llu, mcb, mrb); HyP->first_l_block_acc = first_l_block_acc; HyP->first_u_block_acc = first_u_block_acc; - intt superlu_acc_offload = HyP->superlu_acc_offload; + int superlu_acc_offload = HyP->superlu_acc_offload; //int_t bigu_size = getBigUSize(nsupers, grid, LUstruct); - int_t bigu_size = getBigUSize(nsupers, grid, LUstruct->Llu->Lrowind_bc_ptr); - // int_t buffer_size = get_max_buffer_size (); - // HyP->buffer_size = buffer_size; + int_t bigu_size = getBigUSize(nsupers, grid, + LUstruct->Llu->Lrowind_bc_ptr); HyP->bigu_size = bigu_size; + int_t buffer_size = get_max_buffer_size (); + HyP->buffer_size = buffer_size; HyP->nsupers = nsupers; #ifdef GPU_ACC /*Now initialize the GPU data structure*/ - LUstruct_gpu *A_gpu, *dA_gpu; + dLUstruct_gpu_t *A_gpu, *dA_gpu; d2Hreduce_t d2HredObj; d2Hreduce_t* d2Hred = &d2HredObj; - sluGPU_t sluGPUobj; - sluGPU_t *sluGPU = &sluGPUobj; + dsluGPU_t sluGPUobj; + dsluGPU_t *sluGPU = &sluGPUobj; sluGPU->isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm); - - int_t buffer_size = get_max_buffer_size (); - HyP->buffer_size = buffer_size; - HyP->bigu_size = bigu_size; - HyP->nsupers = nsupers; - if (superlu_acc_offload) { - #if 0 /* Sherry: For GPU code on titan, we do not need performance - lookup tables since due to difference in CPU-GPU performance + lookup tables since due to difference in CPU-GPU performance, it didn't make much sense to do any Schur-complement update on CPU, except for the lookahead-update on CPU. Same should hold for summit as well. (from Piyush) */ @@ -260,20 +254,25 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, if (!iam) printf("Using MIC async cost of %lf \n", acc_async_cost); #endif #endif - int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid); + + //OLD: int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid); + int_t* perm_c_supno = getPerm_c_supno(nsupers, options, + LUstruct->etree, + LUstruct->Glu_persist, + LUstruct->Llu->Lrowind_bc_ptr, + LUstruct->Llu->Ufstnz_br_ptr, + grid); + /* Initialize GPU data structures */ - initSluGPU3D_t(sluGPU, LUstruct, grid3d, perm_c_supno, - n, buffer_size, bigu_size, ldt); - + dinitSluGPU3D_t(sluGPU, LUstruct, grid3d, perm_c_supno, + n, buffer_size, bigu_size, ldt); + HyP->first_u_block_acc = sluGPU->A_gpu->first_u_block_gpu; HyP->first_l_block_acc = sluGPU->A_gpu->first_l_block_gpu; HyP->nCudaStreams = sluGPU->nCudaStreams; + } - } /* end if superlu_acc_offload */ - -#endif - - +#endif // end GPU_ACC /*==== starting main factorization loop =====*/ MPI_Barrier( grid3d->comm); @@ -288,7 +287,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, #endif SCT->pdgstrfTimer = SuperLU_timer_(); - for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl) + for (int ilvl = 0; ilvl < maxLvl; ++ilvl) { /* if I participate in this level */ if (!myZeroTrIdxs[ilvl]) @@ -302,7 +301,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, { double tilvl = SuperLU_timer_(); #ifdef GPU_ACC - sparseTreeFactor_ASYNC_GPU( + dsparseTreeFactor_ASYNC_GPU( sforest, comReqss, &scuBufs, &packLUInfo, msgss, LUvsbs, dFBufs, &factStat, &fNlists, @@ -325,7 +324,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, if (ilvl < maxLvl - 1) /*then reduce before factorization*/ { #ifdef GPU_ACC - reduceAllAncestors3d_GPU( + dreduceAllAncestors3d_GPU( ilvl, myNodeCount, treePerm, LUvsb, LUstruct, grid3d, sluGPU, d2Hred, &factStat, HyP, SCT ); diff --git a/SRC/psymbfact.c b/SRC/psymbfact.c index fdade46e..fe2b722c 100644 --- a/SRC/psymbfact.c +++ b/SRC/psymbfact.c @@ -4776,7 +4776,11 @@ intraLvl_symbfact MPI_Irecv (&sz_msg, 1, mpi_int_t, MPI_ANY_SOURCE, tag_intraLvl_szMsg, (*symb_comm), &(request[0])); +#if defined (_LONGINT) if (sz_msg > LONG_MAX) +#else + if (sz_msg > INT_MAX) +#endif ABORT("ERROR in intraLvl_symbfact size to send > LONG_MAX\n"); } MPI_Waitany (2, request, index_req, status); diff --git a/SRC/pz3dcomm.c b/SRC/pz3dcomm.c index e72fad7e..a3bbe4d9 100644 --- a/SRC/pz3dcomm.c +++ b/SRC/pz3dcomm.c @@ -18,7 +18,7 @@ at the top-level directory. * May 10, 2019 */ #include "superlu_zdefs.h" -// #include "cblas.h" +//#include "cblas.h" #if 0 #include "p3dcomm.h" #include "sec_structs.h" @@ -68,12 +68,8 @@ int_t zAllocLlu(int_t nsupers, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d) Ufstnz_br_ptr[i] = NULL; Unzval_br_ptr[i] = NULL; } - -#if 0 // Sherry: change to int type - int_t *ToRecv = intCalloc_dist(nsupers); /* Recv from no one (0), left (1), and up (2).*/ - int_t *ToSendD = intCalloc_dist(nbr); /* Whether need to send down block row. */ - int_t **ToSendR = (int_t **) SUPERLU_MALLOC(nbc * sizeof(int_t*)); /* List of processes to send right block col. */ -#else + + // Sherry: use int type /* Recv from no one (0), left (1), and up (2).*/ int *ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int)); for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; @@ -82,9 +78,8 @@ int_t zAllocLlu(int_t nsupers, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d) for (i = 0; i < nbr; ++i) ToSendD[i] = 0; /* List of processes to send right block col. */ int **ToSendR = (int **) SUPERLU_MALLOC(nbc * sizeof(int*)); -#endif - for (i = 0; i < nbc; ++i) + for (int_t i = 0; i < nbc; ++i) { /* code */ //ToSendR[i] = INT_T_ALLOC(Pc); @@ -240,7 +235,7 @@ int_t zzRecvLPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex bet if (lsub != NULL) { int len = lsub[1]; /* LDA of the nzval[] */ - int len2 = SuperSize(k) * len; /*size of nzval of L panels*/ + int len2 = SuperSize(k) * len; /* size of nzval of L panels */ MPI_Status status; MPI_Recv(Lval_buf , len2, SuperLU_MPI_DOUBLE_COMPLEX, sender, k, @@ -249,15 +244,6 @@ int_t zzRecvLPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex bet /*reduce the updates*/ superlu_zscal(len2, alpha, lnzval, 1); superlu_zaxpy(len2, beta, Lval_buf, 1, lnzval, 1); -#if 0 // replaced -#if 1 - zscal_(&len2, &alpha, lnzval, &inc); - zaxpy_(&len2, &beta, Lval_buf, &inc, lnzval, &inc); -#else - cblas_zscal (len2, (void*) &alpha, lnzval, 1); - cblas_zaxpy (len2, (void*) &beta, Lval_buf, 1, lnzval, 1); -#endif -#endif } } @@ -265,7 +251,7 @@ int_t zzRecvLPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex bet } int_t zzSendUPanel(int_t k, int_t receiver, - zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) + zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT) { zLocalLU_t *Llu = LUstruct->Llu; int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; @@ -328,15 +314,6 @@ int_t zzRecvUPanel(int_t k, int_t sender, doublecomplex alpha, doublecomplex bet /*reduce the updates*/ superlu_zscal(lenv, alpha, unzval, 1); superlu_zaxpy(lenv, beta, Uval_buf, 1, unzval, 1); -#if 0 // replaced -#if 1 - zscal_(&lenv, &alpha, unzval, &inc); - zaxpy_(&lenv, &beta, Uval_buf, &inc, unzval, &inc); -#else - cblas_zscal (lenv, (void*) &alpha, unzval, 1); - cblas_zaxpy (lenv, (void*) &beta, Uval_buf, 1, unzval, 1); -#endif -#endif } } return 0; @@ -373,7 +350,7 @@ int_t zp3dScatter(int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d) int_t *supno = LUstruct->Glu_persist->supno; MPI_Bcast( supno, n, mpi_int_t, 0, grid3d->zscp.comm); - /* now broadcast localLu */ + /* now broadcast local LU structure */ /* first allocating space for it */ if ( grid3d->zscp.Iam ) // all other process layers not equal 0 zAllocLlu(nsupers, LUstruct, grid3d); @@ -396,6 +373,7 @@ int_t zp3dScatter(int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d) int_t nbr = CEILING(nsupers, Pr); int_t nbc = CEILING(nsupers, Pc); + // MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0, grid3d->zscp.comm); MPI_Bcast( ToRecv, nsupers, MPI_INT, 0, grid3d->zscp.comm); MPI_Bcast( ToSendD, nbr, MPI_INT, 0, grid3d->zscp.comm); @@ -416,7 +394,7 @@ int_t zp3dScatter(int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d) int_t zscatter3dUPanels(int_t nsupers, - zLUstruct_t * LUstruct, gridinfo3d_t* grid3d) + zLUstruct_t * LUstruct, gridinfo3d_t* grid3d) { zLocalLU_t *Llu = LUstruct->Llu; @@ -485,7 +463,7 @@ int_t zscatter3dUPanels(int_t nsupers, int_t zscatter3dLPanels(int_t nsupers, - zLUstruct_t * LUstruct, gridinfo3d_t* grid3d) + zLUstruct_t * LUstruct, gridinfo3d_t* grid3d) { zLocalLU_t *Llu = LUstruct->Llu; int_t* xsup = LUstruct->Glu_persist->xsup; @@ -804,7 +782,7 @@ int_t zinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs, } -int_t zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm, +int zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm, zLUValSubBuf_t* LUvsb, zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT ) { @@ -842,7 +820,7 @@ int_t zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm, } int_t zgatherAllFactoredLU( trf3Dpartition_t* trf3Dpartition, - zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT ) + zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT ) { int_t maxLvl = log2i(grid3d->zscp.Np) + 1; int_t myGrid = grid3d->zscp.Iam; diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c index c01e3bcd..a6f9fc42 100644 --- a/SRC/pzgssvx3d.c +++ b/SRC/pzgssvx3d.c @@ -360,7 +360,7 @@ at the top-level directory. * If all the above condition are true, the LU decomposition is * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. * - * ScalePermstruct (input/output) dScalePermstruct_t* (global) + * ScalePermstruct (input/output) zScalePermstruct_t* (global) * The data structure to store the scaling and permutation vectors * describing the transformations performed to the matrix A. * It contains the following fields: @@ -458,9 +458,9 @@ at the top-level directory. * xsup[s] is the leading column of the s-th supernode, * supno[i] is the supernode number to which column i belongs. * - * o Llu (LocalLU_t*) (local) + * o Llu (zLocalLU_t*) (local) * The distributed data structures to store L and U factors. - * See superlu_ddefs.h for the definition of 'LocalLU_t'. + * See superlu_ddefs.h for the definition of 'zLocalLU_t'. * * SOLVEstruct (input/output) zSOLVEstruct_t* * The data structure to hold the communication pattern used @@ -481,6 +481,7 @@ at the top-level directory. * * info (output) int* * = 0: successful exit + * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, @@ -558,7 +559,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, B2d = (doublecomplex *) A3d->B2d; NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0 NRformat_loc *A_orig = A->Store; - + /* definition of factored seen by each process layer */ Fact = options->Fact; factored = (Fact == FACTORED); @@ -578,35 +579,35 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, *info = -1; fprintf (stderr, "Extra precise iterative refinement yet to support."); - } - /* Test the other input parameters. */ - else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc + } else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc || A->Dtype != SLU_Z || A->Mtype != SLU_GE) - *info = -2; + *info = -2; else if (ldb < Astore3d->m_loc) - *info = -5; + *info = -5; else if (nrhs < 0) { - *info = -6; + *info = -6; } if (*info) { i = -(*info); pxerr_dist ("pzgssvx3d", grid, -*info); return; } - + #if ( DEBUGlevel>=1 ) CHECK_MALLOC (iam, "Enter pzgssvx3d()"); #endif /* Perform preprocessing steps on process layer zero, including: + gather 3D matrices {A, B} onto 2D grid-0, ordering, symbolic factorization, distribution of L & U */ - + #define NRFRMT if (grid3d->zscp.Iam == 0) { m = A->nrow; n = A->ncol; + // checkNRFMT(Astore0, (NRformat_loc *) A->Store); #ifdef NRFRMT // On input, A->Store is on 3D, now A->Store is re-assigned to 2D store A->Store = Astore0; @@ -662,8 +663,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* Not factored & ask for equilibration */ if (Equil && Fact != SamePattern_SameRowPerm) { /* Allocate storage if not done so before. */ - switch (ScalePermstruct->DiagScale) - { + switch (ScalePermstruct->DiagScale) { case NOEQUIL: if (!(R = (double *) doubleMalloc_dist (m))) ABORT ("Malloc fails for R[]."); @@ -682,8 +682,8 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, ABORT ("Malloc fails for R[]."); ScalePermstruct->R = R; break; - default: break; - } + default: break; + } } /* ------------------------------------------------------------ @@ -1106,11 +1106,9 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, #if ( PRNTlevel>=1 ) if (!iam) { printf ("\tNo of supers %ld\n", - Glu_persist->supno[n - 1] + 1); - printf ("\tSize of G(L) %ld\n", - Glu_freeable->xlsub[n]); - printf ("\tSize of G(U) %ld\n", - Glu_freeable->xusub[n]); + (long) Glu_persist->supno[n - 1] + 1); + printf ("\tSize of G(L) %ld\n", (long) Glu_freeable->xlsub[n]); + printf ("\tSize of G(U) %ld\n", (long) Glu_freeable->xusub[n]); printf ("\tint %d, short %d, float %d, double %d\n", sizeof (int_t), sizeof (short), sizeof (float), sizeof (double)); @@ -1218,7 +1216,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, SCT_init(SCT); #if ( PRNTlevel>=1 ) - if (iam==0) { + if (grid3d->iam == 0) { printf("after 3D initialization.\n"); fflush(stdout); } #endif @@ -1391,7 +1389,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, { /* Improve the solution by iterative refinement. */ int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; - zSOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ + zSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */ t = SuperLU_timer_ (); if (options->RefineInitialized == NO || Fact == DOFACT) { @@ -1448,7 +1446,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, So we use SOLVEstruct1 for the refinement step. */ if (!(SOLVEstruct1 = (zSOLVEstruct_t *) - SUPERLU_MALLOC (sizeof (zSOLVEstruct_t)))) + SUPERLU_MALLOC(sizeof(zSOLVEstruct_t)))) ABORT ("Malloc fails for SOLVEstruct1"); /* Copy the same stuff */ SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; @@ -1537,8 +1535,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* Deallocate R and/or C if it was not used. */ if (Equil && Fact != SamePattern_SameRowPerm) { - switch (ScalePermstruct->DiagScale) - { + switch (ScalePermstruct->DiagScale) { case NOEQUIL: SUPERLU_FREE (R); SUPERLU_FREE (C); @@ -1549,8 +1546,8 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, case COL: SUPERLU_FREE (R); break; - default: break; - } + default: break; + } } #if 0 @@ -1561,22 +1558,23 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, } /* process layer 0 done solve */ #ifdef NRFRMT - zScatter_B3d(A3d, grid3d); - - A->Store = Astore3d; // restore Astore to 3D - - /* free A2d and B2d, which are allocated only in 2D layer Grid_0 */ - NRformat_loc *A2d = A3d->A_nfmt; - if (grid3d->zscp.Iam == 0) { - SUPERLU_FREE( A2d->rowptr ); - SUPERLU_FREE( A2d->colind ); - SUPERLU_FREE( A2d->nzval ); - SUPERLU_FREE( A3d->B2d ); - } - SUPERLU_FREE( A2d ); // free 2D structure - SUPERLU_FREE( A3d ); // free 3D structure -#endif + /* Scatter the solution from 2D grid_0 to 3D grid */ + zScatter_B3d(A3d, grid3d); + + A->Store = Astore3d; // restore Astore to 3D + /* free A2d and B2d, which are allocated only in 2D layer Grid_0 */ + NRformat_loc *A2d = A3d->A_nfmt; + if (grid3d->zscp.Iam == 0) { + SUPERLU_FREE( A2d->rowptr ); + SUPERLU_FREE( A2d->colind ); + SUPERLU_FREE( A2d->nzval ); + SUPERLU_FREE( A3d->B2d ); + } + SUPERLU_FREE( A2d ); // free 2D structure + SUPERLU_FREE( A3d ); // free 3D structure +#endif + #if ( DEBUGlevel>=1 ) CHECK_MALLOC (iam, "Exit pzgssvx3d()"); #endif diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c index 20d32ad8..00e58a03 100644 --- a/SRC/pzgstrf.c +++ b/SRC/pzgstrf.c @@ -128,7 +128,7 @@ at the top-level directory. // #define SUPERNODE_PROFILE /* - Name : BAELINE + Name : BASELINE Purpose : baseline to compare performance against Overhead : NA : this won't be used for running experiments */ @@ -767,13 +767,14 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, SUPERLU_MAX (max_row_size * num_threads * ldt, get_max_buffer_size ()); */ -#ifdef GPU_ACC +#ifdef GPU_ACC /*-------- use GPU --------*/ int cublas_nb = get_cublas_nb(); // default 64 - int nstreams = get_num_cuda_streams (); + int nstreams = get_num_cuda_streams (); // default 8 - int_t buffer_size = SUPERLU_MAX(max_row_size*nstreams*cublas_nb,get_max_buffer_size()); + int_t buffer_size = SUPERLU_MAX(max_row_size * nstreams * cublas_nb, + get_max_buffer_size()); /* array holding last column blk for each partition, - used in SchCompUdt--CUDA.c */ + used in SchCompUdt-cuda.c */ #if 0 int *stream_end_col = (int_t *) _mm_malloc (sizeof (int_t) * nstreams,64); #else @@ -783,8 +784,9 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #else /* not to use GPU */ int Threads_per_process = get_thread_per_process(); - int_t buffer_size = SUPERLU_MAX(max_row_size*Threads_per_process*ldt,get_max_buffer_size()); -#endif /* end ifdef GPU_ACC */ + int_t buffer_size = SUPERLU_MAX(max_row_size * Threads_per_process * ldt, + get_max_buffer_size()); +#endif /* end ifdef GPU_ACC -----------*/ int_t max_ncols = 0; #if 0 @@ -811,8 +813,12 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, bigV = NULL; #if ( PRNTlevel>=1 ) - if(!iam) printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n", - max_row_size, max_ncols); + if(!iam) { + printf("\t.. MAX_BUFFER_SIZE " IFMT " set for GPU\n", get_max_buffer_size()); + printf("\t.. N_GEMM: " IFMT " flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7)); + printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n", + max_row_size, max_ncols); + } printf("[%d].. BIG U size " IFMT " (on CPU)\n", iam, bigu_size); fflush(stdout); #endif @@ -827,17 +833,20 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #endif #if ( PRNTlevel>=1 ) - printf("[%d].. BIG V size %d (on CPU), dC buffer_size %d (on GPU)\n", iam, bigv_size, buffer_size); + printf("[%d].. BIG V size " IFMT " (on CPU), dC buffer_size " IFMT " (on GPU)\n", + iam, bigv_size, buffer_size); fflush(stdout); #endif if ( checkCuda(cudaHostAlloc((void**)&bigV, bigv_size * sizeof(doublecomplex) ,cudaHostAllocDefault)) ) ABORT("Malloc fails for zgemm buffer V"); - DisplayHeader(); - #if ( PRNTlevel>=1 ) - printf(" Starting with %d Cuda Streams \n",nstreams ); + if ( iam==0 ) { + DisplayHeader(); + printf(" Starting with %d Cuda Streams \n",nstreams ); + fflush(stdout); + } #endif cublasHandle_t *handle; @@ -879,10 +888,11 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, return 1; } - stat->gpu_buffer += ( max_row_size * sp_ienv_dist(3) - + bigu_size + buffer_size ) * dword; + stat->gpu_buffer += dword * ( max_row_size * sp_ienv_dist(3) // dA + + bigu_size // dB + + buffer_size ); // dC -#else /*-- not to use GPU --*/ +#else /*-------- not to use GPU --------*/ // for GEMM padding 0 j = bigu_size / ldt; @@ -904,7 +914,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, ABORT ("Malloc failed for zgemm V buffer"); //#endif -#endif /* end ifdef GPU_ACC */ +#endif +/*************** end ifdef GPU_ACC ****************/ log_memory((bigv_size + bigu_size) * dword, stat); @@ -1757,29 +1768,29 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, MPI_Reduce(&RemainGEMM_flops, &allflops, 1, MPI_DOUBLE, MPI_SUM, 0, grid->comm); if ( iam==0 ) { - printf("\nInitialization time\t%8.2lf seconds\n" + printf("\nInitialization time\t%8.4lf seconds\n" "\t Serial: compute static schedule, allocate storage\n", InitTimer); printf("\n==== Time breakdown in factorization (rank 0) ====\n"); - printf("Panel factorization \t %8.2lf seconds\n", + printf("Panel factorization \t %8.4lf seconds\n", pdgstrf2_timer + pdgstrs2_timer); - printf(".. L-panel pxgstrf2 \t %8.2lf seconds\n", pdgstrf2_timer); - printf(".. U-panel pxgstrs2 \t %8.2lf seconds\n", pdgstrs2_timer); - printf("Time in Look-ahead update \t %8.2lf seconds\n", lookaheadupdatetimer); - printf("Time in Schur update \t\t %8.2lf seconds\n", NetSchurUpTimer); - printf(".. Time to Gather L buffer\t %8.2lf (Separate L panel by Lookahead/Remain)\n", GatherLTimer); - printf(".. Time to Gather U buffer\t %8.2lf \n", GatherUTimer); - - printf(".. Time in GEMM %8.2lf \n", + printf(".. L-panel pxgstrf2 \t %8.4lf seconds\n", pdgstrf2_timer); + printf(".. U-panel pxgstrs2 \t %8.4lf seconds\n", pdgstrs2_timer); + printf("Time in Look-ahead update \t %8.4lf seconds\n", lookaheadupdatetimer); + printf("Time in Schur update \t\t %8.4lf seconds\n", NetSchurUpTimer); + printf(".. Time to Gather L buffer\t %8.4lf (Separate L panel by Lookahead/Remain)\n", GatherLTimer); + printf(".. Time to Gather U buffer\t %8.4lf \n", GatherUTimer); + + printf(".. Time in GEMM %8.4lf \n", LookAheadGEMMTimer + RemainGEMMTimer); - printf("\t* Look-ahead\t %8.2lf \n", LookAheadGEMMTimer); - printf("\t* Remain\t %8.2lf\tFlops %8.2le\tGflops %8.2lf\n", + printf("\t* Look-ahead\t %8.4lf \n", LookAheadGEMMTimer); + printf("\t* Remain\t %8.4lf\tFlops %8.4le\tGflops %8.4lf\n", RemainGEMMTimer, allflops, allflops/RemainGEMMTimer*1e-9); - printf(".. Time to Scatter %8.2lf \n", + printf(".. Time to Scatter %8.4lf \n", LookAheadScatterTimer + RemainScatterTimer); - printf("\t* Look-ahead\t %8.2lf \n", LookAheadScatterTimer); - printf("\t* Remain\t %8.2lf \n", RemainScatterTimer); + printf("\t* Look-ahead\t %8.4lf \n", LookAheadScatterTimer); + printf("\t* Remain\t %8.4lf \n", RemainScatterTimer); - printf("Total factorization time \t: %8.2lf seconds, \n", pxgstrfTimer); + printf("Total factorization time \t: %8.4lf seconds, \n", pxgstrfTimer); printf("--------\n"); printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n); } diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c index bc40ab75..f592f50e 100644 --- a/SRC/pzgstrf3d.c +++ b/SRC/pzgstrf3d.c @@ -38,8 +38,8 @@ at the top-level directory. #endif #ifdef GPU_ACC -#include "lustruct_gpu.h" -#include "acc_aux.c" +#include "zlustruct_gpu.h" +//#include "acc_aux.c" //no need anymore #endif @@ -151,7 +151,7 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, initFactStat(nsupers, &factStat); #if 0 // sherry: not used - diagFactBufs_t dFBuf; + zdiagFactBufs_t dFBuf; zinitDiagFactBufs(ldt, &dFBuf); commRequests_t comReqs; @@ -165,7 +165,7 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, packLUInfo_t packLUInfo; initPackLUInfo(nsupers, &packLUInfo); - scuBufs_t scuBufs; + zscuBufs_t scuBufs; zinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid); factNodelists_t fNlists; @@ -176,7 +176,7 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, int_t maxLvl = log2i(grid3d->zscp.Np) + 1; #if ( PRNTlevel>=1 ) - if (!iam) { + if (grid3d->iam == 0) { printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout); } #endif @@ -201,7 +201,7 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode ) mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1]; } - diagFactBufs_t** dFBufs = zinitDiagFactBufsArr(mxLeafNode, ldt, grid); + zdiagFactBufs_t** dFBufs = zinitDiagFactBufsArr(mxLeafNode, ldt, grid); commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), ldt, grid); /* Setting up GPU related data structures */ @@ -215,37 +215,56 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t)); zInit_HyP(HyP, Llu, mcb, mrb); - HyP->first_l_block_acc = first_l_block_acc; HyP->first_u_block_acc = first_u_block_acc; + + int superlu_acc_offload = HyP->superlu_acc_offload; + //int_t bigu_size = getBigUSize(nsupers, grid, LUstruct); - int_t bigu_size = getBigUSize(nsupers, grid, LUstruct->Llu->Lrowind_bc_ptr); - // int_t buffer_size = get_max_buffer_size (); - // HyP->buffer_size = buffer_size; + int_t bigu_size = getBigUSize(nsupers, grid, + LUstruct->Llu->Lrowind_bc_ptr); HyP->bigu_size = bigu_size; + int_t buffer_size = get_max_buffer_size (); + HyP->buffer_size = buffer_size; HyP->nsupers = nsupers; #ifdef GPU_ACC /*Now initialize the GPU data structure*/ - LUstruct_gpu *A_gpu, *dA_gpu; + zLUstruct_gpu_t *A_gpu, *dA_gpu; d2Hreduce_t d2HredObj; d2Hreduce_t* d2Hred = &d2HredObj; - sluGPU_t sluGPUobj; - sluGPU_t *sluGPU = &sluGPUobj; + zsluGPU_t sluGPUobj; + zsluGPU_t *sluGPU = &sluGPUobj; sluGPU->isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm); if (superlu_acc_offload) { +#if 0 /* Sherry: For GPU code on titan, we do not need performance + lookup tables since due to difference in CPU-GPU performance, + it didn't make much sense to do any Schur-complement update + on CPU, except for the lookahead-update on CPU. Same should + hold for summit as well. (from Piyush) */ + /*Initilize the lookup tables */ LookUpTableInit(iam); acc_async_cost = get_acc_async_cost(); #ifdef GPU_DEBUG if (!iam) printf("Using MIC async cost of %lf \n", acc_async_cost); #endif +#endif + + //OLD: int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid); + int_t* perm_c_supno = getPerm_c_supno(nsupers, options, + LUstruct->etree, + LUstruct->Glu_persist, + LUstruct->Llu->Lrowind_bc_ptr, + LUstruct->Llu->Ufstnz_br_ptr, + grid); - initSluGPU3D_t(sluGPU, LUstruct, grid3d, perm_c_supno, - n, buffer_size, bigu_size, ldt); + /* Initialize GPU data structures */ + zinitSluGPU3D_t(sluGPU, LUstruct, grid3d, perm_c_supno, + n, buffer_size, bigu_size, ldt); HyP->first_u_block_acc = sluGPU->A_gpu->first_u_block_gpu; HyP->first_l_block_acc = sluGPU->A_gpu->first_l_block_gpu; @@ -267,7 +286,7 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, #endif SCT->pdgstrfTimer = SuperLU_timer_(); - for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl) + for (int ilvl = 0; ilvl < maxLvl; ++ilvl) { /* if I participate in this level */ if (!myZeroTrIdxs[ilvl]) diff --git a/SRC/sp_ienv.c b/SRC/sp_ienv.c index 3faababf..1a317f65 100644 --- a/SRC/sp_ienv.c +++ b/SRC/sp_ienv.c @@ -92,7 +92,7 @@ sp_ienv_dist(int_t ispec) return 20; case 3: - ttemp = getenv("NSUP"); + ttemp = getenv("NSUP"); // take min of MAX_SUPER_SIZE in superlu_defs.h if(ttemp) { return(atoi(ttemp)); diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index 32a26aea..97fe6387 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -9,6 +9,7 @@ The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ + /*! @file * \brief Distributed SuperLU data types and function prototypes * @@ -82,8 +83,7 @@ typedef struct { #if 0 // Sherry: move to superlu_defs.h /*-- Auxiliary data type used in PxGSTRS/PxGSTRS1. */ -typedef struct -{ +typedef struct { int_t lbnum; /* Row block number (local). */ int_t indpos; /* Starting position in Uindex[]. */ } Ucb_indptr_t; @@ -94,34 +94,33 @@ typedef struct * column format, the blocks in U are stored in compressed block row format. */ #define MAX_LOOKAHEADS 50 -typedef struct -{ - int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ - double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ - double **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) pointers to locations in Lrowind_bc_ptr and Lnzval_bc_ptr */ - int_t *Unnz; /* number of nonzeros per block column in U*/ - int_t **Lrowind_bc_2_lsum; /* size ceil(NSUPERS/Pc) map indices of Lrowind_bc_ptr to indices of lsum */ - double **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ - int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ - double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ - /*-- Data structures used for broadcast and reduction trees. --*/ - BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ - RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ - BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ - RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ +typedef struct { + int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ + double **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ + double **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) pointers to locations in Lrowind_bc_ptr and Lnzval_bc_ptr */ + int_t *Unnz; /* number of nonzeros per block column in U*/ + int_t **Lrowind_bc_2_lsum; /* size ceil(NSUPERS/Pc) map indices of Lrowind_bc_ptr to indices of lsum */ + double **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ + double **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ + /*-- Data structures used for broadcast and reduction trees. --*/ + BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ #if 0 int_t *Lsub_buf; /* Buffer for the remote subscripts of L */ double *Lval_buf; /* Buffer for the remote nonzeros of L */ int_t *Usub_buf; /* Buffer for the remote subscripts of U */ double *Uval_buf; /* Buffer for the remote nonzeros of U */ #endif - int_t *Lsub_buf_2[MAX_LOOKAHEADS]; /* Buffers for the remote subscripts of L*/ - double *Lval_buf_2[MAX_LOOKAHEADS]; /* Buffers for the remote nonzeros of L */ - int_t *Usub_buf_2[MAX_LOOKAHEADS]; /* Buffer for the remote subscripts of U */ - double *Uval_buf_2[MAX_LOOKAHEADS]; /* Buffer for the remote nonzeros of U */ - double *ujrow; /* used in panel factorization. */ - int_t bufmax[NBUFFERS]; /* Maximum buffer size across all MPI ranks: + int_t *Lsub_buf_2[MAX_LOOKAHEADS]; /* Buffers for the remote subscripts of L*/ + double *Lval_buf_2[MAX_LOOKAHEADS]; /* Buffers for the remote nonzeros of L */ + int_t *Usub_buf_2[MAX_LOOKAHEADS]; /* Buffer for the remote subscripts of U */ + double *Uval_buf_2[MAX_LOOKAHEADS]; /* Buffer for the remote nonzeros of U */ + double *ujrow; /* used in panel factorization. */ + int_t bufmax[NBUFFERS]; /* Maximum buffer size across all MPI ranks: * 0 : maximum size of Lsub_buf[] * 1 : maximum size of Lval_buf[] * 2 : maximum size of Usub_buf[] @@ -130,29 +129,30 @@ typedef struct */ /*-- Record communication schedule for factorization. --*/ - int *ToRecv; /* Recv from no one (0), left (1), and up (2).*/ - int *ToSendD; /* Whether need to send down block row. */ - int **ToSendR; /* List of processes to send right block col. */ + int *ToRecv; /* Recv from no one (0), left (1), and up (2).*/ + int *ToSendD; /* Whether need to send down block row. */ + int **ToSendR; /* List of processes to send right block col. */ /*-- Record communication schedule for forward/back solves. --*/ - int_t *fmod; /* Modification count for L-solve */ - int_t **fsendx_plist; /* Column process list to send down Xk */ - int_t *frecv; /* Modifications to be recv'd in proc row */ - int_t nfrecvx; /* Number of Xk I will receive in L-solve */ - int_t nfsendx; /* Number of Xk I will send in L-solve */ - int_t *bmod; /* Modification count for U-solve */ - int_t **bsendx_plist; /* Column process list to send down Xk */ - int_t *brecv; /* Modifications to be recv'd in proc row */ - int_t nbrecvx; /* Number of Xk I will receive in U-solve */ - int_t nbsendx; /* Number of Xk I will send in U-solve */ - int_t *mod_bit; /* Flag contribution from each row blocks */ + int_t *fmod; /* Modification count for L-solve */ + int_t **fsendx_plist; /* Column process list to send down Xk */ + int_t *frecv; /* Modifications to be recv'd in proc row */ + int_t nfrecvx; /* Number of Xk I will receive in L-solve */ + int_t nfsendx; /* Number of Xk I will send in L-solve */ + int_t *bmod; /* Modification count for U-solve */ + int_t **bsendx_plist; /* Column process list to send down Xk */ + int_t *brecv; /* Modifications to be recv'd in proc row */ + int_t nbrecvx; /* Number of Xk I will receive in U-solve */ + int_t nbsendx; /* Number of Xk I will send in U-solve */ + int_t *mod_bit; /* Flag contribution from each row blocks */ /*-- Auxiliary arrays used for forward/back solves. --*/ - int_t *ilsum; /* Starting position of each supernode in lsum + int_t *ilsum; /* Starting position of each supernode in lsum (local) */ - int_t ldalsum; /* LDA of lsum (local) */ - int_t SolveMsgSent; /* Number of actual messages sent in LU-solve */ - int_t SolveMsgVol; /* Volume of messages sent in the solve phase */ + int_t ldalsum; /* LDA of lsum (local) */ + int_t SolveMsgSent; /* Number of actual messages sent in LU-solve */ + int_t SolveMsgVol; /* Volume of messages sent in the solve phase */ + /*********************/ /* The following variables are used in the hybrid solver */ @@ -161,19 +161,19 @@ typedef struct int_t UT_SOLVE; int_t L_SOLVE; int_t FRECV; - int_t ut_ldalsum; /* LDA of lsum (local) */ - int_t *ut_ilsum; /* ilsum in column-wise */ - int_t *utmod; /* Modification count for Ut-solve. */ - int_t **ut_sendx_plist; /* Row process list to send down Xk */ - int_t *utrecv; /* Modifications to be recev'd in proc column. */ - int_t n_utsendx; /* Number of Xk I will receive */ - int_t n_utrecvx; /* Number of Xk I will send */ + int_t ut_ldalsum; /* LDA of lsum (local) */ + int_t *ut_ilsum; /* ilsum in column-wise */ + int_t *utmod; /* Modification count for Ut-solve. */ + int_t **ut_sendx_plist; /* Row process list to send down Xk */ + int_t *utrecv; /* Modifications to be recev'd in proc column. */ + int_t n_utsendx; /* Number of Xk I will receive */ + int_t n_utrecvx; /* Number of Xk I will send */ int_t n_utrecvmod; int_t nroot; int_t *ut_modbit; int_t *Urbs; - Ucb_indptr_t **Ucb_indptr; /* Vertical linked list pointing to Uindex[] */ - int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ /* some additional counters for L solve */ int_t n; @@ -182,52 +182,53 @@ typedef struct int_t inv; /* whether the diagonal block is inverted*/ } dLocalLU_t; -typedef struct -{ + +typedef struct { int_t *etree; Glu_persist_t *Glu_persist; dLocalLU_t *Llu; char dt; } dLUstruct_t; + /*-- Data structure for communication during matrix-vector multiplication. */ -typedef struct -{ +typedef struct { int_t *extern_start; - int_t *ind_tosend; /* X indeices to be sent to other processes */ - int_t *ind_torecv; /* X indeices to be received from other processes */ - int_t *ptr_ind_tosend; /* Printers to ind_tosend[] (Size procs) + int_t *ind_tosend; /* X indeices to be sent to other processes */ + int_t *ind_torecv; /* X indeices to be received from other processes */ + int_t *ptr_ind_tosend;/* Printers to ind_tosend[] (Size procs) (also point to val_torecv) */ - int_t *ptr_ind_torecv; /* Printers to ind_torecv[] (Size procs) + int_t *ptr_ind_torecv;/* Printers to ind_torecv[] (Size procs) (also point to val_tosend) */ - int *SendCounts; /* Numbers of X indices to be sent + int *SendCounts; /* Numbers of X indices to be sent (also numbers of X values to be received) */ - int *RecvCounts; /* Numbers of X indices to be received + int *RecvCounts; /* Numbers of X indices to be received (also numbers of X values to be sent) */ - double *val_tosend; /* X values to be sent to other processes */ - double *val_torecv; /* X values to be received from other processes */ - int_t TotalIndSend; /* Total number of indices to be sent + double *val_tosend; /* X values to be sent to other processes */ + double *val_torecv; /* X values to be received from other processes */ + int_t TotalIndSend; /* Total number of indices to be sent (also total number of values to be received) */ - int_t TotalValSend; /* Total number of values to be sent. + int_t TotalValSend; /* Total number of values to be sent. (also total number of indices to be received) */ } pdgsmv_comm_t; /*-- Data structure holding the information for the solution phase --*/ -typedef struct -{ +typedef struct { int_t *row_to_proc; int_t *inv_perm_c; int_t num_diag_procs, *diag_procs, *diag_len; - pdgsmv_comm_t *gsmv_comm; /* communication metadata for SpMV, + pdgsmv_comm_t *gsmv_comm; /* communication metadata for SpMV, required by IterRefine. */ - pxgstrs_comm_t *gstrs_comm; /* communication metadata for SpTRSV. */ - int_t *A_colind_gsmv; /* After pdgsmv_init(), the global column + pxgstrs_comm_t *gstrs_comm; /* communication metadata for SpTRSV. */ + int_t *A_colind_gsmv; /* After pdgsmv_init(), the global column indices of A are translated into the relative positions in the gathered x-vector. This is re-used in repeated calls to pdgsmv() */ int_t *xrow_to_proc; /* used by PDSLin */ } dSOLVEstruct_t; + + /*==== For 3D code ====*/ // new structures for pdgstrf_4_8 @@ -291,43 +292,45 @@ typedef struct int_t scuStatUpdate( int_t knsupc, - HyP_t *HyP, - SCT_t *SCT, - SuperLUStat_t *stat); + HyP_t* HyP, + SCT_t* SCT, + SuperLUStat_t *stat + ); typedef struct { gEtreeInfo_t gEtreeInfo; - int_t *iperm_c_supno; - int_t *myNodeCount; - int_t *myTreeIdxs; - int_t *myZeroTrIdxs; - int_t **treePerm; - sForest_t **sForests; - int_t *supernode2treeMap; - dLUValSubBuf_t *LUvsb; + int_t* iperm_c_supno; + int_t* myNodeCount; + int_t* myTreeIdxs; + int_t* myZeroTrIdxs; + int_t** treePerm; + sForest_t** sForests; + int_t* supernode2treeMap; + dLUValSubBuf_t *LUvsb; } trf3Dpartition_t; typedef struct { double *bigU; double *bigV; -} scuBufs_t; +} dscuBufs_t; typedef struct -{ - double *BlockLFactor; - double *BlockUFactor; -} diagFactBufs_t; +{ + double* BlockLFactor; + double* BlockUFactor; +} ddiagFactBufs_t; typedef struct { - Ublock_info_t *Ublock_info; - Remain_info_t *Remain_info; - uPanelInfo_t *uPanelInfo; - lPanelInfo_t *lPanelInfo; + Ublock_info_t* Ublock_info; + Remain_info_t* Remain_info; + uPanelInfo_t* uPanelInfo; + lPanelInfo_t* lPanelInfo; } packLUInfo_t; +//#endif /*=====================*/ /*********************************************************************** @@ -335,50 +338,48 @@ typedef struct ***********************************************************************/ #ifdef __cplusplus -extern "C" -{ +extern "C" { #endif - /* Supernodal LU factor related */ - extern void - dCreate_CompCol_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *, - int_t *, int_t *, Stype_t, Dtype_t, Mtype_t); - extern void - dCreate_CompRowLoc_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, int_t, - int_t, double *, int_t *, int_t *, - Stype_t, Dtype_t, Mtype_t); - extern void - dCompRow_to_CompCol_dist(int_t, int_t, int_t, double *, int_t *, int_t *, - double **, int_t **, int_t **); - extern int - pdCompRow_loc_to_CompCol_global(int_t, SuperMatrix *, gridinfo_t *, - SuperMatrix *); - extern void - dCopy_CompCol_Matrix_dist(SuperMatrix *, SuperMatrix *); - extern void - dCreate_Dense_Matrix_dist(SuperMatrix *, int_t, int_t, double *, int_t, - Stype_t, Dtype_t, Mtype_t); - extern void - dCreate_SuperNode_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *, - int_t *, int_t *, int_t *, int_t *, int_t *, - Stype_t, Dtype_t, Mtype_t); - extern void - dCopy_Dense_Matrix_dist(int_t, int_t, double *, int_t, - double *, int_t); - - extern void dallocateA_dist(int_t, int_t, double **, int_t **, int_t **); - extern void dGenXtrue_dist(int_t, int_t, double *, int_t); - extern void dFillRHS_dist(char *, int_t, double *, int_t, + +/* Supernodal LU factor related */ +extern void +dCreate_CompCol_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *, + int_t *, int_t *, Stype_t, Dtype_t, Mtype_t); +extern void +dCreate_CompRowLoc_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, int_t, + int_t, double *, int_t *, int_t *, + Stype_t, Dtype_t, Mtype_t); +extern void +dCompRow_to_CompCol_dist(int_t, int_t, int_t, double *, int_t *, int_t *, + double **, int_t **, int_t **); +extern int +pdCompRow_loc_to_CompCol_global(int_t, SuperMatrix *, gridinfo_t *, + SuperMatrix *); +extern void +dCopy_CompCol_Matrix_dist(SuperMatrix *, SuperMatrix *); +extern void +dCreate_Dense_Matrix_dist(SuperMatrix *, int_t, int_t, double *, int_t, + Stype_t, Dtype_t, Mtype_t); +extern void +dCreate_SuperNode_Matrix_dist(SuperMatrix *, int_t, int_t, int_t, double *, + int_t *, int_t *, int_t *, int_t *, int_t *, + Stype_t, Dtype_t, Mtype_t); +extern void +dCopy_Dense_Matrix_dist(int_t, int_t, double *, int_t, + double *, int_t); + +extern void dallocateA_dist (int_t, int_t, double **, int_t **, int_t **); +extern void dGenXtrue_dist (int_t, int_t, double *, int_t); +extern void dFillRHS_dist (char *, int_t, double *, int_t, SuperMatrix *, double *, int_t); - - extern int dcreate_matrix(SuperMatrix *, int, double **, int *, double **, int *, FILE *, gridinfo_t *); extern int dcreate_matrix_rb(SuperMatrix *, int, double **, int *, double **, int *, FILE *, gridinfo_t *); extern int dcreate_matrix_dat(SuperMatrix *, int, double **, int *, double **, int *, FILE *, gridinfo_t *); -extern int dcreate_matrix_postfix(SuperMatrix *, int, double **, int *, +extern int dcreate_matrix_postfix(SuperMatrix *, int, double **, int *, double **, int *, FILE *, char *, gridinfo_t *); extern void dScalePermstructInit(const int_t, const int_t, @@ -448,7 +449,7 @@ extern void dscatter_u (int ib, int jb, int nsupc, int_t iukp, int_t * xsup, int_t* lsub, int_t* usub, double* tempv, int_t ** Ufstnz_br_ptr, double **Unzval_br_ptr, gridinfo_t * grid); -extern int_t pdgstrf(superlu_dist_options_t *, int, int, double, +extern int_t pdgstrf(superlu_dist_options_t *, int, int, double anorm, dLUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*); /* #define GPU_PROF @@ -570,6 +571,7 @@ extern int file_dPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A); extern void Printdouble5(char *, int_t, double *); extern int file_Printdouble5(FILE *, char *, int_t, double *); + /* BLAS */ #ifdef USE_VENDOR_BLAS @@ -584,6 +586,7 @@ extern void dtrsm_(const char*, const char*, const char*, const char*, extern void dgemv_(const char *, const int *, const int *, const double *, const double *a, const int *, const double *, const int *, const double *, double *, const int *, int); + #else extern int dgemm_(const char*, const char*, const int*, const int*, const int*, const double*, const double*, const int*, const double*, @@ -627,19 +630,19 @@ extern int superlu_dgemv(const char *trans, const int m, extern int superlu_dtrsv(char *uplo, char *trans, char *diag, int n, double *a, int lda, double *x, int incx); + // LAPACK routine -extern void dtrtri_(char *, char *, int *, double *, int *, int *); +extern void dtrtri_(char*, char*, int*, double*, int*, int*); - -/*==== For 3D code ====*/ +/*==== For 3D code ====*/ extern int dcreate_matrix3d(SuperMatrix *A, int nrhs, double **rhs, - int *ldb, double **x, int *ldx, - FILE *fp, gridinfo3d_t *grid3d); + int *ldb, double **x, int *ldx, + FILE *fp, gridinfo3d_t *grid3d); extern int dcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, double **rhs, - int *ldb, double **x, int *ldx, - FILE *fp, char *postfix, gridinfo3d_t *grid3d); -/* 3D-to-2D redistribution */ + int *ldb, double **x, int *ldx, + FILE *fp, char * postfix, gridinfo3d_t *grid3d); + /* Matrix distributed in NRformat_loc in 3D process grid. It converts it to a NRformat_loc distributed in 2D grid in grid-0 */ extern NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, double *B, @@ -647,137 +650,129 @@ extern NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, double *B, gridinfo3d_t *grid3d); extern int dScatter_B3d(NRformat_loc3d *A3d, gridinfo3d_t *grid3d); -extern void pdgssvx3d(superlu_dist_options_t *, SuperMatrix *, - dScalePermstruct_t *, double B[], int ldb, int nrhs, - gridinfo3d_t *, dLUstruct_t *, dSOLVEstruct_t *, - double *berr, SuperLUStat_t *, int *info); +extern void pdgssvx3d (superlu_dist_options_t *, SuperMatrix *, + dScalePermstruct_t *, double B[], int ldb, int nrhs, + gridinfo3d_t *, dLUstruct_t *, dSOLVEstruct_t *, + double *berr, SuperLUStat_t *, int *info); extern int_t pdgstrf3d(superlu_dist_options_t *, int m, int n, double anorm, - trf3Dpartition_t *, SCT_t *, dLUstruct_t *, + trf3Dpartition_t*, SCT_t *, dLUstruct_t *, gridinfo3d_t *, SuperLUStat_t *, int *); -extern void dInit_HyP(HyP_t *HyP, dLocalLU_t *Llu, int_t mcb, int_t mrb); -extern void Free_HyP(HyP_t *HyP); -extern int updateDirtyBit(int_t k0, HyP_t *HyP, gridinfo_t *grid); +extern void dInit_HyP(HyP_t* HyP, dLocalLU_t *Llu, int_t mcb, int_t mrb ); +extern void Free_HyP(HyP_t* HyP); +extern int updateDirtyBit(int_t k0, HyP_t* HyP, gridinfo_t* grid); /* from scatter.h */ - extern void - dblock_gemm_scatter(int_t lb, int_t j, Ublock_info_t *Ublock_info, - Remain_info_t *Remain_info, double *L_mat, int ldl, - double *U_mat, int ldu, double *bigV, - // int_t jj0, - int_t knsupc, int_t klst, - int_t *lsub, int_t *usub, int_t ldt, - int_t thread_id, - int *indirect, int *indirect2, - int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, - int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, - int_t *xsup, gridinfo_t *, SuperLUStat_t * +extern void +dblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info, + Remain_info_t *Remain_info, double *L_mat, int ldl, + double *U_mat, int ldu, double *bigV, + // int_t jj0, + int_t knsupc, int_t klst, + int_t *lsub, int_t *usub, int_t ldt, + int_t thread_id, + int *indirect, int *indirect2, + int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, + int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, + int_t *xsup, gridinfo_t *, SuperLUStat_t * #ifdef SCATTER_PROFILE - , - double *Host_TheadScatterMOP, double *Host_TheadScatterTimer + , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer #endif - ); + ); #ifdef _OPENMP - /*this version uses a lock to prevent multiple thread updating the same block*/ - extern void - dblock_gemm_scatter_lock(int_t lb, int_t j, omp_lock_t *lock, - Ublock_info_t *Ublock_info, Remain_info_t *Remain_info, - double *L_mat, int_t ldl, double *U_mat, int_t ldu, - double *bigV, - // int_t jj0, - int_t knsupc, int_t klst, - int_t *lsub, int_t *usub, int_t ldt, - int_t thread_id, - int *indirect, int *indirect2, - int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, - int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, - int_t *xsup, gridinfo_t * +/*this version uses a lock to prevent multiple thread updating the same block*/ +extern void +dblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock, + Ublock_info_t *Ublock_info, Remain_info_t *Remain_info, + double *L_mat, int_t ldl, double *U_mat, int_t ldu, + double *bigV, + // int_t jj0, + int_t knsupc, int_t klst, + int_t *lsub, int_t *usub, int_t ldt, + int_t thread_id, + int *indirect, int *indirect2, + int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, + int_t **Ufstnz_br_ptr, double **Unzval_br_ptr, + int_t *xsup, gridinfo_t * #ifdef SCATTER_PROFILE - , - double *Host_TheadScatterMOP, double *Host_TheadScatterTimer + , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer #endif - ); + ); #endif - extern int_t - dblock_gemm_scatterTopLeft(int_t lb, int_t j, double *bigV, - int_t knsupc, int_t klst, int_t *lsub, - int_t *usub, int_t ldt, - int *indirect, int *indirect2, - HyP_t *HyP, dLUstruct_t *, gridinfo_t *, - SCT_t *SCT, SuperLUStat_t *); - extern int_t - dblock_gemm_scatterTopRight(int_t lb, int_t j, double *bigV, - int_t knsupc, int_t klst, int_t *lsub, - int_t *usub, int_t ldt, - int *indirect, int *indirect2, - HyP_t *HyP, dLUstruct_t *, gridinfo_t *, - SCT_t *SCT, SuperLUStat_t *); - extern int_t - dblock_gemm_scatterBottomLeft(int_t lb, int_t j, double *bigV, - int_t knsupc, int_t klst, int_t *lsub, - int_t *usub, int_t ldt, - int *indirect, int *indirect2, - HyP_t *HyP, dLUstruct_t *, gridinfo_t *, - SCT_t *SCT, SuperLUStat_t *); - extern int_t - dblock_gemm_scatterBottomRight(int_t lb, int_t j, double *bigV, - int_t knsupc, int_t klst, int_t *lsub, - int_t *usub, int_t ldt, - int *indirect, int *indirect2, - HyP_t *HyP, dLUstruct_t *, gridinfo_t *, - SCT_t *SCT, SuperLUStat_t *); +extern int_t +dblock_gemm_scatterTopLeft( int_t lb, int_t j, double* bigV, + int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, + int* indirect, int* indirect2, + HyP_t* HyP, dLUstruct_t *, gridinfo_t*, + SCT_t*SCT, SuperLUStat_t * + ); +extern int_t +dblock_gemm_scatterTopRight( int_t lb, int_t j, double* bigV, + int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, + int* indirect, int* indirect2, + HyP_t* HyP, dLUstruct_t *, gridinfo_t*, + SCT_t*SCT, SuperLUStat_t * ); +extern int_t +dblock_gemm_scatterBottomLeft( int_t lb, int_t j, double* bigV, + int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, + int* indirect, int* indirect2, + HyP_t* HyP, dLUstruct_t *, gridinfo_t*, + SCT_t*SCT, SuperLUStat_t * ); +extern int_t +dblock_gemm_scatterBottomRight( int_t lb, int_t j, double* bigV, + int_t knsupc, int_t klst, int_t* lsub, + int_t * usub, int_t ldt, + int* indirect, int* indirect2, + HyP_t* HyP, dLUstruct_t *, gridinfo_t*, + SCT_t*SCT, SuperLUStat_t * ); /* from gather.h */ - extern void dgather_u(int_t num_u_blks, - Ublock_info_t *Ublock_info, int_t *usub, - double *uval, double *bigU, int_t ldu, - int_t *xsup, int_t klst /* for SuperSize */ - ); - - extern void dgather_l(int_t num_LBlk, int_t knsupc, - Remain_info_t *L_info, - double *lval, int_t LD_lval, - double *L_buff); - - extern void dRgather_L(int_t k, int_t *lsub, double *lusup, gEtreeInfo_t *, - Glu_persist_t *, gridinfo_t *, HyP_t *, - int_t *myIperm, int_t *iperm_c_supno); - extern void dRgather_U(int_t k, int_t jj0, int_t *usub, double *uval, - double *bigU, gEtreeInfo_t *, Glu_persist_t *, - gridinfo_t *, HyP_t *, int_t *myIperm, - int_t *iperm_c_supno, int_t *perm_u); +extern void dgather_u(int_t num_u_blks, + Ublock_info_t *Ublock_info, int_t * usub, + double *uval, double *bigU, int_t ldu, + int_t *xsup, int_t klst /* for SuperSize */ + ); + +extern void dgather_l( int_t num_LBlk, int_t knsupc, + Remain_info_t *L_info, + double * lval, int_t LD_lval, + double * L_buff ); + +extern void dRgather_L(int_t k, int_t *lsub, double *lusup, gEtreeInfo_t*, + Glu_persist_t *, gridinfo_t *, HyP_t *, + int_t *myIperm, int_t *iperm_c_supno ); +extern void dRgather_U(int_t k, int_t jj0, int_t *usub, double *uval, + double *bigU, gEtreeInfo_t*, Glu_persist_t *, + gridinfo_t *, HyP_t *, int_t *myIperm, + int_t *iperm_c_supno, int_t *perm_u); /* from xtrf3Dpartition.h */ - extern trf3Dpartition_t *dinitTrf3Dpartition(int_t nsupers, - superlu_dist_options_t *options, - dLUstruct_t *LUstruct, gridinfo3d_t *grid3d); - extern void dDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d); +extern trf3Dpartition_t* dinitTrf3Dpartition(int_t nsupers, + superlu_dist_options_t *options, + dLUstruct_t *LUstruct, gridinfo3d_t * grid3d); +extern void dDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d); - extern void d3D_printMemUse(trf3Dpartition_t *trf3Dpartition, - dLUstruct_t *LUstruct, gridinfo3d_t *grid3d); +extern void d3D_printMemUse(trf3Dpartition_t* trf3Dpartition, + dLUstruct_t *LUstruct, gridinfo3d_t * grid3d); - //extern int *getLastDep(gridinfo_t *grid, SuperLUStat_t *stat, - // superlu_dist_options_t *options, dLocalLU_t *Llu, - // int_t *xsup, int_t num_look_aheads, int_t nsupers, - // int_t *iperm_c_supno); +//extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat, +// superlu_dist_options_t *options, dLocalLU_t *Llu, +// int_t* xsup, int_t num_look_aheads, int_t nsupers, +// int_t * iperm_c_supno); - extern void dinit3DLUstructForest(int_t *myTreeIdxs, int_t *myZeroTrIdxs, - sForest_t **sForests, dLUstruct_t *LUstruct, - gridinfo3d_t *grid3d); +extern void dinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + sForest_t** sForests, dLUstruct_t* LUstruct, + gridinfo3d_t* grid3d); - extern int_t dgatherAllFactoredLUFr(int_t *myZeroTrIdxs, sForest_t *sForests, - dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, - SCT_t *SCT); +extern int_t dgatherAllFactoredLUFr(int_t* myZeroTrIdxs, sForest_t* sForests, + dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, + SCT_t* SCT ); /* The following are from pdgstrf2.h */ -extern void pdgstrf2_trsm(superlu_dist_options_t * options, int_t k0, int_t k, - double thresh, Glu_persist_t *, gridinfo_t *, - dLocalLU_t *, MPI_Request *, int tag_ub, - SuperLUStat_t *, int *info); -extern void pdgstrs2_omp(int_t k0, int_t k, Glu_persist_t *, gridinfo_t *, - dLocalLU_t *, Ublock_info_t *, SuperLUStat_t *); - extern int_t dLpanelUpdate(int_t off0, int_t nsupc, double* ublk_ptr, int_t ld_ujrow, double* lusup, int_t nsupr, SCT_t*); extern void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, @@ -792,7 +787,7 @@ extern int_t dTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, double* uval, double *tempv); extern int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, int_t *usub, double* uval, double *tempv, - int_t knsupc, int_t nsupr, double* lusup, + int_t knsupc, int nsupr, double* lusup, Glu_persist_t *Glu_persist) ; extern void pdgstrs2 #ifdef _CRAY @@ -836,7 +831,7 @@ int_t dreduceAncestors3d(int_t sender, int_t receiver, double* Lval_buf, double* Uval_buf, dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); /*reduces all nodelists required in a level*/ -int_t dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, +extern int dreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm, dLUValSubBuf_t* LUvsb, dLUstruct_t* LUstruct, @@ -963,6 +958,7 @@ extern int_t dLPanelTrSolve(int_t k, int_t* factored_L, double* BlockUFactor, gridinfo_t *, dLUstruct_t *); /* from trfAux.h */ +extern int getNsupers(int, Glu_persist_t *); extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo); extern int freePackLUInfo(packLUInfo_t* packLUInfo); extern int_t dSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*, @@ -973,28 +969,29 @@ extern int_t dSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*, double* Uval_buf, gridinfo_t *, dLUstruct_t *); extern int_t dSchurComplementSetupGPU(int_t k, msgs_t* msgs, packLUInfo_t*, int_t*, int_t*, int_t*, gEtreeInfo_t*, - factNodelists_t*, scuBufs_t*, + factNodelists_t*, dscuBufs_t*, dLUValSubBuf_t* LUvsb, gridinfo_t *, dLUstruct_t *, HyP_t*); extern double* dgetBigV(int_t, int_t); extern double* dgetBigU(int_t, gridinfo_t *, dLUstruct_t *); - +// permutation from superLU default + /* from treeFactorization.h */ extern int_t dLluBufInit(dLUValSubBuf_t*, dLUstruct_t *); extern int_t dinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, - scuBufs_t*, dLUstruct_t*, gridinfo_t *); -extern int dfreeScuBufs(scuBufs_t* scuBufs); + dscuBufs_t*, dLUstruct_t*, gridinfo_t *); +extern int dfreeScuBufs(dscuBufs_t* scuBufs); // the generic tree factoring code extern int_t treeFactor( int_t nnnodes, // number of nodes in the tree int_t *perm_c_supno, // list of nodes in the order of factorization commRequests_t *comReqs, // lists of communication requests - scuBufs_t *scuBufs, // contains buffers for schur complement update + dscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t*msgs, dLUValSubBuf_t* LUvsb, - diagFactBufs_t *dFBuf, + ddiagFactBufs_t *dFBuf, factStat_t *factStat, factNodelists_t *fNlists, superlu_dist_options_t *options, @@ -1010,11 +1007,11 @@ extern int_t dsparseTreeFactor( int_t *perm_c_supno, // list of nodes in the order of factorization treeTopoInfo_t* treeTopoInfo, commRequests_t *comReqs, // lists of communication requests - scuBufs_t *scuBufs, // contains buffers for schur complement update + dscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t*msgs, dLUValSubBuf_t* LUvsb, - diagFactBufs_t *dFBuf, + ddiagFactBufs_t *dFBuf, factStat_t *factStat, factNodelists_t *fNlists, superlu_dist_options_t *options, @@ -1029,11 +1026,11 @@ extern int_t ddenseTreeFactor( int_t nnnodes, // number of nodes in the tree int_t *perm_c_supno, // list of nodes in the order of factorization commRequests_t *comReqs, // lists of communication requests - scuBufs_t *scuBufs, // contains buffers for schur complement update + dscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t*msgs, dLUValSubBuf_t* LUvsb, - diagFactBufs_t *dFBuf, + ddiagFactBufs_t *dFBuf, factStat_t *factStat, factNodelists_t *fNlists, superlu_dist_options_t *options, @@ -1047,11 +1044,11 @@ extern int_t ddenseTreeFactor( extern int_t dsparseTreeFactor_ASYNC( sForest_t* sforest, commRequests_t **comReqss, // lists of communication requests // size maxEtree level - scuBufs_t *scuBufs, // contains buffers for schur complement update + dscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t**msgss, // size=num Look ahead dLUValSubBuf_t** LUvsbs, // size=num Look ahead - diagFactBufs_t **dFBufs, // size maxEtree level + ddiagFactBufs_t **dFBufs, // size maxEtree level factStat_t *factStat, factNodelists_t *fNlists, gEtreeInfo_t* gEtreeInfo, // global etree info @@ -1065,9 +1062,9 @@ extern int_t dsparseTreeFactor_ASYNC( ); extern dLUValSubBuf_t** dLluBufInitArr(int_t numLA, dLUstruct_t *LUstruct); extern int dLluBufFreeArr(int_t numLA, dLUValSubBuf_t **LUvsbs); -extern diagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); -extern int dfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs); -extern int_t dinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf); +extern ddiagFactBufs_t** dinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); +extern int dfreeDiagFactBufsArr(int_t mxLeafNode, ddiagFactBufs_t** dFBufs); +extern int_t dinitDiagFactBufs(int_t ldt, ddiagFactBufs_t* dFBuf); extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs, gridinfo_t *grid, SCT_t *SCT); extern int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *, SCT_t *); @@ -1077,11 +1074,11 @@ extern int_t ancestorFactor( int_t ilvl, // level of factorization sForest_t* sforest, commRequests_t **comReqss, // lists of communication requests // size maxEtree level - scuBufs_t *scuBufs, // contains buffers for schur complement update + dscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t**msgss, // size=num Look ahead dLUValSubBuf_t** LUvsbs, // size=num Look ahead - diagFactBufs_t **dFBufs, // size maxEtree level + ddiagFactBufs_t **dFBufs, // size maxEtree level factStat_t *factStat, factNodelists_t *fNlists, gEtreeInfo_t* gEtreeInfo, // global etree info @@ -1092,11 +1089,13 @@ extern int_t ancestorFactor( dLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, double thresh, SCT_t *SCT, int tag_ub, int *info ); - -/*== end 3D prototypes =================*/ + +/*== end 3D prototypes ===================*/ + #ifdef __cplusplus -} + } #endif #endif /* __SUPERLU_dDEFS */ + diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index b31f65db..bb051921 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -130,6 +130,9 @@ typedef MPI_C_DOUBLE_COMPLEX SuperLU_MPI_DOUBLE_COMPLEX; #include "util_dist.h" #include "psymbfact.h" +#define MAX_SUPER_SIZE 256 /* Sherry: moved from superlu_gpu.cu */ + + #define ISORT /* NOTE: qsort() has bug on Mac */ /*********************************************************************** @@ -1270,6 +1273,9 @@ extern int_t* getMyNodeCounts(int_t maxLvl, int_t* myTreeIdxs, int_t* gNodeCount extern int_t checkIntVector3d(int_t* vec, int_t len, gridinfo3d_t* grid3d); extern int_t reduceStat(PhaseType PHASE, SuperLUStat_t *stat, gridinfo3d_t * grid3d); + extern int getnCudaStreams(); + extern int get_mpi_process_per_gpu (); + /*=====================*/ #ifdef __cplusplus diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index ffd061a2..8dcabe22 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -1,7 +1,7 @@ /* superlu_dist_config.h.in */ /* Enable CUDA */ -/* #undef HAVE_CUDA */ +#define HAVE_CUDA TRUE /* Enable parmetis */ #define HAVE_PARMETIS TRUE diff --git a/SRC/superlu_gpu.cu b/SRC/superlu_gpu.cu deleted file mode 100644 index 173560c8..00000000 --- a/SRC/superlu_gpu.cu +++ /dev/null @@ -1,1877 +0,0 @@ -#define GPU_DEBUG - -#include "mpi.h" -#include "omp.h" -// #include "sec_structs.h" -#include -#include -#include - -#undef Reduce -#include "cub/cub.cuh" -//#include - -#include "lustruct_gpu.h" -// #include "p3dcomm.h" - -#include "dcomplex.h" - -extern "C" { - void cblas_daxpy(const int N, const double alpha, const double *X, - const int incX, double *Y, const int incY); -} - -/*error reporting functions */ -static -cudaError_t checkCuda(cudaError_t result) -{ -#if defined(DEBUG) || defined(_DEBUG) - if (result != cudaSuccess) - { - fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); - assert(result == cudaSuccess); - } -#endif - return result; -} - - -// cublasStatus_t checkCublas(cublasStatus_t result) -// { -// #if defined(DEBUG) || defined(_DEBUG) -// if (result != CUBLAS_STATUS_SUCCESS) -// { -// fprintf(stderr, "CUDA Blas Runtime Error: %s\n", cublasGetErrorString(result)); -// assert(result == CUBLAS_STATUS_SUCCESS); -// } -// #endif -// return result; -// } - - -int_t getnCudaStreams() -{ - // Disabling multiple cuda streams - #if 1 - return 1; - #else - char *ttemp; - ttemp = getenv ("N_CUDA_STREAMS"); - - if (ttemp) - return atoi (ttemp); - else - return 1; - #endif -} - - -// #define UNIT_STRIDE - -__device__ inline -void device_scatter_l (int_t thread_id, - int_t nsupc, int_t temp_nbrow, - int_t *usub, int_t iukp, int_t klst, - double *nzval, int_t ldv, - double *tempv, int_t nbrow, - // int_t *indirect2_thread - int *indirect2_thread - ) -{ - - - int_t segsize, jj; - - for (jj = 0; jj < nsupc; ++jj) - { - segsize = klst - usub[iukp + jj]; - if (segsize) - { - if (thread_id < temp_nbrow) - { - -#ifndef UNIT_STRIDE - nzval[indirect2_thread[thread_id]] -= tempv[thread_id]; -#else - nzval[thread_id] -= tempv[thread_id]; /*making access unit strided*/ -#endif - } - tempv += nbrow; - } - nzval += ldv; - } -} - -#define THREAD_BLOCK_SIZE 512 /* Sherry: was 192 on Titan */ - -#define MAX_SUPER_SIZE 512 /* Sherry: was 192. Must be <= THREAD_BLOCK_SIZE */ - - -__device__ inline -void device_scatter_l_2D (int thread_id, - int nsupc, int temp_nbrow, - int_t *usub, int iukp, int_t klst, - double *nzval, int ldv, - const double *tempv, int nbrow, - // int_t *indirect2_thread - int *indirect2_thread, - int nnz_cols, int ColPerBlock, - int *IndirectJ3 - ) -{ - if ( thread_id < temp_nbrow * ColPerBlock ) - { - int thread_id_x = thread_id % temp_nbrow; - int thread_id_y = thread_id / temp_nbrow; - -#define UNROLL_ITER 8 - -#pragma unroll 4 - for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) - { - nzval[ldv * IndirectJ3[col] + indirect2_thread[thread_id_x]] - -= tempv[nbrow * col + thread_id_x]; - } - } -} - -/* Sherry: this routine is not used */ -__global__ -void cub_scan_test(void) -{ - int thread_id = threadIdx.x; - typedef cub::BlockScan BlockScan; /*1D int data type*/ - - __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ - - __shared__ int IndirectJ1[MAX_SUPER_SIZE]; - __shared__ int IndirectJ2[MAX_SUPER_SIZE]; - - if (thread_id < MAX_SUPER_SIZE) - { - IndirectJ1[thread_id] = (thread_id + 1) % 2; - } - - __syncthreads(); - if (thread_id < MAX_SUPER_SIZE) - BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]); - - - if (thread_id < MAX_SUPER_SIZE) - printf("%d %d\n", thread_id, IndirectJ2[thread_id]); - -} - -__device__ inline -void device_scatter_u_2D (int thread_id, - int temp_nbrow, int nsupc, - double * ucol, - int_t * usub, int iukp, - int_t ilst, int_t klst, - int_t * index, int iuip_lib, - double * tempv, int nbrow, - int *indirect, - int nnz_cols, int ColPerBlock, - int *IndirectJ1, - int *IndirectJ3 - ) -{ - if ( thread_id < temp_nbrow * ColPerBlock ) - { - /* 1D threads are logically arranged in 2D shape. */ - int thread_id_x = thread_id % temp_nbrow; - int thread_id_y = thread_id / temp_nbrow; - -#pragma unroll 4 - for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) - { - ucol[IndirectJ1[IndirectJ3[col]] + indirect[thread_id_x]] - -= tempv[nbrow * col + thread_id_x]; - } - } -} - - -__device__ inline -void device_scatter_u (int_t thread_id, - int_t temp_nbrow, int_t nsupc, - double * ucol, - int_t * usub, int_t iukp, - int_t ilst, int_t klst, - int_t * index, int_t iuip_lib, - double * tempv, int_t nbrow, - // int_t *indirect - int *indirect - ) -{ - int_t segsize, fnz, jj; - for (jj = 0; jj < nsupc; ++jj) - { - segsize = klst - usub[iukp + jj]; - fnz = index[iuip_lib++]; - ucol -= fnz; - if (segsize) /* Nonzero segment in U(k.j). */ - { - - - if (thread_id < temp_nbrow) - { -#ifndef UNIT_STRIDE - ucol[indirect[thread_id]] -= tempv[thread_id]; -#else - /*making access unit strided; - it doesn't work; it for measurements */ - ucol[thread_id] -= tempv[thread_id]; -#endif - } - tempv += nbrow; - } - ucol += ilst ; - } -} - - -__global__ -void Scatter_GPU_kernel( - int_t streamId, - int_t ii_st, int_t ii_end, - int_t jj_st, int_t jj_end, /* defines rectangular Schur block to be scatter */ - int_t klst, - int_t jj0, /* 0 on entry */ - int_t nrows, int_t ldt, int_t npcol, int_t nprow, - LUstruct_gpu * A_gpu) -{ - - /* initializing pointers */ - int_t *xsup = A_gpu->xsup; - int_t *UrowindPtr = A_gpu->UrowindPtr; - int_t *UrowindVec = A_gpu->UrowindVec; - int_t *UnzvalPtr = A_gpu->UnzvalPtr; - double *UnzvalVec = A_gpu->UnzvalVec; - int_t *LrowindPtr = A_gpu->LrowindPtr; - int_t *LrowindVec = A_gpu->LrowindVec; - int_t *LnzvalPtr = A_gpu->LnzvalPtr; - double *LnzvalVec = A_gpu->LnzvalVec; - double *bigV = A_gpu->scubufs[streamId].bigV; - local_l_blk_info_t *local_l_blk_infoVec = A_gpu->local_l_blk_infoVec; - local_u_blk_info_t *local_u_blk_infoVec = A_gpu->local_u_blk_infoVec; - int_t *local_l_blk_infoPtr = A_gpu->local_l_blk_infoPtr; - int_t *local_u_blk_infoPtr = A_gpu->local_u_blk_infoPtr; - Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info; - Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info; - int_t *lsub = A_gpu->scubufs[streamId].lsub; - int_t *usub = A_gpu->scubufs[streamId].usub; - - /* thread block assignment: this thread block is - assigned to block (lb, j) in 2D grid */ - int lb = blockIdx.x + ii_st; - int j = blockIdx.y + jj_st; - __shared__ int indirect_thread[MAX_SUPER_SIZE]; /* row-wise */ - __shared__ int indirect2_thread[MAX_SUPER_SIZE]; /* row-wise */ - __shared__ int IndirectJ1[THREAD_BLOCK_SIZE]; /* column-wise */ - __shared__ int IndirectJ3[THREAD_BLOCK_SIZE]; /* column-wise */ - - /* see CUB page https://nvlabs.github.io/cub/. Implement threads collectives */ - typedef cub::BlockScan BlockScan; /*1D int data type*/ - __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ - - int thread_id = threadIdx.x; - - int iukp = Ublock_info[j].iukp; - int jb = Ublock_info[j].jb; - int nsupc = SuperSize (jb); - int ljb = jb / npcol; - - double *tempv1; - if (jj_st == jj0) - { - tempv1 = (j == jj_st) ? bigV - : bigV + Ublock_info[j - 1].full_u_cols * nrows; - } - else - { - tempv1 = (j == jj_st) ? bigV - : bigV + (Ublock_info[j - 1].full_u_cols - - Ublock_info[jj_st - 1].full_u_cols) * nrows; - } - - /* # of nonzero columns in block j */ - int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols - : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols); - int cum_ncol = (j == 0) ? 0 : Ublock_info[j - 1].full_u_cols; - - int lptr = Remain_info[lb].lptr; - int ib = Remain_info[lb].ib; - int temp_nbrow = lsub[lptr + 1]; /* number of rows in the current L block */ - lptr += LB_DESCRIPTOR; - - int_t cum_nrow; - if (ii_st == 0) - { - cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow); - } - else - { - cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow - Remain_info[ii_st - 1].FullRow); - } - - tempv1 += cum_nrow; - - if (ib < jb) /*scatter U code */ - { - int ilst = FstBlockC (ib + 1); - int lib = ib / nprow; /* local index of row block ib */ - int_t *index = &UrowindVec[UrowindPtr[lib]]; - - int num_u_blocks = index[0]; - - int ljb = (jb) / npcol; /* local index of column block jb */ - - /* Each thread is responsible for one block column */ - __shared__ int ljb_ind; - /*do a search ljb_ind at local row lib*/ - int blks_per_threads = CEILING(num_u_blocks, THREAD_BLOCK_SIZE); - for (int i = 0; i < blks_per_threads; ++i) - /* each thread is assigned a chunk of consecutive U blocks to search */ - { - /* only one thread finds the block index matching ljb */ - if (thread_id * blks_per_threads + i < num_u_blocks && - local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + thread_id * blks_per_threads + i ].ljb == ljb) - { - ljb_ind = thread_id * blks_per_threads + i; - } - } - __syncthreads(); - - int iuip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].iuip; - int ruip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].ruip; - iuip_lib += UB_DESCRIPTOR; - double *Unzval_lib = &UnzvalVec[UnzvalPtr[lib]]; - double *ucol = &Unzval_lib[ruip_lib]; - - if (thread_id < temp_nbrow) /* row-wise */ - { - /* cyclically map each thread to a row */ - indirect_thread[thread_id] = (int) lsub[lptr + thread_id]; - } - - /* column-wise: each thread is assigned one column */ - if (thread_id < nnz_cols) - IndirectJ3[thread_id] = A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; - /* indirectJ3[j] == kk means the j-th nonzero segment - points to column kk in this supernode */ - - __syncthreads(); - - /* threads are divided into multiple columns */ - int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; - - if (thread_id < THREAD_BLOCK_SIZE) - IndirectJ1[thread_id] = 0; - - if (thread_id < THREAD_BLOCK_SIZE) - { - if (thread_id < nsupc) - { - /* fstnz subscript of each column in the block */ - IndirectJ1[thread_id] = index[iuip_lib + thread_id]; - } - } - - /* perform an inclusive block-wide prefix sum among all threads */ - if (thread_id < THREAD_BLOCK_SIZE) - BlockScan(temp_storage).InclusiveSum(IndirectJ1[thread_id], IndirectJ1[thread_id]); - - if (thread_id < THREAD_BLOCK_SIZE) - IndirectJ1[thread_id] = -IndirectJ1[thread_id] + ilst * thread_id; - - __syncthreads(); - - device_scatter_u_2D ( - thread_id, - temp_nbrow, nsupc, - ucol, - usub, iukp, - ilst, klst, - index, iuip_lib, - tempv1, nrows, - indirect_thread, - nnz_cols, ColPerBlock, - IndirectJ1, - IndirectJ3 ); - - } - else /* ib >= jb, scatter L code */ - { - - int rel; - double *nzval; - int_t *index = &LrowindVec[LrowindPtr[ljb]]; - int num_l_blocks = index[0]; - int ldv = index[1]; - - int fnz = FstBlockC (ib); - int lib = ib / nprow; - - __shared__ int lib_ind; - /*do a search lib_ind for lib*/ - int blks_per_threads = CEILING(num_l_blocks, THREAD_BLOCK_SIZE); - for (int i = 0; i < blks_per_threads; ++i) - { - if (thread_id * blks_per_threads + i < num_l_blocks && - local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + thread_id * blks_per_threads + i ].lib == lib) - { - lib_ind = thread_id * blks_per_threads + i; - } - } - __syncthreads(); - - int lptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].lptrj; - int luptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].luptrj; - lptrj += LB_DESCRIPTOR; - int dest_nbrow = index[lptrj - 1]; - - if (thread_id < dest_nbrow) - { - rel = index[lptrj + thread_id] - fnz; - indirect_thread[rel] = thread_id; - } - __syncthreads(); - - /* can be precalculated */ - if (thread_id < temp_nbrow) - { - rel = lsub[lptr + thread_id] - fnz; - indirect2_thread[thread_id] = indirect_thread[rel]; - } - if (thread_id < nnz_cols) - IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; - __syncthreads(); - - int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; - - nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj; - device_scatter_l_2D( - thread_id, - nsupc, temp_nbrow, - usub, iukp, klst, - nzval, ldv, - tempv1, nrows, indirect2_thread, - nnz_cols, ColPerBlock, - IndirectJ3); - } /* end else ib >= jb */ - -} /* end Scatter_GPU_kernel */ - - -#define GPU_2D_SCHUDT /* Not used */ - -int_t SchurCompUpdate_GPU( - int_t streamId, - int_t jj_cpu, /* 0 on entry, pointing to the start of Phi part */ - int_t nub, /* jj_cpu on entry, pointing to the end of the Phi part */ - int_t klst, int_t knsupc, - int_t Rnbrow, int_t RemainBlk, - int_t Remain_lbuf_send_size, - int_t bigu_send_size, int_t ldu, - int_t mcb, /* num_u_blks_hi */ - int_t buffer_size, int_t lsub_len, int_t usub_len, - int_t ldt, int_t k0, - sluGPU_t *sluGPU, gridinfo_t *grid -) -{ - - LUstruct_gpu * A_gpu = sluGPU->A_gpu; - LUstruct_gpu * dA_gpu = sluGPU->dA_gpu; - int_t nprow = grid->nprow; - int_t npcol = grid->npcol; - - cudaStream_t FunCallStream = sluGPU->funCallStreams[streamId]; - cublasHandle_t cublas_handle0 = sluGPU->cublasHandles[streamId]; - int_t * lsub = A_gpu->scubufs[streamId].lsub_buf; - int_t * usub = A_gpu->scubufs[streamId].usub_buf; - Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info_host; - double * Remain_L_buff = A_gpu->scubufs[streamId].Remain_L_buff_host; - Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info_host; - double * bigU = A_gpu->scubufs[streamId].bigU_host; - - A_gpu->isOffloaded[k0] = 1; - /* start by sending data to */ - int_t *xsup = A_gpu->xsup_host; - int_t col_back = (jj_cpu == 0) ? 0 : Ublock_info[jj_cpu - 1].full_u_cols; - // if(nub<1) return; - int_t ncols = Ublock_info[nub - 1].full_u_cols - col_back; - - /* Sherry: can get max_super_size from sp_ienv(3) */ - int_t indirectJ1[MAX_SUPER_SIZE]; // 0 indicates an empry segment - int_t indirectJ2[MAX_SUPER_SIZE]; // # of nonzero segments so far - int_t indirectJ3[MAX_SUPER_SIZE]; /* indirectJ3[j] == k means the - j-th nonzero segment points - to column k in this supernode */ - /* calculate usub_indirect */ - for (int jj = jj_cpu; jj < nub; ++jj) - { - int_t iukp = Ublock_info[jj].iukp; - int_t jb = Ublock_info[jj].jb; - int_t nsupc = SuperSize (jb); - int_t addr = (jj == 0) ? 0 - : Ublock_info[jj - 1].full_u_cols - col_back; - - for (int_t kk = 0; kk < MAX_SUPER_SIZE; ++kk) - { - indirectJ1[kk] = 0; - } - - for (int_t kk = 0; kk < nsupc; ++kk) - { - indirectJ1[kk] = ((klst - usub[iukp + kk]) == 0) ? 0 : 1; - } - - /*prefix sum - indicates # of nonzero segments up to column kk */ - indirectJ2[0] = indirectJ1[0]; - for (int_t kk = 1; kk < MAX_SUPER_SIZE; ++kk) - { - indirectJ2[kk] = indirectJ2[kk - 1] + indirectJ1[kk]; - } - - /* total number of nonzero segments in this supernode */ - int nnz_col = indirectJ2[MAX_SUPER_SIZE - 1]; - - /* compactation */ - for (int_t kk = 0; kk < MAX_SUPER_SIZE; ++kk) - { - if (indirectJ1[kk]) /* kk is a nonzero segment */ - { - /* indirectJ3[j] == kk means the j-th nonzero segment - points to column kk in this supernode */ - indirectJ3[indirectJ2[kk] - 1] = kk; - } - } - - for (int i = 0; i < nnz_col; ++i) - { - /* addr == total # of full columns before current block jj */ - A_gpu->scubufs[streamId].usub_IndirectJ3_host[addr + i] = indirectJ3[i]; - } - } /* end for jj ... calculate usub_indirect */ - - //printf("SchurCompUpdate_GPU[3]: jj_cpu %d, nub %d\n", jj_cpu, nub); fflush(stdout); - - /*sizeof RemainLbuf = Rnbuf*knsupc */ - double tTmp = SuperLU_timer_(); - cudaEventRecord(A_gpu->ePCIeH2D[k0], FunCallStream); - - checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].usub_IndirectJ3, - A_gpu->scubufs[streamId].usub_IndirectJ3_host, - ncols * sizeof(int_t), cudaMemcpyHostToDevice, - FunCallStream)) ; - - checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Remain_L_buff, Remain_L_buff, - Remain_lbuf_send_size * sizeof(double), - cudaMemcpyHostToDevice, FunCallStream)) ; - - checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].bigU, bigU, - bigu_send_size * sizeof(double), - cudaMemcpyHostToDevice, FunCallStream) ); - - checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Remain_info, Remain_info, - RemainBlk * sizeof(Remain_info_t), - cudaMemcpyHostToDevice, FunCallStream) ); - - checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Ublock_info, Ublock_info, - mcb * sizeof(Ublock_info_t), cudaMemcpyHostToDevice, - FunCallStream) ); - - checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].lsub, lsub, - lsub_len * sizeof(int_t), cudaMemcpyHostToDevice, - FunCallStream) ); - - checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].usub, usub, - usub_len * sizeof(int_t), cudaMemcpyHostToDevice, - FunCallStream) ); - - A_gpu->tHost_PCIeH2D += SuperLU_timer_() - tTmp; - A_gpu->cPCIeH2D += Remain_lbuf_send_size * sizeof(double) - + bigu_send_size * sizeof(double) - + RemainBlk * sizeof(Remain_info_t) - + mcb * sizeof(Ublock_info_t) - + lsub_len * sizeof(int_t) - + usub_len * sizeof(int_t); - - double alpha = 1.0, beta = 0.0; - - int_t ii_st = 0; - int_t ii_end = 0; - int_t maxGemmBlockDim = (int) sqrt(buffer_size); - // int_t maxGemmBlockDim = 8000; - - /* Organize GEMM by blocks of [ii_st : ii_end, jj_st : jj_end] that - fits in the buffer_size */ - while (ii_end < RemainBlk) - { - ii_st = ii_end; - ii_end = RemainBlk; - int_t nrow_max = maxGemmBlockDim; -// nrow_max = Rnbrow; - int_t remaining_rows = (ii_st == 0) ? Rnbrow : Rnbrow - Remain_info[ii_st - 1].FullRow; - nrow_max = (remaining_rows / nrow_max) > 0 ? remaining_rows / CEILING(remaining_rows, nrow_max) : nrow_max; - - int_t ResRow = (ii_st == 0) ? 0 : Remain_info[ii_st - 1].FullRow; - for (int_t i = ii_st; i < RemainBlk - 1; ++i) - { - if ( Remain_info[i + 1].FullRow > ResRow + nrow_max) - { - ii_end = i; - break; /* row dimension reaches nrow_max */ - } - } - - int_t nrows; /* actual row dimension for GEMM */ - int_t st_row; - if (ii_st > 0) - { - nrows = Remain_info[ii_end - 1].FullRow - Remain_info[ii_st - 1].FullRow; - st_row = Remain_info[ii_st - 1].FullRow; - } - else - { - nrows = Remain_info[ii_end - 1].FullRow; - st_row = 0; - } - - int_t jj_st = jj_cpu; - int_t jj_end = jj_cpu; - - while (jj_end < nub && nrows > 0 ) - { - int_t remaining_cols = (jj_st == jj_cpu) ? ncols : ncols - Ublock_info[jj_st - 1].full_u_cols; - if ( remaining_cols * nrows < buffer_size) - { - jj_st = jj_end; - jj_end = nub; - } - else /* C matrix cannot fit in buffer, need to break into pieces */ - { - int_t ncol_max = buffer_size / nrows; - /** Must revisit **/ - ncol_max = SUPERLU_MIN(ncol_max, maxGemmBlockDim); - ncol_max = (remaining_cols / ncol_max) > 0 ? - remaining_cols / CEILING(remaining_cols, ncol_max) - : ncol_max; - - jj_st = jj_end; - jj_end = nub; - - int_t ResCol = (jj_st == 0) ? 0 : Ublock_info[jj_st - 1].full_u_cols; - for (int_t j = jj_st; j < nub - 1; ++j) - { - if (Ublock_info[j + 1].full_u_cols > ResCol + ncol_max) - { - jj_end = j; - break; - } - } - } /* end-if-else */ - - int_t ncols; - int_t st_col; - if (jj_st > 0) - { - ncols = Ublock_info[jj_end - 1].full_u_cols - Ublock_info[jj_st - 1].full_u_cols; - st_col = Ublock_info[jj_st - 1].full_u_cols; - if (ncols == 0) exit(0); - } - else - { - ncols = Ublock_info[jj_end - 1].full_u_cols; - st_col = 0; - } - - /* none of the matrix dimension is zero. */ - if (nrows > 0 && ldu > 0 && ncols > 0) - { - if (nrows * ncols > buffer_size) - { - printf("!! Matrix size %lld x %lld exceeds buffer_size \n", - nrows, ncols, buffer_size); - fflush(stdout); - } - assert(nrows * ncols <= buffer_size); - cublasSetStream(cublas_handle0, FunCallStream); - cudaEventRecord(A_gpu->GemmStart[k0], FunCallStream); - cublasDgemm(cublas_handle0, CUBLAS_OP_N, CUBLAS_OP_N, - nrows, ncols, ldu, &alpha, - &A_gpu->scubufs[streamId].Remain_L_buff[(knsupc - ldu) * Rnbrow + st_row], Rnbrow, - &A_gpu->scubufs[streamId].bigU[st_col * ldu], ldu, - &beta, A_gpu->scubufs[streamId].bigV, nrows); - -// #define SCATTER_OPT -#ifdef SCATTER_OPT - cudaStreamSynchronize(FunCallStream); -#warning this function is synchrnous -#endif - cudaEventRecord(A_gpu->GemmEnd[k0], FunCallStream); - - A_gpu->GemmFLOPCounter += 2.0 * (double) nrows * ncols * ldu ; - - /* - * Scattering the output - */ - dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread - - dim3 dimGrid(ii_end - ii_st, jj_end - jj_st); - - Scatter_GPU_kernel <<< dimGrid, dimBlock, 0, FunCallStream>>> - (streamId, ii_st, ii_end, jj_st, jj_end, klst, - 0, nrows, ldt, npcol, nprow, dA_gpu); -#ifdef SCATTER_OPT - cudaStreamSynchronize(FunCallStream); -#warning this function is synchrnous -#endif - - cudaEventRecord(A_gpu->ScatterEnd[k0], FunCallStream); - - A_gpu->ScatterMOPCounter += 3.0 * (double) nrows * ncols; - } /* endif ... none of the matrix dimension is zero. */ - - } /* end while jj_end < nub */ - - } /* end while (ii_end < RemainBlk) */ - - return 0; -} /* end SchurCompUpdate_GPU */ - - -void print_occupany() -{ - int blockSize; // The launch configurator returned block size - int minGridSize; /* The minimum grid size needed to achieve the - best potential occupancy */ - - cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, - Scatter_GPU_kernel, 0, 0); -#if (PRNTlevel>=1) - printf("Occupancy: MinGridSize %d blocksize %d \n", minGridSize, blockSize); -#endif -} - -void printDevProp(cudaDeviceProp devProp) -{ - size_t mfree, mtotal; - cudaMemGetInfo (&mfree, &mtotal); - - printf("pciBusID: %d\n", devProp.pciBusID); - printf("pciDeviceID: %d\n", devProp.pciDeviceID); - printf("GPU Name: %s\n", devProp.name); - printf("Total global memory: %zu\n", devProp.totalGlobalMem); - printf("Total free memory: %zu\n", mfree); - printf("Clock rate: %d\n", devProp.clockRate); - - return; -} - - -int -get_mpi_process_per_gpu () -{ - - - char *ttemp; - ttemp = getenv ("MPI_PROCESS_PER_GPU"); - - if (ttemp) - return atol (ttemp); - else - { - printf("MPI_PROCESS_PER_GPU is not set; Using default 1 \n"); - return 1; - } -} - -size_t -get_acc_memory () -{ - - size_t mfree, mtotal; - cudaMemGetInfo (&mfree, &mtotal); -#if 0 - printf("Total memory %zu & free memory %zu\n", mtotal, mfree); -#endif - return (size_t) (0.9 * (double) mfree) / get_mpi_process_per_gpu (); - - -} - - -int_t free_LUstruct_gpu (LUstruct_gpu * A_gpu) -{ - checkCuda(cudaFree(A_gpu->LrowindVec)); - checkCuda(cudaFree(A_gpu->LrowindPtr)); - - checkCuda(cudaFree(A_gpu->LnzvalVec)); - checkCuda(cudaFree(A_gpu->LnzvalPtr)); - free(A_gpu->LnzvalPtr_host); - /*freeing the pinned memory*/ - int_t streamId = 0; - checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_info_host)); - checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Ublock_info_host)); - checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_L_buff_host)); - checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].bigU_host)); - - checkCuda(cudaFreeHost(A_gpu->acc_L_buff)); - checkCuda(cudaFreeHost(A_gpu->acc_U_buff)); - checkCuda(cudaFreeHost(A_gpu->scubufs[streamId].lsub_buf)); - checkCuda(cudaFreeHost(A_gpu->scubufs[streamId].usub_buf)); - - - free(A_gpu->isOffloaded); - free(A_gpu->GemmStart); - free(A_gpu->GemmEnd); - free(A_gpu->ScatterEnd); - free(A_gpu->ePCIeH2D); - - free(A_gpu->ePCIeD2H_Start); - free(A_gpu->ePCIeD2H_End); - - checkCuda(cudaFree(A_gpu->UrowindVec)); - checkCuda(cudaFree(A_gpu->UrowindPtr)); - - free(A_gpu->UrowindPtr_host); - - checkCuda(cudaFree(A_gpu->UnzvalVec)); - checkCuda(cudaFree(A_gpu->UnzvalPtr)); - - checkCuda(cudaFree(A_gpu->grid)); - - - - checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV)); - checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU)); - - checkCuda(cudaFree(A_gpu->scubufs[streamId].Remain_L_buff)); - checkCuda(cudaFree(A_gpu->scubufs[streamId].Ublock_info)); - checkCuda(cudaFree(A_gpu->scubufs[streamId].Remain_info)); - - // checkCuda(cudaFree(A_gpu->indirect)); - // checkCuda(cudaFree(A_gpu->indirect2)); - checkCuda(cudaFree(A_gpu->xsup)); - - checkCuda(cudaFree(A_gpu->scubufs[streamId].lsub)); - checkCuda(cudaFree(A_gpu->scubufs[streamId].usub)); - - - checkCuda(cudaFree(A_gpu->local_l_blk_infoVec)); - checkCuda(cudaFree(A_gpu->local_l_blk_infoPtr)); - checkCuda(cudaFree(A_gpu->jib_lookupVec)); - checkCuda(cudaFree(A_gpu->jib_lookupPtr)); - checkCuda(cudaFree(A_gpu->local_u_blk_infoVec)); - checkCuda(cudaFree(A_gpu->local_u_blk_infoPtr)); - checkCuda(cudaFree(A_gpu->ijb_lookupVec)); - checkCuda(cudaFree(A_gpu->ijb_lookupPtr)); - - return 0; - -} - - - -void dPrint_matrix( char *desc, int_t m, int_t n, double * dA, int_t lda ) -{ - double *cPtr = (double *) malloc(sizeof(double) * lda * n); - checkCuda(cudaMemcpy( cPtr, dA, - lda * n * sizeof(double), cudaMemcpyDeviceToHost)) ; - - int_t i, j; - printf( "\n %s\n", desc ); - for ( i = 0; i < m; i++ ) - { - for ( j = 0; j < n; j++ ) printf( " %.3e", cPtr[i + j * lda] ); - printf( "\n" ); - } - free(cPtr); -} - -void printGPUStats(LUstruct_gpu * A_gpu) -{ - double tGemm = 0; - double tScatter = 0; - double tPCIeH2D = 0; - double tPCIeD2H = 0; - - for (int_t i = 0; i < A_gpu->nsupers; ++i) - { - float milliseconds = 0; - - if (A_gpu->isOffloaded[i]) - { - cudaEventElapsedTime(&milliseconds, A_gpu->ePCIeH2D[i], A_gpu->GemmStart[i]); - tPCIeH2D += 1e-3 * (double) milliseconds; - milliseconds = 0; - cudaEventElapsedTime(&milliseconds, A_gpu->GemmStart[i], A_gpu->GemmEnd[i]); - tGemm += 1e-3 * (double) milliseconds; - milliseconds = 0; - cudaEventElapsedTime(&milliseconds, A_gpu->GemmEnd[i], A_gpu->ScatterEnd[i]); - tScatter += 1e-3 * (double) milliseconds; - } - - milliseconds = 0; - cudaEventElapsedTime(&milliseconds, A_gpu->ePCIeD2H_Start[i], A_gpu->ePCIeD2H_End[i]); - tPCIeD2H += 1e-3 * (double) milliseconds; - } - - printf("GPU: Flops offloaded %.3e Time spent %lf Flop rate %lf GF/sec \n", - A_gpu->GemmFLOPCounter, tGemm, 1e-9 * A_gpu->GemmFLOPCounter / tGemm ); - printf("GPU: Mop offloaded %.3e Time spent %lf Bandwidth %lf GByte/sec \n", - A_gpu->ScatterMOPCounter, tScatter, 8e-9 * A_gpu->ScatterMOPCounter / tScatter ); - printf("PCIe Data Transfer H2D:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", - 1e-9 * A_gpu->cPCIeH2D, A_gpu->tHost_PCIeH2D, tPCIeH2D, 1e-9 * A_gpu->cPCIeH2D / tPCIeH2D ); - printf("PCIe Data Transfer D2H:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", - 1e-9 * A_gpu->cPCIeD2H, A_gpu->tHost_PCIeD2H, tPCIeD2H, 1e-9 * A_gpu->cPCIeD2H / tPCIeD2H ); - fflush(stdout); - -} /* end printGPUStats */ - - -int_t initSluGPU3D_t( - sluGPU_t *sluGPU, - LUstruct_t *LUstruct, - gridinfo3d_t * grid3d, - int_t* perm_c_supno, - int_t n, - int_t buffer_size, /* read from env variable MAX_BUFFER_SIZE */ - int_t bigu_size, - int_t ldt /* NSUP read from sp_ienv(3) */ -) -{ - gridinfo_t* grid = &(grid3d->grid2d); - checkCudaErrors(cudaDeviceReset ()) ; - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - int_t* isNodeInMyGrid = sluGPU->isNodeInMyGrid; - - sluGPU->nCudaStreams = getnCudaStreams(); - if (!grid->iam) - { - printf("initSluGPU3D_t: Using hardware acceleration, with %d cuda streams \n", sluGPU->nCudaStreams); - fflush(stdout); - if ( MAX_SUPER_SIZE < ldt ) - { - ABORT("MAX_SUPER_SIZE smaller than requested NSUP"); - } - } - - cudaStreamCreate(&(sluGPU->CopyStream)); - - for (int_t streamId = 0; streamId < sluGPU->nCudaStreams; streamId++) - { - cudaStreamCreate(&(sluGPU->funCallStreams[streamId])); - cublasCreate(&(sluGPU->cublasHandles[streamId])); - sluGPU->lastOffloadStream[streamId] = -1; - } - - sluGPU->A_gpu = (LUstruct_gpu *) malloc (sizeof(LUstruct_gpu)); - sluGPU->A_gpu->perm_c_supno = perm_c_supno; - CopyLUToGPU3D ( - isNodeInMyGrid, - Llu, /* referred to as A_host */ - sluGPU, - Glu_persist, n, - grid3d, - buffer_size, - bigu_size, - ldt - ); - - return 0; -} /* end initSluGPU3D_t */ - -int_t initD2Hreduce( - int_t next_k, - d2Hreduce_t* d2Hred, - int_t last_flag, - HyP_t* HyP, - sluGPU_t *sluGPU, - gridinfo_t *grid, - LUstruct_t *LUstruct - , SCT_t* SCT -) -{ - Glu_persist_t *Glu_persist = LUstruct->Glu_persist; - LocalLU_t *Llu = LUstruct->Llu; - int_t* xsup = Glu_persist->xsup; - int_t iam = grid->iam; - int_t myrow = MYROW (iam, grid); - int_t mycol = MYCOL (iam, grid); - int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - - - // int_t next_col = SUPERLU_MIN (k0 + num_look_aheads + 1, nsupers - 1); - // int_t next_k = perm_c_supno[next_col]; /* global block number for next colum*/ - int_t mkcol, mkrow; - - int_t kljb = LBj( next_k, grid ); /*local block number for next block*/ - int_t kijb = LBi( next_k, grid ); /*local block number for next block*/ - - int_t *kindexL ; /*for storing index vectors*/ - int_t *kindexU ; - mkrow = PROW (next_k, grid); - mkcol = PCOL (next_k, grid); - int_t ksup_size = SuperSize(next_k); - - int_t copyL_kljb = 0; - int_t copyU_kljb = 0; - int_t l_copy_len = 0; - int_t u_copy_len = 0; - - if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL && last_flag) - { - if (HyP->Lblock_dirty_bit[kljb] > -1) - { - copyL_kljb = 1; - int_t lastk0 = HyP->Lblock_dirty_bit[kljb]; - int_t streamIdk0Offload = lastk0 % sluGPU->nCudaStreams; - if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1) - { - // printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload); - double ttx = SuperLU_timer_(); - cudaStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]); - SCT->PhiWaitTimer += SuperLU_timer_() - ttx; - sluGPU->lastOffloadStream[streamIdk0Offload] = -1; - } - } - - kindexL = Lrowind_bc_ptr[kljb]; - l_copy_len = kindexL[1] * ksup_size; - } - - if ( mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL && last_flag ) - { - if (HyP->Ublock_dirty_bit[kijb] > -1) - { - copyU_kljb = 1; - int_t lastk0 = HyP->Ublock_dirty_bit[kijb]; - int_t streamIdk0Offload = lastk0 % sluGPU->nCudaStreams; - if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1) - { - // printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload); - double ttx = SuperLU_timer_(); - cudaStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]); - SCT->PhiWaitTimer += SuperLU_timer_() - ttx; - sluGPU->lastOffloadStream[streamIdk0Offload] = -1; - } - - } - // copyU_kljb = HyP->Ublock_dirty_bit[kijb]>-1? 1: 0; - kindexU = Ufstnz_br_ptr[kijb]; - u_copy_len = kindexU[1]; - } - - // wait for streams if they have not been finished - - // d2Hred->next_col = next_col; - d2Hred->next_k = next_k; - d2Hred->kljb = kljb; - d2Hred->kijb = kijb; - d2Hred->copyL_kljb = copyL_kljb; - d2Hred->copyU_kljb = copyU_kljb; - d2Hred->l_copy_len = l_copy_len; - d2Hred->u_copy_len = u_copy_len; - d2Hred->kindexU = kindexU; - d2Hred->kindexL = kindexL; - d2Hred->mkrow = mkrow; - d2Hred->mkcol = mkcol; - d2Hred->ksup_size = ksup_size; - return 0; -} - -int_t reduceGPUlu( - - int_t last_flag, - d2Hreduce_t* d2Hred, - sluGPU_t *sluGPU, - SCT_t *SCT, - gridinfo_t *grid, - LUstruct_t *LUstruct -) -{ - - LocalLU_t *Llu = LUstruct->Llu; - int_t iam = grid->iam; - int_t myrow = MYROW (iam, grid); - int_t mycol = MYCOL (iam, grid); - int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; - double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; - int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; - double** Unzval_br_ptr = Llu->Unzval_br_ptr; - - cudaStream_t CopyStream; - LUstruct_gpu *A_gpu; - A_gpu = sluGPU->A_gpu; - CopyStream = sluGPU->CopyStream; - - int_t kljb = d2Hred->kljb; - int_t kijb = d2Hred->kijb; - int_t copyL_kljb = d2Hred->copyL_kljb; - int_t copyU_kljb = d2Hred->copyU_kljb; - int_t mkrow = d2Hred->mkrow; - int_t mkcol = d2Hred->mkcol; - int_t ksup_size = d2Hred->ksup_size; - int_t *kindex; - if ((copyL_kljb || copyU_kljb) && last_flag ) - { - double ttx = SuperLU_timer_(); - cudaStreamSynchronize(CopyStream); - SCT->PhiWaitTimer_2 += SuperLU_timer_() - ttx; - } - - - double tt_start = SuperLU_timer_(); - - - if (last_flag) - { - - if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL ) - { - - kindex = Lrowind_bc_ptr[kljb]; - int_t len = kindex[1]; - - if (copyL_kljb) - { - - double *nzval_host; - nzval_host = Lnzval_bc_ptr[kljb]; - int_t llen = ksup_size * len; - - double alpha = 1; - superlu_daxpy (llen, alpha, A_gpu->acc_L_buff, 1, nzval_host, 1); - } - - } - } - if (last_flag) - { - if (mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL ) - { - - kindex = Ufstnz_br_ptr[kijb]; - int_t len = kindex[1]; - - if (copyU_kljb) - { - - double *nzval_host; - nzval_host = Unzval_br_ptr[kijb]; - - double alpha = 1; - superlu_daxpy (len, alpha, A_gpu->acc_U_buff, 1, nzval_host, 1); - } - - } - } - - double tt_end = SuperLU_timer_(); - SCT->AssemblyTimer += tt_end - tt_start; - return 0; -} - - -int_t waitGPUscu(int_t streamId, sluGPU_t *sluGPU, SCT_t *SCT) -{ - double ttx = SuperLU_timer_(); - cudaStreamSynchronize(sluGPU->funCallStreams[streamId]); - SCT->PhiWaitTimer += SuperLU_timer_() - ttx; - return 0; -} - -int_t sendLUpanelGPU2HOST( - int_t k0, - d2Hreduce_t* d2Hred, - sluGPU_t *sluGPU -) -{ - - int_t kljb = d2Hred->kljb; - int_t kijb = d2Hred->kijb; - int_t copyL_kljb = d2Hred->copyL_kljb; - int_t copyU_kljb = d2Hred->copyU_kljb; - int_t l_copy_len = d2Hred->l_copy_len; - int_t u_copy_len = d2Hred->u_copy_len; - cudaStream_t CopyStream = sluGPU->CopyStream;; - LUstruct_gpu *A_gpu = sluGPU->A_gpu; - double tty = SuperLU_timer_(); - cudaEventRecord(A_gpu->ePCIeD2H_Start[k0], CopyStream); - if (copyL_kljb) - checkCuda(cudaMemcpyAsync(A_gpu->acc_L_buff, &A_gpu->LnzvalVec[A_gpu->LnzvalPtr_host[kljb]], - l_copy_len * sizeof(double), cudaMemcpyDeviceToHost, CopyStream ) ); - - if (copyU_kljb) - checkCuda(cudaMemcpyAsync(A_gpu->acc_U_buff, &A_gpu->UnzvalVec[A_gpu->UnzvalPtr_host[kijb]], - u_copy_len * sizeof(double), cudaMemcpyDeviceToHost, CopyStream ) ); - cudaEventRecord(A_gpu->ePCIeD2H_End[k0], CopyStream); - A_gpu->tHost_PCIeD2H += SuperLU_timer_() - tty; - A_gpu->cPCIeD2H += u_copy_len * sizeof(double) + l_copy_len * sizeof(double); - - return 0; -} - -/* Copy L and U panel data structures from host to the host part of the - data structures in A_gpu. - GPU is not involved in this routine. */ -int_t sendSCUdataHost2GPU( - int_t streamId, - int_t* lsub, - int_t* usub, - double* bigU, - int_t bigu_send_size, - int_t Remain_lbuf_send_size, - sluGPU_t *sluGPU, - HyP_t* HyP -) -{ - //{printf("....[enter] sendSCUdataHost2GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} - - int_t usub_len = usub[2]; - int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; - //{printf("....[2] in sendSCUdataHost2GPU, lsub_len %d\n", lsub_len); fflush(stdout);} - LUstruct_gpu *A_gpu = sluGPU->A_gpu; - memcpy(A_gpu->scubufs[streamId].lsub_buf, lsub, sizeof(int_t)*lsub_len); - memcpy(A_gpu->scubufs[streamId].usub_buf, usub, sizeof(int_t)*usub_len); - memcpy(A_gpu->scubufs[streamId].Remain_info_host, HyP->Remain_info, - sizeof(Remain_info_t)*HyP->RemainBlk); - memcpy(A_gpu->scubufs[streamId].Ublock_info_host, HyP->Ublock_info_Phi, - sizeof(Ublock_info_t)*HyP->num_u_blks_Phi); - memcpy(A_gpu->scubufs[streamId].Remain_L_buff_host, HyP->Remain_L_buff, - sizeof(double)*Remain_lbuf_send_size); - memcpy(A_gpu->scubufs[streamId].bigU_host, bigU, - sizeof(double)*bigu_send_size); - - return 0; -} - - -int_t freeSluGPU(sluGPU_t *sluGPU) -{ - return 0; -} - - -void CopyLUToGPU3D ( - int_t* isNodeInMyGrid, - LocalLU_t *A_host, /* distributed LU structure on host */ - sluGPU_t *sluGPU, - Glu_persist_t *Glu_persist, int_t n, - gridinfo3d_t *grid3d, - int_t buffer_size, /* bigV size on GPU for Schur complement update */ - int_t bigu_size, - int_t ldt -) -{ - gridinfo_t* grid = &(grid3d->grid2d); - LUstruct_gpu * A_gpu = sluGPU->A_gpu; - LUstruct_gpu **dA_gpu = &(sluGPU->dA_gpu); - -#ifdef GPU_DEBUG - // if ( grid3d->iam == 0 ) - { - print_occupany(); - cudaDeviceProp devProp; - cudaGetDeviceProperties(&devProp, 0); - printDevProp(devProp); - } -#endif - int_t *xsup ; - xsup = Glu_persist->xsup; - int_t iam = grid->iam; - int_t nsupers = Glu_persist->supno[n - 1] + 1; - int_t Pc = grid->npcol; - int_t Pr = grid->nprow; - int_t myrow = MYROW (iam, grid); - int_t mycol = MYCOL (iam, grid); - int_t mrb = (nsupers + Pr - 1) / Pr; - int_t mcb = (nsupers + Pc - 1) / Pc; - int_t remain_l_max = A_host->bufmax[1]; - - /*copies of scalars for easy access*/ - A_gpu->nsupers = nsupers; - A_gpu->ScatterMOPCounter = 0; - A_gpu->GemmFLOPCounter = 0; - A_gpu->cPCIeH2D = 0; - A_gpu->cPCIeD2H = 0; - A_gpu->tHost_PCIeH2D = 0; - A_gpu->tHost_PCIeD2H = 0; - - /*initializing memory*/ - size_t max_gpu_memory = get_acc_memory (); - size_t gpu_mem_used = 0; - - void *tmp_ptr; - - A_gpu->xsup_host = xsup; - - int_t nCudaStreams = sluGPU->nCudaStreams; - /*pinned memory allocations. - Paged-locked memory by cudaMallocHost is accessible to the device.*/ - for (int_t streamId = 0; streamId < nCudaStreams; streamId++ ) - { - void *tmp_ptr; - checkCudaErrors(cudaMallocHost( &tmp_ptr, (n) * sizeof(int_t) )) ; - A_gpu->scubufs[streamId].usub_IndirectJ3_host = (int_t*) tmp_ptr; - - checkCudaErrors(cudaMalloc( &tmp_ptr, ( n) * sizeof(int_t) )); - A_gpu->scubufs[streamId].usub_IndirectJ3 = (int_t*) tmp_ptr; - gpu_mem_used += ( n) * sizeof(int_t); - checkCudaErrors(cudaMallocHost( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; - A_gpu->scubufs[streamId].Remain_info_host = (Remain_info_t*)tmp_ptr; - checkCudaErrors(cudaMallocHost( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; - A_gpu->scubufs[streamId].Ublock_info_host = (Ublock_info_t*)tmp_ptr; - checkCudaErrors(cudaMallocHost( &tmp_ptr, remain_l_max * sizeof(double) )) ; - A_gpu->scubufs[streamId].Remain_L_buff_host = (double *) tmp_ptr; - checkCudaErrors(cudaMallocHost( &tmp_ptr, bigu_size * sizeof(double) )) ; - A_gpu->scubufs[streamId].bigU_host = (double *) tmp_ptr; - - checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[1]))); - A_gpu->acc_L_buff = (double *) tmp_ptr; - checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[3]))); - A_gpu->acc_U_buff = (double *) tmp_ptr; - checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[0]))); - A_gpu->scubufs[streamId].lsub_buf = (int_t *) tmp_ptr; - checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[2]))); - A_gpu->scubufs[streamId].usub_buf = (int_t *) tmp_ptr; - - checkCudaErrors(cudaMalloc( &tmp_ptr, remain_l_max * sizeof(double) )) ; - A_gpu->scubufs[streamId].Remain_L_buff = (double *) tmp_ptr; - gpu_mem_used += remain_l_max * sizeof(double); - checkCudaErrors(cudaMalloc( &tmp_ptr, bigu_size * sizeof(double) )) ; - A_gpu->scubufs[streamId].bigU = (double *) tmp_ptr; - gpu_mem_used += bigu_size * sizeof(double); - checkCudaErrors(cudaMalloc( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; - A_gpu->scubufs[streamId].Ublock_info = (Ublock_info_t *) tmp_ptr; - gpu_mem_used += mcb * sizeof(Ublock_info_t); - checkCudaErrors(cudaMalloc( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; - A_gpu->scubufs[streamId].Remain_info = (Remain_info_t *) tmp_ptr; - gpu_mem_used += mrb * sizeof(Remain_info_t); - checkCudaErrors(cudaMalloc( &tmp_ptr, buffer_size * sizeof(double))) ; - A_gpu->scubufs[streamId].bigV = (double *) tmp_ptr; - gpu_mem_used += buffer_size * sizeof(double); - checkCudaErrors(cudaMalloc( &tmp_ptr, A_host->bufmax[0]*sizeof(int_t))) ; - A_gpu->scubufs[streamId].lsub = (int_t *) tmp_ptr; - gpu_mem_used += A_host->bufmax[0] * sizeof(int_t); - checkCudaErrors(cudaMalloc( &tmp_ptr, A_host->bufmax[2]*sizeof(int_t))) ; - A_gpu->scubufs[streamId].usub = (int_t *) tmp_ptr; - gpu_mem_used += A_host->bufmax[2] * sizeof(int_t); - - } /* endfor streamID ... allocate paged-locked memory */ - - A_gpu->isOffloaded = (int_t *) malloc (sizeof(int_t) * nsupers); - A_gpu->GemmStart = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->GemmEnd = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ScatterEnd = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ePCIeH2D = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ePCIeD2H_Start = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ePCIeD2H_End = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - - for (int_t i = 0; i < nsupers; ++i) - { - A_gpu->isOffloaded[i] = 0; - checkCudaErrors(cudaEventCreate(&(A_gpu->GemmStart[i]))); - checkCudaErrors(cudaEventCreate(&(A_gpu->GemmEnd[i]))); - checkCudaErrors(cudaEventCreate(&(A_gpu->ScatterEnd[i]))); - checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeH2D[i]))); - checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeH2D[i]))); - checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeD2H_Start[i]))); - checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeD2H_End[i]))); - } - - /*---- Copy L data structure to GPU ----*/ - - /*pointers and address of local blocks for easy accessibility */ - local_l_blk_info_t *local_l_blk_infoVec; - int_t * local_l_blk_infoPtr; - local_l_blk_infoPtr = (int_t *) malloc( CEILING(nsupers, Pc) * sizeof(int_t ) ); - - /* First pass: count total L blocks */ - int_t cum_num_l_blocks = 0; /* total number of L blocks I own */ - for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) - { - /* going through each block column I own */ - - if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) - { - int_t *index = A_host->Lrowind_bc_ptr[i]; - int_t num_l_blocks = index[0]; - cum_num_l_blocks += num_l_blocks; - } - } - - /*allocating memory*/ - local_l_blk_infoVec = (local_l_blk_info_t *) malloc(cum_num_l_blocks * sizeof(local_l_blk_info_t)); - - /* Second pass: set up the meta-data for the L structure */ - cum_num_l_blocks = 0; - - /*initialzing vectors */ - for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) - { - if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) - { - int_t *index = A_host->Lrowind_bc_ptr[i]; - int_t num_l_blocks = index[0]; /* # L blocks in this column */ - - if (num_l_blocks > 0) - { - - local_l_blk_info_t *local_l_blk_info_i = local_l_blk_infoVec + cum_num_l_blocks; - local_l_blk_infoPtr[i] = cum_num_l_blocks; - - int_t lptrj = BC_HEADER; - int_t luptrj = 0; - - for (int_t j = 0; j < num_l_blocks ; ++j) - { - - int_t ijb = index[lptrj]; - - local_l_blk_info_i[j].lib = ijb / Pr; - local_l_blk_info_i[j].lptrj = lptrj; - local_l_blk_info_i[j].luptrj = luptrj; - luptrj += index[lptrj + 1]; - lptrj += LB_DESCRIPTOR + index[lptrj + 1]; - - } - } - cum_num_l_blocks += num_l_blocks; - } - - } /* endfor all block columns */ - - - /* Allocate L memory on GPU, and copy the values from CPU to GPU */ - checkCudaErrors(cudaMalloc( &tmp_ptr, cum_num_l_blocks * sizeof(local_l_blk_info_t))) ; - A_gpu->local_l_blk_infoVec = (local_l_blk_info_t *) tmp_ptr; - gpu_mem_used += cum_num_l_blocks * sizeof(local_l_blk_info_t); - checkCudaErrors(cudaMemcpy( (A_gpu->local_l_blk_infoVec), local_l_blk_infoVec, cum_num_l_blocks * sizeof(local_l_blk_info_t), cudaMemcpyHostToDevice)) ; - - checkCudaErrors(cudaMalloc( &tmp_ptr, CEILING(nsupers, Pc)*sizeof(int_t))) ; - A_gpu->local_l_blk_infoPtr = (int_t *) tmp_ptr; - gpu_mem_used += CEILING(nsupers, Pc) * sizeof(int_t); - checkCudaErrors(cudaMemcpy( (A_gpu->local_l_blk_infoPtr), local_l_blk_infoPtr, CEILING(nsupers, Pc)*sizeof(int_t), cudaMemcpyHostToDevice)) ; - - - /*---- Copy U data structure to GPU ----*/ - - local_u_blk_info_t *local_u_blk_infoVec; - int_t * local_u_blk_infoPtr; - local_u_blk_infoPtr = (int_t *) malloc( CEILING(nsupers, Pr) * sizeof(int_t ) ); - - /* First pass: count total U blocks */ - int_t cum_num_u_blocks = 0; - - for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) - { - - if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) - { - int_t *index = A_host->Ufstnz_br_ptr[i]; - int_t num_u_blocks = index[0]; - cum_num_u_blocks += num_u_blocks; - - } - - } - - local_u_blk_infoVec = (local_u_blk_info_t *) malloc(cum_num_u_blocks * sizeof(local_u_blk_info_t)); - - /* Second pass: set up the meta-data for the U structure */ - cum_num_u_blocks = 0; - - for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) - { - if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) - { - int_t *index = A_host->Ufstnz_br_ptr[i]; - int_t num_u_blocks = index[0]; - - if (num_u_blocks > 0) - { - local_u_blk_info_t *local_u_blk_info_i = local_u_blk_infoVec + cum_num_u_blocks; - local_u_blk_infoPtr[i] = cum_num_u_blocks; - - int_t iuip_lib, ruip_lib; - iuip_lib = BR_HEADER; - ruip_lib = 0; - - for (int_t j = 0; j < num_u_blocks ; ++j) - { - - int_t ijb = index[iuip_lib]; - local_u_blk_info_i[j].ljb = ijb / Pc; - local_u_blk_info_i[j].iuip = iuip_lib; - local_u_blk_info_i[j].ruip = ruip_lib; - - ruip_lib += index[iuip_lib + 1]; - iuip_lib += UB_DESCRIPTOR + SuperSize (ijb); - - } - } - cum_num_u_blocks += num_u_blocks; - } - - } - - checkCudaErrors(cudaMalloc( &tmp_ptr, cum_num_u_blocks * sizeof(local_u_blk_info_t))) ; - A_gpu->local_u_blk_infoVec = (local_u_blk_info_t *) tmp_ptr; - gpu_mem_used += cum_num_u_blocks * sizeof(local_u_blk_info_t); - checkCudaErrors(cudaMemcpy( (A_gpu->local_u_blk_infoVec), local_u_blk_infoVec, cum_num_u_blocks * sizeof(local_u_blk_info_t), cudaMemcpyHostToDevice)) ; - - checkCudaErrors(cudaMalloc( &tmp_ptr, CEILING(nsupers, Pr)*sizeof(int_t))) ; - A_gpu->local_u_blk_infoPtr = (int_t *) tmp_ptr; - gpu_mem_used += CEILING(nsupers, Pr) * sizeof(int_t); - checkCudaErrors(cudaMemcpy( (A_gpu->local_u_blk_infoPtr), local_u_blk_infoPtr, CEILING(nsupers, Pr)*sizeof(int_t), cudaMemcpyHostToDevice)) ; - - - /* Copy the actual L indices and values */ - int_t l_k = CEILING( nsupers, grid->npcol ); /* # of local block columns */ - int_t *temp_LrowindPtr = (int_t *) malloc(sizeof(int_t) * l_k); - int_t *temp_LnzvalPtr = (int_t *) malloc(sizeof(int_t) * l_k); - int_t *Lnzval_size = (int_t *) malloc(sizeof(int_t) * l_k); - int_t l_ind_len = 0; - int_t l_val_len = 0; - for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ - { - int_t pc = PCOL( jb, grid ); - if (mycol == pc && isNodeInMyGrid[jb] == 1) - { - int_t ljb = LBj( jb, grid ); /* Local block number */ - int_t *index_host; - index_host = A_host->Lrowind_bc_ptr[ljb]; - - temp_LrowindPtr[ljb] = l_ind_len; - temp_LnzvalPtr[ljb] = l_val_len; // ### - Lnzval_size[ljb] = 0; //### - if (index_host != NULL) - { - int_t nrbl = index_host[0]; /* number of L blocks */ - int_t len = index_host[1]; /* LDA of the nzval[] */ - int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - - /* Global block number is mycol + ljb*Pc */ - int_t nsupc = SuperSize(jb); - - l_ind_len += len1; - l_val_len += len * nsupc; - Lnzval_size[ljb] = len * nsupc ; // ### - } - else - { - Lnzval_size[ljb] = 0 ; // ### - } - - } - } /* endfor jb = 0 ... */ - - /* Copy the actual U indices and values */ - int_t u_k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ - int_t *temp_UrowindPtr = (int_t *) malloc(sizeof(int_t) * u_k); - int_t *temp_UnzvalPtr = (int_t *) malloc(sizeof(int_t) * u_k); - int_t *Unzval_size = (int_t *) malloc(sizeof(int_t) * u_k); - int_t u_ind_len = 0; - int_t u_val_len = 0; - for ( int_t lb = 0; lb < u_k; ++lb) - { - int_t *index_host; - index_host = A_host->Ufstnz_br_ptr[lb]; - temp_UrowindPtr[lb] = u_ind_len; - temp_UnzvalPtr[lb] = u_val_len; - Unzval_size[lb] = 0; - if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) - { - int_t len = index_host[1]; - int_t len1 = index_host[2]; - - u_ind_len += len1; - u_val_len += len; - Unzval_size[lb] = len; - } - else - { - Unzval_size[lb] = 0; - } - } - - gpu_mem_used += l_ind_len * sizeof(int_t); - gpu_mem_used += 2 * l_k * sizeof(int_t); - gpu_mem_used += u_ind_len * sizeof(int_t); - gpu_mem_used += 2 * u_k * sizeof(int_t); - - /*left memory shall be divided among the two */ - - for (int_t i = 0; i < l_k; ++i) - { - temp_LnzvalPtr[i] = -1; - } - - for (int_t i = 0; i < u_k; ++i) - { - temp_UnzvalPtr[i] = -1; - } - - /*setting these pointers back */ - l_val_len = 0; - u_val_len = 0; - - int_t num_gpu_l_blocks = 0; - int_t num_gpu_u_blocks = 0; - size_t mem_l_block, mem_u_block; - - /* Find the trailing matrix size that can fit into GPU memory */ - for (int_t i = nsupers - 1; i > -1; --i) - { - /* ulte se chalte hai eleimination tree */ - /* bottom up ordering */ - int_t i_sup = A_gpu->perm_c_supno[i]; - - int_t pc = PCOL( i_sup, grid ); - if (isNodeInMyGrid[i_sup] == 1) - { - if (mycol == pc ) - { - int_t ljb = LBj(i_sup, grid); - mem_l_block = sizeof(double) * Lnzval_size[ljb]; - if (gpu_mem_used + mem_l_block > max_gpu_memory) - { - break; - } - else - { - gpu_mem_used += mem_l_block; - temp_LnzvalPtr[ljb] = l_val_len; - l_val_len += Lnzval_size[ljb]; - num_gpu_l_blocks++; - A_gpu->first_l_block_gpu = i; - } - } - - int_t pr = PROW( i_sup, grid ); - if (myrow == pr) - { - int_t lib = LBi(i_sup, grid); - mem_u_block = sizeof(double) * Unzval_size[lib]; - if (gpu_mem_used + mem_u_block > max_gpu_memory) - { - break; - } - else - { - gpu_mem_used += mem_u_block; - temp_UnzvalPtr[lib] = u_val_len; - u_val_len += Unzval_size[lib]; - num_gpu_u_blocks++; - A_gpu->first_u_block_gpu = i; - } - - } - } /* endif */ - - } /* endfor i .... nsupers */ - -#if (PRNTlevel>=1) - printf("(%d) Number of L blocks in GPU %d, U blocks %d\n", - grid3d->iam, num_gpu_l_blocks, num_gpu_u_blocks ); - printf("(%d) elimination order of first block in GPU: L block %d, U block %d\n", - grid3d->iam, A_gpu->first_l_block_gpu, A_gpu->first_u_block_gpu); - printf("(%d) Memory of L %.1f GB, memory for U %.1f GB, Total device memory used %.1f GB, Memory allowed %.1f GB \n", grid3d->iam, - l_val_len * sizeof(double) * 1e-9, - u_val_len * sizeof(double) * 1e-9, - gpu_mem_used * 1e-9, max_gpu_memory * 1e-9); - fflush(stdout); -#endif - - /* Assemble index vector on temp */ - int_t *indtemp = (int_t *) malloc(sizeof(int_t) * l_ind_len); - for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ - { - int_t pc = PCOL( jb, grid ); - if (mycol == pc && isNodeInMyGrid[jb] == 1) - { - int_t ljb = LBj( jb, grid ); /* Local block number */ - int_t *index_host; - index_host = A_host->Lrowind_bc_ptr[ljb]; - - if (index_host != NULL) - { - int_t nrbl = index_host[0]; /* number of L blocks */ - int_t len = index_host[1]; /* LDA of the nzval[] */ - int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; - - memcpy(&indtemp[temp_LrowindPtr[ljb]] , index_host, len1 * sizeof(int_t)) ; - } - } - } - - checkCudaErrors(cudaMalloc( &tmp_ptr, l_ind_len * sizeof(int_t))) ; - A_gpu->LrowindVec = (int_t *) tmp_ptr; - checkCudaErrors(cudaMemcpy( (A_gpu->LrowindVec), indtemp, l_ind_len * sizeof(int_t), cudaMemcpyHostToDevice)) ; - - checkCudaErrors(cudaMalloc( &tmp_ptr, l_val_len * sizeof(double))); - A_gpu->LnzvalVec = (double *) tmp_ptr; - checkCudaErrors(cudaMemset( (A_gpu->LnzvalVec), 0, l_val_len * sizeof(double))); - - checkCudaErrors(cudaMalloc( &tmp_ptr, l_k * sizeof(int_t))) ; - A_gpu->LrowindPtr = (int_t *) tmp_ptr; - checkCudaErrors(cudaMemcpy( (A_gpu->LrowindPtr), temp_LrowindPtr, l_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; - - checkCudaErrors(cudaMalloc( &tmp_ptr, l_k * sizeof(int_t))) ; - A_gpu->LnzvalPtr = (int_t *) tmp_ptr; - checkCudaErrors(cudaMemcpy( (A_gpu->LnzvalPtr), temp_LnzvalPtr, l_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; - - A_gpu->LnzvalPtr_host = temp_LnzvalPtr; - - int_t *indtemp1 = (int_t *) malloc(sizeof(int_t) * u_ind_len); - for ( int_t lb = 0; lb < u_k; ++lb) - { - int_t *index_host; - index_host = A_host->Ufstnz_br_ptr[lb]; - - if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) - { - int_t len1 = index_host[2]; - memcpy(&indtemp1[temp_UrowindPtr[lb]] , index_host, sizeof(int_t)*len1); - - } - } - - checkCudaErrors(cudaMalloc( &tmp_ptr, u_ind_len * sizeof(int_t))) ; - A_gpu->UrowindVec = (int_t *) tmp_ptr; - checkCudaErrors(cudaMemcpy( (A_gpu->UrowindVec), indtemp1, u_ind_len * sizeof(int_t), cudaMemcpyHostToDevice)) ; - - checkCudaErrors(cudaMalloc( &tmp_ptr, u_val_len * sizeof(double))); - A_gpu->UnzvalVec = (double *) tmp_ptr; - checkCudaErrors(cudaMemset( (A_gpu->UnzvalVec), 0, u_val_len * sizeof(double))); - - checkCudaErrors(cudaMalloc( &tmp_ptr, u_k * sizeof(int_t))) ; - A_gpu->UrowindPtr = (int_t *) tmp_ptr; - checkCudaErrors(cudaMemcpy( (A_gpu->UrowindPtr), temp_UrowindPtr, u_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; - - A_gpu->UnzvalPtr_host = temp_UnzvalPtr; - - checkCudaErrors(cudaMalloc( &tmp_ptr, u_k * sizeof(int_t))) ; - A_gpu->UnzvalPtr = (int_t *) tmp_ptr; - checkCudaErrors(cudaMemcpy( (A_gpu->UnzvalPtr), temp_UnzvalPtr, u_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; - - checkCudaErrors(cudaMalloc( &tmp_ptr, (nsupers + 1)*sizeof(int_t))) ; - A_gpu->xsup = (int_t *) tmp_ptr; - checkCudaErrors(cudaMemcpy( (A_gpu->xsup), xsup, (nsupers + 1)*sizeof(int_t), cudaMemcpyHostToDevice)) ; - - checkCudaErrors(cudaMalloc( &tmp_ptr, sizeof(LUstruct_gpu))) ; - *dA_gpu = (LUstruct_gpu *) tmp_ptr; - checkCudaErrors(cudaMemcpy( *dA_gpu, A_gpu, sizeof(LUstruct_gpu), cudaMemcpyHostToDevice)) ; - - free (temp_LrowindPtr); - free (temp_UrowindPtr); - free (indtemp1); - free (indtemp); - -} /* end CopyLUToGPU3D */ - - - -int_t reduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount, - int_t** treePerm, - dLUValSubBuf_t*LUvsb, - LUstruct_t* LUstruct, - gridinfo3d_t* grid3d, - sluGPU_t *sluGPU, - d2Hreduce_t* d2Hred, - factStat_t *factStat, - HyP_t* HyP, - SCT_t* SCT ) -{ - - -// first synchronize all cuda streams - int_t superlu_acc_offload = HyP->superlu_acc_offload; - - - - - int_t maxLvl = log2i( (int_t) grid3d->zscp.Np) + 1; - int_t myGrid = grid3d->zscp.Iam; - gridinfo_t* grid = &(grid3d->grid2d); - int_t* gpuLUreduced = factStat->gpuLUreduced; - - - int_t sender; - if ((myGrid % (1 << (ilvl + 1))) == 0) - { - sender = myGrid + (1 << ilvl); - - } - else - { - sender = myGrid; - } - - /*Reduce all the ancestors from the GPU*/ - if (myGrid == sender && superlu_acc_offload) - { - for (int_t streamId = 0; streamId < sluGPU->nCudaStreams; streamId++) - { - double ttx = SuperLU_timer_(); - cudaStreamSynchronize(sluGPU->funCallStreams[streamId]); - SCT->PhiWaitTimer += SuperLU_timer_() - ttx; - sluGPU->lastOffloadStream[streamId] = -1; - } - - for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl) - { - /* code */ - // int_t atree = myTreeIdxs[alvl]; - int_t nsAncestor = myNodeCount[alvl]; - int_t* cAncestorList = treePerm[alvl]; - - for (int_t node = 0; node < nsAncestor; node++ ) - { - int_t k = cAncestorList[node]; - if (!gpuLUreduced[k]) - { - - initD2Hreduce(k, d2Hred, 1, - HyP, sluGPU, grid, LUstruct, SCT); - int_t copyL_kljb = d2Hred->copyL_kljb; - int_t copyU_kljb = d2Hred->copyU_kljb; - - double tt_start1 = SuperLU_timer_(); - SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; - if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++; - sendLUpanelGPU2HOST(k, d2Hred, sluGPU); - /* - Reduce the LU panels from GPU - */ - reduceGPUlu(1, d2Hred, - sluGPU, SCT, grid, LUstruct); - - gpuLUreduced[k] = 1; - } - } - } - } /*if (myGrid == sender)*/ - - dreduceAllAncestors3d(ilvl, myNodeCount, treePerm, - LUvsb, LUstruct, grid3d, SCT ); - return 0; -} - - -void syncAllfunCallStreams(sluGPU_t* sluGPU, SCT_t* SCT) -{ - for (int_t streamId = 0; streamId < sluGPU->nCudaStreams; streamId++) - { - double ttx = SuperLU_timer_(); - cudaStreamSynchronize(sluGPU->funCallStreams[streamId]); - SCT->PhiWaitTimer += SuperLU_timer_() - ttx; - sluGPU->lastOffloadStream[streamId] = -1; - } -} diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h index fb17eacc..0b63cf6c 100644 --- a/SRC/superlu_zdefs.h +++ b/SRC/superlu_zdefs.h @@ -228,6 +228,7 @@ typedef struct { } zSOLVEstruct_t; + /*==== For 3D code ====*/ // new structures for pdgstrf_4_8 @@ -313,13 +314,13 @@ typedef struct { doublecomplex *bigU; doublecomplex *bigV; -} scuBufs_t; +} zscuBufs_t; typedef struct { doublecomplex* BlockLFactor; doublecomplex* BlockUFactor; -} diagFactBufs_t; +} zdiagFactBufs_t; typedef struct { @@ -329,7 +330,8 @@ typedef struct lPanelInfo_t* lPanelInfo; } packLUInfo_t; -/*==== End 3D structures ============*/ +//#endif +/*=====================*/ /*********************************************************************** * Function prototypes @@ -447,7 +449,7 @@ extern void zscatter_u (int ib, int jb, int nsupc, int_t iukp, int_t * xsup, int_t* lsub, int_t* usub, doublecomplex* tempv, int_t ** Ufstnz_br_ptr, doublecomplex **Unzval_br_ptr, gridinfo_t * grid); -extern int_t pzgstrf(superlu_dist_options_t *, int, int, double, +extern int_t pzgstrf(superlu_dist_options_t *, int, int, double anorm, zLUstruct_t*, gridinfo_t*, SuperLUStat_t*, int*); /* #define GPU_PROF @@ -573,7 +575,7 @@ extern int file_PrintDoublecomplex(FILE *fp, char *, int_t, doublecomplex *); /* BLAS */ - + #ifdef USE_VENDOR_BLAS extern void zgemm_(const char*, const char*, const int*, const int*, const int*, const doublecomplex*, const doublecomplex*, const int*, const doublecomplex*, @@ -586,9 +588,8 @@ extern void ztrsm_(const char*, const char*, const char*, const char*, extern void zgemv_(const char *, const int *, const int *, const doublecomplex *, const doublecomplex *a, const int *, const doublecomplex *, const int *, const doublecomplex *, doublecomplex *, const int *, int); - + #else - extern int zgemm_(const char*, const char*, const int*, const int*, const int*, const doublecomplex*, const doublecomplex*, const int*, const doublecomplex*, const int*, const doublecomplex*, doublecomplex*, const int*); @@ -600,7 +601,6 @@ extern int ztrsm_(const char*, const char*, const char*, const char*, extern void zgemv_(const char *, const int *, const int *, const doublecomplex *, const doublecomplex *a, const int *, const doublecomplex *, const int *, const doublecomplex *, doublecomplex *, const int *); - #endif extern void zgeru_(const int*, const int*, const doublecomplex*, @@ -611,7 +611,7 @@ extern int zscal_(const int *n, const doublecomplex *alpha, doublecomplex *dx, c extern int zaxpy_(const int *n, const doublecomplex *alpha, const doublecomplex *x, const int *incx, doublecomplex *y, const int *incy); -/* SuperLU BLAS interface: xsuperlu_blas.c.base */ +/* SuperLU BLAS interface: zsuperlu_blas.c */ extern int superlu_zgemm(const char *transa, const char *transb, int m, int n, int k, doublecomplex alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex beta, doublecomplex *c, int ldc); @@ -632,6 +632,7 @@ extern int superlu_zgemv(const char *trans, const int m, extern int superlu_ztrsv(char *uplo, char *trans, char *diag, int n, doublecomplex *a, int lda, doublecomplex *x, int incx); + // LAPACK routine extern void ztrtri_(char*, char*, int*, doublecomplex*, int*, int*); @@ -640,9 +641,9 @@ extern void ztrtri_(char*, char*, int*, doublecomplex*, int*, int*); extern int zcreate_matrix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs, int *ldb, doublecomplex **x, int *ldx, FILE *fp, gridinfo3d_t *grid3d); -extern int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, - doublecomplex **rhs, int *ldb, doublecomplex **x, int *ldx, - FILE *fp, char * postfix, gridinfo3d_t *grid3d); +extern int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs, + int *ldb, doublecomplex **x, int *ldx, + FILE *fp, char * postfix, gridinfo3d_t *grid3d); /* Matrix distributed in NRformat_loc in 3D process grid. It converts it to a NRformat_loc distributed in 2D grid in grid-0 */ @@ -650,7 +651,7 @@ extern NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, doublecomplex *B, int ldb, int nrhs, gridinfo3d_t *grid3d); extern int zScatter_B3d(NRformat_loc3d *A3d, gridinfo3d_t *grid3d); - + extern void pzgssvx3d (superlu_dist_options_t *, SuperMatrix *, zScalePermstruct_t *, doublecomplex B[], int ldb, int nrhs, gridinfo3d_t *, zLUstruct_t *, zSOLVEstruct_t *, @@ -680,7 +681,7 @@ zblock_gemm_scatter( int_t lb, int_t j, Ublock_info_t *Ublock_info, #endif ); -#ifdef _OPENMP +#ifdef _OPENMP /*this version uses a lock to prevent multiple thread updating the same block*/ extern void zblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock, @@ -700,7 +701,7 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, omp_lock_t* lock, #endif ); #endif - + extern int_t zblock_gemm_scatterTopLeft( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, @@ -760,11 +761,20 @@ extern void zDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d extern void z3D_printMemUse(trf3Dpartition_t* trf3Dpartition, zLUstruct_t *LUstruct, gridinfo3d_t * grid3d); - //extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat, - // superlu_dist_options_t *options, LocalLU_t *Llu, - // int_t* xsup, int_t num_look_aheads, int_t nsupers, - // int_t * iperm_c_supno); +//extern int* getLastDep(gridinfo_t *grid, SuperLUStat_t *stat, +// superlu_dist_options_t *options, zLocalLU_t *Llu, +// int_t* xsup, int_t num_look_aheads, int_t nsupers, +// int_t * iperm_c_supno); + +extern void zinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs, + sForest_t** sForests, zLUstruct_t* LUstruct, + gridinfo3d_t* grid3d); +extern int_t zgatherAllFactoredLUFr(int_t* myZeroTrIdxs, sForest_t* sForests, + zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, + SCT_t* SCT ); + + /* The following are from pdgstrf2.h */ extern int_t zLpanelUpdate(int_t off0, int_t nsupc, doublecomplex* ublk_ptr, int_t ld_ujrow, doublecomplex* lusup, int_t nsupr, SCT_t*); extern void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, @@ -822,9 +832,8 @@ int_t zreduceAncestors3d(int_t sender, int_t receiver, int_t nnodes, int_t* nodeList, doublecomplex* Lval_buf, doublecomplex* Uval_buf, zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT); - /*reduces all nodelists required in a level*/ -int_t zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, +extern int zreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm, zLUValSubBuf_t* LUvsb, zLUstruct_t* LUstruct, @@ -951,6 +960,7 @@ extern int_t zLPanelTrSolve(int_t k, int_t* factored_L, doublecomplex* BlockUFac gridinfo_t *, zLUstruct_t *); /* from trfAux.h */ +extern int getNsupers(int, Glu_persist_t *); extern int_t initPackLUInfo(int_t nsupers, packLUInfo_t* packLUInfo); extern int freePackLUInfo(packLUInfo_t* packLUInfo); extern int_t zSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*, @@ -961,33 +971,35 @@ extern int_t zSchurComplementSetup(int_t k, int *msgcnt, Ublock_info_t*, doublecomplex* Uval_buf, gridinfo_t *, zLUstruct_t *); extern int_t zSchurComplementSetupGPU(int_t k, msgs_t* msgs, packLUInfo_t*, int_t*, int_t*, int_t*, gEtreeInfo_t*, - factNodelists_t*, scuBufs_t*, + factNodelists_t*, zscuBufs_t*, zLUValSubBuf_t* LUvsb, gridinfo_t *, zLUstruct_t *, HyP_t*); extern doublecomplex* zgetBigV(int_t, int_t); extern doublecomplex* zgetBigU(int_t, gridinfo_t *, zLUstruct_t *); +// permutation from superLU default /* from treeFactorization.h */ extern int_t zLluBufInit(zLUValSubBuf_t*, zLUstruct_t *); extern int_t zinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, - scuBufs_t*, zLUstruct_t*, gridinfo_t *); -extern int zfreeScuBufs(scuBufs_t* scuBufs); + zscuBufs_t*, zLUstruct_t*, gridinfo_t *); +extern int zfreeScuBufs(zscuBufs_t* scuBufs); // the generic tree factoring code extern int_t treeFactor( int_t nnnodes, // number of nodes in the tree int_t *perm_c_supno, // list of nodes in the order of factorization commRequests_t *comReqs, // lists of communication requests - scuBufs_t *scuBufs, // contains buffers for schur complement update + zscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t*msgs, zLUValSubBuf_t* LUvsb, - diagFactBufs_t *dFBuf, + zdiagFactBufs_t *dFBuf, factStat_t *factStat, factNodelists_t *fNlists, superlu_dist_options_t *options, int_t * gIperm_c_supno, int_t ldt, + zLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat, double thresh, SCT_t *SCT, int *info ); @@ -997,11 +1009,11 @@ extern int_t zsparseTreeFactor( int_t *perm_c_supno, // list of nodes in the order of factorization treeTopoInfo_t* treeTopoInfo, commRequests_t *comReqs, // lists of communication requests - scuBufs_t *scuBufs, // contains buffers for schur complement update + zscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t*msgs, zLUValSubBuf_t* LUvsb, - diagFactBufs_t *dFBuf, + zdiagFactBufs_t *dFBuf, factStat_t *factStat, factNodelists_t *fNlists, superlu_dist_options_t *options, @@ -1016,11 +1028,11 @@ extern int_t zdenseTreeFactor( int_t nnnodes, // number of nodes in the tree int_t *perm_c_supno, // list of nodes in the order of factorization commRequests_t *comReqs, // lists of communication requests - scuBufs_t *scuBufs, // contains buffers for schur complement update + zscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t*msgs, zLUValSubBuf_t* LUvsb, - diagFactBufs_t *dFBuf, + zdiagFactBufs_t *dFBuf, factStat_t *factStat, factNodelists_t *fNlists, superlu_dist_options_t *options, @@ -1034,11 +1046,11 @@ extern int_t zdenseTreeFactor( extern int_t zsparseTreeFactor_ASYNC( sForest_t* sforest, commRequests_t **comReqss, // lists of communication requests // size maxEtree level - scuBufs_t *scuBufs, // contains buffers for schur complement update + zscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t**msgss, // size=num Look ahead zLUValSubBuf_t** LUvsbs, // size=num Look ahead - diagFactBufs_t **dFBufs, // size maxEtree level + zdiagFactBufs_t **dFBufs, // size maxEtree level factStat_t *factStat, factNodelists_t *fNlists, gEtreeInfo_t* gEtreeInfo, // global etree info @@ -1052,9 +1064,9 @@ extern int_t zsparseTreeFactor_ASYNC( ); extern zLUValSubBuf_t** zLluBufInitArr(int_t numLA, zLUstruct_t *LUstruct); extern int zLluBufFreeArr(int_t numLA, zLUValSubBuf_t **LUvsbs); -extern diagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); -extern int zfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs); -extern int_t zinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf); +extern zdiagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid); +extern int zfreeDiagFactBufsArr(int_t mxLeafNode, zdiagFactBufs_t** dFBufs); +extern int_t zinitDiagFactBufs(int_t ldt, zdiagFactBufs_t* dFBuf); extern int_t checkRecvUDiag(int_t k, commRequests_t *comReqs, gridinfo_t *grid, SCT_t *SCT); extern int_t checkRecvLDiag(int_t k, commRequests_t *comReqs, gridinfo_t *, SCT_t *); @@ -1064,11 +1076,11 @@ extern int_t ancestorFactor( int_t ilvl, // level of factorization sForest_t* sforest, commRequests_t **comReqss, // lists of communication requests // size maxEtree level - scuBufs_t *scuBufs, // contains buffers for schur complement update + zscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t**msgss, // size=num Look ahead zLUValSubBuf_t** LUvsbs, // size=num Look ahead - diagFactBufs_t **dFBufs, // size maxEtree level + zdiagFactBufs_t **dFBufs, // size maxEtree level factStat_t *factStat, factNodelists_t *fNlists, gEtreeInfo_t* gEtreeInfo, // global etree info @@ -1080,7 +1092,8 @@ extern int_t ancestorFactor( double thresh, SCT_t *SCT, int tag_ub, int *info ); -/*== end 3D prototypes =================*/ +/*== end 3D prototypes ===================*/ + #ifdef __cplusplus } diff --git a/SRC/treeFactorizationGPU.c b/SRC/treeFactorizationGPU.c index 24b0639a..45d4f8c0 100644 --- a/SRC/treeFactorizationGPU.c +++ b/SRC/treeFactorizationGPU.c @@ -1,6 +1,6 @@ // #include "treeFactorization.h" // #include "trfCommWrapper.h" -#include "lustruct_gpu.h" +#include "dlustruct_gpu.h" #ifdef __INTEL_COMPILER #include "mkl.h" #else @@ -46,7 +46,7 @@ static int_t getAccUPartition(HyP_t *HyP) return jj_cpu; } -int_t sparseTreeFactor_ASYNC_GPU( +int dsparseTreeFactor_ASYNC_GPU( sForest_t *sforest, commRequests_t **comReqss, // lists of communication requests, // size = maxEtree level @@ -64,7 +64,7 @@ int_t sparseTreeFactor_ASYNC_GPU( sluGPU_t *sluGPU, d2Hreduce_t *d2Hred, HyP_t *HyP, - LUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat, + dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat, double thresh, SCT_t *SCT, int tag_ub, int *info) { diff --git a/SRC/util.c b/SRC/util.c index 2759641c..f428b42b 100644 --- a/SRC/util.c +++ b/SRC/util.c @@ -263,178 +263,6 @@ void print_sp_ienv_dist(superlu_dist_options_t *options) printf("**************************************************\n"); } -<<<<<<< HEAD -/*! \brief - * - *
- * Purpose
- * =======
- *   Set up the communication pattern for redistribution between B and X
- *   in the triangular solution.
- * 
- * Arguments
- * =========
- *
- * n      (input) int (global)
- *        The dimension of the linear system.
- *
- * m_loc  (input) int (local)
- *        The local row dimension of the distributed input matrix.
- *
- * nrhs   (input) int (global)
- *        Number of right-hand sides.
- *
- * fst_row (input) int (global)
- *        The row number of matrix B's first row in the global matrix.
- *
- * perm_r (input) int* (global)
- *        The row permutation vector.
- *
- * perm_c (input) int* (global)
- *        The column permutation vector.
- *
- * grid   (input) gridinfo_t*
- *        The 2D process mesh.
- * 
- */ -int_t pxgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, - int_t perm_r[], int_t perm_c[], gridinfo_t *grid, - Glu_persist_t *Glu_persist, SOLVEstruct_t *SOLVEstruct) -{ - - int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; - int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; - int *itemp, *ptr_to_ibuf, *ptr_to_dbuf; - int_t *row_to_proc; - int_t i, gbi, k, l, num_diag_procs, *diag_procs; - int_t irow, q, knsupc, nsupers, *xsup, *supno; - int iam, p, pkk, procs; - pxgstrs_comm_t *gstrs_comm; - - procs = grid->nprow * grid->npcol; - iam = grid->iam; - gstrs_comm = SOLVEstruct->gstrs_comm; - xsup = Glu_persist->xsup; - supno = Glu_persist->supno; - nsupers = Glu_persist->supno[n - 1] + 1; - row_to_proc = SOLVEstruct->row_to_proc; - - /* ------------------------------------------------------------ - SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X. - ------------------------------------------------------------*/ - if (!(itemp = SUPERLU_MALLOC(8 * procs * sizeof(int)))) - ABORT("Malloc fails for B_to_X_itemp[]."); - SendCnt = itemp; - SendCnt_nrhs = itemp + procs; - RecvCnt = itemp + 2 * procs; - RecvCnt_nrhs = itemp + 3 * procs; - sdispls = itemp + 4 * procs; - sdispls_nrhs = itemp + 5 * procs; - rdispls = itemp + 6 * procs; - rdispls_nrhs = itemp + 7 * procs; - - /* Count the number of elements to be sent to each diagonal process.*/ - for (p = 0; p < procs; ++p) - SendCnt[p] = 0; - for (i = 0, l = fst_row; i < m_loc; ++i, ++l) - { - irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ - gbi = BlockNum(irow); - p = PNUM(PROW(gbi, grid), PCOL(gbi, grid), grid); /* Diagonal process */ - ++SendCnt[p]; - } - - /* Set up the displacements for alltoall. */ - MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); - sdispls[0] = rdispls[0] = 0; - for (p = 1; p < procs; ++p) - { - sdispls[p] = sdispls[p - 1] + SendCnt[p - 1]; - rdispls[p] = rdispls[p - 1] + RecvCnt[p - 1]; - } - for (p = 0; p < procs; ++p) - { - SendCnt_nrhs[p] = SendCnt[p] * nrhs; - sdispls_nrhs[p] = sdispls[p] * nrhs; - RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; - rdispls_nrhs[p] = rdispls[p] * nrhs; - } - - /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ - gstrs_comm->B_to_X_SendCnt = SendCnt; - - /* ------------------------------------------------------------ - SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B. - ------------------------------------------------------------*/ - /* This is freed in pxgstrs_finalize(). */ - if (!(itemp = SUPERLU_MALLOC(8 * procs * sizeof(int)))) - ABORT("Malloc fails for X_to_B_itemp[]."); - SendCnt = itemp; - SendCnt_nrhs = itemp + procs; - RecvCnt = itemp + 2 * procs; - RecvCnt_nrhs = itemp + 3 * procs; - sdispls = itemp + 4 * procs; - sdispls_nrhs = itemp + 5 * procs; - rdispls = itemp + 6 * procs; - rdispls_nrhs = itemp + 7 * procs; - - /* Count the number of X entries to be sent to each process.*/ - for (p = 0; p < procs; ++p) - SendCnt[p] = 0; - num_diag_procs = SOLVEstruct->num_diag_procs; - diag_procs = SOLVEstruct->diag_procs; - - for (p = 0; p < num_diag_procs; ++p) - { /* for all diagonal processes */ - pkk = diag_procs[p]; - if (iam == pkk) - { - for (k = p; k < nsupers; k += num_diag_procs) - { - knsupc = SuperSize(k); - irow = FstBlockC(k); - for (i = 0; i < knsupc; ++i) - { -#if 0 - q = row_to_proc[inv_perm_c[irow]]; -#else - q = row_to_proc[irow]; -#endif - ++SendCnt[q]; - ++irow; - } - } - } - } - - MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); - sdispls[0] = rdispls[0] = 0; - sdispls_nrhs[0] = rdispls_nrhs[0] = 0; - SendCnt_nrhs[0] = SendCnt[0] * nrhs; - RecvCnt_nrhs[0] = RecvCnt[0] * nrhs; - for (p = 1; p < procs; ++p) - { - sdispls[p] = sdispls[p - 1] + SendCnt[p - 1]; - rdispls[p] = rdispls[p - 1] + RecvCnt[p - 1]; - sdispls_nrhs[p] = sdispls[p] * nrhs; - rdispls_nrhs[p] = rdispls[p] * nrhs; - SendCnt_nrhs[p] = SendCnt[p] * nrhs; - RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; - } - - /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ - gstrs_comm->X_to_B_SendCnt = SendCnt; - - if (!(ptr_to_ibuf = SUPERLU_MALLOC(2 * procs * sizeof(int)))) - ABORT("Malloc fails for ptr_to_ibuf[]."); - gstrs_comm->ptr_to_ibuf = ptr_to_ibuf; - gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs; - - return 0; -} /* PXGSTRS_INIT */ - -======= ->>>>>>> Version-7 void pxgstrs_finalize(pxgstrs_comm_t *gstrs_comm) { SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt); @@ -487,21 +315,10 @@ void PStatPrint(superlu_dist_options_t *options, SuperLUStat_t *stat, gridinfo_t if (options->PrintStat == NO) return; -<<<<<<< HEAD if (!iam && options->Fact != FACTORED) { printf("**************************************************\n"); printf("**** Time (seconds) ****\n"); - - if (options->Equil != NO) - printf("\tEQUIL time %8.2f\n", utime[EQUIL]); - if (options->RowPerm != NOROWPERM) - printf("\tROWPERM time %8.2f\n", utime[ROWPERM]); - if (options->ColPerm != NATURAL) - printf("\tCOLPERM time %8.2f\n", utime[COLPERM]); - printf("\tSYMBFACT time %8.2f\n", utime[SYMBFAC]); - printf("\tDISTRIBUTE time %8.2f\n", utime[DIST]); -======= if ( options->Equil != NO ) printf("\tEQUIL time %8.3f\n", utime[EQUIL]); if ( options->RowPerm != NOROWPERM ) @@ -510,29 +327,17 @@ void PStatPrint(superlu_dist_options_t *options, SuperLUStat_t *stat, gridinfo_t printf("\tCOLPERM time %8.3f\n", utime[COLPERM]); printf("\tSYMBFACT time %8.3f\n", utime[SYMBFAC]); printf("\tDISTRIBUTE time %8.3f\n", utime[DIST]); - ->>>>>>> Version-7 } MPI_Reduce(&ops[FACT], &flopcnt, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); factflop = flopcnt; -<<<<<<< HEAD - if (!iam && options->Fact != FACTORED) - { - printf("\tFACTOR time %8.2f\n", utime[FACT]); - if (utime[FACT] != 0.0) - printf("\tFactor flops\t%e\tMflops \t%8.2f\n", - flopcnt, - flopcnt * 1e-6 / utime[FACT]); -======= if ( !iam && options->Fact != FACTORED ) { printf("\tFACTOR time %8.3f\n", utime[FACT]); if ( utime[FACT] != 0.0 ) printf("\tFactor flops\t%e\tMflops \t%8.2f\n", flopcnt, flopcnt*1e-6/utime[FACT]); ->>>>>>> Version-7 } MPI_Reduce(&ops[SOLVE], &flopcnt, 1, MPI_FLOAT, MPI_SUM, @@ -702,8 +507,10 @@ void get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid, do { ++(*num_diag_procs); - i = (++i) % nprow; - j = (++j) % npcol; + ++i; + i = (i) % nprow; + ++j; + j = (j) % npcol; pkk = PNUM(i, j, grid); } while (pkk != 0); /* Until wrap back to process 0 */ if (!(*diag_procs = intMalloc_dist(*num_diag_procs))) @@ -714,8 +521,10 @@ void get_diag_procs(int_t n, Glu_persist_t *Glu_persist, gridinfo_t *grid, { pkk = PNUM(i, j, grid); (*diag_procs)[k] = pkk; - i = (++i) % nprow; - j = (++j) % npcol; + ++i; + i = (i) % nprow; + ++j; + j = (j) % npcol; } for (k = 0; k < nsupers; ++k) { @@ -1178,20 +987,12 @@ int_t num_full_cols_U( } int_t estimate_bigu_size( -<<<<<<< HEAD - int_t nsupers, - int_t **Ufstnz_br_ptr, /* point to U index[] array */ - Glu_persist_t *Glu_persist, - gridinfo_t *grid, int_t *perm_u, - int_t *max_ncols /* Output: Max. number of columns in among all U(k,:). -======= int_t nsupers, - int_t**Ufstnz_br_ptr, /* point to U index[] array */ + int_t **Ufstnz_br_ptr, /* point to U index[] array */ Glu_persist_t *Glu_persist, gridinfo_t* grid, int_t* perm_u, int_t *max_ncols /* Output: Max. number of columns among all U(k,:). ->>>>>>> Version-7 - This is used for allocating GEMM V buffer. */ + This is used for allocating GEMM V buffer. */ ) { int_t iam = grid->iam; @@ -1312,11 +1113,10 @@ void quickSortM(int_t *a, int_t l, int_t r, int_t lda, int_t dir, int_t dims) // printf("dims: %5d",dims); // fflush(stdout); -<<<<<<< HEAD // divide and conquer j = partitionM(a, l, r, lda, dir, dims); - quickSortM(a, l, j - 1, lda, dir, dims); - quickSortM(a, j + 1, r, lda, dir, dims); + quickSortM(a, l, j-1, lda, dir, dims); + quickSortM(a, j+1, r, lda, dir, dims); } } @@ -1383,68 +1183,8 @@ int_t partitionM(int_t *a, int_t l, int_t r, int_t lda, int_t dir, int_t dims) } return 0; -} - -/* - * The following are from 3D code p3dcomm.c - */ - -int AllocGlu_3d(int_t n, int_t nsupers, LUstruct_t *LUstruct) -{ - /*broadcasting Glu_persist*/ - LUstruct->Glu_persist->xsup = intMalloc_dist(nsupers + 1); //INT_T_ALLOC(nsupers+1); - LUstruct->Glu_persist->supno = intMalloc_dist(n); //INT_T_ALLOC(n); - return 0; -} - -// Sherry added -int DeAllocGlu_3d(LUstruct_t *LUstruct) -{ - SUPERLU_FREE(LUstruct->Glu_persist->xsup); - SUPERLU_FREE(LUstruct->Glu_persist->supno); - return 0; -} -======= -int_t partitionM( int_t* a, int_t l, int_t r, int_t lda, int_t dir, int_t dims) { - int_t pivot, i, j, t, dd; - pivot = a[l]; - i = l; j = r+1; - - if(dir==0){ - while( 1) - { - do ++i; while( a[i] <= pivot && i <= r ); - do --j; while( a[j] > pivot ); - if( i >= j ) break; - for(dd=0;dd= pivot && i <= r ); - do --j; while( a[j] < pivot ); - if( i >= j ) break; - for(dd=0;dd>>>>>> Version-7 - int_t **getTreePerm(int_t *myTreeIdxs, int_t *myZeroTrIdxs, int_t *nodeCount, int_t **nodeList, int_t *perm_c_supno, int_t *iperm_c_supno, @@ -1763,4 +1503,36 @@ gemm_division_new (int * num_streams_used, /*number of streams that will be us } } +/* The following are moved from superlu_gpu.cu */ + +int getnCudaStreams() +{ + // Disabling multiple cuda streams + #if 1 + return 1; + #else + char *ttemp; + ttemp = getenv ("N_CUDA_STREAMS"); + + if (ttemp) + return atoi (ttemp); + else + return 1; + #endif +} + +int get_mpi_process_per_gpu () +{ + char *ttemp; + ttemp = getenv ("MPI_PROCESS_PER_GPU"); + + if (ttemp) + return atol (ttemp); + else + { + printf("MPI_PROCESS_PER_GPU is not set; Using default 1 \n"); + return 1; + } +} + #endif /* defined GPU_ACC */ diff --git a/SRC/util_dist.h b/SRC/util_dist.h index ca31e064..a52b7c1f 100644 --- a/SRC/util_dist.h +++ b/SRC/util_dist.h @@ -289,4 +289,5 @@ typedef struct } SCT_t; + #endif /* __SUPERLU_UTIL */ diff --git a/SRC/zscatter3d.c b/SRC/zscatter3d.c index 62e8603a..a4dc8cb9 100644 --- a/SRC/zscatter3d.c +++ b/SRC/zscatter3d.c @@ -18,10 +18,7 @@ at the top-level directory. #else //#include "cblas.h" #endif - -#ifdef _OPENMP #include "omp.h" -#endif #define ISORT #define SCATTER_U_CPU scatter_u @@ -97,11 +94,11 @@ zblock_gemm_scatter( int_t lb, int_t j, ) { // return ; -#ifdef _OPENMP +#ifdef _OPENMP thread_id = omp_get_thread_num(); -#else +#else thread_id = 0; -#endif +#endif int *indirect_thread = indirect + ldt * thread_id; int *indirect2_thread = indirect2 + ldt * thread_id; doublecomplex *tempv1 = bigV + thread_id * ldt * ldt; @@ -131,45 +128,16 @@ zblock_gemm_scatter( int_t lb, int_t j, int_t ib = Remain_info[lb].ib; int temp_nbrow = lsub[lptr + 1]; lptr += LB_DESCRIPTOR; - int_t cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow); - /* Getting L block information */ - // int_t lptr = Remain_info[lb].lptr; - // int_t ib = Remain_info[lb].ib; - // int_t temp_nbrow = lsub[lptr + 1]; - // lptr += LB_DESCRIPTOR; - // int_t cum_nrow = Remain_info[lb].StRow; + int cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow); doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0}; /* calling ZGEMM */ // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col ); - superlu_zgemm("N", "N", temp_nbrow, ncols, ldu, alpha, &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl, &U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow); -#if 0 // ** replaced by superlu_zgemm -#if 1 - #if defined (USE_VENDOR_BLAS) - zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha, - &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl, - &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1); - #else - zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha, - &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl, - &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow); - #endif -#else - // printf("%d %d %d %d %d %d %d %d\n", temp_nbrow, ncols, ldu, ldl,st_col,(knsupc - ldu)*ldl + cum_nrow,cum_nrow,st_col); - - cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, - temp_nbrow, ncols, ldu, alpha, - &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl, - &U_mat[st_col * ldu], ldu, - beta, tempv1, temp_nbrow); -#endif -#endif // ** replaced by superlu_zgemm - // printf("SCU update: (%d, %d)\n",ib,jb ); #ifdef SCATTER_PROFILE double ttx = SuperLU_timer_(); @@ -254,9 +222,9 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, /* Getting L block information */ int_t lptr = Remain_info[lb].lptr; int_t ib = Remain_info[lb].ib; - int_t temp_nbrow = lsub[lptr + 1]; + int temp_nbrow = lsub[lptr + 1]; lptr += LB_DESCRIPTOR; - int_t cum_nrow = Remain_info[lb].StRow; + int cum_nrow = Remain_info[lb].StRow; doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0}; @@ -265,27 +233,6 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl, &U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow); -#if 0 // replaced by superlu_zgemm -#if 1 - #if defined (USE_VENDOR_BLAS) - // printf(" m %d n %d k %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col ); - zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha, - &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl, - &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1); - #else - zgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha, - &L_mat[(knsupc - ldu)*ldl + cum_nrow], &ldl, - &U_mat[st_col * ldu], &ldu, &beta, tempv1, &temp_nbrow); - #endif -#else - cblas_zgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, - temp_nbrow, ncols, ldu, alpha, - &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl, - &U_mat[st_col * ldu], ldu, - beta, tempv1, temp_nbrow); -#endif -#endif // replaced by superlu_zgemm - /*try to get the lock for the block*/ if (lock) /*lock is not null*/ while (!omp_test_lock(lock)) @@ -326,6 +273,7 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, omp_unset_lock(lock); #ifdef SCATTER_PROFILE + //double t_s = (double) __rdtsc() - ttx; double t_s = SuperLU_timer_() - ttx; Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1) + (192 / 8) * (CEILING(ncols, 8) - 1))] += 3.0 * (double ) temp_nbrow * (double ) ncols; @@ -333,7 +281,7 @@ zblock_gemm_scatter_lock( int_t lb, int_t j, += t_s; #endif } /* zblock_gemm_scatter_lock */ -#endif // only if _OPENMP is defined +#endif // Only if _OPENMP is defined // there are following three variations of block_gemm_scatter call @@ -375,13 +323,14 @@ int_t zblock_gemm_scatterTopLeft( int_t lb, /* block number in L */ int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; -#ifdef _OPENMP +#ifdef _OPENMP volatile int_t thread_id = omp_get_thread_num(); -#else +#else volatile int_t thread_id = 0; -#endif - +#endif + // printf("Thread's ID %lld \n", thread_id); + //unsigned long long t1 = _rdtsc(); double t1 = SuperLU_timer_(); zblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow, @@ -394,6 +343,7 @@ int_t zblock_gemm_scatterTopLeft( int_t lb, /* block number in L */ , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer #endif ); + //unsigned long long t2 = _rdtsc(); double t2 = SuperLU_timer_(); SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1); return 0; @@ -401,7 +351,7 @@ int_t zblock_gemm_scatterTopLeft( int_t lb, /* block number in L */ int_t zblock_gemm_scatterTopRight( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, - int_t * usub, int_t ldt, int* indirect, int* indirect2, + int_t* usub, int_t ldt, int* indirect, int* indirect2, HyP_t* HyP, zLUstruct_t *LUstruct, gridinfo_t* grid, @@ -415,11 +365,12 @@ int_t zblock_gemm_scatterTopRight( int_t lb, int_t j, int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; -#ifdef _OPENMP +#ifdef _OPENMP volatile int_t thread_id = omp_get_thread_num(); -#else +#else volatile int_t thread_id = 0; -#endif +#endif + //unsigned long long t1 = _rdtsc(); double t1 = SuperLU_timer_(); zblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow, HyP->bigU_Phi, HyP->ldu_Phi, @@ -429,6 +380,7 @@ int_t zblock_gemm_scatterTopRight( int_t lb, int_t j, , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer #endif ); + //unsigned long long t2 = _rdtsc(); double t2 = SuperLU_timer_(); SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1); return 0; @@ -436,7 +388,7 @@ int_t zblock_gemm_scatterTopRight( int_t lb, int_t j, int_t zblock_gemm_scatterBottomLeft( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, - int_t * usub, int_t ldt, int* indirect, int* indirect2, + int_t* usub, int_t ldt, int* indirect, int* indirect2, HyP_t* HyP, zLUstruct_t *LUstruct, gridinfo_t* grid, @@ -450,12 +402,13 @@ int_t zblock_gemm_scatterBottomLeft( int_t lb, int_t j, int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; -#ifdef _OPENMP +#ifdef _OPENMP volatile int_t thread_id = omp_get_thread_num(); -#else +#else volatile int_t thread_id = 0; -#endif +#endif //printf("Thread's ID %lld \n", thread_id); + //unsigned long long t1 = _rdtsc(); double t1 = SuperLU_timer_(); zblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow, HyP->bigU_host, HyP->ldu, @@ -465,6 +418,7 @@ int_t zblock_gemm_scatterBottomLeft( int_t lb, int_t j, , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer #endif ); + //unsigned long long t2 = _rdtsc(); double t2 = SuperLU_timer_(); SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1); return 0; @@ -473,7 +427,7 @@ int_t zblock_gemm_scatterBottomLeft( int_t lb, int_t j, int_t zblock_gemm_scatterBottomRight( int_t lb, int_t j, doublecomplex* bigV, int_t knsupc, int_t klst, int_t* lsub, - int_t * usub, int_t ldt, int* indirect, int* indirect2, + int_t* usub, int_t ldt, int* indirect, int* indirect2, HyP_t* HyP, zLUstruct_t *LUstruct, gridinfo_t* grid, @@ -487,12 +441,13 @@ int_t zblock_gemm_scatterBottomRight( int_t lb, int_t j, int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; -#ifdef _OPENMP +#ifdef _OPENMP volatile int_t thread_id = omp_get_thread_num(); -#else +#else volatile int_t thread_id = 0; -#endif +#endif // printf("Thread's ID %lld \n", thread_id); + //unsigned long long t1 = _rdtsc(); double t1 = SuperLU_timer_(); zblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow, HyP->bigU_Phi, HyP->ldu_Phi, @@ -503,6 +458,7 @@ int_t zblock_gemm_scatterBottomRight( int_t lb, int_t j, #endif ); + //unsigned long long t2 = _rdtsc(); double t2 = SuperLU_timer_(); SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1); return 0; @@ -527,8 +483,8 @@ scatter_l (int_t ib, int_t *usub, int_t *lsub, double *tempv, - int *indirect_thread, int_t *indirect2, - int **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid) + int *indirect_thread, int *indirect2, + int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid) { int_t rel, i, segsize, jj; double *nzval; @@ -636,7 +592,6 @@ scatter_u (int_t ib, indirect[i] = lsub[lptr + i] ; } - iuip_lib += UB_DESCRIPTOR; ucol = &Unzval_br_ptr[lib][ruip_lib]; diff --git a/SRC/ztreeFactorization.c b/SRC/ztreeFactorization.c index f0e65f70..5f401d1c 100644 --- a/SRC/ztreeFactorization.c +++ b/SRC/ztreeFactorization.c @@ -33,19 +33,19 @@ int_t zLluBufInit(zLUValSubBuf_t* LUvsb, zLUstruct_t *LUstruct) return 0; } -diagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid) +zdiagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid) { - diagFactBufs_t** dFBufs; + zdiagFactBufs_t** dFBufs; /* Sherry fix: * mxLeafNode can be 0 for the replicated layers of the processes ?? */ - if ( mxLeafNode ) dFBufs = (diagFactBufs_t** ) - SUPERLU_MALLOC(mxLeafNode * sizeof(diagFactBufs_t*)); + if ( mxLeafNode ) dFBufs = (zdiagFactBufs_t** ) + SUPERLU_MALLOC(mxLeafNode * sizeof(zdiagFactBufs_t*)); for (int i = 0; i < mxLeafNode; ++i) { /* code */ - dFBufs[i] = (diagFactBufs_t* ) SUPERLU_MALLOC(sizeof(diagFactBufs_t)); + dFBufs[i] = (zdiagFactBufs_t* ) SUPERLU_MALLOC(sizeof(zdiagFactBufs_t)); assert(dFBufs[i]); zinitDiagFactBufs(ldt, dFBufs[i]); @@ -55,7 +55,7 @@ diagFactBufs_t** zinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* g } // sherry added -int zfreeDiagFactBufsArr(int_t mxLeafNode, diagFactBufs_t** dFBufs) +int zfreeDiagFactBufsArr(int_t mxLeafNode, zdiagFactBufs_t** dFBufs) { for (int i = 0; i < mxLeafNode; ++i) { SUPERLU_FREE(dFBufs[i]->BlockUFactor); @@ -99,7 +99,7 @@ int zLluBufFreeArr(int_t numLA, zLUValSubBuf_t **LUvsbs) int_t zinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, - scuBufs_t* scuBufs, + zscuBufs_t* scuBufs, zLUstruct_t* LUstruct, gridinfo_t * grid) { @@ -109,14 +109,14 @@ int_t zinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers, } // sherry added -int zfreeScuBufs(scuBufs_t* scuBufs) +int zfreeScuBufs(zscuBufs_t* scuBufs) { SUPERLU_FREE(scuBufs->bigV); SUPERLU_FREE(scuBufs->bigU); return 0; } -int_t zinitDiagFactBufs(int_t ldt, diagFactBufs_t* dFBuf) +int_t zinitDiagFactBufs(int_t ldt, zdiagFactBufs_t* dFBuf) { dFBuf->BlockUFactor = doublecomplexMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt); dFBuf->BlockLFactor = doublecomplexMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt); @@ -127,11 +127,11 @@ int_t zdenseTreeFactor( int_t nnodes, // number of nodes in the tree int_t *perm_c_supno, // list of nodes in the order of factorization commRequests_t *comReqs, // lists of communication requests - scuBufs_t *scuBufs, // contains buffers for schur complement update + zscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t*msgs, zLUValSubBuf_t* LUvsb, - diagFactBufs_t *dFBuf, + zdiagFactBufs_t *dFBuf, factStat_t *factStat, factNodelists_t *fNlists, superlu_dist_options_t *options, @@ -247,11 +247,11 @@ int_t zdenseTreeFactor( int_t klst = FstBlockC (k + 1); int_t *lsub = lPanelInfo->lsub; int_t *usub = uPanelInfo->usub; -#ifdef _OPENMP +#ifdef _OPENMP int_t thread_id = omp_get_thread_num(); -#else +#else int_t thread_id = 0; -#endif +#endif zblock_gemm_scatter( lb, ub, Ublock_info, Remain_info, @@ -288,11 +288,11 @@ int_t zdenseTreeFactor( int_t zsparseTreeFactor_ASYNC( sForest_t* sforest, commRequests_t **comReqss, // lists of communication requests // size maxEtree level - scuBufs_t *scuBufs, // contains buffers for schur complement update + zscuBufs_t *scuBufs, // contains buffers for schur complement update packLUInfo_t*packLUInfo, msgs_t**msgss, // size=num Look ahead zLUValSubBuf_t** LUvsbs, // size=num Look ahead - diagFactBufs_t **dFBufs, // size maxEtree level + zdiagFactBufs_t **dFBufs, // size maxEtree level factStat_t *factStat, factNodelists_t *fNlists, gEtreeInfo_t* gEtreeInfo, // global etree info diff --git a/SRC/ztrfAux.c b/SRC/ztrfAux.c index 787d48a7..12a8a782 100644 --- a/SRC/ztrfAux.c +++ b/SRC/ztrfAux.c @@ -336,7 +336,7 @@ int_t zSchurComplementSetupGPU( int_t* myIperm, int_t* iperm_c_supno, int_t*perm_c_supno, gEtreeInfo_t* gEtreeInfo, factNodelists_t* fNlists, - scuBufs_t* scuBufs, zLUValSubBuf_t* LUvsb, + zscuBufs_t* scuBufs, zLUValSubBuf_t* LUvsb, gridinfo_t *grid, zLUstruct_t *LUstruct, HyP_t* HyP) { @@ -485,7 +485,7 @@ int_t zSchurComplementSetupGPU( } return LU_nonempty; -} /* dSchurComplementSetupGPU */ +} /* zSchurComplementSetupGPU */ doublecomplex* zgetBigV(int_t ldt, int_t num_threads) @@ -553,10 +553,10 @@ trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers, CHECK_MALLOC (iam, "Enter zinitTrf3Dpartition()"); #endif int_t* perm_c_supno = getPerm_c_supno(nsupers, options, - LUstruct->etree, - LUstruct->Glu_persist, - LUstruct->Llu->Lrowind_bc_ptr, - LUstruct->Llu->Ufstnz_br_ptr, grid); + LUstruct->etree, + LUstruct->Glu_persist, + LUstruct->Llu->Lrowind_bc_ptr, + LUstruct->Llu->Ufstnz_br_ptr, grid); int_t* iperm_c_supno = getFactIperm(perm_c_supno, nsupers); // calculating tree factorization @@ -564,9 +564,9 @@ trf3Dpartition_t* zinitTrf3Dpartition(int_t nsupers, treeList_t* treeList = setree2list(nsupers, setree ); /*update treelist with weight and depth*/ - getSCUweight(nsupers, treeList,LUstruct->Glu_persist->xsup, - LUstruct->Llu->Lrowind_bc_ptr, LUstruct->Llu->Ufstnz_br_ptr, - grid3d); + getSCUweight(nsupers, treeList, LUstruct->Glu_persist->xsup, + LUstruct->Llu->Lrowind_bc_ptr, LUstruct->Llu->Ufstnz_br_ptr, + grid3d); calcTreeWeight(nsupers, setree, treeList, LUstruct->Glu_persist->xsup); From a81c5f3890e2dc799e30e29911fa3b6640d4400a Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Mon, 5 Apr 2021 18:54:50 -0400 Subject: [PATCH 071/147] Added two new files: real and complex _superlu_gpu.cu --- EXAMPLE/Makefile | 2 +- EXAMPLE/pzutil.c | 411 +++++++++- SRC/dnrformat_loc3d.c | 34 +- SRC/dsuperlu_gpu.cu | 1807 ++++++++++++++++++++++++++++++++++++++++ SRC/superlu_defs.h | 5 + SRC/znrformat_loc3d.c | 31 +- SRC/zsuperlu_gpu.cu | 1816 +++++++++++++++++++++++++++++++++++++++++ 7 files changed, 4025 insertions(+), 81 deletions(-) create mode 100644 SRC/dsuperlu_gpu.cu create mode 100644 SRC/zsuperlu_gpu.cu diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile index 5af38147..50a786b0 100644 --- a/EXAMPLE/Makefile +++ b/EXAMPLE/Makefile @@ -52,7 +52,7 @@ ZEXM1 = pzdrive1.o zcreate_matrix.o ZEXM2 = pzdrive2.o zcreate_matrix.o zcreate_matrix_perturbed.o ZEXM3 = pzdrive3.o zcreate_matrix.o ZEXM4 = pzdrive4.o zcreate_matrix.o -ZEXM3D = pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o +ZEXM3D = pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o pzutil.o ZEXMG = pzdrive_ABglobal.o ZEXMG1 = pzdrive1_ABglobal.o ZEXMG2 = pzdrive2_ABglobal.o diff --git a/EXAMPLE/pzutil.c b/EXAMPLE/pzutil.c index 8efa3285..2bfcfd6e 100644 --- a/EXAMPLE/pzutil.c +++ b/EXAMPLE/pzutil.c @@ -1,15 +1,15 @@ /*! \file Copyright (c) 2003, The Regents of the University of California, through -Lawrence Berkeley National Laboratory (subject to receipt of any required -approvals from U.S. Dept. of Energy) +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) -All rights reserved. +All rights reserved. The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ -/*! @file +/*! @file * \brief Several matrix utilities * *
@@ -38,15 +38,15 @@ int pzCompRow_loc_to_CompCol_global
     int_t *colind, *rowptr;
     int_t *colptr_loc, *rowind_loc;
     int_t m_loc, n, i, j, k, l;
-    int_t colnnz, fst_row, m_loc_max, nnz_loc, nnz_max, nnz;
+    int_t colnnz, fst_row, nnz_loc, nnz;
     doublecomplex *a_recv;  /* Buffer to receive the blocks of values. */
     doublecomplex *a_buf;   /* Buffer to merge blocks into block columns. */
-    int_t *colcnt, *itemp;
-    int_t *colptr_send; /* Buffer to redistribute the column pointers of the 
+    int_t *itemp;
+    int_t *colptr_send; /* Buffer to redistribute the column pointers of the
 			   local block rows.
 			   Use n_loc+1 pointers for each block. */
     int_t *colptr_blk;  /* The column pointers for each block, after
-			   redistribution to the local block columns. 
+			   redistribution to the local block columns.
 			   Use n_loc+1 pointers for each block. */
     int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */
     int_t *rowind_buf;  /* Buffer to merge blocks into block columns. */
@@ -164,7 +164,7 @@ int pzCompRow_loc_to_CompCol_global
                       a_recv, recvcnts, rdispls, SuperLU_MPI_DOUBLE_COMPLEX,
                       grid->comm);
     }
-      
+
     /* Reset colptr_loc[] to point to the n_loc global columns. */
     colptr_loc[0] = 0;
     itemp = colptr_send;
@@ -178,7 +178,7 @@ int pzCompRow_loc_to_CompCol_global
 	itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */
     }
     itemp[n_loc] = colptr_loc[n_loc];
-      
+
     /* Merge blocks of row indices into columns of row indices. */
     for (i = 0; i < procs; ++i) {
         k = i * (n_loc + 1);
@@ -219,12 +219,12 @@ int pzCompRow_loc_to_CompCol_global
     MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm);
     for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i];
     GAstore->nnz = nnz;
-    
+
     if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) )
         ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]");
     if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) )
         ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]");
-      
+
     /* Allgatherv for row indices. */
     rdispls[0] = 0;
     for (i = 0; i < procs-1; ++i) {
@@ -233,12 +233,12 @@ int pzCompRow_loc_to_CompCol_global
     }
     itemp_32[procs-1] = itemp[procs-1];
     it = nnz_loc;
-    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, 
+    MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind,
 		   itemp_32, rdispls, mpi_int_t, grid->comm);
     if ( need_value ) {
       if ( !(GAstore->nzval = (doublecomplex *) doublecomplexMalloc_dist (nnz)) )
           ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]");
-      MPI_Allgatherv(a_buf, it, SuperLU_MPI_DOUBLE_COMPLEX, GAstore->nzval, 
+      MPI_Allgatherv(a_buf, it, SuperLU_MPI_DOUBLE_COMPLEX, GAstore->nzval,
 		     itemp_32, rdispls, SuperLU_MPI_DOUBLE_COMPLEX, grid->comm);
     } else GAstore->nzval = NULL;
 
@@ -249,7 +249,7 @@ int pzCompRow_loc_to_CompCol_global
         itemp_32[i] = n_locs[i];
     }
     itemp_32[procs-1] = n_locs[procs-1];
-    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, 
+    MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr,
 		   itemp_32, rdispls, mpi_int_t, grid->comm);
 
     /* Recompute column pointers. */
@@ -277,7 +277,7 @@ int pzCompRow_loc_to_CompCol_global
     SUPERLU_FREE(rowind_recv);
     if ( need_value) SUPERLU_FREE(a_recv);
 #if ( DEBUGlevel>=1 )
-    if ( !grid->iam ) printf("sizeof(NCformat) %d\n", sizeof(NCformat));
+    if ( !grid->iam ) printf("sizeof(NCformat) %lu\n", sizeof(NCformat));
     CHECK_MALLOC(grid->iam, "Exit pzCompRow_loc_to_CompCol_global");
 #endif
     return 0;
@@ -371,7 +371,7 @@ int pzPermute_Dense_Matrix
 	++ptr_to_ibuf[p];
 	ptr_to_dbuf[p] += nrhs;
     }
-	  
+
     /* Transfer the (permuted) row indices and numerical values. */
     MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t,
 		  recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm);
@@ -397,12 +397,299 @@ int pzPermute_Dense_Matrix
 } /* pzPermute_Dense_Matrix */
 
 
+/*! \brief Allocate storage in LUstruct */
+void zLUstructInit(const int_t n, zLUstruct_t *LUstruct)
+{
+    if ( !(LUstruct->etree = intMalloc_dist(n)) )
+	ABORT("Malloc fails for etree[].");
+    if ( !(LUstruct->Glu_persist = (Glu_persist_t *)
+	   SUPERLU_MALLOC(sizeof(Glu_persist_t))) )
+	ABORT("Malloc fails for Glu_persist_t.");
+    if ( !(LUstruct->Llu = (zLocalLU_t *)
+	   SUPERLU_MALLOC(sizeof(zLocalLU_t))) )
+	ABORT("Malloc fails for LocalLU_t.");
+	LUstruct->Llu->inv = 0;
+}
+
+/*! \brief Deallocate LUstruct */
+void zLUstructFree(zLUstruct_t *LUstruct)
+{
+#if ( DEBUGlevel>=1 )
+    int iam;
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    CHECK_MALLOC(iam, "Enter zLUstructFree()");
+#endif
+
+    SUPERLU_FREE(LUstruct->etree);
+    SUPERLU_FREE(LUstruct->Glu_persist);
+    SUPERLU_FREE(LUstruct->Llu);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit zLUstructFree()");
+#endif
+}
+
+/*! \brief Destroy distributed L & U matrices. */
+void
+zDestroy_LU(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
+{
+    int_t i, nb, nsupers;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    zLocalLU_t *Llu = LUstruct->Llu;
+
+#if ( DEBUGlevel>=1 )
+    int iam;
+    MPI_Comm_rank( MPI_COMM_WORLD, &iam );
+    CHECK_MALLOC(iam, "Enter zDestroy_LU()");
+#endif
+
+    zDestroy_Tree(n, grid, LUstruct);
+
+    nsupers = Glu_persist->supno[n-1] + 1;
+
+    nb = CEILING(nsupers, grid->npcol);
+    for (i = 0; i < nb; ++i) 
+	if ( Llu->Lrowind_bc_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]);
+#if 0 // Sherry: the following is not allocated with cudaHostAlloc    
+    //#ifdef GPU_ACC
+	    checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i]));
+#endif
+	    SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Lrowind_bc_ptr);
+    SUPERLU_FREE (Llu->Lnzval_bc_ptr);
+
+    nb = CEILING(nsupers, grid->nprow);
+    for (i = 0; i < nb; ++i)
+	if ( Llu->Ufstnz_br_ptr[i] ) {
+	    SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]);
+	    SUPERLU_FREE (Llu->Unzval_br_ptr[i]);
+	}
+    SUPERLU_FREE (Llu->Ufstnz_br_ptr);
+    SUPERLU_FREE (Llu->Unzval_br_ptr);
+
+    /* The following can be freed after factorization. */
+    SUPERLU_FREE(Llu->ToRecv);
+    SUPERLU_FREE(Llu->ToSendD);
+    SUPERLU_FREE(Llu->ToSendR[0]);
+    SUPERLU_FREE(Llu->ToSendR);
+
+    /* The following can be freed only after iterative refinement. */
+    SUPERLU_FREE(Llu->ilsum);
+    SUPERLU_FREE(Llu->fmod);
+    SUPERLU_FREE(Llu->fsendx_plist[0]);
+    SUPERLU_FREE(Llu->fsendx_plist);
+    SUPERLU_FREE(Llu->bmod);
+    SUPERLU_FREE(Llu->bsendx_plist[0]);
+    SUPERLU_FREE(Llu->bsendx_plist);
+    SUPERLU_FREE(Llu->mod_bit);
+
+    nb = CEILING(nsupers, grid->npcol);
+    for (i = 0; i < nb; ++i) 
+	if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) {
+	    SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]);
+	}	
+    SUPERLU_FREE(Llu->Lindval_loc_bc_ptr);
+	
+    nb = CEILING(nsupers, grid->npcol);
+    for (i=0; iLinv_bc_ptr[i]!=NULL) {
+	    SUPERLU_FREE(Llu->Linv_bc_ptr[i]);
+	}
+	if(Llu->Uinv_bc_ptr[i]!=NULL){
+	    SUPERLU_FREE(Llu->Uinv_bc_ptr[i]);
+	}	
+    }
+    SUPERLU_FREE(Llu->Linv_bc_ptr);
+    SUPERLU_FREE(Llu->Uinv_bc_ptr);
+    SUPERLU_FREE(Llu->Unnz);
+	
+    nb = CEILING(nsupers, grid->npcol);
+    for (i = 0; i < nb; ++i)
+	if ( Llu->Urbs[i] ) {
+	    SUPERLU_FREE(Llu->Ucb_indptr[i]);
+	    SUPERLU_FREE(Llu->Ucb_valptr[i]);
+	}
+    SUPERLU_FREE(Llu->Ucb_indptr);
+    SUPERLU_FREE(Llu->Ucb_valptr);	
+    SUPERLU_FREE(Llu->Urbs);
+
+    SUPERLU_FREE(Glu_persist->xsup);
+    SUPERLU_FREE(Glu_persist->supno);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(iam, "Exit zDestroy_LU()");
+#endif
+}
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *   Set up the communication pattern for redistribution between B and X
+ *   in the triangular solution.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The dimension of the linear system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of the distributed input matrix.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * fst_row (input) int (global)
+ *        The row number of matrix B's first row in the global matrix.
+ *
+ * perm_r (input) int* (global)
+ *        The row permutation vector.
+ *
+ * perm_c (input) int* (global)
+ *        The column permutation vector.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ * 
+ */ +int_t +pzgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, + int_t perm_r[], int_t perm_c[], gridinfo_t *grid, + Glu_persist_t *Glu_persist, zSOLVEstruct_t *SOLVEstruct) +{ + + int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; + int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; + int *itemp, *ptr_to_ibuf, *ptr_to_dbuf; + int_t *row_to_proc; + int_t i, gbi, k, l, num_diag_procs, *diag_procs; + int_t irow, q, knsupc, nsupers, *xsup, *supno; + int iam, p, pkk, procs; + pxgstrs_comm_t *gstrs_comm; + + procs = grid->nprow * grid->npcol; + iam = grid->iam; + gstrs_comm = SOLVEstruct->gstrs_comm; + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = Glu_persist->supno[n-1] + 1; + row_to_proc = SOLVEstruct->row_to_proc; + + /* ------------------------------------------------------------ + SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X. + ------------------------------------------------------------*/ + if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) + ABORT("Malloc fails for B_to_X_itemp[]."); + SendCnt = itemp; + SendCnt_nrhs = itemp + procs; + RecvCnt = itemp + 2*procs; + RecvCnt_nrhs = itemp + 3*procs; + sdispls = itemp + 4*procs; + sdispls_nrhs = itemp + 5*procs; + rdispls = itemp + 6*procs; + rdispls_nrhs = itemp + 7*procs; + + /* Count the number of elements to be sent to each diagonal process.*/ + for (p = 0; p < procs; ++p) SendCnt[p] = 0; + for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { + irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ + gbi = BlockNum( irow ); + p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ + ++SendCnt[p]; + } + + /* Set up the displacements for alltoall. */ + MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); + sdispls[0] = rdispls[0] = 0; + for (p = 1; p < procs; ++p) { + sdispls[p] = sdispls[p-1] + SendCnt[p-1]; + rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; + } + for (p = 0; p < procs; ++p) { + SendCnt_nrhs[p] = SendCnt[p] * nrhs; + sdispls_nrhs[p] = sdispls[p] * nrhs; + RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; + rdispls_nrhs[p] = rdispls[p] * nrhs; + } + + /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ + gstrs_comm->B_to_X_SendCnt = SendCnt; + + /* ------------------------------------------------------------ + SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B. + ------------------------------------------------------------*/ + /* This is freed in pxgstrs_finalize(). */ + if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) + ABORT("Malloc fails for X_to_B_itemp[]."); + SendCnt = itemp; + SendCnt_nrhs = itemp + procs; + RecvCnt = itemp + 2*procs; + RecvCnt_nrhs = itemp + 3*procs; + sdispls = itemp + 4*procs; + sdispls_nrhs = itemp + 5*procs; + rdispls = itemp + 6*procs; + rdispls_nrhs = itemp + 7*procs; + + /* Count the number of X entries to be sent to each process.*/ + for (p = 0; p < procs; ++p) SendCnt[p] = 0; + num_diag_procs = SOLVEstruct->num_diag_procs; + diag_procs = SOLVEstruct->diag_procs; + + for (p = 0; p < num_diag_procs; ++p) { /* for all diagonal processes */ + pkk = diag_procs[p]; + if ( iam == pkk ) { + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + irow = FstBlockC( k ); + for (i = 0; i < knsupc; ++i) { +#if 0 + q = row_to_proc[inv_perm_c[irow]]; +#else + q = row_to_proc[irow]; +#endif + ++SendCnt[q]; + ++irow; + } + } + } + } + + MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); + sdispls[0] = rdispls[0] = 0; + sdispls_nrhs[0] = rdispls_nrhs[0] = 0; + SendCnt_nrhs[0] = SendCnt[0] * nrhs; + RecvCnt_nrhs[0] = RecvCnt[0] * nrhs; + for (p = 1; p < procs; ++p) { + sdispls[p] = sdispls[p-1] + SendCnt[p-1]; + rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; + sdispls_nrhs[p] = sdispls[p] * nrhs; + rdispls_nrhs[p] = rdispls[p] * nrhs; + SendCnt_nrhs[p] = SendCnt[p] * nrhs; + RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; + } + + /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ + gstrs_comm->X_to_B_SendCnt = SendCnt; + + if ( !(ptr_to_ibuf = SUPERLU_MALLOC(2*procs * sizeof(int))) ) + ABORT("Malloc fails for ptr_to_ibuf[]."); + gstrs_comm->ptr_to_ibuf = ptr_to_ibuf; + gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs; + + return 0; +} /* PZGSTRS_INIT */ + + /*! \brief Initialize the data structure for the solution phase. */ -int zSolveInit(superlu_options_t *options, SuperMatrix *A, +int zSolveInit(superlu_dist_options_t *options, SuperMatrix *A, int_t perm_r[], int_t perm_c[], int_t nrhs, - LUstruct_t *LUstruct, gridinfo_t *grid, - SOLVEstruct_t *SOLVEstruct) + zLUstruct_t *LUstruct, gridinfo_t *grid, + zSOLVEstruct_t *SOLVEstruct) { int_t *row_to_proc, *inv_perm_c, *itemp; NRformat_loc *Astore; @@ -413,29 +700,21 @@ int zSolveInit(superlu_options_t *options, SuperMatrix *A, fst_row = Astore->fst_row; m_loc = Astore->m_loc; procs = grid->nprow * grid->npcol; - - if ( !grid->iam ) printf("@@@ enter zSolveInit, A->nrow %d\n", A->nrow); if ( !(row_to_proc = intMalloc_dist(A->nrow)) ) ABORT("Malloc fails for row_to_proc[]"); - if ( !grid->iam ) { printf("@@@ malloc(1) zSolveInit\n"); fflush(stdout); } SOLVEstruct->row_to_proc = row_to_proc; - if ( !(inv_perm_c = intMalloc_dist(A->ncol)) ) ABORT("Malloc fails for inv_perm_c[]."); - if ( !grid->iam ) { printf("@@@ malloc(2) zSolveInit\n"); fflush(stdout); } - for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i; SOLVEstruct->inv_perm_c = inv_perm_c; - if ( !grid->iam ) printf("@@@ after malloc zSolveInit\n"); - /* ------------------------------------------------------------ EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION. SET UP THE MAPPING BETWEEN ROWS AND PROCESSES. - + NOTE: For those processes that do not own any row, it must - must be set so that fst_row == A->nrow. + must be set so that fst_row == A->nrow. ------------------------------------------------------------*/ if ( !(itemp = intMalloc_dist(procs+1)) ) ABORT("Malloc fails for itemp[]"); @@ -445,11 +724,6 @@ int zSolveInit(superlu_options_t *options, SuperMatrix *A, for (p = 0; p < procs; ++p) { for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p; } - - if ( !grid->iam ) printf("@@@ after allgather zSolveInit\n"); - -#define DEBUGlevel 2 - #if ( DEBUGlevel>=2 ) if ( !grid->iam ) { printf("fst_row = %d\n", fst_row); @@ -475,34 +749,35 @@ int zSolveInit(superlu_options_t *options, SuperMatrix *A, for (i = j ; i < k; ++i) row_to_proc[i] = p; } } -#endif +#endif get_diag_procs(A->ncol, LUstruct->Glu_persist, grid, &SOLVEstruct->num_diag_procs, &SOLVEstruct->diag_procs, &SOLVEstruct->diag_len); + /* Setup communication pattern for redistribution of B and X. */ if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *) SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) ABORT("Malloc fails for gstrs_comm[]"); - pxgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, + pzgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, LUstruct->Glu_persist, SOLVEstruct); if ( !(SOLVEstruct->gsmv_comm = (pzgsmv_comm_t *) SUPERLU_MALLOC(sizeof(pzgsmv_comm_t))) ) ABORT("Malloc fails for gsmv_comm[]"); SOLVEstruct->A_colind_gsmv = NULL; - + options->SolveInitialized = YES; return 0; } /* zSolveInit */ /*! \brief Release the resources used for the solution phase. */ -void zSolveFinalize(superlu_options_t *options, SOLVEstruct_t *SOLVEstruct) +void zSolveFinalize(superlu_dist_options_t *options, zSOLVEstruct_t *SOLVEstruct) { - int_t *it; pxgstrs_finalize(SOLVEstruct->gstrs_comm); + if ( options->RefineInitialized ) { pzgsmv_finalize(SOLVEstruct->gsmv_comm); options->RefineInitialized = NO; @@ -512,14 +787,14 @@ void zSolveFinalize(superlu_options_t *options, SOLVEstruct_t *SOLVEstruct) SUPERLU_FREE(SOLVEstruct->inv_perm_c); SUPERLU_FREE(SOLVEstruct->diag_procs); SUPERLU_FREE(SOLVEstruct->diag_len); - if ( it = SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(it); + if ( SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(SOLVEstruct->A_colind_gsmv); options->SolveInitialized = NO; } /* zSolveFinalize */ -/*! \brief Check the inf-norm of the error vector +/*! \brief Check the inf-norm of the error vector */ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx, - doublecomplex xtrue[], int_t ldxtrue, gridinfo_t *grid) + doublecomplex xtrue[], int_t ldxtrue, MPI_Comm slucomm) { double err, xnorm, temperr, tempxnorm; doublecomplex *x_work, *xtrue_work; @@ -535,15 +810,61 @@ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx err = SUPERLU_MAX(err, slud_z_abs(&temp)); xnorm = SUPERLU_MAX(xnorm, slud_z_abs(&x_work[i])); } + printf("\t(%d) err = %e\txnorm = %e\n", iam, err, xnorm); /* get the golbal max err & xnrom */ temperr = err; tempxnorm = xnorm; - MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, grid->comm); - MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, grid->comm); + MPI_Allreduce( &temperr, &err, 1, MPI_DOUBLE, MPI_MAX, slucomm); + MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, slucomm); err = err / xnorm; if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); } } +/*! \brief Destroy broadcast and reduction trees used in triangular solve */ +void +zDestroy_Tree(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct) +{ + int_t i, nb, nsupers; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + zLocalLU_t *Llu = LUstruct->Llu; +#if ( DEBUGlevel>=1 ) + int iam; + MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + CHECK_MALLOC(iam, "Enter Destroy_Tree()"); +#endif + + nsupers = Glu_persist->supno[n-1] + 1; + + nb = CEILING(nsupers, grid->npcol); + for (i=0;iLBtree_ptr[i]!=NULL){ + BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt); + } + if(Llu->UBtree_ptr[i]!=NULL){ + BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt); + } + } + SUPERLU_FREE(Llu->LBtree_ptr); + SUPERLU_FREE(Llu->UBtree_ptr); + + nb = CEILING(nsupers, grid->nprow); + for (i=0;iLRtree_ptr[i]!=NULL){ + RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt); + } + if(Llu->URtree_ptr[i]!=NULL){ + RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt); + } + } + SUPERLU_FREE(Llu->LRtree_ptr); + SUPERLU_FREE(Llu->URtree_ptr); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit zDestroy_Tree()"); +#endif +} + + diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c index 439a51a3..71249b24 100644 --- a/SRC/dnrformat_loc3d.c +++ b/SRC/dnrformat_loc3d.c @@ -9,6 +9,8 @@ The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ + + /*! @file * \brief Preprocessing routines for the 3D factorization/solve codes: * - Gather {A,B} from 3D grid to 2D process layer 0 @@ -41,15 +43,15 @@ static void matCopy(int n, int m, double *Dst, int lddst, double *Src, int ldsrc * output is in the returned A3d->{} structure. * see supermatrix.h for nrformat_loc3d{} structure. */ -NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input - double *B, // input - int ldb, int nrhs, // input +NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input + double *B, // input + int ldb, int nrhs, // input gridinfo3d_t *grid3d) { NRformat_loc3d *A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d)); NRformat_loc *A2d = SUPERLU_MALLOC(sizeof(NRformat_loc)); A3d->m_loc = A->m_loc; - A3d->B = (double *)B; // on 3D process grid + A3d->B = (double *) B; // on 3D process grid A3d->ldb = ldb; A3d->nrhs = nrhs; @@ -144,7 +146,7 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input if (grid3d->zscp.Iam == 0) { B1 = SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(double)); - A3d->B2d = (double *)SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(double)); + A3d->B2d = (double *) SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(double)); } // B1 <- gatherv(Btmp) @@ -158,8 +160,8 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input for (int i = 0; i < grid3d->npdep; ++i) { /* code */ - matCopy(row_counts_int[i], nrhs, ((double *)A3d->B2d) + row_disp[i], - A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]); + matCopy(row_counts_int[i], nrhs, ((double*)A3d->B2d) + row_disp[i], + A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]); } SUPERLU_FREE(B1); @@ -171,15 +173,12 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input A3d->row_counts_int = row_counts_int; A3d->row_disp = row_disp; -#if 1 /* free storage */ SUPERLU_FREE(nnz_counts); SUPERLU_FREE(nnz_counts_int); SUPERLU_FREE(row_counts); SUPERLU_FREE(nnz_disp); SUPERLU_FREE(Btmp); - // SUPERLU_FREE(B1); -#endif return A3d; @@ -189,14 +188,14 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input * Scatter B (solution) from 2D process layer 0 to 3D grid * Output: X2d <- A^{-1} B2d */ -int dScatter_B3d(NRformat_loc3d *A3d, // modified - gridinfo3d_t *grid3d) +int dScatter_B3d(NRformat_loc3d *A3d, // modified + gridinfo3d_t *grid3d) { - double *B = (double *)A3d->B; + double *B = (double *) A3d->B; int ldb = A3d->ldb; int nrhs = A3d->nrhs; - double *B2d = (double *)A3d->B2d; + double *B2d = (double *) A3d->B2d; NRformat_loc A2d = *(A3d->A_nfmt); int m_loc = A3d->m_loc; int *b_counts_int = A3d->b_counts_int; @@ -225,23 +224,20 @@ int dScatter_B3d(NRformat_loc3d *A3d, // modified double *Btmp; Btmp = SUPERLU_MALLOC(A3d->m_loc * nrhs * sizeof(double)); - // Btmp <- scatterv(B1) + // Btmp <- scatterv(B1) MPI_Scatterv(B1, b_counts_int, b_disp, MPI_DOUBLE, Btmp, nrhs * A3d->m_loc, MPI_DOUBLE, 0, grid3d->zscp.comm); // B <- colMajor(Btmp) matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc); -#if 1 /* free storage */ SUPERLU_FREE(A3d->b_counts_int); SUPERLU_FREE(A3d->b_disp); SUPERLU_FREE(A3d->row_counts_int); SUPERLU_FREE(A3d->row_disp); - if (grid3d->zscp.Iam == 0) - SUPERLU_FREE(B1); SUPERLU_FREE(Btmp); -#endif + if (grid3d->zscp.Iam == 0) SUPERLU_FREE(B1); return 0; } /* dScatter_B3d */ diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu new file mode 100644 index 00000000..383eeebe --- /dev/null +++ b/SRC/dsuperlu_gpu.cu @@ -0,0 +1,1807 @@ + + +/*! @file + * \brief Descriptions and declarations for structures used in GPU + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ * 
+ */ + +//#define GPU_DEBUG + +#include "mpi.h" +#include "omp.h" +// #include "sec_structs.h" +#include +#include +#include + +#undef Reduce +#include "cub/cub.cuh" +//#include + +#include "dlustruct_gpu.h" + + +//extern "C" { +// void cblas_daxpy(const int N, const double alpha, const double *X, +// const int incX, double *Y, const int incY); +//} + +/*error reporting functions */ +static +cudaError_t checkCuda(cudaError_t result) +{ +#if defined(DEBUG) || defined(_DEBUG) + if (result != cudaSuccess) + { + fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); + assert(result == cudaSuccess); + } +#endif + return result; +} + + +// cublasStatus_t checkCublas(cublasStatus_t result) +// { +// #if defined(DEBUG) || defined(_DEBUG) +// if (result != CUBLAS_STATUS_SUCCESS) +// { +// fprintf(stderr, "CUDA Blas Runtime Error: %s\n", cublasGetErrorString(result)); +// assert(result == CUBLAS_STATUS_SUCCESS); +// } +// #endif +// return result; +// } + + +// #define UNIT_STRIDE + +#if 0 ////////// this routine is not used anymore +__device__ inline +void device_scatter_l (int_t thread_id, + int_t nsupc, int_t temp_nbrow, + int_t *usub, int_t iukp, int_t klst, + double *nzval, int_t ldv, + double *tempv, int_t nbrow, + // int_t *indirect2_thread + int *indirect2_thread + ) +{ + + + int_t segsize, jj; + + for (jj = 0; jj < nsupc; ++jj) + { + segsize = klst - usub[iukp + jj]; + if (segsize) + { + if (thread_id < temp_nbrow) + { + +#ifndef UNIT_STRIDE + nzval[indirect2_thread[thread_id]] -= tempv[thread_id]; +#else + nzval[thread_id] -= tempv[thread_id]; /*making access unit strided*/ +#endif + } + tempv += nbrow; + } + nzval += ldv; + } +} +#endif ///////////// not used + +#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ +// The following is moved to superlu_defs.h +//#define MAX_SUPER_SIZE 256 /* Sherry: was 192 on Titan */ + +__device__ inline +void ddevice_scatter_l_2D (int thread_id, + int nsupc, int temp_nbrow, + int_t *usub, int iukp, int_t klst, + double *nzval, int ldv, + const double *tempv, int nbrow, + int *indirect2_thread, + int nnz_cols, int ColPerBlock, + int *IndirectJ3 + ) +{ + int i; + if ( thread_id < temp_nbrow * ColPerBlock ) { + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; + +#define UNROLL_ITER 8 + +#pragma unroll 4 + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + i = ldv * IndirectJ3[col] + indirect2_thread[thread_id_x]; + nzval[i] -= tempv[nbrow * col + thread_id_x]; + } + } +} + +/* Sherry: this routine is not used */ +#if 0 +__global__ +void cub_scan_test(void) +{ + int thread_id = threadIdx.x; + typedef cub::BlockScan BlockScan; /*1D int data type*/ + + __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ + + __shared__ int IndirectJ1[MAX_SUPER_SIZE]; + __shared__ int IndirectJ2[MAX_SUPER_SIZE]; + + if (thread_id < MAX_SUPER_SIZE) + { + IndirectJ1[thread_id] = (thread_id + 1) % 2; + } + + __syncthreads(); + if (thread_id < MAX_SUPER_SIZE) + BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]); + + + if (thread_id < MAX_SUPER_SIZE) + printf("%d %d\n", thread_id, IndirectJ2[thread_id]); + +} +#endif // not used + + +__device__ inline +void device_scatter_u_2D (int thread_id, + int temp_nbrow, int nsupc, + double * ucol, + int_t * usub, int iukp, + int_t ilst, int_t klst, + int_t * index, int iuip_lib, + double * tempv, int nbrow, + int *indirect, + int nnz_cols, int ColPerBlock, + int *IndirectJ1, + int *IndirectJ3 + ) +{ + int i; + + if ( thread_id < temp_nbrow * ColPerBlock ) + { + /* 1D threads are logically arranged in 2D shape. */ + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; + +#pragma unroll 4 + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + i = IndirectJ1[IndirectJ3[col]] + indirect[thread_id_x]; + ucol[i] -= tempv[nbrow * col + thread_id_x]; + } + } +} + + +__device__ inline +void device_scatter_u (int_t thread_id, + int_t temp_nbrow, int_t nsupc, + double * ucol, + int_t * usub, int_t iukp, + int_t ilst, int_t klst, + int_t * index, int_t iuip_lib, + double * tempv, int_t nbrow, + // int_t *indirect + int *indirect + ) +{ + int_t segsize, fnz, jj; + for (jj = 0; jj < nsupc; ++jj) + { + segsize = klst - usub[iukp + jj]; + fnz = index[iuip_lib++]; + ucol -= fnz; + if (segsize) { /* Nonzero segment in U(k.j). */ + if (thread_id < temp_nbrow) + { +#ifndef UNIT_STRIDE + ucol[indirect[thread_id]] -= tempv[thread_id]; +#else + /* making access unit strided; + it doesn't work; it is for measurements */ + ucol[thread_id] -= tempv[thread_id]; +#endif + } + tempv += nbrow; + } + ucol += ilst ; + } +} + + +__global__ +void Scatter_GPU_kernel( + int_t streamId, + int_t ii_st, int_t ii_end, + int_t jj_st, int_t jj_end, /* defines rectangular Schur block to be scatter */ + int_t klst, + int_t jj0, /* 0 on entry */ + int_t nrows, int_t ldt, int_t npcol, int_t nprow, + dLUstruct_gpu_t * A_gpu) +{ + + /* initializing pointers */ + int_t *xsup = A_gpu->xsup; + int_t *UrowindPtr = A_gpu->UrowindPtr; + int_t *UrowindVec = A_gpu->UrowindVec; + int_t *UnzvalPtr = A_gpu->UnzvalPtr; + double *UnzvalVec = A_gpu->UnzvalVec; + int_t *LrowindPtr = A_gpu->LrowindPtr; + int_t *LrowindVec = A_gpu->LrowindVec; + int_t *LnzvalPtr = A_gpu->LnzvalPtr; + double *LnzvalVec = A_gpu->LnzvalVec; + double *bigV = A_gpu->scubufs[streamId].bigV; + local_l_blk_info_t *local_l_blk_infoVec = A_gpu->local_l_blk_infoVec; + local_u_blk_info_t *local_u_blk_infoVec = A_gpu->local_u_blk_infoVec; + int_t *local_l_blk_infoPtr = A_gpu->local_l_blk_infoPtr; + int_t *local_u_blk_infoPtr = A_gpu->local_u_blk_infoPtr; + Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info; + Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info; + int_t *lsub = A_gpu->scubufs[streamId].lsub; + int_t *usub = A_gpu->scubufs[streamId].usub; + + /* thread block assignment: this thread block is + assigned to block (lb, j) in 2D grid */ + int lb = blockIdx.x + ii_st; + int j = blockIdx.y + jj_st; + __shared__ int indirect_thread[MAX_SUPER_SIZE]; /* row-wise */ + __shared__ int indirect2_thread[MAX_SUPER_SIZE]; /* row-wise */ + __shared__ int IndirectJ1[THREAD_BLOCK_SIZE]; /* column-wise */ + __shared__ int IndirectJ3[THREAD_BLOCK_SIZE]; /* column-wise */ + + /* see CUB page https://nvlabs.github.io/cub/. Implement threads collectives */ + typedef cub::BlockScan BlockScan; /*1D int data type*/ + __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ + + int thread_id = threadIdx.x; + + int iukp = Ublock_info[j].iukp; + int jb = Ublock_info[j].jb; + int nsupc = SuperSize (jb); + int ljb = jb / npcol; + + double *tempv1; + if (jj_st == jj0) + { + tempv1 = (j == jj_st) ? bigV + : bigV + Ublock_info[j - 1].full_u_cols * nrows; + } + else + { + tempv1 = (j == jj_st) ? bigV + : bigV + (Ublock_info[j - 1].full_u_cols - + Ublock_info[jj_st - 1].full_u_cols) * nrows; + } + + /* # of nonzero columns in block j */ + int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols + : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols); + int cum_ncol = (j == 0) ? 0 : Ublock_info[j - 1].full_u_cols; + + int lptr = Remain_info[lb].lptr; + int ib = Remain_info[lb].ib; + int temp_nbrow = lsub[lptr + 1]; /* number of rows in the current L block */ + lptr += LB_DESCRIPTOR; + + int_t cum_nrow; + if (ii_st == 0) + { + cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow); + } + else + { + cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow - Remain_info[ii_st - 1].FullRow); + } + + tempv1 += cum_nrow; + + if (ib < jb) /*scatter U code */ + { + int ilst = FstBlockC (ib + 1); + int lib = ib / nprow; /* local index of row block ib */ + int_t *index = &UrowindVec[UrowindPtr[lib]]; + + int num_u_blocks = index[0]; + + int ljb = (jb) / npcol; /* local index of column block jb */ + + /* Each thread is responsible for one block column */ + __shared__ int ljb_ind; + /*do a search ljb_ind at local row lib*/ + int blks_per_threads = CEILING(num_u_blocks, THREAD_BLOCK_SIZE); + for (int i = 0; i < blks_per_threads; ++i) + /* each thread is assigned a chunk of consecutive U blocks to search */ + { + /* only one thread finds the block index matching ljb */ + if (thread_id * blks_per_threads + i < num_u_blocks && + local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + thread_id * blks_per_threads + i ].ljb == ljb) + { + ljb_ind = thread_id * blks_per_threads + i; + } + } + __syncthreads(); + + int iuip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].iuip; + int ruip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].ruip; + iuip_lib += UB_DESCRIPTOR; + double *Unzval_lib = &UnzvalVec[UnzvalPtr[lib]]; + double *ucol = &Unzval_lib[ruip_lib]; + + if (thread_id < temp_nbrow) /* row-wise */ + { + /* cyclically map each thread to a row */ + indirect_thread[thread_id] = (int) lsub[lptr + thread_id]; + } + + /* column-wise: each thread is assigned one column */ + if (thread_id < nnz_cols) + IndirectJ3[thread_id] = A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; + /* indirectJ3[j] == kk means the j-th nonzero segment + points to column kk in this supernode */ + + __syncthreads(); + + /* threads are divided into multiple columns */ + int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; + + if (thread_id < THREAD_BLOCK_SIZE) + IndirectJ1[thread_id] = 0; + + if (thread_id < THREAD_BLOCK_SIZE) + { + if (thread_id < nsupc) + { + /* fstnz subscript of each column in the block */ + IndirectJ1[thread_id] = index[iuip_lib + thread_id]; + } + } + + /* perform an inclusive block-wide prefix sum among all threads */ + if (thread_id < THREAD_BLOCK_SIZE) + BlockScan(temp_storage).InclusiveSum(IndirectJ1[thread_id], IndirectJ1[thread_id]); + + if (thread_id < THREAD_BLOCK_SIZE) + IndirectJ1[thread_id] = -IndirectJ1[thread_id] + ilst * thread_id; + + __syncthreads(); + + device_scatter_u_2D ( + thread_id, + temp_nbrow, nsupc, + ucol, + usub, iukp, + ilst, klst, + index, iuip_lib, + tempv1, nrows, + indirect_thread, + nnz_cols, ColPerBlock, + IndirectJ1, + IndirectJ3 ); + + } + else /* ib >= jb, scatter L code */ + { + + int rel; + double *nzval; + int_t *index = &LrowindVec[LrowindPtr[ljb]]; + int num_l_blocks = index[0]; + int ldv = index[1]; + + int fnz = FstBlockC (ib); + int lib = ib / nprow; + + __shared__ int lib_ind; + /*do a search lib_ind for lib*/ + int blks_per_threads = CEILING(num_l_blocks, THREAD_BLOCK_SIZE); + for (int i = 0; i < blks_per_threads; ++i) + { + if (thread_id * blks_per_threads + i < num_l_blocks && + local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + thread_id * blks_per_threads + i ].lib == lib) + { + lib_ind = thread_id * blks_per_threads + i; + } + } + __syncthreads(); + + int lptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].lptrj; + int luptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].luptrj; + lptrj += LB_DESCRIPTOR; + int dest_nbrow = index[lptrj - 1]; + + if (thread_id < dest_nbrow) + { + rel = index[lptrj + thread_id] - fnz; + indirect_thread[rel] = thread_id; + } + __syncthreads(); + + /* can be precalculated */ + if (thread_id < temp_nbrow) + { + rel = lsub[lptr + thread_id] - fnz; + indirect2_thread[thread_id] = indirect_thread[rel]; + } + if (thread_id < nnz_cols) + IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; + __syncthreads(); + + int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; + + nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj; + ddevice_scatter_l_2D( + thread_id, + nsupc, temp_nbrow, + usub, iukp, klst, + nzval, ldv, + tempv1, nrows, indirect2_thread, + nnz_cols, ColPerBlock, + IndirectJ3); + } /* end else ib >= jb */ + +} /* end Scatter_GPU_kernel */ + + +#define GPU_2D_SCHUDT /* Not used */ + +int dSchurCompUpdate_GPU( + int_t streamId, + int_t jj_cpu, /* 0 on entry, pointing to the start of Phi part */ + int_t nub, /* jj_cpu on entry, pointing to the end of the Phi part */ + int_t klst, int_t knsupc, + int_t Rnbrow, int_t RemainBlk, + int_t Remain_lbuf_send_size, + int_t bigu_send_size, int_t ldu, + int_t mcb, /* num_u_blks_hi */ + int_t buffer_size, int_t lsub_len, int_t usub_len, + int_t ldt, int_t k0, + dsluGPU_t *sluGPU, gridinfo_t *grid +) +{ + + dLUstruct_gpu_t * A_gpu = sluGPU->A_gpu; + dLUstruct_gpu_t * dA_gpu = sluGPU->dA_gpu; + int_t nprow = grid->nprow; + int_t npcol = grid->npcol; + + cudaStream_t FunCallStream = sluGPU->funCallStreams[streamId]; + cublasHandle_t cublas_handle0 = sluGPU->cublasHandles[streamId]; + int_t * lsub = A_gpu->scubufs[streamId].lsub_buf; + int_t * usub = A_gpu->scubufs[streamId].usub_buf; + Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info_host; + double * Remain_L_buff = A_gpu->scubufs[streamId].Remain_L_buff_host; + Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info_host; + double * bigU = A_gpu->scubufs[streamId].bigU_host; + + A_gpu->isOffloaded[k0] = 1; + /* start by sending data to */ + int_t *xsup = A_gpu->xsup_host; + int_t col_back = (jj_cpu == 0) ? 0 : Ublock_info[jj_cpu - 1].full_u_cols; + // if(nub<1) return; + int_t ncols = Ublock_info[nub - 1].full_u_cols - col_back; + + /* Sherry: can get max_super_size from sp_ienv(3) */ + int_t indirectJ1[MAX_SUPER_SIZE]; // 0 indicates an empry segment + int_t indirectJ2[MAX_SUPER_SIZE]; // # of nonzero segments so far + int_t indirectJ3[MAX_SUPER_SIZE]; /* indirectJ3[j] == k means the + j-th nonzero segment points + to column k in this supernode */ + /* calculate usub_indirect */ + for (int jj = jj_cpu; jj < nub; ++jj) + { + int_t iukp = Ublock_info[jj].iukp; + int_t jb = Ublock_info[jj].jb; + int_t nsupc = SuperSize (jb); + int_t addr = (jj == 0) ? 0 + : Ublock_info[jj - 1].full_u_cols - col_back; + + for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + indirectJ1[kk] = 0; + } + + for (int_t kk = 0; kk < nsupc; ++kk) + { + indirectJ1[kk] = ((klst - usub[iukp + kk]) == 0) ? 0 : 1; + } + + /*prefix sum - indicates # of nonzero segments up to column kk */ + indirectJ2[0] = indirectJ1[0]; + for (int_t kk = 1; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + indirectJ2[kk] = indirectJ2[kk - 1] + indirectJ1[kk]; + } + + /* total number of nonzero segments in this supernode */ + int nnz_col = indirectJ2[nsupc - 1]; // old: MAX_SUPER_SIZE + + /* compactation */ + for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + if (indirectJ1[kk]) /* kk is a nonzero segment */ + { + /* indirectJ3[j] == kk means the j-th nonzero segment + points to column kk in this supernode */ + indirectJ3[indirectJ2[kk] - 1] = kk; + } + } + + for (int i = 0; i < nnz_col; ++i) + { + /* addr == total # of full columns before current block jj */ + A_gpu->scubufs[streamId].usub_IndirectJ3_host[addr + i] = indirectJ3[i]; + } + } /* end for jj ... calculate usub_indirect */ + + //printf("dSchurCompUpdate_GPU[3]: jj_cpu %d, nub %d\n", jj_cpu, nub); fflush(stdout); + + /*sizeof RemainLbuf = Rnbuf*knsupc */ + double tTmp = SuperLU_timer_(); + cudaEventRecord(A_gpu->ePCIeH2D[k0], FunCallStream); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].usub_IndirectJ3, + A_gpu->scubufs[streamId].usub_IndirectJ3_host, + ncols * sizeof(int_t), cudaMemcpyHostToDevice, + FunCallStream)) ; + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Remain_L_buff, Remain_L_buff, + Remain_lbuf_send_size * sizeof(double), + cudaMemcpyHostToDevice, FunCallStream)) ; + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].bigU, bigU, + bigu_send_size * sizeof(double), + cudaMemcpyHostToDevice, FunCallStream) ); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Remain_info, Remain_info, + RemainBlk * sizeof(Remain_info_t), + cudaMemcpyHostToDevice, FunCallStream) ); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Ublock_info, Ublock_info, + mcb * sizeof(Ublock_info_t), cudaMemcpyHostToDevice, + FunCallStream) ); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].lsub, lsub, + lsub_len * sizeof(int_t), cudaMemcpyHostToDevice, + FunCallStream) ); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].usub, usub, + usub_len * sizeof(int_t), cudaMemcpyHostToDevice, + FunCallStream) ); + + A_gpu->tHost_PCIeH2D += SuperLU_timer_() - tTmp; + A_gpu->cPCIeH2D += Remain_lbuf_send_size * sizeof(double) + + bigu_send_size * sizeof(double) + + RemainBlk * sizeof(Remain_info_t) + + mcb * sizeof(Ublock_info_t) + + lsub_len * sizeof(int_t) + + usub_len * sizeof(int_t); + + double alpha = 1.0, beta = 0.0; + + int_t ii_st = 0; + int_t ii_end = 0; + int_t maxGemmBlockDim = (int) sqrt(buffer_size); + // int_t maxGemmBlockDim = 8000; + + /* Organize GEMM by blocks of [ii_st : ii_end, jj_st : jj_end] that + fits in the buffer_size */ + while (ii_end < RemainBlk) { + ii_st = ii_end; + ii_end = RemainBlk; + int_t nrow_max = maxGemmBlockDim; +// nrow_max = Rnbrow; + int_t remaining_rows = (ii_st == 0) ? Rnbrow : Rnbrow - Remain_info[ii_st - 1].FullRow; + nrow_max = (remaining_rows / nrow_max) > 0 ? remaining_rows / CEILING(remaining_rows, nrow_max) : nrow_max; + + int_t ResRow = (ii_st == 0) ? 0 : Remain_info[ii_st - 1].FullRow; + for (int_t i = ii_st; i < RemainBlk - 1; ++i) + { + if ( Remain_info[i + 1].FullRow > ResRow + nrow_max) + { + ii_end = i; + break; /* row dimension reaches nrow_max */ + } + } + + int_t nrows; /* actual row dimension for GEMM */ + int_t st_row; + if (ii_st > 0) + { + nrows = Remain_info[ii_end - 1].FullRow - Remain_info[ii_st - 1].FullRow; + st_row = Remain_info[ii_st - 1].FullRow; + } + else + { + nrows = Remain_info[ii_end - 1].FullRow; + st_row = 0; + } + + int jj_st = jj_cpu; + int jj_end = jj_cpu; + + while (jj_end < nub && nrows > 0 ) + { + int_t remaining_cols = (jj_st == jj_cpu) ? ncols : ncols - Ublock_info[jj_st - 1].full_u_cols; + if ( remaining_cols * nrows < buffer_size) + { + jj_st = jj_end; + jj_end = nub; + } + else /* C matrix cannot fit in buffer, need to break into pieces */ + { + int_t ncol_max = buffer_size / nrows; + /** Must revisit **/ + ncol_max = SUPERLU_MIN(ncol_max, maxGemmBlockDim); + ncol_max = (remaining_cols / ncol_max) > 0 ? + remaining_cols / CEILING(remaining_cols, ncol_max) + : ncol_max; + + jj_st = jj_end; + jj_end = nub; + + int_t ResCol = (jj_st == 0) ? 0 : Ublock_info[jj_st - 1].full_u_cols; + for (int_t j = jj_st; j < nub - 1; ++j) + { + if (Ublock_info[j + 1].full_u_cols > ResCol + ncol_max) + { + jj_end = j; + break; + } + } + } /* end-if-else */ + + int ncols; + int st_col; + if (jj_st > 0) + { + ncols = Ublock_info[jj_end - 1].full_u_cols - Ublock_info[jj_st - 1].full_u_cols; + st_col = Ublock_info[jj_st - 1].full_u_cols; + if (ncols == 0) exit(0); + } + else + { + ncols = Ublock_info[jj_end - 1].full_u_cols; + st_col = 0; + } + + /* none of the matrix dimension is zero. */ + if (nrows > 0 && ldu > 0 && ncols > 0) + { + if (nrows * ncols > buffer_size) { + printf("!! Matrix size %lld x %lld exceeds buffer_size \n", + nrows, ncols, buffer_size); + fflush(stdout); + } + assert(nrows * ncols <= buffer_size); + cublasSetStream(cublas_handle0, FunCallStream); + cudaEventRecord(A_gpu->GemmStart[k0], FunCallStream); + cublasDgemm(cublas_handle0, CUBLAS_OP_N, CUBLAS_OP_N, + nrows, ncols, ldu, &alpha, + &A_gpu->scubufs[streamId].Remain_L_buff[(knsupc - ldu) * Rnbrow + st_row], Rnbrow, + &A_gpu->scubufs[streamId].bigU[st_col * ldu], ldu, + &beta, A_gpu->scubufs[streamId].bigV, nrows); + +// #define SCATTER_OPT +#ifdef SCATTER_OPT + cudaStreamSynchronize(FunCallStream); +#warning this function is synchronous +#endif + cudaEventRecord(A_gpu->GemmEnd[k0], FunCallStream); + + A_gpu->GemmFLOPCounter += 2.0 * (double) nrows * ncols * ldu; + + /* + * Scattering the output + */ + dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread + + dim3 dimGrid(ii_end - ii_st, jj_end - jj_st); + + Scatter_GPU_kernel <<< dimGrid, dimBlock, 0, FunCallStream>>> + (streamId, ii_st, ii_end, jj_st, jj_end, klst, + 0, nrows, ldt, npcol, nprow, dA_gpu); +#ifdef SCATTER_OPT + cudaStreamSynchronize(FunCallStream); +#warning this function is synchrnous +#endif + + cudaEventRecord(A_gpu->ScatterEnd[k0], FunCallStream); + + A_gpu->ScatterMOPCounter += 3.0 * (double) nrows * ncols; + } /* endif ... none of the matrix dimension is zero. */ + + } /* end while jj_end < nub */ + + } /* end while (ii_end < RemainBlk) */ + + return 0; +} /* end dSchurCompUpdate_GPU */ + + +static void print_occupancy() +{ + int blockSize; // The launch configurator returned block size + int minGridSize; /* The minimum grid size needed to achieve the + best potential occupancy */ + + cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, + Scatter_GPU_kernel, 0, 0); + printf("Occupancy: MinGridSize %d blocksize %d \n", minGridSize, blockSize); +} + +static void printDevProp(cudaDeviceProp devProp) +{ + size_t mfree, mtotal; + cudaMemGetInfo (&mfree, &mtotal); + + printf("pciBusID: %d\n", devProp.pciBusID); + printf("pciDeviceID: %d\n", devProp.pciDeviceID); + printf("GPU Name: %s\n", devProp.name); + printf("Total global memory: %zu\n", devProp.totalGlobalMem); + printf("Total free memory: %zu\n", mfree); + printf("Clock rate: %d\n", devProp.clockRate); + + return; +} + + +static size_t get_acc_memory () +{ + + size_t mfree, mtotal; + cudaMemGetInfo (&mfree, &mtotal); +#if 0 + printf("Total memory %zu & free memory %zu\n", mtotal, mfree); +#endif + return (size_t) (0.9 * (double) mfree) / get_mpi_process_per_gpu (); + + +} + +int dfree_LUstruct_gpu (dLUstruct_gpu_t * A_gpu) +{ + checkCuda(cudaFree(A_gpu->LrowindVec)); + checkCuda(cudaFree(A_gpu->LrowindPtr)); + + checkCuda(cudaFree(A_gpu->LnzvalVec)); + checkCuda(cudaFree(A_gpu->LnzvalPtr)); + free(A_gpu->LnzvalPtr_host); + /*freeing the pinned memory*/ + int_t streamId = 0; + checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_info_host)); + checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Ublock_info_host)); + checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_L_buff_host)); + checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].bigU_host)); + + checkCuda(cudaFreeHost(A_gpu->acc_L_buff)); + checkCuda(cudaFreeHost(A_gpu->acc_U_buff)); + checkCuda(cudaFreeHost(A_gpu->scubufs[streamId].lsub_buf)); + checkCuda(cudaFreeHost(A_gpu->scubufs[streamId].usub_buf)); + + + free(A_gpu->isOffloaded); + free(A_gpu->GemmStart); + free(A_gpu->GemmEnd); + free(A_gpu->ScatterEnd); + free(A_gpu->ePCIeH2D); + + free(A_gpu->ePCIeD2H_Start); + free(A_gpu->ePCIeD2H_End); + + checkCuda(cudaFree(A_gpu->UrowindVec)); + checkCuda(cudaFree(A_gpu->UrowindPtr)); + + free(A_gpu->UrowindPtr_host); + + checkCuda(cudaFree(A_gpu->UnzvalVec)); + checkCuda(cudaFree(A_gpu->UnzvalPtr)); + + checkCuda(cudaFree(A_gpu->grid)); + + + + checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV)); + checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU)); + + checkCuda(cudaFree(A_gpu->scubufs[streamId].Remain_L_buff)); + checkCuda(cudaFree(A_gpu->scubufs[streamId].Ublock_info)); + checkCuda(cudaFree(A_gpu->scubufs[streamId].Remain_info)); + + // checkCuda(cudaFree(A_gpu->indirect)); + // checkCuda(cudaFree(A_gpu->indirect2)); + checkCuda(cudaFree(A_gpu->xsup)); + + checkCuda(cudaFree(A_gpu->scubufs[streamId].lsub)); + checkCuda(cudaFree(A_gpu->scubufs[streamId].usub)); + + + checkCuda(cudaFree(A_gpu->local_l_blk_infoVec)); + checkCuda(cudaFree(A_gpu->local_l_blk_infoPtr)); + checkCuda(cudaFree(A_gpu->jib_lookupVec)); + checkCuda(cudaFree(A_gpu->jib_lookupPtr)); + checkCuda(cudaFree(A_gpu->local_u_blk_infoVec)); + checkCuda(cudaFree(A_gpu->local_u_blk_infoPtr)); + checkCuda(cudaFree(A_gpu->ijb_lookupVec)); + checkCuda(cudaFree(A_gpu->ijb_lookupPtr)); + + return 0; +} + + + +void dPrint_matrix( char *desc, int_t m, int_t n, double * dA, int_t lda ) +{ + double *cPtr = (double *) malloc(sizeof(double) * lda * n); + checkCuda(cudaMemcpy( cPtr, dA, + lda * n * sizeof(double), cudaMemcpyDeviceToHost)) ; + + int_t i, j; + printf( "\n %s\n", desc ); + for ( i = 0; i < m; i++ ) + { + for ( j = 0; j < n; j++ ) printf( " %.3e", cPtr[i + j * lda] ); + printf( "\n" ); + } + free(cPtr); +} + +void dprintGPUStats(dLUstruct_gpu_t * A_gpu) +{ + double tGemm = 0; + double tScatter = 0; + double tPCIeH2D = 0; + double tPCIeD2H = 0; + + for (int_t i = 0; i < A_gpu->nsupers; ++i) + { + float milliseconds = 0; + + if (A_gpu->isOffloaded[i]) + { + cudaEventElapsedTime(&milliseconds, A_gpu->ePCIeH2D[i], A_gpu->GemmStart[i]); + tPCIeH2D += 1e-3 * (double) milliseconds; + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, A_gpu->GemmStart[i], A_gpu->GemmEnd[i]); + tGemm += 1e-3 * (double) milliseconds; + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, A_gpu->GemmEnd[i], A_gpu->ScatterEnd[i]); + tScatter += 1e-3 * (double) milliseconds; + } + + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, A_gpu->ePCIeD2H_Start[i], A_gpu->ePCIeD2H_End[i]); + tPCIeD2H += 1e-3 * (double) milliseconds; + } + + printf("GPU: Flops offloaded %.3e Time spent %lf Flop rate %lf GF/sec \n", + A_gpu->GemmFLOPCounter, tGemm, 1e-9 * A_gpu->GemmFLOPCounter / tGemm ); + printf("GPU: Mop offloaded %.3e Time spent %lf Bandwidth %lf GByte/sec \n", + A_gpu->ScatterMOPCounter, tScatter, 8e-9 * A_gpu->ScatterMOPCounter / tScatter ); + printf("PCIe Data Transfer H2D:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", + 1e-9 * A_gpu->cPCIeH2D, A_gpu->tHost_PCIeH2D, tPCIeH2D, 1e-9 * A_gpu->cPCIeH2D / tPCIeH2D ); + printf("PCIe Data Transfer D2H:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", + 1e-9 * A_gpu->cPCIeD2H, A_gpu->tHost_PCIeD2H, tPCIeD2H, 1e-9 * A_gpu->cPCIeD2H / tPCIeD2H ); + fflush(stdout); + +} /* end printGPUStats */ + + +int dinitSluGPU3D_t( + dsluGPU_t *sluGPU, + dLUstruct_t *LUstruct, + gridinfo3d_t * grid3d, + int_t* perm_c_supno, + int_t n, + int_t buffer_size, /* read from env variable MAX_BUFFER_SIZE */ + int_t bigu_size, + int_t ldt /* NSUP read from sp_ienv(3) */ +) +{ + gridinfo_t* grid = &(grid3d->grid2d); + checkCudaErrors(cudaDeviceReset ()) ; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + dLocalLU_t *Llu = LUstruct->Llu; + int_t* isNodeInMyGrid = sluGPU->isNodeInMyGrid; + + sluGPU->nCudaStreams = getnCudaStreams(); + if (grid3d->iam == 0) + { + printf("dinitSluGPU3D_t: Using hardware acceleration, with %d cuda streams \n", sluGPU->nCudaStreams); + fflush(stdout); + if ( MAX_SUPER_SIZE < ldt ) + { + ABORT("MAX_SUPER_SIZE smaller than requested NSUP"); + } + } + + cudaStreamCreate(&(sluGPU->CopyStream)); + + for (int streamId = 0; streamId < sluGPU->nCudaStreams; streamId++) + { + cudaStreamCreate(&(sluGPU->funCallStreams[streamId])); + cublasCreate(&(sluGPU->cublasHandles[streamId])); + sluGPU->lastOffloadStream[streamId] = -1; + } + + sluGPU->A_gpu = (dLUstruct_gpu_t *) malloc (sizeof(dLUstruct_gpu_t)); + sluGPU->A_gpu->perm_c_supno = perm_c_supno; + dCopyLUToGPU3D ( isNodeInMyGrid, + Llu, /* referred to as A_host */ + sluGPU, Glu_persist, n, grid3d, buffer_size, bigu_size, ldt + ); + + return 0; +} /* end dinitSluGPU3D_t */ + +int dinitD2Hreduce( + int next_k, d2Hreduce_t* d2Hred, int last_flag, HyP_t* HyP, + dsluGPU_t *sluGPU, gridinfo_t *grid, dLUstruct_t *LUstruct, SCT_t* SCT +) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + dLocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + + + // int_t next_col = SUPERLU_MIN (k0 + num_look_aheads + 1, nsupers - 1); + // int_t next_k = perm_c_supno[next_col]; /* global block number for next colum*/ + int_t mkcol, mkrow; + + int_t kljb = LBj( next_k, grid ); /*local block number for next block*/ + int_t kijb = LBi( next_k, grid ); /*local block number for next block*/ + + int_t *kindexL ; /*for storing index vectors*/ + int_t *kindexU ; + mkrow = PROW (next_k, grid); + mkcol = PCOL (next_k, grid); + int_t ksup_size = SuperSize(next_k); + + int_t copyL_kljb = 0; + int_t copyU_kljb = 0; + int_t l_copy_len = 0; + int_t u_copy_len = 0; + + if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL && last_flag) + { + if (HyP->Lblock_dirty_bit[kljb] > -1) + { + copyL_kljb = 1; + int_t lastk0 = HyP->Lblock_dirty_bit[kljb]; + int_t streamIdk0Offload = lastk0 % sluGPU->nCudaStreams; + if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1) + { + // printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload); + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamIdk0Offload] = -1; + } + } + + kindexL = Lrowind_bc_ptr[kljb]; + l_copy_len = kindexL[1] * ksup_size; + } + + if ( mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL && last_flag ) + { + if (HyP->Ublock_dirty_bit[kijb] > -1) + { + copyU_kljb = 1; + int_t lastk0 = HyP->Ublock_dirty_bit[kijb]; + int_t streamIdk0Offload = lastk0 % sluGPU->nCudaStreams; + if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1) + { + // printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload); + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamIdk0Offload] = -1; + } + } + // copyU_kljb = HyP->Ublock_dirty_bit[kijb]>-1? 1: 0; + kindexU = Ufstnz_br_ptr[kijb]; + u_copy_len = kindexU[1]; + } + + // wait for streams if they have not been finished + + // d2Hred->next_col = next_col; + d2Hred->next_k = next_k; + d2Hred->kljb = kljb; + d2Hred->kijb = kijb; + d2Hred->copyL_kljb = copyL_kljb; + d2Hred->copyU_kljb = copyU_kljb; + d2Hred->l_copy_len = l_copy_len; + d2Hred->u_copy_len = u_copy_len; + d2Hred->kindexU = kindexU; + d2Hred->kindexL = kindexL; + d2Hred->mkrow = mkrow; + d2Hred->mkcol = mkcol; + d2Hred->ksup_size = ksup_size; + return 0; +} /* dinitD2Hreduce */ + +int dreduceGPUlu( + int last_flag, + d2Hreduce_t* d2Hred, + dsluGPU_t *sluGPU, + SCT_t *SCT, + gridinfo_t *grid, + dLUstruct_t *LUstruct +) +{ + dLocalLU_t *Llu = LUstruct->Llu; + int iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + double** Unzval_br_ptr = Llu->Unzval_br_ptr; + + cudaStream_t CopyStream; + dLUstruct_gpu_t *A_gpu; + A_gpu = sluGPU->A_gpu; + CopyStream = sluGPU->CopyStream; + + int_t kljb = d2Hred->kljb; + int_t kijb = d2Hred->kijb; + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + int_t mkrow = d2Hred->mkrow; + int_t mkcol = d2Hred->mkcol; + int_t ksup_size = d2Hred->ksup_size; + int_t *kindex; + if ((copyL_kljb || copyU_kljb) && last_flag ) + { + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(CopyStream); + SCT->PhiWaitTimer_2 += SuperLU_timer_() - ttx; + } + + double tt_start = SuperLU_timer_(); + + if (last_flag) { + if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL ) + { + kindex = Lrowind_bc_ptr[kljb]; + int_t len = kindex[1]; + + if (copyL_kljb) + { + double *nzval_host; + nzval_host = Lnzval_bc_ptr[kljb]; + int_t llen = ksup_size * len; + double alpha = 1; + superlu_daxpy (llen, alpha, A_gpu->acc_L_buff, 1, nzval_host, 1); + } + + } + } + if (last_flag) { + if (mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL ) + { + kindex = Ufstnz_br_ptr[kijb]; + int_t len = kindex[1]; + + if (copyU_kljb) + { + double *nzval_host; + nzval_host = Unzval_br_ptr[kijb]; + + double alpha = 1; + superlu_daxpy (len, alpha, A_gpu->acc_U_buff, 1, nzval_host, 1); + } + } + } + + double tt_end = SuperLU_timer_(); + SCT->AssemblyTimer += tt_end - tt_start; + return 0; +} /* dreduceGPUlu */ + + +int dwaitGPUscu(int_t streamId, dsluGPU_t *sluGPU, SCT_t *SCT) +{ + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + return 0; +} + +int_t dsendLUpanelGPU2HOST( + int_t k0, + d2Hreduce_t* d2Hred, + dsluGPU_t *sluGPU +) +{ + int_t kljb = d2Hred->kljb; + int_t kijb = d2Hred->kijb; + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + int_t l_copy_len = d2Hred->l_copy_len; + int_t u_copy_len = d2Hred->u_copy_len; + cudaStream_t CopyStream = sluGPU->CopyStream;; + dLUstruct_gpu_t *A_gpu = sluGPU->A_gpu; + double tty = SuperLU_timer_(); + cudaEventRecord(A_gpu->ePCIeD2H_Start[k0], CopyStream); + if (copyL_kljb) + checkCuda(cudaMemcpyAsync(A_gpu->acc_L_buff, &A_gpu->LnzvalVec[A_gpu->LnzvalPtr_host[kljb]], + l_copy_len * sizeof(double), cudaMemcpyDeviceToHost, CopyStream ) ); + + if (copyU_kljb) + checkCuda(cudaMemcpyAsync(A_gpu->acc_U_buff, &A_gpu->UnzvalVec[A_gpu->UnzvalPtr_host[kijb]], + u_copy_len * sizeof(double), cudaMemcpyDeviceToHost, CopyStream ) ); + cudaEventRecord(A_gpu->ePCIeD2H_End[k0], CopyStream); + A_gpu->tHost_PCIeD2H += SuperLU_timer_() - tty; + A_gpu->cPCIeD2H += u_copy_len * sizeof(double) + l_copy_len * sizeof(double); + + return 0; +} + +/* Copy L and U panel data structures from host to the host part of the + data structures in A_gpu. + GPU is not involved in this routine. */ +int dsendSCUdataHost2GPU( + int_t streamId, + int_t* lsub, + int_t* usub, + double* bigU, + int_t bigu_send_size, + int_t Remain_lbuf_send_size, + dsluGPU_t *sluGPU, + HyP_t* HyP +) +{ + //{printf("....[enter] dsendSCUdataHost2GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} + + int_t usub_len = usub[2]; + int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + //{printf("....[2] in dsendSCUdataHost2GPU, lsub_len %d\n", lsub_len); fflush(stdout);} + dLUstruct_gpu_t *A_gpu = sluGPU->A_gpu; + memcpy(A_gpu->scubufs[streamId].lsub_buf, lsub, sizeof(int_t)*lsub_len); + memcpy(A_gpu->scubufs[streamId].usub_buf, usub, sizeof(int_t)*usub_len); + memcpy(A_gpu->scubufs[streamId].Remain_info_host, HyP->Remain_info, + sizeof(Remain_info_t)*HyP->RemainBlk); + memcpy(A_gpu->scubufs[streamId].Ublock_info_host, HyP->Ublock_info_Phi, + sizeof(Ublock_info_t)*HyP->num_u_blks_Phi); + memcpy(A_gpu->scubufs[streamId].Remain_L_buff_host, HyP->Remain_L_buff, + sizeof(double)*Remain_lbuf_send_size); + memcpy(A_gpu->scubufs[streamId].bigU_host, bigU, + sizeof(double)*bigu_send_size); + + return 0; +} + +/* Sherry: not used ?*/ +#if 0 +int freeSluGPU(dsluGPU_t *sluGPU) +{ + return 0; +} +#endif + +void dCopyLUToGPU3D ( + int_t* isNodeInMyGrid, + dLocalLU_t *A_host, /* distributed LU structure on host */ + dsluGPU_t *sluGPU, + Glu_persist_t *Glu_persist, int_t n, + gridinfo3d_t *grid3d, + int_t buffer_size, /* bigV size on GPU for Schur complement update */ + int_t bigu_size, + int_t ldt +) +{ + gridinfo_t* grid = &(grid3d->grid2d); + dLUstruct_gpu_t * A_gpu = sluGPU->A_gpu; + dLUstruct_gpu_t **dA_gpu = &(sluGPU->dA_gpu); + +#if ( PRNTlevel>=1 ) + if ( grid3d->iam == 0 ) print_occupancy(); +#endif + +#ifdef GPU_DEBUG + // if ( grid3d->iam == 0 ) + { + cudaDeviceProp devProp; + cudaGetDeviceProperties(&devProp, 0); + printDevProp(devProp); + } +#endif + int_t *xsup ; + xsup = Glu_persist->xsup; + int iam = grid->iam; + int nsupers = Glu_persist->supno[n - 1] + 1; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t mrb = (nsupers + Pr - 1) / Pr; + int_t mcb = (nsupers + Pc - 1) / Pc; + int_t remain_l_max = A_host->bufmax[1]; + + /*copies of scalars for easy access*/ + A_gpu->nsupers = nsupers; + A_gpu->ScatterMOPCounter = 0; + A_gpu->GemmFLOPCounter = 0; + A_gpu->cPCIeH2D = 0; + A_gpu->cPCIeD2H = 0; + A_gpu->tHost_PCIeH2D = 0; + A_gpu->tHost_PCIeD2H = 0; + + /*initializing memory*/ + size_t max_gpu_memory = get_acc_memory (); + size_t gpu_mem_used = 0; + + void *tmp_ptr; + + A_gpu->xsup_host = xsup; + + int_t nCudaStreams = sluGPU->nCudaStreams; + /*pinned memory allocations. + Paged-locked memory by cudaMallocHost is accessible to the device.*/ + for (int streamId = 0; streamId < nCudaStreams; streamId++ ) { + void *tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, (n) * sizeof(int_t) )) ; + A_gpu->scubufs[streamId].usub_IndirectJ3_host = (int_t*) tmp_ptr; + + checkCudaErrors(cudaMalloc( &tmp_ptr, ( n) * sizeof(int_t) )); + A_gpu->scubufs[streamId].usub_IndirectJ3 = (int_t*) tmp_ptr; + gpu_mem_used += ( n) * sizeof(int_t); + checkCudaErrors(cudaMallocHost( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; + A_gpu->scubufs[streamId].Remain_info_host = (Remain_info_t*)tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; + A_gpu->scubufs[streamId].Ublock_info_host = (Ublock_info_t*)tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, remain_l_max * sizeof(double) )) ; + A_gpu->scubufs[streamId].Remain_L_buff_host = (double *) tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, bigu_size * sizeof(double) )) ; + A_gpu->scubufs[streamId].bigU_host = (double *) tmp_ptr; + + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[1]))); + A_gpu->acc_L_buff = (double *) tmp_ptr; + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(double) * (A_host->bufmax[3]))); + A_gpu->acc_U_buff = (double *) tmp_ptr; + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[0]))); + A_gpu->scubufs[streamId].lsub_buf = (int_t *) tmp_ptr; + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[2]))); + A_gpu->scubufs[streamId].usub_buf = (int_t *) tmp_ptr; + + checkCudaErrors(cudaMalloc( &tmp_ptr, remain_l_max * sizeof(double) )) ; + A_gpu->scubufs[streamId].Remain_L_buff = (double *) tmp_ptr; + gpu_mem_used += remain_l_max * sizeof(double); + checkCudaErrors(cudaMalloc( &tmp_ptr, bigu_size * sizeof(double) )) ; + A_gpu->scubufs[streamId].bigU = (double *) tmp_ptr; + gpu_mem_used += bigu_size * sizeof(double); + checkCudaErrors(cudaMalloc( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; + A_gpu->scubufs[streamId].Ublock_info = (Ublock_info_t *) tmp_ptr; + gpu_mem_used += mcb * sizeof(Ublock_info_t); + checkCudaErrors(cudaMalloc( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; + A_gpu->scubufs[streamId].Remain_info = (Remain_info_t *) tmp_ptr; + gpu_mem_used += mrb * sizeof(Remain_info_t); + checkCudaErrors(cudaMalloc( &tmp_ptr, buffer_size * sizeof(double))) ; + A_gpu->scubufs[streamId].bigV = (double *) tmp_ptr; + gpu_mem_used += buffer_size * sizeof(double); + checkCudaErrors(cudaMalloc( &tmp_ptr, A_host->bufmax[0]*sizeof(int_t))) ; + A_gpu->scubufs[streamId].lsub = (int_t *) tmp_ptr; + gpu_mem_used += A_host->bufmax[0] * sizeof(int_t); + checkCudaErrors(cudaMalloc( &tmp_ptr, A_host->bufmax[2]*sizeof(int_t))) ; + A_gpu->scubufs[streamId].usub = (int_t *) tmp_ptr; + gpu_mem_used += A_host->bufmax[2] * sizeof(int_t); + + } /* endfor streamID ... allocate paged-locked memory */ + + A_gpu->isOffloaded = (int *) SUPERLU_MALLOC (sizeof(int) * nsupers); + A_gpu->GemmStart = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->GemmEnd = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->ScatterEnd = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeH2D = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeD2H_Start = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeD2H_End = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + + for (int i = 0; i < nsupers; ++i) + { + A_gpu->isOffloaded[i] = 0; + checkCudaErrors(cudaEventCreate(&(A_gpu->GemmStart[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->GemmEnd[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ScatterEnd[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeH2D[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeH2D[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeD2H_Start[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeD2H_End[i]))); + } + + /*---- Copy L data structure to GPU ----*/ + + /*pointers and address of local blocks for easy accessibility */ + local_l_blk_info_t *local_l_blk_infoVec; + int_t * local_l_blk_infoPtr; + local_l_blk_infoPtr = (int_t *) malloc( CEILING(nsupers, Pc) * sizeof(int_t ) ); + + /* First pass: count total L blocks */ + int_t cum_num_l_blocks = 0; /* total number of L blocks I own */ + for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) + { + /* going through each block column I own */ + + if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) + { + int_t *index = A_host->Lrowind_bc_ptr[i]; + int_t num_l_blocks = index[0]; + cum_num_l_blocks += num_l_blocks; + } + } + + /*allocating memory*/ + local_l_blk_infoVec = (local_l_blk_info_t *) malloc(cum_num_l_blocks * sizeof(local_l_blk_info_t)); + + /* Second pass: set up the meta-data for the L structure */ + cum_num_l_blocks = 0; + + /*initialzing vectors */ + for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) + { + if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) + { + int_t *index = A_host->Lrowind_bc_ptr[i]; + int_t num_l_blocks = index[0]; /* # L blocks in this column */ + + if (num_l_blocks > 0) + { + + local_l_blk_info_t *local_l_blk_info_i = local_l_blk_infoVec + cum_num_l_blocks; + local_l_blk_infoPtr[i] = cum_num_l_blocks; + + int_t lptrj = BC_HEADER; + int_t luptrj = 0; + + for (int_t j = 0; j < num_l_blocks ; ++j) + { + + int_t ijb = index[lptrj]; + + local_l_blk_info_i[j].lib = ijb / Pr; + local_l_blk_info_i[j].lptrj = lptrj; + local_l_blk_info_i[j].luptrj = luptrj; + luptrj += index[lptrj + 1]; + lptrj += LB_DESCRIPTOR + index[lptrj + 1]; + + } + } + cum_num_l_blocks += num_l_blocks; + } + + } /* endfor all block columns */ + + /* Allocate L memory on GPU, and copy the values from CPU to GPU */ + checkCudaErrors(cudaMalloc( &tmp_ptr, cum_num_l_blocks * sizeof(local_l_blk_info_t))) ; + A_gpu->local_l_blk_infoVec = (local_l_blk_info_t *) tmp_ptr; + gpu_mem_used += cum_num_l_blocks * sizeof(local_l_blk_info_t); + checkCudaErrors(cudaMemcpy( (A_gpu->local_l_blk_infoVec), local_l_blk_infoVec, cum_num_l_blocks * sizeof(local_l_blk_info_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, CEILING(nsupers, Pc)*sizeof(int_t))) ; + A_gpu->local_l_blk_infoPtr = (int_t *) tmp_ptr; + gpu_mem_used += CEILING(nsupers, Pc) * sizeof(int_t); + checkCudaErrors(cudaMemcpy( (A_gpu->local_l_blk_infoPtr), local_l_blk_infoPtr, CEILING(nsupers, Pc)*sizeof(int_t), cudaMemcpyHostToDevice)) ; + + /*---- Copy U data structure to GPU ----*/ + + local_u_blk_info_t *local_u_blk_infoVec; + int_t * local_u_blk_infoPtr; + local_u_blk_infoPtr = (int_t *) malloc( CEILING(nsupers, Pr) * sizeof(int_t ) ); + + /* First pass: count total U blocks */ + int_t cum_num_u_blocks = 0; + + for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) + { + + if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) + { + int_t *index = A_host->Ufstnz_br_ptr[i]; + int_t num_u_blocks = index[0]; + cum_num_u_blocks += num_u_blocks; + + } + } + + local_u_blk_infoVec = (local_u_blk_info_t *) malloc(cum_num_u_blocks * sizeof(local_u_blk_info_t)); + + /* Second pass: set up the meta-data for the U structure */ + cum_num_u_blocks = 0; + + for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) + { + if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) + { + int_t *index = A_host->Ufstnz_br_ptr[i]; + int_t num_u_blocks = index[0]; + + if (num_u_blocks > 0) + { + local_u_blk_info_t *local_u_blk_info_i = local_u_blk_infoVec + cum_num_u_blocks; + local_u_blk_infoPtr[i] = cum_num_u_blocks; + + int_t iuip_lib, ruip_lib; + iuip_lib = BR_HEADER; + ruip_lib = 0; + + for (int_t j = 0; j < num_u_blocks ; ++j) + { + + int_t ijb = index[iuip_lib]; + local_u_blk_info_i[j].ljb = ijb / Pc; + local_u_blk_info_i[j].iuip = iuip_lib; + local_u_blk_info_i[j].ruip = ruip_lib; + + ruip_lib += index[iuip_lib + 1]; + iuip_lib += UB_DESCRIPTOR + SuperSize (ijb); + + } + } + cum_num_u_blocks += num_u_blocks; + } + } + + checkCudaErrors(cudaMalloc( &tmp_ptr, cum_num_u_blocks * sizeof(local_u_blk_info_t))) ; + A_gpu->local_u_blk_infoVec = (local_u_blk_info_t *) tmp_ptr; + gpu_mem_used += cum_num_u_blocks * sizeof(local_u_blk_info_t); + checkCudaErrors(cudaMemcpy( (A_gpu->local_u_blk_infoVec), local_u_blk_infoVec, cum_num_u_blocks * sizeof(local_u_blk_info_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, CEILING(nsupers, Pr)*sizeof(int_t))) ; + A_gpu->local_u_blk_infoPtr = (int_t *) tmp_ptr; + gpu_mem_used += CEILING(nsupers, Pr) * sizeof(int_t); + checkCudaErrors(cudaMemcpy( (A_gpu->local_u_blk_infoPtr), local_u_blk_infoPtr, CEILING(nsupers, Pr)*sizeof(int_t), cudaMemcpyHostToDevice)) ; + + /* Copy the actual L indices and values */ + int_t l_k = CEILING( nsupers, grid->npcol ); /* # of local block columns */ + int_t *temp_LrowindPtr = (int_t *) malloc(sizeof(int_t) * l_k); + int_t *temp_LnzvalPtr = (int_t *) malloc(sizeof(int_t) * l_k); + int_t *Lnzval_size = (int_t *) malloc(sizeof(int_t) * l_k); + int_t l_ind_len = 0; + int_t l_val_len = 0; + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc && isNodeInMyGrid[jb] == 1) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *index_host; + index_host = A_host->Lrowind_bc_ptr[ljb]; + + temp_LrowindPtr[ljb] = l_ind_len; + temp_LnzvalPtr[ljb] = l_val_len; // ### + Lnzval_size[ljb] = 0; //### + if (index_host != NULL) + { + int_t nrbl = index_host[0]; /* number of L blocks */ + int_t len = index_host[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + + /* Global block number is mycol + ljb*Pc */ + int_t nsupc = SuperSize(jb); + + l_ind_len += len1; + l_val_len += len * nsupc; + Lnzval_size[ljb] = len * nsupc ; // ### + } + else + { + Lnzval_size[ljb] = 0 ; // ### + } + } + } /* endfor jb = 0 ... */ + + /* Copy the actual U indices and values */ + int_t u_k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + int_t *temp_UrowindPtr = (int_t *) malloc(sizeof(int_t) * u_k); + int_t *temp_UnzvalPtr = (int_t *) malloc(sizeof(int_t) * u_k); + int_t *Unzval_size = (int_t *) malloc(sizeof(int_t) * u_k); + int_t u_ind_len = 0; + int_t u_val_len = 0; + for ( int_t lb = 0; lb < u_k; ++lb) + { + int_t *index_host; + index_host = A_host->Ufstnz_br_ptr[lb]; + temp_UrowindPtr[lb] = u_ind_len; + temp_UnzvalPtr[lb] = u_val_len; + Unzval_size[lb] = 0; + if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) + { + int_t len = index_host[1]; + int_t len1 = index_host[2]; + + u_ind_len += len1; + u_val_len += len; + Unzval_size[lb] = len; + } + else + { + Unzval_size[lb] = 0; + } + } + + gpu_mem_used += l_ind_len * sizeof(int_t); + gpu_mem_used += 2 * l_k * sizeof(int_t); + gpu_mem_used += u_ind_len * sizeof(int_t); + gpu_mem_used += 2 * u_k * sizeof(int_t); + + /*left memory shall be divided among the two */ + + for (int_t i = 0; i < l_k; ++i) + { + temp_LnzvalPtr[i] = -1; + } + + for (int_t i = 0; i < u_k; ++i) + { + temp_UnzvalPtr[i] = -1; + } + + /*setting these pointers back */ + l_val_len = 0; + u_val_len = 0; + + int_t num_gpu_l_blocks = 0; + int_t num_gpu_u_blocks = 0; + size_t mem_l_block, mem_u_block; + + /* Find the trailing matrix size that can fit into GPU memory */ + for (int_t i = nsupers - 1; i > -1; --i) + { + /* ulte se chalte hai eleimination tree */ + /* bottom up ordering */ + int_t i_sup = A_gpu->perm_c_supno[i]; + + int_t pc = PCOL( i_sup, grid ); + if (isNodeInMyGrid[i_sup] == 1) + { + if (mycol == pc ) + { + int_t ljb = LBj(i_sup, grid); + mem_l_block = sizeof(double) * Lnzval_size[ljb]; + if (gpu_mem_used + mem_l_block > max_gpu_memory) + { + break; + } + else + { + gpu_mem_used += mem_l_block; + temp_LnzvalPtr[ljb] = l_val_len; + l_val_len += Lnzval_size[ljb]; + num_gpu_l_blocks++; + A_gpu->first_l_block_gpu = i; + } + } + + int_t pr = PROW( i_sup, grid ); + if (myrow == pr) + { + int_t lib = LBi(i_sup, grid); + mem_u_block = sizeof(double) * Unzval_size[lib]; + if (gpu_mem_used + mem_u_block > max_gpu_memory) + { + break; + } + else + { + gpu_mem_used += mem_u_block; + temp_UnzvalPtr[lib] = u_val_len; + u_val_len += Unzval_size[lib]; + num_gpu_u_blocks++; + A_gpu->first_u_block_gpu = i; + } + } + } /* endif */ + + } /* endfor i .... nsupers */ + +#if (PRNTlevel>=2) + printf("(%d) Number of L blocks in GPU %d, U blocks %d\n", + grid3d->iam, num_gpu_l_blocks, num_gpu_u_blocks ); + printf("(%d) elimination order of first block in GPU: L block %d, U block %d\n", + grid3d->iam, A_gpu->first_l_block_gpu, A_gpu->first_u_block_gpu); + printf("(%d) Memory of L %.1f GB, memory for U %.1f GB, Total device memory used %.1f GB, Memory allowed %.1f GB \n", grid3d->iam, + l_val_len * sizeof(double) * 1e-9, + u_val_len * sizeof(double) * 1e-9, + gpu_mem_used * 1e-9, max_gpu_memory * 1e-9); + fflush(stdout); +#endif + + /* Assemble index vector on temp */ + int_t *indtemp = (int_t *) malloc(sizeof(int_t) * l_ind_len); + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc && isNodeInMyGrid[jb] == 1) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *index_host; + index_host = A_host->Lrowind_bc_ptr[ljb]; + + if (index_host != NULL) + { + int_t nrbl = index_host[0]; /* number of L blocks */ + int_t len = index_host[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + + memcpy(&indtemp[temp_LrowindPtr[ljb]] , index_host, len1 * sizeof(int_t)) ; + } + } + } + + checkCudaErrors(cudaMalloc( &tmp_ptr, l_ind_len * sizeof(int_t))) ; + A_gpu->LrowindVec = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->LrowindVec), indtemp, l_ind_len * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, l_val_len * sizeof(double))); + A_gpu->LnzvalVec = (double *) tmp_ptr; + checkCudaErrors(cudaMemset( (A_gpu->LnzvalVec), 0, l_val_len * sizeof(double))); + + checkCudaErrors(cudaMalloc( &tmp_ptr, l_k * sizeof(int_t))) ; + A_gpu->LrowindPtr = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->LrowindPtr), temp_LrowindPtr, l_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, l_k * sizeof(int_t))) ; + A_gpu->LnzvalPtr = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->LnzvalPtr), temp_LnzvalPtr, l_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + A_gpu->LnzvalPtr_host = temp_LnzvalPtr; + + int_t *indtemp1 = (int_t *) malloc(sizeof(int_t) * u_ind_len); + for ( int_t lb = 0; lb < u_k; ++lb) + { + int_t *index_host; + index_host = A_host->Ufstnz_br_ptr[lb]; + + if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) + { + int_t len1 = index_host[2]; + memcpy(&indtemp1[temp_UrowindPtr[lb]] , index_host, sizeof(int_t)*len1); + } + } + + checkCudaErrors(cudaMalloc( &tmp_ptr, u_ind_len * sizeof(int_t))) ; + A_gpu->UrowindVec = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->UrowindVec), indtemp1, u_ind_len * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, u_val_len * sizeof(double))); + A_gpu->UnzvalVec = (double *) tmp_ptr; + checkCudaErrors(cudaMemset( (A_gpu->UnzvalVec), 0, u_val_len * sizeof(double))); + + checkCudaErrors(cudaMalloc( &tmp_ptr, u_k * sizeof(int_t))) ; + A_gpu->UrowindPtr = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->UrowindPtr), temp_UrowindPtr, u_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + A_gpu->UnzvalPtr_host = temp_UnzvalPtr; + + checkCudaErrors(cudaMalloc( &tmp_ptr, u_k * sizeof(int_t))) ; + A_gpu->UnzvalPtr = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->UnzvalPtr), temp_UnzvalPtr, u_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, (nsupers + 1)*sizeof(int_t))) ; + A_gpu->xsup = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->xsup), xsup, (nsupers + 1)*sizeof(int_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, sizeof(dLUstruct_gpu_t))) ; + *dA_gpu = (dLUstruct_gpu_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( *dA_gpu, A_gpu, sizeof(dLUstruct_gpu_t), cudaMemcpyHostToDevice)) ; + + free (temp_LrowindPtr); + free (temp_UrowindPtr); + free (indtemp1); + free (indtemp); + +} /* end dCopyLUToGPU3D */ + + + +int dreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount, + int_t** treePerm, + dLUValSubBuf_t*LUvsb, + dLUstruct_t* LUstruct, + gridinfo3d_t* grid3d, + dsluGPU_t *sluGPU, + d2Hreduce_t* d2Hred, + factStat_t *factStat, + HyP_t* HyP, SCT_t* SCT ) +{ + // first synchronize all cuda streams + int superlu_acc_offload = HyP->superlu_acc_offload; + + int_t maxLvl = log2i( (int_t) grid3d->zscp.Np) + 1; + int_t myGrid = grid3d->zscp.Iam; + gridinfo_t* grid = &(grid3d->grid2d); + int_t* gpuLUreduced = factStat->gpuLUreduced; + + int_t sender; + if ((myGrid % (1 << (ilvl + 1))) == 0) + { + sender = myGrid + (1 << ilvl); + + } + else + { + sender = myGrid; + } + + /*Reduce all the ancestors from the GPU*/ + if (myGrid == sender && superlu_acc_offload) + { + for (int_t streamId = 0; streamId < sluGPU->nCudaStreams; streamId++) + { + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamId] = -1; + } + + for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl) + { + /* code */ + // int_t atree = myTreeIdxs[alvl]; + int_t nsAncestor = myNodeCount[alvl]; + int_t* cAncestorList = treePerm[alvl]; + + for (int_t node = 0; node < nsAncestor; node++ ) + { + int_t k = cAncestorList[node]; + if (!gpuLUreduced[k]) + { + dinitD2Hreduce(k, d2Hred, 1, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + double tt_start1 = SuperLU_timer_(); + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++; + dsendLUpanelGPU2HOST(k, d2Hred, sluGPU); + /* + Reduce the LU panels from GPU + */ + dreduceGPUlu(1, d2Hred, sluGPU, SCT, grid, LUstruct); + gpuLUreduced[k] = 1; + } + } + } + } /*if (myGrid == sender)*/ + + dreduceAllAncestors3d(ilvl, myNodeCount, treePerm, + LUvsb, LUstruct, grid3d, SCT ); + return 0; +} /* dreduceAllAncestors3d_GPU */ + + +void dsyncAllfunCallStreams(dsluGPU_t* sluGPU, SCT_t* SCT) +{ + for (int streamId = 0; streamId < sluGPU->nCudaStreams; streamId++) + { + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamId] = -1; + } +} diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index bb051921..ad9c0234 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -130,6 +130,11 @@ typedef MPI_C_DOUBLE_COMPLEX SuperLU_MPI_DOUBLE_COMPLEX; #include "util_dist.h" #include "psymbfact.h" +#ifdef GPU_ACC +#include +#endif + + #define MAX_SUPER_SIZE 256 /* Sherry: moved from superlu_gpu.cu */ diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c index 7c9c0ba2..92e6e085 100644 --- a/SRC/znrformat_loc3d.c +++ b/SRC/znrformat_loc3d.c @@ -9,6 +9,7 @@ The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ + /*! @file * \brief Preprocessing routines for the 3D factorization/solve codes: * - Gather {A,B} from 3D grid to 2D process layer 0 @@ -41,15 +42,15 @@ static void matCopy(int n, int m, doublecomplex *Dst, int lddst, doublecomplex * * output is in the returned A3d->{} structure. * see supermatrix.h for nrformat_loc3d{} structure. */ -NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input - doublecomplex *B, // input - int ldb, int nrhs, // input +NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input + doublecomplex *B, // input + int ldb, int nrhs, // input gridinfo3d_t *grid3d) { NRformat_loc3d *A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d)); NRformat_loc *A2d = SUPERLU_MALLOC(sizeof(NRformat_loc)); A3d->m_loc = A->m_loc; - A3d->B = (doublecomplex *)B; // on 3D process grid + A3d->B = (doublecomplex *) B; // on 3D process grid A3d->ldb = ldb; A3d->nrhs = nrhs; @@ -144,7 +145,7 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input if (grid3d->zscp.Iam == 0) { B1 = SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(doublecomplex)); - A3d->B2d = (doublecomplex *)SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(doublecomplex)); + A3d->B2d = (doublecomplex *) SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(doublecomplex)); } // B1 <- gatherv(Btmp) @@ -158,8 +159,8 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input for (int i = 0; i < grid3d->npdep; ++i) { /* code */ - matCopy(row_counts_int[i], nrhs, ((doublecomplex *)A3d->B2d) + row_disp[i], - A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]); + matCopy(row_counts_int[i], nrhs, ((doublecomplex*)A3d->B2d) + row_disp[i], + A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]); } SUPERLU_FREE(B1); @@ -177,7 +178,6 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input SUPERLU_FREE(row_counts); SUPERLU_FREE(nnz_disp); SUPERLU_FREE(Btmp); - // SUPERLU_FREE(B1); return A3d; @@ -187,14 +187,14 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input * Scatter B (solution) from 2D process layer 0 to 3D grid * Output: X2d <- A^{-1} B2d */ -int zScatter_B3d(NRformat_loc3d *A3d, // modified - gridinfo3d_t *grid3d) +int zScatter_B3d(NRformat_loc3d *A3d, // modified + gridinfo3d_t *grid3d) { - doublecomplex *B = (doublecomplex *)A3d->B; + doublecomplex *B = (doublecomplex *) A3d->B; int ldb = A3d->ldb; int nrhs = A3d->nrhs; - doublecomplex *B2d = (doublecomplex *)A3d->B2d; + doublecomplex *B2d = (doublecomplex *) A3d->B2d; NRformat_loc A2d = *(A3d->A_nfmt); int m_loc = A3d->m_loc; int *b_counts_int = A3d->b_counts_int; @@ -223,7 +223,7 @@ int zScatter_B3d(NRformat_loc3d *A3d, // modified doublecomplex *Btmp; Btmp = SUPERLU_MALLOC(A3d->m_loc * nrhs * sizeof(doublecomplex)); - // Bttmp <- scatterv(B1) + // Btmp <- scatterv(B1) MPI_Scatterv(B1, b_counts_int, b_disp, SuperLU_MPI_DOUBLE_COMPLEX, Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm); @@ -236,8 +236,7 @@ int zScatter_B3d(NRformat_loc3d *A3d, // modified SUPERLU_FREE(A3d->row_counts_int); SUPERLU_FREE(A3d->row_disp); SUPERLU_FREE(Btmp); - if (grid3d->zscp.Iam == 0) - SUPERLU_FREE(B1); + if (grid3d->zscp.Iam == 0) SUPERLU_FREE(B1); return 0; -} /* dScatter_B3d */ +} /* zScatter_B3d */ diff --git a/SRC/zsuperlu_gpu.cu b/SRC/zsuperlu_gpu.cu new file mode 100644 index 00000000..fe4205ea --- /dev/null +++ b/SRC/zsuperlu_gpu.cu @@ -0,0 +1,1816 @@ + +/*! @file + * \brief Descriptions and declarations for structures used in GPU + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ * 
+ */ + +//#define GPU_DEBUG + +#include "mpi.h" +#include "omp.h" +// #include "sec_structs.h" +#include +#include +#include + +#undef Reduce +#include "cub/cub.cuh" +//#include + +#include "zlustruct_gpu.h" + +#include "dcomplex.h" + +//extern "C" { +// void cblas_daxpy(const int N, const double alpha, const double *X, +// const int incX, double *Y, const int incY); +//} + +/*error reporting functions */ +static +cudaError_t checkCuda(cudaError_t result) +{ +#if defined(DEBUG) || defined(_DEBUG) + if (result != cudaSuccess) + { + fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); + assert(result == cudaSuccess); + } +#endif + return result; +} + + +// cublasStatus_t checkCublas(cublasStatus_t result) +// { +// #if defined(DEBUG) || defined(_DEBUG) +// if (result != CUBLAS_STATUS_SUCCESS) +// { +// fprintf(stderr, "CUDA Blas Runtime Error: %s\n", cublasGetErrorString(result)); +// assert(result == CUBLAS_STATUS_SUCCESS); +// } +// #endif +// return result; +// } + + +// #define UNIT_STRIDE + +#if 0 ////////// this routine is not used anymore +__device__ inline +void device_scatter_l (int_t thread_id, + int_t nsupc, int_t temp_nbrow, + int_t *usub, int_t iukp, int_t klst, + doublecomplex *nzval, int_t ldv, + doublecomplex *tempv, int_t nbrow, + // int_t *indirect2_thread + int *indirect2_thread + ) +{ + + + int_t segsize, jj; + + for (jj = 0; jj < nsupc; ++jj) + { + segsize = klst - usub[iukp + jj]; + if (segsize) + { + if (thread_id < temp_nbrow) + { + +#ifndef UNIT_STRIDE + nzval[indirect2_thread[thread_id]] -= tempv[thread_id]; +#else + nzval[thread_id] -= tempv[thread_id]; /*making access unit strided*/ +#endif + } + tempv += nbrow; + } + nzval += ldv; + } +} +#endif ///////////// not used + +#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ +// The following is moved to superlu_defs.h +//#define MAX_SUPER_SIZE 256 /* Sherry: was 192 on Titan */ + +__device__ inline +void zdevice_scatter_l_2D (int thread_id, + int nsupc, int temp_nbrow, + int_t *usub, int iukp, int_t klst, + doublecomplex *nzval, int ldv, + const doublecomplex *tempv, int nbrow, + int *indirect2_thread, + int nnz_cols, int ColPerBlock, + int *IndirectJ3 + ) +{ + int i; + if ( thread_id < temp_nbrow * ColPerBlock ) { + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; + +#define UNROLL_ITER 8 + +#pragma unroll 4 + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + i = ldv * IndirectJ3[col] + indirect2_thread[thread_id_x]; + z_sub(&nzval[i], &nzval[i], &tempv[nbrow * col + thread_id_x]); + } + } +} + +/* Sherry: this routine is not used */ +#if 0 +__global__ +void cub_scan_test(void) +{ + int thread_id = threadIdx.x; + typedef cub::BlockScan BlockScan; /*1D int data type*/ + + __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ + + __shared__ int IndirectJ1[MAX_SUPER_SIZE]; + __shared__ int IndirectJ2[MAX_SUPER_SIZE]; + + if (thread_id < MAX_SUPER_SIZE) + { + IndirectJ1[thread_id] = (thread_id + 1) % 2; + } + + __syncthreads(); + if (thread_id < MAX_SUPER_SIZE) + BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]); + + + if (thread_id < MAX_SUPER_SIZE) + printf("%d %d\n", thread_id, IndirectJ2[thread_id]); + +} +#endif // not used + + +__device__ inline +void device_scatter_u_2D (int thread_id, + int temp_nbrow, int nsupc, + doublecomplex * ucol, + int_t * usub, int iukp, + int_t ilst, int_t klst, + int_t * index, int iuip_lib, + doublecomplex * tempv, int nbrow, + int *indirect, + int nnz_cols, int ColPerBlock, + int *IndirectJ1, + int *IndirectJ3 + ) +{ + int i; + + if ( thread_id < temp_nbrow * ColPerBlock ) + { + /* 1D threads are logically arranged in 2D shape. */ + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; + +#pragma unroll 4 + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + i = IndirectJ1[IndirectJ3[col]] + indirect[thread_id_x]; + z_sub(&ucol[i], &ucol[i], &tempv[nbrow * col + thread_id_x]); + } + } +} + + +__device__ inline +void device_scatter_u (int_t thread_id, + int_t temp_nbrow, int_t nsupc, + doublecomplex * ucol, + int_t * usub, int_t iukp, + int_t ilst, int_t klst, + int_t * index, int_t iuip_lib, + doublecomplex * tempv, int_t nbrow, + // int_t *indirect + int *indirect + ) +{ + int_t segsize, fnz, jj; + for (jj = 0; jj < nsupc; ++jj) + { + segsize = klst - usub[iukp + jj]; + fnz = index[iuip_lib++]; + ucol -= fnz; + if (segsize) { /* Nonzero segment in U(k.j). */ + if (thread_id < temp_nbrow) + { +#ifndef UNIT_STRIDE + z_sub(&ucol[indirect[thread_id]], &ucol[indirect[thread_id]], + &tempv[thread_id]); +#else + /* making access unit strided; + it doesn't work; it is for measurements */ + z_sub(&ucol[thread_id], &ucol[thread_id], + &tempv[thread_id]); +#endif + } + tempv += nbrow; + } + ucol += ilst ; + } +} + + +__global__ +void Scatter_GPU_kernel( + int_t streamId, + int_t ii_st, int_t ii_end, + int_t jj_st, int_t jj_end, /* defines rectangular Schur block to be scatter */ + int_t klst, + int_t jj0, /* 0 on entry */ + int_t nrows, int_t ldt, int_t npcol, int_t nprow, + zLUstruct_gpu_t * A_gpu) +{ + + /* initializing pointers */ + int_t *xsup = A_gpu->xsup; + int_t *UrowindPtr = A_gpu->UrowindPtr; + int_t *UrowindVec = A_gpu->UrowindVec; + int_t *UnzvalPtr = A_gpu->UnzvalPtr; + doublecomplex *UnzvalVec = A_gpu->UnzvalVec; + int_t *LrowindPtr = A_gpu->LrowindPtr; + int_t *LrowindVec = A_gpu->LrowindVec; + int_t *LnzvalPtr = A_gpu->LnzvalPtr; + doublecomplex *LnzvalVec = A_gpu->LnzvalVec; + doublecomplex *bigV = A_gpu->scubufs[streamId].bigV; + local_l_blk_info_t *local_l_blk_infoVec = A_gpu->local_l_blk_infoVec; + local_u_blk_info_t *local_u_blk_infoVec = A_gpu->local_u_blk_infoVec; + int_t *local_l_blk_infoPtr = A_gpu->local_l_blk_infoPtr; + int_t *local_u_blk_infoPtr = A_gpu->local_u_blk_infoPtr; + Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info; + Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info; + int_t *lsub = A_gpu->scubufs[streamId].lsub; + int_t *usub = A_gpu->scubufs[streamId].usub; + + /* thread block assignment: this thread block is + assigned to block (lb, j) in 2D grid */ + int lb = blockIdx.x + ii_st; + int j = blockIdx.y + jj_st; + __shared__ int indirect_thread[MAX_SUPER_SIZE]; /* row-wise */ + __shared__ int indirect2_thread[MAX_SUPER_SIZE]; /* row-wise */ + __shared__ int IndirectJ1[THREAD_BLOCK_SIZE]; /* column-wise */ + __shared__ int IndirectJ3[THREAD_BLOCK_SIZE]; /* column-wise */ + + /* see CUB page https://nvlabs.github.io/cub/. Implement threads collectives */ + typedef cub::BlockScan BlockScan; /*1D int data type*/ + __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ + + int thread_id = threadIdx.x; + + int iukp = Ublock_info[j].iukp; + int jb = Ublock_info[j].jb; + int nsupc = SuperSize (jb); + int ljb = jb / npcol; + + doublecomplex *tempv1; + if (jj_st == jj0) + { + tempv1 = (j == jj_st) ? bigV + : bigV + Ublock_info[j - 1].full_u_cols * nrows; + } + else + { + tempv1 = (j == jj_st) ? bigV + : bigV + (Ublock_info[j - 1].full_u_cols - + Ublock_info[jj_st - 1].full_u_cols) * nrows; + } + + /* # of nonzero columns in block j */ + int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols + : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols); + int cum_ncol = (j == 0) ? 0 : Ublock_info[j - 1].full_u_cols; + + int lptr = Remain_info[lb].lptr; + int ib = Remain_info[lb].ib; + int temp_nbrow = lsub[lptr + 1]; /* number of rows in the current L block */ + lptr += LB_DESCRIPTOR; + + int_t cum_nrow; + if (ii_st == 0) + { + cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow); + } + else + { + cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow - Remain_info[ii_st - 1].FullRow); + } + + tempv1 += cum_nrow; + + if (ib < jb) /*scatter U code */ + { + int ilst = FstBlockC (ib + 1); + int lib = ib / nprow; /* local index of row block ib */ + int_t *index = &UrowindVec[UrowindPtr[lib]]; + + int num_u_blocks = index[0]; + + int ljb = (jb) / npcol; /* local index of column block jb */ + + /* Each thread is responsible for one block column */ + __shared__ int ljb_ind; + /*do a search ljb_ind at local row lib*/ + int blks_per_threads = CEILING(num_u_blocks, THREAD_BLOCK_SIZE); + for (int i = 0; i < blks_per_threads; ++i) + /* each thread is assigned a chunk of consecutive U blocks to search */ + { + /* only one thread finds the block index matching ljb */ + if (thread_id * blks_per_threads + i < num_u_blocks && + local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + thread_id * blks_per_threads + i ].ljb == ljb) + { + ljb_ind = thread_id * blks_per_threads + i; + } + } + __syncthreads(); + + int iuip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].iuip; + int ruip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].ruip; + iuip_lib += UB_DESCRIPTOR; + doublecomplex *Unzval_lib = &UnzvalVec[UnzvalPtr[lib]]; + doublecomplex *ucol = &Unzval_lib[ruip_lib]; + + if (thread_id < temp_nbrow) /* row-wise */ + { + /* cyclically map each thread to a row */ + indirect_thread[thread_id] = (int) lsub[lptr + thread_id]; + } + + /* column-wise: each thread is assigned one column */ + if (thread_id < nnz_cols) + IndirectJ3[thread_id] = A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; + /* indirectJ3[j] == kk means the j-th nonzero segment + points to column kk in this supernode */ + + __syncthreads(); + + /* threads are divided into multiple columns */ + int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; + + if (thread_id < THREAD_BLOCK_SIZE) + IndirectJ1[thread_id] = 0; + + if (thread_id < THREAD_BLOCK_SIZE) + { + if (thread_id < nsupc) + { + /* fstnz subscript of each column in the block */ + IndirectJ1[thread_id] = index[iuip_lib + thread_id]; + } + } + + /* perform an inclusive block-wide prefix sum among all threads */ + if (thread_id < THREAD_BLOCK_SIZE) + BlockScan(temp_storage).InclusiveSum(IndirectJ1[thread_id], IndirectJ1[thread_id]); + + if (thread_id < THREAD_BLOCK_SIZE) + IndirectJ1[thread_id] = -IndirectJ1[thread_id] + ilst * thread_id; + + __syncthreads(); + + device_scatter_u_2D ( + thread_id, + temp_nbrow, nsupc, + ucol, + usub, iukp, + ilst, klst, + index, iuip_lib, + tempv1, nrows, + indirect_thread, + nnz_cols, ColPerBlock, + IndirectJ1, + IndirectJ3 ); + + } + else /* ib >= jb, scatter L code */ + { + + int rel; + doublecomplex *nzval; + int_t *index = &LrowindVec[LrowindPtr[ljb]]; + int num_l_blocks = index[0]; + int ldv = index[1]; + + int fnz = FstBlockC (ib); + int lib = ib / nprow; + + __shared__ int lib_ind; + /*do a search lib_ind for lib*/ + int blks_per_threads = CEILING(num_l_blocks, THREAD_BLOCK_SIZE); + for (int i = 0; i < blks_per_threads; ++i) + { + if (thread_id * blks_per_threads + i < num_l_blocks && + local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + thread_id * blks_per_threads + i ].lib == lib) + { + lib_ind = thread_id * blks_per_threads + i; + } + } + __syncthreads(); + + int lptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].lptrj; + int luptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].luptrj; + lptrj += LB_DESCRIPTOR; + int dest_nbrow = index[lptrj - 1]; + + if (thread_id < dest_nbrow) + { + rel = index[lptrj + thread_id] - fnz; + indirect_thread[rel] = thread_id; + } + __syncthreads(); + + /* can be precalculated */ + if (thread_id < temp_nbrow) + { + rel = lsub[lptr + thread_id] - fnz; + indirect2_thread[thread_id] = indirect_thread[rel]; + } + if (thread_id < nnz_cols) + IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; + __syncthreads(); + + int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; + + nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj; + zdevice_scatter_l_2D( + thread_id, + nsupc, temp_nbrow, + usub, iukp, klst, + nzval, ldv, + tempv1, nrows, indirect2_thread, + nnz_cols, ColPerBlock, + IndirectJ3); + } /* end else ib >= jb */ + +} /* end Scatter_GPU_kernel */ + + +#define GPU_2D_SCHUDT /* Not used */ + +int zSchurCompUpdate_GPU( + int_t streamId, + int_t jj_cpu, /* 0 on entry, pointing to the start of Phi part */ + int_t nub, /* jj_cpu on entry, pointing to the end of the Phi part */ + int_t klst, int_t knsupc, + int_t Rnbrow, int_t RemainBlk, + int_t Remain_lbuf_send_size, + int_t bigu_send_size, int_t ldu, + int_t mcb, /* num_u_blks_hi */ + int_t buffer_size, int_t lsub_len, int_t usub_len, + int_t ldt, int_t k0, + zsluGPU_t *sluGPU, gridinfo_t *grid +) +{ + + zLUstruct_gpu_t * A_gpu = sluGPU->A_gpu; + zLUstruct_gpu_t * dA_gpu = sluGPU->dA_gpu; + int_t nprow = grid->nprow; + int_t npcol = grid->npcol; + + cudaStream_t FunCallStream = sluGPU->funCallStreams[streamId]; + cublasHandle_t cublas_handle0 = sluGPU->cublasHandles[streamId]; + int_t * lsub = A_gpu->scubufs[streamId].lsub_buf; + int_t * usub = A_gpu->scubufs[streamId].usub_buf; + Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info_host; + doublecomplex * Remain_L_buff = A_gpu->scubufs[streamId].Remain_L_buff_host; + Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info_host; + doublecomplex * bigU = A_gpu->scubufs[streamId].bigU_host; + + A_gpu->isOffloaded[k0] = 1; + /* start by sending data to */ + int_t *xsup = A_gpu->xsup_host; + int_t col_back = (jj_cpu == 0) ? 0 : Ublock_info[jj_cpu - 1].full_u_cols; + // if(nub<1) return; + int_t ncols = Ublock_info[nub - 1].full_u_cols - col_back; + + /* Sherry: can get max_super_size from sp_ienv(3) */ + int_t indirectJ1[MAX_SUPER_SIZE]; // 0 indicates an empry segment + int_t indirectJ2[MAX_SUPER_SIZE]; // # of nonzero segments so far + int_t indirectJ3[MAX_SUPER_SIZE]; /* indirectJ3[j] == k means the + j-th nonzero segment points + to column k in this supernode */ + /* calculate usub_indirect */ + for (int jj = jj_cpu; jj < nub; ++jj) + { + int_t iukp = Ublock_info[jj].iukp; + int_t jb = Ublock_info[jj].jb; + int_t nsupc = SuperSize (jb); + int_t addr = (jj == 0) ? 0 + : Ublock_info[jj - 1].full_u_cols - col_back; + + for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + indirectJ1[kk] = 0; + } + + for (int_t kk = 0; kk < nsupc; ++kk) + { + indirectJ1[kk] = ((klst - usub[iukp + kk]) == 0) ? 0 : 1; + } + + /*prefix sum - indicates # of nonzero segments up to column kk */ + indirectJ2[0] = indirectJ1[0]; + for (int_t kk = 1; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + indirectJ2[kk] = indirectJ2[kk - 1] + indirectJ1[kk]; + } + + /* total number of nonzero segments in this supernode */ + int nnz_col = indirectJ2[nsupc - 1]; // old: MAX_SUPER_SIZE + + /* compactation */ + for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + if (indirectJ1[kk]) /* kk is a nonzero segment */ + { + /* indirectJ3[j] == kk means the j-th nonzero segment + points to column kk in this supernode */ + indirectJ3[indirectJ2[kk] - 1] = kk; + } + } + + for (int i = 0; i < nnz_col; ++i) + { + /* addr == total # of full columns before current block jj */ + A_gpu->scubufs[streamId].usub_IndirectJ3_host[addr + i] = indirectJ3[i]; + } + } /* end for jj ... calculate usub_indirect */ + + //printf("zSchurCompUpdate_GPU[3]: jj_cpu %d, nub %d\n", jj_cpu, nub); fflush(stdout); + + /*sizeof RemainLbuf = Rnbuf*knsupc */ + double tTmp = SuperLU_timer_(); + cudaEventRecord(A_gpu->ePCIeH2D[k0], FunCallStream); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].usub_IndirectJ3, + A_gpu->scubufs[streamId].usub_IndirectJ3_host, + ncols * sizeof(int_t), cudaMemcpyHostToDevice, + FunCallStream)) ; + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Remain_L_buff, Remain_L_buff, + Remain_lbuf_send_size * sizeof(doublecomplex), + cudaMemcpyHostToDevice, FunCallStream)) ; + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].bigU, bigU, + bigu_send_size * sizeof(doublecomplex), + cudaMemcpyHostToDevice, FunCallStream) ); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Remain_info, Remain_info, + RemainBlk * sizeof(Remain_info_t), + cudaMemcpyHostToDevice, FunCallStream) ); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Ublock_info, Ublock_info, + mcb * sizeof(Ublock_info_t), cudaMemcpyHostToDevice, + FunCallStream) ); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].lsub, lsub, + lsub_len * sizeof(int_t), cudaMemcpyHostToDevice, + FunCallStream) ); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].usub, usub, + usub_len * sizeof(int_t), cudaMemcpyHostToDevice, + FunCallStream) ); + + A_gpu->tHost_PCIeH2D += SuperLU_timer_() - tTmp; + A_gpu->cPCIeH2D += Remain_lbuf_send_size * sizeof(doublecomplex) + + bigu_send_size * sizeof(doublecomplex) + + RemainBlk * sizeof(Remain_info_t) + + mcb * sizeof(Ublock_info_t) + + lsub_len * sizeof(int_t) + + usub_len * sizeof(int_t); + + doublecomplex alpha = {1.0, 0.0}, beta = {0.0, 0.0}; + + /* The following are used in cublasZgemm() call */ + cuDoubleComplex *cu_alpha = (cuDoubleComplex*) α + cuDoubleComplex *cu_beta = (cuDoubleComplex*) β + cuDoubleComplex *cu_A, *cu_B, *cu_C; /* C <- A*B */ + + int_t ii_st = 0; + int_t ii_end = 0; + int_t maxGemmBlockDim = (int) sqrt(buffer_size); + // int_t maxGemmBlockDim = 8000; + + /* Organize GEMM by blocks of [ii_st : ii_end, jj_st : jj_end] that + fits in the buffer_size */ + while (ii_end < RemainBlk) { + ii_st = ii_end; + ii_end = RemainBlk; + int_t nrow_max = maxGemmBlockDim; +// nrow_max = Rnbrow; + int_t remaining_rows = (ii_st == 0) ? Rnbrow : Rnbrow - Remain_info[ii_st - 1].FullRow; + nrow_max = (remaining_rows / nrow_max) > 0 ? remaining_rows / CEILING(remaining_rows, nrow_max) : nrow_max; + + int_t ResRow = (ii_st == 0) ? 0 : Remain_info[ii_st - 1].FullRow; + for (int_t i = ii_st; i < RemainBlk - 1; ++i) + { + if ( Remain_info[i + 1].FullRow > ResRow + nrow_max) + { + ii_end = i; + break; /* row dimension reaches nrow_max */ + } + } + + int_t nrows; /* actual row dimension for GEMM */ + int_t st_row; + if (ii_st > 0) + { + nrows = Remain_info[ii_end - 1].FullRow - Remain_info[ii_st - 1].FullRow; + st_row = Remain_info[ii_st - 1].FullRow; + } + else + { + nrows = Remain_info[ii_end - 1].FullRow; + st_row = 0; + } + + int jj_st = jj_cpu; + int jj_end = jj_cpu; + + while (jj_end < nub && nrows > 0 ) + { + int_t remaining_cols = (jj_st == jj_cpu) ? ncols : ncols - Ublock_info[jj_st - 1].full_u_cols; + if ( remaining_cols * nrows < buffer_size) + { + jj_st = jj_end; + jj_end = nub; + } + else /* C matrix cannot fit in buffer, need to break into pieces */ + { + int_t ncol_max = buffer_size / nrows; + /** Must revisit **/ + ncol_max = SUPERLU_MIN(ncol_max, maxGemmBlockDim); + ncol_max = (remaining_cols / ncol_max) > 0 ? + remaining_cols / CEILING(remaining_cols, ncol_max) + : ncol_max; + + jj_st = jj_end; + jj_end = nub; + + int_t ResCol = (jj_st == 0) ? 0 : Ublock_info[jj_st - 1].full_u_cols; + for (int_t j = jj_st; j < nub - 1; ++j) + { + if (Ublock_info[j + 1].full_u_cols > ResCol + ncol_max) + { + jj_end = j; + break; + } + } + } /* end-if-else */ + + int ncols; + int st_col; + if (jj_st > 0) + { + ncols = Ublock_info[jj_end - 1].full_u_cols - Ublock_info[jj_st - 1].full_u_cols; + st_col = Ublock_info[jj_st - 1].full_u_cols; + if (ncols == 0) exit(0); + } + else + { + ncols = Ublock_info[jj_end - 1].full_u_cols; + st_col = 0; + } + + /* none of the matrix dimension is zero. */ + if (nrows > 0 && ldu > 0 && ncols > 0) + { + if (nrows * ncols > buffer_size) { + printf("!! Matrix size %lld x %lld exceeds buffer_size \n", + nrows, ncols, buffer_size); + fflush(stdout); + } + assert(nrows * ncols <= buffer_size); + cublasSetStream(cublas_handle0, FunCallStream); + cudaEventRecord(A_gpu->GemmStart[k0], FunCallStream); + cu_A = (cuDoubleComplex*) &A_gpu->scubufs[streamId].Remain_L_buff[(knsupc - ldu) * Rnbrow + st_row]; + cu_B = (cuDoubleComplex*) &A_gpu->scubufs[streamId].bigU[st_col * ldu]; + cu_C = (cuDoubleComplex*) A_gpu->scubufs[streamId].bigV; + cublasZgemm(cublas_handle0, CUBLAS_OP_N, CUBLAS_OP_N, + nrows, ncols, ldu, cu_alpha, + cu_A, Rnbrow, cu_B, ldu, cu_beta, + cu_C, nrows); + +// #define SCATTER_OPT +#ifdef SCATTER_OPT + cudaStreamSynchronize(FunCallStream); +#warning this function is synchronous +#endif + cudaEventRecord(A_gpu->GemmEnd[k0], FunCallStream); + + A_gpu->GemmFLOPCounter += 8.0 * (double) nrows * ncols * ldu; + + /* + * Scattering the output + */ + dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread + + dim3 dimGrid(ii_end - ii_st, jj_end - jj_st); + + Scatter_GPU_kernel <<< dimGrid, dimBlock, 0, FunCallStream>>> + (streamId, ii_st, ii_end, jj_st, jj_end, klst, + 0, nrows, ldt, npcol, nprow, dA_gpu); +#ifdef SCATTER_OPT + cudaStreamSynchronize(FunCallStream); +#warning this function is synchrnous +#endif + + cudaEventRecord(A_gpu->ScatterEnd[k0], FunCallStream); + + A_gpu->ScatterMOPCounter += 3.0 * (double) nrows * ncols; + } /* endif ... none of the matrix dimension is zero. */ + + } /* end while jj_end < nub */ + + } /* end while (ii_end < RemainBlk) */ + + return 0; +} /* end zSchurCompUpdate_GPU */ + + +static void print_occupancy() +{ + int blockSize; // The launch configurator returned block size + int minGridSize; /* The minimum grid size needed to achieve the + best potential occupancy */ + + cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, + Scatter_GPU_kernel, 0, 0); + printf("Occupancy: MinGridSize %d blocksize %d \n", minGridSize, blockSize); +} + +static void printDevProp(cudaDeviceProp devProp) +{ + size_t mfree, mtotal; + cudaMemGetInfo (&mfree, &mtotal); + + printf("pciBusID: %d\n", devProp.pciBusID); + printf("pciDeviceID: %d\n", devProp.pciDeviceID); + printf("GPU Name: %s\n", devProp.name); + printf("Total global memory: %zu\n", devProp.totalGlobalMem); + printf("Total free memory: %zu\n", mfree); + printf("Clock rate: %d\n", devProp.clockRate); + + return; +} + + +static size_t get_acc_memory () +{ + + size_t mfree, mtotal; + cudaMemGetInfo (&mfree, &mtotal); +#if 0 + printf("Total memory %zu & free memory %zu\n", mtotal, mfree); +#endif + return (size_t) (0.9 * (double) mfree) / get_mpi_process_per_gpu (); + + +} + +int zfree_LUstruct_gpu (zLUstruct_gpu_t * A_gpu) +{ + checkCuda(cudaFree(A_gpu->LrowindVec)); + checkCuda(cudaFree(A_gpu->LrowindPtr)); + + checkCuda(cudaFree(A_gpu->LnzvalVec)); + checkCuda(cudaFree(A_gpu->LnzvalPtr)); + free(A_gpu->LnzvalPtr_host); + /*freeing the pinned memory*/ + int_t streamId = 0; + checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_info_host)); + checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Ublock_info_host)); + checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_L_buff_host)); + checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].bigU_host)); + + checkCuda(cudaFreeHost(A_gpu->acc_L_buff)); + checkCuda(cudaFreeHost(A_gpu->acc_U_buff)); + checkCuda(cudaFreeHost(A_gpu->scubufs[streamId].lsub_buf)); + checkCuda(cudaFreeHost(A_gpu->scubufs[streamId].usub_buf)); + + + free(A_gpu->isOffloaded); + free(A_gpu->GemmStart); + free(A_gpu->GemmEnd); + free(A_gpu->ScatterEnd); + free(A_gpu->ePCIeH2D); + + free(A_gpu->ePCIeD2H_Start); + free(A_gpu->ePCIeD2H_End); + + checkCuda(cudaFree(A_gpu->UrowindVec)); + checkCuda(cudaFree(A_gpu->UrowindPtr)); + + free(A_gpu->UrowindPtr_host); + + checkCuda(cudaFree(A_gpu->UnzvalVec)); + checkCuda(cudaFree(A_gpu->UnzvalPtr)); + + checkCuda(cudaFree(A_gpu->grid)); + + + + checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV)); + checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU)); + + checkCuda(cudaFree(A_gpu->scubufs[streamId].Remain_L_buff)); + checkCuda(cudaFree(A_gpu->scubufs[streamId].Ublock_info)); + checkCuda(cudaFree(A_gpu->scubufs[streamId].Remain_info)); + + // checkCuda(cudaFree(A_gpu->indirect)); + // checkCuda(cudaFree(A_gpu->indirect2)); + checkCuda(cudaFree(A_gpu->xsup)); + + checkCuda(cudaFree(A_gpu->scubufs[streamId].lsub)); + checkCuda(cudaFree(A_gpu->scubufs[streamId].usub)); + + + checkCuda(cudaFree(A_gpu->local_l_blk_infoVec)); + checkCuda(cudaFree(A_gpu->local_l_blk_infoPtr)); + checkCuda(cudaFree(A_gpu->jib_lookupVec)); + checkCuda(cudaFree(A_gpu->jib_lookupPtr)); + checkCuda(cudaFree(A_gpu->local_u_blk_infoVec)); + checkCuda(cudaFree(A_gpu->local_u_blk_infoPtr)); + checkCuda(cudaFree(A_gpu->ijb_lookupVec)); + checkCuda(cudaFree(A_gpu->ijb_lookupPtr)); + + return 0; +} + + + +void zPrint_matrix( char *desc, int_t m, int_t n, doublecomplex * dA, int_t lda ) +{ + doublecomplex *cPtr = (doublecomplex *) malloc(sizeof(doublecomplex) * lda * n); + checkCuda(cudaMemcpy( cPtr, dA, + lda * n * sizeof(doublecomplex), cudaMemcpyDeviceToHost)) ; + + int_t i, j; + printf( "\n %s\n", desc ); + for ( i = 0; i < m; i++ ) + { + for ( j = 0; j < n; j++ ) printf( " %.3e", cPtr[i + j * lda] ); + printf( "\n" ); + } + free(cPtr); +} + +void zprintGPUStats(zLUstruct_gpu_t * A_gpu) +{ + double tGemm = 0; + double tScatter = 0; + double tPCIeH2D = 0; + double tPCIeD2H = 0; + + for (int_t i = 0; i < A_gpu->nsupers; ++i) + { + float milliseconds = 0; + + if (A_gpu->isOffloaded[i]) + { + cudaEventElapsedTime(&milliseconds, A_gpu->ePCIeH2D[i], A_gpu->GemmStart[i]); + tPCIeH2D += 1e-3 * (double) milliseconds; + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, A_gpu->GemmStart[i], A_gpu->GemmEnd[i]); + tGemm += 1e-3 * (double) milliseconds; + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, A_gpu->GemmEnd[i], A_gpu->ScatterEnd[i]); + tScatter += 1e-3 * (double) milliseconds; + } + + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, A_gpu->ePCIeD2H_Start[i], A_gpu->ePCIeD2H_End[i]); + tPCIeD2H += 1e-3 * (double) milliseconds; + } + + printf("GPU: Flops offloaded %.3e Time spent %lf Flop rate %lf GF/sec \n", + A_gpu->GemmFLOPCounter, tGemm, 1e-9 * A_gpu->GemmFLOPCounter / tGemm ); + printf("GPU: Mop offloaded %.3e Time spent %lf Bandwidth %lf GByte/sec \n", + A_gpu->ScatterMOPCounter, tScatter, 8e-9 * A_gpu->ScatterMOPCounter / tScatter ); + printf("PCIe Data Transfer H2D:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", + 1e-9 * A_gpu->cPCIeH2D, A_gpu->tHost_PCIeH2D, tPCIeH2D, 1e-9 * A_gpu->cPCIeH2D / tPCIeH2D ); + printf("PCIe Data Transfer D2H:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", + 1e-9 * A_gpu->cPCIeD2H, A_gpu->tHost_PCIeD2H, tPCIeD2H, 1e-9 * A_gpu->cPCIeD2H / tPCIeD2H ); + fflush(stdout); + +} /* end printGPUStats */ + + +int zinitSluGPU3D_t( + zsluGPU_t *sluGPU, + zLUstruct_t *LUstruct, + gridinfo3d_t * grid3d, + int_t* perm_c_supno, + int_t n, + int_t buffer_size, /* read from env variable MAX_BUFFER_SIZE */ + int_t bigu_size, + int_t ldt /* NSUP read from sp_ienv(3) */ +) +{ + gridinfo_t* grid = &(grid3d->grid2d); + checkCudaErrors(cudaDeviceReset ()) ; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + zLocalLU_t *Llu = LUstruct->Llu; + int_t* isNodeInMyGrid = sluGPU->isNodeInMyGrid; + + sluGPU->nCudaStreams = getnCudaStreams(); + if (grid3d->iam == 0) + { + printf("zinitSluGPU3D_t: Using hardware acceleration, with %d cuda streams \n", sluGPU->nCudaStreams); + fflush(stdout); + if ( MAX_SUPER_SIZE < ldt ) + { + ABORT("MAX_SUPER_SIZE smaller than requested NSUP"); + } + } + + cudaStreamCreate(&(sluGPU->CopyStream)); + + for (int streamId = 0; streamId < sluGPU->nCudaStreams; streamId++) + { + cudaStreamCreate(&(sluGPU->funCallStreams[streamId])); + cublasCreate(&(sluGPU->cublasHandles[streamId])); + sluGPU->lastOffloadStream[streamId] = -1; + } + + sluGPU->A_gpu = (zLUstruct_gpu_t *) malloc (sizeof(zLUstruct_gpu_t)); + sluGPU->A_gpu->perm_c_supno = perm_c_supno; + zCopyLUToGPU3D ( isNodeInMyGrid, + Llu, /* referred to as A_host */ + sluGPU, Glu_persist, n, grid3d, buffer_size, bigu_size, ldt + ); + + return 0; +} /* end zinitSluGPU3D_t */ + +int zinitD2Hreduce( + int next_k, d2Hreduce_t* d2Hred, int last_flag, HyP_t* HyP, + zsluGPU_t *sluGPU, gridinfo_t *grid, zLUstruct_t *LUstruct, SCT_t* SCT +) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + zLocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + + + // int_t next_col = SUPERLU_MIN (k0 + num_look_aheads + 1, nsupers - 1); + // int_t next_k = perm_c_supno[next_col]; /* global block number for next colum*/ + int_t mkcol, mkrow; + + int_t kljb = LBj( next_k, grid ); /*local block number for next block*/ + int_t kijb = LBi( next_k, grid ); /*local block number for next block*/ + + int_t *kindexL ; /*for storing index vectors*/ + int_t *kindexU ; + mkrow = PROW (next_k, grid); + mkcol = PCOL (next_k, grid); + int_t ksup_size = SuperSize(next_k); + + int_t copyL_kljb = 0; + int_t copyU_kljb = 0; + int_t l_copy_len = 0; + int_t u_copy_len = 0; + + if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL && last_flag) + { + if (HyP->Lblock_dirty_bit[kljb] > -1) + { + copyL_kljb = 1; + int_t lastk0 = HyP->Lblock_dirty_bit[kljb]; + int_t streamIdk0Offload = lastk0 % sluGPU->nCudaStreams; + if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1) + { + // printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload); + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamIdk0Offload] = -1; + } + } + + kindexL = Lrowind_bc_ptr[kljb]; + l_copy_len = kindexL[1] * ksup_size; + } + + if ( mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL && last_flag ) + { + if (HyP->Ublock_dirty_bit[kijb] > -1) + { + copyU_kljb = 1; + int_t lastk0 = HyP->Ublock_dirty_bit[kijb]; + int_t streamIdk0Offload = lastk0 % sluGPU->nCudaStreams; + if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1) + { + // printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload); + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamIdk0Offload] = -1; + } + } + // copyU_kljb = HyP->Ublock_dirty_bit[kijb]>-1? 1: 0; + kindexU = Ufstnz_br_ptr[kijb]; + u_copy_len = kindexU[1]; + } + + // wait for streams if they have not been finished + + // d2Hred->next_col = next_col; + d2Hred->next_k = next_k; + d2Hred->kljb = kljb; + d2Hred->kijb = kijb; + d2Hred->copyL_kljb = copyL_kljb; + d2Hred->copyU_kljb = copyU_kljb; + d2Hred->l_copy_len = l_copy_len; + d2Hred->u_copy_len = u_copy_len; + d2Hred->kindexU = kindexU; + d2Hred->kindexL = kindexL; + d2Hred->mkrow = mkrow; + d2Hred->mkcol = mkcol; + d2Hred->ksup_size = ksup_size; + return 0; +} /* zinitD2Hreduce */ + +int zreduceGPUlu( + int last_flag, + d2Hreduce_t* d2Hred, + zsluGPU_t *sluGPU, + SCT_t *SCT, + gridinfo_t *grid, + zLUstruct_t *LUstruct +) +{ + zLocalLU_t *Llu = LUstruct->Llu; + int iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + doublecomplex** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + doublecomplex** Unzval_br_ptr = Llu->Unzval_br_ptr; + + cudaStream_t CopyStream; + zLUstruct_gpu_t *A_gpu; + A_gpu = sluGPU->A_gpu; + CopyStream = sluGPU->CopyStream; + + int_t kljb = d2Hred->kljb; + int_t kijb = d2Hred->kijb; + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + int_t mkrow = d2Hred->mkrow; + int_t mkcol = d2Hred->mkcol; + int_t ksup_size = d2Hred->ksup_size; + int_t *kindex; + if ((copyL_kljb || copyU_kljb) && last_flag ) + { + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(CopyStream); + SCT->PhiWaitTimer_2 += SuperLU_timer_() - ttx; + } + + double tt_start = SuperLU_timer_(); + + if (last_flag) { + if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL ) + { + kindex = Lrowind_bc_ptr[kljb]; + int_t len = kindex[1]; + + if (copyL_kljb) + { + doublecomplex *nzval_host; + nzval_host = Lnzval_bc_ptr[kljb]; + int_t llen = ksup_size * len; + doublecomplex alpha = {1.0, 0.0}; + superlu_zaxpy (llen, alpha, A_gpu->acc_L_buff, 1, nzval_host, 1); + } + + } + } + if (last_flag) { + if (mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL ) + { + kindex = Ufstnz_br_ptr[kijb]; + int_t len = kindex[1]; + + if (copyU_kljb) + { + doublecomplex *nzval_host; + nzval_host = Unzval_br_ptr[kijb]; + + doublecomplex alpha = {1.0, 0.0}; + superlu_zaxpy (len, alpha, A_gpu->acc_U_buff, 1, nzval_host, 1); + } + } + } + + double tt_end = SuperLU_timer_(); + SCT->AssemblyTimer += tt_end - tt_start; + return 0; +} /* zreduceGPUlu */ + + +int zwaitGPUscu(int_t streamId, zsluGPU_t *sluGPU, SCT_t *SCT) +{ + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + return 0; +} + +int_t zsendLUpanelGPU2HOST( + int_t k0, + d2Hreduce_t* d2Hred, + zsluGPU_t *sluGPU +) +{ + int_t kljb = d2Hred->kljb; + int_t kijb = d2Hred->kijb; + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + int_t l_copy_len = d2Hred->l_copy_len; + int_t u_copy_len = d2Hred->u_copy_len; + cudaStream_t CopyStream = sluGPU->CopyStream;; + zLUstruct_gpu_t *A_gpu = sluGPU->A_gpu; + double tty = SuperLU_timer_(); + cudaEventRecord(A_gpu->ePCIeD2H_Start[k0], CopyStream); + if (copyL_kljb) + checkCuda(cudaMemcpyAsync(A_gpu->acc_L_buff, &A_gpu->LnzvalVec[A_gpu->LnzvalPtr_host[kljb]], + l_copy_len * sizeof(doublecomplex), cudaMemcpyDeviceToHost, CopyStream ) ); + + if (copyU_kljb) + checkCuda(cudaMemcpyAsync(A_gpu->acc_U_buff, &A_gpu->UnzvalVec[A_gpu->UnzvalPtr_host[kijb]], + u_copy_len * sizeof(doublecomplex), cudaMemcpyDeviceToHost, CopyStream ) ); + cudaEventRecord(A_gpu->ePCIeD2H_End[k0], CopyStream); + A_gpu->tHost_PCIeD2H += SuperLU_timer_() - tty; + A_gpu->cPCIeD2H += u_copy_len * sizeof(doublecomplex) + l_copy_len * sizeof(doublecomplex); + + return 0; +} + +/* Copy L and U panel data structures from host to the host part of the + data structures in A_gpu. + GPU is not involved in this routine. */ +int zsendSCUdataHost2GPU( + int_t streamId, + int_t* lsub, + int_t* usub, + doublecomplex* bigU, + int_t bigu_send_size, + int_t Remain_lbuf_send_size, + zsluGPU_t *sluGPU, + HyP_t* HyP +) +{ + //{printf("....[enter] zsendSCUdataHost2GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} + + int_t usub_len = usub[2]; + int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + //{printf("....[2] in zsendSCUdataHost2GPU, lsub_len %d\n", lsub_len); fflush(stdout);} + zLUstruct_gpu_t *A_gpu = sluGPU->A_gpu; + memcpy(A_gpu->scubufs[streamId].lsub_buf, lsub, sizeof(int_t)*lsub_len); + memcpy(A_gpu->scubufs[streamId].usub_buf, usub, sizeof(int_t)*usub_len); + memcpy(A_gpu->scubufs[streamId].Remain_info_host, HyP->Remain_info, + sizeof(Remain_info_t)*HyP->RemainBlk); + memcpy(A_gpu->scubufs[streamId].Ublock_info_host, HyP->Ublock_info_Phi, + sizeof(Ublock_info_t)*HyP->num_u_blks_Phi); + memcpy(A_gpu->scubufs[streamId].Remain_L_buff_host, HyP->Remain_L_buff, + sizeof(doublecomplex)*Remain_lbuf_send_size); + memcpy(A_gpu->scubufs[streamId].bigU_host, bigU, + sizeof(doublecomplex)*bigu_send_size); + + return 0; +} + +/* Sherry: not used ?*/ +#if 0 +int freeSluGPU(zsluGPU_t *sluGPU) +{ + return 0; +} +#endif + +void zCopyLUToGPU3D ( + int_t* isNodeInMyGrid, + zLocalLU_t *A_host, /* distributed LU structure on host */ + zsluGPU_t *sluGPU, + Glu_persist_t *Glu_persist, int_t n, + gridinfo3d_t *grid3d, + int_t buffer_size, /* bigV size on GPU for Schur complement update */ + int_t bigu_size, + int_t ldt +) +{ + gridinfo_t* grid = &(grid3d->grid2d); + zLUstruct_gpu_t * A_gpu = sluGPU->A_gpu; + zLUstruct_gpu_t **dA_gpu = &(sluGPU->dA_gpu); + +#if ( PRNTlevel>=1 ) + if ( grid3d->iam == 0 ) print_occupancy(); +#endif + +#ifdef GPU_DEBUG + // if ( grid3d->iam == 0 ) + { + cudaDeviceProp devProp; + cudaGetDeviceProperties(&devProp, 0); + printDevProp(devProp); + } +#endif + int_t *xsup ; + xsup = Glu_persist->xsup; + int iam = grid->iam; + int nsupers = Glu_persist->supno[n - 1] + 1; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t mrb = (nsupers + Pr - 1) / Pr; + int_t mcb = (nsupers + Pc - 1) / Pc; + int_t remain_l_max = A_host->bufmax[1]; + + /*copies of scalars for easy access*/ + A_gpu->nsupers = nsupers; + A_gpu->ScatterMOPCounter = 0; + A_gpu->GemmFLOPCounter = 0; + A_gpu->cPCIeH2D = 0; + A_gpu->cPCIeD2H = 0; + A_gpu->tHost_PCIeH2D = 0; + A_gpu->tHost_PCIeD2H = 0; + + /*initializing memory*/ + size_t max_gpu_memory = get_acc_memory (); + size_t gpu_mem_used = 0; + + void *tmp_ptr; + + A_gpu->xsup_host = xsup; + + int_t nCudaStreams = sluGPU->nCudaStreams; + /*pinned memory allocations. + Paged-locked memory by cudaMallocHost is accessible to the device.*/ + for (int streamId = 0; streamId < nCudaStreams; streamId++ ) { + void *tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, (n) * sizeof(int_t) )) ; + A_gpu->scubufs[streamId].usub_IndirectJ3_host = (int_t*) tmp_ptr; + + checkCudaErrors(cudaMalloc( &tmp_ptr, ( n) * sizeof(int_t) )); + A_gpu->scubufs[streamId].usub_IndirectJ3 = (int_t*) tmp_ptr; + gpu_mem_used += ( n) * sizeof(int_t); + checkCudaErrors(cudaMallocHost( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; + A_gpu->scubufs[streamId].Remain_info_host = (Remain_info_t*)tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; + A_gpu->scubufs[streamId].Ublock_info_host = (Ublock_info_t*)tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, remain_l_max * sizeof(doublecomplex) )) ; + A_gpu->scubufs[streamId].Remain_L_buff_host = (doublecomplex *) tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, bigu_size * sizeof(doublecomplex) )) ; + A_gpu->scubufs[streamId].bigU_host = (doublecomplex *) tmp_ptr; + + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(doublecomplex) * (A_host->bufmax[1]))); + A_gpu->acc_L_buff = (doublecomplex *) tmp_ptr; + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(doublecomplex) * (A_host->bufmax[3]))); + A_gpu->acc_U_buff = (doublecomplex *) tmp_ptr; + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[0]))); + A_gpu->scubufs[streamId].lsub_buf = (int_t *) tmp_ptr; + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[2]))); + A_gpu->scubufs[streamId].usub_buf = (int_t *) tmp_ptr; + + checkCudaErrors(cudaMalloc( &tmp_ptr, remain_l_max * sizeof(doublecomplex) )) ; + A_gpu->scubufs[streamId].Remain_L_buff = (doublecomplex *) tmp_ptr; + gpu_mem_used += remain_l_max * sizeof(doublecomplex); + checkCudaErrors(cudaMalloc( &tmp_ptr, bigu_size * sizeof(doublecomplex) )) ; + A_gpu->scubufs[streamId].bigU = (doublecomplex *) tmp_ptr; + gpu_mem_used += bigu_size * sizeof(doublecomplex); + checkCudaErrors(cudaMalloc( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; + A_gpu->scubufs[streamId].Ublock_info = (Ublock_info_t *) tmp_ptr; + gpu_mem_used += mcb * sizeof(Ublock_info_t); + checkCudaErrors(cudaMalloc( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; + A_gpu->scubufs[streamId].Remain_info = (Remain_info_t *) tmp_ptr; + gpu_mem_used += mrb * sizeof(Remain_info_t); + checkCudaErrors(cudaMalloc( &tmp_ptr, buffer_size * sizeof(doublecomplex))) ; + A_gpu->scubufs[streamId].bigV = (doublecomplex *) tmp_ptr; + gpu_mem_used += buffer_size * sizeof(doublecomplex); + checkCudaErrors(cudaMalloc( &tmp_ptr, A_host->bufmax[0]*sizeof(int_t))) ; + A_gpu->scubufs[streamId].lsub = (int_t *) tmp_ptr; + gpu_mem_used += A_host->bufmax[0] * sizeof(int_t); + checkCudaErrors(cudaMalloc( &tmp_ptr, A_host->bufmax[2]*sizeof(int_t))) ; + A_gpu->scubufs[streamId].usub = (int_t *) tmp_ptr; + gpu_mem_used += A_host->bufmax[2] * sizeof(int_t); + + } /* endfor streamID ... allocate paged-locked memory */ + + A_gpu->isOffloaded = (int *) SUPERLU_MALLOC (sizeof(int) * nsupers); + A_gpu->GemmStart = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->GemmEnd = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->ScatterEnd = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeH2D = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeD2H_Start = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeD2H_End = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + + for (int i = 0; i < nsupers; ++i) + { + A_gpu->isOffloaded[i] = 0; + checkCudaErrors(cudaEventCreate(&(A_gpu->GemmStart[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->GemmEnd[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ScatterEnd[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeH2D[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeH2D[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeD2H_Start[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeD2H_End[i]))); + } + + /*---- Copy L data structure to GPU ----*/ + + /*pointers and address of local blocks for easy accessibility */ + local_l_blk_info_t *local_l_blk_infoVec; + int_t * local_l_blk_infoPtr; + local_l_blk_infoPtr = (int_t *) malloc( CEILING(nsupers, Pc) * sizeof(int_t ) ); + + /* First pass: count total L blocks */ + int_t cum_num_l_blocks = 0; /* total number of L blocks I own */ + for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) + { + /* going through each block column I own */ + + if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) + { + int_t *index = A_host->Lrowind_bc_ptr[i]; + int_t num_l_blocks = index[0]; + cum_num_l_blocks += num_l_blocks; + } + } + + /*allocating memory*/ + local_l_blk_infoVec = (local_l_blk_info_t *) malloc(cum_num_l_blocks * sizeof(local_l_blk_info_t)); + + /* Second pass: set up the meta-data for the L structure */ + cum_num_l_blocks = 0; + + /*initialzing vectors */ + for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) + { + if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) + { + int_t *index = A_host->Lrowind_bc_ptr[i]; + int_t num_l_blocks = index[0]; /* # L blocks in this column */ + + if (num_l_blocks > 0) + { + + local_l_blk_info_t *local_l_blk_info_i = local_l_blk_infoVec + cum_num_l_blocks; + local_l_blk_infoPtr[i] = cum_num_l_blocks; + + int_t lptrj = BC_HEADER; + int_t luptrj = 0; + + for (int_t j = 0; j < num_l_blocks ; ++j) + { + + int_t ijb = index[lptrj]; + + local_l_blk_info_i[j].lib = ijb / Pr; + local_l_blk_info_i[j].lptrj = lptrj; + local_l_blk_info_i[j].luptrj = luptrj; + luptrj += index[lptrj + 1]; + lptrj += LB_DESCRIPTOR + index[lptrj + 1]; + + } + } + cum_num_l_blocks += num_l_blocks; + } + + } /* endfor all block columns */ + + /* Allocate L memory on GPU, and copy the values from CPU to GPU */ + checkCudaErrors(cudaMalloc( &tmp_ptr, cum_num_l_blocks * sizeof(local_l_blk_info_t))) ; + A_gpu->local_l_blk_infoVec = (local_l_blk_info_t *) tmp_ptr; + gpu_mem_used += cum_num_l_blocks * sizeof(local_l_blk_info_t); + checkCudaErrors(cudaMemcpy( (A_gpu->local_l_blk_infoVec), local_l_blk_infoVec, cum_num_l_blocks * sizeof(local_l_blk_info_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, CEILING(nsupers, Pc)*sizeof(int_t))) ; + A_gpu->local_l_blk_infoPtr = (int_t *) tmp_ptr; + gpu_mem_used += CEILING(nsupers, Pc) * sizeof(int_t); + checkCudaErrors(cudaMemcpy( (A_gpu->local_l_blk_infoPtr), local_l_blk_infoPtr, CEILING(nsupers, Pc)*sizeof(int_t), cudaMemcpyHostToDevice)) ; + + /*---- Copy U data structure to GPU ----*/ + + local_u_blk_info_t *local_u_blk_infoVec; + int_t * local_u_blk_infoPtr; + local_u_blk_infoPtr = (int_t *) malloc( CEILING(nsupers, Pr) * sizeof(int_t ) ); + + /* First pass: count total U blocks */ + int_t cum_num_u_blocks = 0; + + for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) + { + + if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) + { + int_t *index = A_host->Ufstnz_br_ptr[i]; + int_t num_u_blocks = index[0]; + cum_num_u_blocks += num_u_blocks; + + } + } + + local_u_blk_infoVec = (local_u_blk_info_t *) malloc(cum_num_u_blocks * sizeof(local_u_blk_info_t)); + + /* Second pass: set up the meta-data for the U structure */ + cum_num_u_blocks = 0; + + for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) + { + if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) + { + int_t *index = A_host->Ufstnz_br_ptr[i]; + int_t num_u_blocks = index[0]; + + if (num_u_blocks > 0) + { + local_u_blk_info_t *local_u_blk_info_i = local_u_blk_infoVec + cum_num_u_blocks; + local_u_blk_infoPtr[i] = cum_num_u_blocks; + + int_t iuip_lib, ruip_lib; + iuip_lib = BR_HEADER; + ruip_lib = 0; + + for (int_t j = 0; j < num_u_blocks ; ++j) + { + + int_t ijb = index[iuip_lib]; + local_u_blk_info_i[j].ljb = ijb / Pc; + local_u_blk_info_i[j].iuip = iuip_lib; + local_u_blk_info_i[j].ruip = ruip_lib; + + ruip_lib += index[iuip_lib + 1]; + iuip_lib += UB_DESCRIPTOR + SuperSize (ijb); + + } + } + cum_num_u_blocks += num_u_blocks; + } + } + + checkCudaErrors(cudaMalloc( &tmp_ptr, cum_num_u_blocks * sizeof(local_u_blk_info_t))) ; + A_gpu->local_u_blk_infoVec = (local_u_blk_info_t *) tmp_ptr; + gpu_mem_used += cum_num_u_blocks * sizeof(local_u_blk_info_t); + checkCudaErrors(cudaMemcpy( (A_gpu->local_u_blk_infoVec), local_u_blk_infoVec, cum_num_u_blocks * sizeof(local_u_blk_info_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, CEILING(nsupers, Pr)*sizeof(int_t))) ; + A_gpu->local_u_blk_infoPtr = (int_t *) tmp_ptr; + gpu_mem_used += CEILING(nsupers, Pr) * sizeof(int_t); + checkCudaErrors(cudaMemcpy( (A_gpu->local_u_blk_infoPtr), local_u_blk_infoPtr, CEILING(nsupers, Pr)*sizeof(int_t), cudaMemcpyHostToDevice)) ; + + /* Copy the actual L indices and values */ + int_t l_k = CEILING( nsupers, grid->npcol ); /* # of local block columns */ + int_t *temp_LrowindPtr = (int_t *) malloc(sizeof(int_t) * l_k); + int_t *temp_LnzvalPtr = (int_t *) malloc(sizeof(int_t) * l_k); + int_t *Lnzval_size = (int_t *) malloc(sizeof(int_t) * l_k); + int_t l_ind_len = 0; + int_t l_val_len = 0; + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc && isNodeInMyGrid[jb] == 1) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *index_host; + index_host = A_host->Lrowind_bc_ptr[ljb]; + + temp_LrowindPtr[ljb] = l_ind_len; + temp_LnzvalPtr[ljb] = l_val_len; // ### + Lnzval_size[ljb] = 0; //### + if (index_host != NULL) + { + int_t nrbl = index_host[0]; /* number of L blocks */ + int_t len = index_host[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + + /* Global block number is mycol + ljb*Pc */ + int_t nsupc = SuperSize(jb); + + l_ind_len += len1; + l_val_len += len * nsupc; + Lnzval_size[ljb] = len * nsupc ; // ### + } + else + { + Lnzval_size[ljb] = 0 ; // ### + } + } + } /* endfor jb = 0 ... */ + + /* Copy the actual U indices and values */ + int_t u_k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + int_t *temp_UrowindPtr = (int_t *) malloc(sizeof(int_t) * u_k); + int_t *temp_UnzvalPtr = (int_t *) malloc(sizeof(int_t) * u_k); + int_t *Unzval_size = (int_t *) malloc(sizeof(int_t) * u_k); + int_t u_ind_len = 0; + int_t u_val_len = 0; + for ( int_t lb = 0; lb < u_k; ++lb) + { + int_t *index_host; + index_host = A_host->Ufstnz_br_ptr[lb]; + temp_UrowindPtr[lb] = u_ind_len; + temp_UnzvalPtr[lb] = u_val_len; + Unzval_size[lb] = 0; + if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) + { + int_t len = index_host[1]; + int_t len1 = index_host[2]; + + u_ind_len += len1; + u_val_len += len; + Unzval_size[lb] = len; + } + else + { + Unzval_size[lb] = 0; + } + } + + gpu_mem_used += l_ind_len * sizeof(int_t); + gpu_mem_used += 2 * l_k * sizeof(int_t); + gpu_mem_used += u_ind_len * sizeof(int_t); + gpu_mem_used += 2 * u_k * sizeof(int_t); + + /*left memory shall be divided among the two */ + + for (int_t i = 0; i < l_k; ++i) + { + temp_LnzvalPtr[i] = -1; + } + + for (int_t i = 0; i < u_k; ++i) + { + temp_UnzvalPtr[i] = -1; + } + + /*setting these pointers back */ + l_val_len = 0; + u_val_len = 0; + + int_t num_gpu_l_blocks = 0; + int_t num_gpu_u_blocks = 0; + size_t mem_l_block, mem_u_block; + + /* Find the trailing matrix size that can fit into GPU memory */ + for (int_t i = nsupers - 1; i > -1; --i) + { + /* ulte se chalte hai eleimination tree */ + /* bottom up ordering */ + int_t i_sup = A_gpu->perm_c_supno[i]; + + int_t pc = PCOL( i_sup, grid ); + if (isNodeInMyGrid[i_sup] == 1) + { + if (mycol == pc ) + { + int_t ljb = LBj(i_sup, grid); + mem_l_block = sizeof(doublecomplex) * Lnzval_size[ljb]; + if (gpu_mem_used + mem_l_block > max_gpu_memory) + { + break; + } + else + { + gpu_mem_used += mem_l_block; + temp_LnzvalPtr[ljb] = l_val_len; + l_val_len += Lnzval_size[ljb]; + num_gpu_l_blocks++; + A_gpu->first_l_block_gpu = i; + } + } + + int_t pr = PROW( i_sup, grid ); + if (myrow == pr) + { + int_t lib = LBi(i_sup, grid); + mem_u_block = sizeof(doublecomplex) * Unzval_size[lib]; + if (gpu_mem_used + mem_u_block > max_gpu_memory) + { + break; + } + else + { + gpu_mem_used += mem_u_block; + temp_UnzvalPtr[lib] = u_val_len; + u_val_len += Unzval_size[lib]; + num_gpu_u_blocks++; + A_gpu->first_u_block_gpu = i; + } + } + } /* endif */ + + } /* endfor i .... nsupers */ + +#if (PRNTlevel>=2) + printf("(%d) Number of L blocks in GPU %d, U blocks %d\n", + grid3d->iam, num_gpu_l_blocks, num_gpu_u_blocks ); + printf("(%d) elimination order of first block in GPU: L block %d, U block %d\n", + grid3d->iam, A_gpu->first_l_block_gpu, A_gpu->first_u_block_gpu); + printf("(%d) Memory of L %.1f GB, memory for U %.1f GB, Total device memory used %.1f GB, Memory allowed %.1f GB \n", grid3d->iam, + l_val_len * sizeof(doublecomplex) * 1e-9, + u_val_len * sizeof(doublecomplex) * 1e-9, + gpu_mem_used * 1e-9, max_gpu_memory * 1e-9); + fflush(stdout); +#endif + + /* Assemble index vector on temp */ + int_t *indtemp = (int_t *) malloc(sizeof(int_t) * l_ind_len); + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc && isNodeInMyGrid[jb] == 1) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *index_host; + index_host = A_host->Lrowind_bc_ptr[ljb]; + + if (index_host != NULL) + { + int_t nrbl = index_host[0]; /* number of L blocks */ + int_t len = index_host[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + + memcpy(&indtemp[temp_LrowindPtr[ljb]] , index_host, len1 * sizeof(int_t)) ; + } + } + } + + checkCudaErrors(cudaMalloc( &tmp_ptr, l_ind_len * sizeof(int_t))) ; + A_gpu->LrowindVec = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->LrowindVec), indtemp, l_ind_len * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, l_val_len * sizeof(doublecomplex))); + A_gpu->LnzvalVec = (doublecomplex *) tmp_ptr; + checkCudaErrors(cudaMemset( (A_gpu->LnzvalVec), 0, l_val_len * sizeof(doublecomplex))); + + checkCudaErrors(cudaMalloc( &tmp_ptr, l_k * sizeof(int_t))) ; + A_gpu->LrowindPtr = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->LrowindPtr), temp_LrowindPtr, l_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, l_k * sizeof(int_t))) ; + A_gpu->LnzvalPtr = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->LnzvalPtr), temp_LnzvalPtr, l_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + A_gpu->LnzvalPtr_host = temp_LnzvalPtr; + + int_t *indtemp1 = (int_t *) malloc(sizeof(int_t) * u_ind_len); + for ( int_t lb = 0; lb < u_k; ++lb) + { + int_t *index_host; + index_host = A_host->Ufstnz_br_ptr[lb]; + + if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) + { + int_t len1 = index_host[2]; + memcpy(&indtemp1[temp_UrowindPtr[lb]] , index_host, sizeof(int_t)*len1); + } + } + + checkCudaErrors(cudaMalloc( &tmp_ptr, u_ind_len * sizeof(int_t))) ; + A_gpu->UrowindVec = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->UrowindVec), indtemp1, u_ind_len * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, u_val_len * sizeof(doublecomplex))); + A_gpu->UnzvalVec = (doublecomplex *) tmp_ptr; + checkCudaErrors(cudaMemset( (A_gpu->UnzvalVec), 0, u_val_len * sizeof(doublecomplex))); + + checkCudaErrors(cudaMalloc( &tmp_ptr, u_k * sizeof(int_t))) ; + A_gpu->UrowindPtr = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->UrowindPtr), temp_UrowindPtr, u_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + A_gpu->UnzvalPtr_host = temp_UnzvalPtr; + + checkCudaErrors(cudaMalloc( &tmp_ptr, u_k * sizeof(int_t))) ; + A_gpu->UnzvalPtr = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->UnzvalPtr), temp_UnzvalPtr, u_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, (nsupers + 1)*sizeof(int_t))) ; + A_gpu->xsup = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->xsup), xsup, (nsupers + 1)*sizeof(int_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, sizeof(zLUstruct_gpu_t))) ; + *dA_gpu = (zLUstruct_gpu_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( *dA_gpu, A_gpu, sizeof(zLUstruct_gpu_t), cudaMemcpyHostToDevice)) ; + + free (temp_LrowindPtr); + free (temp_UrowindPtr); + free (indtemp1); + free (indtemp); + +} /* end zCopyLUToGPU3D */ + + + +int zreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount, + int_t** treePerm, + zLUValSubBuf_t*LUvsb, + zLUstruct_t* LUstruct, + gridinfo3d_t* grid3d, + zsluGPU_t *sluGPU, + d2Hreduce_t* d2Hred, + factStat_t *factStat, + HyP_t* HyP, SCT_t* SCT ) +{ + // first synchronize all cuda streams + int superlu_acc_offload = HyP->superlu_acc_offload; + + int_t maxLvl = log2i( (int_t) grid3d->zscp.Np) + 1; + int_t myGrid = grid3d->zscp.Iam; + gridinfo_t* grid = &(grid3d->grid2d); + int_t* gpuLUreduced = factStat->gpuLUreduced; + + int_t sender; + if ((myGrid % (1 << (ilvl + 1))) == 0) + { + sender = myGrid + (1 << ilvl); + + } + else + { + sender = myGrid; + } + + /*Reduce all the ancestors from the GPU*/ + if (myGrid == sender && superlu_acc_offload) + { + for (int_t streamId = 0; streamId < sluGPU->nCudaStreams; streamId++) + { + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamId] = -1; + } + + for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl) + { + /* code */ + // int_t atree = myTreeIdxs[alvl]; + int_t nsAncestor = myNodeCount[alvl]; + int_t* cAncestorList = treePerm[alvl]; + + for (int_t node = 0; node < nsAncestor; node++ ) + { + int_t k = cAncestorList[node]; + if (!gpuLUreduced[k]) + { + zinitD2Hreduce(k, d2Hred, 1, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + double tt_start1 = SuperLU_timer_(); + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++; + zsendLUpanelGPU2HOST(k, d2Hred, sluGPU); + /* + Reduce the LU panels from GPU + */ + zreduceGPUlu(1, d2Hred, sluGPU, SCT, grid, LUstruct); + gpuLUreduced[k] = 1; + } + } + } + } /*if (myGrid == sender)*/ + + zreduceAllAncestors3d(ilvl, myNodeCount, treePerm, + LUvsb, LUstruct, grid3d, SCT ); + return 0; +} /* zreduceAllAncestors3d_GPU */ + + +void zsyncAllfunCallStreams(zsluGPU_t* sluGPU, SCT_t* SCT) +{ + for (int streamId = 0; streamId < sluGPU->nCudaStreams; streamId++) + { + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamId] = -1; + } +} From 99267f4bd7c2d3e7a73fede66863b14b3615adc4 Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Mon, 5 Apr 2021 19:14:55 -0400 Subject: [PATCH 072/147] Added 4 more news pre-cision-dependent files. --- SRC/Makefile | 4 +- SRC/dlustruct_gpu.h | 248 ++++++++++++ SRC/dtreeFactorizationGPU.c | 734 ++++++++++++++++++++++++++++++++++++ SRC/zlustruct_gpu.h | 247 ++++++++++++ SRC/ztreeFactorizationGPU.c | 733 +++++++++++++++++++++++++++++++++++ 5 files changed, 1964 insertions(+), 2 deletions(-) create mode 100644 SRC/dlustruct_gpu.h create mode 100644 SRC/dtreeFactorizationGPU.c create mode 100644 SRC/zlustruct_gpu.h create mode 100644 SRC/ztreeFactorizationGPU.c diff --git a/SRC/Makefile b/SRC/Makefile index b2d64a1b..c5709f55 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -71,7 +71,7 @@ DPLUSRC = pdgssvx.o pdgssvx_ABglobal.o \ # from 3D code DPLUSRC += pdgssvx3d.o pdgstrf3d.o dtreeFactorization.o dscatter3d.o \ dgather.o pd3dcomm.o dtrfAux.o dcommunication_aux.o dtrfCommWrapper.o \ - dnrformat_loc3d.o ##$(FACT3D) + dnrformat_loc3d.o dtreeFactorizationGPU.o ##$(FACT3D) # # Routines for double complex parallel SuperLU @@ -86,7 +86,7 @@ ZPLUSRC = pzgssvx.o pzgssvx_ABglobal.o \ # from 3D code ZPLUSRC += pzgssvx3d.o pzgstrf3d.o ztreeFactorization.o zscatter3d.o \ zgather.o pz3dcomm.o ztrfAux.o zcommunication_aux.o ztrfCommWrapper.o \ - znrformat_loc3d.o ##$(FACT3D) + znrformat_loc3d.o ztreeFactorizationGPU.o ##$(FACT3D) ifeq ($(HAVE_COMBBLAS),TRUE) DPLUSRC += d_c2cpp_GetHWPM.o diff --git a/SRC/dlustruct_gpu.h b/SRC/dlustruct_gpu.h new file mode 100644 index 00000000..bfe3e527 --- /dev/null +++ b/SRC/dlustruct_gpu.h @@ -0,0 +1,248 @@ + + +/*! @file + * \brief Descriptions and declarations for structures used in GPU + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ * 
+ */ + +#pragma once // so that this header file is included onle once + +// #ifdef DEBUG +// #include +// #endif +// #include +// #include "mkl.h" + +// #define USE_VENDOR_BLAS + +#include +#include +#include "superlu_ddefs.h" +// #include "sec_structs.h" +// #include "supernodal_etree.h" + +/* Constants */ +//#define SLU_TARGET_GPU 0 +//#define MAX_BLOCK_SIZE 10000 +#define MAX_NCUDA_STREAMS 32 + +static +void check(cudaError_t result, char const *const func, const char *const file, int_t const line) +{ + if (result) + { + fprintf(stderr, "CUDA error at file %s: line %d code=(%s) \"%s\" \n", + file, line, cudaGetErrorString(result), func); + + // Make sure we call CUDA Device Reset before exiting + exit(EXIT_FAILURE); + } +} + +#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ ) + +typedef struct //SCUbuf_gpu_ +{ + /*Informations for various buffers*/ + double *bigV; + double *bigU; + double *bigU_host; /*pinned location*/ + int_t *indirect; /*for indirect address calculations*/ + int_t *indirect2; /*for indirect address calculations*/ + + double *Remain_L_buff; /* on GPU */ + double *Remain_L_buff_host; /* Sherry: this memory is page-locked, why need another copy on GPU ? */ + + int_t *lsub; + int_t *usub; + + int_t *lsub_buf, *usub_buf; + + Ublock_info_t *Ublock_info; /* on GPU */ + Remain_info_t *Remain_info; + Ublock_info_t *Ublock_info_host; + Remain_info_t *Remain_info_host; + + int_t* usub_IndirectJ3; /* on GPU */ + int_t* usub_IndirectJ3_host; + +} dSCUbuf_gpu_t; + + +typedef struct //LUstruct_gpu_ +{ + int_t *LrowindVec; /* A single vector */ + int_t *LrowindPtr; /* A single vector */ + + double *LnzvalVec; /* A single vector */ + int_t *LnzvalPtr; /* A single vector */ + int_t *LnzvalPtr_host; /* A single vector */ + + int_t *UrowindVec; /* A single vector */ + int_t *UrowindPtr; /* A single vector */ + int_t *UrowindPtr_host; /* A single vector */ + int_t *UnzvalPtr_host; + + double *UnzvalVec; /* A single vector */ + int_t *UnzvalPtr; /* A single vector */ + /*gpu pointers for easy block accesses */ + local_l_blk_info_t *local_l_blk_infoVec; + int_t *local_l_blk_infoPtr; + int_t *jib_lookupVec; + int_t *jib_lookupPtr; + local_u_blk_info_t *local_u_blk_infoVec; + + int_t *local_u_blk_infoPtr; + int_t *ijb_lookupVec; + int_t *ijb_lookupPtr; + + // GPU buffers for performing Schur Complement Update on GPU + dSCUbuf_gpu_t scubufs[MAX_NCUDA_STREAMS]; + double *acc_L_buff, *acc_U_buff; + + /*Informations for various buffers*/ + int_t buffer_size; /**/ + int_t nsupers; /*should have number of supernodes*/ + int_t *xsup; + gridinfo_t *grid; + + + double ScatterMOPCounter; + double ScatterMOPTimer; + double GemmFLOPCounter; + double GemmFLOPTimer; + + double cPCIeH2D; + double cPCIeD2H; + double tHost_PCIeH2D; + double tHost_PCIeD2H; + + /*cuda events to measure DGEMM and SCATTER timing */ + int *isOffloaded; /*stores if any iteration is offloaded or not*/ + cudaEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*cuda events to store gemm and scatter's begin and end*/ + cudaEvent_t *ePCIeH2D; + cudaEvent_t *ePCIeD2H_Start; + cudaEvent_t *ePCIeD2H_End; + + int_t *xsup_host; + int_t* perm_c_supno; + int_t first_l_block_gpu, first_u_block_gpu; +} dLUstruct_gpu_t; + +typedef struct //sluGPU_t_ +{ + int_t gpuId; // if there are multiple GPUs + dLUstruct_gpu_t *A_gpu, *dA_gpu; + cudaStream_t funCallStreams[MAX_NCUDA_STREAMS], CopyStream; + cublasHandle_t cublasHandles[MAX_NCUDA_STREAMS]; + int_t lastOffloadStream[MAX_NCUDA_STREAMS]; + int_t nCudaStreams; + int_t* isNodeInMyGrid; + double acc_async_cost; +} dsluGPU_t; + + +#ifdef __cplusplus +extern "C" { +#endif + +extern int dsparseTreeFactor_ASYNC_GPU( + sForest_t *sforest, + commRequests_t **comReqss, // lists of communication requests, + // size = maxEtree level + dscuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t *packLUInfo, + msgs_t **msgss, // size = num Look ahead + dLUValSubBuf_t **LUvsbs, // size = num Look ahead + ddiagFactBufs_t **dFBufs, // size = maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t *gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t *gIperm_c_supno, + int ldt, + dsluGPU_t *sluGPU, + d2Hreduce_t *d2Hred, + HyP_t *HyP, + dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, + SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info); + +extern double estimate_cpu_time(int m, int n , int k); + +int dinitD2Hreduce( + int next_k, + d2Hreduce_t* d2Hred, + int last_flag, + // int_t *perm_c_supno, + HyP_t* HyP, + dsluGPU_t *sluGPU, + gridinfo_t *grid, + dLUstruct_t *LUstruct, SCT_t* SCT +); + +extern int dreduceGPUlu(int last_flag, d2Hreduce_t* d2Hred, + dsluGPU_t *sluGPU, SCT_t *SCT, gridinfo_t *grid, + dLUstruct_t *LUstruct); + +extern int dwaitGPUscu(int streamId, dsluGPU_t *sluGPU, SCT_t *SCT); +extern int dsendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred, dsluGPU_t *sluGPU); +extern int dsendSCUdataHost2GPU( + int_t streamId, int_t* lsub, int_t* usub, double* bigU, int_t bigu_send_size, + int_t Remain_lbuf_send_size, dsluGPU_t *sluGPU, HyP_t* HyP +); + +extern int dinitSluGPU3D_t( + dsluGPU_t *sluGPU, + dLUstruct_t *LUstruct, + gridinfo3d_t * grid3d, + int_t* perm_c_supno, int_t n, int_t buffer_size, int_t bigu_size, int_t ldt +); +int dSchurCompUpdate_GPU( + int_t streamId, + int_t jj_cpu, int_t nub, int_t klst, int_t knsupc, + int_t Rnbrow, int_t RemainBlk, + int_t Remain_lbuf_send_size, + int_t bigu_send_size, int_t ldu, + int_t mcb, + int_t buffer_size, int_t lsub_len, int_t usub_len, + int_t ldt, int_t k0, + dsluGPU_t *sluGPU, gridinfo_t *grid +); + + +extern void dCopyLUToGPU3D (int_t* isNodeInMyGrid, dLocalLU_t *A_host, + dsluGPU_t *sluGPU, Glu_persist_t *Glu_persist, int_t n, + gridinfo3d_t *grid3d, int_t buffer_size, int_t bigu_size, int_t ldt); + +extern int dreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount, + int_t** treePerm, dLUValSubBuf_t*LUvsb, + dLUstruct_t* LUstruct, gridinfo3d_t* grid3d, + dsluGPU_t *sluGPU, d2Hreduce_t* d2Hred, + factStat_t *factStat, HyP_t* HyP, SCT_t* SCT ); + +extern void dsyncAllfunCallStreams(dsluGPU_t* sluGPU, SCT_t* SCT); +extern int dfree_LUstruct_gpu (dLUstruct_gpu_t *A_gpu); + +//int freeSluGPU(dsluGPU_t *sluGPU); + +cublasStatus_t checkCublas(cublasStatus_t result); +// cudaError_t checkCuda(cudaError_t result); + +extern void dPrint_matrix( char *desc, int_t m, int_t n, double *dA, int_t lda ); + +/*to print out various statistics*/ +void dprintGPUStats(dLUstruct_gpu_t *A_gpu); + +#ifdef __cplusplus +} +#endif + +//#undef DEBUG diff --git a/SRC/dtreeFactorizationGPU.c b/SRC/dtreeFactorizationGPU.c new file mode 100644 index 00000000..9fab2fc4 --- /dev/null +++ b/SRC/dtreeFactorizationGPU.c @@ -0,0 +1,734 @@ + + +/*! @file + * \brief Factorization routines for the subtree using 2D process grid, with GPUs. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ * 
+ */ +// #include "treeFactorization.h" +// #include "trfCommWrapper.h" +#include "dlustruct_gpu.h" +#ifdef __INTEL_COMPILER +#include "mkl.h" +#else +//#include "cblas.h" +#endif + +/* +/-- num_u_blks--\ /-- num_u_blks_Phi --\ +---------------------------------------- +| host_cols || GPU | host | +---------------------------------------- + ^ ^ + 0 jj_cpu +*/ +static int_t getAccUPartition(HyP_t *HyP) +{ + /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */ + int_t total_cols_1 = HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; + + int_t host_cols = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols; + double cpu_time_0 = estimate_cpu_time(HyP->Lnbrow, total_cols_1, HyP->ldu_Phi) + + estimate_cpu_time(HyP->Rnbrow, host_cols, HyP->ldu) + estimate_cpu_time(HyP->Lnbrow, host_cols, HyP->ldu); + + int jj_cpu; + +#if 0 /* Ignoe those estimates */ + jj_cpu = tuned_partition(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, + HyP->Remain_info, HyP->RemainBlk, cpu_time_0, HyP->Rnbrow, HyP->ldu_Phi ); +#else /* Sherry: new */ + jj_cpu = HyP->num_u_blks_Phi; +#endif + + if (jj_cpu != 0 && HyP->Rnbrow > 0) // ### + { + HyP->offloadCondition = 1; + } + else + { + HyP->offloadCondition = 0; + jj_cpu = 0; // ### + } + + return jj_cpu; +} + +int dsparseTreeFactor_ASYNC_GPU( + sForest_t *sforest, + commRequests_t **comReqss, // lists of communication requests, + // size = maxEtree level + dscuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t *packLUInfo, + msgs_t **msgss, // size = num Look ahead + dLUValSubBuf_t **LUvsbs, // size = num Look ahead + ddiagFactBufs_t **dFBufs, // size = maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t *gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t *gIperm_c_supno, + int_t ldt, + dsluGPU_t *sluGPU, + d2Hreduce_t *d2Hred, + HyP_t *HyP, + dLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info) +{ + // sforest.nNodes, sforest.nodeList, + // &sforest.topoInfo, + int_t nnodes = sforest->nNodes; // number of nodes in supernodal etree + if (nnodes < 1) + { + return 1; + } + + int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization + treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo; + int_t *myIperm = treeTopoInfo->myIperm; + + gridinfo_t *grid = &(grid3d->grid2d); + /*main loop over all the levels*/ + + int_t maxTopoLevel = treeTopoInfo->numLvl; + int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims; + int_t *IrecvPlcd_D = factStat->IrecvPlcd_D; + int_t *factored_D = factStat->factored_D; + int_t *factored_L = factStat->factored_L; + int_t *factored_U = factStat->factored_U; + int_t *IbcastPanel_L = factStat->IbcastPanel_L; + int_t *IbcastPanel_U = factStat->IbcastPanel_U; + int_t *gpuLUreduced = factStat->gpuLUreduced; + int_t *xsup = LUstruct->Glu_persist->xsup; + + // int_t numLAMax = getNumLookAhead(); + int_t numLAMax = getNumLookAhead(options); + int_t numLA = numLAMax; // number of look-ahead panels + int_t superlu_acc_offload = HyP->superlu_acc_offload; + int_t last_flag = 1; /* for updating nsuper-1 only once */ + int_t nCudaStreams = sluGPU->nCudaStreams; // number of cuda streams + + if (superlu_acc_offload) + dsyncAllfunCallStreams(sluGPU, SCT); + + /* Go through each leaf node */ + for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0; + /* k-th diagonal factorization */ + + /* If LU panels from GPU are not reduced, then reduce + them before diagonal factorization */ + if (!gpuLUreduced[k] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + + dinitD2Hreduce(k, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + dsendLUpanelGPU2HOST(k, d2Hred, sluGPU); + + dreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid, LUstruct); + + gpuLUreduced[k] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + double t1 = SuperLU_timer_(); + + /*Now factor and broadcast diagonal block*/ + // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + // options, thresh, LUstruct, stat, info, SCT); + +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k] = 1; + + SCT->pdgstrf2_timer += (SuperLU_timer_() - t1); + } /* for all leaves ... */ + + //printf(".. SparseFactor_GPU: after leaves\n"); fflush(stdout); + + /* Process supernodal etree level by level */ + for (int topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl) + // for (int_t topoLvl = 0; topoLvl < 1; ++topoLvl) + { + // printf("(%d) factor level %d, maxTopoLevel %d\n",grid3d->iam,topoLvl,maxTopoLevel); fflush(stdout); + /* code */ + int k_st = eTreeTopLims[topoLvl]; + int k_end = eTreeTopLims[topoLvl + 1]; + + /* Process all the nodes in 'topoLvl': diagonal factorization */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 - k_st; + + if (!factored_D[k]) + { + /*If LU panels from GPU are not reduced then reduce + them before diagonal factorization*/ + if (!gpuLUreduced[k] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + dinitD2Hreduce(k, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + dsendLUpanelGPU2HOST(k, d2Hred, sluGPU); + /* + Reduce the LU panels from GPU + */ + dreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid, + LUstruct); + + gpuLUreduced[k] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + double t1 = SuperLU_timer_(); + /* Factor diagonal block on CPU */ + // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + // options, thresh, LUstruct, stat, info, SCT); +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + dDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + SCT->pdgstrf2_timer += (SuperLU_timer_() - t1); + } + } /* for all nodes in this level */ + + //printf(".. SparseFactor_GPU: after diag factorization\n"); fflush(stdout); + + double t_apt = SuperLU_timer_(); /* Async Pipe Timer */ + + /* Process all the nodes in 'topoLvl': panel updates on CPU */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 - k_st; + + /*L update */ + if (factored_L[k] == 0) + { +#if 0 + sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + dLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L, + comReqss[offset]->U_diag_blk_recv_req, + dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT); +#endif + + factored_L[k] = 1; + } + /*U update*/ + if (factored_U[k] == 0) + { +#if 0 + sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset], + scuBufs, packLUInfo, grid, LUstruct, stat, SCT); +#else + dUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req, + dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt, + packLUInfo->Ublock_info, grid, LUstruct, stat, SCT); +#endif + factored_U[k] = 1; + } + } /* end panel update */ + + //printf(".. after CPU panel updates. numLA %d\n", numLA); fflush(stdout); + + /* Process all the panels in look-ahead window: + broadcast L and U panels. */ + for (int k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 % numLA; + /* diagonal factorization */ + + /*L Ibcast*/ + if (IbcastPanel_L[k] == 0) + { +#if 0 + sIBcastRecvLPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req, + comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf, + LUvsbs[offset]->Lval_buf, factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[k] = 1; /*for consistancy; unused later*/ + } + + /*U Ibcast*/ + if (IbcastPanel_U[k] == 0) + { +#if 0 + sIBcastRecvUPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[k] = 1; + } + } /* end for panels in look-ahead window */ + + //printf(".. after CPU look-ahead updates\n"); fflush(stdout); + + // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt; + SCT->tAsyncPipeTail += (SuperLU_timer_() - t_apt); + + /* Process all the nodes in level 'topoLvl': Schur complement update + (no MPI communication) */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 % numLA; + + double tsch = SuperLU_timer_(); + +#if 0 + sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); + /*Wait for U panel*/ + sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); +#else + dWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + grid, LUstruct, SCT); + dWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, grid, LUstruct, SCT); +#endif + + int_t LU_nonempty = dSchurComplementSetupGPU(k, + msgss[offset], packLUInfo, + myIperm, gIperm_c_supno, perm_c_supno, + gEtreeInfo, fNlists, scuBufs, + LUvsbs[offset], grid, LUstruct, HyP); + // initializing D2H data transfer. D2H = Device To Host. + int_t jj_cpu; /* limit between CPU and GPU */ + +#if 1 + if (superlu_acc_offload) + { + jj_cpu = HyP->num_u_blks_Phi; // -1 ?? + HyP->offloadCondition = 1; + } + else + { + /* code */ + HyP->offloadCondition = 0; + jj_cpu = 0; + } + +#else + if (superlu_acc_offload) + { + jj_cpu = getAccUPartition(HyP); + + if (jj_cpu > 0) + jj_cpu = HyP->num_u_blks_Phi; + + /* Sherry force this --> */ + jj_cpu = HyP->num_u_blks_Phi; // -1 ?? + HyP->offloadCondition = 1; + } + else + { + jj_cpu = 0; + } +#endif + + // int_t jj_cpu = HyP->num_u_blks_Phi-1; + // if (HyP->Rnbrow > 0 && jj_cpu>=0) + // HyP->offloadCondition = 1; + // else + // HyP->offloadCondition = 0; + // jj_cpu=0; +#if 0 + if ( HyP->offloadCondition ) { + printf("(%d) k=%d, nub=%d, nub_host=%d, nub_phi=%d, jj_cpu %d, offloadCondition %d\n", + grid3d->iam, k, HyP->num_u_blks+HyP->num_u_blks_Phi , + HyP->num_u_blks, HyP->num_u_blks_Phi, + jj_cpu, HyP->offloadCondition); + fflush(stdout); + } +#endif + scuStatUpdate(SuperSize(k), HyP, SCT, stat); + + int_t offload_condition = HyP->offloadCondition; + uPanelInfo_t *uPanelInfo = packLUInfo->uPanelInfo; + lPanelInfo_t *lPanelInfo = packLUInfo->lPanelInfo; + int_t *lsub = lPanelInfo->lsub; + int_t *usub = uPanelInfo->usub; + int_t *indirect = fNlists->indirect; + int_t *indirect2 = fNlists->indirect2; + + /* Schur Complement Update */ + + int_t knsupc = SuperSize(k); + int_t klst = FstBlockC(k + 1); + + double *bigV = scuBufs->bigV; + double *bigU = scuBufs->bigU; + + double t1 = SuperLU_timer_(); + +#pragma omp parallel /* Look-ahead update on CPU */ + { + int_t thread_id = omp_get_thread_num(); + +#pragma omp for + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij) + { + int_t j = ij / HyP->lookAheadBlk; + int_t lb = ij % HyP->lookAheadBlk; + dblock_gemm_scatterTopLeft(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } + +#pragma omp for + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij) + { + int_t j = ij / HyP->lookAheadBlk; + int_t lb = ij % HyP->lookAheadBlk; + dblock_gemm_scatterTopRight(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } + +#pragma omp for + for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) + { + int_t j = ij / HyP->RemainBlk; + int_t lb = ij % HyP->RemainBlk; + dblock_gemm_scatterBottomLeft(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } /* for int_t ij = ... */ + } /* end parallel region ... end look-ahead update */ + + SCT->lookaheadupdatetimer += (SuperLU_timer_() - t1); + + //printf("... after look-ahead update, topoLvl %d\t maxTopoLevel %d\n", topoLvl, maxTopoLevel); fflush(stdout); + + /* Reduce the L & U panels from GPU to CPU. */ + if (topoLvl < maxTopoLevel - 1) + { /* Not the root */ + int_t k_parent = gEtreeInfo->setree[k]; + gEtreeInfo->numChildLeft[k_parent]--; + if (gEtreeInfo->numChildLeft[k_parent] == 0 && k_parent < nnodes) + { /* if k is the last child in this level */ + int_t k0_parent = myIperm[k_parent]; + if (k0_parent > 0) + { + /* code */ + // printf("Before assert: iam %d, k %d, k_parent %d, k0_parent %d, nnodes %d\n", grid3d->iam, k, k_parent, k0_parent, nnodes); fflush(stdout); + // exit(-1); + assert(k0_parent < nnodes); + int offset = k0_parent - k_end; + if (!gpuLUreduced[k_parent] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + + dinitD2Hreduce(k_parent, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + dsendLUpanelGPU2HOST(k_parent, d2Hred, sluGPU); + + /* Reduce the LU panels from GPU */ + dreduceGPUlu(last_flag, d2Hred, + sluGPU, SCT, grid, LUstruct); + + gpuLUreduced[k_parent] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + /* Factorize diagonal block on CPU */ +#if 0 + sDiagFactIBCast(k_parent, dFBufs[offset], factStat, + comReqss[offset], grid, options, thresh, + LUstruct, stat, info, SCT, tag_ub); +#else + dDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor, + dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k_parent] = 1; + } /* end if k0_parent > 0 */ + + } /* end if all children are done */ + } /* end if non-root */ + +#pragma omp parallel + { + /* Master thread performs Schur complement update on GPU. */ +#pragma omp master + { + if (superlu_acc_offload) + { + int thread_id = omp_get_thread_num(); + double t1 = SuperLU_timer_(); + + if (offload_condition) + { + SCT->datatransfer_count++; + int streamId = k0 % nCudaStreams; + + /*wait for previous offload to get finished*/ + if (sluGPU->lastOffloadStream[streamId] != -1) + { + dwaitGPUscu(streamId, sluGPU, SCT); + sluGPU->lastOffloadStream[streamId] = -1; + } + + int_t Remain_lbuf_send_size = knsupc * HyP->Rnbrow; + int_t bigu_send_size = jj_cpu < 1 ? 0 : HyP->ldu_Phi * HyP->Ublock_info_Phi[jj_cpu - 1].full_u_cols; + assert(bigu_send_size < HyP->bigu_size); + + /* !! Sherry add the test to avoid seg_fault inside + sendSCUdataHost2GPU */ + if (bigu_send_size > 0) + { + dsendSCUdataHost2GPU(streamId, lsub, usub, + bigU, bigu_send_size, + Remain_lbuf_send_size, sluGPU, HyP); + + sluGPU->lastOffloadStream[streamId] = k0; + int_t usub_len = usub[2]; + int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + //{printf("... before SchurCompUpdate_GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} + + dSchurCompUpdate_GPU( + streamId, 0, jj_cpu, klst, knsupc, HyP->Rnbrow, HyP->RemainBlk, + Remain_lbuf_send_size, bigu_send_size, HyP->ldu_Phi, HyP->num_u_blks_Phi, + HyP->buffer_size, lsub_len, usub_len, ldt, k0, sluGPU, grid); + } /* endif bigu_send_size > 0 */ + + // sendLUpanelGPU2HOST( k0, d2Hred, sluGPU); + + SCT->schurPhiCallCount++; + HyP->jj_cpu = jj_cpu; + updateDirtyBit(k0, HyP, grid); + } /* endif (offload_condition) */ + + double t2 = SuperLU_timer_(); + SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double)(t2 - t1); /* not used */ + SCT->CPUOffloadTimer += (double)(t2 - t1); // Sherry added + + } /* endif (superlu_acc_offload) */ + + } /* end omp master thread */ + +#pragma omp for + /* The following update is on CPU. Should not be necessary now, + because we set jj_cpu equal to num_u_blks_Phi. */ + for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu); ++ij) + { + //printf(".. WARNING: should NOT get here\n"); + int_t j = ij / HyP->RemainBlk + jj_cpu; + int_t lb = ij % HyP->RemainBlk; + dblock_gemm_scatterBottomRight(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } /* for int_t ij = ... */ + + } /* end omp parallel region */ + + //SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + + // finish waiting for diag block send + int_t abs_offset = k0 - k_st; +#if 0 + sWait_LUDiagSend(k, comReqss[abs_offset], grid, SCT); +#else + Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, + comReqss[abs_offset]->L_diag_blk_send_req, + grid, SCT); +#endif + + /*Schedule next I bcasts within look-ahead window */ + for (int next_k0 = k0 + 1; next_k0 < SUPERLU_MIN(k0 + 1 + numLA, nnodes); ++next_k0) + { + /* code */ + int_t next_k = perm_c_supno[next_k0]; + int_t offset = next_k0 % numLA; + + /*L Ibcast*/ + if (IbcastPanel_L[next_k] == 0 && factored_L[next_k]) + { +#if 0 + sIBcastRecvLPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf, + factStat->factored, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[next_k] = 1; /*will be used later*/ + } + /*U Ibcast*/ + if (IbcastPanel_U[next_k] == 0 && factored_U[next_k]) + { +#if 0 + sIBcastRecvUPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + dIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[next_k] = 1; + } + } /* end for look-ahead window */ + + if (topoLvl < maxTopoLevel - 1) /* not root */ + { + /*look-ahead LU factorization*/ + int kx_st = eTreeTopLims[topoLvl + 1]; + int kx_end = eTreeTopLims[topoLvl + 2]; + for (int k0x = kx_st; k0x < kx_end; k0x++) + { + /* code */ + int kx = perm_c_supno[k0x]; + int offset = k0x - kx_st; + if (IrecvPlcd_D[kx] && !factored_L[kx]) + { + /*check if received*/ + int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset], + grid, SCT); + if (recvUDiag) + { +#if 0 + sLPanelTrSolve( kx, dFBufs[offset], + factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + dLPanelTrSolve(kx, factStat->factored_L, + dFBufs[offset]->BlockUFactor, grid, LUstruct); +#endif + + factored_L[kx] = 1; + + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_L[kx] == 0 && + k0x - k0 < numLA + 1 && // is within look-ahead window + factored_L[kx]) + { + int_t offset1 = k0x % numLA; +#if 0 + sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1], + msgss[offset1], factStat, + grid, LUstruct, SCT, tag_ub); +#else + dIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, + comReqss[offset1]->send_req, + comReqss[offset1]->recv_req, + LUvsbs[offset1]->Lsub_buf, + LUvsbs[offset1]->Lval_buf, + factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[kx] = 1; /*will be used later*/ + } + } + } + + if (IrecvPlcd_D[kx] && !factored_U[kx]) + { + /*check if received*/ + int_t recvLDiag = checkRecvLDiag(kx, comReqss[offset], + grid, SCT); + if (recvLDiag) + { +#if 0 + sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo, + grid, LUstruct, stat, SCT); +#else + dUPanelTrSolve(kx, dFBufs[offset]->BlockLFactor, + scuBufs->bigV, + ldt, packLUInfo->Ublock_info, + grid, LUstruct, stat, SCT); +#endif + factored_U[kx] = 1; + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_U[kx] == 0 && + k0x - k0 < numLA + 1 && // is within lookahead window + factored_U[kx]) + { + int_t offset = k0x % numLA; +#if 0 + sIBcastRecvUPanel( kx, comReqss[offset], + LUvsbs[offset], + msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub); +#else + dIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[kx] = 1; /*will be used later*/ + } + } + } + } /* end look-ahead */ + + } /* end if non-root level */ + + /* end Schur complement update */ + SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + + } /* end Schur update for all the nodes in level 'topoLvl' */ + + } /* end for all levels of the tree */ + + return 0; +} /* end dsparseTreeFactor_ASYNC_GPU */ diff --git a/SRC/zlustruct_gpu.h b/SRC/zlustruct_gpu.h new file mode 100644 index 00000000..c4d81fc7 --- /dev/null +++ b/SRC/zlustruct_gpu.h @@ -0,0 +1,247 @@ + +/*! @file + * \brief Descriptions and declarations for structures used in GPU + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ * 
+ */ + +#pragma once // so that this header file is included onle once + +// #ifdef DEBUG +// #include +// #endif +// #include +// #include "mkl.h" + +// #define USE_VENDOR_BLAS + +#include +#include +#include "superlu_zdefs.h" +// #include "sec_structs.h" +// #include "supernodal_etree.h" + +/* Constants */ +//#define SLU_TARGET_GPU 0 +//#define MAX_BLOCK_SIZE 10000 +#define MAX_NCUDA_STREAMS 32 + +static +void check(cudaError_t result, char const *const func, const char *const file, int_t const line) +{ + if (result) + { + fprintf(stderr, "CUDA error at file %s: line %d code=(%s) \"%s\" \n", + file, line, cudaGetErrorString(result), func); + + // Make sure we call CUDA Device Reset before exiting + exit(EXIT_FAILURE); + } +} + +#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ ) + +typedef struct //SCUbuf_gpu_ +{ + /*Informations for various buffers*/ + doublecomplex *bigV; + doublecomplex *bigU; + doublecomplex *bigU_host; /*pinned location*/ + int_t *indirect; /*for indirect address calculations*/ + int_t *indirect2; /*for indirect address calculations*/ + + doublecomplex *Remain_L_buff; /* on GPU */ + doublecomplex *Remain_L_buff_host; /* Sherry: this memory is page-locked, why need another copy on GPU ? */ + + int_t *lsub; + int_t *usub; + + int_t *lsub_buf, *usub_buf; + + Ublock_info_t *Ublock_info; /* on GPU */ + Remain_info_t *Remain_info; + Ublock_info_t *Ublock_info_host; + Remain_info_t *Remain_info_host; + + int_t* usub_IndirectJ3; /* on GPU */ + int_t* usub_IndirectJ3_host; + +} zSCUbuf_gpu_t; + + +typedef struct //LUstruct_gpu_ +{ + int_t *LrowindVec; /* A single vector */ + int_t *LrowindPtr; /* A single vector */ + + doublecomplex *LnzvalVec; /* A single vector */ + int_t *LnzvalPtr; /* A single vector */ + int_t *LnzvalPtr_host; /* A single vector */ + + int_t *UrowindVec; /* A single vector */ + int_t *UrowindPtr; /* A single vector */ + int_t *UrowindPtr_host; /* A single vector */ + int_t *UnzvalPtr_host; + + doublecomplex *UnzvalVec; /* A single vector */ + int_t *UnzvalPtr; /* A single vector */ + /*gpu pointers for easy block accesses */ + local_l_blk_info_t *local_l_blk_infoVec; + int_t *local_l_blk_infoPtr; + int_t *jib_lookupVec; + int_t *jib_lookupPtr; + local_u_blk_info_t *local_u_blk_infoVec; + + int_t *local_u_blk_infoPtr; + int_t *ijb_lookupVec; + int_t *ijb_lookupPtr; + + // GPU buffers for performing Schur Complement Update on GPU + zSCUbuf_gpu_t scubufs[MAX_NCUDA_STREAMS]; + doublecomplex *acc_L_buff, *acc_U_buff; + + /*Informations for various buffers*/ + int_t buffer_size; /**/ + int_t nsupers; /*should have number of supernodes*/ + int_t *xsup; + gridinfo_t *grid; + + + double ScatterMOPCounter; + double ScatterMOPTimer; + double GemmFLOPCounter; + double GemmFLOPTimer; + + double cPCIeH2D; + double cPCIeD2H; + double tHost_PCIeH2D; + double tHost_PCIeD2H; + + /*cuda events to measure DGEMM and SCATTER timing */ + int *isOffloaded; /*stores if any iteration is offloaded or not*/ + cudaEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*cuda events to store gemm and scatter's begin and end*/ + cudaEvent_t *ePCIeH2D; + cudaEvent_t *ePCIeD2H_Start; + cudaEvent_t *ePCIeD2H_End; + + int_t *xsup_host; + int_t* perm_c_supno; + int_t first_l_block_gpu, first_u_block_gpu; +} zLUstruct_gpu_t; + +typedef struct //sluGPU_t_ +{ + int_t gpuId; // if there are multiple GPUs + zLUstruct_gpu_t *A_gpu, *dA_gpu; + cudaStream_t funCallStreams[MAX_NCUDA_STREAMS], CopyStream; + cublasHandle_t cublasHandles[MAX_NCUDA_STREAMS]; + int_t lastOffloadStream[MAX_NCUDA_STREAMS]; + int_t nCudaStreams; + int_t* isNodeInMyGrid; + double acc_async_cost; +} zsluGPU_t; + + +#ifdef __cplusplus +extern "C" { +#endif + +extern int zsparseTreeFactor_ASYNC_GPU( + sForest_t *sforest, + commRequests_t **comReqss, // lists of communication requests, + // size = maxEtree level + zscuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t *packLUInfo, + msgs_t **msgss, // size = num Look ahead + zLUValSubBuf_t **LUvsbs, // size = num Look ahead + zdiagFactBufs_t **dFBufs, // size = maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t *gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t *gIperm_c_supno, + int ldt, + zsluGPU_t *sluGPU, + d2Hreduce_t *d2Hred, + HyP_t *HyP, + zLUstruct_t *LUstruct, gridinfo3d_t *grid3d, + SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info); + +extern double estimate_cpu_time(int m, int n , int k); + +int zinitD2Hreduce( + int next_k, + d2Hreduce_t* d2Hred, + int last_flag, + // int_t *perm_c_supno, + HyP_t* HyP, + zsluGPU_t *sluGPU, + gridinfo_t *grid, + zLUstruct_t *LUstruct, SCT_t* SCT +); + +extern int zreduceGPUlu(int last_flag, d2Hreduce_t* d2Hred, + zsluGPU_t *sluGPU, SCT_t *SCT, gridinfo_t *grid, + zLUstruct_t *LUstruct); + +extern int zwaitGPUscu(int streamId, zsluGPU_t *sluGPU, SCT_t *SCT); +extern int zsendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred, zsluGPU_t *sluGPU); +extern int zsendSCUdataHost2GPU( + int_t streamId, int_t* lsub, int_t* usub, doublecomplex* bigU, int_t bigu_send_size, + int_t Remain_lbuf_send_size, zsluGPU_t *sluGPU, HyP_t* HyP +); + +extern int zinitSluGPU3D_t( + zsluGPU_t *sluGPU, + zLUstruct_t *LUstruct, + gridinfo3d_t * grid3d, + int_t* perm_c_supno, int_t n, int_t buffer_size, int_t bigu_size, int_t ldt +); +int zSchurCompUpdate_GPU( + int_t streamId, + int_t jj_cpu, int_t nub, int_t klst, int_t knsupc, + int_t Rnbrow, int_t RemainBlk, + int_t Remain_lbuf_send_size, + int_t bigu_send_size, int_t ldu, + int_t mcb, + int_t buffer_size, int_t lsub_len, int_t usub_len, + int_t ldt, int_t k0, + zsluGPU_t *sluGPU, gridinfo_t *grid +); + + +extern void zCopyLUToGPU3D (int_t* isNodeInMyGrid, zLocalLU_t *A_host, + zsluGPU_t *sluGPU, Glu_persist_t *Glu_persist, int_t n, + gridinfo3d_t *grid3d, int_t buffer_size, int_t bigu_size, int_t ldt); + +extern int zreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount, + int_t** treePerm, zLUValSubBuf_t*LUvsb, + zLUstruct_t* LUstruct, gridinfo3d_t* grid3d, + zsluGPU_t *sluGPU, d2Hreduce_t* d2Hred, + factStat_t *factStat, HyP_t* HyP, SCT_t* SCT ); + +extern void zsyncAllfunCallStreams(zsluGPU_t* sluGPU, SCT_t* SCT); +extern int zfree_LUstruct_gpu (zLUstruct_gpu_t *A_gpu); + +//int freeSluGPU(zsluGPU_t *sluGPU); + +cublasStatus_t checkCublas(cublasStatus_t result); +// cudaError_t checkCuda(cudaError_t result); + +extern void zPrint_matrix( char *desc, int_t m, int_t n, doublecomplex *dA, int_t lda ); + +/*to print out various statistics*/ +void zprintGPUStats(zLUstruct_gpu_t *A_gpu); + +#ifdef __cplusplus +} +#endif + +//#undef DEBUG diff --git a/SRC/ztreeFactorizationGPU.c b/SRC/ztreeFactorizationGPU.c new file mode 100644 index 00000000..2128f812 --- /dev/null +++ b/SRC/ztreeFactorizationGPU.c @@ -0,0 +1,733 @@ + +/*! @file + * \brief Factorization routines for the subtree using 2D process grid, with GPUs. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ * 
+ */ +// #include "treeFactorization.h" +// #include "trfCommWrapper.h" +#include "zlustruct_gpu.h" +#ifdef __INTEL_COMPILER +#include "mkl.h" +#else +//#include "cblas.h" +#endif + +/* +/-- num_u_blks--\ /-- num_u_blks_Phi --\ +---------------------------------------- +| host_cols || GPU | host | +---------------------------------------- + ^ ^ + 0 jj_cpu +*/ +static int_t getAccUPartition(HyP_t *HyP) +{ + /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */ + int_t total_cols_1 = HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; + + int_t host_cols = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols; + double cpu_time_0 = estimate_cpu_time(HyP->Lnbrow, total_cols_1, HyP->ldu_Phi) + + estimate_cpu_time(HyP->Rnbrow, host_cols, HyP->ldu) + estimate_cpu_time(HyP->Lnbrow, host_cols, HyP->ldu); + + int jj_cpu; + +#if 0 /* Ignoe those estimates */ + jj_cpu = tuned_partition(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, + HyP->Remain_info, HyP->RemainBlk, cpu_time_0, HyP->Rnbrow, HyP->ldu_Phi ); +#else /* Sherry: new */ + jj_cpu = HyP->num_u_blks_Phi; +#endif + + if (jj_cpu != 0 && HyP->Rnbrow > 0) // ### + { + HyP->offloadCondition = 1; + } + else + { + HyP->offloadCondition = 0; + jj_cpu = 0; // ### + } + + return jj_cpu; +} + +int zsparseTreeFactor_ASYNC_GPU( + sForest_t *sforest, + commRequests_t **comReqss, // lists of communication requests, + // size = maxEtree level + zscuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t *packLUInfo, + msgs_t **msgss, // size = num Look ahead + zLUValSubBuf_t **LUvsbs, // size = num Look ahead + zdiagFactBufs_t **dFBufs, // size = maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t *gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t *gIperm_c_supno, + int_t ldt, + zsluGPU_t *sluGPU, + d2Hreduce_t *d2Hred, + HyP_t *HyP, + zLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info) +{ + // sforest.nNodes, sforest.nodeList, + // &sforest.topoInfo, + int_t nnodes = sforest->nNodes; // number of nodes in supernodal etree + if (nnodes < 1) + { + return 1; + } + + int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization + treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo; + int_t *myIperm = treeTopoInfo->myIperm; + + gridinfo_t *grid = &(grid3d->grid2d); + /*main loop over all the levels*/ + + int_t maxTopoLevel = treeTopoInfo->numLvl; + int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims; + int_t *IrecvPlcd_D = factStat->IrecvPlcd_D; + int_t *factored_D = factStat->factored_D; + int_t *factored_L = factStat->factored_L; + int_t *factored_U = factStat->factored_U; + int_t *IbcastPanel_L = factStat->IbcastPanel_L; + int_t *IbcastPanel_U = factStat->IbcastPanel_U; + int_t *gpuLUreduced = factStat->gpuLUreduced; + int_t *xsup = LUstruct->Glu_persist->xsup; + + // int_t numLAMax = getNumLookAhead(); + int_t numLAMax = getNumLookAhead(options); + int_t numLA = numLAMax; // number of look-ahead panels + int_t superlu_acc_offload = HyP->superlu_acc_offload; + int_t last_flag = 1; /* for updating nsuper-1 only once */ + int_t nCudaStreams = sluGPU->nCudaStreams; // number of cuda streams + + if (superlu_acc_offload) + zsyncAllfunCallStreams(sluGPU, SCT); + + /* Go through each leaf node */ + for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0; + /* k-th diagonal factorization */ + + /* If LU panels from GPU are not reduced, then reduce + them before diagonal factorization */ + if (!gpuLUreduced[k] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + + zinitD2Hreduce(k, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + zsendLUpanelGPU2HOST(k, d2Hred, sluGPU); + + zreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid, LUstruct); + + gpuLUreduced[k] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + double t1 = SuperLU_timer_(); + + /*Now factor and broadcast diagonal block*/ + // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + // options, thresh, LUstruct, stat, info, SCT); + +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + zDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k] = 1; + + SCT->pdgstrf2_timer += (SuperLU_timer_() - t1); + } /* for all leaves ... */ + + //printf(".. SparseFactor_GPU: after leaves\n"); fflush(stdout); + + /* Process supernodal etree level by level */ + for (int topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl) + // for (int_t topoLvl = 0; topoLvl < 1; ++topoLvl) + { + // printf("(%d) factor level %d, maxTopoLevel %d\n",grid3d->iam,topoLvl,maxTopoLevel); fflush(stdout); + /* code */ + int k_st = eTreeTopLims[topoLvl]; + int k_end = eTreeTopLims[topoLvl + 1]; + + /* Process all the nodes in 'topoLvl': diagonal factorization */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 - k_st; + + if (!factored_D[k]) + { + /*If LU panels from GPU are not reduced then reduce + them before diagonal factorization*/ + if (!gpuLUreduced[k] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + zinitD2Hreduce(k, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + zsendLUpanelGPU2HOST(k, d2Hred, sluGPU); + /* + Reduce the LU panels from GPU + */ + zreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid, + LUstruct); + + gpuLUreduced[k] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + double t1 = SuperLU_timer_(); + /* Factor diagonal block on CPU */ + // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + // options, thresh, LUstruct, stat, info, SCT); +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + zDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + SCT->pdgstrf2_timer += (SuperLU_timer_() - t1); + } + } /* for all nodes in this level */ + + //printf(".. SparseFactor_GPU: after diag factorization\n"); fflush(stdout); + + double t_apt = SuperLU_timer_(); /* Async Pipe Timer */ + + /* Process all the nodes in 'topoLvl': panel updates on CPU */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 - k_st; + + /*L update */ + if (factored_L[k] == 0) + { +#if 0 + sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + zLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L, + comReqss[offset]->U_diag_blk_recv_req, + dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT); +#endif + + factored_L[k] = 1; + } + /*U update*/ + if (factored_U[k] == 0) + { +#if 0 + sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset], + scuBufs, packLUInfo, grid, LUstruct, stat, SCT); +#else + zUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req, + dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt, + packLUInfo->Ublock_info, grid, LUstruct, stat, SCT); +#endif + factored_U[k] = 1; + } + } /* end panel update */ + + //printf(".. after CPU panel updates. numLA %d\n", numLA); fflush(stdout); + + /* Process all the panels in look-ahead window: + broadcast L and U panels. */ + for (int k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 % numLA; + /* diagonal factorization */ + + /*L Ibcast*/ + if (IbcastPanel_L[k] == 0) + { +#if 0 + sIBcastRecvLPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + zIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req, + comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf, + LUvsbs[offset]->Lval_buf, factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[k] = 1; /*for consistancy; unused later*/ + } + + /*U Ibcast*/ + if (IbcastPanel_U[k] == 0) + { +#if 0 + sIBcastRecvUPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + zIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[k] = 1; + } + } /* end for panels in look-ahead window */ + + //printf(".. after CPU look-ahead updates\n"); fflush(stdout); + + // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt; + SCT->tAsyncPipeTail += (SuperLU_timer_() - t_apt); + + /* Process all the nodes in level 'topoLvl': Schur complement update + (no MPI communication) */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 % numLA; + + double tsch = SuperLU_timer_(); + +#if 0 + sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); + /*Wait for U panel*/ + sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); +#else + zWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + grid, LUstruct, SCT); + zWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, grid, LUstruct, SCT); +#endif + + int_t LU_nonempty = zSchurComplementSetupGPU(k, + msgss[offset], packLUInfo, + myIperm, gIperm_c_supno, perm_c_supno, + gEtreeInfo, fNlists, scuBufs, + LUvsbs[offset], grid, LUstruct, HyP); + // initializing D2H data transfer. D2H = Device To Host. + int_t jj_cpu; /* limit between CPU and GPU */ + +#if 1 + if (superlu_acc_offload) + { + jj_cpu = HyP->num_u_blks_Phi; // -1 ?? + HyP->offloadCondition = 1; + } + else + { + /* code */ + HyP->offloadCondition = 0; + jj_cpu = 0; + } + +#else + if (superlu_acc_offload) + { + jj_cpu = getAccUPartition(HyP); + + if (jj_cpu > 0) + jj_cpu = HyP->num_u_blks_Phi; + + /* Sherry force this --> */ + jj_cpu = HyP->num_u_blks_Phi; // -1 ?? + HyP->offloadCondition = 1; + } + else + { + jj_cpu = 0; + } +#endif + + // int_t jj_cpu = HyP->num_u_blks_Phi-1; + // if (HyP->Rnbrow > 0 && jj_cpu>=0) + // HyP->offloadCondition = 1; + // else + // HyP->offloadCondition = 0; + // jj_cpu=0; +#if 0 + if ( HyP->offloadCondition ) { + printf("(%d) k=%d, nub=%d, nub_host=%d, nub_phi=%d, jj_cpu %d, offloadCondition %d\n", + grid3d->iam, k, HyP->num_u_blks+HyP->num_u_blks_Phi , + HyP->num_u_blks, HyP->num_u_blks_Phi, + jj_cpu, HyP->offloadCondition); + fflush(stdout); + } +#endif + scuStatUpdate(SuperSize(k), HyP, SCT, stat); + + int_t offload_condition = HyP->offloadCondition; + uPanelInfo_t *uPanelInfo = packLUInfo->uPanelInfo; + lPanelInfo_t *lPanelInfo = packLUInfo->lPanelInfo; + int_t *lsub = lPanelInfo->lsub; + int_t *usub = uPanelInfo->usub; + int_t *indirect = fNlists->indirect; + int_t *indirect2 = fNlists->indirect2; + + /* Schur Complement Update */ + + int_t knsupc = SuperSize(k); + int_t klst = FstBlockC(k + 1); + + doublecomplex *bigV = scuBufs->bigV; + doublecomplex *bigU = scuBufs->bigU; + + double t1 = SuperLU_timer_(); + +#pragma omp parallel /* Look-ahead update on CPU */ + { + int_t thread_id = omp_get_thread_num(); + +#pragma omp for + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij) + { + int_t j = ij / HyP->lookAheadBlk; + int_t lb = ij % HyP->lookAheadBlk; + zblock_gemm_scatterTopLeft(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } + +#pragma omp for + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij) + { + int_t j = ij / HyP->lookAheadBlk; + int_t lb = ij % HyP->lookAheadBlk; + zblock_gemm_scatterTopRight(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } + +#pragma omp for + for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) + { + int_t j = ij / HyP->RemainBlk; + int_t lb = ij % HyP->RemainBlk; + zblock_gemm_scatterBottomLeft(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } /* for int_t ij = ... */ + } /* end parallel region ... end look-ahead update */ + + SCT->lookaheadupdatetimer += (SuperLU_timer_() - t1); + + //printf("... after look-ahead update, topoLvl %d\t maxTopoLevel %d\n", topoLvl, maxTopoLevel); fflush(stdout); + + /* Reduce the L & U panels from GPU to CPU. */ + if (topoLvl < maxTopoLevel - 1) + { /* Not the root */ + int_t k_parent = gEtreeInfo->setree[k]; + gEtreeInfo->numChildLeft[k_parent]--; + if (gEtreeInfo->numChildLeft[k_parent] == 0 && k_parent < nnodes) + { /* if k is the last child in this level */ + int_t k0_parent = myIperm[k_parent]; + if (k0_parent > 0) + { + /* code */ + // printf("Before assert: iam %d, k %d, k_parent %d, k0_parent %d, nnodes %d\n", grid3d->iam, k, k_parent, k0_parent, nnodes); fflush(stdout); + // exit(-1); + assert(k0_parent < nnodes); + int offset = k0_parent - k_end; + if (!gpuLUreduced[k_parent] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + + zinitD2Hreduce(k_parent, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + zsendLUpanelGPU2HOST(k_parent, d2Hred, sluGPU); + + /* Reduce the LU panels from GPU */ + zreduceGPUlu(last_flag, d2Hred, + sluGPU, SCT, grid, LUstruct); + + gpuLUreduced[k_parent] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + /* Factorize diagonal block on CPU */ +#if 0 + sDiagFactIBCast(k_parent, dFBufs[offset], factStat, + comReqss[offset], grid, options, thresh, + LUstruct, stat, info, SCT, tag_ub); +#else + zDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor, + dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k_parent] = 1; + } /* end if k0_parent > 0 */ + + } /* end if all children are done */ + } /* end if non-root */ + +#pragma omp parallel + { + /* Master thread performs Schur complement update on GPU. */ +#pragma omp master + { + if (superlu_acc_offload) + { + int thread_id = omp_get_thread_num(); + double t1 = SuperLU_timer_(); + + if (offload_condition) + { + SCT->datatransfer_count++; + int streamId = k0 % nCudaStreams; + + /*wait for previous offload to get finished*/ + if (sluGPU->lastOffloadStream[streamId] != -1) + { + zwaitGPUscu(streamId, sluGPU, SCT); + sluGPU->lastOffloadStream[streamId] = -1; + } + + int_t Remain_lbuf_send_size = knsupc * HyP->Rnbrow; + int_t bigu_send_size = jj_cpu < 1 ? 0 : HyP->ldu_Phi * HyP->Ublock_info_Phi[jj_cpu - 1].full_u_cols; + assert(bigu_send_size < HyP->bigu_size); + + /* !! Sherry add the test to avoid seg_fault inside + sendSCUdataHost2GPU */ + if (bigu_send_size > 0) + { + zsendSCUdataHost2GPU(streamId, lsub, usub, + bigU, bigu_send_size, + Remain_lbuf_send_size, sluGPU, HyP); + + sluGPU->lastOffloadStream[streamId] = k0; + int_t usub_len = usub[2]; + int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + //{printf("... before SchurCompUpdate_GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} + + zSchurCompUpdate_GPU( + streamId, 0, jj_cpu, klst, knsupc, HyP->Rnbrow, HyP->RemainBlk, + Remain_lbuf_send_size, bigu_send_size, HyP->ldu_Phi, HyP->num_u_blks_Phi, + HyP->buffer_size, lsub_len, usub_len, ldt, k0, sluGPU, grid); + } /* endif bigu_send_size > 0 */ + + // sendLUpanelGPU2HOST( k0, d2Hred, sluGPU); + + SCT->schurPhiCallCount++; + HyP->jj_cpu = jj_cpu; + updateDirtyBit(k0, HyP, grid); + } /* endif (offload_condition) */ + + double t2 = SuperLU_timer_(); + SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double)(t2 - t1); /* not used */ + SCT->CPUOffloadTimer += (double)(t2 - t1); // Sherry added + + } /* endif (superlu_acc_offload) */ + + } /* end omp master thread */ + +#pragma omp for + /* The following update is on CPU. Should not be necessary now, + because we set jj_cpu equal to num_u_blks_Phi. */ + for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu); ++ij) + { + //printf(".. WARNING: should NOT get here\n"); + int_t j = ij / HyP->RemainBlk + jj_cpu; + int_t lb = ij % HyP->RemainBlk; + zblock_gemm_scatterBottomRight(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } /* for int_t ij = ... */ + + } /* end omp parallel region */ + + //SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + + // finish waiting for diag block send + int_t abs_offset = k0 - k_st; +#if 0 + sWait_LUDiagSend(k, comReqss[abs_offset], grid, SCT); +#else + Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, + comReqss[abs_offset]->L_diag_blk_send_req, + grid, SCT); +#endif + + /*Schedule next I bcasts within look-ahead window */ + for (int next_k0 = k0 + 1; next_k0 < SUPERLU_MIN(k0 + 1 + numLA, nnodes); ++next_k0) + { + /* code */ + int_t next_k = perm_c_supno[next_k0]; + int_t offset = next_k0 % numLA; + + /*L Ibcast*/ + if (IbcastPanel_L[next_k] == 0 && factored_L[next_k]) + { +#if 0 + sIBcastRecvLPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + zIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf, + factStat->factored, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[next_k] = 1; /*will be used later*/ + } + /*U Ibcast*/ + if (IbcastPanel_U[next_k] == 0 && factored_U[next_k]) + { +#if 0 + sIBcastRecvUPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + zIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[next_k] = 1; + } + } /* end for look-ahead window */ + + if (topoLvl < maxTopoLevel - 1) /* not root */ + { + /*look-ahead LU factorization*/ + int kx_st = eTreeTopLims[topoLvl + 1]; + int kx_end = eTreeTopLims[topoLvl + 2]; + for (int k0x = kx_st; k0x < kx_end; k0x++) + { + /* code */ + int kx = perm_c_supno[k0x]; + int offset = k0x - kx_st; + if (IrecvPlcd_D[kx] && !factored_L[kx]) + { + /*check if received*/ + int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset], + grid, SCT); + if (recvUDiag) + { +#if 0 + sLPanelTrSolve( kx, dFBufs[offset], + factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + zLPanelTrSolve(kx, factStat->factored_L, + dFBufs[offset]->BlockUFactor, grid, LUstruct); +#endif + + factored_L[kx] = 1; + + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_L[kx] == 0 && + k0x - k0 < numLA + 1 && // is within look-ahead window + factored_L[kx]) + { + int_t offset1 = k0x % numLA; +#if 0 + sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1], + msgss[offset1], factStat, + grid, LUstruct, SCT, tag_ub); +#else + zIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, + comReqss[offset1]->send_req, + comReqss[offset1]->recv_req, + LUvsbs[offset1]->Lsub_buf, + LUvsbs[offset1]->Lval_buf, + factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[kx] = 1; /*will be used later*/ + } + } + } + + if (IrecvPlcd_D[kx] && !factored_U[kx]) + { + /*check if received*/ + int_t recvLDiag = checkRecvLDiag(kx, comReqss[offset], + grid, SCT); + if (recvLDiag) + { +#if 0 + sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo, + grid, LUstruct, stat, SCT); +#else + zUPanelTrSolve(kx, dFBufs[offset]->BlockLFactor, + scuBufs->bigV, + ldt, packLUInfo->Ublock_info, + grid, LUstruct, stat, SCT); +#endif + factored_U[kx] = 1; + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_U[kx] == 0 && + k0x - k0 < numLA + 1 && // is within lookahead window + factored_U[kx]) + { + int_t offset = k0x % numLA; +#if 0 + sIBcastRecvUPanel( kx, comReqss[offset], + LUvsbs[offset], + msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub); +#else + zIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[kx] = 1; /*will be used later*/ + } + } + } + } /* end look-ahead */ + + } /* end if non-root level */ + + /* end Schur complement update */ + SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + + } /* end Schur update for all the nodes in level 'topoLvl' */ + + } /* end for all levels of the tree */ + + return 0; +} /* end zsparseTreeFactor_ASYNC_GPU */ From a34e3f468a6435fc141cddd2fc3e9b5d2848aad5 Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Sun, 11 Apr 2021 13:37:39 -0400 Subject: [PATCH 073/147] Fixed some bug in xScatter_B3d(). Still buggy on 3x4x4 grid. --- EXAMPLE/Makefile | 4 ++-- EXAMPLE/pddrive3d.c | 2 +- EXAMPLE/pzdrive.c | 4 ++++ EXAMPLE/pzdrive3d.c | 2 +- EXAMPLE/pzutil.c | 22 ++++++++++++++--- SRC/dnrformat_loc3d.c | 55 ++++++++++++++++++++++++++++++++++--------- SRC/znrformat_loc3d.c | 55 ++++++++++++++++++++++++++++++++++--------- 7 files changed, 115 insertions(+), 29 deletions(-) diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile index 50a786b0..9da5c1ab 100644 --- a/EXAMPLE/Makefile +++ b/EXAMPLE/Makefile @@ -52,7 +52,7 @@ ZEXM1 = pzdrive1.o zcreate_matrix.o ZEXM2 = pzdrive2.o zcreate_matrix.o zcreate_matrix_perturbed.o ZEXM3 = pzdrive3.o zcreate_matrix.o ZEXM4 = pzdrive4.o zcreate_matrix.o -ZEXM3D = pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o pzutil.o +ZEXM3D = pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o pzutil.o pzgssvx3d.o znrformat_loc3d.o superlu_grid3d.o ZEXMG = pzdrive_ABglobal.o ZEXMG1 = pzdrive1_ABglobal.o ZEXMG2 = pzdrive2_ABglobal.o @@ -60,7 +60,7 @@ ZEXMG3 = pzdrive3_ABglobal.o ZEXMG4 = pzdrive4_ABglobal.o -all: double complex16 +all: pzdrive3d #double complex16 double: pddrive pddrive1 pddrive2 pddrive3 pddrive4 \ pddrive3d \ diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c index 1e4b0db5..3ac75d0f 100644 --- a/EXAMPLE/pddrive3d.c +++ b/EXAMPLE/pddrive3d.c @@ -217,7 +217,7 @@ main (int argc, char *argv[]) printf("Input matrix file:\t%s\n", *cpp); printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); - printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); fflush(stdout); } diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c index 3878558d..16c3f14a 100644 --- a/EXAMPLE/pzdrive.c +++ b/EXAMPLE/pzdrive.c @@ -173,6 +173,10 @@ int main(int argc, char *argv[]) GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------*/ zcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid); + if (iam==1) { + printf("(%d) 9341 xtrue[0] %e\n", xtrue[0]); + fflush(stdout); + } if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); diff --git a/EXAMPLE/pzdrive3d.c b/EXAMPLE/pzdrive3d.c index 24ca2777..2eb12796 100644 --- a/EXAMPLE/pzdrive3d.c +++ b/EXAMPLE/pzdrive3d.c @@ -217,7 +217,7 @@ main (int argc, char *argv[]) printf("Input matrix file:\t%s\n", *cpp); printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); - printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); fflush(stdout); } diff --git a/EXAMPLE/pzutil.c b/EXAMPLE/pzutil.c index 2bfcfd6e..c48233fc 100644 --- a/EXAMPLE/pzutil.c +++ b/EXAMPLE/pzutil.c @@ -799,7 +799,7 @@ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx double err, xnorm, temperr, tempxnorm; doublecomplex *x_work, *xtrue_work; doublecomplex temp; - int i, j; + int i, j, ii; for (j = 0; j < nrhs; j++) { x_work = &x[j*ldx]; @@ -809,8 +809,21 @@ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx z_sub(&temp, &x_work[i], &xtrue_work[i]); err = SUPERLU_MAX(err, slud_z_abs(&temp)); xnorm = SUPERLU_MAX(xnorm, slud_z_abs(&x_work[i])); + if (err > 1.e-4 && iam == 1) { + ii = i; + PrintDoublecomplex("x_work(ii)", 5, &x[ii]); + PrintDoublecomplex("x_true(ii)", 5, &xtrue_work[ii]); + break; + } + } + + printf("\t(%d) loc n %d: err = %e\txnorm = %e\n", iam, n, err, xnorm); + if (iam == 1) { + printf("ii %d\n", ii); + PrintDoublecomplex("x_work", 5, x); + PrintDoublecomplex("x_true", 5, xtrue_work); } - printf("\t(%d) err = %e\txnorm = %e\n", iam, err, xnorm); + fflush(stdout); /* get the golbal max err & xnrom */ temperr = err; @@ -819,7 +832,10 @@ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_DOUBLE, MPI_MAX, slucomm); err = err / xnorm; - if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); + if ( !iam ) { + printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); + fflush(stdout); + } } } diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c index 71249b24..31d70312 100644 --- a/SRC/dnrformat_loc3d.c +++ b/SRC/dnrformat_loc3d.c @@ -25,6 +25,7 @@ at the top-level directory. #include "superlu_ddefs.h" +/* Dst <- BlockByBlock (Src), reshape the block storage. */ static void matCopy(int n, int m, double *Dst, int lddst, double *Src, int ldsrc) { for (int j = 0; j < m; j++) @@ -43,7 +44,7 @@ static void matCopy(int n, int m, double *Dst, int lddst, double *Src, int ldsrc * output is in the returned A3d->{} structure. * see supermatrix.h for nrformat_loc3d{} structure. */ -NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input +NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input, on 3D grid double *B, // input int ldb, int nrhs, // input gridinfo3d_t *grid3d) @@ -56,9 +57,13 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input A3d->nrhs = nrhs; // find number of nnzs - int_t *nnz_counts, *row_counts; - int *nnz_disp, *row_disp, *nnz_counts_int, *row_counts_int; - int *b_counts_int, *b_disp; + int_t *nnz_counts; // number of local nonzeros relative to all processes + int_t *row_counts; // number of local rows relative to all processes + int *nnz_counts_int, *row_counts_int; // 32-bit + int *nnz_disp, *row_disp; // displacement + int *b_counts_int; // number of local B entries relative to all processes + int *b_disp; // including 'nrhs' + nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t)); row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t)); nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); @@ -191,25 +196,29 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input int dScatter_B3d(NRformat_loc3d *A3d, // modified gridinfo3d_t *grid3d) { - - double *B = (double *) A3d->B; + double *B = (double *) A3d->B; // on 3D grid int ldb = A3d->ldb; int nrhs = A3d->nrhs; - double *B2d = (double *) A3d->B2d; + double *B2d = (double *) A3d->B2d; // on 2D layer 0 NRformat_loc A2d = *(A3d->A_nfmt); + + /* The following are the number of local rows relative to all processes */ int m_loc = A3d->m_loc; int *b_counts_int = A3d->b_counts_int; int *b_disp = A3d->b_disp; int *row_counts_int = A3d->row_counts_int; int *row_disp = A3d->row_disp; - double *B1; + gridinfo_t *grid2d = &(grid3d->grid2d); + int iam = grid3d->iam; + + double *B1; // on 2D layer 0 if (grid3d->zscp.Iam == 0) { B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double)); } - // B1 <- blockByBock(b2d) + // B1 <- blockByBlock(b2d) if (grid3d->zscp.Iam == 0) { for (int i = 0; i < grid3d->npdep; ++i) @@ -220,13 +229,37 @@ int dScatter_B3d(NRformat_loc3d *A3d, // modified } } - // - double *Btmp; + double *Btmp; // on 3D grid Btmp = SUPERLU_MALLOC(A3d->m_loc * nrhs * sizeof(double)); +#if 0 // This is a bug: the result of this scatter is a "permuted" distribution // Btmp <- scatterv(B1) MPI_Scatterv(B1, b_counts_int, b_disp, MPI_DOUBLE, Btmp, nrhs * A3d->m_loc, MPI_DOUBLE, 0, grid3d->zscp.comm); +#else + /* For example, in 1x3x4 grid, layer 0 has procs:{0,1,2}, the process + scattering pattern is: + 0 -> {0,1,2,3}, 1 -> {4,5,6,7}, 2 -> {8,9,10,11} + This is different from the scattering pattern along Z-dimension. + */ + if (grid3d->zscp.Iam == 0) // processes on layer 0 + { + MPI_Request send_req; + for (int p = 0; p < grid3d->npdep; ++p) { // send to npdep procs + int dest = p + grid2d->iam * grid3d->npdep; + int tag = dest; + + MPI_Isend(B1 + b_disp[p], b_counts_int[p], SuperLU_MPI_DOUBLE_COMPLEX, + dest, tag, grid3d->comm, &send_req); + } + } + + /* Everyone receives one block */ + MPI_Status status; + int src = grid3d->iam / grid3d->npdep; // which proc the data should come from + MPI_Recv(Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, + src, grid3d->iam, grid3d->comm, &status); +#endif // B <- colMajor(Btmp) matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc); diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c index 92e6e085..114b847e 100644 --- a/SRC/znrformat_loc3d.c +++ b/SRC/znrformat_loc3d.c @@ -24,6 +24,7 @@ at the top-level directory. #include "superlu_zdefs.h" +/* Dst <- BlockByBlock (Src), reshape the block storage. */ static void matCopy(int n, int m, doublecomplex *Dst, int lddst, doublecomplex *Src, int ldsrc) { for (int j = 0; j < m; j++) @@ -42,7 +43,7 @@ static void matCopy(int n, int m, doublecomplex *Dst, int lddst, doublecomplex * * output is in the returned A3d->{} structure. * see supermatrix.h for nrformat_loc3d{} structure. */ -NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input +NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input, on 3D grid doublecomplex *B, // input int ldb, int nrhs, // input gridinfo3d_t *grid3d) @@ -55,9 +56,13 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input A3d->nrhs = nrhs; // find number of nnzs - int_t *nnz_counts, *row_counts; - int *nnz_disp, *row_disp, *nnz_counts_int, *row_counts_int; - int *b_counts_int, *b_disp; + int_t *nnz_counts; // number of local nonzeros relative to all processes + int_t *row_counts; // number of local rows relative to all processes + int *nnz_counts_int, *row_counts_int; // 32-bit + int *nnz_disp, *row_disp; // displacement + int *b_counts_int; // number of local B entries relative to all processes + int *b_disp; // including 'nrhs' + nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t)); row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t)); nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); @@ -190,25 +195,29 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input int zScatter_B3d(NRformat_loc3d *A3d, // modified gridinfo3d_t *grid3d) { - - doublecomplex *B = (doublecomplex *) A3d->B; + doublecomplex *B = (doublecomplex *) A3d->B; // on 3D grid int ldb = A3d->ldb; int nrhs = A3d->nrhs; - doublecomplex *B2d = (doublecomplex *) A3d->B2d; + doublecomplex *B2d = (doublecomplex *) A3d->B2d; // on 2D layer 0 NRformat_loc A2d = *(A3d->A_nfmt); + + /* The following are the number of local rows relative to all processes */ int m_loc = A3d->m_loc; int *b_counts_int = A3d->b_counts_int; int *b_disp = A3d->b_disp; int *row_counts_int = A3d->row_counts_int; int *row_disp = A3d->row_disp; - doublecomplex *B1; + gridinfo_t *grid2d = &(grid3d->grid2d); + int iam = grid3d->iam; + + doublecomplex *B1; // on 2D layer 0 if (grid3d->zscp.Iam == 0) { B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(doublecomplex)); } - // B1 <- blockByBock(b2d) + // B1 <- blockByBlock(b2d) if (grid3d->zscp.Iam == 0) { for (int i = 0; i < grid3d->npdep; ++i) @@ -219,13 +228,37 @@ int zScatter_B3d(NRformat_loc3d *A3d, // modified } } - // - doublecomplex *Btmp; + doublecomplex *Btmp; // on 3D grid Btmp = SUPERLU_MALLOC(A3d->m_loc * nrhs * sizeof(doublecomplex)); +#if 0 // This is a bug: the result of this scatter is a "permuted" distribution // Btmp <- scatterv(B1) MPI_Scatterv(B1, b_counts_int, b_disp, SuperLU_MPI_DOUBLE_COMPLEX, Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm); +#else + /* For example, in 1x3x4 grid, layer 0 has procs:{0,1,2}, the process + scattering pattern is: + 0 -> {0,1,2,3}, 1 -> {4,5,6,7}, 2 -> {8,9,10,11} + This is different from the scattering pattern along Z-dimension. + */ + if (grid3d->zscp.Iam == 0) // processes on layer 0 + { + MPI_Request send_req; + for (int p = 0; p < grid3d->npdep; ++p) { // send to npdep procs + int dest = p + grid2d->iam * grid3d->npdep; + int tag = dest; + + MPI_Isend(B1 + b_disp[p], b_counts_int[p], SuperLU_MPI_DOUBLE_COMPLEX, + dest, tag, grid3d->comm, &send_req); + } + } + + /* Everyone receives one block */ + MPI_Status status; + int src = grid3d->iam / grid3d->npdep; // which proc the data should come from + MPI_Recv(Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, + src, grid3d->iam, grid3d->comm, &status); +#endif // B <- colMajor(Btmp) matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc); From 5d8cf04a1a802761c6488347ddb4b4a76985bc41 Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Fri, 16 Apr 2021 12:04:25 -0700 Subject: [PATCH 074/147] guide DisplayHeader() with #if ( PRNTlevel>=1 ) --- SRC/pdgstrf.c | 3 ++- SRC/pzgstrf.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c index 1d60904d..17491399 100644 --- a/SRC/pdgstrf.c +++ b/SRC/pdgstrf.c @@ -834,9 +834,10 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, if ( checkCuda(cudaHostAlloc((void**)&bigV, bigv_size * sizeof(double) ,cudaHostAllocDefault)) ) ABORT("Malloc fails for dgemm buffer V"); - DisplayHeader(); + #if ( PRNTlevel>=1 ) + DisplayHeader(); printf(" Starting with %d Cuda Streams \n",nstreams ); #endif diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c index 20d32ad8..a355c8f6 100644 --- a/SRC/pzgstrf.c +++ b/SRC/pzgstrf.c @@ -834,9 +834,10 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, if ( checkCuda(cudaHostAlloc((void**)&bigV, bigv_size * sizeof(doublecomplex) ,cudaHostAllocDefault)) ) ABORT("Malloc fails for zgemm buffer V"); - DisplayHeader(); + #if ( PRNTlevel>=1 ) + DisplayHeader(); printf(" Starting with %d Cuda Streams \n",nstreams ); #endif From 0f34fbe7f7150fd036a5a78eb2438fefa254189b Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Sat, 24 Apr 2021 16:34:01 -0400 Subject: [PATCH 075/147] Fixed some problems in xScatter_B3d() routine: from X2d -> X3d. Support 3D grid both in "Z-major" and in "XY-major". --- EXAMPLE/Makefile | 4 +- EXAMPLE/pzutil.c | 9 +++- SRC/dnrformat_loc3d.c | 105 ++++++++++++++++++++++++++------------ SRC/dutil_dist.c | 4 +- SRC/pdgssvx3d.c | 3 +- SRC/pzgssvx3d.c | 3 +- SRC/superlu_defs.h | 15 +++++- SRC/superlu_dist_config.h | 19 ++----- SRC/superlu_grid3d.c | 29 ++++++++--- SRC/supermatrix.h | 4 +- SRC/util.c | 3 +- SRC/znrformat_loc3d.c | 105 ++++++++++++++++++++++++++------------ SRC/zutil_dist.c | 16 ++++-- 13 files changed, 209 insertions(+), 110 deletions(-) diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile index 9da5c1ab..5af38147 100644 --- a/EXAMPLE/Makefile +++ b/EXAMPLE/Makefile @@ -52,7 +52,7 @@ ZEXM1 = pzdrive1.o zcreate_matrix.o ZEXM2 = pzdrive2.o zcreate_matrix.o zcreate_matrix_perturbed.o ZEXM3 = pzdrive3.o zcreate_matrix.o ZEXM4 = pzdrive4.o zcreate_matrix.o -ZEXM3D = pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o pzutil.o pzgssvx3d.o znrformat_loc3d.o superlu_grid3d.o +ZEXM3D = pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o ZEXMG = pzdrive_ABglobal.o ZEXMG1 = pzdrive1_ABglobal.o ZEXMG2 = pzdrive2_ABglobal.o @@ -60,7 +60,7 @@ ZEXMG3 = pzdrive3_ABglobal.o ZEXMG4 = pzdrive4_ABglobal.o -all: pzdrive3d #double complex16 +all: double complex16 double: pddrive pddrive1 pddrive2 pddrive3 pddrive4 \ pddrive3d \ diff --git a/EXAMPLE/pzutil.c b/EXAMPLE/pzutil.c index c48233fc..153dca2c 100644 --- a/EXAMPLE/pzutil.c +++ b/EXAMPLE/pzutil.c @@ -809,22 +809,27 @@ void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx z_sub(&temp, &x_work[i], &xtrue_work[i]); err = SUPERLU_MAX(err, slud_z_abs(&temp)); xnorm = SUPERLU_MAX(xnorm, slud_z_abs(&x_work[i])); +#if 1 if (err > 1.e-4 && iam == 1) { ii = i; + printf("(wrong proc %d) wrong index ii %d\n", iam, ii); PrintDoublecomplex("x_work(ii)", 5, &x[ii]); PrintDoublecomplex("x_true(ii)", 5, &xtrue_work[ii]); + fflush(stdout); break; } +#endif } +#if 0 printf("\t(%d) loc n %d: err = %e\txnorm = %e\n", iam, n, err, xnorm); - if (iam == 1) { + if (iam == 4) { printf("ii %d\n", ii); PrintDoublecomplex("x_work", 5, x); PrintDoublecomplex("x_true", 5, xtrue_work); } fflush(stdout); - +#endif /* get the golbal max err & xnrom */ temperr = err; tempxnorm = xnorm; diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c index 31d70312..f2bec71d 100644 --- a/SRC/dnrformat_loc3d.c +++ b/SRC/dnrformat_loc3d.c @@ -52,7 +52,7 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input, on 3D grid NRformat_loc3d *A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d)); NRformat_loc *A2d = SUPERLU_MALLOC(sizeof(NRformat_loc)); A3d->m_loc = A->m_loc; - A3d->B = (double *) B; // on 3D process grid + A3d->B3d = (double *) B; // on 3D process grid A3d->ldb = ldb; A3d->nrhs = nrhs; @@ -191,12 +191,12 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input, on 3D grid /* * Scatter B (solution) from 2D process layer 0 to 3D grid - * Output: X2d <- A^{-1} B2d + * Output: X3d <- A^{-1} B2d */ int dScatter_B3d(NRformat_loc3d *A3d, // modified gridinfo3d_t *grid3d) { - double *B = (double *) A3d->B; // on 3D grid + double *B = (double *) A3d->B3d; // on 3D grid int ldb = A3d->ldb; int nrhs = A3d->nrhs; double *B2d = (double *) A3d->B2d; // on 2D layer 0 @@ -208,9 +208,10 @@ int dScatter_B3d(NRformat_loc3d *A3d, // modified int *b_disp = A3d->b_disp; int *row_counts_int = A3d->row_counts_int; int *row_disp = A3d->row_disp; - - gridinfo_t *grid2d = &(grid3d->grid2d); + int i, p; int iam = grid3d->iam; + int rankorder = grid3d->rankorder; + gridinfo_t *grid2d = &(grid3d->grid2d); double *B1; // on 2D layer 0 if (grid3d->zscp.Iam == 0) @@ -218,10 +219,10 @@ int dScatter_B3d(NRformat_loc3d *A3d, // modified B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double)); } - // B1 <- blockByBlock(b2d) + // B1 <- BlockByBlock(B2d) if (grid3d->zscp.Iam == 0) { - for (int i = 0; i < grid3d->npdep; ++i) + for (i = 0; i < grid3d->npdep; ++i) { /* code */ matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i], @@ -232,34 +233,70 @@ int dScatter_B3d(NRformat_loc3d *A3d, // modified double *Btmp; // on 3D grid Btmp = SUPERLU_MALLOC(A3d->m_loc * nrhs * sizeof(double)); -#if 0 // This is a bug: the result of this scatter is a "permuted" distribution - // Btmp <- scatterv(B1) - MPI_Scatterv(B1, b_counts_int, b_disp, MPI_DOUBLE, - Btmp, nrhs * A3d->m_loc, MPI_DOUBLE, 0, grid3d->zscp.comm); -#else - /* For example, in 1x3x4 grid, layer 0 has procs:{0,1,2}, the process - scattering pattern is: - 0 -> {0,1,2,3}, 1 -> {4,5,6,7}, 2 -> {8,9,10,11} - This is different from the scattering pattern along Z-dimension. - */ - if (grid3d->zscp.Iam == 0) // processes on layer 0 - { - MPI_Request send_req; - for (int p = 0; p < grid3d->npdep; ++p) { // send to npdep procs - int dest = p + grid2d->iam * grid3d->npdep; - int tag = dest; - - MPI_Isend(B1 + b_disp[p], b_counts_int[p], SuperLU_MPI_DOUBLE_COMPLEX, - dest, tag, grid3d->comm, &send_req); - } - } + // Btmp <- scatterv(B1), block-by-block + if ( rankorder == 1 ) { /* XY-major in 3D grid */ + /* e.g. 1x3x4 grid: layer0 layer1 layer2 layer3 + * 0 1 2 4 + * 5 6 7 8 + * 9 10 11 12 + */ + MPI_Scatterv(B1, b_counts_int, b_disp, MPI_DOUBLE, + Btmp, nrhs * A3d->m_loc, MPI_DOUBLE, + 0, grid3d->zscp.comm); + + } else { /* Z-major in 3D grid */ + /* e.g. 1x3x4 grid: layer0 layer1 layer2 layer3 + 0 3 6 9 + 1 4 7 10 + 2 5 8 11 + GATHER: {A, B} in A * X = B + layer-0: + B (row space) X (column space) SCATTER + ---- ---- ---->> + P0 0 0 +(equations 3 1 Proc 0 -> Procs {0, 1, 2, 3} + reordered 6 2 + after gather) 9 3 + ---- ---- + P1 1 4 Proc 1 -> Procs {4, 5, 6, 7} + 4 5 + 7 6 + 10 7 + ---- ---- + P2 2 8 Proc 2 -> Procs {8, 9, 10, 11} + 5 9 + 8 10 + 11 11 + ---- ---- + */ + MPI_Request recv_req; + MPI_Status recv_status; + int pxy = grid2d->nprow * grid2d->npcol; + int npdep = grid3d->npdep, dest, src, tag; + int nprocs = pxy * npdep; + + /* Everyone receives one block (post non-blocking irecv) */ + src = grid3d->iam / npdep; // Z-major + tag = iam; + MPI_Irecv(Btmp, nrhs * A3d->m_loc, MPI_DOUBLE, + src, tag, grid3d->comm, &recv_req); + + /* Layer 0 sends to npdep procs */ + if (grid3d->zscp.Iam == 0) { + int dest, tag; + for (p = 0; p < npdep; ++p) { // send to npdep procs + dest = p + grid2d->iam * npdep; // Z-major order + tag = dest; + + MPI_Send(B1 + b_disp[p], b_counts_int[p], + MPI_DOUBLE, dest, tag, grid3d->comm); + } + } /* end layer 0 send */ - /* Everyone receives one block */ - MPI_Status status; - int src = grid3d->iam / grid3d->npdep; // which proc the data should come from - MPI_Recv(Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, - src, grid3d->iam, grid3d->comm, &status); -#endif + /* Wait for Irecv to complete */ + MPI_Wait(&recv_req, &recv_status); + + } /* else Z-major */ // B <- colMajor(Btmp) matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc); diff --git a/SRC/dutil_dist.c b/SRC/dutil_dist.c index 8be5c73d..8291b470 100644 --- a/SRC/dutil_dist.c +++ b/SRC/dutil_dist.c @@ -489,8 +489,8 @@ dGenXtrue_dist(int_t n, int_t nrhs, double *x, int_t ldx) int i, j; for (j = 0; j < nrhs; ++j) for (i = 0; i < n; ++i) { - if ( i % 2 ) x[i + j*ldx] = 1.0;/* + (double)(i+1.)/n;*/ - else x[i + j*ldx] = 1.0; + if ( i % 2 ) x[i + j*ldx] = 1.0 + (double)(i+1.)/n; + else x[i + j*ldx] = 1.0 - (double)(i+1.)/n; } } diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index 0b8bb280..cc064ff7 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -552,7 +552,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d B3d and Astore3d will be restored on return */ int ldb3d = ldb; - double *B3d = B; + // double *B3d = B; NRformat_loc *Astore3d = (NRformat_loc *)A->Store; double *B2d; NRformat_loc3d *A3d = dGatherNRformat_loc3d((NRformat_loc *)A->Store, @@ -1560,6 +1560,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* Scatter the solution from 2D grid_0 to 3D grid */ dScatter_B3d(A3d, grid3d); + B = A3d->B3d; // B is now assigned back to B3d on return A->Store = Astore3d; // restore Astore to 3D /* free A2d and B2d, which are allocated only in 2D layer Grid_0 */ diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c index a6f9fc42..86873c9c 100644 --- a/SRC/pzgssvx3d.c +++ b/SRC/pzgssvx3d.c @@ -551,7 +551,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d B3d and Astore3d will be restored on return */ int ldb3d = ldb; - doublecomplex *B3d = B; + // doublecomplex *B3d = B; NRformat_loc *Astore3d = (NRformat_loc *)A->Store; doublecomplex *B2d; NRformat_loc3d *A3d = zGatherNRformat_loc3d((NRformat_loc *)A->Store, @@ -1561,6 +1561,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, /* Scatter the solution from 2D grid_0 to 3D grid */ zScatter_B3d(A3d, grid3d); + B = A3d->B3d; // B is now assigned back to B3d on return A->Store = Astore3d; // restore Astore to 3D /* free A2d and B2d, which are allocated only in 2D layer Grid_0 */ diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index ad9c0234..b228a68e 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -354,7 +354,7 @@ typedef struct { MPI_Comm comm; /* MPI communicator */ superlu_scope_t rscp; /* process scope in rowwise, horizontal directon */ superlu_scope_t cscp; /* process scope in columnwise, vertical direction */ - int iam; /* my process number in this scope */ + int iam; /* my process number in this grid */ int_t nprow; /* number of process rows */ int_t npcol; /* number of process columns */ } gridinfo_t; @@ -365,11 +365,22 @@ typedef struct { superlu_scope_t rscp; /* row scope */ superlu_scope_t cscp; /* column scope */ superlu_scope_t zscp; /* scope in third dimension */ + gridinfo_t grid2d; /* for using 2D functions */ int iam; /* my process number in this grid */ int_t nprow; /* number of process rows */ int_t npcol; /* number of process columns */ int_t npdep; /* number of replication factor in Z-dimension */ - gridinfo_t grid2d; /* for using 2D functions */ + int rankorder; /* = 0: Z-major ( default ) + * e.g. 1x3x4 grid: layer0 layer1 layer2 layer3 + * 0 3 6 9 + * 1 4 7 10 + * 2 5 8 11 + * = 1: XY-major (need set env. var.: RANKORDER=XY) + * e.g. 1x3x4 grid: layer0 layer1 layer2 layer3 + * 0 1 2 4 + * 5 6 7 8 + * 9 10 11 12 + */ } gridinfo3d_t; diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index 8dcabe22..ec3d9f9a 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -1,20 +1,7 @@ -/* superlu_dist_config.h.in */ - -/* Enable CUDA */ -#define HAVE_CUDA TRUE - -/* Enable parmetis */ +/* #define XSDK_INDEX_SIZE 64 */ +/* #define SLU_HAVE_LAPACK TRUE */ #define HAVE_PARMETIS TRUE - -/* Enable LAPACK */ -/* #undef SLU_HAVE_LAPACK */ - -/* Enable CombBLAS */ -/* #undef HAVE_COMBBLAS */ - -/* enable 64bit index mode */ -/* #undef XSDK_INDEX_SIZE */ - +/* #define HAVE_COMBBLAS TRUE */ #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 #endif diff --git a/SRC/superlu_grid3d.c b/SRC/superlu_grid3d.c index 8dd12462..d34c1ccc 100644 --- a/SRC/superlu_grid3d.c +++ b/SRC/superlu_grid3d.c @@ -120,10 +120,12 @@ void superlu_gridmap3d( if (getenv("RANKORDER") && strcmp(getenv("RANKORDER"), "XY" )) { + grid->rankorder = 1; // XY-major + dims[0] = nprow; dims[1] = npcol; dims[2] = npdep; - + // create the new communicator int error = MPI_Cart_create(grid->comm, ndim, dims, periodic, reorder, &superlu3d_comm); @@ -133,7 +135,6 @@ void superlu_gridmap3d( grid->iam = iam; MPI_Cart_coords(superlu3d_comm, iam, ndim, coords3d); - // printf("My coordinats are (%d %d %d)\n", coords3d[0], coords3d[1], coords3d[2] ); int rowc[3] = {1, 0, 0}; int colc[3] = {0, 1, 0}; int depc[3] = {0, 0, 1}; @@ -160,7 +161,9 @@ void superlu_gridmap3d( int xyc[3] = {1, 1, 0}; MPI_Cart_sub(superlu3d_comm, xyc, &(grid->grid2d.comm)); - } else { + } else { /* default */ + grid->rankorder = 0; // Z-major + dims[1] = nprow; dims[2] = npcol; dims[0] = npdep; @@ -174,27 +177,29 @@ void superlu_gridmap3d( grid->iam = iam; MPI_Cart_coords(superlu3d_comm, iam, ndim, coords3d); - // create row communicator; + /* printf("(%d) My coordinats are (%d %d %d)\n", + iam, coords3d[0], coords3d[1], coords3d[2] ); + fflush(stdout); */ + + // create row communicator - // printf("My coordinats are (%d %d %d)\n", coords3d[0], coords3d[1], coords3d[2] ); int rowc[3] = {0, 1, 0}; int colc[3] = {0, 0, 1}; int depc[3] = {1, 0, 0}; - MPI_Cart_sub(superlu3d_comm, colc, &(grid->rscp.comm)); MPI_Cart_sub(superlu3d_comm, rowc, &(grid->cscp.comm)); MPI_Cart_sub(superlu3d_comm, depc, &(grid->zscp.comm)); + // 2x3: 0,2,4 / 1,3,5 column-major grid->cscp.Np = nprow; grid->cscp.Iam = coords3d[1]; grid->rscp.Np = npcol; grid->rscp.Iam = coords3d[2]; + grid->zscp.Np = npdep; grid->zscp.Iam = coords3d[0]; - //printf("(Iam %d) grid->zscp.Np = %d\n", grid->iam, grid->zscp.Np); - grid->nprow = nprow; grid->npcol = npcol; grid->npdep = npdep; @@ -216,6 +221,14 @@ void superlu_gridmap3d( // grid->grid2d.cscp = grid->cscp; +#if 0 + if ( (grid->zscp).Iam == 0) { + printf("(3d grid: layer 0) iam %d, grid->grid2d.iam %d\n", + grid->iam, (grid->grid2d).iam); + } + fflush(stdout); +#endif + gridmap_out: SUPERLU_FREE(pranks); MPI_Group_free( &superlu_grp ); diff --git a/SRC/supermatrix.h b/SRC/supermatrix.h index db70575d..727e966c 100644 --- a/SRC/supermatrix.h +++ b/SRC/supermatrix.h @@ -192,11 +192,11 @@ typedef struct { typedef struct NRformat_loc3d { NRformat_loc* A_nfmt; - void* B; // distributed on 3D process grid + void* B3d; // on the entire 3D process grid int ldb; int nrhs; int m_loc; - void* B2d; // on 2D process layer Grid_0 + void* B2d; // on 2D process layer Grid_0 int* row_counts_int; // these counts are for {A, B} distributed on 2D layer 0 int* row_disp; diff --git a/SRC/util.c b/SRC/util.c index f428b42b..42c41725 100644 --- a/SRC/util.c +++ b/SRC/util.c @@ -1503,6 +1503,8 @@ gemm_division_new (int * num_streams_used, /*number of streams that will be us } } +#endif /* defined GPU_ACC */ + /* The following are moved from superlu_gpu.cu */ int getnCudaStreams() @@ -1535,4 +1537,3 @@ int get_mpi_process_per_gpu () } } -#endif /* defined GPU_ACC */ diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c index 114b847e..c4f7d1e3 100644 --- a/SRC/znrformat_loc3d.c +++ b/SRC/znrformat_loc3d.c @@ -51,7 +51,7 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input, on 3D grid NRformat_loc3d *A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d)); NRformat_loc *A2d = SUPERLU_MALLOC(sizeof(NRformat_loc)); A3d->m_loc = A->m_loc; - A3d->B = (doublecomplex *) B; // on 3D process grid + A3d->B3d = (doublecomplex *) B; // on 3D process grid A3d->ldb = ldb; A3d->nrhs = nrhs; @@ -190,12 +190,12 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input, on 3D grid /* * Scatter B (solution) from 2D process layer 0 to 3D grid - * Output: X2d <- A^{-1} B2d + * Output: X3d <- A^{-1} B2d */ int zScatter_B3d(NRformat_loc3d *A3d, // modified gridinfo3d_t *grid3d) { - doublecomplex *B = (doublecomplex *) A3d->B; // on 3D grid + doublecomplex *B = (doublecomplex *) A3d->B3d; // on 3D grid int ldb = A3d->ldb; int nrhs = A3d->nrhs; doublecomplex *B2d = (doublecomplex *) A3d->B2d; // on 2D layer 0 @@ -207,9 +207,10 @@ int zScatter_B3d(NRformat_loc3d *A3d, // modified int *b_disp = A3d->b_disp; int *row_counts_int = A3d->row_counts_int; int *row_disp = A3d->row_disp; - - gridinfo_t *grid2d = &(grid3d->grid2d); + int i, p; int iam = grid3d->iam; + int rankorder = grid3d->rankorder; + gridinfo_t *grid2d = &(grid3d->grid2d); doublecomplex *B1; // on 2D layer 0 if (grid3d->zscp.Iam == 0) @@ -217,10 +218,10 @@ int zScatter_B3d(NRformat_loc3d *A3d, // modified B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(doublecomplex)); } - // B1 <- blockByBlock(b2d) + // B1 <- BlockByBlock(B2d) if (grid3d->zscp.Iam == 0) { - for (int i = 0; i < grid3d->npdep; ++i) + for (i = 0; i < grid3d->npdep; ++i) { /* code */ matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i], @@ -231,34 +232,70 @@ int zScatter_B3d(NRformat_loc3d *A3d, // modified doublecomplex *Btmp; // on 3D grid Btmp = SUPERLU_MALLOC(A3d->m_loc * nrhs * sizeof(doublecomplex)); -#if 0 // This is a bug: the result of this scatter is a "permuted" distribution - // Btmp <- scatterv(B1) - MPI_Scatterv(B1, b_counts_int, b_disp, SuperLU_MPI_DOUBLE_COMPLEX, - Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm); -#else - /* For example, in 1x3x4 grid, layer 0 has procs:{0,1,2}, the process - scattering pattern is: - 0 -> {0,1,2,3}, 1 -> {4,5,6,7}, 2 -> {8,9,10,11} - This is different from the scattering pattern along Z-dimension. - */ - if (grid3d->zscp.Iam == 0) // processes on layer 0 - { - MPI_Request send_req; - for (int p = 0; p < grid3d->npdep; ++p) { // send to npdep procs - int dest = p + grid2d->iam * grid3d->npdep; - int tag = dest; - - MPI_Isend(B1 + b_disp[p], b_counts_int[p], SuperLU_MPI_DOUBLE_COMPLEX, - dest, tag, grid3d->comm, &send_req); - } - } + // Btmp <- scatterv(B1), block-by-block + if ( rankorder == 1 ) { /* XY-major in 3D grid */ + /* e.g. 1x3x4 grid: layer0 layer1 layer2 layer3 + * 0 1 2 4 + * 5 6 7 8 + * 9 10 11 12 + */ + MPI_Scatterv(B1, b_counts_int, b_disp, SuperLU_MPI_DOUBLE_COMPLEX, + Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, + 0, grid3d->zscp.comm); + + } else { /* Z-major in 3D grid */ + /* e.g. 1x3x4 grid: layer0 layer1 layer2 layer3 + 0 3 6 9 + 1 4 7 10 + 2 5 8 11 + GATHER: {A, B} in A * X = B + layer-0: + B (row space) X (column space) SCATTER + ---- ---- ---->> + P0 0 0 +(equations 3 1 Proc 0 -> Procs {0, 1, 2, 3} + reordered 6 2 + after gather) 9 3 + ---- ---- + P1 1 4 Proc 1 -> Procs {4, 5, 6, 7} + 4 5 + 7 6 + 10 7 + ---- ---- + P2 2 8 Proc 2 -> Procs {8, 9, 10, 11} + 5 9 + 8 10 + 11 11 + ---- ---- + */ + MPI_Request recv_req; + MPI_Status recv_status; + int pxy = grid2d->nprow * grid2d->npcol; + int npdep = grid3d->npdep, dest, src, tag; + int nprocs = pxy * npdep; + + /* Everyone receives one block (post non-blocking irecv) */ + src = grid3d->iam / npdep; // Z-major + tag = iam; + MPI_Irecv(Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, + src, tag, grid3d->comm, &recv_req); + + /* Layer 0 sends to npdep procs */ + if (grid3d->zscp.Iam == 0) { + int dest, tag; + for (p = 0; p < npdep; ++p) { // send to npdep procs + dest = p + grid2d->iam * npdep; // Z-major order + tag = dest; + + MPI_Send(B1 + b_disp[p], b_counts_int[p], + SuperLU_MPI_DOUBLE_COMPLEX, dest, tag, grid3d->comm); + } + } /* end layer 0 send */ - /* Everyone receives one block */ - MPI_Status status; - int src = grid3d->iam / grid3d->npdep; // which proc the data should come from - MPI_Recv(Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, - src, grid3d->iam, grid3d->comm, &status); -#endif + /* Wait for Irecv to complete */ + MPI_Wait(&recv_req, &recv_status); + + } /* else Z-major */ // B <- colMajor(Btmp) matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc); diff --git a/SRC/zutil_dist.c b/SRC/zutil_dist.c index 58b62d12..f3d699ed 100644 --- a/SRC/zutil_dist.c +++ b/SRC/zutil_dist.c @@ -421,6 +421,7 @@ void zScalePermstructFree(zScalePermstruct_t *ScalePermstruct) SUPERLU_FREE(ScalePermstruct->R); SUPERLU_FREE(ScalePermstruct->C); break; + default: break; } } @@ -489,9 +490,14 @@ zGenXtrue_dist(int_t n, int_t nrhs, doublecomplex *x, int_t ldx) int i, j; for (j = 0; j < nrhs; ++j) for (i = 0; i < n; ++i) { - if ( i % 2 ) x[i + j*ldx].r = 1.0; - else x[i + j*ldx].r = 2.0; - x[i + j*ldx].i = 0.0; + if ( i % 2 ) { + x[i + j*ldx].r = 1.0 + (double)(i+1.)/n; + x[i + j*ldx].i = 1.0; + } + else { + x[i + j*ldx].r = 2.0 + (double)(i+1.)/n; + x[i + j*ldx].i = 2.0; + } } } @@ -653,8 +659,8 @@ void zDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid, Glu_persist_t *Glu_persist, zLocalLU_t *Llu) { register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb; - register int_t k, mycol, r; - int_t nnzL, n,nmax; + int k, mycol, r, n, nmax; + int_t nnzL; int_t *xsup = Glu_persist->xsup; int_t *index; doublecomplex *nzval; From 3da3e0c09904c252473bc8edc2ec4eda95f936b4 Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Tue, 27 Apr 2021 11:42:07 -0400 Subject: [PATCH 076/147] Fixed several inconsistencies with 'int' and 'int_t', for 64bit indexing. --- CMakeLists.txt | 48 +++++++++++++++++++-------------- FORTRAN/CMakeLists.txt | 48 ++++++++++++++++++++++----------- FORTRAN/Makefile | 24 ++++++++--------- SRC/dsuperlu_gpu.cu | 5 ++-- SRC/dtreeFactorizationGPU.c | 6 ++--- SRC/superlu_FortranCInterface.h | 8 +++--- SRC/superlu_dist_config.h | 19 ++++++++++--- SRC/zsuperlu_gpu.cu | 5 ++-- SRC/ztreeFactorizationGPU.c | 6 ++--- make.inc.in | 32 +++++++++++++--------- 10 files changed, 122 insertions(+), 79 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index eb99b3c1..711a436c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,6 +57,16 @@ include(CheckLanguage) # Usual initialization stuff # ###################################################################### +set(CMAKE_CXX_STANDARD 11) +#set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +#message("!!!! top: cxx_implicit='${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES}'") + +if (XSDK_ENABLE_Fortran) + enable_language (Fortran) + set(NOFORTRAN FALSE) +endif() set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) ## ???? set(CMAKE_INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX}/lib") @@ -88,7 +98,7 @@ if (BUILD_SHARED_LIBS) set(PROJECT_NAME_LIB_EXPORT libsuperlu_dist.so) SET(CMAKE_EXE_LINKER_FLAGS - "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath,${CMAKE_INSTALL_PREFIX}/SRC") + "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath,${CMAKE_INSTALL_PREFIX}/SRC") if (BUILD_STATIC_LIBS) message("-- SuperLU_DIST will also be built as a static library.") @@ -99,16 +109,6 @@ else() set(PROJECT_NAME_LIB_EXPORT libsuperlu_dist.a) endif() -set(CMAKE_CXX_STANDARD 11) -#set(CMAKE_CXX_STANDARD 14) -set(CMAKE_CXX_STANDARD_REQUIRED ON) - -#message("!!!! top: cxx_implicit='${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES}'") - -if (XSDK_ENABLE_Fortran) - enable_language (Fortran) - set(NOFORTRAN FALSE) -endif() set(SUPERLU_VERSION "${PROJECT_VERSION}") set(SUPERLU_REV "${PROJECT_REV}") @@ -145,6 +145,7 @@ if(XSDK_INDEX_SIZE EQUAL 64) endif() set(CMAKE_C_FLAGS_RELEASE "-O3 -g" CACHE STRING "") set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g" CACHE STRING "") +set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -g" CACHE STRING "") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0") @@ -164,10 +165,9 @@ if(MPI_C_FOUND) endif() if (XSDK_ENABLE_Fortran) if(MPI_Fortran_FOUND) - add_definitions(${MPI_Fortran_COMPILE_FLAGS}) - include_directories(${MPI_Fortran_INCLUDE_PATH}) set(CMAKE_Fortran_FLAGS "${MPI_Fortran_COMPILE_FLAGS} ${CMAKE_Fortran_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MPI_Fortran_LINK_FLAGS}") + include_directories(${MPI_Fortran_INCLUDE_PATH}) endif() endif() @@ -185,15 +185,13 @@ if (TPL_ENABLE_CUDALIB) ## want to use cuda set(HAVE_CUDA TRUE) set(CMAKE_CUDA_FLAGS_RELEASE "-O3 --expt-relaxed-constexpr -DNDEBUG" CACHE STRING "") set(CMAKE_CUDA_FLAGS_DDEBUG "-O0 --expt-relaxed-constexpr -DDEBUG -g" CACHE STRING "") -# set(CUDA_NVCC_FLAGS_RELEASE "-O3 --expt-relaxed-constexpr -DNDEBUG ${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS}") -# set(CUDA_NVCC_FLAGS_DEBUG "-O0 --expt-relaxed-constexpr -DDEBUG -g ${CMAKE_CUDA_FLAGS} ${CUDA_ARCH_FLAGS}") endif() -# find_package(CUDAToolkit REQUIRED) - message("cuda libraries at : '${CUDA_LIBRARIES}'") # find_package(CUB REQUIRED) -# set(CUDA_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcudart.so") -# set(CUDA_CUBLAS_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcublas.so") + + find_package(CUDAToolkit REQUIRED) + set(CUDA_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcudart.so") + set(CUDA_CUBLAS_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcublas.so") # # The following make.inc exporting does not work # set(CUDA_LIB CUDA::cudart CUDA::cublas CUDA::cusolver) @@ -343,6 +341,16 @@ if (XSDK_ENABLE_Fortran) string (REPLACE ";" " " EXTRA_FLIB_STR "${EXTRA_FLIB}") set(EXTRA_FLIB_EXPORT ${EXTRA_FLIB_STR}) message("-- EXTRA_FLIB_EXPORT='${EXTRA_FLIB_EXPORT}'") + + if (BUILD_SHARED_LIBS) + message("-- superlu_dist_fortran will be built as a dynamic library.") + set(PROJECT_NAME_LIB_FORTRAN libsuperlu_dist_fortran.so) + SET(CMAKE_EXE_LINKER_FLAGS + "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath,${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}") + else() + message("-- superlu_dist_fortranwill be built as a static library.") + set(PROJECT_NAME_LIB_FORTRAN libsuperlu_dist_fortran.a) + endif() endif() #--------------------- CombBLAS --------------------- @@ -451,4 +459,4 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/superlu_dist.pc.in ${CMAKE_CURRENT_BI install(FILES ${CMAKE_CURRENT_BINARY_DIR}/superlu_dist.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) -#message("CMAKE_CXX_LINK_FLAGS '${CMAKE_CXX_LINK_FLAGS}'") +#message("MPI_Fortran_LINK_FLAGS '${MPI_Fortran_LINK_FLAGS}'") diff --git a/FORTRAN/CMakeLists.txt b/FORTRAN/CMakeLists.txt index 1f8a836d..4516170b 100644 --- a/FORTRAN/CMakeLists.txt +++ b/FORTRAN/CMakeLists.txt @@ -5,14 +5,38 @@ set(headers ${CMAKE_INSTALL_PREFIX}/FORTRAN/superlu_mod.mod ${CMAKE_INSTALL_PREFIX}/FORTRAN/superlupara_mod.mod ) + +set(sources "") # initialize an empty set +if(enable_double) + list(APPEND sources c2f_dcreate_matrix_x_b.c superlu_c2f_dwrap.c) +endif() +if(enable_complex16) + list(APPEND sources c2f_zcreate_matrix_x_b.c superlu_c2f_zwrap.c) +endif() + +add_library(superlu_dist_fortran ${sources}) +set(targets superlu_dist_fortran) + +install(TARGETS superlu_dist_fortran +# DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION "${INSTALL_BIN_DIR}" + LIBRARY DESTINATION "${INSTALL_LIB_DIR}" + ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" +) +install(FILES ${headers} +# DESTINATION ${CMAKE_INSTALL_PREFIX}/include) + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} +) -# Fortran stuff +# Fortran MPI stuff add_definitions(${MPI_Fortran_COMPILE_FLAGS}) include_directories(${MPI_Fortran_INCLUDE_PATH}) link_directories(${MPI_Fortran_LIBRARIES}) -# Libs linked to all of the examples -set(all_link_libs superlu_dist ${MPI_Fortran_LIBRARIES} ${BLAS_LIB} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}) +# Libs to be linked with the Fortran codes +set(fortran_link_libs superlu_dist_fortran ${MPI_Fortran_LIBRARIES} ${BLAS_LIB} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}) +set(all_link_libs ${fortran_link_libs} superlu_dist) + #message("!!! in Fortran: MPI_Fortran_LINK_FLAGS='${MPI_Fortran_LINK_FLAGS}'") #message("!!! in Fortran: all_link_libs='${all_link_libs}'") #message("!!! in Fortran: cxx_implicit='${CMAKE_CXX_IMPLICIT_LINK_LIBRARIES}'") @@ -20,22 +44,22 @@ if (NOT MSVC) list(APPEND all_link_libs m) endif () + set(F_MOD superlupara.f90 superlu_mod.f90) if(enable_double) - set(C_DWRAP c2f_dcreate_matrix_x_b.c superlu_c2f_dwrap.c) - set(F_DEXM ${F_MOD} f_pddrive.f90 ${C_DWRAP}) + set(F_DEXM ${F_MOD} f_pddrive.f90) add_executable(f_pddrive ${F_DEXM}) target_link_libraries(f_pddrive ${all_link_libs}) # set_target_properties(f_pddrive PROPERTIES LINKER_LANGUAGE Fortran) set_target_properties(f_pddrive PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") - set(F_DEXM3D ${F_MOD} f_pddrive3d.f90 ${C_DWRAP}) + set(F_DEXM3D ${F_MOD} f_pddrive3d.f90) add_executable(f_pddrive3d ${F_DEXM3D}) target_link_libraries(f_pddrive3d ${all_link_libs}) set_target_properties(f_pddrive3d PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") - set(F_5x5 ${F_MOD} f_5x5.f90 sp_ienv.c ${C_DWRAP}) + set(F_5x5 ${F_MOD} f_5x5.f90 sp_ienv.c) add_executable(f_5x5 ${F_5x5}) target_link_libraries(f_5x5 ${all_link_libs}) set_target_properties(f_5x5 PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") @@ -43,22 +67,16 @@ if(enable_double) endif() if(enable_complex16) - set(C_ZWRAP c2f_zcreate_matrix_x_b.c superlu_c2f_zwrap.c) - - set(F_ZEXM ${F_MOD} f_pzdrive.f90 ${C_ZWRAP}) + set(F_ZEXM ${F_MOD} f_pzdrive.f90) add_executable(f_pzdrive ${F_ZEXM}) target_link_libraries(f_pzdrive ${all_link_libs}) # set_target_properties(f_pzdrive PROPERTIES LINKER_LANGUAGE Fortran) set_target_properties(f_pzdrive PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") - set(F_ZEXM3D ${F_MOD} f_pzdrive3d.f90 ${C_ZWRAP}) + set(F_ZEXM3D ${F_MOD} f_pzdrive3d.f90) add_executable(f_pzdrive3d ${F_ZEXM3D}) target_link_libraries(f_pzdrive3d ${all_link_libs}) set_target_properties(f_pzdrive3d PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") endif() -install(FILES ${headers} -# DESTINATION ${CMAKE_INSTALL_PREFIX}/include) - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} -) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 7b9a5cb0..9a8ea6b8 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -14,8 +14,8 @@ include ../make.inc #F90FLAGS = $(FFLAGS) -qfree -qsuffix=f=f90 -qflag=w:w F_MOD = superlupara.o superlu_mod.o -C_DWRAP = c2f_dcreate_matrix_x_b.o superlu_c2f_dwrap.o #dcreate_dist_matrix.o -C_ZWRAP = c2f_zcreate_matrix_x_b.o superlu_c2f_zwrap.o +#C_DWRAP = c2f_dcreate_matrix_x_b.o superlu_c2f_dwrap.o +#C_ZWRAP = c2f_zcreate_matrix_x_b.o superlu_c2f_zwrap.o F_DEXM = $(F_MOD) f_pddrive.o F_DEXM3D= $(F_MOD) f_pddrive3d.o @@ -26,20 +26,20 @@ F_5x5 = $(F_MOD) f_5x5.o sp_ienv.o all: f_pddrive f_pddrive3d f_pzdrive f_pzdrive3d f_5x5 -f_5x5: $(F_5x5) $(C_DWRAP) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(F_5x5) $(C_DWRAP) $(LIBS) -o $@ +f_5x5: $(F_5x5) $(DSUPERLULIB) $(DFORTRANLIB) + $(LOADER) $(LOADOPTS) $(F_5x5) $(LIBS) -o $@ -f_pddrive: $(F_DEXM) $(C_DWRAP) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(F_DEXM) $(C_DWRAP) $(LIBS) -o $@ +f_pddrive: $(F_DEXM) $(DSUPERLULIB) $(DFORTRANLIB) + $(LOADER) $(LOADOPTS) $(F_DEXM) $(LIBS) -o $@ -f_pddrive3d: $(F_DEXM3D) $(C_DWRAP) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(F_DEXM3D) $(C_DWRAP) $(LIBS) -o $@ +f_pddrive3d: $(F_DEXM3D) $(DSUPERLULIB) $(DFORTRANLIB) + $(LOADER) $(LOADOPTS) $(F_DEXM3D) $(LIBS) -o $@ -f_pzdrive: $(F_ZEXM) $(C_ZWRAP) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(F_ZEXM) $(C_ZWRAP) $(LIBS) -o $@ +f_pzdrive: $(F_ZEXM) $(DSUPERLULIB) $(DFORTRANLIB) + $(LOADER) $(LOADOPTS) $(F_ZEXM) $(LIBS) -o $@ -f_pzdrive3d: $(F_ZEXM3D) $(C_ZWRAP) $(DSUPERLULIB) - $(LOADER) $(LOADOPTS) $(F_ZEXM3D) $(C_ZWRAP) $(LIBS) -o $@ +f_pzdrive3d: $(F_ZEXM3D) $(DSUPERLULIB) $(DFORTRANLIB) + $(LOADER) $(LOADOPTS) $(F_ZEXM3D) $(LIBS) -o $@ .c.o: $(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -I$(INCLUDEDIR) -c $< $(VERBOSE) diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu index 383eeebe..6c065704 100644 --- a/SRC/dsuperlu_gpu.cu +++ b/SRC/dsuperlu_gpu.cu @@ -915,7 +915,6 @@ int dinitSluGPU3D_t( int_t ldt /* NSUP read from sp_ienv(3) */ ) { - gridinfo_t* grid = &(grid3d->grid2d); checkCudaErrors(cudaDeviceReset ()) ; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; dLocalLU_t *Llu = LUstruct->Llu; @@ -1124,7 +1123,7 @@ int dreduceGPUlu( } /* dreduceGPUlu */ -int dwaitGPUscu(int_t streamId, dsluGPU_t *sluGPU, SCT_t *SCT) +int dwaitGPUscu(int streamId, dsluGPU_t *sluGPU, SCT_t *SCT) { double ttx = SuperLU_timer_(); cudaStreamSynchronize(sluGPU->funCallStreams[streamId]); @@ -1132,7 +1131,7 @@ int dwaitGPUscu(int_t streamId, dsluGPU_t *sluGPU, SCT_t *SCT) return 0; } -int_t dsendLUpanelGPU2HOST( +int dsendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred, dsluGPU_t *sluGPU diff --git a/SRC/dtreeFactorizationGPU.c b/SRC/dtreeFactorizationGPU.c index 9fab2fc4..e1983df5 100644 --- a/SRC/dtreeFactorizationGPU.c +++ b/SRC/dtreeFactorizationGPU.c @@ -72,7 +72,7 @@ int dsparseTreeFactor_ASYNC_GPU( gEtreeInfo_t *gEtreeInfo, // global etree info superlu_dist_options_t *options, int_t *gIperm_c_supno, - int_t ldt, + int ldt, dsluGPU_t *sluGPU, d2Hreduce_t *d2Hred, HyP_t *HyP, @@ -395,8 +395,8 @@ int dsparseTreeFactor_ASYNC_GPU( lPanelInfo_t *lPanelInfo = packLUInfo->lPanelInfo; int_t *lsub = lPanelInfo->lsub; int_t *usub = uPanelInfo->usub; - int_t *indirect = fNlists->indirect; - int_t *indirect2 = fNlists->indirect2; + int *indirect = fNlists->indirect; + int *indirect2 = fNlists->indirect2; /* Schur Complement Update */ diff --git a/SRC/superlu_FortranCInterface.h b/SRC/superlu_FortranCInterface.h index 467bfb65..c9fee77d 100644 --- a/SRC/superlu_FortranCInterface.h +++ b/SRC/superlu_FortranCInterface.h @@ -2,15 +2,15 @@ #define FC_HEADER_INCLUDED /* Mangling for Fortran global symbols without underscores. */ -#define FC_GLOBAL(name,NAME) name##_ +#define FC_GLOBAL(name,NAME) name /* Mangling for Fortran global symbols with underscores. */ -#define FC_GLOBAL_(name,NAME) name##_ +#define FC_GLOBAL_(name,NAME) name /* Mangling for Fortran module symbols without underscores. */ -#define FC_MODULE(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name +#define FC_MODULE(mod_name,name, mod_NAME,NAME) __##mod_name##_NMOD_##name /* Mangling for Fortran module symbols with underscores. */ -#define FC_MODULE_(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name +#define FC_MODULE_(mod_name,name, mod_NAME,NAME) __##mod_name##_NMOD_##name #endif diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index ec3d9f9a..6df0c761 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -1,7 +1,20 @@ -/* #define XSDK_INDEX_SIZE 64 */ -/* #define SLU_HAVE_LAPACK TRUE */ +/* superlu_dist_config.h.in */ + +/* Enable CUDA */ +#define HAVE_CUDA TRUE + +/* Enable parmetis */ #define HAVE_PARMETIS TRUE -/* #define HAVE_COMBBLAS TRUE */ + +/* Enable LAPACK */ +/* #undef SLU_HAVE_LAPACK */ + +/* Enable CombBLAS */ +/* #undef HAVE_COMBBLAS */ + +/* enable 64bit index mode */ +#define XSDK_INDEX_SIZE 64 + #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 #endif diff --git a/SRC/zsuperlu_gpu.cu b/SRC/zsuperlu_gpu.cu index fe4205ea..54c8912c 100644 --- a/SRC/zsuperlu_gpu.cu +++ b/SRC/zsuperlu_gpu.cu @@ -924,7 +924,6 @@ int zinitSluGPU3D_t( int_t ldt /* NSUP read from sp_ienv(3) */ ) { - gridinfo_t* grid = &(grid3d->grid2d); checkCudaErrors(cudaDeviceReset ()) ; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; zLocalLU_t *Llu = LUstruct->Llu; @@ -1133,7 +1132,7 @@ int zreduceGPUlu( } /* zreduceGPUlu */ -int zwaitGPUscu(int_t streamId, zsluGPU_t *sluGPU, SCT_t *SCT) +int zwaitGPUscu(int streamId, zsluGPU_t *sluGPU, SCT_t *SCT) { double ttx = SuperLU_timer_(); cudaStreamSynchronize(sluGPU->funCallStreams[streamId]); @@ -1141,7 +1140,7 @@ int zwaitGPUscu(int_t streamId, zsluGPU_t *sluGPU, SCT_t *SCT) return 0; } -int_t zsendLUpanelGPU2HOST( +int zsendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred, zsluGPU_t *sluGPU diff --git a/SRC/ztreeFactorizationGPU.c b/SRC/ztreeFactorizationGPU.c index 2128f812..b081339f 100644 --- a/SRC/ztreeFactorizationGPU.c +++ b/SRC/ztreeFactorizationGPU.c @@ -71,7 +71,7 @@ int zsparseTreeFactor_ASYNC_GPU( gEtreeInfo_t *gEtreeInfo, // global etree info superlu_dist_options_t *options, int_t *gIperm_c_supno, - int_t ldt, + int ldt, zsluGPU_t *sluGPU, d2Hreduce_t *d2Hred, HyP_t *HyP, @@ -394,8 +394,8 @@ int zsparseTreeFactor_ASYNC_GPU( lPanelInfo_t *lPanelInfo = packLUInfo->lPanelInfo; int_t *lsub = lPanelInfo->lsub; int_t *usub = uPanelInfo->usub; - int_t *indirect = fNlists->indirect; - int_t *indirect2 = fNlists->indirect2; + int *indirect = fNlists->indirect; + int *indirect2 = fNlists->indirect2; /* Schur Complement Update */ diff --git a/make.inc.in b/make.inc.in index 4e55c479..31736a88 100644 --- a/make.inc.in +++ b/make.inc.in @@ -26,18 +26,23 @@ HAVE_PARMETIS = @HAVE_PARMETIS@ HAVE_COMBBLAS = @HAVE_COMBBLAS@ HAVE_CUDA = @HAVE_CUDA@ -LIBS = $(DSUPERLULIB) ${BLAS_LIB_EXPORT} -lm -LIBS += ${LAPACK_LIB_EXPORT} -LIBS += ${PARMETIS_LIB_EXPORT} -LIBS += ${COMBBLAS_LIB_EXPORT} -LIBS += ${EXTRA_LIB_EXPORT} -LIBS += ${EXTRA_FLIB_EXPORT} -CUDALIBS = ${CUDA_LIB_EXPORT} -LIBS += ${CUDA_LIB_EXPORT} +XSDK_ENABLE_Fortran = @XSDK_ENABLE_Fortran@ +ifeq ($(XSDK_ENABLE_Fortran),TRUE) + DFORTRANLIB = $(SuperLUroot)/@CMAKE_INSTALL_LIBDIR@/${PROJECT_NAME_LIB_FORTRAN} + LIBS = $(DFORTRANLIB) $(DSUPERLULIB) ${BLAS_LIB_EXPORT} -lm + LIBS += ${EXTRA_FLIB_EXPORT} +else + LIBS = $(DSUPERLULIB) ${BLAS_LIB_EXPORT} -lm +endif -#LIBS += ${EXTRA_FLIB_EXPORT} -CUDALIBS = ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} #${CUDA_LIB_EXPORT} -LIBS += ${CUDALIBS} +LIBS += ${LAPACK_LIB_EXPORT} +LIBS += ${PARMETIS_LIB_EXPORT} +LIBS += ${COMBBLAS_LIB_EXPORT} +LIBS += ${EXTRA_LIB_EXPORT} +# LIBS += ${CUDA_LIB_EXPORT} + +CUDALIBS = ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} +LIBS += $(CUDALIBS) # # The archiver and the flag(s) to use when building archive (library) @@ -49,7 +54,7 @@ RANLIB = @CMAKE_RANLIB@ CC = @CMAKE_C_COMPILER@ CFLAGS = @CMAKE_C_FLAGS_RELEASE@ @CMAKE_C_FLAGS@ ${SHARED_C_FLAGS_EXPORT} - ##@CMAKE_SHARED_LIBRARY_C_FLAGS@ +##@CMAKE_SHARED_LIBRARY_C_FLAGS@ #CFLAGS += -D${DirDefs} # CFLAGS += @COMPILE_DEFINITIONS@ CXX = @CMAKE_CXX_COMPILER@ @@ -59,6 +64,7 @@ NVCCFLAGS = @CMAKE_CUDA_FLAGS@ NOOPTS = -O0 FORTRAN = @CMAKE_Fortran_COMPILER@ +FFLAGS = @CMAKE_Fortran_FLAGS@ @Fortrtan_INCLUDES@ LOADER = @CMAKE_CXX_COMPILER@ -LOADOPTS = @CMAKE_EXE_LINKER_FLAGS@ @CMAKE_CXX_LINK_FLAGS@ +LOADOPTS = @CMAKE_EXE_LINKER_FLAGS@ @CMAKE_CXX_LINK_FLAGS@ @CMAKE_Fortran_LINK_FLAGS@ From edbbfc3e7774277c782f4176febe71eb4d7672a4 Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Wed, 28 Apr 2021 23:44:02 -0400 Subject: [PATCH 077/147] Cleaned up most of the compiler warnings. --- EXAMPLE/sp_ienv.c | 2 +- FORTRAN/c2f_dcreate_matrix_x_b.c | 7 ++--- FORTRAN/c2f_zcreate_matrix_x_b.c | 7 ++--- FORTRAN/sp_ienv.c | 2 +- SRC/cublas_utils.c | 11 +++----- SRC/dlustruct_gpu.h | 5 +--- SRC/dreadtriple_noheader.c | 8 +++--- SRC/dsuperlu_gpu.cu | 2 +- SRC/dutil_dist.c | 2 +- SRC/pdgssvx.c | 20 ++++++------- SRC/pdgssvx3d.c | 9 +++--- SRC/pdgssvx_ABglobal.c | 6 ++-- SRC/pdgstrf.c | 2 +- SRC/pzgssvx.c | 20 ++++++------- SRC/pzgssvx3d.c | 9 +++--- SRC/pzgssvx_ABglobal.c | 6 ++-- SRC/pzgstrf.c | 2 +- SRC/sec_structs.c | 1 + SRC/sp_colorder.c | 4 +-- SRC/sp_ienv.c | 2 +- SRC/superlu_defs.h | 11 ++++---- SRC/supernodalForest.c | 13 ++++----- SRC/supernodal_etree.c | 48 ++++++++++++++++---------------- SRC/util.c | 8 +++--- SRC/zlustruct_gpu.h | 5 +--- SRC/zsuperlu_gpu.cu | 2 +- 26 files changed, 97 insertions(+), 117 deletions(-) diff --git a/EXAMPLE/sp_ienv.c b/EXAMPLE/sp_ienv.c index 195cf8c2..c9985cf1 100644 --- a/EXAMPLE/sp_ienv.c +++ b/EXAMPLE/sp_ienv.c @@ -67,7 +67,7 @@ at the top-level directory. #include -int_t +int sp_ienv_dist(int_t ispec) { // printf(" this function called\n"); diff --git a/FORTRAN/c2f_dcreate_matrix_x_b.c b/FORTRAN/c2f_dcreate_matrix_x_b.c index e35f85c9..cd9b970f 100644 --- a/FORTRAN/c2f_dcreate_matrix_x_b.c +++ b/FORTRAN/c2f_dcreate_matrix_x_b.c @@ -39,10 +39,10 @@ at the top-level directory. * Arguments * ========= * - * FNAME (input) char* + * fname (input) char* * File name as a character string. * - * NRHS (input) int + * nrhs (input) int * Number of right-hand sides. * * nprocs (input) int* @@ -90,8 +90,7 @@ int c2f_dcreate_matrix_x_b(char *fname, int nrhs, int nprocs, double *nzval_loc; /* local */ int_t *colind, *rowptr; /* local */ int_t *marker; - int_t nnz, nnz_loc; - int m, n; + int_t nnz, nnz_loc, m, n; int m_loc, fst_row; int m_loc_fst; /* Record m_loc of the first p-1 processors, when mod(m, p) is not zero. */ diff --git a/FORTRAN/c2f_zcreate_matrix_x_b.c b/FORTRAN/c2f_zcreate_matrix_x_b.c index e23428ee..df54a458 100644 --- a/FORTRAN/c2f_zcreate_matrix_x_b.c +++ b/FORTRAN/c2f_zcreate_matrix_x_b.c @@ -38,10 +38,10 @@ at the top-level directory. * Arguments * ========= * - * FNAME (input) char* + * fname (input) char* * File name as a character string. * - * NRHS (input) int + * nrhs (input) int * Number of right-hand sides. * * nprocs (input) int* @@ -89,8 +89,7 @@ int c2f_zcreate_matrix_x_b(char *fname, int nrhs, int nprocs, doublecomplex *nzval_loc; /* local */ int_t *colind, *rowptr; /* local */ int_t *marker; - int_t nnz, nnz_loc; - int m, n; + int_t nnz, nnz_loc, m, n; int m_loc, fst_row; int m_loc_fst; /* Record m_loc of the first p-1 processors, when mod(m, p) is not zero. */ diff --git a/FORTRAN/sp_ienv.c b/FORTRAN/sp_ienv.c index 3366671a..99df0d25 100644 --- a/FORTRAN/sp_ienv.c +++ b/FORTRAN/sp_ienv.c @@ -67,7 +67,7 @@ at the top-level directory. #include -int_t +int sp_ienv_dist(int_t ispec) { // printf(" this function called\n"); diff --git a/SRC/cublas_utils.c b/SRC/cublas_utils.c index be1c6da2..0b40659e 100644 --- a/SRC/cublas_utils.c +++ b/SRC/cublas_utils.c @@ -12,12 +12,6 @@ at the top-level directory. #ifdef GPU_ACC // enable CUDA -#include -#include "cublas_utils.h" - - - - void DisplayHeader() { const int kb = 1024; @@ -74,11 +68,14 @@ const char* cublasGetErrorString(cublasStatus_t status) case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; } return "unknown error"; } -inline +/*error reporting functions */ +//inline cudaError_t checkCuda(cudaError_t result) { #if defined(DEBUG) || defined(_DEBUG) diff --git a/SRC/dlustruct_gpu.h b/SRC/dlustruct_gpu.h index bfe3e527..bc85565a 100644 --- a/SRC/dlustruct_gpu.h +++ b/SRC/dlustruct_gpu.h @@ -33,7 +33,7 @@ #define MAX_NCUDA_STREAMS 32 static -void check(cudaError_t result, char const *const func, const char *const file, int_t const line) +void check(cudaError_t result, char const *const func, const char *const file, int const line) { if (result) { @@ -233,9 +233,6 @@ extern int dfree_LUstruct_gpu (dLUstruct_gpu_t *A_gpu); //int freeSluGPU(dsluGPU_t *sluGPU); -cublasStatus_t checkCublas(cublasStatus_t result); -// cudaError_t checkCuda(cudaError_t result); - extern void dPrint_matrix( char *desc, int_t m, int_t n, double *dA, int_t lda ); /*to print out various statistics*/ diff --git a/SRC/dreadtriple_noheader.c b/SRC/dreadtriple_noheader.c index 09004af4..b5982c22 100644 --- a/SRC/dreadtriple_noheader.c +++ b/SRC/dreadtriple_noheader.c @@ -49,9 +49,9 @@ dreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, nz = *n = 0; #ifdef _LONGINT - ret_val = fscanf(fp, "%ld%ld%lf%\n", &i, &j, &vali); + ret_val = fscanf(fp, "%ld%ld%lf\n", &i, &j, &vali); #else // int - ret_val = fscanf(fp, "%d%d%lf%\n", &i, &j, &vali); + ret_val = fscanf(fp, "%d%d%lf\n", &i, &j, &vali); #endif while (ret_val != EOF) { @@ -62,9 +62,9 @@ dreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, ++nz; #ifdef _LONGINT - ret_val = fscanf(fp, "%ld%ld%lf%\n", &i, &j, &vali); + ret_val = fscanf(fp, "%ld%ld%lf\n", &i, &j, &vali); #else // int - ret_val = fscanf(fp, "%d%d%lf%\n", &i, &j, &vali); + ret_val = fscanf(fp, "%d%d%lf\n", &i, &j, &vali); #endif } diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu index 6c065704..e28030b9 100644 --- a/SRC/dsuperlu_gpu.cu +++ b/SRC/dsuperlu_gpu.cu @@ -33,7 +33,7 @@ //} /*error reporting functions */ -static +//static cudaError_t checkCuda(cudaError_t result) { #if defined(DEBUG) || defined(_DEBUG) diff --git a/SRC/dutil_dist.c b/SRC/dutil_dist.c index 8291b470..a8653599 100644 --- a/SRC/dutil_dist.c +++ b/SRC/dutil_dist.c @@ -710,7 +710,7 @@ void dDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid, } if(grid->iam==0){ - fprintf(fp, "%d %d %d\n", n,n,nnzL); + fprintf(fp, "%d %d" IFMT "\n", n,n,nnzL); } ncb = nsupers / grid->npcol; diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c index 94bf18ff..d8eb7b4d 100644 --- a/SRC/pdgssvx.c +++ b/SRC/pdgssvx.c @@ -490,7 +490,7 @@ at the top-level directory. * * info (output) int* * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value + * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, @@ -1060,7 +1060,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, the nonzero data structures for L & U. */ #if ( PRNTlevel>=1 ) if ( !iam ) { - printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n", + printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); fflush(stdout); } @@ -1082,10 +1082,10 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1); printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]); printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]); - printf("\tint %d, short %d, float %d, double %d\n", - (int) sizeof(int_t), (int) sizeof(short), - (int) sizeof(float), (int) sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n", + printf("\tint %lu, short %lu, float %lu, double %lu\n", + sizeof(int_t), sizeof(short), + sizeof(float), sizeof(double)); + printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); @@ -1225,11 +1225,6 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, MPI_Comm_rank( MPI_COMM_WORLD, &iam_g ); - if (!iam_g) { - print_options_dist(options); - fflush(stdout); - } - printf(".. Ainfo mygid %5d mysid %5d nnz_loc " IFMT " sum_loc %e lsum_loc %e nnz " IFMT " nnzLU %ld sum %e lsum %e N " IFMT "\n", iam_g,iam,Astore->rowptr[Astore->m_loc],asum, lsum, nnz_tot,nnzLU,asum_tot,lsum_tot,A->ncol); fflush(stdout); #endif @@ -1320,7 +1315,8 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, avg * 1e-6, avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); - printf("**************************************************\n"); + printf("**************************************************\n\n"); + printf("** number of Tiny Pivots: %8d\n\n", stat->TinyPivots); fflush(stdout); } } /* end printing stats */ diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index cc064ff7..04b49faf 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -1079,14 +1079,13 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, } } - /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up the nonzero data structures for L & U. */ #if ( PRNTlevel>=1 ) if (!iam) printf (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", - sp_ienv_dist (2), sp_ienv_dist (3), sp_ienv_dist (6)); + sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); #endif t = SuperLU_timer_ (); if (!(Glu_freeable = (Glu_freeable_t *) @@ -1108,9 +1107,9 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, (long) Glu_persist->supno[n - 1] + 1); printf ("\tSize of G(L) %ld\n", (long) Glu_freeable->xlsub[n]); printf ("\tSize of G(U) %ld\n", (long) Glu_freeable->xusub[n]); - printf ("\tint %d, short %d, float %d, double %d\n", - sizeof (int_t), sizeof (short), - sizeof (float), sizeof (double)); + printf ("\tint %lu, short %lu, float %lu, double %lu\n", + sizeof(int_t), sizeof (short), + sizeof(float), sizeof (double)); printf ("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu * 1e-6, diff --git a/SRC/pdgssvx_ABglobal.c b/SRC/pdgssvx_ABglobal.c index 8cdc7c66..14cf4f2d 100644 --- a/SRC/pdgssvx_ABglobal.c +++ b/SRC/pdgssvx_ABglobal.c @@ -877,7 +877,7 @@ pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, if ( Fact != SamePattern_SameRowPerm ) { #if ( PRNTlevel>=1 ) if ( !iam ) - printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n", + printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); #endif t = SuperLU_timer_(); @@ -900,7 +900,7 @@ pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, printf("\tint %d, short %d, float %d, double %d\n", (int) sizeof(int_t), (int) sizeof(short), (int) sizeof(float), (int) sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n", + printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); @@ -1099,7 +1099,7 @@ pdgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, case COL: SUPERLU_FREE(R); break; - default: break; + default: break; } } if ( !factored || (factored && options->IterRefine) ) diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c index dae6bc22..d4ae6bb4 100644 --- a/SRC/pdgstrf.c +++ b/SRC/pdgstrf.c @@ -815,7 +815,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #if ( PRNTlevel>=1 ) if(!iam) { printf("\t.. MAX_BUFFER_SIZE " IFMT " set for GPU\n", get_max_buffer_size()); - printf("\t.. N_GEMM: " IFMT " flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7)); + printf("\t.. N_GEMM: %d flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7)); printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n", max_row_size, max_ncols); } diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c index 5161c4b8..e50ef7d3 100644 --- a/SRC/pzgssvx.c +++ b/SRC/pzgssvx.c @@ -489,7 +489,7 @@ at the top-level directory. * * info (output) int* * = 0: successful exit - * < 0: if info = -i, the i-th argument had an illegal value + * < 0: if info = -i, the i-th argument had an illegal value * > 0: if info = i, and i is * <= A->ncol: U(i,i) is exactly zero. The factorization has * been completed, but the factor U is exactly singular, @@ -1061,7 +1061,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, the nonzero data structures for L & U. */ #if ( PRNTlevel>=1 ) if ( !iam ) { - printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n", + printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); fflush(stdout); } @@ -1083,10 +1083,10 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1); printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]); printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]); - printf("\tint %d, short %d, float %d, double %d\n", - (int) sizeof(int_t), (int) sizeof(short), - (int) sizeof(float), (int) sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n", + printf("\tint %lu, short %lu, float %lu, double %lu\n", + sizeof(int_t), sizeof(short), + sizeof(float), sizeof(double)); + printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); @@ -1230,11 +1230,6 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, MPI_Comm_rank( MPI_COMM_WORLD, &iam_g ); - if (!iam_g) { - print_options_dist(options); - fflush(stdout); - } - printf(".. Ainfo mygid %5d mysid %5d nnz_loc " IFMT " sum_loc %e lsum_loc %e nnz "IFMT " nnzLU %ld sum %e lsum %e N "IFMT "\n", iam_g,iam,Astore->rowptr[Astore->m_loc],asum.r+asum.i, lsum.r+lsum.i, nnz_tot,nnzLU,asum_tot.r+asum_tot.i,lsum_tot.r+lsum_tot.i,A->ncol); fflush(stdout); #endif @@ -1325,7 +1320,8 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A, avg * 1e-6, avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); - printf("**************************************************\n"); + printf("**************************************************\n\n"); + printf("** number of Tiny Pivots: %8d\n\n", stat->TinyPivots); fflush(stdout); } } /* end printing stats */ diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c index 86873c9c..b4d4b2ab 100644 --- a/SRC/pzgssvx3d.c +++ b/SRC/pzgssvx3d.c @@ -1080,14 +1080,13 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, } } - /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up the nonzero data structures for L & U. */ #if ( PRNTlevel>=1 ) if (!iam) printf (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", - sp_ienv_dist (2), sp_ienv_dist (3), sp_ienv_dist (6)); + sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); #endif t = SuperLU_timer_ (); if (!(Glu_freeable = (Glu_freeable_t *) @@ -1109,9 +1108,9 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, (long) Glu_persist->supno[n - 1] + 1); printf ("\tSize of G(L) %ld\n", (long) Glu_freeable->xlsub[n]); printf ("\tSize of G(U) %ld\n", (long) Glu_freeable->xusub[n]); - printf ("\tint %d, short %d, float %d, double %d\n", - sizeof (int_t), sizeof (short), - sizeof (float), sizeof (double)); + printf ("\tint %lu, short %lu, float %lu, double %lu\n", + sizeof(int_t), sizeof (short), + sizeof(float), sizeof (double)); printf ("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu * 1e-6, diff --git a/SRC/pzgssvx_ABglobal.c b/SRC/pzgssvx_ABglobal.c index 6acf9758..29396604 100644 --- a/SRC/pzgssvx_ABglobal.c +++ b/SRC/pzgssvx_ABglobal.c @@ -876,7 +876,7 @@ pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, if ( Fact != SamePattern_SameRowPerm ) { #if ( PRNTlevel>=1 ) if ( !iam ) - printf(".. symbfact(): relax " IFMT ", maxsuper " IFMT ", fill " IFMT "\n", + printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n", sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); #endif t = SuperLU_timer_(); @@ -899,7 +899,7 @@ pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, printf("\tint %d, short %d, float %d, double %d\n", (int) sizeof(int_t), (int) sizeof(short), (int) sizeof(float), (int) sizeof(double)); - printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions " IFMT "\n", + printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", symb_mem_usage.for_lu*1e-6, symb_mem_usage.total*1e-6, symb_mem_usage.expansions); @@ -1098,7 +1098,7 @@ pzgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, case COL: SUPERLU_FREE(R); break; - default: break; + default: break; } } if ( !factored || (factored && options->IterRefine) ) diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c index 00e58a03..d6650011 100644 --- a/SRC/pzgstrf.c +++ b/SRC/pzgstrf.c @@ -815,7 +815,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm, #if ( PRNTlevel>=1 ) if(!iam) { printf("\t.. MAX_BUFFER_SIZE " IFMT " set for GPU\n", get_max_buffer_size()); - printf("\t.. N_GEMM: " IFMT " flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7)); + printf("\t.. N_GEMM: %d flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7)); printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n", max_row_size, max_ncols); } diff --git a/SRC/sec_structs.c b/SRC/sec_structs.c index 3f874a78..05216f35 100644 --- a/SRC/sec_structs.c +++ b/SRC/sec_structs.c @@ -13,6 +13,7 @@ #endif #include /*for printfs*/ +#include /*for getline*/ double CPU_CLOCK_RATE; /*for sorting structures */ diff --git a/SRC/sp_colorder.c b/SRC/sp_colorder.c index 94db174a..ca97bc36 100644 --- a/SRC/sp_colorder.c +++ b/SRC/sp_colorder.c @@ -223,13 +223,13 @@ sp_colorder(superlu_dist_options_t *options, SuperMatrix *A, int_t *perm_c, int check_perm_dist(char *what, int_t n, int_t *perm) { - register int_t i; + register int i; int_t *marker; marker = (int_t *) intCalloc_dist(n); for (i = 0; i < n; ++i) { if ( perm[i] >= n || marker[perm[i]] == 1 ) { - printf("%s: Not a valid PERM[" IFMT "] = " IFMT "\n", + printf("%s: Not a valid PERM[%d] = " IFMT "\n", what, i, perm[i]); ABORT("check_perm_dist"); } else { diff --git a/SRC/sp_ienv.c b/SRC/sp_ienv.c index 1a317f65..872fef99 100644 --- a/SRC/sp_ienv.c +++ b/SRC/sp_ienv.c @@ -65,7 +65,7 @@ at the top-level directory. #include #include -int_t +int sp_ienv_dist(int_t ispec) { // printf(" this function called\n"); diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index b228a68e..d52b1bf0 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -86,7 +86,7 @@ at the top-level directory. #elif defined (_LONGINT) typedef int64_t int_t; #define mpi_int_t MPI_LONG_LONG_INT - #define IFMT "%lld" + #define IFMT "%ld" #else /* Default */ typedef int int_t; #define mpi_int_t MPI_INT @@ -110,8 +110,9 @@ static __inline__ unsigned long long _rdtsc(void) } #endif -#ifdef HAVE_CUDA -#define GPU_ACC +#ifdef HAVE_CUDA +#define GPU_ACC // enable CUDA +#include "cublas_utils.h" #endif /* MPI C complex datatype */ @@ -708,7 +709,7 @@ typedef struct { typedef struct { float for_lu; float total; - int_t expansions; + int expansions; int64_t nnzL, nnzU; } superlu_dist_mem_usage_t; @@ -1009,7 +1010,7 @@ extern int_t estimate_bigu_size (int_t, int_t **, Glu_persist_t *, /* Auxiliary routines */ extern double SuperLU_timer_ (); extern void superlu_abort_and_exit_dist(char *); -extern int_t sp_ienv_dist (int_t); +extern int sp_ienv_dist (int_t); extern void ifill_dist (int_t *, int_t, int_t); extern void super_stats_dist (int_t, int_t *); extern void get_diag_procs(int_t, Glu_persist_t *, gridinfo_t *, int_t *, diff --git a/SRC/supernodalForest.c b/SRC/supernodalForest.c index e033d9b5..95f35649 100644 --- a/SRC/supernodalForest.c +++ b/SRC/supernodalForest.c @@ -340,7 +340,7 @@ double pearsonCoeff(int_t numForests, double* frCost, double* frWeight) return covarCostWeight / sqrt(stdCost * stdWeight); } -void printGantt(int_t root, int_t numForests, char* nodename, double scale, double* gFrstCostAcc, double* crPathCost); +void printGantt(int root, int numForests, char* nodename, double scale, double* gFrstCostAcc, double* crPathCost); void printForestWeightCost(sForest_t** sForests, SCT_t* SCT, gridinfo3d_t* grid3d) { @@ -434,7 +434,7 @@ void printForestWeightCost(sForest_t** sForests, SCT_t* SCT, gridinfo3d_t* grid } -void printGantt(int_t root, int_t numForests, char* nodename, double scale, double* gFrstCostAcc, double* crPathCost) +void printGantt(int root, int numForests, char* nodename, double scale, double* gFrstCostAcc, double* crPathCost) { @@ -445,12 +445,11 @@ void printGantt(int_t root, int_t numForests, char* nodename, double scale, doub } else { - printGantt(2*root+1, numForests, nodename, scale, gFrstCostAcc, crPathCost); - int_t depTree = crPathCost[2*root+1]> crPathCost[2*root+2]? 2*root+1:2*root+2; - printf("\t tree-%d %.2g \t:%s-%d, after %s-%d, %.0fd \n", root,100*scale*crPathCost[root], nodename, root, nodename, depTree, 100*scale*gFrstCostAcc[root] ); - printGantt(2*root+2, numForests, nodename, scale, gFrstCostAcc, crPathCost); + printGantt(2*root+1, numForests, nodename, scale, gFrstCostAcc, crPathCost); + int depTree =crPathCost[2*root+1]> crPathCost[2*root+2]? 2*root+1:2*root+2; + printf("\t tree-%d %.2g \t:%s-%d, after %s-%d, %.0fd \n", root,100*scale*crPathCost[root], nodename, root, nodename, depTree, 100*scale*gFrstCostAcc[root] ); + printGantt(2*root+2, numForests, nodename, scale, gFrstCostAcc, crPathCost); } - } #define ABS(a) ((a)<0?-(a):a) diff --git a/SRC/supernodal_etree.c b/SRC/supernodal_etree.c index a282187e..90171eb8 100644 --- a/SRC/supernodal_etree.c +++ b/SRC/supernodal_etree.c @@ -237,8 +237,8 @@ int_t printFileList(char* sname, int_t nnodes, int_t*dlist, int_t*setree) fprintf(fp, "digraph elimination_tree {\n"); for (int i = 0; i < nnodes; ++i) { - /* code */ - fprintf(fp, "%lld -> %lld;\n", dlist[i], setree[dlist[i]]); + /* code */ + fprintf(fp, IFMT " -> " IFMT ";\n", dlist[i], setree[dlist[i]]); } /*end of the file */ fprintf(fp, "}\n"); @@ -360,7 +360,7 @@ int_t* getEtreeLB(int_t nnodes, int_t* perm_l, int_t* gTopOrder) printf("numLB=%d curPtr=%d \n", numLB, curPtr); for (int i = 0; i < numLB; ++i) { - printf("%d ", lEtreeLB[i]); + printf(IFMT, lEtreeLB[i]); } return lEtreeLB; @@ -429,14 +429,14 @@ int_t testListPerm(int_t nodeCount, int_t* nodeList, int_t* permList, int_t* gTo // checking monotonicity for (int i = 0; i < nodeCount - 1; ++i) { - if (!( gTopLevel[permList[i]] <= gTopLevel[permList[i + 1]])) - { - /* code */ - printf("%d : %d (%d) %d (%d)\n", i, - permList[i], gTopLevel[permList[i]], - permList[i + 1], gTopLevel[permList[i + 1]] ); - } - assert( gTopLevel[permList[i]] <= gTopLevel[permList[i + 1]]); + if (!( gTopLevel[permList[i]] <= gTopLevel[permList[i + 1]])) + { + /* code */ + printf("%d :" IFMT "(" IFMT ")" IFMT "(" IFMT ")\n", i, + permList[i], gTopLevel[permList[i]], + permList[i + 1], gTopLevel[permList[i + 1]] ); + } + assert( gTopLevel[permList[i]] <= gTopLevel[permList[i + 1]]); } #if 0 int_t* slist = (int_t* ) malloc(sizeof(int_t) * nodeCount); @@ -485,17 +485,17 @@ int_t* merg_perms(int_t nperms, int_t* nnodes, int_t** perms) int_t* gperm = intMalloc_dist(nn); // Sherry fix //now concatenat arrays - int_t ptr = 0; + int ptr = 0; for (int tr = 0; tr < nperms; ++tr) { + /* code */ + for (int nd = 0; nd < nnodes[tr]; ++nd) + { /* code */ - for (int nd = 0; nd < nnodes[tr]; ++nd) - { - /* code */ - gperm[ptr] = perms[tr][nd]; - printf("%d %d %d %d\n", tr, ptr, nd, perms[tr][nd] ); - ptr++; - } + gperm[ptr] = perms[tr][nd]; + printf("%d %d %d" IFMT "\n", tr, ptr, nd, perms[tr][nd] ); + ptr++; + } } mergPermTest( nperms, gperm, nnodes); return gperm; @@ -522,7 +522,7 @@ int_t mergPermTest(int_t nperms, int_t* gperms, int_t* nnodes) for (int i = 0; i < nn; ++i) { /* code */ - printf("%d %d \n", i, gperms[i] ); + printf("%d" IFMT "\n", i, gperms[i] ); tperm[gperms[i]]++; } for (int i = 0; i < nn; ++i) @@ -977,7 +977,7 @@ void Print_EtreeLevelBoundry(int_t *Etree_LvlBdry, int_t max_level, int_t nsuper void print_etree_leveled(int_t *setree, int_t* tsort_etree, int_t nsuper) { FILE* fp = fopen("output_sorted.dot", "w"); - int_t max_level = tsort_etree[nsuper - 1]; + int max_level = tsort_etree[nsuper - 1]; /*beginning of the file */ fprintf(fp, "//dot file generated by pdgstrf\n"); fprintf(fp, "digraph elimination_tree {\n"); @@ -988,7 +988,7 @@ void print_etree_leveled(int_t *setree, int_t* tsort_etree, int_t nsuper) { /* code */ // fprintf(fp, "%lld -> %lld;\n",iperm[i],iperm[setree[i]]); - fprintf(fp, "%lld -> %lld;\n", i, setree[i]); + fprintf(fp, "%d -> " IFMT ";\n", i, setree[i]); } /*adding rank information*/ for (int i = 0; i < max_level; ++i) @@ -997,7 +997,7 @@ void print_etree_leveled(int_t *setree, int_t* tsort_etree, int_t nsuper) for (int j = 0; j < nsuper; ++j) { if (tsort_etree[j] == i) - fprintf(fp, "%lld ", j); + fprintf(fp, "%d ", j); } fprintf(fp, "}\n"); } @@ -1042,7 +1042,7 @@ void print_etree(int_t *setree, int_t* iperm, int_t nsuper) for (int i = 0; i < nsuper; ++i) { /* code */ - fprintf(fp, "%lld -> %lld;\n", iperm[i], iperm[setree[i]]); + fprintf(fp, IFMT " -> " IFMT ";\n", iperm[i], iperm[setree[i]]); } /*end of the file */ fprintf(fp, "}\n"); diff --git a/SRC/util.c b/SRC/util.c index 42c41725..5d220b83 100644 --- a/SRC/util.c +++ b/SRC/util.c @@ -256,10 +256,10 @@ void print_sp_ienv_dist(superlu_dist_options_t *options) printf("**************************************************\n"); printf(".. blocking parameters from sp_ienv():\n"); - printf("** relaxation : " IFMT "\n", sp_ienv_dist(2)); - printf("** max supernode : " IFMT "\n", sp_ienv_dist(3)); - printf("** estimated fill ratio : " IFMT "\n", sp_ienv_dist(6)); - printf("** min GEMM dimension for GPU : " IFMT "\n", sp_ienv_dist(7)); + printf("** relaxation : %d\n", sp_ienv_dist(2)); + printf("** max supernode : %d\n", sp_ienv_dist(3)); + printf("** estimated fill ratio : %d\n", sp_ienv_dist(6)); + printf("** min GEMM dimension for GPU : %d\n", sp_ienv_dist(7)); printf("**************************************************\n"); } diff --git a/SRC/zlustruct_gpu.h b/SRC/zlustruct_gpu.h index c4d81fc7..0f7a329f 100644 --- a/SRC/zlustruct_gpu.h +++ b/SRC/zlustruct_gpu.h @@ -32,7 +32,7 @@ #define MAX_NCUDA_STREAMS 32 static -void check(cudaError_t result, char const *const func, const char *const file, int_t const line) +void check(cudaError_t result, char const *const func, const char *const file, int const line) { if (result) { @@ -232,9 +232,6 @@ extern int zfree_LUstruct_gpu (zLUstruct_gpu_t *A_gpu); //int freeSluGPU(zsluGPU_t *sluGPU); -cublasStatus_t checkCublas(cublasStatus_t result); -// cudaError_t checkCuda(cudaError_t result); - extern void zPrint_matrix( char *desc, int_t m, int_t n, doublecomplex *dA, int_t lda ); /*to print out various statistics*/ diff --git a/SRC/zsuperlu_gpu.cu b/SRC/zsuperlu_gpu.cu index 54c8912c..aecf1ec5 100644 --- a/SRC/zsuperlu_gpu.cu +++ b/SRC/zsuperlu_gpu.cu @@ -33,7 +33,7 @@ //} /*error reporting functions */ -static +//static cudaError_t checkCuda(cudaError_t result) { #if defined(DEBUG) || defined(_DEBUG) From 1ea763f6149bcb78311ae55f751db117bfd4d77f Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Sun, 2 May 2021 16:04:49 -0400 Subject: [PATCH 078/147] =?UTF-8?q?1)=20opened=20up=20the=20branch=20of=20?= =?UTF-8?q?simple=20calculation=20of=20"fst=5Frow"=20on=20layer=200,=20whe?= =?UTF-8?q?n=20grid=20is=20XY-major=202)=20Fortran=20wrapper:=20=20=20=20*?= =?UTF-8?q?*=20complex=20can=20compile=20together=20with=20real;=20=20=20?= =?UTF-8?q?=20**=20tested=2064-bit=20indexing,=20and=20it=20works=20now?= =?UTF-8?q?=C3=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- EXAMPLE/pddrive4.c | 2 +- EXAMPLE/pddrive4_ABglobal.c | 4 +- EXAMPLE/pzdrive.c | 4 - EXAMPLE/pzdrive4.c | 2 +- EXAMPLE/pzdrive4_ABglobal.c | 4 +- FORTRAN/CMakeLists.txt | 3 +- FORTRAN/f_5x5.f90 | 23 ++- FORTRAN/f_pddrive.f90 | 14 +- FORTRAN/f_pddrive3d.f90 | 14 +- FORTRAN/f_pzdrive.f90 | 14 +- FORTRAN/f_pzdrive3d.f90 | 14 +- FORTRAN/superlu_c2f_dwrap.c | 289 ++++---------------------- FORTRAN/superlu_c2f_zwrap.c | 289 ++++---------------------- FORTRAN/superlupara.f90 | 2 +- SRC/CMakeLists.txt | 21 +- SRC/Cnames.h | 378 ---------------------------------- SRC/Makefile | 13 +- SRC/dlustruct_gpu.h | 15 +- SRC/dnrformat_loc3d.c | 47 +++-- SRC/dtreeFactorizationGPU.c | 4 + SRC/dutil_dist.c | 2 +- SRC/memory.c | 7 +- SRC/pdgsequ.c | 1 - SRC/pdgstrf2.c | 396 +++++++++++++++++------------------- SRC/pzgstrf2.c | 47 ++--- SRC/superlu_FCnames.h | 44 ++-- SRC/superlu_defs.h | 12 +- SRC/superlu_dist_config.h | 2 +- SRC/superlu_grid.c | 14 +- SRC/superlu_grid3d.c | 16 +- SRC/zlustruct_gpu.h | 15 +- SRC/znrformat_loc3d.c | 47 +++-- SRC/zsuperlu_gpu.cu | 15 -- SRC/ztreeFactorizationGPU.c | 4 + SRC/zutil_dist.c | 2 +- 35 files changed, 481 insertions(+), 1299 deletions(-) delete mode 100644 SRC/Cnames.h diff --git a/EXAMPLE/pddrive4.c b/EXAMPLE/pddrive4.c index b7c35ba0..ca984a81 100644 --- a/EXAMPLE/pddrive4.c +++ b/EXAMPLE/pddrive4.c @@ -61,7 +61,7 @@ int main(int argc, char *argv[]) int_t *asub, *xa; int_t i, j, m, n; int nprow, npcol, ldumap, p; - int_t usermap[6]; + int usermap[6]; int iam, info, ldb, ldx, nprocs; int nrhs = 1; /* Number of right-hand side. */ int ii, omp_mpi_level; diff --git a/EXAMPLE/pddrive4_ABglobal.c b/EXAMPLE/pddrive4_ABglobal.c index f870de85..9ff46dd8 100644 --- a/EXAMPLE/pddrive4_ABglobal.c +++ b/EXAMPLE/pddrive4_ABglobal.c @@ -60,7 +60,7 @@ int main(int argc, char *argv[]) int_t *asub, *xa; int_t i, j, m, n, nnz; int_t nprow, npcol, ldumap, p; - int_t usermap[6]; + int usermap[6]; int iam, info, ldb, ldx, nprocs; int nrhs = 1; /* Number of right-hand side. */ char trans[1]; @@ -126,7 +126,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in any of the 2 grids. */ MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - if ( iam >= 10 ) goto out; + if ( iam == -1 ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c index 16c3f14a..3878558d 100644 --- a/EXAMPLE/pzdrive.c +++ b/EXAMPLE/pzdrive.c @@ -173,10 +173,6 @@ int main(int argc, char *argv[]) GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. ------------------------------------------------------------*/ zcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid); - if (iam==1) { - printf("(%d) 9341 xtrue[0] %e\n", xtrue[0]); - fflush(stdout); - } if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); diff --git a/EXAMPLE/pzdrive4.c b/EXAMPLE/pzdrive4.c index 53e8c06c..7453d2ac 100644 --- a/EXAMPLE/pzdrive4.c +++ b/EXAMPLE/pzdrive4.c @@ -60,7 +60,7 @@ int main(int argc, char *argv[]) int_t *asub, *xa; int_t i, j, m, n; int nprow, npcol, ldumap, p; - int_t usermap[6]; + int usermap[6]; int iam, info, ldb, ldx, nprocs; int nrhs = 1; /* Number of right-hand side. */ int ii, omp_mpi_level; diff --git a/EXAMPLE/pzdrive4_ABglobal.c b/EXAMPLE/pzdrive4_ABglobal.c index 4ec17583..5515e885 100644 --- a/EXAMPLE/pzdrive4_ABglobal.c +++ b/EXAMPLE/pzdrive4_ABglobal.c @@ -59,7 +59,7 @@ int main(int argc, char *argv[]) int_t *asub, *xa; int_t i, j, m, n, nnz; int_t nprow, npcol, ldumap, p; - int_t usermap[6]; + int usermap[6]; int iam, info, ldb, ldx, nprocs; int nrhs = 1; /* Number of right-hand side. */ char trans[1]; @@ -125,7 +125,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in any of the 2 grids. */ MPI_Comm_rank( MPI_COMM_WORLD, &iam ); - if ( iam >= 10 ) goto out; + if ( iam == -1 ) goto out; #if ( DEBUGlevel>=1 ) CHECK_MALLOC(iam, "Enter main()"); diff --git a/FORTRAN/CMakeLists.txt b/FORTRAN/CMakeLists.txt index 4516170b..99745790 100644 --- a/FORTRAN/CMakeLists.txt +++ b/FORTRAN/CMakeLists.txt @@ -6,7 +6,8 @@ set(headers ${CMAKE_INSTALL_PREFIX}/FORTRAN/superlupara_mod.mod ) -set(sources "") # initialize an empty set +set(sources "superlu_c2f_wrap.c") # initialize precision-independent file + if(enable_double) list(APPEND sources c2f_dcreate_matrix_x_b.c superlu_c2f_dwrap.c) endif() diff --git a/FORTRAN/f_5x5.f90 b/FORTRAN/f_5x5.f90 index 48e391e8..1b750533 100644 --- a/FORTRAN/f_5x5.f90 +++ b/FORTRAN/f_5x5.f90 @@ -36,12 +36,13 @@ program f_5x5 use superlu_mod ! implicit none include 'mpif.h' +! include 'superlu_dist_config.fh' integer maxn, maxnz, maxnrhs parameter ( maxn = 10, maxnz = 100, maxnrhs = 10 ) - integer colind(maxnz), rowptr(maxn+1) + integer*8 colind(maxnz), rowptr(maxn+1) real*8 nzval(maxnz), b(maxn), berr(maxnrhs) integer n, m, nnz, nrhs, nprow, npcol, init - integer*4 iam, info, i, ierr, ldb + integer iam, info, i, ierr, ldb integer nnz_loc, m_loc, fst_row real*8 s, u, p, e, r, l @@ -62,9 +63,9 @@ program f_5x5 ! Create Fortran handles for the C structures used in SuperLU_DIST call f_create_gridinfo_handle(grid) call f_create_options_handle(options) - call f_create_ScalePerm_handle(ScalePermstruct) - call f_create_LUstruct_handle(LUstruct) - call f_create_SOLVEstruct_handle(SOLVEstruct) + call f_dcreate_ScalePerm_handle(ScalePermstruct) + call f_dcreate_LUstruct_handle(LUstruct) + call f_dcreate_SOLVEstruct_handle(SOLVEstruct) call f_create_SuperMatrix_handle(A) call f_create_SuperLUStat_handle(stat) @@ -176,8 +177,8 @@ program f_5x5 ! Initialize ScalePermstruct and LUstruct call get_SuperMatrix(A,nrow=m,ncol=n) - call f_ScalePermstructInit(m, n, ScalePermstruct) - call f_LUstructInit(m, n, LUstruct) + call f_dScalePermstructInit(m, n, ScalePermstruct) + call f_dLUstructInit(m, n, LUstruct) ! Initialize the statistics variables call f_PStatInit(stat) @@ -195,8 +196,8 @@ program f_5x5 ! Deallocate the storage allocated by SuperLU_DIST call f_PStatFree(stat) call f_Destroy_SuperMat_Store_dist(A) - call f_ScalePermstructFree(ScalePermstruct) - call f_Destroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct) + call f_dScalePermstructFree(ScalePermstruct) + call f_dDestroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct) ! call f_LUstructFree(LUstruct) ! call get_superlu_options(options, SolveInitialized=init) ! if (init == YES) then @@ -212,7 +213,11 @@ program f_5x5 call f_destroy_ScalePerm_handle(ScalePermstruct) call f_destroy_LUstruct_handle(LUstruct) call f_destroy_SOLVEstruct_handle(SOLVEstruct) + +! call f_Destroy_CompRowLoc_Mat_dist(A) +! need to free the supermatrix Store call f_destroy_SuperMatrix_handle(A) + call f_destroy_SuperLUStat_handle(stat) ! Check malloc diff --git a/FORTRAN/f_pddrive.f90 b/FORTRAN/f_pddrive.f90 index aa8f7e22..a67de8f0 100644 --- a/FORTRAN/f_pddrive.f90 +++ b/FORTRAN/f_pddrive.f90 @@ -57,9 +57,9 @@ program f_pddrive ! Create Fortran handles for the C structures used in SuperLU_DIST call f_create_gridinfo_handle(grid) call f_create_options_handle(options) - call f_create_ScalePerm_handle(ScalePermstruct) - call f_create_LUstruct_handle(LUstruct) - call f_create_SOLVEstruct_handle(SOLVEstruct) + call f_dcreate_ScalePerm_handle(ScalePermstruct) + call f_dcreate_LUstruct_handle(LUstruct) + call f_dcreate_SOLVEstruct_handle(SOLVEstruct) call f_create_SuperMatrix_handle(A) call f_create_SuperLUStat_handle(stat) @@ -96,8 +96,8 @@ program f_pddrive ! Initialize ScalePermstruct and LUstruct call get_SuperMatrix(A, nrow=m, ncol=n) - call f_ScalePermstructInit(m, n, ScalePermstruct) - call f_LUstructInit(m, n, LUstruct) + call f_dScalePermstructInit(m, n, ScalePermstruct) + call f_dLUstructInit(m, n, LUstruct) ! Initialize the statistics variables call f_PStatInit(stat) @@ -117,8 +117,8 @@ program f_pddrive ! Deallocate the storage allocated by SuperLU_DIST call f_PStatFree(stat) call f_Destroy_CompRowLoc_Mat_dist(A) - call f_ScalePermstructFree(ScalePermstruct) - call f_Destroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct) + call f_dScalePermstructFree(ScalePermstruct) + call f_dDestroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct) ! Release the SuperLU process grid 100 call f_superlu_gridexit(grid) diff --git a/FORTRAN/f_pddrive3d.f90 b/FORTRAN/f_pddrive3d.f90 index b124af36..ffe0353b 100644 --- a/FORTRAN/f_pddrive3d.f90 +++ b/FORTRAN/f_pddrive3d.f90 @@ -69,9 +69,9 @@ program f_pddrive3d ! Create Fortran handles for the C structures used in SuperLU_DIST call f_create_gridinfo3d_handle(grid) call f_create_options_handle(options) - call f_create_ScalePerm_handle(ScalePermstruct) - call f_create_LUstruct_handle(LUstruct) - call f_create_SOLVEstruct_handle(SOLVEstruct) + call f_dcreate_ScalePerm_handle(ScalePermstruct) + call f_dcreate_LUstruct_handle(LUstruct) + call f_dcreate_SOLVEstruct_handle(SOLVEstruct) call f_create_SuperMatrix_handle(A) call f_create_SuperLUStat_handle(stat) @@ -109,8 +109,8 @@ program f_pddrive3d ! Initialize ScalePermstruct and LUstruct call get_SuperMatrix(A, nrow=m, ncol=n) - call f_ScalePermstructInit(m, n, ScalePermstruct) - call f_LUstructInit(m, n, LUstruct) + call f_dScalePermstructInit(m, n, ScalePermstruct) + call f_dLUstructInit(m, n, LUstruct) ! Initialize the statistics variables call f_PStatInit(stat) @@ -130,8 +130,8 @@ program f_pddrive3d ! Deallocate the storage allocated by SuperLU_DIST call f_PStatFree(stat) call f_Destroy_CompRowLoc_Mat_dist(A) - call f_ScalePermstructFree(ScalePermstruct) - call f_Destroy_LU_SOLVE_struct_3d(options, n, grid, LUstruct, SOLVEstruct) + call f_dScalePermstructFree(ScalePermstruct) + call f_dDestroy_LU_SOLVE_struct_3d(options, n, grid, LUstruct, SOLVEstruct) ! call f_LUstructFree(LUstruct) ! call get_superlu_options(options, SolveInitialized=init) ! if (init == YES) then diff --git a/FORTRAN/f_pzdrive.f90 b/FORTRAN/f_pzdrive.f90 index 8609621d..7bbe9ead 100644 --- a/FORTRAN/f_pzdrive.f90 +++ b/FORTRAN/f_pzdrive.f90 @@ -57,9 +57,9 @@ program f_pzdrive ! Create Fortran handles for the C structures used in SuperLU_DIST call f_create_gridinfo_handle(grid) call f_create_options_handle(options) - call f_create_ScalePerm_handle(ScalePermstruct) - call f_create_LUstruct_handle(LUstruct) - call f_create_SOLVEstruct_handle(SOLVEstruct) + call f_zcreate_ScalePerm_handle(ScalePermstruct) + call f_zcreate_LUstruct_handle(LUstruct) + call f_zcreate_SOLVEstruct_handle(SOLVEstruct) call f_create_SuperMatrix_handle(A) call f_create_SuperLUStat_handle(stat) @@ -96,8 +96,8 @@ program f_pzdrive ! Initialize ScalePermstruct and LUstruct call get_SuperMatrix(A, nrow=m, ncol=n) - call f_ScalePermstructInit(m, n, ScalePermstruct) - call f_LUstructInit(m, n, LUstruct) + call f_zScalePermstructInit(m, n, ScalePermstruct) + call f_zLUstructInit(m, n, LUstruct) ! Initialize the statistics variables call f_PStatInit(stat) @@ -117,8 +117,8 @@ program f_pzdrive ! Deallocate the storage allocated by SuperLU_DIST call f_PStatFree(stat) call f_Destroy_CompRowLoc_Mat_dist(A) - call f_ScalePermstructFree(ScalePermstruct) - call f_Destroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct) + call f_zScalePermstructFree(ScalePermstruct) + call f_zDestroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct) ! Release the SuperLU process grid 100 call f_superlu_gridexit(grid) diff --git a/FORTRAN/f_pzdrive3d.f90 b/FORTRAN/f_pzdrive3d.f90 index 42a0a12d..f64c50d7 100644 --- a/FORTRAN/f_pzdrive3d.f90 +++ b/FORTRAN/f_pzdrive3d.f90 @@ -69,9 +69,9 @@ program f_pzdrive3d ! Create Fortran handles for the C structures used in SuperLU_DIST call f_create_gridinfo3d_handle(grid) call f_create_options_handle(options) - call f_create_ScalePerm_handle(ScalePermstruct) - call f_create_LUstruct_handle(LUstruct) - call f_create_SOLVEstruct_handle(SOLVEstruct) + call f_zcreate_ScalePerm_handle(ScalePermstruct) + call f_zcreate_LUstruct_handle(LUstruct) + call f_zcreate_SOLVEstruct_handle(SOLVEstruct) call f_create_SuperMatrix_handle(A) call f_create_SuperLUStat_handle(stat) @@ -109,8 +109,8 @@ program f_pzdrive3d ! Initialize ScalePermstruct and LUstruct call get_SuperMatrix(A, nrow=m, ncol=n) - call f_ScalePermstructInit(m, n, ScalePermstruct) - call f_LUstructInit(m, n, LUstruct) + call f_zScalePermstructInit(m, n, ScalePermstruct) + call f_zLUstructInit(m, n, LUstruct) ! Initialize the statistics variables call f_PStatInit(stat) @@ -130,8 +130,8 @@ program f_pzdrive3d ! Deallocate the storage allocated by SuperLU_DIST call f_PStatFree(stat) call f_Destroy_CompRowLoc_Mat_dist(A) - call f_ScalePermstructFree(ScalePermstruct) - call f_Destroy_LU_SOLVE_struct_3d(options, n, grid, LUstruct, SOLVEstruct) + call f_zScalePermstructFree(ScalePermstruct) + call f_zDestroy_LU_SOLVE_struct_3d(options, n, grid, LUstruct, SOLVEstruct) ! call f_LUstructFree(LUstruct) ! call get_superlu_options(options, SolveInitialized=init) ! if (init == YES) then diff --git a/FORTRAN/superlu_c2f_dwrap.c b/FORTRAN/superlu_c2f_dwrap.c index 616f8729..018d24a2 100644 --- a/FORTRAN/superlu_c2f_dwrap.c +++ b/FORTRAN/superlu_c2f_dwrap.c @@ -13,273 +13,59 @@ #include "superlu_ddefs.h" #include "superlu_FCnames.h" -/* kind of integer to hold a pointer. Use int. - This might need to be changed on systems with large memory. - If changed, be sure to change it in superlupara.f90 too */ +/* kind of integer to hold a pointer. + Be sure to be consistent with that in superlupara.f90 */ #if 0 typedef int fptr; /* 32-bit */ #else typedef long long int fptr; /* 64-bit */ #endif - -/* some MPI implementations may require conversion between a Fortran - communicator and a C communicator. This routine is used to perform the - conversion. It may need different forms for different MPI libraries. */ - -/* NO_MPI2 should be defined on the compiler command line if the MPI - library does not provide MPI_Comm_f2c */ - -MPI_Comm f2c_comm(int *f_comm) -{ -#ifndef NO_MPI2 - -/* MPI 2 provides a standard way of doing this */ - return MPI_Comm_f2c((MPI_Fint)(*f_comm)); -#else - -/* will probably need some special cases here */ -/* when in doubt, just return the input */ - return (MPI_Comm)(*f_comm); -#endif -} - - /* functions that create memory for a struct and return a handle */ -void f_create_gridinfo_handle(fptr *handle) -{ - *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo_t)); -} - -void f_create_gridinfo3d_handle(fptr *handle) -{ - *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo3d_t)); -} - -void f_create_options_handle(fptr *handle) -{ - *handle = (fptr) SUPERLU_MALLOC(sizeof(superlu_dist_options_t)); -} - -void f_create_ScalePerm_handle(fptr *handle) +void f_dcreate_ScalePerm_handle(fptr *handle) { *handle = (fptr) SUPERLU_MALLOC(sizeof(dScalePermstruct_t)); } -void f_create_LUstruct_handle(fptr *handle) +void f_dcreate_LUstruct_handle(fptr *handle) { *handle = (fptr) SUPERLU_MALLOC(sizeof(dLUstruct_t)); } -void f_create_SOLVEstruct_handle(fptr *handle) +void f_dcreate_SOLVEstruct_handle(fptr *handle) { *handle = (fptr) SUPERLU_MALLOC(sizeof(dSOLVEstruct_t)); } -void f_create_SuperMatrix_handle(fptr *handle) -{ - *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperMatrix)); -} - -void f_create_SuperLUStat_handle(fptr *handle) -{ - *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperLUStat_t)); -} - -/* functions that free the memory allocated by the above functions */ - -void f_destroy_gridinfo_handle(fptr *handle) -{ - SUPERLU_FREE((void *)*handle); -} - -void f_destroy_options_handle(fptr *handle) -{ - SUPERLU_FREE((void *)*handle); -} - -void f_destroy_ScalePerm_handle(fptr *handle) -{ - SUPERLU_FREE((void *)*handle); -} - -void f_destroy_LUstruct_handle(fptr *handle) -{ - SUPERLU_FREE((void *)*handle); -} - -void f_destroy_SOLVEstruct_handle(fptr *handle) -{ - SUPERLU_FREE((void *)*handle); -} - -void f_destroy_SuperMatrix_handle(fptr *handle) -{ - SUPERLU_FREE((void *)*handle); -} - -void f_destroy_SuperLUStat_handle(fptr *handle) -{ - SUPERLU_FREE((void *)*handle); -} - -/* functions that get or set values in a C struct. - This is not the complete set of structs for which a user might want - to get/set a component, and there may be missing components. */ - -void f_get_gridinfo(fptr *grid, int *iam, int_t *nprow, int_t *npcol) -{ - *iam=((gridinfo_t *) *grid)->iam; - *npcol=((gridinfo_t *) *grid)->npcol; - *nprow=((gridinfo_t *) *grid)->nprow; -} - -void f_get_gridinfo3d(fptr *grid, int *iam, - int_t *nprow, int_t *npcol, int_t *npdep) -{ - *iam=((gridinfo3d_t *) *grid)->iam; - *npcol=((gridinfo3d_t *) *grid)->npcol; - *nprow=((gridinfo3d_t *) *grid)->nprow; - *npdep=((gridinfo3d_t *) *grid)->npdep; -} - -void f_get_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol) -{ - *nrow = ((SuperMatrix *) *A)->nrow; - *ncol = ((SuperMatrix *) *A)->ncol; -} - -void f_set_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol) -{ - ((SuperMatrix *) *A)->nrow = *nrow; - ((SuperMatrix *) *A)->ncol = *ncol; -} - -void f_get_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc, - int_t *m_loc, int_t *fst_row) -{ - *m=((SuperMatrix *) *A)->nrow; - *n=((SuperMatrix *) *A)->ncol; - *m_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc; - *nnz_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc; - *fst_row=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row; -} - -void f_set_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc, - int_t *m_loc, int_t *fst_row) -{ - ((SuperMatrix *) *A)->nrow = *m; - ((SuperMatrix *) *A)->ncol = *n; - ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc = *m_loc; - ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc = *nnz_loc; - ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row = *fst_row; -} - -void f_get_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact, - int *ColPerm, int *RowPerm, int *IterRefine, - int *Trans, int *ReplaceTinyPivot, - int *SolveInitialized, int *RefineInitialized, - int *PrintStat) -{ - *Fact = (int) ((superlu_dist_options_t *) *opt)->Fact; - *Equil = (int) ((superlu_dist_options_t *) *opt)->Equil; - *ParSymbFact = (int) ((superlu_dist_options_t *) *opt)->ParSymbFact; - *ColPerm = (int) ((superlu_dist_options_t *) *opt)->ColPerm; - *RowPerm = (int) ((superlu_dist_options_t *) *opt)->RowPerm; - *IterRefine = (int) ((superlu_dist_options_t *) *opt)->IterRefine; - *Trans = (int) ((superlu_dist_options_t *) *opt)->Trans; - *ReplaceTinyPivot = (int) ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot; - *SolveInitialized = (int) ((superlu_dist_options_t *) *opt)->SolveInitialized; - *RefineInitialized = (int) ((superlu_dist_options_t *) *opt)->RefineInitialized; - *PrintStat = (int) ((superlu_dist_options_t *) *opt)->PrintStat; -} - -void f_set_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact, - int *ColPerm, int *RowPerm, int *IterRefine, - int *Trans, int *ReplaceTinyPivot, - int *SolveInitialized, int *RefineInitialized, - int *PrintStat) -{ - superlu_dist_options_t *l_options = (superlu_dist_options_t*) *opt; - l_options->Fact = (fact_t) *Fact; - ((superlu_dist_options_t *) *opt)->Equil = (yes_no_t) *Equil; - ((superlu_dist_options_t *) *opt)->ParSymbFact = (yes_no_t) *ParSymbFact; - ((superlu_dist_options_t *) *opt)->ColPerm = (colperm_t) *ColPerm; - ((superlu_dist_options_t *) *opt)->RowPerm = (rowperm_t) *RowPerm; - ((superlu_dist_options_t *) *opt)->IterRefine = (IterRefine_t) *IterRefine; - ((superlu_dist_options_t *) *opt)->Trans = (trans_t) *Trans; - ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot = (yes_no_t) *ReplaceTinyPivot; - ((superlu_dist_options_t *) *opt)->SolveInitialized = (yes_no_t) *SolveInitialized; - ((superlu_dist_options_t *) *opt)->RefineInitialized = (yes_no_t) *RefineInitialized; - ((superlu_dist_options_t *) *opt)->PrintStat = (yes_no_t) *PrintStat; -} - /* wrappers for SuperLU functions */ -void f_set_default_options(fptr *options) -{ - set_default_options_dist((superlu_dist_options_t *) *options); -} - -void f_superlu_gridinit(int *Bcomm, int_t *nprow, int_t *npcol, fptr *grid) -{ - superlu_gridinit(f2c_comm(Bcomm), *nprow, *npcol, (gridinfo_t *) *grid); -} - -void f_superlu_gridinit3d(int *Bcomm, int_t *nprow, int_t *npcol, - int_t *npdep, fptr *grid) -{ - superlu_gridinit3d(f2c_comm(Bcomm), *nprow, *npcol, *npdep, (gridinfo3d_t *) *grid); -} - -void f_superlu_gridmap(int *Bcomm, int_t *nprow, int_t *npcol, - int_t *usermap, int_t *ldumap, fptr *grid) -{ - superlu_gridmap(f2c_comm(Bcomm), *nprow, *npcol, usermap, *ldumap, (gridinfo_t *) *grid); -} - -void f_superlu_gridexit(fptr *grid) -{ - superlu_gridexit((gridinfo_t *) *grid); -} - -void f_ScalePermstructInit(int_t *m, int_t *n, fptr *ScalePermstruct) +void f_dScalePermstructInit(int *m, int *n, fptr *ScalePermstruct) { dScalePermstructInit(*m, *n, (dScalePermstruct_t *) *ScalePermstruct); } -void f_ScalePermstructFree(fptr *ScalePermstruct) +void f_dScalePermstructFree(fptr *ScalePermstruct) { dScalePermstructFree((dScalePermstruct_t *) *ScalePermstruct); } -void f_PStatInit(fptr *stat) -{ - PStatInit((SuperLUStat_t *) *stat); -} - -void f_PStatFree(fptr *stat) -{ - PStatFree((SuperLUStat_t *) *stat); -} - -void f_LUstructInit(int_t *m, int_t *n, fptr *LUstruct) +void f_dLUstructInit(int *m, int *n, fptr *LUstruct) { extern void dLUstructInit(const int_t, dLUstruct_t *); dLUstructInit(*m, (dLUstruct_t *) *LUstruct); } -void f_LUstructFree(fptr *LUstruct) +void f_dLUstructFree(fptr *LUstruct) { extern void dLUstructFree(dLUstruct_t *); dLUstructFree((dLUstruct_t *) *LUstruct); } -void f_Destroy_LU_SOLVE_struct(fptr *options, int_t *n, fptr *grid, +void f_dDestroy_LU_SOLVE_struct(fptr *options, int *n, fptr *grid, fptr *LUstruct, fptr *SOLVEstruct) { superlu_dist_options_t *opt = (superlu_dist_options_t *) *options; @@ -290,7 +76,7 @@ void f_Destroy_LU_SOLVE_struct(fptr *options, int_t *n, fptr *grid, } } -void f_Destroy_LU_SOLVE_struct_3d(fptr *options, int_t *n, fptr *grid, +void f_dDestroy_LU_SOLVE_struct_3d(fptr *options, int *n, fptr *grid, fptr *LUstruct, fptr *SOLVEstruct) { gridinfo3d_t *grid3d = (gridinfo3d_t *) *grid; @@ -310,27 +96,37 @@ void f_Destroy_LU_SOLVE_struct_3d(fptr *options, int_t *n, fptr *grid, dLUstructFree(LUstruct_ptr); } -void f_dCreate_CompRowLoc_Mat_dist(fptr *A, int_t *m, int_t *n, int_t *nnz_loc, - int_t *m_loc, int_t *fst_row, double *nzval, +void f_dCreate_CompRowLoc_Mat_dist(fptr *A, int *m, int *n, int *nnz_loc, + int *m_loc, int *fst_row, double *nzval, int_t *colind, int_t *rowptr, int *stype, int *dtype, int *mtype) { - dCreate_CompRowLoc_Matrix_dist((SuperMatrix *) *A, *m, *n, *nnz_loc, *m_loc, - *fst_row, (double *) nzval, colind, rowptr, +#if 1 + double *C_nzval = nzval; + int_t *C_colind = colind; + int_t *C_rowptr = rowptr; +#else + /* make a copy of matrix A that is internal to the C side */ + double *C_nzval = doubleMalloc_dist(*nnz_loc); + int_t *C_colind = intMalloc_dist(*nnz_loc); + int_t *C_rowptr = intMalloc_dist(*m_loc + 1); + int i; + + for (i = 0; i < *nnz_loc; ++i) { + C_nzval[i] = nzval[i]; + C_colind[i] = colind[i]; + } + for (i = 0; i <= *m_loc; ++i) { + C_rowptr[i] = rowptr[i]; + } +#endif + + dCreate_CompRowLoc_Matrix_dist((SuperMatrix *) *A, *m, *n, *nnz_loc, *m_loc, + *fst_row, C_nzval, C_colind, C_rowptr, (Stype_t) *stype, (Dtype_t) *dtype, (Mtype_t) *mtype); } -void f_Destroy_CompRowLoc_Mat_dist(fptr *A) -{ - Destroy_CompRowLoc_Matrix_dist((SuperMatrix *) *A); -} - -void f_Destroy_SuperMat_Store_dist(fptr *A) -{ - Destroy_SuperMatrix_Store_dist((SuperMatrix *) *A); -} - void f_dSolveFinalize(fptr *options, fptr *SOLVEstruct) { dSolveFinalize((superlu_dist_options_t *) *options, @@ -379,9 +175,10 @@ void f_dcreate_matrix_x_b(char *fname, fptr *A, int *m, int *n, int_t *nnz, MPI_Comm, SuperMatrix *A, int *m_g, int *n_g, int_t *nnz_g, double *rhs, int *ldb, double *x, int *ldx); + extern void f_get_gridinfo(fptr *grid, int *iam, int *nprow, int *npcol); int iam, nprocs; - int_t nprow, npcol; + int nprow, npcol; MPI_Comm slucomm = ((gridinfo_t *) *grid)->comm; f_get_gridinfo(grid, &iam, &nprow, &npcol); nprocs = nprow * npcol; @@ -398,9 +195,11 @@ void f_dcreate_matrix_x_b_3d(char *fname, fptr *A, int *m, int *n, int_t *nnz, MPI_Comm, SuperMatrix *A, int *m_g, int *n_g, int_t *nnz_g, double *rhs, int *ldb, double *x, int *ldx); + extern void f_get_gridinfo3d(fptr *grid, int *iam, + int *nprow, int *npcol, int *npdep); int iam, nprocs; - int_t nprow, npcol, npdep; + int nprow, npcol, npdep; MPI_Comm slucomm = ((gridinfo3d_t *) *grid)->comm; f_get_gridinfo3d(grid, &iam, &nprow, &npcol, &npdep); nprocs = nprow * npcol * npdep; @@ -409,11 +208,3 @@ void f_dcreate_matrix_x_b_3d(char *fname, fptr *A, int *m, int *n, int_t *nnz, (SuperMatrix *) *A, m, n, nnz, b, ldb, xtrue, ldx); } -/* Check malloc */ - -void f_check_malloc(int *iam) -{ -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC((int_t) *iam, "Check Malloc"); -#endif -} diff --git a/FORTRAN/superlu_c2f_zwrap.c b/FORTRAN/superlu_c2f_zwrap.c index 7f5c990a..467d4131 100644 --- a/FORTRAN/superlu_c2f_zwrap.c +++ b/FORTRAN/superlu_c2f_zwrap.c @@ -12,273 +12,59 @@ #include "superlu_zdefs.h" #include "superlu_FCnames.h" -/* kind of integer to hold a pointer. Use int. - This might need to be changed on systems with large memory. - If changed, be sure to change it in superlupara.f90 too */ +/* kind of integer to hold a pointer. + Be sure to be consistent with that in superlupara.f90 */ #if 0 typedef int fptr; /* 32-bit */ #else typedef long long int fptr; /* 64-bit */ #endif - -/* some MPI implementations may require conversion between a Fortran - communicator and a C communicator. This routine is used to perform the - conversion. It may need different forms for different MPI libraries. */ - -/* NO_MPI2 should be defined on the compiler command line if the MPI - library does not provide MPI_Comm_f2c */ - -MPI_Comm f2c_comm(int *f_comm) -{ -#ifndef NO_MPI2 - -/* MPI 2 provides a standard way of doing this */ - return MPI_Comm_f2c((MPI_Fint)(*f_comm)); -#else - -/* will probably need some special cases here */ -/* when in doubt, just return the input */ - return (MPI_Comm)(*f_comm); -#endif -} - - /* functions that create memory for a struct and return a handle */ -void f_create_gridinfo_handle(fptr *handle) -{ - *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo_t)); -} - -void f_create_gridinfo3d_handle(fptr *handle) -{ - *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo3d_t)); -} - -void f_create_options_handle(fptr *handle) -{ - *handle = (fptr) SUPERLU_MALLOC(sizeof(superlu_dist_options_t)); -} - -void f_create_ScalePerm_handle(fptr *handle) +void f_zcreate_ScalePerm_handle(fptr *handle) { *handle = (fptr) SUPERLU_MALLOC(sizeof(zScalePermstruct_t)); } -void f_create_LUstruct_handle(fptr *handle) +void f_zcreate_LUstruct_handle(fptr *handle) { *handle = (fptr) SUPERLU_MALLOC(sizeof(zLUstruct_t)); } -void f_create_SOLVEstruct_handle(fptr *handle) +void f_zcreate_SOLVEstruct_handle(fptr *handle) { *handle = (fptr) SUPERLU_MALLOC(sizeof(zSOLVEstruct_t)); } -void f_create_SuperMatrix_handle(fptr *handle) -{ - *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperMatrix)); -} - -void f_create_SuperLUStat_handle(fptr *handle) -{ - *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperLUStat_t)); -} - -/* functions that free the memory allocated by the above functions */ - -void f_destroy_gridinfo_handle(fptr *handle) -{ - SUPERLU_FREE((void *)*handle); -} - -void f_destroy_options_handle(fptr *handle) -{ - SUPERLU_FREE((void *)*handle); -} - -void f_destroy_ScalePerm_handle(fptr *handle) -{ - SUPERLU_FREE((void *)*handle); -} - -void f_destroy_LUstruct_handle(fptr *handle) -{ - SUPERLU_FREE((void *)*handle); -} - -void f_destroy_SOLVEstruct_handle(fptr *handle) -{ - SUPERLU_FREE((void *)*handle); -} - -void f_destroy_SuperMatrix_handle(fptr *handle) -{ - SUPERLU_FREE((void *)*handle); -} - -void f_destroy_SuperLUStat_handle(fptr *handle) -{ - SUPERLU_FREE((void *)*handle); -} - -/* functions that get or set values in a C struct. - This is not the complete set of structs for which a user might want - to get/set a component, and there may be missing components. */ - -void f_get_gridinfo(fptr *grid, int *iam, int_t *nprow, int_t *npcol) -{ - *iam=((gridinfo_t *) *grid)->iam; - *npcol=((gridinfo_t *) *grid)->npcol; - *nprow=((gridinfo_t *) *grid)->nprow; -} - -void f_get_gridinfo3d(fptr *grid, int *iam, - int_t *nprow, int_t *npcol, int_t *npdep) -{ - *iam=((gridinfo3d_t *) *grid)->iam; - *npcol=((gridinfo3d_t *) *grid)->npcol; - *nprow=((gridinfo3d_t *) *grid)->nprow; - *npdep=((gridinfo3d_t *) *grid)->npdep; -} - -void f_get_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol) -{ - *nrow = ((SuperMatrix *) *A)->nrow; - *ncol = ((SuperMatrix *) *A)->ncol; -} - -void f_set_SuperMatrix(fptr *A, int_t *nrow, int_t *ncol) -{ - ((SuperMatrix *) *A)->nrow = *nrow; - ((SuperMatrix *) *A)->ncol = *ncol; -} - -void f_get_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc, - int_t *m_loc, int_t *fst_row) -{ - *m=((SuperMatrix *) *A)->nrow; - *n=((SuperMatrix *) *A)->ncol; - *m_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc; - *nnz_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc; - *fst_row=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row; -} - -void f_set_CompRowLoc_Matrix(fptr *A, int_t *m, int_t *n, int_t *nnz_loc, - int_t *m_loc, int_t *fst_row) -{ - ((SuperMatrix *) *A)->nrow = *m; - ((SuperMatrix *) *A)->ncol = *n; - ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc = *m_loc; - ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc = *nnz_loc; - ((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row = *fst_row; -} - -void f_get_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact, - int *ColPerm, int *RowPerm, int *IterRefine, - int *Trans, int *ReplaceTinyPivot, - int *SolveInitialized, int *RefineInitialized, - int *PrintStat) -{ - *Fact = (int) ((superlu_dist_options_t *) *opt)->Fact; - *Equil = (int) ((superlu_dist_options_t *) *opt)->Equil; - *ParSymbFact = (int) ((superlu_dist_options_t *) *opt)->ParSymbFact; - *ColPerm = (int) ((superlu_dist_options_t *) *opt)->ColPerm; - *RowPerm = (int) ((superlu_dist_options_t *) *opt)->RowPerm; - *IterRefine = (int) ((superlu_dist_options_t *) *opt)->IterRefine; - *Trans = (int) ((superlu_dist_options_t *) *opt)->Trans; - *ReplaceTinyPivot = (int) ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot; - *SolveInitialized = (int) ((superlu_dist_options_t *) *opt)->SolveInitialized; - *RefineInitialized = (int) ((superlu_dist_options_t *) *opt)->RefineInitialized; - *PrintStat = (int) ((superlu_dist_options_t *) *opt)->PrintStat; -} - -void f_set_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact, - int *ColPerm, int *RowPerm, int *IterRefine, - int *Trans, int *ReplaceTinyPivot, - int *SolveInitialized, int *RefineInitialized, - int *PrintStat) -{ - superlu_dist_options_t *l_options = (superlu_dist_options_t*) *opt; - l_options->Fact = (fact_t) *Fact; - ((superlu_dist_options_t *) *opt)->Equil = (yes_no_t) *Equil; - ((superlu_dist_options_t *) *opt)->ParSymbFact = (yes_no_t) *ParSymbFact; - ((superlu_dist_options_t *) *opt)->ColPerm = (colperm_t) *ColPerm; - ((superlu_dist_options_t *) *opt)->RowPerm = (rowperm_t) *RowPerm; - ((superlu_dist_options_t *) *opt)->IterRefine = (IterRefine_t) *IterRefine; - ((superlu_dist_options_t *) *opt)->Trans = (trans_t) *Trans; - ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot = (yes_no_t) *ReplaceTinyPivot; - ((superlu_dist_options_t *) *opt)->SolveInitialized = (yes_no_t) *SolveInitialized; - ((superlu_dist_options_t *) *opt)->RefineInitialized = (yes_no_t) *RefineInitialized; - ((superlu_dist_options_t *) *opt)->PrintStat = (yes_no_t) *PrintStat; -} - /* wrappers for SuperLU functions */ -void f_set_default_options(fptr *options) -{ - set_default_options_dist((superlu_dist_options_t *) *options); -} - -void f_superlu_gridinit(int *Bcomm, int_t *nprow, int_t *npcol, fptr *grid) -{ - superlu_gridinit(f2c_comm(Bcomm), *nprow, *npcol, (gridinfo_t *) *grid); -} - -void f_superlu_gridinit3d(int *Bcomm, int_t *nprow, int_t *npcol, - int_t *npdep, fptr *grid) -{ - superlu_gridinit3d(f2c_comm(Bcomm), *nprow, *npcol, *npdep, (gridinfo3d_t *) *grid); -} - -void f_superlu_gridmap(int *Bcomm, int_t *nprow, int_t *npcol, - int_t *usermap, int_t *ldumap, fptr *grid) -{ - superlu_gridmap(f2c_comm(Bcomm), *nprow, *npcol, usermap, *ldumap, (gridinfo_t *) *grid); -} - -void f_superlu_gridexit(fptr *grid) -{ - superlu_gridexit((gridinfo_t *) *grid); -} - -void f_ScalePermstructInit(int_t *m, int_t *n, fptr *ScalePermstruct) +void f_zScalePermstructInit(int *m, int *n, fptr *ScalePermstruct) { zScalePermstructInit(*m, *n, (zScalePermstruct_t *) *ScalePermstruct); } -void f_ScalePermstructFree(fptr *ScalePermstruct) +void f_zScalePermstructFree(fptr *ScalePermstruct) { zScalePermstructFree((zScalePermstruct_t *) *ScalePermstruct); } -void f_PStatInit(fptr *stat) -{ - PStatInit((SuperLUStat_t *) *stat); -} - -void f_PStatFree(fptr *stat) -{ - PStatFree((SuperLUStat_t *) *stat); -} - -void f_LUstructInit(int_t *m, int_t *n, fptr *LUstruct) +void f_zLUstructInit(int *m, int *n, fptr *LUstruct) { extern void zLUstructInit(const int_t, zLUstruct_t *); zLUstructInit(*m, (zLUstruct_t *) *LUstruct); } -void f_LUstructFree(fptr *LUstruct) +void f_zLUstructFree(fptr *LUstruct) { extern void zLUstructFree(zLUstruct_t *); zLUstructFree((zLUstruct_t *) *LUstruct); } -void f_Destroy_LU_SOLVE_struct(fptr *options, int_t *n, fptr *grid, +void f_zDestroy_LU_SOLVE_struct(fptr *options, int *n, fptr *grid, fptr *LUstruct, fptr *SOLVEstruct) { superlu_dist_options_t *opt = (superlu_dist_options_t *) *options; @@ -289,7 +75,7 @@ void f_Destroy_LU_SOLVE_struct(fptr *options, int_t *n, fptr *grid, } } -void f_Destroy_LU_SOLVE_struct_3d(fptr *options, int_t *n, fptr *grid, +void f_zDestroy_LU_SOLVE_struct_3d(fptr *options, int *n, fptr *grid, fptr *LUstruct, fptr *SOLVEstruct) { gridinfo3d_t *grid3d = (gridinfo3d_t *) *grid; @@ -309,27 +95,37 @@ void f_Destroy_LU_SOLVE_struct_3d(fptr *options, int_t *n, fptr *grid, zLUstructFree(LUstruct_ptr); } -void f_zCreate_CompRowLoc_Mat_dist(fptr *A, int_t *m, int_t *n, int_t *nnz_loc, - int_t *m_loc, int_t *fst_row, doublecomplex *nzval, +void f_zCreate_CompRowLoc_Mat_dist(fptr *A, int *m, int *n, int *nnz_loc, + int *m_loc, int *fst_row, doublecomplex *nzval, int_t *colind, int_t *rowptr, int *stype, int *dtype, int *mtype) { - zCreate_CompRowLoc_Matrix_dist((SuperMatrix *) *A, *m, *n, *nnz_loc, *m_loc, - *fst_row, (doublecomplex *) nzval, colind, rowptr, +#if 1 + doublecomplex *C_nzval = nzval; + int_t *C_colind = colind; + int_t *C_rowptr = rowptr; +#else + /* make a copy of matrix A that is internal to the C side */ + doublecomplex *C_nzval = doublecomplexMalloc_dist(*nnz_loc); + int_t *C_colind = intMalloc_dist(*nnz_loc); + int_t *C_rowptr = intMalloc_dist(*m_loc + 1); + int i; + + for (i = 0; i < *nnz_loc; ++i) { + C_nzval[i] = nzval[i]; + C_colind[i] = colind[i]; + } + for (i = 0; i <= *m_loc; ++i) { + C_rowptr[i] = rowptr[i]; + } +#endif + + zCreate_CompRowLoc_Matrix_dist((SuperMatrix *) *A, *m, *n, *nnz_loc, *m_loc, + *fst_row, C_nzval, C_colind, C_rowptr, (Stype_t) *stype, (Dtype_t) *dtype, (Mtype_t) *mtype); } -void f_Destroy_CompRowLoc_Mat_dist(fptr *A) -{ - Destroy_CompRowLoc_Matrix_dist((SuperMatrix *) *A); -} - -void f_Destroy_SuperMat_Store_dist(fptr *A) -{ - Destroy_SuperMatrix_Store_dist((SuperMatrix *) *A); -} - void f_zSolveFinalize(fptr *options, fptr *SOLVEstruct) { zSolveFinalize((superlu_dist_options_t *) *options, @@ -378,9 +174,10 @@ void f_zcreate_matrix_x_b(char *fname, fptr *A, int *m, int *n, int_t *nnz, MPI_Comm, SuperMatrix *A, int *m_g, int *n_g, int_t *nnz_g, doublecomplex *rhs, int *ldb, doublecomplex *x, int *ldx); + extern void f_get_gridinfo(fptr *grid, int *iam, int *nprow, int *npcol); int iam, nprocs; - int_t nprow, npcol; + int nprow, npcol; MPI_Comm slucomm = ((gridinfo_t *) *grid)->comm; f_get_gridinfo(grid, &iam, &nprow, &npcol); nprocs = nprow * npcol; @@ -397,9 +194,11 @@ void f_zcreate_matrix_x_b_3d(char *fname, fptr *A, int *m, int *n, int_t *nnz, MPI_Comm, SuperMatrix *A, int *m_g, int *n_g, int_t *nnz_g, doublecomplex *rhs, int *ldb, doublecomplex *x, int *ldx); + extern void f_get_gridinfo3d(fptr *grid, int *iam, + int *nprow, int *npcol, int *npdep); int iam, nprocs; - int_t nprow, npcol, npdep; + int nprow, npcol, npdep; MPI_Comm slucomm = ((gridinfo3d_t *) *grid)->comm; f_get_gridinfo3d(grid, &iam, &nprow, &npcol, &npdep); nprocs = nprow * npcol * npdep; @@ -408,11 +207,3 @@ void f_zcreate_matrix_x_b_3d(char *fname, fptr *A, int *m, int *n, int_t *nnz, (SuperMatrix *) *A, m, n, nnz, b, ldb, xtrue, ldx); } -/* Check malloc */ - -void f_check_malloc(int *iam) -{ -#if ( DEBUGlevel>=1 ) - CHECK_MALLOC((int_t) *iam, "Check Malloc"); -#endif -} diff --git a/FORTRAN/superlupara.f90 b/FORTRAN/superlupara.f90 index 122265ad..7ae58251 100644 --- a/FORTRAN/superlupara.f90 +++ b/FORTRAN/superlupara.f90 @@ -19,7 +19,7 @@ module superlupara_mod public superlu_ptr !---------------------------------------------------- -! kind of integer to hold a SuperLU pointer. Use default integer. +! kind of integer to hold a SuperLU pointer. Use 64-bit integer. ! This might need to be changed on systems with large memory. ! If changed, be sure to change it in superlu_c2f_wrap.c too. ! diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 07726d7c..3589770f 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -54,13 +54,15 @@ set(sources communication_aux.c treeFactorization.c sec_structs.c - cublas_utils.c ) +if (HAVE_CUDA) + list(APPEND sources cublas_utils.c) +endif() + if (MSVC) list(APPEND sources wingetopt.c) endif () - set_source_files_properties(superlu_timer.c PROPERTIES COMPILE_FLAGS -O0) if(enable_double) @@ -115,13 +117,15 @@ if(enable_double) dcommunication_aux.c dtrfCommWrapper.c dsuperlu_blas.c - dsuperlu_gpu.cu ) +if (HAVE_CUDA) + list(APPEND sources dsuperlu_gpu.cu) +endif() if (HAVE_COMBBLAS) list(APPEND sources d_c2cpp_GetHWPM.cpp dHWPM_CombBLAS.hpp) endif() -endif() ## enable double +endif() ########## enable double if(enable_complex16) @@ -176,13 +180,16 @@ if(enable_complex16) zcommunication_aux.c ztrfCommWrapper.c zsuperlu_blas.c - zsuperlu_gpu.cu ) +if (HAVE_CUDA) + list(APPEND sources zsuperlu_gpu.cu) +endif() if (HAVE_COMBBLAS) - list(APPEND sources z_c2cpp_GetHWPM.cpp zHWPM_CombBLAS.hpp) + list(APPEND sources z_c2cpp_GetHWPM.cpp zHWPM_CombBLAS.hpp) endif() -endif() ## enable compex16 +endif() ######### enable compex16 + add_library(superlu_dist ${sources} ${HEADERS}) set(targets superlu_dist) diff --git a/SRC/Cnames.h b/SRC/Cnames.h deleted file mode 100644 index a4d1672d..00000000 --- a/SRC/Cnames.h +++ /dev/null @@ -1,378 +0,0 @@ -/*! \file -Copyright (c) 2003, The Regents of the University of California, through -Lawrence Berkeley National Laboratory (subject to receipt of any required -approvals from U.S. Dept. of Energy) - -All rights reserved. - -The source code is distributed under BSD license, see the file License.txt -at the top-level directory. -*/ -/*! @file - * \brief Macro definitions - * - *
- * -- Distributed SuperLU routine (version 1.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * September 1, 1999
- * 
- */ - -#ifndef __SUPERLU_CNAMES /* allow multiple inclusions */ -#define __SUPERLU_CNAMES - -/* - * These macros define how C routines will be called. ADD_ assumes that - * they will be called by fortran, which expects C routines to have an - * underscore postfixed to the name (Suns, and the Intel expect this). - * NOCHANGE indicates that fortran will be calling, and that it expects - * the name called by fortran to be identical to that compiled by the C - * (RS6K's do this). UPCASE says it expects C routines called by fortran - * to be in all upcase (CRAY wants this). - */ - -#define ADD_ 0 -#define NOCHANGE 1 -#define UPCASE 2 -#define C_CALL 3 - -#ifdef UpCase -#define F77_CALL_C UPCASE -#endif - -#ifdef NoChange -#define F77_CALL_C NOCHANGE -#endif - -#ifdef Add_ -#define F77_CALL_C ADD_ -#endif - -#ifndef F77_CALL_C -#define F77_CALL_C ADD_ -#endif - -#if (F77_CALL_C == ADD_) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * No redefinition necessary to have following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void dgemm_(...) - * - * This is the default. - */ -/* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle f_create_gridinfo_handle_ -#define f_create_options_handle f_create_options_handle_ -#define f_create_ScalePerm_handle f_create_scaleperm_handle_ -#define f_create_LUstruct_handle f_create_lustruct_handle_ -#define f_create_SOLVEstruct_handle f_create_solvestruct_handle_ -#define f_create_SuperMatrix_handle f_create_supermatrix_handle_ -#define f_destroy_gridinfo_handle f_destroy_gridinfo_handle_ -#define f_destroy_options_handle f_destroy_options_handle_ -#define f_destroy_ScalePerm_handle f_destroy_scaleperm_handle_ -#define f_destroy_LUstruct_handle f_destroy_lustruct_handle_ -#define f_destroy_SOLVEstruct_handle f_destroy_solvestruct_handle_ -#define f_destroy_SuperMatrix_handle f_destroy_supermatrix_handle_ -#define f_create_SuperLUStat_handle f_create_superlustat_handle_ -#define f_destroy_SuperLUStat_handle f_destroy_superlustat_handle_ -#define f_get_gridinfo f_get_gridinfo_ -#define f_get_SuperMatrix f_get_supermatrix_ -#define f_set_SuperMatrix f_set_supermatrix_ -#define f_get_CompRowLoc_Matrix f_get_comprowloc_matrix_ -#define f_set_CompRowLoc_Matrix f_set_comprowloc_matrix_ -#define f_get_superlu_options f_get_superlu_options_ -#define f_set_superlu_options f_set_superlu_options_ -#define f_set_default_options f_set_default_options_ -#define f_superlu_gridinit f_superlu_gridinit_ -#define f_superlu_gridmap f_superlu_gridmap_ -#define f_superlu_gridexit f_superlu_gridexit_ -#define f_ScalePermstructInit f_scalepermstructinit_ -#define f_ScalePermstructFree f_scalepermstructfree_ -#define f_PStatInit f_pstatinit_ -#define f_PStatFree f_pstatfree_ -#define f_LUstructInit f_lustructinit_ -#define f_LUstructFree f_lustructfree_ -#define f_Destroy_LU f_destroy_lu_ -#define f_dCreate_CompRowLoc_Mat_dist f_dcreate_comprowloc_mat_dist_ -#define f_zCreate_CompRowLoc_Mat_dist f_zcreate_comprowloc_mat_dist_ -#define f_Destroy_CompRowLoc_Mat_dist f_destroy_comprowloc_mat_dist_ -#define f_Destroy_SuperMat_Store_dist f_destroy_supermat_store_dist_ -#define f_dSolveFinalize f_dsolvefinalize_ -#define f_zSolveFinalize f_zsolvefinalize_ -#define f_pdgssvx f_pdgssvx_ -#define f_pzgssvx f_pzgssvx_ -#define f_dcreate_dist_matrix f_dcreate_dist_matrix_ -#define f_zcreate_dist_matrix f_zcreate_dist_matrix_ -#define f_check_malloc f_check_malloc_ -#endif - -#if (F77_CALL_C == UPCASE) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void DGEMM(...) - */ -/* BLAS */ -#define sasum_ SASUM -#define isamax_ ISAMAX -#define scopy_ SCOPY -#define sscal_ SSCAL -#define sger_ SGER -#define snrm2_ SNRM2 -#define ssymv_ SSYMV -#define sdot_ SDOT -#define saxpy_ SAXPY -#define ssyr2_ SSYR2 -#define srot_ SROT -#define sgemv_ SGEMV -#define strsv_ STRSV -#define sgemm_ SGEMM -#define strsm_ STRSM - -#define dasum_ DASUM -#define idamax_ IDAMAX -#define dcopy_ DCOPY -#define dscal_ DSCAL -#define dger_ DGER -#define dnrm2_ DNRM2 -#define dsymv_ DSYMV -#define ddot_ DDOT -#define daxpy_ DAXPY -#define dsyr2_ DSYR2 -#define drot_ DROT -#define dgemv_ DGEMV -#define dtrsv_ DTRSV -#define dgemm_ DGEMM -#define dtrsm_ DTRSM - -#define scasum_ SCASUM -#define icamax_ ICAMAX -#define ccopy_ CCOPY -#define cscal_ CSCAL -#define scnrm2_ SCNRM2 -#define caxpy_ CAXPY -#define cgemv_ CGEMV -#define ctrsv_ CTRSV -#define cgemm_ CGEMM -#define ctrsm_ CTRSM -#define cgerc_ CGERC -#define chemv_ CHEMV -#define cher2_ CHER2 - -#define dzasum_ DZASUM -#define izamax_ IZAMAX -#define zcopy_ ZCOPY -#define zscal_ ZSCAL -#define dznrm2_ DZNRM2 -#define zaxpy_ ZAXPY -#define zgemv_ ZGEMV -#define ztrsv_ ZTRSV -#define zgemm_ ZGEMM -#define ztrsm_ ZTRSM - -#define zgerc_ ZGERC -#define zhemv_ ZHEMV -#define zher2_ ZHER2 -#define zgeru_ ZGERU - -/* LAPACK */ -#define strtri_ STRTRI -#define dtrtri_ DTRTRI -#define ctrtri_ CTRTRI -#define ztrtri_ ZTRTRI - -/* -#define mc64id_dist MC64ID_DIST -#define mc64ad_dist MC64AD_DIST -*/ -#define c_bridge_dgssv_ C_BRIDGE_DGSSV -#define c_fortran_slugrid_ C_FORTRAN_SLUGRID -#define c_fortran_pdgssvx_ C_FORTRAN_PDGSSVX -#define c_fortran_pdgssvx_ABglobal_ C_FORTRAN_PDGSSVX_ABGLOBAL -#define c_fortran_pzgssvx_ C_FORTRAN_PZGSSVX -#define c_fortran_pzgssvx_ABglobal_ C_FORTRAN_PZGSSVX_ABGLOBAL - -/* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle F_CREATE_GRIDINFO_HANDLE -#define f_create_options_handle F_CREATE_OPTIONS_HANDLE -#define f_create_ScalePerm_handle F_CREATE_SCALEPERM_HANDLE -#define f_create_LUstruct_handle F_CREATE_LUSTRUCT_HANDLE -#define f_create_SOLVEstruct_handle F_CREATE_SOLVESTRUCT_HANDLE -#define f_create_SuperMatrix_handle F_CREATE_SUPERMATRIX_HANDLE -#define f_destroy_gridinfo_handle F_DESTROY_GRIDINFO_HANDLE -#define f_destroy_options_handle F_DESTROY_OPTIONS_HANDLE -#define f_destroy_ScalePerm_handle F_DESTROY_SCALEPERM_HANDLE -#define f_destroy_LUstruct_handle F_DESTROY_LUSTRUCT_HANDLE -#define f_destroy_SOLVEstruct_handle F_DESTROY_SOLVESTRUCT_HANDLE -#define f_destroy_SuperMatrix_handle F_DESTROY_SUPERMATRIX_HANDLE -#define f_create_SuperLUStat_handle F_CREATE_SUPERLUSTAT_HANDLE -#define f_destroy_SuperLUStat_handle F_DESTROY_SUPERLUSTAT_HANDLE -#define f_get_gridinfo F_GET_GRIDINFO -#define f_get_SuperMatrix F_GET_SUPERMATRIX -#define f_set_SuperMatrix F_SET_SUPERMATRIX -#define f_get_CompRowLoc_Matrix F_GET_COMPROWLOC_MATRIX -#define f_set_CompRowLoc_Matrix F_SET_COMPROWLOC_MATRIX -#define f_get_superlu_options F_GET_SUPERLU_OPTIONS -#define f_set_superlu_options F_SET_SUPERLU_OPTIONS -#define f_set_default_options F_SET_DEFAULT_OPTIONS -#define f_superlu_gridinit F_SUPERLU_GRIDINIT -#define f_superlu_gridmap F_SUPERLU_GRIDMAP -#define f_superlu_gridexit F_SUPERLU_GRIDEXIT -#define f_ScalePermstructInit F_SCALEPERMSTRUCTINIT -#define f_ScalePermstructFree F_SCALEPERMSTRUCTFREE -#define f_PStatInit F_PSTATINIT -#define f_PStatFree F_PSTATFREE -#define f_LUstructInit F_LUSTRUCTINIT -#define f_LUstructFree F_LUSTRUCTFREE -#define f_Destroy_LU F_DESTROY_LU -#define f_dCreate_CompRowLoc_Mat_dist F_DCREATE_COMPROWLOC_MAT_DIST -#define f_zCreate_CompRowLoc_Mat_dist F_ZCREATE_COMPROWLOC_MAT_DIST -#define f_Destroy_CompRowLoc_Mat_dist F_DESTROY_COMPROWLOC_MAT_DIST -#define f_Destroy_SuperMat_Store_dist F_DESTROY_SUPERMAT_STORE_DIST -#define f_dSolveFinalize F_DSOLVEFINALIZE -#define f_zSolveFinalize F_ZSOLVEFINALIZE -#define f_pdgssvx F_PDGSSVX -#define f_pzgssvx F_PZGSSVX -#define f_dcreate_dist_matrix F_DCREATE_DIST_MATRIX -#define f_zcreate_dist_matrix F_ZCREATE_DIST_MATRIX -#define f_check_malloc F_CHECK_MALLOC -#endif - -#if (F77_CALL_C == NOCHANGE) -/* - * These defines set up the naming scheme required to have a fortran 77 - * routine call a C routine - * for following Fortran to C interface: - * FORTRAN CALL C DECLARATION - * call dgemm(...) void dgemm(...) - */ -/* BLAS */ -#define sasum_ sasum -#define isamax_ isamax -#define scopy_ scopy -#define sscal_ sscal -#define sger_ sger -#define snrm2_ snrm2 -#define ssymv_ ssymv -#define sdot_ sdot -#define saxpy_ saxpy -#define ssyr2_ ssyr2 -#define srot_ srot -#define sgemv_ sgemv -#define strsv_ strsv -#define sgemm_ sgemm -#define strsm_ strsm - -#define dasum_ dasum -#define idamax_ idamax -#define dcopy_ dcopy -#define dscal_ dscal -#define dger_ dger -#define dnrm2_ dnrm2 -#define dsymv_ dsymv -#define ddot_ ddot -#define daxpy_ daxpy -#define dsyr2_ dsyr2 -#define drot_ drot -#define dgemv_ dgemv -#define dtrsv_ dtrsv -#define dgemm_ dgemm -#define dtrsm_ dtrsm - -#define scasum_ scasum -#define icamax_ icamax -#define ccopy_ ccopy -#define cscal_ cscal -#define scnrm2_ scnrm2 -#define caxpy_ caxpy -#define cgemv_ cgemv -#define ctrsv_ ctrsv -#define cgemm_ cgemm -#define ctrsm_ ctrsm -#define cgerc_ cgerc -#define chemv_ chemv -#define cher2_ cher2 - -#define dzasum_ dzasum -#define izamax_ izamax -#define zcopy_ zcopy -#define zscal_ zscal -#define dznrm2_ dznrm2 -#define zaxpy_ zaxpy -#define zgemv_ zgemv -#define ztrsv_ ztrsv -#define zgemm_ zgemm -#define ztrsm_ ztrsm -#define zgerc_ zgerc -#define zhemv_ zhemv -#define zher2_ zher2 -#define zgeru_ zgeru - -/* LAPACK */ -#define strtri_ strtri -#define dtrtri_ dtrtri -#define ctrtri_ ctrtri -#define ztrtri_ ztrtri - -/* -#define mc64id_dist mc64id_dist -#define mc64ad_dist mc64ad_dist -*/ - -#define c_bridge_dgssv_ c_bridge_dgssv -#define c_fortran_slugrid_ c_fortran_slugrid -#define c_fortran_pdgssvx_ c_fortran_pdgssvx -#define c_fortran_pdgssvx_ABglobal_ c_fortran_pdgssvx_abglobal -#define c_fortran_pzgssvx_ c_fortran_pzgssvx -#define c_fortran_pzgssvx_ABglobal_ c_fortran_pzgssvx_abglobal - -/* These are the functions defined in F90 wraper */ -#define f_create_gridinfo_handle f_create_gridinfo_handle -#define f_create_options_handle f_create_options_handle -#define f_create_ScalePerm_handle f_create_scaleperm_handle -#define f_create_LUstruct_handle f_create_lustruct_handle -#define f_create_SOLVEstruct_handle f_create_solvestruct_handle -#define f_create_SuperMatrix_handle f_create_supermatrix_handle -#define f_destroy_gridinfo_handle f_destroy_gridinfo_handle -#define f_destroy_options_handle f_destroy_options_handle -#define f_destroy_ScalePerm_handle f_destroy_scaleperm_handle -#define f_destroy_LUstruct_handle f_destroy_lustruct_handle -#define f_destroy_SOLVEstruct_handle f_destroy_solvestruct_handle -#define f_destroy_SuperMatrix_handle f_destroy_supermatrix_handle -#define f_create_SuperLUStat_handle f_create_superlustat_handle -#define f_destroy_SuperLUStat_handle f_destroy_superlustat_handle -#define f_get_gridinfo f_get_gridinfo -#define f_get_SuperMatrix f_get_supermatrix -#define f_set_SuperMatrix f_set_supermatrix -#define f_get_CompRowLoc_Matrix f_get_comprowloc_matrix -#define f_set_CompRowLoc_Matrix f_set_comprowloc_matrix -#define f_get_superlu_options f_get_superlu_options -#define f_set_superlu_options f_set_superlu_options -#define f_set_default_options f_set_default_options -#define f_superlu_gridinit f_superlu_gridinit -#define f_superlu_gridmap f_superlu_gridmap -#define f_superlu_gridexit f_superlu_gridexit -#define f_ScalePermstructInit f_scalepermstructinit -#define f_ScalePermstructFree f_scalepermstructfree -#define f_PStatInit f_pstatinit -#define f_PStatFree f_pstatfree -#define f_LUstructInit f_lustructinit -#define f_LUstructFree f_lustructfree -#define f_Destroy_LU f_destroy_lu -#define f_dCreate_CompRowLoc_Mat_dist f_dcreate_comprowloc_mat_dist -#define f_Destroy_CompRowLoc_Mat_dist f_destroy_comprowloc_mat_dist -#define f_Destroy_SuperMat_Store_dist f_destroy_supermat_store_dist -#define f_dSolveFinalize f_dsolvefinalize -#define f_zSolveFinalize f_zsolvefinalize -#define f_pdgssvx f_pdgssvx -#define f_pzgssvx f_pzgssvx -#define f_dcreate_dist_matrix f_dcreate_dist_matrix -#define f_zcreate_dist_matrix f_zcreate_dist_matrix -#define f_check_malloc f_check_malloc -#endif - -#endif /* __SUPERLU_CNAMES */ diff --git a/SRC/Makefile b/SRC/Makefile index c5709f55..63fb8c3f 100644 --- a/SRC/Makefile +++ b/SRC/Makefile @@ -45,10 +45,6 @@ ALLAUX = sp_ienv.o etree.o sp_colorder.o get_perm_c.o \ ALLAUX += superlu_grid3d.o supernodal_etree.o supernodalForest.o \ trfAux.o communication_aux.o treeFactorization.o sec_structs.o -ifeq ($(HAVE_CUDA),TRUE) -ALLAUX += cublas_utils.o -endif - # # Routines literally taken from SuperLU, but renamed with suffix _dist # @@ -88,6 +84,12 @@ ZPLUSRC += pzgssvx3d.o pzgstrf3d.o ztreeFactorization.o zscatter3d.o \ zgather.o pz3dcomm.o ztrfAux.o zcommunication_aux.o ztrfCommWrapper.o \ znrformat_loc3d.o ztreeFactorizationGPU.o ##$(FACT3D) +ifeq ($(HAVE_CUDA),TRUE) +ALLAUX += cublas_utils.o +DPLUSRC += dsuperlu_gpu.o +ZPLUSRC += zsuperlu_gpu.o +endif + ifeq ($(HAVE_COMBBLAS),TRUE) DPLUSRC += d_c2cpp_GetHWPM.o ZPLUSRC += z_c2cpp_GetHWPM.o @@ -142,6 +144,9 @@ pzgstrf.o: zscatter.c zlook_ahead_update.c zSchCompUdt-2Ddynamic.c pzgstrf.c .c.o: $(CC) $(CFLAGS) $(CDEFS) $(BLASDEF) -I$(INCLUDEDIR) -c $< $(VERBOSE) +.cu.o: + $(NVCC) $(NVCCFLAGS) $(CDEFS) $(BLASDEF) -I$(INCLUDEDIR) -c $< $(VERBOSE) + .cpp.o: $(CXX) $(CXXFLAGS) $(CPPFLAGS) $(CDEFS) $(BLASDEF) -I$(INCLUDEDIR) -c $< $(VERBOSE) diff --git a/SRC/dlustruct_gpu.h b/SRC/dlustruct_gpu.h index bc85565a..e0513d07 100644 --- a/SRC/dlustruct_gpu.h +++ b/SRC/dlustruct_gpu.h @@ -13,17 +13,14 @@ #pragma once // so that this header file is included onle once -// #ifdef DEBUG -// #include -// #endif -// #include -// #include "mkl.h" +#include "superlu_ddefs.h" + +#ifdef GPU_ACC // enable GPU -// #define USE_VENDOR_BLAS +// #include "mkl.h" #include #include -#include "superlu_ddefs.h" // #include "sec_structs.h" // #include "supernodal_etree.h" @@ -175,8 +172,6 @@ extern int dsparseTreeFactor_ASYNC_GPU( double thresh, SCT_t *SCT, int tag_ub, int *info); -extern double estimate_cpu_time(int m, int n , int k); - int dinitD2Hreduce( int next_k, d2Hreduce_t* d2Hred, @@ -242,4 +237,4 @@ void dprintGPUStats(dLUstruct_gpu_t *A_gpu); } #endif -//#undef DEBUG +#endif // matching: enable GPU diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c index f2bec71d..08f8d04e 100644 --- a/SRC/dnrformat_loc3d.c +++ b/SRC/dnrformat_loc3d.c @@ -120,27 +120,30 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input, on 3D grid } A2d->nnz_loc = nnz_disp[grid3d->npdep]; A2d->m_loc = row_disp[grid3d->npdep]; -#if 0 - A2d->fst_row = A->fst_row; // This is a bug -#else - gridinfo_t *grid2d = &(grid3d->grid2d); - int procs2d = grid2d->nprow * grid2d->npcol; - int m_loc_2d = A2d->m_loc; - int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int)); - - MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, MPI_INT, grid2d->comm); - - int fst_row = 0; - for (int p = 0; p < procs2d; ++p) - { - if (grid2d->iam == p) - A2d->fst_row = fst_row; - fst_row += m_loc_2d_counts[p]; - } - SUPERLU_FREE(m_loc_2d_counts); -#endif + if (grid3d->rankorder == 1) { // XY-major + A2d->fst_row = A->fst_row; + } else { // Z-major + gridinfo_t *grid2d = &(grid3d->grid2d); + int procs2d = grid2d->nprow * grid2d->npcol; + int m_loc_2d = A2d->m_loc; + int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int)); + + MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, + MPI_INT, grid2d->comm); + + int fst_row = 0; + for (int p = 0; p < procs2d; ++p) + { + if (grid2d->iam == p) + A2d->fst_row = fst_row; + fst_row += m_loc_2d_counts[p]; + } + + SUPERLU_FREE(m_loc_2d_counts); + } } + // Btmp <- compact(B) // compacting B double *Btmp; @@ -236,9 +239,9 @@ int dScatter_B3d(NRformat_loc3d *A3d, // modified // Btmp <- scatterv(B1), block-by-block if ( rankorder == 1 ) { /* XY-major in 3D grid */ /* e.g. 1x3x4 grid: layer0 layer1 layer2 layer3 - * 0 1 2 4 - * 5 6 7 8 - * 9 10 11 12 + * 0 1 2 3 + * 4 5 6 7 + * 8 9 10 11 */ MPI_Scatterv(B1, b_counts_int, b_disp, MPI_DOUBLE, Btmp, nrhs * A3d->m_loc, MPI_DOUBLE, diff --git a/SRC/dtreeFactorizationGPU.c b/SRC/dtreeFactorizationGPU.c index e1983df5..21a03f97 100644 --- a/SRC/dtreeFactorizationGPU.c +++ b/SRC/dtreeFactorizationGPU.c @@ -19,6 +19,8 @@ //#include "cblas.h" #endif +#ifdef GPU_ACC ///////////////// enable GPU + /* /-- num_u_blks--\ /-- num_u_blks_Phi --\ ---------------------------------------- @@ -732,3 +734,5 @@ int dsparseTreeFactor_ASYNC_GPU( return 0; } /* end dsparseTreeFactor_ASYNC_GPU */ + +#endif // matching: enable GPU diff --git a/SRC/dutil_dist.c b/SRC/dutil_dist.c index a8653599..c9ff728f 100644 --- a/SRC/dutil_dist.c +++ b/SRC/dutil_dist.c @@ -710,7 +710,7 @@ void dDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid, } if(grid->iam==0){ - fprintf(fp, "%d %d" IFMT "\n", n,n,nnzL); + fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL); } ncb = nsupers / grid->npcol; diff --git a/SRC/memory.c b/SRC/memory.c index 4159206a..c45b48f6 100644 --- a/SRC/memory.c +++ b/SRC/memory.c @@ -67,12 +67,12 @@ void *superlu_malloc_dist(size_t size) MPI_Comm_rank(MPI_COMM_WORLD, &iam); if ( size < 0 ) { - printf("(%d) superlu_malloc size %lld\n", iam, size); + printf("(%d) superlu_malloc size %lu\n", iam, size); ABORT("superlu_malloc: nonpositive size"); } buf = (char *) malloc(size + DWORD); if ( !buf ) { - printf("(%d) superlu_malloc fails: malloc_total %.0f MB, size %lld\n", + printf("(%d) superlu_malloc fails: malloc_total %.0f MB, size %lu\n", iam, superlu_malloc_total*1e-6, size); ABORT("superlu_malloc: out of memory"); } @@ -401,8 +401,7 @@ int_t symbfact_SubXpand #if ( DEBUGlevel>=1 ) printf("symbfact_SubXpand(): jcol " IFMT ", next " IFMT ", maxlen " IFMT - ", MemType " IFMT "\n", - jcol, next, *maxlen, mem_type); + ", MemType %d\n", jcol, next, *maxlen, mem_type); #endif new_mem = expand(maxlen, mem_type, next, 0, Glu_freeable); diff --git a/SRC/pdgsequ.c b/SRC/pdgsequ.c index 685f294a..95adf5b6 100644 --- a/SRC/pdgsequ.c +++ b/SRC/pdgsequ.c @@ -253,7 +253,6 @@ pdgsequ(SuperMatrix *A, double *r, double *c, double *rowcnd, SUPERLU_FREE(r_sizes); SUPERLU_FREE(loc_r); - return; } /* pdgsequ */ diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c index b1c311b8..70836770 100644 --- a/SRC/pdgstrf2.c +++ b/SRC/pdgstrf2.c @@ -9,6 +9,7 @@ The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ + /*! @file * \brief Performs panel LU factorization. * @@ -147,13 +148,13 @@ pdgstrf2_trsm int cols_left, iam, l, pkk, pr; int incx = 1, incy = 1; - int nsupr; /* number of rows in the block (LDA) */ - int nsupc; /* number of columns in the block */ + int nsupr; /* number of rows in the block (LDA) */ + int nsupc; /* number of columns in the block */ int luptr; int_t i, myrow, krow, j, jfst, jlst, u_diag_cnt; int_t *xsup = Glu_persist->xsup; double *lusup, temp; - double *ujrow, *ublk_ptr; /* pointer to the U block */ + double *ujrow, *ublk_ptr; /* pointer to the U block */ double alpha = -1, zero = 0.0; int_t Pr; MPI_Status status; @@ -163,63 +164,57 @@ pdgstrf2_trsm /* Initialization. */ iam = grid->iam; Pr = grid->nprow; - myrow = MYROW(iam, grid); - krow = PROW(k, grid); - pkk = PNUM(PROW(k, grid), PCOL(k, grid), grid); - j = LBj(k, grid); /* Local block number */ - jfst = FstBlockC(k); - jlst = FstBlockC(k + 1); + myrow = MYROW (iam, grid); + krow = PROW (k, grid); + pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + j = LBj (k, grid); /* Local block number */ + jfst = FstBlockC (k); + jlst = FstBlockC (k + 1); lusup = Llu->Lnzval_bc_ptr[j]; - nsupc = SuperSize(k); + nsupc = SuperSize (k); if (Llu->Lrowind_bc_ptr[j]) nsupr = Llu->Lrowind_bc_ptr[j][1]; else nsupr = 0; #ifdef PI_DEBUG - printf("rank %d Iter %d k=%d \t dtrsm nsuper %d \n", - iam, k0, k, nsupr); + printf ("rank %d Iter %d k=%d \t dtrsm nsuper %d \n", + iam, k0, k, nsupr); #endif ublk_ptr = ujrow = Llu->ujrow; - luptr = 0; /* Point to the diagonal entries. */ - cols_left = nsupc; /* supernode size */ - int ld_ujrow = nsupc; /* leading dimension of ujrow */ + luptr = 0; /* Point to the diagonal entries. */ + cols_left = nsupc; /* supernode size */ + int ld_ujrow = nsupc; /* leading dimension of ujrow */ u_diag_cnt = 0; incy = ld_ujrow; - if (U_diag_blk_send_req && - U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL) - { + if ( U_diag_blk_send_req && + U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL ) { /* There are pending sends - wait for all Isend to complete */ -#if (PROFlevel >= 1) - TIC(t1); +#if ( PROFlevel>=1 ) + TIC (t1); #endif - for (pr = 0; pr < Pr; ++pr) - { - if (pr != myrow) - { - MPI_Wait(U_diag_blk_send_req + pr, &status); + for (pr = 0; pr < Pr; ++pr) { + if (pr != myrow) { + MPI_Wait (U_diag_blk_send_req + pr, &status); } - } -#if (PROFlevel >= 1) - TOC(t2, t1); - stat->utime[COMM] += t2; - stat->utime[COMM_DIAG] += t2; + } +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DIAG] += t2; #endif - /* flag no more outstanding send request. */ - U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL; + /* flag no more outstanding send request. */ + U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL; } - if (iam == pkk) - { /* diagonal process */ - /* ++++ First step compute diagonal block ++++++++++ */ - for (j = 0; j < jlst - jfst; ++j) - { /* for each column in panel */ + if (iam == pkk) { /* diagonal process */ + /* ++++ First step compute diagonal block ++++++++++ */ + for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */ /* Diagonal pivot */ i = luptr; /* May replace zero pivot. */ - // if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0 ) { - if (options->ReplaceTinyPivot == YES) { + if (options->ReplaceTinyPivot == YES ) { if (fabs (lusup[i]) < thresh) { /* Diagonal */ #if ( PRNTlevel>=2 ) @@ -227,12 +222,10 @@ pdgstrf2_trsm iam, jfst + j, lusup[i]); #endif /* Keep the new diagonal entry with the same sign. */ - if (lusup[i] < 0) - lusup[i] = -thresh; - else - lusup[i] = thresh; -#if (PRNTlevel >= 2) - printf("replaced by %e\n", lusup[i]); + if (lusup[i] < 0) lusup[i] = -thresh; + else lusup[i] = thresh; +#if ( PRNTlevel>=2 ) + printf ("replaced by %e\n", lusup[i]); #endif ++(stat->TinyPivots); } @@ -245,70 +238,63 @@ pdgstrf2_trsm /* storing U in full form */ int st; - for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) - { + for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) { st = j * ld_ujrow + j; ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */ } - if (ujrow[0] == zero) - { /* Test for singularity. */ + if ( ujrow[0] == zero ) { /* Test for singularity. */ *info = j + jfst + 1; - } - else - { /* Scale the j-th column within diag. block. */ + } else { /* Scale the j-th column within diag. block. */ temp = 1.0 / ujrow[0]; for (i = luptr + 1; i < luptr - j + nsupc; ++i) - lusup[i] *= temp; + lusup[i] *= temp; stat->ops[FACT] += nsupc - j - 1; } /* Rank-1 update of the trailing submatrix within diag. block. */ - if (--cols_left) - { + if (--cols_left) { /* l = nsupr - j - 1; */ - l = nsupc - j - 1; /* Piyush */ - dger_(&l, &cols_left, &alpha, &lusup[luptr + 1], &incx, - &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1], - &nsupr); + l = nsupc - j - 1; /* Piyush */ + dger_ (&l, &cols_left, &alpha, &lusup[luptr + 1], &incx, + &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1], + &nsupr); stat->ops[FACT] += 2 * l * cols_left; } /* ujrow = ublk_ptr + u_diag_cnt; */ ujrow = ujrow + ld_ujrow + 1; /* move to next row of U */ - luptr += nsupr + 1; /* move to next column */ + luptr += nsupr + 1; /* move to next column */ - } /* for column j ... first loop */ + } /* for column j ... first loop */ - /* ++++ Second step compute off-diagonal block with communication ++*/ + /* ++++ Second step compute off-diagonal block with communication ++*/ ublk_ptr = ujrow = Llu->ujrow; - if (U_diag_blk_send_req && iam == pkk) - { /* Send the U block downward */ + if (U_diag_blk_send_req && iam == pkk) { /* Send the U block downward */ /** ALWAYS SEND TO ALL OTHERS - TO FIX **/ -#if (PROFlevel >= 1) - TIC(t1); +#if ( PROFlevel>=1 ) + TIC (t1); #endif - for (pr = 0; pr < Pr; ++pr) - { - if (pr != krow) - { + for (pr = 0; pr < Pr; ++pr) { + if (pr != krow) { /* tag = ((k0<<2)+2) % tag_ub; */ /* tag = (4*(nsupers+k0)+2) % tag_ub; */ - MPI_Isend(ublk_ptr, nsupc * nsupc, MPI_DOUBLE, pr, - SLU_MPI_TAG(4, k0) /* tag */, - comm, U_diag_blk_send_req + pr); + MPI_Isend (ublk_ptr, nsupc * nsupc, MPI_DOUBLE, pr, + SLU_MPI_TAG (4, k0) /* tag */ , + comm, U_diag_blk_send_req + pr); + } } -#if (PROFlevel >= 1) - TOC(t2, t1); - stat->utime[COMM] += t2; - stat->utime[COMM_DIAG] += t2; +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DIAG] += t2; #endif - /* flag outstanding Isend */ - U_diag_blk_send_req[krow] = (MPI_Request)TRUE; /* Sherry */ + /* flag outstanding Isend */ + U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* Sherry */ } /* pragma below would be changed by an MKL call */ @@ -317,22 +303,20 @@ pdgstrf2_trsm // n = nsupc; double alpha = 1.0; #ifdef PI_DEBUG - printf("calling dtrsm\n"); - printf("dtrsm diagonal param 11: %d \n", nsupr); + printf ("calling dtrsm\n"); + printf ("dtrsm diagonal param 11: %d \n", nsupr); #endif -#if defined(USE_VENDOR_BLAS) - dtrsm_("R", "U", "N", "N", &l, &nsupc, - &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr, - 1, 1, 1, 1); +#if defined (USE_VENDOR_BLAS) + dtrsm_ ("R", "U", "N", "N", &l, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr, + 1, 1, 1, 1); #else - dtrsm_("R", "U", "N", "N", &l, &nsupc, - &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr); + dtrsm_ ("R", "U", "N", "N", &l, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr); #endif - stat->ops[FACT] += (flops_t)nsupc * (nsupc + 1) * l; - } - else - { /* non-diagonal process */ + stat->ops[FACT] += (flops_t) nsupc * (nsupc+1) * l; + } else { /* non-diagonal process */ /* ================================================================== * * Receive the diagonal block of U for panel factorization of L(:,k). * * Note: we block for panel factorization of L(:,k), but panel * @@ -342,72 +326,73 @@ pdgstrf2_trsm /* tag = ((k0<<2)+2) % tag_ub; */ /* tag = (4*(nsupers+k0)+2) % tag_ub; */ // printf("hello message receiving%d %d\n",(nsupc*(nsupc+1))>>1,SLU_MPI_TAG(4,k0)); -#if (PROFlevel >= 1) - TIC(t1); +#if ( PROFlevel>=1 ) + TIC (t1); #endif - MPI_Recv(ublk_ptr, (nsupc * nsupc), MPI_DOUBLE, krow, - SLU_MPI_TAG(4, k0) /* tag */, - comm, &status); -#if (PROFlevel >= 1) - TOC(t2, t1); - stat->utime[COMM] += t2; - stat->utime[COMM_DIAG] += t2; + MPI_Recv (ublk_ptr, (nsupc * nsupc), MPI_DOUBLE, krow, + SLU_MPI_TAG (4, k0) /* tag */ , + comm, &status); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DIAG] += t2; #endif - if (nsupr > 0) - { + if (nsupr > 0) { double alpha = 1.0; #ifdef PI_DEBUG - printf("dtrsm non diagonal param 11: %d \n", nsupr); + printf ("dtrsm non diagonal param 11: %d \n", nsupr); if (!lusup) - printf(" Rank :%d \t Empty block column occurred :\n", iam); + printf (" Rank :%d \t Empty block column occurred :\n", iam); #endif -#if defined(USE_VENDOR_BLAS) - dtrsm_("R", "U", "N", "N", &nsupr, &nsupc, - &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr, 1, 1, 1, 1); +#if defined (USE_VENDOR_BLAS) + dtrsm_ ("R", "U", "N", "N", &nsupr, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr, 1, 1, 1, 1); #else - dtrsm_("R", "U", "N", "N", &nsupr, &nsupc, - &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr); + dtrsm_ ("R", "U", "N", "N", &nsupr, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr); #endif - stat->ops[FACT] += (flops_t)nsupc * (nsupc + 1) * nsupr; + stat->ops[FACT] += (flops_t) nsupc * (nsupc+1) * nsupr; } } /* end if pkk ... */ /* printf("exiting pdgstrf2 %d \n", grid->iam); */ -} /* PDGSTRF2_trsm */ +} /* PDGSTRF2_trsm */ + /***************************************************************************** * The following functions are for the new pdgstrf2_dtrsm in the 3D code. *****************************************************************************/ -static int_t LpanelUpdate(int off0, int nsupc, double *ublk_ptr, int ld_ujrow, - double *lusup, int nsupr, SCT_t *SCT) +static +int_t LpanelUpdate(int off0, int nsupc, double* ublk_ptr, int ld_ujrow, + double* lusup, int nsupr, SCT_t* SCT) { int_t l = nsupr - off0; double alpha = 1.0; double t1 = SuperLU_timer_(); -#define GT 32 +#define GT 32 #pragma omp parallel for for (int i = 0; i < CEILING(l, GT); ++i) { int_t off = i * GT; int len = SUPERLU_MIN(GT, l - i * GT); - - superlu_dtrsm("R", "U", "N", "N", - len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); + + superlu_dtrsm("R", "U", "N", "N", len, nsupc, alpha, + ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); } /* for i = ... */ t1 = SuperLU_timer_() - t1; - SCT->trf2_flops += (double)l * (double)nsupc * (double)nsupc; + SCT->trf2_flops += (double) l * (double) nsupc * (double)nsupc; SCT->trf2_time += t1; SCT->L_PanelUpdate_tl += t1; return 0; -} /* LpanelUpdate */ +} #pragma GCC push_options #pragma GCC optimize ("O0") @@ -435,19 +420,19 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, nsupr = 0; double *ublk_ptr = BlockUFactor; double *ujrow = BlockUFactor; - int_t luptr = 0; /* Point to the diagonal entries. */ - int cols_left = nsupc; /* supernode size */ + int_t luptr = 0; /* Point_t to the diagonal entries. */ + int cols_left = nsupc; /* supernode size */ int_t u_diag_cnt = 0; - int_t ld_ujrow = nsupc; /* leading dimension of ujrow */ + int_t ld_ujrow = nsupc; /* leading dimension of ujrow */ int incx = 1; int incy = ld_ujrow; - for (int_t j = 0; j < jlst - jfst; ++j) /* for each column in panel */ + for (int_t j = 0; j < jlst - jfst; ++j) /* for each column in panel */ { /* Diagonal pivot */ int_t i = luptr; - /* May replace zero pivot. */ - if (options->ReplaceTinyPivot == YES) + /* Not to replace zero pivot. */ + if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0) { if (fabs (lusup[i]) < thresh) { /* Diagonal */ @@ -471,11 +456,11 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */ } - if (ujrow[0] == zero) /* Test for singularity. */ + if (ujrow[0] == zero) /* Test for singularity. */ { *info = j + jfst + 1; } - else /* Scale the j-th column. */ + else /* Scale the j-th column. */ { double temp; temp = 1.0 / ujrow[0]; @@ -490,23 +475,21 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh, /*following must be int*/ int l = nsupc - j - 1; - /* Rank-1 update */ - + /* Rank-1 update */ superlu_dger(l, cols_left, alpha, &lusup[luptr + 1], incx, - &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], - nsupr); - + &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], nsupr); stat->ops[FACT] += 2 * l * cols_left; } ujrow = ujrow + ld_ujrow + 1; /* move to next row of U */ luptr += nsupr + 1; /* move to next column */ - } /* for column j ... first loop */ + } /* for column j ... first loop */ + //int_t thread_id = omp_get_thread_num(); - // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( _rdtsc() - t1); -} /* Local_Dgstrf2 */ + // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( SuperLU_timer_() - t1); +} #pragma GCC pop_options /************************************************************************/ @@ -580,13 +563,13 @@ void pdgstrf2_xtrsm int cols_left, iam, pkk; int incy = 1; - int nsupr; /* number of rows in the block (LDA) */ + int nsupr; /* number of rows in the block (LDA) */ int luptr; int_t myrow, krow, j, jfst, jlst, u_diag_cnt; - int nsupc; /* number of columns in the block */ + int_t nsupc; /* number of columns in the block */ int_t *xsup = Glu_persist->xsup; double *lusup; - double *ujrow, *ublk_ptr; /* pointer to the U block */ + double *ujrow, *ublk_ptr; /* pointer to the U block */ int_t Pr; /* Quick return. */ @@ -595,24 +578,23 @@ void pdgstrf2_xtrsm /* Initialization. */ iam = grid->iam; Pr = grid->nprow; - myrow = MYROW(iam, grid); - krow = PROW(k, grid); - pkk = PNUM(PROW(k, grid), PCOL(k, grid), grid); - j = LBj(k, grid); /* Local block number */ - jfst = FstBlockC(k); - jlst = FstBlockC(k + 1); + myrow = MYROW (iam, grid); + krow = PROW (k, grid); + pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + j = LBj (k, grid); /* Local block number */ + jfst = FstBlockC (k); + jlst = FstBlockC (k + 1); lusup = Llu->Lnzval_bc_ptr[j]; - nsupc = SuperSize(k); - + nsupc = SuperSize (k); if (Llu->Lrowind_bc_ptr[j]) nsupr = Llu->Lrowind_bc_ptr[j][1]; else nsupr = 0; ublk_ptr = ujrow = Llu->ujrow; - luptr = 0; /* Point to the diagonal entries. */ - cols_left = nsupc; /* supernode size */ - int ld_ujrow = nsupc; /* leading dimension of ujrow */ + luptr = 0; /* Point to the diagonal entries. */ + cols_left = nsupc; /* supernode size */ + int ld_ujrow = nsupc; /* leading dimension of ujrow */ u_diag_cnt = 0; incy = ld_ujrow; @@ -622,23 +604,23 @@ void pdgstrf2_xtrsm Wait_UDiagBlockSend(U_diag_blk_send_req, grid, SCT); } - if (iam == pkk) /* diagonal process */ + if (iam == pkk) /* diagonal process */ { /*factorize the diagonal block*/ Local_Dgstrf2(options, k, thresh, Llu->ujrow, Glu_persist, grid, Llu, stat, info, SCT); ublk_ptr = ujrow = Llu->ujrow; - if (U_diag_blk_send_req && iam == pkk) /* Send the U block */ + if (U_diag_blk_send_req && iam == pkk) /* Send the U block */ { dISend_UDiagBlock(k0, ublk_ptr, nsupc * nsupc, U_diag_blk_send_req, - grid, tag_ub); - U_diag_blk_send_req[krow] = (MPI_Request)TRUE; /* flag outstanding Isend */ + grid, tag_ub); + U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* flag outstanding Isend */ } - LpanelUpdate(nsupc, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); + LpanelUpdate(nsupc, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); } - else /* non-diagonal process */ + else /* non-diagonal process */ { /* ================================================ * * Receive the diagonal block of U * @@ -647,11 +629,11 @@ void pdgstrf2_xtrsm * but panel factorization of U(:,k) don't * * ================================================ */ - dRecv_UDiagBlock(k0, ublk_ptr, (nsupc * nsupc), krow, grid, SCT, tag_ub); + dRecv_UDiagBlock( k0, ublk_ptr, (nsupc * nsupc), krow, grid, SCT, tag_ub); if (nsupr > 0) { - LpanelUpdate(0, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); + LpanelUpdate(0, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); } } /* end if pkk ... */ @@ -664,21 +646,19 @@ void pdgstrf2_xtrsm /* PDGSTRS2 helping kernels*/ int_t dTrs2_GatherU(int_t iukp, int_t rukp, int_t klst, - int_t nsupc, int_t ldu, - int_t *usub, - double *uval, double *tempv) + int_t nsupc, int_t ldu, + int_t *usub, + double* uval, double *tempv) { double zero = 0.0; int_t ncols = 0; for (int_t jj = iukp; jj < iukp + nsupc; ++jj) { int_t segsize = klst - usub[jj]; - - if (segsize) + if ( segsize ) { int_t lead_zero = ldu - segsize; - for (int_t i = 0; i < lead_zero; ++i) - tempv[i] = zero; + for (int_t i = 0; i < lead_zero; ++i) tempv[i] = zero; tempv += lead_zero; for (int_t i = 0; i < segsize; ++i) tempv[i] = uval[rukp + i]; @@ -691,8 +671,8 @@ int_t dTrs2_GatherU(int_t iukp, int_t rukp, int_t klst, } int_t dTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, - int_t nsupc, int_t ldu, - int_t *usub, double *uval, double *tempv) + int_t nsupc, int_t ldu, + int_t *usub, double* uval, double *tempv) { for (int_t jj = 0; jj < nsupc; ++jj) { @@ -713,16 +693,16 @@ int_t dTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, } int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, - int_t *usub, double *uval, double *tempv, - int_t knsupc, int nsupr, double *lusup, - Glu_persist_t *Glu_persist) /*glupersist for xsup for supersize*/ + int_t *usub, double *uval, double *tempv, + int_t knsupc, int nsupr, double *lusup, + Glu_persist_t *Glu_persist) /*glupersist for xsup for supersize*/ { double alpha = 1.0; int_t *xsup = Glu_persist->xsup; // int_t iukp = Ublock_info.iukp; // int_t rukp = Ublock_info.rukp; int_t gb = usub[iukp]; - int_t nsupc = SuperSize(gb); + int_t nsupc = SuperSize (gb); iukp += UB_DESCRIPTOR; // printf("klst inside task%d\n", ); @@ -730,24 +710,25 @@ int_t dTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, int ldu = 0; for (int_t jj = iukp; jj < iukp + nsupc; ++jj) { - ldu = SUPERLU_MAX(klst - usub[jj], ldu); + ldu = SUPERLU_MAX( klst - usub[jj], ldu) ; } /*pack U block into a dense Block*/ int ncols = dTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub, - uval, tempv); + uval, tempv); /*now call dtrsm on packed dense block*/ int_t luptr = (knsupc - ldu) * (nsupr + 1); - - superlu_dtrsm("L", "L", "N", "U", - ldu, ncols, alpha, &lusup[luptr], nsupr, tempv, ldu); + // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu ); + + superlu_dtrsm("L", "L", "N", "U", ldu, ncols, alpha, + &lusup[luptr], nsupr, tempv, ldu); /*now scatter the output into sparse U block*/ dTrs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv); return 0; -} /* dTrs2_GatherTrsmScatter */ +} /* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */ @@ -765,9 +746,9 @@ void pdgstrs2_omp #endif int iam, pkk; int incx = 1; - int nsupr; /* number of rows in the block L(:,k) (LDA) */ + int nsupr; /* number of rows in the block L(:,k) (LDA) */ int segsize; - int nsupc; /* number of columns in the block */ + int nsupc; /* number of columns in the block */ int_t luptr, iukp, rukp; int_t b, gb, j, klst, knsupc, lk, nb; int_t *xsup = Glu_persist->xsup; @@ -781,28 +762,24 @@ void pdgstrs2_omp #endif /* Quick return. */ - lk = LBi(k, grid); /* Local block number */ - if (!Llu->Unzval_br_ptr[lk]) - return; + lk = LBi (k, grid); /* Local block number */ + if (!Llu->Unzval_br_ptr[lk]) return; /* Initialization. */ iam = grid->iam; - pkk = PNUM(PROW(k, grid), PCOL(k, grid), grid); + pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); //int k_row_cycle = k / grid->nprow; /* for which cycle k exist (to assign rowwise thread blocking) */ //int gb_col_cycle; /* cycle through block columns */ - klst = FstBlockC(k + 1); - knsupc = SuperSize(k); - usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ + klst = FstBlockC (k + 1); + knsupc = SuperSize (k); + usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ uval = Llu->Unzval_br_ptr[lk]; - if (iam == pkk) - { - lk = LBj(k, grid); + if (iam == pkk) { + lk = LBj (k, grid); nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ lusup = Llu->Lnzval_bc_ptr[lk]; - } - else - { - nsupr = Llu->Lsub_buf_2[k0 % (1 + stat->num_look_aheads)][1]; /* LDA of lusup[] */ + } else { + nsupr = Llu->Lsub_buf_2[k0 % (1 + stat->num_look_aheads)][1]; /* LDA of lusup[] */ lusup = Llu->Lval_buf_2[k0 % (1 + stat->num_look_aheads)]; } @@ -819,26 +796,26 @@ void pdgstrs2_omp #undef USE_Ublock_info #ifdef USE_Ublock_info /** 4/19/2019 **/ /* Loop through all the row blocks. to get the iukp and rukp*/ - Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat); + Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); #else - int *blocks_index_pointers = SUPERLU_MALLOC(3 * nb * sizeof(int)); - int *blocks_value_pointers = blocks_index_pointers + nb; - int *nsupc_temp = blocks_value_pointers + nb; - for (b = 0; b < nb; b++) - { /* set up pointers to each block */ - blocks_index_pointers[b] = iukp + UB_DESCRIPTOR; - blocks_value_pointers[b] = rukp; - gb = usub[iukp]; - rukp += usub[iukp + 1]; - nsupc = SuperSize(gb); - nsupc_temp[b] = nsupc; - iukp += (UB_DESCRIPTOR + nsupc); /* move to the next block */ + int* blocks_index_pointers = SUPERLU_MALLOC (3 * nb * sizeof(int)); + int* blocks_value_pointers = blocks_index_pointers + nb; + int* nsupc_temp = blocks_value_pointers + nb; + for (b = 0; b < nb; b++) { /* set up pointers to each block */ + blocks_index_pointers[b] = iukp + UB_DESCRIPTOR; + blocks_value_pointers[b] = rukp; + gb = usub[iukp]; + rukp += usub[iukp+1]; + nsupc = SuperSize( gb ); + nsupc_temp[b] = nsupc; + iukp += (UB_DESCRIPTOR + nsupc); /* move to the next block */ } #endif // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for -#pragma omp parallel for schedule(static) default(shared) private(b, j, iukp, rukp, segsize) +#pragma omp parallel for schedule(static) default(shared) \ + private(b,j,iukp,rukp,segsize) /* Loop through all the blocks in the row. */ for (b = 0; b < nb; ++b) { #ifdef USE_Ublock_info @@ -871,13 +848,11 @@ void pdgstrs2_omp dtrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr, &uval[rukp], &incx, 1, 1, 1); #else - dtrsv_("L", "N", "U", &segsize, &lusup[luptr], &nsupr, - &uval[rukp], &incx); + dtrsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr, + &uval[rukp], &incx); #endif } /* end task */ - rukp += segsize; - #ifndef USE_Ublock_info stat->ops[FACT] += segsize * (segsize + 1); #endif @@ -941,7 +916,6 @@ void pdgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf, } /* for b ... */ SCT->PDGSTRS2_tl += (double) ( SuperLU_timer_() - t1); - } /* pdgstrs2_omp new version from Piyush */ #endif /* there are 2 versions of pdgstrs2_omp */ diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c index b6193bad..5f2eaa21 100644 --- a/SRC/pzgstrf2.c +++ b/SRC/pzgstrf2.c @@ -361,14 +361,14 @@ pzgstrf2_trsm } /* PZGSTRF2_trsm */ - + /***************************************************************************** * The following functions are for the new pdgstrf2_ztrsm in the 3D code. *****************************************************************************/ static -int_t LpanelUpdate(int_t off0, int_t nsupc, doublecomplex* ublk_ptr, int_t ld_ujrow, - doublecomplex* lusup, int_t nsupr, SCT_t* SCT) +int_t LpanelUpdate(int off0, int nsupc, doublecomplex* ublk_ptr, int ld_ujrow, + doublecomplex* lusup, int nsupr, SCT_t* SCT) { int_t l = nsupr - off0; doublecomplex alpha = {1.0, 0.0}; @@ -379,30 +379,20 @@ int_t LpanelUpdate(int_t off0, int_t nsupc, doublecomplex* ublk_ptr, int_t ld_u for (int i = 0; i < CEILING(l, GT); ++i) { int_t off = i * GT; - int_t len = SUPERLU_MIN(GT, l - i * GT); -#if 1 - #if defined (USE_VENDOR_BLAS) - ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha, - ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr, - 1, 1, 1, 1); - #else - ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha, - ublk_ptr, &ld_ujrow, &lusup[off0 + off], &nsupr); - #endif -#else - cblas_ztrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, - len, nsupc, (void*) &alpha, ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); -#endif + int len = SUPERLU_MIN(GT, l - i * GT); + + superlu_ztrsm("R", "U", "N", "N", len, nsupc, alpha, + ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); } /* for i = ... */ t1 = SuperLU_timer_() - t1; - SCT->trf2_flops += (double) l * (double)nsupc * (double)nsupc; + SCT->trf2_flops += (double) l * (double) nsupc * (double)nsupc; SCT->trf2_time += t1; SCT->L_PanelUpdate_tl += t1; return 0; -} /* LpanelUpdate */ +} #pragma GCC push_options #pragma GCC optimize ("O0") @@ -422,8 +412,8 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh, int_t jfst = FstBlockC (k); int_t jlst = FstBlockC (k + 1); doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk]; - int_t nsupc = SuperSize (k); - int_t nsupr; + int nsupc = SuperSize (k); + int nsupr; if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1]; else @@ -431,11 +421,11 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh, doublecomplex *ublk_ptr = BlockUFactor; doublecomplex *ujrow = BlockUFactor; int_t luptr = 0; /* Point_t to the diagonal entries. */ - int_t cols_left = nsupc; /* supernode size */ + int cols_left = nsupc; /* supernode size */ int_t u_diag_cnt = 0; int_t ld_ujrow = nsupc; /* leading dimension of ujrow */ - int_t incx = 1; - int_t incy = ld_ujrow; + int incx = 1; + int incy = ld_ujrow; for (int_t j = 0; j < jlst - jfst; ++j) /* for each column in panel */ { @@ -471,7 +461,7 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh, { *info = j + jfst + 1; } - else /* Scale the j-th column within diagonal block. */ + else /* Scale the j-th column. */ { doublecomplex temp; slud_z_div(&temp, &one, &ujrow[0]); @@ -489,7 +479,6 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh, /* Rank-1 update */ superlu_zger(l, cols_left, alpha, &lusup[luptr + 1], incx, &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], nsupr); - stat->ops[FACT] += 8 * l * cols_left; } @@ -500,8 +489,8 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh, //int_t thread_id = omp_get_thread_num(); - // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += SuperLU_timer_() - t1; -} /* Local_Zgstrf2 */ + // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( SuperLU_timer_() - t1); +} #pragma GCC pop_options /************************************************************************/ @@ -930,4 +919,4 @@ void pzgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf, SCT->PDGSTRS2_tl += (double) ( SuperLU_timer_() - t1); } /* pdgstrs2_omp new version from Piyush */ -#endif +#endif /* there are 2 versions of pzgstrs2_omp */ diff --git a/SRC/superlu_FCnames.h b/SRC/superlu_FCnames.h index 7b5d2825..a04768d1 100644 --- a/SRC/superlu_FCnames.h +++ b/SRC/superlu_FCnames.h @@ -27,9 +27,6 @@ at the top-level directory. #define f_create_gridinfo_handle FC_GLOBAL(f_create_gridinfo_handle,F_CREATE_GRIDINFO_HANDLE) #define f_create_gridinfo3d_handle FC_GLOBAL(f_create_gridinfo3d_handle,F_CREATE_GRIDINFO3D_HANDLE) #define f_create_options_handle FC_GLOBAL(f_create_options_handle,F_CREATE_OPTIONS_HANDLE) -#define f_create_ScalePerm_handle FC_GLOBAL(f_create_scaleperm_handle,F_CREATE_SCALEPERM_HANDLE) -#define f_create_LUstruct_handle FC_GLOBAL(f_create_lustruct_handle,F_CREATE_LUSTRUCT_HANDLE) -#define f_create_SOLVEstruct_handle FC_GLOBAL(f_create_solvestruct_handle,F_CREATE_SOLVESTRUCT_HANDLE) #define f_create_SuperMatrix_handle FC_GLOBAL(f_create_supermatrix_handle,F_CREATE_SUPERMATRIX_HANDLE) #define f_destroy_gridinfo_handle FC_GLOBAL(f_destroy_gridinfo_handle,F_DESTROY_GRIDINFO_HANDLE) #define f_destroy_options_handle FC_GLOBAL(f_destroy_options_handle,F_DESTROY_OPTIONS_HANDLE) @@ -52,30 +49,47 @@ at the top-level directory. #define f_superlu_gridinit3d FC_GLOBAL(f_superlu_gridinit3d,F_SUPERLU_GRIDINIT3D) #define f_superlu_gridmap FC_GLOBAL(f_superlu_gridmap,F_SUPERLU_GRIDMAP) #define f_superlu_gridexit FC_GLOBAL(f_superlu_gridexit,F_SUPERLU_GRIDEXIT) -#define f_ScalePermstructInit FC_GLOBAL(f_scalepermstructinit,F_SCALEPERMSTRUCTINIT) -#define f_ScalePermstructFree FC_GLOBAL(f_scalepermstructfree,F_SCALEPERMSTRUCTFREE) #define f_PStatInit FC_GLOBAL(f_pstatinit,F_PSTATINIT) #define f_PStatFree FC_GLOBAL(f_pstatfree,F_PSTATFREE) -#define f_LUstructInit FC_GLOBAL(f_lustructinit,F_LUSTRUCTINIT) -#define f_LUstructFree FC_GLOBAL(f_lustructfree,F_LUSTRUCTFREE) -#define f_Destroy_LU_SOLVE_struct FC_GLOBAL(f_destroy_lu_solve_struct,F_DESTROY_LU_SOLVE_STRUCT) -#define f_Destroy_LU_SOLVE_struct_3d FC_GLOBAL(f_destroy_lu_solve_struct_3d,F_DESTROY_LU_SOLVE_STRUCT_3D) -#define f_dCreate_CompRowLoc_Mat_dist FC_GLOBAL(f_dcreate_comprowloc_mat_dist,F_DCREATE_COMPROWLOC_MAT_DIST) -#define f_zCreate_CompRowLoc_Mat_dist FC_GLOBAL(f_zcreate_comprowloc_mat_dist,F_ZCREATE_COMPROWLOC_MAT_DIST) #define f_Destroy_CompRowLoc_Mat_dist FC_GLOBAL(f_destroy_comprowloc_mat_dist,F_DESTROY_COMPROWLOC_MAT_DIST) #define f_Destroy_SuperMat_Store_dist FC_GLOBAL(f_destroy_supermat_store_dist,F_DESTROY_SUPERMAT_STORE_DIST) +#define f_check_malloc FC_GLOBAL(f_check_malloc,F_CHECK_MALLOC) + +////// double +#define f_dcreate_ScalePerm_handle FC_GLOBAL(f_dcreate_scaleperm_handle,F_DCREATE_SCALEPERM_HANDLE) +#define f_dcreate_LUstruct_handle FC_GLOBAL(f_dcreate_lustruct_handle,F_DCREATE_LUSTRUCT_HANDLE) +#define f_dcreate_SOLVEstruct_handle FC_GLOBAL(f_dcreate_solvestruct_handle,F_DCREATE_SOLVESTRUCT_HANDLE) +#define f_dScalePermstructInit FC_GLOBAL(f_dscalepermstructinit,F_DSCALEPERMSTRUCTINIT) +#define f_dScalePermstructFree FC_GLOBAL(f_dscalepermstructfree,F_DSCALEPERMSTRUCTFREE) +#define f_dLUstructInit FC_GLOBAL(f_dlustructinit,F_DLUSTRUCTINIT) +#define f_dLUstructFree FC_GLOBAL(f_dlustructfree,F_DLUSTRUCTFREE) +#define f_dDestroy_LU_SOLVE_struct FC_GLOBAL(f_ddestroy_lu_solve_struct,F_DDESTROY_LU_SOLVE_STRUCT) +#define f_dDestroy_LU_SOLVE_struct_3d FC_GLOBAL(f_ddestroy_lu_solve_struct_3d,F_DDESTROY_LU_SOLVE_STRUCT_3D) + +#define f_dCreate_CompRowLoc_Mat_dist FC_GLOBAL(f_dcreate_comprowloc_mat_dist,F_DCREATE_COMPROWLOC_MAT_DIST) #define f_dSolveFinalize FC_GLOBAL(f_dsolvefinalize,F_DSOLVEFINALIZE) -#define f_zSolveFinalize FC_GLOBAL(f_zsolvefinalize,F_ZSOLVEFINALIZE) #define f_pdgssvx FC_GLOBAL(f_pdgssvx,F_PDGSSVX) #define f_pdgssvx3d FC_GLOBAL(f_pdgssvx3d,F_PDGSSVX3D) -#define f_pzgssvx FC_GLOBAL(f_pzgssvx,F_PZGSSVX) -#define f_pzgssvx3d FC_GLOBAL(f_pzgssvx3d,F_PZGSSVX3D) #define f_dcreate_dist_matrix FC_GLOBAL(f_dcreate_dist_matrix,F_DCREATE_DIST_MATRIX) #define f_dcreate_matrix_x_b FC_GLOBAL(f_dcreate_matrix_x_b,F_DCREATE_MATRIX_X_B) #define f_dcreate_matrix_x_b_3d FC_GLOBAL(f_dcreate_matrix_x_b_3d,F_DCREATE_MATRIX_X_B_3D) + +////// complex16 +#define f_zcreate_ScalePerm_handle FC_GLOBAL(f_zcreate_scaleperm_handle,F_ZCREATE_SCALEPERM_HANDLE) +#define f_zcreate_LUstruct_handle FC_GLOBAL(f_zcreate_lustruct_handle,F_ZCREATE_LUSTRUCT_HANDLE) +#define f_zcreate_SOLVEstruct_handle FC_GLOBAL(f_zcreate_solvestruct_handle,F_ZCREATE_SOLVESTRUCT_HANDLE) +#define f_zScalePermstructInit FC_GLOBAL(f_zscalepermstructinit,F_ZSCALEPERMSTRUCTINIT) +#define f_zScalePermstructFree FC_GLOBAL(f_zscalepermstructfree,F_ZSCALEPERMSTRUCTFREE) +#define f_zLUstructInit FC_GLOBAL(f_zlustructinit,F_ZLUSTRUCTINIT) +#define f_zLUstructFree FC_GLOBAL(f_zlustructfree,F_ZLUSTRUCTFREE) +#define f_zDestroy_LU_SOLVE_struct FC_GLOBAL(f_zdestroy_lu_solve_struct,F_ZDESTROY_LU_SOLVE_STRUCT) +#define f_zDestroy_LU_SOLVE_struct_3d FC_GLOBAL(f_zdestroy_lu_solve_struct_3d,F_ZDESTROY_LU_SOLVE_STRUCT_3D) +#define f_zCreate_CompRowLoc_Mat_dist FC_GLOBAL(f_zcreate_comprowloc_mat_dist,F_ZCREATE_COMPROWLOC_MAT_DIST) +#define f_zSolveFinalize FC_GLOBAL(f_zsolvefinalize,F_ZSOLVEFINALIZE) +#define f_pzgssvx FC_GLOBAL(f_pzgssvx,F_PZGSSVX) +#define f_pzgssvx3d FC_GLOBAL(f_pzgssvx3d,F_PZGSSVX3D) #define f_zcreate_matrix_x_b FC_GLOBAL(f_zcreate_matrix_x_b,F_ZCREATE_MATRIX_X_B) #define f_zcreate_matrix_x_b_3d FC_GLOBAL(f_zcreate_matrix_x_b_3d,F_ZCREATE_MATRIX_X_B_3D) -#define f_check_malloc FC_GLOBAL(f_check_malloc,F_CHECK_MALLOC) /* BLAS */ #define sasum_ FC_GLOBAL(sasum,SASUM) diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index d52b1bf0..f241ae66 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -953,12 +953,11 @@ typedef struct xtrsTimer_t extern "C" { #endif -extern void superlu_gridinit(MPI_Comm, int_t, int_t, gridinfo_t *); -extern void superlu_gridmap(MPI_Comm, int_t, int_t, int_t [], int_t, - gridinfo_t *); +extern void superlu_gridinit(MPI_Comm, int, int, gridinfo_t *); +extern void superlu_gridmap(MPI_Comm, int, int, int [], int, gridinfo_t *); extern void superlu_gridexit(gridinfo_t *); -extern void superlu_gridinit3d(MPI_Comm Bcomm, int_t nprow, - int_t npcol, int_t npdep, gridinfo3d_t *grid) ; +extern void superlu_gridinit3d(MPI_Comm Bcomm, int nprow, int npcol, int npdep, + gridinfo3d_t *grid) ; extern void superlu_gridexit3d(gridinfo3d_t *grid); extern void set_default_options_dist(superlu_dist_options_t *); @@ -1076,6 +1075,8 @@ extern int_t get_cublas_nb (); extern int_t get_num_cuda_streams (); #endif +extern double estimate_cpu_time(int m, int n , int k); + extern int get_thread_per_process(); extern int_t get_max_buffer_size (); extern int_t get_min (int_t *, int_t); @@ -1134,6 +1135,7 @@ extern yes_no_t StdList_Find(StdList lst, int_t dat); extern int_t StdList_Size(StdList lst); yes_no_t StdList_Empty(StdList lst); + /*==== For 3D code ====*/ extern void DistPrint(char* function_name, double value, char* Units, gridinfo_t* grid); diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index 6df0c761..4ea5a7ff 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -1,7 +1,7 @@ /* superlu_dist_config.h.in */ /* Enable CUDA */ -#define HAVE_CUDA TRUE +/* #undef HAVE_CUDA */ /* Enable parmetis */ #define HAVE_PARMETIS TRUE diff --git a/SRC/superlu_grid.c b/SRC/superlu_grid.c index 32303e4c..8aa61d79 100644 --- a/SRC/superlu_grid.c +++ b/SRC/superlu_grid.c @@ -35,14 +35,14 @@ MPI_Datatype SuperLU_MPI_DOUBLE_COMPLEX = MPI_DATATYPE_NULL; */ void superlu_gridinit(MPI_Comm Bcomm, /* The base communicator upon which the new grid is formed. */ - int_t nprow, int_t npcol, gridinfo_t *grid) + int nprow, int npcol, gridinfo_t *grid) { int Np = nprow * npcol; - int_t *usermap; + int *usermap; int i, j, info; /* Make a list of the processes in the new communicator. */ - usermap = (int_t *) SUPERLU_MALLOC(Np*sizeof(int_t)); + usermap = SUPERLU_MALLOC(Np*sizeof(int)); for (j = 0; j < npcol; ++j) for (i = 0; i < nprow; ++i) usermap[j*nprow+i] = i*npcol+j; @@ -73,12 +73,12 @@ void superlu_gridinit(MPI_Comm Bcomm, /* The base communicator upon which void superlu_gridmap( MPI_Comm Bcomm, /* The base communicator upon which the new grid is formed. */ - int_t nprow, - int_t npcol, - int_t usermap[], /* usermap(i,j) holds the process + int nprow, + int npcol, + int usermap[], /* usermap(i,j) holds the process number to be placed in {i,j} of the process grid. */ - int_t ldumap, /* The leading dimension of the + int ldumap, /* The leading dimension of the 2D array usermap[]. */ gridinfo_t *grid) { diff --git a/SRC/superlu_grid3d.c b/SRC/superlu_grid3d.c index d34c1ccc..9fb97b84 100644 --- a/SRC/superlu_grid3d.c +++ b/SRC/superlu_grid3d.c @@ -13,9 +13,9 @@ void superlu_gridmap3d( MPI_Comm Bcomm, /* The base communicator upon which the new grid is formed. */ - int_t nprow, - int_t npcol, - int_t npdep, + int nprow, + int npcol, + int npdep, gridinfo3d_t *grid); @@ -23,11 +23,9 @@ void superlu_gridmap3d( */ void superlu_gridinit3d(MPI_Comm Bcomm, /* The base communicator upon which the new grid is formed. */ - int_t nprow, int_t npcol, int_t npdep, - gridinfo3d_t *grid) + int nprow, int npcol, int npdep, gridinfo3d_t *grid) { int Np = nprow * npcol * npdep; - int_t *usermap; int i, j, info; /* Make a list of the processes in the new communicator. */ @@ -55,9 +53,9 @@ void superlu_gridinit3d(MPI_Comm Bcomm, /* The base communicator upon which void superlu_gridmap3d( MPI_Comm Bcomm, /* The base communicator upon which the new grid is formed. */ - int_t nprow, - int_t npcol, - int_t npdep, + int nprow, + int npcol, + int npdep, gridinfo3d_t *grid) { MPI_Group mpi_base_group, superlu_grp; diff --git a/SRC/zlustruct_gpu.h b/SRC/zlustruct_gpu.h index 0f7a329f..04819221 100644 --- a/SRC/zlustruct_gpu.h +++ b/SRC/zlustruct_gpu.h @@ -12,17 +12,14 @@ #pragma once // so that this header file is included onle once -// #ifdef DEBUG -// #include -// #endif -// #include -// #include "mkl.h" +#include "superlu_zdefs.h" + +#ifdef GPU_ACC // enable GPU -// #define USE_VENDOR_BLAS +// #include "mkl.h" #include #include -#include "superlu_zdefs.h" // #include "sec_structs.h" // #include "supernodal_etree.h" @@ -174,8 +171,6 @@ extern int zsparseTreeFactor_ASYNC_GPU( double thresh, SCT_t *SCT, int tag_ub, int *info); -extern double estimate_cpu_time(int m, int n , int k); - int zinitD2Hreduce( int next_k, d2Hreduce_t* d2Hred, @@ -241,4 +236,4 @@ void zprintGPUStats(zLUstruct_gpu_t *A_gpu); } #endif -//#undef DEBUG +#endif // matching: enable GPU diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c index c4f7d1e3..d3f16e05 100644 --- a/SRC/znrformat_loc3d.c +++ b/SRC/znrformat_loc3d.c @@ -119,27 +119,30 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input, on 3D grid } A2d->nnz_loc = nnz_disp[grid3d->npdep]; A2d->m_loc = row_disp[grid3d->npdep]; -#if 0 - A2d->fst_row = A->fst_row; // This is a bug -#else - gridinfo_t *grid2d = &(grid3d->grid2d); - int procs2d = grid2d->nprow * grid2d->npcol; - int m_loc_2d = A2d->m_loc; - int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int)); - - MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, MPI_INT, grid2d->comm); - - int fst_row = 0; - for (int p = 0; p < procs2d; ++p) - { - if (grid2d->iam == p) - A2d->fst_row = fst_row; - fst_row += m_loc_2d_counts[p]; - } - SUPERLU_FREE(m_loc_2d_counts); -#endif + if (grid3d->rankorder == 1) { // XY-major + A2d->fst_row = A->fst_row; + } else { // Z-major + gridinfo_t *grid2d = &(grid3d->grid2d); + int procs2d = grid2d->nprow * grid2d->npcol; + int m_loc_2d = A2d->m_loc; + int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int)); + + MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, + MPI_INT, grid2d->comm); + + int fst_row = 0; + for (int p = 0; p < procs2d; ++p) + { + if (grid2d->iam == p) + A2d->fst_row = fst_row; + fst_row += m_loc_2d_counts[p]; + } + + SUPERLU_FREE(m_loc_2d_counts); + } } + // Btmp <- compact(B) // compacting B doublecomplex *Btmp; @@ -235,9 +238,9 @@ int zScatter_B3d(NRformat_loc3d *A3d, // modified // Btmp <- scatterv(B1), block-by-block if ( rankorder == 1 ) { /* XY-major in 3D grid */ /* e.g. 1x3x4 grid: layer0 layer1 layer2 layer3 - * 0 1 2 4 - * 5 6 7 8 - * 9 10 11 12 + * 0 1 2 3 + * 4 5 6 7 + * 8 9 10 11 */ MPI_Scatterv(B1, b_counts_int, b_disp, SuperLU_MPI_DOUBLE_COMPLEX, Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, diff --git a/SRC/zsuperlu_gpu.cu b/SRC/zsuperlu_gpu.cu index aecf1ec5..6a705725 100644 --- a/SRC/zsuperlu_gpu.cu +++ b/SRC/zsuperlu_gpu.cu @@ -32,21 +32,6 @@ // const int incX, double *Y, const int incY); //} -/*error reporting functions */ -//static -cudaError_t checkCuda(cudaError_t result) -{ -#if defined(DEBUG) || defined(_DEBUG) - if (result != cudaSuccess) - { - fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); - assert(result == cudaSuccess); - } -#endif - return result; -} - - // cublasStatus_t checkCublas(cublasStatus_t result) // { // #if defined(DEBUG) || defined(_DEBUG) diff --git a/SRC/ztreeFactorizationGPU.c b/SRC/ztreeFactorizationGPU.c index b081339f..0377acf1 100644 --- a/SRC/ztreeFactorizationGPU.c +++ b/SRC/ztreeFactorizationGPU.c @@ -18,6 +18,8 @@ //#include "cblas.h" #endif +#ifdef GPU_ACC ///////////////// enable GPU + /* /-- num_u_blks--\ /-- num_u_blks_Phi --\ ---------------------------------------- @@ -731,3 +733,5 @@ int zsparseTreeFactor_ASYNC_GPU( return 0; } /* end zsparseTreeFactor_ASYNC_GPU */ + +#endif // matching: enable GPU diff --git a/SRC/zutil_dist.c b/SRC/zutil_dist.c index f3d699ed..4687bf39 100644 --- a/SRC/zutil_dist.c +++ b/SRC/zutil_dist.c @@ -713,7 +713,7 @@ void zDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid, } if(grid->iam==0){ - fprintf(fp, "%d %d %d\n", n,n,nnzL); + fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL); } ncb = nsupers / grid->npcol; From 18da633875fe7885b958a0af90122e2a44495039 Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Wed, 5 May 2021 19:46:00 -0400 Subject: [PATCH 079/147] FORTRAN/: set up the FPP preprrocessing to decide whether to use 64-bit indexing. --- CMakeLists.txt | 11 +- FORTRAN/CMakeLists.txt | 27 +++- FORTRAN/Makefile | 5 +- FORTRAN/README | 18 ++- FORTRAN/{f_5x5.f90 => f_5x5.F90} | 10 +- FORTRAN/{f_pddrive.f90 => f_pddrive.F90} | 10 +- FORTRAN/{f_pddrive3d.f90 => f_pddrive3d.F90} | 8 +- FORTRAN/f_psdrive.F90 | 146 +++++++++++++++++++ FORTRAN/{f_pzdrive.f90 => f_pzdrive.F90} | 10 +- FORTRAN/{f_pzdrive3d.f90 => f_pzdrive3d.F90} | 8 +- FORTRAN/superlu_dist_config.fh | 11 ++ 11 files changed, 244 insertions(+), 20 deletions(-) rename FORTRAN/{f_5x5.f90 => f_5x5.F90} (97%) rename FORTRAN/{f_pddrive.f90 => f_pddrive.F90} (96%) rename FORTRAN/{f_pddrive3d.f90 => f_pddrive3d.F90} (96%) create mode 100644 FORTRAN/f_psdrive.F90 rename FORTRAN/{f_pzdrive.f90 => f_pzdrive.F90} (96%) rename FORTRAN/{f_pzdrive3d.f90 => f_pzdrive3d.F90} (96%) create mode 100644 FORTRAN/superlu_dist_config.fh diff --git a/CMakeLists.txt b/CMakeLists.txt index 711a436c..03ed5c2c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -399,7 +399,6 @@ if (XSDK_ENABLE_Fortran) MACRO_NAMESPACE "FC_") FortranCInterface_VERIFY(CXX) SET(MPI_Fortran_LINK_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") - add_subdirectory(FORTRAN) endif() ###################################################################### @@ -441,14 +440,24 @@ if(enable_examples) add_subdirectory(EXAMPLE) endif() +if (XSDK_ENABLE_Fortran) + add_subdirectory(FORTRAN) +endif() + # superlu_dist uses c++11. PUBLIC means that the other codes linking to it need c++11 #target_compile_features(SuperLU_DIST PUBLIC cxx_std_11) +# Generate various configure files with proper definitions # configure_file(${CMAKE_SOURCE_DIR}/make.inc.in ${CMAKE_BINARY_DIR}/make.inc) configure_file(${SuperLU_DIST_SOURCE_DIR}/make.inc.in ${SuperLU_DIST_SOURCE_DIR}/make.inc) + configure_file(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h.in ${SuperLU_DIST_BINARY_DIR}/SRC/superlu_dist_config.h) configure_file(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h.in ${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h) +# Following is to configure a file for FORTRAN code +configure_file(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_dist_config.h.in ${SuperLU_DIST_BINARY_DIR}/FORTRAN/superlu_dist_config.h) + + # Add pkg-config support if(IS_ABSOLUTE ${CMAKE_INSTALL_LIBDIR}) set(pkgconfig_libdir ${CMAKE_INSTALL_LIBDIR}) diff --git a/FORTRAN/CMakeLists.txt b/FORTRAN/CMakeLists.txt index 99745790..5705c7fd 100644 --- a/FORTRAN/CMakeLists.txt +++ b/FORTRAN/CMakeLists.txt @@ -1,9 +1,12 @@ -# Sherry; may not need it? +# include the paths for header files include_directories(${SuperLU_DIST_SOURCE_DIR}/SRC) +include_directories(${SuperLU_DIST_BINARY_DIR}/FORTRAN) set(headers ${CMAKE_INSTALL_PREFIX}/FORTRAN/superlu_mod.mod ${CMAKE_INSTALL_PREFIX}/FORTRAN/superlupara_mod.mod + ${CMAKE_INSTALL_PREFIX}/FORTRAN/superlu_dist_config.fh +# ${CMAKE_CURRENT_BINARY_DIR}/fortran_config.h ) set(sources "superlu_c2f_wrap.c") # initialize precision-independent file @@ -18,6 +21,9 @@ endif() add_library(superlu_dist_fortran ${sources}) set(targets superlu_dist_fortran) +# depends on FPP defs +add_dependencies(superlu_dist_fortran config_f) + install(TARGETS superlu_dist_fortran # DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION "${INSTALL_BIN_DIR}" @@ -49,18 +55,18 @@ endif () set(F_MOD superlupara.f90 superlu_mod.f90) if(enable_double) - set(F_DEXM ${F_MOD} f_pddrive.f90) + set(F_DEXM ${F_MOD} f_pddrive.F90) add_executable(f_pddrive ${F_DEXM}) target_link_libraries(f_pddrive ${all_link_libs}) # set_target_properties(f_pddrive PROPERTIES LINKER_LANGUAGE Fortran) set_target_properties(f_pddrive PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") - set(F_DEXM3D ${F_MOD} f_pddrive3d.f90) + set(F_DEXM3D ${F_MOD} f_pddrive3d.F90) add_executable(f_pddrive3d ${F_DEXM3D}) target_link_libraries(f_pddrive3d ${all_link_libs}) set_target_properties(f_pddrive3d PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") - set(F_5x5 ${F_MOD} f_5x5.f90 sp_ienv.c) + set(F_5x5 ${F_MOD} f_5x5.F90 sp_ienv.c) add_executable(f_5x5 ${F_5x5}) target_link_libraries(f_5x5 ${all_link_libs}) set_target_properties(f_5x5 PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") @@ -68,16 +74,25 @@ if(enable_double) endif() if(enable_complex16) - set(F_ZEXM ${F_MOD} f_pzdrive.f90) + set(F_ZEXM ${F_MOD} f_pzdrive.F90) add_executable(f_pzdrive ${F_ZEXM}) target_link_libraries(f_pzdrive ${all_link_libs}) # set_target_properties(f_pzdrive PROPERTIES LINKER_LANGUAGE Fortran) set_target_properties(f_pzdrive PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") - set(F_ZEXM3D ${F_MOD} f_pzdrive3d.f90) + set(F_ZEXM3D ${F_MOD} f_pzdrive3d.F90) add_executable(f_pzdrive3d ${F_ZEXM3D}) target_link_libraries(f_pzdrive3d ${all_link_libs}) set_target_properties(f_pzdrive3d PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}") endif() + +# Format superlu_dist_config.fh from superlu_dist_config.h in C +add_custom_command( + OUTPUT superlu_dist_config.fh + COMMAND sed;'/^\\//;d';<;superlu_dist_config.h;>;superlu_dist_config.fh + COMMAND cp;superlu_dist_config.fh;${SuperLU_DIST_SOURCE_DIR}/FORTRAN/. + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/FORTRAN +) +add_custom_target(config_f DEPENDS superlu_dist_config.fh) diff --git a/FORTRAN/Makefile b/FORTRAN/Makefile index 9a8ea6b8..008e3431 100644 --- a/FORTRAN/Makefile +++ b/FORTRAN/Makefile @@ -8,7 +8,7 @@ # ####################################################################### .SUFFIXES: -.SUFFIXES: .f90 .c .o +.SUFFIXES: .f90 .F90 .c .o include ../make.inc #F90FLAGS = $(FFLAGS) -qfree -qsuffix=f=f90 -qflag=w:w @@ -47,6 +47,9 @@ f_pzdrive3d: $(F_ZEXM3D) $(DSUPERLULIB) $(DFORTRANLIB) .f90.o: $(FORTRAN) $(F90FLAGS) -c $< $(VERBOSE) +.F90.o: + $(FORTRAN) $(F90FLAGS) -c $< $(VERBOSE) + .f.o: $(FORTRAN) $(FFLAGS) -c $< $(VERBOSE) diff --git a/FORTRAN/README b/FORTRAN/README index c26ef8c4..67b93850 100644 --- a/FORTRAN/README +++ b/FORTRAN/README @@ -19,16 +19,30 @@ There are two examples in the directory. mpiexec -n 2 f_5x5 (The example is set up to use 2 processors.) -2. f_pddrive.f90, f_pddrive3d.f90 +2. f_pddrive.f90 A real example Fortran driver routine that reads a matrix from a file 'g20.rua' in Harwell-Boeing format. To run the code, type: mpiexec -n 4 f_pddrive (or f_pddrive3d) (The example is set up to use 4 MPI processes) +2. f_pddrive3d.f90: use the 3D algorithms + A real example Fortran driver routine that reads a matrix from a file + 'g20.rua' in Harwell-Boeing format. + To run the code, type: + mpiexec -n 8 f_pddrive3d + (The example is set up to use 8 MPI processes) + 3. f_pzdrive.f90 A complex example Fortran driver routine that reads a matrix from a file 'cg20.cua' in Harwell-Boeing format. To run the code, type: - mpiexec -n 4 f_pzdrive (or f_pddrive3d) + mpiexec -n 4 f_pzdrive (The example is set up to use 4 MPI processes) + +3. f_pzdrive3d.f90: use the 3D algorihms + A complex example Fortran driver routine that reads a matrix from a file + 'cg20.cua' in Harwell-Boeing format. + To run the code, type: + mpiexec -n 8 f_pzdrive3d + (The example is set up to use 8 MPI processes) diff --git a/FORTRAN/f_5x5.f90 b/FORTRAN/f_5x5.F90 similarity index 97% rename from FORTRAN/f_5x5.f90 rename to FORTRAN/f_5x5.F90 index 1b750533..058cf07e 100644 --- a/FORTRAN/f_5x5.f90 +++ b/FORTRAN/f_5x5.F90 @@ -33,13 +33,16 @@ program f_5x5 ! 6. Release the process grid and terminate the MPI environment ! 7. Release all structures ! + #include "superlu_dist_config.fh" use superlu_mod -! implicit none include 'mpif.h' -! include 'superlu_dist_config.fh' integer maxn, maxnz, maxnrhs parameter ( maxn = 10, maxnz = 100, maxnrhs = 10 ) +#if (XSDK_INDEX_SIZE==64) integer*8 colind(maxnz), rowptr(maxn+1) +#else + integer colind(maxnz), rowptr(maxn+1) +#endif real*8 nzval(maxnz), b(maxn), berr(maxnrhs) integer n, m, nnz, nrhs, nprow, npcol, init integer iam, info, i, ierr, ldb @@ -82,6 +85,9 @@ program f_5x5 if ( iam == 0 ) then write(*,*) ' Process grid ', nprow, ' X ', npcol write(*,*) ' default integer size ', kind(0) +#if (XSDK_INDEX_SIZE==64) + write(*,*) ' use 64-bit integer for A matrix' +#endif endif ! !************************************************************************* diff --git a/FORTRAN/f_pddrive.f90 b/FORTRAN/f_pddrive.F90 similarity index 96% rename from FORTRAN/f_pddrive.f90 rename to FORTRAN/f_pddrive.F90 index a67de8f0..e3e2b571 100644 --- a/FORTRAN/f_pddrive.f90 +++ b/FORTRAN/f_pddrive.F90 @@ -29,14 +29,18 @@ program f_pddrive ! 7. Release all structures ! ! + #include "superlu_dist_config.fh" use superlu_mod -! implicit none include 'mpif.h' integer maxn, maxnz, maxnrhs parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 ) - integer rowind(maxnz), colptr(maxn) real*8 values(maxnz), b(maxn), berr(maxnrhs), xtrue(maxn) - integer n, m, nnz, nprow, npcol +#if (XSDK_INDEX_SIZE==64) + integer*8 nnz +#else + integer nnz +#endif + integer n, m, nprow, npcol integer*4 iam, info, i, ierr, ldb, nrhs character*80 fname diff --git a/FORTRAN/f_pddrive3d.f90 b/FORTRAN/f_pddrive3d.F90 similarity index 96% rename from FORTRAN/f_pddrive3d.f90 rename to FORTRAN/f_pddrive3d.F90 index ffe0353b..784e1085 100644 --- a/FORTRAN/f_pddrive3d.f90 +++ b/FORTRAN/f_pddrive3d.F90 @@ -41,6 +41,7 @@ program f_pddrive3d ! The program may be run by typing ! mpiexec -np 8 f_pddrive3d ! + #include "superlu_dist_config.fh" use superlu_mod ! implicit none include 'mpif.h' @@ -48,7 +49,12 @@ program f_pddrive3d parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 ) integer rowind(maxnz), colptr(maxn) real*8 values(maxnz), b(maxn), berr(maxnrhs), xtrue(maxn) - integer n, m, nnz, nprow, npcol, npdep, init +#if (XSDK_INDEX_SIZE==64) + integer*8 nnz +#else + integer nnz +#endif + integer n, m, nprow, npcol, npdep, init integer*4 iam, info, i, ierr, ldb, nrhs character*80 fname diff --git a/FORTRAN/f_psdrive.F90 b/FORTRAN/f_psdrive.F90 new file mode 100644 index 00000000..e7014834 --- /dev/null +++ b/FORTRAN/f_psdrive.F90 @@ -0,0 +1,146 @@ + + +!> @file +!! \brief The driver program to solve a linear system with default options. +!! +!!
+!! -- Distributed SuperLU routine (version 3.2) --
+!! Lawrence Berkeley National Lab, Univ. of California Berkeley.
+!! October, 2012
+!! 
+! + program f_psdrive +! +! Purpose +! ======= +! +! The driver program F_PDDRIVE. +! +! This example illustrates how to use F_PDGSSVX with the full +! (default) options to solve a linear system. +! +! Seven basic steps are required: +! 1. Create C structures used in SuperLU_DIST +! 2. Initialize the MPI environment and the SuperLU process grid +! 3. Set up the input matrix and the right-hand side +! 4. Set the options argument +! 5. Call f_pdgssvx +! 6. Release the process grid and terminate the MPI environment +! 7. Release all structures +! +! + #include "superlu_dist_config.fh" + use superlu_mod + include 'mpif.h' + integer maxn, maxnz, maxnrhs + parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 ) +#if (XSDK_INDEX_SIZE==64) + integer*8 nnz +#else + integer nnz +#endif + integer n, m, nprow, npcol + integer*4 iam, info, i, ierr, ldb, nrhs + character*80 fname + + integer(superlu_ptr) :: grid + integer(superlu_ptr) :: options + integer(superlu_ptr) :: ScalePermstruct + integer(superlu_ptr) :: LUstruct + integer(superlu_ptr) :: SOLVEstruct + integer(superlu_ptr) :: A + integer(superlu_ptr) :: stat + +! Initialize MPI environment + call mpi_init(ierr) + +! Check malloc +! call f_check_malloc(iam) + +! Create Fortran handles for the C structures used in SuperLU_DIST + call f_create_gridinfo_handle(grid) + call f_create_options_handle(options) + call f_screate_ScalePerm_handle(ScalePermstruct) + call f_screate_LUstruct_handle(LUstruct) + call f_screate_SOLVEstruct_handle(SOLVEstruct) + call f_create_SuperMatrix_handle(A) + call f_create_SuperLUStat_handle(stat) + +! Initialize the SuperLU_DIST process grid + nprow = 2 + npcol = 2 + call f_superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, grid) + +! Bail out if I do not belong in the grid. + call get_GridInfo(grid, iam=iam) + if ( iam >= nprow * npcol ) then + go to 100 + endif + if ( iam == 0 ) then + write(*,*) ' Process grid ', nprow, ' X ', npcol + endif + +! Read and distribute the matrix to the process gird + nrhs = 1 + fname = '../EXAMPLE/g20.rua'//char(0) !! make the string null-ended + call f_screate_matrix_x_b(fname, A, m, n, nnz, & + nrhs, b, ldb, xtrue, ldx, grid) + + if ( iam == 0 ) then + write(*,*) ' Matrix A was set up: m ', m, ' nnz ', nnz + endif + +! Set the default input options + call f_set_default_options(options) + +! Change one or more options +! call set_superlu_options(options,Fact=FACTORED) +! call set_superlu_options(options,ParSymbFact=YES) + +! Initialize ScalePermstruct and LUstruct + call get_SuperMatrix(A, nrow=m, ncol=n) + call f_sScalePermstructInit(m, n, ScalePermstruct) + call f_sLUstructInit(m, n, LUstruct) + +! Initialize the statistics variables + call f_PStatInit(stat) + +! Call the linear equation solver + call f_psgssvx(options, A, ScalePermstruct, b, ldb, nrhs, & + grid, LUstruct, SOLVEstruct, berr, stat, info) + + if (info == 0) then + if ( iam == 0 ) then + write (*,*) 'Backward error: ', (berr(i), i = 1, nrhs) + endif + else + write(*,*) 'INFO from f_pdgssvx = ', info + endif + +! Deallocate the storage allocated by SuperLU_DIST + call f_PStatFree(stat) + call f_Destroy_CompRowLoc_Mat_dist(A) + call f_sScalePermstructFree(ScalePermstruct) + call f_sDestroy_LU_SOLVE_struct(options, n, grid, LUstruct, SOLVEstruct) + +! Release the SuperLU process grid +100 call f_superlu_gridexit(grid) + +! Deallocate the C structures pointed to by the Fortran handles + call f_destroy_gridinfo_handle(grid) + call f_destroy_options_handle(options) + call f_destroy_ScalePerm_handle(ScalePermstruct) + call f_destroy_LUstruct_handle(LUstruct) + call f_destroy_SOLVEstruct_handle(SOLVEstruct) + call f_destroy_SuperMatrix_handle(A) + call f_destroy_SuperLUStat_handle(stat) + +! Check malloc +! call f_check_malloc(iam) + + +! Terminate the MPI execution environment + call mpi_finalize(ierr) + + stop + end diff --git a/FORTRAN/f_pzdrive.f90 b/FORTRAN/f_pzdrive.F90 similarity index 96% rename from FORTRAN/f_pzdrive.f90 rename to FORTRAN/f_pzdrive.F90 index 7bbe9ead..750e57aa 100644 --- a/FORTRAN/f_pzdrive.f90 +++ b/FORTRAN/f_pzdrive.F90 @@ -28,15 +28,19 @@ program f_pzdrive ! 7. Release all structures ! ! + #include "superlu_dist_config.fh" use superlu_mod -! implicit none include 'mpif.h' integer maxn, maxnz, maxnrhs parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 ) - integer rowind(maxnz), colptr(maxn) double complex values(maxnz), b(maxn), xtrue(maxn) real*8 berr(maxnrhs) - integer n, m, nnz, nprow, npcol +#if (XSDK_INDEX_SIZE==64) + integer*8 nnz +#else + integer nnz +#endif + integer n, m, nprow, npcol integer*4 iam, info, i, ierr, ldb, nrhs character*80 fname diff --git a/FORTRAN/f_pzdrive3d.f90 b/FORTRAN/f_pzdrive3d.F90 similarity index 96% rename from FORTRAN/f_pzdrive3d.f90 rename to FORTRAN/f_pzdrive3d.F90 index f64c50d7..f25d8e97 100644 --- a/FORTRAN/f_pzdrive3d.f90 +++ b/FORTRAN/f_pzdrive3d.F90 @@ -40,6 +40,7 @@ program f_pzdrive3d ! The program may be run by typing ! mpiexec -np 8 f_pzdrive3d ! + #include "superlu_dist_config.fh" use superlu_mod ! implicit none include 'mpif.h' @@ -48,7 +49,12 @@ program f_pzdrive3d integer rowind(maxnz), colptr(maxn) double complex values(maxnz), b(maxn), xtrue(maxn) real*8 berr(maxnrhs) - integer n, m, nnz, nprow, npcol, npdep, init +#if (XSDK_INDEX_SIZE==64) + integer*8 nnz +#else + integer nnz +#endif + integer n, m, nprow, npcol, npdep, init integer*4 iam, info, i, ierr, ldb, nrhs character*80 fname diff --git a/FORTRAN/superlu_dist_config.fh b/FORTRAN/superlu_dist_config.fh new file mode 100644 index 00000000..caa86f6b --- /dev/null +++ b/FORTRAN/superlu_dist_config.fh @@ -0,0 +1,11 @@ + + +#define HAVE_PARMETIS TRUE + + + +#define XSDK_INDEX_SIZE 64 + +#if (XSDK_INDEX_SIZE == 64) +#define _LONGINT 1 +#endif From 2a797a326efd92210ef7d57fcf0d4997f4d33e09 Mon Sep 17 00:00:00 2001 From: piyush Date: Wed, 5 May 2021 21:10:47 -0400 Subject: [PATCH 080/147] fixing cudafreeHost error --- SRC/dutil_dist.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/SRC/dutil_dist.c b/SRC/dutil_dist.c index a8653599..c5a813fb 100644 --- a/SRC/dutil_dist.c +++ b/SRC/dutil_dist.c @@ -455,11 +455,7 @@ int dDeAllocLlu_3d(int_t n, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d) for (i = 0; i < nbc; ++i) if ( Llu->Lrowind_bc_ptr[i] ) { SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); -#ifdef GPU_ACC - checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i])); -#else SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); -#endif } SUPERLU_FREE (Llu->Lrowind_bc_ptr); SUPERLU_FREE (Llu->Lnzval_bc_ptr); From 6b3beddc8c427afbe81dc14b0b5066830c5ec3e9 Mon Sep 17 00:00:00 2001 From: Piyush Sao Date: Thu, 6 May 2021 16:28:13 -0400 Subject: [PATCH 081/147] remove cudafreehost --- SRC/zutil_dist.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/SRC/zutil_dist.c b/SRC/zutil_dist.c index 4687bf39..ca27f360 100644 --- a/SRC/zutil_dist.c +++ b/SRC/zutil_dist.c @@ -456,11 +456,8 @@ int zDeAllocLlu_3d(int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d) for (i = 0; i < nbc; ++i) if ( Llu->Lrowind_bc_ptr[i] ) { SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); -#ifdef GPU_ACC - checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i])); -#else - SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); -#endif + SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); + } SUPERLU_FREE (Llu->Lrowind_bc_ptr); SUPERLU_FREE (Llu->Lnzval_bc_ptr); From ec4f7737ae36a20185c2ce804e16377418cf1e06 Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Sat, 8 May 2021 14:37:25 -0400 Subject: [PATCH 082/147] Add new file superlu_c2f_wrap.c (precision-independent). --- FORTRAN/superlu_c2f_wrap.c | 261 +++++++++++++++++++++++++++++++++++++ SRC/cublas_utils.c | 2 +- 2 files changed, 262 insertions(+), 1 deletion(-) create mode 100644 FORTRAN/superlu_c2f_wrap.c diff --git a/FORTRAN/superlu_c2f_wrap.c b/FORTRAN/superlu_c2f_wrap.c new file mode 100644 index 00000000..d35081ab --- /dev/null +++ b/FORTRAN/superlu_c2f_wrap.c @@ -0,0 +1,261 @@ + + +/*! @file + * \brief C interface functions for the Fortran90 wrapper. + * + *
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 2012
+ * April 5, 2015
+ */
+
+#include "superlu_defs.h"
+#include "superlu_FCnames.h"
+
+/* kind of integer to hold a pointer.
+   Be sure to be consistent with that in superlupara.f90 */
+#if 0
+typedef int fptr;  /* 32-bit */
+#else
+typedef long long int fptr;  /* 64-bit */
+#endif
+
+
+/* some MPI implementations may require conversion between a Fortran
+   communicator and a C communicator.  This routine is used to perform the
+   conversion.  It may need different forms for different MPI libraries. */
+
+/* NO_MPI2 should be defined on the compiler command line if the MPI
+   library does not provide MPI_Comm_f2c */
+
+MPI_Comm f2c_comm(int *f_comm)
+{
+#ifndef NO_MPI2
+
+/* MPI 2 provides a standard way of doing this */
+   return MPI_Comm_f2c((MPI_Fint)(*f_comm));
+#else
+
+/* will probably need some special cases here */
+/* when in doubt, just return the input */
+   return (MPI_Comm)(*f_comm);
+#endif
+}
+
+
+/* functions that create memory for a struct and return a handle */
+
+void f_create_gridinfo_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo_t));
+}
+
+void f_create_gridinfo3d_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(gridinfo3d_t));
+}
+
+void f_create_options_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(superlu_dist_options_t));
+}
+
+void f_create_SuperMatrix_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperMatrix));
+}
+
+void f_create_SuperLUStat_handle(fptr *handle)
+{
+   *handle = (fptr) SUPERLU_MALLOC(sizeof(SuperLUStat_t));
+}
+
+/* functions that free the memory allocated by the above functions */
+
+void f_destroy_gridinfo_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_options_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_ScalePerm_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_LUstruct_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_SOLVEstruct_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_SuperMatrix_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+void f_destroy_SuperLUStat_handle(fptr *handle)
+{
+   SUPERLU_FREE((void *)*handle);
+}
+
+/* functions that get or set values in a C struct.
+   This is not the complete set of structs for which a user might want
+   to get/set a component, and there may be missing components. */
+
+void f_get_gridinfo(fptr *grid, int *iam, int *nprow, int *npcol)
+{
+  *iam=((gridinfo_t *) *grid)->iam;
+  *npcol=((gridinfo_t *) *grid)->npcol;
+  *nprow=((gridinfo_t *) *grid)->nprow;
+}
+
+void f_get_gridinfo3d(fptr *grid, int *iam,
+         	      int *nprow, int *npcol, int *npdep)
+{
+  *iam=((gridinfo3d_t *) *grid)->iam;
+  *npcol=((gridinfo3d_t *) *grid)->npcol;
+  *nprow=((gridinfo3d_t *) *grid)->nprow;
+  *npdep=((gridinfo3d_t *) *grid)->npdep;
+}
+
+void f_get_SuperMatrix(fptr *A, int *nrow, int *ncol)
+{
+   *nrow = ((SuperMatrix *) *A)->nrow;
+   *ncol = ((SuperMatrix *) *A)->ncol;
+}
+
+void f_set_SuperMatrix(fptr *A, int *nrow, int *ncol)
+{
+   ((SuperMatrix *) *A)->nrow = *nrow;
+   ((SuperMatrix *) *A)->ncol = *ncol;
+}
+
+void f_get_CompRowLoc_Matrix(fptr *A, int *m, int *n, int_t *nnz_loc,
+			     int *m_loc, int *fst_row)
+{
+  *m=((SuperMatrix *) *A)->nrow;
+  *n=((SuperMatrix *) *A)->ncol;
+  *m_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->m_loc;
+  *nnz_loc=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->nnz_loc;
+  *fst_row=((NRformat_loc *) ((SuperMatrix *) *A)->Store)->fst_row;
+}
+
+void f_set_CompRowLoc_Matrix(fptr *A, int *m, int *n, int_t *nnz_loc,
+			     int *m_loc, int *fst_row)
+{
+  NRformat_loc *Astore = ((SuperMatrix *) *A)->Store;
+
+  ((SuperMatrix *) *A)->nrow = *m;
+  ((SuperMatrix *) *A)->ncol = *n;
+  Astore->m_loc = *m_loc;
+  Astore->nnz_loc = *nnz_loc;
+  Astore->fst_row = *fst_row;
+}
+
+void f_get_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
+                           int *ColPerm, int *RowPerm, int *IterRefine,
+			   int *Trans, int *ReplaceTinyPivot,
+			   int *SolveInitialized, int *RefineInitialized,
+			   int *PrintStat)
+{
+   *Fact = (int) ((superlu_dist_options_t *) *opt)->Fact;
+   *Equil = (int) ((superlu_dist_options_t *) *opt)->Equil;
+   *ParSymbFact = (int) ((superlu_dist_options_t *) *opt)->ParSymbFact;
+   *ColPerm = (int) ((superlu_dist_options_t *) *opt)->ColPerm;
+   *RowPerm = (int) ((superlu_dist_options_t *) *opt)->RowPerm;
+   *IterRefine = (int) ((superlu_dist_options_t *) *opt)->IterRefine;
+   *Trans = (int) ((superlu_dist_options_t *) *opt)->Trans;
+   *ReplaceTinyPivot = (int) ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot;
+   *SolveInitialized = (int) ((superlu_dist_options_t *) *opt)->SolveInitialized;
+   *RefineInitialized = (int) ((superlu_dist_options_t *) *opt)->RefineInitialized;
+   *PrintStat = (int) ((superlu_dist_options_t *) *opt)->PrintStat;
+}
+
+void f_set_superlu_options(fptr *opt, int *Fact, int *Equil, int *ParSymbFact,
+                           int *ColPerm, int *RowPerm, int *IterRefine,
+			   int *Trans, int *ReplaceTinyPivot,
+			   int *SolveInitialized, int *RefineInitialized,
+			   int *PrintStat)
+{
+    superlu_dist_options_t *l_options = (superlu_dist_options_t*) *opt;
+    l_options->Fact = (fact_t) *Fact;
+   ((superlu_dist_options_t *) *opt)->Equil = (yes_no_t) *Equil;
+   ((superlu_dist_options_t *) *opt)->ParSymbFact = (yes_no_t) *ParSymbFact;
+   ((superlu_dist_options_t *) *opt)->ColPerm = (colperm_t) *ColPerm;
+   ((superlu_dist_options_t *) *opt)->RowPerm = (rowperm_t) *RowPerm;
+   ((superlu_dist_options_t *) *opt)->IterRefine = (IterRefine_t) *IterRefine;
+   ((superlu_dist_options_t *) *opt)->Trans = (trans_t) *Trans;
+   ((superlu_dist_options_t *) *opt)->ReplaceTinyPivot = (yes_no_t) *ReplaceTinyPivot;
+   ((superlu_dist_options_t *) *opt)->SolveInitialized = (yes_no_t) *SolveInitialized;
+   ((superlu_dist_options_t *) *opt)->RefineInitialized = (yes_no_t) *RefineInitialized;
+   ((superlu_dist_options_t *) *opt)->PrintStat = (yes_no_t) *PrintStat;
+}
+
+/* wrappers for SuperLU functions */
+
+void f_set_default_options(fptr *options)
+{
+   set_default_options_dist((superlu_dist_options_t *) *options);
+}
+
+void f_superlu_gridinit(int *Bcomm, int *nprow, int *npcol, fptr *grid)
+{
+   superlu_gridinit(f2c_comm(Bcomm), *nprow, *npcol, (gridinfo_t *) *grid);
+}
+
+void f_superlu_gridinit3d(int *Bcomm, int *nprow, int *npcol,
+   			  int *npdep, fptr *grid)
+{
+    superlu_gridinit3d(f2c_comm(Bcomm), *nprow, *npcol, *npdep, (gridinfo3d_t *) *grid);
+}
+
+void f_superlu_gridmap(int *Bcomm, int *nprow, int *npcol, 
+                       int *usermap, int *ldumap, fptr *grid)
+{
+   superlu_gridmap(f2c_comm(Bcomm), *nprow, *npcol, usermap, *ldumap,
+		   (gridinfo_t *) *grid);
+}
+
+void f_superlu_gridexit(fptr *grid)
+{
+   superlu_gridexit((gridinfo_t *) *grid);
+}
+
+void f_PStatInit(fptr *stat)
+{
+   PStatInit((SuperLUStat_t *) *stat);
+}
+
+void f_PStatFree(fptr *stat)
+{
+   PStatFree((SuperLUStat_t *) *stat);
+}
+
+void f_Destroy_CompRowLoc_Mat_dist(fptr *A)
+{
+   Destroy_CompRowLoc_Matrix_dist((SuperMatrix *) *A);
+}
+
+void f_Destroy_SuperMat_Store_dist(fptr *A)
+{
+   Destroy_SuperMatrix_Store_dist((SuperMatrix *) *A);
+}
+
+/* Check malloc */
+
+void f_check_malloc(int *iam)
+{
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC(*iam, "Check Malloc");
+#endif
+}
diff --git a/SRC/cublas_utils.c b/SRC/cublas_utils.c
index 0b40659e..fba1a3fa 100644
--- a/SRC/cublas_utils.c
+++ b/SRC/cublas_utils.c
@@ -10,7 +10,7 @@ at the top-level directory.
 */
 #include "superlu_defs.h"
 
-#ifdef GPU_ACC  // enable CUDA
+#ifdef GPU_ACC  //////////////////////////////  enable CUDA
 
  void DisplayHeader()
 {

From f7959a9aed05c929c34e09f4a0699f0c294a84a9 Mon Sep 17 00:00:00 2001
From: Xiaoye Li 
Date: Sun, 9 May 2021 13:06:20 -0400
Subject: [PATCH 083/147] updating README.

---
 CMakeLists.txt |  6 +++---
 README.md      | 46 ++++++++++++++++++++++++++++------------------
 SRC/pdgstrf.c  |  2 +-
 SRC/pzgstrf.c  |  2 +-
 4 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 03ed5c2c..1c82490c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,13 +27,13 @@ option(enable_examples  "Build examples" ON)
 #-- BLAS
 option(TPL_ENABLE_INTERNAL_BLASLIB  "Build the CBLAS library" ${enable_blaslib_DEFAULT})
 option(TPL_BLAS_LIBRARIES "List of absolute paths to blas libraries [].")
-#-- LAPACK
-option(TPL_ENABLE_LAPACKLIB "Enable LAPACK library" ON)
-option(TPL_LAPACK_LIBRARIES "List of absolute paths to LAPACK libraries [].")
 #-- ParMETIS
 option(TPL_ENABLE_PARMETISLIB   "Build the ParMETIS library" ON)
 option(TPL_PARMETIS_LIBRARIES "List of absolute paths to ParMETIS link libraries [].")
 option(TPL_PARMETIS_INCLUDE_DIRS "List of absolute paths to ParMETIS include directories [].")
+#-- LAPACK
+option(TPL_ENABLE_LAPACKLIB "Enable LAPACK library" OFF)
+option(TPL_LAPACK_LIBRARIES "List of absolute paths to LAPACK libraries [].")
 #-- CombBLAS
 option(TPL_ENABLE_COMBBLASLIB   "Build the CombBLAS library" OFF)
 option(TPL_COMBBLAS_LIBRARIES "List of absolute paths to CombBLAS link libraries [].")
diff --git a/README.md b/README.md
index 2b3a7de6..1b4b2a52 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# SuperLU_DIST (version 6.4)   superlu
+SuperLU_DIST (version 7.0)   superlu
 
 [![Build Status](https://travis-ci.org/xiaoyeli/superlu_dist.svg?branch=master)](https://travis-ci.org/xiaoyeli/superlu_dist) 
 [Nightly tests](http://my.cdash.org/index.php?project=superlu_dist)
@@ -19,7 +19,7 @@ column preordering for sparsity are performed sequentially.
 This "alpha" release contains double-precision real and double-precision
 complex data types.
 
-### The distribution contains the following directory structure:
+# The distribution contains the following directory structure:
 
 ```
 SuperLU_DIST/README    instructions on installation
@@ -43,8 +43,8 @@ SuperLU_DIST/MAKE_INC/ sample machine-specific make.inc files
 
 ## INSTALLATION
 
-There are two ways to install the package. One requires users to 
-edit makefile manually, the other uses CMake automatic build system.
+There are two ways to install the package. The first method is to use
+CMake automatic build system. The other method requires users to 
 The procedures are described below.
 
 ### Installation option 1: Using CMake build system.
@@ -59,9 +59,9 @@ export PARMETIS_ROOT=
 export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64
 ```
 
-Second, in order to use parallel weighted matching AWPM for numerical
-pre-pivoting, you need to install CombBLAS and define the environment
-variable:
+Second, in order to use parallel weighted matching HWPM (Heavy Weight
+Perfect Matching) for numerical pre-pivoting, you need to install 
+CombBLAS and define the environment variable:
 
 ```
 export COMBBLAS_ROOT=
@@ -79,7 +79,6 @@ cmake .. \
     -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
     -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
 ```
-
 For a more sophisticated installation including third-part libraries, do:
 ```
 cmake .. \
@@ -133,20 +132,32 @@ execution may fail. You can pass the definition option "-DMPIEXEC_EXECUTABLE"
 to cmake. For example on Cori at NERSC, you will need the following:
 `-DMPIEXEC_EXECUTABLE=/usr/bin/srun`
 
-Or, you can always go to TEST/ directory to perform
-testing manually.
+Or, you can always go to TEST/ directory to perform testing manually.
 
-**Note on the C-Fortran name mangling handled by C preprocessor definition:**  
-In the default setting, we assume that Fortran expects a C routine
-to have an underscore postfixed to the name. Depending on the
-compiler, you may need to define one of the following flags in
-during the cmake build to overwrite default setting:
+**SUMMARY of the CMake definitions:**
+The first one in the list of choices is the default setting.
 ```
-cmake .. -DCMAKE_C_FLAGS="-DNoChange" 
-cmake .. -DCMAKE_C_FLAGS="-DUpCase"
+    -DTPL_ENABLE_INTERNAL_BLASLIB=OFF | ON
+    -TPL_ENABLE_PARMETISLIB=ON | OFF
+    -DTPL_ENABLE_LAPACKLIB=OFF | ON
+    -TPL_ENABLE_COMBBLASLIB=OFF
+    -DTPL_ENABLE_CUDALIB=OFF | ON
+    -Denable_complex16=OFF | ON
+    -DXSDK_INDEX_SIZE=32 | 64
+
+    -DXSDK_ENABLE_Fortran=OFF | ON
+    -DCMAKE_Fortran_COMPILER=
+    -DBUILD_SHARED_LIBS= OFF | ON
+    -DCMAKE_INSTALL_PREFIX=<...>.
+    -DCMAKE_C_COMPILER=
+    -DCMAKE_C_FLAGS="..." 
+    -DCMAKE_CXX_COMPILER=
+    -DMAKE_CXX_FLAGS="..."
+    -DCMAKE_CUDA_FLAGS="..." 
 ```
 
 
+
 ### Installation option 2: Manual installation with makefile.
 Before installing the package, please examine the three things dependent 
 on your system setup:
@@ -302,7 +313,6 @@ A Makefile is provided in each subdirectory. The installation can be done
 completely automatically by simply typing "make" at the top level.
 
 
-
 ## Windows Usage
 Prerequisites: CMake, Visual Studio, Microsoft HPC Pack
 This has been tested with Visual Studio 2017, without Parmetis,
diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c
index d4ae6bb4..d481bf7c 100644
--- a/SRC/pdgstrf.c
+++ b/SRC/pdgstrf.c
@@ -900,7 +900,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad));
 
 #if ( PRNTlevel>=1 )
-    printf("[%d].. BIG V size %d (on CPU)\n", iam, bigv_size);
+    printf("[%d].. BIG V size " IFMT " (on CPU)\n", iam, bigv_size);
     fflush(stdout);
 #endif
 
diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c
index d6650011..a2d693f7 100644
--- a/SRC/pzgstrf.c
+++ b/SRC/pzgstrf.c
@@ -900,7 +900,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad));
 
 #if ( PRNTlevel>=1 )
-    printf("[%d].. BIG V size %d (on CPU)\n", iam, bigv_size);
+    printf("[%d].. BIG V size " IFMT " (on CPU)\n", iam, bigv_size);
     fflush(stdout);
 #endif
 

From c68fae311a713a8e3b0c3329e0ac3f87c9dfe0ec Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Sun, 9 May 2021 10:25:57 -0700
Subject: [PATCH 084/147] Adding TOC in README.md

---
 README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/README.md b/README.md
index 1b4b2a52..0bc5ab20 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,27 @@
+
+Table of Contents
+=================
+
+* [The distribution contains the following directory structure:](#the-distribution-contains-the-following-directory-structure)
+   * [INSTALLATION](#installation)
+      * [Installation option 1: Using CMake build system.](#installation-option-1-using-cmake-build-system)
+      * [Installation option 2: Manual installation with makefile.](#installation-option-2-manual-installation-with-makefile)
+         * [1.1 Edit the make.inc include file.](#11-edit-the-makeinc-include-file)
+         * [1.2. The BLAS library.](#12-the-blas-library)
+         * [1.3. External libraries.](#13-external-libraries)
+            * [1.3.1 LAPACK.](#131-lapack)
+            * [1.3.2 Metis and ParMetis.](#132-metis-and-parmetis)
+            * [1.3.3 CombBLAS.](#133-combblas)
+         * [1.4. C preprocessor definition CDEFS. (Replaced by cmake module FortranCInterface.)](#14-c-preprocessor-definition-cdefs-replaced-by-cmake-module-fortrancinterface)
+         * [1.5. Multicore and GPU (optional).](#15-multicore-and-gpu-optional)
+   * [Windows Usage](#windows-usage)
+   * [READING SPARSE MATRIX FILES](#reading-sparse-matrix-files)
+   * [REFERENCES](#references)
+   * [RELEASE VERSIONS](#release-versions)
+
+Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)
+
+
 SuperLU_DIST (version 7.0)   superlu
 
 [![Build Status](https://travis-ci.org/xiaoyeli/superlu_dist.svg?branch=master)](https://travis-ci.org/xiaoyeli/superlu_dist) 

From 8a91c72ed9c43257c79eb5c86042c5887ab3d464 Mon Sep 17 00:00:00 2001
From: "X. Sherry Li" 
Date: Sun, 9 May 2021 10:27:48 -0700
Subject: [PATCH 085/147] Update README.md

---
 README.md | 41 +++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 0bc5ab20..d63a0e28 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,24 @@
+# SuperLU_DIST (version 7.0)   superlu
+
+[![Build Status](https://travis-ci.org/xiaoyeli/superlu_dist.svg?branch=master)](https://travis-ci.org/xiaoyeli/superlu_dist) 
+[Nightly tests](http://my.cdash.org/index.php?project=superlu_dist)
+
+SuperLU_DIST contains a set of subroutines to solve a sparse linear system 
+A*X=B. It uses Gaussian elimination with static pivoting (GESP). 
+Static pivoting is a technique that combines the numerical stability of
+partial pivoting with the scalability of Cholesky (no pivoting),
+to run accurately and efficiently on large numbers of processors. 
+
+SuperLU_DIST is a parallel extension to the serial SuperLU library.
+It is targeted for the distributed memory parallel machines.
+SuperLU_DIST is implemented in ANSI C, and MPI for communications.
+Currently, the LU factorization and triangular solution routines,
+which are the most time-consuming part of the solution process,
+are parallelized. The other routines, such as static pivoting and 
+column preordering for sparsity are performed sequentially. 
+This "alpha" release contains double-precision real and double-precision
+complex data types.
+
 
 Table of Contents
 =================
@@ -22,26 +43,6 @@ Table of Contents
 Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)
 
 
-SuperLU_DIST (version 7.0)   superlu
-
-[![Build Status](https://travis-ci.org/xiaoyeli/superlu_dist.svg?branch=master)](https://travis-ci.org/xiaoyeli/superlu_dist) 
-[Nightly tests](http://my.cdash.org/index.php?project=superlu_dist)
-
-SuperLU_DIST contains a set of subroutines to solve a sparse linear system 
-A*X=B. It uses Gaussian elimination with static pivoting (GESP). 
-Static pivoting is a technique that combines the numerical stability of
-partial pivoting with the scalability of Cholesky (no pivoting),
-to run accurately and efficiently on large numbers of processors. 
-
-SuperLU_DIST is a parallel extension to the serial SuperLU library.
-It is targeted for the distributed memory parallel machines.
-SuperLU_DIST is implemented in ANSI C, and MPI for communications.
-Currently, the LU factorization and triangular solution routines,
-which are the most time-consuming part of the solution process,
-are parallelized. The other routines, such as static pivoting and 
-column preordering for sparsity are performed sequentially. 
-This "alpha" release contains double-precision real and double-precision
-complex data types.
 
 # The distribution contains the following directory structure:
 

From 19d499909c35bb75b8a4eb720f43645f6fe2c1f1 Mon Sep 17 00:00:00 2001
From: "X. Sherry Li" 
Date: Sun, 9 May 2021 10:32:46 -0700
Subject: [PATCH 086/147] Update README.md

---
 README.md | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index d63a0e28..67e27adc 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)
 
 
 
-# The distribution contains the following directory structure:
+# Directory structure of the source code
 
 ```
 SuperLU_DIST/README    instructions on installation
@@ -66,13 +66,13 @@ SuperLU_DIST/make.inc  compiler, compiler flags, library definitions and C
 SuperLU_DIST/MAKE_INC/ sample machine-specific make.inc files
 ```
 
-## INSTALLATION
+# INSTALLATION
 
 There are two ways to install the package. The first method is to use
 CMake automatic build system. The other method requires users to 
 The procedures are described below.
 
-### Installation option 1: Using CMake build system.
+## Installation option 1: Using CMake build system.
 You will need to create a build tree from which to invoke CMake.
 
 First, in order to use parallel symbolic factorization function, you
@@ -159,7 +159,7 @@ to cmake. For example on Cori at NERSC, you will need the following:
 
 Or, you can always go to TEST/ directory to perform testing manually.
 
-**SUMMARY of the CMake definitions:**
+### SUMMARY of the CMake definitions:
 The first one in the list of choices is the default setting.
 ```
     -DTPL_ENABLE_INTERNAL_BLASLIB=OFF | ON
@@ -183,11 +183,11 @@ The first one in the list of choices is the default setting.
 
 
 
-### Installation option 2: Manual installation with makefile.
+## Installation option 2: Manual installation with makefile.
 Before installing the package, please examine the three things dependent 
 on your system setup:
 
-#### 1.1 Edit the make.inc include file.
+### 1.1 Edit the make.inc include file.
 
 This make include file is referenced inside each of the Makefiles
 in the various subdirectories. As a result, there is no need to 
@@ -219,7 +219,7 @@ printing level to show solver's execution details. (default 0)
 diagnostic printing level for debugging purpose. (default 0)
 ```      
 
-#### 1.2. The BLAS library.
+### 1.2. The BLAS library.
 
 The parallel routines in SuperLU_DIST use some BLAS routines on each MPI
 process. Moreover, if you enable OpenMP with multiple threads, you need to
@@ -246,7 +246,7 @@ top-level SuperLU_DIST/ directory and do the following:
 to make the BLAS library from the routines in the
 ` CBLAS/ subdirectory.`
 
-#### 1.3. External libraries. 
+### 1.3. External libraries. 
 
   ##### 1.3.1 LAPACK.
   Starting Version 6.0, the triangular solve routine can perform explicit
@@ -261,7 +261,7 @@ You can disable LAPACK with the following line in SRC/superlu_dist_config.h:
 #undef SLU_HAVE_LAPACK
 ```
 
-  ##### 1.3.2 Metis and ParMetis.
+  #### 1.3.2 Metis and ParMetis.
 
 If you will use Metis or ParMetis for sparsity ordering, you will
 need to install them yourself. Since ParMetis package already
@@ -281,7 +281,7 @@ You can disable ParMetis with the following line in SRC/superlu_dist_config.h:
 #undef HAVE_PARMETIS
 ```
 
- ##### 1.3.3 CombBLAS.
+ #### 1.3.3 CombBLAS.
 
 You can use parallel approximate weight perfect matching (AWPM) algorithm
 to perform numerical pre-pivoting for stability. The default pre-pivoting
@@ -302,7 +302,7 @@ You can disable CombBLAS with the following line in SRC/superlu_dist_config.h:
 ```
 
 
-#### 1.4. C preprocessor definition CDEFS. (Replaced by cmake module FortranCInterface.)
+### 1.4. C preprocessor definition CDEFS. (Replaced by cmake module FortranCInterface.)
 
 In the header file SRC/Cnames.h, we use macros to determine how
 C routines should be named so that they are callable by Fortran.
@@ -319,7 +319,7 @@ The possible options for CDEFS are:
 -DUpCase: Fortran expects a C routine name to be all uppercase.
 ```
 
-#### 1.5. Multicore and GPU (optional).
+### 1.5. Multicore and GPU.
 
 To use OpenMP parallelism, need to link with an OpenMP library, and
 set the number of threads you wish to use as follows (bash):
@@ -441,4 +441,5 @@ February 8, 2019    Version 6.1.1
 November 12, 2019   Version 6.2.0
 February 23, 2020   Version 6.3.0
 October 23, 2020    Version 6.4.0
+May 10, 2021        Version 7.0.0
 ```

From 41567aea20205c43656c65dd4daeb0ceef801e7d Mon Sep 17 00:00:00 2001
From: Xiaoye Li 
Date: Sun, 9 May 2021 20:37:11 -0400
Subject: [PATCH 087/147] Make MAX_BUFFER_SIZE to be the 8th query case in
 sp_ienv_dist().

---
 EXAMPLE/sp_ienv.c              |  2 +-
 FORTRAN/sp_ienv.c              |  2 +-
 FORTRAN/superlu_dist_config.fh |  1 +
 SRC/pdgstrf.c                  | 10 +++++-----
 SRC/pdgstrf3d.c                |  2 +-
 SRC/pzgstrf.c                  | 10 +++++-----
 SRC/pzgstrf3d.c                |  2 +-
 SRC/sp_ienv.c                  |  9 +++++++--
 SRC/superlu_defs.h             |  2 +-
 SRC/superlu_dist_config.h      |  2 +-
 10 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/EXAMPLE/sp_ienv.c b/EXAMPLE/sp_ienv.c
index c9985cf1..651a1be7 100644
--- a/EXAMPLE/sp_ienv.c
+++ b/EXAMPLE/sp_ienv.c
@@ -68,7 +68,7 @@ at the top-level directory.
 
 
 int
-sp_ienv_dist(int_t ispec)
+sp_ienv_dist(int ispec)
 {
     // printf(" this function called\n");
     int i;
diff --git a/FORTRAN/sp_ienv.c b/FORTRAN/sp_ienv.c
index 99df0d25..56a07726 100644
--- a/FORTRAN/sp_ienv.c
+++ b/FORTRAN/sp_ienv.c
@@ -68,7 +68,7 @@ at the top-level directory.
 
 
 int
-sp_ienv_dist(int_t ispec)
+sp_ienv_dist(int ispec)
 {
     // printf(" this function called\n");
     int i;
diff --git a/FORTRAN/superlu_dist_config.fh b/FORTRAN/superlu_dist_config.fh
index caa86f6b..878933ff 100644
--- a/FORTRAN/superlu_dist_config.fh
+++ b/FORTRAN/superlu_dist_config.fh
@@ -1,4 +1,5 @@
 
+#define HAVE_CUDA TRUE
 
 #define HAVE_PARMETIS TRUE
 
diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c
index d481bf7c..f26c9e66 100644
--- a/SRC/pdgstrf.c
+++ b/SRC/pdgstrf.c
@@ -771,8 +771,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     int cublas_nb = get_cublas_nb(); // default 64
     int nstreams = get_num_cuda_streams (); // default 8
 
-    int_t buffer_size  = SUPERLU_MAX(max_row_size * nstreams * cublas_nb,
-                                     get_max_buffer_size());
+    int_t buffer_size  = SUPERLU_MAX(max_row_size * nstreams * cublas_nb, sp_ienv_dist(8));
+                                     //   get_max_buffer_size());
     /* array holding last column blk for each partition,
        used in SchCompUdt-cuda.c         */
   #if 0
@@ -784,8 +784,8 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #else /* not to use GPU */
 
     int Threads_per_process = get_thread_per_process();
-    int_t buffer_size  = SUPERLU_MAX(max_row_size * Threads_per_process * ldt,
-                                     get_max_buffer_size());
+    int_t buffer_size  = SUPERLU_MAX(max_row_size * Threads_per_process * ldt, sp_ienv_dist(8));
+                                     // get_max_buffer_size());
 #endif /* end ifdef GPU_ACC -----------*/
 
     int_t max_ncols = 0;
@@ -814,7 +814,7 @@ pdgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 
 #if ( PRNTlevel>=1 )
     if(!iam) {
-        printf("\t.. MAX_BUFFER_SIZE " IFMT " set for GPU\n", get_max_buffer_size());
+        printf("\t.. MAX_BUFFER_SIZE %d set for GPU\n", sp_ienv_dist(8));
 	printf("\t.. N_GEMM: %d flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7));
         printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n",
                 max_row_size, max_ncols);
diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c
index ec97a69d..dbef9da9 100644
--- a/SRC/pdgstrf3d.c
+++ b/SRC/pdgstrf3d.c
@@ -225,7 +225,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int_t bigu_size = getBigUSize(nsupers, grid,
     	  	                  LUstruct->Llu->Lrowind_bc_ptr);
     HyP->bigu_size = bigu_size;
-    int_t buffer_size = get_max_buffer_size ();
+    int_t buffer_size =sp_ienv_dist(8); // get_max_buffer_size ();
     HyP->buffer_size = buffer_size;
     HyP->nsupers = nsupers;
 
diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c
index a2d693f7..c9bbdd4b 100644
--- a/SRC/pzgstrf.c
+++ b/SRC/pzgstrf.c
@@ -771,8 +771,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
     int cublas_nb = get_cublas_nb(); // default 64
     int nstreams = get_num_cuda_streams (); // default 8
 
-    int_t buffer_size  = SUPERLU_MAX(max_row_size * nstreams * cublas_nb,
-                                     get_max_buffer_size());
+    int_t buffer_size  = SUPERLU_MAX(max_row_size * nstreams * cublas_nb, sp_ienv_dist(8));
+                                     //   get_max_buffer_size());
     /* array holding last column blk for each partition,
        used in SchCompUdt-cuda.c         */
   #if 0
@@ -784,8 +784,8 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 #else /* not to use GPU */
 
     int Threads_per_process = get_thread_per_process();
-    int_t buffer_size  = SUPERLU_MAX(max_row_size * Threads_per_process * ldt,
-                                     get_max_buffer_size());
+    int_t buffer_size  = SUPERLU_MAX(max_row_size * Threads_per_process * ldt, sp_ienv_dist(8));
+                                     // get_max_buffer_size());
 #endif /* end ifdef GPU_ACC -----------*/
 
     int_t max_ncols = 0;
@@ -814,7 +814,7 @@ pzgstrf(superlu_dist_options_t * options, int m, int n, double anorm,
 
 #if ( PRNTlevel>=1 )
     if(!iam) {
-        printf("\t.. MAX_BUFFER_SIZE " IFMT " set for GPU\n", get_max_buffer_size());
+        printf("\t.. MAX_BUFFER_SIZE %d set for GPU\n", sp_ienv_dist(8));
 	printf("\t.. N_GEMM: %d flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7));
         printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n",
                 max_row_size, max_ncols);
diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c
index f592f50e..88eafa21 100644
--- a/SRC/pzgstrf3d.c
+++ b/SRC/pzgstrf3d.c
@@ -224,7 +224,7 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int_t bigu_size = getBigUSize(nsupers, grid,
     	  	                  LUstruct->Llu->Lrowind_bc_ptr);
     HyP->bigu_size = bigu_size;
-    int_t buffer_size = get_max_buffer_size ();
+    int_t buffer_size =sp_ienv_dist(8); // get_max_buffer_size ();
     HyP->buffer_size = buffer_size;
     HyP->nsupers = nsupers;
 
diff --git a/SRC/sp_ienv.c b/SRC/sp_ienv.c
index 872fef99..d42b2353 100644
--- a/SRC/sp_ienv.c
+++ b/SRC/sp_ienv.c
@@ -53,6 +53,8 @@ at the top-level directory.
 	         of L and U, compared with A;
 	    = 7: the minimum value of the product M*N*K for a GEMM call
 	         to be off-loaded to accelerator (e.g., GPU, Xeon Phi).
+            = 8: the maximum buffer size on GPU that can hold the three
+	         matrices in the GEMM call for the Schur complement update.
 	    
    (SP_IENV_DIST) (output) int
             >= 0: the value of the parameter specified by ISPEC   
@@ -66,7 +68,7 @@ at the top-level directory.
 #include 
 
 int
-sp_ienv_dist(int_t ispec)
+sp_ienv_dist(int ispec)
 {
     // printf(" this function called\n");
     int i;
@@ -109,7 +111,10 @@ sp_ienv_dist(int_t ispec)
 	    ttemp = getenv ("N_GEMM");
 	    if (ttemp) return atoi (ttemp);
 	    else return 10000;
-
+        case 8:
+  	    ttemp = getenv ("MAX_BUFFER_SIZE");
+	    if (ttemp) return atoi (ttemp);
+	    else return 64000000; // 8000^2
     }
 
     /* Invalid value for ISPEC */
diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h
index f241ae66..33944a6e 100644
--- a/SRC/superlu_defs.h
+++ b/SRC/superlu_defs.h
@@ -1009,7 +1009,7 @@ extern int_t estimate_bigu_size (int_t, int_t **, Glu_persist_t *,
 /* Auxiliary routines */
 extern double SuperLU_timer_ ();
 extern void   superlu_abort_and_exit_dist(char *);
-extern int    sp_ienv_dist (int_t);
+extern int    sp_ienv_dist (int);
 extern void   ifill_dist (int_t *, int_t, int_t);
 extern void   super_stats_dist (int_t, int_t *);
 extern void  get_diag_procs(int_t, Glu_persist_t *, gridinfo_t *, int_t *,
diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h
index 4ea5a7ff..6df0c761 100644
--- a/SRC/superlu_dist_config.h
+++ b/SRC/superlu_dist_config.h
@@ -1,7 +1,7 @@
 /* superlu_dist_config.h.in */
 
 /* Enable CUDA */
-/* #undef HAVE_CUDA */
+#define HAVE_CUDA TRUE
 
 /* Enable parmetis */
 #define HAVE_PARMETIS TRUE

From c503f8fbc2881d6fb63b35be27728bee3c1958a1 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Sun, 9 May 2021 17:57:00 -0700
Subject: [PATCH 088/147] Updating README.md

---
 README.md | 132 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 71 insertions(+), 61 deletions(-)

diff --git a/README.md b/README.md
index 67e27adc..d7ccb003 100644
--- a/README.md
+++ b/README.md
@@ -11,39 +11,43 @@ to run accurately and efficiently on large numbers of processors.
 
 SuperLU_DIST is a parallel extension to the serial SuperLU library.
 It is targeted for the distributed memory parallel machines.
-SuperLU_DIST is implemented in ANSI C, and MPI for communications.
-Currently, the LU factorization and triangular solution routines,
-which are the most time-consuming part of the solution process,
-are parallelized. The other routines, such as static pivoting and 
-column preordering for sparsity are performed sequentially. 
-This "alpha" release contains double-precision real and double-precision
-complex data types.
+SuperLU_DIST is implemented in ANSI C, with OpenMP for on-node parallelism
+and MPI for off-node communications. We are actively developing GPU
+acceleration capabilities.
+
+
+
+
+
+
 
 
 Table of Contents
 =================
 
-* [The distribution contains the following directory structure:](#the-distribution-contains-the-following-directory-structure)
-   * [INSTALLATION](#installation)
-      * [Installation option 1: Using CMake build system.](#installation-option-1-using-cmake-build-system)
-      * [Installation option 2: Manual installation with makefile.](#installation-option-2-manual-installation-with-makefile)
-         * [1.1 Edit the make.inc include file.](#11-edit-the-makeinc-include-file)
-         * [1.2. The BLAS library.](#12-the-blas-library)
-         * [1.3. External libraries.](#13-external-libraries)
-            * [1.3.1 LAPACK.](#131-lapack)
-            * [1.3.2 Metis and ParMetis.](#132-metis-and-parmetis)
-            * [1.3.3 CombBLAS.](#133-combblas)
-         * [1.4. C preprocessor definition CDEFS. (Replaced by cmake module FortranCInterface.)](#14-c-preprocessor-definition-cdefs-replaced-by-cmake-module-fortrancinterface)
-         * [1.5. Multicore and GPU (optional).](#15-multicore-and-gpu-optional)
-   * [Windows Usage](#windows-usage)
-   * [READING SPARSE MATRIX FILES](#reading-sparse-matrix-files)
-   * [REFERENCES](#references)
-   * [RELEASE VERSIONS](#release-versions)
+* [SuperLU_DIST (version 7.0)   superlu](#superlu_dist-version-70---)
+* [Directory structure of the source code](#directory-structure-of-the-source-code)
+* [Installation](#installation)
+   * [Installation option 1: Using CMake build system.](#installation-option-1-using-cmake-build-system)
+      * [Summary of the CMake definitions.](#summary-of-the-cmake-definitions)
+   * [Installation option 2: Manual installation with makefile.](#installation-option-2-manual-installation-with-makefile)
+      * [2.1 Edit the make.inc include file.](#21-edit-the-makeinc-include-file)
+      * [2.2. The BLAS library.](#22-the-blas-library)
+      * [2.3. External libraries.](#23-external-libraries)
+         * [2.3.1 Metis and ParMetis.](#231-metis-and-parmetis)
+         * [2.3.2 LAPACK.](#232-lapack)
+         * [2.3.3 CombBLAS.](#233-combblas)
+      * [2.4. C preprocessor definition CDEFS. (Replaced by cmake module FortranCInterface.)](#24-c-preprocessor-definition-cdefs-replaced-by-cmake-module-fortrancinterface)
+      * [2.5. Multicore and GPU.](#25-multicore-and-gpu)
+* [Summary of the environment variables.](#summary-of-the-environment-variables)
+* [Windows Usage](#windows-usage)
+* [Reading sparse matrix files](#reading-sparse-matrix-files)
+* [REFERENCES](#references)
+* [RELEASE VERSIONS](#release-versions)
 
 Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)
 
 
-
 # Directory structure of the source code
 
 ```
@@ -66,7 +70,7 @@ SuperLU_DIST/make.inc  compiler, compiler flags, library definitions and C
 SuperLU_DIST/MAKE_INC/ sample machine-specific make.inc files
 ```
 
-# INSTALLATION
+# Installation
 
 There are two ways to install the package. The first method is to use
 CMake automatic build system. The other method requires users to 
@@ -159,19 +163,20 @@ to cmake. For example on Cori at NERSC, you will need the following:
 
 Or, you can always go to TEST/ directory to perform testing manually.
 
-### SUMMARY of the CMake definitions:
-The first one in the list of choices is the default setting.
+### Summary of the CMake definitions.
+The following list summarize the commonly used CMake definitions. In each case,
+the first choice is the default setting. After running 'cmake' installation,
+a configuration header file is generated in SRC/superlu_dist_config.h, which
+contains the key CPP definitions used throughout the code.
 ```
-    -DTPL_ENABLE_INTERNAL_BLASLIB=OFF | ON
     -TPL_ENABLE_PARMETISLIB=ON | OFF
+    -DTPL_ENABLE_INTERNAL_BLASLIB=OFF | ON
     -DTPL_ENABLE_LAPACKLIB=OFF | ON
     -TPL_ENABLE_COMBBLASLIB=OFF
     -DTPL_ENABLE_CUDALIB=OFF | ON
     -Denable_complex16=OFF | ON
     -DXSDK_INDEX_SIZE=32 | 64
 
-    -DXSDK_ENABLE_Fortran=OFF | ON
-    -DCMAKE_Fortran_COMPILER=
     -DBUILD_SHARED_LIBS= OFF | ON
     -DCMAKE_INSTALL_PREFIX=<...>.
     -DCMAKE_C_COMPILER=
@@ -179,15 +184,15 @@ The first one in the list of choices is the default setting.
     -DCMAKE_CXX_COMPILER=
     -DMAKE_CXX_FLAGS="..."
     -DCMAKE_CUDA_FLAGS="..." 
+    -DXSDK_ENABLE_Fortran=OFF | ON
+    -DCMAKE_Fortran_COMPILER=
 ```
 
-
-
 ## Installation option 2: Manual installation with makefile.
 Before installing the package, please examine the three things dependent 
 on your system setup:
 
-### 1.1 Edit the make.inc include file.
+### 2.1 Edit the make.inc include file.
 
 This make include file is referenced inside each of the Makefiles
 in the various subdirectories. As a result, there is no need to 
@@ -219,7 +224,7 @@ printing level to show solver's execution details. (default 0)
 diagnostic printing level for debugging purpose. (default 0)
 ```      
 
-### 1.2. The BLAS library.
+### 2.2. The BLAS library.
 
 The parallel routines in SuperLU_DIST use some BLAS routines on each MPI
 process. Moreover, if you enable OpenMP with multiple threads, you need to
@@ -246,22 +251,9 @@ top-level SuperLU_DIST/ directory and do the following:
 to make the BLAS library from the routines in the
 ` CBLAS/ subdirectory.`
 
-### 1.3. External libraries. 
+### 2.3. External libraries. 
 
-  ##### 1.3.1 LAPACK.
-  Starting Version 6.0, the triangular solve routine can perform explicit
-  inversion on the diagonal blocks, using LAPACK's xTRTRI inversion routine.
-  To use this feature, you should define the following in make.inc:
-```
-SLU_HAVE_LAPACK = TRUE
-LAPACKLIB = 
-```
-You can disable LAPACK with the following line in SRC/superlu_dist_config.h:
-```
-#undef SLU_HAVE_LAPACK
-```
-
-  #### 1.3.2 Metis and ParMetis.
+  #### 2.3.1 Metis and ParMetis.
 
 If you will use Metis or ParMetis for sparsity ordering, you will
 need to install them yourself. Since ParMetis package already
@@ -279,9 +271,21 @@ I_PARMETIS = -I/include -I/metis/include
 You can disable ParMetis with the following line in SRC/superlu_dist_config.h:
 ```
 #undef HAVE_PARMETIS
+```
+  #### 2.3.2 LAPACK.
+  Starting Version 6.0, the triangular solve routine can perform explicit
+  inversion on the diagonal blocks, using LAPACK's xTRTRI inversion routine.
+  To use this feature, you should define the following in make.inc:
+```
+SLU_HAVE_LAPACK = TRUE
+LAPACKLIB = 
+```
+You can disable LAPACK with the following line in SRC/superlu_dist_config.h:
+```
+#undef SLU_HAVE_LAPACK
 ```
 
- #### 1.3.3 CombBLAS.
+ #### 2.3.3 CombBLAS.
 
 You can use parallel approximate weight perfect matching (AWPM) algorithm
 to perform numerical pre-pivoting for stability. The default pre-pivoting
@@ -301,10 +305,9 @@ You can disable CombBLAS with the following line in SRC/superlu_dist_config.h:
 #undef HAVE_COMBBLAS
 ```
 
+### 2.4. C preprocessor definition CDEFS. (Replaced by cmake module FortranCInterface.)
 
-### 1.4. C preprocessor definition CDEFS. (Replaced by cmake module FortranCInterface.)
-
-In the header file SRC/Cnames.h, we use macros to determine how
+In the header file SRC/superlu_Cnames.h, we use macros to determine how
 C routines should be named so that they are callable by Fortran.
 (Some vendor-supplied BLAS libraries do not have C interfaces. So the 
 re-naming is needed in order for the SuperLU BLAS calls (in C) to 
@@ -319,7 +322,7 @@ The possible options for CDEFS are:
 -DUpCase: Fortran expects a C routine name to be all uppercase.
 ```
 
-### 1.5. Multicore and GPU.
+### 2.5. Multicore and GPU.
 
 To use OpenMP parallelism, need to link with an OpenMP library, and
 set the number of threads you wish to use as follows (bash):
@@ -337,8 +340,15 @@ endif
 A Makefile is provided in each subdirectory. The installation can be done
 completely automatically by simply typing "make" at the top level.
 
+# Summary of the environment variables.
+Several blocking parameters may affect performance. Most of them can be set
+by the user through environment variables. The SuperLU code uses an
+environment inquiry function to read these parameters. This function is
+provided in the file SRC/sp_ienv.c. Please consult that file for detailed
+description of the meanings.
 
-## Windows Usage
+
+# Windows Usage
 Prerequisites: CMake, Visual Studio, Microsoft HPC Pack
 This has been tested with Visual Studio 2017, without Parmetis,
 without Fortran, and with OpenMP disabled. 
@@ -372,7 +382,7 @@ for the above configuration.
 If you wish to test:
   `ctest`
 
-## READING SPARSE MATRIX FILES
+# Reading sparse matrix files
 
 The SRC/ directory contains the following routines to read different file 
 formats, they all have the similar calling sequence.
@@ -385,7 +395,7 @@ dreadtriple.c          : triplet, with header
 dreadtriple_noheader.c : triplet, no header, which is also readable in Matlab
 ```
 
-## REFERENCES
+# REFERENCES
 
 **[1]** X.S. Li and J.W. Demmel, "SuperLU_DIST: A Scalable Distributed-Memory
  Sparse Direct Solver for Unsymmetric Linear Systems", ACM Trans. on Math.
@@ -403,19 +413,19 @@ dreadtriple_noheader.c : triplet, no header, which is also readable in Matlab
  SIAM workshop on Combinatorial Scientific Computing, June 6-8, 2018,
  Bergen, Norway. 
 
-**Xiaoye S. Li**, Lawrence Berkeley National Lab, [xsli@lbl.gov](xsli@lbl.gov)  
+**Xiaoye S. Li**, Lawrence Berkeley National Lab, [xsli@lbl.gov](xsli@lbl.gov)
 **Gustavo Chavez**, Lawrence Berkeley National Lab, [gichavez@lbl.gov](gichavez@lbl.gov)   
+**Nan Ding**, Lawrence Berkeley National Lab, [nanding@lbl.gov](nanding@lbl.gov)  
 **Laura Grigori**, INRIA, France, [laura.grigori@inria.fr](laura.grigori@inria.fr)  
 **Yang Liu**, Lawrence Berkeley National Lab, [liuyangzhuan@lbl.gov](liuyangzhuan@lbl.gov)   
-**Meiyue Shao**, Lawrence Berkeley National Lab, [myshao@lbl.gov](myshao@lbl.gov)   
 **Piyush Sao**, Georgia Institute of Technology, [piyush.feynman@gmail.com](piyush.feynman@gmail.com)  
+**Meiyue Shao**, Lawrence Berkeley National Lab, [myshao@lbl.gov](myshao@lbl.gov) 
 **Ichitaro Yamazaki**, Univ. of Tennessee, [ic.yamazaki@gmail.com](ic.yamazaki@gmail.com)  
 **Jim Demmel**, UC Berkeley, [demmel@cs.berkeley.edu](demmel@cs.berkeley.edu)  
 **John Gilbert**, UC Santa Barbara, [gilbert@cs.ucsb.edu](gilbert@cs.ucsb.edu)
 
 
-
-## RELEASE VERSIONS
+# RELEASE VERSIONS
 ```
 October 15, 2003    Version 2.0  
 October 1,  2007    Version 2.1  

From a5a7902a0b99437dbecc633d618a1261ca01d068 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Sun, 9 May 2021 20:44:42 -0700
Subject: [PATCH 089/147] Update README.md, adding list of environment
 variables.

---
 README.md | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index d7ccb003..c16fabf4 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ SuperLU_DIST/CBLAS/    needed BLAS routines in C, not necessarily fast
 		       library with multiple OpenMP threads, performance
 		       relies on a good multithreaded BLAS implementation.)
 SuperLU_DIST/DOC/      the Users' Guide
+SuperLU_DIST/FORTRAN/  Fortran90 wrapper functions
 SuperLU_DIST/EXAMPLE/  example programs
 SuperLU_DIST/INSTALL/  test machine dependent parameters
 SuperLU_DIST/SRC/      C source code, to be compiled into libsuperlu_dist.a
@@ -340,13 +341,24 @@ endif
 A Makefile is provided in each subdirectory. The installation can be done
 completely automatically by simply typing "make" at the top level.
 
-# Summary of the environment variables.
-Several blocking parameters may affect performance. Most of them can be set
-by the user through environment variables. The SuperLU code uses an
-environment inquiry function to read these parameters. This function is
-provided in the file SRC/sp_ienv.c. Please consult that file for detailed
-description of the meanings.
 
+# Summary of the environment variables.
+A couple of environment variables affect parallel execution.
+```
+    export OMP_NUM_THREADS=<...>
+    export SUPERLU_ACC_OFFLOAD=1  // this enables use of GPU. Default is 0.
+```
+Several integer blocking parameters may affect performance. Most of them can be
+set by the user through environment variables. Oherwise the default values
+are provided. Various SuperLU routines call an environment inquiry function
+to obtain these parameters. This function is provided in the file SRC/sp_ienv.c.
+Please consult that file for detailed description of the meanings.
+```
+    export NREL=<...>   // supernode relaxation parameter
+    export NSUP=<...>   // maximum allowable supernode size
+    export FILL=<...>   // estimated fill ratio of nonzeros(L+U)/nonzeros(A)
+    export MAX_BUFFER_SIZE=<...>   // maximum buffer size on GPU for GEMM
+```
 
 # Windows Usage
 Prerequisites: CMake, Visual Studio, Microsoft HPC Pack

From 331c3a391e22658fe3017e034faf6b544de73322 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Sun, 9 May 2021 20:56:25 -0700
Subject: [PATCH 090/147] Update README.md

---
 README.md | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index c16fabf4..da3d92c2 100644
--- a/README.md
+++ b/README.md
@@ -420,20 +420,30 @@ dreadtriple_noheader.c : triplet, no header, which is also readable in Matlab
  Porto, Portugal.  
 **[4]** P. Sao, X.S. Li, R. Vuduc, “A Communication-Avoiding 3D Factorization
  for Sparse Matrices”, Proc. of IPDPS, May 21–25, 2018, Vancouver.   
-**[5]** Y. Liu, M. Jacquelin, P. Ghysels and X.S. Li, “Highly scalable
+**[5]** P. Sao, R. Vuduc, X. Li, "Communication-avoiding 3D algorithm for
+ sparse LU factorization on heterogeneous systems", J. Parallel and
+ Distributed Computing (JPDC), September 2019.     
+**[6]** Y. Liu, M. Jacquelin, P. Ghysels and X.S. Li, “Highly scalable
  distributed-memory sparse triangular solution algorithms”, Proc. of
  SIAM workshop on Combinatorial Scientific Computing, June 6-8, 2018,
- Bergen, Norway. 
+ Bergen, Norway.   
+**[7]** N. Ding, S. Williams, Y. Liu, X.S. Li, "Leveraging One-Sided
+ Communication for Sparse Triangular Solvers", Proc. of SIAM Conf. on
+ Parallel Processing for Scientific Computing. Feb. 12-15, 2020.   
+**[8]** A. Azad, A. Buluc, X.S. Li, X. Wang, and J. Langguth,
+"A distributed-memory algorithm for computing a heavy-weight perfect matching 
+on bipartite graphs", SIAM J. Sci. Comput., Vol. 42, No. 4, pp. C143-C168, 2020.   
 
-**Xiaoye S. Li**, Lawrence Berkeley National Lab, [xsli@lbl.gov](xsli@lbl.gov)
+
+**Xiaoye S. Li**, Lawrence Berkeley National Lab, [xsli@lbl.gov](xsli@lbl.gov)   
 **Gustavo Chavez**, Lawrence Berkeley National Lab, [gichavez@lbl.gov](gichavez@lbl.gov)   
 **Nan Ding**, Lawrence Berkeley National Lab, [nanding@lbl.gov](nanding@lbl.gov)  
 **Laura Grigori**, INRIA, France, [laura.grigori@inria.fr](laura.grigori@inria.fr)  
 **Yang Liu**, Lawrence Berkeley National Lab, [liuyangzhuan@lbl.gov](liuyangzhuan@lbl.gov)   
 **Piyush Sao**, Georgia Institute of Technology, [piyush.feynman@gmail.com](piyush.feynman@gmail.com)  
-**Meiyue Shao**, Lawrence Berkeley National Lab, [myshao@lbl.gov](myshao@lbl.gov) 
+**Meiyue Shao**, Lawrence Berkeley National Lab, [myshao@lbl.gov](myshao@lbl.gov)   
 **Ichitaro Yamazaki**, Univ. of Tennessee, [ic.yamazaki@gmail.com](ic.yamazaki@gmail.com)  
-**Jim Demmel**, UC Berkeley, [demmel@cs.berkeley.edu](demmel@cs.berkeley.edu)  
+**Jim Demmel**, UC Berkeley, [demmel@cs.berkeley.edu](demmel@cs.berkeley.edu)   
 **John Gilbert**, UC Santa Barbara, [gilbert@cs.ucsb.edu](gilbert@cs.ucsb.edu)
 
 

From cba813d044fc3f848a0a59b7992b07ff64ff9a6b Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Wed, 12 May 2021 09:46:54 -0700
Subject: [PATCH 091/147] Small updates

---
 EXAMPLE/Makefile                |  2 +-
 FORTRAN/f_5x5.F90               |  2 +-
 FORTRAN/f_pddrive.F90           |  2 +-
 FORTRAN/f_pddrive3d.F90         |  2 +-
 FORTRAN/f_pzdrive.F90           |  2 +-
 FORTRAN/f_pzdrive3d.F90         |  2 +-
 FORTRAN/sp_ienv.c               | 23 ++++++++++++++---------
 FORTRAN/superlu_dist_config.fh  |  2 --
 SRC/dscatter3d.c                |  1 -
 SRC/dsuperlu_gpu.cu             |  5 +----
 SRC/scatter.c                   |  2 +-
 SRC/sec_structs.c               |  1 -
 SRC/sp_ienv.c                   |  8 ++++----
 SRC/superlu_FortranCInterface.h |  8 ++++----
 SRC/superlu_defs.h              |  2 +-
 SRC/superlu_dist_config.h       |  6 +++---
 SRC/zscatter3d.c                |  1 -
 SRC/zsuperlu_gpu.cu             |  3 ---
 18 files changed, 34 insertions(+), 40 deletions(-)

diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile
index 0dbdbf4b..ddc81a86 100644
--- a/EXAMPLE/Makefile
+++ b/EXAMPLE/Makefile
@@ -30,7 +30,7 @@
 #######################################################################
 include ../make.inc
 
-DEXM	= pddrive.o dcreate_matrix.o sp_ienv.o 
+DEXM	= pddrive.o dcreate_matrix.o 
 	#pdgssvx.o pdgstrf2.o
 DEXM1	= pddrive1.o dcreate_matrix.o
 DEXM2	= pddrive2.o dcreate_matrix.o dcreate_matrix_perturbed.o
diff --git a/FORTRAN/f_5x5.F90 b/FORTRAN/f_5x5.F90
index 058cf07e..7585da3a 100644
--- a/FORTRAN/f_5x5.F90
+++ b/FORTRAN/f_5x5.F90
@@ -33,7 +33,7 @@ program f_5x5
 !   6. Release the process grid and terminate the MPI environment
 !   7. Release all structures
 !
-      #include "superlu_dist_config.fh"
+#include "superlu_dist_config.fh"
       use superlu_mod
       include 'mpif.h'
       integer maxn, maxnz, maxnrhs
diff --git a/FORTRAN/f_pddrive.F90 b/FORTRAN/f_pddrive.F90
index e3e2b571..c69792af 100644
--- a/FORTRAN/f_pddrive.F90
+++ b/FORTRAN/f_pddrive.F90
@@ -29,7 +29,7 @@ program f_pddrive
 !   7. Release all structures
 !
 !
-      #include "superlu_dist_config.fh"
+#include "superlu_dist_config.fh"
       use superlu_mod
       include 'mpif.h'
       integer maxn, maxnz, maxnrhs
diff --git a/FORTRAN/f_pddrive3d.F90 b/FORTRAN/f_pddrive3d.F90
index 784e1085..eed4f9c4 100644
--- a/FORTRAN/f_pddrive3d.F90
+++ b/FORTRAN/f_pddrive3d.F90
@@ -41,7 +41,7 @@ program f_pddrive3d
 ! The program may be run by typing
 !    mpiexec -np 8 f_pddrive3d 
 !
-      #include "superlu_dist_config.fh"
+#include "superlu_dist_config.fh"
       use superlu_mod
 !      implicit none
       include 'mpif.h'
diff --git a/FORTRAN/f_pzdrive.F90 b/FORTRAN/f_pzdrive.F90
index 750e57aa..f493f1d5 100644
--- a/FORTRAN/f_pzdrive.F90
+++ b/FORTRAN/f_pzdrive.F90
@@ -28,7 +28,7 @@ program f_pzdrive
 !   7. Release all structures
 !
 !
-      #include "superlu_dist_config.fh"
+#include "superlu_dist_config.fh"
       use superlu_mod
       include 'mpif.h'
       integer maxn, maxnz, maxnrhs
diff --git a/FORTRAN/f_pzdrive3d.F90 b/FORTRAN/f_pzdrive3d.F90
index f25d8e97..07fd9c0f 100644
--- a/FORTRAN/f_pzdrive3d.F90
+++ b/FORTRAN/f_pzdrive3d.F90
@@ -40,7 +40,7 @@ program f_pzdrive3d
 ! The program may be run by typing
 !    mpiexec -np 8 f_pzdrive3d 
 !
-      #include "superlu_dist_config.fh"
+#include "superlu_dist_config.fh"
       use superlu_mod
 !      implicit none
       include 'mpif.h'
diff --git a/FORTRAN/sp_ienv.c b/FORTRAN/sp_ienv.c
index 56a07726..e62d5c91 100644
--- a/FORTRAN/sp_ienv.c
+++ b/FORTRAN/sp_ienv.c
@@ -53,6 +53,8 @@ at the top-level directory.
 	         of L and U, compared with A;
 	    = 7: the minimum value of the product M*N*K for a GEMM call
 	         to be off-loaded to accelerator (e.g., GPU, Xeon Phi).
+            = 8: the maximum buffer size on GPU that can hold the three
+	         matrices in the GEMM call for the Schur complement update.
 	    
    (SP_IENV_DIST) (output) int
             >= 0: the value of the parameter specified by ISPEC   
@@ -62,11 +64,9 @@ at the top-level directory.
 
*/ - #include #include - int sp_ienv_dist(int ispec) { @@ -94,21 +94,27 @@ sp_ienv_dist(int ispec) return 1; case 3: - ttemp = getenv("NSUP"); + ttemp = getenv("NSUP"); // take min of MAX_SUPER_SIZE in superlu_defs.h if(ttemp) { - return(atoi(ttemp)); + int k = SUPERLU_MIN( atoi(ttemp), MAX_SUPER_SIZE ); + return (k); } - else - return 128; + else return 128; #endif - case 6: return (5); + case 6: + ttemp = getenv("FILL"); + if ( ttemp ) return(atoi(ttemp)); + else return (5); case 7: ttemp = getenv ("N_GEMM"); if (ttemp) return atoi (ttemp); else return 10000; - + case 8: + ttemp = getenv ("MAX_BUFFER_SIZE"); + if (ttemp) return atoi (ttemp); + else return 64000000; // 8000^2 } /* Invalid value for ISPEC */ @@ -116,6 +122,5 @@ sp_ienv_dist(int ispec) xerr_dist("sp_ienv", &i); return 0; - } /* sp_ienv_dist */ diff --git a/FORTRAN/superlu_dist_config.fh b/FORTRAN/superlu_dist_config.fh index 878933ff..cbe990cc 100644 --- a/FORTRAN/superlu_dist_config.fh +++ b/FORTRAN/superlu_dist_config.fh @@ -1,11 +1,9 @@ -#define HAVE_CUDA TRUE #define HAVE_PARMETIS TRUE -#define XSDK_INDEX_SIZE 64 #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 diff --git a/SRC/dscatter3d.c b/SRC/dscatter3d.c index 78bf5f0e..3abb2440 100644 --- a/SRC/dscatter3d.c +++ b/SRC/dscatter3d.c @@ -19,7 +19,6 @@ at the top-level directory. #else //#include "cblas.h" #endif -#include "omp.h" #define ISORT #define SCATTER_U_CPU scatter_u diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu index e28030b9..fe91bae5 100644 --- a/SRC/dsuperlu_gpu.cu +++ b/SRC/dsuperlu_gpu.cu @@ -14,7 +14,6 @@ //#define GPU_DEBUG #include "mpi.h" -#include "omp.h" // #include "sec_structs.h" #include #include @@ -98,9 +97,7 @@ void device_scatter_l (int_t thread_id, } #endif ///////////// not used -#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ -// The following is moved to superlu_defs.h -//#define MAX_SUPER_SIZE 256 /* Sherry: was 192 on Titan */ +#define THREAD_BLOCK_SIZE 512 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ __device__ inline void ddevice_scatter_l_2D (int thread_id, diff --git a/SRC/scatter.c b/SRC/scatter.c index 6612bba5..ed9e89bd 100644 --- a/SRC/scatter.c +++ b/SRC/scatter.c @@ -7,7 +7,7 @@ #else #include "cblas.h" #endif -#include "omp.h" + #define ISORT diff --git a/SRC/sec_structs.c b/SRC/sec_structs.c index 05216f35..a628bf4f 100644 --- a/SRC/sec_structs.c +++ b/SRC/sec_structs.c @@ -4,7 +4,6 @@ #include "sec_structs.h" #include /*for Qsort */ #include -#include #include /*for sqrt*/ #include #include "compiler.h" diff --git a/SRC/sp_ienv.c b/SRC/sp_ienv.c index d42b2353..30c9ce1f 100644 --- a/SRC/sp_ienv.c +++ b/SRC/sp_ienv.c @@ -94,13 +94,13 @@ sp_ienv_dist(int ispec) return 20; case 3: - ttemp = getenv("NSUP"); // take min of MAX_SUPER_SIZE in superlu_defs.h + ttemp = getenv("NSUP"); // take min of MAX_SUPER_SIZE in superlu_defs.h if(ttemp) { - return(atoi(ttemp)); + int k = SUPERLU_MIN( atoi(ttemp), MAX_SUPER_SIZE ); + return (k); } - else - return 128; + else return 128; #endif case 6: diff --git a/SRC/superlu_FortranCInterface.h b/SRC/superlu_FortranCInterface.h index c9fee77d..467bfb65 100644 --- a/SRC/superlu_FortranCInterface.h +++ b/SRC/superlu_FortranCInterface.h @@ -2,15 +2,15 @@ #define FC_HEADER_INCLUDED /* Mangling for Fortran global symbols without underscores. */ -#define FC_GLOBAL(name,NAME) name +#define FC_GLOBAL(name,NAME) name##_ /* Mangling for Fortran global symbols with underscores. */ -#define FC_GLOBAL_(name,NAME) name +#define FC_GLOBAL_(name,NAME) name##_ /* Mangling for Fortran module symbols without underscores. */ -#define FC_MODULE(mod_name,name, mod_NAME,NAME) __##mod_name##_NMOD_##name +#define FC_MODULE(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name /* Mangling for Fortran module symbols with underscores. */ -#define FC_MODULE_(mod_name,name, mod_NAME,NAME) __##mod_name##_NMOD_##name +#define FC_MODULE_(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name #endif diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index 33944a6e..7f41043b 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -136,7 +136,7 @@ typedef MPI_C_DOUBLE_COMPLEX SuperLU_MPI_DOUBLE_COMPLEX; #endif -#define MAX_SUPER_SIZE 256 /* Sherry: moved from superlu_gpu.cu */ +#define MAX_SUPER_SIZE 512 /* Sherry: moved from superlu_gpu.cu */ #define ISORT /* NOTE: qsort() has bug on Mac */ diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index 066a1350..ffd061a2 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -1,19 +1,19 @@ /* superlu_dist_config.h.in */ /* Enable CUDA */ -#define HAVE_CUDA TRUE +/* #undef HAVE_CUDA */ /* Enable parmetis */ #define HAVE_PARMETIS TRUE /* Enable LAPACK */ -#define SLU_HAVE_LAPACK TRUE +/* #undef SLU_HAVE_LAPACK */ /* Enable CombBLAS */ /* #undef HAVE_COMBBLAS */ /* enable 64bit index mode */ -#define XSDK_INDEX_SIZE 64 +/* #undef XSDK_INDEX_SIZE */ #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 diff --git a/SRC/zscatter3d.c b/SRC/zscatter3d.c index a4dc8cb9..43388abf 100644 --- a/SRC/zscatter3d.c +++ b/SRC/zscatter3d.c @@ -18,7 +18,6 @@ at the top-level directory. #else //#include "cblas.h" #endif -#include "omp.h" #define ISORT #define SCATTER_U_CPU scatter_u diff --git a/SRC/zsuperlu_gpu.cu b/SRC/zsuperlu_gpu.cu index 6a705725..af54ae63 100644 --- a/SRC/zsuperlu_gpu.cu +++ b/SRC/zsuperlu_gpu.cu @@ -13,7 +13,6 @@ //#define GPU_DEBUG #include "mpi.h" -#include "omp.h" // #include "sec_structs.h" #include #include @@ -84,8 +83,6 @@ void device_scatter_l (int_t thread_id, #endif ///////////// not used #define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ -// The following is moved to superlu_defs.h -//#define MAX_SUPER_SIZE 256 /* Sherry: was 192 on Titan */ __device__ inline void zdevice_scatter_l_2D (int thread_id, From 1a6d5bad87736c9e6f53530a63dce79856ecd76c Mon Sep 17 00:00:00 2001 From: Xiaoye Li Date: Wed, 12 May 2021 15:29:07 -0400 Subject: [PATCH 092/147] Update version number to 7.0 --- EXAMPLE/README | 4 +- FORTRAN/README | 1 + FORTRAN/superlu_c2f_wrap.c | 3 +- FORTRAN/superlu_dist_config.fh | 2 + README.md | 2 +- SRC/communication_aux.c | 10 ++ SRC/sec_structs.c | 22 ++++ SRC/superlu_FortranCInterface.h | 8 +- SRC/superlu_defs.h | 7 +- SRC/superlu_dist_config.h | 4 +- SRC/superlu_grid3d.c | 4 +- SRC/supernodalForest.c | 9 ++ SRC/supernodal_etree.c | 11 +- SRC/treeFactorization.c | 178 +++++++++++++++++--------------- SRC/trfAux.c | 10 ++ 15 files changed, 175 insertions(+), 100 deletions(-) diff --git a/EXAMPLE/README b/EXAMPLE/README index 2819f1cc..7cfba7de 100644 --- a/EXAMPLE/README +++ b/EXAMPLE/README @@ -1,5 +1,5 @@ - SuperLU_DIST EXAMPLES - ====================== + SuperLU_DIST EXAMPLES + ===================== This directory contains sample programs to illustrate how to use various functions provided in SuperLU_DIST. You can modify these diff --git a/FORTRAN/README b/FORTRAN/README index 67b93850..1d1d384e 100644 --- a/FORTRAN/README +++ b/FORTRAN/README @@ -1,4 +1,5 @@ Fortran 90 Interface + ==================== This directory contains Fortran-90 wrapper routines for SuperLU_DIST. The directory contains the following files: diff --git a/FORTRAN/superlu_c2f_wrap.c b/FORTRAN/superlu_c2f_wrap.c index d35081ab..1c4b0e8e 100644 --- a/FORTRAN/superlu_c2f_wrap.c +++ b/FORTRAN/superlu_c2f_wrap.c @@ -4,10 +4,11 @@ * \brief C interface functions for the Fortran90 wrapper. * *
- * -- Distributed SuperLU routine (version 4.1) --
+ * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 2012
  * April 5, 2015
+ * May 12, 2021
  */
 
 #include "superlu_defs.h"
diff --git a/FORTRAN/superlu_dist_config.fh b/FORTRAN/superlu_dist_config.fh
index cbe990cc..878933ff 100644
--- a/FORTRAN/superlu_dist_config.fh
+++ b/FORTRAN/superlu_dist_config.fh
@@ -1,9 +1,11 @@
 
+#define HAVE_CUDA TRUE
 
 #define HAVE_PARMETIS TRUE
 
 
 
+#define XSDK_INDEX_SIZE 64
 
 #if (XSDK_INDEX_SIZE == 64)
 #define _LONGINT 1
diff --git a/README.md b/README.md
index da3d92c2..0c78f131 100644
--- a/README.md
+++ b/README.md
@@ -355,7 +355,7 @@ to obtain these parameters. This function is provided in the file SRC/sp_ienv.c.
 Please consult that file for detailed description of the meanings.
 ```
     export NREL=<...>   // supernode relaxation parameter
-    export NSUP=<...>   // maximum allowable supernode size
+    export NSUP=<...>   // maximum allowable supernode size, not to exceed 512
     export FILL=<...>   // estimated fill ratio of nonzeros(L+U)/nonzeros(A)
     export MAX_BUFFER_SIZE=<...>   // maximum buffer size on GPU for GEMM
 ```
diff --git a/SRC/communication_aux.c b/SRC/communication_aux.c
index f3b3791e..ff0034fc 100644
--- a/SRC/communication_aux.c
+++ b/SRC/communication_aux.c
@@ -9,6 +9,16 @@ The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
+/*! @file
+ * \brief Auxiliary routines to support communication in 3D algorithms
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * 
+ */ + #include "superlu_defs.h" #if 0 #include "sec_structs.h" diff --git a/SRC/sec_structs.c b/SRC/sec_structs.c index a628bf4f..db73cb4d 100644 --- a/SRC/sec_structs.c +++ b/SRC/sec_structs.c @@ -1,3 +1,25 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Auxiliary routines in 3D algorithms + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * 
+ */ + + //#include #include "superlu_ddefs.h" #if 0 diff --git a/SRC/superlu_FortranCInterface.h b/SRC/superlu_FortranCInterface.h index 467bfb65..c9fee77d 100644 --- a/SRC/superlu_FortranCInterface.h +++ b/SRC/superlu_FortranCInterface.h @@ -2,15 +2,15 @@ #define FC_HEADER_INCLUDED /* Mangling for Fortran global symbols without underscores. */ -#define FC_GLOBAL(name,NAME) name##_ +#define FC_GLOBAL(name,NAME) name /* Mangling for Fortran global symbols with underscores. */ -#define FC_GLOBAL_(name,NAME) name##_ +#define FC_GLOBAL_(name,NAME) name /* Mangling for Fortran module symbols without underscores. */ -#define FC_MODULE(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name +#define FC_MODULE(mod_name,name, mod_NAME,NAME) __##mod_name##_NMOD_##name /* Mangling for Fortran module symbols with underscores. */ -#define FC_MODULE_(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name +#define FC_MODULE_(mod_name,name, mod_NAME,NAME) __##mod_name##_NMOD_##name #endif diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index 7f41043b..a12d4b7c 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -23,6 +23,7 @@ at the top-level directory. * February 8, 2019 version 6.1.1 * November 12, 2019 version 6.2.0 * October 23, 2020 version 6.4.0 + * May 12, 2021 version 7.0.0 *
*/ @@ -72,10 +73,10 @@ at the top-level directory. * #endif * Versions 4.x and earlier do not include a #define'd version numbers. */ -#define SUPERLU_DIST_MAJOR_VERSION 6 -#define SUPERLU_DIST_MINOR_VERSION 4 +#define SUPERLU_DIST_MAJOR_VERSION 7 +#define SUPERLU_DIST_MINOR_VERSION 0 #define SUPERLU_DIST_PATCH_VERSION 0 -#define SUPERLU_DIST_RELEASE_DATE "October 23, 2020" +#define SUPERLU_DIST_RELEASE_DATE "May 12, 2021" #include "superlu_dist_config.h" /* Define my integer size int_t */ diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index ffd061a2..6df0c761 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -1,7 +1,7 @@ /* superlu_dist_config.h.in */ /* Enable CUDA */ -/* #undef HAVE_CUDA */ +#define HAVE_CUDA TRUE /* Enable parmetis */ #define HAVE_PARMETIS TRUE @@ -13,7 +13,7 @@ /* #undef HAVE_COMBBLAS */ /* enable 64bit index mode */ -/* #undef XSDK_INDEX_SIZE */ +#define XSDK_INDEX_SIZE 64 #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 diff --git a/SRC/superlu_grid3d.c b/SRC/superlu_grid3d.c index 9fb97b84..9dc05867 100644 --- a/SRC/superlu_grid3d.c +++ b/SRC/superlu_grid3d.c @@ -3,8 +3,8 @@ * *
  * -- Distributed SuperLU routine (version 7.0.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley.
- * March 30, 2019
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
  * 
*/ diff --git a/SRC/supernodalForest.c b/SRC/supernodalForest.c index 95f35649..a5487977 100644 --- a/SRC/supernodalForest.c +++ b/SRC/supernodalForest.c @@ -1,3 +1,12 @@ +/*! @file + * \brief SuperLU utilities + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * 
+ */ #include #include #include "superlu_ddefs.h" diff --git a/SRC/supernodal_etree.c b/SRC/supernodal_etree.c index 90171eb8..08de5b2e 100644 --- a/SRC/supernodal_etree.c +++ b/SRC/supernodal_etree.c @@ -1,4 +1,13 @@ -/*function to generate supernodal etree*/ +/*! @file + * \brief function to generate supernodal etree +* + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * 
+ */ + #include #include #include "superlu_ddefs.h" diff --git a/SRC/treeFactorization.c b/SRC/treeFactorization.c index 4806b78a..dd677995 100644 --- a/SRC/treeFactorization.c +++ b/SRC/treeFactorization.c @@ -9,90 +9,100 @@ The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ -#include "superlu_ddefs.h" -#if 0 -#include "treeFactorization.h" -#include "trfCommWrapper.h" -#endif - -#if 0 /******** Sherry: Remove extra layer of function calls. *******/ -int_t sDiagFactIBCast(int_t k, diagFactBufs_t *dFBuf, - factStat_t *factStat, - commRequests_t *comReqs, - gridinfo_t *grid, - superlu_dist_options_t *options, - double thresh, - LUstruct_t *LUstruct, - SuperLUStat_t *stat, int *info, - SCT_t *SCT, - int tag_ub - ) -{ - MPI_Request * U_diag_blk_recv_req = comReqs->U_diag_blk_recv_req; - MPI_Request * L_diag_blk_recv_req = comReqs->L_diag_blk_recv_req; - MPI_Request * U_diag_blk_send_req = comReqs->U_diag_blk_send_req; - MPI_Request * L_diag_blk_send_req = comReqs->L_diag_blk_send_req; - int_t * IrecvPlcd_D = factStat->IrecvPlcd_D; - - double * BlockUFactor = dFBuf->BlockUFactor; - double * BlockLFactor = dFBuf->BlockLFactor; - dDiagFactIBCast(k, k, BlockUFactor, BlockLFactor, - IrecvPlcd_D, - U_diag_blk_recv_req, L_diag_blk_recv_req, - U_diag_blk_send_req, L_diag_blk_send_req, - grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); - return 0; -} -int_t sLPanelUpdate( int_t k, diagFactBufs_t *dFBuf, - factStat_t *factStat, - commRequests_t *comReqs, - gridinfo_t *grid, - LUstruct_t *LUstruct, SCT_t *SCT) -{ - MPI_Request * U_diag_blk_recv_req = comReqs->U_diag_blk_recv_req; - int_t * IrecvPlcd_D = factStat->IrecvPlcd_D; - int_t * factored_L = factStat->factored_L; - double * BlockUFactor = dFBuf->BlockUFactor; - - dLPanelUpdate( k, IrecvPlcd_D, factored_L, - U_diag_blk_recv_req, BlockUFactor, grid, LUstruct, SCT); - return 0; -} - -int_t sUPanelUpdate( int_t k, - int_t ldt, - diagFactBufs_t *dFBuf, - factStat_t *factStat, - commRequests_t *comReqs, - scuBufs_t* scuBufs, - packLUInfo_t* packLUInfo, - gridinfo_t *grid, - LUstruct_t *LUstruct, - SuperLUStat_t *stat, SCT_t *SCT) -{ - double* bigV = scuBufs->bigV; - Ublock_info_t* Ublock_info = packLUInfo->Ublock_info; - - MPI_Request * L_diag_blk_recv_req = comReqs->L_diag_blk_recv_req; - - int_t * factored_U = factStat->factored_U; - - double * BlockLFactor = dFBuf->BlockLFactor; - dUPanelUpdate(k, factored_U, L_diag_blk_recv_req, BlockLFactor, bigV, ldt, - Ublock_info, grid, LUstruct, stat, SCT); - return 0; -} -int_t sIBcastRecvLPanel( - int_t k, - commRequests_t *comReqs, - LUValSubBuf_t* LUvsb, - msgs_t* msgs, - factStat_t *factStat, - gridinfo_t *grid, - LUstruct_t *LUstruct, SCT_t *SCT, int tag_ub) - -{ - int* msgcnt = msgs->msgcnt; +/*! @file + * \brief factorization routines in 3D algorithms + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * 
+ */ + + #include "superlu_ddefs.h" + #if 0 + #include "treeFactorization.h" + #include "trfCommWrapper.h" + #endif + + #if 0 /******** Sherry: Remove extra layer of function calls. *******/ + int_t sDiagFactIBCast(int_t k, diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + gridinfo_t *grid, + superlu_dist_options_t *options, + double thresh, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, int *info, + SCT_t *SCT, + int tag_ub + ) + { + MPI_Request * U_diag_blk_recv_req = comReqs->U_diag_blk_recv_req; + MPI_Request * L_diag_blk_recv_req = comReqs->L_diag_blk_recv_req; + MPI_Request * U_diag_blk_send_req = comReqs->U_diag_blk_send_req; + MPI_Request * L_diag_blk_send_req = comReqs->L_diag_blk_send_req; + int_t * IrecvPlcd_D = factStat->IrecvPlcd_D; + + double * BlockUFactor = dFBuf->BlockUFactor; + double * BlockLFactor = dFBuf->BlockLFactor; + dDiagFactIBCast(k, k, BlockUFactor, BlockLFactor, + IrecvPlcd_D, + U_diag_blk_recv_req, L_diag_blk_recv_req, + U_diag_blk_send_req, L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); + return 0; + } + int_t sLPanelUpdate( int_t k, diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT) + { + MPI_Request * U_diag_blk_recv_req = comReqs->U_diag_blk_recv_req; + int_t * IrecvPlcd_D = factStat->IrecvPlcd_D; + int_t * factored_L = factStat->factored_L; + double * BlockUFactor = dFBuf->BlockUFactor; + + dLPanelUpdate( k, IrecvPlcd_D, factored_L, + U_diag_blk_recv_req, BlockUFactor, grid, LUstruct, SCT); + return 0; + } + + int_t sUPanelUpdate( int_t k, + int_t ldt, + diagFactBufs_t *dFBuf, + factStat_t *factStat, + commRequests_t *comReqs, + scuBufs_t* scuBufs, + packLUInfo_t* packLUInfo, + gridinfo_t *grid, + LUstruct_t *LUstruct, + SuperLUStat_t *stat, SCT_t *SCT) + { + double* bigV = scuBufs->bigV; + Ublock_info_t* Ublock_info = packLUInfo->Ublock_info; + + MPI_Request * L_diag_blk_recv_req = comReqs->L_diag_blk_recv_req; + + int_t * factored_U = factStat->factored_U; + + double * BlockLFactor = dFBuf->BlockLFactor; + dUPanelUpdate(k, factored_U, L_diag_blk_recv_req, BlockLFactor, bigV, ldt, + Ublock_info, grid, LUstruct, stat, SCT); + return 0; + } + int_t sIBcastRecvLPanel( + int_t k, + commRequests_t *comReqs, + LUValSubBuf_t* LUvsb, + msgs_t* msgs, + factStat_t *factStat, + gridinfo_t *grid, + LUstruct_t *LUstruct, SCT_t *SCT, int tag_ub) + + { + int* msgcnt = msgs->msgcnt; MPI_Request *send_req = comReqs->send_req; MPI_Request *recv_req = comReqs->recv_req; int_t * Lsub_buf = LUvsb->Lsub_buf; diff --git a/SRC/trfAux.c b/SRC/trfAux.c index d0280189..ff96d1b3 100644 --- a/SRC/trfAux.c +++ b/SRC/trfAux.c @@ -9,6 +9,16 @@ The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ +/*! @file + * \brief Auxiliary routines to support 3D algorithms + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * May 12, 2021
+ * 
+ */ + #include "superlu_ddefs.h" #if 0 From fba7e923795f00b9287b3b239a01f16da98770ea Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Sat, 22 May 2021 19:23:15 -0700 Subject: [PATCH 093/147] In CMakeLists.txt, set XSDK-ENABLE_Fortran=ON. In .gitignore, add file SRC/superlu_FortranCInterface.h. FORTRAN/: link libsuperlu_dist_fortran.so to libsuperlu_dist.so. FORTRAN/: propagate VERSION number down to the superlu_dist_fortran target. --- .gitignore | 3 +++ CMakeLists.txt | 1 + FORTRAN/CMakeLists.txt | 6 +++++- FORTRAN/README | 2 +- FORTRAN/superlu_dist_config.fh | 2 -- SRC/superlu_FortranCInterface.h | 8 ++++---- SRC/superlu_dist_config.h | 4 ++-- cmake/XSDKDefaults.cmake | 6 +++--- make.inc.in | 2 +- 9 files changed, 20 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index ecf5bf52..0ec797c8 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ # If the instructions are telling people to create this build dir under the # source tree, you had better put in an ignore for this. /build/* + +# not to commit any changes to the following file +/SRC/superlu_FortranCInterface.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1c82490c..df9798e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,7 @@ option(enable_double "Enable double precision library" ON) option(enable_complex16 "Enable complex16 precision library" OFF) option(enable_tests "Build tests" ON) option(enable_examples "Build examples" ON) +option(XSDK_ENABLE_Fortran "Enable Fortran" ON) #-- BLAS option(TPL_ENABLE_INTERNAL_BLASLIB "Build the CBLAS library" ${enable_blaslib_DEFAULT}) diff --git a/FORTRAN/CMakeLists.txt b/FORTRAN/CMakeLists.txt index 5705c7fd..6d0a0fd8 100644 --- a/FORTRAN/CMakeLists.txt +++ b/FORTRAN/CMakeLists.txt @@ -6,7 +6,6 @@ set(headers ${CMAKE_INSTALL_PREFIX}/FORTRAN/superlu_mod.mod ${CMAKE_INSTALL_PREFIX}/FORTRAN/superlupara_mod.mod ${CMAKE_INSTALL_PREFIX}/FORTRAN/superlu_dist_config.fh -# ${CMAKE_CURRENT_BINARY_DIR}/fortran_config.h ) set(sources "superlu_c2f_wrap.c") # initialize precision-independent file @@ -20,6 +19,11 @@ endif() add_library(superlu_dist_fortran ${sources}) set(targets superlu_dist_fortran) +get_target_property(superlu_dist_version superlu_dist VERSION) +get_target_property(superlu_dist_soversion superlu_dist SOVERSION) +set_target_properties(superlu_dist_fortran PROPERTIES VERSION ${superlu_dist_version}) +set_target_properties(superlu_dist_fortran PROPERTIES SOVERSION ${superlu_dist_soversion}) +target_link_libraries(superlu_dist_fortran superlu_dist) # depends on FPP defs add_dependencies(superlu_dist_fortran config_f) diff --git a/FORTRAN/README b/FORTRAN/README index 1d1d384e..ce7da07d 100644 --- a/FORTRAN/README +++ b/FORTRAN/README @@ -24,7 +24,7 @@ There are two examples in the directory. A real example Fortran driver routine that reads a matrix from a file 'g20.rua' in Harwell-Boeing format. To run the code, type: - mpiexec -n 4 f_pddrive (or f_pddrive3d) + mpiexec -n 4 f_pddrive (The example is set up to use 4 MPI processes) 2. f_pddrive3d.f90: use the 3D algorithms diff --git a/FORTRAN/superlu_dist_config.fh b/FORTRAN/superlu_dist_config.fh index 878933ff..cbe990cc 100644 --- a/FORTRAN/superlu_dist_config.fh +++ b/FORTRAN/superlu_dist_config.fh @@ -1,11 +1,9 @@ -#define HAVE_CUDA TRUE #define HAVE_PARMETIS TRUE -#define XSDK_INDEX_SIZE 64 #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 diff --git a/SRC/superlu_FortranCInterface.h b/SRC/superlu_FortranCInterface.h index c9fee77d..467bfb65 100644 --- a/SRC/superlu_FortranCInterface.h +++ b/SRC/superlu_FortranCInterface.h @@ -2,15 +2,15 @@ #define FC_HEADER_INCLUDED /* Mangling for Fortran global symbols without underscores. */ -#define FC_GLOBAL(name,NAME) name +#define FC_GLOBAL(name,NAME) name##_ /* Mangling for Fortran global symbols with underscores. */ -#define FC_GLOBAL_(name,NAME) name +#define FC_GLOBAL_(name,NAME) name##_ /* Mangling for Fortran module symbols without underscores. */ -#define FC_MODULE(mod_name,name, mod_NAME,NAME) __##mod_name##_NMOD_##name +#define FC_MODULE(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name /* Mangling for Fortran module symbols with underscores. */ -#define FC_MODULE_(mod_name,name, mod_NAME,NAME) __##mod_name##_NMOD_##name +#define FC_MODULE_(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name #endif diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h index 6df0c761..ffd061a2 100644 --- a/SRC/superlu_dist_config.h +++ b/SRC/superlu_dist_config.h @@ -1,7 +1,7 @@ /* superlu_dist_config.h.in */ /* Enable CUDA */ -#define HAVE_CUDA TRUE +/* #undef HAVE_CUDA */ /* Enable parmetis */ #define HAVE_PARMETIS TRUE @@ -13,7 +13,7 @@ /* #undef HAVE_COMBBLAS */ /* enable 64bit index mode */ -#define XSDK_INDEX_SIZE 64 +/* #undef XSDK_INDEX_SIZE */ #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 diff --git a/cmake/XSDKDefaults.cmake b/cmake/XSDKDefaults.cmake index 20eea804..d454d789 100644 --- a/cmake/XSDKDefaults.cmake +++ b/cmake/XSDKDefaults.cmake @@ -98,9 +98,9 @@ SET(USE_XSDK_DEFAULTS ${USE_XSDK_DEFAULTS_DEFAULT} CACHE BOOL "Use XSDK defaults and behavior.") PRINT_VAR(USE_XSDK_DEFAULTS) -SET(XSDK_ENABLE_C TRUE) -SET(XSDK_ENABLE_CXX TRUE) -SET(XSDK_ENABLE_Fortran TRUE) +SET(XSDK_ENABLE_C ON) +SET(XSDK_ENABLE_CXX ON) +SET(XSDK_ENABLE_Fortran ON) # Handle the compiler and flags for a language MACRO(XSDK_HANDLE_LANG_DEFAULTS CMAKE_LANG_NAME ENV_LANG_NAME diff --git a/make.inc.in b/make.inc.in index 31736a88..4956ebe9 100644 --- a/make.inc.in +++ b/make.inc.in @@ -27,7 +27,7 @@ HAVE_COMBBLAS = @HAVE_COMBBLAS@ HAVE_CUDA = @HAVE_CUDA@ XSDK_ENABLE_Fortran = @XSDK_ENABLE_Fortran@ -ifeq ($(XSDK_ENABLE_Fortran),TRUE) +ifeq ($(XSDK_ENABLE_Fortran),ON) DFORTRANLIB = $(SuperLUroot)/@CMAKE_INSTALL_LIBDIR@/${PROJECT_NAME_LIB_FORTRAN} LIBS = $(DFORTRANLIB) $(DSUPERLULIB) ${BLAS_LIB_EXPORT} -lm LIBS += ${EXTRA_FLIB_EXPORT} From 4d43154be647c32abf03c7505182000c0297fbf7 Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Sat, 22 May 2021 21:50:21 -0700 Subject: [PATCH 094/147] Change c2cpp_GetAWPM() to {d,z}_c2pp_getHWPM(). --- SRC/pdgssvx.c | 4 ++-- SRC/pdgssvx3d.c | 13 +++++++++---- SRC/pzgssvx.c | 4 ++-- SRC/pzgssvx3d.c | 13 +++++++++---- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c index d8eb7b4d..822d8ebf 100644 --- a/SRC/pdgssvx.c +++ b/SRC/pdgssvx.c @@ -324,7 +324,7 @@ at the top-level directory. * = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows * of the original matrix to make the diagonal large * relative to the off-diagonal. - * = LargeDiag_APWM: use the parallel approximate-weight perfect + * = LargeDiag_HPWM: use the parallel approximate-weight perfect * matching to permute rows of the original matrix * to make the diagonal large relative to the * off-diagonal. @@ -913,7 +913,7 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A, if ( !iam ) printf("\t product of diagonal %e\n", dprod); } #endif - } else { /* use largeDiag_AWPM */ + } else { /* use LargeDiag_HWPM */ #ifdef HAVE_COMBBLAS d_c2cpp_GetHWPM(A, grid, ScalePermstruct); #else diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index 04b49faf..74ec2be4 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -15,8 +15,9 @@ at the top-level directory. * *
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
  */
 #include "superlu_ddefs.h"
 #if 0
@@ -320,6 +321,10 @@ at the top-level directory.
  *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows of
  *                        the original matrix to make the diagonal large
  *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
  *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
  *                        input by the user.
  *
@@ -941,9 +946,9 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 				printf ("\t product of diagonal %e\n", dprod);
 			}
 #endif
-		    } else { /* use largeDiag_AWPM */
+		    } else { /* use LargeDiag_HWPM */
 #ifdef HAVE_COMBBLAS
-			c2cpp_GetAWPM(A, grid, ScalePermstruct);
+		        d_c2cpp_GetHWPM(A, grid, ScalePermstruct);
 #else
 			if ( iam == 0 ) {
 			    printf("CombBLAS is not available\n"); fflush(stdout);
diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c
index e50ef7d3..a1ea6bcf 100644
--- a/SRC/pzgssvx.c
+++ b/SRC/pzgssvx.c
@@ -323,7 +323,7 @@ at the top-level directory.
  *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows
  *                        of the original matrix to make the diagonal large
  *                        relative to the off-diagonal.
- *           = LargeDiag_APWM: use the parallel approximate-weight perfect
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
  *                        matching to permute rows of the original matrix
  *                        to make the diagonal large relative to the
  *                        off-diagonal.
@@ -914,7 +914,7 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
 		        if ( !iam ) printf("\t product of diagonal %e\n", dprod);
 	            }
 #endif
-                } else { /* use largeDiag_AWPM */
+                } else { /* use LargeDiag_HWPM */
 #ifdef HAVE_COMBBLAS
 		    z_c2cpp_GetHWPM(A, grid, ScalePermstruct);
 #else
diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c
index b4d4b2ab..479296ae 100644
--- a/SRC/pzgssvx3d.c
+++ b/SRC/pzgssvx3d.c
@@ -14,8 +14,9 @@ at the top-level directory.
  *
  * 
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
  */
 #include "superlu_zdefs.h"
 #if 0
@@ -319,6 +320,10 @@ at the top-level directory.
  *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows of
  *                        the original matrix to make the diagonal large
  *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
  *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
  *                        input by the user.
  *
@@ -942,9 +947,9 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 				printf ("\t product of diagonal %e\n", dprod);
 			}
 #endif
-		    } else { /* use largeDiag_AWPM */
+		    } else { /* use LargeDiag_HWPM */
 #ifdef HAVE_COMBBLAS
-			c2cpp_GetAWPM(A, grid, ScalePermstruct);
+		        z_c2cpp_GetHWPM(A, grid, ScalePermstruct);
 #else
 			if ( iam == 0 ) {
 			    printf("CombBLAS is not available\n"); fflush(stdout);

From bb400c86aeba0daf998417ed7404475ff8f08f51 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Sat, 29 May 2021 10:31:23 -0700
Subject: [PATCH 095/147] Latest patch from Satish (incorporating patches from
 @drew-parsons) to resolve the issues related to FORTRAN/ wrapper (e.g. make
 -j). Add back big.rua test matrix.

---
 EXAMPLE/big.rua        | 11496 +++++++++++++++++++++++++++++++++++++++
 FORTRAN/CMakeLists.txt |    26 +-
 SRC/sp_ienv.c          |     8 +-
 3 files changed, 11514 insertions(+), 16 deletions(-)
 create mode 100644 EXAMPLE/big.rua

diff --git a/EXAMPLE/big.rua b/EXAMPLE/big.rua
new file mode 100644
index 00000000..3a5c16f7
--- /dev/null
+++ b/EXAMPLE/big.rua
@@ -0,0 +1,11496 @@
+32-bit adder, from Steve Hamm (Motorola) hamm@austoto.sps.mot.com       add32   
+         11491           382          1493          7962          1654
+RUA                     4960          4960         23884             0
+(13i6)          (16i5)          (3e26.18)           (3e26.18)           
+F                          1             0
+     1    30    42    45    57    65    73    76    87   111   114   117   127
+   130   144   157   165   178   181   191   203   206   214   226   236   239
+   251   254   263   274   278   288   310   321   324   340   350   359   362
+   373   397   400   404   417   420   430   440   450   468   471   484   496
+   499   509   520   530   534   546   549   558   570   573   581   609   622
+   625   635   644   653   656   671   695   698   702   714   717   728   739
+   752   765   768   778   793   796   804   816   826   829   841   844   853
+   865   868   876   899   912   915   925   934   943   946   961   988   991
+   995  1009  1012  1022  1031  1043  1058  1061  1071  1087  1090  1098  1110
+  1121  1124  1136  1140  1149  1160  1164  1174  1202  1214  1218  1230  1239
+  1248  1251  1263  1288  1291  1294  1304  1307  1319  1330  1343  1356  1359
+  1371  1385  1388  1398  1409  1418  1422  1436  1440  1449  1461  1464  1472
+  1501  1512  1516  1532  1542  1551  1554  1565  1590  1593  1596  1607  1611
+  1623  1634  1646  1660  1663  1675  1689  1692  1700  1712  1722  1725  1737
+  1740  1749  1761  1764  1772  1801  1812  1816  1832  1842  1851  1854  1865
+  1891  1894  1897  1910  1913  1923  1932  1945  1959  1962  1974  1990  1993
+  2003  2014  2024  2028  2039  2043  2052  2064  2067  2075  2101  2113  2117
+  2131  2140  2149  2152  2163  2190  2193  2197  2209  2212  2224  2234  2244
+  2258  2261  2271  2286  2289  2297  2309  2319  2322  2334  2337  2346  2358
+  2361  2369  2399  2411  2414  2425  2434  2443  2446  2459  2483  2486  2490
+  2504  2507  2517  2526  2536  2552  2555  2565  2581  2584  2592  2604  2614
+  2617  2629  2632  2641  2653  2656  2664  2696  2708  2711  2722  2731  2740
+  2743  2755  2780  2783  2787  2799  2802  2813  2827  2839  2852  2855  2868
+  2879  2882  2890  2902  2912  2915  2927  2930  2939  2951  2954  2962  2992
+  3004  3007  3018  3027  3036  3039  3051  3078  3081  3085  3096  3099  3114
+  3126  3139  3150  3153  3165  3177  3180  3188  3200  3210  3213  3225  3228
+  3237  3248  3252  3262  3292  3304  3307  3318  3327  3336  3339  3351  3377
+  3380  3384  3395  3398  3412  3425  3437  3449  3452  3465  3476  3479  3487
+  3499  3509  3512  3524  3527  3536  3548  3551  3559  3586  3598  3601  3614
+  3623  3632  3635  3647  3673  3676  3680  3691  3694  3709  3722  3735  3746
+  3749  3761  3773  3776  3784  3796  3806  3809  3821  3824  3833  3845  3848
+  3856  3886  3898  3901  3914  3923  3932  3935  3946  3973  3976  3980  3991
+  3994  4006  4019  4031  4044  4047  4057  4070  4073  4081  4093  4103  4106
+  4118  4121  4130  4142  4145  4153  4180  4193  4196  4206  4215  4224  4227
+  4241  4267  4270  4274  4287  4290  4300  4312  4322  4338  4341  4353  4365
+  4368  4376  4388  4398  4401  4413  4416  4425  4436  4440  4450  4476  4489
+  4492  4502  4511  4520  4523  4537  4566  4569  4573  4584  4587  4602  4614
+  4626  4638  4641  4651  4664  4667  4677  4688  4697  4701  4712  4715  4724
+  4736  4739  4747  4779  4792  4795  4805  4814  4823  4826  4841  4864  4867
+  4871  4882  4885  4897  4908  4920  4934  4937  4948  4962  4965  4973  4985
+  4995  4998  5010  5013  5022  5034  5037  5045  5070  5082  5085  5096  5105
+  5114  5117  5137  5164  5167  5171  5183  5186  5200  5213  5225  5237  5240
+  5252  5264  5267  5275  5287  5297  5300  5312  5315  5324  5336  5339  5347
+  5376  5389  5392  5402  5411  5420  5423  5437  5463  5466  5470  5481  5484
+  5498  5512  5526  5537  5540  5553  5564  5567  5575  5587  5597  5600  5612
+  5615  5624  5636  5639  5647  5674  5687  5690  5700  5709  5718  5721  5735
+  5759  5762  5766  5777  5780  5792  5805  5815  5829  5832  5843  5856  5859
+  5867  5879  5889  5894  5906  5909  5918  5930  5935  5943  5970  5982  5985
+  5997  6007  6016  6019  6033  6056  6059  6062  6072  6075  6088  6098  6111
+  6125  6129  6142  6156  6160  6169  6179  6189  6193  6205  6208  6217  6228
+  6233  6240  6264  6275  6278  6294  6304  6313  6316  6327  6351  6354  6357
+  6367  6370  6383  6393  6406  6420  6424  6437  6451  6455  6463  6475  6485
+  6490  6502  6505  6514  6526  6531  6539  6563  6574  6577  6596  6605  6613
+  6616  6627  6652  6655  6658  6671  6674  6684  6693  6706  6721  6725  6737
+  6754  6758  6765  6776  6786  6791  6803  6806  6815  6825  6829  6838  6867
+  6879  6882  6897  6907  6916  6919  6931  6955  6958  6961  6972  6975  6987
+  6997  7010  7023  7027  7039  7054  7058  7066  7078  7088  7093  7105  7108
+  7117  7129  7134  7142  7168  7179  7182  7197  7207  7216  7219  7230  7255
+  7258  7261  7271  7274  7287  7297  7309  7324  7328  7341  7354  7357  7364
+  7374  7383  7388  7400  7403  7412  7423  7428  7435  7465  7478  7481  7491
+  7500  7509  7512  7527  7551  7554  7557  7567  7570  7583  7593  7604  7618
+  7622  7633  7648  7652  7663  7673  7683  7687  7699  7702  7711  7722  7728
+  7737  7765  7778  7781  7791  7800  7809  7812  7827  7851  7854  7857  7867
+  7870  7883  7893  7903  7917  7920  7930  7945  7948  7957  7968  7978  7984
+  7996  7999  8008  8019  8025  8034  8061  8074  8077  8087  8096  8105  8108
+  8123  8150  8154  8158  8171  8174  8184  8196  8206  8222  8225  8236  8249
+  8253  8262  8273  8283  8289  8301  8304  8313  8323  8327  8338  8369  8381
+  8384  8395  8404  8413  8416  8429  8451  8454  8458  8469  8472  8484  8495
+  8505  8520  8523  8534  8548  8551  8560  8571  8581  8587  8599  8602  8611
+  8622  8628  8637  8663  8676  8679  8689  8698  8707  8710  8724  8750  8754
+  8758  8769  8772  8784  8795  8805  8820  8823  8834  8848  8852  8861  8872
+  8882  8888  8900  8903  8912  8922  8926  8937  8959  8971  8974  8986  8995
+  9004  9007  9018  9044  9048  9052  9063  9066  9079  9089  9099  9113  9116
+  9127  9142  9146  9155  9166  9176  9182  9194  9197  9206  9216  9220  9231
+  9234  9244  9247  9259  9269  9278  9281  9292  9317  9320  9323  9333  9336
+  9352  9362  9376  9387  9391  9404  9417  9421  9432  9442  9452  9456  9468
+  9471  9480  9491  9497  9506  9510  9513  9518  9521  9525  9532  9537  9540
+  9542  9545  9548  9550  9554  9557  9559  9562  9566  9569  9574  9577  9582
+  9585  9590  9596  9599  9601  9605  9608  9611  9615  9619  9622  9626  9629
+  9633  9636  9638  9641  9645  9648  9651  9653  9655  9658  9662  9665  9669
+  9672  9677  9680  9684  9691  9695  9698  9700  9703  9707  9710  9713  9715
+  9719  9722  9726  9729  9734  9737  9742  9748  9752  9755  9757  9760  9763
+  9765  9769  9772  9775  9779  9783  9786  9790  9793  9798  9801  9805  9813
+  9817  9820  9824  9827  9830  9834  9838  9841  9845  9848  9852  9855  9858
+  9862  9866  9869  9872  9874  9876  9879  9883  9886  9890  9893  9898  9901
+  9905  9912  9916  9919  9921  9924  9928  9931  9934  9936  9940  9943  9947
+  9950  9955  9958  9962  9970  9974  9977  9979  9982  9985  9987  9991  9994
+  9997 10001 10005 10008 10012 10015 10020 10023 10027 10034 10038 10041 10045
+ 10048 10051 10055 10059 10062 10066 10069 10073 10076 10079 10083 10087 10090
+ 10093 10095 10097 10100 10104 10107 10111 10114 10119 10122 10126 10133 10137
+ 10140 10142 10145 10149 10152 10155 10157 10161 10164 10168 10171 10176 10179
+ 10183 10190 10194 10197 10199 10202 10205 10207 10211 10214 10217 10221 10225
+ 10228 10232 10235 10240 10243 10247 10256 10260 10263 10267 10270 10273 10277
+ 10281 10284 10288 10291 10295 10298 10301 10305 10309 10312 10315 10317 10319
+ 10322 10326 10329 10333 10336 10340 10343 10347 10353 10357 10360 10362 10365
+ 10369 10372 10375 10377 10381 10384 10388 10391 10395 10398 10402 10408 10412
+ 10415 10417 10420 10423 10425 10429 10432 10435 10439 10443 10446 10451 10454
+ 10459 10462 10467 10473 10477 10480 10484 10487 10490 10494 10498 10501 10505
+ 10508 10512 10515 10518 10522 10526 10529 10532 10534 10536 10539 10543 10546
+ 10550 10553 10558 10561 10565 10570 10574 10577 10579 10582 10586 10589 10592
+ 10594 10598 10601 10605 10608 10612 10615 10620 10625 10629 10632 10634 10637
+ 10640 10642 10646 10649 10652 10656 10660 10663 10668 10671 10675 10678 10682
+ 10688 10692 10695 10699 10702 10705 10709 10713 10716 10720 10723 10727 10730
+ 10733 10737 10741 10744 10747 10749 10751 10754 10758 10761 10765 10768 10773
+ 10776 10780 10787 10791 10794 10796 10799 10803 10806 10809 10811 10815 10818
+ 10822 10825 10829 10832 10837 10842 10846 10849 10851 10854 10857 10859 10863
+ 10866 10869 10873 10877 10880 10885 10888 10893 10896 10900 10909 10913 10916
+ 10920 10923 10926 10930 10934 10937 10941 10944 10948 10951 10954 10958 10962
+ 10965 10968 10970 10972 10975 10979 10982 10986 10989 10993 10996 11000 11007
+ 11011 11014 11016 11019 11023 11026 11029 11031 11035 11038 11042 11045 11049
+ 11052 11056 11062 11066 11069 11071 11074 11077 11079 11083 11086 11089 11093
+ 11097 11100 11104 11107 11112 11115 11119 11126 11130 11133 11137 11140 11143
+ 11147 11151 11154 11158 11161 11165 11168 11171 11175 11179 11182 11185 11187
+ 11189 11192 11196 11199 11203 11206 11211 11214 11218 11225 11229 11232 11234
+ 11237 11241 11244 11247 11249 11253 11256 11260 11263 11268 11271 11275 11282
+ 11286 11289 11291 11294 11297 11299 11303 11306 11309 11313 11317 11320 11324
+ 11327 11332 11335 11339 11348 11352 11355 11359 11362 11365 11369 11373 11376
+ 11380 11383 11387 11390 11393 11397 11401 11404 11407 11409 11411 11414 11418
+ 11421 11425 11428 11433 11436 11440 11447 11451 11454 11456 11459 11463 11466
+ 11469 11471 11475 11478 11482 11485 11490 11493 11497 11504 11508 11511 11513
+ 11516 11519 11521 11525 11528 11531 11535 11539 11542 11546 11549 11554 11557
+ 11561 11568 11572 11575 11579 11582 11585 11589 11593 11596 11600 11603 11607
+ 11610 11613 11617 11621 11624 11627 11629 11631 11634 11638 11641 11645 11648
+ 11653 11656 11660 11667 11671 11674 11676 11679 11683 11686 11689 11691 11695
+ 11698 11702 11705 11710 11713 11717 11724 11728 11731 11733 11736 11739 11741
+ 11745 11748 11751 11755 11759 11762 11766 11769 11774 11777 11782 11788 11792
+ 11795 11799 11802 11805 11809 11813 11816 11820 11823 11827 11830 11833 11837
+ 11841 11844 11847 11849 11851 11854 11858 11861 11865 11868 11873 11876 11880
+ 11887 11891 11894 11896 11899 11903 11906 11909 11911 11915 11918 11922 11925
+ 11930 11933 11937 11944 11948 11951 11953 11956 11959 11961 11965 11968 11971
+ 11975 11979 11982 11986 11989 11994 11997 12002 12008 12012 12015 12019 12022
+ 12025 12029 12033 12036 12040 12043 12047 12050 12053 12057 12061 12064 12067
+ 12069 12071 12074 12078 12081 12085 12088 12093 12096 12100 12107 12111 12114
+ 12116 12119 12123 12126 12129 12131 12135 12138 12142 12145 12150 12153 12157
+ 12164 12168 12171 12173 12176 12179 12181 12185 12188 12191 12195 12199 12202
+ 12206 12209 12214 12217 12222 12228 12232 12235 12239 12242 12245 12249 12253
+ 12256 12260 12263 12267 12270 12273 12277 12281 12284 12287 12289 12291 12294
+ 12298 12301 12305 12308 12313 12316 12320 12327 12331 12334 12336 12339 12343
+ 12346 12349 12351 12355 12358 12362 12365 12370 12373 12377 12384 12388 12391
+ 12393 12396 12399 12401 12405 12408 12411 12415 12419 12422 12426 12429 12434
+ 12437 12442 12448 12452 12455 12459 12462 12465 12469 12473 12476 12480 12483
+ 12487 12490 12493 12497 12501 12504 12507 12509 12511 12514 12518 12521 12525
+ 12528 12533 12536 12540 12547 12551 12554 12556 12559 12563 12566 12569 12571
+ 12575 12578 12582 12585 12590 12593 12597 12605 12609 12612 12614 12617 12620
+ 12622 12626 12629 12632 12636 12640 12643 12647 12650 12655 12658 12662 12670
+ 12674 12677 12681 12684 12687 12691 12695 12698 12702 12705 12709 12712 12715
+ 12719 12723 12726 12729 12731 12733 12736 12740 12743 12747 12750 12755 12758
+ 12762 12769 12773 12776 12778 12781 12785 12788 12791 12793 12797 12800 12804
+ 12807 12812 12815 12819 12827 12831 12834 12836 12839 12842 12844 12848 12851
+ 12854 12858 12862 12865 12869 12872 12877 12880 12885 12891 12895 12898 12902
+ 12905 12908 12912 12916 12919 12923 12926 12930 12933 12936 12940 12944 12947
+ 12950 12952 12954 12957 12961 12964 12968 12971 12976 12979 12983 12991 12995
+ 12998 13000 13003 13007 13010 13013 13015 13019 13022 13026 13029 13034 13037
+ 13041 13049 13053 13056 13058 13061 13064 13066 13070 13073 13076 13080 13084
+ 13087 13091 13094 13099 13102 13107 13113 13117 13120 13124 13127 13130 13134
+ 13138 13141 13145 13148 13152 13155 13158 13162 13166 13169 13172 13174 13176
+ 13179 13183 13186 13190 13193 13198 13201 13205 13212 13216 13219 13221 13224
+ 13228 13231 13234 13236 13240 13243 13247 13250 13255 13258 13262 13269 13273
+ 13276 13278 13281 13284 13286 13290 13293 13296 13300 13304 13307 13311 13314
+ 13319 13322 13326 13333 13337 13340 13344 13347 13350 13354 13358 13361 13365
+ 13368 13372 13375 13378 13382 13386 13389 13392 13394 13396 13399 13403 13406
+ 13410 13413 13418 13421 13425 13432 13436 13439 13441 13444 13448 13451 13454
+ 13456 13460 13463 13467 13470 13475 13478 13482 13490 13494 13497 13499 13502
+ 13505 13507 13511 13514 13517 13521 13525 13528 13532 13535 13540 13543 13548
+ 13554 13558 13561 13565 13568 13571 13575 13579 13582 13586 13589 13593 13596
+ 13599 13603 13607 13610 13613 13615 13617 13620 13624 13627 13631 13634 13639
+ 13642 13646 13653 13657 13660 13662 13665 13669 13672 13675 13677 13681 13684
+ 13688 13691 13696 13699 13703 13710 13714 13717 13719 13722 13725 13727 13731
+ 13734 13737 13741 13745 13748 13752 13755 13760 13763 13768 13774 13778 13781
+ 13785 13788 13791 13795 13799 13802 13806 13809 13813 13816 13819 13823 13827
+ 13830 13833 13835 13837 13840 13844 13847 13851 13854 13859 13862 13866 13873
+ 13877 13880 13882 13885 13889 13892 13895 13897 13901 13904 13908 13911 13916
+ 13919 13924 13930 13934 13937 13939 13942 13945 13947 13951 13954 13957 13961
+ 13965 13968 13973 13976 13981 13984 13989 13995 13999 14002 14006 14009 14012
+ 14016 14020 14023 14027 14030 14034 14037 14040 14044 14048 14051 14054 14056
+ 14058 14061 14066 14069 14073 14076 14081 14084 14088 14095 14099 14102 14104
+ 14107 14112 14115 14118 14120 14124 14127 14131 14134 14139 14142 14147 14153
+ 14157 14160 14162 14165 14168 14170 14174 14177 14180 14184 14188 14191 14196
+ 14199 14204 14207 14212 14218 14222 14225 14229 14232 14235 14239 14243 14246
+ 14250 14253 14257 14260 14263 14267 14271 14274 14277 14279 14281 14284 14288
+ 14291 14295 14298 14303 14306 14310 14317 14321 14324 14326 14329 14333 14336
+ 14339 14341 14345 14348 14352 14355 14360 14363 14368 14374 14379 14382 14384
+ 14387 14390 14392 14396 14399 14402 14406 14410 14413 14418 14421 14426 14429
+ 14433 14442 14446 14449 14453 14456 14459 14463 14467 14470 14474 14477 14481
+ 14484 14487 14491 14495 14498 14501 14503 14505 14508 14513 14516 14520 14523
+ 14528 14531 14535 14542 14546 14549 14551 14554 14559 14562 14565 14567 14571
+ 14574 14578 14581 14586 14589 14594 14600 14604 14607 14609 14612 14615 14617
+ 14621 14624 14627 14631 14635 14638 14643 14646 14651 14654 14658 14665 14669
+ 14672 14676 14679 14682 14686 14690 14693 14697 14700 14704 14707 14710 14714
+ 14718 14721 14724 14726 14728 14731 14735 14738 14742 14745 14750 14753 14757
+ 14764 14768 14771 14773 14776 14780 14783 14786 14788 14792 14795 14799 14802
+ 14807 14810 14815 14821 14825 14828 14830 14833 14836 14838 14842 14845 14848
+ 14852 14856 14859 14864 14867 14872 14875 14880 14886 14890 14893 14897 14900
+ 14903 14907 14911 14914 14918 14921 14925 14928 14931 14935 14939 14942 14945
+ 14947 14949 14952 14957 14960 14964 14967 14972 14975 14979 14986 14990 14993
+ 14995 14998 15003 15006 15009 15011 15016 15019 15023 15026 15031 15034 15038
+ 15046 15050 15053 15055 15058 15061 15063 15067 15070 15073 15077 15081 15084
+ 15089 15092 15097 15100 15105 15111 15115 15118 15122 15125 15128 15132 15136
+ 15139 15143 15146 15150 15153 15156 15160 15164 15167 15170 15172 15174 15177
+ 15181 15184 15188 15191 15196 15199 15203 15210 15214 15217 15219 15222 15226
+ 15229 15232 15234 15238 15241 15245 15248 15253 15256 15260 15268 15272 15275
+ 15277 15280 15283 15285 15289 15292 15295 15299 15303 15306 15311 15314 15319
+ 15322 15327 15333 15337 15340 15344 15347 15350 15354 15358 15361 15365 15368
+ 15372 15375 15378 15382 15386 15389 15392 15394 15396 15399 15403 15406 15410
+ 15413 15418 15421 15425 15432 15436 15439 15441 15444 15448 15451 15454 15456
+ 15460 15463 15467 15470 15475 15478 15482 15490 15494 15497 15499 15502 15505
+ 15507 15511 15514 15517 15521 15525 15528 15532 15535 15540 15543 15547 15555
+ 15559 15562 15566 15569 15572 15576 15580 15583 15587 15590 15594 15597 15600
+ 15604 15608 15611 15614 15616 15618 15621 15625 15628 15632 15635 15640 15643
+ 15647 15654 15658 15661 15663 15666 15670 15673 15676 15678 15682 15685 15689
+ 15692 15697 15700 15704 15711 15715 15718 15720 15723 15726 15728 15732 15735
+ 15738 15742 15746 15749 15753 15756 15761 15764 15769 15775 15779 15782 15786
+ 15789 15792 15796 15800 15803 15807 15810 15814 15817 15820 15824 15828 15831
+ 15834 15836 15838 15841 15845 15848 15852 15855 15860 15863 15867 15874 15878
+ 15881 15883 15886 15890 15893 15896 15898 15902 15905 15909 15912 15917 15920
+ 15924 15932 15936 15939 15941 15944 15947 15949 15953 15956 15959 15963 15967
+ 15970 15974 15977 15982 15985 15990 15996 16000 16003 16007 16010 16013 16017
+ 16021 16024 16028 16031 16035 16038 16041 16045 16049 16052 16055 16057 16059
+ 16062 16066 16069 16073 16076 16081 16084 16088 16095 16099 16102 16104 16107
+ 16111 16114 16117 16119 16123 16126 16130 16133 16138 16141 16145 16152 16156
+ 16159 16161 16164 16167 16169 16173 16176 16179 16183 16187 16190 16194 16197
+ 16202 16205 16210 16216 16220 16223 16227 16230 16233 16237 16241 16244 16248
+ 16251 16255 16258 16261 16265 16269 16272 16275 16277 16279 16282 16286 16289
+ 16293 16296 16301 16304 16308 16315 16319 16322 16324 16327 16331 16334 16337
+ 16339 16343 16346 16351 16354 16359 16362 16367 16373 16377 16380 16382 16385
+ 16388 16390 16394 16397 16400 16404 16408 16411 16416 16419 16424 16427 16432
+ 16438 16442 16445 16449 16452 16455 16459 16463 16466 16470 16473 16477 16480
+ 16483 16487 16491 16494 16497 16499 16501 16504 16508 16511 16515 16518 16523
+ 16526 16530 16537 16541 16544 16546 16549 16553 16556 16559 16561 16565 16568
+ 16572 16575 16579 16582 16585 16589 16594 16597 16600 16603 16606 16608 16612
+ 16615 16620 16623 16628 16631 16635 16638 16641 16645 16649 16652 16656 16662
+ 16668 16671 16674 16676 16680 16683 16686 16691 16697 16700 16704 16707 16711
+ 16714 16717 16722 16725 16727 16731 16734 16738 16741 16744 16748 16752 16755
+ 16758 16763 16767 16770 16773 16775 16781 16784 16787 16793 16797 16800 16804
+ 16807 16810 16814 16818 16821 16824 16827 16830 16832 16836 16839 16844 16847
+ 16851 16854 16858 16861 16864 16868 16872 16875 16879 16887 16892 16895 16899
+ 16902 16906 16909 16914 16920 16925 16928 16932 16935 16939 16942 16945 16950
+ 16953 16955 16959 16962 16966 16969 16972 16976 16980 16983 16986 16991 16997
+ 17000 17003 17005 17009 17012 17017 17023 17027 17030 17034 17037 17040 17044
+ 17048 17051 17054 17057 17060 17062 17066 17069 17074 17077 17081 17084 17088
+ 17091 17094 17098 17102 17105 17110 17116 17121 17124 17128 17131 17135 17138
+ 17142 17150 17155 17158 17162 17165 17169 17172 17175 17180 17183 17185 17189
+ 17192 17196 17199 17202 17206 17210 17213 17216 17221 17227 17230 17233 17235
+ 17241 17244 17248 17255 17259 17262 17266 17269 17272 17276 17280 17283 17286
+ 17289 17292 17294 17298 17301 17306 17309 17313 17316 17320 17323 17326 17330
+ 17334 17337 17342 17348 17353 17356 17360 17363 17367 17370 17374 17382 17387
+ 17390 17394 17397 17401 17404 17407 17412 17415 17417 17421 17424 17428 17431
+ 17434 17438 17442 17445 17448 17453 17457 17460 17463 17465 17471 17474 17479
+ 17485 17489 17492 17496 17499 17502 17506 17510 17513 17516 17519 17522 17524
+ 17528 17531 17536 17539 17544 17547 17551 17554 17557 17561 17565 17568 17573
+ 17579 17584 17587 17591 17594 17598 17601 17606 17612 17617 17620 17624 17627
+ 17631 17634 17637 17642 17645 17647 17651 17654 17658 17661 17664 17668 17672
+ 17675 17678 17683 17689 17692 17695 17697 17701 17704 17709 17715 17719 17722
+ 17726 17729 17732 17736 17740 17743 17746 17749 17752 17754 17758 17761 17766
+ 17769 17774 17777 17781 17784 17787 17791 17795 17798 17802 17809 17814 17817
+ 17821 17824 17828 17831 17836 17842 17847 17850 17854 17857 17861 17864 17867
+ 17872 17875 17877 17881 17884 17888 17891 17894 17898 17902 17905 17908 17913
+ 17919 17922 17925 17927 17933 17936 17941 17947 17951 17954 17958 17961 17964
+ 17968 17972 17975 17978 17981 17984 17986 17990 17993 17998 18001 18006 18009
+ 18013 18016 18019 18023 18027 18030 18035 18041 18046 18049 18053 18056 18060
+ 18063 18068 18074 18079 18082 18086 18089 18093 18096 18099 18104 18107 18109
+ 18113 18116 18121 18124 18127 18131 18135 18138 18141 18146 18152 18155 18158
+ 18160 18164 18167 18172 18178 18182 18185 18189 18192 18195 18199 18203 18206
+ 18209 18212 18215 18217 18221 18224 18229 18232 18236 18239 18243 18246 18249
+ 18253 18257 18260 18264 18272 18277 18280 18284 18287 18291 18294 18298 18306
+ 18311 18314 18318 18321 18325 18328 18331 18336 18339 18341 18345 18348 18352
+ 18355 18358 18362 18366 18369 18372 18377 18383 18386 18389 18391 18397 18400
+ 18405 18411 18415 18418 18422 18425 18428 18432 18436 18439 18442 18445 18448
+ 18450 18454 18457 18462 18465 18469 18472 18476 18479 18482 18486 18490 18493
+ 18497 18505 18510 18513 18517 18520 18524 18527 18531 18539 18544 18547 18551
+ 18554 18558 18561 18564 18569 18572 18574 18578 18581 18585 18588 18591 18595
+ 18599 18602 18605 18610 18616 18619 18622 18624 18630 18633 18637 18644 18648
+ 18651 18655 18658 18661 18665 18669 18672 18675 18678 18681 18683 18687 18690
+ 18695 18698 18702 18705 18709 18712 18715 18719 18723 18726 18731 18737 18742
+ 18745 18749 18752 18756 18759 18764 18770 18775 18778 18782 18785 18789 18792
+ 18795 18800 18803 18805 18809 18812 18816 18819 18822 18826 18830 18833 18836
+ 18841 18847 18850 18853 18855 18861 18864 18869 18875 18879 18882 18886 18889
+ 18892 18896 18900 18903 18906 18909 18912 18914 18918 18921 18926 18929 18933
+ 18936 18940 18943 18946 18950 18954 18957 18962 18968 18973 18976 18980 18983
+ 18987 18990 18995 19001 19006 19009 19013 19016 19020 19023 19026 19031 19034
+ 19036 19040 19043 19047 19050 19053 19057 19061 19064 19067 19072 19076 19079
+ 19082 19084 19090 19093 19098 19104 19108 19111 19115 19118 19121 19125 19129
+ 19132 19135 19138 19141 19143 19147 19150 19155 19158 19162 19165 19169 19172
+ 19175 19179 19183 19186 19191 19197 19202 19205 19209 19212 19216 19219 19224
+ 19230 19235 19238 19242 19245 19249 19252 19255 19260 19263 19265 19269 19272
+ 19276 19279 19282 19286 19290 19293 19296 19301 19307 19310 19313 19315 19321
+ 19324 19329 19335 19339 19342 19346 19349 19352 19356 19360 19363 19366 19369
+ 19372 19374 19378 19381 19386 19389 19393 19396 19400 19403 19406 19410 19414
+ 19417 19422 19428 19433 19436 19440 19443 19447 19450 19454 19461 19466 19469
+ 19473 19476 19480 19483 19486 19491 19494 19496 19500 19503 19507 19510 19513
+ 19517 19521 19524 19527 19532 19538 19541 19544 19546 19552 19555 19559 19566
+ 19570 19573 19577 19580 19583 19587 19591 19594 19597 19600 19603 19605 19609
+ 19612 19617 19620 19624 19627 19631 19634 19637 19641 19645 19648 19653 19659
+ 19664 19667 19671 19674 19678 19681 19685 19693 19698 19701 19705 19708 19712
+ 19715 19718 19723 19726 19728 19732 19735 19739 19742 19745 19749 19753 19756
+ 19759 19764 19770 19773 19776 19778 19784 19787 19792 19798 19802 19805 19809
+ 19812 19815 19819 19823 19826 19829 19832 19835 19837 19841 19844 19849 19852
+ 19856 19859 19863 19866 19869 19873 19877 19880 19884 19892 19897 19900 19904
+ 19907 19911 19914 19919 19925 19930 19933 19937 19940 19944 19947 19950 19955
+ 19958 19960 19964 19967 19971 19974 19977 19981 19985 19988 19991 19996 20000
+ 20003 20006 20008 20014 20017 20022 20028 20032 20035 20039 20042 20045 20049
+ 20053 20056 20059 20062 20065 20067 20071 20074 20079 20082 20086 20089 20093
+ 20096 20099 20103 20107 20110 20114 20121 20126 20129 20133 20136 20140 20143
+ 20147 20155 20160 20163 20167 20170 20174 20177 20180 20185 20188 20190 20194
+ 20197 20202 20205 20208 20212 20216 20219 20222 20227 20233 20236 20239 20241
+ 20245 20248 20253 20259 20263 20266 20270 20273 20276 20280 20284 20287 20290
+ 20293 20296 20298 20302 20305 20310 20313 20317 20320 20324 20327 20330 20334
+ 20338 20341 20346 20352 20357 20360 20364 20367 20371 20374 20378 20385 20390
+ 20393 20397 20400 20404 20407 20410 20415 20418 20420 20424 20427 20431 20434
+ 20437 20441 20445 20448 20451 20456 20462 20465 20468 20470 20476 20479 20483
+ 20490 20494 20497 20501 20504 20507 20511 20515 20518 20521 20524 20527 20529
+ 20533 20536 20541 20544 20548 20551 20555 20558 20561 20565 20569 20572 20576
+ 20583 20588 20591 20595 20598 20602 20605 20609 20616 20621 20624 20628 20631
+ 20635 20638 20641 20646 20649 20651 20655 20658 20662 20665 20668 20672 20676
+ 20679 20682 20687 20693 20696 20699 20701 20707 20710 20714 20721 20725 20728
+ 20732 20735 20738 20742 20746 20749 20752 20755 20758 20760 20764 20767 20772
+ 20775 20779 20782 20786 20789 20792 20796 20800 20803 20808 20814 20819 20822
+ 20826 20829 20833 20836 20841 20847 20852 20855 20859 20862 20866 20869 20872
+ 20877 20880 20882 20886 20889 20893 20896 20899 20903 20907 20910 20913 20918
+ 20924 20927 20930 20932 20938 20941 20946 20952 20956 20959 20963 20966 20969
+ 20973 20977 20980 20983 20986 20989 20991 20995 20998 21003 21006 21010 21013
+ 21017 21020 21023 21027 21031 21034 21038 21046 21051 21054 21058 21061 21065
+ 21068 21072 21079 21084 21087 21091 21094 21098 21101 21105 21108 21111 21113
+ 21117 21120 21124 21127 21130 21134 21138 21141 21145 21148 21153 21156 21159
+ 21161 21166 21169 21173 21180 21184 21187 21191 21194 21197 21201 21205 21208
+ 21211 21214 21217 21219 21223 21226 21231 21234 21239 21242 21246 21249 21252
+ 21256 21260 21263 21268 21273 21277 21280 21284 21287 21291 21294 21298 21304
+ 21308 21311 21315 21318 21322 21325 21328 21334 21337 21339 21343 21346 21350
+ 21353 21356 21360 21364 21367 21371 21374 21380 21383 21386 21388 21392 21395
+ 21399 21406 21410 21413 21417 21420 21423 21427 21431 21434 21437 21440 21443
+ 21445 21449 21452 21457 21460 21465 21468 21472 21475 21478 21482 21486 21489
+ 21494 21499 21503 21506 21510 21513 21517 21520 21524 21530 21534 21537 21541
+ 21544 21548 21551 21555 21558 21561 21563 21567 21570 21574 21577 21580 21584
+ 21588 21591 21595 21598 21603 21606 21609 21611 21616 21619 21624 21630 21634
+ 21637 21641 21644 21647 21651 21656 21659 21662 21665 21668 21670 21674 21677
+ 21682 21685 21690 21693 21697 21700 21703 21707 21711 21714 21718 21724 21728
+ 21731 21735 21738 21742 21745 21749 21755 21759 21762 21766 21769 21773 21776
+ 21780 21783 21786 21788 21792 21795 21799 21802 21805 21809 21813 21816 21819
+ 21825 21829 21832 21835 21837 21843 21846 21851 21857 21861 21864 21868 21871
+ 21874 21878 21882 21885 21888 21891 21894 21896 21900 21903 21908 21911 21916
+ 21919 21923 21926 21929 21933 21937 21940 21945 21950 21954 21957 21961 21964
+ 21968 21971 21975 21981 21985 21988 21992 21995 21999 22002 22006 22009 22012
+ 22014 22018 22021 22025 22028 22031 22035 22039 22042 22046 22049 22054 22057
+ 22060 22062 22067 22070 22074 22081 22085 22088 22092 22095 22098 22102 22106
+ 22109 22112 22115 22118 22120 22124 22127 22132 22135 22140 22143 22147 22150
+ 22153 22157 22161 22164 22168 22174 22178 22181 22185 22188 22192 22195 22200
+ 22206 22211 22214 22218 22221 22226 22229 22233 22236 22239 22241 22245 22248
+ 22252 22255 22258 22262 22266 22269 22273 22276 22282 22285 22288 22290 22296
+ 22299 22304 22310 22314 22317 22321 22324 22327 22331 22335 22338 22341 22344
+ 22347 22349 22353 22356 22361 22364 22369 22372 22376 22379 22382 22386 22390
+ 22393 22397 22404 22408 22411 22415 22418 22422 22425 22429 22436 22440 22443
+ 22447 22450 22454 22457 22461 22465 22468 22470 22474 22477 22481 22484 22487
+ 22491 22495 22498 22502 22505 22509 22512 22515 22517 22521 22524 22528 22535
+ 22539 22542 22546 22549 22552 22556 22560 22563 22566 22569 22572 22574 22578
+ 22581 22586 22589 22594 22597 22601 22604 22607 22611 22615 22618 22622 22630
+ 22635 22638 22642 22645 22649 22652 22656 22664 22669 22672 22676 22679 22683
+ 22686 22690 22693 22696 22698 22702 22705 22709 22712 22715 22719 22723 22726
+ 22730 22733 22737 22740 22743 22745 22749 22752 22756 22763 22767 22770 22774
+ 22777 22780 22784 22788 22791 22794 22797 22800 22802 22806 22809 22813 22816
+ 22820 22823 22827 22830 22833 22837 22841 22844 22848 22856 22861 22864 22868
+ 22871 22875 22878 22882 22889 22893 22896 22900 22903 22907 22910 22914 22917
+ 22920 22922 22926 22929 22933 22936 22939 22943 22947 22950 22954 22958 22962
+ 22965 22968 22970 22974 22977 22981 22987 22991 22994 22998 23001 23004 23008
+ 23012 23015 23018 23021 23024 23026 23030 23033 23038 23041 23045 23048 23052
+ 23055 23058 23062 23066 23069 23073 23081 23086 23089 23093 23096 23100 23103
+ 23107 23114 23119 23122 23126 23129 23133 23136 23140 23143 23146 23148 23152
+ 23155 23159 23162 23165 23169 23173 23176 23180 23183 23187 23190 23193 23195
+ 23199 23202 23206 23213 23217 23220 23224 23227 23230 23234 23238 23241 23244
+ 23247 23250 23252 23256 23259 23263 23266 23270 23273 23277 23280 23283 23287
+ 23291 23294 23298 23306 23311 23314 23318 23321 23325 23328 23332 23339 23343
+ 23346 23350 23353 23357 23360 23364 23367 23370 23372 23376 23379 23383 23386
+ 23389 23393 23397 23400 23404 23408 23412 23415 23418 23420 23424 23427 23432
+ 23437 23441 23444 23448 23451 23454 23458 23462 23465 23468 23471 23474 23476
+ 23480 23483 23487 23490 23494 23497 23501 23504 23507 23511 23515 23518 23522
+ 23530 23535 23538 23542 23545 23549 23552 23556 23563 23567 23570 23574 23577
+ 23581 23584 23588 23591 23594 23596 23600 23603 23607 23610 23613 23617 23621
+ 23624 23628 23632 23636 23639 23642 23644 23648 23651 23656 23661 23666 23669
+ 23673 23676 23679 23683 23687 23690 23693 23696 23699 23701 23705 23708 23713
+ 23716 23721 23724 23728 23731 23734 23738 23742 23745 23750 23755 23759 23762
+ 23766 23769 23773 23776 23781 23786 23790 23793 23797 23800 23804 23807 23811
+ 23815 23818 23820 23824 23827 23831 23834 23837 23841 23845 23848 23852 23855
+ 23859 23862 23865 23867 23871 23874 23879 23885
+    1    2    4    9   14   15   35   39   40   47   48   50   51   63  993 1069
+ 1070 1079 1080 1091 1092 2977 3064 3067 3068 3071 3072 3099 3100    1    2    4
+    5  993  994  997  998 2977 2978 2979 2982    3  995  998    1    2    4    8
+    9  995  996  998 1005 2979 2980 2989    2    5  997  998  999 2981 2982 2983
+    6  999 1000 1002 1003 2983 2984 2985    7 2986 2987    4    8    9 1005 1006
+ 1008 1009 2989 2990 3037 3038    1    4    8    9   14   15   17   19   26 1009
+ 1010 1021 1022 1025 1026 1039 2991 2992 3001 3002 3013 3014 3021 3038   10 2991
+ 3038   11 1011 2993   12   14   15 1011 1012 1015 2993 2994 2995 2998   13 1013
+ 1016    1    9   12   14   15   17 1013 1014 1015 1016 1019 2995 2996 2999    1
+    9   12   14   15   19   20 1015 1016 1027 2997 2998 3007   16   17 1017 1023
+ 1024 3003 3004 3005    9   14   16   17 1019 1020 1022 1023 2999 3000 3001 3002
+ 3003   18 3002 3003    9   15   19   20 1025 1031 1032 3011 3012 3013   15   19
+   20 1027 1028 1030 1031 3007 3008 3009 3010 3011   21 3010 3011   22   23 1033
+ 1037 1038 3019 3035 3036   22   23   24 1036 1037 1053 1054 3015 3016 3017 3018
+ 3035   23   24   26 1041 1042 1044 1053 3015 3023 3024   25 3018 3035    9   24
+   26   28 1039 1040 1043 1044 3021 3022 3023 3026   27 1041 1044   26   28   29
+ 1043 1044 1045 3025 3026 3027   28   29   31 1045 1046 1048 1049 3027 3028 3029
+ 3030   30   31 3030 3031   29   30   31 1049 1050 1051 3030 3031 3032 3033   32
+   33   35   63   70   71   78   82 1055 1131 1132 1141 1142 1153 1154 3039 3129
+ 3130 3133 3134 3161 3162   32   33   35   36 1055 1056 1059 3039 3040 3041 3044
+   34 1057 1060    1   32   33   35   36   39   40   63 1057 1058 1059 1060 1067
+ 3041 3042 3051   33   35   36   37 1059 1060 1061 3043 3044 3045   36   37 1061
+ 1062 1064 1065 3045 3046 3047   38 3048 3049    1   35   39   40 1067 1068 1070
+ 1071 3051 3052 3099    1   35   39   40   48   50   57   63 1071 1072 1083 1084
+ 1087 1088 1101 3053 3054 3063 3064 3075 3076 3083 3099 3100   41 3053 3100   42
+   43 1073 3055   42   43   45   46   48 1073 1074 1077 1078 3055 3056 3057 3060
+   44 1075 1078   43   45   48 1075 1076 1078 1081 3057 3058 3061   43   46   48
+   51 1077 1078 1089 3059 3060 3069    1   47   48 1079 1085 1086 3064 3065 3066
+ 3067    1   40   43   45   46   47   48   50   51 1078 1081 1082 1084 1085 3061
+ 3062 3063 3064   49 3064 3065    1   40   48   50   51 1087 1093 1094 3071 3072
+ 3073 3074 3075    1   46   48   50   51 1089 1090 1092 1093 3069 3070 3071   52
+ 3072 3073   53   54   56 1095 1099 1100 3080 3081 3097 3098   53   54   55 1098
+ 1099 1115 1116 3077 3078 3079 3080   54   55   57 1103 1104 1106 1115 3077 3085
+ 3086   53   56 3080 3097   40   55   57   59 1101 1102 1105 1106 3083 3084 3085
+ 3088   58 1103 1106   57   59   60 1105 1106 1107 3087 3088 3089   59   60   62
+ 1107 1108 1110 1111 3089 3090 3091 3092 3093   61 3092 3093   60   62 1111 1112
+ 1113 3093 3094 3095    1   32   35   40   63   64   70  101  102  109  110  113
+  156  280 1117 1193 1194 1203 1204 1215 1216 3101 3191 3192 3195 3196 3223 3224
+   63   64   66   67   70 1117 1118 1121 1122 3101 3102 3103 3106   65 1119 1122
+   64   66   70 1119 1120 1122 1129 3103 3104 3113   64   67   68 1121 1122 1123
+ 3105 3106 3107   67   68 1123 1124 1126 1127 3107 3108 3109   69 3110 3111   32
+   63   64   66   70   71 1122 1129 1130 1132 1133 3113 3114 3161 3162   32   70
+   71   78   79   81   82   88 1133 1134 1145 1146 1149 1150 1163 3115 3116 3125
+ 3126 3134 3137 3138 3145 3162   72 3115 3162   73   74 1135 3117   73   74   76
+   77 1135 1136 1139 1140 3117 3118 3119 3122   75 1137 1140   74   76   77   79
+ 1137 1138 1140 1143 3119 3120 3123   74   76   77   79   82 1139 1140 1151 3121
+ 3122 3131   32   71   78   79   82 1141 1147 1148 3125 3126 3127 3128 3129   71
+   76   77   78   79   82 1143 1144 1146 1147 3123 3124 3125   80 3126 3127   71
+   81   82 1149 1155 1156 3134 3135 3136 3137   32   71   77   78   79   81   82
+ 1151 1152 1154 1155 3131 3132 3133 3134   83 3134 3135   84   85 1157 1161 1162
+ 3143 3159 3160   84   85   86 1160 1161 1177 1178 3139 3140 3141 3142 3159   85
+   86   88 1165 1166 1168 1177 3139 3147 3148   87 3142 3159   71   86   88   90
+ 1163 1164 1167 1168 3145 3146 3147 3150   89 1165 1168   88   90   91 1167 1168
+ 1169 3149 3150 3151   90   91   93 1169 1170 1172 1173 3151 3152 3153 3154 3155
+   92 3154 3155   91   93 1173 1174 1175 3155 3156 3157   94   95  101  125  132
+  133  140  143  144 1179 1255 1256 1265 1266 1277 1278 3163 3253 3254 3257 3258
+ 3285 3286   94   95   97   98  101 1179 1180 1183 1184 3163 3164 3165 3168   96
+ 1181 1184   95   97  101 1181 1182 1184 1191 3165 3166 3175   95   98   99 1183
+ 1184 1185 3167 3168 3169   98   99 1185 1186 1188 1189 3169 3170 3171  100 3172
+ 3173   63   94   95   97  101  102  125  156 1191 1192 1194 1195 3175 3176 3223
+   63  101  102  109  110  112  113  119  156  280 1195 1196 1207 1208 1211 1212
+ 1225 3177 3178 3187 3188 3196 3199 3200 3207 3223 3224  103 3177 3224  104  105
+ 1197 3179  104  105  107  108  110  113 1197 1198 1201 1202 3179 3180 3181 3184
+  106 1199 1202  105  107  110 1199 1200 1202 1205 3181 3182 3185  105  108  113
+ 1201 1202 1213 3183 3184 3193   63  102  109  110 1203 1209 1210 3187 3188 3189
+ 3190 3191   63  102  105  107  109  110  113 1202 1205 1206 1208 1209 3185 3186
+ 3187  111 3188 3189  102  112  113 1211 1217 1218 3196 3197 3198 3199   63  102
+  105  108  110  112  113 1202 1213 1214 1216 1217 3193 3194 3195 3196  114 3196
+ 3197  115  116 1219 1223 1224 3205 3221 3222  115  116  117 1222 1223 1239 1240
+ 3201 3202 3203 3204 3221  116  117  119  120 1227 1228 1230 1239 3201 3209 3210
+  118 3204 3221  102  117  119  121 1225 1226 1229 1230 3207 3208 3209 3212  117
+  120 1227 1230  119  121  122 1229 1230 1231 3211 3212 3213  121  122  124 1231
+ 1232 1234 1235 3213 3214 3215 3216  123  124 3216 3217  122  123  124 1235 1236
+ 1237 3216 3217 3218 3219   94  101  125  126  128  132  133  156  159  163  164
+  171  174  175 1241 1317 1318 1327 1328 1339 1340 3225 3315 3316 3319 3320 3347
+ 3348  125  126  128  129 1241 1242 1245 1246 3225 3226 3227 3230  127  128 1243
+ 1246  125  126  127  128  132 1243 1244 1246 1253 3227 3228 3237  126  129  130
+ 1245 1246 1247 3229 3230 3231  129  130 1247 1248 1250 1251 3231 3232 3233  131
+ 3234 3235   94  125  128  132  133 1253 1254 1256 1257 3237 3238 3285   94  125
+  132  133  140  141  143  144  150 1257 1258 1269 1270 1273 1274 1287 3239 3240
+ 3249 3250 3261 3262 3269 3285 3286  134 3239 3286  135 1259 3241  136  138  139
+ 1259 1260 1263 3241 3242 3243 3246  137 1261 1264  136  138  139  141 1261 1262
+ 1263 1264 1267 3243 3244 3247  136  138  139  141  144 1263 1264 1275 3245 3246
+ 3255   94  133  140  141  144 1265 1271 1272 3249 3250 3251 3252 3253  133  138
+  139  140  141  144 1267 1268 1270 1271 3247 3248 3249  142 3250 3251   94  133
+  143  144 1273 1279 1280 3257 3258 3259 3260 3261   94  133  139  140  141  143
+  144 1275 1276 1278 1279 3255 3256 3257  145 3258 3259  146  147  149 1281 1285
+ 1286 3266 3267 3283 3284  146  147  148 1284 1285 1301 1302 3263 3264 3265 3266
+  147  148  150 1289 1290 1301 3263 3271 3272  146  149 3266 3283  133  148  150
+  151  152 1287 1288 1289 1291 1292 3269 3270 3271 3274  150  151 1289 1292  150
+  152  153 1291 1292 1293 3273 3274 3275  152  153  155 1293 1294 1296 1297 3275
+ 3276 3277 3278 3279  154 3278 3279  153  155 1297 1298 1299 3279 3280 3281   63
+  101  102  125  156  157  159  187  190  194  195  202  205  206  280 1303 1379
+ 1380 1389 1390 1401 1402 3287 3377 3378 3381 3382 3409 3410  156  157  159  160
+ 1303 1304 1307 3287 3288 3289 3292  158  159 1305 1308  125  156  157  158  159
+  160  163  164 1305 1306 1307 1308 1315 3289 3290 3299  157  159  160  161 1307
+ 1308 1309 3291 3292 3293  160  161 1309 1310 1312 1313 3293 3294 3295  162 3296
+ 3297  125  159  163  164 1315 1316 1318 1319 3299 3300 3347  125  159  163  164
+  171  172  174  175  181 1319 1320 1331 1332 1335 1336 1349 3301 3302 3311 3312
+ 3323 3324 3331 3347 3348  165 3301 3348  166 1321 3303  167  169  170 1321 1322
+ 1325 1326 3303 3304 3305 3308  168  169 1323 1326  167  168  169  170  172 1323
+ 1324 1326 1329 3305 3306 3309  167  169  170  172  175 1325 1326 1337 3307 3308
+ 3317  125  164  171  172  175 1327 1333 1334 3312 3313 3314 3315  164  169  170
+  171  172  175 1329 1330 1332 1333 3309 3310 3311 3312  173 3312 3313  125  164
+  174  175 1335 1341 1342 3319 3320 3321 3322 3323  125  164  170  171  172  174
+  175 1337 1338 1340 1341 3317 3318 3319  176 3320 3321  177  178 1343 1347 1348
+ 3329 3345 3346  177  178  179 1346 1347 1363 1364 3325 3326 3327 3328 3345  178
+  179  181 1351 1352 1354 1363 3325 3333 3334  180 3328 3345  164  179  181  183
+ 1349 1350 1353 1354 3331 3332 3333 3336  182 1351 1354  181  183  184 1353 1354
+ 1355 3335 3336 3337  183  184  186 1355 1356 1358 1359 3337 3338 3339 3340 3341
+  185 3340 3341  184  186 1359 1360 1361 3341 3342 3343  156  187  188  190  195
+  218  221  225  226  233  234  237  249  280 1365 1441 1442 1451 1452 1463 1464
+ 3349 3436 3439 3440 3443 3444 3471 3472  187  188  190  191 1365 1366 1369 3349
+ 3350 3351 3354  189  190 1367 1370  156  187  188  189  190  191  194  195 1367
+ 1368 1369 1370 1377 3351 3352 3361  188  190  191  192 1369 1370 1371 3353 3354
+ 3355  191  192 1371 1372 1374 1375 3355 3356 3357  193 3358 3359  156  190  194
+  195 1377 1378 1380 1381 3361 3362 3409  156  187  190  194  195  202  203  205
+  206  212 1381 1382 1393 1394 1397 1398 1411 3363 3364 3373 3374 3385 3386 3393
+ 3409 3410  196 3363 3410  197 1383 3365  198  200  201  203  206 1383 1384 1387
+ 1388 3365 3366 3367 3370  199 1385 1388  198  200  203 1385 1386 1388 1391 3367
+ 3368 3371  198  201  206 1387 1388 1399 3369 3370 3379  156  195  202  203  206
+ 1389 1395 1396 3373 3374 3375 3376 3377  195  198  200  202  203  206 1388 1391
+ 1392 1394 1395 3371 3372 3373  204 3374 3375  156  195  205  206 1397 1403 1404
+ 3381 3382 3383 3384 3385  156  195  198  201  202  203  205  206 1388 1399 1400
+ 1402 1403 3379 3380 3381  207 3382 3383  208  209  211 1405 1409 1410 3390 3391
+ 3407 3408  208  209  210 1408 1409 1425 1426 3387 3388 3389 3390  209  210  213
+ 1413 1414 1416 1425 3387 3395 3396  208  211 3390 3407  195  212  214 1411 1412
+ 1415 1416 3393 3394 3395 3398  210  213 1413 1416  212  214  215 1415 1416 1417
+ 3397 3398 3399  214  215  217 1417 1418 1420 1421 3399 3400 3401 3402 3403  216
+ 3402 3403  215  217 1421 1422 1423 3403 3404 3405  187  218  219  221  226  249
+  256  257  264  265  268 1427 1503 1504 1513 1514 1525 1526 3411 3498 3501 3502
+ 3505 3506 3533 3534  218  219  221  222 1427 1428 1431 1432 3411 3412 3413 3416
+  220  221 1429 1432  187  218  219  220  221  225  226 1429 1430 1432 1439 3413
+ 3414 3423  219  222  223 1431 1432 1433 3415 3416 3417  222  223 1433 1434 1436
+ 1437 3417 3418 3419  224 3420 3421  187  221  225  226 1439 1440 1442 1443 3423
+ 3424 3471  187  218  221  225  226  234  236  237  243  249 1443 1444 1455 1456
+ 1459 1460 1473 3425 3426 3435 3436 3444 3447 3448 3455 3471 3472  227 3425 3472
+  228  229 1445 3427  228  229  231  232 1445 1446 1449 1450 3427 3428 3429 3432
+  230 1447 1450  229  231  232  234  237 1447 1448 1450 1453 3429 3430 3433  229
+  231  232  237 1449 1450 1461 3431 3432 3441  187  233  234 1451 1457 1458 3436
+ 3437 3438 3439  187  226  231  233  234  237 1453 1454 1456 1457 3433 3434 3435
+ 3436  235 3436 3437  226  236  237 1459 1465 1466 3444 3445 3446 3447  187  226
+  231  232  234  236  237 1461 1462 1464 1465 3441 3442 3443 3444  238 3444 3445
+  239  240 1467 1471 1472 3453 3469 3470  239  240  241 1470 1471 1487 1488 3449
+ 3450 3451 3452 3469  240  241  243 1475 1476 1478 1487 3449 3457 3458  242 3452
+ 3469  226  241  243  245 1473 1474 1477 1478 3455 3456 3457 3460  244 1475 1478
+  243  245  246 1477 1478 1479 3459 3460 3461  245  246  248 1479 1480 1482 1483
+ 3461 3462 3463 3464 3465  247 3464 3465  246  248 1483 1484 1485 3465 3466 3467
+  187  218  226  249  250  252  256  257  280  287  288  294  295  296  298  299
+ 1489 1565 1566 1575 1576 1587 1588 3473 3563 3564 3567 3568 3595 3596  249  250
+  252  253 1489 1490 1493 1494 3473 3474 3475 3478  251 1491 1494  249  250  252
+  256 1491 1492 1494 1501 3475 3476 3485  250  253  254 1493 1494 1495 3477 3478
+ 3479  253  254 1495 1496 1498 1499 3479 3480 3481  255 3482 3483  218  249  252
+  256  257 1501 1502 1504 1505 3485 3486 3533 3534  218  249  256  257  265  267
+  268  274 1505 1506 1517 1518 1521 1522 1535 3487 3488 3497 3498 3506 3509 3510
+ 3517 3534  258 3487 3534  259  260 1507 3489  259  260  262  263  265  268 1507
+ 1508 1511 1512 3489 3490 3491 3494  261 1509 1512  260  262  265 1509 1510 1512
+ 1515 3491 3492 3495  260  263  268 1511 1512 1523 3493 3494 3503  218  264  265
+ 1513 1519 1520 3498 3499 3500 3501  218  257  260  262  264  265  268 1512 1515
+ 1516 1518 1519 3495 3496 3497 3498  266 3498 3499  257  267  268 1521 1527 1528
+ 3506 3507 3508 3509  218  257  260  263  265  267  268 1512 1523 1524 1526 1527
+ 3503 3504 3505 3506  269 3506 3507  270  271 1529 1533 1534 3515 3531 3532  270
+  271  272 1532 1533 1549 1550 3511 3512 3513 3514 3531  271  272  274 1537 1538
+ 1540 1549 3511 3519 3520  273 3514 3531  257  272  274  276 1535 1536 1539 1540
+ 3517 3518 3519 3522  275 1537 1540  274  276  277 1539 1540 1541 3521 3522 3523
+  276  277  279 1541 1542 1544 1545 3523 3524 3525 3526 3527  278 3526 3527  277
+  279 1545 1546 1547 3527 3528 3529   63  102  156  187  249  280  281  283  287
+  288  311  318  319  324  325  326  329  330 1551 1627 1628 1637 1638 1649 1650
+ 3535 3625 3626 3629 3630 3657 3658  280  281  283  284 1551 1552 1555 1556 3535
+ 3536 3537 3540  282 1553 1556  280  281  283  287 1553 1554 1556 1563 3537 3538
+ 3547  281  284  285 1555 1556 1557 3539 3540 3541  284  285 1557 1558 1560 1561
+ 3541 3542 3543  286 3544 3545  249  280  283  287  288 1563 1564 1566 1567 3547
+ 3548 3595  249  280  287  288  294  295  296  298  305 1567 1568 1579 1580 1583
+ 1584 1597 3549 3550 3559 3560 3571 3572 3579 3595 3596  289 3549 3596  290  291
+ 1569 3551  290  291  293  294 1569 1570 1573 1574 3551 3552 3553 3556  292 1571
+ 1574  291  293  294  296 1571 1572 1574 1577 3553 3554 3557  249  288  291  293
+  294  296  298  299 1573 1574 1585 3555 3556 3565  249  288  295  296 1575 1581
+ 1582 3559 3560 3561 3562 3563  249  288  293  294  295  296 1577 1578 1580 1581
+ 3557 3558 3559  297 3560 3561  249  288  294  298  299 1583 1589 1590 3567 3568
+ 3569 3570 3571  249  294  298  299 1585 1586 1588 1589 3565 3566 3567  300 3568
+ 3569  301  302 1591 1595 1596 3577 3593 3594  301  302  303 1594 1595 1611 1612
+ 3573 3574 3575 3576 3593  302  303  305 1599 1600 1602 1611 3573 3581 3582  304
+ 3576 3593  288  303  305  307 1597 1598 1601 1602 3579 3580 3581 3584  306 1599
+ 1602  305  307  308 1601 1602 1603 3583 3584 3585  307  308  310 1603 1604 1606
+ 1607 3585 3586 3587 3588 3589  309 3588 3589  308  310 1607 1608 1609 3589 3590
+ 3591  280  311  312  314  318  319  342  349  350  355  356  357  358  360  361
+  404 1613 1689 1690 1699 1700 1711 1712 3597 3687 3688 3691 3692 3719 3720  311
+  312  314  315 1613 1614 1617 1618 3597 3598 3599 3602  313 1615 1618  311  312
+  314  318 1615 1616 1618 1625 3599 3600 3609  312  315  316 1617 1618 1619 3601
+ 3602 3603  315  316 1619 1620 1622 1623 3603 3604 3605  317 3606 3607  280  311
+  314  318  319 1625 1626 1628 1629 3609 3610 3657  280  311  318  319  324  325
+  326  327  329  330  336 1629 1630 1641 1642 1645 1646 1659 3611 3612 3621 3622
+ 3633 3634 3641 3657 3658  320 3611 3658  321  322 1631 3613  321  322  324  325
+ 1631 1632 1635 3613 3614 3615 3618  323 1633 1636  280  319  322  324  325  326
+  327 1633 1634 1635 1636 1639 3615 3616 3619  280  319  322  324  325  330 1635
+ 1636 1647 3617 3618 3627  280  319  324  326  327 1637 1643 1644 3621 3622 3623
+ 3624 3625  319  324  326  327 1639 1640 1642 1643 3619 3620 3621  328 3622 3623
+  280  319  329  330 1645 1651 1652 3629 3630 3631 3632 3633  280  319  325  329
+  330 1647 1648 1650 1651 3627 3628 3629  331 3630 3631  332  333 1653 1657 1658
+ 3639 3655 3656  332  333  334 1656 1657 1673 1674 3635 3636 3637 3638 3655  333
+  334  336 1661 1662 1664 1673 3635 3643 3644  335 3638 3655  319  334  336  338
+ 1659 1660 1663 1664 3641 3642 3643 3646  337 1661 1664  336  338  339 1663 1664
+ 1665 3645 3646 3647  338  339  341 1665 1666 1668 1669 3647 3648 3649 3650  340
+  341 3650 3651  339  340  341 1669 1670 1671 3650 3651 3652 3653  311  342  343
+  345  349  350  373  376  380  381  386  387  388  391  392  404 1675 1751 1752
+ 1761 1762 1773 1774 3659 3749 3750 3753 3754 3781 3782  342  343  345  346 1675
+ 1676 1679 1680 3659 3660 3661 3664  344 1677 1680  342  343  345  349 1677 1678
+ 1680 1687 3661 3662 3671  343  346  347 1679 1680 1681 3663 3664 3665  346  347
+ 1681 1682 1684 1685 3665 3666 3667  348 3668 3669  311  342  345  349  350 1687
+ 1688 1690 1691 3671 3672 3719  311  342  349  350  355  356  357  358  360  367
+ 1691 1692 1703 1704 1707 1708 1721 3673 3674 3683 3684 3695 3696 3703 3719 3720
+  351 3673 3720  352  353 1693 3675  352  353  355  356 1693 1694 1697 3675 3676
+ 3677 3680  354 1695 1698  311  350  353  355  356  358 1695 1696 1697 1698 1701
+ 3677 3678 3681  311  350  353  355  356  360  361 1697 1698 1709 3679 3680 3689
+  311  350  357  358 1699 1705 1706 3683 3684 3685 3686 3687  311  350  355  357
+  358 1701 1702 1704 1705 3681 3682 3683  359 3684 3685  311  350  356  360  361
+ 1707 1713 1714 3691 3692 3693 3694 3695  311  356  360  361 1709 1710 1712 1713
+ 3689 3690 3691  362 3692 3693  363  364 1715 1719 1720 3701 3717 3718  363  364
+  365 1718 1719 1735 1736 3697 3698 3699 3700 3717  364  365  367 1723 1724 1726
+ 1735 3697 3705 3706  366 3700 3717  350  365  367  369 1721 1722 1725 1726 3703
+ 3704 3705 3708  368 1723 1726  367  369  370 1725 1726 1727 3707 3708 3709  369
+  370  372 1727 1728 1730 1731 3709 3710 3711 3712 3713  371 3712 3713  370  372
+ 1731 1732 1733 3713 3714 3715  342  373  374  376  381  404  407  411  412  418
+  419  420  423 1737 1813 1814 1823 1824 1835 1836 3721 3811 3812 3815 3816 3843
+ 3844  373  374  376  377 1737 1738 1741 1742 3721 3722 3723 3726  375 1739 1742
+  342  373  374  376  380  381 1739 1740 1742 1749 3723 3724 3733  374  377  378
+ 1741 1742 1743 3725 3726 3727  377  378 1743 1744 1746 1747 3727 3728 3729  379
+ 3730 3731  342  376  380  381 1749 1750 1752 1753 3733 3734 3781 3782  342  373
+  376  380  381  386  387  388  389  391  398 1753 1754 1765 1766 1769 1770 1783
+ 3735 3736 3745 3746 3757 3758 3765 3782  382 3735 3782  383  384 1755 3737  383
+  384  386  387 1755 1756 1759 3737 3738 3739 3742  385 1757 1760  342  381  384
+  386  387  388  389 1757 1758 1759 1760 1763 3739 3740 3743  342  381  384  386
+  387  391  392 1759 1760 1771 3741 3742 3751  342  381  386  388  389 1761 1767
+ 1768 3745 3746 3747 3748 3749  381  386  388  389 1763 1764 1766 1767 3743 3744
+ 3745  390 3746 3747  342  381  387  391  392 1769 1775 1776 3754 3755 3756 3757
+  342  387  391  392 1771 1772 1774 1775 3751 3752 3753 3754  393 3754 3755  394
+  395 1777 1781 1782 3763 3779 3780  394  395  396 1780 1781 1797 1798 3759 3760
+ 3761 3762 3779  395  396  398 1785 1786 1788 1797 3759 3767 3768  397 3762 3779
+  381  396  398  400 1783 1784 1787 1788 3765 3766 3767 3770  399 1785 1788  398
+  400  401 1787 1788 1789 3769 3770 3771  400  401  403 1789 1790 1792 1793 3771
+ 3772 3773 3774 3775  402 3774 3775  401  403 1793 1794 1795 3775 3776 3777  311
+  342  373  404  405  407  435  442  443  449  450  451  453  454  497 1799 1875
+ 1876 1885 1886 1897 1898 3783 3870 3873 3874 3877 3878 3905 3906  404  405  407
+  408 1799 1800 1803 1804 3783 3784 3785 3788  406 1801 1804  373  404  405  407
+  411  412 1801 1802 1804 1811 3785 3786 3795  405  408  409 1803 1804 1805 3787
+ 3788 3789  408  409 1805 1806 1808 1809 3789 3790 3791  410 3792 3793  373  407
+  411  412 1811 1812 1814 1815 3795 3796 3843  373  407  411  412  418  419  420
+  422  423  429 1815 1816 1827 1828 1831 1832 1845 3797 3798 3807 3808 3816 3819
+ 3820 3827 3843 3844  413 3797 3844  414  415 1817 3799  414  415  417  418 1817
+ 1818 1821 3799 3800 3801 3804  416 1819 1822  415  417  418  420 1819 1820 1821
+ 1822 1825 3801 3802 3805  373  412  415  417  418  420  423 1821 1822 1833 3803
+ 3804 3813  373  412  419  420 1823 1829 1830 3807 3808 3809 3810 3811  373  412
+  417  418  419  420 1825 1826 1828 1829 3805 3806 3807  421 3808 3809  412  422
+  423 1831 1837 1838 3816 3817 3818 3819  373  412  418  422  423 1833 1834 1836
+ 1837 3813 3814 3815 3816  424 3816 3817  425  426 1839 1843 1844 3825 3841 3842
+  425  426  427 1842 1843 1859 1860 3821 3822 3823 3824 3841  426  427  429 1847
+ 1848 1850 1859 3821 3829 3830  428 3824 3841  412  427  429  431 1845 1846 1849
+ 1850 3827 3828 3829 3832  430 1847 1850  429  431  432 1849 1850 1851 3831 3832
+ 3833  431  432  434 1851 1852 1854 1855 3833 3834 3835 3836 3837  433 3836 3837
+  432  434 1855 1856 1857 3837 3838 3839  404  435  436  442  443  466  473  474
+  479  480  481  485  497 1861 1937 1938 1947 1948 1959 1960 3845 3935 3936 3939
+ 3940 3967 3968  435  436  438  439  442 1861 1862 1865 1866 3845 3846 3847 3850
+  437 1863 1866  436  438  442 1863 1864 1866 1873 3847 3848 3857  436  439  440
+ 1865 1866 1867 3849 3850 3851  439  440 1867 1868 1870 1871 3851 3852 3853  441
+ 3854 3855  404  435  436  438  442  443 1866 1873 1874 1876 1877 3857 3858 3905
+  404  435  442  443  449  451  453  454  460  497 1877 1878 1889 1890 1893 1894
+ 1907 3859 3860 3869 3870 3881 3882 3889 3905 3906  444 3859 3906  445  446 1879
+ 3861  445  446  448  449  451 1879 1880 1883 1884 3861 3862 3863 3866  447 1881
+ 1884  446  448  451 1881 1882 1884 1887 3863 3864 3867  404  443  446  449  451
+  454 1883 1884 1895 3865 3866 3875  404  450  451 1885 1891 1892 3870 3871 3872
+ 3873  404  443  446  448  449  450  451 1884 1887 1888 1890 1891 3867 3868 3869
+ 3870  452 3870 3871  404  443  453  454 1893 1899 1900 3877 3878 3879 3880 3881
+  404  443  449  453  454 1895 1896 1898 1899 3875 3876 3877  455 3878 3879  456
+  457 1901 1905 1906 3887 3903 3904  456  457  458 1904 1905 1921 1922 3883 3884
+ 3885 3886 3903  457  458  460 1909 1910 1912 1921 3883 3891 3892  459 3886 3903
+  443  458  460  462 1907 1908 1911 1912 3889 3890 3891 3894  461 1909 1912  460
+  462  463 1911 1912 1913 3893 3894 3895  462  463  465 1913 1914 1916 1917 3895
+ 3896 3897 3898  464  465 3898 3899  463  464  465 1917 1918 1919 3898 3899 3900
+ 3901  435  466  467  473  474  497  504  505  512  513  515  516 1923 1999 2000
+ 2009 2010 2021 2022 3907 3997 3998 4001 4002 4029 4030  466  467  469  470  473
+ 1923 1924 1927 1928 3907 3908 3909 3912  468 1925 1928  467  469  473 1925 1926
+ 1928 1935 3909 3910 3919  467  470  471 1927 1928 1929 3911 3912 3913  470  471
+ 1929 1930 1932 1933 3913 3914 3915  472 3916 3917  435  466  467  469  473  474
+ 1928 1935 1936 1938 1939 3919 3920 3967  435  466  473  474  479  480  481  482
+  484  485  491  497 1939 1940 1951 1952 1955 1956 1969 3921 3922 3931 3932 3940
+ 3943 3944 3951 3967 3968  475 3921 3968  476  477 1941 3923  476  477  479  480
+ 1941 1942 1945 3923 3924 3925 3928  478 1943 1946  435  474  477  479  480  481
+  482 1943 1944 1945 1946 1949 3925 3926 3929  435  474  477  479  480  485 1945
+ 1946 1957 3927 3928 3937  435  474  479  481  482 1947 1953 1954 3932 3933 3934
+ 3935  474  479  481  482 1949 1950 1952 1953 3929 3930 3931 3932  483 3932 3933
+  474  484  485 1955 1961 1962 3940 3941 3942 3943  435  474  480  484  485 1957
+ 1958 1960 1961 3937 3938 3939 3940  486 3940 3941  487  488  490 1963 1967 1968
+ 3948 3949 3965 3966  487  488  489 1966 1967 1983 1984 3945 3946 3947 3948  488
+  489 1971 1972 1974 1983 3945 3953 3954  487  490 3948 3965  474  491  493 1969
+ 1970 1973 1974 3951 3952 3953 3956  492 1971 1974  491  493  494 1973 1974 1975
+ 3955 3956 3957  493  494  496 1975 1976 1978 1979 3957 3958 3959 3960 3961  495
+ 3960 3961  494  496 1979 1980 1981 3961 3962 3963  404  435  443  466  474  497
+  498  504  535  536  541  542  543  546  547  714  776  869 1985 2061 2062 2071
+ 2072 2083 2084 3969 4059 4060 4063 4064 4091 4092  497  498  500  501  504 1985
+ 1986 1989 1990 3969 3970 3971 3974  499 1987 1990  498  500  504 1987 1988 1990
+ 1997 3971 3972 3981  498  501  502 1989 1990 1991 3973 3974 3975  501  502 1991
+ 1992 1994 1995 3975 3976 3977  503 3978 3979  466  497  498  500  504  505 1990
+ 1997 1998 2000 2001 3981 3982 4029 4030  466  504  505  512  513  515  516  522
+ 2001 2002 2013 2014 2017 2018 2031 3983 3984 3993 3994 4005 4006 4013 4030  506
+ 3983 4030  507  508 2003 3985  507  508  510  511 2003 2004 2007 3985 3986 3987
+ 3990  509 2005 2008  508  510  511  513 2005 2006 2007 2008 2011 3987 3988 3991
+  508  510  511  513  516 2007 2008 2019 3989 3990 3999  466  505  512  513 2009
+ 2015 2016 3993 3994 3995 3996 3997  466  505  510  511  512  513  516 2011 2012
+ 2014 2015 3991 3992 3993  514 3994 3995  466  505  515  516 2017 2023 2024 4002
+ 4003 4004 4005  466  505  511  513  515  516 2019 2020 2022 2023 3999 4000 4001
+ 4002  517 4002 4003  518  519 2025 2029 2030 4011 4027 4028  518  519  520 2028
+ 2029 2045 2046 4007 4008 4009 4010 4027  519  520  522 2033 2034 2036 2045 4007
+ 4015 4016  521 4010 4027  505  520  522  524 2031 2032 2035 2036 4013 4014 4015
+ 4018  523 2033 2036  522  524  525 2035 2036 2037 4017 4018 4019  524  525  527
+ 2037 2038 2040 2041 4019 4020 4021 4022 4023  526 4022 4023  525  527 2041 2042
+ 2043 4023 4024 4025  528  529  531  535  559  566  567  573  574  577  578 2047
+ 2123 2124 2133 2134 2145 2146 4031 4121 4122 4125 4126 4153 4154  528  529  531
+  532 2047 2048 2051 2052 4031 4032 4033 4036  530 2049 2052  528  529  531  535
+ 2049 2050 2052 2059 4033 4034 4043  529  532  533 2051 2052 2053 4035 4036 4037
+  532  533 2053 2054 2056 2057 4037 4038 4039  534 4040 4041  497  528  531  535
+  536  559  590  621  652  655  686  714 2059 2060 2062 2063 4043 4044 4091 4092
+  497  535  536  541  542  543  544  546  553  714  776  869 2063 2064 2075 2076
+ 2079 2080 2093 4045 4046 4055 4056 4067 4068 4075 4092  537 4045 4092  538  539
+ 2065 4047  538  539  541  542 2065 2066 2069 2070 4047 4048 4049 4052  540 2067
+ 2070  497  536  539  541  542  543  544 2067 2068 2070 2073 4049 4050 4053  497
+  536  539  541  542  546  547 2069 2070 2081 4051 4052 4061  497  536  541  543
+  544 2071 2077 2078 4056 4057 4058 4059  536  541  543  544 2073 2074 2076 2077
+ 4053 4054 4055 4056  545 4056 4057  497  536  542  546  547 2079 2085 2086 4064
+ 4065 4066 4067  497  542  546  547 2081 2082 2084 2085 4061 4062 4063 4064  548
+ 4064 4065  549  550 2087 2091 2092 4073 4089 4090  549  550  551 2090 2091 2107
+ 2108 4069 4070 4071 4072 4089  550  551  553 2095 2096 2098 2107 4069 4077 4078
+  552 4072 4089  536  551  553  555 2093 2094 2097 2098 4075 4076 4077 4080  554
+ 2095 2098  553  555  556 2097 2098 2099 4079 4080 4081  555  556  558 2099 2100
+ 2102 2103 4081 4082 4083 4084 4085  557 4084 4085  556  558 2103 2104 2105 4085
+ 4086 4087  528  535  559  560  566  567  590  597  598  604  605  606  608  609
+ 2109 2185 2186 2195 2196 2207 2208 4093 4180 4183 4184 4187 4188 4215 4216  559
+  560  562  563  566 2109 2110 2113 2114 4093 4094 4095 4098  561 2111 2114  560
+  562  566 2111 2112 2114 2121 4095 4096 4105  560  563  564 2113 2114 2115 4097
+ 4098 4099  563  564 2115 2116 2118 2119 4099 4100 4101  565 4102 4103  528  559
+  560  562  566  567 2114 2121 2122 2124 2125 4105 4106 4153  528  559  566  567
+  572  573  574  575  577  584 2125 2126 2137 2138 2141 2142 2155 4107 4108 4117
+ 4118 4129 4130 4137 4153 4154  568 4107 4154  569  570 2127 4109  569  570  572
+  573 2127 2128 2131 4109 4110 4111 4114  571 2129 2132  567  570  572  573  574
+  575 2129 2130 2131 2132 2135 4111 4112 4115  528  567  570  572  573  574  577
+  578 2131 2132 2143 4113 4114 4123  528  567  572  573  574  575 2133 2139 2140
+ 4117 4118 4119 4120 4121  567  572  574  575 2135 2136 2138 2139 4115 4116 4117
+  576 4118 4119  528  567  573  577  578 2141 2147 2148 4125 4126 4127 4128 4129
+  528  573  577  578 2143 2144 2146 2147 4123 4124 4125  579 4126 4127  580  581
+ 2149 2153 2154 4135 4151 4152  580  581  582 2152 2153 2169 2170 4131 4132 4133
+ 4134 4151  581  582  584 2157 2158 2160 2169 4131 4139 4140  583 4134 4151  567
+  582  584  586 2155 2156 2159 2160 4137 4138 4139 4142  585 2157 2160  584  586
+  587 2159 2160 2161 4141 4142 4143  586  587  589 2161 2162 2164 2165 4143 4144
+ 4145 4146 4147  588 4146 4147  587  589 2165 2166 2167 4147 4148 4149  535  559
+  590  591  597  598  621  628  629  636  637  639  640 2171 2247 2248 2257 2258
+ 2269 2270 4155 4245 4246 4249 4250 4277 4278  590  591  593  594  597 2171 2172
+ 2175 2176 4155 4156 4157 4160  592 2173 2176  591  593  597 2173 2174 2176 2183
+ 4157 4158 4167  591  594  595 2175 2176 2177 4159 4160 4161  594  595 2177 2178
+ 2180 2181 4161 4162 4163  596 4164 4165  559  590  591  593  597  598 2183 2184
+ 2186 2187 4167 4168 4215 4216  559  590  597  598  604  606  608  609  615 2187
+ 2188 2199 2200 2203 2204 2217 4169 4170 4179 4180 4191 4192 4199 4216  599 4169
+ 4216  600  601 2189 4171  600  601  603  604 2189 2190 2193 4171 4172 4173 4176
+  602 2191 2194  601  603  604  606 2191 2192 2193 2194 2197 4173 4174 4177  559
+  598  601  603  604  606  609 2193 2194 2205 4175 4176 4185  559  605  606 2195
+ 2201 2202 4180 4181 4182 4183  559  598  603  604  605  606 2197 2198 2200 2201
+ 4177 4178 4179 4180  607 4180 4181  559  598  608  609 2203 2209 2210 4188 4189
+ 4190 4191  559  598  604  608  609 2205 2206 2208 2209 4185 4186 4187 4188  610
+ 4188 4189  611  612 2211 2215 2216 4197 4213 4214  611  612  613  614 2214 2215
+ 2231 2232 4193 4194 4195 4213  612  613  615 2219 2220 2222 2231 4193 4201 4202
+  612  614 4195 4196 4213  598  613  615  617 2217 2218 2221 2222 4199 4200 4201
+ 4204  616 2219 2222  615  617  618 2221 2222 2223 4203 4204 4205  617  618  619
+  620 2223 2224 2226 2227 4205 4206 4207 4209  618  619 4207 4208 4209  618  620
+ 2227 2228 2229 4209 4210 4211  535  590  621  622  628  629  655  659  660  667
+  668  670  671 2233 2309 2310 2319 2320 2331 2332 4217 4307 4308 4311 4312 4339
+ 4340  621  622  624  625  628 2233 2234 2237 4217 4218 4219 4222  623 2235 2238
+  622  624  625  628 2235 2236 2237 2238 2245 4219 4220 4229  622  624  625  626
+ 2237 2238 2239 4221 4222 4223  625  626 2239 2240 2242 2243 4223 4224 4225  627
+ 4226 4227  590  621  622  624  628  629 2245 2246 2248 2249 4229 4230 4277 4278
+  590  621  628  629  636  637  639  646 2249 2250 2261 2262 2265 2266 2279 4231
+ 4232 4241 4242 4253 4254 4261 4278  630 4231 4278  631 2251 4233  632  634  635
+ 2251 2252 2255 4233 4234 4235 4238  633 2253 2256  632  634  635  637  640 2253
+ 2254 2255 2256 2259 4235 4236 4239  632  634  635  640 2255 2256 2267 4237 4238
+ 4247  590  629  636  637  638 2257 2263 2264 4241 4242 4243 4244 4245  590  629
+  634  636  637  639  640 2259 2260 2262 2263 4239 4240 4241  636  638 4242 4243
+  590  629  637  639  640  641 2265 2271 2272 4250 4251 4252 4253  590  634  635
+  637  639  640 2267 2268 2270 2271 4247 4248 4249 4250  639  641 4250 4251  642
+  645 2273 2277 2278 4258 4259 4275 4276  643  644 2276 2277 2293 2294 4255 4256
+ 4257 4258  643  644  646 2281 2282 2284 2293 4255 4263 4264  642  645 4258 4275
+  629  644  646  648 2279 2280 2283 2284 4261 4262 4263 4266  647 2281 2284  646
+  648  649 2283 2284 2285 4265 4266 4267  648  649  650 2285 2286 2288 2289 4267
+ 4268 4269 4271  649  650 4269 4270 4271  651 2289 2290 2291 4271 4272 4273  535
+  652  653  655  686  690  691  698  701  702 2295 2371 2372 2381 2382 2393 2394
+ 4279 4369 4370 4373 4374 4401 4402  652  653  655  656 2295 2296 2299 4279 4280
+ 4281 4284  654 2297 2300  535  621  652  653  655  656  659  660 2297 2298 2299
+ 2300 2307 4281 4282 4291  653  655  656  657 2299 2300 2301 4283 4284 4285  656
+  657 2301 2302 2304 2305 4285 4286 4287  658 4288 4289  621  655  659  660 2307
+ 2308 2310 2311 4291 4292 4339  621  655  659  660  667  668  670  677 2311 2312
+ 2323 2324 2327 2328 2341 4293 4294 4303 4304 4315 4316 4323 4339 4340  661 4293
+ 4340  662 2313 4295  663  665  666 2313 2314 2317 4295 4296 4297 4300  664 2315
+ 2318  663  665  666  668  671 2315 2316 2317 2318 2321 4297 4298 4301  663  665
+  666  671 2317 2318 2329 4299 4300 4309  621  660  667  668  669 2319 2325 2326
+ 4303 4304 4305 4306 4307  621  660  665  667  668  670  671 2321 2322 2324 2325
+ 4301 4302 4303  667  669 4304 4305  621  660  668  670  671  672 2327 2333 2334
+ 4312 4313 4314 4315  621  665  666  668  670  671 2329 2330 2332 2333 4309 4310
+ 4311 4312  670  672 4312 4313  673  674 2335 2339 2340 4321 4337 4338  673  674
+  675  676 2338 2339 2355 2356 4317 4318 4319 4337  674  675  677 2343 2344 2346
+ 2355 4317 4325 4326  674  676 4319 4320 4337  660  675  677  679 2341 2342 2345
+ 2346 4323 4324 4325 4328  678 2343 2346  677  679  680 2345 2346 2347 4327 4328
+ 4329  679  680  681  682 2347 2348 2350 2351 4329 4330 4331 4333  680  681 4331
+ 4332 4333  680  682 2351 2352 2353 4333 4334 4335  683  684  686  717  721  722
+  729  730  732  733 2357 2433 2434 2443 2444 2455 2456 4341 4431 4432 4435 4436
+ 4463 4464  683  684  686  687 2357 2358 2361 4341 4342 4343 4346  685 2359 2362
+  535  652  683  684  686  687  690  691  714  715  717 2359 2360 2361 2362 2369
+ 4343 4344 4353  684  686  687 2361 2362 2363 4345 4346 4347  688 2363 2364 2366
+ 2367 4347 4348 4349  689 4350 4351  652  686  690  691 2369 2370 2372 2373 4353
+ 4354 4401  652  686  690  691  698  699  701  702  708 2373 2374 2385 2386 2389
+ 2390 2403 4355 4356 4365 4366 4377 4378 4385 4401 4402  692 4355 4402  693 2375
+ 4357  694  696  697  699  702 2375 2376 2379 2380 4357 4358 4359 4362  695 2377
+ 2380  694  696  699 2377 2378 2380 2383 4359 4360 4363  694  697  702 2379 2380
+ 2391 4361 4362 4371  652  691  698  699  700  702 2381 2387 2388 4366 4367 4368
+ 4369  691  694  696  698  699  702 2380 2383 2384 2386 2387 4363 4364 4365 4366
+  698  700 4366 4367  652  691  701  702  703 2389 2395 2396 4374 4375 4376 4377
+  652  691  694  697  698  699  701  702 2380 2391 2392 2394 2395 4371 4372 4373
+ 4374  701  703 4374 4375  704 2397 2401 2402 4383 4399 4400  705  706  707 2400
+ 2401 2417 2418 4379 4380 4381 4399  705  706  708 2405 2406 2408 2417 4379 4387
+ 4388  705  707 4381 4382 4399  691  706  708  710 2403 2404 2407 2408 4385 4386
+ 4387 4390  709 2405 2408  708  710  711 2407 2408 2409 4389 4390 4391  710  711
+ 2409 2410 2412 2413 4391 4392 4393 4394  712  713 4394 4395  712  713 2413 2414
+ 2415 4394 4395 4396 4397  497  535  536  686  714  715  745  748  752  753  760
+  761  763  764  776 2419 2495 2496 2505 2506 2517 2518 4403 4493 4494 4497 4498
+ 4525 4526  686  714  715  717  718 2419 2420 2423 4403 4404 4405 4408  716 2421
+ 2424  683  686  715  717  718  721  722 2421 2422 2423 2424 2431 4405 4406 4415
+  715  717  718  719 2423 2424 2425 4407 4408 4409  718  719 2425 2426 2428 2429
+ 4409 4410 4411  720 4412 4413  683  717  721  722 2431 2432 2434 2435 4415 4416
+ 4463 4464  683  717  721  722  729  730  732  733  739 2435 2436 2447 2448 2451
+ 2452 2465 4417 4418 4427 4428 4439 4440 4447 4464  723 4417 4464  724 2437 4419
+  725  727  728 2437 2438 2441 2442 4419 4420 4421 4424  726 2439 2442  725  727
+  728  730  733 2439 2440 2442 2445 4421 4422 4425  725  727  728  733 2441 2442
+ 2453 4423 4424 4433  683  722  729  730  731 2443 2449 2450 4427 4428 4429 4430
+ 4431  683  722  727  729  730  733 2445 2446 2448 2449 4425 4426 4427  729  731
+ 4428 4429  683  722  732  733  734 2451 2457 2458 4436 4437 4438 4439  683  722
+  727  728  730  732  733 2453 2454 2456 2457 4433 4434 4435 4436  732  734 4436
+ 4437  735  736 2459 2463 2464 4445 4461 4462  735  736  737  738 2462 2463 2479
+ 2480 4441 4442 4443 4461  736  737  739 2467 2468 2470 2479 4441 4449 4450  736
+  738 4443 4444 4461  722  737  739  741 2465 2466 2469 2470 4447 4448 4449 4452
+  740 2467 2470  739  741  742 2469 2470 2471 4451 4452 4453  741  742  743  744
+ 2471 2472 2474 2475 4453 4454 4455 4457  742  743 4455 4456 4457  742  744 2475
+ 2476 2477 4457 4458 4459  714  745  746  748  753  776  783  784  791  792  795
+ 2481 2557 2558 2567 2568 2579 2580 4465 4552 4555 4556 4559 4560 4587 4588  745
+  746  748  749 2481 2482 2485 4465 4466 4467 4470  747 2483 2486  714  745  746
+  748  749  752  753 2483 2484 2485 2486 2493 4467 4468 4477  746  748  749  750
+ 2485 2486 2487 4469 4470 4471  749  750 2487 2488 2490 2491 4471 4472 4473  751
+ 4474 4475  714  748  752  753 2493 2494 2496 2497 4477 4478 4525  714  745  748
+  752  753  760  761  763  770 2497 2498 2509 2510 2513 2514 2527 4479 4480 4489
+ 4490 4501 4502 4509 4525 4526  754 4479 4526  755 2499 4481  756  758  759 2499
+ 2500 2503 4481 4482 4483 4486  757 2501 2504  756  758  759  761  764 2501 2502
+ 2503 2504 2507 4483 4484 4487  756  758  759  764 2503 2504 2515 4485 4486 4495
+  714  753  760  761  762 2505 2511 2512 4490 4491 4492 4493  714  753  758  760
+  761  763  764 2507 2508 2510 2511 4487 4488 4489 4490  760  762 4490 4491  714
+  753  761  763  764 2513 2519 2520 4497 4498 4499 4500 4501  714  758  759  761
+  763  764 2515 2516 2518 2519 4495 4496 4497  765 4498 4499  766 2521 2525 2526
+ 4507 4523 4524  767  769 2524 2525 2541 2542 4503 4504 4505 4523  768  770 2529
+ 2530 2532 2541 4503 4511 4512  767  769 4505 4506 4523  753  768  770  772 2527
+ 2528 2531 2532 4509 4510 4511 4514  771 2529 2532  770  772  773 2531 2532 2533
+ 4513 4514 4515  772  773  774 2533 2534 2536 2537 4515 4516 4517 4519  773  774
+ 4517 4518 4519  775 2537 2538 2539 4519 4520 4521  497  536  714  745  776  777
+  783  784  807  814  815  822  823  826  869 2543 2619 2620 2629 2630 2641 2642
+ 4527 4614 4617 4618 4621 4622 4649 4650  776  777  779  780  783 2543 2544 2547
+ 2548 4527 4528 4529 4532  778 2545 2548  777  779  783 2545 2546 2548 2555 4529
+ 4530 4539  777  780  781 2547 2548 2549 4531 4532 4533  780  781 2549 2550 2552
+ 2553 4533 4534 4535  782 4536 4537  745  776  777  779  783  784 2548 2555 2556
+ 2558 2559 4539 4540 4587 4588  745  776  783  784  792  794  795  801 2559 2560
+ 2571 2572 2575 2576 2589 4541 4542 4551 4552 4560 4563 4564 4571 4588  785 4541
+ 4588  786 2561 4543  787  789  790 2561 2562 2565 4543 4544 4545 4548  788 2563
+ 2566  787  789  790  792  795 2563 2564 2565 2566 2569 4545 4546 4549  787  789
+  790  795 2565 2566 2577 4547 4548 4557  745  791  792  793 2567 2573 2574 4552
+ 4553 4554 4555  745  784  789  791  792  795 2569 2570 2572 2573 4549 4550 4551
+ 4552  791  793 4552 4553  784  794  795  796 2575 2581 2582 4560 4561 4562 4563
+  745  784  789  790  792  794  795 2577 2578 2580 2581 4557 4558 4559 4560  794
+  796 4560 4561  797  798  800 2583 2587 2588 4567 4568 4569 4585 4586  797  798
+  799 2586 2587 2603 2604 4565 4566 4567  798  799  801 2591 2592 2594 2603 4565
+ 4573 4574  797  800 4568 4585  784  799  801  803 2589 2590 2593 2594 4571 4572
+ 4573 4576  802 2591 2594  801  803  804 2593 2594 2595 4575 4576 4577  803  804
+  805  806 2595 2596 2598 2599 4577 4578 4579  804  805  806 4579 4580 4581  804
+  805  806 2599 2600 2601 4581 4582 4583  776  807  808  814  815  838  845  846
+  852  853  854  857  869 2605 2681 2682 2691 2692 2703 2704 4589 4676 4679 4680
+ 4683 4684 4711 4712  807  808  810  811  814 2605 2606 2609 2610 4589 4590 4591
+ 4594  809 2607 2610  808  810  814 2607 2608 2610 2617 4591 4592 4601  808  811
+  812 2609 2610 2611 4593 4594 4595  811  812 2611 2612 2614 2615 4595 4596 4597
+  813 4598 4599  776  807  808  810  814  815 2610 2617 2618 2620 2621 4601 4602
+ 4649 4650  776  807  814  815  823  825  826  832 2621 2622 2633 2634 2637 2638
+ 2651 4603 4604 4613 4614 4622 4625 4626 4633 4650  816 4603 4650  817 2623 4605
+  818  820  821 2623 2624 2627 4605 4606 4607 4610  819 2625 2628  818  820  821
+  823  826 2625 2626 2627 2628 2631 4607 4608 4611  818  820  821  826 2627 2628
+ 2639 4609 4610 4619  776  822  823 2629 2635 2636 4614 4615 4616 4617  776  815
+  820  822  823  826 2631 2632 2634 2635 4611 4612 4613 4614  824 4614 4615  815
+  825  826 2637 2643 2644 4622 4623 4624 4625  776  815  820  821  823  825  826
+ 2639 2640 2642 2643 4619 4620 4621 4622  827 4622 4623  828  829  831 2645 2649
+ 2650 4631 4647 4648  828  829  830  831 2648 2649 2665 2666 4627 4628 4629  829
+  830  832 2653 2654 2656 2665 4627 4635 4636  828  829  831 4629 4630 4647  815
+  830  832  834 2651 2652 2655 2656 4633 4634 4635 4638  833 2653 2656  832  834
+  835 2655 2656 2657 4637 4638 4639  834  835  836  837 2657 2658 2660 2661 4639
+ 4640 4641  835  836  837 4641 4642 4643  835  836  837 2661 2662 2663 4643 4644
+ 4645  807  838  839  845  846  869  876  877  884  885  887  888 2667 2743 2744
+ 2753 2754 2765 2766 4651 4738 4741 4742 4745 4746 4773 4774  838  839  841  842
+  845 2667 2668 2671 2672 4651 4652 4653 4656  840 2669 2672  839  841  845 2669
+ 2670 2672 2679 4653 4654 4663  839  842  843 2671 2672 2673 4655 4656 4657  842
+  843 2673 2674 2676 2677 4657 4658 4659  844 4660 4661  807  838  839  841  845
+  846 2672 2679 2680 2682 2683 4663 4664 4711 4712  807  838  845  846  847  852
+  854  856  857  863  869 2683 2684 2695 2696 2699 2700 2713 4665 4666 4675 4676
+ 4684 4687 4688 4695 4712  846  847 4665 4712  848  849 2685 4667  848  849  851
+  852  854 2685 2686 2689 2690 4667 4668 4669 4672  850 2687 2690  849  851  854
+ 2687 2688 2690 2693 4669 4670 4673  807  846  849  852  854  857 2689 2690 2701
+ 4671 4672 4681  807  853  854 2691 2697 2698 4676 4677 4678 4679  807  846  849
+  851  852  853  854 2690 2693 2694 2696 2697 4673 4674 4675 4676  855 4676 4677
+  846  856  857  858 2699 2705 2706 4684 4685 4686 4687  807  846  852  856  857
+ 2701 2702 2704 2705 4681 4682 4683 4684  856  858 4684 4685  859  860  862 2707
+ 2711 2712 4693 4709 4710  859  860  861  862 2710 2711 2727 2728 4689 4690 4691
+  860  861  863 2715 2716 2718 2727 4689 4697 4698  859  860  862 4691 4692 4709
+  846  861  863  865 2713 2714 2717 2718 4695 4696 4697 4700  864 2715 2718  863
+  865  866 2717 2718 2719 4699 4700 4701  865  866  868 2719 2720 2722 2723 4701
+ 4702 4703  867  868 4704 4705  866  867  868 2723 2724 2725 4703 4704 4705 4706
+ 4707  497  536  776  807  838  846  869  870  872  876  900  907  908  915  916
+  919 2729 2805 2806 2815 2816 2827 2828 4713 4800 4803 4804 4807 4808 4835 4836
+  869  870  872  873 2729 2730 2733 2734 4713 4714 4715 4718  871 2731 2734  869
+  870  872  876 2731 2732 2734 2741 4715 4716 4725  870  873  874 2733 2734 2735
+ 4717 4718 4719  873  874 2735 2736 2738 2739 4719 4720 4721  875 4722 4723  838
+  869  872  876  877 2741 2742 2744 2745 4725 4726 4773 4774  838  876  877  885
+  887  888  894 2745 2746 2757 2758 2761 2762 2775 4727 4728 4737 4738 4749 4750
+ 4757 4774  878 4727 4774  879  880 2747 4729  879  880  882  883 2747 2748 2751
+ 4729 4730 4731 4734  881 2749 2752  880  882  883  885 2749 2750 2751 2752 2755
+ 4731 4732 4735  880  882  883  885  888 2751 2752 2763 4733 4734 4743  838  884
+  885 2753 2759 2760 4738 4739 4740 4741  838  877  882  883  884  885  888 2755
+ 2756 2758 2759 4735 4736 4737 4738  886 4738 4739  838  877  887  888 2761 2767
+ 2768 4746 4747 4748 4749  838  877  883  885  887  888 2763 2764 2766 2767 4743
+ 4744 4745 4746  889 4746 4747  890  891  893 2769 2773 2774 4755 4771 4772  890
+  891  892  893 2772 2773 2789 2790 4751 4752 4753  891  892  894 2777 2778 2780
+ 2789 4751 4759 4760  890  891  893 4753 4754 4771  877  892  894  896 2775 2776
+ 2779 2780 4757 4758 4759 4762  895 2777 2780  894  896  897 2779 2780 2781 4761
+ 4762 4763  896  897  898  899 2781 2782 2784 2785 4763 4764 4765  897  898  899
+ 4765 4766 4767  897  898  899 2785 2786 2787 4767 4768 4769  869  900  901  907
+  908  934  938  939  946  947  950 2791 2867 2868 2877 2878 2889 2890 4775 4862
+ 4865 4866 4869 4870 4897 4898  900  901  903  904  907 2791 2792 2795 2796 4775
+ 4776 4777 4780  902 2793 2796  901  903  907 2793 2794 2796 2803 4777 4778 4787
+  901  904  905 2795 2796 2797 4779 4780 4781  904  905 2797 2798 2800 2801 4781
+ 4782 4783  906 4784 4785  869  900  901  903  907  908 2796 2803 2804 2806 2807
+ 4787 4788 4835  869  900  907  908  909  916  918  919  925 2807 2808 2819 2820
+ 2823 2824 2837 4789 4790 4799 4800 4808 4811 4812 4819 4835 4836  908  909 4789
+ 4836  910  911 2809 4791  910  911  913  914 2809 2810 2813 4791 4792 4793 4796
+  912 2811 2814  911  913  914  916 2811 2812 2813 2814 2817 4793 4794 4797  911
+  913  914  916  919 2813 2814 2825 4795 4796 4805  869  915  916 2815 2821 2822
+ 4800 4801 4802 4803  869  908  913  914  915  916  919 2817 2818 2820 2821 4797
+ 4798 4799 4800  917 4800 4801  908  918  919  920 2823 2829 2830 4808 4809 4810
+ 4811  869  908  914  916  918  919 2825 2826 2828 2829 4805 4806 4807 4808  918
+  920 4808 4809  921  922  924 2831 2835 2836 4817 4833 4834  921  922  923  924
+ 2834 2835 2851 2852 4813 4814 4815  922  923  925 2839 2840 2842 2851 4813 4821
+ 4822  921  922  924 4815 4816 4833  908  923  925  927 2837 2838 2841 2842 4819
+ 4820 4821 4824  926 2839 2842  925  927  928 2841 2842 2843 4823 4824 4825  927
+  928  930 2843 2844 2846 2847 4825 4826 4827  929  930 4828 4829  928  929  930
+ 2847 2848 2849 4827 4828 4829 4830 4831  931  932  969  970  975  977  980  981
+ 2853 2929 2930 2939 2940 2951 2952 4837 4927 4928 4931 4932 4959 4960  931  932
+  934  935 2853 2854 2857 2858 4837 4838 4839 4842  933 2855 2858  900  932  934
+  938  939 2855 2856 2858 2865 4839 4840 4849  932  935  936 2857 2858 2859 4841
+ 4842 4843  935  936 2859 2860 2862 2863 4843 4844 4845  937 4846 4847  900  934
+  938  939 2865 2866 2868 2869 4849 4850 4897  900  934  938  939  940  947  949
+  950  956 2869 2870 2881 2882 2885 2886 2899 4851 4852 4861 4862 4870 4873 4874
+ 4881 4897 4898  939  940 4851 4898  941  942 2871 4853  941  942  944  945 2871
+ 2872 2875 4853 4854 4855 4858  943 2873 2876  942  944  945  947  950 2873 2874
+ 2875 2876 2879 4855 4856 4859  942  944  945  950 2875 2876 2887 4857 4858 4867
+  900  946  947 2877 2883 2884 4862 4863 4864 4865  900  939  944  946  947  950
+ 2879 2880 2882 2883 4859 4860 4861 4862  948 4862 4863  939  949  950  951 2885
+ 2891 2892 4870 4871 4872 4873  900  939  944  945  947  949  950 2887 2888 2890
+ 2891 4867 4868 4869 4870  949  951 4870 4871  952  953  955 2893 2897 2898 4879
+ 4895 4896  952  953  954  955 2896 2897 2913 2914 4875 4876 4877  953  954  956
+ 2901 2902 2904 2913 4875 4883 4884  952  953  955 4877 4878 4895  939  954  956
+  958 2899 2900 2903 2904 4881 4882 4883 4886  957 2901 2904  956  958  959 2903
+ 2904 2905 4885 4886 4887  958  959  961 2905 2906 2908 2909 4887 4888 4889  960
+  961 4890 4891  959  960  961 2909 2910 2911 4889 4890 4891 4892 4893  962 2915
+ 4899  963  965  966 2915 2916 2919 4899 4900 4901 4904  964 2917 2920  963  965
+  966  969 2917 2918 2919 2920 2927 4901 4902 4911  963  965  966  967 2919 2920
+ 2921 4903 4904 4905  966  967 2921 2922 2924 2925 4905 4906 4907  968 4908 4909
+  931  965  969  970 2927 2928 2930 2931 4911 4912 4959  931  969  970  975  977
+  978  980  981  987 2931 2932 2943 2944 2947 2948 2961 4913 4914 4923 4924 4935
+ 4936 4943 4959 4960  971 4913 4960  972 2933 4915  973  975  976 2933 2934 2937
+ 4915 4916 4917 4920  974 2935 2938  931  970  973  975  976  977  978  981 2935
+ 2936 2937 2938 2941 4917 4918 4921  973  975  976  981 2937 2938 2949 4919 4920
+ 4929  931  970  975  977  978  979 2939 2945 2946 4923 4924 4925 4926 4927  970
+  975  977  978 2941 2942 2944 2945 4921 4922 4923  977  979 4924 4925  931  970
+  980  981  982 2947 2953 2954 4931 4932 4933 4934 4935  931  970  975  976  980
+  981 2949 2950 2952 2953 4929 4930 4931  980  982 4932 4933  983  984  986 2955
+ 2959 2960 4939 4940 4941 4957 4958  983  984  985 2958 2959 2975 2976 4937 4938
+ 4939  984  985  987 2963 2964 2966 2975 4937 4945 4946  983  986 4940 4957  970
+  985  987  989 2961 2962 2965 2966 4943 4944 4945 4948  988 2963 2966  987  989
+  990 2965 2966 2967 4947 4948 4949  989  990  991  992 2967 2968 2970 2971 4949
+ 4950 4951  990  991  992 4951 4952 4953  990  991  992 2971 2972 2973 4953 4954
+ 4955    1    2  993  994    2  993  994    3    4  995  996  998    4  995  996
+    2    5  997  998    2    3    4    5  995  997  998    5    6  999 1000 2983
+    6  999 1000 1001 1002    6 1001 1002    6 1003 1004 1003 1004    4    8 1005
+ 1006    8 1005 1006 1007 1008    8 1007 1008    8    9 1009 1010    9 1009 1010
+   11   12 1011 1012 2993   12 1011 1012   13   14 1013 1014 1016   14 1013 1014
+   12   14   15 1015 1016   13   14   15 1013 1015 1016   16 1017 1018 1017 1018
+   14   17 1019 1020   17 1019 1020    9 1021 1022    9   17 1021 1022   16   17
+ 1023 1024   16 1023 1024    9   19 1025 1026    9 1025 1026   15   20 1027 1028
+   20 1027 1028 1029 1030   20 1029 1030   19   20 1031 1032   19 1031 1032   22
+ 1033 1034 1033 1034 1035 1036   23 1035 1036   22   23 1037 1038   22 1037 1038
+    9   26 1039 1040   26 1039 1040   24   27 1041 1042 1044   24 1041 1042   26
+   28 1043 1044   24   26   27   28 1041 1043 1044   28   29 1045 1046   29 1045
+ 1046 1047 1048   29 1047 1048   29   31 1049 1050   31 1049 1050   31 1051 1052
+ 1051 1052   23   24 1053 1054   23 1053 1054   32   33 1055 1056   33 1055 1056
+   34   35 1057 1058 1060   35 1057 1058   33   35   36 1059 1060   34   35   36
+ 1057 1059 1060   36   37 1061 1062   37 1061 1062 1063 1064   37 1063 1064   37
+ 1065 1066 1065 1066   35   39 1067 1068   39 1067 1068    1 1069 1070    1   39
+ 1069 1070   39   40 1071 1072   40 1071 1072   42   43 1073 1074   43 1073 1074
+   44   45 1075 1076 1078   45 1075 1076   43   46 1077 1078   43   44   45   46
+   48 1075 1077 1078    1   47 1079 1080    1 1079 1080   45   48 1081 1082   48
+ 1081 1082   40 1083 1084   40   48 1083 1084   47   48 1085 1086   47 1085 1086
+   40   50 1087 1088   40 1087 1088   46   51 1089 1090   51 1089 1090    1 1091
+ 1092    1   51 1091 1092   50   51 1093 1094   50 1093 1094   53 1095 1096 1095
+ 1096 1097 1098   54 1097 1098   53   54 1099 1100   53 1099 1100   40   57 1101
+ 1102   57 1101 1102   55   58 1103 1104 1106   55 1103 1104   57   59 1105 1106
+   55   57   58   59 1103 1105 1106   59   60 1107 1108   60 1107 1108 1109 1110
+   60 1109 1110   60   62 1111 1112   62 1111 1112   62 1113 1114 1113 1114   54
+   55 1115 1116   54 1115 1116   63   64 1117 1118   64 1117 1118   65   66 1119
+ 1120 1122   66 1119 1120   64   67 1121 1122   64   65   66   67   70 1119 1121
+ 1122   67   68 1123 1124   68 1123 1124 1125 1126   68 1125 1126   68 1127 1128
+ 1127 1128   66   70 1129 1130   70 1129 1130   32 1131 1132   32   70 1131 1132
+   70   71 1133 1134   71 1133 1134   73   74 1135 1136   74 1135 1136   75   76
+ 1137 1138 1140   76 1137 1138   74   77 1139 1140   74   75   76   77 1137 1139
+ 1140   32   78 1141 1142   32 1141 1142   76   79 1143 1144   79 1143 1144   71
+ 1145 1146   71   79 1145 1146   78   79 1147 1148   78 1147 1148   71   81 1149
+ 1150   71 1149 1150   77   82 1151 1152   82 1151 1152   32 1153 1154   32   82
+ 1153 1154   81   82 1155 1156   81 1155 1156   84 1157 1158 1157 1158 1159 1160
+   85 1159 1160   84   85 1161 1162   84 1161 1162   71   88 1163 1164   88 1163
+ 1164   86   89 1165 1166 1168   86 1165 1166   88   90 1167 1168   86   88   89
+   90 1165 1167 1168   90   91 1169 1170   91 1169 1170 1171 1172   91 1171 1172
+   91   93 1173 1174   93 1173 1174   93 1175 1176 1175 1176   85   86 1177 1178
+   85 1177 1178   94   95 1179 1180   95 1179 1180   96   97 1181 1182 1184   97
+ 1181 1182   95   98 1183 1184   95   96   97   98 1181 1183 1184   98   99 1185
+ 1186   99 1185 1186 1187 1188   99 1187 1188   99 1189 1190 1189 1190   97  101
+ 1191 1192  101 1191 1192   63 1193 1194   63  101 1193 1194  101  102 1195 1196
+  102 1195 1196  104  105 1197 1198  105 1197 1198  106  107 1199 1200 1202  107
+ 1199 1200  105  108 1201 1202  105  106  107  108  110  113 1199 1201 1202   63
+  109 1203 1204   63 1203 1204  107  110 1205 1206  110 1205 1206  102 1207 1208
+  102  110 1207 1208  109  110 1209 1210  109 1209 1210  102  112 1211 1212  102
+ 1211 1212  108  113 1213 1214  113 1213 1214   63 1215 1216   63  113 1215 1216
+  112  113 1217 1218  112 1217 1218  115 1219 1220 1219 1220 1221 1222  116 1221
+ 1222  115  116 1223 1224  115 1223 1224  102  119 1225 1226  119 1225 1226  117
+  120 1227 1228  117 1227 1228  119  121 1229 1230  117  119  120  121 1229 1230
+  121  122 1231 1232  122 1231 1232 1233 1234  122 1233 1234  122  124 1235 1236
+  124 1235 1236  124 1237 1238 1237 1238  116  117 1239 1240  116 1239 1240  125
+  126 1241 1242  126 1241 1242  127  128 1243 1244  128 1243 1244  126  129 1245
+ 1246  126  127  128  129 1245 1246  129  130 1247 1248  130 1247 1248 1249 1250
+  130 1249 1250  130 1251 1252 1251 1252  128  132 1253 1254  132 1253 1254   94
+ 1255 1256   94  132 1255 1256  132  133 1257 1258  133 1257 1258  135  136 1259
+ 1260 3241  136 1259 1260  137  138 1261 1262 1264  138 1261 1262  136  138  139
+ 1263 1264  137  138  139 1261 1263 1264   94  140 1265 1266   94 1265 1266  138
+  141 1267 1268  141 1267 1268  133 1269 1270  133  141 1269 1270  140  141 1271
+ 1272  140 1271 1272  133  143 1273 1274  133 1273 1274  139  144 1275 1276  144
+ 1275 1276   94 1277 1278   94  144 1277 1278  143  144 1279 1280  143 1279 1280
+  146 1281 1282 1281 1282 1283 1284  147 1283 1284  146  147 1285 1286  146 1285
+ 1286  133  150 1287 1288  150 1287 1288  148  150  151 1289 1290  148 1289 1290
+  150  152 1291 1292  150  151  152 1291 1292  152  153 1293 1294  153 1293 1294
+ 1295 1296  153 1295 1296  153  155 1297 1298  155 1297 1298  155 1299 1300 1299
+ 1300  147  148 1301 1302  147 1301 1302  156  157 1303 1304  157 1303 1304  158
+  159 1305 1306  159 1305 1306  157  159  160 1307 1308  158  159  160 1307 1308
+  160  161 1309 1310  161 1309 1310 1311 1312  161 1311 1312  161 1313 1314 1313
+ 1314  159  163 1315 1316  163 1315 1316  125 1317 1318  125  163 1317 1318  163
+  164 1319 1320  164 1319 1320  166  167 1321 1322 3303  167 1321 1322  168  169
+ 1323 1324  169 1323 1324  167  170 1325 1326  167  168  169  170 1325 1326  125
+  171 1327 1328  125 1327 1328  169  172 1329 1330  172 1329 1330  164 1331 1332
+  164  172 1331 1332  171  172 1333 1334  171 1333 1334  164  174 1335 1336  164
+ 1335 1336  170  175 1337 1338  175 1337 1338  125 1339 1340  125  175 1339 1340
+  174  175 1341 1342  174 1341 1342  177 1343 1344 1343 1344 1345 1346  178 1345
+ 1346  177  178 1347 1348  177 1347 1348  164  181 1349 1350  181 1349 1350  179
+  182 1351 1352 1354  179 1351 1352  181  183 1353 1354  179  181  182  183 1351
+ 1353 1354  183  184 1355 1356  184 1355 1356 1357 1358  184 1357 1358  184  186
+ 1359 1360  186 1359 1360  186 1361 1362 1361 1362  178  179 1363 1364  178 1363
+ 1364  187  188 1365 1366  188 1365 1366  189  190 1367 1368  190 1367 1368  188
+  190  191 1369 1370  189  190  191 1369 1370  191  192 1371 1372  192 1371 1372
+ 1373 1374  192 1373 1374  192 1375 1376 1375 1376  190  194 1377 1378  194 1377
+ 1378  156 1379 1380  156  194 1379 1380  194  195 1381 1382  195 1381 1382  197
+  198 1383 1384 3365  198 1383 1384  199  200 1385 1386 1388  200 1385 1386  198
+  201 1387 1388  198  199  200  201  203  206 1385 1387 1388  156  202 1389 1390
+  156 1389 1390  200  203 1391 1392  203 1391 1392  195 1393 1394  195  203 1393
+ 1394  202  203 1395 1396  202 1395 1396  195  205 1397 1398  195 1397 1398  201
+  206 1399 1400  206 1399 1400  156 1401 1402  156  206 1401 1402  205  206 1403
+ 1404  205 1403 1404  208 1405 1406 1405 1406 1407 1408  209 1407 1408  208  209
+ 1409 1410  208 1409 1410  195  212 1411 1412  212 1411 1412  210  213 1413 1414
+  210 1413 1414  212  214 1415 1416  210  212  213  214 1415 1416 3395  214  215
+ 1417 1418  215 1417 1418 1419 1420  215 1419 1420  215  217 1421 1422  217 1421
+ 1422  217 1423 1424 1423 1424  209  210 1425 1426  209 1425 1426  218  219 1427
+ 1428  219 1427 1428  220  221 1429 1430  221 1429 1430  219  222 1431 1432  219
+  220  221  222 1431 1432  222  223 1433 1434  223 1433 1434 1435 1436  223 1435
+ 1436  223 1437 1438 1437 1438  221  225 1439 1440  225 1439 1440  187 1441 1442
+  187  225 1441 1442  225  226 1443 1444  226 1443 1444  228  229 1445 1446  229
+ 1445 1446  230  231 1447 1448 1450  231 1447 1448  229  232 1449 1450  229  230
+  231  232 1447 1449 1450  187  233 1451 1452  187 1451 1452  231  234 1453 1454
+  234 1453 1454  226 1455 1456  226  234 1455 1456  233  234 1457 1458  233 1457
+ 1458  226  236 1459 1460  226 1459 1460  232  237 1461 1462  237 1461 1462  187
+ 1463 1464  187  237 1463 1464  236  237 1465 1466  236 1465 1466  239 1467 1468
+ 1467 1468 1469 1470  240 1469 1470  239  240 1471 1472  239 1471 1472  226  243
+ 1473 1474  243 1473 1474  241  244 1475 1476 1478  241 1475 1476  243  245 1477
+ 1478  241  243  244  245 1475 1477 1478  245  246 1479 1480  246 1479 1480 1481
+ 1482  246 1481 1482  246  248 1483 1484  248 1483 1484  248 1485 1486 1485 1486
+  240  241 1487 1488  240 1487 1488  249  250 1489 1490  250 1489 1490  251  252
+ 1491 1492 1494  252 1491 1492  250  253 1493 1494  250  251  252  253 1491 1493
+ 1494  253  254 1495 1496  254 1495 1496 1497 1498  254 1497 1498  254 1499 1500
+ 1499 1500  252  256 1501 1502  256 1501 1502  218 1503 1504  218  256 1503 1504
+  256  257 1505 1506  257 1505 1506  259  260 1507 1508  260 1507 1508  261  262
+ 1509 1510 1512  262 1509 1510  260  263 1511 1512  260  261  262  263  265  268
+ 1509 1511 1512  218  264 1513 1514  218 1513 1514  262  265 1515 1516  265 1515
+ 1516  257 1517 1518  257  265 1517 1518  264  265 1519 1520  264 1519 1520  257
+  267 1521 1522  257 1521 1522  263  268 1523 1524  268 1523 1524  218 1525 1526
+  218  268 1525 1526  267  268 1527 1528  267 1527 1528  270 1529 1530 1529 1530
+ 1531 1532  271 1531 1532  270  271 1533 1534  270 1533 1534  257  274 1535 1536
+  274 1535 1536  272  275 1537 1538 1540  272 1537 1538  274  276 1539 1540  272
+  274  275  276 1537 1539 1540  276  277 1541 1542  277 1541 1542 1543 1544  277
+ 1543 1544  277  279 1545 1546  279 1545 1546  279 1547 1548 1547 1548  271  272
+ 1549 1550  271 1549 1550  280  281 1551 1552  281 1551 1552  282  283 1553 1554
+ 1556  283 1553 1554  281  284 1555 1556  281  282  283  284 1553 1555 1556  284
+  285 1557 1558  285 1557 1558 1559 1560  285 1559 1560  285 1561 1562 1561 1562
+  283  287 1563 1564  287 1563 1564  249 1565 1566  249  287 1565 1566  287  288
+ 1567 1568  288 1567 1568  290  291 1569 1570  291 1569 1570  292  293 1571 1572
+ 1574  293 1571 1572  291  294 1573 1574  291  292  293  294 1571 1573 1574  249
+  295 1575 1576  249 1575 1576  293  296 1577 1578  296 1577 1578  288 1579 1580
+  288  296 1579 1580  295  296 1581 1582  295 1581 1582  288  298 1583 1584  288
+ 1583 1584  294  299 1585 1586  299 1585 1586  249 1587 1588  249  299 1587 1588
+  298  299 1589 1590  298 1589 1590  301 1591 1592 1591 1592 1593 1594  302 1593
+ 1594  301  302 1595 1596  301 1595 1596  288  305 1597 1598  305 1597 1598  303
+  306 1599 1600 1602  303 1599 1600  305  307 1601 1602  303  305  306  307 1599
+ 1601 1602  307  308 1603 1604  308 1603 1604 1605 1606  308 1605 1606  308  310
+ 1607 1608  310 1607 1608  310 1609 1610 1609 1610  302  303 1611 1612  302 1611
+ 1612  311  312 1613 1614  312 1613 1614  313  314 1615 1616 1618  314 1615 1616
+  312  315 1617 1618  312  313  314  315 1615 1617 1618  315  316 1619 1620  316
+ 1619 1620 1621 1622  316 1621 1622  316 1623 1624 1623 1624  314  318 1625 1626
+  318 1625 1626  280 1627 1628  280  318 1627 1628  318  319 1629 1630  319 1629
+ 1630  321  322 1631 1632  322 1631 1632  323  324 1633 1634 1636  324 1633 1634
+  322  324  325 1635 1636  323  324  325 1633 1635 1636  280  326 1637 1638  280
+ 1637 1638  324  327 1639 1640  327 1639 1640  319 1641 1642  319  327 1641 1642
+  326  327 1643 1644  326 1643 1644  319  329 1645 1646  319 1645 1646  325  330
+ 1647 1648  330 1647 1648  280 1649 1650  280  330 1649 1650  329  330 1651 1652
+  329 1651 1652  332 1653 1654 1653 1654 1655 1656  333 1655 1656  332  333 1657
+ 1658  332 1657 1658  319  336 1659 1660  336 1659 1660  334  337 1661 1662 1664
+  334 1661 1662  336  338 1663 1664  334  336  337  338 1661 1663 1664  338  339
+ 1665 1666  339 1665 1666 1667 1668  339 1667 1668  339  341 1669 1670  341 1669
+ 1670  341 1671 1672 1671 1672  333  334 1673 1674  333 1673 1674  342  343 1675
+ 1676  343 1675 1676  344  345 1677 1678 1680  345 1677 1678  343  346 1679 1680
+  343  344  345  346 1677 1679 1680  346  347 1681 1682  347 1681 1682 1683 1684
+  347 1683 1684  347 1685 1686 1685 1686  345  349 1687 1688  349 1687 1688  311
+ 1689 1690  311  349 1689 1690  349  350 1691 1692  350 1691 1692  352  353 1693
+ 1694  353 1693 1694  354  355 1695 1696 1698  355 1695 1696  353  355  356 1697
+ 1698  354  355  356 1695 1697 1698  311  357 1699 1700  311 1699 1700  355  358
+ 1701 1702  358 1701 1702  350 1703 1704  350  358 1703 1704  357  358 1705 1706
+  357 1705 1706  350  360 1707 1708  350 1707 1708  356  361 1709 1710  361 1709
+ 1710  311 1711 1712  311  361 1711 1712  360  361 1713 1714  360 1713 1714  363
+ 1715 1716 1715 1716 1717 1718  364 1717 1718  363  364 1719 1720  363 1719 1720
+  350  367 1721 1722  367 1721 1722  365  368 1723 1724 1726  365 1723 1724  367
+  369 1725 1726  365  367  368  369 1723 1725 1726  369  370 1727 1728  370 1727
+ 1728 1729 1730  370 1729 1730  370  372 1731 1732  372 1731 1732  372 1733 1734
+ 1733 1734  364  365 1735 1736  364 1735 1736  373  374 1737 1738  374 1737 1738
+  375  376 1739 1740 1742  376 1739 1740  374  377 1741 1742  374  375  376  377
+ 1739 1741 1742  377  378 1743 1744  378 1743 1744 1745 1746  378 1745 1746  378
+ 1747 1748 1747 1748  376  380 1749 1750  380 1749 1750  342 1751 1752  342  380
+ 1751 1752  380  381 1753 1754  381 1753 1754  383  384 1755 1756  384 1755 1756
+  385  386 1757 1758 1760  386 1757 1758  384  386  387 1759 1760  385  386  387
+ 1757 1759 1760  342  388 1761 1762  342 1761 1762  386  389 1763 1764  389 1763
+ 1764  381 1765 1766  381  389 1765 1766  388  389 1767 1768  388 1767 1768  381
+  391 1769 1770  381 1769 1770  387  392 1771 1772  392 1771 1772  342 1773 1774
+  342  392 1773 1774  391  392 1775 1776  391 1775 1776  394 1777 1778 1777 1778
+ 1779 1780  395 1779 1780  394  395 1781 1782  394 1781 1782  381  398 1783 1784
+  398 1783 1784  396  399 1785 1786 1788  396 1785 1786  398  400 1787 1788  396
+  398  399  400 1785 1787 1788  400  401 1789 1790  401 1789 1790 1791 1792  401
+ 1791 1792  401  403 1793 1794  403 1793 1794  403 1795 1796 1795 1796  395  396
+ 1797 1798  395 1797 1798  404  405 1799 1800  405 1799 1800  406  407 1801 1802
+ 1804  407 1801 1802  405  408 1803 1804  405  406  407  408 1801 1803 1804  408
+  409 1805 1806  409 1805 1806 1807 1808  409 1807 1808  409 1809 1810 1809 1810
+  407  411 1811 1812  411 1811 1812  373 1813 1814  373  411 1813 1814  411  412
+ 1815 1816  412 1815 1816  414  415 1817 1818  415 1817 1818  416  417 1819 1820
+ 1822  417 1819 1820  415  417  418 1821 1822  416  417  418 1819 1821 1822  373
+  419 1823 1824  373 1823 1824  417  420 1825 1826  420 1825 1826  412 1827 1828
+  412  420 1827 1828  419  420 1829 1830  419 1829 1830  412  422 1831 1832  412
+ 1831 1832  418  423 1833 1834  423 1833 1834  373 1835 1836  373  423 1835 1836
+  422  423 1837 1838  422 1837 1838  425 1839 1840 1839 1840 1841 1842  426 1841
+ 1842  425  426 1843 1844  425 1843 1844  412  429 1845 1846  429 1845 1846  427
+  430 1847 1848 1850  427 1847 1848  429  431 1849 1850  427  429  430  431 1847
+ 1849 1850  431  432 1851 1852  432 1851 1852 1853 1854  432 1853 1854  432  434
+ 1855 1856  434 1855 1856  434 1857 1858 1857 1858  426  427 1859 1860  426 1859
+ 1860  435  436 1861 1862  436 1861 1862  437  438 1863 1864 1866  438 1863 1864
+  436  439 1865 1866  436  437  438  439  442 1863 1865 1866  439  440 1867 1868
+  440 1867 1868 1869 1870  440 1869 1870  440 1871 1872 1871 1872  438  442 1873
+ 1874  442 1873 1874  404 1875 1876  404  442 1875 1876  442  443 1877 1878  443
+ 1877 1878  445  446 1879 1880  446 1879 1880  447  448 1881 1882 1884  448 1881
+ 1882  446  449 1883 1884  446  447  448  449  451 1881 1883 1884  404  450 1885
+ 1886  404 1885 1886  448  451 1887 1888  451 1887 1888  443 1889 1890  443  451
+ 1889 1890  450  451 1891 1892  450 1891 1892  443  453 1893 1894  443 1893 1894
+  449  454 1895 1896  454 1895 1896  404 1897 1898  404  454 1897 1898  453  454
+ 1899 1900  453 1899 1900  456 1901 1902 1901 1902 1903 1904  457 1903 1904  456
+  457 1905 1906  456 1905 1906  443  460 1907 1908  460 1907 1908  458  461 1909
+ 1910 1912  458 1909 1910  460  462 1911 1912  458  460  461  462 1909 1911 1912
+  462  463 1913 1914  463 1913 1914 1915 1916  463 1915 1916  463  465 1917 1918
+  465 1917 1918  465 1919 1920 1919 1920  457  458 1921 1922  457 1921 1922  466
+  467 1923 1924  467 1923 1924  468  469 1925 1926 1928  469 1925 1926  467  470
+ 1927 1928  467  468  469  470  473 1925 1927 1928  470  471 1929 1930  471 1929
+ 1930 1931 1932  471 1931 1932  471 1933 1934 1933 1934  469  473 1935 1936  473
+ 1935 1936  435 1937 1938  435  473 1937 1938  473  474 1939 1940  474 1939 1940
+  476  477 1941 1942  477 1941 1942  478  479 1943 1944 1946  479 1943 1944  477
+  479  480 1945 1946  478  479  480 1943 1945 1946  435  481 1947 1948  435 1947
+ 1948  479  482 1949 1950  482 1949 1950  474 1951 1952  474  482 1951 1952  481
+  482 1953 1954  481 1953 1954  474  484 1955 1956  474 1955 1956  480  485 1957
+ 1958  485 1957 1958  435 1959 1960  435  485 1959 1960  484  485 1961 1962  484
+ 1961 1962  487 1963 1964 1963 1964 1965 1966  488 1965 1966  487  488 1967 1968
+  487 1967 1968  474  491 1969 1970  491 1969 1970  489  492 1971 1972 1974  489
+ 1971 1972  491  493 1973 1974  489  491  492  493 1971 1973 1974 3953  493  494
+ 1975 1976  494 1975 1976 1977 1978  494 1977 1978  494  496 1979 1980  496 1979
+ 1980  496 1981 1982 1981 1982  488  489 1983 1984  488 1983 1984  497  498 1985
+ 1986  498 1985 1986  499  500 1987 1988 1990  500 1987 1988  498  501 1989 1990
+  498  499  500  501  504 1987 1989 1990  501  502 1991 1992  502 1991 1992 1993
+ 1994  502 1993 1994  502 1995 1996 1995 1996  500  504 1997 1998  504 1997 1998
+  466 1999 2000  466  504 1999 2000  504  505 2001 2002  505 2001 2002  507  508
+ 2003 2004  508 2003 2004  509  510 2005 2006 2008  510 2005 2006  508  510  511
+ 2007 2008  509  510  511 2005 2007 2008  466  512 2009 2010  466 2009 2010  510
+  513 2011 2012  513 2011 2012  505 2013 2014  505  513 2013 2014  512  513 2015
+ 2016  512 2015 2016  505  515 2017 2018  505 2017 2018  511  516 2019 2020  516
+ 2019 2020  466 2021 2022  466  516 2021 2022  515  516 2023 2024  515 2023 2024
+  518 2025 2026 2025 2026 2027 2028  519 2027 2028  518  519 2029 2030  518 2029
+ 2030  505  522 2031 2032  522 2031 2032  520  523 2033 2034 2036  520 2033 2034
+  522  524 2035 2036  520  522  523  524 2033 2035 2036  524  525 2037 2038  525
+ 2037 2038 2039 2040  525 2039 2040  525  527 2041 2042  527 2041 2042  527 2043
+ 2044 2043 2044  519  520 2045 2046  519 2045 2046  528  529 2047 2048  529 2047
+ 2048  530  531 2049 2050 2052  531 2049 2050  529  532 2051 2052  529  530  531
+  532 2049 2051 2052  532  533 2053 2054  533 2053 2054 2055 2056  533 2055 2056
+  533 2057 2058 2057 2058  531  535 2059 2060  535 2059 2060  497 2061 2062  497
+  535 2061 2062  535  536 2063 2064  536 2063 2064  538  539 2065 2066  539 2065
+ 2066  540  541 2067 2068 2070  541 2067 2068  539  542 2069 2070  539  540  541
+  542 2067 2069 2070  497  543 2071 2072  497 2071 2072  541  544 2073 2074  544
+ 2073 2074  536 2075 2076  536  544 2075 2076  543  544 2077 2078  543 2077 2078
+  536  546 2079 2080  536 2079 2080  542  547 2081 2082  547 2081 2082  497 2083
+ 2084  497  547 2083 2084  546  547 2085 2086  546 2085 2086  549 2087 2088 2087
+ 2088 2089 2090  550 2089 2090  549  550 2091 2092  549 2091 2092  536  553 2093
+ 2094  553 2093 2094  551  554 2095 2096 2098  551 2095 2096  553  555 2097 2098
+  551  553  554  555 2095 2097 2098  555  556 2099 2100  556 2099 2100 2101 2102
+  556 2101 2102  556  558 2103 2104  558 2103 2104  558 2105 2106 2105 2106  550
+  551 2107 2108  550 2107 2108  559  560 2109 2110  560 2109 2110  561  562 2111
+ 2112 2114  562 2111 2112  560  563 2113 2114  560  561  562  563  566 2111 2113
+ 2114  563  564 2115 2116  564 2115 2116 2117 2118  564 2117 2118  564 2119 2120
+ 2119 2120  562  566 2121 2122  566 2121 2122  528 2123 2124  528  566 2123 2124
+  566  567 2125 2126  567 2125 2126  569  570 2127 2128  570 2127 2128  571  572
+ 2129 2130 2132  572 2129 2130  570  572  573 2131 2132  571  572  573 2129 2131
+ 2132  528  574 2133 2134  528 2133 2134  572  575 2135 2136  575 2135 2136  567
+ 2137 2138  567  575 2137 2138  574  575 2139 2140  574 2139 2140  567  577 2141
+ 2142  567 2141 2142  573  578 2143 2144  578 2143 2144  528 2145 2146  528  578
+ 2145 2146  577  578 2147 2148  577 2147 2148  580 2149 2150 2149 2150 2151 2152
+  581 2151 2152  580  581 2153 2154  580 2153 2154  567  584 2155 2156  584 2155
+ 2156  582  585 2157 2158 2160  582 2157 2158  584  586 2159 2160  582  584  585
+  586 2157 2159 2160  586  587 2161 2162  587 2161 2162 2163 2164  587 2163 2164
+  587  589 2165 2166  589 2165 2166  589 2167 2168 2167 2168  581  582 2169 2170
+  581 2169 2170  590  591 2171 2172  591 2171 2172  592  593 2173 2174 2176  593
+ 2173 2174  591  594 2175 2176  591  592  593  594 2173 2175 2176  594  595 2177
+ 2178  595 2177 2178 2179 2180  595 2179 2180  595 2181 2182 2181 2182  593  597
+ 2183 2184  597 2183 2184  559 2185 2186  559  597 2185 2186  597  598 2187 2188
+  598 2187 2188  600  601 2189 2190  601 2189 2190  602  603 2191 2192 2194  603
+ 2191 2192  601  603  604 2193 2194  602  603  604 2191 2193 2194  559  605 2195
+ 2196  559 2195 2196  603  606 2197 2198  606 2197 2198  598 2199 2200  598  606
+ 2199 2200  605  606 2201 2202  605 2201 2202  598  608 2203 2204  598 2203 2204
+  604  609 2205 2206  609 2205 2206  559 2207 2208  559  609 2207 2208  608  609
+ 2209 2210  608 2209 2210  611 2211 2212 2211 2212 2213 2214  612 2213 2214  611
+  612 2215 2216  611 2215 2216  598  615 2217 2218  615 2217 2218  613  616 2219
+ 2220 2222  613 2219 2220  615  617 2221 2222  613  615  616  617 2219 2221 2222
+  617  618 2223 2224  618 2223 2224 2225 2226  618 2225 2226  618  620 2227 2228
+  620 2227 2228  620 2229 2230 2229 2230  612  613 2231 2232  612 2231 2232  621
+  622 2233 2234  622 2233 2234  623  624 2235 2236 2238  624 2235 2236  622  624
+  625 2237 2238  623  624  625 2235 2237 2238  625  626 2239 2240  626 2239 2240
+ 2241 2242  626 2241 2242  626 2243 2244 2243 2244  624  628 2245 2246  628 2245
+ 2246  590 2247 2248  590  628 2247 2248  628  629 2249 2250  629 2249 2250  631
+  632 2251 2252 4233  632 2251 2252  633  634 2253 2254 2256  634 2253 2254  632
+  634  635 2255 2256  633  634  635 2253 2255 2256  590  636 2257 2258  590 2257
+ 2258  634  637 2259 2260  637 2259 2260  629 2261 2262  629  637 2261 2262  636
+  637 2263 2264  636 2263 2264  629  639 2265 2266  629 2265 2266  635  640 2267
+ 2268  640 2267 2268  590 2269 2270  590  640 2269 2270  639  640 2271 2272  639
+ 2271 2272  642 2273 2274 2273 2274 2275 2276  643 2275 2276  642  643 2277 2278
+ 4258  642 2277 2278  629  646 2279 2280  646 2279 2280  644  647 2281 2282 2284
+  644 2281 2282  646  648 2283 2284  644  646  647  648 2281 2283 2284  648  649
+ 2285 2286  649 2285 2286 2287 2288  649 2287 2288  649  651 2289 2290 4271  651
+ 2289 2290  651 2291 2292 2291 2292  643  644 2293 2294  643 2293 2294  652  653
+ 2295 2296  653 2295 2296  654  655 2297 2298 2300  655 2297 2298  653  655  656
+ 2299 2300  654  655  656 2297 2299 2300  656  657 2301 2302  657 2301 2302 2303
+ 2304  657 2303 2304  657 2305 2306 2305 2306  655  659 2307 2308  659 2307 2308
+  621 2309 2310  621  659 2309 2310  659  660 2311 2312  660 2311 2312  662  663
+ 2313 2314 4295  663 2313 2314  664  665 2315 2316 2318  665 2315 2316  663  665
+  666 2317 2318  664  665  666 2315 2317 2318  621  667 2319 2320  621 2319 2320
+  665  668 2321 2322  668 2321 2322  660 2323 2324  660  668 2323 2324  667  668
+ 2325 2326  667 2325 2326  660  670 2327 2328  660 2327 2328  666  671 2329 2330
+  671 2329 2330  621 2331 2332  621  671 2331 2332  670  671 2333 2334  670 2333
+ 2334  673 2335 2336 2335 2336 2337 2338  674 2337 2338  673  674 2339 2340  673
+ 2339 2340  660  677 2341 2342  677 2341 2342  675  678 2343 2344 2346  675 2343
+ 2344  677  679 2345 2346  675  677  678  679 2343 2345 2346  679  680 2347 2348
+  680 2347 2348 2349 2350  680 2349 2350  680  682 2351 2352  682 2351 2352  682
+ 2353 2354 2353 2354  674  675 2355 2356  674 2355 2356  683  684 2357 2358  684
+ 2357 2358  685  686 2359 2360 2362  686 2359 2360  684  686  687 2361 2362  685
+  686  687 2359 2361 2362  687  688 2363 2364 4347  688 2363 2364 2365 2366  688
+ 2365 2366  688 2367 2368 2367 2368  686  690 2369 2370  690 2369 2370  652 2371
+ 2372  652  690 2371 2372  690  691 2373 2374  691 2373 2374  693  694 2375 2376
+ 4357  694 2375 2376  695  696 2377 2378 2380  696 2377 2378  694  697 2379 2380
+  694  695  696  697  699  702 2377 2379 2380  652  698 2381 2382  652 2381 2382
+  696  699 2383 2384  699 2383 2384  691 2385 2386  691  699 2385 2386  698  699
+ 2387 2388  698 2387 2388  691  701 2389 2390  691 2389 2390  697  702 2391 2392
+  702 2391 2392  652 2393 2394  652  702 2393 2394  701  702 2395 2396  701 2395
+ 2396  704 2397 2398 2397 2398 2399 2400  705 2399 2400  704  705 2401 2402 4399
+  704 2401 2402  691  708 2403 2404  708 2403 2404  706  709 2405 2406 2408  706
+ 2405 2406  708  710 2407 2408  706  708  709  710 2405 2407 2408  710  711 2409
+ 2410  711 2409 2410 2411 2412  711 2411 2412  711  713 2413 2414 4394  713 2413
+ 2414  713 2415 2416 2415 2416  705  706 2417 2418  705 2417 2418  714  715 2419
+ 2420  715 2419 2420  716  717 2421 2422 2424  717 2421 2422  715  717  718 2423
+ 2424  716  717  718 2421 2423 2424  718  719 2425 2426  719 2425 2426 2427 2428
+  719 2427 2428  719 2429 2430 2429 2430  717  721 2431 2432  721 2431 2432  683
+ 2433 2434  683  721 2433 2434  721  722 2435 2436  722 2435 2436  724  725 2437
+ 2438 4419  725 2437 2438  726  727 2439 2440 2442  727 2439 2440  725  728 2441
+ 2442  725  726  727  728 2439 2441 2442  683  729 2443 2444  683 2443 2444  727
+  730 2445 2446  730 2445 2446  722 2447 2448  722  730 2447 2448  729  730 2449
+ 2450  729 2449 2450  722  732 2451 2452  722 2451 2452  728  733 2453 2454  733
+ 2453 2454  683 2455 2456  683  733 2455 2456  732  733 2457 2458  732 2457 2458
+  735 2459 2460 2459 2460 2461 2462  736 2461 2462  735  736 2463 2464  735 2463
+ 2464  722  739 2465 2466  739 2465 2466  737  740 2467 2468 2470  737 2467 2468
+  739  741 2469 2470  737  739  740  741 2467 2469 2470  741  742 2471 2472  742
+ 2471 2472 2473 2474  742 2473 2474  742  744 2475 2476  744 2475 2476  744 2477
+ 2478 2477 2478  736  737 2479 2480  736 2479 2480  745  746 2481 2482  746 2481
+ 2482  747  748 2483 2484 2486  748 2483 2484  746  748  749 2485 2486  747  748
+  749 2483 2485 2486  749  750 2487 2488  750 2487 2488 2489 2490  750 2489 2490
+  750 2491 2492 2491 2492  748  752 2493 2494  752 2493 2494  714 2495 2496  714
+  752 2495 2496  752  753 2497 2498  753 2497 2498  755  756 2499 2500 4481  756
+ 2499 2500  757  758 2501 2502 2504  758 2501 2502  756  758  759 2503 2504  757
+  758  759 2501 2503 2504  714  760 2505 2506  714 2505 2506  758  761 2507 2508
+  761 2507 2508  753 2509 2510  753  761 2509 2510  760  761 2511 2512  760 2511
+ 2512  753  763 2513 2514  753 2513 2514  759  764 2515 2516  764 2515 2516  714
+ 2517 2518  714  764 2517 2518  763  764 2519 2520  763 2519 2520  766 2521 2522
+ 2521 2522 2523 2524  767 2523 2524  766  767 2525 2526 4523  766 2525 2526  753
+  770 2527 2528  770 2527 2528  768  771 2529 2530 2532  768 2529 2530  770  772
+ 2531 2532  768  770  771  772 2529 2531 2532  772  773 2533 2534  773 2533 2534
+ 2535 2536  773 2535 2536  773  775 2537 2538 4519  775 2537 2538  775 2539 2540
+ 2539 2540  767  768 2541 2542 4503  767 2541 2542  776  777 2543 2544  777 2543
+ 2544  778  779 2545 2546 2548  779 2545 2546  777  780 2547 2548  777  778  779
+  780  783 2545 2547 2548  780  781 2549 2550  781 2549 2550 2551 2552  781 2551
+ 2552  781 2553 2554 2553 2554  779  783 2555 2556  783 2555 2556  745 2557 2558
+  745  783 2557 2558  783  784 2559 2560  784 2559 2560  786  787 2561 2562 4543
+  787 2561 2562  788  789 2563 2564 2566  789 2563 2564  787  789  790 2565 2566
+  788  789  790 2563 2565 2566  745  791 2567 2568  745 2567 2568  789  792 2569
+ 2570  792 2569 2570  784 2571 2572  784  792 2571 2572  791  792 2573 2574  791
+ 2573 2574  784  794 2575 2576  784 2575 2576  790  795 2577 2578  795 2577 2578
+  745 2579 2580  745  795 2579 2580  794  795 2581 2582  794 2581 2582  797 2583
+ 2584 2583 2584 2585 2586  798 2585 2586  797  798 2587 2588  797 2587 2588  784
+  801 2589 2590  801 2589 2590  799  802 2591 2592 2594  799 2591 2592  801  803
+ 2593 2594  799  801  802  803 2591 2593 2594  803  804 2595 2596  804 2595 2596
+ 2597 2598  804 2597 2598  804  806 2599 2600  806 2599 2600  806 2601 2602 2601
+ 2602  798  799 2603 2604  798 2603 2604  807  808 2605 2606  808 2605 2606  809
+  810 2607 2608 2610  810 2607 2608  808  811 2609 2610  808  809  810  811  814
+ 2607 2609 2610  811  812 2611 2612  812 2611 2612 2613 2614  812 2613 2614  812
+ 2615 2616 2615 2616  810  814 2617 2618  814 2617 2618  776 2619 2620  776  814
+ 2619 2620  814  815 2621 2622  815 2621 2622  817  818 2623 2624 4605  818 2623
+ 2624  819  820 2625 2626 2628  820 2625 2626  818  820  821 2627 2628  819  820
+  821 2625 2627 2628  776  822 2629 2630  776 2629 2630  820  823 2631 2632  823
+ 2631 2632  815 2633 2634  815  823 2633 2634  822  823 2635 2636  822 2635 2636
+  815  825 2637 2638  815 2637 2638  821  826 2639 2640  826 2639 2640  776 2641
+ 2642  776  826 2641 2642  825  826 2643 2644  825 2643 2644  828 2645 2646 2645
+ 2646 2647 2648  829 2647 2648  828  829 2649 2650  828 2649 2650  815  832 2651
+ 2652  832 2651 2652  830  833 2653 2654 2656  830 2653 2654  832  834 2655 2656
+  830  832  833  834 2653 2655 2656  834  835 2657 2658  835 2657 2658 2659 2660
+  835 2659 2660  835  837 2661 2662  837 2661 2662  837 2663 2664 2663 2664  829
+  830 2665 2666  829 2665 2666  838  839 2667 2668  839 2667 2668  840  841 2669
+ 2670 2672  841 2669 2670  839  842 2671 2672  839  840  841  842  845 2669 2671
+ 2672  842  843 2673 2674  843 2673 2674 2675 2676  843 2675 2676  843 2677 2678
+ 2677 2678  841  845 2679 2680  845 2679 2680  807 2681 2682  807  845 2681 2682
+  845  846 2683 2684  846 2683 2684  848  849 2685 2686  849 2685 2686  850  851
+ 2687 2688 2690  851 2687 2688  849  852 2689 2690  849  850  851  852  854 2687
+ 2689 2690  807  853 2691 2692  807 2691 2692  851  854 2693 2694  854 2693 2694
+  846 2695 2696  846  854 2695 2696  853  854 2697 2698  853 2697 2698  846  856
+ 2699 2700  846 2699 2700  852  857 2701 2702  857 2701 2702  807 2703 2704  807
+  857 2703 2704  856  857 2705 2706  856 2705 2706  859 2707 2708 2707 2708 2709
+ 2710  860 2709 2710  859  860 2711 2712  859 2711 2712  846  863 2713 2714  863
+ 2713 2714  861  864 2715 2716 2718  861 2715 2716  863  865 2717 2718  861  863
+  864  865 2715 2717 2718  865  866 2719 2720  866 2719 2720 2721 2722  866 2721
+ 2722  866  868 2723 2724  868 2723 2724  868 2725 2726 2725 2726  860  861 2727
+ 2728  860 2727 2728  869  870 2729 2730  870 2729 2730  871  872 2731 2732 2734
+  872 2731 2732  870  873 2733 2734  870  871  872  873 2731 2733 2734  873  874
+ 2735 2736  874 2735 2736 2737 2738  874 2737 2738  874 2739 2740 2739 2740  872
+  876 2741 2742  876 2741 2742  838 2743 2744  838  876 2743 2744  876  877 2745
+ 2746  877 2745 2746  879  880 2747 2748  880 2747 2748  881  882 2749 2750 2752
+  882 2749 2750  880  882  883 2751 2752  881  882  883 2749 2751 2752  838  884
+ 2753 2754  838 2753 2754  882  885 2755 2756  885 2755 2756  877 2757 2758  877
+  885 2757 2758  884  885 2759 2760  884 2759 2760  877  887 2761 2762  877 2761
+ 2762  883  888 2763 2764  888 2763 2764  838 2765 2766  838  888 2765 2766  887
+  888 2767 2768  887 2767 2768  890 2769 2770 2769 2770 2771 2772  891 2771 2772
+  890  891 2773 2774  890 2773 2774  877  894 2775 2776  894 2775 2776  892  895
+ 2777 2778 2780  892 2777 2778  894  896 2779 2780  892  894  895  896 2777 2779
+ 2780  896  897 2781 2782  897 2781 2782 2783 2784  897 2783 2784  897  899 2785
+ 2786  899 2785 2786  899 2787 2788 2787 2788  891  892 2789 2790  891 2789 2790
+  900  901 2791 2792  901 2791 2792  902  903 2793 2794 2796  903 2793 2794  901
+  904 2795 2796  901  902  903  904  907 2793 2795 2796  904  905 2797 2798  905
+ 2797 2798 2799 2800  905 2799 2800  905 2801 2802 2801 2802  903  907 2803 2804
+  907 2803 2804  869 2805 2806  869  907 2805 2806  907  908 2807 2808  908 2807
+ 2808  910  911 2809 2810  911 2809 2810  912  913 2811 2812 2814  913 2811 2812
+  911  913  914 2813 2814  912  913  914 2811 2813 2814  869  915 2815 2816  869
+ 2815 2816  913  916 2817 2818  916 2817 2818  908 2819 2820  908  916 2819 2820
+  915  916 2821 2822  915 2821 2822  908  918 2823 2824  908 2823 2824  914  919
+ 2825 2826  919 2825 2826  869 2827 2828  869  919 2827 2828  918  919 2829 2830
+  918 2829 2830  921 2831 2832 2831 2832 2833 2834  922 2833 2834  921  922 2835
+ 2836  921 2835 2836  908  925 2837 2838  925 2837 2838  923  926 2839 2840 2842
+  923 2839 2840  925  927 2841 2842  923  925  926  927 2839 2841 2842  927  928
+ 2843 2844  928 2843 2844 2845 2846  928 2845 2846  928  930 2847 2848  930 2847
+ 2848  930 2849 2850 2849 2850  922  923 2851 2852  922 2851 2852  931  932 2853
+ 2854  932 2853 2854  933  934 2855 2856 2858  934 2855 2856  932  935 2857 2858
+  932  933  934  935 2855 2857 2858  935  936 2859 2860  936 2859 2860 2861 2862
+  936 2861 2862  936 2863 2864 2863 2864  934  938 2865 2866  938 2865 2866  900
+ 2867 2868  900  938 2867 2868  938  939 2869 2870  939 2869 2870  941  942 2871
+ 2872  942 2871 2872  943  944 2873 2874 2876  944 2873 2874  942  944  945 2875
+ 2876  943  944  945 2873 2875 2876  900  946 2877 2878  900 2877 2878  944  947
+ 2879 2880  947 2879 2880  939 2881 2882  939  947 2881 2882  946  947 2883 2884
+  946 2883 2884  939  949 2885 2886  939 2885 2886  945  950 2887 2888  950 2887
+ 2888  900 2889 2890  900  950 2889 2890  949  950 2891 2892  949 2891 2892  952
+ 2893 2894 2893 2894 2895 2896  953 2895 2896  952  953 2897 2898  952 2897 2898
+  939  956 2899 2900  956 2899 2900  954  957 2901 2902 2904  954 2901 2902  956
+  958 2903 2904  954  956  957  958 2901 2903 2904  958  959 2905 2906  959 2905
+ 2906 2907 2908  959 2907 2908  959  961 2909 2910  961 2909 2910  961 2911 2912
+ 2911 2912  953  954 2913 2914  953 2913 2914  962  963 2915 2916 4899  963 2915
+ 2916  964  965 2917 2918 2920  965 2917 2918  963  965  966 2919 2920  964  965
+  966 2917 2919 2920  966  967 2921 2922  967 2921 2922 2923 2924  967 2923 2924
+  967 2925 2926 2925 2926  965  969 2927 2928  969 2927 2928  931 2929 2930  931
+  969 2929 2930  969  970 2931 2932  970 2931 2932  972  973 2933 2934 4915  973
+ 2933 2934  974  975 2935 2936 2938  975 2935 2936  973  975  976 2937 2938  974
+  975  976 2935 2937 2938  931  977 2939 2940  931 2939 2940  975  978 2941 2942
+  978 2941 2942  970 2943 2944  970  978 2943 2944  977  978 2945 2946  977 2945
+ 2946  970  980 2947 2948  970 2947 2948  976  981 2949 2950  981 2949 2950  931
+ 2951 2952  931  981 2951 2952  980  981 2953 2954  980 2953 2954  983 2955 2956
+ 2955 2956 2957 2958  984 2957 2958  983  984 2959 2960  983 2959 2960  970  987
+ 2961 2962  987 2961 2962  985  988 2963 2964 2966  985 2963 2964  987  989 2965
+ 2966  985  987  988  989 2963 2965 2966  989  990 2967 2968  990 2967 2968 2969
+ 2970  990 2969 2970  990  992 2971 2972  992 2971 2972  992 2973 2974 2973 2974
+  984  985 2975 2976  984 2975 2976    1    2 2977 2978    2 2977 2978    2    4
+ 2979 2980    4 2979 2980    5 2981 2982    2    5 2981 2982    5    6  999 2983
+ 2984    6 2983 2984    6 2985 2986    7 2985 2986    7 2987 2988 2987 2988    4
+    8 2989 2990    8 2989 2990    9   10 2991 2992 3038    9 2991 2992   11   12
+ 1011 2993 2994   12 2993 2994   12   14 2995 2996   14 2995 2996   15 2997 2998
+   12   15 2997 2998   14   17 2999 3000   17 2999 3000    9   17 3001 3002    9
+   17   18 3001 3002 3003   16   17   18 3002 3003 3004   16 3003 3004   16 3005
+ 3006 3005 3006   15   20 3007 3008   20 3007 3008   20 3009 3010   20   21 3009
+ 3010 3011   19   20   21 3010 3011 3012   19 3011 3012    9   19 3013 3014    9
+ 3013 3014   23   24 3015 3016   23 3015 3016   23 3017 3018   23   25 3017 3018
+ 3035   22 3019 3020 3019 3020    9   26 3021 3022   26 3021 3022   24   26 3023
+ 3024   24 3023 3024   28 3025 3026   26   28 3025 3026   28   29 3027 3028   29
+ 3027 3028   29 3029 3030   29   30   31 3029 3030   30   31 3031 3032   31 3031
+ 3032   31 3033 3034 3033 3034   22   23   25 3018 3035 3036   22 3035 3036    8
+ 3037 3038    8    9   10 2991 3037 3038   32   33 3039 3040   33 3039 3040   33
+   35 3041 3042   35 3041 3042   36 3043 3044   33   36 3043 3044   36   37 3045
+ 3046   37 3045 3046   37 3047 3048   38 3047 3048   38 3049 3050 3049 3050   35
+   39 3051 3052   39 3051 3052   40   41 3053 3054 3100   40 3053 3054   42   43
+ 3055 3056   43 3055 3056   43   45 3057 3058   45 3057 3058   46 3059 3060   43
+   46 3059 3060   45   48 3061 3062   48 3061 3062   40   48 3063 3064    1   40
+   47   48   49 3063 3064 3065   47   49 3064 3065 3066   47 3065 3066    1   47
+ 3067 3068    1 3067 3068   46   51 3069 3070   51 3069 3070    1   50   51 3071
+ 3072    1   50   52 3071 3072 3073   50   52 3072 3073 3074   50 3073 3074   40
+   50 3075 3076   40 3075 3076   54   55 3077 3078   54 3077 3078   54 3079 3080
+   53   54   56 3079 3080   53 3081 3082 3081 3082   40   57 3083 3084   57 3083
+ 3084   55   57 3085 3086   55 3085 3086   59 3087 3088   57   59 3087 3088   59
+   60 3089 3090   60 3089 3090   60 3091 3092   60   61 3091 3092 3093   60   61
+   62 3092 3093 3094   62 3093 3094   62 3095 3096 3095 3096   53   56 3097 3098
+   53 3097 3098    1   39   40 3099 3100    1   40   41 3053 3099 3100   63   64
+ 3101 3102   64 3101 3102   64   66 3103 3104   66 3103 3104   67 3105 3106   64
+   67 3105 3106   67   68 3107 3108   68 3107 3108   68 3109 3110   69 3109 3110
+   69 3111 3112 3111 3112   66   70 3113 3114   70 3113 3114   71   72 3115 3116
+ 3162   71 3115 3116   73   74 3117 3118   74 3117 3118   74   76 3119 3120   76
+ 3119 3120   77 3121 3122   74   77 3121 3122   76   79 3123 3124   79 3123 3124
+   71   78   79 3125 3126   71   78   80 3125 3126 3127   78   80 3126 3127 3128
+   78 3127 3128   32   78 3129 3130   32 3129 3130   77   82 3131 3132   82 3131
+ 3132   32   82 3133 3134   32   71   81   82   83 3133 3134 3135   81   83 3134
+ 3135 3136   81 3135 3136   71   81 3137 3138   71 3137 3138   85   86 3139 3140
+   85 3139 3140   85 3141 3142   85   87 3141 3142 3159   84 3143 3144 3143 3144
+   71   88 3145 3146   88 3145 3146   86   88 3147 3148   86 3147 3148   90 3149
+ 3150   88   90 3149 3150   90   91 3151 3152   91 3151 3152   91 3153 3154   91
+   92 3153 3154 3155   91   92   93 3154 3155 3156   93 3155 3156   93 3157 3158
+ 3157 3158   84   85   87 3142 3159 3160   84 3159 3160   32   70 3161 3162   32
+   70   71   72 3115 3161 3162   94   95 3163 3164   95 3163 3164   95   97 3165
+ 3166   97 3165 3166   98 3167 3168   95   98 3167 3168   98   99 3169 3170   99
+ 3169 3170   99 3171 3172  100 3171 3172  100 3173 3174 3173 3174   97  101 3175
+ 3176  101 3175 3176  102  103 3177 3178 3224  102 3177 3178  104  105 3179 3180
+  105 3179 3180  105  107 3181 3182  107 3181 3182  108 3183 3184  105  108 3183
+ 3184  107  110 3185 3186  110 3185 3186  102  109  110 3187 3188  102  109  111
+ 3187 3188 3189  109  111 3188 3189 3190  109 3189 3190   63  109 3191 3192   63
+ 3191 3192  108  113 3193 3194  113 3193 3194   63  113 3195 3196   63  102  112
+  113  114 3195 3196 3197  112  114 3196 3197 3198  112 3197 3198  102  112 3199
+ 3200  102 3199 3200  116  117 3201 3202  116 3201 3202  116 3203 3204  116  118
+ 3203 3204 3221  115 3205 3206 3205 3206  102  119 3207 3208  119 3207 3208  117
+  119 3209 3210  117 3209 3210  121 3211 3212  119  121 3211 3212  121  122 3213
+ 3214  122 3213 3214  122 3215 3216  122  123  124 3215 3216  123  124 3217 3218
+  124 3217 3218  124 3219 3220 3219 3220  115  116  118 3204 3221 3222  115 3221
+ 3222   63  101  102 3223 3224   63  102  103 3177 3223 3224  125  126 3225 3226
+  126 3225 3226  126  128 3227 3228  128 3227 3228  129 3229 3230  126  129 3229
+ 3230  129  130 3231 3232  130 3231 3232  130 3233 3234  131 3233 3234  131 3235
+ 3236 3235 3236  128  132 3237 3238  132 3237 3238  133  134 3239 3240 3286  133
+ 3239 3240  135  136 1259 3241 3242  136 3241 3242  136  138 3243 3244  138 3243
+ 3244  139 3245 3246  136  139 3245 3246  138  141 3247 3248  141 3247 3248  133
+  140  141 3249 3250  133  140  142 3249 3250 3251  140  142 3250 3251 3252  140
+ 3251 3252   94  140 3253 3254   94 3253 3254  139  144 3255 3256  144 3255 3256
+   94  143  144 3257 3258   94  143  145 3257 3258 3259  143  145 3258 3259 3260
+  143 3259 3260  133  143 3261 3262  133 3261 3262  147  148 3263 3264  147 3263
+ 3264  147 3265 3266  146  147  149 3265 3266  146 3267 3268 3267 3268  133  150
+ 3269 3270  150 3269 3270  148  150 3271 3272  148 3271 3272  152 3273 3274  150
+  152 3273 3274  152  153 3275 3276  153 3275 3276  153 3277 3278  153  154 3277
+ 3278 3279  153  154  155 3278 3279 3280  155 3279 3280  155 3281 3282 3281 3282
+  146  149 3283 3284  146 3283 3284   94  132  133 3285 3286   94  133  134 3239
+ 3285 3286  156  157 3287 3288  157 3287 3288  157  159 3289 3290  159 3289 3290
+  160 3291 3292  157  160 3291 3292  160  161 3293 3294  161 3293 3294  161 3295
+ 3296  162 3295 3296  162 3297 3298 3297 3298  159  163 3299 3300  163 3299 3300
+  164  165 3301 3302 3348  164 3301 3302  166  167 1321 3303 3304  167 3303 3304
+  167  169 3305 3306  169 3305 3306  170 3307 3308  167  170 3307 3308  169  172
+ 3309 3310  172 3309 3310  164  172 3311 3312  164  171  172  173 3311 3312 3313
+  171  173 3312 3313 3314  171 3313 3314  125  171 3315 3316  125 3315 3316  170
+  175 3317 3318  175 3317 3318  125  174  175 3319 3320  125  174  176 3319 3320
+ 3321  174  176 3320 3321 3322  174 3321 3322  164  174 3323 3324  164 3323 3324
+  178  179 3325 3326  178 3325 3326  178 3327 3328  178  180 3327 3328 3345  177
+ 3329 3330 3329 3330  164  181 3331 3332  181 3331 3332  179  181 3333 3334  179
+ 3333 3334  183 3335 3336  181  183 3335 3336  183  184 3337 3338  184 3337 3338
+  184 3339 3340  184  185 3339 3340 3341  184  185  186 3340 3341 3342  186 3341
+ 3342  186 3343 3344 3343 3344  177  178  180 3328 3345 3346  177 3345 3346  125
+  163  164 3347 3348  125  164  165 3301 3347 3348  187  188 3349 3350  188 3349
+ 3350  188  190 3351 3352  190 3351 3352  191 3353 3354  188  191 3353 3354  191
+  192 3355 3356  192 3355 3356  192 3357 3358  193 3357 3358  193 3359 3360 3359
+ 3360  190  194 3361 3362  194 3361 3362  195  196 3363 3364 3410  195 3363 3364
+  197  198 1383 3365 3366  198 3365 3366  198  200 3367 3368  200 3367 3368  201
+ 3369 3370  198  201 3369 3370  200  203 3371 3372  203 3371 3372  195  202  203
+ 3373 3374  195  202  204 3373 3374 3375  202  204 3374 3375 3376  202 3375 3376
+  156  202 3377 3378  156 3377 3378  201  206 3379 3380  206 3379 3380  156  205
+  206 3381 3382  156  205  207 3381 3382 3383  205  207 3382 3383 3384  205 3383
+ 3384  195  205 3385 3386  195 3385 3386  209  210 3387 3388  209 3387 3388  209
+ 3389 3390  208  209  211 3389 3390  208 3391 3392 3391 3392  195  212 3393 3394
+  212 3393 3394  210  212 1416 3395 3396  210 3395 3396  214 3397 3398  212  214
+ 3397 3398  214  215 3399 3400  215 3399 3400  215 3401 3402  215  216 3401 3402
+ 3403  215  216  217 3402 3403 3404  217 3403 3404  217 3405 3406 3405 3406  208
+  211 3407 3408  208 3407 3408  156  194  195 3409 3410  156  195  196 3363 3409
+ 3410  218  219 3411 3412  219 3411 3412  219  221 3413 3414  221 3413 3414  222
+ 3415 3416  219  222 3415 3416  222  223 3417 3418  223 3417 3418  223 3419 3420
+  224 3419 3420  224 3421 3422 3421 3422  221  225 3423 3424  225 3423 3424  226
+  227 3425 3426 3472  226 3425 3426  228  229 3427 3428  229 3427 3428  229  231
+ 3429 3430  231 3429 3430  232 3431 3432  229  232 3431 3432  231  234 3433 3434
+  234 3433 3434  226  234 3435 3436  187  226  233  234  235 3435 3436 3437  233
+  235 3436 3437 3438  233 3437 3438  187  233 3439 3440  187 3439 3440  232  237
+ 3441 3442  237 3441 3442  187  237 3443 3444  187  226  236  237  238 3443 3444
+ 3445  236  238 3444 3445 3446  236 3445 3446  226  236 3447 3448  226 3447 3448
+  240  241 3449 3450  240 3449 3450  240 3451 3452  240  242 3451 3452 3469  239
+ 3453 3454 3453 3454  226  243 3455 3456  243 3455 3456  241  243 3457 3458  241
+ 3457 3458  245 3459 3460  243  245 3459 3460  245  246 3461 3462  246 3461 3462
+  246 3463 3464  246  247 3463 3464 3465  246  247  248 3464 3465 3466  248 3465
+ 3466  248 3467 3468 3467 3468  239  240  242 3452 3469 3470  239 3469 3470  187
+  225  226 3471 3472  187  226  227 3425 3471 3472  249  250 3473 3474  250 3473
+ 3474  250  252 3475 3476  252 3475 3476  253 3477 3478  250  253 3477 3478  253
+  254 3479 3480  254 3479 3480  254 3481 3482  255 3481 3482  255 3483 3484 3483
+ 3484  252  256 3485 3486  256 3485 3486  257  258 3487 3488 3534  257 3487 3488
+  259  260 3489 3490  260 3489 3490  260  262 3491 3492  262 3491 3492  263 3493
+ 3494  260  263 3493 3494  262  265 3495 3496  265 3495 3496  257  265 3497 3498
+  218  257  264  265  266 3497 3498 3499  264  266 3498 3499 3500  264 3499 3500
+  218  264 3501 3502  218 3501 3502  263  268 3503 3504  268 3503 3504  218  268
+ 3505 3506  218  257  267  268  269 3505 3506 3507  267  269 3506 3507 3508  267
+ 3507 3508  257  267 3509 3510  257 3509 3510  271  272 3511 3512  271 3511 3512
+  271 3513 3514  271  273 3513 3514 3531  270 3515 3516 3515 3516  257  274 3517
+ 3518  274 3517 3518  272  274 3519 3520  272 3519 3520  276 3521 3522  274  276
+ 3521 3522  276  277 3523 3524  277 3523 3524  277 3525 3526  277  278 3525 3526
+ 3527  277  278  279 3526 3527 3528  279 3527 3528  279 3529 3530 3529 3530  270
+  271  273 3514 3531 3532  270 3531 3532  218  256 3533 3534  218  256  257  258
+ 3487 3533 3534  280  281 3535 3536  281 3535 3536  281  283 3537 3538  283 3537
+ 3538  284 3539 3540  281  284 3539 3540  284  285 3541 3542  285 3541 3542  285
+ 3543 3544  286 3543 3544  286 3545 3546 3545 3546  283  287 3547 3548  287 3547
+ 3548  288  289 3549 3550 3596  288 3549 3550  290  291 3551 3552  291 3551 3552
+  291  293 3553 3554  293 3553 3554  294 3555 3556  291  294 3555 3556  293  296
+ 3557 3558  296 3557 3558  288  295  296 3559 3560  288  295  297 3559 3560 3561
+  295  297 3560 3561 3562  295 3561 3562  249  295 3563 3564  249 3563 3564  294
+  299 3565 3566  299 3565 3566  249  298  299 3567 3568  249  298  300 3567 3568
+ 3569  298  300 3568 3569 3570  298 3569 3570  288  298 3571 3572  288 3571 3572
+  302  303 3573 3574  302 3573 3574  302 3575 3576  302  304 3575 3576 3593  301
+ 3577 3578 3577 3578  288  305 3579 3580  305 3579 3580  303  305 3581 3582  303
+ 3581 3582  307 3583 3584  305  307 3583 3584  307  308 3585 3586  308 3585 3586
+  308 3587 3588  308  309 3587 3588 3589  308  309  310 3588 3589 3590  310 3589
+ 3590  310 3591 3592 3591 3592  301  302  304 3576 3593 3594  301 3593 3594  249
+  287  288 3595 3596  249  288  289 3549 3595 3596  311  312 3597 3598  312 3597
+ 3598  312  314 3599 3600  314 3599 3600  315 3601 3602  312  315 3601 3602  315
+  316 3603 3604  316 3603 3604  316 3605 3606  317 3605 3606  317 3607 3608 3607
+ 3608  314  318 3609 3610  318 3609 3610  319  320 3611 3612 3658  319 3611 3612
+  321  322 3613 3614  322 3613 3614  322  324 3615 3616  324 3615 3616  325 3617
+ 3618  322  325 3617 3618  324  327 3619 3620  327 3619 3620  319  326  327 3621
+ 3622  319  326  328 3621 3622 3623  326  328 3622 3623 3624  326 3623 3624  280
+  326 3625 3626  280 3625 3626  325  330 3627 3628  330 3627 3628  280  329  330
+ 3629 3630  280  329  331 3629 3630 3631  329  331 3630 3631 3632  329 3631 3632
+  319  329 3633 3634  319 3633 3634  333  334 3635 3636  333 3635 3636  333 3637
+ 3638  333  335 3637 3638 3655  332 3639 3640 3639 3640  319  336 3641 3642  336
+ 3641 3642  334  336 3643 3644  334 3643 3644  338 3645 3646  336  338 3645 3646
+  338  339 3647 3648  339 3647 3648  339 3649 3650  339  340  341 3649 3650  340
+  341 3651 3652  341 3651 3652  341 3653 3654 3653 3654  332  333  335 3638 3655
+ 3656  332 3655 3656  280  318  319 3657 3658  280  319  320 3611 3657 3658  342
+  343 3659 3660  343 3659 3660  343  345 3661 3662  345 3661 3662  346 3663 3664
+  343  346 3663 3664  346  347 3665 3666  347 3665 3666  347 3667 3668  348 3667
+ 3668  348 3669 3670 3669 3670  345  349 3671 3672  349 3671 3672  350  351 3673
+ 3674 3720  350 3673 3674  352  353 3675 3676  353 3675 3676  353  355 3677 3678
+  355 3677 3678  356 3679 3680  353  356 3679 3680  355  358 3681 3682  358 3681
+ 3682  350  357  358 3683 3684  350  357  359 3683 3684 3685  357  359 3684 3685
+ 3686  357 3685 3686  311  357 3687 3688  311 3687 3688  356  361 3689 3690  361
+ 3689 3690  311  360  361 3691 3692  311  360  362 3691 3692 3693  360  362 3692
+ 3693 3694  360 3693 3694  350  360 3695 3696  350 3695 3696  364  365 3697 3698
+  364 3697 3698  364 3699 3700  364  366 3699 3700 3717  363 3701 3702 3701 3702
+  350  367 3703 3704  367 3703 3704  365  367 3705 3706  365 3705 3706  369 3707
+ 3708  367  369 3707 3708  369  370 3709 3710  370 3709 3710  370 3711 3712  370
+  371 3711 3712 3713  370  371  372 3712 3713 3714  372 3713 3714  372 3715 3716
+ 3715 3716  363  364  366 3700 3717 3718  363 3717 3718  311  349  350 3719 3720
+  311  350  351 3673 3719 3720  373  374 3721 3722  374 3721 3722  374  376 3723
+ 3724  376 3723 3724  377 3725 3726  374  377 3725 3726  377  378 3727 3728  378
+ 3727 3728  378 3729 3730  379 3729 3730  379 3731 3732 3731 3732  376  380 3733
+ 3734  380 3733 3734  381  382 3735 3736 3782  381 3735 3736  383  384 3737 3738
+  384 3737 3738  384  386 3739 3740  386 3739 3740  387 3741 3742  384  387 3741
+ 3742  386  389 3743 3744  389 3743 3744  381  388  389 3745 3746  381  388  390
+ 3745 3746 3747  388  390 3746 3747 3748  388 3747 3748  342  388 3749 3750  342
+ 3749 3750  387  392 3751 3752  392 3751 3752  342  392 3753 3754  342  391  392
+  393 3753 3754 3755  391  393 3754 3755 3756  391 3755 3756  381  391 3757 3758
+  381 3757 3758  395  396 3759 3760  395 3759 3760  395 3761 3762  395  397 3761
+ 3762 3779  394 3763 3764 3763 3764  381  398 3765 3766  398 3765 3766  396  398
+ 3767 3768  396 3767 3768  400 3769 3770  398  400 3769 3770  400  401 3771 3772
+  401 3771 3772  401 3773 3774  401  402 3773 3774 3775  401  402  403 3774 3775
+ 3776  403 3775 3776  403 3777 3778 3777 3778  394  395  397 3762 3779 3780  394
+ 3779 3780  342  380 3781 3782  342  380  381  382 3735 3781 3782  404  405 3783
+ 3784  405 3783 3784  405  407 3785 3786  407 3785 3786  408 3787 3788  405  408
+ 3787 3788  408  409 3789 3790  409 3789 3790  409 3791 3792  410 3791 3792  410
+ 3793 3794 3793 3794  407  411 3795 3796  411 3795 3796  412  413 3797 3798 3844
+  412 3797 3798  414  415 3799 3800  415 3799 3800  415  417 3801 3802  417 3801
+ 3802  418 3803 3804  415  418 3803 3804  417  420 3805 3806  420 3805 3806  412
+  419  420 3807 3808  412  419  421 3807 3808 3809  419  421 3808 3809 3810  419
+ 3809 3810  373  419 3811 3812  373 3811 3812  418  423 3813 3814  423 3813 3814
+  373  423 3815 3816  373  412  422  423  424 3815 3816 3817  422  424 3816 3817
+ 3818  422 3817 3818  412  422 3819 3820  412 3819 3820  426  427 3821 3822  426
+ 3821 3822  426 3823 3824  426  428 3823 3824 3841  425 3825 3826 3825 3826  412
+  429 3827 3828  429 3827 3828  427  429 3829 3830  427 3829 3830  431 3831 3832
+  429  431 3831 3832  431  432 3833 3834  432 3833 3834  432 3835 3836  432  433
+ 3835 3836 3837  432  433  434 3836 3837 3838  434 3837 3838  434 3839 3840 3839
+ 3840  425  426  428 3824 3841 3842  425 3841 3842  373  411  412 3843 3844  373
+  412  413 3797 3843 3844  435  436 3845 3846  436 3845 3846  436  438 3847 3848
+  438 3847 3848  439 3849 3850  436  439 3849 3850  439  440 3851 3852  440 3851
+ 3852  440 3853 3854  441 3853 3854  441 3855 3856 3855 3856  438  442 3857 3858
+  442 3857 3858  443  444 3859 3860 3906  443 3859 3860  445  446 3861 3862  446
+ 3861 3862  446  448 3863 3864  448 3863 3864  449 3865 3866  446  449 3865 3866
+  448  451 3867 3868  451 3867 3868  443  451 3869 3870  404  443  450  451  452
+ 3869 3870 3871  450  452 3870 3871 3872  450 3871 3872  404  450 3873 3874  404
+ 3873 3874  449  454 3875 3876  454 3875 3876  404  453  454 3877 3878  404  453
+  455 3877 3878 3879  453  455 3878 3879 3880  453 3879 3880  443  453 3881 3882
+  443 3881 3882  457  458 3883 3884  457 3883 3884  457 3885 3886  457  459 3885
+ 3886 3903  456 3887 3888 3887 3888  443  460 3889 3890  460 3889 3890  458  460
+ 3891 3892  458 3891 3892  462 3893 3894  460  462 3893 3894  462  463 3895 3896
+  463 3895 3896  463 3897 3898  463  464  465 3897 3898  464  465 3899 3900  465
+ 3899 3900  465 3901 3902 3901 3902  456  457  459 3886 3903 3904  456 3903 3904
+  404  442  443 3905 3906  404  443  444 3859 3905 3906  466  467 3907 3908  467
+ 3907 3908  467  469 3909 3910  469 3909 3910  470 3911 3912  467  470 3911 3912
+  470  471 3913 3914  471 3913 3914  471 3915 3916  472 3915 3916  472 3917 3918
+ 3917 3918  469  473 3919 3920  473 3919 3920  474  475 3921 3922 3968  474 3921
+ 3922  476  477 3923 3924  477 3923 3924  477  479 3925 3926  479 3925 3926  480
+ 3927 3928  477  480 3927 3928  479  482 3929 3930  482 3929 3930  474  482 3931
+ 3932  474  481  482  483 3931 3932 3933  481  483 3932 3933 3934  481 3933 3934
+  435  481 3935 3936  435 3935 3936  480  485 3937 3938  485 3937 3938  435  485
+ 3939 3940  435  474  484  485  486 3939 3940 3941  484  486 3940 3941 3942  484
+ 3941 3942  474  484 3943 3944  474 3943 3944  488  489 3945 3946  488 3945 3946
+  488 3947 3948  487  488  490 3947 3948  487 3949 3950 3949 3950  474  491 3951
+ 3952  491 3951 3952  489  491 1974 3953 3954  489 3953 3954  493 3955 3956  491
+  493 3955 3956  493  494 3957 3958  494 3957 3958  494 3959 3960  494  495 3959
+ 3960 3961  494  495  496 3960 3961 3962  496 3961 3962  496 3963 3964 3963 3964
+  487  490 3965 3966  487 3965 3966  435  473  474 3967 3968  435  474  475 3921
+ 3967 3968  497  498 3969 3970  498 3969 3970  498  500 3971 3972  500 3971 3972
+  501 3973 3974  498  501 3973 3974  501  502 3975 3976  502 3975 3976  502 3977
+ 3978  503 3977 3978  503 3979 3980 3979 3980  500  504 3981 3982  504 3981 3982
+  505  506 3983 3984 4030  505 3983 3984  507  508 3985 3986  508 3985 3986  508
+  510 3987 3988  510 3987 3988  511 3989 3990  508  511 3989 3990  510  513 3991
+ 3992  513 3991 3992  505  512  513 3993 3994  505  512  514 3993 3994 3995  512
+  514 3994 3995 3996  512 3995 3996  466  512 3997 3998  466 3997 3998  511  516
+ 3999 4000  516 3999 4000  466  516 4001 4002  466  515  516  517 4001 4002 4003
+  515  517 4002 4003 4004  515 4003 4004  505  515 4005 4006  505 4005 4006  519
+  520 4007 4008  519 4007 4008  519 4009 4010  519  521 4009 4010 4027  518 4011
+ 4012 4011 4012  505  522 4013 4014  522 4013 4014  520  522 4015 4016  520 4015
+ 4016  524 4017 4018  522  524 4017 4018  524  525 4019 4020  525 4019 4020  525
+ 4021 4022  525  526 4021 4022 4023  525  526  527 4022 4023 4024  527 4023 4024
+  527 4025 4026 4025 4026  518  519  521 4010 4027 4028  518 4027 4028  466  504
+ 4029 4030  466  504  505  506 3983 4029 4030  528  529 4031 4032  529 4031 4032
+  529  531 4033 4034  531 4033 4034  532 4035 4036  529  532 4035 4036  532  533
+ 4037 4038  533 4037 4038  533 4039 4040  534 4039 4040  534 4041 4042 4041 4042
+  531  535 4043 4044  535 4043 4044  536  537 4045 4046 4092  536 4045 4046  538
+  539 4047 4048  539 4047 4048  539  541 4049 4050  541 4049 4050  542 4051 4052
+  539  542 4051 4052  541  544 4053 4054  544 4053 4054  536  544 4055 4056  536
+  543  544  545 4055 4056 4057  543  545 4056 4057 4058  543 4057 4058  497  543
+ 4059 4060  497 4059 4060  542  547 4061 4062  547 4061 4062  497  547 4063 4064
+  497  546  547  548 4063 4064 4065  546  548 4064 4065 4066  546 4065 4066  536
+  546 4067 4068  536 4067 4068  550  551 4069 4070  550 4069 4070  550 4071 4072
+  550  552 4071 4072 4089  549 4073 4074 4073 4074  536  553 4075 4076  553 4075
+ 4076  551  553 4077 4078  551 4077 4078  555 4079 4080  553  555 4079 4080  555
+  556 4081 4082  556 4081 4082  556 4083 4084  556  557 4083 4084 4085  556  557
+  558 4084 4085 4086  558 4085 4086  558 4087 4088 4087 4088  549  550  552 4072
+ 4089 4090  549 4089 4090  497  535 4091 4092  497  535  536  537 4045 4091 4092
+  559  560 4093 4094  560 4093 4094  560  562 4095 4096  562 4095 4096  563 4097
+ 4098  560  563 4097 4098  563  564 4099 4100  564 4099 4100  564 4101 4102  565
+ 4101 4102  565 4103 4104 4103 4104  562  566 4105 4106  566 4105 4106  567  568
+ 4107 4108 4154  567 4107 4108  569  570 4109 4110  570 4109 4110  570  572 4111
+ 4112  572 4111 4112  573 4113 4114  570  573 4113 4114  572  575 4115 4116  575
+ 4115 4116  567  574  575 4117 4118  567  574  576 4117 4118 4119  574  576 4118
+ 4119 4120  574 4119 4120  528  574 4121 4122  528 4121 4122  573  578 4123 4124
+  578 4123 4124  528  577  578 4125 4126  528  577  579 4125 4126 4127  577  579
+ 4126 4127 4128  577 4127 4128  567  577 4129 4130  567 4129 4130  581  582 4131
+ 4132  581 4131 4132  581 4133 4134  581  583 4133 4134 4151  580 4135 4136 4135
+ 4136  567  584 4137 4138  584 4137 4138  582  584 4139 4140  582 4139 4140  586
+ 4141 4142  584  586 4141 4142  586  587 4143 4144  587 4143 4144  587 4145 4146
+  587  588 4145 4146 4147  587  588  589 4146 4147 4148  589 4147 4148  589 4149
+ 4150 4149 4150  580  581  583 4134 4151 4152  580 4151 4152  528  566  567 4153
+ 4154  528  567  568 4107 4153 4154  590  591 4155 4156  591 4155 4156  591  593
+ 4157 4158  593 4157 4158  594 4159 4160  591  594 4159 4160  594  595 4161 4162
+  595 4161 4162  595 4163 4164  596 4163 4164  596 4165 4166 4165 4166  593  597
+ 4167 4168  597 4167 4168  598  599 4169 4170 4216  598 4169 4170  600  601 4171
+ 4172  601 4171 4172  601  603 4173 4174  603 4173 4174  604 4175 4176  601  604
+ 4175 4176  603  606 4177 4178  606 4177 4178  598  606 4179 4180  559  598  605
+  606  607 4179 4180 4181  605  607 4180 4181 4182  605 4181 4182  559  605 4183
+ 4184  559 4183 4184  604  609 4185 4186  609 4185 4186  559  609 4187 4188  559
+  608  609  610 4187 4188 4189  608  610 4188 4189 4190  608 4189 4190  598  608
+ 4191 4192  598 4191 4192  612  613 4193 4194  612 4193 4194  612  614 4195 4196
+  614 4195 4196  611 4197 4198 4197 4198  598  615 4199 4200  615 4199 4200  613
+  615 4201 4202  613 4201 4202  617 4203 4204  615  617 4203 4204  617  618 4205
+ 4206  618 4205 4206  618  619 4207 4208  619 4207 4208  618  619  620 4209 4210
+  620 4209 4210  620 4211 4212 4211 4212  611  612  614 4213 4214  611 4213 4214
+  559  597 4215 4216  559  597  598  599 4169 4215 4216  621  622 4217 4218  622
+ 4217 4218  622  624 4219 4220  624 4219 4220  625 4221 4222  622  625 4221 4222
+  625  626 4223 4224  626 4223 4224  626 4225 4226  627 4225 4226  627 4227 4228
+ 4227 4228  624  628 4229 4230  628 4229 4230  629  630 4231 4232 4278  629 4231
+ 4232  631  632 2251 4233 4234  632 4233 4234  632  634 4235 4236  634 4235 4236
+  635 4237 4238  632  635 4237 4238  634  637 4239 4240  637 4239 4240  629  636
+  637 4241 4242  629  636  638 4241 4242  636  638 4243 4244  636 4243 4244  590
+  636 4245 4246  590 4245 4246  635  640 4247 4248  640 4247 4248  590  640 4249
+ 4250  590  639  640  641 4249 4250  639  641 4251 4252  639 4251 4252  629  639
+ 4253 4254  629 4253 4254  643  644 4255 4256  643 4255 4256  643 4257 4258  642
+  643  645 2277 4257 4258  642 4259 4260 4259 4260  629  646 4261 4262  646 4261
+ 4262  644  646 4263 4264  644 4263 4264  648 4265 4266  646  648 4265 4266  648
+  649 4267 4268  649 4267 4268  649  650 4269 4270  650 4269 4270  649  650  651
+ 2289 4271 4272  651 4271 4272  651 4273 4274 4273 4274  642  645 4275 4276  642
+ 4275 4276  590  628 4277 4278  590  628  629  630 4231 4277 4278  652  653 4279
+ 4280  653 4279 4280  653  655 4281 4282  655 4281 4282  656 4283 4284  653  656
+ 4283 4284  656  657 4285 4286  657 4285 4286  657 4287 4288  658 4287 4288  658
+ 4289 4290 4289 4290  655  659 4291 4292  659 4291 4292  660  661 4293 4294 4340
+  660 4293 4294  662  663 2313 4295 4296  663 4295 4296  663  665 4297 4298  665
+ 4297 4298  666 4299 4300  663  666 4299 4300  665  668 4301 4302  668 4301 4302
+  660  667  668 4303 4304  660  667  669 4303 4304  667  669 4305 4306  667 4305
+ 4306  621  667 4307 4308  621 4307 4308  666  671 4309 4310  671 4309 4310  621
+  671 4311 4312  621  670  671  672 4311 4312  670  672 4313 4314  670 4313 4314
+  660  670 4315 4316  660 4315 4316  674  675 4317 4318  674 4317 4318  674  676
+ 4319 4320  676 4319 4320  673 4321 4322 4321 4322  660  677 4323 4324  677 4323
+ 4324  675  677 4325 4326  675 4325 4326  679 4327 4328  677  679 4327 4328  679
+  680 4329 4330  680 4329 4330  680  681 4331 4332  681 4331 4332  680  681  682
+ 4333 4334  682 4333 4334  682 4335 4336 4335 4336  673  674  676 4337 4338  673
+ 4337 4338  621  659  660 4339 4340  621  660  661 4293 4339 4340  683  684 4341
+ 4342  684 4341 4342  684  686 4343 4344  686 4343 4344  687 4345 4346  684  687
+ 4345 4346  687  688 2363 4347 4348  688 4347 4348  688 4349 4350  689 4349 4350
+  689 4351 4352 4351 4352  686  690 4353 4354  690 4353 4354  691  692 4355 4356
+ 4402  691 4355 4356  693  694 2375 4357 4358  694 4357 4358  694  696 4359 4360
+  696 4359 4360  697 4361 4362  694  697 4361 4362  696  699 4363 4364  699 4363
+ 4364  691  699 4365 4366  691  698  699  700 4365 4366  698  700 4367 4368  698
+ 4367 4368  652  698 4369 4370  652 4369 4370  697  702 4371 4372  702 4371 4372
+  652  702 4373 4374  652  701  702  703 4373 4374  701  703 4375 4376  701 4375
+ 4376  691  701 4377 4378  691 4377 4378  705  706 4379 4380  705 4379 4380  705
+  707 4381 4382  707 4381 4382  704 4383 4384 4383 4384  691  708 4385 4386  708
+ 4385 4386  706  708 4387 4388  706 4387 4388  710 4389 4390  708  710 4389 4390
+  710  711 4391 4392  711 4391 4392  711 4393 4394  711  712  713 2413 4393 4394
+  712  713 4395 4396  713 4395 4396  713 4397 4398 4397 4398  704  705  707 2401
+ 4399 4400  704 4399 4400  652  690  691 4401 4402  652  691  692 4355 4401 4402
+  714  715 4403 4404  715 4403 4404  715  717 4405 4406  717 4405 4406  718 4407
+ 4408  715  718 4407 4408  718  719 4409 4410  719 4409 4410  719 4411 4412  720
+ 4411 4412  720 4413 4414 4413 4414  717  721 4415 4416  721 4415 4416  722  723
+ 4417 4418 4464  722 4417 4418  724  725 2437 4419 4420  725 4419 4420  725  727
+ 4421 4422  727 4421 4422  728 4423 4424  725  728 4423 4424  727  730 4425 4426
+  730 4425 4426  722  729  730 4427 4428  722  729  731 4427 4428  729  731 4429
+ 4430  729 4429 4430  683  729 4431 4432  683 4431 4432  728  733 4433 4434  733
+ 4433 4434  683  733 4435 4436  683  732  733  734 4435 4436  732  734 4437 4438
+  732 4437 4438  722  732 4439 4440  722 4439 4440  736  737 4441 4442  736 4441
+ 4442  736  738 4443 4444  738 4443 4444  735 4445 4446 4445 4446  722  739 4447
+ 4448  739 4447 4448  737  739 4449 4450  737 4449 4450  741 4451 4452  739  741
+ 4451 4452  741  742 4453 4454  742 4453 4454  742  743 4455 4456  743 4455 4456
+  742  743  744 4457 4458  744 4457 4458  744 4459 4460 4459 4460  735  736  738
+ 4461 4462  735 4461 4462  683  721 4463 4464  683  721  722  723 4417 4463 4464
+  745  746 4465 4466  746 4465 4466  746  748 4467 4468  748 4467 4468  749 4469
+ 4470  746  749 4469 4470  749  750 4471 4472  750 4471 4472  750 4473 4474  751
+ 4473 4474  751 4475 4476 4475 4476  748  752 4477 4478  752 4477 4478  753  754
+ 4479 4480 4526  753 4479 4480  755  756 2499 4481 4482  756 4481 4482  756  758
+ 4483 4484  758 4483 4484  759 4485 4486  756  759 4485 4486  758  761 4487 4488
+  761 4487 4488  753  761 4489 4490  753  760  761  762 4489 4490  760  762 4491
+ 4492  760 4491 4492  714  760 4493 4494  714 4493 4494  759  764 4495 4496  764
+ 4495 4496  714  763  764 4497 4498  714  763  765 4497 4498 4499  763  765 4498
+ 4499 4500  763 4499 4500  753  763 4501 4502  753 4501 4502  767  768 2541 4503
+ 4504  767 4503 4504  767  769 4505 4506  769 4505 4506  766 4507 4508 4507 4508
+  753  770 4509 4510  770 4509 4510  768  770 4511 4512  768 4511 4512  772 4513
+ 4514  770  772 4513 4514  772  773 4515 4516  773 4515 4516  773  774 4517 4518
+  774 4517 4518  773  774  775 2537 4519 4520  775 4519 4520  775 4521 4522 4521
+ 4522  766  767  769 2525 4523 4524  766 4523 4524  714  752  753 4525 4526  714
+  753  754 4479 4525 4526  776  777 4527 4528  777 4527 4528  777  779 4529 4530
+  779 4529 4530  780 4531 4532  777  780 4531 4532  780  781 4533 4534  781 4533
+ 4534  781 4535 4536  782 4535 4536  782 4537 4538 4537 4538  779  783 4539 4540
+  783 4539 4540  784  785 4541 4542 4588  784 4541 4542  786  787 2561 4543 4544
+  787 4543 4544  787  789 4545 4546  789 4545 4546  790 4547 4548  787  790 4547
+ 4548  789  792 4549 4550  792 4549 4550  784  792 4551 4552  745  784  791  792
+  793 4551 4552  791  793 4553 4554  791 4553 4554  745  791 4555 4556  745 4555
+ 4556  790  795 4557 4558  795 4557 4558  745  795 4559 4560  745  784  794  795
+  796 4559 4560  794  796 4561 4562  794 4561 4562  784  794 4563 4564  784 4563
+ 4564  798  799 4565 4566  798 4565 4566  797  798 4567 4568  797  800 4567 4568
+  797 4569 4570 4569 4570  784  801 4571 4572  801 4571 4572  799  801 4573 4574
+  799 4573 4574  803 4575 4576  801  803 4575 4576  803  804 4577 4578  804 4577
+ 4578  804  805 4579 4580  805 4579 4580  805  806 4581 4582  806 4581 4582  806
+ 4583 4584 4583 4584  797  800 4585 4586  797 4585 4586  745  783 4587 4588  745
+  783  784  785 4541 4587 4588  807  808 4589 4590  808 4589 4590  808  810 4591
+ 4592  810 4591 4592  811 4593 4594  808  811 4593 4594  811  812 4595 4596  812
+ 4595 4596  812 4597 4598  813 4597 4598  813 4599 4600 4599 4600  810  814 4601
+ 4602  814 4601 4602  815  816 4603 4604 4650  815 4603 4604  817  818 2623 4605
+ 4606  818 4605 4606  818  820 4607 4608  820 4607 4608  821 4609 4610  818  821
+ 4609 4610  820  823 4611 4612  823 4611 4612  815  823 4613 4614  776  815  822
+  823  824 4613 4614 4615  822  824 4614 4615 4616  822 4615 4616  776  822 4617
+ 4618  776 4617 4618  821  826 4619 4620  826 4619 4620  776  826 4621 4622  776
+  815  825  826  827 4621 4622 4623  825  827 4622 4623 4624  825 4623 4624  815
+  825 4625 4626  815 4625 4626  829  830 4627 4628  829 4627 4628  829  831 4629
+ 4630  831 4629 4630  828 4631 4632 4631 4632  815  832 4633 4634  832 4633 4634
+  830  832 4635 4636  830 4635 4636  834 4637 4638  832  834 4637 4638  834  835
+ 4639 4640  835 4639 4640  835  836 4641 4642  836 4641 4642  836  837 4643 4644
+  837 4643 4644  837 4645 4646 4645 4646  828  831 4647 4648  828 4647 4648  776
+  814 4649 4650  776  814  815  816 4603 4649 4650  838  839 4651 4652  839 4651
+ 4652  839  841 4653 4654  841 4653 4654  842 4655 4656  839  842 4655 4656  842
+  843 4657 4658  843 4657 4658  843 4659 4660  844 4659 4660  844 4661 4662 4661
+ 4662  841  845 4663 4664  845 4663 4664  846  847 4665 4666  846 4665 4666  848
+  849 4667 4668  849 4667 4668  849  851 4669 4670  851 4669 4670  852 4671 4672
+  849  852 4671 4672  851  854 4673 4674  854 4673 4674  846  854 4675 4676  807
+  846  853  854  855 4675 4676 4677  853  855 4676 4677 4678  853 4677 4678  807
+  853 4679 4680  807 4679 4680  852  857 4681 4682  857 4681 4682  807  857 4683
+ 4684  807  846  856  857  858 4683 4684  856  858 4685 4686  856 4685 4686  846
+  856 4687 4688  846 4687 4688  860  861 4689 4690  860 4689 4690  860  862 4691
+ 4692  862 4691 4692  859 4693 4694 4693 4694  846  863 4695 4696  863 4695 4696
+  861  863 4697 4698  861 4697 4698  865 4699 4700  863  865 4699 4700  865  866
+ 4701 4702  866 4701 4702  866  868 4703 4704  867  868 4703 4704  867  868 4705
+ 4706  868 4705 4706  868 4707 4708 4707 4708  859  862 4709 4710  859 4709 4710
+  807  845 4711 4712  807  845  846  847 4711 4712  869  870 4713 4714  870 4713
+ 4714  870  872 4715 4716  872 4715 4716  873 4717 4718  870  873 4717 4718  873
+  874 4719 4720  874 4719 4720  874 4721 4722  875 4721 4722  875 4723 4724 4723
+ 4724  872  876 4725 4726  876 4725 4726  877  878 4727 4728 4774  877 4727 4728
+  879  880 4729 4730  880 4729 4730  880  882 4731 4732  882 4731 4732  883 4733
+ 4734  880  883 4733 4734  882  885 4735 4736  885 4735 4736  877  885 4737 4738
+  838  877  884  885  886 4737 4738 4739  884  886 4738 4739 4740  884 4739 4740
+  838  884 4741 4742  838 4741 4742  883  888 4743 4744  888 4743 4744  838  888
+ 4745 4746  838  887  888  889 4745 4746 4747  887  889 4746 4747 4748  887 4747
+ 4748  877  887 4749 4750  877 4749 4750  891  892 4751 4752  891 4751 4752  891
+  893 4753 4754  893 4753 4754  890 4755 4756 4755 4756  877  894 4757 4758  894
+ 4757 4758  892  894 4759 4760  892 4759 4760  896 4761 4762  894  896 4761 4762
+  896  897 4763 4764  897 4763 4764  897  898 4765 4766  898 4765 4766  898  899
+ 4767 4768  899 4767 4768  899 4769 4770 4769 4770  890  893 4771 4772  890 4771
+ 4772  838  876 4773 4774  838  876  877  878 4727 4773 4774  900  901 4775 4776
+  901 4775 4776  901  903 4777 4778  903 4777 4778  904 4779 4780  901  904 4779
+ 4780  904  905 4781 4782  905 4781 4782  905 4783 4784  906 4783 4784  906 4785
+ 4786 4785 4786  903  907 4787 4788  907 4787 4788  908  909 4789 4790  908 4789
+ 4790  910  911 4791 4792  911 4791 4792  911  913 4793 4794  913 4793 4794  914
+ 4795 4796  911  914 4795 4796  913  916 4797 4798  916 4797 4798  908  916 4799
+ 4800  869  908  915  916  917 4799 4800 4801  915  917 4800 4801 4802  915 4801
+ 4802  869  915 4803 4804  869 4803 4804  914  919 4805 4806  919 4805 4806  869
+  919 4807 4808  869  908  918  919  920 4807 4808  918  920 4809 4810  918 4809
+ 4810  908  918 4811 4812  908 4811 4812  922  923 4813 4814  922 4813 4814  922
+  924 4815 4816  924 4815 4816  921 4817 4818 4817 4818  908  925 4819 4820  925
+ 4819 4820  923  925 4821 4822  923 4821 4822  927 4823 4824  925  927 4823 4824
+  927  928 4825 4826  928 4825 4826  928  930 4827 4828  929  930 4827 4828  929
+  930 4829 4830  930 4829 4830  930 4831 4832 4831 4832  921  924 4833 4834  921
+ 4833 4834  869  907  908 4835 4836  869  908  909 4835 4836  931  932 4837 4838
+  932 4837 4838  932  934 4839 4840  934 4839 4840  935 4841 4842  932  935 4841
+ 4842  935  936 4843 4844  936 4843 4844  936 4845 4846  937 4845 4846  937 4847
+ 4848 4847 4848  934  938 4849 4850  938 4849 4850  939  940 4851 4852  939 4851
+ 4852  941  942 4853 4854  942 4853 4854  942  944 4855 4856  944 4855 4856  945
+ 4857 4858  942  945 4857 4858  944  947 4859 4860  947 4859 4860  939  947 4861
+ 4862  900  939  946  947  948 4861 4862 4863  946  948 4862 4863 4864  946 4863
+ 4864  900  946 4865 4866  900 4865 4866  945  950 4867 4868  950 4867 4868  900
+  950 4869 4870  900  939  949  950  951 4869 4870  949  951 4871 4872  949 4871
+ 4872  939  949 4873 4874  939 4873 4874  953  954 4875 4876  953 4875 4876  953
+  955 4877 4878  955 4877 4878  952 4879 4880 4879 4880  939  956 4881 4882  956
+ 4881 4882  954  956 4883 4884  954 4883 4884  958 4885 4886  956  958 4885 4886
+  958  959 4887 4888  959 4887 4888  959  961 4889 4890  960  961 4889 4890  960
+  961 4891 4892  961 4891 4892  961 4893 4894 4893 4894  952  955 4895 4896  952
+ 4895 4896  900  938  939 4897 4898  900  939  940 4897 4898  962  963 2915 4899
+ 4900  963 4899 4900  963  965 4901 4902  965 4901 4902  966 4903 4904  963  966
+ 4903 4904  966  967 4905 4906  967 4905 4906  967 4907 4908  968 4907 4908  968
+ 4909 4910 4909 4910  965  969 4911 4912  969 4911 4912  970  971 4913 4914 4960
+  970 4913 4914  972  973 2933 4915 4916  973 4915 4916  973  975 4917 4918  975
+ 4917 4918  976 4919 4920  973  976 4919 4920  975  978 4921 4922  978 4921 4922
+  970  977  978 4923 4924  970  977  979 4923 4924  977  979 4925 4926  977 4925
+ 4926  931  977 4927 4928  931 4927 4928  976  981 4929 4930  981 4929 4930  931
+  980  981 4931 4932  931  980  982 4931 4932  980  982 4933 4934  980 4933 4934
+  970  980 4935 4936  970 4935 4936  984  985 4937 4938  984 4937 4938  983  984
+ 4939 4940  983  986 4939 4940  983 4941 4942 4941 4942  970  987 4943 4944  987
+ 4943 4944  985  987 4945 4946  985 4945 4946  989 4947 4948  987  989 4947 4948
+  989  990 4949 4950  990 4949 4950  990  991 4951 4952  991 4951 4952  991  992
+ 4953 4954  992 4953 4954  992 4955 4956 4955 4956  983  986 4957 4958  983 4957
+ 4958  931  969  970 4959 4960  931  970  971 4913 4959 4960
+  0.320886418015886993E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435047786080177985E-03 -0.435050661596638019E-03 -0.435015088300818990E-03
+ -0.435084759117159022E-03 -0.435058330687056996E-03 -0.435040431898216988E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.138286756523262001E-03
+ -0.140229438392076996E-03 -0.138900356244381991E-03 -0.139615838670957006E-03
+ -0.138288251646271991E-03 -0.140227943269067006E-03  0.000000000000000000E+00
+  0.419834067892811968E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423305217256013E-03 -0.161424187158655996E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435071410144034006E-03 -0.435067418468867008E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255994304135462002E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434980740464878987E-03 -0.435120576273734998E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286719937541003E-03
+ -0.140229474977798997E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253690283189994988E-01 -0.434980401777350985E-03 -0.435120927891995981E-03
+ -0.666666666666666970E-02 -0.140229499401465997E-03 -0.138286695513873000E-03
+ -0.166666666666667011E-01  0.319795589003930014E-01 -0.161423181596791996E-03
+ -0.161424310779120013E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435063425289485017E-03 -0.435075493914430017E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.320997920666244024E-01  0.000000000000000000E+00
+ -0.161423317051876997E-03 -0.161424175324035012E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435064354014662996E-03 -0.435074543616996011E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.292675967555632993E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423746720073988E-03
+ -0.161423745655837994E-03 -0.161424013317901000E-03 -0.161423479058011009E-03
+ -0.161423479016896003E-03 -0.161424013359016006E-03 -0.666666666666666970E-02
+ -0.434770925530010013E-03 -0.435374516178396991E-03 -0.434923678631289005E-03
+ -0.435218350961854000E-03 -0.434767192010144024E-03 -0.435378332843076009E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.420929664672890988E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014954239530986E-03
+ -0.435084898922045006E-03 -0.666666666666666970E-02 -0.138286666946142001E-03
+ -0.140229527969196996E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.255503077544010984E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423360613601998E-03 -0.161424131762310011E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435068344066269984E-03
+ -0.435070463673477021E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252823531939237005E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423118685809013E-03 -0.161424373690102996E-03 -0.666666666666666970E-02
+ -0.139258124485676013E-03 -0.139258070429663011E-03 -0.166666666666667011E-01
+  0.255489774258116989E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423314542717992E-03 -0.161424177833194993E-03 -0.434708406678657993E-03
+ -0.435438495815256975E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322022752306447974E-01
+ -0.435029928828667000E-03 -0.435069283946130990E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138272576530863001E-03 -0.140243618384475996E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.258515188587684006E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435046874781375994E-03
+ -0.435051611957959993E-03 -0.138907408932722987E-03 -0.139608785982616010E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320997478880679990E-01 -0.161423315884339007E-03 -0.161424176491573002E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435068970788089987E-03
+ -0.435069822829565989E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255013626659516997E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161778512187199996E-03 -0.161068980188712013E-03
+ -0.166666666666667011E-01 -0.430721544738902977E-03 -0.439524019069752994E-03
+  0.000000000000000000E+00  0.319967450902398018E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161423540386620003E-03
+ -0.161423951989293009E-03 -0.436779536615916999E-03 -0.433396525451669977E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257464062516243007E-01  0.000000000000000000E+00
+ -0.357123043190452993E-03 -0.513335165827105049E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138849239149727987E-03
+ -0.139666955765612013E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.421600788316065025E-01  0.000000000000000000E+00 -0.438009740604286991E-03
+ -0.432217430813423007E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.110342284848130999E-03 -0.562588969540806947E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.250247725514408000E-01
+  0.000000000000000000E+00  0.312521876302675970E-04 -0.718933610019259007E-03
+ -0.666666666666666970E-02 -0.143947962448449994E-03 -0.134568232466890006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.321784029946046990E-01
+  0.000000000000000000E+00 -0.131393250908101988E-03 -0.556288171480889002E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.344954211373242007E-03
+ -0.526364720145119946E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252170876990303987E-01 -0.161423598610935996E-03 -0.161423893764976013E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435940683599852022E-03
+ -0.434217221119004976E-03 -0.166666666666667011E-01  0.320886759388818998E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435055736276135005E-03
+ -0.435042919782540022E-03 -0.435014872722787977E-03 -0.435084983920160974E-03
+ -0.435084944266096979E-03 -0.435014910749576022E-03 -0.166666666666667011E-01
+ -0.138286772670705003E-03 -0.140229422244634997E-03 -0.138288005601589994E-03
+ -0.140228189313749003E-03 -0.138900291039652002E-03 -0.139615903875686995E-03
+  0.000000000000000000E+00  0.419834016282712025E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423305078950007E-03 -0.161424187296962002E-03
+ -0.666666666666666970E-02 -0.435072265974590982E-03 -0.435066581504923002E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255995473964411996E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434977413290588025E-03 -0.435124045681951985E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138286961247241009E-03 -0.140229233668097989E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.253627459848576008E-01
+  0.000000000000000000E+00 -0.434982364929778985E-03 -0.435118882445508014E-03
+ -0.666666666666666970E-02 -0.140242815655179008E-03 -0.138273379260159989E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319681677869403033E-01
+ -0.161422464571142990E-03 -0.161425027804768992E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.434576582527966993E-03 -0.435573489949336987E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320997527624522980E-01  0.000000000000000000E+00 -0.161423316033646006E-03
+ -0.161424176342266003E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435072976730357019E-03 -0.435065886383583010E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.292687587508236001E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423746234547004E-03
+ -0.161423746141366008E-03 -0.161424006944943987E-03 -0.161423485430967995E-03
+ -0.161423485376057003E-03 -0.161424006999856008E-03 -0.666666666666666970E-02
+ -0.435021380261554999E-03 -0.435118476044887013E-03 -0.434889017803224000E-03
+ -0.435253786460061004E-03 -0.434968358559784986E-03 -0.435172681171430026E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.420929989771356011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435014809641986002E-03 -0.435085049702760990E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.138286733578440991E-03 -0.140229461336898007E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.255502685417309007E-01  0.000000000000000000E+00 -0.161423359690042998E-03
+ -0.161424132685869011E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435070746654247023E-03 -0.435068067322774026E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.252823730509023001E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423118749988009E-03 -0.161424373625924000E-03
+ -0.666666666666666970E-02 -0.139258062928016002E-03 -0.139258131987322995E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.255489363703280993E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423313440402000E-03
+ -0.161424178935510009E-03  0.000000000000000000E+00 -0.434870514603341021E-03
+ -0.435272746258683974E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322040616077563027E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435018118913463021E-03 -0.435081599403137019E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138276442783731991E-03 -0.140239752131608009E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.258538650317957017E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435040431700919026E-03
+ -0.435058330932033995E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.138909454179009012E-03 -0.139606740736330012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320997274074339978E-01 -0.161423315341556987E-03
+ -0.161424177034354995E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435070727065607990E-03 -0.435068086477601978E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.256925653634595998E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.430785237193071976E-03 -0.439488203700820024E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.138857649137926994E-03
+ -0.139658545777412004E-03  0.000000000000000000E+00  0.319726623957349021E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161422835104515990E-03 -0.161424657271395992E-03 -0.427144778551284004E-03
+ -0.443167299433834994E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256448637616590001E-01  0.000000000000000000E+00
+ -0.324936573418277007E-03 -0.544311326562666028E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138671855069575001E-03
+ -0.139844339845764999E-03  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.421522789439464016E-01  0.000000000000000000E+00
+ -0.438342696270233999E-03 -0.431915478441594015E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.403316551935295006E-04 -0.713262909582468036E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.250271632275507992E-01  0.000000000000000000E+00  0.157694230048913999E-04
+ -0.703450845393883040E-03 -0.666666666666666970E-02 -0.143088258477573990E-03
+ -0.135427936437765008E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.321536300563562971E-01  0.000000000000000000E+00  0.469561289675323007E-04
+ -0.734637551356524029E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.337097960055970989E-03 -0.529554555471797037E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.251453403201698997E-01 -0.161349265278626000E-03 -0.161498227097286009E-03
+ -0.666666666666666970E-02 -0.239135715409318991E-03 -0.630652995284446973E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.312158150516623004E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161424178244695010E-03 -0.161423314131216999E-03
+ -0.161423314059395002E-03 -0.161424178316517007E-03 -0.161423748120379993E-03
+ -0.161423744255532992E-03 -0.166666666666667011E-01 -0.435052937193477996E-03
+ -0.435086217679552987E-03 -0.138285143313543007E-03 -0.140231051601796993E-03
+ -0.435084982345309993E-03 -0.435054144804906983E-03  0.000000000000000000E+00
+  0.420929687910095995E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435023352535621991E-03 -0.435076141203039014E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.138286660715920991E-03
+ -0.140229534199418006E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256871015736835004E-01  0.000000000000000000E+00
+ -0.435049188847810018E-03 -0.435049198755422021E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.139258077328959006E-03 -0.139258117586380995E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.252598187185257006E-01
+  0.000000000000000000E+00 -0.161423313380501998E-03 -0.161424178995410011E-03
+ -0.666666666666666970E-02 -0.435070190010078980E-03 -0.435068611700095025E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.320929801235649015E-01
+ -0.435014945836726004E-03 -0.435084907682568021E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138286694923928999E-03 -0.140229499991409998E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320997361127142999E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423315570077993E-03
+ -0.161424176805833989E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069372832834977E-03 -0.435069411726189985E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.296373133570986998E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435042919779554996E-03 -0.435055736280175014E-03 -0.435084883176088981E-03
+ -0.435014969337498025E-03 -0.435014925977329975E-03 -0.435084928391770982E-03
+ -0.666666666666666970E-02 -0.138910376259480993E-03 -0.139605818655859007E-03
+ -0.138287928869166004E-03 -0.140228266046172993E-03  0.000000000000000000E+00
+ -0.138286695535614994E-03 -0.140229499379725006E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.238543465439772992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.419834073423381998E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423305232433000E-03 -0.161424187143480012E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435069384652433987E-03 -0.435069399640343013E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.256582249639075999E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434980660603881994E-03 -0.435120659531148981E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286858193205001E-03 -0.140229336722134999E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256868354321434997E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434980641366321990E-03 -0.435120678040121982E-03 -0.666666666666666970E-02
+ -0.140229336893623989E-03 -0.138286858021716011E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255642318718434999E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423743718790006E-03 -0.161423748657122003E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435032845221742983E-03 -0.435106762281529014E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996545778364992E-01
+  0.000000000000000000E+00 -0.161423313381549988E-03 -0.161424178994361994E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384536214995E-03
+ -0.435069399759127010E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.256786742296024988E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423743714627998E-03 -0.161423748661284011E-03  0.000000000000000000E+00
+ -0.435032854625325984E-03 -0.435106752665999992E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320996544124479005E-01 -0.161423313377080012E-03 -0.161424178998831997E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069378322735991E-03
+ -0.435069406112619992E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.256614590292301992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435016112049723975E-03 -0.435083691563065015E-03 -0.166666666666667011E-01
+ -0.138320998722293008E-03 -0.140195196193045990E-03  0.000000000000000000E+00
+  0.319837098823798002E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.161423313471231993E-03 -0.161424178904679989E-03
+ -0.435069384789494977E-03 -0.435069399500138975E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256582249673321001E-01  0.000000000000000000E+00 -0.434980660337917985E-03
+ -0.435120659808483994E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01 -0.138286858200591995E-03 -0.140229336714748005E-03
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.419834073415166001E-01
+  0.000000000000000000E+00 -0.161423305232409988E-03 -0.161424187143501994E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435069382174439019E-03
+ -0.435069402174196023E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.253691078098052004E-01  0.000000000000000000E+00
+ -0.434980641344910027E-03 -0.435120678062418979E-03 -0.666666666666666970E-02
+ -0.140229336715104002E-03 -0.138286858200234995E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319837098823800986E-01  0.000000000000000000E+00
+ -0.161423313471231993E-03 -0.161424178904679989E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069384776443026E-03 -0.435069399513485016E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.253136149492917996E-01 -0.435016112049779974E-03
+ -0.435083691563007010E-03 -0.666666666666666970E-02 -0.138320998722294011E-03
+ -0.140195196193044987E-03 -0.166666666666667011E-01  0.312157552277101992E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161424178824619004E-03 -0.161423313551293005E-03 -0.161423313542176003E-03
+ -0.161424178833736006E-03 -0.161423748436264003E-03 -0.161423743939648006E-03
+ -0.166666666666667011E-01 -0.435065044970892999E-03 -0.435073837108244986E-03
+ -0.138284790665693988E-03 -0.140231404249645009E-03 -0.435089215417341023E-03
+ -0.435050005423793986E-03  0.000000000000000000E+00  0.420929694350214018E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435016885622778980E-03 -0.435082884886724001E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.138286670421676992E-03 -0.140229524493662005E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.254915570467122987E-01  0.000000000000000000E+00 -0.161423360047314990E-03
+ -0.161424132328596992E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435068591366232014E-03 -0.435070210801716977E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.249645634044841011E-01  0.000000000000000000E+00
+ -0.161423119061156986E-03 -0.161424373314754995E-03 -0.666666666666666970E-02
+ -0.139258118141762012E-03 -0.139258076773577012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319795589138204966E-01 -0.161423181597302004E-03
+ -0.161424310778610005E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435063329379546009E-03 -0.435075591995543994E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.322057822807787983E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435022004975030990E-03
+ -0.435077546786745027E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138280140954257988E-03 -0.140236053961082012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.286987467725142997E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423485461702009E-03 -0.161424006914210000E-03
+ -0.161423743701962999E-03 -0.161423748673950013E-03 -0.161423485478031991E-03
+ -0.161424006897879991E-03 -0.666666666666666970E-02 -0.434818199339674985E-03
+ -0.435326186862402978E-03 -0.138107939681827004E-03 -0.140408255233511993E-03
+  0.000000000000000000E+00 -0.434750487935355013E-03 -0.435395405802110002E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.419834078568206009E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423305246576987E-03 -0.161424187129334995E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435069368742756007E-03
+ -0.435069415908644010E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256580957796501002E-01  0.000000000000000000E+00
+ -0.434980863637934997E-03 -0.435120447850457018E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286593339164988E-03 -0.140229601576174009E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.256868273049125004E-01
+  0.000000000000000000E+00 -0.434980612246946025E-03 -0.435120708410804025E-03
+ -0.666666666666666970E-02 -0.140229353764145001E-03 -0.138286841151193996E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256611955308473985E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435106719725676021E-03 -0.434994029732162983E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.138320118013872997E-03 -0.140196076901467003E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320999233193210026E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423320520808004E-03 -0.161424171855104005E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435059778147493012E-03 -0.435079222540740013E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.257716863609747000E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435022363974353014E-03
+ -0.435077172536910996E-03  0.000000000000000000E+00 -0.138312498857105987E-03
+ -0.140203696058234013E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996765524547006E-01
+  0.000000000000000000E+00 -0.161423313971483987E-03 -0.161424178404427995E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435068773858159996E-03
+ -0.435070024197671990E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.255013626644550011E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161778513994175003E-03 -0.161068978381738009E-03 -0.166666666666667011E-01
+ -0.430721544928323017E-03 -0.439524018875910982E-03  0.000000000000000000E+00
+  0.319967451293014030E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.161423540387061002E-03 -0.161423951988851007E-03
+ -0.436779134539225992E-03 -0.433396918749564988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.257463974004799000E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.357123790381716987E-03 -0.513334443915504997E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138849213053176987E-03
+ -0.139666981862162010E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.421600707339388966E-01  0.000000000000000000E+00 -0.438162222473089020E-03
+ -0.432071491834290977E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.110310967384949997E-03 -0.562620287003987962E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.250247683829405015E-01  0.000000000000000000E+00  0.312489112000452975E-04
+ -0.718930333589037039E-03 -0.666666666666666970E-02 -0.143948366628467004E-03
+ -0.134567828286871993E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.321784029807069966E-01  0.000000000000000000E+00 -0.131393333165154990E-03
+ -0.556288089223836053E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.344952960576817022E-03 -0.526365968885628946E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252170877020880015E-01 -0.161423598610962993E-03
+ -0.161423893764948989E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435940683620558982E-03 -0.434217221098728986E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.312157364394618012E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423748574843987E-03 -0.161423743801067995E-03 -0.161423313378308007E-03
+ -0.161424178997604002E-03 -0.161424178997597009E-03 -0.161423313378315000E-03
+ -0.166666666666667011E-01 -0.435069384129798016E-03 -0.435069400174701974E-03
+ -0.435069377183192995E-03 -0.435069407277841015E-03 -0.138284694479560007E-03
+ -0.140231500435778991E-03  0.000000000000000000E+00  0.420929781177558013E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014927902171013E-03
+ -0.435084926384880994E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138286690846829989E-03 -0.140229504068509008E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.254915410590286010E-01
+  0.000000000000000000E+00 -0.161423359665572013E-03 -0.161424132710339996E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435069213980331018E-03
+ -0.435069574158249974E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.249645180764926991E-01  0.000000000000000000E+00 -0.161423119329893002E-03
+ -0.161424373046020010E-03 -0.666666666666666970E-02 -0.139258102070886994E-03
+ -0.139258092844452003E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319784847376137008E-01 -0.161423139773135993E-03 -0.161424352602775989E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435059186696719997E-03
+ -0.435079828747282984E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322084822393016029E-01
+  0.000000000000000000E+00 -0.435016615284991993E-03 -0.435083166852704025E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138285779894368007E-03
+ -0.140230415020971993E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.286738419021206996E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423325593204012E-03
+ -0.161424166782707997E-03 -0.161423747542489989E-03 -0.161423744833421993E-03
+ -0.161423325604668989E-03 -0.161424166771242993E-03 -0.666666666666666970E-02
+ -0.435017582368835013E-03 -0.435122368257235984E-03 -0.138274982048601996E-03
+ -0.140241212866737001E-03 -0.435000864585250997E-03 -0.435139462305625011E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.419834075447891031E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423305237978992E-03 -0.161424187137933992E-03 -0.666666666666666970E-02
+ -0.435069372007444006E-03 -0.435069412570366005E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.256581825663056012E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434980761705325996E-03
+ -0.435120554118331997E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286771391029988E-03 -0.140229423524309009E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256868309172346015E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434980606570965994E-03 -0.435120714326099026E-03 -0.666666666666666970E-02
+ -0.140229346218551998E-03 -0.138286848696788002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256614027694726000E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435029702345694975E-03 -0.435069519599524978E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.138320910572908000E-03 -0.140195284342430998E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320997009695584007E-01
+  0.000000000000000000E+00 -0.161423314617753998E-03 -0.161424177758158011E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435066266755664011E-03
+ -0.435072587790152974E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257749746704468014E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435018010012214000E-03 -0.435081712497056015E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138319195561076991E-03
+ -0.140196999354262006E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996637608070984E-01
+ -0.161423313627734002E-03 -0.161424178748178007E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069045666087995E-03 -0.435069746265178998E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255649837723719991E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423599069663996E-03
+ -0.161423893306248013E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.435941020866372994E-03 -0.434216890957876992E-03  0.000000000000000000E+00
+  0.321778215180335975E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.133203886290680012E-03 -0.554477536098311953E-03
+ -0.310287644885220981E-03 -0.560775130597174036E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.258308431809457992E-01
+  0.000000000000000000E+00 -0.200815543944558010E-03 -0.626132470433443992E-03
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.446184483319574977E-03
+ -0.411111793506487000E-03  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.421914017926215967E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435150874851761001E-03 -0.434951664732459008E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138419555564682994E-03 -0.140096639350656004E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.251983574296692006E-01  0.000000000000000000E+00 -0.874932041930874000E-05
+ -0.678932101969682953E-03 -0.666666666666666970E-02 -0.139012530212417987E-03
+ -0.139503664702921010E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319960855039493966E-01  0.000000000000000000E+00 -0.161423534312431000E-03
+ -0.161423958063482012E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.422822405468364991E-03 -0.447571581638852981E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.251635090367436989E-01 -0.161499426491679002E-03 -0.161348065884233007E-03
+ -0.666666666666666970E-02 -0.431483288576659993E-03 -0.438741647262608995E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.312157284528267985E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161424179076904013E-03
+ -0.161423313299007996E-03 -0.161423313309436991E-03 -0.161424179066474991E-03
+ -0.161423748561558987E-03 -0.161423743814352995E-03 -0.166666666666667011E-01
+ -0.435069198160920999E-03 -0.435069590334165020E-03 -0.138284743432991999E-03
+ -0.140231451482346998E-03 -0.435089075923097014E-03 -0.435050142023235000E-03
+  0.000000000000000000E+00  0.420929803448919018E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014018174735974E-03 -0.435085875041650019E-03
+ -0.666666666666666970E-02 -0.138286696532327987E-03 -0.140229498383011010E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256871018283196008E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435049188665910004E-03
+ -0.435049198945112027E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.139258096875932990E-03 -0.139258098039406008E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252598186653136998E-01  0.000000000000000000E+00 -0.161423313379014988E-03
+ -0.161424178996896994E-03 -0.666666666666666970E-02 -0.435069414335974010E-03
+ -0.435069370280600999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.320929801358365019E-01 -0.435014929695106008E-03 -0.435084924514935985E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286694969620992E-03
+ -0.140229499945718005E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320996579150382988E-01  0.000000000000000000E+00
+ -0.161423313471394000E-03 -0.161424178904518009E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069382051928022E-03 -0.435069402299393022E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.295485029250618994E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435016114097582006E-03 -0.435083689427570000E-03
+ -0.435055716891851981E-03 -0.435042938371768973E-03 -0.435014924444715012E-03
+ -0.435084929989965974E-03 -0.666666666666666970E-02 -0.138321048329605000E-03
+ -0.140195146585733998E-03 -0.138900264540687006E-03 -0.139615930374651991E-03
+ -0.138286695606836994E-03 -0.140229499308502004E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.420929804454360026E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014924421165002E-03
+ -0.435084930014524021E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138286695611057007E-03 -0.140229499304281991E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257458287849329989E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435049188237201015E-03 -0.435049199392181004E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258097270754996E-03
+ -0.139258097644584002E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255775463582341989E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423313378621992E-03 -0.161424178997289990E-03
+ -0.666666666666666970E-02 -0.435069399485626008E-03 -0.435069384803689026E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.257404831787951989E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435042938371768973E-03 -0.435055716891853011E-03
+  0.000000000000000000E+00 -0.138910376033074995E-03 -0.139605818882264002E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320997361628787004E-01
+  0.000000000000000000E+00 -0.161423315571431998E-03 -0.161424176804480011E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384827771000E-03
+ -0.435069399460985995E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256633780151170991E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423313378029990E-03
+ -0.161424178997881992E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435069376688627994E-03 -0.435069407783549012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.322089281648140976E-01 -0.435014924425583993E-03 -0.435084930009956982E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286694981505013E-03
+ -0.140229499933834011E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.256614590292298002E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435016112049637998E-03
+ -0.435083691563155003E-03 -0.166666666666667011E-01 -0.138320998722293008E-03
+ -0.140195196193045990E-03  0.000000000000000000E+00  0.319837098823795019E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161423313471231993E-03 -0.161424178904679989E-03 -0.435069384800439022E-03
+ -0.435069399488948003E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256582249675047017E-01
+  0.000000000000000000E+00 -0.434980660275586010E-03 -0.435120659873482023E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138286858201177004E-03 -0.140229336714161993E-03  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.419834073409681013E-01  0.000000000000000000E+00
+ -0.161423305232394999E-03 -0.161424187143517010E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435069384650815002E-03 -0.435069399641999023E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.253691078101502994E-01  0.000000000000000000E+00 -0.434980641228884999E-03
+ -0.435120678183400979E-03 -0.666666666666666970E-02 -0.140229336714165002E-03
+ -0.138286858201173995E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319837098823795019E-01  0.000000000000000000E+00 -0.161423313471231993E-03
+ -0.161424178904679989E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069384800325018E-03 -0.435069399489064988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253136149492911994E-01 -0.435016112049640979E-03 -0.435083691563150992E-03
+ -0.666666666666666970E-02 -0.138320998722293008E-03 -0.140195196193045990E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.312154838512909011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423744465432012E-03 -0.161423747910479997E-03
+ -0.161423311121030010E-03 -0.161424181254881999E-03 -0.161424181258488001E-03
+ -0.161423311117424008E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.435082183658245017E-03 -0.435056882388395007E-03 -0.435077576153311009E-03
+ -0.435061388362830001E-03 -0.138286177937990999E-03 -0.140230016977347998E-03
+  0.000000000000000000E+00  0.420359769758308008E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434988625092396007E-03 -0.435112371917261017E-03
+ -0.666666666666666970E-02 -0.138123103593930992E-03 -0.140393091321408005E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.254927196052617995E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423391697327001E-03
+ -0.161424100678585008E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.429493592762821984E-03 -0.440767063383173984E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.251692612175913009E-01  0.000000000000000000E+00 -0.161423746187955991E-03
+ -0.161423746187955991E-03 -0.666666666666666970E-02 -0.440899213583507998E-03
+ -0.429390539341747025E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.318962392046409984E-01 -0.363404809254292002E-03 -0.507297969422810992E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.135240574414478994E-03
+ -0.143275620500860003E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.322093262366704020E-01  0.000000000000000000E+00
+ -0.435187611870158021E-03 -0.434916457449104998E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138287300795975000E-03 -0.140228894119363997E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.286700106542116008E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423291382795011E-03
+ -0.161424200993116998E-03 -0.161423748962132008E-03 -0.161423743413780001E-03
+ -0.161423291392504990E-03 -0.161424200983406992E-03 -0.666666666666666970E-02
+ -0.435081950459870019E-03 -0.435057110654062009E-03 -0.138300165059673992E-03
+ -0.140216029855665006E-03 -0.435072720707763974E-03 -0.435066136833905980E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.419834073815130021E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305233497009E-03
+ -0.161424187142415000E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435069380325788988E-03 -0.435069404064516009E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.256582188293747009E-01
+  0.000000000000000000E+00 -0.434980686274821977E-03 -0.435120632764312025E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286845679548006E-03
+ -0.140229349235790991E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.256868342262811002E-01  0.000000000000000000E+00 -0.434980627630137003E-03
+ -0.435120692364288997E-03 -0.666666666666666970E-02 -0.140229339372730996E-03
+ -0.138286855542608001E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256614658944893000E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435012511657230014E-03
+ -0.435087446025792977E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.138321094746602009E-03 -0.140195100168737991E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320996483486796005E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423313210778008E-03 -0.161424179165134001E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435068950004979999E-03
+ -0.435069844082024008E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257759423107052008E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435016296291419009E-03 -0.435083499433781991E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138321121808393998E-03
+ -0.140195073106944999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320996585742336984E-01  0.000000000000000000E+00 -0.161423313488731992E-03
+ -0.161424178887179990E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069291046625011E-03 -0.435069495355377978E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.255844281626146988E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.327328574610849998E-03 -0.542038187704011951E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.138213770503833004E-03
+ -0.140302424411505993E-03  0.000000000000000000E+00  0.321601835856102983E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.946023952620347994E-04 -0.593079027126956976E-03 -0.133620312350716999E-03
+ -0.725519910394541970E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257239980440517012E-01  0.000000000000000000E+00
+  0.196968326301319991E-04 -0.707378255019124025E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.458132642260517973E-03
+ -0.413595834475461983E-03  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.423184714573687989E-01  0.000000000000000000E+00 -0.434738589247653001E-03
+ -0.435372752845944011E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138688174968105987E-03 -0.139828019947233010E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.251897402149190985E-01  0.000000000000000000E+00 -0.646688794373514043E-04
+ -0.623012542951639965E-03 -0.666666666666666970E-02 -0.137976511090806987E-03
+ -0.140539683824532011E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319702861190497012E-01  0.000000000000000000E+00 -0.161422661148781000E-03
+ -0.161424831227131009E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.425027871300158022E-03 -0.445328608699814012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253132418654096990E-01 -0.429884568296530988E-03 -0.440422631514232989E-03
+ -0.666666666666666970E-02 -0.138411741521856003E-03 -0.140104453393482994E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.310595796671125010E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161430908664596994E-03 -0.161416583711314988E-03
+ -0.161367227947133000E-03 -0.161480264428779009E-03 -0.161480354961919997E-03
+ -0.161367137413992012E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.438392776826464984E-03 -0.431826121382375026E-03 -0.436721410041992001E-03
+ -0.433458024629497019E-03 -0.139062002018278005E-03 -0.139454192897061995E-03
+  0.000000000000000000E+00  0.421404524821471990E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.402803578099727999E-03 -0.467868848878249025E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.227910657339421994E-03
+ -0.900841911728360008E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256049262714956004E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.340826630558288979E-03
+ -0.529260041764853008E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138562349500047995E-03 -0.139953845415291002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.253040215793172013E-01  0.000000000000000000E+00
+ -0.246116548137240024E-03 -0.598443155917590948E-03 -0.666666666666666970E-02
+ -0.140073789192540987E-03 -0.138442405722798010E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319703071004562994E-01 -0.161422660702825987E-03
+ -0.161424831673085995E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.426922177757430997E-03 -0.443395080957603976E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320998407804300029E-01
+  0.000000000000000000E+00 -0.161423318608057989E-03 -0.161424173767853993E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.432673706188636982E-03
+ -0.437518364259369976E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.295486137458738987E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435073936139989976E-03 -0.435025467270096005E-03 -0.435055941094787983E-03
+ -0.435042723373920998E-03 -0.435014132966001985E-03 -0.435085755332848012E-03
+ -0.666666666666666970E-02 -0.138321121970870991E-03 -0.140195072944468007E-03
+ -0.138900280797982998E-03 -0.139615914117355999E-03  0.000000000000000000E+00
+ -0.138286771859713999E-03 -0.140229423055624998E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.420929803055049012E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014930179625984E-03 -0.435084924009661019E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.138286695318050990E-03
+ -0.140229499597288007E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.257458286284305005E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435049187797943010E-03
+ -0.435049199850251001E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.139258097008661995E-03 -0.139258097906677002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255775441079788002E-01
+  0.000000000000000000E+00 -0.161423313317997011E-03 -0.161424179057914998E-03
+ -0.666666666666666970E-02 -0.435069409759835009E-03 -0.435069374755894995E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.257406249620084002E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435042723384703009E-03
+ -0.435055941080233978E-03  0.000000000000000000E+00 -0.138910497854425013E-03
+ -0.139605697060914011E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320997355336692033E-01  0.000000000000000000E+00 -0.161423315554604991E-03
+ -0.161424176821307994E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069370488565992E-03 -0.435069414123279022E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.256633754959206990E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423313307973996E-03
+ -0.161424179067938013E-03  0.000000000000000000E+00 -0.435085186467014016E-03
+ -0.435053945729502005E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322092437021506001E-01
+ -0.435014144601562013E-03 -0.435085743160859985E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138287341133016000E-03 -0.140228853782322998E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.254980067883627988E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161506286311511997E-03
+ -0.161341206064400012E-03 -0.166666666666667011E-01 -0.431364756974581974E-03
+ -0.438866309542754976E-03  0.000000000000000000E+00  0.319866973588324027E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161423386153582989E-03 -0.161424106222328993E-03 -0.435249329951151978E-03
+ -0.434893408363733994E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256606789218085003E-01
+  0.000000000000000000E+00 -0.434963065954236988E-03 -0.435139005528701976E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138291831741704991E-03 -0.140224363173634006E-03  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.419833492947222983E-01  0.000000000000000000E+00
+ -0.161423303638426010E-03 -0.161424188737485999E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435072120349480976E-03 -0.435066723923213975E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.253728413941171002E-01  0.000000000000000000E+00 -0.434979152196512009E-03
+ -0.435122229395894016E-03 -0.666666666666666970E-02 -0.140221824647693988E-03
+ -0.138294370267645010E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319851023354644978E-01  0.000000000000000000E+00 -0.161423349940120991E-03
+ -0.161424142435790991E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435344407954329007E-03 -0.434800430411269015E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.251600082854132011E-01 -0.161430149009246000E-03 -0.161417343366666009E-03
+ -0.666666666666666970E-02 -0.432078300694966016E-03 -0.438133229980573974E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320887275245489018E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435111004034048989E-03 -0.434989920105145983E-03 -0.435014834120551011E-03
+ -0.435085024172310027E-03 -0.435084968865059006E-03 -0.435014887158174024E-03
+ -0.166666666666667011E-01 -0.138286802031117000E-03 -0.140229392884223000E-03
+ -0.138288034558563993E-03 -0.140228160356775005E-03 -0.138900326717328012E-03
+ -0.139615868198011988E-03  0.000000000000000000E+00  0.419834068202609018E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305219030012E-03
+ -0.161424187156881997E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435073349340887991E-03 -0.435065522020417015E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256004806855422984E-01  0.000000000000000000E+00 -0.434981132067385973E-03
+ -0.435120167645717019E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138288861702206010E-03 -0.140227333213132987E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.253696619275626985E-01  0.000000000000000000E+00
+ -0.434980119724182024E-03 -0.435121221827560020E-03 -0.666666666666666970E-02
+ -0.140228210971901000E-03 -0.138287983943439000E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319694542067313001E-01 -0.161422588303278990E-03
+ -0.161424904072633994E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435109649626736015E-03 -0.435030036925322975E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320971379137156981E-01  0.000000000000000000E+00 -0.161423238729728995E-03
+ -0.161424253646182987E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435144484966556002E-03 -0.434995958682276988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.296734541379750985E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434149030098187011E-03 -0.435987412204495999E-03 -0.435031452984650986E-03
+ -0.435067691864950016E-03 -0.434950548419927988E-03 -0.435152046333653016E-03
+ -0.666666666666666970E-02 -0.138392945863299996E-03 -0.140123249052040005E-03
+ -0.138921815209089002E-03 -0.139594379706249995E-03  0.000000000000000000E+00
+ -0.138356769775533993E-03 -0.140159425139805004E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.238543465439772992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.420931571567067994E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014671747375992E-03
+ -0.435085193476965012E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138287057706326004E-03 -0.140229137209012993E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.257455695212117017E-01
+  0.000000000000000000E+00 -0.435047022783618996E-03 -0.435051457582689993E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258438339454998E-03
+ -0.139257756575883999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.255770773094191987E-01  0.000000000000000000E+00 -0.161423300794449003E-03
+ -0.161424191581463006E-03 -0.666666666666666970E-02 -0.435056164326672024E-03
+ -0.435082918027994002E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.257823402544687004E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435067703423035017E-03 -0.435031441124867973E-03  0.000000000000000000E+00
+ -0.138940670377310000E-03 -0.139575524538028997E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320998367437754992E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423318157696995E-03
+ -0.161424174218214987E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435046243704917976E-03 -0.435093061865261994E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.256631852001646017E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423307528138997E-03
+ -0.161424184847773012E-03  0.000000000000000000E+00 -0.438796404237050007E-03
+ -0.431424781290872027E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.323103002924175972E-01
+  0.000000000000000000E+00 -0.434856382728898011E-03 -0.435250190151687997E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138421511330616989E-03
+ -0.140094683584722008E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.255452807930411988E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423207454343989E-03 -0.161424284921568996E-03 -0.166666666666667011E-01
+ -0.404324480174985020E-03 -0.466398849792140982E-03  0.000000000000000000E+00
+  0.318949373068843009E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.323838290351469984E-03 -0.544927583290811006E-03
+ -0.135274543056306012E-03 -0.143241651859033012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.255622073519536988E-01  0.000000000000000000E+00 -0.114452729937741999E-03
+ -0.573228692451250047E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.218484783290129011E-03 -0.891416037679068000E-03
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.421663414565641992E-01
+  0.000000000000000000E+00  0.687960644515914959E-05 -0.694561028834151002E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.438096223299876001E-03
+ -0.432167718169455990E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.255893861003781004E-01  0.000000000000000000E+00
+ -0.330656818749315009E-03 -0.535638411162091960E-03 -0.666666666666666970E-02
+ -0.139701772394438010E-03 -0.138814422520901990E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319824507944573982E-01  0.000000000000000000E+00
+ -0.161423276715586998E-03 -0.161424215660325011E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.453058027683075014E-03 -0.417478867706813021E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.252771264600550986E-01 -0.434642415203527976E-03
+ -0.435473406290417023E-03 -0.666666666666666970E-02 -0.142683907585935987E-03
+ -0.135832287329404013E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.323033163369174023E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435248914259523019E-03 -0.434857618105281016E-03 -0.434883815526439026E-03
+ -0.435221605059342973E-03 -0.435144012463656023E-03 -0.434958245914109976E-03
+ -0.166666666666667011E-01 -0.138392691801889008E-03 -0.140123503113449990E-03
+ -0.138936896109547012E-03 -0.139579298805792012E-03 -0.138396653973258001E-03
+ -0.140119540942080996E-03  0.000000000000000000E+00  0.419134531555254014E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.160003730040957000E-03
+ -0.162843762334955009E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.439266035090063022E-03 -0.430980363867674026E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256392673957523999E-01  0.000000000000000000E+00 -0.434777562142257996E-03
+ -0.435332391625641989E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138409864927514006E-03 -0.140106329987824992E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.254101279172769014E-01  0.000000000000000000E+00
+ -0.373644623806981997E-03 -0.492026351519330037E-03 -0.666666666666666970E-02
+ -0.908412435934918957E-03  0.235481181545980998E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.321268372004474001E-01 -0.200001267706564000E-03
+ -0.627372315602944992E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.127171097759715991E-03 -0.545760156629222998E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320995602660988991E-01  0.000000000000000000E+00 -0.161423310639775001E-03
+ -0.161424181736137008E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.437288207604906009E-03 -0.432899576991382993E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.296377168165362984E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434989920176705983E-03 -0.435111003938864983E-03 -0.435084786849818019E-03
+ -0.435015061692732000E-03 -0.435015088357717974E-03 -0.435084759043768022E-03
+ -0.666666666666666970E-02 -0.138910479291025010E-03 -0.139605715624313987E-03
+ -0.138288220944727007E-03 -0.140227973970611991E-03 -0.138286985838064012E-03
+ -0.140229209077275013E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.238543465439772992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.419834073428042021E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423305232445008E-03 -0.161424187143467001E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435069384775871976E-03 -0.435069399514123015E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.256582244991315003E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434980660285921980E-03 -0.435120659862830007E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286857240759006E-03 -0.140229337674580994E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256868355803533989E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434980641844436011E-03 -0.435120677541527977E-03 -0.666666666666666970E-02
+ -0.140229336589824008E-03 -0.138286858325514989E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255642317604596984E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423743725482000E-03
+ -0.161423748650431012E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435032785443368025E-03 -0.435106823406514024E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996548652853000E-01
+ -0.161423313389232997E-03 -0.161424178986679012E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069349700356026E-03 -0.435069435379967977E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256786747395171995E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423743712989009E-03 -0.161423748662923000E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435032864937970982E-03
+ -0.435106742120615006E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996543404327983E-01
+ -0.161423313375148994E-03 -0.161424179000762988E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069389550788983E-03 -0.435069394631554978E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.256124534411805993E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.422563580132133995E-03 -0.447998624216019981E-03
+ -0.166666666666667011E-01 -0.138286150728846990E-03 -0.140230044186492007E-03
+  0.000000000000000000E+00  0.319552376669786989E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161419817025352993E-03
+ -0.161427675350558989E-03 -0.434132277678501012E-03 -0.436028351482909977E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256464487088938001E-01  0.000000000000000000E+00
+ -0.434899896421036018E-03 -0.435204881377557976E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138261308742496992E-03
+ -0.140254886172842006E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.419830852158453033E-01  0.000000000000000000E+00 -0.161423296248439004E-03
+ -0.161424196127473005E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435080307367357005E-03 -0.435058717463834973E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.253594622355668986E-01
+  0.000000000000000000E+00 -0.434950331860284995E-03 -0.435152283722859020E-03
+ -0.666666666666666970E-02 -0.140249996876446007E-03 -0.138266198038893993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319571786575690020E-01
+  0.000000000000000000E+00 -0.161420475113086009E-03 -0.161427017262827003E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.434308830151222019E-03
+ -0.435847642640102974E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.252660555352819012E-01
+ -0.423620518084152984E-03 -0.446908164705096009E-03 -0.666666666666666970E-02
+ -0.138290733115794992E-03 -0.140225461799544005E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.320831749900574031E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.434897829232994994E-03
+ -0.435207041701621983E-03 -0.435182072257829001E-03 -0.434921773469817985E-03
+ -0.434970483470178027E-03 -0.435131276171263001E-03 -0.166666666666667011E-01
+ -0.138283460799556990E-03 -0.140232734115782007E-03 -0.138290545167624998E-03
+ -0.140225649747715002E-03 -0.138898900526048002E-03 -0.139617294389290995E-03
+  0.000000000000000000E+00  0.421628681453787013E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.397188434436698007E-04 -0.727400265832661014E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435230978617207986E-03
+ -0.434916855000890001E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256973883757283007E-01
+  0.000000000000000000E+00 -0.435443117881719019E-03 -0.434671353469976026E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138672805722981989E-03
+ -0.139843389192358011E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253811407347032010E-01  0.000000000000000000E+00 -0.111660189923879004E-03
+ -0.576021232465113043E-03 -0.666666666666666970E-02 -0.807402873679339953E-03
+  0.113200250901337992E-04 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.318951479564182974E-01 -0.334801618998244014E-03 -0.534674353093398985E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.135236534922172994E-03
+ -0.143279659993166003E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320991543625473996E-01
+  0.000000000000000000E+00 -0.161423299320882996E-03 -0.161424193055029988E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.439089523234416986E-03
+ -0.431138190963970024E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.292550790147567998E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423735549563995E-03 -0.161423756826347987E-03
+ -0.161424083758227992E-03 -0.161423408617683990E-03 -0.161423408591030993E-03
+ -0.161424083784880988E-03 -0.666666666666666970E-02 -0.434939106055095008E-03
+ -0.435202598868165004E-03 -0.434989974486818000E-03 -0.435150588304691020E-03
+ -0.434898044278314978E-03 -0.435244580342165975E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.420929736779580982E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014942481458009E-03 -0.435084911182310021E-03
+ -0.666666666666666970E-02 -0.138286681738491990E-03 -0.140229513176847007E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255502900122650013E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423360190818011E-03 -0.161424132185093998E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435068884552577983E-03
+ -0.435069911008902977E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252824734873780994E-01  0.000000000000000000E+00 -0.161423118220565989E-03
+ -0.161424374155345993E-03 -0.666666666666666970E-02 -0.139258110569624000E-03
+ -0.139258084345714997E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255485447377079983E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423302756024003E-03
+ -0.161424189619888006E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434872688531394978E-03 -0.435270525342987020E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.322052497845927016E-01 -0.435023979094238992E-03 -0.435075488205511978E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138279010834734011E-03
+ -0.140237184080605013E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.258530259445254006E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.434958189051660006E-03 -0.435144092590821993E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138909064929736007E-03
+ -0.139607129985603993E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320994819723091004E-01 -0.161423308725767013E-03 -0.161424183650144996E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435080068461045984E-03
+ -0.435058950989111021E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.256614731138163008E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.429527300766954008E-03
+ -0.440793399874086025E-03 -0.166666666666667011E-01 -0.138412525549365991E-03
+ -0.140103669365973006E-03  0.000000000000000000E+00  0.319693244854589026E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161422576318977012E-03 -0.161424916056934997E-03 -0.437741651795111979E-03
+ -0.432457178334932999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.257682962742948989E-01
+  0.000000000000000000E+00 -0.372135275695876014E-03 -0.498733752802202964E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138908713201793988E-03 -0.139607481713545009E-03  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.418896828694112003E-01  0.000000000000000000E+00
+ -0.436533664253622989E-03 -0.433624016341012007E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.134626819469842998E-03 -0.143889375445495999E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.252833740154079986E-01  0.000000000000000000E+00  0.378159457418904982E-04
+ -0.725497368130881976E-03 -0.666666666666666970E-02 -0.597713125202172023E-03
+ -0.752181291867666958E-04 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.321707087590768978E-01  0.000000000000000000E+00 -0.981956570690248044E-04
+ -0.589485765319966958E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.242422521139187011E-03 -0.626419769058526952E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252392538739079006E-01 -0.333702074783931012E-03
+ -0.536013663819399954E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138219319560300990E-03 -0.140296875355038007E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.310595586219111000E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161431110154876994E-03
+ -0.161416382221035991E-03 -0.161367182761019008E-03 -0.161480309614893001E-03
+ -0.161480399925954988E-03 -0.161367092449956993E-03 -0.166666666666667011E-01
+ -0.438417298132810025E-03 -0.431802193377014986E-03 -0.436709554426625997E-03
+ -0.433469595476341000E-03 -0.139063297967854010E-03 -0.139452896947484987E-03
+  0.000000000000000000E+00  0.421404520292116974E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.402798477196058982E-03 -0.467873676471954026E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.227909367425477999E-03
+ -0.900840621814416988E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256044922875989997E-01
+  0.000000000000000000E+00 -0.340827301030340014E-03 -0.529259221222554946E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138561533653603990E-03
+ -0.139954661261736010E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253040211801738996E-01  0.000000000000000000E+00 -0.246116211180549989E-03
+ -0.598443499755276051E-03 -0.666666666666666970E-02 -0.140073790975437003E-03
+ -0.138442403939901995E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319703070963996000E-01 -0.161422660702506988E-03 -0.161424831673404994E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.426922132602362023E-03
+ -0.443395127058409007E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.321002728756756978E-01
+  0.000000000000000000E+00 -0.161423329712289998E-03 -0.161424162663622011E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.432632949803903998E-03
+ -0.437559988660470999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.296364413425855011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435207087979181024E-03 -0.434897780856480991E-03 -0.435078144333426976E-03
+ -0.435021431652866995E-03 -0.435027766934605010E-03 -0.435071537880043005E-03
+ -0.666666666666666970E-02 -0.138910179724992006E-03 -0.139606015190346991E-03
+ -0.138289402276009992E-03 -0.140226792639329005E-03 -0.138286064088546994E-03
+ -0.140230130826792004E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.238543465439772992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.419834145316453980E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423305428603012E-03 -0.161424186947310000E-03 -0.666666666666666970E-02
+ -0.435068797728623978E-03 -0.435069999793927990E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256581977277514015E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434982640822920976E-03 -0.435118594642042004E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138286799402304007E-03 -0.140229395513034991E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256861835048418988E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434980941091875014E-03 -0.435120365734433022E-03
+ -0.666666666666666970E-02 -0.140230675700951006E-03 -0.138285519214387991E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.255648151652009999E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423744258310013E-03 -0.161423748117601996E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434971272637178004E-03 -0.435169719904374022E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996755017593990E-01
+ -0.161423313937937010E-03 -0.161424178437974999E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435067422348721020E-03 -0.435071406160561979E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256787278728396011E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423752315616002E-03 -0.161423740060296007E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434863912658681987E-03
+ -0.435279496392151021E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.321000539265607968E-01
+ -0.161423323758083008E-03 -0.161424168617829001E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435021975200768020E-03 -0.435117876636681000E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.254980067884808988E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161506286650737010E-03 -0.161341205725174999E-03
+ -0.166666666666667011E-01 -0.431364756994681022E-03 -0.438866309522160990E-03
+  0.000000000000000000E+00  0.319866973712260028E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161423386153855991E-03
+ -0.161424106222056993E-03 -0.435249227300768993E-03 -0.434893508757877025E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256606773361423013E-01  0.000000000000000000E+00
+ -0.434963273622003977E-03 -0.435138788986911998E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138291826977172987E-03
+ -0.140224367938166010E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.419833531788693995E-01  0.000000000000000000E+00 -0.161423303747212010E-03
+ -0.161424188628699999E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435047841600877025E-03 -0.435091428270091996E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.253728372267130013E-01
+  0.000000000000000000E+00 -0.434980109299934025E-03 -0.435121231414349987E-03
+ -0.666666666666666970E-02 -0.140221834450205004E-03 -0.138294360465133993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319851023684395025E-01
+  0.000000000000000000E+00 -0.161423349940937992E-03 -0.161424142434973990E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435344118977629016E-03
+ -0.434800713024421025E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.251600082863014003E-01
+ -0.161430149161919989E-03 -0.161417343213992996E-03 -0.666666666666666970E-02
+ -0.432078300658010010E-03 -0.438133230018339012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.320880852120032020E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435006978817776007E-03
+ -0.435093216238015019E-03 -0.435023227600191012E-03 -0.435076271502717992E-03
+ -0.435080703092234026E-03 -0.435018977896097989E-03 -0.166666666666667011E-01
+ -0.138286342284362987E-03 -0.140229852630976986E-03 -0.138288877468917000E-03
+ -0.140227317446421998E-03 -0.138898309276879996E-03 -0.139617885638460004E-03
+  0.000000000000000000E+00  0.419834046918393031E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423305160284008E-03 -0.161424187215628001E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435057325047151988E-03
+ -0.435081731065996996E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256008015072015996E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434979053168488001E-03 -0.435122335339392023E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138289510792666997E-03 -0.140226684122673003E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.253691080193319991E-01
+  0.000000000000000000E+00 -0.434981271968578004E-03 -0.435120020450890975E-03
+ -0.666666666666666970E-02 -0.140229335054007991E-03 -0.138286859861331006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319837881512948022E-01
+ -0.161423315571830009E-03 -0.161424176804082000E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069394631856982E-03 -0.435069389550488985E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320970354177166975E-01  0.000000000000000000E+00 -0.161423235472326997E-03
+ -0.161424256903585988E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435168272526173982E-03 -0.434972697148703995E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.295545368095916985E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434122085949861976E-03 -0.436015593157589986E-03 -0.435101402844330994E-03
+ -0.434999127125352007E-03 -0.434882942033044983E-03 -0.435222550914836999E-03
+ -0.666666666666666970E-02 -0.138329236201211012E-03 -0.140186958714128013E-03
+ -0.138899088561680992E-03 -0.139617106353658006E-03 -0.138290618264299002E-03
+ -0.140225576651039995E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.420931657846550997E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014640889280020E-03
+ -0.435085225654339980E-03 -0.666666666666666970E-02 -0.138287075366651002E-03
+ -0.140229119548687995E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.257458666890181993E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435045894309902018E-03 -0.435052634377398005E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.139258454546935994E-03 -0.139257740368403004E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255770740455284994E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423300836045991E-03 -0.161424191539865991E-03
+ -0.666666666666666970E-02 -0.435055528504503020E-03 -0.435083568186561014E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257826760506618992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.434999152127107014E-03 -0.435101368436342979E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138940653027276996E-03
+ -0.139575541888062001E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996072561336027E-01
+ -0.161423312108405990E-03 -0.161424180267505992E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435072662950887002E-03 -0.435066193259916003E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256629422996789008E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423300977684998E-03 -0.161424191398227011E-03
+  0.000000000000000000E+00 -0.438825800967892977E-03 -0.431396059158853015E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.323107970182680976E-01 -0.434855364031651011E-03
+ -0.435251251813800982E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138421931556495010E-03 -0.140094263358843987E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255642320215948987E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423743718458999E-03 -0.161423748657454013E-03
+ -0.166666666666667011E-01 -0.435032852641439025E-03 -0.435106754694577981E-03
+  0.000000000000000000E+00  0.319837065352637992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161423313381099990E-03
+ -0.161424178994811992E-03 -0.435069595976944991E-03 -0.435069192642484977E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256582283981556004E-01  0.000000000000000000E+00
+ -0.434978918685956012E-03 -0.435122475932554027E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138286871097143010E-03
+ -0.140229323818195987E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.419833928488543007E-01  0.000000000000000000E+00 -0.161423304820364997E-03
+ -0.161424187555547012E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435207587129756010E-03 -0.434934243879776989E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.253691146856406986E-01
+  0.000000000000000000E+00 -0.434977844183998008E-03 -0.435123594722203980E-03
+ -0.666666666666666970E-02 -0.140229316757458007E-03 -0.138286878157881994E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319837065219248998E-01
+  0.000000000000000000E+00 -0.161423313380713987E-03 -0.161424178995198998E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069861087792979E-03
+ -0.435068933373936994E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.252163879413985006E-01
+ -0.161423743718170005E-03 -0.161423748657742004E-03 -0.666666666666666970E-02
+ -0.435032852613000023E-03 -0.435106754723656988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320883554506867022E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435127306548502995E-03 -0.434974286158534991E-03
+ -0.435023129650981017E-03 -0.435076373631457973E-03 -0.435052776118324998E-03
+ -0.435045758390568017E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.138286581602824004E-03 -0.140229613312514993E-03 -0.138900368233470993E-03
+ -0.139615826681868004E-03 -0.138293378267740995E-03 -0.140222816647599005E-03
+  0.000000000000000000E+00  0.419834123874497020E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423305373013988E-03 -0.161424187002898997E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435065741772807020E-03
+ -0.435073124632554995E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.255992190832709000E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434982794949810992E-03 -0.435118433992394990E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286284757403011E-03 -0.140229910157936013E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.253696592428569005E-01
+  0.000000000000000000E+00 -0.434980341928489984E-03 -0.435120990165656005E-03
+ -0.666666666666666970E-02 -0.140228218876105011E-03 -0.138287976039234013E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319694542100162002E-01
+ -0.161422588303611000E-03 -0.161424904072301009E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435109462002123012E-03 -0.435030220343913024E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320999013551653001E-01  0.000000000000000000E+00 -0.161423319891360013E-03
+ -0.161424172484551996E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435048884195216989E-03 -0.435090361870336983E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.297460083034186001E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435093184138760016E-03 -0.435007004727823009E-03
+ -0.435044582892759007E-03 -0.435054001407353006E-03 -0.435082620665603974E-03
+ -0.435017135654766987E-03 -0.666666666666666970E-02 -0.138933893426106987E-03
+ -0.139582301489232010E-03 -0.138357128758214001E-03 -0.140159066157124996E-03
+  0.000000000000000000E+00 -0.138352953476254988E-03 -0.140163241439084010E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.419834091612526009E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305282162996E-03
+ -0.161424187093749013E-03 -0.666666666666666970E-02 -0.435069239563229014E-03
+ -0.435069548000012994E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256578127707293990E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434981761685145974E-03 -0.435119511480767007E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138286013333851002E-03 -0.140230181581488998E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256868183233514985E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434980125799632985E-03 -0.435121215670877001E-03
+ -0.666666666666666970E-02 -0.140229372680005009E-03 -0.138286822235333988E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.255642631986222994E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423746696742987E-03 -0.161423745679168995E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434938825463917979E-03 -0.435202898680430025E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320998066618511033E-01 -0.161423317331866001E-03 -0.161424175044046008E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435039227329621022E-03
+ -0.435100236286997026E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.256789254737601995E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423744043055008E-03 -0.161423748332858004E-03  0.000000000000000000E+00
+ -0.434994690186026999E-03 -0.435145776107332002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320996673803692970E-01 -0.161423313722635004E-03
+ -0.161424178653277005E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435068125149740983E-03 -0.435070687523708016E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255452807933189002E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423207454355997E-03 -0.161424284921556012E-03
+ -0.166666666666667011E-01 -0.404324471515191976E-03 -0.466398858585693999E-03
+  0.000000000000000000E+00  0.318949372864687011E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.323838542624367001E-03
+ -0.544927348767803980E-03 -0.135274539864154011E-03 -0.143241655051185013E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255622073357968001E-01  0.000000000000000000E+00
+ -0.114453213670021004E-03 -0.573228208718970988E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.218486799119613006E-03
+ -0.891418053508551047E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.421663421972207977E-01  0.000000000000000000E+00  0.686273215429911012E-05
+ -0.694544154543290989E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.437941742226114026E-03 -0.432315834894540991E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.255893865594537008E-01
+  0.000000000000000000E+00 -0.330660242956903980E-03 -0.535635480866768947E-03
+ -0.666666666666666970E-02 -0.139701781018268989E-03 -0.138814413897070008E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319824507952650994E-01
+  0.000000000000000000E+00 -0.161423276715632995E-03 -0.161424215660278987E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.453057648293402017E-03
+ -0.417479238786919984E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.252771264643981003E-01
+ -0.434642414870016003E-03 -0.435473406638131990E-03 -0.666666666666666970E-02
+ -0.142683907492472013E-03 -0.135832287422867011E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.320831993534842971E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435039282000798980E-03
+ -0.435059530140830008E-03 -0.435183177514903016E-03 -0.434920713595511989E-03
+ -0.434923578188304021E-03 -0.435180190251448023E-03 -0.166666666666667011E-01
+ -0.138283467016899007E-03 -0.140232727898440993E-03 -0.138898928565553995E-03
+ -0.139617266349785003E-03 -0.138290715294724011E-03 -0.140225479620615013E-03
+  0.000000000000000000E+00  0.421628681647200979E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.397187181120990971E-04
+ -0.727400140501091022E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435231404167815000E-03 -0.434916453249015015E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.256970881787325009E-01
+  0.000000000000000000E+00 -0.435443459120569022E-03 -0.434671026438330013E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138672556881986003E-03
+ -0.139843638033352994E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253811403923139001E-01  0.000000000000000000E+00 -0.111659736914792001E-03
+ -0.576021685474199015E-03 -0.666666666666666970E-02 -0.807403253803167008E-03
+  0.113208142554363006E-04 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.318951479398702983E-01 -0.334801708387352976E-03 -0.534674268837695009E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.135236533028903003E-03
+ -0.143279661886435995E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320995045206775992E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423308955499994E-03 -0.161424183420412991E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.439073955171491003E-03 -0.431153380462344989E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.292551256260864985E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423741996618989E-03
+ -0.161423750379292993E-03 -0.161424083526681006E-03 -0.161423408849231003E-03
+ -0.161423408822156008E-03 -0.161424083553757003E-03 -0.666666666666666970E-02
+ -0.434870224107475974E-03 -0.435273023345194990E-03 -0.434990359763353014E-03
+ -0.435150194228865013E-03 -0.434886571394757027E-03 -0.435256310073468009E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.420929704392936990E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435014948460666999E-03 -0.435084904947665005E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.138286675090450010E-03 -0.140229519824888987E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.255502912848136997E-01  0.000000000000000000E+00 -0.161423360221235005E-03
+ -0.161424132154678007E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435068640548550009E-03 -0.435070160511034009E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252823540608277005E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423118749632011E-03 -0.161424373626279998E-03 -0.666666666666666970E-02
+ -0.139258116844783998E-03 -0.139258078070556002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.255489628695773005E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423314131588989E-03 -0.161424178244322993E-03
+  0.000000000000000000E+00 -0.434859092529006977E-03 -0.435284424993224019E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.322050771166806013E-01  0.000000000000000000E+00
+ -0.435024403864370983E-03 -0.435075045265772024E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138278642730750004E-03 -0.140237552184589996E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.258528526278573000E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435045758287675984E-03 -0.435052776259060999E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138908581714380013E-03
+ -0.139607613200959987E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320997445770579981E-01 -0.161423315795968994E-03 -0.161424176579942987E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069043735089990E-03
+ -0.435069748238998001E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.256614731136071000E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.429527301021726991E-03
+ -0.440793399609727991E-03 -0.166666666666667011E-01 -0.138412525548941011E-03
+ -0.140103669366398013E-03  0.000000000000000000E+00  0.319693244861690984E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161422576319042010E-03 -0.161424916056869999E-03 -0.437741641876694004E-03
+ -0.432457188030005977E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.257682960389588010E-01
+  0.000000000000000000E+00 -0.372135294409653991E-03 -0.498733734440172968E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138908712464486990E-03 -0.139607482450852008E-03  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.418896816202265973E-01  0.000000000000000000E+00
+ -0.436537889268050986E-03 -0.433619959227105999E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.134626693225825994E-03 -0.143889501689513003E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.252833739842331998E-01  0.000000000000000000E+00  0.378158663248480029E-04
+ -0.725497288713840022E-03 -0.666666666666666970E-02 -0.597713400143132971E-03
+ -0.752178542458052050E-04 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.321707087558135013E-01  0.000000000000000000E+00 -0.981956616589569036E-04
+ -0.589485760730035007E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.242422452004102012E-03 -0.626419835077194048E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252392538736065999E-01 -0.333702076879594987E-03
+ -0.536013661829747045E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138219319557457995E-03 -0.140296875357881002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.321074799924774973E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.434939601065898027E-03 -0.435163477654634996E-03
+ -0.435157861406833973E-03 -0.434944986847371983E-03 -0.434989471025225001E-03
+ -0.435111473318402012E-03 -0.166666666666667011E-01 -0.138299620238519004E-03
+ -0.140216574676820996E-03 -0.138306111153669995E-03 -0.140210083761669002E-03
+ -0.138904474579713009E-03 -0.139611720335626992E-03  0.000000000000000000E+00
+  0.421630568906255984E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.382244495288109972E-04 -0.725905871917801953E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435871010405398983E-03
+ -0.434311639967858000E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256952418884623009E-01  0.000000000000000000E+00
+ -0.435307159704155999E-03 -0.434801765636921985E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138667044406462996E-03 -0.139849150508876001E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.253756621166979016E-01
+  0.000000000000000000E+00 -0.110334788320004006E-03 -0.577346634068988027E-03
+ -0.666666666666666970E-02 -0.811801782227151991E-03  0.217125293756245991E-04
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.318985675848939970E-01
+ -0.333010067147420014E-03 -0.536378952719955017E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.135533186849416004E-03 -0.142983008065922994E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320998187031011004E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423317205601008E-03
+ -0.161424175170311001E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.439006972327073976E-03 -0.431218847172978989E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.292402861508459996E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423747268413004E-03 -0.161423745107499005E-03 -0.161424199494236009E-03
+ -0.161423292881676000E-03 -0.161423292890866999E-03 -0.161424199485045010E-03
+ -0.666666666666666970E-02 -0.434862443268883999E-03 -0.435281002767785006E-03
+ -0.435100436115737983E-03 -0.435039032534641993E-03  0.000000000000000000E+00
+ -0.435036509827943995E-03 -0.435103015721981983E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.420929537555077005E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435015025801769976E-03 -0.435084824299273990E-03
+ -0.666666666666666970E-02 -0.138286640917195994E-03 -0.140229553998143003E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255502682711579998E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423359671854992E-03 -0.161424132704057993E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435067434800663979E-03
+ -0.435071393428817989E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252818652853033995E-01  0.000000000000000000E+00 -0.161423121023111999E-03
+ -0.161424371352800010E-03 -0.666666666666666970E-02 -0.139258148276853998E-03
+ -0.139258046638484999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255494975093474991E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423327915938004E-03
+ -0.161424164459974005E-03  0.000000000000000000E+00 -0.435032834826549976E-03
+ -0.435106772375089984E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322088227833026985E-01
+ -0.435016604539718995E-03 -0.435083178015839003E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138286481481243002E-03 -0.140229713434095995E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.258545382043442011E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435180234489963001E-03 -0.434923532533521007E-03  0.000000000000000000E+00
+ -0.138910062415604000E-03 -0.139606132499734998E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.321001796355565028E-01 -0.161423327064930995E-03
+ -0.161424165310980987E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435025887435454999E-03 -0.435113876200826026E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255844281592568015E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.327328582483525979E-03
+ -0.542038180279041048E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.138213770482627013E-03 -0.140302424432712987E-03  0.000000000000000000E+00
+  0.321601835592413010E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.946024118762530064E-04 -0.593079010512739054E-03
+ -0.133620199164305990E-03 -0.725519994603599015E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.257239982411411987E-01
+  0.196968234580236002E-04 -0.707378245847014990E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.458129631085333017E-03
+ -0.413598754070089024E-03  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.423184714605200005E-01  0.000000000000000000E+00 -0.434756036956663000E-03
+ -0.435354579774434021E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138688118698557992E-03 -0.139828076216782008E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.251897443601684992E-01
+  0.000000000000000000E+00 -0.646690982568611981E-04 -0.623012324132130035E-03
+ -0.666666666666666970E-02 -0.137976768222350989E-03 -0.140539426692988008E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319702861271339983E-01
+  0.000000000000000000E+00 -0.161422661149470011E-03 -0.161424831226441998E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.425027766008080989E-03
+ -0.445328716144572018E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.253132418625061015E-01
+ -0.429884571295451002E-03 -0.440422628401466976E-03 -0.666666666666666970E-02
+ -0.138411741515959000E-03 -0.140104453399379997E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.312157365405992017E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423748574759013E-03 -0.161423743801152996E-03
+ -0.161423313379198001E-03 -0.161424178996714008E-03 -0.161424178996706988E-03
+ -0.161423313379204994E-03 -0.166666666666667011E-01 -0.435069382470465026E-03
+ -0.435069401871426007E-03 -0.435069376779887026E-03 -0.435069407690234008E-03
+ -0.138284693812444989E-03 -0.140231501102894008E-03  0.000000000000000000E+00
+  0.420929708077939987E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014938579204977E-03 -0.435084915251974006E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.138286675893133993E-03
+ -0.140229519022206007E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256866648616451004E-01  0.000000000000000000E+00
+ -0.435050846412010986E-03 -0.435047608857992002E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.139258078983040012E-03 -0.139258115932299013E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.252598190481393009E-01
+  0.000000000000000000E+00 -0.161423313207681012E-03 -0.161424179168232000E-03
+ -0.666666666666666970E-02 -0.435070119779517976E-03 -0.435068680382981010E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.320929799686702974E-01
+ -0.435014999741732006E-03 -0.435084851470821988E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138286694533200995E-03 -0.140229500382138002E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.321001296867379007E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423325778209994E-03
+ -0.161424166597702991E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435030925452593018E-03 -0.435108724823487006E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.296363951915997008E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435163526863417986E-03 -0.434939554418941980E-03 -0.435078552482020999E-03
+ -0.435021040258217015E-03 -0.435027006817730023E-03 -0.435072330536212006E-03
+ -0.666666666666666970E-02 -0.138910218392951005E-03 -0.139605976522387993E-03
+ -0.138289242827388992E-03 -0.140226952087950006E-03 -0.138286026412109991E-03
+ -0.140230168503229007E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.419834140961602986E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305416680989E-03
+ -0.161424186959231996E-03 -0.666666666666666970E-02 -0.435068823290631025E-03
+ -0.435069973655738992E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256581988626478005E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434982526778275992E-03 -0.435118713563232987E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138286801913291003E-03 -0.140229393002047994E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256862275407307994E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434980912593391981E-03
+ -0.435120395436144002E-03 -0.666666666666666970E-02 -0.140230584882228997E-03
+ -0.138285610033110000E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255647395300480987E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423744222123004E-03 -0.161423748153789005E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434974854599588009E-03
+ -0.435166057532792997E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320996742791537990E-01  0.000000000000000000E+00
+ -0.161423313905322989E-03 -0.161424178470588993E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435067503898425021E-03 -0.435071322773375989E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256787237707600986E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423750872133001E-03 -0.161423741503779008E-03  0.000000000000000000E+00
+ -0.434878052016917011E-03 -0.435265039122159018E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320999946846379988E-01
+ -0.161423322217426002E-03 -0.161424170158486007E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435025227850866988E-03 -0.435114550827602994E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.256614590240232011E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435016115295252983E-03
+ -0.435083688178646997E-03 -0.166666666666667011E-01 -0.138320998711983004E-03
+ -0.140195196203355993E-03  0.000000000000000000E+00  0.319837098932621994E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161423313471531992E-03 -0.161424178904379990E-03 -0.435069292513270988E-03
+ -0.435069493855684000E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256582235323178989E-01
+  0.000000000000000000E+00 -0.434980856857106006E-03 -0.435120454886334974E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138286853775869010E-03 -0.140229341139469987E-03  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.419834109605791983E-01  0.000000000000000000E+00
+ -0.161423305333356989E-03 -0.161424187042554993E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435046434997871025E-03 -0.435092866540485025E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.253691040607966004E-01  0.000000000000000000E+00 -0.434981527116148989E-03
+ -0.435119754440041018E-03 -0.666666666666666970E-02 -0.140229345868751000E-03
+ -0.138286849046589000E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319837099126562968E-01  0.000000000000000000E+00 -0.161423313472065989E-03
+ -0.161424178903845993E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069123307102977E-03 -0.435069666874682976E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253136149345605001E-01 -0.435016121073636018E-03 -0.435083682152984010E-03
+ -0.666666666666666970E-02 -0.138320998693124988E-03 -0.140195196222214010E-03
+ -0.166666666666667011E-01  0.312157824691738014E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423746615296988E-03 -0.161423745760614994E-03
+ -0.161423313748345991E-03 -0.161424178627565991E-03 -0.161424178629453993E-03
+ -0.161423313746457989E-03 -0.166666666666667011E-01 -0.435045651504759002E-03
+ -0.435093667508772001E-03 -0.435080766409200008E-03 -0.435058268328494021E-03
+ -0.138278294452393003E-03 -0.140237900462945994E-03  0.000000000000000000E+00
+  0.420929613697930971E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435018968638475976E-03 -0.435080712727501978E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.138286650710325003E-03 -0.140229544205013995E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256871012844454988E-01  0.000000000000000000E+00
+ -0.435049191356535016E-03 -0.435049196139260014E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.139258062358015006E-03 -0.139258132557323991E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.252598188674287988E-01
+  0.000000000000000000E+00 -0.161423313384250004E-03 -0.161424178991662005E-03
+ -0.666666666666666970E-02 -0.435070776497347995E-03 -0.435068038137203020E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.320929799834977977E-01
+ -0.435014991950716994E-03 -0.435084859595236974E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138286694574832000E-03 -0.140229500340508000E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320996579186692971E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423313471493991E-03 -0.161424178904418994E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069357599841020E-03
+ -0.435069427302476014E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.295485029193944988E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435016115212415004E-03 -0.435083688265029017E-03 -0.435055716856611019E-03
+ -0.435042938405563989E-03 -0.435014924529298014E-03 -0.435084929901764013E-03
+ -0.666666666666666970E-02 -0.138321048324070988E-03 -0.140195146591268009E-03
+ -0.138900264540679010E-03 -0.139615930374659987E-03 -0.138286695603002008E-03
+ -0.140229499312336989E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.420929804452524967E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014924421641997E-03
+ -0.435084930014025993E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138286695610681005E-03 -0.140229499304657992E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257458287848717007E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435049188240248003E-03
+ -0.435049199389004021E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.139258097270406994E-03 -0.139258097644932003E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255775463585184992E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423313378629012E-03 -0.161424178997282997E-03
+ -0.666666666666666970E-02 -0.435069399499345990E-03 -0.435069384790270994E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257404831595632010E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435042938405561007E-03 -0.435055716856614001E-03
+  0.000000000000000000E+00 -0.138910376016605991E-03 -0.139605818898733006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320997361629841993E-01 -0.161423315571435007E-03
+ -0.161424176804478005E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069384822633020E-03 -0.435069399466241015E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256633780153988009E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423313378038013E-03 -0.161424178997873996E-03
+  0.000000000000000000E+00 -0.435069374982946988E-03 -0.435069409527666015E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.322089281358309970E-01 -0.435014924516908999E-03
+ -0.435084929914729011E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138286694922169990E-03 -0.140229499993169008E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.256614590292298002E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435016112049646021E-03 -0.435083691563146980E-03
+ -0.166666666666667011E-01 -0.138320998722293008E-03 -0.140195196193045990E-03
+  0.000000000000000000E+00  0.319837098823795019E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161423313471231993E-03
+ -0.161424178904679989E-03 -0.435069384800128994E-03 -0.435069399489265024E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256582249674996987E-01  0.000000000000000000E+00
+ -0.434980660277106983E-03 -0.435120659871895998E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138286858201160009E-03
+ -0.140229336714178988E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.419834073409849004E-01  0.000000000000000000E+00 -0.161423305232394999E-03
+ -0.161424187143517010E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435069384518094995E-03 -0.435069399777710994E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.253691078101383992E-01
+  0.000000000000000000E+00 -0.434980641232482002E-03 -0.435120678179650019E-03
+ -0.666666666666666970E-02 -0.140229336714196010E-03 -0.138286858201142987E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319837098823795990E-01
+  0.000000000000000000E+00 -0.161423313471231993E-03 -0.161424178904679989E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384799506987E-03
+ -0.435069399489901993E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.253136149492911994E-01
+ -0.435016112049662988E-03 -0.435083691563128007E-03 -0.666666666666666970E-02
+ -0.138320998722293008E-03 -0.140195196193046992E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320885977646230974E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435028541278858008E-03 -0.435070730506436016E-03 -0.435014782666575002E-03
+ -0.435085077833358993E-03 -0.435084998915685002E-03 -0.435014858345719973E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.138286722125585996E-03
+ -0.140229472789753001E-03 -0.138287952903743989E-03 -0.140228242011595009E-03
+ -0.138900254709783987E-03 -0.139615940205555010E-03  0.000000000000000000E+00
+  0.419834115616333006E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423305349811004E-03 -0.161424187026102008E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435070335762563986E-03
+ -0.435068469166236015E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.255991790107528011E-01  0.000000000000000000E+00
+ -0.434982884474892026E-03 -0.435118340650319981E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286203203484011E-03 -0.140229991711855013E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.253696597865739996E-01
+  0.000000000000000000E+00 -0.434979979753249990E-03 -0.435121367820745985E-03
+ -0.666666666666666970E-02 -0.140228217416730994E-03 -0.138287977498608003E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319694542079494021E-01
+ -0.161422588303417009E-03 -0.161424904072495000E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435109502026177015E-03 -0.435030181217087981E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320997612467455035E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423316150031992E-03
+ -0.161424176225879990E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435046027403482012E-03 -0.435093283082004022E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.296643788713649992E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435041466166206979E-03 -0.435057251216224993E-03
+ -0.434987713518249003E-03 -0.435113298977872021E-03 -0.435084291440460005E-03
+ -0.435015533322462983E-03 -0.666666666666666970E-02 -0.138384828446226003E-03
+ -0.140131366469113997E-03 -0.138922740970408006E-03 -0.139593453944930992E-03
+ -0.138352603070340010E-03 -0.140163591844998987E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.420929717450221966E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014955543717996E-03 -0.435084897561344988E-03
+ -0.666666666666666970E-02 -0.138286677768216005E-03 -0.140229517147122992E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257455320601093010E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435050355484345984E-03
+ -0.435048079623292984E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.139258080653318999E-03 -0.139258114262021002E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255775497212881002E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423313341254991E-03 -0.161424179034656991E-03 -0.666666666666666970E-02
+ -0.435070056149005984E-03 -0.435068742611218005E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257402390203077015E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435113345596286011E-03 -0.434987675900872992E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138910391927832998E-03
+ -0.139605802987505999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320999657878337022E-01
+ -0.161423321565590998E-03 -0.161424170810321011E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435043064748192026E-03 -0.435096312338415002E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256636236446474014E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423319781588012E-03 -0.161424172594323997E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435040266525950976E-03
+ -0.435099173616122011E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322086798404708011E-01
+ -0.435015968986542014E-03 -0.435083840783048986E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138286185919272000E-03 -0.140230008996066998E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255452807944169004E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423207454397007E-03 -0.161424284921515002E-03
+ -0.166666666666667011E-01 -0.404324467010457988E-03 -0.466398863159433010E-03
+  0.000000000000000000E+00  0.318949372706568007E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.323838600968767004E-03
+ -0.544927294428741000E-03 -0.135274538150394994E-03 -0.143241656764945006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255622073245997013E-01  0.000000000000000000E+00
+ -0.114453247275369001E-03 -0.573228175113622992E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.218487273219374002E-03
+ -0.891418527608312016E-03  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.421663423170699977E-01  0.000000000000000000E+00  0.686068254293191984E-05
+ -0.694542104931923968E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.437928115910693992E-03 -0.432328899183500985E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.255893867272030014E-01
+  0.000000000000000000E+00 -0.330661927924627975E-03 -0.535633960374462992E-03
+ -0.666666666666666970E-02 -0.139701786050046989E-03 -0.138814408865292008E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319824508284424977E-01
+  0.000000000000000000E+00 -0.161423276716622004E-03 -0.161424215659290005E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.453057324015285993E-03
+ -0.417479555947782987E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.252771264598301987E-01
+ -0.434642424564362999E-03 -0.435473396530201005E-03 -0.666666666666666970E-02
+ -0.142683907588298003E-03 -0.135832287327041998E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.310595659754696989E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161480379356609011E-03
+ -0.161367113019302998E-03 -0.161367201132704989E-03 -0.161480291243207996E-03
+ -0.161431097435676003E-03 -0.161416394940237009E-03 -0.166666666666667011E-01
+ -0.438433354062566980E-03 -0.431786524598485992E-03 -0.139063303639515990E-03
+ -0.139452891275823008E-03 -0.436729888333051012E-03 -0.433449749920077998E-03
+  0.000000000000000000E+00  0.421404522783956001E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.402800893973209023E-03
+ -0.467871394310760984E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.227909597161434006E-03 -0.900840851550373050E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.256049312238144991E-01
+  0.000000000000000000E+00 -0.340826929960757021E-03 -0.529259757575146952E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138562359099061988E-03
+ -0.139953835816277009E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253040222377561991E-01  0.000000000000000000E+00 -0.246116167738164992E-03
+ -0.598443392983163958E-03 -0.666666666666666970E-02 -0.140073786324521011E-03
+ -0.138442408590818013E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319703070887365007E-01 -0.161422660701845001E-03 -0.161424831674067008E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.426922283598156992E-03
+ -0.443394972905718991E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320998325318406982E-01  0.000000000000000000E+00 -0.161423318386659000E-03
+ -0.161424173989254012E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.432674210328745981E-03 -0.437517849506738994E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.296374388638415004E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435070730494953989E-03 -0.435028541287397997E-03
+ -0.435085047523665978E-03 -0.435014811728346980E-03 -0.435014733314390012E-03
+ -0.435085129293168984E-03 -0.666666666666666970E-02 -0.138910408020893991E-03
+ -0.139605786894445006E-03 -0.138288015962704006E-03 -0.140228178952634991E-03
+ -0.138286786003025006E-03 -0.140229408912313991E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.238543465439772992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.419834073379218020E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423305232311000E-03 -0.161424187143602012E-03 -0.666666666666666970E-02
+ -0.435069384517251974E-03 -0.435069399778572989E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.256582252366890015E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434980659785321009E-03
+ -0.435120660384639003E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286858754340991E-03 -0.140229336160998006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256868357246730003E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434980640625951985E-03
+ -0.435120678812035981E-03 -0.666666666666666970E-02 -0.140229336292177010E-03
+ -0.138286858623161988E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.255642315719874988E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423743707386991E-03 -0.161423748668524991E-03  0.000000000000000000E+00
+ -0.435032940295220990E-03 -0.435106665066769985E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996541101558966E-01
+ -0.161423313368980995E-03 -0.161424179006930987E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069405363546012E-03 -0.435069379055302017E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256786736305051992E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423743710730996E-03 -0.161423748665181989E-03
+  0.000000000000000000E+00 -0.435032918551319986E-03 -0.435106687300586010E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996542492566969E-01
+ -0.161423313372716993E-03 -0.161424179003194989E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069400469630010E-03 -0.435069383841369989E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.254980067883639992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161506286329310991E-03
+ -0.161341206046600991E-03 -0.166666666666667011E-01 -0.431364756977960023E-03
+ -0.438866309539295992E-03  0.000000000000000000E+00  0.319866973594409992E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.161423386153596000E-03 -0.161424106222316009E-03
+ -0.435249325540191991E-03 -0.434893412677740991E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256606788604577009E-01
+  0.000000000000000000E+00 -0.434963057292216018E-03 -0.435139014560906973E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138291831602560002E-03 -0.140224363312778995E-03  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.419833493286519976E-01  0.000000000000000000E+00 -0.161423303639283993E-03
+ -0.161424188736628991E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435072704181455982E-03 -0.435066152962260003E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.253728412360505994E-01
+  0.000000000000000000E+00 -0.434979171689485976E-03 -0.435122209070102999E-03
+ -0.666666666666666970E-02 -0.140221824975804004E-03 -0.138294369939534993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319851023368492998E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423349940155008E-03
+ -0.161424142435757001E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435344396466284009E-03 -0.434800441646335982E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.251600082854456994E-01 -0.161430149015849008E-03
+ -0.161417343360063001E-03 -0.666666666666666970E-02 -0.432078300695953020E-03
+ -0.438133229979561004E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.312157350718397994E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423748575108993E-03 -0.161423743800802989E-03
+ -0.161423313366150007E-03 -0.161424179009763005E-03 -0.161424179009759996E-03
+ -0.161423313366152013E-03 -0.166666666666667011E-01 -0.435069386750616997E-03
+ -0.435069397494826013E-03 -0.435069386095123000E-03 -0.435069398165091018E-03
+ -0.138284703375701995E-03 -0.140231491539637002E-03  0.000000000000000000E+00
+  0.420931147064136021E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014771024603002E-03 -0.435085089957181989E-03
+ -0.666666666666666970E-02 -0.138286970731685001E-03 -0.140229224183653997E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.254910621183637998E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423348017174001E-03 -0.161424144358738008E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435079665188463989E-03
+ -0.435059345445455996E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.249645250281322999E-01  0.000000000000000000E+00
+ -0.161423121269891999E-03 -0.161424371106020010E-03 -0.666666666666666970E-02
+ -0.139257838659735000E-03 -0.139258356255603997E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319784847308889966E-01 -0.161423139772803008E-03
+ -0.161424352603109001E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435059648379540009E-03 -0.435079356600350973E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.323098477771483969E-01  0.000000000000000000E+00
+ -0.434855411650391993E-03 -0.435251202539120021E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138421129190984010E-03 -0.140095065724355991E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.286728645834729984E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423315838776007E-03 -0.161424176537136002E-03
+ -0.161423752187323010E-03 -0.161423740188588999E-03 -0.161423317510291011E-03
+ -0.161424174865620998E-03 -0.666666666666666970E-02 -0.438771655527875001E-03
+ -0.431448953116295001E-03 -0.138271241563716002E-03 -0.140244953351622995E-03
+ -0.435165323893648012E-03 -0.434975574060252989E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.238543465439772992E-01 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.419833961506188985E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423304924141007E-03 -0.161424187451771002E-03
+ -0.666666666666666970E-02 -0.435069598057073975E-03 -0.435069190609702994E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.256582546102008015E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434978063949837974E-03 -0.435123367209186982E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138286924064156001E-03 -0.140229270851182996E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256881329485436002E-01
+  0.000000000000000000E+00 -0.434979402202776018E-03 -0.435121969745410990E-03
+ -0.666666666666666970E-02 -0.140226692592859009E-03 -0.138289502322480991E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.257149793095595991E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.434964716442319023E-03 -0.435137270613733019E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138403971130206005E-03
+ -0.140112223785132992E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320995089735353969E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423309436031000E-03 -0.161424182939881009E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435071687413104014E-03 -0.435067147303640016E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.257775991854455010E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.434123548609677019E-03 -0.436014071331951010E-03  0.000000000000000000E+00
+ -0.138328115136844999E-03 -0.140188079778493999E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320970398516952013E-01
+ -0.161423235617312989E-03 -0.161424256758598993E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435167788232856023E-03 -0.434973170725627982E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.255649837703663986E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423599069647001E-03 -0.161423893306266011E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.435941020818056012E-03 -0.434216891005145009E-03
+  0.321778215370647006E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.133203828204913997E-03 -0.554477594184077968E-03
+ -0.310288607973751023E-03 -0.560774180157768039E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.258307931039124007E-01
+  0.000000000000000000E+00 -0.200815667582686987E-03 -0.626132462157779008E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.446235732743376025E-03 -0.410993106848831988E-03  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.421918986255057984E-01
+  0.000000000000000000E+00 -0.435044026349501998E-03 -0.435054581088656020E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.138420116213701009E-03
+ -0.140096078701637989E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.251983478038565016E-01  0.000000000000000000E+00
+ -0.874760871910627032E-05 -0.678933813669884995E-03 -0.666666666666666970E-02
+ -0.139011566068860000E-03 -0.139504628846480000E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319960854906845005E-01  0.000000000000000000E+00
+ -0.161423534312251998E-03 -0.161423958063660011E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.422822744160627974E-03 -0.447571236367950997E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.251635090375166015E-01 -0.161499426349241996E-03
+ -0.161348066026670989E-03 -0.666666666666666970E-02 -0.431483288490702005E-03
+ -0.438741647350535027E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.312158150513081011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161424178244698994E-03 -0.161423314131212988E-03
+ -0.161423314059391994E-03 -0.161424178316519988E-03 -0.161423748120379993E-03
+ -0.161423744255532992E-03 -0.166666666666667011E-01 -0.435052937198406996E-03
+ -0.435086217674513994E-03 -0.138285143315771992E-03 -0.140231051599568009E-03
+ -0.435084983208553009E-03 -0.435054143960659026E-03  0.000000000000000000E+00
+  0.420929687916430026E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435023352498181996E-03 -0.435076141242082002E-03 -0.666666666666666970E-02
+ -0.138286660717270009E-03 -0.140229534198068988E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256871016294627011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435049188710877018E-03 -0.435049198898220012E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258077330179004E-03
+ -0.139258117585160996E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252598187185143000E-01  0.000000000000000000E+00
+ -0.161423313380516987E-03 -0.161424178995394995E-03 -0.666666666666666970E-02
+ -0.435070189961645989E-03 -0.435068611747460024E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.320929801235530984E-01 -0.435014945837523977E-03
+ -0.435084907681736004E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138286694923904008E-03 -0.140229499991435992E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996579074415977E-01
+  0.000000000000000000E+00 -0.161423313471188002E-03 -0.161424178904724007E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069377867833978E-03
+ -0.435069406577771021E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.295485029270328992E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435016111300222006E-03 -0.435083692344642020E-03
+ -0.435055716669636018E-03 -0.435042938584860996E-03 -0.435014924590734005E-03
+ -0.435084929837698995E-03 -0.666666666666666970E-02 -0.138321048357603003E-03
+ -0.140195146557735994E-03 -0.138900264538850991E-03 -0.139615930376488006E-03
+ -0.138286695606074989E-03 -0.140229499309265011E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.420929804437093005E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014924430680990E-03
+ -0.435084930004601023E-03 -0.666666666666666970E-02 -0.138286695607513997E-03
+ -0.140229499307826003E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.257458287840863984E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435049188263473999E-03
+ -0.435049199364782998E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.139258097267473007E-03 -0.139258097647866993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.255775463585166986E-01  0.000000000000000000E+00 -0.161423313378626003E-03
+ -0.161424178997286006E-03 -0.666666666666666970E-02 -0.435069399615192016E-03
+ -0.435069384676977993E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.257404832051559007E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435042938584863977E-03
+ -0.435055716669632006E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.138910376055990990E-03 -0.139605818859349010E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320997361635547013E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423315571449996E-03 -0.161424176804462013E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384753956998E-03
+ -0.435069399536463976E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256633780155203009E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423313378038989E-03 -0.161424178997872993E-03
+  0.000000000000000000E+00 -0.435069385238039011E-03 -0.435069399041488984E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.322089283932210022E-01 -0.435014924466399001E-03 -0.435084929967365997E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286695450416002E-03
+ -0.140229499464922995E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.256614590292296996E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435016112049732974E-03 -0.435083691563056016E-03
+ -0.166666666666667011E-01 -0.138320998722292005E-03 -0.140195196193046992E-03
+  0.000000000000000000E+00  0.319837098823798002E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161423313471231993E-03 -0.161424178904679989E-03 -0.435069384798286989E-03
+ -0.435069399491148988E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256582249674734003E-01  0.000000000000000000E+00
+ -0.434980660273001976E-03 -0.435120659876175994E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138286858201095011E-03
+ -0.140229336714244013E-03  0.000000000000000000E+00  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.419834073410084996E-01
+  0.000000000000000000E+00 -0.161423305232396002E-03 -0.161424187143516007E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435069384645917010E-03
+ -0.435069399647007983E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.253691078100609993E-01  0.000000000000000000E+00
+ -0.434980641243667987E-03 -0.435120678167986010E-03 -0.666666666666666970E-02
+ -0.140229336714366013E-03 -0.138286858200973987E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319837098823803970E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423313471231993E-03 -0.161424178904679989E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384793870004E-03
+ -0.435069399495665991E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253136149492908004E-01 -0.435016112049908018E-03 -0.435083691562873978E-03
+ -0.666666666666666970E-02 -0.138320998722292005E-03 -0.140195196193046992E-03
+ -0.166666666666667011E-01  0.312157364805190005E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423748574844990E-03 -0.161423743801066992E-03 -0.161423313378673003E-03
+ -0.161424178997239006E-03 -0.161424178997232013E-03 -0.161423313378679996E-03
+ -0.166666666666667011E-01 -0.435069383970378013E-03 -0.435069400337715023E-03
+ -0.435069376957132018E-03 -0.435069407508996008E-03 -0.138284694213328010E-03
+ -0.140231500702010987E-03  0.000000000000000000E+00  0.420929739826170005E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014932601145024E-03
+ -0.435084921485382001E-03 -0.666666666666666970E-02 -0.138286682366655988E-03
+ -0.140229512548683009E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.254915570191668016E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423360046659997E-03
+ -0.161424132329252012E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435068904052976024E-03 -0.435069891069177000E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.249645635933331000E-01 -0.161423119060364001E-03 -0.161424373315548008E-03
+ -0.666666666666666970E-02 -0.139258110049798010E-03 -0.139258084865540987E-03
+ -0.166666666666667011E-01  0.319795589129939009E-01 -0.161423181597269993E-03
+ -0.161424310778641989E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435063340741964014E-03 -0.435075580375908022E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322057823008502012E-01
+  0.000000000000000000E+00 -0.435021995940365993E-03 -0.435077556208205004E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138280141008673011E-03
+ -0.140236053906665986E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.286987467722891014E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423485461700003E-03
+ -0.161424006914212006E-03 -0.161423743701962999E-03 -0.161423748673950013E-03
+ -0.161423485478030988E-03 -0.161424006897880994E-03 -0.666666666666666970E-02
+ -0.434818199915888975E-03 -0.435326186273355992E-03 -0.138107939683270998E-03
+ -0.140408255232067999E-03 -0.434750487938305018E-03 -0.435395405799094998E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.419834078568204969E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305246576987E-03
+ -0.161424187129334995E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435069368742756982E-03 -0.435069415908642004E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.256580957796514984E-01
+  0.000000000000000000E+00 -0.434980863637928004E-03 -0.435120447850464987E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286593339167997E-03
+ -0.140229601576171000E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.256868273049145994E-01  0.000000000000000000E+00 -0.434980612246944019E-03
+ -0.435120708410805001E-03 -0.666666666666666970E-02 -0.140229353764139986E-03
+ -0.138286841151199011E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256611955309135990E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435106719724830994E-03 -0.434994029732972990E-03  0.000000000000000000E+00
+ -0.138320118014007004E-03 -0.140196076901331993E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320999233193186018E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423320520808004E-03 -0.161424171855104005E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435059778147594005E-03
+ -0.435079222540635984E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.257716863610135995E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435022363973044002E-03 -0.435077172538276982E-03
+  0.000000000000000000E+00 -0.138312498857191992E-03 -0.140203696058148009E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996765524508010E-01
+  0.000000000000000000E+00 -0.161423313971483987E-03 -0.161424178404427995E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435068773858323006E-03
+ -0.435070024197505999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255013626644550011E-01 -0.666666666666666970E-02
+ -0.161778513994174000E-03 -0.161068978381738009E-03 -0.166666666666667011E-01
+ -0.430721544928321987E-03 -0.439524018875910982E-03  0.319967451293014030E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.161423540387061002E-03 -0.161423951988851007E-03
+ -0.436779134539303025E-03 -0.433396918749490016E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.257463974004814994E-01
+  0.000000000000000000E+00 -0.357123790381600977E-03 -0.513334443915617971E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138849213053182001E-03 -0.139666981862157999E-03  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.421600707339402012E-01  0.000000000000000000E+00 -0.438162222449141976E-03
+ -0.432071491857211011E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.110310967390135005E-03 -0.562620286998803958E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.250247683829411989E-01
+  0.000000000000000000E+00  0.312489112005970006E-04 -0.718930333589588031E-03
+ -0.666666666666666970E-02 -0.143948366628396992E-03 -0.134567828286942005E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.321784029807069966E-01
+ -0.131393333165138998E-03 -0.556288089223852967E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.344952960577046006E-03 -0.526365968885401047E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.252170877020880015E-01 -0.161423598610962993E-03
+ -0.161423893764948989E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435940683620558982E-03 -0.434217221098728986E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.312157364068764015E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423748574870008E-03 -0.161423743801043004E-03
+ -0.161423313378020991E-03 -0.161424178997891994E-03 -0.161424178997885001E-03
+ -0.161423313378028011E-03 -0.166666666666667011E-01 -0.435069384799240003E-03
+ -0.435069399490174995E-03 -0.435069377535609984E-03 -0.435069406917481974E-03
+ -0.138284694694889009E-03 -0.140231500220449988E-03  0.000000000000000000E+00
+  0.000000000000000000E+00  0.420929804454727996E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014924420943987E-03 -0.435084930014753980E-03
+ -0.666666666666666970E-02 -0.138286695611132006E-03 -0.140229499304206991E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256871018618805014E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435049188236810973E-03 -0.435049199392586983E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258097270825008E-03
+ -0.139258097644513989E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252598186487732017E-01  0.000000000000000000E+00
+ -0.161423313378620013E-03 -0.161424178997291996E-03 -0.666666666666666970E-02
+ -0.435069399482861021E-03 -0.435069384806393027E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.320929801562968992E-01 -0.435014924399239995E-03
+ -0.435084930037426003E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138286695018654008E-03 -0.140229499896684989E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996579148839015E-01
+  0.000000000000000000E+00 -0.161423313471389989E-03 -0.161424178904521993E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384785220998E-03
+ -0.435069399504509015E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.295485029252238011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435016114058515979E-03 -0.435083689468307975E-03 -0.435055716894270023E-03
+ -0.435042938369449973E-03 -0.435014924446868020E-03 -0.435084929987721025E-03
+ -0.666666666666666970E-02 -0.138321048328908996E-03 -0.140195146586430001E-03
+ -0.138900264540856006E-03 -0.139615930374483994E-03 -0.138286695607003012E-03
+ -0.140229499308336012E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.420929804454603998E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435014924420999986E-03 -0.435084930014695975E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.138286695611106988E-03 -0.140229499304232009E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.257458287849481986E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435049188236881013E-03 -0.435049199392514017E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258097270799990E-03
+ -0.139258097644539007E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255775463582476013E-01  0.000000000000000000E+00
+ -0.161423313378621992E-03 -0.161424178997289990E-03 -0.666666666666666970E-02
+ -0.435069399483869980E-03 -0.435069384805406023E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.257404831771072991E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435042938369448998E-03 -0.435055716894270999E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.138910376031616011E-03 -0.139605818883723013E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320997361628699990E-01
+  0.000000000000000000E+00 -0.161423315571431998E-03 -0.161424176804481013E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384829067001E-03
+ -0.435069399459660992E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.256633780151233996E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423313378029990E-03 -0.161424178997881992E-03  0.000000000000000000E+00
+ -0.435069376411787974E-03 -0.435069408066628019E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.322089281582298992E-01 -0.435014924431676993E-03 -0.435084930003603991E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286694967978995E-03
+ -0.140229499947360002E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.256614590292298002E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435016112049635016E-03 -0.435083691563157009E-03
+ -0.166666666666667011E-01 -0.138320998722293008E-03 -0.140195196193045990E-03
+  0.000000000000000000E+00  0.319837098823795019E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161423313471231993E-03 -0.161424178904679989E-03 -0.435069384800495997E-03
+ -0.435069399488889998E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256582249675054996E-01  0.000000000000000000E+00
+ -0.434980660275541991E-03 -0.435120659873527993E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138286858201179010E-03
+ -0.140229336714159988E-03  0.000000000000000000E+00  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.419834073409666025E-01
+  0.000000000000000000E+00 -0.161423305232394999E-03 -0.161424187143517010E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435069384656581982E-03
+ -0.435069399636101993E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.253691078101523984E-01  0.000000000000000000E+00
+ -0.434980641228473002E-03 -0.435120678183829998E-03 -0.666666666666666970E-02
+ -0.140229336714159988E-03 -0.138286858201179010E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319837098823795019E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423313471231993E-03 -0.161424178904679989E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384800471982E-03
+ -0.435069399488914989E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253136149492911994E-01 -0.435016112049635992E-03 -0.435083691563157009E-03
+ -0.666666666666666970E-02 -0.138320998722293008E-03 -0.140195196193045990E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.320885629537309008E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435084884244977990E-03 -0.435014968312467975E-03
+ -0.435014924420903005E-03 -0.435084930014797023E-03 -0.435055716900959984E-03
+ -0.435042938363035996E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.138286695608023002E-03 -0.140229499307315995E-03 -0.138900264540568990E-03
+ -0.139615930374770007E-03 -0.138287928689845009E-03 -0.140228266225493988E-03
+  0.000000000000000000E+00  0.419834073409644029E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423305232394999E-03 -0.161424187143517010E-03
+ -0.666666666666666970E-02 -0.435069384693222974E-03 -0.435069399598635002E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255994980449185013E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434980660275051986E-03 -0.435120659874038977E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286858201180012E-03
+ -0.140229336714159012E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.253691078105353005E-01  0.000000000000000000E+00
+ -0.434980641228200975E-03 -0.435120678184114005E-03 -0.666666666666666970E-02
+ -0.140229336713374998E-03 -0.138286858201963999E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319837881362097023E-01 -0.161423315571430996E-03
+ -0.161424176804481013E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069384829208001E-03 -0.435069399459517986E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320996579140455027E-01
+  0.000000000000000000E+00 -0.161423313471367004E-03 -0.161424178904545006E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384800495021E-03
+ -0.435069399488890974E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.295485029266231991E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435016113771493024E-03
+ -0.435083689767614020E-03 -0.435055716901091010E-03 -0.435042938362910012E-03
+ -0.435014924420824020E-03 -0.435084930014878989E-03 -0.666666666666666970E-02
+ -0.138321048331202002E-03 -0.140195146584136995E-03 -0.138900264540522993E-03
+ -0.139615930374817007E-03 -0.138286695607886989E-03 -0.140229499307452008E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.238543465439772992E-01 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.420929804454732021E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435014924420943012E-03 -0.435084930014755010E-03 -0.666666666666666970E-02
+ -0.138286695611133009E-03 -0.140229499304205988E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.257458287849504017E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435049188236650999E-03 -0.435049199392754005E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.139258097270826011E-03
+ -0.139258097644513013E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255775463581538985E-01  0.000000000000000000E+00
+ -0.161423313378620013E-03 -0.161424178997291996E-03 -0.666666666666666970E-02
+ -0.435069399482816027E-03 -0.435069384806436991E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.257404831864265007E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435042938362910012E-03 -0.435055716901091010E-03  0.000000000000000000E+00
+ -0.138910376039622004E-03 -0.139605818875716993E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320997361628520966E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423315571430996E-03 -0.161424176804481013E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384829304983E-03
+ -0.435069399459418022E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256633780150350016E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423313378028011E-03 -0.161424178997883998E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435069377535611014E-03
+ -0.435069406917480998E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.322089281809833025E-01 -0.435014924399087014E-03
+ -0.435084930037585977E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138286695014638991E-03 -0.140229499900700006E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.255642320220383010E-01 -0.666666666666666970E-02 -0.161423743718748996E-03
+ -0.161423748657163013E-03 -0.166666666666667011E-01 -0.435032852654396000E-03
+ -0.435106754681328976E-03  0.319837065485489985E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161423313381479000E-03
+ -0.161424178994433009E-03 -0.435069384799287003E-03 -0.435069399490127019E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.256582249674894985E-01
+  0.000000000000000000E+00 -0.434980660275194016E-03 -0.435120659873890984E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138286858201148001E-03 -0.140229336714191999E-03  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.419834073409640976E-01  0.000000000000000000E+00 -0.161423305232394999E-03
+ -0.161424187143517010E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435069384687570974E-03 -0.435069399604414992E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.253691078101366992E-01
+  0.000000000000000000E+00 -0.434980641228032002E-03 -0.435120678184291001E-03
+ -0.666666666666666970E-02 -0.140229336714191999E-03 -0.138286858201148001E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319837065485489985E-01
+  0.000000000000000000E+00 -0.161423313381479000E-03 -0.161424178994433009E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384799287003E-03
+ -0.435069399490127019E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.252163879420997002E-01
+ -0.161423743718748996E-03 -0.161423748657163013E-03 -0.666666666666666970E-02
+ -0.435032852654396000E-03 -0.435106754681328976E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320887366348247965E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435111019942127013E-03 -0.434989904848891011E-03
+ -0.435014811994715992E-03 -0.435085047244578027E-03 -0.435084986246467005E-03
+ -0.435014870489667008E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.138286808290661993E-03 -0.140229386624677005E-03 -0.138288040456547998E-03
+ -0.140228154458792002E-03 -0.138900328862363002E-03 -0.139615866052975995E-03
+  0.000000000000000000E+00  0.419834068919483008E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305219605994E-03
+ -0.161424187156306991E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435073590544243023E-03 -0.435065286134219016E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.255994981604824000E-01
+  0.000000000000000000E+00 -0.434980605991956010E-03 -0.435120716478292976E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286858621097992E-03
+ -0.140229336294241005E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.253691080400011010E-01  0.000000000000000000E+00 -0.434980551188209996E-03
+ -0.435120772070955015E-03 -0.666666666666666970E-02 -0.140229336059168004E-03
+ -0.138286858856170994E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319837098814821016E-01 -0.161423313471205999E-03 -0.161424178904706010E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069401084582977E-03
+ -0.435069383239967001E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320996545746903006E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423313381464986E-03 -0.161424178994447998E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069392005489001E-03 -0.435069392121542001E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.292425277050150992E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423743718737991E-03 -0.161423748657174993E-03
+ -0.161424178997885001E-03 -0.161423313378027008E-03 -0.161423313378019988E-03
+ -0.161424178997891994E-03 -0.666666666666666970E-02 -0.435032852646152974E-03
+ -0.435106754689757998E-03 -0.435069377535869000E-03 -0.435069406917216995E-03
+  0.000000000000000000E+00 -0.435069384798973994E-03 -0.435069399490447021E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.420929804212083966E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014924419121986E-03
+ -0.435084930016657026E-03 -0.666666666666666970E-02 -0.138286695561419006E-03
+ -0.140229499353920994E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.255502626954251001E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423359539924989E-03
+ -0.161424132835986993E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435069382989675027E-03 -0.435069401340516977E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252823579642372995E-01  0.000000000000000000E+00 -0.161423118848985994E-03
+ -0.161424373526926991E-03 -0.666666666666666970E-02 -0.139258097691293003E-03
+ -0.139258097224046997E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.255489358069172015E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423313378028011E-03 -0.161424178997883998E-03
+  0.000000000000000000E+00 -0.435069377535374983E-03 -0.435069406917722016E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.322089281809831013E-01
+  0.000000000000000000E+00 -0.435014924399098994E-03 -0.435084930037573996E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286695014637988E-03
+ -0.140229499900701009E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.258549253945415010E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435042938363035996E-03 -0.435055716900959008E-03  0.000000000000000000E+00
+ -0.138910376039619998E-03 -0.139605818875718999E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320997361628526032E-01 -0.161423315571430996E-03 -0.161424176804481013E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384829208001E-03
+ -0.435069399459517010E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.255489358069375012E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423313378028011E-03
+ -0.161424178997883998E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.435069377571737986E-03 -0.435069406880540007E-03
+  0.000000000000000000E+00  0.320929801553170024E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435014924399170009E-03
+ -0.435084930037499024E-03 -0.138286695016646012E-03 -0.140229499898693013E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.255502626954253013E-01
+  0.000000000000000000E+00 -0.161423359539924989E-03 -0.161424132835986993E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.435069382989559993E-03 -0.435069401340634017E-03  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.420929804212068007E-01
+  0.000000000000000000E+00 -0.435014924417415994E-03 -0.435084930018437015E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.138286695561417000E-03
+ -0.140229499353921997E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.249646302548570016E-01  0.000000000000000000E+00
+ -0.161423118848985994E-03 -0.161424373526926991E-03 -0.666666666666666970E-02
+ -0.139258097691296012E-03 -0.139258097224043988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319837881362063023E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423315571430996E-03 -0.161424176804481013E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069384829284004E-03
+ -0.435069399459439977E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.253926391078276997E-01 -0.435042938363074973E-03
+ -0.435055716900919001E-03 -0.666666666666666970E-02 -0.138910376047635993E-03
+ -0.139605818867703004E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.323033163368178014E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435248914087700014E-03 -0.434857618270111026E-03
+ -0.434883815525587006E-03 -0.435221605060230013E-03 -0.435144012464388997E-03
+ -0.434958245913406004E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.138392691801862987E-03 -0.140123503113476010E-03 -0.138936896109538013E-03
+ -0.139579298805801011E-03 -0.138396653972959005E-03 -0.140119540942379992E-03
+  0.000000000000000000E+00  0.419134531554943013E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.160003730031608006E-03
+ -0.162843762344304003E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.439266035089253990E-03 -0.430980363868474004E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.256392674343047001E-01
+  0.000000000000000000E+00 -0.434777562089547990E-03 -0.435332391680561979E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138409864981691995E-03
+ -0.140106329933648005E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.254101279172756003E-01  0.000000000000000000E+00 -0.373644623812598002E-03
+ -0.492026351508191052E-03 -0.666666666666666970E-02 -0.908412435916435045E-03
+  0.235481181527497004E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.321268372004408984E-01 -0.200001267691483995E-03 -0.627372315610022989E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.127171097760206999E-03
+ -0.545760156628731963E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320995602249590020E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423310638646997E-03 -0.161424181737265012E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.437288210106853009E-03 -0.432899574546833006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.296377168966710999E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434989904918347983E-03 -0.435111019849799988E-03
+ -0.435084787576991007E-03 -0.435015060995395985E-03 -0.435015087313978986E-03
+ -0.435084760132168008E-03 -0.666666666666666970E-02 -0.138910479330903999E-03
+ -0.139605715584434998E-03 -0.138288220902347007E-03 -0.140227974012992993E-03
+  0.000000000000000000E+00 -0.138286985894706013E-03 -0.140229209020633011E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.419834073423189028E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305232431997E-03
+ -0.161424187143480012E-03 -0.666666666666666970E-02 -0.435069384782949973E-03
+ -0.435069399506886020E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256582245002681016E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434980660161395994E-03
+ -0.435120659992680999E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138286857243334990E-03 -0.140229337672004008E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256868356243260994E-01  0.000000000000000000E+00 -0.434980641780946975E-03
+ -0.435120677607716026E-03 -0.666666666666666970E-02 -0.140229336498973991E-03
+ -0.138286858416365006E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.255642316967265998E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423743725422992E-03 -0.161423748650489993E-03  0.000000000000000000E+00
+ -0.435032788457063980E-03 -0.435106820324987996E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320996548633918979E-01  0.000000000000000000E+00
+ -0.161423313389182012E-03 -0.161424178986729997E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069349793661977E-03 -0.435069435284559975E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.256786747351973009E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423743711796008E-03 -0.161423748664116001E-03  0.000000000000000000E+00
+ -0.435032875429173019E-03 -0.435106731393101022E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320996542943430968E-01 -0.161423313373898990E-03 -0.161424179002013995E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069393047989017E-03
+ -0.435069391099458976E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.256124534411805993E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.422563580131968979E-03 -0.447998624216189984E-03
+ -0.166666666666667011E-01 -0.138286150728846990E-03 -0.140230044186493010E-03
+  0.000000000000000000E+00  0.319552376669784977E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161419817025352993E-03 -0.161427675350558989E-03 -0.434132277684113981E-03
+ -0.436028351477167012E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.256464487089681989E-01  0.000000000000000000E+00 -0.434899896387440008E-03
+ -0.435204881412590012E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01 -0.138261308742774005E-03 -0.140254886172565995E-03
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.419830852156009016E-01
+  0.000000000000000000E+00 -0.161423296248432011E-03 -0.161424196127479998E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435080309509953001E-03
+ -0.435058715368486990E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.253594622357068006E-01  0.000000000000000000E+00
+ -0.434950331803641015E-03 -0.435152283781920987E-03 -0.666666666666666970E-02
+ -0.140249996876020999E-03 -0.138266198039317998E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319571786575686967E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161420475113085006E-03 -0.161427017262827003E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.434308830163140978E-03
+ -0.435847642627907976E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252660555352814988E-01 -0.423620518083848974E-03
+ -0.446908164705409994E-03 -0.666666666666666970E-02 -0.138290733115793013E-03
+ -0.140225461799546011E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.320833316936339971E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.434976564103407978E-03 -0.435124935063587002E-03 -0.435182102606195996E-03
+ -0.434921744344358996E-03 -0.434970473518194996E-03 -0.435131286534540024E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.138283560982178005E-03
+ -0.140232633933160992E-03 -0.138290646200932993E-03 -0.140225548714406005E-03
+ -0.138898978959791009E-03 -0.139617215955547988E-03  0.000000000000000000E+00
+  0.421628681367203009E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.397192681545031025E-04 -0.727400690543495007E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435235074593796000E-03
+ -0.434912988301708998E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.256973882949261996E-01  0.000000000000000000E+00
+ -0.435443110746612982E-03 -0.434671360314161997E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.138672805730211991E-03 -0.139843389185127006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.253811408048402014E-01
+  0.000000000000000000E+00 -0.111660192932873994E-03 -0.576021229456117972E-03
+ -0.666666666666666970E-02 -0.807402803567670987E-03  0.113198743547475004E-04
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.318951479546440986E-01
+ -0.334801614937765011E-03 -0.534674356904564995E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.135236534786964987E-03 -0.143279660128374010E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320991543630683995E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423299320898012E-03
+ -0.161424193055014999E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.439089518603328992E-03 -0.431138195492317014E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.292550790147875009E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423735549572994E-03 -0.161423756826338988E-03 -0.161424083758227992E-03
+ -0.161423408617684993E-03 -0.161423408591030993E-03 -0.161424083784880988E-03
+ -0.666666666666666970E-02 -0.434939106056469018E-03 -0.435202598866759986E-03
+ -0.434989974486759020E-03 -0.435150588304750977E-03  0.000000000000000000E+00
+ -0.434898044279196977E-03 -0.435244580341264027E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.420929736779582994E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435014942481456979E-03
+ -0.435084911182310997E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.138286681738492993E-03 -0.140229513176846004E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.255502900122648001E-01
+  0.000000000000000000E+00 -0.161423360190818011E-03 -0.161424132185093998E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435068884552595005E-03
+ -0.435069911008886009E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.252824734873801985E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423118220565989E-03
+ -0.161424374155345993E-03 -0.666666666666666970E-02 -0.139258110569624000E-03
+ -0.139258084345714997E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.255485447377057015E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.161423302756024003E-03 -0.161424189619888006E-03  0.000000000000000000E+00
+ -0.434872688531902005E-03 -0.435270525342468988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.322052497845985025E-01  0.000000000000000000E+00 -0.435023979094175024E-03
+ -0.435075488205578982E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138279010834745992E-03 -0.140237184080593006E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.258530259445350005E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.434958189050955980E-03 -0.435144092591555997E-03  0.000000000000000000E+00
+ -0.138909064929744003E-03 -0.139607129985594994E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320994819723071992E-01 -0.161423308725767013E-03
+ -0.161424183650144996E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435080068461255018E-03 -0.435058950988906974E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.256614731138163008E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.429527300766952978E-03 -0.440793399874087001E-03 -0.166666666666667011E-01
+ -0.138412525549365991E-03 -0.140103669365973006E-03  0.000000000000000000E+00
+  0.319693244854589026E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.161422576318977012E-03
+ -0.161424916056934997E-03 -0.437741651795140981E-03 -0.432457178334903997E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.257682962742953013E-01
+  0.000000000000000000E+00 -0.372135275695885989E-03 -0.498733752802193965E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+ -0.138908713201794991E-03 -0.139607481713544007E-03  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.418896828694114015E-01  0.000000000000000000E+00
+ -0.436533664255522999E-03 -0.433624016339187999E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.134626819469867013E-03 -0.143889375445472011E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.252833740154079986E-01  0.000000000000000000E+00  0.378159457419296988E-04
+ -0.725497368130921007E-03 -0.666666666666666970E-02 -0.597713125202046038E-03
+ -0.752181291868920024E-04 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.321707087590768978E-01  0.000000000000000000E+00 -0.981956570690172963E-04
+ -0.589485765319974005E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.242422521139259002E-03 -0.626419769058458972E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.252392538739079006E-01 -0.333702074783927001E-03 -0.536013663819403965E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.138219319560300990E-03 -0.140296875355038007E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.323033117395759989E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435248913863458978E-03
+ -0.434857618485890026E-03 -0.434883818075460025E-03 -0.435221602402711982E-03
+ -0.435144010721807983E-03 -0.434958247585231009E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.138392690175634999E-03 -0.140123504739703998E-03
+ -0.138936895546960007E-03 -0.139579299368378990E-03 -0.138396652402613996E-03
+ -0.140119542512725002E-03  0.000000000000000000E+00  0.419134536981460026E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.160003888239780996E-03
+ -0.162843604136131013E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.439265966752186974E-03 -0.430980430216653992E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.000000000000000000E+00
+  0.256385430489332010E-01  0.000000000000000000E+00 -0.434778992070864020E-03
+ -0.435330901615854019E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138408856053690005E-03 -0.140107338861648992E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.254101277388419991E-01  0.000000000000000000E+00
+ -0.373643453136775986E-03 -0.492027450061672958E-03 -0.666666666666666970E-02
+ -0.908412791146940971E-03  0.235481536758003012E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.321268373345425032E-01 -0.200001595110305000E-03
+ -0.627372161158813030E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.127171079349847012E-03 -0.545760175039092005E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.321000029443719026E-01  0.000000000000000000E+00 -0.161423322269462990E-03
+ -0.161424170106448992E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.437246082492378997E-03 -0.432940746439987984E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.296367144183955003E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.435124963867026995E-03
+ -0.434976534547966984E-03 -0.435077880745983003E-03 -0.435021684411966020E-03
+ -0.435028125198068990E-03 -0.435071164272797983E-03 -0.666666666666666970E-02
+ -0.138910250043690000E-03 -0.139605944871648997E-03 -0.138289604026781002E-03
+ -0.140226590888557995E-03 -0.138286261066744988E-03 -0.140229933848595012E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.419834145363481015E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423305428732005E-03 -0.161424186947180004E-03
+ -0.666666666666666970E-02 -0.435068797988044014E-03 -0.435069999528659985E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.256581969780162004E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.434982641293220015E-03 -0.435118594151826009E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.138286797863401998E-03
+ -0.140229397051936999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256861833871666993E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434980942275246015E-03 -0.435120364500525027E-03
+ -0.666666666666666970E-02 -0.140230675945465999E-03 -0.138285518969872998E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.255648152871906995E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423744276026012E-03
+ -0.161423748099886999E-03  0.000000000000000000E+00 -0.434971116348022017E-03
+ -0.435169879709118980E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320996762669861976E-01  0.000000000000000000E+00
+ -0.161423313958423010E-03 -0.161424178417488999E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435067366048129010E-03 -0.435071463729721000E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.256787289926059988E-01  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423752315998997E-03 -0.161423740059913012E-03
+  0.000000000000000000E+00 -0.434863867040502017E-03 -0.435279543030525011E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.321000539630773019E-01 -0.161423323759014989E-03 -0.161424168616896993E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435021966445662005E-03
+ -0.435117885588833984E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.256124534378156000E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.422563583786935012E-03 -0.447998620447711019E-03
+ -0.166666666666667011E-01 -0.138286150716901006E-03 -0.140230044198437991E-03
+  0.000000000000000000E+00  0.319552376748891975E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.161419817028325008E-03 -0.161427675347588004E-03 -0.434132178542880997E-03
+ -0.436028452917342011E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.256464472998883010E-01  0.000000000000000000E+00 -0.434900113529982019E-03
+ -0.435204654991785977E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.166666666666667011E-01 -0.138261303887879009E-03 -0.140254891027459988E-03
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.419830890665787021E-01
+  0.000000000000000000E+00 -0.161423296358576002E-03 -0.161424196017336007E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435055172719591018E-03
+ -0.435083932013876024E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.253594585320825010E-01  0.000000000000000000E+00
+ -0.434951272976298023E-03 -0.435151302408671016E-03 -0.666666666666666970E-02
+ -0.140250006846153006E-03 -0.138266188069185992E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319571786801332997E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161420475119938004E-03 -0.161427017255975008E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.434308550272304017E-03
+ -0.435847929000599975E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252660555256693994E-01 -0.423620528167419021E-03
+ -0.446908154298355977E-03 -0.666666666666666970E-02 -0.138290733082334995E-03
+ -0.140225461833005005E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.320861713072602972E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435354479890220003E-03
+ -0.434756412513020009E-03 -0.435057879269062974E-03 -0.435040865004169999E-03
+ -0.434966118570618990E-03 -0.435135827690980980E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.138285724464454012E-03 -0.140230470450885988E-03
+ -0.138897585402013996E-03 -0.139618609513325002E-03 -0.138201697331153989E-03
+ -0.140314497584185008E-03  0.000000000000000000E+00  0.421628678547669028E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.397284147640295009E-04 -0.727409837153020992E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435314260356761012E-03 -0.434838234146065004E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.256973904451089016E-01  0.000000000000000000E+00 -0.435443064032224982E-03
+ -0.434671405121811992E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138672811291356012E-03 -0.139843383623983013E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.253811442749607014E-01  0.000000000000000000E+00
+ -0.111659649866176005E-03 -0.576021772522815052E-03 -0.666666666666666970E-02
+ -0.807399256528189952E-03  0.113123198788523993E-04 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.318951480124047976E-01 -0.334801256611660003E-03
+ -0.534674694551013989E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.135236541601312007E-03 -0.143279653314027993E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.320991543634817009E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423299320899991E-03 -0.161424193055011991E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.439089603173896989E-03
+ -0.431138112796988022E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.292550790154386987E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423735549587007E-03
+ -0.161423756826326004E-03 -0.161424083758224008E-03 -0.161423408617689004E-03
+ -0.161423408591035005E-03 -0.161424083784877004E-03 -0.666666666666666970E-02
+ -0.434939104555981024E-03 -0.435202600400863017E-03 -0.434989974551208984E-03
+ -0.435150588238853982E-03  0.000000000000000000E+00 -0.434898044011626994E-03
+ -0.435244580614824986E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.420929736778404007E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014942481818994E-03 -0.435084911181932989E-03
+ -0.666666666666666970E-02 -0.138286681738250999E-03 -0.140229513177087998E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.255502900122961014E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423360190818987E-03 -0.161424132185092995E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435068884543961991E-03
+ -0.435069911017712987E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.252824734843049986E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423118220580002E-03 -0.161424374155332007E-03
+ -0.666666666666666970E-02 -0.139258110569847997E-03 -0.139258084345492004E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.255485447455677007E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423302756240003E-03
+ -0.161424189619672006E-03  0.000000000000000000E+00 -0.434872688199729998E-03
+ -0.435270525682074014E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.322052497810565996E-01  0.000000000000000000E+00
+ -0.435023979104199015E-03 -0.435075488195124996E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.138279010827208998E-03 -0.140237184088130000E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.258530259407728988E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.434958190724600005E-03 -0.435144090846366989E-03
+  0.000000000000000000E+00 -0.138909064921109010E-03 -0.139607129994229987E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320994819774753012E-01 -0.161423308725907011E-03 -0.161424183650004998E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435080068181975977E-03
+ -0.435058951262028994E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.256614731138162001E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.429527300766965989E-03
+ -0.440793399874073991E-03 -0.166666666666667011E-01 -0.138412525549365991E-03
+ -0.140103669365973006E-03  0.000000000000000000E+00  0.319693244854589997E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.161422576318977012E-03 -0.161424916056934997E-03
+ -0.437741651794766986E-03 -0.432457178335270023E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.257682962742878004E-01  0.000000000000000000E+00
+ -0.372135275696198998E-03 -0.498733752801886051E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.138908713201775990E-03
+ -0.139607481713563007E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.418896828693844023E-01  0.000000000000000000E+00 -0.436533664324920991E-03
+ -0.433624016272548000E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.134626819467095006E-03 -0.143889375448244994E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.252833740154073013E-01
+  0.000000000000000000E+00  0.378159457400615982E-04 -0.725497368129053035E-03
+ -0.666666666666666970E-02 -0.597713125208466033E-03 -0.752181291804723058E-04
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.321707087590768007E-01
+  0.000000000000000000E+00 -0.981956570691642057E-04 -0.589485765319826987E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.242422521137329990E-03
+ -0.626419769060301032E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.252392538739078000E-01
+ -0.333702074783996986E-03 -0.536013663819338046E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138219319560300990E-03
+ -0.140296875355038007E-03 -0.166666666666667011E-01  0.320887149633356031E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435084983875554023E-03
+ -0.435014872763852026E-03 -0.435014828662647025E-03 -0.435085029863970982E-03
+ -0.435055763684865022E-03 -0.435042893498926986E-03 -0.166666666666667011E-01
+ -0.138286799284913995E-03 -0.140229395630425002E-03 -0.138900300230182012E-03
+ -0.139615894685157012E-03 -0.138288031619809013E-03 -0.140228163295530011E-03
+  0.000000000000000000E+00  0.419834394758250989E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423306124254990E-03 -0.161424186251656992E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435073250189719975E-03
+ -0.435065618982948010E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255995701724177002E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.434985228537088976E-03
+ -0.435115896247923015E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138286994987186010E-03 -0.140229199928153990E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.253544285340416009E-01  0.000000000000000000E+00
+ -0.434987985656363021E-03 -0.435113023678724013E-03 -0.666666666666666970E-02
+ -0.140261879841764011E-03 -0.138254315073575989E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.319435795329029021E-01 -0.161410251962658001E-03
+ -0.161437240413254008E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.433888695127137008E-03 -0.436278193993204976E-03 -0.166666666666667011E-01
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.320989341466072983E-01
+  0.000000000000000000E+00 -0.161423293334831010E-03 -0.161424199041080999E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435076232841126974E-03
+ -0.435062702151373017E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.291210504532323985E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423172358814994E-03
+ -0.161424320017096988E-03 -0.161436491869935994E-03 -0.161411000505975988E-03
+ -0.161411014301727991E-03 -0.161436478074183991E-03 -0.666666666666666970E-02
+ -0.437347517893396990E-03 -0.432844697077642020E-03 -0.436241571863408976E-03
+ -0.433925164846824982E-03  0.000000000000000000E+00 -0.437803693779869989E-03
+ -0.432399238650052009E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.238543465439772992E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.420931105150842977E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014654010180979E-03 -0.435085211979509994E-03
+ -0.666666666666666970E-02 -0.138286962315859997E-03 -0.140229232599479000E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.255498409722413992E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423349311523990E-03 -0.161424143064387991E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435079351261626014E-03 -0.435059652438656008E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.252819204017892991E-01
+  0.000000000000000000E+00 -0.161423122522292011E-03 -0.161424369853619998E-03
+ -0.666666666666666970E-02 -0.139257846415800999E-03 -0.139258348499537998E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.255491014693066017E-01
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423317190226995E-03
+ -0.161424175185684987E-03  0.000000000000000000E+00 -0.438043455875436027E-03
+ -0.432161023931338002E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.322801706716391024E-01  0.000000000000000000E+00 -0.434874122345711006E-03
+ -0.435231709745513020E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.138392757283109009E-03 -0.140123437632229988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.258886981903099014E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435135825668647020E-03 -0.434966107095303013E-03  0.000000000000000000E+00
+ -0.138935446888224990E-03 -0.139580748027114007E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.321000528704651969E-01 -0.161423323773416988E-03 -0.161424168602494994E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435030730220607980E-03
+ -0.435108924522503010E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.254844804391417996E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161142124363106994E-03
+ -0.161705368012804988E-03 -0.166666666666667011E-01 -0.225281625809598993E-03
+ -0.643927738894806969E-03  0.000000000000000000E+00  0.321599842976618006E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.464257778793151032E-04 -0.734107200268307048E-03
+ -0.388025614613707001E-03 -0.483301313684787997E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.255020432374560985E-01  0.000000000000000000E+00
+ -0.337126126111301995E-03 -0.532578756276378025E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.137258701832956002E-03
+ -0.141257493082382995E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.333333333333332982E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.421427039886230967E-01  0.000000000000000000E+00 -0.413943151144718978E-03
+ -0.457074858319932024E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.211434099434618995E-03 -0.884365353823558038E-03 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.133333333333332995E-01 -0.666666666666666970E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00  0.252964489671319986E-01
+  0.000000000000000000E+00 -0.213567925484438011E-03 -0.614115947829842971E-03
+ -0.666666666666666970E-02 -0.140017356100088014E-03 -0.138498838815251987E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319739847535212979E-01
+  0.000000000000000000E+00 -0.161422918075368997E-03 -0.161424574300543012E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.427230082581125004E-03
+ -0.443079920816016025E-03 -0.166666666666667011E-01  0.333333333333332982E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.253449783929956991E-01
+ -0.431322912949462978E-03 -0.438929377851577025E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.138850109293811998E-03
+ -0.139666085621527000E-03 -0.166666666666667011E-01  0.240847367787417992E-01
+ -0.666666666666666970E-02 -0.166666666666667011E-01  0.419834073413790990E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.161423305232406004E-03
+ -0.161424187143506005E-03 -0.666666666666666970E-02 -0.435069384667189979E-03
+ -0.435069399625256014E-03 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.133333333333332995E-01 -0.666666666666666970E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00  0.255994979990794985E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.434980660475906998E-03 -0.435120659664607981E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.138286858107371994E-03 -0.140229336807968006E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.253691078096668007E-01
+  0.000000000000000000E+00 -0.434980641165363003E-03 -0.435120678249643021E-03
+ -0.666666666666666970E-02 -0.140229336715335995E-03 -0.138286858200003003E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.319837098824014010E-01
+ -0.161423313471232996E-03 -0.161424178904679013E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069384765853027E-03 -0.435069399524313973E-03
+ -0.166666666666667011E-01  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320996543007720028E-01  0.000000000000000000E+00 -0.161423313374075010E-03
+ -0.161424179001836999E-03 -0.666666666666666970E-02 -0.666666666666666970E-02
+ -0.435069381467433980E-03 -0.435069402897058982E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.292425276973896017E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423743711891987E-03 -0.161423748664019995E-03 -0.161424178997872993E-03
+ -0.161423313378038989E-03 -0.161423313378032999E-03 -0.161424178997880013E-03
+ -0.666666666666666970E-02 -0.435032871054405002E-03 -0.435106735866860005E-03
+ -0.435069379977790014E-03 -0.435069404420270985E-03 -0.435069383385271993E-03
+ -0.435069400936004996E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.333333333333332982E-01 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.238543465439772992E-01 -0.666666666666666970E-02
+ -0.166666666666667011E-01  0.420929804189209972E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.435014924423208018E-03 -0.435084930012396979E-03
+ -0.666666666666666970E-02 -0.138286695556728991E-03 -0.140229499358610006E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.255502626963007989E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.161423359539945995E-03 -0.161424132835965987E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435069382819562025E-03 -0.435069401514463013E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.252823579206017009E-01
+  0.000000000000000000E+00 -0.161423118849177003E-03 -0.161424373526735006E-03
+ -0.666666666666666970E-02 -0.139258097695681013E-03 -0.139258097219658011E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.255489355337556005E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.161423313370652997E-03
+ -0.161424179005259012E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.435069374869375021E-03 -0.435069409643798000E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.322089281766608018E-01 -0.435014924689618014E-03 -0.435084929734623027E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.138286695005679009E-03
+ -0.140229499909660992E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.000000000000000000E+00  0.258549254171622014E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435042893494948018E-03 -0.435055763690273998E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.138910376350831992E-03 -0.139605818564507005E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+  0.320997360358935974E-01 -0.161423315568021993E-03 -0.161424176807890992E-03
+ -0.666666666666666970E-02 -0.666666666666666970E-02 -0.435069380937729011E-03
+ -0.435069403438677978E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.333333333333332982E-01 -0.166666666666667011E-01 -0.166666666666667011E-01
+  0.255489358069375012E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.161423313378028011E-03 -0.161424178997883998E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.435069377571565001E-03 -0.435069406880717003E-03  0.000000000000000000E+00
+  0.320929801553103966E-01  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435014924404768016E-03 -0.435084930031662005E-03
+ -0.138286695016626008E-03 -0.140229499898713992E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.255502626954418992E-01  0.000000000000000000E+00
+ -0.161423359539924989E-03 -0.161424132835986993E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166666666666667011E-01 -0.435069382752795012E-03
+ -0.435069401582733977E-03  0.000000000000000000E+00  0.333333333333332982E-01
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00  0.420929804179970002E-01  0.000000000000000000E+00
+ -0.435014924505136027E-03 -0.435084929926964016E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.138286695554652988E-03 -0.140229499360686009E-03
+ -0.166666666666667011E-01 -0.166666666666667011E-01  0.133333333333332995E-01
+ -0.666666666666666970E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+  0.249646302547502988E-01  0.000000000000000000E+00 -0.161423118848985994E-03
+ -0.161424373526925988E-03 -0.666666666666666970E-02 -0.139258097697407009E-03
+ -0.139258097217931988E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.319837881362066007E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.161423315571430996E-03 -0.161424176804481013E-03 -0.666666666666666970E-02
+ -0.666666666666666970E-02 -0.435069384823893025E-03 -0.435069399464952007E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.333333333333332982E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+  0.253926391078274985E-01 -0.435042938363147018E-03 -0.435055716900843974E-03
+ -0.666666666666666970E-02 -0.138910376047635993E-03 -0.139605818867703004E-03
+ -0.166666666666667011E-01 -0.666666666666666970E-02 -0.162064605978737010E-03
+  0.755721724764775969E-02 -0.421583401256607965E-11 -0.164615581083446994E-03
+ -0.622132993821362964E-09  0.822717713000382066E-02 -0.666666666666666970E-02
+ -0.435031787535767993E-03  0.921658221859024927E-02 -0.671472362675333990E-03
+  0.000000000000000000E+00 -0.435066598341386974E-03 -0.622856281165874953E-03
+  0.926582425818990064E-02 -0.666666666666666970E-02 -0.435031698814498024E-03
+  0.921598015968093029E-02 -0.671350185242260036E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066687046999979E-03
+  0.000000000000000000E+00 -0.622803871309166967E-03  0.926515487581706064E-02
+ -0.666666666666666970E-02 -0.162091523387943991E-03  0.755721896647259996E-02
+ -0.540935128117279988E-11  0.000000000000000000E+00 -0.164750534006632987E-03
+ -0.795366002646085000E-09  0.822717800583096931E-02  0.926452029956657992E-02
+ -0.622281045706827999E-03 -0.666666666666666970E-02 -0.668721208315983004E-03
+  0.920448404025630065E-02 -0.666666666666666970E-02  0.920291930713102065E-02
+ -0.668399798797767049E-03 -0.622140084769083981E-03  0.926273190908652976E-02
+ -0.666666666666666970E-02 -0.162061623289820004E-03  0.755721877319791985E-02
+ -0.410171929572062999E-11 -0.164601757794207001E-03 -0.605571814693378962E-09
+  0.822717790541416943E-02  0.921717278330758984E-02 -0.671606351121000053E-03
+ -0.666666666666666970E-02 -0.622958188273658011E-03  0.926596681290394082E-02
+ -0.666666666666666970E-02 -0.164300067974464987E-03  0.822730729925378983E-02
+ -0.253113151843113024E-09 -0.164103118034321003E-03 -0.251546774270035010E-09
+  0.822722715999001965E-02 -0.666666666666666970E-02 -0.435040487769458974E-03
+  0.921687604564088055E-02 -0.671540363061452956E-03  0.000000000000000000E+00
+ -0.435057899246222006E-03 -0.622872037532872002E-03  0.926585755069535971E-02
+ -0.666666666666666970E-02 -0.162127209687436009E-03  0.759121330666192966E-02
+ -0.381352367097723024E-11  0.000000000000000000E+00 -0.164582493974596989E-03
+ -0.545377294793035977E-09  0.822717728074968957E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.166319287245753013E-03  0.755721788134880966E-02
+ -0.125543672150161994E-10 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166854288422184003E-03  0.000000000000000000E+00 -0.695251112369320969E-09
+  0.759121397932040025E-02 -0.666666666666666970E-02  0.921698284063145017E-02
+ -0.671564905209266956E-03 -0.622877569603531964E-03  0.926586949940542970E-02
+ -0.666666666666666970E-02 -0.435044292403726006E-03  0.921676136558105021E-02
+ -0.671305578260984965E-03 -0.435054094871408024E-03 -0.622625142915718996E-03
+  0.926561885584748987E-02 -0.164408450418608012E-03  0.822722728633254984E-02
+ -0.379408966293651983E-09 -0.162009019530446013E-03 -0.666666666666666970E-02
+ -0.254284697644517994E-11  0.755732846813902005E-02 -0.162062503890727003E-03
+ -0.666666666666666970E-02  0.755732483871392972E-02 -0.412630295389444991E-11
+ -0.164604272365608011E-03 -0.609067369792561007E-09  0.822717730203972007E-02
+ -0.162008946283214003E-03 -0.666666666666666970E-02  0.755728526608244033E-02
+ -0.254314505624484981E-11 -0.164408477352361010E-03 -0.379466146432811988E-09
+  0.822722728638950081E-02 -0.666666666666666970E-02 -0.162061922095729002E-03
+  0.755721774336119961E-02 -0.411297420986845978E-11 -0.164602266437615996E-03
+ -0.607177113853009997E-09  0.822717717450133050E-02  0.926588728390210017E-02
+ -0.622885394141277989E-03 -0.666666666666666970E-02 -0.671602423890051950E-03
+  0.921714903947036014E-02 -0.435048603885989987E-03 -0.666666666666666970E-02
+  0.921700338564301024E-02 -0.671444778088914037E-03 -0.435049783507797990E-03
+ -0.622737470874805995E-03  0.926573446347610068E-02 -0.666666666666666970E-02
+  0.897119442365579972E-02 -0.606814776285979950E-03 -0.607757345476577036E-03
+  0.923427126759777082E-02  0.921865858272334984E-02 -0.671454242057297965E-03
+ -0.666666666666666970E-02 -0.646212455869751953E-03  0.929466342228658926E-02
+ -0.192798491760539997E-03 -0.666666666666666970E-02  0.825745086852652972E-02
+ -0.932628952743546979E-06 -0.167240534534016013E-03 -0.225014212014958993E-07
+  0.822685297950877942E-02 -0.666666666666666970E-02 -0.435814163537570013E-03
+  0.921887387061852068E-02 -0.575307467536130958E-03 -0.434281293408232021E-03
+ -0.604501016235682955E-03  0.915698677472209961E-02 -0.413213566066566995E-03
+ -0.666666666666666970E-02  0.877723361706403069E-02 -0.541754211244169052E-03
+  0.000000000000000000E+00 -0.454683602340957979E-03 -0.590990555173378953E-03
+  0.920776881757097952E-02 -0.666666666666666970E-02 -0.163098401897450001E-03
+  0.763192105293324010E-02 -0.471912653584295006E-04  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.530165759527253025E-03
+  0.000000000000000000E+00 -0.266005332113615023E-03  0.861802401723344921E-02
+ -0.666666666666666970E-02 -0.162004211770428006E-03  0.757730699043844026E-02
+ -0.176948923798146998E-05 -0.529438965832964951E-03 -0.405267405870187984E-04
+  0.875391888040484047E-02  0.921979764348120068E-02 -0.602143221168287973E-03
+ -0.666666666666666970E-02 -0.555336710991042974E-03  0.881585220388428001E-02
+ -0.666666666666666970E-02 -0.163010868628506010E-03  0.795402855233807993E-02
+ -0.273140602330698999E-11 -0.164270868470497012E-03 -0.213417929117084997E-09
+  0.822716826170043040E-02 -0.666666666666666970E-02  0.927693305369737056E-02
+ -0.632604553292874993E-03 -0.671545581861832000E-03  0.921776054175638994E-02
+ -0.161978982193291990E-03 -0.666666666666666970E-02  0.755685732173456970E-02
+ -0.195386460922786995E-11 -0.164191969605661007E-03 -0.291699488577104992E-09
+  0.822713227252660009E-02 -0.666666666666666970E-02 -0.162064640844275011E-03
+  0.755721703602540030E-02 -0.421716748215653003E-11 -0.164615755229752009E-03
+ -0.622326809900769984E-09  0.822717689294136939E-02 -0.666666666666666970E-02
+ -0.435030941620315019E-03  0.921655770481275005E-02 -0.671470405959126958E-03
+  0.000000000000000000E+00 -0.435067444105677011E-03 -0.622859268229516961E-03
+  0.926582520767522955E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435032188115217974E-03  0.921571954136004949E-02 -0.671094905936150000E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066197830839016E-03
+  0.000000000000000000E+00 -0.622560079517476997E-03  0.926486481761856079E-02
+ -0.666666666666666970E-02 -0.162178699975160996E-03  0.755732514584614001E-02
+ -0.123678884662636997E-10 -0.165233346068301005E-03 -0.180156095420544003E-08
+  0.822721902239778938E-02  0.925859295317845929E-02 -0.620261598377330964E-03
+ -0.666666666666666970E-02 -0.659675714565077006E-03  0.916581375466305956E-02
+ -0.666666666666666970E-02  0.818856060767081048E-02 -0.736710799666523000E-11
+ -0.221521744718127990E-09  0.822698054922356946E-02 -0.666666666666666970E-02
+ -0.162061886876159011E-03  0.755721683506239972E-02 -0.411153398251473010E-11
+ -0.164602034973986998E-03 -0.606965384602901960E-09  0.822717599389061020E-02
+ -0.435048835805744005E-03  0.921717024009354943E-02 -0.671607913788692021E-03
+ -0.435049551589144980E-03 -0.666666666666666970E-02 -0.622896101942852008E-03
+  0.926589988923041952E-02 -0.666666666666666970E-02 -0.164092201396150991E-03
+  0.822715599945273920E-02 -0.245104005328256975E-09 -0.164092160478588006E-03
+ -0.245036767609787999E-09  0.822714877827463061E-02 -0.666666666666666970E-02
+ -0.435040451026806000E-03  0.921687587838842040E-02 -0.671541321966330983E-03
+ -0.435057935985662001E-03 -0.622873182117880046E-03  0.926585851990125060E-02
+ -0.666666666666666970E-02 -0.162127502447745005E-03  0.759121254289110009E-02
+ -0.382274259639387968E-11  0.000000000000000000E+00 -0.164583524873151011E-03
+ -0.546666909348633031E-09  0.822717637948596967E-02 -0.666666666666666970E-02
+ -0.166319284889783996E-03  0.755721738061322038E-02 -0.125530862526258001E-10
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166854283136274001E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.695179911733615969E-09  0.759121329234466961E-02 -0.435040521890532996E-03
+ -0.666666666666666970E-02  0.921687815467167083E-02 -0.671542181838102967E-03
+ -0.435057865128120000E-03 -0.622873658606718991E-03  0.926585863114167026E-02
+ -0.666666666666666970E-02 -0.435041288920689020E-03  0.921669954253361012E-02
+ -0.671349556748095989E-03 -0.435057098161677983E-03 -0.622686448593169997E-03
+  0.926564688510513046E-02 -0.164325652029393012E-03  0.822714890084452928E-02
+ -0.369068713238530013E-09 -0.162006572449318004E-03 -0.666666666666666970E-02
+ -0.248169403822924009E-11  0.755729660596458994E-02 -0.162062728183659998E-03
+ -0.666666666666666970E-02  0.755729867674227035E-02 -0.413683822802849013E-11
+ -0.164605417097992009E-03 -0.610604701856152959E-09  0.822717650154041955E-02
+ -0.162006472670696007E-03 -0.666666666666666970E-02  0.755723750063842965E-02
+ -0.248209213801367997E-11 -0.164325689607119994E-03 -0.369144828512267008E-09
+  0.822714890092034017E-02 -0.666666666666666970E-02 -0.162062060860401010E-03
+  0.755721735591260990E-02 -0.411820697467498986E-11 -0.164602902125366996E-03
+ -0.607936291979959974E-09  0.822717685023799036E-02 -0.435051421420119008E-03
+  0.926588186737323932E-02 -0.622883850263154012E-03 -0.435046965950540009E-03
+ -0.666666666666666970E-02 -0.671591030490054954E-03  0.921709452945720931E-02
+ -0.435046965633857023E-03 -0.666666666666666970E-02  0.921703017052688013E-02
+ -0.671537494669817957E-03 -0.435051421736794026E-03 -0.622834247686287991E-03
+  0.926581357500960018E-02 -0.666666666666666970E-02  0.755832397448806970E-02
+ -0.411138722941705998E-11 -0.609957135264493987E-09  0.822725821738184981E-02
+  0.926024553873549920E-02 -0.621022213891748000E-03 -0.666666666666666970E-02
+ -0.663176099841847051E-03  0.918049665304914927E-02 -0.433952552704485026E-03
+ -0.666666666666666970E-02  0.917940522365195966E-02 -0.660850447221769969E-03
+ -0.436139822938484006E-03 -0.618641376898239033E-03  0.925939891361087994E-02
+ -0.666666666666666970E-02 -0.436004520088601995E-03  0.917698869910051063E-02
+ -0.534972138649924970E-03 -0.434089279817291974E-03 -0.560727539174508006E-03
+  0.911474373371438072E-02 -0.405342384042105025E-03 -0.666666666666666970E-02
+  0.868224382189996917E-02 -0.501678846320934967E-03  0.000000000000000000E+00
+ -0.460809060945838991E-03 -0.586502661037227048E-03  0.920011158722021036E-02
+ -0.666666666666666970E-02 -0.163211684871365992E-03  0.764753570908803006E-02
+ -0.433463713660776003E-04  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.529849870720847950E-03  0.000000000000000000E+00
+ -0.242696464819676003E-03  0.855013113167163961E-02 -0.666666666666666970E-02
+ -0.162715770416370988E-03  0.761282345029158040E-02 -0.385507239315511005E-04
+ -0.531931290617636970E-03 -0.289327756212754013E-03  0.896268459992746042E-02
+  0.822708378408086929E-02 -0.453015989481950975E-09 -0.666666666666666970E-02
+ -0.344818166136396003E-11  0.765885647535816984E-02 -0.666666666666666970E-02
+ -0.162981032279518011E-03  0.765876435591864024E-02 -0.850978247264382040E-09
+ -0.167181094192769006E-03 -0.102217899652555996E-06  0.822724871883005922E-02
+ -0.666666666666666970E-02  0.899524486155203924E-02 -0.614036210565926987E-03
+ -0.609553123871605014E-03  0.923729250745260015E-02 -0.162146754878560000E-03
+ -0.666666666666666970E-02  0.755888395212829002E-02 -0.877868214680283015E-11
+ -0.165048028543372008E-03 -0.128278104037022006E-08  0.822721063460546062E-02
+ -0.666666666666666970E-02 -0.435042623242022023E-03  0.921694787133059065E-02
+ -0.671556689557351017E-03 -0.435055763937149977E-03 -0.622875549102152042E-03
+  0.926586538966900006E-02 -0.666666666666666970E-02 -0.435049192464642004E-03
+  0.921716877978374927E-02 -0.671606871920753961E-03  0.000000000000000000E+00
+ -0.435049194930888991E-03 -0.622886313955485970E-03  0.926588938212210070E-02
+ -0.666666666666666970E-02 -0.162062558982797991E-03  0.755721781611762029E-02
+ -0.413711697147502992E-11  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164605380154927001E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.610686417053202014E-09  0.822717710018053966E-02
+ -0.666666666666666970E-02 -0.435040485643464010E-03  0.921687647065221917E-02
+ -0.671540876850887998E-03 -0.435057901372032005E-03 -0.622872543909229947E-03
+  0.926585805993978932E-02  0.822717712936564018E-02 -0.606929319768994989E-09
+ -0.666666666666666970E-02 -0.411128314368316016E-11  0.755721765744643992E-02
+ -0.666666666666666970E-02  0.755721765744643992E-02 -0.411128317078820962E-11
+ -0.606929319768994989E-09  0.822717712936564018E-02 -0.666666666666666970E-02
+ -0.162062002168695000E-03  0.755721765497851042E-02 -0.411600421982957979E-11
+ -0.164602608742393997E-03 -0.607615952871632013E-09  0.822717713082168033E-02
+ -0.435050788864285991E-03  0.926588513557140975E-02 -0.622885206966810982E-03
+ -0.435047598518492997E-03 -0.666666666666666970E-02 -0.671596264858268996E-03
+  0.921711667933667982E-02 -0.666666666666666970E-02 -0.435047598512343999E-03
+  0.921711525903850040E-02 -0.671594794936314007E-03 -0.435050788870434014E-03
+ -0.622883798838939949E-03  0.926588365351022937E-02 -0.666666666666666970E-02
+ -0.162064603060552002E-03  0.755721765252307005E-02 -0.421568902488327991E-11
+ -0.164615562620811005E-03 -0.622111732773898991E-09  0.822717714449052936E-02
+ -0.666666666666666970E-02 -0.435031767332582026E-03  0.921658397831942068E-02
+ -0.671474831069555976E-03  0.000000000000000000E+00 -0.435066618541047982E-03
+ -0.622858773502414035E-03  0.926582672366347941E-02 -0.666666666666666970E-02
+ -0.435031759872446977E-03  0.921598447326531928E-02 -0.671353605549905000E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435066625999740012E-03  0.000000000000000000E+00 -0.622806880419023005E-03
+  0.926515741228120006E-02 -0.435040467082004024E-03 -0.666666666666666970E-02
+  0.921687727787984939E-02 -0.671542217009017000E-03 -0.435057919931870025E-03
+ -0.622873933298062990E-03  0.926585948649214994E-02 -0.666666666666666970E-02
+ -0.162062558212559002E-03  0.755721765246759013E-02 -0.413710611008680003E-11
+ -0.164605378163800012E-03 -0.610684955957268983E-09  0.822717713313388978E-02
+ -0.435057895396635016E-03  0.926585809258491030E-02 -0.622872563956516044E-03
+ -0.435040491619382013E-03 -0.666666666666666970E-02 -0.671540933200040989E-03
+  0.921687668189401059E-02 -0.164358965526126012E-03 -0.666666666666666970E-02
+  0.822694399085341019E-02 -0.400851057692599981E-09 -0.164360419602091007E-03
+ -0.404402306244331983E-09  0.822717693055982982E-02 -0.435040480593904986E-03
+ -0.666666666666666970E-02  0.921687631186799035E-02 -0.671540849649739004E-03
+ -0.435057906421149976E-03 -0.622872546527165980E-03  0.926585805293679045E-02
+ -0.666666666666666970E-02 -0.162062559343999999E-03  0.755721765385561003E-02
+ -0.413714921327754003E-11 -0.164605383792317005E-03 -0.610691224557087966E-09
+  0.822717713418924003E-02 -0.435057910263422993E-03  0.926585952122008921E-02
+ -0.622873948580168974E-03 -0.435040476751295975E-03 -0.666666666666666970E-02
+ -0.671542290274572949E-03  0.921687760235279964E-02 -0.164358958283310994E-03
+ -0.666666666666666970E-02  0.822694359327636945E-02 -0.400841676414764018E-09
+ -0.164360412981448008E-03 -0.404398890304639026E-09  0.822717692927778937E-02
+ -0.666666666666666970E-02  0.755721765237515972E-02 -0.411128322752053976E-11
+ -0.606929330558722046E-09  0.822717712936565058E-02  0.822717692676106990E-02
+ -0.401916224886799020E-09 -0.666666666666666970E-02 -0.398503993611501997E-09
+  0.822695176179768020E-02 -0.435040782183989977E-03 -0.666666666666666970E-02
+  0.921688643492079955E-02 -0.671543136482274949E-03 -0.435057604856947981E-03
+ -0.622873024631925972E-03  0.926585913880650924E-02 -0.666666666666666970E-02
+ -0.162064603067295007E-03  0.755721765304565030E-02 -0.421568935096809032E-11
+ -0.164615562649187992E-03 -0.622111764150747044E-09  0.822717714448892995E-02
+ -0.435031767264953018E-03 -0.666666666666666970E-02  0.921658397617671973E-02
+ -0.671474830688970005E-03  0.000000000000000000E+00 -0.435066618608665010E-03
+ -0.622858773521975970E-03  0.926582672355137985E-02 -0.666666666666666970E-02
+ -0.435031759867127014E-03  0.921598447384829045E-02 -0.671353608399908978E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435066626005059000E-03  0.000000000000000000E+00 -0.622806883450133957E-03
+  0.926515741287571928E-02 -0.666666666666666970E-02 -0.162062535484642990E-03
+  0.755721765241379028E-02 -0.413624149795890984E-11 -0.164605264966733003E-03
+ -0.610559194893120966E-09  0.822717713298093054E-02  0.822717692676106990E-02
+ -0.401916224778066025E-09 -0.666666666666666970E-02 -0.398503993617073007E-09
+  0.822695176179807051E-02 -0.666666666666666970E-02 -0.435040782184004994E-03
+  0.921688643492129048E-02 -0.671543136482392043E-03 -0.435057604856933019E-03
+ -0.622873024631955029E-03  0.926585913880656996E-02 -0.666666666666666970E-02
+  0.755721765237515018E-02 -0.411128322752061973E-11 -0.606929330558734970E-09
+  0.822717712936565058E-02 -0.162062535484639006E-03 -0.666666666666666970E-02
+  0.755721765241101993E-02 -0.413624149803294008E-11 -0.164605264966739996E-03
+ -0.610559194905281038E-09  0.822717713298093054E-02 -0.666666666666666970E-02
+ -0.435040978876939991E-03  0.921689266842156026E-02 -0.671544215875962010E-03
+ -0.435057408180386984E-03 -0.622872937769437958E-03  0.926585946462473060E-02
+ -0.666666666666666970E-02 -0.162127387652292998E-03  0.759121301093484014E-02
+ -0.381917589201341967E-11  0.000000000000000000E+00 -0.164583145431396004E-03
+ -0.546168929922635014E-09  0.822717717091626943E-02 -0.666666666666666970E-02
+ -0.166319270517048003E-03  0.755721782869738998E-02 -0.125468645423563995E-10
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166854266113373008E-03  0.000000000000000000E+00 -0.694835327004200008E-09
+  0.759121369277350033E-02 -0.666666666666666970E-02 -0.162091523336998009E-03
+  0.755721898664039968E-02 -0.540934653228686967E-11 -0.164750533613684997E-03
+ -0.795365292654328000E-09  0.822717800588296938E-02  0.926452030366763969E-02
+ -0.622281047524610035E-03 -0.666666666666666970E-02 -0.668721216930222967E-03
+  0.920448407782870071E-02 -0.666666666666666970E-02  0.920291934460982022E-02
+ -0.668399807408691956E-03 -0.622140086592986989E-03  0.926273191317288080E-02
+ -0.666666666666666970E-02 -0.435042278634080015E-03  0.921682356015443051E-02
+ -0.671431390315878037E-03 -0.435056108521798986E-03 -0.622756696995610017E-03
+  0.926574695980873948E-02 -0.164604656267662011E-03  0.822717725537036940E-02
+ -0.609640569558637034E-09 -0.162062479874256008E-03 -0.666666666666666970E-02
+ -0.413006082275806018E-11  0.755726759085299012E-02 -0.666666666666666970E-02
+ -0.162006162226947987E-03  0.755727139217683957E-02 -0.248123277233144982E-11
+ -0.164398193103688998E-03 -0.370429925449825014E-09  0.822722962912562916E-02
+ -0.666666666666666970E-02 -0.162064599546255997E-03  0.755721765602699984E-02
+ -0.421555271070431014E-11 -0.164615545095071005E-03 -0.622091901724106049E-09
+  0.822717714627387020E-02 -0.666666666666666970E-02 -0.435031818763031985E-03
+  0.921658104872763971E-02 -0.671470214801736043E-03  0.000000000000000000E+00
+ -0.435066567119566977E-03 -0.622854034507517988E-03  0.926582206155602932E-02
+ -0.666666666666666970E-02 -0.435031752461292008E-03  0.921598570305218016E-02
+ -0.671353593598150960E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066633409600010E-03  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.622806719992169021E-03
+  0.926515905834468002E-02 -0.162062393641141989E-03 -0.666666666666666970E-02
+  0.755722114123831036E-02 -0.413058273432428028E-11 -0.164604687174786012E-03
+ -0.609739837378995011E-09  0.822717725546923996E-02 -0.666666666666666970E-02
+ -0.162060731985959994E-03  0.755721979203459980E-02 -0.406827649196786028E-11
+ -0.164598252833557000E-03 -0.600736958190943012E-09  0.822717859850461038E-02
+ -0.164089877343614012E-03  0.822722950591434077E-02 -0.245723982778697014E-09
+ -0.164740360816590010E-03 -0.666666666666666970E-02 -0.253032241614393982E-09
+  0.822760867484073968E-02 -0.435063218164074010E-03 -0.666666666666666970E-02
+  0.926625279535532066E-02 -0.623222898205732980E-03 -0.435035168245856982E-03
+ -0.671596115310440014E-03  0.921718087666401065E-02 -0.162006191893170997E-03
+ -0.666666666666666970E-02  0.755728897650832006E-02 -0.248111426872892018E-11
+ -0.164398182178321996E-03 -0.370407207277356981E-09  0.822722962910301010E-02
+ -0.666666666666666970E-02 -0.162062408593516993E-03  0.755721778752637980E-02
+ -0.413141814596678035E-11 -0.164604734980543005E-03 -0.609861061511916968E-09
+  0.822717721123181027E-02 -0.164359758083610009E-03  0.822717705188663086E-02
+ -0.403734886259138006E-09 -0.164358443024191010E-03 -0.666666666666666970E-02
+ -0.400953600054809976E-09  0.822699461577980004E-02 -0.435042369262490986E-03
+ -0.666666666666666970E-02  0.921678605590701971E-02 -0.671388823849048961E-03
+ -0.435056017899627977E-03 -0.622715238127973967E-03  0.926570506586106915E-02
+ -0.666666666666666970E-02  0.897119442339463016E-02 -0.606814776206859971E-03
+ -0.607757345455318975E-03  0.923427126754335081E-02  0.921865858631707932E-02
+ -0.671454241685535990E-03 -0.666666666666666970E-02 -0.646212508082324977E-03
+  0.929466349621914031E-02 -0.192798550395760997E-03 -0.666666666666666970E-02
+  0.825745095197305025E-02 -0.932632147975482013E-06 -0.167240534661016013E-03
+ -0.225014932097670997E-07  0.822685297960725967E-02 -0.666666666666666970E-02
+ -0.435853333051191975E-03  0.921982672935077983E-02 -0.576029289720921011E-03
+ -0.434241817036211018E-03 -0.604491260294178975E-03  0.915701687929672964E-02
+ -0.413213785008022022E-03 -0.666666666666666970E-02  0.877723644056978974E-02
+ -0.541755128362865035E-03 -0.454683425554907011E-03 -0.590990666065798007E-03
+  0.920776909864313986E-02 -0.666666666666666970E-02 -0.163098438357505001E-03
+  0.763192860819322978E-02 -0.471912218750692030E-04  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.530165770547247946E-03
+ -0.266001580374522018E-03  0.861802239077459967E-02 -0.666666666666666970E-02
+ -0.162004212070115012E-03  0.757730725874164977E-02 -0.176948864124370000E-05
+ -0.529438965863178955E-03 -0.405266342026540003E-04  0.875391878258710045E-02
+  0.921979766533967057E-02 -0.602143233936809023E-03 -0.666666666666666970E-02
+ -0.555336769390324045E-03  0.881585236637343062E-02 -0.666666666666666970E-02
+ -0.163010868918803012E-03  0.795402864518520922E-02 -0.273140627481144992E-11
+ -0.164270868404786012E-03 -0.213417891562764991E-09  0.822716826171294990E-02
+ -0.666666666666666970E-02  0.927693305629184034E-02 -0.632604555455441994E-03
+ -0.671545581847466972E-03  0.921776054189130979E-02 -0.161978982117798992E-03
+ -0.666666666666666970E-02  0.755685740523829967E-02 -0.195386057816982017E-11
+ -0.164191968508161013E-03 -0.291698868024367013E-09  0.822713227244221967E-02
+ -0.666666666666666970E-02 -0.435040481081572998E-03  0.921687625100787999E-02
+ -0.671540765653113052E-03 -0.435057905933523977E-03 -0.622872462287793992E-03
+  0.926585797472436953E-02 -0.666666666666666970E-02 -0.162127507324803994E-03
+  0.759121282163092038E-02 -0.382298621466253031E-11 -0.164583632044819009E-03
+ -0.546703997412867024E-09  0.822717713283360047E-02 -0.666666666666666970E-02
+ -0.166319256859904011E-03  0.755721769623335978E-02 -0.125414820597383008E-10
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166854250172961013E-03 -0.694537633219151044E-09  0.759121350560284982E-02
+ -0.666666666666666970E-02 -0.162099247952961000E-03  0.755721989572244977E-02
+ -0.581368808752400979E-11 -0.164789516502383002E-03 -0.853932691235368040E-09
+  0.822717846311052034E-02  0.926237298878906934E-02 -0.621979743485002014E-03
+ -0.666666666666666970E-02 -0.667643958464268013E-03  0.919963618647812002E-02
+ -0.666666666666666970E-02  0.920119115376864978E-02 -0.667964676147511953E-03
+ -0.622121227338204037E-03  0.926416149864764914E-02 -0.666666666666666970E-02
+ -0.435040909817440986E-03  0.921687655493176952E-02 -0.671527187062299024E-03
+ -0.435057477234175008E-03 -0.622856729377207043E-03  0.926584500443678079E-02
+ -0.164605266534459990E-03  0.822717720592583933E-02 -0.610450247156119009E-09
+ -0.162062530200907004E-03 -0.666666666666666970E-02 -0.413549431266714970E-11
+  0.755722446015559979E-02 -0.666666666666666970E-02 -0.162059365323539989E-03
+  0.755722567800456990E-02 -0.401936380178025961E-11 -0.164612123412715989E-03
+ -0.594277800995966983E-09  0.822719391360227960E-02 -0.666666666666666970E-02
+ -0.162064601680787996E-03  0.755721765533357015E-02 -0.421563549128826973E-11
+  0.000000000000000000E+00 -0.164615555733905002E-03 -0.622103957379088996E-09
+  0.822717714640426936E-02 -0.666666666666666970E-02 -0.435031792976918995E-03
+  0.921658339535963086E-02 -0.671473400089229013E-03  0.000000000000000000E+00
+ -0.435066592901186004E-03 -0.622857239963683028E-03  0.926582531946548021E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435031751019518008E-03
+  0.921598490375117978E-02 -0.671353543894939037E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066634851121988E-03  0.000000000000000000E+00
+ -0.622806775506295953E-03  0.926515820468288079E-02 -0.162062519267639990E-03
+ -0.666666666666666970E-02  0.755721857194846958E-02 -0.413556062572659997E-11
+ -0.164605270453123994E-03 -0.610462848248556989E-09  0.822717720593840046E-02
+ -0.666666666666666970E-02 -0.162062243310156996E-03  0.755721835822449010E-02
+ -0.412518743024167011E-11 -0.164604700854507999E-03 -0.608980634819725018E-09
+  0.822717779878246944E-02 -0.164369437450745994E-03  0.822719371500816969E-02
+ -0.393313770364309007E-09 -0.164344328728707013E-03 -0.666666666666666970E-02
+ -0.391103618326530975E-09  0.822706248393468946E-02 -0.435044237783256996E-03
+ -0.666666666666666970E-02  0.921700053762436987E-02 -0.671567082152385998E-03
+ -0.435054149489181002E-03 -0.622876302108100002E-03  0.926586962058313993E-02
+ -0.162059379398541000E-03 -0.666666666666666970E-02  0.755723329728289969E-02
+ -0.401928043513828996E-11 -0.164612118388109011E-03 -0.594261933530200978E-09
+  0.822719391358646933E-02 -0.666666666666666970E-02 -0.162062495727046009E-03
+  0.755721772845492981E-02 -0.413473254943709991E-11 -0.164605146051557987E-03
+ -0.610342338260452991E-09  0.822717719432444067E-02 -0.164360315553491003E-03
+  0.822717700213156967E-02 -0.404232600081692006E-09 -0.164358887835959998E-03
+ -0.666666666666666970E-02 -0.400997831179622018E-09  0.822696488974702066E-02
+ -0.435041264130012022E-03 -0.666666666666666970E-02  0.921687154445821957E-02
+ -0.671511051258173990E-03 -0.435057122950393985E-03 -0.622839088072463958E-03
+  0.926582874613315031E-02 -0.666666666666666970E-02  0.927696108771089954E-02
+ -0.632627917074633003E-03 -0.671545420285009982E-03  0.921776198086651961E-02
+  0.922026584017879985E-02 -0.602408843182564049E-03 -0.666666666666666970E-02
+ -0.556545745722415003E-03  0.881922185436606956E-02 -0.163017068240428012E-03
+ -0.666666666666666970E-02  0.795594799175567031E-02 -0.273445423603751999E-11
+ -0.164269850461446013E-03 -0.212776812668013003E-09  0.822716866999695026E-02
+ -0.666666666666666970E-02 -0.435073828677985024E-03  0.926998697001005928E-02
+ -0.626775693155938018E-03 -0.435024555678502995E-03 -0.674658125777386950E-03
+  0.922087193677081918E-02 -0.317388619046037024E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.807555110066160951E-02 -0.161231565052261995E-03
+ -0.506658534832258992E-03 -0.311775039000433988E-03  0.895372823636290081E-02
+ -0.666666666666666970E-02 -0.163033789441224992E-03  0.757712252412545001E-02
+ -0.230784432560672012E-04  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.528851957602917964E-03 -0.189448307782970002E-03  0.832713693082491081E-02
+ -0.666666666666666970E-02 -0.161988418356935989E-03  0.755983351837515982E-02
+ -0.201738757027985980E-11 -0.164212922403823006E-03 -0.300296756336277975E-09
+  0.822710828992056012E-02  0.921859794240553060E-02 -0.671461758481382994E-03
+ -0.666666666666666970E-02 -0.645287749704886960E-03  0.929336169606828980E-02
+ -0.666666666666666970E-02 -0.192231218611419992E-03  0.825469369225220045E-02
+ -0.211366001320515989E-06 -0.166616012215181007E-03 -0.134857476054616004E-07
+  0.822777129012250072E-02 -0.666666666666666970E-02  0.902089049785161946E-02
+ -0.621512175397749989E-03 -0.611398199404824985E-03  0.924071304541716938E-02
+ -0.162013975253551008E-03 -0.666666666666666970E-02  0.758489435794543979E-02
+ -0.176583046930321999E-05 -0.529428370426132016E-03 -0.381593418223961990E-04
+  0.875176249380833077E-02 -0.666666666666666970E-02 -0.435040249760507993E-03
+  0.921686856770915011E-02 -0.671539094886240001E-03 -0.435058137234120004E-03
+ -0.622872175240221974E-03  0.926585722647491929E-02 -0.666666666666666970E-02
+ -0.435049192418392975E-03  0.921716878281565039E-02 -0.671606876372485046E-03
+ -0.435049194977138020E-03 -0.622886318490127947E-03  0.926588938673533064E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.162062559002504992E-03
+  0.755721765529914023E-02 -0.413713063586915014E-11 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164605381839701988E-03 -0.610688496265456990E-09
+  0.822717709903768996E-02 -0.666666666666666970E-02 -0.435040481539034974E-03
+  0.921687633332718068E-02 -0.671540846145312980E-03 -0.435057905476102008E-03
+ -0.622872537796382957E-03  0.926585804562602040E-02  0.822717712936564018E-02
+ -0.606929320140830976E-09 -0.666666666666666970E-02 -0.411128314549578026E-11
+  0.755721765728442022E-02 -0.666666666666666970E-02  0.755721765728442022E-02
+ -0.411128314549578026E-11 -0.606929320113726025E-09  0.822717712936564018E-02
+ -0.666666666666666970E-02 -0.162062535444855996E-03  0.755721765299004027E-02
+ -0.413623988345109963E-11 -0.164605264762881988E-03 -0.610558967689581019E-09
+  0.822717713298422998E-02 -0.164360411104607013E-03  0.822717692934940049E-02
+ -0.404392079821727997E-09 -0.164359007307861987E-03 -0.666666666666666970E-02
+ -0.400959182056933016E-09  0.822695177830758996E-02 -0.666666666666666970E-02
+ -0.435040782704709025E-03  0.921688645230588945E-02 -0.671543140332089947E-03
+ -0.435057604336271976E-03 -0.622873025362695042E-03  0.926585914058404916E-02
+ -0.666666666666666970E-02 -0.435040480198235974E-03  0.921687629980992920E-02
+ -0.671540848018962978E-03  0.000000000000000000E+00 -0.435057906816784022E-03
+ -0.622872547225220984E-03  0.926585805277986042E-02 -0.666666666666666970E-02
+ -0.435049192309392005E-03  0.921716878048172046E-02 -0.671606876991212012E-03
+ -0.435049195086140020E-03 -0.622886319711294994E-03  0.926588938772130062E-02
+ -0.666666666666666970E-02 -0.162062559108483992E-03  0.755721765240897035E-02
+ -0.413713455792400007E-11  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164605382375200988E-03 -0.610689046115110989E-09
+  0.822717709593154961E-02 -0.162062559033916009E-03 -0.666666666666666970E-02
+  0.755721765255952006E-02 -0.413713742434565006E-11 -0.164605382254266990E-03
+ -0.610689502144610012E-09  0.822717713322326967E-02 -0.666666666666666970E-02
+ -0.162062001821161990E-03  0.755721765238182019E-02 -0.411599099559748981E-11
+ -0.164602607031998988E-03 -0.607614053906671031E-09  0.822717713004775947E-02
+ -0.435050784143273998E-03  0.926588367156185033E-02 -0.622883807548416991E-03
+ -0.435047603239580017E-03 -0.666666666666666970E-02 -0.671594832030014948E-03
+  0.921711541879277965E-02 -0.435047603239578987E-03 -0.666666666666666970E-02
+  0.921711541854514961E-02 -0.671594831758394982E-03 -0.435050784143274974E-03
+ -0.622883807286240968E-03  0.926588367130479033E-02 -0.435040480204222993E-03
+ -0.666666666666666970E-02  0.921687629993508950E-02 -0.671540847984813971E-03
+ -0.435057906810797979E-03 -0.622872547158363982E-03  0.926585805272238036E-02
+ -0.666666666666666970E-02 -0.435040480199119979E-03  0.921687628837800955E-02
+ -0.671540836043143011E-03 -0.435057906815900994E-03 -0.622872535731676044E-03
+  0.926585804082549064E-02 -0.164605382251114998E-03  0.822717713322325926E-02
+ -0.610689492032270982E-09 -0.162062559042711003E-03 -0.666666666666666970E-02
+ -0.413713737102918003E-11  0.755721765729555021E-02 -0.162062559115393992E-03
+ -0.666666666666666970E-02  0.755721765728750976E-02 -0.413714006582654996E-11
+ -0.164605382612441987E-03 -0.610689891716875000E-09  0.822717713311152052E-02
+ -0.666666666666666970E-02  0.755721765237515972E-02 -0.411128320041545960E-11
+ -0.606929330558716048E-09  0.822717712936565058E-02  0.822717692676106990E-02
+ -0.401916224778854018E-09 -0.666666666666666970E-02 -0.398503993603185006E-09
+  0.822695176179710080E-02 -0.435040782183968022E-03 -0.666666666666666970E-02
+  0.921688643492006056E-02 -0.671543136482093996E-03 -0.435057604856969990E-03
+ -0.622873024631876967E-03  0.926585913880641036E-02 -0.666666666666666970E-02
+ -0.162064603069931001E-03  0.755721765252316026E-02 -0.421568944254191005E-11
+ -0.164615562667500004E-03 -0.622111785613204007E-09  0.822717714448713971E-02
+ -0.435031767249101982E-03 -0.666666666666666970E-02  0.921658397565039943E-02
+ -0.671474830574549054E-03  0.000000000000000000E+00 -0.435066618624513010E-03
+ -0.622858773502365029E-03  0.926582672349992968E-02 -0.666666666666666970E-02
+ -0.435031759837616007E-03  0.921598447232674020E-02 -0.671353608078640034E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435066626034565020E-03  0.000000000000000000E+00 -0.622806883368089994E-03
+  0.926515741217365935E-02 -0.666666666666666970E-02 -0.162062535484635997E-03
+  0.755721765240871968E-02 -0.413624136259383020E-11 -0.164605264966749998E-03
+ -0.610559194918970050E-09  0.822717713298093921E-02  0.822717692676106990E-02
+ -0.401916224724624988E-09 -0.666666666666666970E-02 -0.398503993603535989E-09
+  0.822695176179711989E-02 -0.666666666666666970E-02 -0.435040782183968998E-03
+  0.921688643492009005E-02 -0.671543136482101043E-03 -0.435057604856969014E-03
+ -0.622873024631878051E-03  0.926585913880642077E-02 -0.666666666666666970E-02
+  0.755721765237515972E-02 -0.411128320041545960E-11 -0.606929330558716048E-09
+  0.822717712936565058E-02 -0.162062535484635997E-03 -0.666666666666666970E-02
+  0.755721765240869973E-02 -0.413624144391172994E-11 -0.164605264966749998E-03
+ -0.610559194919383020E-09  0.822717713298093921E-02 -0.666666666666666970E-02
+ -0.435033689603804975E-03  0.921378384069738987E-02 -0.668589820131555040E-03
+ -0.435064696586663006E-03 -0.620092661077489989E-03  0.926283488586463963E-02
+ -0.666666666666666970E-02 -0.162162554053133000E-03  0.761060627573421043E-02
+ -0.358738548769622997E-11 -0.164556192142978992E-03 -0.502148070051966022E-09
+  0.822717405269838935E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.163092637467958997E-03  0.755846712970139993E-02  0.813151629364127964E-19
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.163377020705577990E-03
+ -0.197563428050820995E-37  0.761068726943637999E-02 -0.666666666666666970E-02
+ -0.415446954693720976E-03  0.879694625554016015E-02 -0.528338181427255972E-03
+ -0.452858816762892013E-03 -0.571958066866902980E-03  0.919346567847542998E-02
+  0.822717710541285077E-02 -0.582878441461362035E-09 -0.666666666666666970E-02
+ -0.398790879900396976E-11  0.756907608051521959E-02 -0.666666666666666970E-02
+  0.756911094403798970E-02 -0.399682676269469016E-11 -0.586791272997618962E-09
+  0.822723940155752065E-02 -0.666666666666666970E-02 -0.435082932504862979E-03
+  0.926680195170886956E-02 -0.623730586218653013E-03 -0.435015449187736995E-03
+ -0.671618387476765970E-03  0.921723408637379997E-02 -0.164605497784444998E-03
+  0.822717715380158997E-02 -0.610801293217642997E-09 -0.162062563985841001E-03
+ -0.666666666666666970E-02 -0.413787738555715978E-11  0.755721095967978960E-02
+ -0.666666666666666970E-02 -0.162067963173172003E-03  0.755721116190670970E-02
+ -0.434923998820925017E-11 -0.164635999950897013E-03 -0.641653733272013955E-09
+  0.822717987065326069E-02 -0.666666666666666970E-02 -0.162064602794511991E-03
+  0.755721765349726994E-02 -0.421567878592677010E-11  0.000000000000000000E+00
+ -0.164615561292109994E-03 -0.622110240351067001E-09  0.822717714537648040E-02
+ -0.666666666666666970E-02 -0.435031773851333981E-03  0.921658402777584054E-02
+ -0.671474653364973048E-03  0.000000000000000000E+00 -0.435066612023434006E-03
+ -0.622858559544564984E-03  0.926582657482898954E-02 -0.666666666666666970E-02
+ -0.435031756377946010E-03  0.921598454249483029E-02 -0.671353578907302054E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435066629493629977E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.622806849901958001E-03  0.926515761104865047E-02
+ -0.162062576485603999E-03 -0.666666666666666970E-02  0.755721769097717974E-02
+ -0.413780156463713995E-11 -0.164605493304321993E-03 -0.610786879472733959E-09
+  0.822717715378724034E-02 -0.666666666666666970E-02 -0.162062600946185996E-03
+  0.755721775647281990E-02 -0.413875277158846027E-11 -0.164605835862048996E-03
+ -0.610932451422013981E-09  0.822717731789045920E-02 -0.164390239995456007E-03
+  0.822717965629857044E-02 -0.424752159962523975E-09 -0.164384780603813011E-03
+ -0.666666666666666970E-02 -0.420718048853076981E-09  0.822693037781527942E-02
+ -0.435039866684628978E-03 -0.666666666666666970E-02  0.921685570981070970E-02
+ -0.671536106102198996E-03 -0.435058520274921989E-03 -0.622871476563668946E-03
+  0.926585585423385025E-02 -0.162067974342572992E-03 -0.666666666666666970E-02
+  0.755721712453114990E-02 -0.434916937807187029E-11 -0.164635995970298013E-03
+ -0.641640314610242035E-09  0.822717987063988944E-02 -0.666666666666666970E-02
+ -0.162062530993410003E-03  0.755721767391838006E-02 -0.413607275117757979E-11
+ -0.164605272998311010E-03 -0.610535653474081026E-09  0.822717715805352037E-02
+ -0.164360523631990990E-03  0.822717694987937058E-02 -0.404455003494009021E-09
+ -0.164359098316623007E-03 -0.666666666666666970E-02 -0.401040888608453987E-09
+  0.822695309202677028E-02 -0.435040829062186994E-03 -0.666666666666666970E-02
+  0.921689067116016926E-02 -0.671545174080842035E-03 -0.435057557982693015E-03
+ -0.622874572247597981E-03  0.926586218088495950E-02 -0.666666666666666970E-02
+  0.755944258225673023E-02 -0.409900186798689021E-11 -0.607550069944511006E-09
+  0.822725761878751946E-02  0.822717683107231958E-02 -0.307429333767333980E-09
+ -0.666666666666666970E-02 -0.346590492659272013E-11  0.788944072959344919E-02
+ -0.405969476545236976E-03 -0.666666666666666970E-02  0.868864828240666076E-02
+ -0.503334320458691947E-03 -0.460337026543280981E-03 -0.585683663662835011E-03
+  0.919951403822613992E-02 -0.666666666666666970E-02 -0.434971941726958005E-03
+  0.922397396166772972E-02 -0.680859150263666982E-03 -0.435126415795954993E-03
+ -0.632212694429038955E-03  0.927545373694507042E-02 -0.163258163012422003E-03
+ -0.666666666666666970E-02  0.771427725269166996E-02 -0.477633450518533990E-04
+ -0.532374012608134001E-03 -0.264816289609958973E-03  0.893589131240556979E-02
+ -0.666666666666666970E-02 -0.163022374738535991E-03  0.756010628736956004E-02
+ -0.985505903625681948E-05  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.528465266513926044E-03 -0.112888812204124995E-03
+  0.818271031310010069E-02  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.162166578301091007E-03  0.755933919789102040E-02 -0.104753730850025000E-10
+ -0.165099058739546009E-03 -0.152246080895752994E-08  0.822717141726839012E-02
+  0.822711899108566000E-02 -0.221645141852008993E-09 -0.666666666666666970E-02
+ -0.803318716673082971E-11  0.819386805739684074E-02 -0.666666666666666970E-02
+ -0.433725918784746999E-03  0.917430859668704086E-02 -0.661288978702212048E-03
+ -0.436363722694927986E-03 -0.620322134736719043E-03  0.926073953711441071E-02
+ -0.666666666666666970E-02  0.755741928796660033E-02 -0.410903028695346997E-11
+ -0.606500685244875008E-09  0.822717712893872993E-02 -0.162225051571071008E-03
+ -0.666666666666666970E-02  0.762801677615301994E-02 -0.369132365187440996E-05
+ -0.529778141305529949E-03 -0.883337217346735949E-04  0.880271406258762920E-02
+ -0.666666666666666970E-02 -0.415930173645343991E-03  0.874527436890885045E-02
+ -0.345835963857271027E-03 -0.452435882453698006E-03 -0.346466114750530014E-03
+  0.898436350079646028E-02 -0.666666666666666970E-02 -0.410333749350875977E-03
+  0.874024429470159937E-02 -0.526324780402007012E-03 -0.456977206374832004E-03
+ -0.594710214126934963E-03  0.921019309223168083E-02 -0.666666666666666970E-02
+ -0.349138390596867001E-03  0.808745258250311017E-02 -0.174389744426073004E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.493489741678000975E-03 -0.295632081822985011E-03  0.859314073796063962E-02
+ -0.666666666666666970E-02 -0.162165456144072008E-03  0.755894484280849981E-02
+ -0.104756642479310002E-10 -0.165141511878991989E-03 -0.152686212523292010E-08
+  0.822721248571258931E-02  0.925936813944002075E-02 -0.620622043504368980E-03
+ -0.666666666666666970E-02 -0.661330297689515954E-03  0.917272030438071040E-02
+ -0.666666666666666970E-02  0.917272030438071040E-02 -0.661330297689515954E-03
+ -0.620622043504368980E-03  0.925936813944002075E-02 -0.666666666666666970E-02
+ -0.162062370509243004E-03  0.755772622350076983E-02 -0.408856756901033998E-11
+ -0.164599404158889001E-03 -0.603371041254018047E-09  0.822717649469902924E-02
+ -0.164363072970947994E-03  0.822717536870335060E-02 -0.406417442325602006E-09
+ -0.164592544349163996E-03 -0.666666666666666970E-02 -0.411459868924217992E-09
+  0.822733545578738061E-02 -0.666666666666666970E-02 -0.435055225953795985E-03
+  0.926605052142811039E-02 -0.623035876737523049E-03 -0.435043161259378005E-03
+ -0.671607266693034040E-03  0.921717853066967956E-02 -0.666666666666666970E-02
+ -0.435040481662372024E-03  0.921687634507689993E-02 -0.671540853956346050E-03
+ -0.435057905352776017E-03 -0.622872544450323051E-03  0.926585805410279004E-02
+ -0.666666666666666970E-02 -0.435049192197708014E-03  0.921716876912447025E-02
+ -0.671606868732131960E-03  0.000000000000000000E+00 -0.435049195197822981E-03
+ -0.622886312493042956E-03  0.926588937934206061E-02 -0.666666666666666970E-02
+ -0.162062574467009994E-03  0.755721765477525027E-02 -0.413771902340413000E-11
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.164605458858507004E-03  0.000000000000000000E+00 -0.610774061235578027E-09
+  0.822717709793101964E-02 -0.162063129702431002E-03 -0.666666666666666970E-02
+  0.755721488286742026E-02 -0.415889664939935991E-11 -0.164608241410742993E-03
+ -0.613854083280822974E-09  0.822717557368093065E-02 -0.666666666666666970E-02
+ -0.162062006116684993E-03  0.755721765538701958E-02 -0.411615339006451024E-11
+ -0.164602628392850010E-03 -0.607637651237361961E-09  0.822717712956365019E-02
+ -0.435050838802116988E-03  0.926588470716894000E-02 -0.622885079077197044E-03
+ -0.435047548579851017E-03 -0.666666666666666970E-02 -0.671595813711089052E-03
+  0.921711477741266050E-02 -0.435047548601973024E-03 -0.666666666666666970E-02
+  0.921711980798223078E-02 -0.671600947327989010E-03 -0.435050838779994981E-03
+ -0.622889987523941044E-03  0.926588996280537994E-02 -0.435040278978450999E-03
+ -0.666666666666666970E-02  0.921687073978887973E-02 -0.671540718908447947E-03
+ -0.435058108018793015E-03 -0.622873586970733978E-03  0.926585856104042080E-02
+ -0.666666666666666970E-02 -0.435040282159379990E-03  0.921688118588532938E-02
+ -0.671551578142270001E-03 -0.435058104838147977E-03 -0.622883989557962013E-03
+  0.926586935714399963E-02 -0.164608242952759995E-03  0.822717557368590063E-02
+ -0.613859067359255033E-09 -0.162063125398168002E-03 -0.666666666666666970E-02
+ -0.415892283159718001E-11  0.755721256705853967E-02 -0.162062565682641010E-03
+ -0.666666666666666970E-02  0.755721268194402968E-02 -0.413779732258981993E-11
+ -0.164605464633444011E-03 -0.610787973952621006E-09  0.822717713109540061E-02
+ -0.666666666666666970E-02  0.895231589357397932E-02 -0.601006644779317027E-03
+ -0.606288347917921001E-03  0.923179444181716075E-02  0.921580902907083964E-02
+ -0.671118539167890951E-03 -0.666666666666666970E-02 -0.628092210114751027E-03
+  0.927168309130125927E-02 -0.174566225284372993E-03 -0.666666666666666970E-02
+  0.823503971979330958E-02 -0.510771887614783954E-06 -0.167453071907815989E-03
+ -0.287656487025054002E-06  0.822679160087593994E-02 -0.666666666666666970E-02
+ -0.162064999238019001E-03  0.755721705065465991E-02 -0.423106066889392027E-11
+ -0.164617538202307007E-03 -0.624346471180137971E-09  0.822717661766741992E-02
+ -0.435027297636500021E-03 -0.666666666666666970E-02  0.921651161171017051E-02
+ -0.671532969891762031E-03  0.000000000000000000E+00 -0.435071087356345989E-03
+ -0.622941366988572030E-03  0.926589066343610004E-02 -0.666666666666666970E-02
+ -0.435031386261265976E-03  0.921594142114651009E-02 -0.671460160302111021E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435066999544915002E-03  0.000000000000000000E+00 -0.622929217552493962E-03
+  0.926511198239901926E-02 -0.666666666666666970E-02 -0.162052976426746003E-03
+  0.755715649452497022E-02 -0.378464898874789026E-11 -0.164557847455290991E-03
+ -0.559380698263932047E-09  0.822711615636790086E-02  0.921569960061689036E-02
+ -0.671199549999528054E-03 -0.666666666666666970E-02 -0.624992481787264051E-03
+  0.926820615335867057E-02 -0.666666666666666970E-02 -0.170162248239514002E-03
+  0.823042259709222922E-02 -0.875293298501571007E-07 -0.166825903639511989E-03
+ -0.692727623674652971E-07  0.822728043870269918E-02 -0.666666666666666970E-02
+  0.900314235021752035E-02 -0.616359389498942999E-03 -0.610140243508493002E-03
+  0.923851413586198981E-02 -0.162042399166736006E-03 -0.666666666666666970E-02
+  0.755717693384080035E-02 -0.343592701208587996E-11 -0.164505035953650008E-03
+ -0.508562496748712960E-09  0.822712657547092045E-02 -0.666666666666666970E-02
+ -0.162064604852803001E-03  0.755721682250679039E-02 -0.421581532487358014E-11
+ -0.164615579255980999E-03 -0.622130465298954038E-09  0.822717706389368073E-02
+ -0.666666666666666970E-02 -0.435031888747001994E-03  0.921662497939955937E-02
+ -0.671514002290806984E-03  0.000000000000000000E+00 -0.435066497147766000E-03
+ -0.622895617170409984E-03  0.926586574231100073E-02 -0.666666666666666970E-02
+ -0.435031628074990023E-03  0.921600144320258025E-02 -0.671374321788007020E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435066757774081993E-03  0.000000000000000000E+00 -0.622827422523968997E-03
+  0.926517931888306938E-02 -0.666666666666666970E-02 -0.162168008296603987E-03
+  0.755720830543264981E-02 -0.111631538525960998E-10 -0.165130579439813012E-03
+ -0.162383789215979997E-08  0.822716472453222970E-02  0.822711943021576034E-02
+ -0.222017847039514990E-09 -0.666666666666666970E-02 -0.777760925671697071E-11
+  0.819196919955125953E-02 -0.666666666666666970E-02  0.917235354741821086E-02
+ -0.661205826288643948E-03 -0.620672923073943951E-03  0.926095692434543948E-02
+ -0.666666666666666970E-02 -0.162079935652635995E-03  0.755720153609774035E-02
+ -0.485738294134179040E-11 -0.164692040195761001E-03 -0.715311464965949002E-09
+  0.822716985670068034E-02 -0.166685978125188992E-03  0.822673928904943050E-02
+ -0.488041135101126002E-07 -0.166618503841219014E-03 -0.666666666666666970E-02
+ -0.388518789845462021E-07  0.822064815916740028E-02 -0.666666666666666970E-02
+ -0.434820802690177985E-03  0.921114685377608967E-02 -0.671312500692984007E-03
+ -0.435277323473812975E-03 -0.623870937176576037E-03  0.926677062396401990E-02
+ -0.666666666666666970E-02 -0.435040416088372018E-03  0.921688078845643056E-02
+ -0.671547258120006041E-03 -0.435057970921028016E-03 -0.622879055329303975E-03
+  0.926586475383493063E-02 -0.666666666666666970E-02 -0.435048641715767004E-03
+  0.921713990189115991E-02 -0.671590634431848974E-03  0.000000000000000000E+00
+ -0.435049745678237000E-03 -0.622873878413765037E-03  0.926587662260439961E-02
+ -0.666666666666666970E-02 -0.162065760605982007E-03  0.755721473567340981E-02
+ -0.425851668132086027E-11  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164621246760972002E-03  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.628327256337155983E-09
+  0.822716093119939956E-02 -0.162542321360015994E-03 -0.666666666666666970E-02
+  0.755650259353967020E-02 -0.562981055991819022E-09 -0.167001540638854007E-03
+ -0.773289585904606956E-07  0.822676761660280989E-02 -0.666666666666666970E-02
+ -0.162061328166368990E-03  0.755722295014580030E-02 -0.409106119013757001E-11
+ -0.164607216616360989E-03 -0.604245617915833026E-09  0.822718306150760012E-02
+ -0.435044697597018011E-03  0.921886702873469036E-02 -0.673104082698351962E-03
+ -0.435053689697235003E-03 -0.666666666666666970E-02 -0.624556261940502019E-03
+  0.926763947624762059E-02 -0.435053689776730007E-03 -0.666666666666666970E-02
+  0.926765139731208082E-02 -0.624573254433248028E-03 -0.435044697517509996E-03
+ -0.673283560783819986E-03  0.921873769877605033E-02 -0.435024190531306987E-03
+ -0.666666666666666970E-02  0.921794328133558952E-02 -0.672853471668093963E-03
+ -0.435074193732175026E-03 -0.624365177556240967E-03  0.926744434610436013E-02
+ -0.666666666666666970E-02 -0.435000466897472022E-03  0.921805866981871037E-02
+ -0.673973626915920021E-03 -0.435097908603312988E-03 -0.625447361535428985E-03
+  0.926834572012657024E-02 -0.167001904506701996E-03  0.822676773783739997E-02
+ -0.774505155958715954E-07 -0.162540904682313003E-03 -0.666666666666666970E-02
+ -0.563656255578501956E-09  0.755607546000543022E-02 -0.162061504581954011E-03
+ -0.666666666666666970E-02  0.755611035298445026E-02 -0.418942890780224988E-11
+ -0.164621967289905991E-03 -0.619216517642183046E-09  0.822718518765969999E-02
+ -0.666666666666666970E-02  0.920452787793652957E-02 -0.668731214720833010E-03
+ -0.622283241864520008E-03  0.926452697307078056E-02  0.822724025612167967E-02
+ -0.586193188243385978E-09 -0.666666666666666970E-02 -0.399370806297400039E-11
+  0.756944692405215041E-02 -0.162114939468458987E-03 -0.666666666666666970E-02
+  0.756941303972038983E-02 -0.521957340938646036E-11 -0.164756390961434999E-03
+ -0.760317731950115969E-09  0.822719049518791032E-02 -0.666666666666666970E-02
+ -0.162348256591775996E-03  0.757138537095577036E-02 -0.154279395379191983E-04
+ -0.530720557527397988E-03 -0.221706573191253990E-03  0.891607556388816086E-02
+ -0.163035589715460993E-03 -0.666666666666666970E-02  0.791125444579478081E-02
+ -0.568881226622524034E-05  0.000000000000000000E+00 -0.530385305835545987E-03
+ -0.644753151430826047E-04  0.878526723619102966E-02 -0.666666666666666970E-02
+ -0.399599558422803001E-03  0.843930969858660944E-02 -0.399745648940732980E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.464884917361518986E-03  0.000000000000000000E+00 -0.476898399198092998E-03
+  0.874546494059007078E-02 -0.666666666666666970E-02 -0.162063070389469993E-03
+  0.755351606433557997E-02 -0.447630187954819978E-11 -0.164644944937428005E-03
+ -0.661962104712968044E-09  0.822717544598733017E-02  0.926551780113546962E-02
+ -0.622722840997360001E-03 -0.666666666666666970E-02 -0.670824268645125988E-03
+  0.921370889062123923E-02 -0.666666666666666970E-02 -0.434945178559459974E-03
+  0.921154593743686925E-02 -0.668579923758682053E-03 -0.435153154613506974E-03
+ -0.620572207517385986E-03  0.926326406971371939E-02 -0.666666666666666970E-02
+  0.755816621993720978E-02 -0.411326355534032026E-11 -0.610348502456840988E-09
+  0.822725905587259020E-02 -0.402874184979866987E-03 -0.666666666666666970E-02
+  0.865106066171478054E-02 -0.472609065736268020E-03 -0.462635081887802994E-03
+ -0.557606013013843028E-03  0.917391259078658046E-02 -0.666666666666666970E-02
+ -0.162816160491244000E-03  0.755629729934226019E-02 -0.145267309791605992E-07
+ -0.168362009612906010E-03 -0.190258327748242995E-05  0.822792910350438923E-02
+ -0.666666666666666970E-02 -0.434980287177094997E-03  0.921591582004819936E-02
+ -0.672220539599956038E-03  0.000000000000000000E+00 -0.435118076428732989E-03
+ -0.623871208911141008E-03  0.926674480491154057E-02 -0.666666666666666970E-02
+ -0.391663109206836005E-03  0.857592364748300043E-02 -0.270377370496600986E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.470345934717641013E-03  0.000000000000000000E+00 -0.290023568937984979E-03
+  0.893922648721857081E-02 -0.666666666666666970E-02 -0.317994853290403008E-03
+  0.807402136257530031E-02 -0.163000017527141987E-03 -0.506466269749422031E-03
+ -0.315285081405852004E-03  0.895741238458845970E-02  0.822721524635505914E-02
+ -0.564092130899272960E-09 -0.666666666666666970E-02 -0.389036333446228008E-11
+  0.758050134032894028E-02 -0.666666666666666970E-02  0.758050134032894028E-02
+ -0.389036333446228008E-11 -0.564092130736643047E-09  0.822721524635505914E-02
+ -0.666666666666666970E-02 -0.162062220957438010E-03  0.755675930940021001E-02
+ -0.416185255233357985E-11 -0.164608241679856012E-03 -0.614515052693167971E-09
+  0.822717669744772034E-02 -0.435064264203551974E-03  0.926583708706145946E-02
+ -0.622864427576335982E-03 -0.435034122053574979E-03 -0.666666666666666970E-02
+ -0.671494672063580045E-03  0.921666478777227044E-02 -0.666666666666666970E-02
+ -0.435034122194007025E-03  0.921666834178432037E-02 -0.671498215917801949E-03
+ -0.435064264063141017E-03 -0.622867805184998033E-03  0.926584080673045936E-02
+ -0.666666666666666970E-02 -0.162064603057306009E-03  0.755721765249671960E-02
+ -0.421568895545965981E-11 -0.164615562604910991E-03 -0.622111714812958048E-09
+  0.822717714449142969E-02 -0.666666666666666970E-02 -0.435031767250997005E-03
+  0.921658395801952003E-02 -0.671474812281343020E-03  0.000000000000000000E+00
+ -0.435066618622617987E-03 -0.622858755966087044E-03  0.926582670501814067E-02
+ -0.666666666666666970E-02 -0.435031759994269993E-03  0.921598448039129964E-02
+ -0.671353611750878003E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066625877937976E-03  0.000000000000000000E+00
+ -0.622806885942570987E-03  0.926515741571650939E-02 -0.435040457277539980E-03
+ -0.666666666666666970E-02  0.921687744869944051E-02 -0.671542694130693044E-03
+ -0.435057929735475977E-03 -0.622874450455320979E-03  0.926585997023653997E-02
+ -0.666666666666666970E-02 -0.162062556267884004E-03  0.755721766013062960E-02
+ -0.413703217741733986E-11 -0.164605370008190008E-03 -0.610674234287949039E-09
+  0.822717713670985916E-02 -0.435057871804128975E-03  0.926586389852728995E-02
+ -0.622877919476854986E-03 -0.435040515213944027E-03 -0.666666666666666970E-02
+ -0.671546676413121014E-03  0.921688294468172957E-02 -0.164358973103396013E-03
+ -0.666666666666666970E-02  0.822694462574627937E-02 -0.400863023298765998E-09
+ -0.164360422527890990E-03 -0.404404639852252990E-09  0.822717693008181983E-02
+ -0.435040521994138000E-03 -0.666666666666666970E-02  0.921688317220892978E-02
+ -0.671546727788281000E-03 -0.435057865024523994E-03 -0.622877930192895020E-03
+  0.926586392288021934E-02 -0.666666666666666970E-02 -0.162062559830136006E-03
+  0.755721765142735018E-02 -0.413716779390587020E-11 -0.164605386233707005E-03
+ -0.610693931490910976E-09  0.822717713361387042E-02 -0.435057916250486973E-03
+  0.926586001868622000E-02 -0.622874471770934967E-03 -0.435040470763708000E-03
+ -0.666666666666666970E-02 -0.671542796319704962E-03  0.921687790126752074E-02
+ -0.164358941319385992E-03 -0.666666666666666970E-02  0.822694343451954926E-02
+ -0.400827305234235980E-09 -0.164360402068052007E-03 -0.404386932680776984E-09
+  0.822717693276291996E-02 -0.666666666666666970E-02  0.755853528834142963E-02
+ -0.410907237024926035E-11 -0.609517161481542976E-09  0.822725835180459011E-02
+  0.822699835769964000E-02 -0.229769146362661995E-09 -0.666666666666666970E-02
+ -0.552503881837150022E-11  0.815277410258053933E-02 -0.431800076739517009E-03
+ -0.666666666666666970E-02  0.911683250971320920E-02 -0.645107775821999975E-03
+ -0.438245882118603019E-03 -0.614761191522381957E-03  0.925173111512482028E-02
+ -0.666666666666666970E-02 -0.162066807230739002E-03  0.755721537399490963E-02
+ -0.430232877656178001E-11 -0.164626562646872987E-03 -0.634707547989976979E-09
+  0.822717705663869939E-02 -0.435011184600756027E-03 -0.666666666666666970E-02
+  0.921541718649479015E-02 -0.670832821913695961E-03  0.000000000000000000E+00
+ -0.435087195554581009E-03 -0.622361407627594967E-03  0.926525297165626920E-02
+ -0.666666666666666970E-02 -0.435024023596373988E-03  0.921463311000128051E-02
+ -0.670757573686644971E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435074360623453004E-03  0.000000000000000000E+00
+ -0.622355873651302019E-03  0.926393658659479917E-02 -0.666666666666666970E-02
+ -0.162274431703276004E-03  0.755738308037023996E-02 -0.318241339520051988E-10
+ -0.165730950021604001E-03 -0.457847648567586989E-08  0.822724281267087944E-02
+  0.822699408685815918E-02 -0.228425571690641000E-09 -0.666666666666666970E-02
+ -0.569261988097333975E-11  0.815841787771899064E-02 -0.666666666666666970E-02
+ -0.432079853778089007E-03  0.912452759982078018E-02 -0.647134253482418054E-03
+ -0.437974705900714988E-03 -0.615294574958598047E-03  0.925273449425248994E-02
+ -0.666666666666666970E-02  0.755849436233489019E-02 -0.410955655495479020E-11
+ -0.609618023537665964E-09  0.822725856237870035E-02 -0.162292777295443990E-03
+ -0.666666666666666970E-02  0.755742123691062993E-02 -0.382880604592767993E-10
+ -0.165829805992421012E-03 -0.549637808741221009E-08  0.822725133924320957E-02
+ -0.666666666666666970E-02 -0.162519834506453002E-03  0.758488071339646994E-02
+ -0.277296855958790984E-04 -0.531411092685664019E-03 -0.271305840096227018E-03
+  0.895289465200115042E-02 -0.666666666666666970E-02 -0.435144739153746991E-03
+  0.927049386352079077E-02 -0.627195195351904020E-03  0.000000000000000000E+00
+ -0.434953602544600011E-03 -0.673646030614466983E-03  0.921927161130219920E-02
+ -0.666666666666666970E-02 -0.162365832482225009E-03  0.770469472296904986E-02
+ -0.341374365570676015E-05  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.530093834704794035E-03  0.000000000000000000E+00
+ -0.670683663828369948E-04  0.878401698585971023E-02 -0.666666666666666970E-02
+ -0.406469353648676005E-03  0.868845378029185940E-02 -0.487445867647356014E-03
+ -0.459955485443886000E-03 -0.561455537392793009E-03  0.917876821487357934E-02
+  0.822724019224464927E-02 -0.586236000786575972E-09 -0.666666666666666970E-02
+ -0.399393159268467017E-11  0.756942275692288041E-02 -0.666666666666666970E-02
+  0.756938783566538965E-02 -0.398490203650706992E-11 -0.582276441424578042E-09
+  0.822717710481313952E-02 -0.666666666666666970E-02 -0.162064188321971013E-03
+  0.755638508819920975E-02 -0.426955668367828011E-11 -0.164621776697154991E-03
+ -0.630372643058242999E-09  0.822717559459556005E-02 -0.435097665976917993E-03
+  0.926770146147764921E-02 -0.624846565957563015E-03 -0.435000709641257001E-03
+ -0.666666666666666970E-02 -0.673347123713348948E-03  0.921745137370448032E-02
+ -0.666666666666666970E-02 -0.164221437092461006E-03  0.822590307325241077E-02
+ -0.301540502789892983E-09 -0.164276463598209013E-03 -0.317267646894687990E-09
+  0.822720896113244074E-02 -0.666666666666666970E-02 -0.435040484785290994E-03
+  0.921687621618137026E-02 -0.671540621958410973E-03 -0.435057902230129995E-03
+ -0.622872302682165040E-03  0.926585782249825934E-02 -0.666666666666666970E-02
+ -0.162127341684403998E-03  0.759121275229191969E-02 -0.381774204087762988E-11
+  0.000000000000000000E+00 -0.164583013632055004E-03 -0.545969305874372025E-09
+  0.822717722380156974E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166319316569640000E-03  0.755721776450155036E-02 -0.125637202975271002E-10
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166854313971363013E-03
+  0.000000000000000000E+00 -0.695766372129872957E-09  0.759121342784058965E-02
+ -0.435007341079701976E-03 -0.666666666666666970E-02  0.921767423502365020E-02
+ -0.673397510122010013E-03 -0.435091037539719026E-03 -0.624857046911454035E-03
+  0.926772529096031930E-02 -0.666666666666666970E-02 -0.435042780427774989E-03
+  0.921682237061306926E-02 -0.671414330503463958E-03 -0.435055606761622013E-03
+ -0.622737221386204017E-03  0.926573013279845085E-02 -0.164512437178604990E-03
+  0.822720912015355993E-02 -0.478141451497207985E-09 -0.162035132712287004E-03
+ -0.666666666666666970E-02 -0.321987108752259996E-11  0.755727832217654039E-02
+ -0.162065357686432991E-03 -0.666666666666666970E-02  0.755727582940853970E-02
+ -0.423979675954428002E-11 -0.164618725863945991E-03 -0.625584793967118990E-09
+  0.822717479197860972E-02 -0.162035093479730998E-03 -0.666666666666666970E-02
+  0.755725620859361031E-02 -0.322006451958004010E-11 -0.164512451389381995E-03
+ -0.478178423500335003E-09  0.822720912019037943E-02 -0.666666666666666970E-02
+ -0.162063731980656992E-03  0.755721533834744982E-02 -0.418199541718481972E-11
+ -0.164611237647245009E-03 -0.617212851732219973E-09  0.822717573520458968E-02
+ -0.435072205240009014E-03  0.926779361315624960E-02 -0.624886867601484965E-03
+ -0.435026179501724010E-03 -0.666666666666666970E-02 -0.673540811814344947E-03
+  0.921830845677890955E-02 -0.435026047901024976E-03 -0.666666666666666970E-02
+  0.921631021320170989E-02 -0.671342130465355002E-03 -0.435072336809685012E-03
+ -0.622763518226994005E-03  0.926572086598713082E-02 -0.666666666666666970E-02
+  0.755741246850622026E-02 -0.410910644194978984E-11 -0.606515170668247041E-09
+  0.822717712895315936E-02  0.822711877443555936E-02 -0.222069534738639993E-09
+ -0.666666666666666970E-02 -0.774169806703398991E-11  0.819168186272310056E-02
+ -0.433633789741900023E-03 -0.666666666666666970E-02  0.917151071300197074E-02
+ -0.660635211420097053E-03 -0.436454594164531018E-03 -0.620192759941486029E-03
+  0.926043761605101039E-02 -0.666666666666666970E-02 -0.435400642028349984E-03
+  0.924569487161715914E-02 -0.602675681802749023E-03 -0.434697126892320004E-03
+ -0.641341627123005026E-03  0.919049697819046031E-02 -0.417766802147069983E-03
+ -0.666666666666666970E-02  0.884119022200588987E-02 -0.565196699593361013E-03
+  0.000000000000000000E+00 -0.450926780658111002E-03 -0.597865634330250052E-03
+  0.921780464690032976E-02 -0.666666666666666970E-02 -0.162927680290126010E-03
+  0.761083369281832008E-02 -0.416350633852703977E-04  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.530230645664085047E-03
+  0.000000000000000000E+00 -0.269827347276391011E-03  0.866009136742803935E-02
+ -0.666666666666666970E-02 -0.162144345304456006E-03  0.760092331739828028E-02
+ -0.320930758282627016E-05 -0.529813749405098054E-03 -0.838256824281321020E-04
+  0.879776792041750931E-02  0.822717682684740922E-02 -0.303187285772573992E-09
+ -0.666666666666666970E-02 -0.349301427021032004E-11  0.790044609360178972E-02
+ -0.666666666666666970E-02 -0.407938480218889010E-03  0.871084858846607989E-02
+ -0.512295469801531947E-03 -0.458837927071481999E-03 -0.588085060255285046E-03
+  0.920289209649226975E-02 -0.666666666666666970E-02  0.755933306797651038E-02
+ -0.410021674909237023E-11 -0.607789044576712979E-09  0.822725774217296021E-02
+ -0.162167689058960993E-03 -0.666666666666666970E-02  0.755666327531443957E-02
+ -0.112741283803980995E-10 -0.165135297900302988E-03 -0.164049635527082004E-08
+  0.822716577111987958E-02 -0.666666666666666970E-02 -0.415926985437479015E-03
+  0.874524524800943982E-02 -0.345826974275356008E-03 -0.452438516787343978E-03
+ -0.346461802416189973E-03  0.898436011623418053E-02 -0.666666666666666970E-02
+ -0.410332540159175974E-03  0.874022409518679030E-02 -0.526301953402972958E-03
+  0.000000000000000000E+00 -0.456978154604879979E-03 -0.594686220254121993E-03
+  0.921017399119665979E-02 -0.666666666666666970E-02 -0.349138358539420027E-03
+  0.808745335846144074E-02 -0.174389979817568010E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.493489760022879976E-03
+  0.000000000000000000E+00 -0.295632770892417004E-03  0.859314451744194047E-02
+ -0.666666666666666970E-02 -0.162165456200227012E-03  0.755894485257972964E-02
+ -0.104756674728628994E-10 -0.165141511934384990E-03 -0.152686256191205990E-08
+  0.822721248562563977E-02  0.925936813792594023E-02 -0.620622042811125991E-03
+ -0.666666666666666970E-02 -0.661330294502400018E-03  0.917272029101801056E-02
+ -0.666666666666666970E-02  0.917272029101801056E-02 -0.661330294502400018E-03
+ -0.620622042811125991E-03  0.925936813792594023E-02 -0.666666666666666970E-02
+ -0.162059451012688989E-03  0.755773564896643039E-02 -0.398147961980706018E-11
+ -0.164599233536076003E-03 -0.588248966932235027E-09  0.822718777261857021E-02
+ -0.435010710626149014E-03  0.921784913653543939E-02 -0.671757497579554964E-03
+ -0.435087669354308976E-03 -0.666666666666666970E-02 -0.624205044887739973E-03
+  0.926731518984568077E-02 -0.666666666666666970E-02 -0.435087680804149997E-03
+  0.926689880496680933E-02 -0.623818561754816976E-03 -0.435010699170602000E-03
+ -0.671588999254513967E-03  0.921721221696823051E-02 -0.666666666666666970E-02
+ -0.162064554135907989E-03  0.755721778431040023E-02 -0.421379742453608961E-11
+ -0.164615465489623012E-03 -0.621841597575447999E-09  0.822717725634031921E-02
+ -0.666666666666666970E-02 -0.435032270858094002E-03  0.921659999696208994E-02
+ -0.671477624444696982E-03  0.000000000000000000E+00 -0.435066115102212007E-03
+ -0.622858575415140996E-03  0.926582763556333018E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435031835174660986E-03  0.921598350730943976E-02
+ -0.671332923411385946E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435066550710658023E-03  0.000000000000000000E+00 -0.622784369281448010E-03
+  0.926515556236343074E-02 -0.435081584415368018E-03 -0.666666666666666970E-02
+  0.926715509609260003E-02 -0.624058431079940984E-03 -0.435016797724696992E-03
+ -0.671759302322477020E-03  0.921784300407063953E-02 -0.666666666666666970E-02
+ -0.162062416434052993E-03  0.755721809516292017E-02 -0.413174197731702039E-11
+ -0.164605215334601997E-03 -0.609922581957833024E-09  0.822717754034535051E-02
+ -0.435056252509799003E-03  0.926585458068619923E-02 -0.622864166386205044E-03
+ -0.435042134635998002E-03 -0.666666666666666970E-02 -0.671542142824929958E-03
+  0.921692265814959971E-02 -0.164341185001132012E-03 -0.666666666666666970E-02
+  0.822700548023182036E-02 -0.388254230807434987E-09 -0.164366486837535988E-03
+ -0.391288606812102024E-09  0.822719359654751001E-02 -0.435043745553257018E-03
+ -0.666666666666666970E-02  0.921697677898357035E-02 -0.671554360684184981E-03
+ -0.435054641693514987E-03 -0.622866717131421977E-03  0.926586042794482964E-02
+ -0.666666666666666970E-02 -0.162059863500320009E-03  0.755722860146506008E-02
+ -0.403707215866304025E-11 -0.164610171594745992E-03 -0.596715205847239044E-09
+  0.822719066601751078E-02 -0.435029181351461001E-03  0.921782667546000058E-02
+ -0.671761723962046014E-03 -0.435069204038055987E-03 -0.666666666666666970E-02
+ -0.623757152955008967E-03  0.926682695577680030E-02 -0.165176244427055993E-03
+ -0.666666666666666970E-02  0.822774267939753966E-02 -0.421637432761214996E-09
+ -0.164358900259774010E-03 -0.403642926186002997E-09  0.822717766237675979E-02
+ -0.666666666666666970E-02  0.895231589389712014E-02 -0.601006644879792970E-03
+ -0.606288347943506004E-03  0.923179444186011944E-02  0.921580902981316945E-02
+ -0.671118538607027963E-03 -0.666666666666666970E-02 -0.628092231229955013E-03
+  0.927168311539479026E-02 -0.174566249107139995E-03 -0.666666666666666970E-02
+  0.823503974641245050E-02 -0.510772714070998049E-06 -0.167453071908487002E-03
+ -0.287656386989065009E-06  0.822679160061134084E-02 -0.666666666666666970E-02
+ -0.162064983519178004E-03  0.755722213005034001E-02 -0.423003049664173980E-11
+ -0.164617409769547002E-03 -0.624194129888606984E-09  0.822717667703955040E-02
+ -0.435027350434486978E-03 -0.666666666666666970E-02  0.921651333906866981E-02
+ -0.671533317952461976E-03  0.000000000000000000E+00 -0.435071034569932023E-03
+ -0.622941399756069946E-03  0.926589080870137034E-02 -0.666666666666666970E-02
+ -0.435031629619492017E-03  0.921595126475713983E-02 -0.671462224228685047E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.666666666666666970E-02
+ -0.435066756229832998E-03  0.000000000000000000E+00 -0.622929617617789958E-03
+  0.926511475629528992E-02 -0.666666666666666970E-02 -0.162052976330494004E-03
+  0.755715655404278971E-02 -0.378464136913797002E-11 -0.164557846396870007E-03
+ -0.559379553656162034E-09  0.822711615671048967E-02  0.921569960267614070E-02
+ -0.671199548462878990E-03 -0.666666666666666970E-02 -0.624992541148623954E-03
+  0.926820621881743051E-02 -0.666666666666666970E-02 -0.170162317751058011E-03
+  0.823042266464544918E-02 -0.875297413337334996E-07 -0.166825903611445003E-03
+ -0.692727437231052063E-07  0.822728043872133011E-02 -0.666666666666666970E-02
+  0.900314235197411002E-02 -0.616359390013905962E-03 -0.610140243636164963E-03
+  0.923851413610003031E-02 -0.162042399128667987E-03 -0.666666666666666970E-02
+  0.755717695498461014E-02 -0.343592447834058983E-11 -0.164505035561990999E-03
+ -0.508562114575286970E-09  0.822712657561797990E-02 -0.666666666666666970E-02
+ -0.162064626628578005E-03  0.755722018474085013E-02 -0.421639430439463977E-11
+ -0.164615674182357004E-03 -0.622213619794602018E-09  0.822717716142255980E-02
+ -0.666666666666666970E-02 -0.435031360634292993E-03  0.921661780771376032E-02
+ -0.671522125948738010E-03  0.000000000000000000E+00 -0.435067025167480974E-03
+ -0.622906555198169992E-03  0.926587475408661987E-02 -0.666666666666666970E-02
+ -0.435031920197820993E-03  0.921597580576540010E-02 -0.671351984885895973E-03
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435066465702252001E-03  0.000000000000000000E+00 -0.622805921827540012E-03
+  0.926514231021871061E-02 -0.666666666666666970E-02 -0.162062001713370989E-03
+  0.755721764997798959E-02 -0.411598727890175012E-11 -0.164602606520337013E-03
+ -0.607613494988126020E-09  0.822717713025281072E-02  0.926588367987988062E-02
+ -0.622883813196616022E-03 -0.666666666666666970E-02 -0.671594845205971042E-03
+  0.921711546375830054E-02 -0.666666666666666970E-02  0.921711546375830054E-02
+ -0.671594845205971042E-03 -0.622883813196616022E-03  0.926588367987988062E-02
+ -0.666666666666666970E-02 -0.162080643229434011E-03  0.755719622314534028E-02
+ -0.488882760495459993E-11 -0.164695578667720994E-03 -0.719871721533777035E-09
+  0.822716411325388926E-02 -0.166686373905860005E-03  0.822673885725208956E-02
+ -0.487733280019655016E-07 -0.166616817170293989E-03 -0.666666666666666970E-02
+ -0.385414488502229971E-07  0.822045888465093924E-02 -0.666666666666666970E-02
+ -0.434813234051737991E-03  0.920937424335109035E-02 -0.669894558723525000E-03
+ -0.435284874465938004E-03 -0.622580915854730019E-03  0.926510963371094017E-02
+ -0.666666666666666970E-02 -0.435040408248740026E-03  0.921688082460559062E-02
+ -0.671547530054927020E-03 -0.435057978759970999E-03 -0.622879363153527985E-03
+  0.926586503625058920E-02 -0.666666666666666970E-02 -0.435048354812829995E-03
+  0.921714230585230984E-02 -0.671602199049514052E-03  0.000000000000000000E+00
+ -0.435050032579175011E-03 -0.622886590993160022E-03  0.926588809980974024E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.162065782294302011E-03
+  0.755721459868404031E-02 -0.425813299026563987E-11 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164621303266333996E-03  0.000000000000000000E+00
+ -0.628265660483508001E-09  0.822715314376032923E-02 -0.162542375670815992E-03
+ -0.666666666666666970E-02  0.755649757850690006E-02 -0.563432271384059970E-09
+ -0.167001893968643994E-03 -0.773903559881119996E-07  0.822676727708255076E-02
+ -0.666666666666666970E-02 -0.162062879664528998E-03  0.755721695458686990E-02
+ -0.414938028726372967E-11 -0.164606984614506006E-03 -0.612470146931394021E-09
+  0.822717691631253030E-02 -0.435061921280182988E-03  0.926591210512174059E-02
+ -0.622933977614011981E-03 -0.435036465303411009E-03 -0.666666666666666970E-02
+ -0.671580853268202043E-03  0.921680734331616937E-02 -0.435036519090423993E-03
+ -0.666666666666666970E-02  0.921831642815977917E-02 -0.673202190294888052E-03
+ -0.435061867500153025E-03 -0.624494985629193999E-03  0.926748120028762952E-02
+ -0.435006921432357974E-03 -0.666666666666666970E-02  0.921581685576767021E-02
+ -0.671357037929466998E-03 -0.435091457008391985E-03 -0.622887259624104036E-03
+  0.926580572699929952E-02 -0.666666666666666970E-02 -0.435000210414426987E-03
+  0.921805741531904997E-02 -0.673980656048901966E-03 -0.435098164960824000E-03
+ -0.625455737994957050E-03  0.926835240170618968E-02 -0.167002256657801997E-03
+  0.822676739801933045E-02 -0.775116142162273040E-07 -0.162540963549468999E-03
+ -0.666666666666666970E-02 -0.564105828717064959E-09  0.755607184337712002E-02
+ -0.162063151808471010E-03 -0.666666666666666970E-02  0.755610609873923027E-02
+ -0.425254382354864998E-11 -0.164619383929850006E-03 -0.628041057868921975E-09
+  0.822717599205483041E-02 -0.666666666666666970E-02  0.921687630329096064E-02
+ -0.671540851769114968E-03 -0.622872550846322991E-03  0.926585805654763991E-02
+  0.926585809314046070E-02 -0.622872566927465052E-03 -0.666666666666666970E-02
+ -0.671540928853904041E-03  0.921687664468394917E-02 -0.164358960620165987E-03
+ -0.666666666666666970E-02  0.822694395609338026E-02 -0.400847008848852017E-09
+ -0.164360413027222997E-03 -0.404398713049403977E-09  0.822717692924596067E-02
+ -0.666666666666666970E-02 -0.162064640983030012E-03  0.755718903530966032E-02
+ -0.421955570049609974E-11 -0.164616035944705001E-03 -0.622688578366344997E-09
+  0.822717711474709075E-02 -0.435031324368629995E-03 -0.666666666666666970E-02
+  0.921656922225446047E-02 -0.671471582389523958E-03  0.000000000000000000E+00
+ -0.435067061426646024E-03 -0.622858176290813008E-03  0.926582523607523932E-02
+ -0.666666666666666970E-02 -0.435031048387666001E-03  0.921594565739619967E-02
+ -0.671345384731177964E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435067337357646024E-03  0.000000000000000000E+00
+ -0.622804669702495955E-03  0.926513810679983028E-02 -0.666666666666666970E-02
+ -0.162062558200820996E-03  0.755721755342539988E-02 -0.413711391502012990E-11
+ -0.164605379088546999E-03 -0.610686125296504951E-09  0.822717713332750053E-02
+  0.926585808945160939E-02 -0.622872565296268030E-03 -0.666666666666666970E-02
+ -0.671540921029493000E-03  0.921687661003382959E-02 -0.666666666666666970E-02
+ -0.164358960452099987E-03  0.822694392878840081E-02 -0.400846613572565987E-09
+ -0.164360413029369988E-03 -0.404398732358932989E-09  0.822717692925123943E-02
+ -0.666666666666666970E-02  0.921687630355970053E-02 -0.671540851829773044E-03
+ -0.622872550859014987E-03  0.926585805657732971E-02 -0.162062558227438999E-03
+ -0.666666666666666970E-02  0.755721760853244034E-02 -0.413711033678103961E-11
+ -0.164605378673800013E-03 -0.610685584929304044E-09  0.822717713318102915E-02
+ -0.666666666666666970E-02 -0.162064570152263006E-03  0.755721841422958996E-02
+ -0.421433625240463005E-11 -0.164615390632404009E-03 -0.621914612647070953E-09
+  0.822717706877345960E-02 -0.666666666666666970E-02 -0.435032309673597981E-03
+  0.921659272513397979E-02 -0.671468254207514999E-03  0.000000000000000000E+00
+ -0.435066076293283009E-03 -0.622849275891169040E-03  0.926581888559491022E-02
+ -0.666666666666666970E-02 -0.435031684645659992E-03  0.921601778227170050E-02
+ -0.671377596560439956E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066701213360013E-03  0.000000000000000000E+00
+ -0.622828682917879006E-03  0.926519565944300058E-02 -0.666666666666666970E-02
+ -0.162168008367528997E-03  0.755720834454587983E-02 -0.111631510700418996E-10
+ -0.165130579339017988E-03 -0.162383743141729992E-08  0.822716472424545042E-02
+  0.822711943021554003E-02 -0.222017845615760010E-09 -0.666666666666666970E-02
+ -0.777761013879667033E-11  0.819196920656370918E-02 -0.666666666666666970E-02
+  0.917235355722251927E-02 -0.661205828628974026E-03 -0.620672923579346962E-03
+  0.926095692540444999E-02 -0.666666666666666970E-02 -0.162060886928861006E-03
+  0.755722230561571041E-02 -0.407434722062596981E-11 -0.164603197375618002E-03
+ -0.601754536458248027E-09  0.822718170485291916E-02 -0.435038460514972026E-03
+  0.921717884367924054E-02 -0.671598994291641997E-03 -0.435059926303323985E-03
+ -0.666666666666666970E-02 -0.623144536207649959E-03  0.926616805245615051E-02
+ -0.666666666666666970E-02 -0.435059886891442024E-03  0.926772984879662039E-02
+ -0.624637266544077001E-03 -0.435038499931220991E-03 -0.673028756927541039E-03
+  0.921880428832917015E-02 -0.666666666666666970E-02 -0.162064590679631006E-03
+  0.755721768483237979E-02 -0.421520958094493002E-11 -0.164615528433844000E-03
+ -0.622042930544206962E-09  0.822717716750567991E-02 -0.666666666666666970E-02
+ -0.435032046708114988E-03  0.921657978568275055E-02 -0.671461221312615953E-03
+  0.000000000000000000E+00 -0.435066339213917980E-03 -0.622843930781788978E-03
+  0.926581370321850027E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435031628760092985E-03  0.921598912055172914E-02 -0.671353905916793046E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066757089098022E-03
+  0.000000000000000000E+00 -0.622806875303639953E-03  0.926516708273138027E-02
+ -0.435042591383769009E-03 -0.666666666666666970E-02  0.921694193442373043E-02
+ -0.671550566383684018E-03 -0.435055795793300019E-03 -0.622869755363486023E-03
+  0.926586026640358919E-02 -0.666666666666666970E-02 -0.162061536519942994E-03
+  0.755722454624959024E-02 -0.409902564328030991E-11 -0.164610432002776987E-03
+ -0.605475143914787990E-09  0.822718468701886044E-02 -0.435048024953900003E-03
+  0.921879226944885034E-02 -0.673030814417662025E-03 -0.435050362434787020E-03
+ -0.666666666666666970E-02 -0.624403907899169952E-03  0.926747756344729022E-02
+ -0.164429321124159010E-03 -0.666666666666666970E-02  0.822722510694912958E-02
+ -0.405099938099922984E-09 -0.164359494664249005E-03 -0.403602349458583981E-09
+  0.822717730152964025E-02 -0.435057319269020976E-03 -0.666666666666666970E-02
+  0.926766299189552001E-02 -0.624575498879247960E-03 -0.435041067795715979E-03
+ -0.673029869981998019E-03  0.921880276298027955E-02 -0.666666666666666970E-02
+ -0.162062471480618010E-03  0.755721793381790036E-02 -0.413381786270217965E-11
+ -0.164605232829300992E-03 -0.610216091166618047E-09  0.822717734939633002E-02
+ -0.435056876315470997E-03  0.926585635253173064E-02 -0.622868044974361963E-03
+ -0.435041510784235001E-03 -0.666666666666666970E-02 -0.671542372337881029E-03
+  0.921690563981085974E-02 -0.164351356240546013E-03 -0.666666666666666970E-02
+  0.822698086373702987E-02 -0.395512544131405997E-09 -0.164365831071158996E-03
+ -0.398742006637490978E-09  0.822718605874551978E-02 -0.666666666666666970E-02
+  0.920452787846717975E-02 -0.668731214842509985E-03 -0.622283241890150981E-03
+  0.926452697312771072E-02  0.822724025612022077E-02 -0.586193181292780024E-09
+ -0.666666666666666970E-02 -0.399370802774403963E-11  0.756944692768340022E-02
+ -0.162114939474388000E-03 -0.666666666666666970E-02  0.756941304335114005E-02
+ -0.521957327686064991E-11 -0.164756390953382006E-03 -0.760317714252139982E-09
+  0.822719049518821043E-02 -0.666666666666666970E-02 -0.162348325334986989E-03
+  0.757141350179527019E-02 -0.154261696442177995E-04 -0.530720487223206049E-03
+ -0.221684447280562997E-03  0.891605622167588978E-02 -0.163035610806201990E-03
+ -0.666666666666666970E-02  0.791126088236420930E-02 -0.568886536946596036E-05
+  0.000000000000000000E+00 -0.530385316755896006E-03 -0.644747260841582991E-04
+  0.878526676544224942E-02 -0.666666666666666970E-02 -0.399601245888043025E-03
+  0.843932649296406934E-02 -0.399753779534292998E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.464883727737099017E-03
+  0.000000000000000000E+00 -0.476904229282087023E-03  0.874547695183781025E-02
+ -0.666666666666666970E-02 -0.162063070557396998E-03  0.755351614246298989E-02
+ -0.447630170821152977E-11 -0.164644944992450995E-03 -0.661962042461270997E-09
+  0.822717544552603076E-02  0.926551780075880044E-02 -0.622722840857442951E-03
+ -0.666666666666666970E-02 -0.670824267990035965E-03  0.921370888772457021E-02
+ -0.666666666666666970E-02 -0.434945178474266999E-03  0.921154593499040955E-02
+ -0.668579923648182010E-03 -0.435153154698610990E-03 -0.620572207908768009E-03
+  0.926326406979680050E-02 -0.666666666666666970E-02  0.755816621971812028E-02
+ -0.411326358485724017E-11 -0.610348502915667010E-09  0.822725905587247917E-02
+ -0.402874267847195980E-03 -0.666666666666666970E-02  0.865106149194994067E-02
+ -0.472609392802310021E-03 -0.462635021140100986E-03 -0.557606090580912007E-03
+  0.917391269482082002E-02 -0.666666666666666970E-02 -0.162519833129987003E-03
+  0.758488053948729989E-02 -0.277295971224945994E-04 -0.531411087991590981E-03
+ -0.271305613723866980E-03  0.895289446740019958E-02 -0.666666666666666970E-02
+ -0.435144823516233985E-03  0.927048838466438055E-02 -0.627189812839501980E-03
+  0.000000000000000000E+00 -0.434953518101383974E-03 -0.673638210139792994E-03
+  0.921926445312559045E-02 -0.666666666666666970E-02 -0.162365832936652005E-03
+  0.770469512965636042E-02 -0.341376585662229018E-05  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.530094362034539009E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.670698858845583960E-04
+  0.878401949557726033E-02 -0.666666666666666970E-02 -0.406469382005018019E-03
+  0.868845408484301973E-02 -0.487445981816212999E-03 -0.459955463954967011E-03
+ -0.561455560363951947E-03  0.917876824764385021E-02  0.822724019224600929E-02
+ -0.586235995361394981E-09 -0.666666666666666970E-02 -0.399393156536155037E-11
+  0.756942275977688043E-02 -0.666666666666666970E-02  0.756938783851883977E-02
+ -0.398490198193408992E-11 -0.582276435921294038E-09  0.822717710481313952E-02
+ -0.666666666666666970E-02 -0.162061804589790998E-03  0.755638842160883967E-02
+ -0.417671676259941005E-11 -0.164610667790356010E-03 -0.616892111647492956E-09
+  0.822717774708076922E-02 -0.435068239353668981E-03  0.926581925277365943E-02
+ -0.622854391016698997E-03 -0.435030146223642990E-03 -0.666666666666666970E-02
+ -0.671460680594175010E-03  0.921652801901830070E-02 -0.666666666666666970E-02
+ -0.164225650703184010E-03  0.822670097400921008E-02 -0.310328437504517979E-09
+ -0.164276003216507990E-03 -0.316769467002113981E-09  0.822720900471031989E-02
+ -0.666666666666666970E-02 -0.435040486303126981E-03  0.921687614266927975E-02
+ -0.671540506442383023E-03 -0.435057900712426009E-03 -0.622872183625812047E-03
+  0.926585769785132074E-02 -0.666666666666666970E-02 -0.162127332716032000E-03
+  0.759121301448910032E-02 -0.381743951397943036E-11  0.000000000000000000E+00
+ -0.164582975768345007E-03 -0.545926699323485040E-09  0.822717722734729075E-02
+ -0.666666666666666970E-02 -0.166319286578935988E-03  0.755721781846689991E-02
+ -0.125531050075483999E-10  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.166854284080451002E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.695180386014204010E-09  0.759121369168153960E-02
+ -0.435042566545895985E-03 -0.666666666666666970E-02  0.921694463459228966E-02
+ -0.671554769008383002E-03 -0.435055820629525977E-03 -0.622874016968664036E-03
+  0.926586380297488021E-02 -0.666666666666666970E-02 -0.435042888369998996E-03
+  0.921681962552989981E-02 -0.671408260658942963E-03 -0.435055498826277974E-03
+ -0.622730755410115045E-03  0.926572389917782943E-02 -0.164512123628282998E-03
+  0.822720916387005043E-02 -0.477817077089683953E-09 -0.162035062407616011E-03
+ -0.666666666666666970E-02 -0.321765105888902011E-11  0.755728112204505018E-02
+ -0.162062504982868002E-03 -0.666666666666666970E-02  0.755727880376912999E-02
+ -0.413009842789627016E-11 -0.164604675325971005E-03 -0.609640565546088004E-09
+  0.822717725918549990E-02 -0.162035022530451994E-03 -0.666666666666666970E-02
+  0.755725864235686981E-02 -0.321784768823900019E-11 -0.164512138073065988E-03
+ -0.477854634654902035E-09  0.822720916390744933E-02 -0.666666666666666970E-02
+ -0.162061944628342011E-03  0.755721772719453004E-02 -0.411382630968498961E-11
+ -0.164602365207998006E-03 -0.607300598025874993E-09  0.822717716454718015E-02
+ -0.435050067144334015E-03  0.926588448108721027E-02 -0.622883111379359043E-03
+ -0.435048320247372999E-03 -0.666666666666666970E-02 -0.671598373882453998E-03
+  0.921713778918750032E-02 -0.435048320032475997E-03 -0.666666666666666970E-02
+  0.921705200694796986E-02 -0.671504927255521023E-03 -0.435050067359229987E-03
+ -0.622795092030723960E-03  0.926579461935501947E-02 -0.666666666666666970E-02
+  0.755741246850972007E-02 -0.410910644191072979E-11 -0.606515170660810997E-09
+  0.822717712895315936E-02  0.822711877443554028E-02 -0.222069534616361010E-09
+ -0.666666666666666970E-02 -0.774169825927849969E-11  0.819168186428156052E-02
+ -0.433633789807636014E-03 -0.666666666666666970E-02  0.917151071499614987E-02
+ -0.660635211889124972E-03 -0.436454594099722020E-03 -0.620192760036731021E-03
+  0.926043761627064026E-02 -0.666666666666666970E-02 -0.435401639669350015E-03
+  0.924572083824642954E-02 -0.602697496588707024E-03 -0.434696125740512986E-03
+ -0.641341546899128006E-03  0.919049825465466028E-02 -0.417766807487059990E-03
+ -0.666666666666666970E-02  0.884119030057015955E-02 -0.565196723638367016E-03
+  0.000000000000000000E+00 -0.450926776155331018E-03 -0.597865637200630960E-03
+  0.921780465508625045E-02 -0.666666666666666970E-02 -0.162927680984810988E-03
+  0.761083385223151974E-02 -0.416350559257179991E-04  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.530230645905115008E-03
+  0.000000000000000000E+00 -0.269827250117159992E-03  0.866009133064084974E-02
+ -0.666666666666666970E-02 -0.162144345329803000E-03  0.760092333284733031E-02
+ -0.320930747823896000E-05 -0.529813749349416032E-03 -0.838256764278110937E-04
+  0.879776791477051949E-02  0.822717682684740922E-02 -0.303187284199640013E-09
+ -0.666666666666666970E-02 -0.349301427958089990E-11  0.790044609725218047E-02
+ -0.666666666666666970E-02 -0.407938480851945009E-03  0.871084859569853931E-02
+ -0.512295472619773030E-03 -0.458837926585046020E-03 -0.588085060897069957E-03
+  0.920289209739394085E-02 -0.666666666666666970E-02  0.755933306799180978E-02
+ -0.410021688445045033E-11 -0.607789044544431015E-09  0.822725774217296021E-02
+ -0.162167689058370998E-03 -0.666666666666666970E-02  0.755666327735462964E-02
+ -0.112741278002712006E-10 -0.165135297873814004E-03 -0.164049626541977997E-08
+  0.822716577112805013E-02 -0.666666666666666970E-02 -0.162508948582128004E-03
+  0.758386230747792964E-02 -0.268846850272863999E-04 -0.531370159696699959E-03
+ -0.268798452226511010E-03  0.895121067622118953E-02 -0.666666666666666970E-02
+ -0.435111783295980990E-03  0.926957338892927044E-02 -0.626355975132891958E-03
+  0.000000000000000000E+00 -0.434986584483092024E-03 -0.673610961683688029E-03
+  0.921919033051740028E-02 -0.666666666666666970E-02 -0.162383112105014989E-03
+  0.770842507081398020E-02 -0.352838184825081003E-05  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.529946733926804961E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.685200481248618984E-04
+  0.878522933538209046E-02 -0.666666666666666970E-02 -0.406001492103280994E-03
+  0.868371047881797038E-02 -0.486532578576920989E-03 -0.460309450350815017E-03
+ -0.562408172008129988E-03  0.917930422133162068E-02  0.822717710596379027E-02
+ -0.583431679178089002E-09 -0.666666666666666970E-02 -0.399067558641347968E-11
+  0.756879025163890995E-02 -0.666666666666666970E-02  0.756882506046350958E-02
+ -0.399965820178691986E-11 -0.587373439708311021E-09  0.822723979745249023E-02
+ -0.666666666666666970E-02 -0.162059699334328987E-03  0.755640315367181025E-02
+ -0.409724983315023040E-11 -0.164617694447567010E-03 -0.605894425669846960E-09
+  0.822719082319614027E-02 -0.435046673499388009E-03  0.921780363530167943E-02
+ -0.671765264155256970E-03 -0.435051713864318020E-03 -0.666666666666666970E-02
+ -0.623330396350863002E-03  0.926636396497984918E-02 -0.666666666666666970E-02
+ -0.164528465340170010E-03  0.822727665463759972E-02 -0.426431688727269020E-09
+ -0.164384495696545988E-03 -0.423252858884490006E-09  0.822718001752816008E-02
+ -0.666666666666666970E-02 -0.435040505956391999E-03  0.921687628114545993E-02
+ -0.671540025560727998E-03 -0.435057881060873976E-03 -0.622871599674077966E-03
+  0.926585723223368081E-02 -0.666666666666666970E-02 -0.162127506812547009E-03
+  0.759121355987753960E-02 -0.382292666777456993E-11  0.000000000000000000E+00
+ -0.164583701911017989E-03 -0.546697495368834967E-09  0.822717719106172071E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166319165150470994E-03
+  0.755721813537886013E-02 -0.125075381051564997E-10 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.166854154167308990E-03  0.000000000000000000E+00
+ -0.692662263886114992E-09  0.759121431463458986E-02 -0.435081853759711993E-03
+ -0.666666666666666970E-02  0.926716255638974050E-02 -0.624065278105386955E-03
+ -0.435016528292601001E-03 -0.671759479502790952E-03  0.921784374777362935E-02
+ -0.666666666666666970E-02 -0.435040907342144000E-03  0.921689077912838002E-02
+ -0.671540798041408006E-03 -0.435057479709268001E-03 -0.622869631331658029E-03
+  0.926586003647436943E-02 -0.164634692602426999E-03  0.822718023129201045E-02
+ -0.639569061694972014E-09 -0.162067618394035989E-03 -0.666666666666666970E-02
+ -0.433481102637178975E-11  0.755721907361002043E-02 -0.162058744590199013E-03
+ -0.666666666666666970E-02  0.755722007835428998E-02 -0.399695343689710995E-11
+ -0.164609059967339006E-03 -0.591014713275666970E-09  0.822719388300061083E-02
+ -0.162067628996523003E-03 -0.666666666666666970E-02  0.755722473679793037E-02
+ -0.433474416210013021E-11 -0.164634688823378012E-03 -0.639556359277955005E-09
+  0.822718023127935044E-02 -0.666666666666666970E-02 -0.162059007940064013E-03
+  0.755722761558702029E-02 -0.400519574062733029E-11 -0.164602739721675997E-03
+ -0.591972127525160032E-09  0.822718830313987938E-02 -0.435017256528589982E-03
+  0.921784067930350032E-02 -0.671758935795494969E-03 -0.435081125759231022E-03
+ -0.666666666666666970E-02 -0.624046237027197953E-03  0.926714180760297018E-02
+ -0.435081137126827990E-03 -0.666666666666666970E-02  0.926672070407219030E-02
+ -0.623654480728591011E-03 -0.435017245156463000E-03 -0.671584693353717023E-03
+  0.921719987714082929E-02 -0.666666666666666970E-02  0.755944258238581013E-02
+ -0.409900186657922026E-11 -0.607550069673094021E-09  0.822725761878752987E-02
+  0.822717683107230917E-02 -0.307429328465473987E-09 -0.666666666666666970E-02
+ -0.346590495801865012E-11  0.788944074290687074E-02 -0.405969478945487005E-03
+ -0.666666666666666970E-02  0.868864830866255926E-02 -0.503334330728072020E-03
+ -0.460337024732346983E-03 -0.585683665883423958E-03  0.919951404153332084E-02
+ -0.666666666666666970E-02 -0.434976282613521999E-03  0.922412114032335022E-02
+ -0.680891393674205956E-03 -0.435122078171959979E-03 -0.632218355554151046E-03
+  0.927546870226302056E-02 -0.163258162677796989E-03 -0.666666666666666970E-02
+  0.771427722386641007E-02 -0.477633227286587011E-04  0.000000000000000000E+00
+ -0.532374011844484007E-03 -0.264816258060028011E-03  0.893589130313274076E-02
+ -0.666666666666666970E-02 -0.163022377118752999E-03  0.756010678634908007E-02
+ -0.985505930025391935E-05  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.528465266297688968E-03  0.000000000000000000E+00
+ -0.112888567375700995E-03  0.818271007019684068E-02  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.162166578290559013E-03  0.755933922022227964E-02
+ -0.104753666208369006E-10 -0.165099058432877988E-03 -0.152245985024684009E-08
+  0.822717141733115068E-02  0.822711899107518921E-02 -0.221645138348997998E-09
+ -0.666666666666666970E-02 -0.803318982792332002E-11  0.819386807586616923E-02
+ -0.666666666666666970E-02 -0.433725919557807001E-03  0.917430862025363052E-02
+ -0.661288984210802052E-03 -0.436363721932067020E-03 -0.620322135837134050E-03
+  0.926073953967261956E-02 -0.666666666666666970E-02  0.755741928801698017E-02
+ -0.410903028639137974E-11 -0.606500685137881987E-09  0.822717712893872993E-02
+ -0.162225051674325002E-03 -0.666666666666666970E-02  0.762801683545900975E-02
+ -0.369132323199304980E-05 -0.529778141458443004E-03 -0.883337008316910978E-04
+  0.880271404367008979E-02 -0.666666666666666970E-02 -0.435040483790848998E-03
+  0.921687610102163955E-02 -0.671540507590834041E-03 -0.435057903224484984E-03
+ -0.622872195907222962E-03  0.926585773544527994E-02 -0.666666666666666970E-02
+ -0.435049596668294974E-03  0.926588177948937074E-02 -0.622878658890590954E-03
+  0.000000000000000000E+00 -0.435048790726422978E-03 -0.671587306492918033E-03
+  0.921715304894545062E-02 -0.666666666666666970E-02 -0.162062557266168006E-03
+  0.755721780085659000E-02 -0.413875860912190023E-11  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.164620130721840991E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.611410160549998955E-09
+  0.822718806718837972E-02 -0.666666666666666970E-02 -0.435040499350135008E-03
+  0.921687692625696060E-02 -0.671540975224692001E-03 -0.435057887666556015E-03
+ -0.622872560192916044E-03  0.926585810471101945E-02  0.822717712936562977E-02
+ -0.606929315017626020E-09 -0.666666666666666970E-02 -0.411128311855796991E-11
+  0.755721765969236993E-02 -0.666666666666666970E-02  0.755721765969236993E-02
+ -0.411128306434785967E-11 -0.606929314990520966E-09  0.822717712936562977E-02
+ -0.666666666666666970E-02 -0.162059344493046002E-03  0.755722651317146995E-02
+ -0.401759691005694024E-11 -0.164603667382743994E-03 -0.593754382165260968E-09
+  0.822718774646260045E-02 -0.435021349583778003E-03  0.921808799379093997E-02
+ -0.672054642719300036E-03 -0.435077033929117025E-03 -0.666666666666666970E-02
+ -0.624215698660223004E-03  0.926731300229175978E-02 -0.666666666666666970E-02
+ -0.435077064835482003E-03  0.926661685822192938E-02 -0.623558994505915955E-03
+ -0.435021318668023997E-03 -0.671590322333262958E-03  0.921719752068604972E-02
+ -0.666666666666666970E-02 -0.162064557104371002E-03  0.755721777860633021E-02
+ -0.421391246707965999E-11 -0.164615474626227001E-03 -0.621858123915516982E-09
+  0.822717725212621966E-02 -0.666666666666666970E-02 -0.435032241858439002E-03
+  0.921659906098254954E-02 -0.671477447178317022E-03  0.000000000000000000E+00
+ -0.435066144096944024E-03 -0.622858570800819051E-03  0.926582756926139055E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435031827990191983E-03
+  0.921598377863167006E-02 -0.671334364038543986E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066557893876995E-03  0.000000000000000000E+00
+ -0.622785914397349987E-03  0.926515598810159942E-02 -0.435075666026484009E-03
+ -0.666666666666666970E-02  0.926727820268466972E-02 -0.624183768120280973E-03
+ -0.435022717858528992E-03 -0.672055614043404951E-03  0.921808835078249955E-02
+ -0.666666666666666970E-02 -0.162062424733906989E-03  0.755721807683177975E-02
+ -0.413205652975882023E-11 -0.164605235421495002E-03 -0.609967620239067977E-09
+  0.822717752452148052E-02 -0.435056352038405984E-03  0.926585352741150087E-02
+ -0.622863300988667017E-03 -0.435042035100298007E-03 -0.666666666666666970E-02
+ -0.671540654928718013E-03  0.921691864794477005E-02 -0.164343513344571992E-03
+ -0.666666666666666970E-02  0.822700195448362002E-02 -0.389924360631551020E-09
+ -0.164367623390296991E-03 -0.392998923185469007E-09  0.822719276781523970E-02
+ -0.435043552264714024E-03 -0.666666666666666970E-02  0.921696961930900012E-02
+ -0.671552161840081006E-03 -0.435054834971315978E-03 -0.622865703361334027E-03
+  0.926585903533608951E-02 -0.666666666666666970E-02 -0.162060264459924991E-03
+  0.755722786475403977E-02 -0.405192380789587974E-11 -0.164611230243440009E-03
+ -0.598848077831665017E-09  0.822718997281340035E-02 -0.435034019503558974E-03
+  0.921807335886390064E-02 -0.672057800833721053E-03 -0.435064366738565982E-03
+ -0.666666666666666970E-02 -0.623908223929953956E-03  0.926697860106714955E-02
+ -0.164987547210470009E-03 -0.666666666666666970E-02  0.822761164777149022E-02
+ -0.417375828718230023E-09 -0.164359000422478990E-03 -0.403612204340233986E-09
+  0.822717763383406034E-02 -0.666666666666666970E-02  0.755721765245576018E-02
+ -0.411128322661895974E-11 -0.606929330387248952E-09  0.822717712936565058E-02
+  0.822717692676105082E-02 -0.401916206499038982E-09 -0.666666666666666970E-02
+ -0.398504312842813992E-09  0.822695178414330035E-02 -0.435040783009246019E-03
+ -0.666666666666666970E-02  0.921688646246055995E-02 -0.671543142540194956E-03
+ -0.435057604031761002E-03 -0.622873025744601029E-03  0.926585914161285988E-02
+ -0.666666666666666970E-02 -0.162064588603687992E-03  0.755722246650228965E-02
+ -0.421473369469323961E-11 -0.164615514901262993E-03 -0.621972777019227005E-09
+  0.822717720033716050E-02 -0.435031817237567986E-03 -0.666666666666666970E-02
+  0.921658561270818043E-02 -0.671475161541192035E-03  0.000000000000000000E+00
+ -0.435066568644765022E-03 -0.622858805909676998E-03  0.926582686247801046E-02
+ -0.666666666666666970E-02 -0.435031985152347994E-03  0.921599362553296962E-02
+ -0.671355526523753957E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435066400758952999E-03  0.000000000000000000E+00
+ -0.622807256895068981E-03  0.926516002397628075E-02 -0.666666666666666970E-02
+ -0.162062535395593997E-03  0.755721770745760007E-02 -0.413623360119686986E-11
+ -0.164605263979841011E-03 -0.610558026872576020E-09  0.822717713332134053E-02
+  0.822717692676101960E-02 -0.401916174204150005E-09 -0.666666666666666970E-02
+ -0.398504881043303978E-09  0.822695182391632029E-02 -0.666666666666666970E-02
+ -0.435040784478542004E-03  0.921688651148403967E-02 -0.671543153316058990E-03
+ -0.435057602562588996E-03 -0.622873027716196984E-03  0.926585914660035988E-02
+ -0.666666666666666970E-02  0.755721765260307984E-02 -0.411128317076067032E-11
+ -0.606929330073791030E-09  0.822717712936565058E-02 -0.162062535451855009E-03
+ -0.666666666666666970E-02  0.755721767183212961E-02 -0.413623865254959015E-11
+ -0.164605264611767001E-03 -0.610558775336228996E-09  0.822717713310498061E-02
+ -0.666666666666666970E-02 -0.435041508530557993E-03  0.921691025951686983E-02
+ -0.671547950168372028E-03 -0.435056878568975021E-03 -0.622873496063732976E-03
+  0.926586118314657974E-02 -0.666666666666666970E-02 -0.435049193102497007E-03
+  0.921716879431322932E-02 -0.671606868811568960E-03  0.000000000000000000E+00
+ -0.435049194293033988E-03 -0.622886307249136953E-03  0.926588937729599070E-02
+ -0.666666666666666970E-02 -0.162062558223755992E-03  0.755721793600594959E-02
+ -0.413708123041369019E-11  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164605375314667000E-03  0.000000000000000000E+00
+ -0.610681172826885034E-09  0.822717711923300066E-02 -0.666666666666666970E-02
+ -0.435040497369072016E-03  0.921687685990914968E-02 -0.671540960664754978E-03
+ -0.435057889647446998E-03 -0.622872557534397998E-03  0.926585809770509930E-02
+  0.822717712936562977E-02 -0.606929315458585975E-09 -0.666666666666666970E-02
+ -0.411128312059151024E-11  0.755721765951058999E-02 -0.666666666666666970E-02
+  0.755721765951058999E-02 -0.411128314769655971E-11 -0.606929315377271019E-09
+  0.822717712936562977E-02 -0.666666666666666970E-02 -0.162062535431107012E-03
+  0.755721765815363031E-02 -0.413623888851339997E-11 -0.164605264643440992E-03
+ -0.610558828302725956E-09  0.822717713301761994E-02 -0.164360410638444992E-03
+  0.822717692977796913E-02 -0.404391259451850998E-09 -0.164359006270221988E-03
+ -0.666666666666666970E-02 -0.400958477955193996E-09  0.822695178639428967E-02
+ -0.666666666666666970E-02 -0.435040782988183007E-03  0.921688646176349081E-02
+ -0.671543142412070991E-03 -0.435057604052822009E-03 -0.622873025744216029E-03
+  0.926585914154554914E-02 -0.666666666666666970E-02 -0.435040480198357025E-03
+  0.921687629980787008E-02 -0.671540848012874966E-03 -0.435057906816663025E-03
+ -0.622872547218613964E-03  0.926585805277396063E-02 -0.666666666666666970E-02
+ -0.435049192310166017E-03  0.921716878050548964E-02 -0.671606876994400976E-03
+  0.000000000000000000E+00 -0.435049195085364978E-03 -0.622886319709903963E-03
+  0.926588938772177073E-02 -0.666666666666666970E-02 -0.162062559106551998E-03
+  0.755721765241205989E-02 -0.413713446085067003E-11  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.164605382365712999E-03
+  0.000000000000000000E+00 -0.610689035956021986E-09  0.822717709595601962E-02
+ -0.162062558807452007E-03 -0.666666666666666970E-02  0.755721765294025023E-02
+ -0.413712892374228960E-11 -0.164605381125477004E-03 -0.610688254193982979E-09
+  0.822717713365142024E-02 -0.666666666666666970E-02 -0.162062001820446010E-03
+  0.755721765238299981E-02 -0.411599096858730035E-11 -0.164602607028427003E-03
+ -0.607614049978665960E-09  0.822717713004911949E-02 -0.435050784134680991E-03
+  0.926588367152923927E-02 -0.622883807494963005E-03 -0.435047603248173024E-03
+ -0.666666666666666970E-02 -0.671594832025931951E-03  0.921711541901995036E-02
+ -0.435047603248167982E-03 -0.666666666666666970E-02  0.921711541808560922E-02
+ -0.671594830956834992E-03 -0.435050784134685979E-03 -0.622883806457666020E-03
+  0.926588367056311972E-02 -0.435040480225729010E-03 -0.666666666666666970E-02
+  0.921687630059584047E-02 -0.671540848078642995E-03 -0.435057906789294022E-03
+ -0.622872547125350026E-03  0.926585805273653050E-02 -0.666666666666666970E-02
+ -0.435040480222318978E-03  0.921687628821601067E-02 -0.671540835117597997E-03
+ -0.435057906792703025E-03 -0.622872534696777965E-03  0.926585803993784998E-02
+ -0.164605381122261992E-03  0.822717713365140983E-02 -0.610688243907033049E-09
+ -0.162062558816420988E-03 -0.666666666666666970E-02 -0.413712878804995986E-11
+  0.755721765777050969E-02 -0.162062559114418996E-03 -0.666666666666666970E-02
+  0.755721765773097014E-02 -0.413714007422677992E-11 -0.164605382603210006E-03
+ -0.610689880891928972E-09  0.822717713311491017E-02 -0.666666666666666970E-02
+  0.755721765237515972E-02 -0.411128320041550968E-11 -0.606929330558722976E-09
+  0.822717712936565058E-02  0.822717692676106990E-02 -0.401916224778808988E-09
+ -0.666666666666666970E-02 -0.398503993603989025E-09  0.822695176179714938E-02
+ -0.435040782183969974E-03 -0.666666666666666970E-02  0.921688643492011954E-02
+ -0.671543136482110042E-03 -0.435057604856967008E-03 -0.622873024631880002E-03
+  0.926585913880642077E-02 -0.666666666666666970E-02 -0.162064603069876005E-03
+  0.755721765255115002E-02 -0.421568943808864994E-11 -0.164615562666949012E-03
+ -0.622111784951490950E-09  0.822717714448729931E-02 -0.435031767249488988E-03
+ -0.666666666666666970E-02  0.921658397566323985E-02 -0.671474830577318974E-03
+  0.000000000000000000E+00 -0.435066618624126004E-03 -0.622858773502819960E-03
+  0.926582672350118042E-02 -0.666666666666666970E-02 -0.435031759838531019E-03
+  0.921598447237031992E-02 -0.671353608087835044E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435066626033650983E-03
+  0.000000000000000000E+00 -0.622806883370278998E-03  0.926515741219141944E-02
+ -0.666666666666666970E-02 -0.162062535484635997E-03  0.755721765240888969E-02
+ -0.413624141678407034E-11 -0.164605264966746989E-03 -0.610559194915996025E-09
+  0.822717713298093921E-02  0.822717692676106990E-02 -0.401916224778717016E-09
+ -0.666666666666666970E-02 -0.398503993605691001E-09  0.822695176179727081E-02
+ -0.666666666666666970E-02 -0.435040782183975015E-03  0.921688643492027046E-02
+ -0.671543136482143001E-03 -0.435057604856962997E-03 -0.622873024631885965E-03
+  0.926585913880642945E-02 -0.666666666666666970E-02  0.755721765237515972E-02
+ -0.411128320041550968E-11 -0.606929330558722046E-09  0.822717712936565058E-02
+ -0.162062535484635997E-03 -0.666666666666666970E-02  0.755721765240875958E-02
+ -0.413624138969446992E-11 -0.164605264966748995E-03 -0.610559194918309029E-09
+  0.822717713298093921E-02 -0.666666666666666970E-02 -0.162064573740720012E-03
+  0.755721745014789007E-02 -0.421455659253393022E-11 -0.164615418100806996E-03
+ -0.621947116183705050E-09  0.822717706997160014E-02 -0.666666666666666970E-02
+ -0.435032332380832999E-03  0.921659235812519011E-02 -0.671466933340958026E-03
+  0.000000000000000000E+00 -0.435066053589888019E-03 -0.622847839674434958E-03
+  0.926581781614685060E-02 -0.666666666666666970E-02 -0.435031592540386985E-03
+  0.921601550080636961E-02 -0.671377077483803031E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066793302425986E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.622828624049149020E-03
+  0.926519622533869026E-02 -0.666666666666666970E-02 -0.162168008363252009E-03
+  0.755720833616017017E-02 -0.111631527935513995E-10 -0.165130579415455000E-03
+ -0.162383770179478996E-08  0.822716472435720998E-02  0.822711943027826069E-02
+ -0.222017847415367999E-09 -0.666666666666666970E-02 -0.777760943851021986E-11
+  0.819196920090728073E-02 -0.666666666666666970E-02  0.917235354987984940E-02
+ -0.661205826876668044E-03 -0.620672923200048048E-03  0.926095692459277982E-02
+ -0.666666666666666970E-02 -0.162061842522603000E-03  0.755722298561666960E-02
+ -0.411041893471449994E-11 -0.164609505054689987E-03 -0.607053573850319957E-09
+  0.822718286444302019E-02 -0.164360468540171998E-03  0.822717730472606081E-02
+ -0.403903416083619976E-09 -0.164359675113830009E-03 -0.666666666666666970E-02
+ -0.403282765837450980E-09  0.822713708288526011E-02 -0.666666666666666970E-02
+ -0.435047234874819013E-03  0.921870444385484988E-02 -0.672996776069070043E-03
+ -0.435051152501487017E-03 -0.624363760097220040E-03  0.926743532429376987E-02
+ -0.666666666666666970E-02 -0.435040488105227008E-03  0.921687626274407014E-02
+ -0.671540573261780044E-03 -0.435057898910482974E-03 -0.622872236309804958E-03
+  0.926585776729671982E-02 -0.666666666666666970E-02 -0.435049476964798998E-03
+  0.926588407933098997E-02 -0.622880979701605024E-03  0.000000000000000000E+00
+ -0.435048910430331005E-03 -0.671593403370661043E-03  0.921715753812691989E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.162062536856723012E-03
+  0.755721779084045971E-02 -0.413747795884770991E-11 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164615649658616002E-03  0.000000000000000000E+00
+ -0.611079896253776960E-09  0.822718482294398055E-02 -0.162062474956520004E-03
+ -0.666666666666666970E-02  0.755722267180921042E-02 -0.413358698004224000E-11
+ -0.164605418170935013E-03 -0.610187187425962033E-09  0.822717750858028077E-02
+ -0.666666666666666970E-02 -0.162060451582619013E-03  0.755722364032399008E-02
+ -0.405820744876872962E-11 -0.164603101435012986E-03 -0.599472333441493042E-09
+  0.822718324339584070E-02 -0.435033610082050993E-03  0.921880734854766022E-02
+ -0.673021763224571968E-03 -0.435064776097052002E-03 -0.666666666666666970E-02
+ -0.624752427388953016E-03  0.926785482617738070E-02 -0.435064833463209995E-03
+ -0.666666666666666970E-02  0.926629223154662972E-02 -0.623259249505167968E-03
+ -0.435033552706628004E-03 -0.671592121883009046E-03  0.921718022533242996E-02
+ -0.435057725512486980E-03 -0.666666666666666970E-02  0.926766886039601979E-02
+ -0.624580707385012017E-03 -0.435040661518347027E-03 -0.673023701802679037E-03
+  0.921879973034194986E-02 -0.666666666666666970E-02 -0.435040745621057988E-03
+  0.921687745366213916E-02 -0.671533468502133989E-03 -0.435057641416789994E-03
+ -0.622863794394841984E-03  0.926585102903257005E-02 -0.164605418982737001E-03
+  0.822717750858287938E-02 -0.610189797184088988E-09 -0.162062472691372001E-03
+ -0.666666666666666970E-02 -0.413360072924052010E-11  0.755722145179233036E-02
+ -0.162060895421523010E-03 -0.666666666666666970E-02  0.755722209603739998E-02
+ -0.407539453490869988E-11 -0.164609527091146007E-03 -0.602110183541000016E-09
+  0.822718635496286037E-02 -0.666666666666666970E-02  0.920452788146493978E-02
+ -0.668731215529775008E-03 -0.622283242035163997E-03  0.926452697345470956E-02
+  0.822724025612128936E-02 -0.586193175774719989E-09 -0.666666666666666970E-02
+ -0.399370800068332996E-11  0.756944693050668957E-02 -0.162114939472544992E-03
+ -0.666666666666666970E-02  0.756941304617413969E-02 -0.521957294732459965E-11
+ -0.164756390920850005E-03 -0.760317657384023048E-09  0.822719049519187069E-02
+ -0.666666666666666970E-02 -0.162348328933429009E-03  0.757141579588467995E-02
+ -0.154258694392289985E-04 -0.530720471615547980E-03 -0.221681707782894001E-03
+  0.891605396648051038E-02 -0.163035609389389007E-03 -0.666666666666666970E-02
+  0.791126060525454075E-02 -0.568885700258806994E-05  0.000000000000000000E+00
+ -0.530385314277834956E-03 -0.644746756760560056E-04  0.878526670786249943E-02
+ -0.666666666666666970E-02 -0.399601863075062019E-03  0.843933027528507955E-02
+ -0.399755487421508981E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.464883291727197006E-03  0.000000000000000000E+00
+ -0.476904595508027981E-03  0.874547668628808915E-02 -0.666666666666666970E-02
+ -0.162063070476976004E-03  0.755351620828405030E-02 -0.447629245041502000E-11
+ -0.164644943926219012E-03 -0.661960659410746001E-09  0.822717544592907989E-02
+  0.926551780966223919E-02 -0.622722844759921010E-03 -0.666666666666666970E-02
+ -0.670824286631610035E-03  0.921370896996778939E-02 -0.666666666666666970E-02
+ -0.434945180954293003E-03  0.921154601677883975E-02 -0.668579941724334022E-03
+ -0.435153152221171024E-03 -0.620572211261187045E-03  0.926326407816084954E-02
+ -0.666666666666666970E-02  0.755816621995536019E-02 -0.411326355512401980E-11
+ -0.610348502411631949E-09  0.822725905587248958E-02 -0.402874286571233979E-03
+ -0.666666666666666970E-02  0.865106167785427947E-02 -0.472609462967472005E-03
+ -0.462635007412455013E-03 -0.557606102878732972E-03  0.917391271174717964E-02
+ -0.666666666666666970E-02 -0.415928551473747994E-03  0.874525952715160017E-02
+ -0.345831822895715981E-03 -0.452437222882241992E-03 -0.346464452179155984E-03
+  0.898436212884005936E-02 -0.666666666666666970E-02 -0.410333854430279001E-03
+  0.874024567848459034E-02 -0.526325481552128048E-03  0.000000000000000000E+00
+ -0.456977123849370008E-03 -0.594710599463255975E-03  0.921019356669105019E-02
+ -0.666666666666666970E-02 -0.349138128782108001E-03  0.808745191949900022E-02
+ -0.174389554292269988E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.493489859954412968E-03  0.000000000000000000E+00
+ -0.295632059850581025E-03  0.859314138531184005E-02 -0.666666666666666970E-02
+ -0.162165456183634003E-03  0.755894482058337962E-02 -0.104756736090630003E-10
+ -0.165141512110283999E-03 -0.152686347573174001E-08  0.822721248554279978E-02
+  0.925936813557192914E-02 -0.620622041734691043E-03 -0.666666666666666970E-02
+ -0.661330289554338960E-03  0.917272027027182975E-02 -0.666666666666666970E-02
+  0.917272027027182975E-02 -0.661330289554338960E-03 -0.620622041734691043E-03
+  0.925936813557192914E-02 -0.666666666666666970E-02 -0.162062426567090008E-03
+  0.755772612433826957E-02 -0.409070240552687966E-11 -0.164599684884186013E-03
+ -0.603681595043669966E-09  0.822717664154948985E-02 -0.435043942803598025E-03
+  0.921717692320900978E-02 -0.671606591937570947E-03 -0.435054444453763980E-03
+ -0.666666666666666970E-02 -0.623016030658318006E-03  0.926602914632313965E-02
+ -0.666666666666666970E-02 -0.435054444436950010E-03  0.926603032447032016E-02
+ -0.623017161092738014E-03 -0.435043942820411995E-03 -0.671607789864231051E-03
+  0.921717803397737075E-02 -0.666666666666666970E-02 -0.162064603090850004E-03
+  0.755721765255309031E-02 -0.421569022865129025E-11 -0.164615562771554003E-03
+ -0.622111903849669951E-09  0.822717714451166003E-02 -0.666666666666666970E-02
+ -0.435031767124851004E-03  0.921658398188585082E-02 -0.671474840199063022E-03
+  0.000000000000000000E+00 -0.435066618748741979E-03 -0.622858783402943975E-03
+  0.926582673394005067E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435031759684571979E-03  0.921598447096053965E-02 -0.671353614127903012E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066626187581996E-03
+  0.000000000000000000E+00 -0.622806890458470035E-03  0.926515741526657936E-02
+ -0.435040444163285018E-03 -0.666666666666666970E-02  0.921687548937809992E-02
+ -0.671541060679635036E-03 -0.435057942848580980E-03 -0.622872964977058013E-03
+  0.926585833344844949E-02 -0.666666666666666970E-02 -0.162062561393839003E-03
+  0.755721764798701970E-02 -0.413722734407884995E-11 -0.164605394042161002E-03
+ -0.610702580324128951E-09  0.822717713158752952E-02 -0.435057935435147018E-03
+  0.926585952661160978E-02 -0.622874138897695035E-03 -0.435040451577369989E-03
+ -0.666666666666666970E-02 -0.671542323723484992E-03  0.921687685921447966E-02
+ -0.164358967946371006E-03 -0.666666666666666970E-02  0.822694291289794051E-02
+ -0.400842285147718976E-09 -0.164360426857548987E-03 -0.404409911340958016E-09
+  0.822717692852258965E-02 -0.435040431638602989E-03 -0.666666666666666970E-02
+  0.921687619006436934E-02 -0.671542172631628951E-03 -0.435057955372164984E-03
+ -0.622874107379275976E-03  0.926585945492748933E-02 -0.666666666666666970E-02
+ -0.162062560447055011E-03  0.755721764904289991E-02 -0.413719132263620016E-11
+ -0.164605389320235000E-03 -0.610697341609479036E-09  0.822717713218053953E-02
+ -0.435057923606845009E-03  0.926585840260774063E-02 -0.622872995394380970E-03
+ -0.435040463406708008E-03 -0.666666666666666970E-02 -0.671541206497535000E-03
+  0.921687613516897035E-02 -0.164358974674477006E-03 -0.666666666666666970E-02
+  0.822694323093815967E-02 -0.400850494105121994E-09 -0.164360431601409999E-03
+ -0.404413311485434000E-09  0.822717692781809069E-02 -0.666666666666666970E-02
+  0.895231589357087069E-02 -0.601006644778351003E-03 -0.606288347917667949E-03
+  0.923179444181664033E-02  0.921580902911038960E-02 -0.671118539139058980E-03
+ -0.666666666666666970E-02 -0.628092211209406968E-03  0.927168309255030007E-02
+ -0.174566226528932001E-03 -0.666666666666666970E-02  0.823503972118425076E-02
+ -0.510771931197153037E-06 -0.167453071908171011E-03 -0.287656482013064974E-06
+  0.822679160086088948E-02 -0.666666666666666970E-02 -0.162064998742904011E-03
+  0.755721692802432000E-02 -0.423105196131728011E-11 -0.164617536966823991E-03
+ -0.624345265608089027E-09  0.822717661914844009E-02 -0.435027295434020991E-03
+ -0.666666666666666970E-02  0.921651153587084958E-02 -0.671532950972917025E-03
+  0.000000000000000000E+00 -0.435071089558342007E-03 -0.622941361344091053E-03
+  0.926589065346502053E-02 -0.666666666666666970E-02 -0.435031391217050999E-03
+  0.921594150620621937E-02 -0.671460176695444977E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435066994590015989E-03
+  0.000000000000000000E+00 -0.622929213193418037E-03  0.926511191021490987E-02
+ -0.666666666666666970E-02 -0.162052976422443997E-03  0.755715649690166975E-02
+ -0.378464866461816970E-11 -0.164557847410658995E-03 -0.559380649971562964E-09
+  0.822711615636883935E-02  0.921569960071171035E-02 -0.671199549931674020E-03
+ -0.666666666666666970E-02 -0.624992484434852037E-03  0.926820615627817028E-02
+ -0.666666666666666970E-02 -0.170162251258758009E-03  0.823042260002626071E-02
+ -0.875293478015414056E-07 -0.166825903638244990E-03 -0.692727617230331017E-07
+  0.822728043872173083E-02 -0.666666666666666970E-02  0.900314235025987016E-02
+ -0.616359389511348982E-03 -0.610140243511595988E-03  0.923851413586814982E-02
+ -0.162042399164606009E-03 -0.666666666666666970E-02  0.755717693475327010E-02
+ -0.343592691293592996E-11 -0.164505035934295996E-03 -0.508562477958249036E-09
+  0.822712657547360927E-02 -0.666666666666666970E-02 -0.435040441298520982E-03
+  0.921688002980029959E-02 -0.671545791530904035E-03 -0.435057945713094023E-03
+ -0.622877507553345035E-03  0.926586316856804065E-02 -0.666666666666666970E-02
+ -0.162131095725141989E-03  0.759120433096500003E-02 -0.393925786733449011E-11
+  0.000000000000000000E+00 -0.164599730797555013E-03 -0.563072851826958952E-09
+  0.822717559878123963E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166319227572208993E-03  0.755721552651014965E-02 -0.125030561412552005E-10
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166854118797604998E-03
+  0.000000000000000000E+00 -0.692387796644269027E-09  0.759120514631322994E-02
+ -0.666666666666666970E-02 -0.162099247780077998E-03  0.755721979757678023E-02
+ -0.581369082613002011E-11 -0.164789517015928995E-03 -0.853933169653901039E-09
+  0.822717846337374034E-02  0.926237298745190979E-02 -0.621979742880892045E-03
+ -0.666666666666666970E-02 -0.667643955617555022E-03  0.919963617413947950E-02
+ -0.666666666666666970E-02  0.920119114136109043E-02 -0.667964673290031966E-03
+ -0.622121226735743030E-03  0.926416149733888007E-02 -0.666666666666666970E-02
+ -0.435000220488568974E-03  0.921804320947046917E-02 -0.673963456108928050E-03
+ -0.435098154891607001E-03 -0.625438906427754982E-03  0.926833740265235045E-02
+ -0.167002100358977013E-03  0.822676745335870936E-02 -0.774836785822800982E-07
+ -0.162540956180643993E-03 -0.666666666666666970E-02 -0.563903073588569969E-09
+  0.755607876483108988E-02 -0.666666666666666970E-02 -0.162059390613440994E-03
+  0.755611424124089017E-02 -0.410936949933791987E-11 -0.164621823131188991E-03
+ -0.607895080715946992E-09  0.822719290698344964E-02 -0.666666666666666970E-02
+ -0.162064679580902008E-03  0.755721760719274029E-02 -0.421866212845835019E-11
+  0.000000000000000000E+00 -0.164615944113866006E-03 -0.622543990942232019E-09
+  0.822717713446135049E-02 -0.666666666666666970E-02 -0.435031107052827001E-03
+  0.921656310812869056E-02 -0.671471001355917964E-03  0.000000000000000000E+00
+ -0.435067278703290996E-03 -0.622858834069494032E-03  0.926582569486444040E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435031446660309002E-03
+  0.921599853918038045E-02 -0.671396777488897042E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066939156653987E-03  0.000000000000000000E+00
+ -0.622852608399671048E-03  0.926518020755455994E-02 -0.162542334394280010E-03
+ -0.666666666666666970E-02  0.755649428756813030E-02 -0.563245912569737047E-09
+ -0.167001746375183998E-03 -0.773653708482778048E-07  0.822676733536495917E-02
+ -0.666666666666666970E-02 -0.162063551455513005E-03  0.755721718231079035E-02
+ -0.417514955388785973E-11 -0.164610567026664999E-03 -0.616225367346680050E-09
+  0.822717731694085942E-02 -0.164377985077527987E-03  0.822719270254208997E-02
+ -0.401055733513956985E-09 -0.164351625080718007E-03 -0.666666666666666970E-02
+ -0.392176583167848991E-09  0.822661805239805963E-02 -0.435027796634762003E-03
+ -0.666666666666666970E-02  0.921803071428142946E-02 -0.673143999294995021E-03
+ -0.435070588466728980E-03 -0.624488956449440998E-03  0.926745798940319977E-02
+ -0.162061399017261011E-03 -0.666666666666666970E-02  0.755719884111734960E-02
+ -0.409719349842826014E-11 -0.164621105345924009E-03 -0.605582245755702968E-09
+  0.822719290467982013E-02 -0.666666666666666970E-02 -0.162080612318650990E-03
+  0.755719632784433975E-02 -0.488742792731292979E-11 -0.164695424088829993E-03
+ -0.719668653572314038E-09  0.822716417739921026E-02 -0.166686224033829990E-03
+  0.822673892793746031E-02 -0.487607625654436989E-07 -0.166616778531177993E-03
+ -0.666666666666666970E-02 -0.385465175249295009E-07  0.822046889213087079E-02
+ -0.434813594858741027E-03 -0.666666666666666970E-02  0.920936888606463042E-02
+ -0.669875781881485994E-03 -0.435284514510405013E-03 -0.622560397625597029E-03
+  0.926509289165874939E-02 -0.666666666666666970E-02  0.927696108602655070E-02
+ -0.632627915671010992E-03 -0.671545420293352051E-03  0.921776198077606072E-02
+  0.922026582434420060E-02 -0.602408833914311001E-03 -0.666666666666666970E-02
+ -0.556545703137562042E-03  0.881922173549517033E-02 -0.163017068024577997E-03
+ -0.666666666666666970E-02  0.795594792411035041E-02 -0.273445418458471009E-11
+  0.000000000000000000E+00 -0.164269850502767994E-03 -0.212776837501969996E-09
+  0.822716867001030069E-02 -0.666666666666666970E-02 -0.435047888244456974E-03
+  0.922079054223712974E-02 -0.674652724066845955E-03 -0.435050499142537988E-03
+ -0.626174907785970051E-03  0.926933372330603945E-02 -0.317388785409780010E-03
+ -0.666666666666666970E-02  0.807555137448344972E-02 -0.161231757480141994E-03
+  0.000000000000000000E+00 -0.506658477841221994E-03 -0.311775274478785975E-03
+  0.895372834363604084E-02 -0.666666666666666970E-02 -0.163033771455105996E-03
+  0.757711880141135989E-02 -0.230783976059587012E-04  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.528851957736768035E-03
+  0.000000000000000000E+00 -0.189450184417205001E-03  0.832713861616275926E-02
+ -0.666666666666666970E-02 -0.161988418302652995E-03  0.755983344534295974E-02
+ -0.201738904538359992E-11 -0.164212922771786994E-03 -0.300297007087386020E-09
+  0.822710829050683073E-02  0.921859794089704976E-02 -0.671461758609938038E-03
+ -0.666666666666666970E-02 -0.645287728552976008E-03  0.929336166647132082E-02
+ -0.666666666666666970E-02 -0.192231196795263987E-03  0.825469366452986041E-02
+ -0.211365722510570006E-06  0.000000000000000000E+00 -0.166616012173876997E-03
+ -0.134858265397606994E-07  0.822777128965413058E-02 -0.666666666666666970E-02
+  0.902089049924758961E-02 -0.621512175798593952E-03 -0.611398199502805962E-03
+  0.924071304560030934E-02 -0.162013975003999001E-03 -0.666666666666666970E-02
+  0.758489414663056968E-02 -0.176583073381748006E-05 -0.529428370961672052E-03
+ -0.381594181184865010E-04  0.875176256452839946E-02 -0.666666666666666970E-02
+ -0.435042623232502024E-03  0.921694787103495040E-02 -0.671556689509785008E-03
+ -0.435055763946669001E-03 -0.622875549110628985E-03  0.926586538965971929E-02
+ -0.666666666666666970E-02 -0.435049192429825995E-03  0.921716878108901072E-02
+ -0.671606874278178004E-03  0.000000000000000000E+00 -0.435049194965705000E-03
+ -0.622886316419352971E-03  0.926588938457374028E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.162062558982806990E-03  0.755721781610755976E-02
+ -0.413711682785996024E-11 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.164605380148779006E-03  0.000000000000000000E+00 -0.610686395487684964E-09
+  0.822717709924931928E-02 -0.666666666666666970E-02 -0.435040485643667027E-03
+  0.921687647065858040E-02 -0.671540876851966020E-03 -0.435057901371828989E-03
+ -0.622872543909110035E-03  0.926585805994004953E-02  0.822717712936564018E-02
+ -0.606929319904127040E-09 -0.666666666666666970E-02 -0.411128314368110998E-11
+  0.755721765744662034E-02 -0.666666666666666970E-02  0.755721765744662034E-02
+ -0.411128314368110998E-11 -0.606929319768601975E-09  0.822717712936562977E-02
+ -0.666666666666666970E-02 -0.162062535498602012E-03  0.755721765388590004E-02
+ -0.413624192489096004E-11 -0.164605265022377999E-03 -0.610559256318664035E-09
+  0.822717713308667060E-02 -0.164360427767194014E-03  0.822717693070432014E-02
+ -0.404403342469463976E-09 -0.164359021906931011E-03 -0.666666666666666970E-02
+ -0.400970042731464983E-09  0.822695176085696048E-02 -0.666666666666666970E-02
+ -0.435040781993407998E-03  0.921688642847521070E-02 -0.671543134942478989E-03
+ -0.435057605047514022E-03 -0.622873024233365990E-03  0.926585913807351051E-02
+ -0.666666666666666970E-02 -0.435040480200654992E-03  0.921687629983308949E-02
+ -0.671540847971295054E-03  0.000000000000000000E+00 -0.435057906814365980E-03
+ -0.622872547165051992E-03  0.926585805272850914E-02 -0.666666666666666970E-02
+ -0.435049192316071991E-03  0.921716878067056940E-02 -0.671606877003452980E-03
+  0.000000000000000000E+00 -0.435049195079459979E-03 -0.622886319684859977E-03
+  0.926588938770844979E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.162062559106629004E-03  0.755721765243790033E-02 -0.413713440957626983E-11
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.164605382367111999E-03
+  0.000000000000000000E+00 -0.610689040449987030E-09  0.822717709614393007E-02
+ -0.162062562108060003E-03 -0.666666666666666970E-02  0.755721765207954983E-02
+ -0.413725467961124014E-11 -0.164605397579012993E-03 -0.610706551944964963E-09
+  0.822717713458392085E-02 -0.666666666666666970E-02 -0.162062001816588988E-03
+  0.755721765239816996E-02 -0.411599093120005983E-11 -0.164602607009130996E-03
+ -0.607614028771139961E-09  0.822717713005806026E-02 -0.435050784089093976E-03
+  0.926588367178806001E-02 -0.622883807627942028E-03 -0.435047603293761014E-03
+ -0.666666666666666970E-02 -0.671594832438412981E-03  0.921711542063954024E-02
+ -0.435047603293765026E-03 -0.666666666666666970E-02  0.921711542225988993E-02
+ -0.671594833533352984E-03 -0.435050784089088989E-03 -0.622883808601936025E-03
+  0.926588367352939972E-02 -0.435040480241351984E-03 -0.666666666666666970E-02
+  0.921687630121104974E-02 -0.671540848264394041E-03 -0.435057906773672024E-03
+ -0.622872547211143052E-03  0.926585805288993036E-02 -0.666666666666666970E-02
+ -0.435040480209677994E-03  0.921687629817317076E-02 -0.671540845217228006E-03
+ -0.435057906805342979E-03 -0.622872544373526951E-03  0.926585805077998967E-02
+ -0.164605397577960991E-03  0.822717713458392085E-02 -0.610706548612634985E-09
+ -0.162062562110996998E-03 -0.666666666666666970E-02 -0.413725463469866987E-11
+  0.755721765366132014E-02 -0.162062559104681993E-03 -0.666666666666666970E-02
+  0.755721765355718989E-02 -0.413713996767741984E-11 -0.164605382596239995E-03
+ -0.610689879340446991E-09  0.822717713313662023E-02 -0.666666666666666970E-02
+  0.755721765237517013E-02 -0.411128320041531016E-11 -0.606929330558691957E-09
+  0.822717712936565058E-02  0.822717692676106990E-02 -0.401916224886744012E-09
+ -0.666666666666666970E-02 -0.398503993612473003E-09  0.822695176179774959E-02
+ -0.435040782183991983E-03 -0.666666666666666970E-02  0.921688643492086027E-02
+ -0.671543136482271046E-03 -0.435057604856944999E-03 -0.622873024631908950E-03
+  0.926585913880649016E-02 -0.666666666666666970E-02 -0.162064603069654990E-03
+  0.755721765252322011E-02 -0.421568940485412024E-11 -0.164615562666129003E-03
+ -0.622111784075274967E-09  0.822717714448810943E-02 -0.435031767248445010E-03
+ -0.666666666666666970E-02  0.921658397562742995E-02 -0.671474830568459958E-03
+  0.000000000000000000E+00 -0.435066618625169982E-03 -0.622858773500244004E-03
+  0.926582672349660075E-02 -0.666666666666666970E-02 -0.435031759841376020E-03
+  0.921598447242718069E-02 -0.671353608099204964E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435066626030805982E-03
+  0.000000000000000000E+00 -0.622806883368841996E-03  0.926515741215891940E-02
+ -0.666666666666666970E-02 -0.162062535484632989E-03  0.755721765241007971E-02
+ -0.413624141658310988E-11 -0.164605264966721999E-03 -0.610559194886172028E-09
+  0.822717713298093921E-02  0.822717692676106990E-02 -0.401916224750255024E-09
+ -0.666666666666666970E-02 -0.398503993629537016E-09  0.822695176179893961E-02
+ -0.666666666666666970E-02 -0.435040782184036978E-03  0.921688643492233999E-02
+ -0.671543136482596957E-03 -0.435057604856900980E-03 -0.622873024631969015E-03
+  0.926585913880663935E-02 -0.666666666666666970E-02  0.755721765237517013E-02
+ -0.411128320041521968E-11 -0.606929330558677998E-09  0.822717712936565058E-02
+ -0.162062535484634994E-03 -0.666666666666666970E-02  0.755721765240914990E-02
+ -0.413624147093784990E-11 -0.164605264966739996E-03 -0.610559194907676036E-09
+  0.822717713298093921E-02 -0.666666666666666970E-02 -0.435040482273178026E-03
+  0.921687613607913986E-02 -0.671540613305316998E-03 -0.435057904742022978E-03
+ -0.622872309434753979E-03  0.926585781733488961E-02 -0.666666666666666970E-02
+ -0.162127387857516993E-03  0.759121301019225012E-02 -0.381918248612579987E-11
+  0.000000000000000000E+00 -0.164583146079131992E-03 -0.546169842951880971E-09
+  0.822717717071064919E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166319269560149001E-03  0.755721776272199035E-02 -0.125468739914664006E-10
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166854266156419006E-03
+  0.000000000000000000E+00 -0.694836199054006039E-09  0.759121369204142967E-02
+ -0.666666666666666970E-02 -0.162091523337547997E-03  0.755721898424006021E-02
+ -0.540934677579967032E-11  0.000000000000000000E+00 -0.164750533637444990E-03
+ -0.795365337200214006E-09  0.822717800588011056E-02  0.926452030343002941E-02
+ -0.622281047419194033E-03 -0.666666666666666970E-02 -0.668721216430620004E-03
+  0.920448407564962955E-02 -0.666666666666666970E-02  0.920291934243665927E-02
+ -0.668399806909403954E-03 -0.622140086487220031E-03  0.926273191293569033E-02
+ -0.666666666666666970E-02 -0.435042276336174984E-03  0.921682348368780928E-02
+ -0.671431373639223947E-03 -0.435056110819545018E-03 -0.622756694074473051E-03
+  0.926574695221287978E-02 -0.164604656270789013E-03  0.822717725536971020E-02
+ -0.609640574535708954E-09 -0.162062479874466994E-03 -0.666666666666666970E-02
+ -0.413006085521291964E-11  0.755726759055258979E-02 -0.666666666666666970E-02
+ -0.162006162226876999E-03  0.755727139187646006E-02 -0.248123275689687997E-11
+ -0.164398193105456003E-03 -0.370429927246309000E-09  0.822722962912524058E-02
+ -0.666666666666666970E-02 -0.162064599546255997E-03  0.755721765602699984E-02
+ -0.421555276492109033E-11  0.000000000000000000E+00 -0.164615545095072007E-03
+ -0.622091901725074988E-09  0.822717714627387020E-02 -0.666666666666666970E-02
+ -0.435031818763029980E-03  0.921658104872762930E-02 -0.671470214801774966E-03
+  0.000000000000000000E+00 -0.435066567119568007E-03 -0.622854034507566018E-03
+  0.926582206155607963E-02 -0.666666666666666970E-02 -0.435031752461292008E-03
+  0.921598570305219057E-02 -0.671353593598220024E-03  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066633409600010E-03
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.622806719992244047E-03  0.926515905834469043E-02 -0.162062393641909007E-03
+ -0.666666666666666970E-02  0.755722114123722009E-02 -0.413058279052571960E-11
+ -0.164604687177714008E-03 -0.609739841581688048E-09  0.822717725546858077E-02
+ -0.666666666666666970E-02 -0.162060731985976013E-03  0.755721979203457985E-02
+ -0.406827649258389981E-11 -0.164598252833624004E-03 -0.600736958280085992E-09
+  0.822717859850459997E-02 -0.164089877345694000E-03  0.822722950591394005E-02
+ -0.245723983761991010E-09 -0.164740360809531990E-03 -0.666666666666666970E-02
+ -0.253032242462855987E-09  0.822760867483479999E-02 -0.435063218163868012E-03
+ -0.666666666666666970E-02  0.926625279535238031E-02 -0.623222898203098044E-03
+ -0.435035168246062981E-03 -0.671596115313023018E-03  0.921718087666609925E-02
+ -0.162006191893605003E-03 -0.666666666666666970E-02  0.755728897650767041E-02
+ -0.248111433259074991E-11 -0.164398182179902004E-03 -0.370407208686471976E-09
+  0.822722962910260938E-02 -0.666666666666666970E-02 -0.162062408593543990E-03
+  0.755721778752635031E-02 -0.413141811987293010E-11 -0.164604734980651994E-03
+ -0.609861061658230019E-09  0.822717721123178945E-02 -0.164359758086323008E-03
+  0.822717705188595952E-02 -0.403734889040530024E-09 -0.164358443027807014E-03
+ -0.666666666666666970E-02 -0.400953602691749013E-09  0.822699461577017926E-02
+ -0.435042369262157974E-03 -0.666666666666666970E-02  0.921678605589727924E-02
+ -0.671388823848062012E-03 -0.435056017899960990E-03 -0.622715238128926981E-03
+  0.926570506586136926E-02 -0.666666666666666970E-02  0.897119442339463016E-02
+ -0.606814776206859971E-03 -0.607757345455318975E-03  0.923427126754335081E-02
+  0.921865858631707932E-02 -0.671454241685535990E-03 -0.666666666666666970E-02
+ -0.646212508082314027E-03  0.929466349621911950E-02 -0.192798550395747987E-03
+ -0.666666666666666970E-02  0.825745095197302943E-02 -0.932632147974648003E-06
+  0.000000000000000000E+00 -0.167240534661016013E-03 -0.225014932097518994E-07
+  0.822685297960725967E-02 -0.666666666666666970E-02 -0.435853333045042000E-03
+  0.921982672920109922E-02 -0.576029289608052963E-03 -0.434241817042410974E-03
+ -0.604491260295870981E-03  0.915701687929209966E-02 -0.413213785007987978E-03
+ -0.666666666666666970E-02  0.877723644056935953E-02 -0.541755128362727992E-03
+  0.000000000000000000E+00 -0.454683425554934008E-03 -0.590990666065788033E-03
+  0.920776909864309996E-02 -0.666666666666666970E-02 -0.163098438357499011E-03
+  0.763192860819197037E-02 -0.471912218750803974E-04  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.530165770547245995E-03
+  0.000000000000000000E+00 -0.266001580375158011E-03  0.861802239077489977E-02
+ -0.666666666666666970E-02 -0.162004212070115012E-03  0.757730725874160033E-02
+ -0.176948864124387004E-05 -0.529438965863178955E-03 -0.405266342026753998E-04
+  0.875391878258711953E-02  0.921979766533967057E-02 -0.602143233936805987E-03
+ -0.666666666666666970E-02 -0.555336769390312010E-03  0.881585236637339939E-02
+ -0.666666666666666970E-02 -0.163010868918803012E-03  0.795402864518519014E-02
+ -0.273140611218113010E-11  0.000000000000000000E+00 -0.164270868404786012E-03
+ -0.213417891562774994E-09  0.822716826171294990E-02 -0.666666666666666970E-02
+  0.927693305629184034E-02 -0.632604555455441018E-03 -0.671545581847466972E-03
+  0.921776054189130979E-02 -0.161978982117798992E-03 -0.666666666666666970E-02
+  0.755685740523827972E-02 -0.195386055106566009E-11 -0.164191968508161013E-03
+ -0.291698868024501998E-09  0.822713227244221967E-02 -0.666666666666666970E-02
+ -0.435040480198179975E-03  0.921687629980929950E-02 -0.671540848019947000E-03
+ -0.435057906816840021E-03 -0.622872547226495030E-03  0.926585805278096024E-02
+ -0.666666666666666970E-02 -0.435049192309293017E-03  0.921716878045670054E-02
+ -0.671606876968350959E-03  0.000000000000000000E+00 -0.435049195086237978E-03
+ -0.622886319690002022E-03  0.926588938769827043E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.162062559109026988E-03  0.755721765240835973E-02
+ -0.413713441548758037E-11 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.164605382377888996E-03  0.000000000000000000E+00 -0.610689049049815970E-09
+  0.822717709592826925E-02 -0.666666666666666970E-02 -0.435040480192435005E-03
+  0.921687628880354937E-02 -0.671540836694003019E-03 -0.435057906822584992E-03
+ -0.622872536396895009E-03  0.926585804147755931E-02  0.822717712936564018E-02
+ -0.606929320743386971E-09 -0.666666666666666970E-02 -0.411128314894905010E-11
+  0.755721765697572965E-02 -0.666666666666666970E-02  0.755721765697572965E-02
+ -0.411128314894905010E-11 -0.606929320770492025E-09  0.822717712936564018E-02
+ -0.666666666666666970E-02 -0.162062535444674989E-03  0.755721765241194020E-02
+ -0.413623989629973001E-11 -0.164605264767690994E-03 -0.610558973788768004E-09
+  0.822717713298089064E-02 -0.164360410611332999E-03  0.822717692931345009E-02
+ -0.404391742677632997E-09 -0.164359006864496997E-03 -0.666666666666666970E-02
+ -0.400958843987352986E-09  0.822695177799862010E-02 -0.666666666666666970E-02
+ -0.435040782694775998E-03  0.921688645197418083E-02 -0.671543140259736040E-03
+ -0.435057604346205003E-03 -0.622873025349945041E-03  0.926585914054997052E-02
+ -0.666666666666666970E-02 -0.435040480198194016E-03  0.921687629980919021E-02
+ -0.671540848019547038E-03  0.000000000000000000E+00 -0.435057906816825981E-03
+ -0.622872547226044002E-03  0.926585805278038951E-02 -0.666666666666666970E-02
+ -0.435049192309309985E-03  0.921716878047955032E-02 -0.671606876991275004E-03
+  0.000000000000000000E+00 -0.435049195086221010E-03 -0.622886319711830048E-03
+  0.926588938772157991E-02 -0.666666666666666970E-02 -0.162062559108390995E-03
+  0.755721765240854968E-02 -0.413713449972606017E-11  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.164605382374721012E-03
+  0.000000000000000000E+00 -0.610689045533098955E-09  0.822717709592844966E-02
+ -0.162062558941643005E-03 -0.666666666666666970E-02  0.755721765259115014E-02
+ -0.413713398632515962E-11 -0.164605381794141997E-03 -0.610688990283018023E-09
+  0.822717713318714926E-02 -0.666666666666666970E-02 -0.162062001821220998E-03
+  0.755721765238152009E-02 -0.411599110620314974E-11 -0.164602607032291994E-03
+ -0.607614054224395964E-09  0.822717713004737956E-02 -0.435050784143862991E-03
+  0.926588367156189023E-02 -0.622883807550479035E-03 -0.435047603238991024E-03
+ -0.666666666666666970E-02 -0.671594832028562985E-03  0.921711541877510976E-02
+ -0.435047603238989018E-03 -0.666666666666666970E-02  0.921711541842836976E-02
+ -0.671594831671507020E-03 -0.435050784143864997E-03 -0.622883807208665977E-03
+  0.926588367119991069E-02 -0.435040480204769973E-03 -0.666666666666666970E-02
+  0.921687629995552975E-02 -0.671540847991965043E-03 -0.435057906810250024E-03
+ -0.622872547162217020E-03  0.926585805272642921E-02 -0.666666666666666970E-02
+ -0.435040480200663991E-03  0.921687628816156984E-02 -0.671540835793514026E-03
+ -0.435057906814356981E-03 -0.622872535485858030E-03  0.926585804054953083E-02
+ -0.164605381790940998E-03  0.822717713318714058E-02 -0.610688980014658047E-09
+ -0.162062558950573009E-03 -0.666666666666666970E-02 -0.413713385087310001E-11
+  0.755721765740006036E-02 -0.162062559115579987E-03 -0.666666666666666970E-02
+  0.755721765739483971E-02 -0.413714011819061037E-11 -0.164605382612299008E-03
+ -0.610689891393699981E-09  0.822717713311063928E-02 -0.666666666666666970E-02
+  0.755721765237515972E-02 -0.411128320041545960E-11 -0.606929330558715014E-09
+  0.822717712936565058E-02  0.822717692676106990E-02 -0.401916224751758011E-09
+ -0.666666666666666970E-02 -0.398503993602972989E-09  0.822695176179707999E-02
+ -0.435040782183968022E-03 -0.666666666666666970E-02  0.921688643492003974E-02
+ -0.671543136482089984E-03 -0.435057604856969990E-03 -0.622873024631875991E-03
+  0.926585913880641036E-02 -0.666666666666666970E-02 -0.162064603069938997E-03
+  0.755721765252195983E-02 -0.421568947004117974E-11 -0.164615562667550012E-03
+ -0.622111785671113991E-09  0.822717714448712063E-02 -0.435031767249090978E-03
+ -0.666666666666666970E-02  0.921658397565005075E-02 -0.671474830574493001E-03
+  0.000000000000000000E+00 -0.435066618624524015E-03 -0.622858773502375004E-03
+  0.926582672349991927E-02 -0.666666666666666970E-02 -0.435031759837511002E-03
+  0.921598447232290993E-02 -0.671353608077845964E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435066626034670025E-03
+  0.000000000000000000E+00 -0.622806883367964985E-03  0.926515741217292035E-02
+ -0.666666666666666970E-02 -0.162062535484635997E-03  0.755721765240869019E-02
+ -0.413624141680835988E-11 -0.164605264966749998E-03 -0.610559194919630036E-09
+  0.822717713298093921E-02  0.822717692676106990E-02 -0.401916224778863996E-09
+ -0.666666666666666970E-02 -0.398503993603033011E-09  0.822695176179709040E-02
+ -0.666666666666666970E-02 -0.435040782183968022E-03  0.921688643492003974E-02
+ -0.671543136482092044E-03 -0.435057604856969990E-03 -0.622873024631875991E-03
+  0.926585913880641036E-02 -0.666666666666666970E-02  0.755721765237515972E-02
+ -0.411128320041545960E-11 -0.606929330558716048E-09  0.822717712936565058E-02
+ -0.162062535484635997E-03 -0.666666666666666970E-02  0.755721765240869019E-02
+ -0.413624144391367996E-11 -0.164605264966749998E-03 -0.610559194919673050E-09
+  0.822717713298093921E-02 -0.666666666666666970E-02 -0.162064603069936991E-03
+  0.755721765251413016E-02 -0.421568947063007983E-11 -0.164615562667617992E-03
+ -0.622111785760815008E-09  0.822717714448719002E-02 -0.666666666666666970E-02
+ -0.435031767248966023E-03  0.921658397564588047E-02 -0.671474830573548986E-03
+  0.000000000000000000E+00 -0.435066618624647993E-03 -0.622858773502177028E-03
+  0.926582672349948039E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435031759837441992E-03  0.921598447233066935E-02 -0.671353608091401961E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066626034738980E-03
+  0.000000000000000000E+00 -0.622806883381814042E-03  0.926515741218287940E-02
+ -0.666666666666666970E-02 -0.162062001821317004E-03  0.755721765238150968E-02
+ -0.411599108277345033E-11 -0.164602607032771997E-03 -0.607614054759129973E-09
+  0.822717713004762069E-02  0.926588367541716050E-02 -0.622883811249623975E-03
+ -0.666666666666666970E-02 -0.671594835878273032E-03  0.921711542243204042E-02
+ -0.666666666666666970E-02  0.921711542243204042E-02 -0.671594835878273032E-03
+ -0.622883811249622999E-03  0.926588367541716050E-02 -0.666666666666666970E-02
+ -0.162062535450388002E-03  0.755721765240867979E-02 -0.413624016808590973E-11
+ -0.164605264796174991E-03 -0.610559005432624996E-09  0.822717713298075012E-02
+ -0.164360411311726003E-03  0.822717692923711011E-02 -0.404392344956488014E-09
+ -0.164359007643746004E-03 -0.666666666666666970E-02 -0.400959412732088008E-09
+  0.822695177596773942E-02 -0.666666666666666970E-02 -0.435040782621792983E-03
+  0.921688644954029961E-02 -0.671543139722759002E-03 -0.435057604419182000E-03
+ -0.622873025249980950E-03  0.926585914030375081E-02 -0.666666666666666970E-02
+ -0.435040480198179975E-03  0.921687629980933072E-02 -0.671540848019966949E-03
+  0.000000000000000000E+00 -0.435057906816840021E-03 -0.622872547226514003E-03
+  0.926585805278100014E-02 -0.666666666666666970E-02 -0.435049192309251980E-03
+  0.921716878047779999E-02 -0.671606876990949960E-03  0.000000000000000000E+00
+ -0.435049195086279015E-03 -0.622886319711839047E-03  0.926588938772160073E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.162062559109029997E-03
+  0.755721765240835019E-02 -0.413713457806971004E-11 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.164605382377896992E-03  0.000000000000000000E+00
+ -0.610689049042042024E-09  0.822717709592721974E-02 -0.162062559106823997E-03
+ -0.666666666666666970E-02  0.755721765240988975E-02 -0.413714016632381966E-11
+ -0.164605382618112988E-03 -0.610689904846968049E-09  0.822717713311111980E-02
+ -0.666666666666666970E-02 -0.162062001821342998E-03  0.755721765238148973E-02
+ -0.411599100244394014E-11 -0.164602607032901993E-03 -0.607614054902494021E-09
+  0.822717713004762069E-02 -0.435050784145525995E-03  0.926588367157234020E-02
+ -0.622883807563274030E-03 -0.435047603237328020E-03 -0.666666666666666970E-02
+ -0.671594832032081004E-03  0.921711541873500989E-02 -0.435047603237328020E-03
+ -0.666666666666666970E-02  0.921711541879069972E-02 -0.671594832090159004E-03
+ -0.435050784145525995E-03 -0.622883807618967002E-03  0.926588367163041007E-02
+ -0.435040480198147991E-03 -0.666666666666666970E-02  0.921687629974903000E-02
+ -0.671540847957907000E-03 -0.435057906816872005E-03 -0.622872547167174968E-03
+  0.926585805271907051E-02 -0.666666666666666970E-02 -0.435040480192394998E-03
+  0.921687628872891983E-02 -0.671540836617235976E-03 -0.435057906822624999E-03
+ -0.622872536323500051E-03  0.926585804140095046E-02 -0.164605382615034992E-03
+  0.822717713311110939E-02 -0.610689894920461967E-09 -0.162062559115410987E-03
+ -0.666666666666666970E-02 -0.413714014137465006E-11  0.755721765703380993E-02
+ -0.162062559115412993E-03 -0.666666666666666970E-02  0.755721765703380993E-02
+ -0.413714014146466007E-11 -0.164605382615047000E-03 -0.610689894960657982E-09
+  0.822717713311110939E-02 -0.666666666666666970E-02  0.921687630349792009E-02
+ -0.671540851815856984E-03 -0.622872550856052947E-03  0.926585805656933957E-02
+  0.926585809685315925E-02 -0.622872568564181964E-03 -0.666666666666666970E-02
+ -0.671540936702172951E-03  0.921687667944103006E-02 -0.164358960784040002E-03
+ -0.666666666666666970E-02  0.822694398364712948E-02 -0.400847403514244019E-09
+  0.000000000000000000E+00 -0.164360413019343991E-03 -0.404398689239843015E-09
+  0.822717692924298041E-02 -0.666666666666666970E-02 -0.162064603069942006E-03
+  0.755721765251538002E-02 -0.421568947070519009E-11 -0.164615562667628997E-03
+ -0.622111785771050024E-09  0.822717714448712931E-02 -0.435031767249002019E-03
+ -0.666666666666666970E-02  0.921658397564648069E-02 -0.671474830573193042E-03
+  0.000000000000000000E+00 -0.435066618624612973E-03 -0.622858773501632000E-03
+  0.926582672349898079E-02 -0.666666666666666970E-02 -0.435031759837399004E-03
+  0.921598447231553042E-02 -0.671353608075763971E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435066626034782023E-03
+  0.000000000000000000E+00 -0.622806883366930982E-03  0.926515741216851069E-02
+ -0.666666666666666970E-02 -0.162062558230456009E-03  0.755721765240984032E-02
+ -0.413710687281216962E-11 -0.164605378253319007E-03 -0.610685055063975992E-09
+  0.822717713310629033E-02  0.926585809685315925E-02 -0.622872568564181964E-03
+ -0.666666666666666970E-02 -0.671540936702172951E-03  0.921687667944103006E-02
+ -0.666666666666666970E-02 -0.164358960784040002E-03  0.822694398364712948E-02
+ -0.400847403514244019E-09  0.000000000000000000E+00 -0.164360413019343991E-03
+ -0.404398689239843015E-09  0.822717692924298041E-02 -0.666666666666666970E-02
+  0.921687630349792009E-02 -0.671540851815856984E-03 -0.622872550856052947E-03
+  0.926585805656933957E-02 -0.162062558230456009E-03 -0.666666666666666970E-02
+  0.755721765240984032E-02 -0.413710687281216962E-11  0.000000000000000000E+00
+ -0.164605378253319007E-03 -0.610685055063975992E-09  0.822717713310629033E-02
+ -0.666666666666666970E-02 -0.162064604275187002E-03  0.755721677996594975E-02
+ -0.421580947944237963E-11 -0.164615577356503002E-03 -0.622129690587314019E-09
+  0.822717714566923060E-02 -0.666666666666666970E-02 -0.435031753444996012E-03
+  0.921658351635084964E-02 -0.671474729715682982E-03  0.000000000000000000E+00
+ -0.435066632426207010E-03 -0.622858755231564033E-03  0.926582667773451045E-02
+ -0.666666666666666970E-02 -0.435031736935309978E-03  0.921598323977216964E-02
+ -0.671353347083358007E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066648932866005E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.622806813899395959E-03  0.926515680964767017E-02
+ -0.666666666666666970E-02 -0.162062535483415999E-03  0.755721764896765975E-02
+ -0.413624165743050033E-11 -0.164605264995012004E-03 -0.610559231671319039E-09
+  0.822717713301475070E-02  0.822717692676106990E-02 -0.401916226534076012E-09
+ -0.666666666666666970E-02 -0.398503963674476984E-09  0.822695175970213945E-02
+ -0.666666666666666970E-02  0.921688643582909035E-02 -0.671543139638232974E-03
+ -0.622873028116618000E-03  0.926585914228755976E-02 -0.666666666666666970E-02
+ -0.162062558230619994E-03  0.755721765087292996E-02 -0.413710692525393008E-11
+ -0.164605378269444997E-03 -0.610685075298003985E-09  0.822717713311651999E-02
+ -0.435057895657227020E-03  0.926585809286093083E-02 -0.622872564813008981E-03
+ -0.435040491358768000E-03 -0.666666666666666970E-02 -0.671540932535798999E-03
+  0.921687667431765958E-02 -0.666666666666666970E-02 -0.164358960778226998E-03
+  0.822694398251594058E-02 -0.400847388083247026E-09 -0.164360413020582990E-03
+ -0.404398690916013012E-09  0.822717692924319031E-02 -0.666666666666666970E-02
+ -0.435040480197697993E-03  0.921687629888584027E-02 -0.671540847069607041E-03
+  0.000000000000000000E+00 -0.435057906817322003E-03 -0.622872546317801987E-03
+  0.926585805183252967E-02 -0.666666666666666970E-02 -0.162127546311873996E-03
+  0.759121249955739975E-02 -0.382423929799540036E-11  0.000000000000000000E+00
+ -0.164583726855931989E-03 -0.546878175870651968E-09  0.822717706955712960E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166319285307283012E-03
+  0.755721766126241958E-02 -0.125511383854575001E-10 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.166854277196956007E-03  0.000000000000000000E+00
+ -0.695070274241472012E-09  0.759121319033947018E-02 -0.435040480198167995E-03
+ -0.666666666666666970E-02  0.921687629975386988E-02 -0.671540847960958053E-03
+ -0.435057906816852002E-03 -0.622872547169800038E-03  0.926585805272362069E-02
+ -0.666666666666666970E-02 -0.435040480192397004E-03  0.921687628872910024E-02
+ -0.671540836617267960E-03 -0.435057906822622017E-03 -0.622872536323503955E-03
+  0.926585804140104934E-02 -0.164605382615351010E-03  0.822717713311131062E-02
+ -0.610689895410060039E-09 -0.162062559115474006E-03 -0.666666666666666970E-02
+ -0.413714014380830024E-11  0.755721765703380993E-02 -0.162062559115409008E-03
+ -0.666666666666666970E-02  0.755721765703380038E-02 -0.413714008712550966E-11
+ -0.164605382615029002E-03 -0.610689894941937954E-09  0.822717713311117010E-02
+ -0.162062559106887993E-03 -0.666666666666666970E-02  0.755721765240996001E-02
+ -0.413714022296702998E-11 -0.164605382618428003E-03 -0.610689905200928972E-09
+  0.822717713311131929E-02 -0.666666666666666970E-02 -0.162062001821339013E-03
+  0.755721765238150968E-02 -0.411599102940498021E-11 -0.164602607032881989E-03
+ -0.607614054881568006E-09  0.822717713004766059E-02 -0.435050784145494011E-03
+  0.926588367157693028E-02 -0.622883807565917966E-03 -0.435047603237360004E-03
+ -0.666666666666666970E-02 -0.671594832035220962E-03  0.921711541874025049E-02
+ -0.435047603237360004E-03 -0.666666666666666970E-02  0.921711541879170933E-02
+ -0.671594832090262003E-03 -0.435050784145494011E-03 -0.622883807618874953E-03
+  0.926588367163047079E-02 -0.666666666666666970E-02  0.921687630349792009E-02
+ -0.671540851815856984E-03 -0.622872550856052947E-03  0.926585805656933957E-02
+  0.822717712936564018E-02 -0.606929320872887989E-09 -0.666666666666666970E-02
+ -0.411128314877483972E-11  0.755721765699129966E-02 -0.162062559115238003E-03
+ -0.666666666666666970E-02  0.755721765701823991E-02 -0.413714005475985968E-11
+ -0.164605382614328987E-03 -0.610689894184682021E-09  0.822717713311110939E-02
+ -0.666666666666666970E-02 -0.435040480197263987E-03  0.921687629887128941E-02
+ -0.671540847066263036E-03 -0.435057906817756009E-03 -0.622872546317053020E-03
+  0.926585805183098923E-02 -0.162127546311872993E-03 -0.666666666666666970E-02
+  0.759121249955737026E-02 -0.382423935215729017E-11  0.000000000000000000E+00
+ -0.164583726855924996E-03 -0.546878175863874980E-09  0.822717706955712960E-02
+ -0.666666666666666970E-02 -0.166319285307283988E-03  0.755721766126244039E-02
+ -0.125511383583243006E-10  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.166854277196956007E-03  0.000000000000000000E+00
+ -0.695070274239683027E-09  0.759121319033943982E-02 -0.666666666666666970E-02
+ -0.162062001821339989E-03  0.755721765238148973E-02 -0.411599111074019016E-11
+ -0.164602607032884998E-03 -0.607614054884462001E-09  0.822717713004762069E-02
+  0.926588367541610926E-02 -0.622883811249163948E-03 -0.666666666666666970E-02
+ -0.671594835876070041E-03  0.921711542242228087E-02 -0.666666666666666970E-02
+ -0.435047603237369979E-03  0.921711541881043914E-02 -0.671594832109593978E-03
+ -0.435050784145483982E-03 -0.622883807637366021E-03  0.926588367164967938E-02
+ -0.666666666666666970E-02  0.755721765237515972E-02 -0.411128320041545960E-11
+ -0.606929330558716048E-09  0.822717712936565058E-02 -0.435040480192415977E-03
+ -0.666666666666666970E-02  0.921687628876629965E-02 -0.671540836655632996E-03
+ -0.435057906822602990E-03 -0.622872536360199970E-03  0.926585804143926009E-02
+ -0.666666666666666970E-02 -0.162816160491764011E-03  0.755629729934281964E-02
+ -0.145267310755072998E-07 -0.168362009615544009E-03 -0.190258328993087994E-05
+  0.822792910352069043E-02 -0.666666666666666970E-02 -0.434980287163910990E-03
+  0.921591582112040938E-02 -0.672220540750253994E-03  0.000000000000000000E+00
+ -0.435118076441907997E-03 -0.623871210047872993E-03  0.926674480647901068E-02
+ -0.666666666666666970E-02 -0.391663109199982980E-03  0.857592364724167958E-02
+ -0.270377370428809022E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.470345934721972021E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.290023568862184014E-03  0.893922648671382006E-02
+ -0.666666666666666970E-02 -0.317994853278870024E-03  0.807402136254166923E-02
+ -0.163000017519030990E-03 -0.506466269753455046E-03 -0.315285081402617016E-03
+  0.895741238458851001E-02  0.822721524635505914E-02 -0.564092130791223009E-09
+ -0.666666666666666970E-02 -0.389036333446409034E-11  0.758050134032871043E-02
+ -0.666666666666666970E-02  0.758050134032871043E-02 -0.389036336156913981E-11
+ -0.564092130737013004E-09  0.822721524635505914E-02 -0.666666666666666970E-02
+ -0.162062221237022010E-03  0.755675930893081986E-02 -0.416186344820719000E-11
+ -0.164608243083756012E-03 -0.614516634575204050E-09  0.822717669815483006E-02
+ -0.435064268078357019E-03  0.926583720685853970E-02 -0.622864535863296978E-03
+ -0.435034118178185007E-03 -0.666666666666666970E-02 -0.671494763562757962E-03
+  0.921666478489410919E-02 -0.666666666666666970E-02 -0.435034118314003989E-03
+  0.921666821285587953E-02 -0.671498187612958996E-03 -0.435064267942557987E-03
+ -0.622867800074395988E-03  0.926584079408090955E-02 -0.666666666666666970E-02
+ -0.162064603060627001E-03  0.755721765249593985E-02 -0.421568916772846982E-11
+  0.000000000000000000E+00 -0.164615562621545010E-03 -0.622111733862203048E-09
+  0.822717714450407062E-02 -0.666666666666666970E-02 -0.435031767219332013E-03
+  0.921658395702201066E-02 -0.671474812089471003E-03  0.000000000000000000E+00
+ -0.435066618654277992E-03 -0.622858755959619019E-03  0.926582670497335011E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435031759978188998E-03
+  0.921598448078022985E-02 -0.671353613199229967E-03 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435066625894017019E-03  0.000000000000000000E+00
+ -0.622806887499908037E-03  0.926515741657525996E-02 -0.435040451653885985E-03
+ -0.666666666666666970E-02  0.921687738713831972E-02 -0.671542772385395025E-03
+ -0.435057935358636985E-03 -0.622874555977533041E-03  0.926586008373840933E-02
+ -0.666666666666666970E-02 -0.162062556280765003E-03  0.755721766011261970E-02
+ -0.413703259232341988E-11 -0.164605370112559998E-03 -0.610674307777927029E-09
+  0.822717713673958018E-02 -0.435057871981406005E-03  0.926586389913170057E-02
+ -0.622877920215035986E-03 -0.435040515036651005E-03 -0.666666666666666970E-02
+ -0.671546676151164976E-03  0.921688293990952957E-02 -0.164358975070530991E-03
+ -0.666666666666666970E-02  0.822694462180378026E-02 -0.400864471382896996E-09
+ -0.164360425623554987E-03 -0.404406184767255019E-09  0.822717693084428978E-02
+ -0.435040521728764014E-03 -0.666666666666666970E-02  0.921688316448704067E-02
+ -0.671546726860146044E-03 -0.435057865289874995E-03 -0.622877930792390950E-03
+  0.926586392317507029E-02 -0.666666666666666970E-02 -0.162062560143134006E-03
+  0.755721765072739013E-02 -0.413717992975032974E-11 -0.164605387803321994E-03
+ -0.610695685352230028E-09  0.822717713417366048E-02 -0.435057920486110002E-03
+  0.926586013716025961E-02 -0.622874579485410002E-03 -0.435040466527716017E-03
+ -0.666666666666666970E-02 -0.671542885086807029E-03  0.921687788626173940E-02
+ -0.164358940726689012E-03 -0.666666666666666970E-02  0.822694332169499921E-02
+ -0.400825748326332007E-09 -0.164360402303031009E-03 -0.404387092386443002E-09
+  0.822717693284892929E-02 -0.666666666666666970E-02  0.755853528834142963E-02
+ -0.410907237024926035E-11 -0.609517161481544010E-09  0.822725835180459011E-02
+  0.822699835769964000E-02 -0.229769146389950011E-09 -0.666666666666666970E-02
+ -0.552503881834807986E-11  0.815277410257971014E-02 -0.431800076739472991E-03
+ -0.666666666666666970E-02  0.911683250971202959E-02 -0.645107775821699976E-03
+ -0.438245882118646008E-03 -0.614761191522313001E-03  0.925173111512469018E-02
+ -0.666666666666666970E-02 -0.162066807231452001E-03  0.755721537354540028E-02
+ -0.430232887169997028E-11 -0.164626562654960999E-03 -0.634707558119433995E-09
+  0.822717705664591063E-02 -0.435011184592201997E-03 -0.666666666666666970E-02
+  0.921541718621200941E-02 -0.670832821851847973E-03  0.000000000000000000E+00
+ -0.435087195563132003E-03 -0.622361407616723998E-03  0.926525297162917975E-02
+ -0.666666666666666970E-02 -0.435024023581949979E-03  0.921463310923579040E-02
+ -0.670757573524478033E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.435074360637873977E-03  0.000000000000000000E+00
+ -0.622355873608441013E-03  0.926393658622583042E-02 -0.666666666666666970E-02
+ -0.162274431703270989E-03  0.755738308036761966E-02 -0.318241338988657979E-10
+ -0.165730950021739988E-03 -0.457847648587293988E-08  0.822724281267098005E-02
+  0.822699408685815918E-02 -0.228425571392821005E-09 -0.666666666666666970E-02
+ -0.569261988092565005E-11  0.815841787771750919E-02 -0.666666666666666970E-02
+ -0.432079853778008993E-03  0.912452759981858923E-02 -0.647134253481845053E-03
+ -0.437974705900792997E-03 -0.615294574958452005E-03  0.925273449425225922E-02
+ -0.666666666666666970E-02  0.755849436233489019E-02 -0.410955652784976012E-11
+ -0.609618023537668031E-09  0.822725856237870035E-02 -0.162292777295442987E-03
+ -0.666666666666666970E-02  0.755742123690941042E-02 -0.382880604873281030E-10
+ -0.165829805992468013E-03 -0.549637808756437989E-08  0.822725133924324080E-02
+ -0.666666666666666970E-02 -0.162519832090693000E-03  0.758487998779587991E-02
+ -0.277297390858832013E-04 -0.531411093209674000E-03 -0.271306402284292975E-03
+  0.895289512037131946E-02 -0.666666666666666970E-02 -0.435144737424841995E-03
+  0.927049381409235027E-02 -0.627195150452744977E-03  0.000000000000000000E+00
+ -0.434953604275157982E-03 -0.673646028178072039E-03  0.921927160555094036E-02
+ -0.666666666666666970E-02 -0.162365832331587992E-03  0.770469465208874042E-02
+ -0.341374299192897006E-05  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.530093823283070008E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.670683422235064935E-04  0.878401693892019025E-02
+ -0.666666666666666970E-02 -0.406469352273000986E-03  0.868845376516133062E-02
+ -0.487445861240936023E-03 -0.459955486486156979E-03 -0.561455535102076998E-03
+  0.917876821200271954E-02  0.822724019224538999E-02 -0.586236000132447954E-09
+ -0.666666666666666970E-02 -0.399393158957938003E-11  0.756942275725619972E-02
+ -0.666666666666666970E-02  0.756938783599863958E-02 -0.398490214171924003E-11
+ -0.582276440781839991E-09  0.822717710481313952E-02 -0.666666666666666970E-02
+ -0.162064188320470992E-03  0.755638508915526964E-02 -0.426955665163783005E-11
+ -0.164621776680132990E-03 -0.630372622134120969E-09  0.822717559459638058E-02
+ -0.435097665935211988E-03  0.926770146162825964E-02 -0.624846566023037008E-03
+ -0.435000709682981980E-03 -0.666666666666666970E-02 -0.673347124029987048E-03
+  0.921745137510697997E-02 -0.666666666666666970E-02 -0.164221437098695006E-03
+  0.822590307436798021E-02 -0.301540515380110992E-09 -0.164276463597796989E-03
+ -0.317267646597123000E-09  0.822720896113241992E-02 -0.666666666666666970E-02
+ -0.435040484785290994E-03  0.921687621618138067E-02 -0.671540621958418996E-03
+ -0.435057902230129995E-03 -0.622872302682172954E-03  0.926585782249826975E-02
+ -0.666666666666666970E-02 -0.162127341684405001E-03  0.759121275229191969E-02
+ -0.381774206802719039E-11  0.000000000000000000E+00 -0.164583013632061998E-03
+ -0.545969305880644018E-09  0.822717722380156974E-02 -0.666666666666666970E-02
+ -0.166319316569640000E-03  0.755721776450153995E-02 -0.125637203248046992E-10
+  0.000000000000000000E+00 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166854313971363013E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.695766372139392979E-09  0.759121342784058965E-02 -0.435007341079486002E-03
+ -0.666666666666666970E-02  0.921767423501631059E-02 -0.673397510119870990E-03
+ -0.435091037539934999E-03 -0.624857046910582011E-03  0.926772529095947969E-02
+ -0.666666666666666970E-02 -0.435042780427758998E-03  0.921682237061272926E-02
+ -0.671414330503569993E-03 -0.435055606761638005E-03 -0.622737221386400041E-03
+  0.926573013279861045E-02 -0.164512437178424009E-03  0.822720912015354085E-02
+ -0.478141451603002007E-09 -0.162035132712254993E-03 -0.666666666666666970E-02
+ -0.321987108658194010E-11  0.755727832217645019E-02 -0.162065357686449010E-03
+ -0.666666666666666970E-02  0.755727582940845036E-02 -0.423979678728262026E-11
+ -0.164618725864026005E-03 -0.625584794059249953E-09  0.822717479197863921E-02
+ -0.162035093479698011E-03 -0.666666666666666970E-02  0.755725620859342036E-02
+ -0.322006454574526012E-11 -0.164512451389200987E-03 -0.478178423362356003E-09
+  0.822720912019037076E-02 -0.666666666666666970E-02 -0.162063731980670002E-03
+  0.755721533834740992E-02 -0.418199544479783967E-11 -0.164611237647310007E-03
+ -0.617212851806125045E-09  0.822717573520461917E-02 -0.435072205240186985E-03
+  0.926779361315555050E-02 -0.624886867600672031E-03 -0.435026179501545984E-03
+ -0.666666666666666970E-02 -0.673540811812492046E-03  0.921830845677282934E-02
+ -0.435026047900845974E-03 -0.666666666666666970E-02  0.921631021319617959E-02
+ -0.671342130464456957E-03 -0.435072336809864014E-03 -0.622763518227145035E-03
+  0.926572086598697990E-02 -0.666666666666666970E-02  0.755741246850622026E-02
+ -0.410910644194978984E-11 -0.606515170668247041E-09  0.822717712895315936E-02
+  0.822711877443555936E-02 -0.222069534738638002E-09 -0.666666666666666970E-02
+ -0.774169806703297047E-11  0.819168186272309015E-02 -0.433633789741900023E-03
+ -0.666666666666666970E-02  0.917151071300196033E-02 -0.660635211420094017E-03
+ -0.436454594164531018E-03 -0.620192759941485053E-03  0.926043761605101039E-02
+ -0.666666666666666970E-02 -0.435400642028797977E-03  0.924569487162926924E-02
+ -0.602675681812970990E-03 -0.434697126891870982E-03 -0.641341627123363030E-03
+  0.919049697819149941E-02 -0.417766802147073019E-03 -0.666666666666666970E-02
+  0.884119022200594018E-02 -0.565196699593391045E-03  0.000000000000000000E+00
+ -0.450926780658108997E-03 -0.597865634330268050E-03  0.921780464690035925E-02
+ -0.666666666666666970E-02 -0.162927680290126010E-03  0.761083369281826978E-02
+ -0.416350633852817005E-04  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.530230645664085047E-03  0.000000000000000000E+00
+ -0.269827347276449016E-03  0.866009136742811915E-02 -0.666666666666666970E-02
+ -0.162144345304456006E-03  0.760092331739826033E-02 -0.320930758282650987E-05
+ -0.529813749405098054E-03 -0.838256824281413042E-04  0.879776792041751972E-02
+  0.822717682684740922E-02 -0.303187285528630003E-09 -0.666666666666666970E-02
+ -0.349301427021029015E-11  0.790044609360177931E-02 -0.666666666666666970E-02
+ -0.407938480218887980E-03  0.871084858846606080E-02 -0.512295469801526959E-03
+ -0.458837927071482975E-03 -0.588085060255285046E-03  0.920289209649226975E-02
+ -0.666666666666666970E-02  0.755933306797651038E-02 -0.410021685751258990E-11
+ -0.607789044576714013E-09  0.822725774217296021E-02 -0.162167689058960993E-03
+ -0.666666666666666970E-02  0.755666327531443957E-02 -0.112741283804007006E-10
+ -0.165135297900302988E-03 -0.164049635527119992E-08  0.822716577111987958E-02
+ -0.666666666666666970E-02 -0.162816151525113997E-03  0.755629731334936036E-02
+ -0.145250995281190996E-07 -0.168361965007562991E-03 -0.190237271058338006E-05
+  0.822792892080568931E-02 -0.666666666666666970E-02 -0.434980646616620992E-03
+  0.921590961843673021E-02 -0.672201796129614040E-03  0.000000000000000000E+00
+ -0.435117717236683025E-03 -0.623850865689214011E-03  0.926672719210277010E-02
+ -0.666666666666666970E-02 -0.391662568195668987E-03  0.857592608264677939E-02
+ -0.270377852525331014E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.470346296009015001E-03  0.000000000000000000E+00
+ -0.290024881064321978E-03  0.893923985221735037E-02 -0.666666666666666970E-02
+ -0.317995102343517999E-03  0.807402209586039962E-02 -0.163000191627275006E-03
+ -0.506466182711903996E-03 -0.315285148839635984E-03  0.895741240163453953E-02
+  0.822721524635562987E-02 -0.564092115954154982E-09 -0.666666666666666970E-02
+ -0.389036326356696026E-11  0.758050134875735016E-02 -0.666666666666666970E-02
+  0.758050134875735016E-02 -0.389036331777706000E-11 -0.564092115791524967E-09
+  0.822721524635562987E-02 -0.666666666666666970E-02 -0.162059230234564013E-03
+  0.755676875879613980E-02 -0.404977791069632030E-11 -0.164607827252304003E-03
+ -0.598672124761886979E-09  0.822718791318045005E-02 -0.435030727375933994E-03
+  0.921782544139550068E-02 -0.671763596290095993E-03 -0.435067658311500988E-03
+ -0.666666666666666970E-02 -0.623720661515962042E-03  0.926678721368117027E-02
+ -0.666666666666666970E-02 -0.435067665337827991E-03  0.926637336566514015E-02
+ -0.623334437794025985E-03 -0.435030720348006980E-03 -0.671597031953381036E-03
+  0.921719062426287934E-02 -0.666666666666666970E-02 -0.162064554103597002E-03
+  0.755721778425398963E-02 -0.421379617299167968E-11 -0.164615465305043013E-03
+ -0.621841414828084976E-09  0.822717725632217053E-02 -0.666666666666666970E-02
+ -0.435032270976548998E-03  0.921659997236582915E-02 -0.671477595984673039E-03
+  0.000000000000000000E+00 -0.435066114983777015E-03 -0.622858547503055051E-03
+  0.926582760612120956E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435031835475485022E-03  0.921598351720689934E-02 -0.671332921939591038E-03
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.435066550409885975E-03
+  0.000000000000000000E+00 -0.622784365706614980E-03  0.926515556356406061E-02
+ -0.435081591675335006E-03 -0.666666666666666970E-02  0.926715723864761018E-02
+ -0.624060476111682047E-03 -0.435016790462375985E-03 -0.671761223543174959E-03
+  0.921784490296891079E-02 -0.666666666666666970E-02 -0.162062411240925002E-03
+  0.755721810753046033E-02 -0.413154453738800000E-11 -0.164605196762570010E-03
+ -0.609894095762123987E-09  0.822717754580637971E-02 -0.435056188182044996E-03
+  0.926585891365665042E-02 -0.622867894213649997E-03 -0.435042198968283974E-03
+ -0.666666666666666970E-02 -0.671546446351377053E-03  0.921692872689685964E-02
+ -0.164341192321288998E-03 -0.666666666666666970E-02  0.822700721314235071E-02
+ -0.388276097998368010E-09 -0.164366485757635008E-03 -0.391285012821627015E-09
+  0.822719359814269927E-02 -0.435043836697356014E-03 -0.666666666666666970E-02
+  0.921698374849288030E-02 -0.671558867523572019E-03 -0.435054550554351985E-03
+ -0.622870487342790016E-03  0.926586485739499914E-02 -0.666666666666666970E-02
+ -0.162059863256517012E-03  0.755722860343106036E-02 -0.403706320675834994E-11
+ -0.164610172315551000E-03 -0.596713959736372018E-09  0.822719066745875027E-02
+ -0.435029178909017021E-03  0.921782856828711958E-02 -0.671763646252384965E-03
+ -0.435069206480010991E-03 -0.666666666666666970E-02 -0.623759080059059956E-03
+  0.926682896934285030E-02 -0.165176323153183011E-03 -0.666666666666666970E-02
+  0.822774275548648036E-02 -0.421612183862497016E-09 -0.164358865204686000E-03
+ -0.403616558608118980E-09  0.822717766773221025E-02 -0.666666666666666970E-02
+  0.755853528843129958E-02 -0.410907228794313989E-11 -0.609517161291164049E-09
+  0.822725835180460052E-02  0.822699835770025062E-02 -0.229769141627132009E-09
+ -0.666666666666666970E-02 -0.552503936541298003E-11  0.815277412222070036E-02
+ -0.431800077704224978E-03 -0.666666666666666970E-02  0.911683253562737932E-02
+ -0.645107782251850984E-03 -0.438245881184861979E-03 -0.614761192822427965E-03
+  0.925173111806255020E-02 -0.666666666666666970E-02 -0.162066792004203990E-03
+  0.755722059760233992E-02 -0.430128633401459982E-11 -0.164626435077370987E-03
+ -0.634553335781814010E-09  0.822717711494816949E-02 -0.435011239879684001E-03
+ -0.666666666666666970E-02  0.921541899333368046E-02 -0.670833188949691958E-03
+  0.000000000000000000E+00 -0.435087140296697020E-03 -0.622361445147185050E-03
+  0.926525312625075051E-02 -0.666666666666666970E-02 -0.435024263236271974E-03
+  0.921464288704679918E-02 -0.670759624758249955E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.435074121043737996E-03
+  0.000000000000000000E+00 -0.622356277676372038E-03  0.926393943568542926E-02
+ -0.666666666666666970E-02 -0.162274431660521006E-03  0.755738313912261012E-02
+ -0.318240683403240976E-10 -0.165730949472054013E-03 -0.457846708968348003E-08
+  0.822724281302687072E-02  0.822699408685964931E-02 -0.228425558756176994E-09
+ -0.666666666666666970E-02 -0.569262164845434979E-11  0.815841793296116953E-02
+ -0.666666666666666970E-02 -0.432079856432105011E-03  0.912452767230425080E-02
+ -0.647134271290311994E-03 -0.437974703324669004E-03 -0.615294578525974042E-03
+  0.925273450239525071E-02 -0.666666666666666970E-02  0.755849436258760991E-02
+ -0.410955652506049982E-11 -0.609618023001794995E-09  0.822725856237871075E-02
+ -0.162292777280510000E-03 -0.666666666666666970E-02  0.755742125771192031E-02
+ -0.382880319923928003E-10 -0.165829805796784997E-03 -0.549637401523144008E-08
+  0.822725133937288015E-02 -0.666666666666666970E-02 -0.162519791749040997E-03
+  0.758486654085705012E-02 -0.277312982546554016E-04 -0.531411130485370960E-03
+ -0.271318851749606982E-03  0.895290555306309938E-02 -0.666666666666666970E-02
+ -0.435144726093062025E-03  0.927049355419245005E-02 -0.627194917416272046E-03
+  0.000000000000000000E+00 -0.434953615617772005E-03 -0.673646073339009962E-03
+  0.921927163146945942E-02 -0.666666666666666970E-02 -0.162365825870953989E-03
+  0.770469126010004009E-02 -0.341374404077266992E-05  0.000000000000000000E+00
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.530093736436846978E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.670688877278153049E-04
+  0.878401724365305073E-02 -0.666666666666666970E-02 -0.406469238190150994E-03
+  0.868845253580060976E-02 -0.487445397507600979E-03 -0.459955572937245996E-03
+ -0.561455436984956992E-03  0.917876806761750916E-02  0.822724019224498060E-02
+ -0.586236017994280044E-09 -0.666666666666666970E-02 -0.399393167827312004E-11
+  0.756942274805354979E-02 -0.666666666666666970E-02  0.756938782679772958E-02
+ -0.398490217607864004E-11 -0.582276458527030000E-09  0.822717710481316034E-02
+ -0.666666666666666970E-02 -0.162064188279850002E-03  0.755638507164865976E-02
+ -0.426955655883516001E-11 -0.164621776652662993E-03 -0.630372617736862023E-09
+  0.822717559470338006E-02 -0.435097665884428989E-03  0.926770143129381982E-02
+ -0.624846533095091048E-03 -0.435000709733791000E-03 -0.666666666666666970E-02
+ -0.673347090449469992E-03  0.921745134737769929E-02 -0.666666666666666970E-02
+ -0.164221437102121003E-03  0.822590307607582068E-02 -0.301540531002169025E-09
+ -0.164276463596261000E-03 -0.317267642602591980E-09  0.822720896113589978E-02
+ -0.666666666666666970E-02 -0.435040484785382989E-03  0.921687621618049943E-02
+ -0.671540621954609001E-03 -0.435057902230038000E-03 -0.622872302677949011E-03
+  0.926585782249450020E-02 -0.666666666666666970E-02 -0.162127341684184989E-03
+  0.759121275229850991E-02 -0.381774206063630980E-11  0.000000000000000000E+00
+ -0.164583013631449993E-03 -0.545969304848883959E-09  0.822717722380189066E-02
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166319316568878998E-03
+  0.755721776450365024E-02 -0.125637200248759998E-10 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.166854313970594991E-03  0.000000000000000000E+00
+ -0.695766357075677050E-09  0.759121342784747997E-02 -0.435007341721630001E-03
+ -0.666666666666666970E-02  0.921767422715922928E-02 -0.673397481031394029E-03
+ -0.435091036898059991E-03 -0.624857014917886951E-03  0.926772526275846005E-02
+ -0.666666666666666970E-02 -0.435042780430306005E-03  0.921682237057486024E-02
+ -0.671414330383147007E-03 -0.435055606759090997E-03 -0.622737221255020975E-03
+  0.926573013268000914E-02 -0.164512437176590000E-03  0.822720912015702070E-02
+ -0.478141445901109973E-09 -0.162035132711022011E-03 -0.666666666666666970E-02
+ -0.321987104749231018E-11  0.755727832223314962E-02 -0.162065357632703997E-03
+ -0.666666666666666970E-02  0.755727582947162985E-02 -0.423979469854183965E-11
+ -0.164618725596388991E-03 -0.625584490453962995E-09  0.822717479206993944E-02
+ -0.162035093478465002E-03 -0.666666666666666970E-02  0.755725620865007990E-02
+ -0.322006453375858990E-11 -0.164512451387367005E-03 -0.478178417741383017E-09
+  0.822720912019385062E-02 -0.666666666666666970E-02 -0.162063731945476990E-03
+  0.755721533840931960E-02 -0.418199409455997034E-11 -0.164611237471814999E-03
+ -0.617212655483738035E-09  0.822717573526372987E-02 -0.435072204818992981E-03
+  0.926779358414435948E-02 -0.624886835257608040E-03 -0.435026179922837024E-03
+ -0.666666666666666970E-02 -0.673540781035909049E-03  0.921830844145678924E-02
+ -0.435026048326556981E-03 -0.666666666666666970E-02  0.921631022735146070E-02
+ -0.671342133550576967E-03 -0.435072336384253024E-03 -0.622763518765494998E-03
+  0.926572086739899971E-02 -0.666666666666666970E-02  0.755741246850622026E-02
+ -0.410910644194978984E-11 -0.606515170668247041E-09  0.822717712895315936E-02
+  0.822711877443555936E-02 -0.222069534792841002E-09 -0.666666666666666970E-02
+ -0.774169806704296934E-11  0.819168186272316995E-02 -0.433633789741903004E-03
+ -0.666666666666666970E-02  0.917151071300205921E-02 -0.660635211420117002E-03
+ -0.436454594164527982E-03 -0.620192759941490040E-03  0.926043761605102080E-02
+ -0.666666666666666970E-02 -0.435400642045188024E-03  0.924569487205297025E-02
+ -0.602675682168614998E-03 -0.434697126875424015E-03 -0.641341627119159036E-03
+  0.919049697820984064E-02 -0.417766802147158021E-03 -0.666666666666666970E-02
+  0.884119022200715969E-02 -0.565196699593685948E-03  0.000000000000000000E+00
+ -0.450926780658037006E-03 -0.597865634330208961E-03  0.921780464690040956E-02
+ -0.666666666666666970E-02 -0.162927680290140999E-03  0.761083369282181989E-02
+ -0.416350633850501013E-04  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.666666666666666970E-02 -0.530230645664087974E-03  0.000000000000000000E+00
+ -0.269827347274097977E-03  0.866009136742689964E-02 -0.666666666666666970E-02
+ -0.162144345304457009E-03  0.760092331739868968E-02 -0.320930758282241997E-05
+ -0.529813749405099029E-03 -0.838256824279560954E-04  0.879776792041734972E-02
+  0.822717682684740922E-02 -0.303187285555688993E-09 -0.666666666666666970E-02
+ -0.349301427021062013E-11  0.790044609360190074E-02 -0.666666666666666970E-02
+ -0.407938480218909014E-03  0.871084858846631060E-02 -0.512295469801619008E-03
+ -0.458837927071466983E-03 -0.588085060255303044E-03  0.920289209649229924E-02
+ -0.666666666666666970E-02  0.755933306797651038E-02 -0.410021674909242031E-11
+ -0.607789044576720010E-09  0.822725774217296021E-02 -0.162167689058960993E-03
+ -0.666666666666666970E-02  0.755666327531452024E-02 -0.112741284887906999E-10
+ -0.165135297900302012E-03 -0.164049635526676995E-08  0.822716577111987958E-02
+ -0.666666666666666970E-02 -0.162064381314455013E-03  0.755721681642354016E-02
+ -0.420708647347467005E-11 -0.164614463272000995E-03 -0.620861048269696965E-09
+  0.822717666428922009E-02 -0.666666666666666970E-02 -0.435032929063819981E-03
+  0.921662843024339054E-02 -0.671487062718633038E-03  0.000000000000000000E+00
+ -0.435065457005957975E-03 -0.622863607135336990E-03  0.926583682639911012E-02
+ -0.666666666666666970E-02 -0.435033606033594978E-03  0.921549100307902068E-02
+ -0.670764254703717011E-03  0.000000000000000000E+00 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435064780144060010E-03  0.000000000000000000E+00
+ -0.622228101134277012E-03  0.926458636134656031E-02 -0.666666666666666970E-02
+ -0.162412248298241007E-03  0.755747465195114043E-02 -0.132892288474876992E-09
+ -0.166453306564733994E-03 -0.187762010463222004E-07  0.822729692880195054E-02
+  0.822701112868542972E-02 -0.238165066447310994E-09 -0.666666666666666970E-02
+ -0.481872285692390998E-11  0.811743485472943976E-02 -0.666666666666666970E-02
+  0.907030660461941030E-02 -0.635289343114582030E-03 -0.614711342014916053E-03
+  0.924712095234279940E-02 -0.666666666666666970E-02 -0.162067491346243997E-03
+  0.755721660010066996E-02 -0.433042344878489997E-11 -0.164637989129536011E-03
+ -0.639067222491722952E-09  0.822718313255254956E-02 -0.435123629441636984E-03
+  0.926610912903469072E-02 -0.623064810739766984E-03 -0.434974730176162019E-03
+ -0.666666666666666970E-02 -0.671125339170808039E-03  0.921519637816755992E-02
+ -0.666666666666666970E-02 -0.166000837408311988E-03  0.822475961254992040E-02
+ -0.105963041200035007E-07 -0.166020054886163010E-03 -0.114023574905160996E-07
+  0.822673401751872028E-02 -0.666666666666666970E-02 -0.435040411541543013E-03
+  0.921687888628672063E-02 -0.671545404567263976E-03 -0.435057975467457981E-03
+ -0.622877304336513001E-03  0.926586291132954973E-02 -0.666666666666666970E-02
+ -0.162130707049625006E-03  0.759120585643323006E-02 -0.392634071196178009E-11
+  0.000000000000000000E+00 -0.164597967653741013E-03 -0.561253610552618950E-09
+  0.822717507317871979E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166319143409792997E-03  0.755721565731708005E-02 -0.124778474586887002E-10
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166854049995611003E-03
+  0.000000000000000000E+00 -0.691000750970318969E-09  0.759120679480445974E-02
+ -0.435051311183006005E-03 -0.666666666666666970E-02  0.926639102794180047E-02
+ -0.623356751569072967E-03 -0.435047076190057990E-03 -0.671803857406816016E-03
+  0.921783849635480004E-02 -0.666666666666666970E-02 -0.435004887685989992E-03
+  0.921760501413688982E-02 -0.673382235862111022E-03 -0.435093489874442013E-03
+ -0.624855064858449022E-03  0.926773123149998018E-02 -0.166320189393602005E-03
+  0.822674016833681992E-02 -0.176036669988993003E-07 -0.162405090688964003E-03
+ -0.666666666666666970E-02 -0.125356963990057004E-09  0.755633079584900041E-02
+ -0.162059610339947000E-03 -0.666666666666666970E-02  0.755636445209539962E-02
+ -0.409725503580782012E-11 -0.164619621806747006E-03 -0.605978989258597974E-09
+  0.822719229488561021E-02 -0.162405940044832003E-03 -0.666666666666666970E-02
+  0.755662367990892966E-02 -0.125254866802548012E-09 -0.166319952568922996E-03
+ -0.175849783175382999E-07  0.822674014969775047E-02 -0.666666666666666970E-02
+ -0.162059866198233001E-03  0.755722656090779960E-02 -0.403692298108515966E-11
+ -0.164606538526818004E-03 -0.596577171893270041E-09  0.822718795529438045E-02
+ -0.435028073525391026E-03  0.921786254908986934E-02 -0.671799805131304017E-03
+ -0.435070311635940977E-03 -0.666666666666666970E-02 -0.623819803880377045E-03
+  0.926689343860783929E-02 -0.435070247954426976E-03 -0.666666666666666970E-02
+  0.926779304085995070E-02 -0.624694668192695042E-03 -0.435028137220242991E-03
+ -0.672979588001651041E-03  0.921846396898064017E-02 -0.666666666666666970E-02
+  0.894946699024212929E-02 -0.600122097862237002E-03 -0.606052261066108988E-03
+  0.923126173183646918E-02  0.822707325920852932E-02 -0.439334748407924013E-09
+ -0.666666666666666970E-02 -0.340850404729477990E-11  0.767218067483432017E-02
+ -0.163197277611309987E-03 -0.666666666666666970E-02  0.767185136575801018E-02
+ -0.332821151740468006E-08 -0.167963560070563987E-03 -0.390028261492079976E-06
+  0.822779264793695002E-02 -0.666666666666666970E-02 -0.424429129575981982E-03
+  0.884286864444589968E-02 -0.396957256272790994E-03 -0.445115815542256987E-03
+ -0.385983334887198980E-03  0.902131707790924942E-02 -0.407763611263302025E-03
+ -0.666666666666666970E-02  0.870515217458626978E-02 -0.498571990041084007E-03
+  0.000000000000000000E+00 -0.458970290280038998E-03 -0.570346192985176997E-03
+  0.918907244525967067E-02 -0.666666666666666970E-02 -0.322071566303724021E-03
+  0.801064459323599064E-02 -0.151974252254838009E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.504268248287133031E-03
+  0.000000000000000000E+00 -0.280047589106463980E-03  0.858275366599436061E-02
+ -0.666666666666666970E-02 -0.162136502882231009E-03  0.755886084998397007E-02
+ -0.797239446559356004E-11 -0.165003120614180988E-03 -0.116699722701056003E-08
+  0.822721467884345026E-02  0.926072857339886937E-02 -0.621240348529511033E-03
+ -0.666666666666666970E-02 -0.664187000217392946E-03  0.918478778839808968E-02
+ -0.666666666666666970E-02 -0.434091712912973990E-03  0.918381835623783074E-02
+ -0.661965111513909037E-03 -0.436002089949920021E-03 -0.618955328845967952E-03
+  0.925997826239693955E-02 -0.666666666666666970E-02  0.755828330701808961E-02
+ -0.411187050494477969E-11 -0.610057942551154034E-09  0.822725843300332015E-02
+ -0.162643320272068002E-03 -0.666666666666666970E-02  0.760168784361417004E-02
+ -0.350264221860245974E-04 -0.531763987837332967E-03 -0.285598197078351006E-03
+  0.896123574808329985E-02 -0.666666666666666970E-02 -0.162064603067110991E-03
+  0.755721765251989031E-02 -0.421568927958496012E-11  0.000000000000000000E+00
+ -0.164615562653504011E-03 -0.622111769806188027E-09  0.822717714448987017E-02
+ -0.666666666666666970E-02 -0.435031767299973019E-03  0.921658397588867064E-02
+ -0.671474829222033949E-03  0.000000000000000000E+00 -0.435066618573651026E-03
+ -0.622858771889590011E-03  0.926582672216797083E-02 -0.666666666666666970E-02
+  0.000000000000000000E+00 -0.435031759821466978E-03  0.921598447351964015E-02
+ -0.671353608292523957E-03 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.435066626050711989E-03  0.000000000000000000E+00 -0.622806883477106956E-03
+  0.926515741405328948E-02 -0.666666666666666970E-02 -0.162062535484502993E-03
+  0.755721765241602027E-02 -0.413624146533982998E-11 -0.164605264966013012E-03
+ -0.610559194090089995E-09  0.822717713298093921E-02  0.822717692676106990E-02
+ -0.401916224683843019E-09 -0.666666666666666970E-02 -0.398503994321098006E-09
+  0.822695176184735054E-02 -0.666666666666666970E-02  0.921688643859516939E-02
+ -0.671543140262901043E-03 -0.622873028246735054E-03  0.926585914257952933E-02
+ -0.666666666666666970E-02 -0.162062560104490004E-03  0.755721765317601043E-02
+ -0.413717829056970014E-11 -0.164605387585608994E-03 -0.610695437742429953E-09
+  0.822717713403002017E-02 -0.435057919911320979E-03  0.926585996691042975E-02
+ -0.622874416570012033E-03 -0.435040467102555022E-03 -0.666666666666666970E-02
+ -0.671542718462655053E-03  0.921687774051401995E-02 -0.666666666666666970E-02
+ -0.164358956864967009E-03  0.822694333429544941E-02 -0.400838058665778016E-09
+ -0.164360413187303008E-03 -0.404399206698849991E-09  0.822717692928500062E-02
+ -0.666666666666666970E-02 -0.435040480198734978E-03  0.921687629883806946E-02
+ -0.671540846988837015E-03  0.000000000000000000E+00 -0.435057906816285018E-03
+ -0.622872546234125974E-03  0.926585805175031071E-02 -0.666666666666666970E-02
+ -0.162127546305523010E-03  0.759121249965894005E-02 -0.382423911661819962E-11
+  0.000000000000000000E+00 -0.164583726826487010E-03 -0.546878146458841976E-09
+  0.822717706956129988E-02 -0.666666666666666970E-02  0.000000000000000000E+00
+ -0.166319285296618989E-03  0.755721766130180994E-02 -0.125511345494907995E-10
+ -0.666666666666666970E-02  0.000000000000000000E+00 -0.166854277186167003E-03
+  0.000000000000000000E+00 -0.695070062425207962E-09  0.759121319044449988E-02
+ -0.435040455888697000E-03 -0.666666666666666970E-02  0.921687736420685050E-02
+ -0.671542633493042978E-03 -0.435057931124196984E-03 -0.622874398846488971E-03
+  0.926585992663164944E-02 -0.666666666666666970E-02 -0.435040480266267004E-03
+  0.921687629128858064E-02 -0.671540837058780997E-03 -0.435057906748759010E-03
+ -0.622872536301050019E-03  0.926585804176803009E-02 -0.164605382599736005E-03
+  0.822717713315264040E-02 -0.610689878464067020E-09 -0.162062559112368987E-03
+ -0.666666666666666970E-02 -0.413714002853130017E-11  0.755721765707715026E-02
+ -0.162062560979487995E-03 -0.666666666666666970E-02  0.755721765716044040E-02
+ -0.413721119467013017E-11 -0.164605391905931007E-03 -0.610700236970853951E-09
+  0.822717713430983974E-02 -0.162062559104255008E-03 -0.666666666666666970E-02
+  0.755721765270791958E-02 -0.413714010482357039E-11 -0.164605382602643998E-03
+ -0.610689887899547005E-09  0.822717713315264040E-02 -0.666666666666666970E-02
+ -0.162062002688690993E-03  0.755721765325861970E-02 -0.411602396034761983E-11
+ -0.164602611349319008E-03 -0.607618836657478970E-09  0.822717713082904076E-02
+ -0.435050795545137013E-03  0.926588559200255957E-02 -0.622885679652955006E-03
+ -0.435047591837533988E-03 -0.666666666666666970E-02 -0.671596715586648962E-03
+  0.921711691727978080E-02 -0.435047591829257026E-03 -0.666666666666666970E-02
+  0.921711503527408021E-02 -0.671594744907884041E-03 -0.435050795553413975E-03
+ -0.622883788869598950E-03  0.926588363014568990E-02 -0.666666666666666970E-02
+  0.921687630349790968E-02 -0.671540851815855032E-03 -0.622872550856051971E-03
+  0.926585805656933957E-02  0.822717712936564018E-02 -0.606929320764278968E-09
+ -0.666666666666666970E-02 -0.411128314877385017E-11  0.755721765699138986E-02
+ -0.162062559115238003E-03 -0.666666666666666970E-02  0.755721765701833012E-02
+ -0.413714010896968962E-11 -0.164605382614328987E-03 -0.610689894184595994E-09
+  0.822717713311110939E-02 -0.666666666666666970E-02 -0.435040480219567002E-03
+  0.921687629951207024E-02 -0.671540847113225037E-03 -0.435057906795455001E-03
+ -0.622872546234027962E-03  0.926585805179963064E-02 -0.162127546311748987E-03
+ -0.666666666666666970E-02  0.759121249955777965E-02 -0.382423940240163967E-11
+  0.000000000000000000E+00 -0.164583726855365006E-03 -0.546878175305287047E-09
+  0.822717706955716950E-02 -0.666666666666666970E-02 -0.166319285308019998E-03
+  0.755721766131254007E-02 -0.125511384896241995E-10  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.666666666666666970E-02 -0.166854277196930989E-03
+  0.000000000000000000E+00 -0.695070273739865966E-09  0.759121319033985009E-02
+ -0.666666666666666970E-02 -0.162062001821340992E-03  0.755721765238263031E-02
+ -0.411599097514774006E-11 -0.164602607032878005E-03 -0.607614054874119969E-09
+  0.822717713004762069E-02  0.926588367541618038E-02 -0.622883811249193005E-03
+ -0.666666666666666970E-02 -0.671594835876209036E-03  0.921711542242288975E-02
+ -0.666666666666666970E-02 -0.435047603237387977E-03  0.921711541881104976E-02
+ -0.671594832109726034E-03 -0.435050784145465008E-03 -0.622883807637388031E-03
+  0.926588367164974010E-02 -0.666666666666666970E-02  0.755721765237517013E-02
+ -0.411128320041531016E-11 -0.606929330558691026E-09  0.822717712936565058E-02
+ -0.435040480193840022E-03 -0.666666666666666970E-02  0.921687628881386924E-02
+ -0.671540836666198004E-03 -0.435057906821179974E-03 -0.622872536362236969E-03
+  0.926585804144416936E-02 -0.166666666666667011E-01 -0.435069629630581997E-03
+  0.194633698669611013E-01 -0.118122796424918004E-02 -0.435069154494976983E-03
+ -0.125613726459116001E-02  0.193884422615678009E-01 -0.166666666666667011E-01
+ -0.139671085486954007E-03  0.172964744640674989E-01 -0.239732978890832008E-07
+ -0.141641486800695994E-03 -0.121305359387927990E-05  0.178147059436198005E-01
+ -0.141641814770074998E-03  0.178147059816575991E-01 -0.121308374397845994E-05
+ -0.166666666666667011E-01 -0.139671142305139987E-03 -0.239739107040978989E-07
+  0.172964744651716990E-01 -0.166666666666667011E-01 -0.435068671068379023E-03
+  0.000000000000000000E+00  0.193848780140391010E-01 -0.125250859257347992E-02
+ -0.435070113055705985E-03 -0.117767774816952006E-02  0.194597410164681003E-01
+ -0.166666666666667011E-01  0.172969944943952014E-01 -0.779115312058558027E-07
+ -0.166666666666667011E-01 -0.755324935173564974E-05  0.179004996941746000E-01
+ -0.166666666666667011E-01  0.178976999748920000E-01 -0.387667548978359018E-05
+ -0.233628102655817991E-06  0.178135930268625009E-01 -0.166666666666667011E-01
+ -0.435068785870240020E-03  0.193888089807052987E-01 -0.125652619301678007E-02
+ -0.435069998254330982E-03 -0.118160505927794005E-02  0.194637572604987007E-01
+ -0.435033736890810016E-03 -0.166666666666667011E-01  0.193943160872508992E-01
+ -0.126292016425918010E-02  0.000000000000000000E+00 -0.435105043203752020E-03
+ -0.118838524744318999E-02  0.194704567784157005E-01 -0.166666666666667011E-01
+ -0.139670810780983011E-03  0.000000000000000000E+00  0.172964393749100998E-01
+ -0.239726372267789987E-07 -0.141640262301834013E-03 -0.121310025861443004E-05
+  0.178147059964809014E-01 -0.166666666666667011E-01 -0.435069265953867983E-03
+  0.193887440338862994E-01 -0.125644950186007995E-02 -0.435069518171818988E-03
+ -0.118152293796727997E-02  0.194636762447145013E-01 -0.140348173477511997E-03
+  0.178144318724902002E-01 -0.933813955352833025E-06 -0.166666666666667011E-01
+ -0.140348162575281002E-03 -0.933775488811803979E-06  0.178144292153939003E-01
+ -0.166666666666667011E-01 -0.139703533487054989E-03  0.172964344581288988E-01
+ -0.243260553268771015E-07 -0.141829183573385009E-03 -0.123048280177751006E-05
+  0.178147222203411011E-01 -0.435051982399214975E-03 -0.166666666666667011E-01
+  0.193933724724300004E-01 -0.126258834031118010E-02 -0.435086800763898984E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.118819422876379008E-02
+  0.194685490108550997E-01  0.000000000000000000E+00 -0.435025946733501019E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193872403842459996E-01 -0.125578622091835007E-02 -0.435112831393176015E-03
+ -0.118149517659288990E-02  0.194634742330271997E-01 -0.166666666666667011E-01
+  0.172964350463347995E-01 -0.239721729446109010E-07 -0.121308719330683991E-05
+  0.178147059846293990E-01 -0.166666666666667011E-01 -0.435069341371819001E-03
+  0.193887901446231983E-01 -0.125649495627712989E-02 -0.435069442753911013E-03
+ -0.118156624050173007E-02  0.194637211303591989E-01 -0.166666666666667011E-01
+  0.172964137870020990E-01 -0.947281877378979960E-08  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.426824290648908980E-06  0.177495617930084996E-01
+  0.000000000000000000E+00 -0.140229676531456008E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.177491875365273014E-01
+ -0.500657647124818999E-07 -0.140479994479306001E-03 -0.522090189834258032E-06
+  0.178140176711019983E-01 -0.435033290824524978E-03 -0.166666666666667011E-01
+  0.193943004566385011E-01 -0.126291331604961991E-02 -0.435105489168504994E-03
+ -0.118838501644233006E-02  0.194704545970095996E-01 -0.435268249897141982E-03
+ -0.166666666666667011E-01  0.194803294031692005E-01 -0.120221696823341996E-02
+ -0.434870408625407019E-03 -0.126728096347563995E-02  0.194000952173805016E-01
+ -0.166666666666667011E-01  0.172950455057520984E-01 -0.209695222919133997E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.104309476531645996E-05
+  0.178040398390986003E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.173086281050639987E-01 -0.234055546331482985E-07 -0.116921175397781010E-05
+  0.178149613334784990E-01 -0.166666666666667011E-01 -0.141165505762467993E-03
+  0.172962556489167016E-01 -0.473446914579739040E-05 -0.544658362791339019E-03
+ -0.346186312977599982E-04  0.183333438873188002E-01 -0.139754712611172996E-03
+ -0.166666666666667011E-01  0.176170424650436003E-01 -0.246162966049732998E-07
+ -0.140206415574070008E-03 -0.567757815085220049E-06  0.178140588186939987E-01
+ -0.165095245016433009E-03  0.178223322459852007E-01 -0.681146904710235036E-05
+ -0.166666666666667011E-01 -0.150451686795303991E-03 -0.294774223703715983E-06
+  0.176167018986788000E-01 -0.166666666666667011E-01 -0.421169644089434015E-03
+  0.189193940368806990E-01 -0.945437876210229950E-03 -0.448291096827642985E-03
+ -0.100548436568255006E-02  0.192537240725896008E-01 -0.166666666666667011E-01
+  0.173199119819372992E-01 -0.235984471741585014E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.123676172594726990E-05
+  0.178172740172132006E-01 -0.166666666666667011E-01 -0.435171428458813985E-03
+  0.194709389718316986E-01 -0.119092785784032009E-02 -0.434967322589985023E-03
+ -0.126074169121839991E-02  0.193933111634515994E-01 -0.166666666666667011E-01
+  0.172958818583838006E-01 -0.240054389113918998E-07 -0.121597095988231006E-05
+  0.178147088549027999E-01 -0.434490077538828977E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.192819870996423989E-01
+ -0.115810409218265005E-02 -0.435647599097352997E-03 -0.109302488191087002E-02
+  0.193702791631770004E-01 -0.166666666666667011E-01  0.172964231611969985E-01
+ -0.237439154341176999E-07  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.119986668002372008E-05
+  0.178139470404297014E-01 -0.166666666666667011E-01 -0.435069730380088988E-03
+  0.194633683526404012E-01 -0.118122901996727003E-02 -0.435069053745286004E-03
+ -0.125613277918751993E-02  0.193884385258774015E-01 -0.166666666666667011E-01
+ -0.139670526915550006E-03  0.172964750459083001E-01 -0.239672729870900994E-07
+ -0.141638257586788001E-03 -0.121275616494760993E-05  0.178147056890299998E-01
+ -0.141820432945145006E-03  0.178147229316946987E-01 -0.122951654344512007E-05
+ -0.166666666666667011E-01 -0.139702093156581013E-03 -0.243082078767443003E-07
+  0.172964754242462999E-01 -0.166666666666667011E-01 -0.435009064920768992E-03
+  0.193710715232028007E-01 -0.123960904534558994E-02 -0.435129707573877990E-03
+ -0.116591990546044996E-02  0.194474141497074995E-01 -0.166666666666667011E-01
+  0.172985028689811998E-01 -0.223410986359397003E-07 -0.166666666666667011E-01
+ -0.111564513924942993E-05  0.178095672953987004E-01 -0.166666666666667011E-01
+  0.193790139692226991E-01 -0.125224418475917997E-02 -0.118134652118899995E-02
+  0.194616264856153007E-01 -0.166666666666667011E-01 -0.435069813890499976E-03
+  0.194637397133314016E-01 -0.118159504407725001E-02 -0.435068970234672974E-03
+ -0.125650151292971989E-02  0.193888007309931006E-01 -0.435063659072025002E-03
+ -0.166666666666667011E-01  0.193957932538242998E-01 -0.126373923029710002E-02
+  0.000000000000000000E+00 -0.435075124949578019E-03 -0.118877268529867007E-02
+  0.194710559190932990E-01 -0.166666666666667011E-01 -0.139670655296777991E-03
+  0.172964394256112999E-01 -0.239709674501792008E-07 -0.141639364432354998E-03
+ -0.121301807421782997E-05  0.178147059272172996E-01 -0.166666666666667011E-01
+ -0.435069551479130975E-03  0.194636473054102983E-01 -0.118149814806462004E-02
+ -0.435069232646526994E-03 -0.125641682842023996E-02  0.193887163382647013E-01
+ -0.140347813456026012E-03  0.178144318351523001E-01 -0.933769427979961972E-06
+ -0.166666666666667011E-01 -0.140367975523149991E-03 -0.933880353406906957E-06
+  0.178144352914434007E-01 -0.166666666666667011E-01 -0.139694549084838003E-03
+  0.172964390046605986E-01 -0.242291801426243003E-07 -0.141777300691703004E-03
+ -0.122571812837964004E-05  0.178147201764872996E-01 -0.435047850309519990E-03
+ -0.166666666666667011E-01  0.193943737538610009E-01 -0.126316205126679004E-02
+  0.000000000000000000E+00 -0.435090932343969020E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.118860422039771007E-02
+  0.194699012602338010E-01  0.000000000000000000E+00 -0.435045457437436027E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193878817571609997E-01
+ -0.125604652907840004E-02 -0.435093324868215015E-03 -0.118146947412013995E-02
+  0.194635297377060007E-01 -0.139670603133990989E-03 -0.166666666666667011E-01
+  0.172964396533356012E-01 -0.239704033946370987E-07 -0.141639061330424995E-03
+ -0.121298992344433003E-05  0.178147059349155999E-01 -0.166666666666667011E-01
+ -0.435069549163963982E-03  0.194637077506569015E-01 -0.118155678661577995E-02
+ -0.435069234961695992E-03 -0.125647743137780000E-02  0.193887749967509987E-01
+ -0.139700055968098002E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964161546672983E-01 -0.947619561016756952E-08 -0.141286732534576009E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.426994435274359988E-06
+  0.177495882894109988E-01  0.000000000000000000E+00 -0.140193998721740011E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.177492151422632000E-01
+ -0.498003721032979000E-07 -0.140435023641922004E-03 -0.519120801586516000E-06
+  0.178140167474564008E-01 -0.435057327600382997E-03 -0.166666666666667011E-01
+  0.193955726939472016E-01 -0.126364253425448000E-02 -0.435081456064144001E-03
+ -0.118875145877563992E-02  0.194710320647638012E-01 -0.434101287475744974E-03
+ -0.166666666666667011E-01  0.193457297954395986E-01 -0.123123427263146005E-02
+ -0.436034461490277003E-03 -0.117022370416927993E-02  0.194484060865461987E-01
+ -0.166666666666667011E-01  0.172978496798759998E-01 -0.927299841537387054E-08
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.415800227862941001E-06  0.177484703644168011E-01 -0.166666666666667011E-01
+  0.193674050743437992E-01 -0.124713171719898993E-02 -0.118115734794936003E-02
+  0.194600874520639011E-01 -0.166666666666667011E-01 -0.141885046454967004E-03
+  0.173019565602799998E-01 -0.104149302119304992E-04 -0.548756258214203995E-03
+ -0.211535543706460008E-03  0.185091585802883984E-01 -0.140739916525857997E-03
+ -0.166666666666667011E-01  0.175467171312232995E-01 -0.284331238335905014E-07
+ -0.142759739737579006E-03 -0.795488176607448023E-06  0.178133916912854989E-01
+ -0.159214075622715010E-03  0.178183546415562992E-01 -0.506772449993064000E-05
+ -0.166666666666667011E-01 -0.147688204700038996E-03 -0.193011674962318992E-06
+  0.175471635134074010E-01 -0.166666666666667011E-01 -0.390248902500896981E-03
+  0.183597810468748013E-01 -0.389827888875239994E-03 -0.472748684898519024E-03
+ -0.455376277614669004E-03  0.187227106527687003E-01 -0.166666666666667011E-01
+  0.184527954209229988E-01 -0.738253265195438005E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.918001551148692977E-03  0.188197523488804995E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.405995886601362996E-03  0.000000000000000000E+00  0.187454059199008011E-01
+ -0.889550329385700050E-03 -0.461282908632618994E-03 -0.106896425909096997E-02
+  0.192970827972911013E-01 -0.166666666666667011E-01  0.173071968997474991E-01
+ -0.234787153019127012E-07 -0.117638457460658001E-05  0.178149781133057003E-01
+ -0.141084828730741996E-03 -0.166666666666667011E-01  0.177480973766994017E-01
+ -0.558748334339287969E-07 -0.141568067840571008E-03 -0.593631384362742965E-06
+  0.178140924667235002E-01 -0.139670647362560011E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172964277750680996E-01 -0.239358302523397992E-07
+ -0.141638396626583005E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.121100320738714998E-05  0.178145893661829993E-01
+ -0.166666666666667011E-01 -0.139670792138544994E-03  0.172964364510580990E-01
+ -0.239726277356469989E-07 -0.141640184766718993E-03 -0.121310638575149002E-05
+  0.178147060027850010E-01 -0.166666666666667011E-01 -0.140348083609600002E-03
+  0.178144298829951003E-01 -0.933774429165531998E-06 -0.140348091728343988E-03
+ -0.933803076573281956E-06  0.178144318618429984E-01 -0.435069485970576990E-03
+  0.194636360494852001E-01 -0.118148312544618999E-02 -0.166666666666667011E-01
+ -0.435069298155133020E-03 -0.125640935261609995E-02  0.193887056174587014E-01
+ -0.166666666666667011E-01 -0.139670745411716994E-03  0.172964393771044000E-01
+ -0.239719351623946988E-07 -0.141639884975219995E-03 -0.121306573571080992E-05
+  0.178147059632727985E-01 -0.166666666666667011E-01  0.193887976520876991E-01
+ -0.125650164472155992E-02 -0.166666666666667011E-01 -0.118157208029700004E-02
+  0.194637273004373001E-01 -0.166666666666667011E-01  0.193887977203392008E-01
+ -0.125650167109372002E-02 -0.118157208884185996E-02  0.194637273863749009E-01
+ -0.166666666666667011E-01 -0.435069389748935983E-03  0.193887814175709994E-01
+ -0.125648501071033009E-02 -0.435069394376801999E-03 -0.118155580222921008E-02
+  0.194637107296697989E-01 -0.140178146762424992E-03 -0.166666666666667011E-01
+  0.177492251547946012E-01 -0.496800708557238970E-07  0.000000000000000000E+00
+ -0.140415043495099005E-03 -0.517779583567776022E-06  0.178140139542000003E-01
+ -0.166666666666667011E-01 -0.435069391170850978E-03  0.193884418595630000E-01
+ -0.125613777078238003E-02 -0.435069392954887005E-03 -0.118121619467206006E-02
+  0.194633634756040015E-01 -0.166666666666667011E-01 -0.139670761471267005E-03
+  0.172964743992243992E-01 -0.239698239143705995E-07 -0.141639617585322011E-03
+ -0.121288295374220008E-05  0.178147057813429002E-01 -0.141639619900249991E-03
+  0.178147057815410993E-01 -0.121288316535549008E-05 -0.166666666666667011E-01
+ -0.139670761872411997E-03 -0.239698282191903987E-07  0.172964743992280005E-01
+ -0.166666666666667011E-01 -0.435069391157123027E-03  0.193887088952683015E-01
+ -0.125641082509626010E-02 -0.435069392968615010E-03 -0.118148323088716994E-02
+  0.194636365300335998E-01 -0.139670783971901987E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.172964393406603995E-01 -0.239438477767506995E-07
+ -0.141639278555754001E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.121143526565240011E-05  0.178146131748509011E-01  0.000000000000000000E+00
+ -0.435064994003248019E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193885566142219003E-01 -0.125634391561759004E-02 -0.435073790061057985E-03
+ -0.118148087424704991E-02  0.194636165590847988E-01 -0.139670564073325994E-03
+ -0.166666666666667011E-01  0.172964393993381010E-01 -0.239699852667253984E-07
+ -0.141638838042723001E-03 -0.121296980432410002E-05  0.178147058666821984E-01
+ -0.166666666666667011E-01 -0.435069390409400993E-03  0.193887087203693000E-01
+ -0.125641066243104005E-02 -0.435069393716336990E-03 -0.118148308295969000E-02
+  0.194636363738771995E-01 -0.139670603963582999E-03 -0.166666666666667011E-01
+  0.172964393580567997E-01 -0.239419221487562986E-07 -0.141638239923676007E-03
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.121134061697532001E-05  0.178146131028351987E-01
+  0.000000000000000000E+00 -0.435064995135525016E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193885567877557993E-01 -0.125634406896881997E-02
+ -0.435073788928811995E-03 -0.118148100749493009E-02  0.194636167020634000E-01
+ -0.139670744060081990E-03 -0.166666666666667011E-01  0.172964393838881993E-01
+ -0.239719201961261003E-07 -0.141639877103680014E-03 -0.121306498448360998E-05
+  0.178147059625157998E-01 -0.435069391187604992E-03 -0.166666666666667011E-01
+  0.193887118630177990E-01 -0.125641385907748998E-02 -0.435069392938132991E-03
+ -0.118148619756011002E-02  0.194636395637266015E-01 -0.166666666666667011E-01
+  0.172965077542917012E-01 -0.927568247376278936E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.102046200973755008E-05  0.173051339160060005E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.193887977549710017E-01
+ -0.125650168627743997E-02 -0.118157208932238010E-02  0.194637273908665996E-01
+ -0.166666666666667011E-01 -0.435069390872567011E-03  0.193884418484903988E-01
+ -0.125613776548786000E-02 -0.435069393253171026E-03 -0.118121619377422010E-02
+  0.194633634734872989E-01 -0.139670761455125999E-03 -0.166666666666667011E-01
+  0.172964743993230009E-01 -0.239698237345604007E-07 -0.141639617491158998E-03
+ -0.121288294467888995E-05  0.178147057813337999E-01 -0.141639617495969007E-03
+  0.178147057813342995E-01 -0.121288294511917005E-05 -0.166666666666667011E-01
+ -0.139670761455959995E-03 -0.239698237435694994E-07  0.172964743993230009E-01
+ -0.166666666666667011E-01 -0.435069391186033983E-03  0.193887118629635993E-01
+ -0.125641385905383009E-02 -0.435069392939704000E-03 -0.118148619755951002E-02
+  0.194636395637197007E-01 -0.166666666666667011E-01  0.172965077542917012E-01
+ -0.927568247375443947E-07  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.102046200973782007E-05  0.173051339160060005E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.139674680709096002E-03
+  0.000000000000000000E+00  0.173041376249854009E-01 -0.235410513736254011E-07
+ -0.141584640574228987E-03 -0.117464392253343004E-05  0.178146677207011994E-01
+ -0.166666666666667011E-01  0.193887977549710017E-01 -0.125650168627745992E-02
+ -0.118157208932238010E-02  0.194637273908665996E-01 -0.139674680709097004E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.173041376249854009E-01 -0.235410513736612015E-07 -0.141584640574232999E-03
+ -0.117464392253384995E-05  0.178146677207011994E-01 -0.139700009140160993E-03
+ -0.166666666666667011E-01  0.172964151618227001E-01 -0.947777749626817048E-08
+ -0.141286602860218007E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.427075077777844986E-06
+  0.177495996071506991E-01 -0.166666666666667011E-01 -0.139670794784869993E-03
+  0.172964386794358001E-01 -0.239725105714824009E-07 -0.141640177118637010E-03
+ -0.121309559766888995E-05  0.178147059913013987E-01 -0.166666666666667011E-01
+ -0.435069295710136016E-03  0.193887241621244003E-01 -0.125642846904032001E-02
+ -0.435069488415573019E-03 -0.118150189715466995E-02  0.194636550484258014E-01
+ -0.140349307522104008E-03  0.178144320174323002E-01 -0.933958710839114049E-06
+ -0.166666666666667011E-01 -0.140349299171436005E-03 -0.933929273088272012E-06
+  0.178144299843184008E-01 -0.166666666666667011E-01 -0.435068659479091984E-03
+  0.193848776262358011E-01 -0.125250842953309989E-02 -0.435070124644940006E-03
+ -0.117767775442942997E-02  0.194597409762624005E-01 -0.166666666666667011E-01
+  0.172969944928271988E-01 -0.779115314441779059E-07 -0.166666666666667011E-01
+ -0.755324939016156960E-05  0.179004996942138013E-01 -0.166666666666667011E-01
+  0.178976999748929992E-01 -0.387667548981126021E-05 -0.233628102656007991E-06
+  0.178135930268625009E-01 -0.166666666666667011E-01 -0.139686009556357012E-03
+  0.172964370559683007E-01 -0.241363115163189999E-07 -0.141728010347422987E-03
+ -0.122115135534191003E-05  0.178147135015842006E-01 -0.435039396134656023E-03
+ -0.166666666666667011E-01  0.193948601305157994E-01 -0.126336416797764991E-02
+  0.000000000000000000E+00 -0.435099385138650019E-03 -0.118873902082604994E-02
+  0.194708368874158995E-01 -0.166666666666667011E-01 -0.435069389255753991E-03
+  0.193884422504904987E-01 -0.125613821100521001E-02 -0.435069394869983992E-03
+ -0.118121665337718993E-02  0.194633639337775007E-01 -0.166666666666667011E-01
+ -0.139671380124092994E-03  0.172964743448792008E-01 -0.239764688220868006E-07
+ -0.141643188367998986E-03 -0.121320975071519003E-05  0.178147060922408014E-01
+ -0.141639844172888998E-03  0.178147058017989017E-01 -0.121290395200246995E-05
+ -0.166666666666667011E-01 -0.139670800627354991E-03 -0.239702482700813011E-07
+  0.172964743394126014E-01 -0.166666666666667011E-01 -0.435068235431626006E-03
+  0.193889053227481005E-01 -0.125663654535791008E-02 -0.435070548689864018E-03
+ -0.118172116532760008E-02  0.194638725033138005E-01 -0.161678337891911993E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172964935301388005E-01
+ -0.110214153508716999E-06 -0.162422470311612014E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.120622187566448997E-05  0.173052150067069012E-01
+  0.000000000000000000E+00 -0.139675704206961997E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173040340098522989E-01 -0.235576501589924013E-07
+ -0.141591382930956008E-03 -0.117568258902886992E-05  0.178146687262424989E-01
+ -0.435067411942580010E-03 -0.166666666666667011E-01  0.193886625538398985E-01
+ -0.125640355926819004E-02 -0.435071372170706019E-03 -0.118150457761581010E-02
+  0.194636501951265008E-01 -0.166666666666667011E-01 -0.435069317670580980E-03
+  0.193887257892274983E-01 -0.125642966436278992E-02 -0.435069466455139981E-03
+ -0.118150274119342008E-02  0.194636560434883997E-01 -0.161601801934664006E-03
+ -0.166666666666667011E-01  0.172965054748792993E-01 -0.926502681628198944E-07
+ -0.162351306455949001E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.101988138676949999E-05
+  0.173051393629537996E-01  0.000000000000000000E+00 -0.139695639988150999E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.173041438368814991E-01
+ -0.237586532645376999E-07 -0.141701626147540003E-03 -0.118517343244098004E-05
+  0.178146776002247008E-01 -0.435031309394906995E-03 -0.166666666666667011E-01
+  0.193945788398839983E-01 -0.126324081474057004E-02 -0.435107470132360024E-03
+ -0.118873511395483993E-02  0.194708001084720993E-01 -0.435268203178909021E-03
+ -0.166666666666667011E-01  0.194803280841454006E-01 -0.120221471825894999E-02
+ -0.434870455402753989E-03 -0.126728100453280998E-02  0.194000950994309002E-01
+ -0.166666666666667011E-01  0.172950455023187996E-01 -0.209695226067755011E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.104309478839939992E-05
+  0.178040398395854990E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.173086281051332003E-01 -0.234055546297331013E-07 -0.116921175366159004E-05
+  0.178149613334783012E-01 -0.166666666666667011E-01 -0.141165531303238012E-03
+  0.172962108365329013E-01 -0.473445694664380988E-05 -0.544660594244451043E-03
+ -0.346550124406059992E-04  0.183333673483180998E-01 -0.139754726222278999E-03
+ -0.166666666666667011E-01  0.176170084219189997E-01 -0.246149781576928007E-07
+ -0.140206540519620008E-03 -0.567788699156871954E-06  0.178140588497450007E-01
+ -0.165095492820405989E-03  0.178223326710732002E-01 -0.681196695094317026E-05
+ -0.166666666666667011E-01 -0.150450799069272987E-03 -0.294765827682543002E-06
+  0.176166680355560011E-01 -0.166666666666667011E-01 -0.421169440873413019E-03
+  0.189193905894523007E-01 -0.945436157371731009E-03 -0.448291280314589975E-03
+ -0.100548420996315999E-02  0.192537236307863992E-01 -0.166666666666667011E-01
+  0.173199119681975988E-01 -0.235984477489150007E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.123676178966298004E-05
+  0.178172740173286014E-01 -0.166666666666667011E-01 -0.435171428461128974E-03
+  0.194709389729267983E-01 -0.119092785894799004E-02 -0.434967322587667975E-03
+ -0.126074169224678996E-02  0.193933111644412001E-01 -0.166666666666667011E-01
+  0.172958818582512990E-01 -0.240054389194565998E-07 -0.121597096057541996E-05
+  0.178147088549035007E-01 -0.434490077563752020E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.192819871000558009E-01
+ -0.115810409202821997E-02 -0.435647599072524984E-03 -0.109302488139546004E-02
+  0.193702791629623006E-01 -0.435071227105071981E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.194623470555931009E-01 -0.118137412384183003E-02
+ -0.435067557009957988E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.125600599101201998E-02  0.193876331668423017E-01
+ -0.166666666666667011E-01 -0.139670755029073005E-03  0.172964393832697010E-01
+ -0.239720378568450991E-07 -0.141639940417128004E-03 -0.121307076941768003E-05
+  0.178147059675646015E-01 -0.166666666666667011E-01 -0.435069370632784017E-03
+  0.193887126423957987E-01 -0.125641513719206011E-02 -0.435069413492952990E-03
+ -0.118148776859817008E-02  0.194636409705596994E-01 -0.140350151817307989E-03
+  0.178144321253345984E-01 -0.934063912396124954E-06 -0.166666666666667011E-01
+ -0.140350134684648006E-03 -0.934057299894995007E-06  0.178144316718926998E-01
+ -0.166666666666667011E-01 -0.435068157528115975E-03  0.193838368298109992E-01
+ -0.125145454139508993E-02 -0.435070626592773998E-03 -0.117665452507550007E-02
+  0.194586918508348992E-01 -0.166666666666667011E-01  0.172970581684627998E-01
+ -0.692387039594140975E-18 -0.166666666666667011E-01 -0.116362368857371003E-16
+  0.173226956581742017E-01 -0.166666666666667011E-01  0.173227088035448999E-01
+ -0.227142423077076990E-07 -0.109342148655364995E-05  0.178145868761214014E-01
+ -0.166666666666667011E-01 -0.139672882575824002E-03  0.172964387861730996E-01
+ -0.239948730000151010E-07 -0.141652224900244998E-03 -0.121419432935652996E-05
+  0.178147068681758017E-01 -0.435063160018721003E-03 -0.166666666666667011E-01
+  0.193888839247167016E-01 -0.125673333404555001E-02  0.000000000000000000E+00
+ -0.435075623983676026E-03 -0.118189505882440008E-02  0.194640013018383996E-01
+ -0.166666666666667011E-01 -0.435069389648734019E-03  0.000000000000000000E+00
+  0.193884419847699004E-01 -0.125613793152611996E-02 -0.435069394477004018E-03
+ -0.118121637448865007E-02  0.194633636498748007E-01 -0.166666666666667011E-01
+ -0.139670964419093003E-03  0.172964743771341009E-01 -0.239720016692284015E-07
+ -0.141640788952219994E-03 -0.121299004622935002E-05  0.178147058760876990E-01
+ -0.141639744525080005E-03  0.178147057920136007E-01 -0.121289465915329990E-05
+ -0.166666666666667011E-01 -0.139670783426588003E-03 -0.239700609664733988E-07
+  0.172964743758160996E-01 -0.166666666666667011E-01 -0.435069015984241008E-03
+  0.193887360934393016E-01 -0.125644691154631004E-02 -0.435069768141048007E-03
+ -0.118152418161517991E-02  0.194636756633727002E-01 -0.161605811424037006E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172965033247536003E-01
+ -0.935932238155109053E-07 -0.162355789101947996E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.103050516658616990E-05  0.173051502951346997E-01
+  0.000000000000000000E+00 -0.139674971988557002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173041442833562001E-01 -0.235436840180659000E-07
+ -0.141586201352507011E-03 -0.117475642480550998E-05  0.178146678139698006E-01
+ -0.435068868942317015E-03 -0.166666666666667011E-01  0.193886960011524015E-01
+ -0.125640827839761994E-02 -0.435069915182551979E-03 -0.118148826831425001E-02
+  0.194636394345385003E-01 -0.166666666666667011E-01 -0.435069350378355024E-03
+  0.193887155593089992E-01 -0.125641852657909004E-02 -0.435069433747378026E-03
+ -0.118149137295133996E-02  0.194636445794403000E-01 -0.161601725677973997E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172965069829389001E-01
+ -0.926650122837621983E-07 -0.162351498594983989E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.102023342811811993E-05  0.173051439965640000E-01
+  0.000000000000000000E+00 -0.139679244153132989E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173041479614229991E-01 -0.235877316475454016E-07
+ -0.141610020722497009E-03 -0.117688204166686002E-05  0.178146696122764014E-01
+ -0.435061149026493986E-03 -0.166666666666667011E-01  0.193888142839695986E-01
+ -0.125670280436314006E-02 -0.435077634883449980E-03 -0.118189407884959998E-02
+  0.194639921507386016E-01 -0.415397337630780993E-03 -0.166666666666667011E-01
+  0.188278985006655984E-01 -0.898332650045155024E-03 -0.453392799083327013E-03
+ -0.100006104572014989E-02  0.192410591480840984E-01 -0.166666666666667011E-01
+  0.173196351833987995E-01 -0.236101000931807013E-07  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.123805476906965002E-05
+  0.178172764972891993E-01 -0.166666666666667011E-01  0.172958803102313992E-01
+ -0.240055324643047990E-07 -0.121597905454717007E-05  0.178147088629614994E-01
+ -0.166666666666667011E-01 -0.139361332584411003E-03  0.172963780471655990E-01
+ -0.206406041294607990E-07 -0.139853947917575004E-03 -0.104886506051046009E-05
+  0.178144680355106985E-01 -0.482519288981585995E-03 -0.166666666666667011E-01
+  0.184437042794661984E-01 -0.182960367117892994E-03 -0.375083574717954013E-03
+ -0.210197986405779994E-03  0.183325256678834002E-01 -0.143207812762676998E-03
+  0.178148150557865988E-01 -0.128505524246536990E-05 -0.166666666666667011E-01
+ -0.199761155404034002E-03 -0.199498340007143991E-05  0.178294573883331017E-01
+ -0.166666666666667011E-01 -0.433603776206709980E-03  0.193497604591337004E-01
+ -0.124503388545287009E-02 -0.436528045694049990E-03 -0.119072564830830993E-02
+  0.194675615509681994E-01 -0.166666666666667011E-01  0.172951192308572997E-01
+ -0.214703230748565997E-07  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.107141838335927994E-05  0.178058595670048010E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.434603197488181014E-03
+  0.000000000000000000E+00  0.193058537142562006E-01 -0.118063268562849000E-02
+ -0.435534875266554988E-03 -0.111369468800605006E-02  0.193919603585517991E-01
+ -0.166666666666667011E-01  0.173056689022044012E-01 -0.235558337256563013E-07
+ -0.118363055389575001E-05  0.178149857852359997E-01 -0.435171466645126008E-03
+ -0.166666666666667011E-01  0.194709518251613006E-01 -0.119094077856059005E-02
+ -0.434967284379077007E-03 -0.126075351291434000E-02  0.193933220870970008E-01
+ -0.435071725054391020E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.194634518078929016E-01 -0.118146952991411008E-02 -0.435067059054058004E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.125632584199591004E-02  0.193884910636192009E-01 -0.166666666666667011E-01
+ -0.139670745136767009E-03  0.172964396823254013E-01 -0.239719121263430998E-07
+ -0.141639880244726003E-03 -0.121306391322725994E-05  0.178147059609039017E-01
+ -0.166666666666667011E-01 -0.140348087747312988E-03  0.178144318030939997E-01
+ -0.933800582768691986E-06 -0.140348087981950005E-03 -0.933801410703608978E-06
+  0.178144318602838012E-01 -0.435069394684114985E-03  0.194636364171987992E-01
+ -0.118148312778892993E-02 -0.166666666666667011E-01 -0.435069389441622998E-03
+ -0.125641069437981007E-02  0.193887087332374995E-01 -0.166666666666667011E-01
+ -0.139670745366996012E-03  0.172964393825588009E-01 -0.239719343252174001E-07
+ -0.141639884660971991E-03 -0.121306568223363996E-05  0.178147059632162985E-01
+ -0.166666666666667011E-01  0.193887976553534999E-01 -0.125650164603376005E-02
+ -0.166666666666667011E-01 -0.118157208059822003E-02  0.194637273034572005E-01
+ -0.166666666666667011E-01  0.193887977225202998E-01 -0.125650167205010994E-02
+ -0.118157208887189995E-02  0.194637273866554994E-01 -0.166666666666667011E-01
+ -0.435069390858170975E-03  0.193887118569476997E-01 -0.125641385953187998E-02
+ -0.435069393267567008E-03 -0.118148620273378011E-02  0.194636395676883005E-01
+ -0.139674686768931008E-03 -0.166666666666667011E-01  0.173041492305231991E-01
+ -0.235404449719397016E-07  0.000000000000000000E+00 -0.141584559768768005E-03
+ -0.117458831050516992E-05  0.178146676653484007E-01 -0.166666666666667011E-01
+ -0.139670743889800991E-03  0.000000000000000000E+00  0.172964393844355011E-01
+ -0.239719183336094004E-07 -0.141639876115250000E-03 -0.121306489168701010E-05
+  0.178147059624313986E-01 -0.166666666666667011E-01 -0.140348087304844010E-03
+  0.178144318418145993E-01 -0.933801043769264029E-06 -0.140348087380234008E-03
+ -0.933801309788676977E-06  0.178144318601899006E-01 -0.435069392936427975E-03
+  0.194636364108725993E-01 -0.118148311418031005E-02 -0.166666666666667011E-01
+ -0.435069391189310008E-03 -0.125641070624249993E-02  0.193887087796953995E-01
+ -0.166666666666667011E-01 -0.435069391192276006E-03  0.193887815134154012E-01
+ -0.125648507879496003E-02 -0.435069392933461977E-03 -0.118155584780088997E-02
+  0.194637107833945007E-01 -0.139700198050289988E-03 -0.166666666666667011E-01
+  0.172964151566850008E-01 -0.947849280649777055E-08 -0.141287467726875003E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.427106383900035001E-06  0.177495992954622012E-01  0.000000000000000000E+00
+ -0.140178145665099013E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.177492248107081994E-01 -0.496797031657574988E-07 -0.140415043450978997E-03
+ -0.517779495462221951E-06  0.178140139541173997E-01 -0.435069391108214992E-03
+ -0.166666666666667011E-01  0.193887087862484007E-01 -0.125641071469571007E-02
+ -0.435069393017522990E-03 -0.118148312365616006E-02  0.194636364200341007E-01
+ -0.166666666666667011E-01 -0.139670745360696987E-03  0.172964393844495003E-01
+ -0.239719341341385992E-07 -0.141639884605172008E-03 -0.121306566857870991E-05
+  0.178147059632026983E-01 -0.435069390272270994E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.193887086899636012E-01 -0.125641067599288007E-02
+ -0.435069393853468019E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.118148311480717992E-02  0.194636363313261986E-01  0.000000000000000000E+00
+ -0.435069390212755017E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193887087459734993E-01 -0.125641069148615990E-02 -0.435069393912983020E-03
+ -0.118148311376017998E-02  0.194636364065413001E-01 -0.139670743899566996E-03
+ -0.166666666666667011E-01  0.172964393844273999E-01 -0.239719184390481014E-07
+ -0.141639876171704000E-03 -0.121306489688977996E-05  0.178147059624365993E-01
+ -0.435069391188922026E-03 -0.166666666666667011E-01  0.193887118630630996E-01
+ -0.125641385909724002E-02 -0.435069392936816011E-03 -0.118148619756053004E-02
+  0.194636395637322983E-01 -0.166666666666667011E-01  0.172965077542917012E-01
+ -0.927568247378449987E-07  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.102046200973719009E-05  0.173051339160060005E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.193887977549708004E-01 -0.125650168627737991E-02
+ -0.118157208932236990E-02  0.194637273908664990E-01 -0.166666666666667011E-01
+ -0.435069391170655984E-03  0.193884418583357004E-01 -0.125613776952834995E-02
+ -0.435069392955081998E-03 -0.118121619344725010E-02  0.194633634743562010E-01
+ -0.139670761454458998E-03 -0.166666666666667011E-01  0.172964743993847016E-01
+ -0.239698237234471001E-07 -0.141639617486673003E-03 -0.121288294398923001E-05
+  0.178147057813330990E-01 -0.141639617486714989E-03  0.178147057813330990E-01
+ -0.121288294399298003E-05 -0.166666666666667011E-01 -0.139670761454465991E-03
+ -0.239698237234977997E-07  0.172964743993847016E-01 -0.166666666666667011E-01
+ -0.435069391188907986E-03  0.193887118630626000E-01 -0.125641385909704010E-02
+ -0.435069392936829997E-03 -0.118148619756053004E-02  0.194636395637322011E-01
+ -0.166666666666667011E-01  0.172965077542917012E-01 -0.927568247378397048E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.102046200973721995E-05
+  0.173051339160060005E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.139674680709099010E-03  0.000000000000000000E+00
+  0.173041376249854009E-01 -0.235410513737083011E-07 -0.141584640574244004E-03
+ -0.117464392253482002E-05  0.178146677207011994E-01 -0.166666666666667011E-01
+  0.193887977549708004E-01 -0.125650168627737991E-02 -0.118157208932236990E-02
+  0.194637273908664990E-01 -0.139674680709099010E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.173041376249854009E-01
+ -0.235410513737084004E-07 -0.141584640574244004E-03 -0.117464392253482002E-05
+  0.178146677207011994E-01 -0.161601720191613987E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172965076669214998E-01 -0.926701710590744007E-07
+ -0.162351552424590994E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.102033061970779005E-05  0.173051453907301991E-01
+ -0.166666666666667011E-01 -0.140025227157220011E-03  0.172964538609525001E-01
+ -0.280836077054835003E-07 -0.143700853603787992E-03 -0.141493042418215998E-05
+  0.178149166877635996E-01 -0.166666666666667011E-01 -0.434397138667670006E-03
+  0.193663283421353014E-01 -0.124681061178207996E-02 -0.435740193435873980E-03
+ -0.118155329481477991E-02  0.194610974716315013E-01 -0.435888399868572985E-03
+  0.193018909827777987E-01 -0.102736762162752003E-02 -0.166666666666667011E-01
+ -0.434248105255362999E-03 -0.108835030296548004E-02  0.192105826235978994E-01
+ -0.166666666666667011E-01 -0.142766725250475000E-03  0.173186084279148000E-01
+ -0.107223373775717997E-06 -0.158331602080537010E-03 -0.502583390911269042E-05
+  0.178188218385040009E-01 -0.166666666666667011E-01  0.191887710237797007E-01
+ -0.116955156282324006E-02 -0.166666666666667011E-01 -0.115321249659207001E-02
+  0.193095009677901989E-01 -0.166666666666667011E-01  0.192716415760125007E-01
+ -0.120400166465171009E-02 -0.117888093553793010E-02  0.194469246141813004E-01
+ -0.166666666666667011E-01 -0.139668701380672987E-03  0.172963839829421012E-01
+ -0.239535905300788984E-07 -0.141628653565274010E-03 -0.121228849838231003E-05
+  0.178147051859353991E-01 -0.435070871110831980E-03 -0.166666666666667011E-01
+  0.194629509412433989E-01 -0.118084397985067997E-02  0.000000000000000000E+00
+ -0.435067913007957025E-03 -0.125568351559249996E-02  0.193879990665461986E-01
+ -0.166666666666667011E-01 -0.435069390650039015E-03  0.000000000000000000E+00
+  0.193884418750694017E-01 -0.125613779809562003E-02 -0.435069393475699022E-03
+ -0.118121622922623989E-02  0.194633635071832997E-01 -0.166666666666667011E-01
+ -0.139670790826866987E-03  0.172964743947118006E-01 -0.239701379014204989E-07
+ -0.141639787000618012E-03 -0.121289838766225000E-05  0.178147057914929997E-01
+ -0.141639653189466996E-03  0.178147057839792011E-01 -0.121288622158938009E-05
+ -0.166666666666667011E-01 -0.139670767633548004E-03 -0.239698902061346000E-07
+  0.172964743947353998E-01 -0.166666666666667011E-01 -0.435069338865947026E-03
+  0.193887012207536007E-01 -0.125640423249911005E-02 -0.435069445259782012E-03
+ -0.118147761261617004E-02  0.194636302225951012E-01 -0.161594071152451998E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172965071943530987E-01
+ -0.911198389499341023E-07 -0.162344730250092007E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.100404518995791990E-05  0.173051412553598995E-01
+  0.000000000000000000E+00 -0.139674707461267010E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173041614291120985E-01 -0.235399540925235988E-07
+ -0.141584554759037012E-03 -0.117453693858401004E-05  0.178146676069954012E-01
+ -0.435069368728934001E-03 -0.166666666666667011E-01  0.193887057094737983E-01
+ -0.125640804230569007E-02 -0.435069415396801976E-03 -0.118148083984231998E-02
+  0.194636339567480984E-01 -0.166666666666667011E-01 -0.435069379906770002E-03
+  0.193887120222560987E-01 -0.125641427645285992E-02 -0.435069404218967981E-03
+ -0.118148677817681990E-02  0.194636400648426008E-01 -0.161601688606437998E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172965075862324992E-01
+ -0.926646811008730053E-07 -0.162351530879915987E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.102027850567167008E-05  0.173051453753680015E-01
+  0.000000000000000000E+00 -0.139674509841842008E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173041492584409991E-01 -0.235385724457535989E-07
+ -0.141583570403258998E-03 -0.117449718296302996E-05  0.178146674600714004E-01
+ -0.435069784091323016E-03 -0.166666666666667011E-01  0.194629197189570002E-01
+ -0.118078836231266999E-02 -0.435069000033927022E-03 -0.125568468813201011E-02
+  0.193879970350075009E-01 -0.371043793435626003E-03 -0.166666666666667011E-01
+  0.180438947424557995E-01 -0.288534286733046986E-03 -0.486290336507850022E-03
+ -0.560900198964180987E-03  0.187957070663157012E-01 -0.166666666666667011E-01
+  0.177374439856716991E-01 -0.214133613745063999E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.267021979533145016E-03
+  0.179665664602573008E-01 -0.166666666666667011E-01  0.193465910019366000E-01
+ -0.123791262554833999E-02 -0.118077342883706003E-02  0.194573127658026999E-01
+ -0.166666666666667011E-01 -0.139258097457669987E-03  0.172965222480018985E-01
+ -0.139396279238899008E-07 -0.139258097457669987E-03 -0.716212299733630966E-06
+  0.178142098387169996E-01 -0.448746699171803979E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.187014712985399990E-01 -0.470424366568322023E-03
+ -0.420614096697062990E-03 -0.416028776742435001E-03  0.185800358532766004E-01
+ -0.143647797085317999E-03  0.178142596258036995E-01 -0.784777563840381003E-06
+ -0.166666666666667011E-01 -0.289432939916457006E-03 -0.392675356000534004E-05
+  0.178691669573259014E-01 -0.166666666666667011E-01 -0.433838074751410983E-03
+  0.193348477920875991E-01 -0.122492972126329001E-02 -0.436295777836767011E-03
+ -0.116751654000400999E-02  0.194447290230824014E-01 -0.166666666666667011E-01
+  0.173080151788632987E-01 -0.244742519208693003E-04  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.133996458919445003E-03  0.178636488290151996E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.139757971205888996E-03  0.000000000000000000E+00  0.173348621828142008E-01
+ -0.229201529246951995E-07 -0.141718205587288988E-03 -0.107549290031851995E-05
+  0.178145692111312988E-01 -0.166666666666667011E-01  0.193848567630524014E-01
+ -0.125477293338469996E-02 -0.118151635484324991E-02  0.194632155620537983E-01
+ -0.140442473096456008E-03 -0.166666666666667011E-01  0.173484955878829006E-01
+ -0.294133603610693987E-07 -0.144782161123405996E-03 -0.133037267977017010E-05
+  0.178148273618939992E-01 -0.435071708554150990E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.194637484731423997E-01 -0.118167963393035996E-02
+ -0.435067075554543010E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.125644827298293993E-02  0.193888013904018990E-01
+ -0.166666666666667011E-01 -0.143972797957216006E-03  0.173505484374359004E-01
+ -0.481359947088405975E-04 -0.558385337699288022E-03 -0.438375372304428026E-03
+  0.187218716475071011E-01 -0.166666666666667011E-01 -0.139937861588156992E-03
+  0.174078894816094010E-01 -0.227986700629730998E-07 -0.141801880833368992E-03
+ -0.908696204756822957E-06  0.178144027207976993E-01 -0.143597614564550997E-03
+  0.178145584123696014E-01 -0.106316430421145007E-05 -0.166666666666667011E-01
+ -0.140417723829495007E-03 -0.268131472654641992E-07  0.174078938787147995E-01
+ -0.166666666666667011E-01 -0.434071246446273990E-03  0.193423068719833988E-01
+ -0.122826345821061989E-02 -0.436064307367708980E-03 -0.116769061492820003E-02
+  0.194456917758701990E-01 -0.166666666666667011E-01  0.172982030065722998E-01
+ -0.280949763883982991E-07 -0.166666666666667011E-01 -0.189517763026067993E-05
+  0.178271977135994013E-01 -0.166666666666667011E-01  0.178265008932220990E-01
+ -0.134657557070520009E-05 -0.925862840555429982E-06  0.178144080134048990E-01
+ -0.166666666666667011E-01 -0.434780858347314024E-03  0.193789566829400996E-01
+ -0.125221572856264000E-02 -0.435357659993625004E-03 -0.118148634697722000E-02
+  0.194624944824026008E-01 -0.139674508339166013E-03 -0.166666666666667011E-01
+  0.173041495432516984E-01 -0.235385803949071996E-07  0.000000000000000000E+00
+ -0.141583560353484997E-03 -0.117449727533117009E-05  0.178146675901048983E-01
+ -0.166666666666667011E-01 -0.139670744554657995E-03  0.172964393825064990E-01
+ -0.239719255799683993E-07 -0.141639879972026009E-03 -0.121306525215112003E-05
+  0.178147059627193002E-01 -0.166666666666667011E-01 -0.140348090038086001E-03
+  0.178144318165064999E-01 -0.933801051164943002E-06 -0.140348090679100993E-03
+ -0.933801691611406014E-06  0.178144318606493005E-01 -0.435069394145568998E-03
+  0.194636343488718000E-01 -0.118148110868380009E-02 -0.166666666666667011E-01
+ -0.435069389980168985E-03 -0.125640863633868995E-02  0.193887067279608985E-01
+ -0.166666666666667011E-01 -0.435069389466851027E-03  0.193887808952679999E-01
+ -0.125648448071444003E-02 -0.435069394658887010E-03 -0.118155528731557990E-02
+  0.194637102049162007E-01 -0.139700012758421009E-03 -0.166666666666667011E-01
+  0.172964152304735004E-01 -0.947803391050105932E-08  0.000000000000000000E+00
+ -0.141286629556396993E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.427087720755819975E-06  0.177496011881496003E-01
+  0.000000000000000000E+00 -0.140176041318299010E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.177492266992976995E-01 -0.496642740357883019E-07
+ -0.140412389918406994E-03 -0.517602927754366013E-06  0.178140137848851986E-01
+ -0.435070897570947004E-03 -0.166666666666667011E-01  0.194636050822587989E-01
+ -0.118148880805278990E-02 -0.435067886547593015E-03 -0.125633531890120007E-02
+  0.193886425635053997E-01 -0.166666666666667011E-01 -0.139669237663479995E-03
+  0.172964396325578015E-01 -0.239557312619153985E-07 -0.141631179766294000E-03
+ -0.121226845353324010E-05  0.178147051921049987E-01 -0.435070355283276994E-03
+ -0.166666666666667011E-01  0.194636498694088986E-01 -0.118156237601487000E-02
+ -0.435068428839514019E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.125638673489450003E-02
+  0.193887513139221999E-01  0.000000000000000000E+00 -0.435071250841686976E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.194636876903557000E-01
+ -0.118157615129436001E-02 -0.435067533273078989E-03 -0.125640800692181000E-02
+  0.193887125978184013E-01 -0.139670568327848998E-03 -0.166666666666667011E-01
+  0.172964396537817998E-01 -0.239700195293086008E-07 -0.141638860134678002E-03
+ -0.121297096046396990E-05  0.178147058843058995E-01 -0.435090508141796988E-03
+ -0.166666666666667011E-01  0.194669892927517990E-01 -0.118530039028566991E-02
+ -0.435048274568398019E-03 -0.125913754523842994E-02  0.193915247098193008E-01
+ -0.166666666666667011E-01  0.172960984745374989E-01 -0.213609391405068995E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.106367312891359006E-05
+  0.178056546914439999E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.173098191165470990E-01 -0.233466429807202985E-07 -0.116342624068746999E-05
+  0.178149495945450001E-01 -0.166666666666667011E-01 -0.435069713252911978E-03
+  0.194633204071541013E-01 -0.118118203409811997E-02 -0.435069070872497979E-03
+ -0.125608526726808992E-02  0.193883924273124987E-01 -0.139659157492490009E-03
+ -0.166666666666667011E-01  0.172964811115927999E-01 -0.238453400472707005E-07
+ -0.141572592558481008E-03 -0.120675012864579991E-05  0.178147008765491000E-01
+ -0.141538194636943009E-03  0.178146978498745015E-01 -0.120361727918045010E-05
+ -0.166666666666667011E-01 -0.139653196804214987E-03 -0.237816257834915005E-07
+  0.172964810525967003E-01 -0.166666666666667011E-01 -0.435101710749684009E-03
+  0.194659176105031993E-01 -0.118451620585209995E-02 -0.435037070057877023E-03
+ -0.125773914717291005E-02  0.193902105162142985E-01 -0.166666666666667011E-01
+  0.172962783354724016E-01 -0.218342668490586010E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.109032448231275997E-05  0.178073744716654991E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.434677413069518002E-03  0.000000000000000000E+00  0.193010409680071000E-01
+ -0.117438144583154004E-02 -0.435460867464983019E-03 -0.110662888765544992E-02
+  0.193848162965538015E-01 -0.166666666666667011E-01  0.173066829056823003E-01
+ -0.235026937709481004E-07 -0.117850668810495007E-05  0.178149746842341007E-01
+ -0.434571088710914019E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.192763238295202005E-01 -0.115086473839517999E-02
+ -0.435566874653087015E-03 -0.108492248964389996E-02  0.193620926343772010E-01
+ -0.161599193400744999E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172964883975278011E-01 -0.923842913120058025E-07 -0.162350867043699006E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.101863211158590009E-05  0.173051440367491989E-01 -0.166666666666667011E-01
+ -0.435069857911570023E-03  0.194633765819712985E-01 -0.118123987997616003E-02
+ -0.435068926213479003E-03 -0.125613733340555994E-02  0.193884432348929991E-01
+ -0.166666666666667011E-01 -0.139666077646513989E-03  0.172964743925189991E-01
+ -0.239195397112311000E-07 -0.141612586198800003E-03 -0.121041044884479009E-05
+  0.178147032886921998E-01 -0.141624350828127994E-03  0.178147048678200005E-01
+ -0.121149428933203998E-05 -0.166666666666667011E-01 -0.139668115474050994E-03
+ -0.239415570468690011E-07  0.172964744463235016E-01 -0.166666666666667011E-01
+ -0.435074201017785996E-03  0.194493813662804001E-01 -0.116767077284960006E-02
+ -0.435064583034155982E-03 -0.124201442563575998E-02  0.193746726576543987E-01
+ -0.166666666666667011E-01  0.173081303334150990E-01 -0.244868683853623015E-04
+ -0.166666666666667011E-01 -0.133924491696816997E-03  0.178635877956252014E-01
+ -0.166666666666667011E-01  0.173348654764515983E-01 -0.223062641979707998E-07
+ -0.104752868600600992E-05  0.178145411980304008E-01 -0.166666666666667011E-01
+ -0.435078250390669987E-03  0.194615680563809991E-01 -0.117967106973176005E-02
+ -0.435060533485638016E-03 -0.125408416347905010E-02  0.193864744856198992E-01
+ -0.139507036577244005E-03 -0.166666666666667011E-01  0.173052664826232015E-01
+ -0.217638922762432993E-07  0.000000000000000000E+00 -0.140641592938044988E-03
+ -0.108598665496889000E-05  0.178145535889315011E-01 -0.166666666666667011E-01
+ -0.139669898838359002E-03  0.172964394567505994E-01 -0.239628351519636015E-07
+ -0.141634997663802990E-03 -0.121261808844727004E-05  0.178147055141749007E-01
+ -0.166666666666667011E-01 -0.140550312154719988E-03  0.178144664524045003E-01
+ -0.935528049442035979E-06 -0.140352831077276009E-03 -0.934435514417231040E-06
+  0.178144323723362016E-01 -0.435067799539816001E-03  0.193882988824195003E-01
+ -0.125598375468558010E-02 -0.166666666666667011E-01 -0.435070984577867010E-03
+ -0.118114930153955010E-02  0.194632560036250013E-01 -0.166666666666667011E-01
+ -0.435066606879862022E-03  0.193887645192647000E-01 -0.125653020767648002E-02
+ -0.435072177221240990E-03 -0.118164241777987005E-02  0.194637770279468998E-01
+ -0.139525322491481995E-03 -0.166666666666667011E-01  0.172964018029032988E-01
+ -0.889114618565385951E-08  0.000000000000000000E+00 -0.140487892558472998E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.401772060623745995E-06  0.177503217815941985E-01  0.000000000000000000E+00
+ -0.139680840737954994E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.177499929679008003E-01 -0.461429010297917004E-07 -0.139808785077553992E-03
+ -0.474216741479885991E-06  0.178139754223665993E-01 -0.435490707175785019E-03
+ -0.166666666666667011E-01  0.194038024150119011E-01 -0.113136000392566002E-02
+ -0.434647496176783011E-03 -0.118521045340098008E-02  0.193210677155511990E-01
+ -0.166666666666667011E-01 -0.139356047055132996E-03  0.172964913696200991E-01
+ -0.205948470749521995E-07 -0.139855780445013013E-03 -0.104675229678023004E-05
+  0.178145500428797988E-01 -0.435278470720009021E-03 -0.166666666666667011E-01
+  0.194122813378182005E-01 -0.114346492577733008E-02 -0.434860167096455020E-03
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.119664955180343990E-02  0.193456144785291995E-01
+  0.000000000000000000E+00 -0.435507216634916999E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.194759659828977992E-01 -0.120235295974980010E-02
+ -0.434630954199676010E-03 -0.125613199041933006E-02  0.193898784861901013E-01
+ -0.139505276818091995E-03 -0.166666666666667011E-01  0.172964538951550992E-01
+ -0.222158902483321995E-07 -0.140684567793642991E-03 -0.112653151422048998E-05
+  0.178145939457222992E-01 -0.142980894339064004E-03 -0.166666666666667011E-01
+  0.173314540736370995E-01 -0.109865934611584997E-06 -0.158534103555776988E-03
+ -0.501444248842295042E-05  0.178188430592681017E-01 -0.166666666666667011E-01
+  0.191833679361077016E-01 -0.116695965850981001E-02  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.115255810703607004E-02  0.193073017106333011E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172969314941423015E-01
+ -0.239425929155972008E-07 -0.121051376505055003E-05  0.178147034232610003E-01
+ -0.166666666666667011E-01 -0.435705239999077025E-03  0.190415627118516000E-01
+ -0.780602612534949010E-03 -0.434432059542404973E-03 -0.823587259891796967E-03
+  0.189719852118625000E-01 -0.147270631146331993E-03 -0.166666666666667011E-01
+  0.174669495140968994E-01 -0.100890785221647000E-03 -0.564704614718581016E-03
+ -0.477370877328551027E-03  0.187551209753295985E-01 -0.139439405915424988E-03
+  0.178140970873433002E-01 -0.575941857140058983E-06 -0.166666666666667011E-01
+ -0.139258097457669987E-03 -0.129861811110761993E-07  0.173691987550076005E-01
+ -0.166666666666667011E-01 -0.437167563287824015E-03  0.195228637533087016E-01
+ -0.126248812464285006E-02 -0.432956764514017976E-03 -0.125491318191218006E-02
+  0.193932789336802996E-01 -0.166666666666667011E-01  0.172966662190472999E-01
+ -0.121413434559184010E-06  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.972291627814046991E-05  0.179398884476568016E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.350105792758167024E-03
+  0.000000000000000000E+00  0.179358412564323000E-01 -0.564926759529481039E-05
+ -0.144639853954665995E-03 -0.314581136116586985E-06  0.178138319445949010E-01
+ -0.166666666666667011E-01  0.193703856784436003E-01 -0.124844632146220002E-02
+ -0.118120734927350994E-02  0.194604805412335984E-01 -0.431301131288118977E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.192675040792685999E-01 -0.119991065467065991E-02 -0.438790367540799003E-03
+ -0.117513383093279010E-02  0.194438480854111986E-01 -0.160851119958975013E-03
+ -0.166666666666667011E-01  0.172967403401987005E-01 -0.176667226349783012E-07
+ -0.161653225232298001E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.204715316608505000E-06
+  0.173054598817215001E-01 -0.166666666666667011E-01 -0.435630521898672989E-03
+  0.193599440196800983E-01 -0.109077905797679009E-02 -0.434507214005793005E-03
+ -0.113826801298030997E-02  0.192764807436608988E-01 -0.166666666666667011E-01
+ -0.139536670190099001E-03  0.173126649368885999E-01 -0.217199105747271010E-07
+ -0.140761521185631995E-03 -0.106837091099441006E-05  0.178145668888614987E-01
+ -0.559739137192076996E-03  0.187342109887928994E-01 -0.453812468447541010E-03
+ -0.166666666666667011E-01 -0.144437867213115988E-03 -0.570571543832924991E-04
+  0.173662394826720017E-01 -0.166666666666667011E-01 -0.141554480273217009E-03
+  0.173902726296936011E-01 -0.612977187708155032E-05 -0.543269256103985979E-03
+ -0.155576690925164998E-04  0.183343607413548014E-01 -0.166666666666667011E-01
+  0.190435224881784007E-01 -0.109812319463792004E-02 -0.166666666666667011E-01
+ -0.111910148431190000E-02  0.192002058717034990E-01 -0.166666666666667011E-01
+  0.191810336689013997E-01 -0.116112314020396998E-02 -0.117547931931091007E-02
+  0.194335300141191998E-01 -0.166666666666667011E-01 -0.435330233904635021E-03
+  0.194710205889397983E-01 -0.119426072742205005E-02 -0.434808333182332976E-03
+ -0.125626108720282010E-02  0.193894446405454002E-01 -0.140176366522793013E-03
+ -0.166666666666667011E-01  0.177492265794550012E-01 -0.496668406936042969E-07
+  0.000000000000000000E+00 -0.140412799232444004E-03 -0.517630044729356947E-06
+  0.178140137898683999E-01 -0.166666666666667011E-01 -0.435069391185710023E-03
+  0.193884418604889988E-01 -0.125613777143264004E-02 -0.435069392940028014E-03
+ -0.118121619509623009E-02  0.194633634760910008E-01 -0.166666666666667011E-01
+ -0.139670763696215990E-03  0.172964743991942983E-01 -0.239698478178091986E-07
+ -0.141639630426265001E-03 -0.121288412910185994E-05  0.178147057825197991E-01
+ -0.141639615804873989E-03  0.178147057811398994E-01 -0.121288279038434004E-05
+ -0.166666666666667011E-01 -0.139670761162715996E-03 -0.239698205914474004E-07
+  0.172964743991641003E-01 -0.166666666666667011E-01 -0.435069386965028005E-03
+  0.193887089999775983E-01 -0.125641102065624996E-02 -0.435069397160709978E-03
+ -0.118148348379683993E-02  0.194636367651009994E-01 -0.139670105523366001E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172964392990828005E-01
+ -0.239365198406219005E-07 -0.141635362805994999E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.121107464035148993E-05  0.178146126504961991E-01
+  0.000000000000000000E+00 -0.435064986808901988E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193885562632203010E-01 -0.125634370176468005E-02
+ -0.435073797255201974E-03 -0.118148076820924009E-02  0.194636164222987997E-01
+ -0.139670495544523001E-03 -0.166666666666667011E-01  0.172964394121789995E-01
+ -0.239692490633551009E-07 -0.141638442382618007E-03 -0.121293358252935993E-05
+  0.178147058331974001E-01 -0.166666666666667011E-01 -0.435069391760568977E-03
+  0.193887087038398000E-01 -0.125641061773689995E-02 -0.435069392365169006E-03
+ -0.118148301966637995E-02  0.194636363154463005E-01 -0.139670535416434004E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172964393658350986E-01
+ -0.239411949136597000E-07 -0.141637844578005988E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.121130492256924999E-05  0.178146130952495999E-01
+  0.000000000000000000E+00 -0.435064996378640002E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193885572616325011E-01 -0.125634453230068005E-02
+ -0.435073787685731975E-03 -0.118148144433873992E-02  0.194636171468661988E-01
+ -0.139670065580594011E-03 -0.166666666666667011E-01  0.172964393185634988E-01
+ -0.239646313460681002E-07 -0.141635961417651987E-03 -0.121270668854361004E-05
+  0.178147055904161987E-01 -0.434952598750324988E-03 -0.166666666666667011E-01
+  0.193531525591995983E-01 -0.122237998046724992E-02 -0.435186141504625024E-03
+ -0.114985437531919007E-02  0.194307215840397000E-01 -0.166666666666667011E-01
+  0.173112910791177986E-01 -0.253865277378717003E-04  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.138259363976047007E-03  0.178692219185750990E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.193634106903348008E-01
+ -0.124536794862644003E-02 -0.118108829657000993E-02  0.194595580860945985E-01
+ -0.166666666666667011E-01 -0.435070677392060000E-03  0.194631053165025998E-01
+ -0.118099345597813001E-02 -0.435068106728430011E-03 -0.125584229113298996E-02
+  0.193881574845413997E-01 -0.139730287993371005E-03 -0.166666666666667011E-01
+  0.172965129995373001E-01 -0.246134321847894014E-07 -0.141982715458390012E-03
+ -0.124443001739989997E-05  0.178147384368371013E-01 -0.141918001872325990E-03
+  0.178147321831834994E-01 -0.123839758886950008E-05 -0.166666666666667011E-01
+ -0.139719073147956993E-03 -0.244906574666029988E-07  0.172965128605244989E-01
+ -0.166666666666667011E-01 -0.434974886831416001E-03  0.193565238209871988E-01
+ -0.122539181987882991E-02 -0.435163868606227981E-03 -0.115249078341496999E-02
+  0.194335198475120013E-01 -0.166666666666667011E-01  0.173108110594775988E-01
+ -0.252486229488127016E-04  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.137635746082909991E-03  0.178684051236191015E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.140103968298785998E-03
+  0.000000000000000000E+00  0.173357792454707990E-01 -0.263227882027079996E-07
+ -0.143405882709686988E-03 -0.122767817853369000E-05  0.178147220760386006E-01
+ -0.166666666666667011E-01  0.193641798191389013E-01 -0.124570775063777002E-02
+ -0.118110177773261002E-02  0.194596602746646002E-01 -0.140117598274002998E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.173359378634117996E-01 -0.264605452289240990E-07 -0.143470065352009998E-03
+ -0.123349701123618009E-05  0.178147283869216008E-01 -0.139699946083262993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172964334581704009E-01
+ -0.947724450009741998E-08 -0.141286163109395011E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.427036318179888015E-06
+  0.177496010208582009E-01 -0.166666666666667011E-01 -0.435115619333969008E-03
+  0.188851999358167003E-01 -0.625665684806612967E-03 -0.435023156520779992E-03
+ -0.676915827627737983E-03  0.188324700803031986E-01 -0.166666666666667011E-01
+ -0.139445450795412993E-03  0.174062618030441998E-01 -0.190292129986386985E-07
+ -0.139963260485348001E-03 -0.765758079538392050E-06  0.178142630176348002E-01
+ -0.532898920971015019E-03  0.187657876890418990E-01 -0.512856477472066019E-03
+ -0.166666666666667011E-01 -0.274567299328894003E-03 -0.183874244209534995E-03
+  0.177330719708947010E-01 -0.166666666666667011E-01 -0.142934406904232013E-03
+  0.173279989505247985E-01 -0.109940048199017001E-06 -0.158611614219584011E-03
+ -0.505928267052957010E-05  0.178189142473271986E-01 -0.166666666666667011E-01
+  0.191837857182392997E-01 -0.116715933513682007E-02 -0.166666666666667011E-01
+ -0.115261551048108000E-02  0.193075009319374005E-01 -0.166666666666667011E-01
+  0.192715504217954997E-01 -0.120392392385177999E-02 -0.117898318247606007E-02
+  0.194477863404796010E-01 -0.166666666666667011E-01 -0.435541716493012012E-03
+  0.194766610463366005E-01 -0.120362759000766000E-02 -0.434596353391778010E-03
+ -0.125584159818315009E-02  0.193897157787361017E-01 -0.435053772892073011E-03
+ -0.166666666666667011E-01  0.193916866829711015E-01 -0.125980517520123993E-02
+  0.000000000000000000E+00 -0.435085010459558010E-03 -0.118504095738582995E-02
+  0.194671524144862003E-01 -0.166666666666667011E-01 -0.139670776269951012E-03
+  0.172964393783541989E-01 -0.239722661869433015E-07 -0.141640063065600003E-03
+ -0.121308200605217006E-05  0.178147059781510013E-01 -0.166666666666667011E-01
+ -0.435069330991406000E-03  0.193887306635154993E-01 -0.125643445841802995E-02
+ -0.435069453134320002E-03 -0.118150726982913000E-02  0.194636605867374995E-01
+ -0.140346027430110010E-03  0.178144315994148983E-01 -0.933532497936932975E-06
+ -0.166666666666667011E-01 -0.140345973839162006E-03 -0.933513688391224014E-06
+  0.178144303098709995E-01 -0.166666666666667011E-01 -0.139688634719759995E-03
+  0.172964363725470992E-01 -0.241646447299009009E-07 -0.141743168416124011E-03
+ -0.122254521804739008E-05  0.178147146809073016E-01 -0.435059870297027000E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.193910369733675987E-01
+ -0.125956508747497001E-02 -0.435078913540753987E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.118493531740329009E-02  0.194660938610287000E-01
+  0.000000000000000000E+00 -0.435045710174179019E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193875426795061985E-01 -0.125569402526470994E-02
+ -0.435093072169510985E-03 -0.118112078544253991E-02  0.194631751705502017E-01
+ -0.139419879842034002E-03 -0.166666666666667011E-01  0.172964832727320016E-01
+ -0.213161584771942993E-07 -0.140230214929088989E-03 -0.108242332728665990E-05
+  0.178145871202937990E-01 -0.166666666666667011E-01 -0.435070648727867019E-03
+  0.194635173256733005E-01 -0.118139703309711993E-02 -0.435068135392855987E-03
+ -0.125625483745250995E-02  0.193885626932315015E-01 -0.139434374869669994E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172964444175748994E-01
+ -0.849107878174372025E-08 -0.140068522154330002E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.383811097196054990E-06  0.177500579653602011E-01
+  0.000000000000000000E+00 -0.140208513794569996E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.177497183126398006E-01 -0.504600381193170004E-07
+ -0.140451251998329987E-03 -0.520324482162036022E-06  0.178140160927331009E-01
+ -0.435048849909038004E-03 -0.166666666666667011E-01  0.193915158406327004E-01
+ -0.125973026995557995E-02 -0.435089932877583986E-03 -0.118503856970240008E-02
+  0.194671300235447001E-01 -0.435388199753574995E-03 -0.166666666666667011E-01
+  0.194580790950913997E-01 -0.118263497490539004E-02 -0.434750258397955996E-03
+ -0.124178531210186001E-02  0.193754724111637015E-01 -0.166666666666667011E-01
+  0.173081480358036992E-01 -0.244895773689186006E-04  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.133922698934494993E-03  0.178635891343980995E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.193849898137051016E-01
+ -0.125483132565947994E-02 -0.118151827368068998E-02  0.194632328928430985E-01
+ -0.166666666666667011E-01 -0.142602728611152993E-03  0.172960572372911001E-01
+ -0.117224517748718006E-06 -0.158587369045204001E-03 -0.567482454003337982E-05
+  0.178189579824735989E-01 -0.139720580884210006E-03 -0.166666666666667011E-01
+  0.177036794314483006E-01 -0.306504126162400988E-07 -0.139932852514187012E-03
+ -0.499375406901578031E-06  0.178139944442497998E-01 -0.546160778925738975E-03
+  0.184421024561669013E-01 -0.894165394996089942E-04 -0.166666666666667011E-01
+ -0.147442848397534993E-03 -0.240246210218283011E-04  0.177137736588957005E-01
+ -0.166666666666667011E-01 -0.400413710403848991E-03  0.186079235147509001E-01
+ -0.750380380436909983E-03 -0.465646679664707009E-03 -0.925379800643584029E-03
+  0.191586075383834986E-01 -0.166666666666667011E-01  0.177029162142693999E-01
+ -0.193968097316907988E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.245143979550067003E-03  0.179475102930650998E-01
+ -0.166666666666667011E-01 -0.140401212229714989E-03  0.173460971401947009E-01
+ -0.290712390925789007E-07 -0.144639789824670004E-03 -0.132223121416674009E-05
+  0.178148169771682983E-01 -0.166666666666667011E-01  0.193485878399233015E-01
+ -0.123880031430750989E-02 -0.118081271609363995E-02  0.194575794108941998E-01
+ -0.139755721134129007E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173348680988957005E-01 -0.228988835753830997E-07
+ -0.141707074016602990E-03 -0.107451126571115990E-05  0.178145682418863992E-01
+ -0.139419950330123000E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172964916757554002E-01 -0.212265241915647985E-07 -0.140190964958966001E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.107696768404692002E-05  0.178142558899385996E-01 -0.166666666666667011E-01
+ -0.143972832788044007E-03  0.173505501235237010E-01 -0.481361186863988992E-04
+ -0.558385355651620026E-03 -0.438374207210451001E-03  0.187218708288803999E-01
+ -0.166666666666667011E-01 -0.139941367226140996E-03  0.174078891209918998E-01
+ -0.228259192699078984E-07 -0.141815000445607008E-03 -0.909747619509022010E-06
+  0.178144035116544995E-01 -0.143597630564741997E-03  0.178145584149138995E-01
+ -0.106316648366329002E-05 -0.166666666666667011E-01 -0.140417726652687994E-03
+ -0.268131811843163990E-07  0.174078935255252004E-01 -0.166666666666667011E-01
+ -0.434071240888807990E-03  0.193423066886313985E-01 -0.122826337417718994E-02
+ -0.436064312888945010E-03 -0.116769060685687994E-02  0.194456917479709016E-01
+ -0.166666666666667011E-01  0.172982030071983997E-01 -0.280949763384092005E-07
+ -0.166666666666667011E-01 -0.189517762455052993E-05  0.178271977135736996E-01
+ -0.166666666666667011E-01  0.178265008932003005E-01 -0.134657556992853991E-05
+ -0.925862840600532052E-06  0.178144080134048990E-01 -0.166666666666667011E-01
+ -0.434776065018225015E-03  0.193791533646124996E-01 -0.125252599578476996E-02
+ -0.435362444422488025E-03 -0.118186375443277993E-02  0.194628395316310994E-01
+ -0.140181958380352012E-03 -0.166666666666667011E-01  0.177492492651218983E-01
+ -0.497367681715309972E-07  0.000000000000000000E+00 -0.140419741913999988E-03
+ -0.518097724412703002E-06  0.178140141000072000E-01 -0.166666666666667011E-01
+ -0.435069320521429986E-03  0.193884456522794985E-01 -0.125614319532225006E-02
+ -0.435069463604292005E-03 -0.118122256043727992E-02  0.194633694896899009E-01
+ -0.166666666666667011E-01 -0.139670889730207013E-03  0.172964735664216003E-01
+ -0.239712525939313013E-07 -0.141640366257597987E-03 -0.121295504750436001E-05
+  0.178147058424521984E-01 -0.141657676433903007E-03  0.178147071049579997E-01
+ -0.121453449198612004E-05 -0.166666666666667011E-01 -0.139673889655509000E-03
+ -0.240033947134963004E-07  0.172964735804191985E-01 -0.166666666666667011E-01
+ -0.435069155034934006E-03  0.193887186536986986E-01 -0.125642600972210999E-02
+ -0.435069629090626005E-03 -0.118150164198114007E-02  0.194636536348553983E-01
+ -0.139672279648728003E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964368900011996E-01 -0.239116221440321008E-07 -0.141646519897784998E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.120943312255959009E-05
+  0.178144561861929007E-01  0.000000000000000000E+00 -0.435057590298097978E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193887674296497996E-01
+ -0.125672688730438004E-02 -0.435081193385279013E-03 -0.118196873889853007E-02
+  0.194640540700679009E-01 -0.139679127767890996E-03 -0.166666666666667011E-01
+  0.172963729770981983E-01 -0.240508992733560004E-07 -0.141688504708434011E-03
+ -0.121696226254563010E-05  0.178146602550901985E-01 -0.166666666666667011E-01
+ -0.435063688039146997E-03  0.193888428280263002E-01 -0.125667755640774008E-02
+ -0.435075095983264025E-03 -0.118183170521553998E-02  0.194639441233023999E-01
+ -0.139679366822631002E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172963909933037986E-01 -0.238904374634853986E-07 -0.141684903032989010E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.120762219986995010E-05
+  0.178141341272457995E-01  0.000000000000000000E+00 -0.435044663859789993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193878878622800004E-01
+ -0.125607230703734000E-02 -0.435094118323148986E-03 -0.118150745952062992E-02
+  0.194635591497266010E-01 -0.139672170490270990E-03 -0.166666666666667011E-01
+  0.172964346444417008E-01 -0.239875039867713010E-07 -0.141648157868298996E-03
+ -0.121384147158235009E-05  0.178147065643662998E-01 -0.435090496096461013E-03
+ -0.166666666666667011E-01  0.194669889560541987E-01 -0.118529978595172006E-02
+ -0.435048286615349021E-03 -0.125913755896959003E-02  0.193915246813111015E-01
+ -0.166666666666667011E-01  0.172960984732630010E-01 -0.213609392236131997E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.106367313562377998E-05
+  0.178056546915008017E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.173098191165334987E-01 -0.233466429813017999E-07 -0.116342624074817007E-05
+  0.178149495945451007E-01 -0.166666666666667011E-01 -0.435066797805972990E-03
+  0.193883041569685999E-01 -0.125604896999190001E-02 -0.435071986298389026E-03
+ -0.118116640188501007E-02  0.194633028406514010E-01 -0.139659163853909999E-03
+ -0.166666666666667011E-01  0.172964806846983010E-01 -0.238454355521033004E-07
+ -0.141572633531096987E-03 -0.120675578133157002E-05  0.178147008816651985E-01
+ -0.141538304155846992E-03  0.178146978602927997E-01 -0.120362914934832999E-05
+ -0.166666666666667011E-01 -0.139653215055702006E-03 -0.237818478129986986E-07
+  0.172964806257772004E-01 -0.166666666666667011E-01 -0.435101676794290987E-03
+  0.194659166622981016E-01 -0.118451450992946000E-02 -0.435037104020240977E-03
+ -0.125773918484995990E-02  0.193902104354441995E-01 -0.166666666666667011E-01
+  0.172962783320088007E-01 -0.218342670137599985E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.109032449721754008E-05  0.178073744715902017E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.434677413065023984E-03  0.000000000000000000E+00  0.193010409686150998E-01
+ -0.117438144655306010E-02 -0.435460867469465978E-03 -0.110662888842583997E-02
+  0.193848162973076985E-01 -0.166666666666667011E-01  0.173066829055725999E-01
+ -0.235026937765211008E-07 -0.117850668861997998E-05  0.178149746842344997E-01
+ -0.434571088713672988E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.192763238297126993E-01 -0.115086473852541002E-02
+ -0.435566874650336990E-03 -0.108492248973003007E-02  0.193620926345084016E-01
+ -0.139708981740112000E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172963531511250990E-01 -0.951377049798812932E-08 -0.141328242724759995E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.428708426390729985E-06  0.177495866410289013E-01 -0.166666666666667011E-01
+ -0.435067939506500023E-03  0.193883890526193002E-01 -0.125611318806702994E-02
+ -0.435070844612536994E-03 -0.118121301451965996E-02  0.194633542459594008E-01
+ -0.166666666666667011E-01 -0.139664557957332988E-03  0.172964746611408010E-01
+ -0.239032606349812988E-07 -0.141603814090319997E-03 -0.120960968951782991E-05
+  0.178147026032577996E-01 -0.141639613175400988E-03  0.178147057784965000E-01
+ -0.121288106723124010E-05 -0.166666666666667011E-01 -0.139670761286573002E-03
+ -0.239698004783898005E-07  0.172964747236447995E-01 -0.166666666666667011E-01
+ -0.435069392365180986E-03  0.194637108092800999E-01 -0.118155587635168993E-02
+ -0.435069391760556996E-03 -0.125648510526804006E-02  0.193887815570613006E-01
+ -0.166666666666667011E-01  0.172964151549298006E-01 -0.947849091229853943E-08
+ -0.166666666666667011E-01 -0.427106302281655022E-06  0.177495992959328004E-01
+ -0.166666666666667011E-01  0.177492248112503005E-01 -0.496796927091242970E-07
+ -0.517779383578906051E-06  0.178140139540073003E-01 -0.166666666666667011E-01
+ -0.435081057490304974E-03  0.194615601858393016E-01 -0.117973336574162002E-02
+ -0.435057726202830975E-03 -0.125399377351724994E-02  0.193864040661351003E-01
+ -0.139666404223663996E-03 -0.166666666666667011E-01  0.173052378719510992E-01
+ -0.233928603649598996E-07  0.000000000000000000E+00 -0.141527802484294987E-03
+ -0.116498797383902993E-05  0.178146582832621990E-01 -0.166666666666667011E-01
+ -0.139669857630702005E-03  0.172964394671825013E-01 -0.239623924701885009E-07
+ -0.141634759721171003E-03 -0.121259630273193999E-05  0.178147054943494990E-01
+ -0.166666666666667011E-01 -0.140555314552680993E-03  0.178144675283670000E-01
+ -0.934892070952238007E-06 -0.140347413389892010E-03 -0.933745297868121043E-06
+  0.178144318010057985E-01 -0.435067722986518012E-03  0.193882964019922999E-01
+ -0.125598089226015993E-02 -0.166666666666667011E-01 -0.435071061130372014E-03
+ -0.118115048591624990E-02  0.194632553716142000E-01 -0.166666666666667011E-01
+ -0.435069777018943008E-03  0.194636048172950014E-01 -0.118146131478344993E-02
+ -0.435069007106324973E-03 -0.125636789473562000E-02  0.193886685693907997E-01
+ -0.139690885024778004E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964302280185014E-01 -0.950505359387174008E-08 -0.141247379876703009E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.428634134774421003E-06
+  0.177500186390967991E-01  0.000000000000000000E+00 -0.139677196785329999E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.177496423494717011E-01
+ -0.457508561380249031E-07 -0.139805680326738989E-03 -0.473830440639680986E-06
+  0.178139751639271003E-01 -0.435493815113026025E-03 -0.166666666666667011E-01
+  0.194038726929433997E-01 -0.113147962792324000E-02 -0.434644379614391009E-03
+ -0.118519815382812010E-02  0.193210650330794993E-01 -0.166666666666667011E-01
+ -0.139355161737402005E-03  0.172964917336673983E-01 -0.205843400324393006E-07
+ -0.139851518567708007E-03 -0.104623713006894006E-05  0.178145496587001004E-01
+ -0.435276969900176027E-03 -0.166666666666667011E-01  0.194123409556366984E-01
+ -0.114355309658945004E-02 -0.434861669984602990E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.119672311455852002E-02
+  0.193457917807220992E-01  0.000000000000000000E+00 -0.435510762531134978E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.194758550382455008E-01
+ -0.120231103100156000E-02 -0.434627398233908020E-03 -0.125592048119514991E-02
+  0.193896956616587991E-01 -0.139662076715157995E-03 -0.166666666666667011E-01
+  0.172964848493273003E-01 -0.238760385660033009E-07 -0.141589392729580995E-03
+ -0.120824848732212997E-05  0.178147013421607013E-01 -0.435069416060821000E-03
+ -0.166666666666667011E-01  0.194636372101129017E-01 -0.118148444615658998E-02
+ -0.435069368064916007E-03 -0.125641081782009996E-02  0.193887089888112006E-01
+ -0.166666666666667011E-01  0.172964393423152007E-01 -0.239438465741853011E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.121143525352024005E-05
+  0.178146131933784001E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964393844348002E-01 -0.239719133473768010E-07 -0.121306464651348996E-05
+  0.178147059621872987E-01 -0.166666666666667011E-01 -0.435085659385655980E-03
+  0.194638181335962009E-01 -0.118203438802173995E-02 -0.435053123899394999E-03
+ -0.125612026912269993E-02  0.193884809592554004E-01 -0.139670749044936993E-03
+ -0.166666666666667011E-01  0.172964759607952996E-01 -0.239695886158460013E-07
+ -0.141639529816893012E-03 -0.121286786314684000E-05  0.178147057662445991E-01
+ -0.141639434640671995E-03  0.178147057572842007E-01 -0.121285914937942990E-05
+ -0.166666666666667011E-01 -0.139670732553277014E-03 -0.239694113970498999E-07
+  0.172964759605990989E-01 -0.166666666666667011E-01 -0.435069447260804027E-03
+  0.194636380955877999E-01 -0.118148603206807003E-02 -0.435069336864923981E-03
+ -0.125641079713200007E-02  0.193887090761660011E-01 -0.166666666666667011E-01
+  0.172964393436225994E-01 -0.239438464742388014E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.121143524546324995E-05  0.178146131932993002E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.435064994893337995E-03  0.000000000000000000E+00  0.193885567753522003E-01
+ -0.125634406111458990E-02 -0.435073789170992999E-03 -0.118148100326300994E-02
+  0.194636166968780999E-01 -0.166666666666667011E-01  0.172964393844214012E-01
+ -0.239719133482090010E-07 -0.121306464658341994E-05  0.178147059621873993E-01
+ -0.435064994896760008E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193885567753732009E-01 -0.125634406106166999E-02
+ -0.435073789167570986E-03 -0.118148100316007990E-02  0.194636166967958983E-01
+ -0.160852226257028993E-03 -0.166666666666667011E-01  0.172967522287055010E-01
+ -0.175965589473195009E-07 -0.161650188610159987E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.203407230368285005E-06  0.173054266458900000E-01 -0.166666666666667011E-01
+ -0.435068952667249981E-03  0.193884313397972990E-01 -0.125613577038263011E-02
+ -0.435069831457874994E-03 -0.118122048876310006E-02  0.194633662905455002E-01
+ -0.166666666666667011E-01 -0.139672097032745988E-03  0.172964738095684016E-01
+ -0.239841856617656007E-07 -0.141647330681414008E-03 -0.121359018892023999E-05
+  0.178147064035918991E-01 -0.141624421727624012E-03  0.178147048779919992E-01
+ -0.121150370057904998E-05 -0.166666666666667011E-01 -0.139668126615946994E-03
+ -0.239417188530348007E-07  0.172964738011583997E-01 -0.166666666666667011E-01
+ -0.435074178605607004E-03  0.194493807353944004E-01 -0.116766964883922007E-02
+ -0.435064605447021003E-03 -0.124201443598458009E-02  0.193746725929095989E-01
+ -0.166666666666667011E-01  0.173081303330447009E-01 -0.244868682847540015E-04
+ -0.166666666666667011E-01 -0.133924491202099991E-03  0.178635877949816016E-01
+ -0.166666666666667011E-01  0.173348654763293003E-01 -0.223062642015643999E-07
+ -0.104752868644245000E-05  0.178145411980309004E-01 -0.166666666666667011E-01
+ -0.435066924736750998E-03  0.193888351972144012E-01 -0.125659460890097997E-02
+ -0.435071859369655009E-03 -0.118170026599907991E-02  0.194638400334165990E-01
+ -0.139784932051159995E-03 -0.166666666666667011E-01  0.177494904443947013E-01
+ -0.465951791816761021E-07  0.000000000000000000E+00 -0.139919920565161006E-03
+ -0.483545322941620994E-06  0.178139539234989999E-01 -0.166666666666667011E-01
+ -0.435069373706029993E-03  0.193884428434829004E-01 -0.125613915345877008E-02
+ -0.435069410419707014E-03 -0.118121780684916997E-02  0.194633650118269004E-01
+ -0.166666666666667011E-01 -0.139672737807363003E-03  0.172964741998577996E-01
+ -0.239910227158871999E-07 -0.141651023984394999E-03 -0.121392525480250999E-05
+  0.178147066608579999E-01 -0.141640091614955997E-03  0.178147058209647997E-01
+ -0.121292720982870991E-05 -0.166666666666667011E-01 -0.139670843240139006E-03
+ -0.239707145982510011E-07  0.172964741884388991E-01 -0.166666666666667011E-01
+ -0.435065762561208998E-03  0.193887058048313986E-01 -0.125648894894022007E-02
+ -0.435073021522692984E-03 -0.118161486512369002E-02  0.194637424061168997E-01
+ -0.139514021925975012E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964189492996988E-01 -0.222147158226326007E-07 -0.140733410708959988E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.112570816185458990E-05
+  0.178142652186864016E-01  0.000000000000000000E+00 -0.435053679420991009E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193881873511392994E-01
+ -0.125619573968341011E-02 -0.435085103920473015E-03 -0.118149860448210999E-02
+  0.194635877202401011E-01 -0.139671537483522003E-03 -0.166666666666667011E-01
+  0.172964363686913987E-01 -0.239806171193915986E-07 -0.141644487121937989E-03
+ -0.121349920180918992E-05  0.178147063153519004E-01 -0.166666666666667011E-01
+ -0.435069239606207980E-03  0.193887146861679988E-01 -0.125642003417049007E-02
+ -0.435069544519456981E-03 -0.118149450125770001E-02  0.194636470456143011E-01
+ -0.139671620203623008E-03 -0.166666666666667011E-01  0.172964378382645008E-01
+ -0.239229803350932009E-07 -0.141643246303753000E-03  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.121015091751334009E-05  0.178145160915291999E-01  0.000000000000000000E+00
+ -0.435060404326353023E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193885964889095007E-01 -0.125648696022081000E-02 -0.435078379542822014E-03
+ -0.118169041399521007E-02  0.194637954413787000E-01 -0.139513918934682001E-03
+ -0.166666666666667011E-01  0.172964088839709998E-01 -0.223091853795638985E-07
+ -0.140734728439723009E-03 -0.113122244037598008E-05  0.178145985581429009E-01
+ -0.142980894195082010E-03 -0.166666666666667011E-01  0.173314539890347015E-01
+ -0.109866000013417994E-06 -0.158534109366701998E-03 -0.501444632546631029E-05
+  0.178188430632978990E-01 -0.166666666666667011E-01  0.191833678841652998E-01
+ -0.116695963416674989E-02  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.115255809689225998E-02  0.193073016724906003E-01  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.172969314941213009E-01 -0.239425929167627016E-07
+ -0.121051376515920001E-05  0.178147034232611010E-01 -0.166666666666667011E-01
+ -0.435672877594180986E-03  0.190410730770498998E-01 -0.779817530267083964E-03
+ -0.434464569145039006E-03 -0.823623092840541026E-03  0.189719892294968988E-01
+ -0.147270553185889990E-03 -0.166666666666667011E-01  0.174669464297294991E-01
+ -0.100890525843168994E-03 -0.564704630029780048E-03 -0.477372644101759012E-03
+  0.187551220717942987E-01 -0.139439417619302005E-03  0.178140970993464001E-01
+ -0.575952369555323042E-06 -0.166666666666667011E-01 -0.139258097457669987E-03
+ -0.129863402849587998E-07  0.173691959079540006E-01 -0.166666666666667011E-01
+ -0.437167519508436999E-03  0.195228624474010001E-01 -0.126248680382140001E-02
+ -0.432956808905804989E-03 -0.125491318727090004E-02  0.193932788356039991E-01
+ -0.166666666666667011E-01  0.172966662193905010E-01 -0.121413434668133993E-06
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.972291627722414967E-05
+  0.179398884478061994E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.350105795458797975E-03  0.000000000000000000E+00
+  0.179358412564393013E-01 -0.564926743849099038E-05 -0.144639853703507987E-03
+ -0.314581126948723974E-06  0.178138319445937006E-01 -0.166666666666667011E-01
+  0.193703856826768009E-01 -0.124844632332796993E-02 -0.118120734934456010E-02
+  0.194604805418001001E-01 -0.431301130210841007E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.192675040489116989E-01
+ -0.119991064076258990E-02 -0.438790368590655981E-03 -0.117513383015891001E-02
+  0.194438480812311985E-01 -0.139700882958903997E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172963963463645017E-01 -0.952678627103132059E-08
+ -0.141292751882909993E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.429511932811560008E-06  0.177498907292438987E-01
+ -0.166666666666667011E-01 -0.435115740933037984E-03  0.188852022804432002E-01
+ -0.625668905066499950E-03 -0.435023034878147990E-03 -0.676916698779699965E-03
+  0.188324708841117010E-01 -0.166666666666667011E-01 -0.139446455078796010E-03
+  0.174062615201245989E-01 -0.190374494518299010E-07 -0.139967041096937007E-03
+ -0.766079367871497027E-06  0.178142633180293011E-01 -0.532899100111402005E-03
+  0.187657876777795017E-01 -0.512856379296787999E-03 -0.166666666666667011E-01
+ -0.274566803915548998E-03 -0.183873918065665000E-03  0.177330707817049994E-01
+ -0.166666666666667011E-01 -0.142934407180007997E-03  0.173279989208002007E-01
+ -0.109940092441766996E-06 -0.158611617255726990E-03 -0.505928492620163000E-05
+  0.178189142493529011E-01 -0.166666666666667011E-01  0.191837856777059002E-01
+ -0.116715931604291999E-02 -0.166666666666667011E-01 -0.115261550320967993E-02
+  0.193075009050143985E-01 -0.166666666666667011E-01  0.192715504007066017E-01
+ -0.120392391412785997E-02 -0.117898318183162006E-02  0.194477863374666986E-01
+ -0.166666666666667011E-01 -0.435539737045843010E-03  0.194769265765691001E-01
+ -0.120385500091470992E-02 -0.434598338907063995E-03 -0.125615984296152997E-02
+  0.193900185880402989E-01 -0.435045514645459013E-03 -0.166666666666667011E-01
+  0.193914099137533005E-01 -0.125968948646175004E-02  0.000000000000000000E+00
+ -0.435093267670908987E-03 -0.118504666533010995E-02  0.194671248940259008E-01
+ -0.166666666666667011E-01 -0.139670791769998004E-03  0.172964393766727002E-01
+ -0.239724329252018989E-07 -0.141640152553009998E-03 -0.121309020944367992E-05
+  0.178147059866567009E-01 -0.166666666666667011E-01 -0.435069301629178995E-03
+  0.193887307626826010E-01 -0.125643515612083000E-02 -0.435069482496533021E-03
+ -0.118150837470912999E-02  0.194636615934981015E-01 -0.140348157921430997E-03
+  0.178144318702273992E-01 -0.933811502946043969E-06 -0.166666666666667011E-01
+ -0.140348150101401000E-03 -0.933783910990536035E-06  0.178144299643035985E-01
+ -0.166666666666667011E-01 -0.139689488845466003E-03  0.172964362338874009E-01
+ -0.241738769572596000E-07 -0.141748099793582007E-03 -0.122299932206489994E-05
+  0.178147151092019999E-01 -0.435059916514922017E-03 -0.166666666666667011E-01
+  0.193909887083489017E-01 -0.125955268129309002E-02  0.000000000000000000E+00
+ -0.435078867325628023E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.118493749346423004E-02  0.194660289027400006E-01
+  0.000000000000000000E+00 -0.435044083341360984E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193878554391240000E-01 -0.125604934766983993E-02
+ -0.435094698749261975E-03 -0.118149272049225995E-02  0.194635444483519000E-01
+ -0.139670978031983007E-03 -0.166666666666667011E-01  0.172964365263250999E-01
+ -0.239746177115109000E-07 -0.141641256924305989E-03 -0.121320404495395004E-05
+  0.178147060925782016E-01 -0.166666666666667011E-01 -0.435069350149236000E-03
+  0.193887875206789004E-01 -0.125649208526834003E-02 -0.435069433976496020E-03
+ -0.118156330259116009E-02  0.194637181802562002E-01 -0.139700455923195987E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172964141750712001E-01
+ -0.947598200812492946E-08 -0.141288502520823991E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.426976078434958026E-06  0.177495760715989000E-01
+  0.000000000000000000E+00 -0.140209281941629005E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.177492016410450010E-01 -0.499125605911364029E-07
+ -0.140454288917710998E-03 -0.520385719864523011E-06  0.178140161378314010E-01
+ -0.435047474599787026E-03 -0.166666666666667011E-01  0.193914777822561009E-01
+ -0.125971925119921992E-02 -0.435091308001488001E-03 -0.118504760094160990E-02
+  0.194671336437930997E-01 -0.435388198571750988E-03 -0.166666666666667011E-01
+  0.194580790627339011E-01 -0.118263492130232990E-02 -0.434750259582202979E-03
+ -0.124178531317212996E-02  0.193754724084427010E-01 -0.166666666666667011E-01
+  0.173081480357068010E-01 -0.244895773575376999E-04  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.133922698986129010E-03  0.178635891344391985E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.193849898136359000E-01
+ -0.125483132562911989E-02 -0.118151827367967994E-02  0.194632328928340016E-01
+ -0.166666666666667011E-01 -0.142602747236407002E-03  0.172960559757211994E-01
+ -0.117227408613948994E-06 -0.158587582136584995E-03 -0.567497435835260990E-05
+  0.178189581300063002E-01 -0.139720580991034007E-03 -0.166666666666667011E-01
+  0.177036782302840007E-01 -0.306502532037200970E-07 -0.139932855252091994E-03
+ -0.499376182467009957E-06  0.178139944449866999E-01 -0.546160785305606963E-03
+  0.184421025597138015E-01 -0.894167397427545998E-04 -0.166666666666667011E-01
+ -0.147442821218409995E-03 -0.240244287551271008E-04  0.177137723141376005E-01
+ -0.166666666666667011E-01 -0.400413696341282016E-03  0.186079233649920003E-01
+ -0.750380296538879967E-03 -0.465646690569393017E-03 -0.925379786421282957E-03
+  0.191586075172780999E-01 -0.166666666666667011E-01  0.177029162002804996E-01
+ -0.193968089936665005E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.245143973844672025E-03  0.179475102895724006E-01
+ -0.166666666666667011E-01 -0.140401212232880995E-03  0.173460971398785996E-01
+ -0.290712391494978008E-07 -0.144639789846284008E-03 -0.132223121759111006E-05
+  0.178148169771744011E-01 -0.166666666666667011E-01  0.193485878396606990E-01
+ -0.123880031419069990E-02 -0.118081271608876993E-02  0.194575794108615002E-01
+ -0.139755721135340007E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173348680988911000E-01 -0.228988835869186992E-07
+ -0.141707074022604998E-03 -0.107451126624275990E-05  0.178145682418867010E-01
+ -0.139671204369828987E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172964527346999991E-01 -0.238206247996137016E-07 -0.141637869293981998E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.120424066353106008E-05  0.178141993541914984E-01 -0.166666666666667011E-01
+ -0.435293328042460988E-03  0.188969200156620006E-01 -0.638325692737096986E-03
+ -0.434845262794537977E-03 -0.686401912695584044E-03  0.188415679710590006E-01
+ -0.166666666666667011E-01 -0.139450260715297009E-03  0.174035477820680996E-01
+ -0.190790185253572011E-07 -0.139988379398403009E-03 -0.772546665100815971E-06
+  0.178142695812380999E-01 -0.535400152422887033E-03  0.187643522383528008E-01
+ -0.509589395081856949E-03 -0.166666666666667011E-01 -0.267380606913843026E-03
+ -0.178487068761980011E-03  0.177166953062542001E-01 -0.166666666666667011E-01
+ -0.142812108775019998E-03  0.173285847433682014E-01 -0.101845505783431005E-06
+ -0.158027855199464992E-03 -0.469583501144497018E-05  0.178185786022877005E-01
+ -0.166666666666667011E-01  0.191928643172403005E-01 -0.117144676995123995E-02
+ -0.166666666666667011E-01 -0.115407859919587998E-02  0.193128636528600017E-01
+ -0.166666666666667011E-01  0.192743847992538006E-01 -0.120526566976207009E-02
+ -0.117896448649838000E-02  0.194473150974792015E-01 -0.166666666666667011E-01
+ -0.435531752463573995E-03  0.194769704025496010E-01 -0.120375472484332991E-02
+ -0.434606347505284997E-03 -0.125643108232954001E-02  0.193902401795961014E-01
+ -0.435044468242076017E-03 -0.166666666666667011E-01  0.193871801282229986E-01
+ -0.125535484215398997E-02  0.000000000000000000E+00 -0.435094313909530008E-03
+ -0.118080938637835000E-02  0.194628399719979014E-01 -0.166666666666667011E-01
+ -0.139670871572238991E-03  0.172964393487799011E-01 -0.239732892684089006E-07
+ -0.141640613379916011E-03 -0.121313235466317992E-05  0.178147060195563006E-01
+ -0.166666666666667011E-01 -0.435069156529857982E-03  0.193887053622597005E-01
+ -0.125641208621197001E-02 -0.435069627595703981E-03 -0.118148788386283009E-02
+  0.194636401109469989E-01 -0.140357102716023998E-03  0.178144327480211015E-01
+ -0.934953983487829978E-06 -0.166666666666667011E-01 -0.140357082074367998E-03
+ -0.934881658679828005E-06  0.178144277579760993E-01 -0.166666666666667011E-01
+ -0.139671248276079991E-03  0.172964387342726990E-01 -0.239772668643883999E-07
+ -0.141642790859008988E-03 -0.121332835924946005E-05  0.178147058633731994E-01
+ -0.435073048109683026E-03 -0.166666666666667011E-01  0.194629028419433005E-01
+ -0.118081642190051008E-02 -0.435065735973592016E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.125566691830197993E-02
+  0.193878887757382989E-01  0.000000000000000000E+00 -0.435064995098833989E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193890263762267999E-01
+ -0.125684228342590990E-02 -0.435073788965510016E-03 -0.118197542174123009E-02
+  0.194640901826458011E-01 -0.139679109099943003E-03 -0.166666666666667011E-01
+  0.172963725697189014E-01 -0.240507205201775987E-07 -0.141688401087265996E-03
+ -0.121695435575037005E-05  0.178146602328799995E-01 -0.166666666666667011E-01
+ -0.435064159301769998E-03  0.193889744409617010E-01 -0.125680005533066992E-02
+ -0.435074624737013995E-03 -0.118194373351253996E-02  0.194640651337368992E-01
+ -0.139708967713254010E-03 -0.166666666666667011E-01  0.172963616356281991E-01
+ -0.951324367715890009E-08 -0.141328092009716008E-03  0.000000000000000000E+00
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.428676150398848976E-06  0.177495851017135005E-01  0.000000000000000000E+00
+ -0.140184006518702988E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.177492476417109996E-01 -0.497520252347835031E-07 -0.140422322794949995E-03
+ -0.518268041627564046E-06  0.178140140868086015E-01 -0.435065432183330027E-03
+ -0.166666666666667011E-01  0.193879055107330014E-01 -0.125567283667538999E-02
+ -0.435073351892596022E-03 -0.118081969348847004E-02  0.194629358214771994E-01
+ -0.371043726257224002E-03 -0.166666666666667011E-01  0.180438943525561002E-01
+ -0.288534096345938022E-03 -0.486290379332660018E-03 -0.560900084076317983E-03
+  0.187957069582666003E-01 -0.166666666666667011E-01  0.177374439328937990E-01
+ -0.214133585214810996E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.267021959071755976E-03  0.179665664489489993E-01
+ -0.166666666666667011E-01  0.193465909995582004E-01 -0.123791262449076994E-02
+ -0.118077342878954009E-02  0.194573127654812002E-01 -0.166666666666667011E-01
+ -0.139258097457669987E-03  0.172965161025564994E-01 -0.139408267359583998E-07
+ -0.139258097457669987E-03 -0.716280208726827027E-06  0.178142098987628013E-01
+ -0.448745061591116986E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.187014625769068993E-01 -0.470417770715891002E-03 -0.420615920070051990E-03
+ -0.416030208289801002E-03  0.185800388700857987E-01 -0.143647586674343006E-03
+  0.178142597734856009E-01 -0.784926914613626987E-06 -0.166666666666667011E-01
+ -0.289422112454956976E-03 -0.392634159972045978E-05  0.178691591832313007E-01
+ -0.166666666666667011E-01 -0.433838061791930977E-03  0.193348473865251995E-01
+ -0.122492954566329995E-02 -0.436295790691586000E-03 -0.116751653970171990E-02
+  0.194447289756928994E-01 -0.166666666666667011E-01  0.173080151777424002E-01
+ -0.244742517923652000E-04  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.133996459548231008E-03  0.178636488295263011E-01  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.139757971222783007E-03
+  0.000000000000000000E+00  0.173348621827651983E-01 -0.229201530844114011E-07
+ -0.141718205670910005E-03 -0.107549290769848001E-05  0.178145692111379012E-01
+ -0.166666666666667011E-01  0.193848567620656005E-01 -0.125477293295166007E-02
+ -0.118151635482893996E-02  0.194632155619245996E-01 -0.140442473137899012E-03
+ -0.166666666666667011E-01  0.173484955870056995E-01 -0.294133609017818009E-07
+ -0.144782161335538988E-03 -0.133037270602234991E-05  0.178148273619183999E-01
+ -0.139679362257274988E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172964143215187001E-01 -0.238876115774515004E-07 -0.141684595140730989E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.120742002760442999E-05  0.178141299090062988E-01 -0.166666666666667011E-01
+ -0.139670790012750987E-03  0.172964393796226010E-01 -0.239724130311146011E-07
+ -0.141640142355812013E-03 -0.121308921736943007E-05  0.178147059830380990E-01
+ -0.166666666666667011E-01 -0.140356055867897991E-03  0.178144308211048014E-01
+ -0.934793117616238968E-06 -0.140356063365803011E-03 -0.934819410434324011E-06
+  0.178144326353903010E-01 -0.435069477705365992E-03  0.194636362328108985E-01
+ -0.118148328063351999E-02 -0.166666666666667011E-01 -0.435069306420349006E-03
+ -0.125640963064629994E-02  0.193887060480272995E-01 -0.166666666666667011E-01
+ -0.139670746102450000E-03  0.172964393573767014E-01 -0.239719438513679986E-07
+ -0.141639889164474000E-03 -0.121306620727653000E-05  0.178147059636829010E-01
+ -0.166666666666667011E-01  0.193887976067478006E-01 -0.125650162570866992E-02
+ -0.166666666666667011E-01 -0.118157207781601002E-02  0.194637272757679988E-01
+ -0.166666666666667011E-01  0.193887977046092001E-01 -0.125650166419828007E-02
+ -0.118157208862118990E-02  0.194637273843102990E-01 -0.166666666666667011E-01
+ -0.435064765067235998E-03  0.193889522097447989E-01 -0.125676448751559002E-02
+ -0.435074018990514973E-03 -0.118190001722009996E-02  0.194640239141652995E-01
+ -0.140182361260136993E-03 -0.166666666666667011E-01  0.177493225039018002E-01
+ -0.498169409981402992E-07  0.000000000000000000E+00 -0.140419962900806993E-03
+ -0.518131012618996021E-06  0.178140141298323007E-01 -0.166666666666667011E-01
+ -0.435069323598363008E-03  0.193884453764502987E-01 -0.125614284670238011E-02
+ -0.435069460527360988E-03 -0.118122217360852990E-02  0.194633691143310007E-01
+ -0.166666666666667011E-01 -0.139670884426293997E-03  0.172964736171049016E-01
+ -0.239711924341550007E-07 -0.141640335129922002E-03 -0.121295197632909002E-05
+  0.178147058397751003E-01 -0.141656453358155989E-03  0.178147070060390988E-01
+ -0.121442246005117002E-05 -0.166666666666667011E-01 -0.139673677795910993E-03
+ -0.240011176184461992E-07  0.172964736295889016E-01 -0.166666666666667011E-01
+ -0.435069164848058006E-03  0.193887179318738001E-01 -0.125642505656990006E-02
+ -0.435069619277515991E-03 -0.118150056235400997E-02  0.194636526000948017E-01
+ -0.139672366859797993E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964370349440004E-01 -0.239153496141802001E-07 -0.141647103173812004E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.120964050265571001E-05
+  0.178144653667999010E-01  0.000000000000000000E+00 -0.435058020368536008E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193887196651586005E-01
+ -0.125666847383042006E-02 -0.435080763346489007E-03 -0.118190508477706993E-02
+  0.194639922621226008E-01 -0.139641248248434992E-03 -0.166666666666667011E-01
+  0.172963810772421012E-01 -0.236457807710709016E-07 -0.141469789611806004E-03
+ -0.119702907967459002E-05  0.178146434627204017E-01 -0.166666666666667011E-01
+ -0.435064079024843019E-03  0.193888050589719994E-01 -0.125663030165246007E-02
+ -0.435074705011247002E-03 -0.118177958688183999E-02  0.194638936988976996E-01
+ -0.139641455576729011E-03 -0.166666666666667011E-01  0.172963976004947007E-01
+ -0.234988035352734987E-07 -0.141466765709830994E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.118846831550546996E-05
+  0.178141535165701990E-01  0.000000000000000000E+00 -0.435046365527533990E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193879446262342006E-01
+ -0.125609596684336995E-02 -0.435092416913644979E-03 -0.118150619229764994E-02
+  0.194635647529617997E-01 -0.139672261692974013E-03 -0.166666666666667011E-01
+  0.172964349185803015E-01 -0.239884652187998984E-07 -0.141648681442598987E-03
+ -0.121388810715753001E-05  0.178147066071378986E-01 -0.435069380083255997E-03
+ -0.166666666666667011E-01  0.193887114872825991E-01 -0.125641369962899998E-02
+ -0.435069404042481985E-03 -0.118148620112529007E-02  0.194636395221940012E-01
+ -0.166666666666667011E-01  0.172965077531164017E-01 -0.927568178720202994E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.102046202213369007E-05
+  0.173051339160117008E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.193887977533772002E-01 -0.125650168557878009E-02 -0.118157208930007003E-02
+  0.194637273906578985E-01 -0.166666666666667011E-01 -0.435066628630517986E-03
+  0.193883490811541000E-01 -0.125609889832018992E-02 -0.435072155470964984E-03
+ -0.118121788505638997E-02  0.194633537877374994E-01 -0.139670767304303994E-03
+ -0.166666666666667011E-01  0.172964740023511983E-01 -0.239699122889327989E-07
+ -0.141639655324462003E-03 -0.121288819251166995E-05  0.178147057860663996E-01
+ -0.141639719254381006E-03  0.178147057910134007E-01 -0.121289402770353995E-05
+ -0.166666666666667011E-01 -0.139670778383217008E-03 -0.239700310192222999E-07
+  0.172964740024197997E-01 -0.166666666666667011E-01 -0.435069359721298977E-03
+  0.193887107978404985E-01 -0.125641340675414995E-02 -0.435069424404436024E-03
+ -0.118148620716793996E-02  0.194636394455750990E-01 -0.166666666666667011E-01
+  0.172965077510262993E-01 -0.927568056586548005E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.102046204410674010E-05  0.173051339160213008E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.139674680780886991E-03  0.000000000000000000E+00  0.173041376249668984E-01
+ -0.235410521181961013E-07 -0.141584640975313996E-03 -0.117464395864609995E-05
+  0.178146677207322995E-01 -0.166666666666667011E-01  0.193887977504637009E-01
+ -0.125650168430156001E-02 -0.118157208925927996E-02  0.194637273902763010E-01
+ -0.139674680734473000E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173041376249788992E-01 -0.235410516367698016E-07
+ -0.141584640716005010E-03 -0.117464393529904008E-05  0.178146677207122010E-01
+ -0.139669171022264988E-03 -0.166666666666667011E-01  0.172963674074106989E-01
+ -0.936361616323369989E-08 -0.141145821435217987E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.422126237945786021E-06  0.177496556211017009E-01 -0.166666666666667011E-01
+ -0.139670831173391990E-03  0.172964378151377017E-01 -0.239729563898503003E-07
+ -0.141640396000908991E-03 -0.121311945483194002E-05  0.178147060102482983E-01
+ -0.166666666666667011E-01 -0.140348082156047988E-03  0.178144284125189012E-01
+ -0.933754616232120984E-06 -0.140348096313297011E-03 -0.933804570413684980E-06
+  0.178144318631820002E-01 -0.435069554992271979E-03  0.194636358566133011E-01
+ -0.118148321892500995E-02 -0.166666666666667011E-01 -0.435069229133381978E-03
+ -0.125640843271979000E-02  0.193887033474266987E-01 -0.166666666666667011E-01
+ -0.139670746039409987E-03  0.172964393604615012E-01 -0.239719429806933989E-07
+ -0.141639888769111005E-03 -0.121306615757996010E-05  0.178147059636586010E-01
+ -0.166666666666667011E-01  0.193887976105126016E-01 -0.125650162733728990E-02
+ -0.166666666666667011E-01 -0.118157207791531990E-02  0.194637272767340004E-01
+ -0.166666666666667011E-01  0.193887977049908011E-01 -0.125650166436534001E-02
+ -0.118157208862703007E-02  0.194637273843653001E-01 -0.166666666666667011E-01
+ -0.435069387915653001E-03  0.193887117581042004E-01 -0.125641381798179000E-02
+ -0.435069396210086012E-03 -0.118148620435052005E-02  0.194636395574343014E-01
+ -0.139674686777477991E-03 -0.166666666666667011E-01  0.173041492300615996E-01
+ -0.235404450870160989E-07  0.000000000000000000E+00 -0.141584559821053004E-03
+ -0.117458831712666003E-05  0.178146676653543994E-01 -0.166666666666667011E-01
+ -0.139670743890679005E-03  0.172964393844352998E-01 -0.239719183429222001E-07
+ -0.141639876120318997E-03 -0.121306489215045994E-05  0.178147059624318010E-01
+ -0.166666666666667011E-01 -0.140348087305803990E-03  0.178144318417805016E-01
+ -0.933801043435799969E-06 -0.140348087381334013E-03 -0.933801309950123998E-06
+  0.178144318601900983E-01 -0.435069392938043002E-03  0.194636364111164008E-01
+ -0.118148311443157998E-02 -0.166666666666667011E-01 -0.435069391187694980E-03
+ -0.125641070647420998E-02  0.193887087798863995E-01 -0.166666666666667011E-01
+ -0.435069391191658010E-03  0.193887815134855014E-01 -0.125648507888060008E-02
+ -0.435069392934080027E-03 -0.118155584789411011E-02  0.194637107834846994E-01
+ -0.139700198059465998E-03 -0.166666666666667011E-01  0.172964151566727987E-01
+ -0.947849280924833965E-08 -0.141287467767569990E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.427106383838012019E-06
+  0.177495992952378008E-01  0.000000000000000000E+00 -0.140178145950843993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.177492248104791014E-01
+ -0.496797052902892032E-07 -0.140415043811154997E-03 -0.517779519367424970E-06
+  0.178140139541318986E-01 -0.435069390908532979E-03 -0.166666666666667011E-01
+  0.193887088082677003E-01 -0.125641074170613001E-02 -0.435069393217205004E-03
+ -0.118148315311942002E-02  0.194636364485415991E-01 -0.166666666666667011E-01
+ -0.139670745499349992E-03  0.172964393844174009E-01 -0.239719356219104995E-07
+ -0.141639885405697995E-03 -0.121306574176735002E-05  0.178147059632627995E-01
+ -0.435069390223737009E-03 -0.166666666666667011E-01  0.193887087097285998E-01
+ -0.125641070240488004E-02 -0.435069393902000974E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.118148314340501995E-02
+  0.194636363510784990E-01  0.000000000000000000E+00 -0.435069390007495975E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193887087390940994E-01
+ -0.125641068860798000E-02 -0.435069394118242007E-03 -0.118148311389442004E-02
+  0.194636364058402012E-01 -0.139670743908197002E-03 -0.166666666666667011E-01
+  0.172964393843969000E-01 -0.239719185336623003E-07 -0.141639876221824010E-03
+ -0.121306490160525007E-05  0.178147059624407002E-01 -0.435069391188885001E-03
+ -0.166666666666667011E-01  0.193887118630617986E-01 -0.125641385909668990E-02
+ -0.435069392936854012E-03 -0.118148619756053004E-02  0.194636395637321005E-01
+ -0.166666666666667011E-01  0.172965077542917012E-01 -0.927568247378030971E-07
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.102046200973725997E-05
+  0.173051339160060005E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.193887977549708004E-01 -0.125650168627737991E-02 -0.118157208932236990E-02
+  0.194637273908664990E-01 -0.166666666666667011E-01 -0.435069391154679994E-03
+  0.193884418577966004E-01 -0.125613776930019998E-02 -0.435069392971057988E-03
+ -0.118121619345346995E-02  0.194633634742979005E-01 -0.139670761454477999E-03
+ -0.166666666666667011E-01  0.172964743993829009E-01 -0.239698237237728008E-07
+ -0.141639617486802999E-03 -0.121288294400935000E-05  0.178147057813331997E-01
+ -0.141639617487035994E-03  0.178147057813331997E-01 -0.121288294403079010E-05
+ -0.166666666666667011E-01 -0.139670761454519009E-03 -0.239698237241789995E-07
+  0.172964743993829009E-01 -0.166666666666667011E-01 -0.435069391188809974E-03
+  0.193887118630593006E-01 -0.125641385909560006E-02 -0.435069392936928985E-03
+ -0.118148619756054001E-02  0.194636395637317987E-01 -0.166666666666667011E-01
+  0.172965077542917012E-01 -0.927568247377395960E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.102046200973731990E-05  0.173051339160060005E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.139674680709099010E-03  0.000000000000000000E+00  0.173041376249854009E-01
+ -0.235410513736555998E-07 -0.141584640574245006E-03 -0.117464392253488990E-05
+  0.178146677207011994E-01 -0.166666666666667011E-01  0.193887977549708004E-01
+ -0.125650168627737991E-02 -0.118157208932236990E-02  0.194637273908664990E-01
+ -0.139674680709099010E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173041376249854009E-01 -0.235410513736546998E-07
+ -0.141584640574244004E-03 -0.117464392253485009E-05  0.178146677207011994E-01
+ -0.161601720479338990E-03 -0.166666666666667011E-01  0.172965076665808001E-01
+ -0.926702349005646988E-07 -0.162351552716187005E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.102033131897240007E-05  0.173051453909592000E-01 -0.166666666666667011E-01
+ -0.435069503154383993E-03  0.194633707007618000E-01 -0.118122593631263999E-02
+ -0.435069280971315012E-03 -0.125614175474171000E-02  0.193884463223708003E-01
+ -0.166666666666667011E-01 -0.139672289609427992E-03  0.172964739021653005E-01
+ -0.239862388766482005E-07 -0.141648440824267006E-03 -0.121369084068463994E-05
+  0.178147064711290000E-01 -0.141624407193420988E-03  0.178147048759245002E-01
+ -0.121150194851005999E-05 -0.166666666666667011E-01 -0.139668124261626994E-03
+ -0.239416874711979002E-07  0.172964738935757999E-01 -0.166666666666667011E-01
+ -0.435074183386577979E-03  0.194493808683343997E-01 -0.116766988693369996E-02
+ -0.435064600665905016E-03 -0.124201443215284004E-02  0.193746726050244011E-01
+ -0.166666666666667011E-01  0.173081303333391008E-01 -0.244868683301465988E-04
+ -0.166666666666667011E-01 -0.133924491176626011E-03  0.178635877950088992E-01
+ -0.166666666666667011E-01  0.173348654763634015E-01 -0.223062642005740000E-07
+ -0.104752868632071993E-05  0.178145411980307998E-01 -0.166666666666667011E-01
+ -0.435066580651087987E-03  0.193886968271455989E-01 -0.125646132097252000E-02
+ -0.435072203449548981E-03 -0.118157534474708009E-02  0.194637086880425011E-01
+ -0.139516419388416990E-03 -0.166666666666667011E-01  0.173041769139335999E-01
+ -0.219137423564159007E-07  0.000000000000000000E+00 -0.140700386591277008E-03
+ -0.109554334382343009E-05  0.178145628168012993E-01 -0.166666666666667011E-01
+ -0.139670785511788992E-03  0.172964393739205996E-01 -0.239723655938119001E-07
+ -0.141640116450570004E-03 -0.121308690248200008E-05  0.178147059824802015E-01
+ -0.166666666666667011E-01 -0.140353492284304998E-03  0.178144307839981988E-01
+ -0.934469196153570009E-06 -0.140353499090780005E-03 -0.934493112001855014E-06
+  0.178144324348224990E-01 -0.435069470216686990E-03  0.194636389371891996E-01
+ -0.118148595888806001E-02 -0.166666666666667011E-01 -0.435069313909032019E-03
+ -0.125641246449570009E-02  0.193887089321175994E-01 -0.166666666666667011E-01
+ -0.435066224778556999E-03  0.193888655360086003E-01 -0.125664138810585991E-02
+ -0.435072559315323980E-03 -0.118175670116711000E-02  0.194638920667745997E-01
+ -0.139535529617076010E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172963867645380999E-01 -0.887106097806382040E-08 -0.140533144432323988E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.400546896298884985E-06
+  0.177498997117815013E-01  0.000000000000000000E+00 -0.140183011740981989E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.177495724693359017E-01
+ -0.500888730287148026E-07 -0.140419805775519006E-03 -0.518188128586851950E-06
+  0.178140142381220008E-01 -0.435066535168476983E-03 -0.166666666666667011E-01
+  0.193886214195925013E-01 -0.125637957080158998E-02 -0.435072248931340003E-03
+ -0.118149384028357995E-02  0.194636350622922998E-01 -0.166666666666667011E-01
+ -0.139671934380400999E-03  0.172964390135027998E-01 -0.239846931501482010E-07
+ -0.141646750301238991E-03 -0.121369351008339006E-05  0.178147064557130988E-01
+ -0.435070730673722014E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.194635424129478984E-01 -0.118148467355335004E-02 -0.435068053446324991E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.125637211589557001E-02
+  0.193885950416877000E-01  0.000000000000000000E+00 -0.435065887914215998E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193887904637096015E-01
+ -0.125657470314374992E-02 -0.435072896172526973E-03 -0.118169761602073009E-02
+  0.194638244685289008E-01 -0.139514756609392007E-03 -0.166666666666667011E-01
+  0.172964082847710011E-01 -0.223179439037086001E-07 -0.140739566181214998E-03
+ -0.113165464823919991E-05  0.178145987602886004E-01 -0.142980894581813998E-03
+ -0.166666666666667011E-01  0.173314539689373992E-01 -0.109866042397868997E-06
+ -0.158534112254571006E-03 -0.501444838217852000E-05  0.178188430651418997E-01
+ -0.166666666666667011E-01  0.191833678439739010E-01 -0.116695961504079994E-02
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.115255809083636006E-02
+  0.193073016509602985E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172969314939965015E-01 -0.239425929242464991E-07 -0.121051376580430009E-05
+  0.178147034232617012E-01 -0.166666666666667011E-01 -0.435670020554572023E-03
+  0.190410313364221991E-01 -0.779749213569466953E-03 -0.434467438812183976E-03
+ -0.823627788704149036E-03  0.189719905685246000E-01 -0.147270534866566013E-03
+ -0.166666666666667011E-01  0.174669457049126987E-01 -0.100890465185484999E-03
+ -0.564704633717408051E-03 -0.477373059999760001E-03  0.187551223307545999E-01
+ -0.139439401685823998E-03  0.178140971035927985E-01 -0.575958645398553022E-06
+ -0.166666666666667011E-01 -0.139258097457669987E-03 -0.129864673787176997E-07
+  0.173691952385951996E-01 -0.166666666666667011E-01 -0.437167482025401009E-03
+  0.195228613586728994E-01 -0.126248570169920009E-02 -0.432956846913321015E-03
+ -0.125491321979356992E-02  0.193932787767274011E-01 -0.166666666666667011E-01
+  0.172966662158898984E-01 -0.121413435136702005E-06  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.972291637013934920E-05  0.179398884476381013E-01
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.350105792551318000E-03  0.000000000000000000E+00  0.179358412563318005E-01
+ -0.564926760193745028E-05 -0.144639853967615001E-03 -0.314581136545178986E-06
+  0.178138319445972984E-01 -0.166666666666667011E-01  0.193703856781149014E-01
+ -0.124844632131722007E-02 -0.118120734926823010E-02  0.194604805411918991E-01
+ -0.431301129654014016E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.192675040339660014E-01 -0.119991063440438991E-02
+ -0.438790369133317979E-03 -0.117513383061983992E-02  0.194438480799019008E-01
+ -0.161600618881618991E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172964964024612013E-01 -0.929759484961031952E-07 -0.162354606074604997E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.102674406185603999E-05  0.173051827906571011E-01 -0.166666666666667011E-01
+ -0.143972810900854002E-03  0.173505492137603010E-01 -0.481358872158661014E-04
+ -0.558385319861011051E-03 -0.438374229680275993E-03  0.187218708148952986E-01
+ -0.166666666666667011E-01 -0.139937822338344987E-03  0.174078896544059003E-01
+ -0.227983603063990998E-07 -0.141801732328176988E-03 -0.908683871102670049E-06
+  0.178144027016026996E-01 -0.143597578982917004E-03  0.178145584095829000E-01
+ -0.106316067196835009E-05 -0.166666666666667011E-01 -0.140417715032831007E-03
+ -0.268130638269802001E-07  0.174078940527996016E-01 -0.166666666666667011E-01
+ -0.434071259447730998E-03  0.193423072809523998E-01 -0.122826363249218997E-02
+ -0.436064294450999015E-03 -0.116769061111692994E-02  0.194456918207231988E-01
+ -0.166666666666667011E-01  0.172982030081412011E-01 -0.280949762784711011E-07
+ -0.166666666666667011E-01 -0.189517761832509992E-05  0.178271977135750007E-01
+ -0.166666666666667011E-01  0.178265008932085994E-01 -0.134657557022567991E-05
+ -0.925862840583118960E-06  0.178144080134048990E-01 -0.166666666666667011E-01
+ -0.434780916834727992E-03  0.193789511951828995E-01 -0.125220905631005992E-02
+ -0.435357601613535017E-03 -0.118147901946294995E-02  0.194624869766942994E-01
+ -0.140177592469087011E-03 -0.166666666666667011E-01  0.177492253272222983E-01
+ -0.496756672234455021E-07  0.000000000000000000E+00 -0.140414345842645999E-03
+ -0.517733155226927036E-06  0.178140139187708997E-01 -0.166666666666667011E-01
+ -0.435069391154579001E-03  0.193884418550169003E-01 -0.125613776648296005E-02
+ -0.435069392971158982E-03 -0.118121619070897998E-02  0.194633634714499008E-01
+ -0.166666666666667011E-01 -0.139670760166673013E-03  0.172964743997286001E-01
+ -0.239698098600330014E-07 -0.141639610050856989E-03 -0.121288226150826994E-05
+  0.178147057806271984E-01 -0.141639611820286996E-03  0.178147057807866993E-01
+ -0.121288242338980006E-05 -0.166666666666667011E-01 -0.139670760473278990E-03
+ -0.239698131526809996E-07  0.172964743997317989E-01 -0.166666666666667011E-01
+ -0.435069393628180999E-03  0.194636361556436016E-01 -0.118148289887844995E-02
+ -0.435069390497556984E-03 -0.125641041453345998E-02  0.193887085193421987E-01
+ -0.139670572892313988E-03 -0.166666666666667011E-01  0.172964393935715992E-01
+ -0.239416569730802012E-07  0.000000000000000000E+00 -0.141638062327426994E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.121132811726249994E-05  0.178146133160351008E-01  0.000000000000000000E+00
+ -0.435065005443445983E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193885567700676983E-01 -0.125634384183789002E-02 -0.435073778621178993E-03
+ -0.118148063714991003E-02  0.194636163660859007E-01 -0.139670682267266008E-03
+ -0.166666666666667011E-01  0.172964394328304003E-01 -0.239712554208225998E-07
+ -0.141639519995227992E-03 -0.121303220633213010E-05  0.178147059371244997E-01
+ -0.166666666666667011E-01 -0.435069393052231988E-03  0.194636362649158005E-01
+ -0.118148299187565992E-02 -0.435069391073505994E-03 -0.125641054065752994E-02
+  0.193887086398788003E-01 -0.139670722089359006E-03 -0.166666666666667011E-01
+  0.172964393787637984E-01 -0.239432406825611007E-07 -0.141638922827194003E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.121140585286092006E-05  0.178146133355806986E-01  0.000000000000000000E+00
+ -0.435065002826116019E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193885565492021017E-01 -0.125634366816502010E-02 -0.435073781238435990E-03
+ -0.118148050457107996E-02  0.194636162212637985E-01 -0.139670533104681001E-03
+ -0.166666666666667011E-01  0.172964394486108987E-01 -0.239696530874210007E-07
+ -0.141638658891901002E-03 -0.121295339218169004E-05  0.178147058622277997E-01
+ -0.435090507624198979E-03 -0.166666666666667011E-01  0.194669892783514015E-01
+ -0.118530036438423006E-02 -0.435048275086066013E-03 -0.125913754589597991E-02
+  0.193915247086616990E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172960984744740011E-01 -0.213609391462743998E-07 -0.166666666666667011E-01
+ -0.106367312934038010E-05  0.178056546914526007E-01 -0.166666666666667011E-01
+  0.173098191165480010E-01 -0.233466429807012998E-07 -0.116342624068299999E-05
+  0.178149495945450001E-01 -0.166666666666667011E-01 -0.435069781984934981E-03
+  0.194633224227003000E-01 -0.118118559577364002E-02 -0.435069002140319990E-03
+ -0.125608528610373000E-02  0.193883926903337007E-01 -0.139659157768774999E-03
+ -0.166666666666667011E-01  0.172964811071458988E-01 -0.238453432888061996E-07
+ -0.141572594197268990E-03 -0.120675029799499991E-05  0.178147008767080006E-01
+ -0.141538198842383991E-03  0.178146978502537016E-01 -0.120361768174549009E-05
+ -0.166666666666667011E-01 -0.139653197525385994E-03 -0.237816337684043011E-07
+  0.172964810481539000E-01 -0.166666666666667011E-01 -0.435101709399810996E-03
+  0.194659175728730989E-01 -0.118451613849551002E-02 -0.435037071408026995E-03
+ -0.125773914873593992E-02  0.193902105130663999E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172962783353253005E-01 -0.218342668577251013E-07
+ -0.166666666666667011E-01 -0.109032448304442004E-05  0.178073744716683996E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.434677413069656997E-03
+  0.193010409680305015E-01 -0.117438144585429007E-02 -0.435460867464844024E-03
+ -0.110662888767641991E-02  0.193848162965736988E-01 -0.166666666666667011E-01
+  0.173066829056793998E-01 -0.235026937711214014E-07 -0.117850668811869000E-05
+  0.178149746842341007E-01 -0.434571088711364992E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.192763238295290996E-01 -0.115086473839449998E-02
+ -0.435566874652637017E-03 -0.108492248963690989E-02  0.193620926343746995E-01
+ -0.139700128428728001E-03 -0.166666666666667011E-01  0.172964062536986006E-01
+ -0.947853442084885955E-08 -0.141287229661082013E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.427116700035002018E-06  0.177495998236214987E-01 -0.166666666666667011E-01
+ -0.139670101692640009E-03  0.172964394288557984E-01 -0.239650162888405991E-07
+ -0.141636168849436004E-03 -0.121272540531506999E-05  0.178147056226493003E-01
+ -0.166666666666667011E-01 -0.435070601601427007E-03  0.194632424803467992E-01
+ -0.118112723413059005E-02 -0.435068182519665007E-03 -0.125598135252497989E-02
+  0.193882951902035992E-01 -0.140350019130283996E-03  0.178144321328113989E-01
+ -0.934070655061730960E-06 -0.166666666666667011E-01 -0.140500665652292997E-03
+ -0.934901878808751052E-06  0.178144580209842994E-01 -0.166666666666667011E-01
+ -0.435068213377728019E-03  0.193838387473541998E-01 -0.125145538065708989E-02
+ -0.435070570743591027E-03 -0.117665454921121997E-02  0.194586920952139995E-01
+ -0.166666666666667011E-01  0.172970581688366015E-01 -0.800807256662095042E-18
+ -0.166666666666667011E-01 -0.116362368362492996E-16  0.173226956581742017E-01
+ -0.166666666666667011E-01  0.173227088035448999E-01 -0.227142423077076990E-07
+ -0.109342148655364995E-05  0.178145868761214014E-01 -0.166666666666667011E-01
+ -0.139356858353544992E-03  0.172964916979878001E-01 -0.206044330373597012E-07
+ -0.139859905327302997E-03 -0.104722219521747004E-05  0.178145504281299986E-01
+ -0.435504189131306018E-03 -0.166666666666667011E-01  0.194761567651129015E-01
+ -0.120248342376421996E-02  0.000000000000000000E+00 -0.434633990251299008E-03
+ -0.125640745877263009E-02  0.193901277969210986E-01 -0.166666666666667011E-01
+ -0.435069416312690977E-03  0.000000000000000000E+00  0.194633539559233987E-01
+ -0.118120745095251002E-02 -0.435069367813045000E-03 -0.125612755432325993E-02
+  0.193884319840324990E-01 -0.166666666666667011E-01 -0.139670622926057006E-03
+  0.172964757091009988E-01 -0.239682454273153985E-07 -0.141638804410660994E-03
+ -0.121280234601187995E-05  0.178147056852437993E-01 -0.141603983295218001E-03
+  0.178147026131165002E-01 -0.120962066737389997E-05 -0.166666666666667011E-01
+ -0.139664589011853001E-03 -0.239035291403939993E-07  0.172964756492609006E-01
+ -0.166666666666667011E-01 -0.435069662231408008E-03  0.194635113006111984E-01
+ -0.118136680130286010E-02 -0.435069121894098010E-03 -0.125627780048987998E-02
+  0.193885794815036004E-01 -0.161604530780638008E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.172965258603989015E-01 -0.937092380803772050E-07
+ -0.162357917896980013E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.103420292441458007E-05  0.173052119919553000E-01 -0.139467894615960005E-03
+ -0.166666666666667011E-01  0.173042004359115001E-01 -0.214254882847721013E-07
+ -0.140431934037818008E-03 -0.107186911344032006E-05  0.178145703665167007E-01
+ -0.435495844207702974E-03 -0.166666666666667011E-01  0.194039307261216001E-01
+ -0.113156930142864010E-02 -0.434642344857009977E-03 -0.118520221101701007E-02
+  0.193210744918359996E-01 -0.166666666666667011E-01 -0.435081000308743025E-03
+  0.194615626446394997E-01 -0.117973440054902992E-02 -0.435057783388624999E-03
+ -0.125399790053982996E-02  0.193864078136761001E-01 -0.160852239681984991E-03
+ -0.166666666666667011E-01  0.172967516936828013E-01 -0.175963431721074013E-07
+ -0.161650172616747013E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.203401553309515001E-06  0.173054257802153014E-01
+ -0.139669192621242992E-03 -0.166666666666667011E-01  0.173052370218566010E-01
+ -0.234215439038825986E-07 -0.141543308308260998E-03 -0.116637645863834998E-05
+  0.178146593404366997E-01 -0.435080679695976980E-03 -0.166666666666667011E-01
+  0.194640875795392999E-01 -0.118216803173680995E-02 -0.435058104025100973E-03
+ -0.125653847166064996E-02  0.193888504672653991E-01 -0.415397501230701984E-03
+ -0.166666666666667011E-01  0.188279009057511007E-01 -0.898333912356110006E-03
+ -0.453392657545613985E-03 -0.100006119280005999E-02  0.192410594709606017E-01
+ -0.166666666666667011E-01  0.173196351930705005E-01 -0.236100996593661996E-07
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.123805471953726010E-05  0.178172764971170003E-01
+ -0.166666666666667011E-01  0.172958803103070990E-01 -0.240055324597324986E-07
+ -0.121597905415154999E-05  0.178147088629611004E-01 -0.166666666666667011E-01
+ -0.139360440123895005E-03  0.172964135806168012E-01 -0.206282988179274991E-07
+ -0.139848706255163002E-03 -0.104818854429194995E-05  0.178144673884889013E-01
+ -0.482623514256753016E-03 -0.166666666666667011E-01  0.184436867195339992E-01
+ -0.182833591562523009E-03 -0.374919021333009015E-03 -0.210075563706943995E-03
+  0.183321751743828996E-01 -0.143208310342222002E-03  0.178148148672475012E-01
+ -0.128486278788734005E-05 -0.166666666666667011E-01 -0.199920559280411991E-03
+ -0.199737151953955009E-05  0.178295061512010987E-01 -0.166666666666667011E-01
+ -0.433603816949246982E-03  0.193497617607279009E-01 -0.124503446125193997E-02
+ -0.436528005342376002E-03 -0.119072566655100006E-02  0.194675617176627014E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172951192319563997E-01
+ -0.214703229569797989E-07 -0.166666666666667011E-01 -0.107141837494050999E-05
+  0.178058595667948995E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.434603197477276975E-03  0.000000000000000000E+00  0.193058537144648011E-01
+ -0.118063268602804994E-02 -0.435534875277424982E-03 -0.111369468853339993E-02
+  0.193919603590869995E-01 -0.166666666666667011E-01  0.173056689021276987E-01
+ -0.235558337294687006E-07 -0.118363055424740992E-05  0.178149857852361003E-01
+ -0.435171466639532013E-03 -0.166666666666667011E-01  0.194709518243956006E-01
+ -0.119094077767639001E-02 -0.434967284384675014E-03 -0.126075351231549003E-02
+  0.193933220864733989E-01 -0.435279548009691016E-03 -0.166666666666667011E-01
+  0.194122108470178995E-01 -0.114336878037841995E-02 -0.434859088315804978E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.119657203671675997E-02  0.193454506408375984E-01
+ -0.166666666666667011E-01 -0.139670792135546010E-03  0.172964364510711996E-01
+ -0.239726277026338997E-07 -0.141640184749274994E-03 -0.121310638409450998E-05
+  0.178147060027833010E-01 -0.166666666666667011E-01 -0.140348082594213009E-03
+  0.178144298829873010E-01 -0.933774300842211964E-06 -0.140348090712458993E-03
+ -0.933802946513419972E-06  0.178144318617154997E-01 -0.435069485964877014E-03
+  0.194636360495127995E-01 -0.118148312545114003E-02 -0.166666666666667011E-01
+ -0.435069298160832996E-03 -0.125640935270473000E-02  0.193887056176578997E-01
+ -0.166666666666667011E-01 -0.139670745411772993E-03  0.172964393771041017E-01
+ -0.239719351629878006E-07 -0.141639884975546990E-03 -0.121306573574195002E-05
+  0.178147059632727985E-01 -0.166666666666667011E-01  0.193887976520839000E-01
+ -0.125650164472002990E-02 -0.166666666666667011E-01 -0.118157208029668996E-02
+  0.194637273004341985E-01 -0.166666666666667011E-01  0.193887977203370011E-01
+ -0.125650167109275009E-02 -0.118157208884183003E-02  0.194637273863745991E-01
+ -0.166666666666667011E-01 -0.435069390354664992E-03  0.193887118325760006E-01
+ -0.125641384488361994E-02 -0.435069393771072991E-03 -0.118148619567291996E-02
+  0.194636395582745009E-01 -0.139674686773609991E-03 -0.166666666666667011E-01
+  0.173041492371790000E-01 -0.235404446357735999E-07  0.000000000000000000E+00
+ -0.141584559729122987E-03 -0.117458827916782993E-05  0.178146676653142995E-01
+ -0.166666666666667011E-01 -0.139670743898058004E-03  0.000000000000000000E+00
+  0.172964393844322988E-01 -0.239719184224240012E-07 -0.141639876162939988E-03
+ -0.121306489605736002E-05  0.178147059624353017E-01 -0.166666666666667011E-01
+ -0.140348087318788991E-03  0.178144318414940016E-01 -0.933801041250177032E-06
+ -0.140348087395501987E-03 -0.933801311940696952E-06  0.178144318601919996E-01
+ -0.435069392951675977E-03  0.194636364109944011E-01 -0.118148311441449989E-02
+ -0.166666666666667011E-01 -0.435069391174062005E-03 -0.125641070624533989E-02
+  0.193887087793657986E-01 -0.166666666666667011E-01 -0.435069391183394005E-03
+  0.193887815136928009E-01 -0.125648507926827001E-02 -0.435069392942343978E-03
+ -0.118155584839527996E-02  0.194637107839485991E-01 -0.139700198051380994E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.172964151566069001E-01
+ -0.947849285071832001E-08 -0.141287467734219009E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.427106386139514992E-06  0.177495992957071996E-01
+ -0.140178145272992012E-03 -0.166666666666667011E-01  0.177492248109030991E-01
+ -0.496797001355079032E-07 -0.140415042956877005E-03 -0.517779462070214031E-06
+  0.178140139540239988E-01 -0.435069391423600026E-03 -0.166666666666667011E-01
+  0.193887083934986999E-01 -0.125641030816468009E-02 -0.435069392702138011E-03
+ -0.118148272213002004E-02  0.194636360081575990E-01 -0.166666666666667011E-01
+ -0.139670744267195992E-03  0.172964393844029993E-01 -0.239719223657696016E-07
+ -0.141639878293219989E-03 -0.121306508981309993E-05  0.178147059625506990E-01
+ -0.435069391344719005E-03 -0.166666666666667011E-01  0.193887083608434013E-01
+ -0.125641029540731006E-02 -0.435069392781018977E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.118148271833985003E-02
+  0.194636359700672001E-01 -0.435069391241579015E-03 -0.166666666666667011E-01
+  0.193887087819448016E-01 -0.125641070755089994E-02 -0.435069392884159022E-03
+ -0.118148311473540010E-02  0.194636364115348993E-01 -0.139670743900726009E-03
+ -0.166666666666667011E-01  0.172964393843643011E-01 -0.239719184546282004E-07
+ -0.141639876179015995E-03 -0.121306489779411998E-05  0.178147059624346009E-01
+ -0.435069391188663010E-03 -0.166666666666667011E-01  0.193887118630544017E-01
+ -0.125641385909356002E-02 -0.435069392937075027E-03 -0.118148619756065993E-02
+  0.194636395637313997E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172965077542917012E-01 -0.927568247375718968E-07 -0.166666666666667011E-01
+ -0.102046200973759010E-05  0.173051339160060005E-01 -0.166666666666667011E-01
+  0.193887977549708004E-01 -0.125650168627735996E-02 -0.118157208932236990E-02
+  0.194637273908664990E-01 -0.166666666666667011E-01 -0.435069391170066991E-03
+  0.193884418583467992E-01 -0.125613776955457008E-02 -0.435069392955670991E-03
+ -0.118121619348252006E-02  0.194633634743846991E-01 -0.139670761454596990E-03
+ -0.166666666666667011E-01  0.172964743993801011E-01 -0.239698237251752988E-07
+ -0.141639617487515998E-03 -0.121288294408737000E-05  0.178147057813331997E-01
+ -0.141639617489162006E-03  0.178147057813334009E-01 -0.121288294423791994E-05
+ -0.166666666666667011E-01 -0.139670761454882000E-03 -0.239698237282632997E-07
+  0.172964743993801011E-01 -0.166666666666667011E-01 -0.435069391188130992E-03
+  0.193887118630363987E-01 -0.125641385908596996E-02 -0.435069392937606990E-03
+ -0.118148619756086007E-02  0.194636395637294013E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172965077542917012E-01 -0.927568247372701942E-07
+ -0.166666666666667011E-01 -0.102046200973822008E-05  0.173051339160060005E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.139674680709100989E-03
+  0.173041376249854009E-01 -0.235410513736760014E-07 -0.141584640574256011E-03
+ -0.117464392253588008E-05  0.178146677207011994E-01 -0.166666666666667011E-01
+  0.193887977549706998E-01 -0.125650168627732006E-02 -0.118157208932236990E-02
+  0.194637273908664990E-01 -0.139674680709099010E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.173041376249854009E-01 -0.235410513737710014E-07
+ -0.141584640574248991E-03 -0.117464392253522998E-05  0.178146677207011994E-01
+ -0.161601715880123993E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172965076669070010E-01 -0.926692831361944028E-07 -0.162351548531821993E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+ -0.102032125850803006E-05  0.173051453880029987E-01 -0.166666666666667011E-01
+ -0.139670774820815007E-03  0.172964393819063991E-01 -0.239722506017439995E-07
+ -0.141640054670844003E-03 -0.121308123358901007E-05  0.178147059780696011E-01
+ -0.166666666666667011E-01 -0.435069333337653979E-03  0.193887254401867992E-01
+ -0.125642901506816990E-02 -0.435069450788072999E-03 -0.118150189097284992E-02
+  0.194636551938420013E-01 -0.140349304427809005E-03  0.178144320166155994E-01
+ -0.933957824287751043E-06 -0.166666666666667011E-01 -0.140349299344085009E-03
+ -0.933939903071356043E-06  0.178144307788948991E-01 -0.166666666666667011E-01
+ -0.435068660852071026E-03  0.000000000000000000E+00  0.193848776728661985E-01
+ -0.125250844957367998E-02 -0.435070123271966982E-03 -0.117767775440582993E-02
+  0.194597409817202013E-01 -0.166666666666667011E-01  0.172969944929179005E-01
+ -0.779115314308188993E-07 -0.166666666666667011E-01 -0.755324938795983959E-05
+  0.179004996942120007E-01 -0.166666666666667011E-01  0.178976999748934017E-01
+ -0.387667548982099008E-05 -0.233628102656076998E-06  0.178135930268625009E-01
+ -0.166666666666667011E-01 -0.139686009466056008E-03  0.172964370591520006E-01
+ -0.241363103321880989E-07 -0.141728009792282988E-03 -0.122115128991173003E-05
+  0.178147135015222988E-01 -0.435039396203468979E-03 -0.166666666666667011E-01
+  0.193948601328561010E-01 -0.126336416897148991E-02  0.000000000000000000E+00
+ -0.435099385069850019E-03 -0.118873902080404996E-02  0.194708368876741998E-01
+ -0.166666666666667011E-01 -0.435069389255753991E-03  0.000000000000000000E+00
+  0.193884422504904987E-01 -0.125613821100519006E-02 -0.435069394869983992E-03
+ -0.118121665337716998E-02  0.194633639337775007E-01 -0.166666666666667011E-01
+ -0.139671380124086001E-03  0.172964743448792008E-01 -0.239764688219899014E-07
+ -0.141643188367961012E-03 -0.121320975071175002E-05  0.178147060922408014E-01
+ -0.141639844172828988E-03  0.178147058017989017E-01 -0.121290395199703009E-05
+ -0.166666666666667011E-01 -0.139670800627344013E-03 -0.239702482699698004E-07
+  0.172964743394126014E-01 -0.166666666666667011E-01 -0.435068235431639017E-03
+  0.193889053227464005E-01 -0.125663654535589997E-02 -0.435070548689851008E-03
+ -0.118172116532546008E-02  0.194638725033117015E-01 -0.161678337891334005E-03
+ -0.166666666666667011E-01  0.172964935301390017E-01 -0.110214153364595997E-06
+ -0.162422470311079996E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.120622187414538000E-05  0.173052150067064016E-01
+ -0.139675704206640992E-03 -0.166666666666667011E-01  0.173040340098533016E-01
+ -0.235576501556051000E-07 -0.141591382929153007E-03 -0.117568258886253002E-05
+  0.178146687262423983E-01 -0.435067411943173014E-03 -0.166666666666667011E-01
+  0.193886625537611004E-01 -0.125640355917502997E-02 -0.435071372170113015E-03
+ -0.118150457751595010E-02  0.194636501950278991E-01 -0.166666666666667011E-01
+ -0.435069317670600008E-03  0.193887257892246985E-01 -0.125642966435953992E-02
+ -0.435069466455119977E-03 -0.118150274118994998E-02  0.194636560434848990E-01
+ -0.161601801933608996E-03 -0.166666666666667011E-01  0.172965054748796011E-01
+ -0.926502679423437959E-07 -0.162351306454973002E-03  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.101988138442591992E-05
+  0.173051393629530988E-01 -0.139695639987958987E-03 -0.166666666666667011E-01
+  0.173041438368830985E-01 -0.237586532625412003E-07 -0.141701626146447995E-03
+ -0.118517343233528007E-05  0.178146776002246002E-01 -0.435031309395256976E-03
+ -0.166666666666667011E-01  0.193945788398414004E-01 -0.126324081468956006E-02
+ -0.435107470132009989E-03 -0.118873511389969004E-02  0.194708001084177990E-01
+ -0.435268203178918020E-03 -0.166666666666667011E-01  0.194803280841455984E-01
+ -0.120221471825937998E-02 -0.434870455402744990E-03 -0.126728100453280001E-02
+  0.194000950994309002E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172950455023187996E-01 -0.209695226068837989E-07 -0.166666666666667011E-01
+ -0.104309478839938997E-05  0.178040398395854990E-01 -0.166666666666667011E-01
+  0.173086281051332003E-01 -0.234055546297331013E-07 -0.116921175366159004E-05
+  0.178149613334783012E-01 -0.166666666666667011E-01 -0.141165531303230992E-03
+  0.172962108365398992E-01 -0.473445694663900975E-05 -0.544660594244081005E-03
+ -0.346550124345780994E-04  0.183333673483142001E-01 -0.139754726222275990E-03
+ -0.166666666666667011E-01  0.176170084219247000E-01 -0.246149781579885013E-07
+ -0.140206540519596996E-03 -0.567788699151559999E-06  0.178140588497450007E-01
+ -0.165095492820345002E-03  0.178223326710730996E-01 -0.681196695085373968E-05
+ -0.166666666666667011E-01 -0.150450799069420005E-03 -0.294765827683784011E-06
+  0.176166680355617014E-01 -0.166666666666667011E-01 -0.421169440873449990E-03
+  0.189193905894529009E-01 -0.945436157372043042E-03 -0.448291280314557015E-03
+ -0.100548420996317994E-02  0.192537236307864998E-01 -0.166666666666667011E-01
+  0.173199119681975988E-01 -0.235984477489147988E-07  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.123676178966295993E-05  0.178172740173286014E-01 -0.166666666666667011E-01
+ -0.435171428461128974E-03  0.194709389729267983E-01 -0.119092785894799004E-02
+ -0.434967322587667975E-03 -0.126074169224678996E-02  0.193933111644412001E-01
+ -0.166666666666667011E-01  0.172958818582512990E-01 -0.240054389194565998E-07
+ -0.121597096057541996E-05  0.178147088549035007E-01 -0.434490077563752020E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.192819871000558009E-01 -0.115810409202821997E-02 -0.435647599072524984E-03
+ -0.109302488139546004E-02  0.193702791629623006E-01 -0.435071227206677987E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.194623470628700994E-01
+ -0.118137412448114003E-02 -0.435067556908351005E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.125600599177529007E-02
+  0.193876331694533000E-01 -0.166666666666667011E-01 -0.139670743889624998E-03
+  0.172964393844356017E-01 -0.239719183317204011E-07 -0.141639876114236000E-03
+ -0.121306489159398005E-05  0.178147059624313014E-01 -0.166666666666667011E-01
+ -0.140348087313353994E-03  0.178144318418225998E-01 -0.933801044951077008E-06
+ -0.140348087388714990E-03 -0.933801310870910963E-06  0.178144318601911010E-01
+ -0.435069392936102985E-03  0.194636364108025997E-01 -0.118148311410958993E-02
+ -0.166666666666667011E-01 -0.435069391189634998E-03 -0.125641070617519006E-02
+  0.193887087796365994E-01 -0.166666666666667011E-01 -0.139670745273937991E-03
+  0.172964393844580004E-01 -0.239719332022523000E-07 -0.141639884104326007E-03
+ -0.121306562274284990E-05  0.178147059631592990E-01 -0.166666666666667011E-01
+  0.193887976615549004E-01 -0.125650164862212989E-02 -0.166666666666667011E-01
+ -0.118157208096353003E-02  0.194637273070949017E-01 -0.166666666666667011E-01
+  0.193887977248977002E-01 -0.125650167309232009E-02 -0.118157208890513009E-02
+  0.194637273869662994E-01 -0.166666666666667011E-01 -0.435069391187089996E-03
+  0.193887118682170012E-01 -0.125641386440402997E-02 -0.435069392938647987E-03
+ -0.118148620277655990E-02  0.194636395690588986E-01 -0.139674686768426990E-03
+ -0.166666666666667011E-01  0.173041492303226997E-01 -0.235404449782952015E-07
+  0.000000000000000000E+00 -0.141584559767933007E-03 -0.117458831126650007E-05
+  0.178146676653491987E-01 -0.166666666666667011E-01 -0.139670743889684006E-03
+  0.000000000000000000E+00  0.172964393844356017E-01 -0.239719183323613007E-07
+ -0.141639876114578011E-03 -0.121306489162555998E-05  0.178147059624313986E-01
+ -0.166666666666667011E-01 -0.140348087304587000E-03  0.178144318418189986E-01
+ -0.933801043795399019E-06 -0.140348087379959000E-03 -0.933801309750847956E-06
+  0.178144318601899006E-01 -0.435069392936221976E-03  0.194636364108877989E-01
+ -0.118148311419265998E-02 -0.166666666666667011E-01 -0.435069391189516982E-03
+ -0.125641070625854005E-02  0.193887087797160983E-01 -0.166666666666667011E-01
+ -0.435069391192432022E-03  0.193887815134137984E-01 -0.125648507878967997E-02
+ -0.435069392933306015E-03 -0.118155584779333005E-02  0.194637107833882002E-01
+ -0.139700198049885988E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964151566860000E-01 -0.947849280219223922E-08 -0.141287467724896009E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.427106383695399994E-06
+  0.177495992954452009E-01 -0.140178145690196993E-03 -0.166666666666667011E-01
+  0.177492248106932010E-01 -0.496797033573775009E-07 -0.140415043482606990E-03
+ -0.517779497585701998E-06  0.178140139541216984E-01 -0.435069391089031011E-03
+ -0.166666666666667011E-01  0.193887087977068009E-01 -0.125641072675559008E-02
+ -0.435069393036708002E-03 -0.118148313570800006E-02  0.194636364323594983E-01
+ -0.166666666666667011E-01 -0.139670745392212007E-03  0.172964393844482998E-01
+ -0.239719344734911991E-07 -0.141639884787109996E-03 -0.121306568527222010E-05
+  0.178147059632214992E-01 -0.435069390245067006E-03 -0.166666666666667011E-01
+  0.193887086999318010E-01 -0.125641068746266005E-02 -0.435069393880670977E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.118148312670310991E-02  0.194636363420862997E-01 -0.435069390179441007E-03
+ -0.166666666666667011E-01  0.193887087448267985E-01 -0.125641069098422005E-02
+ -0.435069393946296975E-03 -0.118148311374637006E-02  0.194636364063980986E-01
+ -0.139670743899172997E-03 -0.166666666666667011E-01  0.172964393844271015E-01
+ -0.239719184348657009E-07 -0.141639876169436012E-03 -0.121306489668503007E-05
+  0.178147059624364015E-01 -0.435069391188929019E-03 -0.166666666666667011E-01
+  0.193887118630633008E-01 -0.125641385909733001E-02 -0.435069392936809993E-03
+ -0.118148619756053004E-02  0.194636395637322983E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172965077542917983E-01 -0.927568247378536014E-07
+ -0.166666666666667011E-01 -0.102046200973724006E-05  0.173051339160060005E-01
+ -0.166666666666667011E-01  0.193887977549708004E-01 -0.125650168627737991E-02
+ -0.118157208932236990E-02  0.194637273908664990E-01 -0.166666666666667011E-01
+ -0.435069391171351012E-03  0.193884418583585988E-01 -0.125613776953759993E-02
+ -0.435069392954388001E-03 -0.118121619344631009E-02  0.194633634743581994E-01
+ -0.139670761454455990E-03 -0.166666666666667011E-01  0.172964743993848993E-01
+ -0.239698237233977009E-07 -0.141639617486649991E-03 -0.121288294398642993E-05
+  0.178147057813330990E-01 -0.141639617486657987E-03  0.178147057813330990E-01
+ -0.121288294398714991E-05 -0.166666666666667011E-01 -0.139670761454456992E-03
+ -0.239698237233852998E-07  0.172964743993848993E-01 -0.166666666666667011E-01
+ -0.435069391188925984E-03  0.193887118630632002E-01 -0.125641385909728990E-02
+ -0.435069392936811999E-03 -0.118148619756053004E-02  0.194636395637322983E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172965077542917983E-01
+ -0.927568247378527941E-07 -0.166666666666667011E-01 -0.102046200973724006E-05
+  0.173051339160060005E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.139674680709099010E-03  0.173041376249854009E-01 -0.235410513736267014E-07
+ -0.141584640574244004E-03 -0.117464392253481007E-05  0.178146677207011994E-01
+ -0.166666666666667011E-01  0.193887977549708004E-01 -0.125650168627737991E-02
+ -0.118157208932236990E-02  0.194637273908664990E-01 -0.139674680709099010E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.173041376249854009E-01
+ -0.235410513735998015E-07 -0.141584640574244004E-03 -0.117464392253481007E-05
+  0.178146677207011994E-01 -0.161601720324868993E-03 -0.166666666666667011E-01
+  0.172965076669642989E-01 -0.926701979568355970E-07 -0.162351552541144001E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.102033090018047992E-05  0.173051453908108985E-01
+ -0.166666666666667011E-01 -0.435069391175761004E-03  0.193884418585090999E-01
+ -0.125613776960263991E-02 -0.435069392949976978E-03 -0.118121619344674009E-02
+  0.194633634743759006E-01 -0.166666666666667011E-01 -0.139670761454455990E-03
+  0.172964743993851006E-01 -0.239698237233875994E-07 -0.141639617486650994E-03
+ -0.121288294398543001E-05  0.178147057813330990E-01 -0.141639617476079996E-03
+  0.178147057813322005E-01 -0.121288294301813998E-05 -0.166666666666667011E-01
+ -0.139670761452624013E-03 -0.239698237036860001E-07  0.172964743993851006E-01
+ -0.166666666666667011E-01 -0.435069391192448990E-03  0.193887815134013014E-01
+ -0.125648507877692997E-02 -0.435069392933288993E-03 -0.118155584778073010E-02
+  0.194637107833748012E-01 -0.166666666666667011E-01  0.172964151566857016E-01
+ -0.947849086778118929E-08 -0.166666666666667011E-01 -0.427106298729459992E-06
+  0.177495992959284983E-01 -0.166666666666667011E-01  0.177492248112494991E-01
+ -0.496796927082655975E-07 -0.517779383578772961E-06  0.178140139540073003E-01
+ -0.166666666666667011E-01 -0.435069391188929019E-03  0.193887118675337006E-01
+ -0.125641386366846992E-02 -0.435069392936809993E-03 -0.118148620203093000E-02
+  0.194636395683033987E-01 -0.139674686766604988E-03 -0.166666666666667011E-01
+  0.173041492306688985E-01 -0.235404449394736996E-07  0.000000000000000000E+00
+ -0.141584559754340987E-03 -0.117458830859994993E-05  0.178146676653464994E-01
+ -0.166666666666667011E-01 -0.139670743889622992E-03  0.000000000000000000E+00
+  0.172964393844356017E-01 -0.239719183316415013E-07 -0.141639876114222989E-03
+ -0.121306489159275990E-05  0.178147059624313014E-01 -0.166666666666667011E-01
+ -0.140348087304557998E-03  0.178144318418215000E-01 -0.933801043825826983E-06
+ -0.140348087379918993E-03 -0.933801309744065975E-06  0.178144318601899006E-01
+ -0.435069392936097998E-03  0.194636364108022007E-01 -0.118148311410921003E-02
+ -0.166666666666667011E-01 -0.435069391189641015E-03 -0.125641070617487000E-02
+  0.193887087796364016E-01 -0.166666666666667011E-01 -0.435069391192461025E-03
+  0.193887815133983003E-01 -0.125648507877364007E-02 -0.435069392933277012E-03
+ -0.118155584777734001E-02  0.194637107833714011E-01 -0.139700198047769001E-03
+ -0.166666666666667011E-01  0.172964151566880990E-01 -0.947849280983375026E-08
+ -0.141287467715877995E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.427106384123253011E-06  0.177495992955506998E-01
+ -0.140178145551705995E-03 -0.166666666666667011E-01  0.177492248107965003E-01
+ -0.496797023203806995E-07 -0.140415043308048001E-03 -0.517779485956588989E-06
+  0.178140139541092986E-01 -0.435069391188775008E-03 -0.166666666666667011E-01
+  0.193887087796628006E-01 -0.125641070621944004E-02 -0.435069392936964004E-03
+ -0.118148311416520994E-02  0.194636364108560014E-01 -0.166666666666667011E-01
+ -0.139670745283318996E-03  0.172964393844581010E-01 -0.239719333030583992E-07
+ -0.141639884158474994E-03 -0.121306562769789989E-05  0.178147059631642984E-01
+ -0.435069390314679995E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.193887086857719992E-01 -0.125641066837268002E-02 -0.435069393811057987E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.118148310576349008E-02
+  0.194636363266590985E-01  0.000000000000000000E+00 -0.435069390314679995E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193887087494363994E-01
+ -0.125641069296695000E-02 -0.435069393811057987E-03 -0.118148311374545001E-02
+  0.194636364069358005E-01 -0.139670743897207989E-03 -0.166666666666667011E-01
+  0.172964393844356988E-01 -0.239719184131230996E-07 -0.141639876158003994E-03
+ -0.121306489559900995E-05  0.178147059624353017E-01 -0.435069391188779996E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.193887088940556986E-01
+ -0.125641082318989007E-02 -0.435069392936957987E-03 -0.118148322855787997E-02
+  0.194636365278264001E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172964393409659987E-01 -0.239438466648548014E-07 -0.166666666666667011E-01
+ -0.121143526110419994E-05  0.178146131934183993E-01 -0.166666666666667011E-01
+  0.172964393844281007E-01 -0.239719133478075007E-07 -0.121306464654854992E-05
+  0.178147059621872987E-01 -0.166666666666667011E-01 -0.435069391175080993E-03
+  0.193884418584855008E-01 -0.125613776959208998E-02 -0.435069392950656990E-03
+ -0.118121619344611992E-02  0.194633634743727989E-01 -0.139670761454532995E-03
+ -0.166666666666667011E-01  0.172964743993852012E-01 -0.239698237241572016E-07
+ -0.141639617487093999E-03 -0.121288294402584004E-05  0.178147057813331997E-01
+ -0.141639617487093999E-03  0.178147057813331997E-01 -0.121288294402580997E-05
+ -0.166666666666667011E-01 -0.139670761454532995E-03 -0.239698237241842008E-07
+  0.172964743993852012E-01 -0.166666666666667011E-01 -0.435069391188779996E-03
+  0.193887088940556986E-01 -0.125641082318989007E-02 -0.435069392936957987E-03
+ -0.118148322855787997E-02  0.194636365278264001E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172964393409659987E-01 -0.239438466648548014E-07
+ -0.166666666666667011E-01 -0.121143526110419994E-05  0.178146131934183993E-01
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.435064994898318983E-03
+  0.000000000000000000E+00  0.193885567754814997E-01 -0.125634406113793993E-02
+ -0.435073789166010982E-03 -0.118148100321111005E-02  0.194636166968597014E-01
+ -0.166666666666667011E-01  0.172964393844281007E-01 -0.239719133478075007E-07
+ -0.121306464654854992E-05  0.178147059621872987E-01 -0.435064994898318983E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193885567754814997E-01 -0.125634406113793993E-02 -0.435073789166010982E-03
+ -0.118148100321111005E-02  0.194636166968597014E-01 -0.161601720097545002E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172965076670156016E-01
+ -0.926701503617335961E-07 -0.162351552330414998E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.102033039390967000E-05
+  0.173051453906522996E-01 -0.166666666666667011E-01 -0.435069886305821003E-03
+  0.194633772872509006E-01 -0.118124110147639996E-02 -0.435068897819141016E-03
+ -0.125613724075302000E-02  0.193884430666162004E-01 -0.166666666666667011E-01
+ -0.139670761026265005E-03  0.172964744480312016E-01 -0.239698159440932015E-07
+ -0.141639614515128010E-03 -0.121288245172214010E-05  0.178147057808154992E-01
+ -0.141639611356817998E-03  0.178147057804570984E-01 -0.121288216154248992E-05
+ -0.166666666666667011E-01 -0.139670760479100993E-03 -0.239698100456143004E-07
+  0.172964744480210014E-01 -0.166666666666667011E-01 -0.435069393124601020E-03
+  0.194636395974100984E-01 -0.118148625156355995E-02 -0.435069391001137993E-03
+ -0.125641387127325998E-02  0.193887118958671993E-01 -0.166666666666667011E-01
+  0.172965077543725011E-01 -0.927568140336766949E-07 -0.166666666666667011E-01
+ -0.102046181269095990E-05  0.173051339147857995E-01 -0.166666666666667011E-01
+  0.173041376239574003E-01 -0.235410465544014012E-07 -0.117464369127346007E-05
+  0.178146677204709010E-01 -0.166666666666667011E-01 -0.435069392055963979E-03
+  0.193887089236092006E-01 -0.125641083591904006E-02 -0.435069392069774004E-03
+ -0.118148322857497003E-02  0.194636365312624987E-01 -0.435064994897326992E-03
+ -0.166666666666667011E-01  0.193885567754197990E-01 -0.125634406109512999E-02
+  0.000000000000000000E+00 -0.435073789167003027E-03 -0.118148100318356004E-02
+  0.194636166968270991E-01 -0.166666666666667011E-01 -0.139670744005780998E-03
+  0.000000000000000000E+00  0.172964393844374995E-01 -0.239719195794844008E-07
+ -0.141639876784672010E-03 -0.121306495294346998E-05  0.178147059624923984E-01
+ -0.166666666666667011E-01 -0.435069390971013978E-03  0.193887087720762998E-01
+ -0.125641070286031001E-02 -0.435069393154724005E-03 -0.118148311400433992E-02
+  0.194636364098218009E-01 -0.140348087384463996E-03  0.178144318601930994E-01
+ -0.933801313166044989E-06 -0.166666666666667011E-01 -0.140348087290235009E-03
+ -0.933800980670895958E-06  0.178144318372258983E-01 -0.166666666666667011E-01
+ -0.139670745283319999E-03  0.172964393844581010E-01 -0.239719333030395990E-07
+ -0.141639884158481011E-03 -0.121306562769829991E-05  0.178147059631642984E-01
+ -0.435069390314711004E-03 -0.166666666666667011E-01  0.193887086857633985E-01
+ -0.125641066836418010E-02  0.000000000000000000E+00 -0.435069393811026979E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.118148310575508990E-02  0.194636363266489990E-01 -0.435069390314652023E-03
+ -0.166666666666667011E-01  0.193887087494357992E-01 -0.125641069296695998E-02
+ -0.435069393811086014E-03 -0.118148311374589995E-02  0.194636364069360017E-01
+ -0.139670743896895007E-03 -0.166666666666667011E-01  0.172964393844356017E-01
+ -0.239719184097173985E-07 -0.141639876156198012E-03 -0.121306489543125995E-05
+  0.178147059624349999E-01 -0.166666666666667011E-01 -0.435069391192448990E-03
+  0.193887815133984009E-01 -0.125648507877395990E-02 -0.435069392933288993E-03
+ -0.118155584777784005E-02  0.194637107833718001E-01 -0.139700198047441003E-03
+ -0.166666666666667011E-01  0.172964151566879984E-01 -0.947849280858640987E-08
+ -0.141287467714373990E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.427106384068703984E-06
+  0.177495992955511994E-01 -0.140178145551758009E-03 -0.166666666666667011E-01
+  0.177492248107972012E-01 -0.496797023215807996E-07 -0.140415043308110993E-03
+ -0.517779485960914998E-06  0.178140139541092986E-01 -0.435069391188743024E-03
+ -0.166666666666667011E-01  0.193887087796531001E-01 -0.125641070621038001E-02
+ -0.435069392936995988E-03 -0.118148311415690994E-02  0.194636364108470017E-01
+ -0.139670745278628006E-03 -0.166666666666667011E-01  0.172964393844581010E-01
+ -0.239719332526419989E-07 -0.141639884131401002E-03 -0.121306562522040010E-05
+  0.178147059631618004E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.193887976612414012E-01 -0.125650164849316990E-02  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.118157208094097992E-02  0.194637273068695993E-01
+ -0.166666666666667011E-01  0.172964393844281007E-01 -0.239719133478075007E-07
+ -0.121306464654854992E-05  0.178147059621872987E-01 -0.166666666666667011E-01
+ -0.139670744005789997E-03  0.172964393844378984E-01 -0.239719195795233015E-07
+ -0.141639876784720013E-03 -0.121306495294561001E-05  0.178147059624923984E-01
+ -0.435069390970999992E-03 -0.166666666666667011E-01  0.193887087720760014E-01
+ -0.125641070286029006E-02 -0.435069393154737991E-03 -0.118148311400450992E-02
+  0.194636364098219015E-01 -0.140348087384448004E-03  0.178144318601930994E-01
+ -0.933801313164241021E-06 -0.166666666666667011E-01 -0.140348087290217987E-03
+ -0.933800980664578045E-06  0.178144318372256000E-01 -0.166666666666667011E-01
+ -0.435069391192457989E-03  0.193887815133986993E-01 -0.125648507877402994E-02
+ -0.435069392933279994E-03 -0.118155584777777001E-02  0.194637107833719007E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172964151566877000E-01
+ -0.947849244191978996E-08 -0.166666666666667011E-01 -0.427106377284980004E-06
+  0.177495993062723005E-01 -0.166666666666667011E-01 -0.140178145569362010E-03
+  0.177492248215363989E-01 -0.496797136596620033E-07 -0.140415043288559006E-03
+ -0.517779487315692952E-06  0.178140139541107002E-01 -0.166666666666667011E-01
+  0.193887977549708004E-01 -0.125650168627737991E-02 -0.118157208932236990E-02
+  0.194637273908664990E-01 -0.435069390319028026E-03 -0.166666666666667011E-01
+  0.193887087496057986E-01 -0.125641069305237993E-02 -0.435069393806710987E-03
+ -0.118148311376662990E-02  0.194636364069748005E-01 -0.139670783800487997E-03
+ -0.166666666666667011E-01  0.172964393410236991E-01 -0.239438517116492010E-07
+ -0.141639277731674996E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.121143550903037006E-05
+  0.178146131936444997E-01 -0.166666666666667011E-01 -0.435630521898627018E-03
+  0.193599440195765006E-01 -0.109077905787125992E-02 -0.434507214005837999E-03
+ -0.113826801288032008E-02  0.192764807435569993E-01 -0.166666666666667011E-01
+ -0.139536670047989993E-03  0.173126649368985017E-01 -0.217199091649137011E-07
+ -0.140761520198303991E-03 -0.106837084216960004E-05  0.178145668887513993E-01
+ -0.559739137191329981E-03  0.187342109887733005E-01 -0.453812468425848997E-03
+ -0.166666666666667011E-01 -0.144437867213283009E-03 -0.570571543797145032E-04
+  0.173662394826822990E-01 -0.166666666666667011E-01 -0.141554480273248993E-03
+  0.173902726296997004E-01 -0.612977187720352984E-05 -0.543269256103840045E-03
+ -0.155576690918531002E-04  0.183343607413550998E-01 -0.166666666666667011E-01
+  0.190435224881807010E-01 -0.109812319463910009E-02 -0.166666666666667011E-01
+ -0.111910148431286992E-02  0.192002058717063995E-01 -0.166666666666667011E-01
+  0.191810336689041995E-01 -0.116112314020530008E-02 -0.117547931931102998E-02
+  0.194335300141195988E-01 -0.166666666666667011E-01 -0.435330234208519001E-03
+  0.194710205585382004E-01 -0.119426070267823997E-02 -0.434808332877938987E-03
+ -0.125626104852707998E-02  0.193894446024132014E-01 -0.140176366249097992E-03
+ -0.166666666666667011E-01  0.177492266081854984E-01 -0.496668683742545015E-07
+  0.000000000000000000E+00 -0.140412798776869000E-03 -0.517630021288783052E-06
+  0.178140137898258992E-01 -0.166666666666667011E-01 -0.435069391186561989E-03
+  0.000000000000000000E+00  0.193884418600598005E-01 -0.125613777099214995E-02
+ -0.435069392939175994E-03 -0.118121619465940005E-02  0.194633634756198985E-01
+ -0.166666666666667011E-01 -0.139670763690922996E-03  0.172964743992523005E-01
+ -0.239698477564660998E-07 -0.141639630395104001E-03 -0.121288412595031991E-05
+  0.178147057825145012E-01 -0.141639614589797013E-03  0.178147057809878995E-01
+ -0.121288267825071997E-05 -0.166666666666667011E-01 -0.139670760952332988E-03
+ -0.239698183155154984E-07  0.172964743992174985E-01 -0.166666666666667011E-01
+ -0.435069386976256979E-03  0.193887089986198996E-01 -0.125641101907938003E-02
+ -0.435069397149481980E-03 -0.118148348210879005E-02  0.194636367633517007E-01
+ -0.139670105391945004E-03 -0.166666666666667011E-01  0.172964392993067012E-01
+ -0.239365206872189997E-07  0.000000000000000000E+00 -0.141635362111391987E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.121107470134998995E-05  0.178146126578259990E-01  0.000000000000000000E+00
+ -0.435064987171317002E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.193885562197462988E-01 -0.125634365104461995E-02 -0.435073796892796990E-03
+ -0.118148071382402999E-02  0.194636163662551993E-01 -0.139670481067569999E-03
+ -0.166666666666667011E-01  0.172964394190999009E-01 -0.239690924292986984E-07
+ -0.141638358730468997E-03 -0.121292585922250005E-05  0.178147058233791983E-01
+ -0.166666666666667011E-01 -0.435069392178805006E-03  0.194636362774779012E-01
+ -0.118148298166642000E-02 -0.435069391946934006E-03 -0.125641057978546000E-02
+  0.193887086723010006E-01 -0.139670520926417992E-03 -0.166666666666667011E-01
+  0.172964393709954013E-01 -0.239410464318514008E-07 -0.141637761129918003E-03
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.121129767514461000E-05  0.178146131114593001E-01
+  0.000000000000000000E+00 -0.435064997641161015E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193885573025995017E-01 -0.125634454891261995E-02
+ -0.435073786423245982E-03 -0.118148144256120997E-02  0.194636171497057989E-01
+ -0.139670065452413992E-03 -0.166666666666667011E-01  0.172964393188763006E-01
+ -0.239646299428580996E-07 -0.141635960674409009E-03 -0.121270661878923007E-05
+  0.178147055903274988E-01 -0.434952598751025003E-03 -0.166666666666667011E-01
+  0.193531525592223995E-01 -0.122237998047711009E-02 -0.435186141503925985E-03
+ -0.114985437531923995E-02  0.194307215840422015E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173112910791177986E-01 -0.253865277378783987E-04
+ -0.166666666666667011E-01 -0.138259363976029009E-03  0.178692219185750990E-01
+ -0.166666666666667011E-01  0.193634106903348008E-01 -0.124536794862644003E-02
+ -0.118108829657000993E-02  0.194595580860945985E-01 -0.166666666666667011E-01
+ -0.435070677644360025E-03  0.194631053235192995E-01 -0.118099346864504007E-02
+ -0.435068106476127980E-03 -0.125584229082854990E-02  0.193881574850998002E-01
+ -0.139730287993055990E-03 -0.166666666666667011E-01  0.172965129995646012E-01
+ -0.246134321794111988E-07 -0.141982715456250990E-03 -0.124443001707338994E-05
+  0.178147384368366989E-01 -0.141918001868253997E-03  0.178147321831826008E-01
+ -0.123839758836049003E-05 -0.166666666666667011E-01 -0.139719073147306011E-03
+ -0.244906574575426016E-07  0.172965128605519006E-01 -0.166666666666667011E-01
+ -0.434974886832897021E-03  0.193565238210357989E-01 -0.122539181989999007E-02
+ -0.435163868604747015E-03 -0.115249078341532994E-02  0.194335198475173998E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.173108110594776994E-01
+ -0.252486229488234996E-04 -0.166666666666667011E-01 -0.137635746082908988E-03
+  0.178684051236191986E-01 -0.166666666666667011E-01 -0.140103968298790010E-03
+  0.173357792454707990E-01 -0.263227882029069994E-07 -0.143405882709704010E-03
+ -0.122767817853530995E-05  0.178147220760386006E-01 -0.166666666666667011E-01
+  0.193641798191387000E-01 -0.124570775063769998E-02 -0.118110177773261002E-02
+  0.194596602746646002E-01 -0.140117598274002998E-03 -0.166666666666667011E-01
+  0.173359378634117996E-01 -0.264605452288720992E-07 -0.143470065352011001E-03
+ -0.123349701123627008E-05  0.178147283869216008E-01 -0.139699930872859987E-03
+ -0.166666666666667011E-01  0.172964334628613985E-01 -0.947718670749511931E-08
+ -0.141286093437001994E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.427033784487718019E-06
+  0.177496010453740000E-01 -0.166666666666667011E-01 -0.435116790829975981E-03
+  0.188852131512914009E-01 -0.625688270833019968E-03 -0.435021984600298978E-03
+ -0.676914763138863006E-03  0.188324695108036989E-01 -0.166666666666667011E-01
+ -0.139445451109797009E-03  0.174062618540683015E-01 -0.190292153995733009E-07
+ -0.139963261540874002E-03 -0.765758082281971048E-06  0.178142630176260988E-01
+ -0.532898885828498952E-03  0.187657876969587016E-01 -0.512856505158025013E-03
+ -0.166666666666667011E-01 -0.274567397040450026E-03 -0.183874311667027007E-03
+  0.177330722022739987E-01 -0.166666666666667011E-01 -0.142934406988999004E-03
+  0.173279989518831008E-01 -0.109940052362783001E-06 -0.158611614292560006E-03
+ -0.505928282170762005E-05  0.178189142473604983E-01 -0.166666666666667011E-01
+  0.191837857135102006E-01 -0.116715933292911990E-02 -0.166666666666667011E-01
+ -0.115261550950918989E-02  0.193075009282458986E-01 -0.166666666666667011E-01
+  0.192715504188021997E-01 -0.120392392247149001E-02 -0.117898318238484007E-02
+  0.194477863400541011E-01 -0.166666666666667011E-01 -0.435541715949862990E-03
+  0.194766610313947992E-01 -0.120362756621229003E-02 -0.434596353936575995E-03
+ -0.125584159877691992E-02  0.193897157775975992E-01 -0.435053772892238027E-03
+ -0.166666666666667011E-01  0.193916866829808993E-01 -0.125980517520797000E-02
+ -0.435085010459394024E-03 -0.118504095739005010E-02  0.194671524144910991E-01
+ -0.166666666666667011E-01 -0.139670776269950009E-03  0.172964393783541989E-01
+ -0.239722661868781990E-07 -0.141640063065594013E-03 -0.121308200605163008E-05
+  0.178147059781510013E-01 -0.166666666666667011E-01 -0.435069330991408006E-03
+  0.193887306635153987E-01 -0.125643445841789009E-02 -0.435069453134317996E-03
+ -0.118150726982896000E-02  0.194636605867372983E-01 -0.140346027430057995E-03
+  0.178144315994148983E-01 -0.933532497931826955E-06 -0.166666666666667011E-01
+ -0.140345973839123002E-03 -0.933513688386790009E-06  0.178144303098709995E-01
+ -0.166666666666667011E-01 -0.139688634719731995E-03  0.172964363725470992E-01
+ -0.241646447296168007E-07 -0.141743168415957993E-03 -0.122254521803203994E-05
+  0.178147146809073016E-01 -0.435059870297020007E-03 -0.166666666666667011E-01
+  0.193910369733737015E-01 -0.125956508747994997E-02  0.000000000000000000E+00
+ -0.435078913540760980E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.118493531740769997E-02  0.194660938610356007E-01
+  0.000000000000000000E+00 -0.435045710174240005E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193875426795061985E-01 -0.125569402526346007E-02
+ -0.435093072169449998E-03 -0.118112078544045998E-02  0.194631751705481998E-01
+ -0.139419879842097997E-03 -0.166666666666667011E-01  0.172964832727322999E-01
+ -0.213161584778305005E-07 -0.140230214929005993E-03 -0.108242332731610002E-05
+  0.178145871202936984E-01 -0.166666666666667011E-01 -0.435070648727890980E-03
+  0.194635173256722006E-01 -0.118139703309657999E-02 -0.435068135392832026E-03
+ -0.125625483745070996E-02  0.193885626932298015E-01 -0.139434374869739004E-03
+ -0.166666666666667011E-01  0.172964444175751006E-01 -0.849107878205794962E-08
+ -0.140068522154645993E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.383811097207312005E-06
+  0.177500579653601005E-01 -0.140208513794423006E-03 -0.166666666666667011E-01
+  0.177497183126398006E-01 -0.504600381181250993E-07 -0.140451251998147000E-03
+ -0.520324482149772044E-06  0.178140160927331009E-01 -0.435048849909143985E-03
+ -0.166666666666667011E-01  0.193915158406403991E-01 -0.125973026996141989E-02
+ -0.435089932877478006E-03 -0.118503856970659008E-02  0.194671300235494012E-01
+ -0.435388199753579007E-03 -0.166666666666667011E-01  0.194580790950915003E-01
+ -0.118263497490555007E-02 -0.434750258397953015E-03 -0.124178531210186001E-02
+  0.193754724111637015E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.173081480358036992E-01 -0.244895773689184990E-04 -0.166666666666667011E-01
+ -0.133922698934493990E-03  0.178635891343980995E-01 -0.166666666666667011E-01
+  0.193849898137051016E-01 -0.125483132565947994E-02 -0.118151827368068998E-02
+  0.194632328928430985E-01 -0.166666666666667011E-01 -0.142602728611135999E-03
+  0.172960572372906005E-01 -0.117224517748023995E-06 -0.158587369045146999E-03
+ -0.567482454000282988E-05  0.178189579824735989E-01 -0.139720580884206998E-03
+ -0.166666666666667011E-01  0.177036794314488002E-01 -0.306504126161977009E-07
+ -0.139932852514181998E-03 -0.499375406900951968E-06  0.178139944442497998E-01
+ -0.546160778925739950E-03  0.184421024561668007E-01 -0.894165394995071063E-04
+ -0.166666666666667011E-01 -0.147442848397543992E-03 -0.240246210218879017E-04
+  0.177137736588962001E-01 -0.166666666666667011E-01 -0.400413710403862977E-03
+  0.186079235147511013E-01 -0.750380380436990973E-03 -0.465646679664696004E-03
+ -0.925379800643594003E-03  0.191586075383834986E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.177029162142695005E-01 -0.193968097316926012E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.245143979550076002E-03
+  0.179475102930650998E-01 -0.166666666666667011E-01 -0.140401212229714989E-03
+  0.173460971401947009E-01 -0.290712390926593986E-07 -0.144639789824670004E-03
+ -0.132223121416671002E-05  0.178148169771682983E-01 -0.166666666666667011E-01
+  0.193485878399233015E-01 -0.123880031430750989E-02 -0.118081271609363995E-02
+  0.194575794108941998E-01 -0.139755721134129007E-03 -0.166666666666667011E-01
+  0.173348680988957005E-01 -0.228988835753830005E-07 -0.141707074016601988E-03
+ -0.107451126571115990E-05  0.178145682418863992E-01 -0.139419950330182008E-03
+ -0.166666666666667011E-01  0.172964916756991986E-01 -0.212265241962166009E-07
+ -0.140190964959546998E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.107696768439272990E-05  0.178142558899423015E-01
+ -0.166666666666667011E-01 -0.435630511714110973E-03  0.193599454053489002E-01
+ -0.109078026057308000E-02 -0.434507224229087005E-03 -0.113826963448308990E-02
+  0.192764822629944987E-01 -0.166666666666667011E-01 -0.139539348987166009E-03
+  0.173126647078671002E-01 -0.217461924902823005E-07 -0.140774574657894001E-03
+ -0.106961747895517997E-05  0.178145678787864983E-01 -0.559739151621855982E-03
+  0.187342113122531996E-01 -0.453812905743477017E-03 -0.166666666666667011E-01
+ -0.144437865847829008E-03 -0.570572508916922003E-04  0.173662393449596011E-01
+ -0.166666666666667011E-01 -0.141554479709531003E-03  0.173902724631262995E-01
+ -0.612976885063492026E-05 -0.543269258690614047E-03 -0.155576910986749004E-04
+  0.183343607371364015E-01 -0.166666666666667011E-01  0.190435223900498989E-01
+ -0.109812314311903995E-02 -0.166666666666667011E-01 -0.111910146500585994E-02
+  0.192002058225646012E-01 -0.166666666666667011E-01  0.191810336309519991E-01
+ -0.116112312176440999E-02 -0.117547931761240003E-02  0.194335300082579994E-01
+ -0.166666666666667011E-01 -0.435325182482829002E-03  0.194712654108410016E-01
+ -0.119439354715173991E-02 -0.434813392979988976E-03 -0.125665050313407000E-02
+  0.193897929840027988E-01 -0.140180749732543006E-03 -0.166666666666667011E-01
+  0.177492505912984003E-01 -0.497281655052260991E-07  0.000000000000000000E+00
+ -0.140418216862212009E-03 -0.517996101551560007E-06  0.178140139712071009E-01
+ -0.166666666666667011E-01 -0.435069320552657990E-03  0.193884456575911003E-01
+ -0.125614320010757996E-02 -0.435069463573064001E-03 -0.118122256466309992E-02
+  0.194633694941631005E-01 -0.166666666666667011E-01 -0.139670893318960006E-03
+  0.172964735659094995E-01 -0.239712911883834992E-07 -0.141640386974580993E-03
+ -0.121295694635543998E-05  0.178147058443772002E-01 -0.141657679701159996E-03
+  0.178147071052390006E-01 -0.121453479340950007E-05 -0.166666666666667011E-01
+ -0.139673890220687005E-03 -0.240034008201301999E-07  0.172964735798710016E-01
+ -0.166666666666667011E-01 -0.435069148260108026E-03  0.193887190916836004E-01
+ -0.125642659983513007E-02 -0.435069635865442011E-03 -0.118150231844085991E-02
+  0.194636542898556017E-01 -0.139671819173706988E-03 -0.166666666666667011E-01
+  0.172964367943914989E-01 -0.239065653681595988E-07  0.000000000000000000E+00
+ -0.141643861658325012E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.120918365724227991E-05  0.178144555196989990E-01
+  0.000000000000000000E+00 -0.435057571499996014E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193887668592458984E-01 -0.125672668568808997E-02
+ -0.435081212181971026E-03 -0.118196881219207990E-02  0.194640540662835010E-01
+ -0.139678894290856009E-03 -0.166666666666667011E-01  0.172963729635307005E-01
+ -0.240483845199339011E-07 -0.141687157028328993E-03 -0.121683863414542000E-05
+  0.178146601266745003E-01 -0.166666666666667011E-01 -0.435063686986125991E-03
+  0.193888428213916005E-01 -0.125667757246277997E-02 -0.435075097036248006E-03
+ -0.118183173664951000E-02  0.194639441484454993E-01 -0.139679133300664996E-03
+ -0.166666666666667011E-01  0.172963909863181990E-01 -0.238879039025341995E-07
+ -0.141683556731170989E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.120749743554915009E-05  0.178141338836764994E-01
+  0.000000000000000000E+00 -0.435044658394895018E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193878886191347008E-01 -0.125607319657829999E-02
+ -0.435094123787185003E-03 -0.118150841007670995E-02  0.194635600909605008E-01
+ -0.139671709913231998E-03 -0.166666666666667011E-01  0.172964345130826001E-01
+ -0.239825539017364991E-07 -0.141645500444457010E-03 -0.121359829368120991E-05
+  0.178147062946855013E-01 -0.434952586391279994E-03 -0.166666666666667011E-01
+  0.193531521576786013E-01 -0.122237980995592004E-02 -0.435186153854376021E-03
+ -0.114985437868289999E-02  0.194307215405399997E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.173112910777010014E-01 -0.253865275619669011E-04
+ -0.166666666666667011E-01 -0.138259364869725991E-03  0.178692219193366010E-01
+ -0.166666666666667011E-01  0.193634106886295017E-01 -0.124536794787297005E-02
+ -0.118108829653989990E-02  0.194595580858667010E-01 -0.166666666666667011E-01
+ -0.435067679897617021E-03  0.193880971906136984E-01 -0.125581995532966003E-02
+ -0.435071104218809021E-03 -0.118092995363368010E-02  0.194630638101403984E-01
+ -0.139730294231680001E-03 -0.166666666666667011E-01  0.172965125754464995E-01
+ -0.246135293977140997E-07 -0.141982756437381995E-03 -0.124443577379881005E-05
+  0.178147384420490988E-01 -0.141918111126178997E-03  0.178147321937730009E-01
+ -0.123840965646412996E-05 -0.166666666666667011E-01 -0.139719091240698002E-03
+ -0.244908832673502001E-07  0.172965124365098005E-01 -0.166666666666667011E-01
+ -0.434974852046012982E-03  0.193565226857273995E-01 -0.122539133752111993E-02
+ -0.435163903370489979E-03 -0.115249079254224011E-02  0.194335197239201007E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.173108110555817013E-01
+ -0.252486224725561010E-04 -0.166666666666667011E-01 -0.137635748516305002E-03
+  0.178684051256846992E-01 -0.166666666666667011E-01 -0.140103968384359013E-03
+  0.173357792453362990E-01 -0.263227891208892990E-07 -0.143405883131451987E-03
+ -0.122767822040640990E-05  0.178147220760749986E-01 -0.166666666666667011E-01
+  0.193641798143363991E-01 -0.124570774851647990E-02 -0.118110177764830008E-02
+  0.194596602740228983E-01 -0.140117598304397007E-03 -0.166666666666667011E-01
+  0.173359378633641988E-01 -0.264605455567292014E-07 -0.143470065501732002E-03
+ -0.123349702618485005E-05  0.178147283869347015E-01 -0.139708750660181013E-03
+ -0.166666666666667011E-01  0.172963799080418992E-01 -0.951229877766877080E-08
+ -0.141326952140999990E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.428620398176468988E-06
+  0.177495879055481996E-01 -0.166666666666667011E-01 -0.435139436933837007E-03
+  0.188854634743449989E-01 -0.626119914025915028E-03 -0.434999328204409004E-03
+ -0.676888990402598964E-03  0.188324542389332987E-01 -0.166666666666667011E-01
+ -0.139445446199876991E-03  0.174062644632392988E-01 -0.190291660309238009E-07
+ -0.139963236529218007E-03 -0.765751518464875966E-06  0.178142630110634005E-01
+ -0.532897137108844952E-03  0.187657880254245013E-01 -0.512857774623407003E-03
+ -0.166666666666667011E-01 -0.274572252963939997E-03 -0.183877623767562004E-03
+  0.177330837430943015E-01 -0.166666666666667011E-01 -0.142934406314033008E-03
+  0.173279990759568001E-01 -0.109939898040709003E-06 -0.158611602798506006E-03
+ -0.505927466613655039E-05  0.178189142396055003E-01 -0.166666666666667011E-01
+  0.191837858462441009E-01 -0.116715939594670996E-02 -0.166666666666667011E-01
+ -0.115261553028868000E-02  0.193075010029012985E-01 -0.166666666666667011E-01
+  0.192715504745154011E-01 -0.120392394815805009E-02 -0.117898318409425003E-02
+  0.194477863480717016E-01 -0.166666666666667011E-01 -0.435541725864183982E-03
+  0.194766613131175000E-01 -0.120362800929753994E-02 -0.434596343992153973E-03
+ -0.125584159683252996E-02  0.193897158068569997E-01 -0.435053772712346986E-03
+ -0.166666666666667011E-01  0.193916866769025011E-01 -0.125980517264371998E-02
+ -0.435085010639267014E-03 -0.118504095747396995E-02  0.194671524138403003E-01
+ -0.166666666666667011E-01 -0.139670776270514011E-03  0.172964393783540983E-01
+ -0.239722661929319005E-07 -0.141640063068849004E-03 -0.121308200634946994E-05
+  0.178147059781512997E-01 -0.166666666666667011E-01 -0.435069330990369015E-03
+  0.193887306635063990E-01 -0.125643445843000995E-02 -0.435069453135356987E-03
+ -0.118150726985585992E-02  0.194636605867599989E-01 -0.140346027478982998E-03
+  0.178144315994209004E-01 -0.933532505083072018E-06 -0.166666666666667011E-01
+ -0.140345973895093990E-03 -0.933513695241501967E-06  0.178144303098549985E-01
+ -0.166666666666667011E-01 -0.139688634737237009E-03  0.172964363725436991E-01
+ -0.241646449186064015E-07 -0.141743168517025991E-03 -0.122254522732879002E-05
+  0.178147146809153993E-01 -0.435059870304747008E-03 -0.166666666666667011E-01
+  0.193910369723532990E-01 -0.125956508720873007E-02  0.000000000000000000E+00
+ -0.435078913533033980E-03  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.118493531739844002E-02  0.194660938594058003E-01
+  0.000000000000000000E+00 -0.435045710134425011E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.193875426850105004E-01 -0.125569403179517989E-02
+ -0.435093072209260005E-03 -0.118112079243948005E-02  0.194631751773721995E-01
+ -0.139419883587548995E-03 -0.166666666666667011E-01  0.172964832718376996E-01
+ -0.213161991804850005E-07 -0.140230234433833000E-03 -0.108242532544913993E-05
+  0.178145871219505987E-01 -0.166666666666667011E-01 -0.435070648695012982E-03
+  0.194635173293556986E-01 -0.118139703587940995E-02 -0.435068135425710023E-03
+ -0.125625484210198000E-02  0.193885626975772996E-01 -0.139434378884568000E-03
+ -0.166666666666667011E-01  0.172964444169705009E-01 -0.849109393347174922E-08
+ -0.140068540591421997E-03  0.000000000000000000E+00  0.000000000000000000E+00
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.383811762342743974E-06
+  0.177500579574621994E-01 -0.140208513821624988E-03 -0.166666666666667011E-01
+  0.177497183043298992E-01 -0.504600292032754029E-07 -0.140451252065581990E-03
+ -0.520324484157912038E-06  0.178140160927338989E-01 -0.435048849877068026E-03
+ -0.166666666666667011E-01  0.193915158396935003E-01 -0.125973026964695009E-02
+ -0.435089932909550007E-03 -0.118503856986224010E-02  0.194671300235720011E-01
+ -0.435388199753534012E-03 -0.166666666666667011E-01  0.194580790950902999E-01
+ -0.118263497490352998E-02 -0.434750258397998009E-03 -0.124178531210191010E-02
+  0.193754724111636009E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.173081480358036992E-01 -0.244895773689180992E-04 -0.166666666666667011E-01
+ -0.133922698934496999E-03  0.178635891343980995E-01 -0.166666666666667011E-01
+  0.193849898137051016E-01 -0.125483132565947994E-02 -0.118151827368068998E-02
+  0.194632328928430985E-01 -0.166666666666667011E-01 -0.142602728611646007E-03
+  0.172960572372683995E-01 -0.117224517813049999E-06 -0.158587369049941992E-03
+ -0.567482454330607033E-05  0.178189579824768012E-01 -0.139720580884228004E-03
+ -0.166666666666667011E-01  0.177036794314212008E-01 -0.306504126126849030E-07
+ -0.139932852514272990E-03 -0.499375406920812032E-06  0.178139944442497998E-01
+ -0.546160778925799039E-03  0.184421024561692987E-01 -0.894165395041286943E-04
+ -0.166666666666667011E-01 -0.147442848396934996E-03 -0.240246210175965007E-04
+  0.177137736588654990E-01 -0.166666666666667011E-01 -0.400413710403472989E-03
+  0.186079235147468998E-01 -0.750380380434690947E-03 -0.465646679664998009E-03
+ -0.925379800643236976E-03  0.191586075383828984E-01 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.177029162142689003E-01 -0.193968097316642006E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.245143979549885995E-03
+  0.179475102930649992E-01 -0.166666666666667011E-01 -0.140401212229714989E-03
+  0.173460971401947009E-01 -0.290712390925843005E-07 -0.144639789824673013E-03
+ -0.132223121416698996E-05  0.178148169771682983E-01 -0.166666666666667011E-01
+  0.193485878399233015E-01 -0.123880031430749992E-02 -0.118081271609363995E-02
+  0.194575794108941998E-01 -0.139755721134129007E-03 -0.166666666666667011E-01
+  0.173348680988957005E-01 -0.228988835753840990E-07 -0.141707074016602990E-03
+ -0.107451126571121009E-05  0.178145682418863992E-01 -0.139419954077844995E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172964916755905009E-01
+ -0.212265636585413010E-07 -0.140190986545845990E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.107696962013746004E-05  0.178142558877735016E-01
+ -0.166666666666667011E-01 -0.435069846225575999E-03  0.194634066063343984E-01
+ -0.118126932115139994E-02 -0.435068937899507017E-03 -0.125616762178684996E-02
+  0.193884733114783013E-01 -0.166666666666667011E-01 -0.139670406363162002E-03
+  0.172964705688706984E-01 -0.239661812368390999E-07 -0.141637605266820997E-03
+ -0.121271180256684004E-05  0.178147053610641995E-01 -0.142072630916558992E-03
+  0.178147467538722001E-01 -0.125303449493682997E-05 -0.166666666666667011E-01
+ -0.139745783560114997E-03 -0.247866380946762004E-07  0.172964714736013984E-01
+ -0.166666666666667011E-01 -0.434919264262457973E-03  0.193344334821136006E-01
+ -0.120384249982520997E-02 -0.435219446910729000E-03 -0.113217480938375002E-02
+  0.194124575348414985E-01 -0.166666666666667011E-01  0.173153477755207987E-01
+ -0.270609262604330985E-04 -0.166666666666667011E-01 -0.148526207309566006E-03
+  0.178819709504901003E-01 -0.166666666666667011E-01  0.173381030673696000E-01
+ -0.222893784193113984E-07 -0.104900286329099006E-05  0.178148086104071984E-01
+ -0.166666666666667011E-01 -0.435070197675724009E-03  0.194629909526723016E-01
+ -0.118086398896642989E-02 -0.435068586447951984E-03 -0.125574648914054000E-02
+  0.193880551117544993E-01 -0.435351553302722974E-03 -0.166666666666667011E-01
+  0.194221659534600001E-01 -0.114701564680454995E-02 -0.434786972534015011E-03
+ -0.120710615390273008E-02  0.193420430171151007E-01 -0.166666666666667011E-01
+ -0.139670121825768003E-03  0.172964394653271000E-01 -0.239652296206953015E-07
+ -0.141636284668589994E-03 -0.121273580815533010E-05  0.178147056312705013E-01
+ -0.166666666666667011E-01 -0.435070564583170979E-03  0.194632895210796984E-01
+ -0.118117290840524997E-02 -0.435068219538200975E-03 -0.125602935372259991E-02
+  0.193883427513572015E-01 -0.140356103029070988E-03  0.178144326340144016E-01
+ -0.934844351950365017E-06 -0.166666666666667011E-01 -0.140500373954539995E-03
+ -0.935645690836052991E-06  0.178144577216279992E-01 -0.166666666666667011E-01
+ -0.139419788136554014E-03  0.172964863338696991E-01 -0.213147023989074007E-07
+ -0.140223522691785012E-03 -0.108230807367247995E-05  0.178145859435818003E-01
+ -0.435214369892329993E-03 -0.166666666666667011E-01  0.194296915933112983E-01
+ -0.115819918280740002E-02  0.000000000000000000E+00 -0.434924344707075003E-03
+  0.000000000000000000E+00  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.121631545945458997E-02  0.193618156735865000E-01  0.000000000000000000E+00
+ -0.435418791966529978E-03 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.194737697407992016E-01 -0.119862135208015992E-02 -0.434719602197164011E-03
+ -0.125646773106410990E-02  0.193899154748590000E-01 -0.139674312472573998E-03
+ -0.166666666666667011E-01  0.172964142795966995E-01 -0.239964322444068015E-07
+ -0.141660280806240994E-03 -0.121419135139913995E-05  0.178146576430187990E-01
+ -0.166666666666667011E-01 -0.435064741241858997E-03  0.193888829829566983E-01
+ -0.125669444198150005E-02 -0.435074042815188978E-03 -0.118183195684604006E-02
+  0.194639537546182004E-01 -0.139703753692154993E-03 -0.166666666666667011E-01
+  0.172963763508955995E-01 -0.954521092502657923E-08 -0.141306390167940008E-03
+  0.000000000000000000E+00  0.000000000000000000E+00  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.430379165314042026E-06  0.177499308188320999E-01
+ -0.139758462452603012E-03 -0.166666666666667011E-01  0.177495927166552991E-01
+ -0.464544378539779993E-07 -0.139910115331338996E-03 -0.481441843847050964E-06
+  0.178139831899020995E-01 -0.435407966296089978E-03 -0.166666666666667011E-01
+  0.194236847721564986E-01 -0.114950911636884008E-02 -0.434730445651113973E-03
+ -0.120708640519090991E-02  0.193421924466846001E-01 -0.416148250027581024E-03
+ -0.166666666666667011E-01  0.185549551170714998E-01 -0.503294505756778955E-03
+ -0.452623601339539015E-03 -0.518655138222936054E-03  0.187784353169859995E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.182536262402052986E-01
+ -0.562462921602506017E-03 -0.166666666666667011E-01 -0.776619709920083967E-03
+  0.186715804513362012E-01 -0.166666666666667011E-01  0.173100319756059016E-01
+ -0.233382192352905001E-07 -0.116274420326912994E-05  0.178149543965509007E-01
+ -0.166666666666667011E-01 -0.143574443571915002E-03  0.173387404346211985E-01
+ -0.394234709446026970E-04 -0.556824893407507947E-03 -0.414251929427215991E-03
+  0.187007563880755012E-01 -0.143222946652751988E-03 -0.166666666666667011E-01
+  0.174281634389273007E-01 -0.707336094321511021E-07 -0.153074744284785989E-03
+ -0.259053351753128992E-05  0.178159320930474999E-01 -0.143343235945712010E-03
+  0.178139865856733991E-01 -0.995973199358266933E-06 -0.166666666666667011E-01
+ -0.140432036984107007E-03 -0.263668001789773996E-07  0.174281413140941997E-01
+ -0.166666666666667011E-01 -0.434113176607574020E-03  0.193474145478290012E-01
+ -0.123277679909749010E-02 -0.436022648061583991E-03 -0.117159827098749008E-02
+  0.194498157758568983E-01 -0.166666666666667011E-01  0.000000000000000000E+00
+  0.172976663726301001E-01 -0.749716013957179010E-08 -0.166666666666667011E-01
+  0.000000000000000000E+00 -0.328919335683087000E-06  0.177355871461097984E-01
+ -0.166666666666667011E-01 -0.140992307333851993E-03  0.177352147833274992E-01
+ -0.465109349848994022E-07 -0.141543817822521994E-03 -0.593601105003849030E-06
+  0.178140918400187011E-01 -0.166666666666667011E-01  0.193681748787879991E-01
+ -0.124747135029532009E-02 -0.118117041740436004E-02  0.194601894329670989E-01
+ -0.402827943802412983E-03 -0.166666666666667011E-01  0.186951349558098986E-01
+ -0.851875272769232967E-03 -0.463811391972161985E-03 -0.104547015467971994E-02
+  0.192724136708169007E-01 -0.139676914109847992E-03 -0.166666666666667011E-01
+  0.000000000000000000E+00  0.172965234506337016E-01 -0.258479543060504998E-07
+ -0.166751637452680000E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+ -0.152816050945488003E-05  0.178205642139422984E-01 -0.166666666666667011E-01
+ -0.435069391172627010E-03  0.000000000000000000E+00  0.193884418587671990E-01
+ -0.125613776993248002E-02 -0.435069392953110973E-03 -0.118121619381530009E-02
+  0.194633634747354983E-01 -0.166666666666667011E-01 -0.139670761673773989E-03
+  0.172964743993378987E-01 -0.239698260758715003E-07 -0.141639618752670998E-03
+ -0.121288305970079009E-05  0.178147057814278011E-01 -0.141639617499855004E-03
+  0.178147057813345007E-01 -0.121288294541192010E-05 -0.166666666666667011E-01
+ -0.139670761456658004E-03 -0.239698237501548993E-07  0.172964743993366983E-01
+ -0.166666666666667011E-01 -0.435069391184759991E-03  0.193887118629383001E-01
+ -0.125641385905375007E-02 -0.435069392940977992E-03 -0.118148619757772006E-02
+  0.194636395637332003E-01 -0.166666666666667011E-01  0.172965077542778997E-01
+ -0.927568134856068057E-07 -0.166666666666667011E-01 -0.102046181376419003E-05
+  0.173051339147867016E-01 -0.166666666666667011E-01  0.173041376239571991E-01
+ -0.235410465543548011E-07 -0.117464369127414998E-05  0.178146677204709010E-01
+ -0.166666666666667011E-01 -0.435069390787829998E-03  0.193887086342375004E-01
+ -0.125641056657932004E-02 -0.435069393337907985E-03 -0.118148298373717004E-02
+  0.194636362741715009E-01 -0.435064997112617009E-03 -0.166666666666667011E-01
+  0.193885568522766984E-01 -0.125634409504329003E-02  0.000000000000000000E+00
+ -0.435073786951775026E-03 -0.118148100464545011E-02  0.194636167070038994E-01
+ -0.166666666666667011E-01 -0.139670744016726994E-03  0.000000000000000000E+00
+  0.172964393844361013E-01 -0.239719196971529010E-07 -0.141639876847867010E-03
+ -0.121306495872900994E-05  0.178147059624979009E-01 -0.166666666666667011E-01
+ -0.435069390950542994E-03  0.193887087721348016E-01 -0.125641070333806998E-02
+ -0.435069393175194989E-03 -0.118148311476701999E-02  0.194636364105120994E-01
+ -0.140348088180879989E-03  0.178144318602812997E-01 -0.933801415168445956E-06
+ -0.166666666666667011E-01 -0.140348088084880012E-03 -0.933801076426891949E-06
+  0.178144318368827007E-01 -0.166666666666667011E-01 -0.139670745303743007E-03
+  0.172964393843559987E-01 -0.239719335227778987E-07 -0.141639884277225002E-03
+ -0.121306563867630006E-05  0.178147059631548997E-01 -0.435069390608568011E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01  0.193887086845648017E-01
+ -0.125641066869512999E-02 -0.435069393517170026E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.118148310483521006E-02  0.194636363135432984E-01
+ -0.435069389993829011E-03 -0.166666666666667011E-01  0.193887084930986983E-01
+ -0.125641043857266008E-02 -0.435069394131910002E-03 -0.118148287005216007E-02
+  0.194636361542637991E-01 -0.139670502006070010E-03 -0.166666666666667011E-01
+  0.172964394138492988E-01 -0.239693181051815999E-07 -0.141638479654202995E-03
+ -0.121293697133386993E-05  0.178147058357341001E-01 -0.166666666666667011E-01
+ -0.435069390724191016E-03  0.193887813829540998E-01 -0.125648495565761011E-02
+ -0.435069393401547020E-03 -0.118155573441382005E-02  0.194637106641438011E-01
+ -0.139699943939322992E-03  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.172964151706326008E-01 -0.947752965014964063E-08 -0.141286304262072990E-03
+  0.000000000000000000E+00 -0.166666666666667011E-01 -0.427064219430228011E-06
+  0.177495997096570012E-01 -0.140178146812241995E-03 -0.166666666666667011E-01
+  0.177492252668765005E-01 -0.496801880864810970E-07 -0.140415043122611998E-03
+ -0.517779586424547950E-06  0.178140139541973011E-01 -0.435069391018619994E-03
+ -0.166666666666667011E-01  0.193887087741046009E-01 -0.125641070402467006E-02
+ -0.435069393107117989E-03 -0.118148311448062994E-02  0.194636364104064999E-01
+ -0.139670745278655002E-03 -0.166666666666667011E-01  0.172964393844560992E-01
+ -0.239719332530519000E-07 -0.141639884131573011E-03 -0.121306562524490010E-05
+  0.178147059631618004E-01  0.000000000000000000E+00 -0.166666666666667011E-01
+  0.193887976612396006E-01 -0.125650164849242007E-02  0.000000000000000000E+00
+ -0.166666666666667011E-01 -0.118157208094086998E-02  0.194637273068684995E-01
+ -0.166666666666667011E-01  0.172964393844281007E-01 -0.239719133479159011E-07
+ -0.121306464654854992E-05  0.178147059621872987E-01 -0.166666666666667011E-01
+ -0.139670744021023010E-03  0.172964393843868004E-01 -0.239719197464163008E-07
+ -0.141639876873170012E-03 -0.121306496126475003E-05  0.178147059625002983E-01
+ -0.435069390942508026E-03 -0.166666666666667011E-01  0.193887087711050003E-01
+ -0.125641070244318989E-02 -0.435069393183230011E-03 -0.118148311400550999E-02
+  0.194636364097085998E-01 -0.140348087386128002E-03  0.178144318601935990E-01
+ -0.933801313750166004E-06 -0.166666666666667011E-01 -0.140348087289432998E-03
+ -0.933800972552974967E-06  0.178144318366253995E-01 -0.166666666666667011E-01
+ -0.435069391191810016E-03  0.193887815133764012E-01 -0.125648507876439009E-02
+ -0.435069392933928997E-03 -0.118155584777763990E-02  0.194637107833691009E-01
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172964151566877000E-01
+ -0.947849244181166918E-08 -0.166666666666667011E-01 -0.427106377285012985E-06
+  0.177495993062723005E-01 -0.166666666666667011E-01 -0.140178145569364992E-03
+  0.177492248215362983E-01 -0.496797136596528977E-07 -0.140415043288561988E-03
+ -0.517779487315898039E-06  0.178140139541107002E-01 -0.166666666666667011E-01
+  0.193887977549708004E-01 -0.125650168627734998E-02 -0.118157208932236990E-02
+  0.194637273908664990E-01 -0.435069390319006992E-03 -0.166666666666667011E-01
+  0.193887087496051012E-01 -0.125641069305206009E-02 -0.435069393806730991E-03
+ -0.118148311376660995E-02  0.194636364069746999E-01 -0.139670541873138993E-03
+ -0.166666666666667011E-01  0.000000000000000000E+00  0.172964393705303011E-01
+ -0.239412684749743987E-07 -0.141637881918781994E-03  0.000000000000000000E+00
+ -0.166666666666667011E-01  0.000000000000000000E+00 -0.121130856871430998E-05
+  0.178146131135742992E-01
+  0.186264861206935986E-17  0.188469719401502014E-16  0.000000000000000000E+00
+  0.552421818745575958E-18  0.917870006576351024E-18  0.914518973837806942E-17
+ -0.462479989200847985E-18 -0.203714864886006990E-16 -0.195088628411609992E-16
+  0.361312491563162037E-20  0.221189708273807991E-17 -0.223400286947376007E-18
+ -0.740148597608108019E-18  0.715802397433987990E-18  0.210593204619582009E-19
+ -0.247092745603879988E-17  0.234522247271034021E-18  0.359989002583078020E-20
+  0.130125436521942010E-18  0.523865972357081977E-17  0.000000000000000000E+00
+  0.502798757490152989E-17 -0.774865740148233950E-17  0.305533254404098983E-16
+  0.000000000000000000E+00  0.162630325872825993E-17 -0.590890184004599994E-17
+  0.202691596146165005E-15 -0.691124674850884999E-15 -0.677626357803439970E-20
+ -0.638832248819193008E-17 -0.354921197842210034E-17  0.180212876973253014E-17
+  0.592118381778119037E-17  0.987885261959830012E-19 -0.448662764248762018E-18
+  0.394717353420504035E-17  0.000000000000000000E+00 -0.120292707493299994E-16
+  0.852284551527277001E-17  0.000000000000000000E+00  0.303397988982417995E-17
+  0.101048383629478005E-18  0.740148804403260964E-18 -0.135433289076572003E-16
+  0.223533152833267012E-19 -0.447899111107253025E-17  0.212817027997642990E-19
+  0.000000000000000000E+00 -0.403081803774639981E-18 -0.672555409776245036E-17
+  0.000000000000000000E+00  0.491279109407494004E-18 -0.205371608391277999E-16
+  0.173167415736669007E-16  0.462479989200847985E-18 -0.704731412115577978E-18
+  0.000000000000000000E+00 -0.474338450462407997E-18 -0.612574227454310005E-17
+ -0.184314369322535985E-17  0.765717784317887969E-17  0.648342840630083007E-17
+ -0.304990590835039002E-18 -0.592118946587380969E-17  0.339088701131534005E-19
+ -0.112557736932424994E-16 -0.381335904017217022E-18  0.000000000000000000E+00
+ -0.669901779474492986E-17  0.138243891116723001E-17 -0.462599103209056006E-18
+  0.620975527371875958E-25  0.182632162877623988E-17  0.000000000000000000E+00
+  0.430176265969362029E-19 -0.395986452044138980E-18 -0.425221003356408028E-17
+  0.763810107027342937E-17  0.449986253228846961E-21  0.149920187933444999E-16
+  0.933333695763632040E-17 -0.449986253228846961E-21 -0.102263638350942002E-18
+  0.716941713442839936E-17 -0.185604230015463005E-18  0.148029860142325988E-16
+  0.765722441914230953E-17  0.000000000000000000E+00  0.395087089926695984E-18
+ -0.112484787433542003E-16  0.148029860142325988E-16  0.186249855764197994E-18
+ -0.123980477187407996E-17 -0.177475529531162003E-18  0.000000000000000000E+00
+  0.468189851533179025E-17  0.846067343136115030E-19  0.251873187799946985E-17
+  0.000000000000000000E+00  0.427222242771388022E-18 -0.126919416816583993E-16
+  0.000000000000000000E+00 -0.639166659846272950E-25 -0.577339967784366969E-17
+  0.000000000000000000E+00 -0.299518791083001013E-18  0.885930391776569904E-18
+ -0.415350546620027017E-18  0.777912411780388954E-17  0.148029595444530002E-16
+ -0.870855748895828018E-19  0.370461440010243996E-17  0.000000000000000000E+00
+  0.925637604759498948E-17 -0.157717534778751000E-16  0.313749473992465019E-16
+  0.575982404132923990E-19  0.219550939928314995E-17 -0.590890184004599994E-17
+  0.200929767615876011E-15 -0.698361724352226037E-15 -0.762329652528870004E-20
+  0.762499059118321011E-17  0.420595399031252993E-17 -0.953539688950861913E-18
+  0.740148701005684010E-18  0.731844738233841023E-17  0.151625515003419992E-18
+  0.117147568281026001E-16  0.000000000000000000E+00  0.912148605074475063E-18
+  0.159581007262709991E-17  0.449986253228846961E-21 -0.130862557845303000E-24
+  0.171921983655087999E-17  0.000000000000000000E+00  0.158448927876369990E-18
+  0.945956303890129087E-18  0.984593083121869034E-18 -0.108044346378205993E-16
+ -0.148029595444530002E-16  0.326160624252108976E-18 -0.372472316079360021E-17
+  0.148029595444530002E-16 -0.106167109608854004E-16  0.550232602536393962E-17
+  0.123056946577105000E-16  0.677626357803439970E-20  0.603087458445062034E-18
+ -0.292734586571086005E-17  0.670850094225405965E-18 -0.124971241037898994E-16
+  0.000000000000000000E+00  0.128342432167972003E-16  0.179743617411750996E-16
+ -0.477944389246467971E-18  0.592118946587380969E-17 -0.141023146889066998E-19
+ -0.776697175539328012E-17 -0.238961064843469015E-19  0.000000000000000000E+00
+  0.212532690638741003E-16 -0.147509485406731005E-17  0.000000000000000000E+00
+  0.635022970263750003E-17 -0.216767304995182992E-18  0.592118946264263003E-17
+  0.350644635857248989E-19 -0.606692755904139014E-18 -0.229492316153697017E-18
+  0.119207909163859998E-16 -0.462599103209056006E-18  0.119335121574955004E-17
+ -0.283978943926022000E-18  0.000000000000000000E+00  0.526661030130147969E-18
+  0.834631058858251966E-18  0.572340905439509016E-18 -0.148029860142325988E-16
+ -0.952426877165385946E-17  0.592118381778119037E-17 -0.152714247567623012E-19
+ -0.136949449939554005E-17  0.000000000000000000E+00 -0.453732450462848006E-18
+ -0.649938968340065021E-17 -0.212605269760829000E-18  0.148061359180052007E-17
+  0.173938215718671006E-16 -0.149755425074559992E-16 -0.176182853028893989E-18
+  0.000000000000000000E+00  0.314858028362183973E-19  0.222314384918731995E-17
+ -0.224993126614423998E-21  0.577572400366616003E-25  0.196242353951357996E-16
+  0.592118381778119037E-17  0.846496107036753039E-18 -0.256018086451873006E-19
+  0.809438002004071989E-18 -0.177860077526130999E-16  0.000000000000000000E+00
+ -0.222683638344174993E-18  0.123060118814754002E-16  0.148029595444530002E-16
+  0.169406589450859996E-18 -0.813151629364127964E-19 -0.921571846612678961E-18
+  0.000000000000000000E+00  0.831362837730095975E-18  0.000000000000000000E+00
+ -0.542101086242751976E-19 -0.159835117146885992E-17  0.000000000000000000E+00
+  0.726754268744189980E-18 -0.753520509877426044E-17  0.777915058758348969E-17
+  0.000000000000000000E+00 -0.513301966036106005E-18 -0.420128341838133016E-18
+  0.608169656128588029E-17  0.578099986501059981E-19 -0.201217970576179000E-17
+ -0.369654608040760013E-19 -0.148029595444530002E-16  0.144305243943295996E-16
+ -0.705346343352828992E-19  0.000000000000000000E+00 -0.147612593664461005E-18
+  0.183565822566964011E-16  0.278007338868100017E-18 -0.662135360669951994E-17
+  0.462585868319254965E-18  0.382512344841314970E-17 -0.174270411452671991E-18
+  0.000000000000000000E+00 -0.413352078260099028E-17  0.111698234754425002E-16
+  0.240451477901814990E-18  0.000000000000000000E+00 -0.117367664498414002E-16
+  0.000000000000000000E+00 -0.428175154837049014E-18 -0.174319380544935010E-17
+  0.575982404132923990E-19 -0.271728169479180011E-17  0.735145188877927989E-18
+  0.530165449030040969E-17  0.000000000000000000E+00 -0.422661168873770002E-18
+  0.147489611940655008E-17  0.542185789537478032E-17  0.000000000000000000E+00
+  0.160646151193882004E-16 -0.132814766129474004E-17  0.148027477862161989E-16
+  0.314846080772212007E-17 -0.793361300897053979E-19  0.000000000000000000E+00
+ -0.798196203889169946E-19 -0.312772209729571009E-17  0.473491417515153986E-18
+  0.994130806456849964E-17 -0.462585868319254965E-18  0.140167012111641990E-16
+  0.389635155736977975E-19 -0.575982404132923990E-19  0.106920968931909993E-16
+ -0.542101086242751976E-19 -0.921571846612678961E-18 -0.460785923306338999E-18
+  0.693889390390722992E-17  0.590890184004599994E-17 -0.176182853028893989E-18
+ -0.708119543904594929E-17  0.000000000000000000E+00 -0.790175860669855044E-18
+ -0.677626357803440031E-19 -0.899887803162968988E-17  0.000000000000000000E+00
+  0.282909004382935976E-18 -0.319839640883224001E-17 -0.189735380184963011E-18
+ -0.921571846612678961E-18  0.940111280245706959E-17 -0.136473551414918999E-17
+ -0.462585868319254965E-18  0.947011739499720023E-25 -0.774842994750098042E-17
+ -0.592118381778119037E-17  0.380929522460731986E-18 -0.412849204146454994E-18
+ -0.790655128793651072E-17  0.574799928767763978E-17  0.449986253228846961E-21
+ -0.184669492435164995E-16 -0.668551742520389010E-17  0.000000000000000000E+00
+  0.757247454845344985E-18 -0.758433300971500986E-17 -0.956088439213291989E-19
+ -0.739967982721356930E-17  0.674366604445490992E-17  0.000000000000000000E+00
+ -0.129066645337873990E-18  0.567512074660380996E-17  0.740306795900258034E-17
+  0.289685267960971005E-18 -0.853809210832335031E-18 -0.346944695195360995E-17
+  0.000000000000000000E+00 -0.440457132572235996E-19  0.140946282423116000E-17
+  0.126038502551440006E-17 -0.460785923306338999E-18 -0.334096264176697982E-17
+ -0.121481465295212001E-16 -0.179994501291539010E-20  0.348945878064791996E-18
+  0.402866943610530006E-18  0.000000000000000000E+00  0.102462200732453003E-16
+  0.324395420824920984E-19 -0.824904211507282051E-17  0.260886147754324986E-18
+ -0.169406589450859993E-20 -0.300220240242445980E-18 -0.155398517387845997E-16
+ -0.462612338098855988E-18 -0.113502414932076005E-18  0.853279815240300958E-17
+  0.118584612615602002E-19  0.000000000000000000E+00 -0.420128341838133016E-18
+ -0.590890184004599994E-17 -0.325260651745650993E-18 -0.414707330975704984E-17
+ -0.739967982721356930E-17 -0.609863722023095955E-18 -0.775204553327135978E-17
+ -0.110046520507278993E-16 -0.596311194867026976E-17  0.506525702458071988E-18
+  0.406575814682063982E-19 -0.585807986321074037E-17  0.000000000000000000E+00
+ -0.957507219399941999E-17 -0.993410828451683924E-18  0.000000000000000000E+00
+ -0.258493941422821001E-23 -0.140905965640059988E-16  0.000000000000000000E+00
+ -0.455999856282107988E-19  0.130766005188302993E-17  0.851098705401120981E-17
+  0.412593057330019984E-17  0.913207396258542929E-21 -0.199420672541224012E-17
+  0.107126374421619007E-16  0.169406589450859993E-20 -0.169813165265542009E-16
+  0.111740586401786993E-16 -0.220546203641338017E-18 -0.580217568869195973E-19
+  0.745639133001020026E-17  0.000000000000000000E+00  0.633051248954183005E-18
+ -0.106607566741425994E-16  0.580217568869195973E-19 -0.175505226671090992E-17
+ -0.127986678330124993E-17 -0.686052019522905026E-18  0.000000000000000000E+00
+  0.469395219121791008E-18 -0.219520603079348977E-18  0.373882580183111030E-17
+  0.462599103209056006E-18  0.597412337698458029E-17 -0.221075599233372003E-18
+  0.000000000000000000E+00  0.234092940533119000E-17 -0.456210270350426032E-18
+  0.000000000000000000E+00  0.247588805817228019E-18  0.115207598134018006E-16
+  0.293496916223615019E-18 -0.516984574123192008E-17  0.462585868319254965E-18
+ -0.979837125471933953E-17  0.542101086242752024E-18  0.000000000000000000E+00
+ -0.111998538839297005E-16 -0.886573158126109934E-17  0.107314690359872997E-18
+  0.000000000000000000E+00  0.132318325413044006E-16  0.000000000000000000E+00
+ -0.490891988880818995E-18  0.300838917864701994E-17 -0.449986253228846961E-21
+  0.153224066374812011E-16  0.752300914782277012E-18 -0.202894665677824002E-16
+ -0.592118381778119037E-17 -0.607071160275117030E-18  0.706332833781481025E-18
+  0.183276753962148989E-16  0.740306795900258034E-17 -0.673708830422389036E-18
+  0.996110745971057006E-18  0.000000000000000000E+00  0.562224322594635998E-24
+ -0.143283150371639000E-17 -0.592118381778119037E-17  0.695175821679365018E-18
+ -0.377743814046069017E-18  0.851295905259153931E-17 -0.617021150427395005E-17
+  0.000000000000000000E+00 -0.699522159489963935E-17 -0.230593470233653008E-17
+  0.899972506457693922E-21  0.136092783635348004E-16 -0.203287907341032009E-18
+ -0.235813972515596983E-17  0.000000000000000000E+00  0.151788304147971006E-17
+  0.000000000000000000E+00  0.718283939271646976E-18  0.339279047022710001E-17
+  0.000000000000000000E+00  0.740095037663445042E-19 -0.151788304147971006E-17
+  0.498732999343331963E-17 -0.592076030130756014E-17  0.124344436656931001E-17
+  0.311708124589582987E-17 -0.189735380184963011E-18 -0.460785923306338999E-18
+  0.145657373796625989E-16  0.661871544984509977E-17 -0.179994501291539010E-20
+  0.232356033872243014E-17  0.178769240009195011E-19  0.740148597608108019E-18
+ -0.485518623621675008E-18  0.132951964373823990E-18 -0.604535355389274969E-17
+ -0.332301613119702991E-18 -0.169406589450859993E-20  0.379629579047537022E-18
+ -0.785816701559762074E-17  0.000000000000000000E+00 -0.108081404069648993E-17
+ -0.867086452280546054E-17 -0.128749007982653989E-18  0.000000000000000000E+00
+  0.406575814682063982E-19 -0.596311194867026976E-17 -0.157209315010398008E-17
+  0.542101086242752024E-18  0.739967982721356930E-17  0.348977574268771980E-18
+ -0.242590236093632002E-17  0.357786716920216001E-17  0.000000000000000000E+00
+  0.508219768352580023E-19 -0.327971157176865014E-17  0.108420217248549998E-18
+  0.460785923306338999E-18  0.269695290405768997E-17  0.105729828849834003E-16
+  0.000000000000000000E+00  0.605520075134427981E-17  0.295792342423450008E-18
+  0.740148804403260964E-18  0.887040033888795054E-17 -0.196630757771205002E-18
+ -0.695752862874681990E-17 -0.545383338913363016E-18  0.000000000000000000E+00
+  0.539586457180591013E-18 -0.226178972740580010E-17  0.000000000000000000E+00
+  0.101643953670515996E-19 -0.219550939928314995E-17  0.157209315010398008E-17
+ -0.740306795900258034E-17  0.156701095242045996E-18  0.000000000000000000E+00
+  0.182959116606928994E-18  0.556924162819702011E-17  0.000000000000000000E+00
+  0.492126142354748016E-18  0.129542647455343004E-16 -0.267003768769613992E-18
+  0.000000000000000000E+00  0.511303497676177987E-18 -0.249632582995209015E-17
+ -0.104854969288279003E-17  0.219719850209398022E-24 -0.877420254237048056E-17
+ -0.186868703054100004E-17  0.000000000000000000E+00  0.310192729707385011E-23
+ -0.646582600286570010E-17  0.000000000000000000E+00 -0.831029897533542997E-18
+  0.106143816202805005E-18 -0.664497347120999001E-18 -0.396502740054638028E-17
+  0.913207396258542929E-21  0.321382829034003006E-17  0.242124367972641983E-17
+ -0.190582413132217990E-20 -0.144015989742860999E-18 -0.112475097000963994E-16
+  0.329502123734093982E-18  0.148029595444530002E-16 -0.150205510589462992E-16
+  0.000000000000000000E+00  0.901573514533289918E-18  0.229584989725403987E-17
+  0.000000000000000000E+00 -0.629603722527569038E-18  0.415692006776889032E-17
+ -0.259039880627106002E-18  0.000000000000000000E+00  0.526221725872965983E-18
+  0.153314307621524007E-16 -0.121059189334008992E-17  0.219719850209398022E-24
+  0.518485626953725989E-17  0.124986814033672995E-17 -0.148029595444530002E-16
+  0.200429726146822988E-17  0.922342050742921017E-19  0.000000000000000000E+00
+  0.356920501310487004E-18  0.132035399740042995E-16 -0.379419114896017021E-18
+  0.180196383238973012E-16  0.000000000000000000E+00  0.117761188624817996E-16
+  0.427189116772794014E-18  0.000000000000000000E+00 -0.945056279636684015E-18
+ -0.306948887770498012E-17 -0.627647260344651004E-18 -0.148029860142325988E-16
+ -0.211552721983768995E-16  0.000000000000000000E+00 -0.363690567209131027E-19
+  0.235090141659442018E-17  0.000000000000000000E+00 -0.154382013937734992E-18
+  0.104986094217475005E-17 -0.202012618175241010E-16  0.000000000000000000E+00
+ -0.201818834573138001E-18 -0.277787102030008014E-18  0.922418879559933069E-18
+ -0.740306795900258034E-17 -0.161027316020146994E-16  0.159919820441612000E-17
+  0.000000000000000000E+00  0.155950028687531998E-16 -0.217486462555504989E-20
+  0.592118919445516966E-17 -0.163450889040478004E-18  0.138365478911949995E-17
+ -0.688214269644119025E-20  0.137618501731589997E-16  0.462638807878457974E-18
+ -0.300866102864727018E-17 -0.425819344452497991E-18  0.000000000000000000E+00
+ -0.534138976538562025E-17 -0.230392961653169981E-18 -0.693889390390722992E-17
+  0.000000000000000000E+00 -0.124683249835832991E-17 -0.593600689435813986E-17
+  0.203287907341032009E-18 -0.146240238343454992E-17 -0.462585868319254965E-18
+  0.804681299891585021E-20 -0.921571846612678961E-18  0.758941520739853008E-18
+ -0.596311194867026976E-17  0.103338019565025002E-18 -0.121972744404619008E-18
+ -0.641119237776780004E-17  0.578099986501059981E-19  0.904419429430779033E-17
+ -0.317770699320046013E-17  0.462599103209056006E-18  0.723480113396294003E-25
+  0.177058483496122985E-16 -0.592123675734038991E-17  0.353544910182120027E-20
+ -0.227547188927349024E-18 -0.648947871549377031E-17  0.181164310453569009E-17
+ -0.449986253228846961E-21 -0.153640401864531989E-16 -0.669340050167796020E-17
+ -0.449986253228846961E-21  0.501443504774546032E-17 -0.146028480106641009E-16
+ -0.454644934438745996E-18  0.580217568869195973E-19  0.102203789509092007E-17
+  0.000000000000000000E+00 -0.394823232538910986E-18  0.152398167869993994E-16
+  0.000000000000000000E+00  0.485858098545066983E-17  0.339523416855172998E-18
+ -0.196963000034115994E-18  0.000000000000000000E+00  0.163869376255981004E-16
+ -0.375984607678321967E-20  0.602139840335321016E-17  0.000000000000000000E+00
+  0.374388562686400991E-18 -0.187956610995729014E-17  0.000000000000000000E+00
+  0.117614743347384008E-23 -0.156049686373756991E-16  0.592118381778119037E-17
+ -0.111927793918019994E-17 -0.125932623433033007E-17  0.107700239243383999E-17
+ -0.680955925205089026E-17  0.000000000000000000E+00 -0.474338450462408009E-19
+ -0.198459819541683001E-16  0.148031713026898014E-16 -0.589365524699542022E-17
+ -0.100559751498030998E-16  0.119262238973404995E-16  0.000000000000000000E+00
+ -0.406575814682063982E-19 -0.298155597433513989E-17  0.291379333855478980E-18
+ -0.143173979074393986E-16 -0.580217568869195973E-19  0.793500464987829020E-17
+  0.509551198266546019E-17 -0.207274917893538006E-19  0.592118946587380969E-17
+ -0.249908860027332019E-18  0.726475167665756951E-17  0.143720452004045010E-18
+  0.219719850209398022E-24 -0.713783873983173068E-17  0.273122128532314001E-17
+ -0.148029860142325988E-16  0.616308195352442029E-17 -0.103685693916237999E-18
+  0.000000000000000000E+00  0.172732113104808003E-18 -0.119086404205086997E-17
+  0.148812751953918994E-18 -0.973629116658004042E-17  0.462585868319254965E-18
+ -0.128415437610183997E-16  0.364164300122516006E-18  0.581611368201348043E-25
+  0.965757993893369943E-18 -0.318378260621606992E-17  0.523869464800819009E-18
+  0.000000000000000000E+00 -0.125934272098304005E-16 -0.592118381778119037E-17
+  0.170794585972679995E-18 -0.638088177192808014E-18  0.148029860142325988E-16
+  0.636134305731703948E-19 -0.115664869328127008E-16  0.103770143886900998E-17
+ -0.740148701005684010E-18  0.109459983278530000E-16  0.860412658285831038E-19
+  0.453890545720096961E-18 -0.462479989200847985E-18  0.574764794271245981E-18
+ -0.417079023228017025E-17 -0.359989002583078020E-20 -0.123844851232456994E-24
+ -0.997743062832839024E-17 -0.592118381778119037E-17 -0.350289482720280986E-18
+  0.671115955244159048E-18 -0.462718217217262968E-18  0.366619682373303022E-17
+  0.148029595444530002E-16 -0.269938812378104984E-18 -0.623165778889683993E-17
+ -0.148029595444530002E-16 -0.428259858131774020E-17  0.948338087745914968E-17
+  0.300036010576418020E-16  0.000000000000000000E+00  0.409286320113278001E-17
+ -0.590890184004599994E-17  0.201607393973679993E-15 -0.699147770927278027E-15
+  0.762329652528870004E-20  0.813490442543030046E-17 -0.681799066366914999E-17
+ -0.607060631336584989E-19 -0.592118946264263003E-17  0.138582841111243012E-18
+ -0.269407654562161003E-17  0.361013453120626983E-18  0.232644547280538988E-24
+ -0.191654297740858989E-16 -0.185276296712765011E-18  0.000000000000000000E+00
+  0.658091044686770012E-17  0.383052929495380021E-18  0.000000000000000000E+00
+ -0.177559921692520007E-18 -0.413370558966372027E-17 -0.446111077151846004E-19
+ -0.468735550811414966E-17  0.000000000000000000E+00 -0.165755308426161985E-16
+  0.248649031945594005E-18 -0.219719850209398022E-24 -0.274940472223534988E-18
+ -0.658549193252238029E-19  0.554334983005379011E-18  0.000000000000000000E+00
+  0.614026536820119009E-17 -0.592118381778119037E-17  0.286291090607452007E-18
+ -0.122457478756202001E-16  0.148029860142325988E-16 -0.552535673821147045E-18
+  0.768758534050744985E-19  0.492023165039092978E-18  0.592118381778119037E-17
+  0.874500688343341996E-18  0.104618365722086995E-19  0.111289438122280006E-16
+ -0.462585868319254965E-18  0.884185636357436999E-17  0.155484871948446003E-18
+ -0.148029595444530002E-16 -0.592115734800158020E-17 -0.959692361744608029E-20
+  0.592118946587380969E-17 -0.191465157017479006E-18 -0.592118381778119037E-17
+  0.220854337068372988E-18  0.888178896156158063E-17  0.000000000000000000E+00
+ -0.806069459784106008E-17  0.127742485722792993E-18  0.219719850209398022E-24
+ -0.296056543911099002E-17  0.296059190889059018E-17  0.719732306674833017E-19
+  0.000000000000000000E+00 -0.120712129288665994E-16  0.000000000000000000E+00
+  0.154282173561696008E-19  0.296059190889059018E-17  0.449986253228846961E-21
+ -0.296056543911099002E-17 -0.166744391367418999E-17  0.868868912763262986E-17
+  0.000000000000000000E+00 -0.440416500555818969E-18 -0.236804406269326013E-18
+  0.139891281099502011E-16  0.000000000000000000E+00  0.569938098259822986E-17
+  0.131207118545926006E-16 -0.449986253228846961E-21  0.720490237761527062E-17
+  0.609070955769230024E-18  0.000000000000000000E+00 -0.856757678897806949E-17
+  0.148001282823888015E-19  0.148369108142005008E-16 -0.160937199231215010E-18
+  0.000000000000000000E+00  0.250599984904598978E-18 -0.593031173923045011E-17
+  0.000000000000000000E+00 -0.747366643183191062E-17 -0.539419458477777017E-18
+ -0.728712515589093926E-17  0.000000000000000000E+00 -0.489944721671871044E-18
+ -0.740148688080987035E-18  0.126086815672215012E-18  0.800808902944986061E-17
+  0.000000000000000000E+00 -0.401836344027630001E-18 -0.161952699515021994E-17
+ -0.221719344273285996E-16  0.591906623541305000E-17 -0.113502414932076005E-18
+  0.769783542464707994E-17 -0.243945488809238016E-18  0.000000000000000000E+00
+ -0.169496586701505991E-16  0.197941011861488998E-18  0.000000000000000000E+00
+ -0.302670152115197982E-25  0.196007899393148981E-17  0.000000000000000000E+00
+  0.268184722574811015E-18 -0.507787081806380982E-18 -0.568046702169790025E-17
+  0.335786576739177007E-17  0.000000000000000000E+00  0.116899459592193998E-16
+  0.128688313346714001E-16  0.000000000000000000E+00 -0.259192081859816000E-18
+ -0.276302147394353002E-17  0.527066251428988001E-18  0.740306795900258034E-17
+ -0.150450521486901014E-16 -0.592118381778119037E-17  0.143254447204384002E-18
+ -0.742678488152571000E-17 -0.740306795900258034E-17  0.813151629364127964E-19
+ -0.108420217248549998E-18 -0.135525271560687996E-17 -0.592076030130756014E-17
+ -0.118245799436700000E-17 -0.325260651745650993E-18  0.962229428080884993E-18
+  0.000000000000000000E+00  0.177156676220440991E-16  0.103507426154475994E-16
+ -0.179994501291539010E-20  0.233949856058605991E-17  0.612251172066077035E-18
+ -0.740148597608108019E-18 -0.667834855456528034E-17 -0.108665062709865995E-18
+  0.525176309165426997E-17 -0.475608999883290026E-18  0.190582413132217990E-20
+ -0.340083728322602019E-18  0.438821300192851001E-17 -0.462585868319254965E-18
+ -0.132984172718925009E-18  0.124611252035315994E-16 -0.315096256378600015E-18
+  0.740306795900258034E-17 -0.191090632900570002E-17  0.590890184004599994E-17
+ -0.216840434497100983E-18  0.314418630020796016E-17 -0.739967982721356930E-17
+ -0.860585474410368952E-18 -0.244623115167042015E-17  0.140133130793750986E-16
+ -0.591906623541305000E-17 -0.271050543121376012E-18 -0.395733792957208964E-17
+  0.256142763249699999E-17  0.000000000000000000E+00  0.588772601636464012E-17
+  0.292279306361937011E-18 -0.462585868319254965E-18  0.305022850878928992E-23
+  0.125181543436834992E-17  0.592123675734038991E-17 -0.840797659796876002E-18
+ -0.119751929896036996E-17 -0.803410750470703968E-18  0.990969653027917929E-17
+  0.000000000000000000E+00 -0.615012094155625980E-17 -0.804469541654772036E-18
+ -0.190582413132217990E-20 -0.342201310690737000E-18  0.120498907076396997E-16
+  0.361683068477586004E-18  0.000000000000000000E+00  0.190463629996254992E-16
+  0.000000000000000000E+00 -0.736389268519206972E-18 -0.564293349460814963E-17
+  0.000000000000000000E+00  0.467562186884373980E-18 -0.188380127469356010E-17
+  0.650521303491303046E-18  0.000000000000000000E+00  0.298155597433514008E-18
+  0.254787510534094004E-17  0.474338450462407997E-18  0.000000000000000000E+00
+  0.253944448053778995E-17 -0.988826262624670062E-17  0.179994501291539010E-20
+  0.443980138421951982E-17  0.118181258669405998E-18  0.740148597608108019E-18
+  0.269978517047506999E-17  0.191455088678460996E-18 -0.363737123374678000E-17
+ -0.190370654895404001E-18  0.000000000000000000E+00  0.452156775156186007E-18
+  0.744150207898425038E-17  0.000000000000000000E+00 -0.131290106824417010E-18
+ -0.699437456195238967E-17 -0.154159996400282999E-18 -0.740306795900258034E-17
+  0.120617491689012002E-17  0.000000000000000000E+00 -0.704731412115577978E-18
+  0.135525271560687996E-17 -0.739967982721356930E-17 -0.304931861011547978E-18
+ -0.133796794952696997E-17  0.309348476086989009E-16  0.000000000000000000E+00
+ -0.444083492377669035E-18  0.396729056670233018E-18 -0.410980386007786987E-17
+  0.000000000000000000E+00 -0.829202903714597009E-17  0.612574227454310005E-17
+  0.000000000000000000E+00  0.876900950248488035E-17 -0.586046090260397985E-19
+  0.000000000000000000E+00  0.543265756545227034E-17  0.562482816536058969E-20
+  0.418476627590987005E-17 -0.160089227031063005E-18  0.304931861011548002E-19
+ -0.397258452262267003E-18  0.144321708717923994E-16  0.000000000000000000E+00
+  0.203287907341032004E-17 -0.487890977618476995E-17 -0.101643953670516005E-18
+  0.184314369322535985E-17 -0.609863722023096013E-17  0.000000000000000000E+00
+  0.298968749062877971E-16 -0.674661742488049989E-17  0.000000000000000000E+00
+ -0.346859991900635989E-18 -0.512412489595360027E-25 -0.229649779221929994E-16
+  0.592118381778119037E-17  0.968136764274826967E-18  0.248448055279030997E-19
+ -0.107229265609408996E-16  0.148029595444530002E-16  0.106183896722397993E-16
+  0.555835236088826021E-17  0.000000000000000000E+00 -0.102162426050442001E-16
+  0.105082980687594002E-17  0.000000000000000000E+00  0.760580847189921965E-17
+  0.559763864169073973E-19 -0.733749295744850035E-18  0.839903522791146024E-19
+  0.219719850209398022E-24  0.117210647846692996E-18 -0.125995396489508994E-16
+  0.462585868319254965E-18  0.128018813561245007E-16  0.318308925156784023E-18
+ -0.818753745432010944E-17  0.000000000000000000E+00 -0.817699696977523956E-19
+  0.740148681618638981E-18 -0.599551467812316950E-19  0.108906402847091006E-16
+ -0.462599103209056006E-18 -0.494820159361191037E-18 -0.232935714856157988E-18
+ -0.685910313144217059E-19  0.143819535639848992E-17 -0.400925499014078976E-17
+ -0.410781759263196998E-17 -0.735211518423296988E-18 -0.686166170447438018E-18
+  0.184201211014737999E-17  0.565444743514457976E-18 -0.496970112021859027E-17
+ -0.300019069917472988E-17  0.677626357803439982E-18 -0.639762992444437982E-18
+  0.112363387228590996E-17 -0.391203242024195008E-17  0.452220302627229964E-17
+  0.251521139731243992E-17 -0.321237245246193018E-17 -0.204498708472292984E-17
+  0.573610877316734989E-18  0.449273430335979002E-18 -0.296543215624495000E-17
+ -0.273389396303369975E-19  0.644925013057074959E-18  0.106550458191936003E-17
+  0.459232167923234995E-18  0.257365667067299011E-17 -0.726396926719566976E-18
+ -0.219646231134880988E-17  0.282697246146122979E-18 -0.721963238636283036E-18
+ -0.174012786050891999E-17  0.322031338634244021E-18 -0.812092838180059969E-19
+ -0.176127328530276993E-18  0.192190328165929010E-17  0.695940524306275957E-18
+  0.238706623117694998E-17 -0.975821659906356921E-18 -0.341196120810363013E-17
+ -0.296800344717906993E-17 -0.310014058695074020E-18  0.106340354316346995E-17
+  0.127478458561772008E-17  0.185161402269790016E-17  0.246253653590506004E-17
+  0.468239813242176996E-17  0.147044919643347001E-17 -0.303020954682541977E-15
+  0.120558517020060009E-15  0.514724981387492984E-16 -0.395303500220003983E-15
+ -0.937401198330966936E-15 -0.695346287059999988E-16  0.360751332235607007E-16
+ -0.841069835305629941E-16 -0.465122731996280971E-16  0.138223865591080994E-17
+  0.316557388212613009E-17  0.153530511954130000E-17 -0.114158865466198001E-17
+  0.251918186425269985E-17 -0.826679341101821022E-18 -0.242299109877044003E-17
+ -0.276451039904612007E-17 -0.219837847523779012E-17  0.189542150793871004E-17
+ -0.349685640873116977E-17 -0.322931311140702004E-19  0.296090954624581016E-17
+  0.262580213648832990E-17 -0.138913403349705005E-17  0.200852687617675992E-18
+  0.140967458246796992E-17 -0.665692623106138030E-18  0.296959494267761993E-18
+ -0.141501031100923995E-17  0.514096059424157008E-18  0.474857258142600987E-17
+  0.234265490408897985E-17  0.238700305525767010E-17  0.123478461273650007E-17
+ -0.777840529785158018E-18 -0.393563436465992003E-18  0.400450542246108993E-19
+ -0.231993556138459993E-18 -0.105675301103853995E-17  0.725377840204900971E-18
+  0.508905335644263999E-17 -0.168930133418030005E-17  0.151288025313498993E-17
+ -0.167222832633720003E-18  0.128336079420867007E-17  0.258071334847867999E-17
+  0.101915268911433000E-18  0.208333047333116009E-17  0.371393755278652995E-18
+ -0.927165655505069004E-18 -0.352746209139563026E-17  0.397578736595447006E-17
+ -0.321124748682885999E-17  0.311549305911972006E-18 -0.439610099624981972E-18
+ -0.860585474410368952E-18  0.169745402629762000E-17  0.229715335295365982E-17
+  0.248350060134961013E-17 -0.577305893113008979E-18  0.141623908780918997E-17
+ -0.112485975395371008E-17 -0.291379333855478980E-18  0.351052804989545007E-17
+ -0.548877349820787005E-18  0.332036915323686022E-18  0.230392961653169981E-18
+ -0.317806761809812986E-17 -0.157886941368202007E-17  0.170761842166467006E-17
+  0.134170018845081000E-17 -0.340422541501502998E-17 -0.157209315010398008E-17
+ -0.237169225231204018E-17  0.648827237596793963E-18 -0.152211820621598002E-17
+ -0.254906293670056003E-17 -0.265743104157052001E-17 -0.576081633090791036E-18
+  0.295951293498425985E-17  0.491017591086957017E-18  0.570295000582212027E-17
+  0.425298666891756003E-17  0.322490243264190984E-17  0.170575850282616993E-17
+  0.316149982129022996E-18  0.316149982129022996E-18  0.170575850282616993E-17
+ -0.104501833606318004E-18  0.278429414238003988E-19  0.201993100848688014E-17
+  0.689958412277429955E-18  0.253745521456217996E-17  0.317289218488110982E-17
+  0.416205158728287997E-19 -0.223087532941556984E-17  0.148681372554603001E-17
+  0.224561764616170982E-18 -0.161663087507475004E-17  0.252552248152889988E-19
+  0.181779337333578008E-17  0.362974215988189973E-17  0.105511449044334006E-18
+  0.811011020335495983E-18  0.212430842451632013E-17  0.291664890900878997E-18
+ -0.253106873399842011E-17  0.740117494324606029E-18 -0.573664210352185016E-17
+ -0.213444055721414999E-17  0.191567131261783994E-18 -0.154648946414654008E-17
+  0.911779475726123009E-19  0.386130223460529005E-18 -0.437674703786760990E-19
+  0.153868974389064991E-17  0.341944482530724993E-19  0.120185332147335992E-17
+  0.429533419052512003E-18 -0.521260203663573975E-17 -0.244979246689270001E-17
+  0.353401443498422013E-17  0.108347743827260008E-17 -0.297276516321888987E-17
+ -0.502982861212076016E-18  0.717817057257475013E-18 -0.404118796699791005E-18
+ -0.824222669082385976E-18 -0.496792572966965000E-18 -0.145300107449145002E-17
+  0.429727421330391006E-18  0.674705684382426013E-18  0.516237685549988988E-18
+  0.152970157584615000E-17  0.114320869044779997E-17  0.120185320191991991E-17
+ -0.132446040544864991E-17  0.212862970992680982E-17 -0.506722571443859031E-18
+ -0.328233673284015989E-17  0.174357265416989999E-19 -0.250028140764131988E-17
+  0.304621564884264023E-18  0.287174051018836004E-18 -0.360316566105648980E-18
+  0.246878009516860991E-17  0.307227700801689012E-17 -0.398859873928170002E-18
+  0.135186458381786007E-17  0.163646765409530999E-17 -0.134905216973517997E-17
+ -0.277534977379301996E-17  0.107718664691529003E-17 -0.868460233841873968E-18
+ -0.314460981668159001E-18 -0.295265097501007996E-17  0.587343884942705016E-18
+ -0.103889092135869005E-17  0.220358991989202002E-17  0.172080450780937999E-17
+  0.203458402189975985E-17 -0.188128398314380995E-17  0.435593310570423997E-20
+  0.150997560841400997E-17  0.338780091677218015E-19 -0.474444329580814977E-18
+  0.134286485875329006E-17  0.260568510399103997E-18  0.163964402764750996E-17
+  0.391340595364909024E-18 -0.555124217806786988E-18 -0.134424128729257002E-17
+  0.120085428439502995E-18 -0.232039433643183985E-17  0.311023860107363000E-17
+  0.161539357362928995E-17 -0.227030637899263991E-17 -0.123412700414952002E-17
+  0.460785923306338999E-18 -0.167034897198548008E-17 -0.772738893357237991E-18
+  0.655603501174828000E-17  0.355923244436257002E-17  0.966041076343529922E-18
+  0.461463549664142979E-17 -0.813151629364127964E-19 -0.300879655391883993E-15
+  0.116570897662623010E-15  0.517977587904949990E-16 -0.397920832027020018E-15
+ -0.938173692378863056E-15 -0.685385179600290032E-16  0.332324906525752010E-16
+ -0.811796376648520952E-16 -0.500833641052523013E-16 -0.327264414061500016E-17
+ -0.249197093082215006E-17  0.355081340008800993E-17  0.577464711790618967E-18
+  0.309632893868808988E-17  0.237164602067062017E-17 -0.236675814580756013E-17
+  0.129894498034675006E-18 -0.177843283691745018E-17  0.905876559778580015E-18
+  0.334760614285670004E-18  0.702903342961835980E-18  0.194244010831745001E-17
+ -0.311708124589582987E-17 -0.384044738285100005E-17 -0.259964999424184991E-17
+ -0.230437174456910998E-17 -0.203085578963201996E-17 -0.186536838192342997E-17
+ -0.292541894847392008E-17  0.444950377659624030E-18  0.473120840600730037E-18
+  0.144694403214716009E-17 -0.387457071246265019E-18 -0.268733332021511007E-17
+  0.480630192996640042E-18 -0.968667995173845007E-18 -0.225453915329464011E-17
+ -0.172670305520930993E-17  0.397513637481238987E-18 -0.223802472500957004E-17
+ -0.103900998689928001E-17  0.125363275017413008E-17  0.283438399974970010E-18
+  0.289612476067066011E-17  0.115254714341709003E-17 -0.905774764864446938E-18
+  0.102320256539338992E-17  0.130652185136016004E-17 -0.227171558456370018E-18
+ -0.315659132105927017E-17 -0.140885526007124000E-17  0.456135927492873002E-17
+ -0.136525167485142000E-17 -0.181712389987688988E-17  0.477260714130435988E-17
+ -0.161968912255027995E-17 -0.847032947254299978E-19 -0.283247817561837995E-17
+ -0.347622321553165013E-17  0.308735568340312003E-17  0.609016689075842040E-18
+ -0.308828212568918019E-17 -0.265629532258948987E-16  0.869394617061814022E-17
+  0.707441917546791951E-17 -0.104625509644850993E-16 -0.644592072860523040E-18
+  0.182366193543851017E-17  0.220253050832250004E-17  0.181095644122968989E-17
+  0.465952824284591002E-17 -0.179909797996813007E-17 -0.443167638003450014E-17
+ -0.110114283143058996E-18 -0.271050543121376012E-18 -0.184653182501437012E-17
+ -0.338094772539718024E-18  0.535365328665338963E-18 -0.193755392891210995E-18
+  0.238648297443423004E-17  0.185387667186595989E-18 -0.583221945298563973E-17
+  0.392947092559352014E-17  0.206476279333813002E-17  0.168586566858662003E-17
+  0.805104553428405950E-19  0.805104553428405950E-19  0.168586566858662003E-17
+  0.710517439930086985E-19 -0.850266110884137972E-18  0.661505608924907992E-18
+ -0.318501041740767019E-17 -0.399340393980533022E-17  0.329464601164237984E-17
+  0.105759985244622995E-17 -0.263066876232583997E-17 -0.462818734786794995E-17
+  0.282065931092569004E-17  0.273173264807079987E-18 -0.191766477016721988E-17
+ -0.325548795749949012E-18 -0.339916505461695014E-17  0.329796223641775006E-19
+  0.184871977884447985E-17  0.476420507970624038E-18 -0.416450631384511008E-17
+  0.953001031626930087E-18  0.133898346897670997E-17 -0.128899552551295991E-17
+ -0.313369156609463019E-18  0.327541076635822000E-17  0.831022609569495005E-18
+ -0.292570682126152987E-17 -0.143599266409235012E-18  0.746109032496585016E-18
+  0.369159449897113977E-18  0.213209126763412006E-18  0.120185446207787991E-17
+  0.429056143033479976E-18  0.100975456627066995E-17 -0.214856782078001993E-17
+  0.104065643190709998E-17 -0.985474136165822018E-19 -0.219472349794965002E-17
+ -0.121077973535169001E-17 -0.306481117342981015E-17 -0.729646695292633014E-18
+ -0.417870234395862012E-17 -0.982270248064390015E-19 -0.222740183391410013E-17
+  0.429074284915790001E-18 -0.757393922655926042E-18 -0.222277203054574005E-17
+ -0.366790309232547969E-17 -0.513331296435003975E-18  0.120185407756813993E-17
+ -0.768679095514467055E-19 -0.849541118088088029E-18  0.468620978068442048E-18
+ -0.329866393396347012E-17 -0.130866590350788998E-17 -0.196412382089491007E-17
+  0.201593841446522999E-18 -0.493820208249257001E-18  0.525838053655470036E-17
+ -0.339024937138534000E-18 -0.388951895548529013E-17 -0.206676039130048994E-18
+  0.863973606199385961E-18 -0.153736479926655993E-18 -0.531777872198089969E-17
+ -0.115969894699322004E-17  0.228376918290164009E-17  0.239277618139722024E-18
+ -0.675919057019131015E-18  0.872979948708863904E-18  0.700124022566006054E-19
+  0.160436471636099012E-18 -0.430442219081631023E-17  0.141636781779202008E-17
+ -0.264584619614596010E-17 -0.214590875462231992E-17  0.136936648480856995E-18
+ -0.261616426743056012E-17 -0.140853348681295003E-18  0.109188833473735006E-17
+  0.822558327290080971E-18  0.590700594208202994E-17  0.211440599458354992E-17
+  0.103192880386794999E-17  0.259586853807156981E-18  0.285297240247498991E-17
+  0.138834510998783011E-19  0.738126141010415962E-18 -0.284059369630692984E-17
+ -0.500243348423792998E-17 -0.177634224131680013E-17 -0.282456371151746988E-17
+ -0.224463731022390023E-18 -0.116551733542191996E-17  0.295162620673421996E-17
+  0.304931861011547978E-18  0.155854062294790993E-17  0.683820286230236957E-18
+  0.203340846900235013E-17 -0.972340883888733010E-18  0.813151629364127964E-19
+  0.219550939928314995E-17  0.636214387616585012E-18 -0.135525271560688006E-18
+ -0.958841296291867984E-18 -0.337270652495431019E-17  0.881761298091727038E-18
+ -0.346436475427008983E-18  0.455327854752469024E-17 -0.188188717875612010E-17
+  0.140212904092025990E-18  0.250969594776295999E-17 -0.298155597433514008E-18
+ -0.317806761809812986E-17  0.271050543121375993E-17  0.335425047112703011E-17
+ -0.105879118406788005E-17  0.387941089842470018E-17 -0.111130722679763993E-17
+  0.620875150337401956E-18  0.674555863369643047E-18  0.227798923252203012E-17
+  0.182620303428026999E-17 -0.209725357740164995E-17 -0.209725357740164995E-17
+  0.182620303428026999E-17 -0.414781446358590017E-19  0.219658225253763007E-17
+ -0.306111089692803988E-17  0.206912281912994004E-17 -0.255938945946767010E-17
+  0.304542693212857005E-17 -0.388794043270335027E-17 -0.294961296034002983E-17
+ -0.640841332232825959E-18 -0.176859620984863995E-17 -0.351940186256116005E-18
+ -0.669305072706347014E-18  0.115719589845962996E-17  0.249115036765450015E-17
+ -0.267263072927983979E-18 -0.327855516196318992E-17 -0.658166933893129994E-20
+  0.288553457408329992E-17 -0.288969575785353004E-17  0.633241671747305985E-19
+  0.180772237182975992E-17  0.283017902710578996E-17 -0.565930133097904035E-17
+ -0.921226912297244036E-19  0.248537333825643019E-17  0.337764313885002993E-18
+  0.278072479341339001E-18  0.197859530774522996E-17  0.566495635123675990E-17
+  0.179570984817912009E-18 -0.119601052152306996E-17 -0.127224348677596007E-17
+ -0.247841840366608010E-17 -0.225030184305866019E-17  0.475991984506901981E-18
+  0.205724781425613012E-18 -0.942906488971646074E-18 -0.319654352426012007E-17
+ -0.119902807639766995E-17 -0.333492753201779013E-18  0.201170324972895992E-20
+ -0.324868899007545985E-17  0.537018888559226030E-18  0.262834323533009010E-17
+  0.974087889342445029E-19 -0.954182615081968947E-18  0.422838847269347037E-17
+  0.232256434137128999E-17  0.579741112836364996E-18 -0.817757371014823969E-18
+  0.861889628043636043E-19  0.258000547574478990E-17 -0.228948704303652010E-18
+ -0.986816337813258049E-18 -0.235678976639628984E-17 -0.976768781707729945E-18
+ -0.542140790912154955E-18  0.304632752502048986E-17  0.286085377935140017E-17
+  0.137515798986736007E-17  0.341100167859307016E-17 -0.366794382918673990E-17
+  0.388735183230519980E-18 -0.792068449911377045E-18  0.321449003483007017E-17
+ -0.494074318133433003E-17 -0.477853637193513999E-17 -0.292649883276361008E-18
+ -0.408307537973959009E-17 -0.194192967971836990E-17 -0.100750598608959000E-18
+  0.430052854827544002E-18  0.212506008087322998E-18 -0.402770783864319995E-18
+ -0.447233396150270964E-18 -0.198459819541682993E-17 -0.236970701884191007E-19
+ -0.220994204661097016E-17  0.296419179891642018E-17 -0.212986434587094013E-17
+ -0.359565486109450012E-18 -0.322052514457926014E-17 -0.491003823699637011E-17
+  0.156785798536770993E-17  0.182249726513603012E-17  0.101220437196889006E-17
+  0.126123205846164993E-17 -0.939359538505019076E-18  0.159242194083808006E-18
+ -0.338384368472172993E-17 -0.302051948990884002E-17 -0.450578514547435023E-18
+ -0.156574040299956994E-17  0.863973606199385985E-19  0.408269880576573004E-18
+ -0.350142244571246017E-18  0.166018457661843011E-18  0.184314369322535985E-17
+ -0.124683249835832991E-17  0.245300741524844993E-17 -0.271050543121375988E-19
+ -0.525838053655470036E-17  0.500596471827291982E-18 -0.289852854753073998E-17
+ -0.358234790318279013E-17 -0.745680161159402964E-18  0.395130447418413017E-17
+ -0.782221691899545996E-18 -0.116890546721092989E-18  0.241870258088465000E-17
+  0.352365706057788989E-18  0.645439105807776955E-18  0.299849663328021983E-18
+  0.216671027907649988E-17 -0.233781093442186990E-18 -0.794834541879753984E-18
+ -0.102999206386122998E-17 -0.467562186884373980E-18  0.241234983378025006E-17
+ -0.226581313390524994E-17  0.247037159066717002E-17  0.853809210832335031E-18
+  0.853809210832335031E-18  0.250594697445184986E-17  0.696049324406221015E-18
+  0.310955224796036990E-17  0.556959317995735977E-19 -0.495739267270380026E-18
+ -0.152187997819956003E-18 -0.313020110589274015E-17  0.360892352575412997E-19
+  0.274798235917439008E-17  0.219805416308431992E-17  0.128987869551562998E-17
+ -0.181271491937153990E-17  0.479946121031822013E-17  0.521859986993336979E-17
+ -0.162858689760435992E-17 -0.410148944122608997E-18 -0.129790139506878007E-17
+  0.180246481185638019E-17  0.252833916722396005E-17  0.184847156355081005E-17
+  0.234299073457090997E-17  0.381342421941949979E-17 -0.210785886003519001E-17
+ -0.186922460423510001E-18  0.283411363932078009E-17  0.245599891393374988E-17
+  0.115727546289480009E-17 -0.267567233540009983E-17 -0.230406371026381001E-18
+  0.847032947254299978E-19 -0.199476259078387999E-17 -0.149691897603515996E-17
+  0.169237182861408996E-17  0.202949094162130015E-17  0.258247110728035017E-17
+ -0.483304261121869975E-18 -0.267057163278154011E-17 -0.105757357420620006E-17
+  0.236854234853944018E-17  0.219443737320928007E-17 -0.769317674343717994E-18
+  0.107361426064483001E-18 -0.208888912704750988E-17 -0.317531476101955994E-17
+ -0.239075049362525982E-18  0.460785923306338999E-18  0.297573262282276003E-18
+ -0.931736241979730072E-20  0.330131091192363981E-17 -0.233781093442186990E-18
+  0.243924312985556986E-17  0.508219768352580011E-18  0.596311194867026957E-18
+ -0.164154985177883001E-17 -0.290601122335189004E-17  0.116551733542191996E-17
+  0.240557357020220988E-17 -0.636968776335234048E-18 -0.350774872303727035E-17
+ -0.278843246236116018E-17 -0.640356908124250960E-18  0.105032085459533001E-18
+ -0.916951328718438058E-18  0.397946666531910978E-18 -0.123324688397775994E-17
+  0.324032453972133018E-17 -0.997275416273532054E-18  0.182323841896488004E-18
+ -0.127499634385454002E-17 -0.769560981766082995E-18 -0.242439037817415004E-17
+  0.214918066753528012E-19 -0.279227223476463018E-17 -0.537361858320706021E-18
+ -0.899122578378295995E-19  0.277170356165288005E-17  0.464597571568984025E-18
+ -0.136671413017441992E-17  0.274447939333253981E-17 -0.310972264716655005E-17
+ -0.120358087848916004E-18 -0.688214269644119025E-21  0.293847640803337985E-18
+ -0.641415699308319028E-18 -0.881761298091727038E-18  0.296628622022741023E-18
+ -0.105601516593214997E-17 -0.284306608745905986E-17 -0.338283783309686005E-18
+  0.266548033611127006E-17  0.116345931005788996E-17 -0.166613820907728004E-18
+  0.215419523463735996E-17  0.626804380968182010E-19  0.639509875176997045E-18
+ -0.297890899637496982E-17  0.119222803137702003E-17  0.431986803099692980E-18
+  0.235475159336694994E-17  0.261563774112127983E-17  0.307684718090124983E-17
+  0.745388993583784009E-18  0.363483013490502018E-17  0.542101086242752024E-18
+  0.819927892942163017E-18 -0.137908119795569001E-17  0.153143556863578002E-17
+  0.243945488809237997E-17 -0.169967748778416003E-17  0.284603070277445010E-18
+  0.346055310600744018E-17 -0.934065582584680084E-18 -0.214364186615361989E-17
+  0.121972744404618998E-17 -0.187702501111552994E-17  0.129087821161555006E-17
+ -0.375923809903299015E-17  0.745388993583784009E-18  0.496530713680471013E-17
+ -0.156912853478858993E-18  0.322888959493338981E-17 -0.216332214728747998E-17
+ -0.179570984817912009E-18 -0.179570984817912009E-18 -0.216332214728747998E-17
+  0.159764972230942001E-18 -0.124635604232549994E-17  0.583605700658213030E-18
+  0.123751513593852991E-17 -0.106874382119810999E-17  0.503334853245148010E-18
+ -0.570046969647549980E-18  0.186124902247292013E-17 -0.224104734636542000E-17
+  0.346772455512312988E-17  0.205875328297097990E-17  0.145346221537408007E-17
+  0.586146799499976027E-18  0.264697796016969010E-17 -0.372201497146810978E-18
+ -0.208443972253258984E-17  0.831015421872823993E-18 -0.985304458448664060E-18
+ -0.122467729283171000E-17 -0.128923708528025005E-17 -0.334881093141887986E-17
+ -0.263824293290113013E-17  0.237010406553594010E-18 -0.229704747383526007E-18
+  0.129257227751006001E-17  0.914372066561017023E-18 -0.293793377755153996E-17
+ -0.146488971553649996E-17  0.586824425857778967E-17 -0.383028298748394999E-17
+  0.325260651745650984E-17  0.116043513773838992E-17 -0.711507675693612044E-19
+  0.270785845325358986E-17  0.158620154263168996E-18  0.269861719145015013E-17
+  0.383335348191774008E-18 -0.514254878101766997E-18  0.233966381899399013E-18
+ -0.790076598996348969E-18  0.667991358028422990E-18 -0.269652938758407014E-17
+  0.390482188684231994E-17  0.140692172538938997E-17  0.380910716380259017E-17
+ -0.247757137071882994E-18 -0.199222149194211016E-17 -0.181773270480773007E-17
+ -0.490537955578646973E-18  0.115196480826585000E-17 -0.847198383376810945E-19
+ -0.315405916610788012E-17  0.144814344403536006E-17 -0.268858018189743015E-17
+ -0.495129841954081968E-18 -0.158074907822647006E-17  0.403686653748175000E-18
+ -0.171910438467481003E-17  0.160230685720048001E-18 -0.580566047782802009E-18
+ -0.580566047782802009E-18  0.160230685720048001E-18 -0.945765225168629963E-19
+  0.838986134255384004E-18  0.121803337815168004E-17  0.381164826264434972E-19
+ -0.338474365722817992E-17  0.347055868269689008E-17 -0.280497979692496991E-18
+ -0.317614840398065008E-17 -0.181985442307893013E-17  0.225148543511765008E-17
+  0.429147919237410989E-18  0.940467960525839941E-18 -0.454856692675558993E-18
+  0.142979161496525994E-17  0.279498538717379988E-18  0.440419909444671001E-18
+  0.234553680134310998E-17  0.186512684518457011E-19  0.991875581234786082E-18
+  0.226687192508932013E-17  0.186688708552808003E-17  0.291392237873034990E-17
+ -0.296599174392933981E-17 -0.949735692108884063E-18 -0.128833711277379005E-17
+  0.571747239396652970E-19  0.453162626781050970E-18  0.663129190387836005E-18
+  0.217093571646361010E-17 -0.296568412519281004E-17  0.396069170862700993E-18
+ -0.465832944811199039E-18  0.173788617681510005E-17  0.292773924322348985E-17
+  0.747797743527539004E-18  0.310340345257573986E-17 -0.667401578251672007E-18
+  0.168471584229874004E-17 -0.578829807955515980E-17  0.479071164337129953E-18
+ -0.370114100408684010E-18 -0.957904246464584045E-18  0.356807490403558990E-18
+ -0.988544004688977926E-18  0.260777962546842991E-17 -0.169343680949112000E-18
+ -0.331726748645320978E-18  0.152314043111593995E-17  0.754985966476637967E-18
+ -0.297055098494334995E-17 -0.629786780666742000E-18 -0.471802637821747000E-18
+  0.474004104058814025E-17 -0.183018673611032999E-17  0.454297022273105981E-17
+  0.143070440877121008E-17  0.922260060882323081E-18  0.793934569373296001E-18
+ -0.330004036250274988E-17 -0.365155903561329030E-17 -0.558321767182671991E-17
+  0.340233613449596002E-17 -0.638672768397092996E-18 -0.904571630663488944E-18
+  0.737624249173749028E-18  0.273475174932892005E-17 -0.610287238496723009E-18
+ -0.285280696635248008E-17  0.452885521275844996E-19  0.143734418754817006E-18
+  0.210147054416443989E-17  0.147160394056858999E-17 -0.253326999085539019E-17
+  0.491174926009342989E-17 -0.494642756650379982E-17  0.321644052671446983E-17
+  0.534345440819455030E-18  0.892521263499816052E-18 -0.104905030517445001E-17
+ -0.843221298991655988E-18  0.269229422284779016E-17 -0.344928028861956983E-17
+  0.700708005616119955E-18 -0.147680194353786994E-17 -0.533011198491406996E-18
+ -0.702080298252345043E-18 -0.236921071047437994E-17  0.328175636224288007E-18
+  0.405514376520035964E-17  0.228538753592071018E-17 -0.130887766174470991E-17
+  0.848968549887674039E-18  0.757247454845344985E-18  0.107234371122393999E-17
+  0.672544160119913988E-18 -0.233955793987558016E-17 -0.857197342621352039E-18
+ -0.433680868994201966E-18 -0.257498015965307014E-18 -0.151110677790167007E-17
+  0.113841228110978004E-17 -0.813151629364128037E-18  0.508219768352580023E-19
+  0.102167724434385003E-17  0.247117416265649998E-17 -0.302836777956074006E-17
+  0.121851355649726996E-17 -0.366020141865323971E-17  0.252415818281781983E-18
+  0.178808655165382994E-17 -0.542101086242752024E-18 -0.133068876013651007E-17
+ -0.149077798716756994E-17 -0.214129929065887010E-17  0.226581313390524994E-17
+ -0.329051124184614016E-17  0.433680868994201966E-18  0.429276297668479027E-17
+  0.542101086242751976E-19  0.131970380160180008E-17 -0.479420648145933992E-18
+  0.660685698858354024E-19 -0.711507675693611996E-18 -0.669083120031099994E-18
+ -0.599011112386401003E-18 -0.343234624711939003E-17 -0.195054689191077018E-17
+ -0.204929033676336999E-18 -0.454904338278842028E-17  0.175727572819745014E-17
+  0.357017653500514010E-18 -0.114005842222755001E-17 -0.367261574528644008E-18
+ -0.318721147938444995E-17 -0.278055444590598997E-18 -0.108154216643068992E-17
+  0.286835796186848008E-17  0.144473814824863007E-17  0.175732205031176005E-17
+ -0.276777279938202999E-17  0.315270956923971003E-17  0.331322231274440013E-18
+ -0.406575814682063982E-19 -0.190650738250813994E-17 -0.344001255703653014E-18
+  0.401387737880132007E-18 -0.550238940807836995E-18  0.166319913296329992E-18
+ -0.231558231661587994E-17  0.179282091988977989E-17 -0.379113418345304005E-19
+ -0.283504574423973992E-17 -0.850505037875492034E-19  0.215451811295723997E-17
+ -0.600969876076925994E-18  0.965194043396275044E-18  0.422912962652231018E-17
+  0.270344316993893994E-17 -0.545658624621219970E-17 -0.227004829864152009E-18
+  0.253432257818487008E-17  0.476667791067357997E-18 -0.109775469964156997E-17
+  0.393478567735145006E-17 -0.542101086242752024E-18 -0.286974762529756993E-17
+ -0.131324333068139999E-17 -0.230392961653169981E-18  0.265629532258949010E-17
+  0.313508069602497990E-17 -0.453162626781050970E-18 -0.273549290315775986E-17
+  0.514307817660970006E-18 -0.307181792277692011E-17 -0.460785923306338999E-18
+ -0.569206140554890020E-18 -0.328648783534668994E-17  0.174804770128381011E-17
+  0.894466792300540965E-18  0.303576608295941010E-17 -0.894466792300540965E-18
+  0.361767771772311983E-17 -0.211059806315422997E-17 -0.100288700954909006E-17
+ -0.830092288309213971E-19 -0.346436475427008993E-17  0.565288613173838978E-18
+  0.395511446808555009E-18  0.201424434857073006E-17 -0.695414049695780964E-18
+ -0.493777856601893997E-17 -0.332585501505931016E-17  0.490182939999496966E-18
+  0.203239589653500981E-17 -0.274086295969446009E-18  0.384873697335969983E-18
+ -0.622858729446303948E-19 -0.186535018394995997E-18  0.517960647246005043E-18
+  0.502290537721800005E-18 -0.168068542091994001E-17  0.219370283682532990E-17
+ -0.834135547143374013E-18  0.702852057763857018E-18  0.742186150251978975E-18
+  0.299452616633997011E-18 -0.852618070750257997E-18 -0.208606347807502990E-17
+ -0.405305265261183001E-18 -0.237848175077988005E-17 -0.266900081679829998E-17
+  0.347918783084704018E-17 -0.211758236813574988E-18 -0.154132203131700996E-17
+  0.376929661528163988E-18  0.262664916943558997E-17  0.316693613899383005E-17
+ -0.806375365786094019E-18 -0.826704156520197035E-18 -0.734112867473461040E-18
+ -0.262071993880480987E-17 -0.917283742317204066E-18 -0.243945488809238016E-18
+  0.172117094882074002E-17 -0.476614851508154026E-18 -0.271050543121375988E-19
+  0.149924831664010991E-18 -0.293611728892637004E-17 -0.138701645112892008E-17
+  0.199306852488936985E-17 -0.197300443195128015E-17 -0.137409919868329007E-17
+  0.401330662417864981E-18  0.251442911077089996E-17 -0.623416249179164953E-18
+ -0.179232171639009990E-17 -0.450060497858702980E-17  0.117250396136948002E-17
+  0.139756465830018993E-17 -0.331705050461928996E-17 -0.113723582348357996E-18
+ -0.464527261216917018E-18  0.124683311874379009E-17  0.389591589814325992E-17
+  0.189226867995338999E-17 -0.708799730321770986E-18 -0.708799730321770986E-18
+  0.189226867995338999E-17  0.730089460974003017E-18 -0.241616148204289018E-17
+ -0.642898006966013957E-18  0.193208215268705984E-17  0.114116513818835998E-17
+  0.940812894841274938E-18 -0.213640693092592999E-18  0.191601065377061008E-17
+  0.197291178772267999E-17  0.162956090277563995E-17  0.467328921951633987E-17
+ -0.169970726628621002E-17 -0.373541529739146016E-18  0.309167025747820009E-18
+ -0.140413908980875996E-19 -0.890122439723413042E-18  0.444510317573746012E-18
+  0.159521450258606000E-17  0.588714368121340047E-18  0.247807429653125996E-17
+ -0.150405258163781997E-17  0.211586183246164007E-18  0.825751244454536043E-18
+  0.100479283368041005E-18 -0.744541960636529998E-18 -0.301924894048795009E-17
+  0.654121193517132955E-18 -0.872008011489114096E-18  0.347595984938333010E-18
+  0.120877279715728994E-17  0.108739371704337996E-17  0.222961127874442003E-17
+ -0.224159359253126005E-18 -0.703638131074452959E-19  0.364852824584889013E-18
+ -0.251407909045142986E-17 -0.133273603215256995E-17  0.305124807351774998E-17
+  0.349674804807092995E-17 -0.668332363235947976E-18 -0.168391328646527988E-18
+  0.109031825667353006E-17 -0.648693735224327031E-18 -0.364916181449932026E-17
+ -0.650813065602986984E-18  0.385502589719925017E-17 -0.120598315164507008E-17
+  0.122143819895578996E-17 -0.100551719832776992E-17 -0.315327560109728015E-17
+ -0.315810940427846024E-18 -0.218116670784693986E-17  0.142069090570837004E-18
+ -0.386443752054550038E-17  0.128694000471918996E-18  0.282494506116690973E-19
+ -0.308648228407384022E-18  0.201310440079036986E-17  0.188112236626928001E-17
+  0.514747278364022024E-18  0.514747278364022024E-18  0.188112236626928001E-17
+  0.364092852397105992E-19  0.183392682345548997E-17  0.100813833416394001E-17
+ -0.180547891482710983E-17 -0.148124239527669002E-17 -0.121626326668438009E-17
+ -0.819908795219576046E-18  0.141548838806682992E-17 -0.102046261316098992E-17
+ -0.176117589589950993E-17  0.674229731042702993E-18  0.876369010356629040E-20
+  0.392250486364678998E-18  0.305615136027426994E-17  0.253348435073654018E-18
+  0.509936297130785021E-18  0.346726570183350994E-17 -0.498687128533197966E-17
+ -0.382091095733158007E-17 -0.130344191058839003E-18 -0.132234278800973008E-17
+ -0.290958106810184998E-17 -0.547838083491907977E-17 -0.148880064826396994E-17
+ -0.257873617436497015E-17 -0.166142330785849998E-18 -0.315692570397513015E-18
+ -0.292263116495110999E-17 -0.485469728861570009E-18  0.120185407756813993E-17
+  0.429089928317842021E-18  0.125501397022973005E-17  0.805596610804541017E-19
+ -0.244340208613324994E-17  0.286470517184008993E-18 -0.183891871488211999E-17
+  0.127540845256485994E-17  0.319791558987090013E-18  0.369842425732445988E-17
+ -0.145110647130905996E-17 -0.142820325539914005E-17  0.130250556227022007E-17
+  0.429164057413021993E-18  0.923727205858856957E-18 -0.206702571605096997E-17
+  0.375403241563379036E-17 -0.302615707496255018E-20  0.120185410341754004E-17
+  0.833068366285379940E-19  0.230376963355947018E-17 -0.555518782958974975E-18
+ -0.887232412834819968E-18 -0.103930280883612994E-18 -0.255683181701366009E-18
+  0.252700699284744995E-17  0.441434446465967964E-18  0.708476885929217991E-18
+ -0.800512309604317966E-18  0.294386300818232006E-17  0.223320236543596015E-17
+  0.855079760253216000E-18 -0.338161691451272986E-17 -0.735145188877927989E-18
+ -0.866514705041148958E-18  0.259976331798577006E-17  0.576172986546055986E-17
+  0.350862222576413020E-17  0.354864453252188973E-17 -0.407577010831044005E-17
+  0.449193684955051025E-18 -0.545627853502432998E-17 -0.652112798989854975E-18
+ -0.316399912411029001E-18 -0.733328286662454975E-18 -0.102626644238229001E-18
+  0.179973821776224996E-18  0.426600202950748001E-18  0.133225047713300999E-17
+ -0.149374260248296000E-17 -0.580937546874361965E-17 -0.151407139321705993E-18
+ -0.290677388387667010E-17  0.242759642683082015E-17 -0.291633443739655992E-17
+ -0.190359074366827988E-19  0.564585509653169000E-18  0.270940528099907018E-17
+  0.271364871754146018E-18  0.104165200177577996E-17  0.286929763904433993E-17
+  0.548919701468149027E-17  0.101931481651438992E-17  0.179570984817912019E-17
+ -0.677626357803439970E-20 -0.691178884959508980E-18 -0.161023610251002992E-17
+  0.125360876193635993E-18  0.285280696635248008E-17  0.387602276663567989E-17
+  0.254787510534094004E-17  0.243945488809238016E-18  0.224971950790742016E-17
+ -0.589534931288993036E-18 -0.846030404351885973E-18  0.133284687435465996E-17
+  0.132183461885973996E-17  0.130259894730511999E-17 -0.676196989704949025E-18
+  0.109267250195805009E-18 -0.239032697715164018E-17  0.211419423634673018E-17
+ -0.355330321373178992E-17  0.124683249835832991E-17 -0.117229359899994994E-17
+  0.204812566646090015E-17  0.154366460681175994E-17 -0.102999206386122998E-17
+  0.473067901041526970E-17 -0.445751088492576035E-18  0.175706396996064004E-17
+  0.234119906621088999E-17 -0.149416611895659004E-17 -0.149416611895659004E-17
+  0.234119906621088999E-17 -0.545767150717587043E-18  0.293270268735776000E-17
+  0.335791322687940991E-17 -0.198208356635465987E-17 -0.400158216617632984E-17
+  0.149475408927575000E-17 -0.266451620510562989E-18  0.215783878432394017E-17
+  0.296575751287548994E-17 -0.132635612379727001E-17  0.125805661614397001E-17
+  0.329964597375880009E-17 -0.725804251810673020E-18 -0.202273142845066994E-17
+ -0.361750742191450980E-18  0.238477798464633989E-17 -0.615240396004691044E-19
+ -0.294892886904875015E-18 -0.136428694961263993E-17 -0.460427427752502973E-18
+  0.408633405776274967E-17  0.241370248087692008E-17  0.212034179096043994E-18
+  0.920172551053097029E-18 -0.155108110818391000E-17  0.871671969365264044E-18
+ -0.135194748282488005E-18  0.579682825683748995E-18  0.307642366442761999E-17
+ -0.397258452262267013E-17  0.241912609735827984E-17  0.178893358460108000E-17
+ -0.498055372985528985E-18  0.296678583731738995E-17 -0.628909555627129947E-19
+ -0.257821443584816018E-18 -0.909587653898011082E-18  0.284076321663370992E-17
+ -0.257331256353816998E-17  0.642289202035175025E-18  0.575876525014516962E-18
+ -0.132401837567687995E-17  0.105540305227885996E-17 -0.410048649765807025E-17
+ -0.330766365902804013E-17  0.314333926726071010E-17  0.249366499671665981E-17
+ -0.134678238613434004E-17 -0.326266503370516004E-18 -0.803834266944331022E-18
+  0.584275019020739044E-18 -0.521181830722954026E-18 -0.246360856197893001E-18
+ -0.890764331878753999E-18  0.548568871229028054E-19  0.885333063976731012E-18
+ -0.171898057455870004E-18  0.173926304317850004E-17 -0.285280696635248008E-17
+  0.132475952950572997E-17 -0.372813610800100016E-17  0.343791462020794002E-17
+ -0.608794342927187998E-17  0.192763522971397014E-17 -0.638662842229741974E-18
+  0.836445035413621970E-18 -0.484502845829459987E-18 -0.647503748616709023E-18
+ -0.270095252911453979E-18  0.150473208466164007E-17 -0.401671791702481975E-17
+  0.289321267132416987E-17  0.217652064114137995E-17 -0.225518220867271997E-17
+ -0.839409650729012021E-18 -0.231409401189875007E-17 -0.246802570644996992E-18
+  0.105003309913973996E-17  0.491384988525900984E-18  0.217168659764162005E-17
+ -0.437069000783218975E-18  0.285079526310274996E-17  0.445195223120940015E-18
+  0.207681890754913995E-17  0.875990886138556925E-18 -0.940788906603511093E-18
+ -0.431139770152439017E-18  0.545235108147592974E-17  0.266137752027301012E-17
+ -0.111469866730911004E-17  0.519358251608974022E-17 -0.341298856642441984E-17
+  0.329834629660825014E-17  0.162630325872826002E-18  0.349655200626575026E-17
+ -0.140792757701426004E-17  0.634427677493470954E-18 -0.722519104007917997E-18
+ -0.278911008871896016E-16  0.584113920426565976E-17  0.682708555486965996E-17
+ -0.992044987824237019E-17  0.376929661528164012E-19 -0.102173349262550002E-18
+ -0.187074505590503005E-17  0.622484512937185011E-17  0.489839153397161963E-17
+ -0.178723951870656986E-17 -0.215485181781494006E-17 -0.132137139771670997E-18
+  0.650521303491303046E-18  0.110114283143059006E-17 -0.937384231202242061E-18
+  0.769355052567648023E-18 -0.277877270876202001E-17 -0.341527911112772982E-17
+  0.188585358734149006E-17 -0.307333598216620014E-17 -0.275261078554908998E-17
+  0.577606530023473009E-18  0.170577906601920992E-17 -0.163191393435032013E-19
+ -0.163191393435032013E-19  0.170577906601920992E-17  0.448670475468240001E-18
+ -0.291729493222043008E-17  0.269717696660611989E-17 -0.178990335531848991E-18
+  0.256607323391346977E-18  0.134101628714247995E-17  0.230979308011910995E-17
+ -0.403170793523692000E-17  0.147556225905089993E-18 -0.337953223511012990E-17
+  0.117050351108804007E-18 -0.178627074576569006E-18  0.405489664315588988E-18
+ -0.296848046543621999E-17  0.780119362087592001E-20  0.311241529209519983E-17
+  0.188868776166629986E-17  0.299491679632580009E-17  0.416857321052395034E-17
+ -0.175461764953809003E-17  0.476829251835777018E-17 -0.883650515905821941E-18
+ -0.237576416358402012E-17 -0.452084617380231971E-18 -0.281131514415595014E-17
+ -0.148642275037992002E-17  0.311120674489479000E-18  0.238552529446801001E-17
+ -0.793446617904934958E-19  0.120185555098361008E-17  0.429438776165916018E-18
+ -0.198933798085102997E-17  0.251437329702588004E-17  0.193038858216918014E-17
+ -0.381279230735173007E-18 -0.241638795747386001E-17  0.338070251827785015E-17
+ -0.883954289874912006E-18  0.230546801479983986E-17  0.280509763384942007E-17
+  0.255304094930993989E-19 -0.243909807437984001E-19  0.430172629206993001E-18
+  0.270270744496287993E-17  0.151309846511602999E-17 -0.110864972489187992E-17
+ -0.322256764930377990E-20  0.120185576747228005E-17 -0.704689046369850962E-19
+ -0.210033616755281988E-17  0.380112200160870019E-18 -0.169167342968315994E-17
+ -0.103844874485367003E-18  0.251310187995528003E-17 -0.687636018697156054E-19
+  0.690260094094115994E-18  0.443239768152865043E-18  0.344151637139014981E-17
+  0.106920187246231007E-17 -0.219024191314240976E-18 -0.110791909500862003E-17
+  0.171270061934819991E-17 -0.592029708016452967E-18 -0.107287972426087994E-17
+ -0.206796869538028005E-17  0.276609196837732003E-20 -0.759576795450294022E-18
+  0.875726188342539956E-18  0.519314816225762026E-18 -0.109032911543856009E-17
+  0.142830372383842998E-17  0.100573850791572003E-17  0.604142927981981971E-17
+  0.479671630253418989E-17  0.110844683623943006E-18 -0.235422778124405997E-17
+ -0.161887517682752998E-18  0.235435785539538009E-17 -0.177241644212961999E-17
+  0.210064170919065983E-17  0.814342769446204974E-18 -0.282994638255850981E-17
+  0.217052192733914010E-19  0.234776357155211005E-17 -0.308228175752572013E-20
+ -0.145160674586253994E-17 -0.227329022625726990E-17 -0.495544052645817993E-18
+  0.544329179940724975E-17  0.237253928525929987E-17 -0.201932654625425008E-17
+ -0.155176435936987995E-17  0.160208341039270009E-19  0.892772726406032943E-18
+ -0.179570984817912019E-17  0.110643678735093001E-17  0.243945488809238016E-18
+ -0.147722546001149998E-17 -0.300791563965369003E-15  0.123197871683703990E-15
+  0.519874941706799006E-16 -0.398570506297564017E-15 -0.938756451046574014E-15
+ -0.674170463378642972E-16  0.300205417165869003E-16 -0.819385791855919976E-16
+ -0.474473975733969001E-16 -0.327433820650950991E-17 -0.162460919283374998E-17
+  0.252287936159080996E-17  0.695202291458967003E-18  0.296090954624581016E-17
+  0.361391397802216010E-17  0.408203287321918007E-18  0.495238006025435965E-17
+  0.275835450342189021E-18  0.364880639030068002E-19  0.194083312525632001E-17
+  0.278119002058286018E-18 -0.641258404061236032E-18  0.165955267525392004E-17
+  0.475587792606270030E-18  0.475587792606270030E-18  0.165955267525392004E-17
+  0.499313877686834030E-18  0.118594492762701009E-17 -0.532072040726810014E-18
+  0.597479878085993991E-17  0.616095653587975001E-17  0.267973686332527998E-17
+  0.237255357212338008E-17  0.415645509636498002E-18  0.406581717756872996E-17
+ -0.216855405062163982E-17  0.128431581959682994E-18 -0.333001210673627003E-17
+ -0.529555996006423025E-18  0.158490256251231992E-17  0.779339004809818954E-19
+ -0.184932006891523006E-17 -0.121805361876716993E-17  0.305020810095135983E-17
+  0.451000953589631032E-17 -0.143328195155579993E-17  0.261449190125046997E-17
+  0.213311232586893987E-17  0.308627464263071995E-18 -0.142468127993853007E-17
+  0.138447030256762007E-17 -0.416491634298082012E-18 -0.634896151970501991E-18
+  0.766861496138588998E-18 -0.678597100070076063E-20  0.120185447177140002E-17
+  0.429046681675597000E-18 -0.176084645720124014E-17  0.144825178137232999E-17
+  0.314218888807008010E-17  0.734785045431920036E-18 -0.117162594263929007E-17
+  0.608863462349075997E-17 -0.662627775282060006E-19 -0.127825823877199996E-17
+ -0.171021320481935997E-18  0.496852428684837022E-19 -0.322174981279925987E-17
+  0.429050616589522023E-18  0.104417858731019005E-17  0.163454775014506994E-17
+  0.178823488869088012E-17  0.581478820662879978E-19  0.120185445884671008E-17
+ -0.129816506770870004E-18 -0.127996804138450008E-17 -0.154357189784606990E-19
+  0.214940359022386997E-17 -0.331603456810303990E-17  0.189241510509073018E-17
+  0.197721030105515002E-18 -0.416669355256307981E-17 -0.851010143081933984E-19
+ -0.125601149543838993E-17 -0.238461833303634009E-17 -0.169618840330709007E-17
+ -0.169618840330709007E-17 -0.238461833303634009E-17  0.400549807943398004E-18
+ -0.271531580761243988E-17  0.199875144619239998E-17  0.743049289256009004E-18
+ -0.307195681303235978E-18 -0.282936086004326019E-17 -0.136888465210175001E-17
+  0.725986645135740000E-18 -0.163159410949012006E-17  0.172113641403015996E-17
+  0.328205462052660005E-19 -0.427613578091477998E-17 -0.808777579770756965E-20
+  0.234837217968203999E-17 -0.253264027376469004E-19  0.574118313848444962E-18
+ -0.393557272547890009E-18 -0.326626289571594991E-17 -0.775608933612533044E-18
+  0.105058856049080991E-17 -0.972396842945079028E-18  0.134638939547039006E-17
+ -0.102121372777366003E-18 -0.136083563029536003E-17  0.281128595079716003E-17
+  0.405831033256048975E-19  0.242827227208932005E-19 -0.170833281491397008E-17
+ -0.127819918718633995E-17  0.635195301101919971E-18  0.626566152951766979E-18
+ -0.199179797546848994E-17  0.132218081797795004E-18 -0.149746601790291009E-17
+  0.212336552384022990E-19 -0.292432710560827011E-17  0.729454186263562966E-18
+ -0.214066401594843015E-17  0.303724839061711014E-17 -0.113084192414368992E-17
+ -0.340000958562557999E-20 -0.173530847038446989E-17  0.626566152951766979E-18
+ -0.199179797546848994E-17  0.132218081797795004E-18 -0.149746601790291009E-17
+ -0.127819918718633995E-17  0.635195301101919971E-18 -0.108475496177924000E-19
+ -0.173530847038446989E-17  0.407397618620635981E-18  0.303835815115187008E-17
+ -0.182128580743015985E-17  0.358929916256484033E-17 -0.155546948389484994E-17
+  0.165743789754505008E-17 -0.733902645660011989E-18 -0.390963850189295982E-18
+ -0.130426075704324999E-17 -0.336457008402797007E-17 -0.323961040779898006E-17
+  0.134737673440338004E-17  0.209549852390063008E-18  0.950982283458043069E-18
+  0.311317695677142983E-17 -0.147193675656687989E-18 -0.592093288710995990E-17
+  0.271635327503675007E-17  0.162966204193836999E-17 -0.257298989513354013E-17
+  0.145506049662061992E-17 -0.187935630575551008E-17 -0.341302337267962986E-19
+ -0.249774864015655006E-18 -0.557204531224242037E-17 -0.140921796185665985E-19
+ -0.191936809076605013E-17  0.109847716108033996E-17  0.307745161082448993E-17
+ -0.606673286482521039E-18 -0.725513167447356014E-18  0.223354010499117018E-17
+  0.660402286256383978E-18  0.261454292557767008E-17 -0.573978120970277956E-18
+  0.106320334455071992E-17 -0.381807789389963005E-18 -0.201346560792103982E-17
+ -0.501719392383432000E-18  0.170682457038868001E-18  0.276999327163805993E-17
+  0.143519550603408003E-17  0.166110181328899991E-17  0.857665614334221004E-19
+  0.132134526597416992E-18 -0.163342014702957008E-17 -0.159162656391022990E-18
+  0.333481192424111018E-18  0.110073055806264995E-17 -0.165744961785494992E-17
+ -0.723664504377990964E-18 -0.373996400265941977E-18 -0.369427622167534017E-19
+  0.137017753030935008E-17  0.219171077410990982E-17 -0.120449150112210007E-17
+ -0.380275891434780971E-17  0.355095637040657016E-17 -0.545627094050522035E-18
+  0.120185446207787991E-17  0.297192801651785003E-17  0.587392766169290955E-18
+  0.135525271560688006E-18 -0.279859685772820986E-17 -0.256481576428601989E-17
+  0.100246349307546002E-17 -0.813151629364128037E-18 -0.449943901581484032E-17
+ -0.105709711817336990E-17  0.673391193067168962E-18  0.214299335655337986E-17
+ -0.399799551104030001E-18 -0.399799551104030001E-18  0.214299335655337986E-17
+  0.815057453495450983E-18  0.164519606391896997E-18 -0.132860922807655002E-17
+  0.338956115711568991E-17  0.276652871974074985E-17  0.209062470706900983E-17
+  0.640476975732763991E-19  0.210311470833367019E-17  0.292498933315886017E-18
+ -0.266081026581987999E-18 -0.339530842093640994E-18  0.891473360161341033E-18
+  0.127055950214517004E-17  0.864474929349381958E-18 -0.173876150030865007E-18
+ -0.268677531257492988E-17  0.262439820419368005E-17 -0.394640487662082006E-17
+  0.239815370840882017E-17 -0.692232213035683983E-18  0.895380620082258923E-18
+  0.271301013410857011E-17 -0.270990856870302012E-18 -0.462501914737739018E-18
+ -0.282121724895181004E-17 -0.224758925933616008E-17 -0.213500811600352005E-17
+ -0.540097564326268985E-19 -0.571747239396653018E-18 -0.201000918383445008E-17
+ -0.203393786459438984E-17  0.320178454062126010E-18 -0.899548989984066960E-18
+ -0.643639160794860989E-18 -0.373628383703464999E-18  0.290045029488884995E-17
+  0.443262929210015969E-17 -0.124590605607226994E-17  0.105385457017215992E-17
+  0.250840866395480999E-17 -0.133195930955738998E-17  0.189142457121884996E-17
+ -0.166018457661843011E-18  0.441388868814215984E-17  0.256481576428601989E-17
+ -0.336229728412594982E-17  0.516690097825123026E-19  0.324032453972133018E-17
+  0.344954167769314006E-18 -0.224103742019807013E-17 -0.501443504774545993E-18
+ -0.975781955236953991E-18  0.473322010925703029E-17 -0.283733538017529017E-17
+ -0.116551733542191996E-17  0.253093444639585018E-17  0.203287907341032009E-18
+ -0.279870273684662012E-17 -0.303322498411764990E-17 -0.289685267960971005E-18
+  0.198205709657505990E-18 -0.884990779140575957E-18 -0.239498565836152988E-18
+ -0.148604651406377009E-17 -0.211101786279453002E-17 -0.360401931144864038E-17
+ -0.530348504099598992E-18  0.888749319906575016E-18 -0.337449898656761984E-17
+  0.172225982869958989E-19  0.542094468797851985E-18  0.101984131697427992E-17
+  0.152297909443849011E-18  0.134546096510577998E-18  0.167670171908989002E-17
+ -0.190413006542766986E-17  0.281780730027413984E-17 -0.127285229170680006E-17
+ -0.171465938303872008E-17  0.128071381624849999E-17 -0.709098925749857965E-18
+  0.231568881611974999E-17  0.245639554703747008E-19 -0.577358832672211987E-18
+  0.380971266001097985E-18  0.689428570783346966E-18 -0.300781399570002012E-17
+ -0.515313669285834969E-18 -0.334866534763107014E-17 -0.885606033578872933E-18
+ -0.755712621218751975E-18  0.215419523463735996E-17  0.672544160119913988E-18
+ -0.336399135002044994E-17 -0.127245524501276999E-17  0.482123254002297968E-18
+  0.163985578588433008E-17 -0.132475952950572997E-17 -0.288668828424266018E-17
+ -0.171100655345368996E-17  0.948676900924816019E-19 -0.311083437790982016E-17
+ -0.542101086242751976E-19 -0.274438674910393021E-18 -0.137908147745226995E-17
+ -0.582758667710959018E-18 -0.948676900924816019E-19  0.260526158751741003E-17
+  0.896160858195049951E-18  0.346055310600744018E-17  0.406628754241267990E-18
+ -0.158687652201152991E-17  0.481114714040442978E-18 -0.347283508374262985E-17
+  0.670850094225405965E-18 -0.124582664673347005E-17 -0.243945488809237997E-17
+  0.605120337518472008E-17  0.677626357803439982E-18  0.306668278553419014E-17
+  0.286720652645581002E-18 -0.311708124589583006E-18 -0.779270311473956023E-18
+ -0.112231865511195007E-17  0.422245924206269008E-18  0.785411300341549989E-18
+ -0.177876918923402994E-17 -0.271008191474013008E-17 -0.191577676845240994E-17
+  0.192117660349115996E-17 -0.172907527995913996E-18 -0.196332952071170988E-17
+ -0.295853553788778990E-17  0.297413657783084012E-17  0.238766345557921019E-17
+ -0.258331152278270982E-17  0.434993770062446016E-17 -0.391244518336761018E-17
+  0.249172443099960983E-18  0.295345913046855004E-17  0.180478236513759987E-17
+ -0.333194968181260020E-18 -0.398801640413046019E-17  0.127766979159431005E-17
+  0.406976831843030025E-17  0.125052834133521992E-17  0.584942424528099012E-18
+  0.105310018145351002E-17 -0.248180653545509999E-18 -0.766564817265142048E-19
+ -0.351391618168446033E-17  0.116087933372732994E-17 -0.435374934888709989E-18
+ -0.120702194983737991E-17 -0.184250841851491989E-17  0.169237182861408996E-17
+ -0.340337838206777992E-17 -0.248543289526053016E-17  0.456352235223054964E-18
+  0.241855596312107998E-17 -0.178644542531852011E-17 -0.843300708330460982E-18
+ -0.535150122119347000E-17  0.259308548890062985E-17 -0.842797782518028958E-19
+  0.840044925439451976E-18  0.223701401369860989E-17  0.350290475337015982E-17
+  0.140776875833665005E-17 -0.137781820271733003E-17  0.193970544921234994E-18
+ -0.692872950854017966E-18  0.230816478126796993E-19  0.914583824797830984E-18
+ -0.332036915323686022E-18 -0.124683249835832991E-17  0.221583819001725007E-17
+ -0.271333769763113996E-17 -0.102999206386122998E-17 -0.175844039849992982E-17
+ -0.120617491689012002E-17 -0.683979104907848005E-18 -0.119558700504943992E-17
+  0.169406589450860008E-19 -0.143995601033231009E-18 -0.298933170139291000E-17
+ -0.111914228155973999E-18  0.259477955479513999E-17 -0.505043394800376962E-18
+ -0.261627301583172017E-17 -0.559571140779871969E-18 -0.249122977699329985E-17
+ -0.244248420444131008E-17  0.763575231435010022E-18 -0.498453660450472978E-18
+  0.131165285269981993E-17 -0.409226721750142997E-18 -0.166919670939219005E-18
+  0.356664398265104977E-17  0.123031535588687009E-17  0.140753053032022997E-18
+ -0.174446435487023001E-17 -0.348347593514250969E-17  0.608010837450976981E-19
+ -0.288229430082877002E-18 -0.270444385168497019E-17 -0.511978477056020969E-18
+ -0.180010383159299994E-17 -0.519787062038522031E-18 -0.778333943020545996E-18
+  0.223510818956729003E-17 -0.461209439779967016E-18 -0.254326936369024016E-17
+ -0.754514450101469011E-18 -0.729630168938883045E-18  0.215419524756206011E-17
+  0.423516473627149977E-18  0.118034041199887003E-17  0.526282745952778004E-17
+  0.189749380216831006E-17  0.362191288245938979E-17 -0.373710936328597001E-17
+  0.325260651745650993E-18  0.315943289325853988E-17 -0.894466792300540965E-18
+ -0.152582397536022009E-17 -0.189735380184963011E-18  0.109436656785256008E-17
+ -0.137907918816530007E-17  0.548877349820787005E-18  0.989334482393022989E-18
+  0.359374903696317968E-17  0.372694496791892029E-19  0.346055310600744018E-17
+  0.325207712186447985E-18 -0.166962105304642995E-17 -0.189715527850262003E-18
+  0.189418900882599987E-17 -0.165532075461661994E-17  0.311165494107747979E-18
+ -0.215505034116195009E-17  0.118796370852416008E-17  0.601181634313739955E-18
+ -0.347389387492670004E-17 -0.208708918203459989E-17  0.337288519596662010E-17
+  0.931736241979729988E-18 -0.145689666927739995E-18  0.227220724004029024E-18
+ -0.162143943672644003E-17  0.174742897018562006E-17  0.323481882556416991E-17
+ -0.635274710440724965E-18  0.375235595633655002E-18 -0.145080924035446995E-17
+  0.143389882520053003E-17 -0.235633978014306013E-18 -0.112496563307211995E-17
+  0.293844332080887002E-18 -0.161068608876326000E-19 -0.307600014795399014E-17
+  0.258090939028385005E-17 -0.476763082273924010E-17 -0.228127148519264015E-17
+ -0.309844652105622987E-17  0.174912303608012996E-18  0.531513174402072981E-18
+ -0.382435375685316988E-18  0.900819539404948025E-18  0.326277091282355990E-17
+ -0.264724265796570986E-18 -0.879365783037773085E-18 -0.260547334575423015E-17
+ -0.143360326322790005E-17 -0.504831636563563002E-18  0.179242759550851016E-17
+  0.298155597433513989E-17  0.101982766849417992E-17 -0.297393267780985003E-17
+  0.257498015965306995E-17  0.151788304147971006E-17 -0.326954717640159979E-18
+  0.230392961653170000E-17  0.139591029707509004E-17 -0.742847894742021012E-18
+  0.256650983018053003E-17 -0.153143556863578002E-17  0.409540429997454021E-18
+ -0.142725051612349992E-18 -0.443845264361252992E-18  0.113671821521527009E-17
+  0.263935466364439986E-17  0.180841534238793007E-17 -0.220183567660795011E-17
+ -0.102829799796672003E-17  0.334027442749732992E-17 -0.454009659728304982E-18
+ -0.325260651745650984E-17 -0.522448571488674997E-18 -0.336345389745121001E-17
+ -0.457041606071333975E-17 -0.304355831294945982E-17  0.427237952841651978E-17
+ -0.186941463343081017E-17 -0.129563254076186006E-17 -0.112984180780269004E-17
+  0.463510525760183965E-18  0.543403608029869993E-17  0.628157090539567034E-17
+  0.107863850412424995E-18 -0.496782957561507984E-18  0.143475058212455998E-17
+ -0.157547458689992002E-17 -0.229157071099953982E-17 -0.353871744751775981E-17
+ -0.655427268083501956E-18 -0.373057479213008010E-18 -0.288389363397911984E-17
+  0.267562859761936013E-18  0.221342377338269010E-17  0.759040992641635024E-19
+ -0.427051187586338014E-18 -0.246546273902076005E-17  0.311034949497450000E-17
+ -0.250275745587688010E-17  0.149384832852448007E-17  0.785170068645730031E-18
+ -0.124784693198205006E-18 -0.251252152874503999E-19  0.306800057149170005E-17
+  0.512280225515675992E-18  0.352936752798798985E-18 -0.468641674547420009E-18
+  0.342326765393769015E-17 -0.232885905658585013E-17  0.440296799120129026E-17
+ -0.682831107464595018E-18 -0.159760541806344992E-17 -0.278081237108601981E-17
+  0.216042711637734983E-17  0.166110979752061002E-17  0.117815888393792008E-17
+ -0.169403964681784996E-18  0.249326037070838015E-17 -0.597174026640863979E-18
+ -0.197246651090594008E-18 -0.770422236383328028E-18 -0.405218297824251983E-18
+ -0.384955849310639988E-19 -0.747681707350008986E-18 -0.163334696595593989E-19
+  0.145763242918459997E-17  0.291859303545338004E-17 -0.294086889444445013E-17
+  0.108848259833857008E-17  0.139201626084264003E-17  0.337320625108601995E-20
+  0.120185558006418003E-17  0.379317687846262973E-17 -0.300611688448085986E-17
+ -0.418480598057927000E-18 -0.261770197033990986E-18 -0.125675852095236000E-16
+  0.569849893866608996E-20 -0.467203818658997981E-18 -0.969974113361066045E-18
+ -0.308892401784452019E-18 -0.128391665958031008E-18 -0.149140333571066003E-17
+  0.168559556503606008E-18  0.264274279543341995E-18 -0.103443898683430996E-18
+ -0.166316242682362005E-18  0.493529040673638011E-19  0.366553507924297990E-18
+ -0.965617559869902025E-19 -0.628914596258986970E-18  0.680130388614003019E-19
+  0.250558797806600002E-18 -0.222187329976643995E-18  0.573591561356962000E-19
+  0.337396632102723016E-19 -0.405891570879359990E-17 -0.890046339107058019E-19
+  0.484291087592646026E-18  0.594828887209332028E-18 -0.110802497412702995E-18
+ -0.409245953699384999E-18  0.386593778060743030E-17 -0.846764863036188956E-19
+  0.417633151520367001E-19  0.269234881676822002E-19 -0.653364819565015025E-17
+ -0.108858622973204005E-18 -0.264863232139479010E-19 -0.124010917433949999E-19
+  0.374812079160027996E-18  0.271262301358190021E-18  0.288414718540088996E-17
+ -0.396919639083364985E-17  0.240811466904398010E-17 -0.235051642863068013E-19
+ -0.148738985537855005E-17 -0.355753837846806022E-19  0.407761660808220029E-17
+  0.397766672030618977E-17  0.493311988480904980E-17  0.739565642071410984E-19
+  0.343477958748839018E-14 -0.366744259744019982E-14  0.123993155153046002E-13
+ -0.668060306510329026E-14  0.135525271560687994E-19  0.152465930505773989E-18
+  0.242251422914729993E-18 -0.542312844479565985E-18  0.462585868319255004E-17
+ -0.919879898349006038E-19  0.210064170919066003E-18 -0.643745039913268017E-19
+  0.125157058890702999E-16 -0.582864546829365060E-19  0.246041564481448014E-18
+  0.101296537813243998E-18 -0.238514220907215010E-17 -0.797809496952801045E-19
+ -0.192534559377843010E-18 -0.187360213774079013E-17 -0.389635155736977975E-19
+ -0.200746808499269012E-18 -0.245893664587923003E-17 -0.948676900924816019E-19
+  0.000000000000000000E+00 -0.108420217248549998E-18  0.261375838676955993E-18
+  0.725966792801038968E-19 -0.167646349107347008E-18 -0.805740091075652957E-19
+ -0.511324047944521005E-17 -0.804386616798363005E-20  0.803374354523752020E-19
+  0.233691757936031022E-18  0.322467056021906979E-19  0.692362580416071978E-19
+  0.782403671634307027E-17  0.714617874796812030E-19 -0.205034912794744004E-18
+ -0.370259277068535988E-18 -0.219593291575676978E-18 -0.226587930835426006E-18
+  0.589328467008100031E-17 -0.458175341293122029E-19 -0.555484868553859964E-19
+  0.307568912804366982E-18  0.664220076179670995E-17 -0.838628792230762059E-19
+  0.133440776417054003E-19  0.513182852027898008E-19  0.130496013436366007E-19
+ -0.467879824239593957E-18  0.114281685243550001E-16 -0.521729943861285986E-17
+  0.141818726358787999E-16 -0.217687467444354995E-18  0.338813178901719991E-18
+  0.218534500391609006E-18 -0.564123942871364026E-17  0.155854062294790998E-18
+  0.738612730005749992E-18  0.132137139771670997E-18  0.652215369385811020E-19
+  0.528548559086683026E-18  0.393023287525995020E-18  0.813151629364127964E-19
+  0.287313575708658983E-17  0.687790753170491972E-18  0.106387338175140007E-17
+ -0.813151629364127964E-19 -0.669156028330897037E-17 -0.169406589450859993E-20
+ -0.383705925106198005E-18  0.159877468794249008E-19  0.637946834691517017E-17
+ -0.251992301808153990E-19  0.130597260343342002E-17 -0.397914845927722009E-19
+ -0.771823101021564955E-19 -0.678629956492047021E-19  0.244403275117150019E-18
+  0.616306942007001003E-19  0.578090437734863976E-17  0.519520791676689028E-19
+  0.250034502269737982E-18  0.145212009554413988E-18 -0.175005708853890006E-18
+  0.394009615144800003E-18 -0.110346462401244998E-18  0.222948572015913991E-18
+  0.171997642246803001E-18  0.319375928924822986E-20  0.101563716873418997E-18
+  0.901803069674074007E-19 -0.274728422129961016E-17 -0.421283293404232023E-21
+  0.608978550920113979E-19  0.763010629372050983E-18 -0.286952614910220011E-18
+  0.730908685021870015E-19 -0.451919320602110971E-17  0.589814511873686954E-22
+  0.126157748391557989E-18  0.141953434272505006E-18  0.100040548709848005E-17
+  0.102890605326513008E-20 -0.935924522377375949E-19  0.847890824022396966E-19
+ -0.879092467376866968E-17  0.476821187834545992E-19 -0.753033951901363993E-19
+  0.168555449681110991E-19 -0.100415019031744998E-16  0.329480295536328014E-20
+ -0.430020650141573979E-19 -0.930238733580653041E-19  0.843287605802546953E-19
+ -0.745054355771767074E-17 -0.127373076708930990E-18  0.142668167903681001E-19
+ -0.212738678080100992E-18 -0.124931763730073991E-18  0.141197446866016998E-17
+ -0.623935018560200991E-21 -0.208825078665083006E-19 -0.702882739988749020E-17
+  0.135013553282889001E-18 -0.273662399296958983E-18  0.979870564908593936E-17
+ -0.967980574380733076E-17 -0.586955097351761970E-17  0.928385604696770927E-21
+  0.588030583085907051E-19 -0.303121635649031013E-20 -0.785213004720344977E-17
+ -0.143126772192240001E-21  0.442508069709277023E-17  0.348261833625540021E-18
+  0.403209644538310013E-17  0.158323610013567989E-18 -0.736025309049684025E-19
+  0.252063439340834008E-18  0.327873715300706015E-21  0.308873893618246024E-19
+ -0.782545946699665940E-19 -0.916516118708754965E-20 -0.330581739190081984E-17
+  0.194817577868489006E-18 -0.215146368602591997E-18  0.701978555037001045E-19
+ -0.378514539581815021E-17  0.926442286059390996E-20  0.154159996400282999E-18
+  0.327166475876973987E-18  0.149444776103045990E-18 -0.384315608916437980E-19
+  0.541531313897068035E-18  0.866120139888960971E-19  0.110405247154699008E-18
+  0.129192718002985003E-17 -0.571019320457605944E-19  0.236706004088174008E-19
+ -0.840441972133477969E-17 -0.208661272600177011E-17 -0.128778654135807999E-16
+ -0.241127284462270001E-19 -0.124068820076829006E-18 -0.257368975789749001E-19
+  0.361217365792718986E-18  0.122756911625320001E-18 -0.415100531279882990E-17
+ -0.172979513388722001E-17  0.192235450868344009E-17 -0.817419881324901948E-19
+ -0.542101086242751976E-19 -0.247333620598255988E-18  0.251695840276614999E-17
+ -0.439123055680311038E-17  0.812389299711598993E-17  0.749624158320056041E-19
+ -0.872105122493027937E-17 -0.372694496791892029E-19  0.467901000063275970E-17
+  0.399121924746226012E-17  0.660008072500550978E-17 -0.527542707461818990E-19
+  0.343843876982052995E-14 -0.367179973492087989E-14  0.124130984354223001E-13
+ -0.668730478978197023E-14  0.811118750290718005E-17  0.122819777351874007E-18
+  0.342201310690737000E-18 -0.599540507978433977E-18 -0.469610947825544972E-17
+ -0.214310597913247990E-19  0.372694496791892029E-19 -0.345589442479755020E-18
+  0.437836624391667992E-18  0.205643717725583008E-18 -0.312684304341145982E-18
+ -0.472009937038071041E-21  0.120475133905591992E-18 -0.260656398339187997E-18
+  0.673638442522140059E-19 -0.964679226862538027E-19 -0.287342692466221023E-18
+ -0.221571907600904013E-18 -0.434358495352004983E-17 -0.228529489169210000E-17
+  0.277911509994136018E-17  0.156800809279949005E-22 -0.476438579319618995E-17
+  0.793257935632228010E-19 -0.191429446079471997E-18 -0.792505201274804995E-19
+ -0.761249535594635047E-19 -0.759284593898908991E-19  0.245210185927285981E-17
+  0.326322751652168995E-21 -0.219505881849384995E-18 -0.351708553712615972E-17
+ -0.957527733479134017E-19  0.210332177437534002E-18  0.632850078629209974E-17
+  0.783703999557241003E-17  0.712107216201591027E-17  0.931799314501438023E-19
+ -0.162733723449395010E-18  0.273630519451929002E-18  0.405238470426719007E-18
+  0.290400572395675995E-18  0.309034676849811019E-19 -0.982194259345465061E-17
+ -0.459295178406396010E-17  0.378501304692013982E-19  0.660156303266319966E-19
+ -0.393711501795638995E-18  0.111130722679763993E-17 -0.731836466427715011E-18
+  0.133492392487278003E-17  0.170253622398114007E-18  0.678102813836270969E-17
+  0.116382328043262001E-19  0.652850644096251986E-18  0.135101755087061000E-18
+  0.135525271560687994E-19  0.406575814682063982E-19 -0.688214269644119025E-20
+ -0.338813178901719985E-20  0.291379333855478980E-18 -0.673391193067169010E-19
+  0.575558887659297003E-17 -0.529395592033938013E-20 -0.101643953670516005E-18
+  0.225310763969643986E-18 -0.860585474410368990E-17 -0.355753837846806022E-19
+  0.111384832563939989E-18 -0.619022265765282981E-18  0.659097512082252000E-20
+  0.367135843075535984E-19 -0.148258098918869009E-16 -0.403656384107634971E-20
+  0.301976499779261008E-19  0.189408714322993997E-20  0.272869291952943018E-18
+ -0.557349643847283955E-19  0.426147018319669965E-17  0.254580724327268992E-21
+ -0.869775563299138967E-19 -0.110887275953398993E-18 -0.998341549897226014E-19
+ -0.481248056751729986E-20 -0.185349767128022002E-18 -0.145910104401558996E-18
+ -0.131668031408930009E-17 -0.607341738615819030E-21 -0.624901402953516003E-17
+ -0.116001184478416006E-20 -0.833449305952115017E-20 -0.220151721333313981E-21
+  0.165335936503532999E-18 -0.129960524270914003E-18 -0.396916978071411982E-18
+  0.219508923152721984E-18 -0.827237810600491973E-17  0.341451599951967009E-18
+  0.159674085227868006E-18  0.188315067856254009E-20 -0.847491822467939984E-19
+ -0.455167206907106011E-19 -0.209933309261154010E-17 -0.145404646964087995E-21
+  0.168411398277699010E-18 -0.313218061614544984E-18  0.560594904015392966E-19
+  0.153058648530649989E-18 -0.122794367412262001E-17  0.111914706524383009E-20
+  0.171517425742714004E-18 -0.183906875580670001E-18  0.219862189175055999E-18
+  0.897452583439613029E-17 -0.352102288514519020E-19  0.153068958878854008E-18
+  0.146666444626402991E-18 -0.303260900504276985E-19  0.235125754644082987E-18
+  0.538382462411191964E-21  0.130959794039688006E-19  0.881466470414765984E-17
+ -0.176754757224459993E-18 -0.338370818476285984E-19  0.458932469699724984E-17
+ -0.582300740523848969E-17  0.582615730901109008E-17 -0.949456505418176036E-21
+  0.113959685864603008E-18  0.144076135613940989E-19  0.582626318812949995E-17
+ -0.116571720284226000E-20 -0.141483631752557998E-16  0.100448206267148993E-17
+ -0.113650645697845998E-17 -0.471162076910205015E-20  0.673391193067169010E-19
+  0.104026233834669001E-18  0.535324822664718007E-18  0.223616698075135001E-18
+  0.861263100768173047E-17  0.000000000000000000E+00  0.243945488809238016E-18
+ -0.174488787134385990E-18  0.271050543121375988E-19  0.487890977618476995E-18
+ -0.125527106409534999E-16  0.100472472860477996E-18 -0.955823741417275020E-19
+  0.107162902717469998E-18 -0.101127663721311999E-18  0.305449075076592991E-19
+ -0.416346654315768998E-17 -0.370398036616291987E-19  0.639750339166004974E-19
+ -0.124552860677330003E-16  0.295344837712059013E-19  0.438212164389766970E-18
+  0.101705744062273998E-16 -0.262887759400580013E-17  0.254481784579693981E-17
+  0.126166498411475007E-18 -0.267108613912254992E-18 -0.118145276312759989E-18
+  0.181096822855342012E-18  0.187530220069472992E-18 -0.423863827445876037E-17
+ -0.640878176666766969E-17 -0.881828403118920017E-17 -0.228818836947481010E-19
+  0.146804706393460995E-18 -0.227785688362402982E-18  0.113841228110978004E-17
+  0.179570984817912009E-18 -0.121972744404618998E-17  0.237846851589007997E-17
+ -0.233781093442186990E-18 -0.337119113007212016E-18 -0.477943634444158998E-17
+  0.130072496962737990E-18  0.000000000000000000E+00 -0.406575814682063982E-19
+ -0.595040645446145988E-19  0.148442524006315990E-18  0.293073399749988014E-18
+  0.321449003483006979E-18 -0.327462937408513012E-17 -0.140607469244213991E-17
+  0.440457132572235984E-18 -0.105549829437158004E-18 -0.242918130488448022E-18
+ -0.188834994088198992E-18 -0.239202104304613992E-17  0.119405175783255012E-18
+  0.120003227366130008E-18 -0.403498702803366996E-18  0.243945488809237997E-17
+ -0.745388993583784058E-19 -0.514996031930614991E-17 -0.502876181595488006E-19
+  0.874958564734091035E-19 -0.126038502551440006E-17  0.198205709657505990E-18
+  0.262368455412020012E-18 -0.435120825004534007E-17 -0.847032947254300038E-20
+ -0.933853824347866003E-19 -0.218110983917982006E-19 -0.158951026508189991E-18
+ -0.123216824045899002E-19  0.105479069047138999E-16 -0.221602720078721986E-19
+ -0.112094436116716006E-16  0.552668609373801021E-20  0.600580946092660953E-19
+ -0.150321091163205985E-19 -0.566633402054696966E-18  0.471817359051710964E-20
+ -0.309740174670613979E-18 -0.713982335130080985E-19  0.717113685500036983E-17
+ -0.388401777744873018E-18 -0.159250724383875008E-18  0.102895322840944002E-18
+  0.202123237038557009E-18  0.107599654080898005E-19 -0.115560962453870006E-16
+  0.115264413034391004E-18 -0.323460706732736002E-19  0.217151454407421018E-18
+ -0.162863590805566010E-19 -0.447140234933781975E-19 -0.458895903494596031E-17
+ -0.264017439963143991E-19  0.132772414482112011E-18 -0.946559318556680990E-19
+ -0.602663941971434999E-17  0.533630756770209021E-18 -0.107064964532944006E-17
+ -0.152465930505774001E-19  0.198642461020933989E-18  0.271326821445969012E-18
+  0.764647908913610958E-17 -0.285476573004300971E-19 -0.423615735300657014E-18
+  0.405577242246589994E-17 -0.569841415265331035E-18 -0.202440874393777998E-18
+ -0.138405183581353003E-17  0.741153828847513037E-19  0.440457132572235996E-19
+ -0.145689666927739995E-18 -0.201593841446523018E-17  0.118584612615602002E-19
+  0.100288700954909006E-17 -0.389635155736978012E-18  0.253678923077150013E-17
+  0.463253320355523965E-17 -0.542861329849173998E-18 -0.578182497767161953E-18
+  0.378034402595259001E-17 -0.390167653256308987E-19 -0.809119123877932027E-18
+ -0.637257875959320984E-17 -0.796210970419041958E-19  0.108526096366957005E-19
+ -0.806968288849171933E-17  0.477726582251425006E-18 -0.325260651745650993E-18
+ -0.114409418473740993E-21 -0.136584062744755990E-19  0.204770214998727011E-18
+ -0.769571784227895035E-17  0.308108234563751989E-19  0.136366586621958005E-17
+  0.456872337957895004E-19  0.560083992759655039E-19  0.178381499097059997E-19
+  0.290512448573523984E-18  0.169088952095639994E-18  0.173985861321953990E-18
+ -0.136689941863163006E-18 -0.742890246389383958E-17  0.225469582647254023E-18
+  0.414781446358590005E-18  0.579688173277162020E-20  0.298155597433514008E-18
+  0.166018457661843011E-18 -0.339766090967380983E-18  0.405517023497995986E-19
+  0.152465930505773989E-18  0.265968345437850018E-18 -0.671697127172659976E-18
+ -0.315784470648243990E-18 -0.822315467062235941E-17 -0.257815653320528003E-18
+  0.697955148537542998E-18 -0.406575814682063982E-19 -0.247333620598255988E-18
+  0.582758667710959018E-18 -0.727897763222982996E-17  0.448338738054221020E-20
+ -0.542101086242751976E-19  0.189735380184963011E-18 -0.758941520739853008E-18
+  0.460785923306338999E-18  0.111490711682346993E-18 -0.542101086242751986E-17
+  0.787740640946499026E-18  0.358824332280602981E-18  0.387587718284786979E-17
+  0.202017357920150992E-18 -0.887267012248879966E-19 -0.146410968421885989E-19
+ -0.105032085459533001E-18  0.372694496791892029E-19 -0.189735380184963011E-18
+ -0.355922582691767008E-18 -0.767073037033493925E-17 -0.682708555486965996E-17
+  0.203287907341032009E-18  0.345589442479755020E-18  0.662040951573960991E-17
+ -0.326107684692905992E-19  0.393023287525995020E-18  0.105709711817336990E-17
+  0.792822838630025021E-18 -0.203287907341031991E-19 -0.542101086242751976E-19
+  0.542101086242752024E-18  0.474338450462407997E-18 -0.137219337455196992E-18
+ -0.847032947254299978E-19  0.326173859141910017E-18  0.731219906678634021E-19
+ -0.946472464592363042E-19  0.139237548491848992E-18  0.612680171563180001E-19
+ -0.188808491239547999E-17  0.145513853081274004E-20  0.351754011649361006E-19
+  0.103319340142544008E-16  0.706026828653623967E-19 -0.227702440388566992E-18
+ -0.768408726927613926E-17 -0.144477278643678006E-18  0.338346241908282006E-18
+  0.257484777844331994E-18  0.254997244763346007E-17  0.210498854330963010E-19
+ -0.183381484398103994E-18 -0.755890011070466999E-19  0.802413433023315002E-17
+  0.699252607177727046E-19  0.213594500225259994E-18 -0.235617227606901007E-21
+  0.130534219874883997E-16 -0.939645122611503009E-19  0.181688567186046989E-18
+  0.550571415715294980E-19  0.367633474932048013E-17  0.264951905901144992E-17
+ -0.711507675693612044E-19 -0.796210970419041958E-19 -0.555997720533642962E-19
+  0.694178241860625994E-19 -0.971685756843591012E-18  0.352246592049581004E-19
+ -0.177016651086348013E-19 -0.809573907778713970E-17 -0.165171424714588999E-18
+  0.330342849429177012E-19 -0.540915240116595965E-17 -0.352026892878886990E-17
+ -0.329326409892472010E-17  0.622238343986890020E-19 -0.660685698858354024E-19
+ -0.982558218814988032E-19  0.528548559086683026E-17 -0.313402190484091005E-19
+ -0.866032624180153034E-17  0.952700100629121048E-19  0.000000000000000000E+00
+  0.121972744404619008E-18  0.146367293285543002E-17 -0.155774652955986003E-18
+ -0.406575814682063982E-19  0.127393755267047002E-17  0.142301535138722000E-18
+  0.508219768352579978E-20 -0.738612730005749992E-18 -0.115196480826584991E-18
+ -0.121972744404619008E-18  0.894678550537355034E-20 -0.120066920273297001E-18
+  0.246976278573632984E-18 -0.799387343971245945E-19 -0.190053017540184000E-18
+  0.346884975340074986E-19 -0.175978087053195992E-19 -0.495753329340794005E-18
+ -0.285070427323536989E-18 -0.441968288153793989E-19  0.812601554256781032E-21
+ -0.748406217586133030E-17 -0.742510405052100003E-19  0.347283508374263019E-19
+  0.168771314740419006E-18 -0.142883870289960005E-18  0.396517298433418985E-19
+  0.398084309385840033E-17 -0.227640104574593005E-19 -0.514307817660970006E-19
+  0.349851076995627978E-18  0.116316152503736994E-16 -0.578894079889111008E-19
+  0.347336447933467014E-18  0.295353109518184001E-19 -0.312025761944802984E-18
+  0.281532575843647979E-18  0.110855436971906990E-18  0.133195930955738993E-18
+ -0.630616029230826988E-17 -0.385230584411256025E-17  0.125369147999762005E-18
+ -0.152512769607959997E-18  0.905901737088474059E-17 -0.838562617781756975E-19
+ -0.264274279543341995E-18 -0.777020380207811956E-19  0.542101086242751976E-19
+ -0.542101086242751976E-19 -0.542101086242751976E-19  0.406575814682063982E-19
+  0.840256683676266033E-18  0.551587855251999996E-17  0.158225754547102995E-17
+  0.362371282747230003E-19 -0.216840434497100983E-18 -0.745388993583784058E-19
+  0.400138364282931018E-17  0.162247341249214004E-19  0.161979169294623999E-17
+ -0.595570041038180046E-19  0.894466792300540965E-18  0.284603070277445010E-18
+  0.502798757490152989E-17 -0.613933450636857020E-19 -0.294972606436410006E-18
+  0.599699326656045025E-17  0.148230765769502993E-18  0.350671640163280003E-18
+  0.628498446862691006E-17  0.169406589450860008E-19  0.381164826264434972E-19
+ -0.220228566286117998E-19 -0.199529198637591003E-18  0.108631975485363994E-18
+ -0.129066645337873990E-18  0.144456340634422998E-18  0.530040792911729013E-19
+  0.403618230401881005E-18  0.737014865216481058E-17  0.110059844318994989E-18
+ -0.454074179816084022E-18  0.352795095513764987E-17  0.327364999223985992E-19
+  0.128494236353987006E-18 -0.206400753422191982E-17  0.908972231522271051E-19
+ -0.196988099795828006E-18 -0.510866746312750012E-19 -0.832093403647102925E-17
+  0.508219768352579978E-20 -0.156595216123638992E-18  0.304349525860310983E-18
+ -0.762382592088074007E-17  0.127054942088145006E-19 -0.180629776001980005E-18
+ -0.293741761684931004E-18  0.590206271074140996E-17 -0.421489881126395011E-19
+ -0.346224717190195023E-18 -0.299214388617581980E-18 -0.100318347108062999E-16
+  0.568359107607636009E-18  0.107471540347625997E-16 -0.118584612615602002E-19
+ -0.106064406863998999E-18 -0.429267033245619011E-18 -0.243921335135351988E-17
+ -0.434104385467828984E-19  0.210838411972415998E-18 -0.518765659418141033E-17
+ -0.190582413132218010E-18 -0.912678000666508976E-19  0.540618778585057037E-17
+  0.694567016748526037E-19  0.345589442479755020E-18 -0.406575814682063982E-19
+  0.202271467804326998E-17 -0.711507675693612044E-19  0.118923425794503999E-17
+ -0.203287907341031991E-19  0.797629750605692964E-17  0.188464830764082006E-19
+ -0.451061588025165991E-18 -0.189821406968669007E-18 -0.400002748021503026E-17
+  0.124839752407727995E-18  0.327464209198704022E-18  0.545653123870147032E-17
+ -0.251365764192934006E-18 -0.179010689506355000E-18 -0.904977154343205932E-18
+ -0.183495918530850001E-18 -0.399801975895928995E-18  0.278820123303884006E-23
+  0.562694574772872009E-18  0.202758511748997993E-19  0.854656243779589042E-18
+ -0.390636044278167019E-19 -0.107188002479181997E-17 -0.245956339028960991E-20
+ -0.580863803722753026E-19 -0.791746263062787051E-20 -0.574261868458814053E-19
+  0.236017789818529992E-18  0.373936922071946989E-18  0.518031784778684025E-18
+  0.245470727140724987E-17 -0.691522992094331016E-19 -0.285741270800317976E-18
+  0.316578564036294992E-19  0.110114283143058996E-18 -0.169406589450859996E-18
+ -0.754330485133237969E-17  0.942324153820408977E-20  0.281214938488428001E-18
+ -0.111808349037568006E-18  0.279944389067546021E-18 -0.269065309651248993E-19
+  0.764138861964645979E-17 -0.310440883891150985E-19  0.204959664400601989E-18
+  0.515554994382785973E-19  0.579622776531413976E-17 -0.229418597622752010E-19
+  0.920281259736054051E-17 -0.399130950750337972E-23 -0.120874248551148997E-18
+ -0.174419097167777988E-18 -0.868654828080977068E-18 -0.900415783177556089E-21
+  0.404465583235387984E-19 -0.681480957419390977E-17  0.149009478768038989E-19
+  0.338276207821614990E-18 -0.482449426342788030E-17  0.360144848996880021E-20
+ -0.141509621152472008E-18 -0.132682285823646993E-18 -0.142110385631281993E-16
+  0.189967376932361008E-22  0.110892271134246991E-18  0.633346199603779993E-19
+ -0.300611992980550998E-17 -0.269779993700494995E-18  0.391392087358040979E-18
+  0.589549975636383965E-19  0.134263796311126002E-16 -0.559058288800089044E-19
+ -0.227134697220322979E-18  0.286999495230071982E-17 -0.530824960132428959E-18
+ -0.419810704482912991E-18 -0.874646221334790992E-17 -0.519739416435238996E-17
+ -0.217856874033806009E-17 -0.364476457406177966E-23  0.131634213959238997E-18
+ -0.338019085513668988E-19  0.242463181151542991E-19  0.205405489709168013E-19
+  0.396478007354323017E-19  0.290554014399305015E-18 -0.223490056723352997E-17
+ -0.582914177666118979E-20 -0.155517709978212000E-20  0.518526149271776997E-17
+  0.184335545146216991E-18  0.465497544075441045E-18  0.198713929425859014E-17
+  0.592923063078010012E-20 -0.474073752666390980E-19 -0.119365471113852009E-18
+ -0.367304091612129012E-17  0.149091033606558009E-19 -0.402266203690663009E-19
+ -0.295418456786575992E-18 -0.928032954377330083E-18  0.123415347392912000E-19
+ -0.403637669146276001E-18 -0.193017632855574003E-18  0.642437432800943969E-17
+ -0.115514118181804992E-18  0.261563774112127983E-17 -0.575982404132923990E-19
+ -0.514996031930614991E-18 -0.508219768352580011E-18 -0.113336184716178005E-16
+ -0.258810717670098996E-20 -0.318484388167617024E-18  0.813151629364127964E-19
+  0.115196480826585000E-17 -0.162630325872826002E-18 -0.178829830989063991E-18
+ -0.260208521396520987E-17 -0.135525271560687994E-19  0.731095312598867980E-19
+ -0.169050570915216994E-17  0.107679063419703003E-18 -0.465868120989865006E-19
+  0.981367078732912033E-20  0.423516473627149989E-19 -0.982558218814988032E-19
+ -0.182959116606928994E-18 -0.101988060805338004E-18 -0.209992702514142018E-17
+  0.590276085117841030E-19  0.271050543121375988E-19  0.162630325872826002E-18
+ -0.382181265801140005E-17  0.383811804224604982E-21  0.423516473627149977E-18
+ -0.149077798716756994E-17 -0.279182059415017007E-17  0.592923063078009997E-19
+ -0.338813178901719985E-20  0.308319992800564986E-18  0.150771864611265003E-18
+ -0.740624433255478979E-19 -0.387094056895215014E-18  0.676501392170368033E-19
+ -0.121549227930992002E-18  0.295402740354937003E-19 -0.291416832279092011E-17
+ -0.460872725571868987E-19  0.758375729200866953E-19  0.521371940092172967E-20
+ -0.406504324951388992E-19 -0.651424377925056947E-19 -0.108738516348261003E-17
+ -0.957544277091384987E-20 -0.357024387267687978E-18 -0.120808074102144995E-18
+ -0.257471546185705991E-18  0.344363560811950999E-18  0.515776973146925997E-17
+  0.290371414279082971E-19  0.268688528882230986E-18  0.191512370935880005E-18
+  0.454950577675083974E-17  0.264846688527228016E-18 -0.236094717615498001E-18
+ -0.111563503576251997E-18  0.117314063194721006E-18 -0.277085652870562998E-18
+ -0.285873619698326003E-19  0.390852765598656020E-18  0.429106891079029014E-17
+ -0.138235776991902008E-17  0.236143521271638010E-19  0.298686647386773004E-19
+  0.674238226014422974E-18  0.847032947254300038E-20  0.176182853028893989E-18
+ -0.202096767258955992E-19 -0.745388993583784058E-19  0.325260651745650993E-18
+  0.119262238973405006E-17  0.307472959853311023E-18 -0.216840434497101002E-17
+ -0.154498809579183997E-17 -0.527193306371076993E-17  0.337489689921634990E-19
+  0.389635155736978012E-18  0.267662411332359003E-18  0.125360876193636007E-17
+  0.162317651601281012E-19 -0.846415539645091067E-17  0.115143541267381001E-20
+ -0.813151629364127964E-19 -0.243945488809238016E-18 -0.454009659728304982E-17
+ -0.104290931630686000E-19  0.338813178901719991E-18  0.623416249179164953E-18
+  0.313063377305188982E-17  0.220228566286117998E-19  0.609863722023096003E-19
+  0.155854062294790998E-18 -0.355753837846805998E-18  0.374388562686400991E-18
+ -0.416740210049116008E-18 -0.124831480601602994E-18  0.223034362923898006E-18
+ -0.571879588294660973E-19 -0.533562697900371999E-17 -0.143723355214125000E-18
+  0.107285742269608005E-18 -0.225737175575414976E-18 -0.854858075849051981E-19
+  0.293756650935957019E-19 -0.710708536503824959E-17 -0.797732982746140040E-20
+  0.214140516977727985E-19 -0.228315083954436005E-18 -0.165965518102638991E-18
+ -0.213240544471269990E-18  0.618450518525887022E-17  0.116043513773839002E-18
+ -0.139892785194967992E-18 -0.506313944221258028E-18  0.133937084784586000E-18
+  0.204346698525099987E-19  0.536807130322413020E-19  0.124523769413732997E-19
+  0.148952067263648992E-18 -0.301517259442928983E-18 -0.127393755267047002E-17
+  0.398105485209521015E-18  0.211419423634673018E-17  0.340168431617327016E-17
+ -0.430292737205185006E-18 -0.193123511973979996E-18 -0.107814853389060005E-16
+  0.631039545704453993E-19  0.230392961653169981E-18 -0.135525271560687994E-19
+  0.976205471710581020E-19 -0.194817577868488988E-19 -0.813151629364127964E-19
+ -0.117102304957906998E-18 -0.428768077900126986E-17 -0.606475590234079043E-18
+  0.101643953670515996E-19 -0.106005024343306003E-18  0.744917500634628952E-19
+  0.810409525633515039E-20  0.448588648865876997E-17  0.121760986167806011E-18
+  0.375955573638820974E-17  0.914795583034644005E-19 -0.891027874206852923E-17
+ -0.104166389249707997E-18  0.901362169886782956E-19 -0.633570718378866017E-19
+  0.112392829688519002E-18  0.194071254160812988E-18  0.147344671802759991E-17
+  0.697367299791314956E-19  0.211231296913984995E-19  0.479296835202547044E-18
+ -0.566753257617398987E-19  0.323987061890757993E-18  0.259933235688662995E-19
+  0.986793383551260015E-19 -0.179676863936318002E-18 -0.919808297546715955E-19
+ -0.606703892083144040E-19  0.386989004957421020E-18  0.314651646799352012E-17
+  0.101994729949027002E-18  0.684812902965300976E-18 -0.806073444862511971E-18
+  0.110753693756562992E-18 -0.925118797079306001E-20  0.301637366067872000E-17
+  0.452103835596982987E-19 -0.753329927464293003E-19 -0.465974000108271974E-18
+  0.212139401639839982E-17  0.984675801183124024E-19 -0.342677766723567977E-18
+ -0.147701370177469013E-19  0.162418567636011994E-18  0.148230765769503011E-19
+ -0.840150804557859028E-19  0.448834817816173048E-18 -0.330239617288731015E-17
+ -0.107028899458235995E-19 -0.720116040930874954E-19  0.171179072713674006E-18
+  0.947754072384057926E-17 -0.104498004108727002E-16 -0.159287360851361995E-18
+ -0.546449624875564009E-18 -0.169016160201734991E-18 -0.141622419855816012E-18
+  0.105248409733328002E-16 -0.382470559941918998E-19 -0.297665499382925019E-18
+  0.962848986359687018E-18 -0.178557947061475997E-18  0.440432446400829996E-19
+  0.263750093250461983E-17  0.518265773643397017E-17 -0.455663906451196969E-17
+ -0.368948450555535963E-20 -0.373638438713888985E-18  0.213542219055679009E-18
+ -0.448714132358229986E-17 -0.219963878966174016E-20  0.576077695339490031E-17
+  0.158077523781333993E-18 -0.930746933967116952E-18 -0.950560029288081993E-19
+  0.423162853915283991E-19  0.142072256667234998E-18  0.287333593479481977E-19
+  0.170800388783012010E-19 -0.143679075460453001E-16 -0.306660014178995024E-19
+ -0.150184396739703993E-18 -0.853679844308174942E-19 -0.189294334590933006E-18
+ -0.272474283524034976E-18  0.125462270442160009E-18 -0.751674786718394965E-19
+  0.306550050694755012E-17 -0.309179488361727017E-21 -0.225680824731844995E-17
+  0.535668041239238007E-21  0.109157572264066003E-20 -0.602222147500702957E-21
+ -0.399748891870335016E-18  0.651654608810402999E-19 -0.115821846718304011E-18
+  0.728333770199589983E-19 -0.105682434112270003E-16  0.785310085019301966E-19
+ -0.113602115086027998E-18 -0.212083351005076997E-19 -0.204710795319530007E-18
+ -0.245404952064859993E-18 -0.800663773371987988E-17 -0.216121373216464000E-19
+ -0.990499766618607977E-19  0.114815399364617001E-19 -0.876268280833858963E-19
+ -0.244044218853604000E-18 -0.920838732195320949E-17 -0.554290201001359983E-20
+ -0.815213370858219041E-19 -0.323977921931644982E-19  0.244566550410973989E-17
+  0.922736264539544005E-17 -0.225107239567582988E-18  0.146285264029161001E-20
+ -0.131179966443482995E-19  0.227861500072650001E-18  0.104859413987007006E-16
+ -0.325575790958343987E-21 -0.128914520652134002E-20  0.712380445010489047E-17
+ -0.689612758111431019E-19 -0.518476950651943026E-19 -0.987352703162770031E-18
+ -0.651557083079744989E-17  0.618057019281626989E-17 -0.256050918324738993E-22
+ -0.579315073000275023E-20  0.116825754961274992E-20  0.488482179479582027E-17
+ -0.112839648754171999E-20 -0.171478222420645004E-17  0.121003483029333003E-16
+  0.281361349456850020E-19  0.196479900706991004E-18  0.446695067007620992E-17
+  0.296023125814352014E-19 -0.525838715399959994E-19  0.134402804013066001E-16
+ -0.533630756770208973E-19 -0.135525271560687994E-19 -0.307388256558586017E-17
+  0.418095462764723033E-17  0.325938278103455002E-17 -0.804756263134598058E-22
+  0.212102343948397000E-18  0.553747789267498967E-19  0.178660424399612991E-17
+ -0.110855436971906990E-18 -0.115760817367797994E-16 -0.210618278531899992E-19
+ -0.934068891307130031E-19  0.132841484063259997E-18  0.281252988796605007E-18
+ -0.193943299659809005E-18 -0.345377684242941011E-18 -0.582891016608967021E-18
+ -0.584484497340988962E-17 -0.130601892554772000E-18 -0.345854140275771989E-18
+  0.152100316675026008E-18 -0.102914503091396998E-18  0.114150924532318007E-21
+ -0.415920887652381993E-18 -0.147870942203041992E-18 -0.441264460850088028E-18
+ -0.741908217566160965E-18  0.214934610365779012E-19  0.377141419764976985E-18
+  0.129262521706926995E-17 -0.169406589450860008E-19  0.576660030490727979E-17
+  0.296461531539004998E-19  0.508219768352580011E-18  0.572594272343907029E-18
+  0.357553782859722012E-18 -0.169662498452868990E-21 -0.162630325872826002E-18
+ -0.745388993583784058E-19 -0.197866896478604983E-17  0.162630325872826002E-18
+ -0.582335151237331002E-19  0.208708918203459989E-17 -0.508219768352580023E-19
+  0.100691041604855001E-18 -0.138156367653096994E-17  0.225522522206456984E-18
+  0.317637355220363012E-18  0.468614360623540998E-19 -0.230392961653169981E-18
+ -0.332036915323686022E-18 -0.677626357803439970E-20  0.210729224131558998E-18
+  0.668009886874144024E-17 -0.181651509494605009E-17 -0.511607900141597020E-17
+  0.677626357803439970E-20  0.169406589450859991E-17 -0.870194004405784926E-21
+ -0.119153712877039012E-18 -0.291379333855478980E-18  0.481961746987696989E-18
+  0.290320542671411008E-18  0.758687410855676968E-17 -0.385399991000707015E-19
+ -0.444692297308507979E-20  0.647980204649540000E-19  0.108234928791338999E-18
+  0.201157090083095001E-18 -0.197605409177339990E-18  0.198550747370517010E-19
+ -0.101671722281198999E-18  0.310643151866217981E-18 -0.125183016637982994E-16
+ -0.145837243138382015E-19  0.171651875903994001E-19 -0.615441504747079020E-17
+  0.123241691163063998E-18 -0.110059088224217001E-18 -0.109498990014289995E-17
+ -0.483878427864559008E-19  0.347403720981722015E-18 -0.340587449003903991E-18
+  0.741306893449987060E-17 -0.363633250169256998E-19  0.364955640550089989E-18
+  0.328622055261124998E-20  0.494646450528802992E-17  0.588947426258927007E-19
+  0.299974477127637992E-18  0.552209517671485973E-18  0.121661835129705993E-16
+ -0.100268920998511004E-18  0.115831755537026005E-18 -0.310225816931887018E-18
+  0.122421671866663997E-16  0.579370535921941047E-18 -0.727770708280895005E-17
+ -0.135525271560687994E-19  0.698851812321550961E-19  0.225434841061526989E-18
+ -0.346550460915418993E-17  0.144465439621161009E-18 -0.453050130217742988E-18
+  0.304348202371331014E-17  0.166441974135469993E-18 -0.925383494875323000E-19
+ -0.872359232377204073E-17 -0.423516473627150019E-20  0.172794721239877004E-18
+ -0.105032085459533001E-18 -0.633919457725117989E-17  0.457397791517322003E-19
+  0.704731412115577978E-18 -0.674238226014422974E-18 -0.404116110412576017E-17
+ -0.362753453114926982E-18 -0.475570224499606972E-17 -0.176475368004182995E-18
+  0.178340140066433007E-19 -0.163225895913863989E-18  0.482835275563946972E-19
+  0.399079748874743998E-20  0.367605681663466019E-18  0.163914110183507994E-19
+ -0.579031722743040020E-17 -0.228529489169210000E-17  0.277911509994136018E-17
+  0.156800809279949005E-22 -0.101942532784422997E-16 -0.200111533788828004E-19
+  0.681861522539712014E-19 -0.438868945796134002E-19 -0.998186762513412040E-19
+ -0.723376683507983022E-19  0.104239077746036003E-16 -0.222340978775424991E-19
+ -0.146386980184122005E-18  0.409358532980754034E-17  0.223196490323958014E-18
+  0.222899946074357995E-18  0.351381030256605971E-17 -0.227693044133796995E-17
+  0.248805340344110007E-17  0.166759611490690003E-20  0.826704156520197035E-18
+  0.189735380184963011E-18 -0.925912890467357058E-19  0.229545928705915007E-18
+  0.121727104849915995E-16 -0.743059652978834972E-17 -0.681480357713447030E-17
+  0.102237042169716995E-18  0.239339747158542999E-18  0.388946941467333989E-18
+  0.338813178901719991E-18 -0.921571846612678961E-18  0.655264687995926973E-17
+  0.770799982001412947E-19  0.954817889792410057E-18  0.202589700059119992E-19
+ -0.249620609555842002E-17 -0.114772964352958009E-18 -0.135525271560688006E-18
+  0.338813178901720015E-19 -0.451045044412914991E-19  0.101643953670516005E-18
+  0.627651413915437033E-18  0.282061971435682013E-18  0.234119906621088999E-17
+  0.834327453045485954E-19  0.216840434497100983E-18  0.237169225231203999E-18
+ -0.646794358523383970E-17 -0.677626357803440031E-19  0.243521972335611010E-18
+ -0.633315946750199974E-18  0.176182853028893989E-18  0.211758236813574988E-18
+ -0.549982711273340999E-17 -0.421382929258131981E-19 -0.755877126763073964E-19
+ -0.927923781985524985E-20  0.544312695782081001E-18  0.417435145161237000E-19
+ -0.461341442457651995E-17  0.536108318836156013E-19  0.197084421782326988E-18
+ -0.190691339550880991E-18 -0.193154897758039006E-18 -0.127537878015355010E-18
+  0.298736053454965990E-18  0.127528573266872008E-18  0.609637342641941969E-17
+  0.396854887229514001E-20 -0.817053422952606007E-17 -0.752305314909567013E-21
+ -0.193634152214976995E-19  0.401511567862913983E-22  0.140692652691434998E-18
+  0.125895048470979998E-19 -0.964089261394241035E-19  0.637270598203130045E-19
+  0.357904713618655020E-17 -0.192415561086064008E-18 -0.335339833979068015E-18
+ -0.144439750170145000E-20  0.293628202065289017E-18 -0.755801452661721952E-19
+  0.339411342868876015E-17 -0.400752487403498007E-20  0.763312882702686022E-19
+  0.247337834049500987E-18  0.791484376388382995E-19  0.117214281504105993E-18
+  0.710182978617857932E-17  0.342936365526693036E-20  0.555760486582942945E-19
+ -0.781423606007173046E-21  0.298447611490521017E-17 -0.609211529548263984E-17
+  0.371005047806836004E-19  0.187569128895034009E-18 -0.189293009203637990E-18
+ -0.886637171818198977E-19  0.294046943769591991E-17  0.259566158502791993E-21
+  0.291867570968205015E-20  0.719829617311043049E-17  0.113028268052886998E-19
+  0.773855520595252012E-19 -0.378454053841608976E-17 -0.480794137786766970E-17
+ -0.101370917403036996E-16  0.115757992860620994E-21 -0.592809693679449974E-20
+  0.167889785464364989E-18  0.611388863593793994E-17  0.327757493780598990E-21
+  0.452478537105359005E-17  0.863084305453742042E-17  0.186191808557267986E-17
+  0.626132329032225955E-19  0.646801472276652056E-19  0.138210713419341001E-18
+  0.227784990428761008E-19 -0.145303063388947988E-19  0.114819286467261003E-18
+ -0.105892353296588006E-18 -0.137844024253797010E-17  0.171947688292622993E-18
+  0.230392961653169981E-18 -0.409752188234268030E-19 -0.101232067356301001E-16
+  0.159844381569747008E-19 -0.165171424714588999E-18  0.324837135272023987E-18
+  0.247031279622018984E-18  0.105941867811068004E-18  0.165709810725904997E-17
+  0.250228339151884989E-19  0.121222449583768003E-18  0.220343942471932981E-17
+ -0.175540960873553008E-18 -0.443011466303800022E-18 -0.485540461189846004E-17
+ -0.599881968135295963E-17 -0.920317179059558028E-17 -0.319285512590875016E-19
+ -0.109833041734791000E-19 -0.160489582447537996E-18  0.197370257238828010E-18
+  0.281536711746710985E-18  0.502518425980557979E-18  0.685177358743191994E-17
+  0.732594660177182060E-17 -0.814276594997199962E-19 -0.277826806699410993E-18
+ -0.214934610365779000E-18  0.267620059684996000E-17 -0.444713473132189036E-17
+ -0.670701863459635981E-17  0.264697796016969006E-19  0.585130359963270982E-17
+ -0.372694496791892029E-19  0.801293168102568024E-18  0.410641572828885035E-17
+  0.502798757490152989E-17  0.178988649666674003E-18  0.343835745465758998E-14
+ -0.367193526019243979E-14  0.124117838402881993E-13 -0.668735222362701016E-14
+  0.823316024731179930E-17  0.113502414932076005E-18  0.248604170019137005E-18
+ -0.680961550033253983E-18 -0.504233419544565038E-17 -0.217204263911860001E-19
+ -0.210064170919066003E-18 -0.277826806699410993E-18  0.184997289636259991E-18
+  0.124672661923991994E-18  0.142419093985962009E-17  0.318871176989522014E-22
+  0.362796009892000020E-20  0.475428702862992037E-20  0.114993378999595012E-18
+  0.296998248186471012E-18  0.137377009476437004E-17  0.107906108735239008E-20
+  0.344697057730012976E-18  0.233888414833785013E-18 -0.114499755441505997E-18
+  0.343701109734176990E-18  0.239648173231097987E-18  0.662407103109341977E-18
+ -0.377078204440514013E-17 -0.211556892294861001E-20 -0.821331087183475025E-17
+  0.141328819223921000E-20 -0.122986833084708995E-19 -0.578785195122303022E-22
+  0.117628290555894005E-18 -0.482852030302985980E-19  0.152715824767208010E-18
+  0.191404425117076001E-18 -0.265067067422130010E-17 -0.664820294833819956E-19
+  0.731756791723990014E-19  0.711274425411596942E-19 -0.103554248939230999E-18
+  0.528365257801646029E-18  0.607076619005779013E-17 -0.156159844440068998E-18
+  0.131687040427342994E-19 -0.227879910615741999E-18  0.316422725621148005E-18
+  0.120637954277358989E-18  0.119435717232415004E-16  0.515901036912107984E-20
+  0.767837756490412976E-19 -0.195591700332982007E-18  0.102080243701931009E-17
+ -0.582123393000517985E-17 -0.113091510730573003E-19 -0.737094946711110960E-24
+  0.812485213421242952E-19  0.293564361805019011E-18 -0.257901768391306992E-17
+  0.755454438035129986E-21 -0.102984835529547994E-20 -0.299206409188393008E-17
+  0.118414324366844011E-18  0.106238035287570000E-18  0.865029155483104026E-17
+ -0.605451096465977991E-17 -0.908673286122591033E-17 -0.245986784410216017E-21
+ -0.231756677129184016E-18 -0.175645901796425989E-18  0.582748079799118032E-17
+  0.756879467374077992E-22  0.697679309046806012E-17  0.378114738427846991E-17
+  0.185184588830498988E-18  0.240027296595149010E-19  0.612515214688381968E-17
+  0.116178223514405005E-20  0.107214617423943998E-19 -0.748335452127508986E-17
+  0.186841398172223001E-18 -0.219407572781849989E-18 -0.827097614197910050E-17
+  0.295813021938763978E-18  0.167050779066308997E-18  0.000000000000000000E+00
+ -0.185963535048316991E-18 -0.266565349779309023E-18  0.395342870760552980E-17
+  0.710830601573883961E-22  0.714369058868556026E-17  0.340525277702111998E-22
+ -0.382282094159552979E-20  0.149830506514481011E-21  0.883994619572126023E-19
+ -0.758302221231688993E-19  0.109190432258762008E-18 -0.703507417453546983E-20
+ -0.569691861010580966E-17 -0.419320786566922027E-19  0.150383337888478994E-18
+  0.240575431966751987E-21  0.280064166101323972E-19  0.105607927978014003E-18
+  0.524266083943383965E-17  0.504615464422202999E-23  0.269899549811437999E-18
+ -0.169841572392483002E-18  0.110246530172922989E-18  0.595056711102719968E-19
+  0.830106334405030930E-17  0.722278172990987047E-22 -0.260802680060638985E-19
+ -0.210156414482063007E-19 -0.129291638464487995E-17  0.652706764278169958E-22
+  0.409368376430043030E-17  0.000000000000000000E+00  0.231210476845850008E-18
+  0.729891645869777022E-19  0.462908799630395012E-17 -0.770274702091042971E-23
+ -0.714684049245816005E-21  0.462959092211639007E-17 -0.138590428094594993E-19
+ -0.208516916658589005E-19 -0.129291638464487995E-17 -0.149795744801019001E-21
+ -0.553218393675464984E-20 -0.532042569994107024E-20  0.409368376430043030E-17
+  0.000000000000000000E+00 -0.494984878551731986E-20 -0.489690922631392007E-20
+ -0.739443137510889016E-17  0.111000775673736002E-16  0.222679088850805999E-18
+  0.106845003637134005E-18 -0.490050851211016962E-17  0.293770518732017028E-19
+ -0.802142049281503955E-19 -0.297911234386516011E-17  0.270478431405522001E-18
+  0.169875903679790995E-18 -0.867533249122434064E-17 -0.585861028395123963E-17
+  0.499436149037106027E-17 -0.308114995328776013E-22 -0.130599392574966007E-14
+  0.130515766480161008E-14  0.769254928189150031E-19 -0.103944874968919999E-18
+ -0.878111951636581074E-17 -0.350432956441832991E-22  0.633450637144740049E-19
+ -0.820001218346395020E-19 -0.398999557912142013E-21  0.148399850494123999E-19
+  0.909624493601297957E-17  0.302645293357789021E-21 -0.219210703865852003E-18
+  0.308119015134121004E-18 -0.275701318827977019E-18 -0.156380108301380001E-18
+ -0.956776784104850991E-17 -0.220370776999360005E-18  0.316081995140841987E-18
+  0.198833102388086990E-19  0.717056862337457936E-17  0.855578145811417952E-19
+ -0.993211691437939944E-19  0.833220302850867983E-21 -0.217506104734504998E-19
+ -0.778452855386439975E-19  0.409724525050444025E-17 -0.504842952642287961E-21
+ -0.346727880643630006E-18 -0.411653532459296998E-18  0.396489180937604980E-17
+ -0.136437880709918003E-23 -0.124957018075439001E-16  0.302207630763774979E-21
+  0.274303192419421975E-19  0.735547307649483054E-19  0.845852113709716048E-22
+ -0.134921032453032995E-18 -0.229587702135863011E-18 -0.161215647846060990E-19
+ -0.410673872542583016E-17  0.980567686783852952E-19 -0.139877417969035991E-18
+ -0.665224868865051959E-21  0.287046018190561014E-19 -0.154866630141438997E-19
+  0.223870818551690020E-18  0.124778581831986997E-18  0.153250017406549997E-17
+ -0.448632604218867018E-19  0.711507675693612044E-19 -0.677626357803439970E-20
+  0.107945878798088004E-16 -0.300431998479260013E-19  0.433680868994201966E-18
+  0.118313562072480995E-16 -0.637646402693036987E-17  0.271050543121375988E-19
+ -0.101643953670516005E-18  0.677626357803439970E-20  0.813151629364127964E-19
+  0.279520872593919016E-18 -0.745388993583784058E-19 -0.100849860282464999E-19
+ -0.592051421507532003E-19  0.780283607724334012E-19 -0.664858447910062962E-19
+  0.437413849068639009E-19 -0.516544150489818993E-17  0.503227866641761002E-20
+ -0.301136200599181013E-19  0.266177796878770016E-17  0.102904964664959001E-18
+  0.312088382102112999E-18 -0.297903979430933015E-17  0.202876384986286988E-20
+  0.543951385875457012E-19 -0.239374042842221994E-18  0.418443806615245012E-17
+ -0.562255471114576966E-19 -0.295243281097027992E-18 -0.799691946769469941E-19
+ -0.852375263513670959E-17  0.140981303182299992E-19 -0.492761908203678022E-18
+  0.254715677652711988E-18 -0.117430723578436001E-16 -0.309172195626648020E-19
+  0.796210970419041958E-19  0.158818677610181001E-18  0.208899500616592014E-17
+ -0.443506451182351966E-17 -0.105032085459533001E-18  0.115196480826584991E-18
+  0.152282296409787000E-18 -0.133506123685445993E-18  0.772670733674762928E-17
+  0.347283508374263019E-19 -0.532836663382158006E-19  0.582929066917145019E-17
+  0.359989002583077981E-18 -0.155854062294790998E-18 -0.347495266611077003E-18
+  0.220906192643921991E-17  0.498055372985528985E-17 -0.167957369017666995E-18
+  0.338813178901720015E-19 -0.116890546721092989E-18 -0.293751026107791011E-17
+  0.733609941661028944E-19  0.100944390482868002E-16 -0.267894332096603011E-19
+  0.216840434497100983E-18  0.135525271560687994E-19  0.151788304147971006E-17
+ -0.247227741479849002E-19 -0.138913403349704991E-18  0.379470760369926985E-18
+  0.643745039913267969E-18  0.203287907341031991E-19  0.304931861011548002E-19
+ -0.182959116606928994E-18  0.347283508374262995E-18  0.196776341559015009E-18
+ -0.427751638363421972E-19 -0.163649412387490995E-19  0.690331852012255017E-19
+ -0.531407295283666964E-18 -0.422299341979263986E-18 -0.157579974642883004E-19
+  0.192321973960417003E-18  0.223317258693390997E-18  0.423311746425542994E-19
+  0.683894318895060959E-19  0.540270370111050010E-17  0.298115892764110993E-19
+ -0.196617522881404009E-18 -0.811034046995993056E-19  0.129860738725924993E-18
+  0.422987078035115991E-19 -0.456513700878625969E-17 -0.224993126614424009E-19
+ -0.459581548334461984E-18 -0.133004025053625993E-18  0.735806933367970026E-18
+  0.405093507024368993E-18  0.194632289411277007E-18 -0.352809074866117003E-19
+ -0.232722302258119018E-18 -0.601393392550553000E-19 -0.627863172152250006E-19
+ -0.931736241979730072E-20 -0.765971894202063951E-17 -0.399121924746226012E-17
+  0.190077832958559989E-18  0.179066508041831014E-19 -0.574288338238416015E-17
+  0.880914265144471992E-19  0.182959116606928994E-18  0.153140909885616995E-18
+  0.203287907341031991E-19  0.325260651745650993E-18  0.379470760369926985E-18
+ -0.207523072077303992E-18 -0.361852475067037028E-17  0.301543729222530998E-17
+  0.394039727062700980E-17  0.350989277518501004E-19  0.393023287525995020E-18
+  0.272744609015884998E-18 -0.357447903741314974E-17  0.155423101195651015E-19
+ -0.123801276779504007E-16  0.770270586409379010E-19 -0.443845264361252992E-18
+ -0.250721752387272997E-18 -0.393700913883799029E-17 -0.601128694754536031E-19
+  0.103676832743925996E-17 -0.806375365786094077E-17  0.270372916763573015E-17
+  0.271050543121375988E-19 -0.168051336735252995E-17 -0.514996031930614991E-18
+ -0.155854062294790998E-18  0.172794721239877004E-18 -0.387517573368842020E-18
+ -0.207523072077304004E-19  0.390799826039453012E-18 -0.925342135844694988E-19
+  0.106281541774795000E-18  0.967048582329619004E-19 -0.834571884916494946E-17
+ -0.954318272702428028E-19  0.207248448113936002E-18  0.517443990235404036E-17
+ -0.244696568805436984E-18 -0.116783013241462011E-18 -0.578651881405754976E-17
+ -0.731624708190902038E-19 -0.335583865790313019E-18 -0.197623374506268996E-18
+ -0.163191485200382005E-17  0.101432195433701996E-18  0.148336644887909010E-18
+ -0.173324116831910990E-18  0.539199998398406007E-17 -0.169406589450860008E-19
+ -0.844915364886164949E-19 -0.416303458685688009E-19  0.233338717250593997E-17
+  0.273292202581394994E-19 -0.728448334638698051E-19 -0.525160427297666030E-19
+ -0.110323923797504006E-16 -0.193123511973980015E-17 -0.372694496791892029E-19
+  0.115196480826584991E-18  0.242873462735370015E-18  0.910537153843644961E-20
+ -0.105840274005222006E-16 -0.469176843440076996E-20  0.193136746863780989E-18
+ -0.343708102894564009E-17 -0.209217137971811991E-18  0.414199111207353010E-18
+  0.394293836946877000E-18  0.304931861011548002E-19 -0.304931861011547978E-18
+  0.611915129942227994E-19  0.237169225231204005E-19 -0.299849663328021983E-18
+  0.207692478666753981E-17 -0.319490239792480988E-19 -0.133513568310958995E-17
+ -0.374812079160028020E-19 -0.406575814682063982E-19  0.406575814682063982E-19
+ -0.420128341838133016E-18  0.463088794131687014E-19 -0.138913403349704991E-18
+  0.127393755267047002E-17 -0.117906986257798992E-17  0.169406589450859993E-20
+ -0.332036915323686022E-18 -0.227004829864152009E-18 -0.559041745187837983E-19
+ -0.709390093325477015E-19 -0.947618109740748986E-19  0.242489650931144976E-18
+  0.359989002583078005E-19 -0.180312138646758992E-18 -0.229976452627177011E-17
+ -0.157796657189281015E-19  0.349872583691553995E-19  0.723435620126626944E-19
+ -0.457635088955547972E-19 -0.933145550948367955E-19 -0.123819011531837992E-17
+ -0.782909906169189995E-19 -0.184229666027809993E-19 -0.157336369952485999E-18
+  0.346171777630992014E-18  0.105217373916745000E-19 -0.111834818817168995E-17
+ -0.155642304057977988E-19 -0.955956090315283023E-19 -0.117479499317231010E-18
+ -0.362238933849222033E-17  0.115699406639016991E-18 -0.613040095575299991E-19
+  0.294757539477146002E-19  0.720825038113409974E-18  0.197676314065472005E-18
+  0.436963121664811995E-18  0.347865843525499989E-18 -0.652215369385810972E-18
+  0.381842452622238978E-17  0.131349663828520009E-18 -0.153847115333583998E-18
+  0.736833960816516019E-17 -0.262580213648833015E-19 -0.284603070277445010E-18
+ -0.197663079175671012E-19 -0.101643953670516005E-18 -0.298155597433514008E-18
+  0.840256683676266033E-18 -0.957147230397359022E-19 -0.220906192643921991E-17
+  0.447233396150270964E-17  0.315435069557500984E-17  0.371635705607823973E-19
+  0.216840434497100983E-18  0.643745039913267969E-18  0.358125530099118029E-17
+  0.155665051524822986E-19 -0.103592129449201009E-17  0.619922238271740940E-19
+ -0.238439774652085979E-18 -0.407277263841508986E-18 -0.147777239183251985E-16
+ -0.557205404227970984E-19  0.336503690631471986E-18 -0.140650350287169006E-16
+  0.180418017765165996E-18  0.145689666927739995E-18  0.814210420548196008E-18
+ -0.687790753170491972E-18  0.718283939271646976E-18 -0.423516473627150019E-20
+ -0.565295230618739017E-18 -0.740359735459462010E-19 -0.399799551104030001E-18
+  0.152465930505773989E-18 -0.679733946795801001E-17 -0.208960686132527017E-20
+ -0.487083649340625048E-18 -0.251436436436519005E-18 -0.289066536862780991E-19
+  0.571135125743363007E-19 -0.514784273693801031E-18  0.586040920381569047E-19
+ -0.225310763969643986E-18 -0.897854924089557998E-19 -0.374812079160027996E-18
+ -0.124990299279213007E-18 -0.103052145945326007E-17  0.125149117956822995E-18
+ -0.194764638309285997E-18  0.287673564711241993E-18 -0.105034203041900993E-16
+ -0.128113733272213002E-19  0.546336250979023959E-19 -0.391223342513080029E-19
+ -0.282909004382935976E-18 -0.487890977618476995E-18  0.135525271560688006E-18
+  0.948676900924816019E-19 -0.650521303491303046E-18 -0.664073830647370984E-18
+ -0.546505657568474964E-17 -0.508219768352579978E-20 -0.460785923306338999E-17
+ -0.338813178901720015E-19  0.482469966756049011E-17  0.372694496791892029E-19
+  0.184653182501436992E-18  0.528548559086683026E-18  0.178529134292789006E-16
+ -0.885043550762337067E-17  0.203626720519933994E-17  0.109267250195805009E-18
+ -0.243521972335611010E-18 -0.253712837482265010E-19 -0.291379333855478980E-18
+ -0.145689666927739995E-18 -0.109097843606353999E-17  0.118584612615601999E-18
+  0.419895407777638036E-17 -0.436221967835964976E-19  0.166770584104119993E-18
+  0.301564784753128000E-18  0.105479300460810000E-16  0.153246012714406002E-21
+ -0.721442795365178040E-20  0.785191282370076000E-17  0.611660841639171959E-19
+ -0.269408879070079001E-19  0.980216235171613067E-20 -0.642700149301778963E-17
+ -0.852508928293581066E-17  0.697544054984669996E-24  0.250693589472354980E-18
+  0.114838828609231990E-18  0.137684826882562005E-19 -0.881975394890917019E-19
+  0.988901378360807009E-17 -0.584661507095537965E-20 -0.378455001863782989E-18
+  0.568235859210977989E-20 -0.215342517844311983E-20 -0.145015040856607996E-18
+ -0.469036545974539019E-17 -0.514100323766396972E-20 -0.305076832491833999E-18
+ -0.196493615426171004E-18  0.205358492279442992E-18  0.117772935177185998E-18
+  0.205010377842294006E-17  0.202435071204792989E-19 -0.619429451884161010E-19
+  0.515229817082411962E-18  0.798642516943490968E-17 -0.375603283878283021E-18
+ -0.257011107083719992E-18 -0.135643988942254000E-20 -0.402490229080583990E-19
+ -0.873766277585149009E-19  0.487430530707361009E-17  0.786805309075646029E-21
+  0.927211481210010019E-19 -0.946732143119615016E-19 -0.966803394206032053E-17
+  0.210728270163397999E-24 -0.182379220144080012E-17  0.301144665518455994E-21
+  0.598805976239300004E-20  0.413513361367576011E-19  0.379595377543990018E-21
+ -0.148122463346125989E-18 -0.690407715361336988E-19 -0.320930310026529998E-19
+ -0.913936660975153941E-17  0.247663640323067001E-18  0.222959731044695013E-18
+ -0.869297579538561051E-22  0.634741170484694975E-20 -0.391093459440610985E-21
+ -0.235888633395294979E-18 -0.133873622544535990E-18 -0.597176764414821015E-17
+ -0.972390876617005020E-20
diff --git a/FORTRAN/CMakeLists.txt b/FORTRAN/CMakeLists.txt
index 6d0a0fd8..95f37002 100644
--- a/FORTRAN/CMakeLists.txt
+++ b/FORTRAN/CMakeLists.txt
@@ -3,9 +3,9 @@ include_directories(${SuperLU_DIST_SOURCE_DIR}/SRC)
 include_directories(${SuperLU_DIST_BINARY_DIR}/FORTRAN)
 
 set(headers
-    ${CMAKE_INSTALL_PREFIX}/FORTRAN/superlu_mod.mod
-    ${CMAKE_INSTALL_PREFIX}/FORTRAN/superlupara_mod.mod
-    ${CMAKE_INSTALL_PREFIX}/FORTRAN/superlu_dist_config.fh
+    ${CMAKE_BINARY_DIR}/FORTRAN/superlu_mod.mod
+    ${CMAKE_BINARY_DIR}/FORTRAN/superlupara_mod.mod
+    ${CMAKE_BINARY_DIR}/FORTRAN/superlu_dist_config.fh
     )
 
 set(sources "superlu_c2f_wrap.c")  # initialize precision-independent file
@@ -56,37 +56,37 @@ if (NOT MSVC)
 endif ()
 
 
-set(F_MOD superlupara.f90 superlu_mod.f90)
+add_library(ftestmod STATIC superlupara.f90 superlu_mod.f90)
 
 if(enable_double)
   set(F_DEXM ${F_MOD} f_pddrive.F90)
   add_executable(f_pddrive ${F_DEXM})
-  target_link_libraries(f_pddrive ${all_link_libs})
+  target_link_libraries(f_pddrive ftestmod ${all_link_libs})
 #  set_target_properties(f_pddrive PROPERTIES LINKER_LANGUAGE Fortran)
   set_target_properties(f_pddrive PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
   
-  set(F_DEXM3D ${F_MOD} f_pddrive3d.F90)
+  set(F_DEXM3D f_pddrive3d.F90)
   add_executable(f_pddrive3d ${F_DEXM3D})
-  target_link_libraries(f_pddrive3d ${all_link_libs})
+  target_link_libraries(f_pddrive3d ftestmod ${all_link_libs})
   set_target_properties(f_pddrive3d PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
   
-  set(F_5x5 ${F_MOD} f_5x5.F90 sp_ienv.c)
+  set(F_5x5 f_5x5.F90 sp_ienv.c)
   add_executable(f_5x5 ${F_5x5})
-  target_link_libraries(f_5x5 ${all_link_libs})
+  target_link_libraries(f_5x5 ftestmod ${all_link_libs})
   set_target_properties(f_5x5 PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
   
 endif()
 
 if(enable_complex16)
-  set(F_ZEXM ${F_MOD} f_pzdrive.F90)
+  set(F_ZEXM f_pzdrive.F90)
   add_executable(f_pzdrive ${F_ZEXM})
-  target_link_libraries(f_pzdrive ${all_link_libs})
+  target_link_libraries(f_pzdrive ftestmod ${all_link_libs})
 #  set_target_properties(f_pzdrive PROPERTIES LINKER_LANGUAGE Fortran)
   set_target_properties(f_pzdrive PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
 
-  set(F_ZEXM3D ${F_MOD} f_pzdrive3d.F90)
+  set(F_ZEXM3D f_pzdrive3d.F90)
   add_executable(f_pzdrive3d ${F_ZEXM3D})
-  target_link_libraries(f_pzdrive3d ${all_link_libs})
+  target_link_libraries(f_pzdrive3d ftestmod ${all_link_libs})
   set_target_properties(f_pzdrive3d PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
   
 endif()
diff --git a/SRC/sp_ienv.c b/SRC/sp_ienv.c
index 30c9ce1f..f4014dd2 100644
--- a/SRC/sp_ienv.c
+++ b/SRC/sp_ienv.c
@@ -53,8 +53,10 @@ at the top-level directory.
 	         of L and U, compared with A;
 	    = 7: the minimum value of the product M*N*K for a GEMM call
 	         to be off-loaded to accelerator (e.g., GPU, Xeon Phi).
-            = 8: the maximum buffer size on GPU that can hold the three
-	         matrices in the GEMM call for the Schur complement update.
+            = 8: the maximum buffer size on GPU that can hold the "dC"
+	         matrix in the GEMM call for the Schur complement update.
+		 If this is too small, the Schur complement update will be
+		 done in multiple partitions, may be slower.
 	    
    (SP_IENV_DIST) (output) int
             >= 0: the value of the parameter specified by ISPEC   
@@ -114,7 +116,7 @@ sp_ienv_dist(int ispec)
         case 8:
   	    ttemp = getenv ("MAX_BUFFER_SIZE");
 	    if (ttemp) return atoi (ttemp);
-	    else return 64000000; // 8000^2
+	    else return 256000000; // 16000^2
     }
 
     /* Invalid value for ISPEC */

From 1b330fa9dee1730d50a368367527b59752b58b95 Mon Sep 17 00:00:00 2001
From: Yang Liu 
Date: Wed, 2 Jun 2021 02:44:05 -0400
Subject: [PATCH 096/147] boot minimum required cmake version to 3.18.1

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index df9798e0..5d25ac95 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@
 ######################################################################
 
 # Required version
-cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.18.1 FATAL_ERROR)
 
 # Project version numbers
 #project(SuperLU_DIST C CXX CUDA)

From eef5a954ae502407361656b4c4079906532bccdc Mon Sep 17 00:00:00 2001
From: Yang Liu 
Date: Wed, 2 Jun 2021 03:09:59 -0400
Subject: [PATCH 097/147] minor change in CMakeLists.txt

---
 CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d25ac95..d5b49650 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -191,8 +191,14 @@ if (TPL_ENABLE_CUDALIB)   ## want to use cuda
 #     find_package(CUB REQUIRED)
 
     find_package(CUDAToolkit REQUIRED)
+    message("-- CUDAToolkit_LIBRARY_ROOT='${CUDAToolkit_LIBRARY_ROOT}'")
+    if (NOT "${CUDAToolkit_LIBRARY_ROOT}" STREQUAL "")
     set(CUDA_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcudart.so")
     set(CUDA_CUBLAS_LIBRARIES "${CUDAToolkit_LIBRARY_ROOT}/lib64/libcublas.so")
+    else()
+    message("-- CUDAToolkit_LIBRARY_ROOT empty, not setting CUDA_LIBRARIES")
+    endif()
+
 
 #     # The following make.inc exporting does not work
 #     set(CUDA_LIB CUDA::cudart CUDA::cublas CUDA::cusolver)

From 8cb8c0a737f7e4ceef7921ca4d8a72f2e1a2cb04 Mon Sep 17 00:00:00 2001
From: Yang Liu 
Date: Wed, 2 Jun 2021 09:33:44 -0400
Subject: [PATCH 098/147] added some traverse scripts

---
 .../batch_script_mpi_runit_traverse_sml.sh    | 18 ++++++
 ...ke_build_traverse_pgi20_openmp_06_01_21.sh | 64 +++++++++++++++++++
 2 files changed, 82 insertions(+)
 create mode 100644 example_scripts/batch_script_mpi_runit_traverse_sml.sh
 create mode 100644 example_scripts/run_cmake_build_traverse_pgi20_openmp_06_01_21.sh

diff --git a/example_scripts/batch_script_mpi_runit_traverse_sml.sh b/example_scripts/batch_script_mpi_runit_traverse_sml.sh
new file mode 100644
index 00000000..6bae66bf
--- /dev/null
+++ b/example_scripts/batch_script_mpi_runit_traverse_sml.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+#SBATCH --qos=test
+#SBATCH -N 1
+#SBATCH -t 00:30:00
+#SBATCH -J superlu_test
+#SBATCH --mail-user=liuyangzhuan@lbl.gov
+#SBATCH --gpus=1
+
+
+module purge
+export ALLINEA_FORCE_CUDA_VERSION=20.0.1
+module load cudatoolkit/11.2 pgi/20.4 openmpi/pgi-20.4/4.0.4/64
+module load hdf5/pgi-20.4/openmpi-4.0.4/1.10.6 fftw/gcc/openmpi-4.0.4/3.3.8 anaconda ddt
+module load cmake
+
+#srun -n 1 ./EXAMPLE/pddrive -r 1 -c 1 ../EXAMPLE/big.rua
+srun -n 1 ./EXAMPLE/pddrive -r 1 -c 1 ../../matrix/HTS/copter2.mtx
diff --git a/example_scripts/run_cmake_build_traverse_pgi20_openmp_06_01_21.sh b/example_scripts/run_cmake_build_traverse_pgi20_openmp_06_01_21.sh
new file mode 100644
index 00000000..65a1d46f
--- /dev/null
+++ b/example_scripts/run_cmake_build_traverse_pgi20_openmp_06_01_21.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+# Bash script to submit many files to Cori/Edison/Queue
+
+
+module purge
+export ALLINEA_FORCE_CUDA_VERSION=20.0.1
+module load cudatoolkit/11.2 pgi/20.4 openmpi/pgi-20.4/4.0.4/64 
+module load hdf5/pgi-20.4/openmpi-4.0.4/1.10.6 fftw/gcc/openmpi-4.0.4/3.3.8 anaconda ddt 
+module load cmake
+
+
+
+export PARMETIS_ROOT=~/petsc_master/traverse-pgi-openmpi-199-gpucuda-branch/
+
+export CUDA_ROOT=/usr/local/cuda-11.2
+export CUDA_PATH=${CUDA_ROOT}
+rm -rf CMakeCache.txt
+rm -rf CMakeFiles
+rm -rf CTestTestfile.cmake
+rm -rf cmake_install.cmake
+rm -rf DartConfiguration.tcl 
+
+cmake .. \
+	-DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include" \
+	-DTPL_PARMETIS_LIBRARIES="${PARMETIS_ROOT}/lib/libparmetis.a;${PARMETIS_ROOT}/lib/libmetis.a" \
+	-DCUDAToolkit_LIBRARY_ROOT="${CUDA_ROOT}" \
+	-DBUILD_SHARED_LIBS=OFF \
+	-DCMAKE_C_COMPILER=mpicc \
+	-DTPL_ENABLE_CUDALIB=TRUE \
+	-DTPL_ENABLE_LAPACKLIB=TRUE \
+	-Denable_openmp:BOOL=TRUE \
+	-DCMAKE_CXX_COMPILER=mpiCC \
+	-DCMAKE_INSTALL_PREFIX=. \
+	-DCMAKE_BUILD_TYPE=Release \
+	-DCMAKE_VERBOSE_MAKEFILE:BOOL=ON \
+        -DTPL_BLAS_LIBRARIES="${PARMETIS_ROOT}/lib/libflapack.a;${PARMETIS_ROOT}/lib/libfblas.a" \
+        -DTPL_LAPACK_LIBRARIES="${PARMETIS_ROOT}/lib/libflapack.a;${PARMETIS_ROOT}/lib/libfblas.a" \
+        -DCMAKE_CXX_FLAGS="-DRELEASE -pgf90libs" \
+        -DCMAKE_C_FLAGS="-DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 -pgf90libs"
+
+make pddrive
+
+
+
+#salloc -N 1 --qos=test -t 0:30:00 --gpus=2
+
+
+
+
+#        -DCMAKE_CUDA_FLAGS="-DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 -DGPU_ACC -gencode arch=compute_70,code=sm_70"
+
+#	-DTPL_BLAS_LIBRARIES="/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_intel_lp64.so;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_sequential.so;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_core.so" \
+#        -DCMAKE_CXX_FLAGS="-g -trace -Ofast -std=c++11 -DAdd_ -DRELEASE -tcollect -L$VT_LIB_DIR -lVT $VT_ADD_LIBS" \
+
+
+#	-DTPL_BLAS_LIBRARIES="/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_lapack95_lp64.a;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_blas95_lp64.a"
+
+#	-DTPL_BLAS_LIBRARIES="/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_intel_lp64.a;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_sequential.a;/opt/intel/compilers_and_libraries_2017.2.174/linux/mkl/lib/intel64/libmkl_core.a"  
+
+
+# DCMAKE_BUILD_TYPE=Release or Debug compiler options set in CMAKELIST.txt
+
+#        -DCMAKE_C_FLAGS="-g -O0 -std=c99 -DPRNTlevel=2 -DPROFlevel=1 -DDEBUGlevel=0" \
+        #-DCMAKE_C_FLAGS="-std=c11 -DPRNTlevel=1 -DPROFlevel=1 -DDEBUGlevel=0 ${INC_VTUNE}" \

From 6fc64290d17bf70b9448012eb5c54c2121f1d60b Mon Sep 17 00:00:00 2001
From: Yang Liu 
Date: Fri, 4 Jun 2021 18:07:02 -0400
Subject: [PATCH 099/147] temporarily remove fortran drivers in cmake, this is
 to be restored later

---
 FORTRAN/CMakeLists.txt                           |  8 +++++---
 ...cmake_build_traverse_pgi20_openmp_06_01_21.sh | 16 ++++++++++++----
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/FORTRAN/CMakeLists.txt b/FORTRAN/CMakeLists.txt
index 95f37002..bf402a9c 100644
--- a/FORTRAN/CMakeLists.txt
+++ b/FORTRAN/CMakeLists.txt
@@ -58,11 +58,12 @@ endif ()
 
 add_library(ftestmod STATIC superlupara.f90 superlu_mod.f90)
 
-if(enable_double)
+#if(enable_double)
+if(FALSE)
   set(F_DEXM ${F_MOD} f_pddrive.F90)
   add_executable(f_pddrive ${F_DEXM})
   target_link_libraries(f_pddrive ftestmod ${all_link_libs})
-#  set_target_properties(f_pddrive PROPERTIES LINKER_LANGUAGE Fortran)
+  # set_target_properties(f_pddrive PROPERTIES LINKER_LANGUAGE Fortran CUDA_RESOLVE_DEVICE_SYMBOLS ON)
   set_target_properties(f_pddrive PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${MPI_Fortran_LINK_FLAGS}")
   
   set(F_DEXM3D f_pddrive3d.F90)
@@ -77,7 +78,8 @@ if(enable_double)
   
 endif()
 
-if(enable_complex16)
+# if(enable_complex16)
+if(FALSE)
   set(F_ZEXM f_pzdrive.F90)
   add_executable(f_pzdrive ${F_ZEXM})
   target_link_libraries(f_pzdrive ftestmod ${all_link_libs})
diff --git a/example_scripts/run_cmake_build_traverse_pgi20_openmp_06_01_21.sh b/example_scripts/run_cmake_build_traverse_pgi20_openmp_06_01_21.sh
index 65a1d46f..1189d97b 100644
--- a/example_scripts/run_cmake_build_traverse_pgi20_openmp_06_01_21.sh
+++ b/example_scripts/run_cmake_build_traverse_pgi20_openmp_06_01_21.sh
@@ -6,14 +6,15 @@ module purge
 export ALLINEA_FORCE_CUDA_VERSION=20.0.1
 module load cudatoolkit/11.2 pgi/20.4 openmpi/pgi-20.4/4.0.4/64 
 module load hdf5/pgi-20.4/openmpi-4.0.4/1.10.6 fftw/gcc/openmpi-4.0.4/3.3.8 anaconda ddt 
-module load cmake
-
+#module load cmake
 
 
+export PATH=/home/yl33/cmake-3.20.3/bin/:$PATH
+cmake --version
 export PARMETIS_ROOT=~/petsc_master/traverse-pgi-openmpi-199-gpucuda-branch/
 
 export CUDA_ROOT=/usr/local/cuda-11.2
-export CUDA_PATH=${CUDA_ROOT}
+#export CUDA_PATH=${CUDA_ROOT}
 rm -rf CMakeCache.txt
 rm -rf CMakeFiles
 rm -rf CTestTestfile.cmake
@@ -23,12 +24,15 @@ rm -rf DartConfiguration.tcl
 cmake .. \
 	-DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include" \
 	-DTPL_PARMETIS_LIBRARIES="${PARMETIS_ROOT}/lib/libparmetis.a;${PARMETIS_ROOT}/lib/libmetis.a" \
-	-DCUDAToolkit_LIBRARY_ROOT="${CUDA_ROOT}" \
 	-DBUILD_SHARED_LIBS=OFF \
 	-DCMAKE_C_COMPILER=mpicc \
 	-DTPL_ENABLE_CUDALIB=TRUE \
 	-DTPL_ENABLE_LAPACKLIB=TRUE \
 	-Denable_openmp:BOOL=TRUE \
+	-DCMAKE_CUDA_FLAGS="-ccbin pgc++ -D_PGIC_PRINCETON_OVERRIDE_" \
+	-DCMAKE_CUDA_HOST_COMPILER=mpicc \
+	-DCMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES="/usr/local/cuda-11.2/include" \
+	-DCMAKE_INCLUDE_SYSTEM_FLAG_C="-I" \
 	-DCMAKE_CXX_COMPILER=mpiCC \
 	-DCMAKE_INSTALL_PREFIX=. \
 	-DCMAKE_BUILD_TYPE=Release \
@@ -39,8 +43,12 @@ cmake .. \
         -DCMAKE_C_FLAGS="-DPRNTlevel=1 -DPROFlevel=0 -DDEBUGlevel=0 -pgf90libs"
 
 make pddrive
+make install
+
+#	-DXSDK_ENABLE_Fortran=FALSE \ 
 
 
+#	-DCUDAToolkit_LIBRARY_ROOT="${CUDA_ROOT}" \
 
 #salloc -N 1 --qos=test -t 0:30:00 --gpus=2
 

From 23ba27f6a4045f5706e3e2c1934a20e234300de0 Mon Sep 17 00:00:00 2001
From: Yang Liu 
Date: Thu, 10 Jun 2021 17:17:40 -0400
Subject: [PATCH 100/147] minor change

---
 SRC/symbfact.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/SRC/symbfact.c b/SRC/symbfact.c
index 7cfa0bd8..6da8a99a 100644
--- a/SRC/symbfact.c
+++ b/SRC/symbfact.c
@@ -185,6 +185,7 @@ int_t symbfact
 
     if ( !pnum && (options->PrintStat == YES)) {
 	nnzLU = nnzL + nnzU - min_mn;				   
+	printf("\tMatrix size min_mn  " IFMT "\n", min_mn);
 	printf("\tNonzeros in L       " IFMT "\n", nnzL);
 	printf("\tNonzeros in U       " IFMT "\n", nnzU);
 	printf("\tnonzeros in L+U     " IFMT "\n", nnzLU);

From 45807bdfbe3a1ffee8791e513350d10133b813af Mon Sep 17 00:00:00 2001
From: Piyush Sao 
Date: Wed, 23 Jun 2021 14:15:18 -0400
Subject: [PATCH 101/147] Adding cub version 1.8

---
 SRC/cub/agent/agent_histogram.cuh             |  787 +++
 SRC/cub/agent/agent_radix_sort_downsweep.cuh  |  789 +++
 .../agent_radix_sort_upsweep.cuh}             |  250 +-
 SRC/cub/agent/agent_reduce.cuh                |  385 ++
 SRC/cub/agent/agent_reduce_by_key.cuh         |  547 ++
 SRC/cub/agent/agent_rle.cuh                   |  837 ++++
 SRC/cub/agent/agent_scan.cuh                  |  471 ++
 SRC/cub/agent/agent_segment_fixup.cuh         |  375 ++
 SRC/cub/agent/agent_select_if.cuh             |  703 +++
 SRC/cub/agent/agent_spmv_orig.cuh             |  670 +++
 .../single_pass_scan_operators.cuh}           |  397 +-
 SRC/cub/block/block_adjacent_difference.cuh   |  596 +++
 SRC/cub/block/block_discontinuity.cuh         |  683 ++-
 SRC/cub/block/block_exchange.cuh              |  528 +-
 SRC/cub/block/block_histogram.cuh             |   30 +-
 SRC/cub/block/block_load.cuh                  |  529 +-
 SRC/cub/block/block_radix_rank.cuh            |  499 +-
 SRC/cub/block/block_radix_sort.cuh            |  134 +-
 SRC/cub/block/block_raking_layout.cuh         |  301 +-
 SRC/cub/block/block_reduce.cuh                | 1214 ++---
 SRC/cub/block/block_scan.cuh                  | 4444 ++++++++---------
 SRC/cub/block/block_shift.cuh                 |  325 --
 SRC/cub/block/block_shuffle.cuh               |  305 ++
 SRC/cub/block/block_store.cuh                 |  258 +-
 .../block_histogram_atomic.cuh                |    6 +-
 .../specializations/block_histogram_sort.cuh  |   22 +-
 .../specializations/block_reduce_raking.cuh   |   99 +-
 .../block_reduce_raking_commutative_only.cuh  |   29 +-
 .../block_reduce_warp_reductions.cuh          |   52 +-
 .../specializations/block_scan_raking.cuh     |  560 +--
 .../specializations/block_scan_warp_scans.cuh |  375 +-
 .../block_scan_warp_scans2.cuh                |  436 ++
 .../block_scan_warp_scans3.cuh                |  418 ++
 SRC/cub/block_range/block_range_histo.cuh     |  319 --
 .../block_range_radix_sort_downsweep.cuh      |  744 ---
 SRC/cub/block_range/block_range_reduce.cuh    |  430 --
 .../block_range/block_range_reduce_by_key.cuh | 1034 ----
 SRC/cub/block_range/block_range_scan.cuh      |  538 --
 SRC/cub/block_range/block_range_select.cuh    |  735 ---
 .../block_range_histo_gatomic.cuh             |  184 -
 .../block_range_histo_satomic.cuh             |  245 -
 .../block_range_histo_sort.cuh                |  364 --
 SRC/cub/cub.cuh                               |   12 +-
 SRC/cub/device/device_histogram.cuh           |  959 ++--
 SRC/cub/device/device_partition.cuh           |  548 +-
 SRC/cub/device/device_radix_sort.cuh          | 1217 +++--
 SRC/cub/device/device_reduce.cuh              |  560 +--
 SRC/cub/device/device_run_length_encode.cuh   |  278 ++
 SRC/cub/device/device_scan.cuh                |  862 ++--
 .../device/device_segmented_radix_sort.cuh    |  876 ++++
 SRC/cub/device/device_segmented_reduce.cuh    |  619 +++
 SRC/cub/device/device_select.cuh              |  741 ++-
 SRC/cub/device/device_spmv.cuh                |  174 +
 .../dispatch/device_histogram_dispatch.cuh    |  554 --
 .../dispatch/device_radix_sort_dispatch.cuh   |  939 ----
 .../device_reduce_by_key_dispatch.cuh         |  594 ---
 .../dispatch/device_reduce_dispatch.cuh       |  743 ---
 .../device/dispatch/device_scan_dispatch.cuh  |  565 ---
 .../device/dispatch/dispatch_histogram.cuh    | 1096 ++++
 .../device/dispatch/dispatch_radix_sort.cuh   | 1619 ++++++
 SRC/cub/device/dispatch/dispatch_reduce.cuh   |  882 ++++
 .../dispatch/dispatch_reduce_by_key.cuh       |  554 ++
 SRC/cub/device/dispatch/dispatch_rle.cuh      |  538 ++
 SRC/cub/device/dispatch/dispatch_scan.cuh     |  563 +++
 ...ct_dispatch.cuh => dispatch_select_if.cuh} | 1106 ++--
 .../device/dispatch/dispatch_spmv_orig.cuh    |  834 ++++
 SRC/cub/grid/grid_barrier.cuh                 |   10 +-
 SRC/cub/grid/grid_even_share.cuh              |  189 +-
 SRC/cub/grid/grid_mapping.cuh                 |   24 +-
 SRC/cub/grid/grid_queue.cuh                   |   46 +-
 SRC/cub/host/mutex.cuh                        |  171 +
 SRC/cub/host/spinlock.cuh                     |  123 -
 SRC/cub/iterator/arg_index_input_iterator.cuh |   70 +-
 .../cache_modified_input_iterator.cuh         |   30 +-
 .../cache_modified_output_iterator.cuh        |   19 +-
 SRC/cub/iterator/constant_input_iterator.cuh  |   20 +-
 SRC/cub/iterator/counting_input_iterator.cuh  |   22 +-
 SRC/cub/iterator/discard_output_iterator.cuh  |  220 +
 SRC/cub/iterator/tex_obj_input_iterator.cuh   |   28 +-
 SRC/cub/iterator/tex_ref_input_iterator.cuh   |   42 +-
 SRC/cub/iterator/transform_input_iterator.cuh |   26 +-
 SRC/cub/thread/thread_load.cuh                |  146 +-
 SRC/cub/thread/thread_operators.cuh           |  149 +-
 SRC/cub/thread/thread_reduce.cuh              |   49 +-
 SRC/cub/thread/thread_scan.cuh                |   71 +-
 SRC/cub/thread/thread_search.cuh              |  154 +
 SRC/cub/thread/thread_store.cuh               |  130 +-
 SRC/cub/util_allocator.cuh                    |  698 +--
 SRC/cub/util_arch.cuh                         |  212 +-
 SRC/cub/util_debug.cuh                        |   48 +-
 SRC/cub/util_device.cuh                       |  217 +-
 SRC/cub/util_macro.cuh                        |   86 +-
 SRC/cub/util_namespace.cuh                    |    7 +-
 SRC/cub/util_ptx.cuh                          |  386 +-
 SRC/cub/util_type.cuh                         |  654 ++-
 .../warp/specializations/warp_reduce_shfl.cuh |  871 ++--
 .../warp/specializations/warp_reduce_smem.cuh |  730 +--
 .../warp/specializations/warp_scan_shfl.cuh   | 1033 ++--
 .../warp/specializations/warp_scan_smem.cuh   |  716 +--
 SRC/cub/warp/warp_reduce.cuh                  | 1239 +++--
 SRC/cub/warp/warp_scan.cuh                    | 2387 ++++-----
 101 files changed, 30389 insertions(+), 20714 deletions(-)
 create mode 100644 SRC/cub/agent/agent_histogram.cuh
 create mode 100644 SRC/cub/agent/agent_radix_sort_downsweep.cuh
 rename SRC/cub/{block_range/block_range_radix_sort_upsweep.cuh => agent/agent_radix_sort_upsweep.cuh} (66%)
 create mode 100644 SRC/cub/agent/agent_reduce.cuh
 create mode 100644 SRC/cub/agent/agent_reduce_by_key.cuh
 create mode 100644 SRC/cub/agent/agent_rle.cuh
 create mode 100644 SRC/cub/agent/agent_scan.cuh
 create mode 100644 SRC/cub/agent/agent_segment_fixup.cuh
 create mode 100644 SRC/cub/agent/agent_select_if.cuh
 create mode 100644 SRC/cub/agent/agent_spmv_orig.cuh
 rename SRC/cub/{block_range/block_scan_prefix_operators.cuh => agent/single_pass_scan_operators.cuh} (57%)
 create mode 100644 SRC/cub/block/block_adjacent_difference.cuh
 delete mode 100644 SRC/cub/block/block_shift.cuh
 create mode 100644 SRC/cub/block/block_shuffle.cuh
 create mode 100644 SRC/cub/block/specializations/block_scan_warp_scans2.cuh
 create mode 100644 SRC/cub/block/specializations/block_scan_warp_scans3.cuh
 delete mode 100644 SRC/cub/block_range/block_range_histo.cuh
 delete mode 100644 SRC/cub/block_range/block_range_radix_sort_downsweep.cuh
 delete mode 100644 SRC/cub/block_range/block_range_reduce.cuh
 delete mode 100644 SRC/cub/block_range/block_range_reduce_by_key.cuh
 delete mode 100644 SRC/cub/block_range/block_range_scan.cuh
 delete mode 100644 SRC/cub/block_range/block_range_select.cuh
 delete mode 100644 SRC/cub/block_range/specializations/block_range_histo_gatomic.cuh
 delete mode 100644 SRC/cub/block_range/specializations/block_range_histo_satomic.cuh
 delete mode 100644 SRC/cub/block_range/specializations/block_range_histo_sort.cuh
 create mode 100644 SRC/cub/device/device_run_length_encode.cuh
 create mode 100644 SRC/cub/device/device_segmented_radix_sort.cuh
 create mode 100644 SRC/cub/device/device_segmented_reduce.cuh
 create mode 100644 SRC/cub/device/device_spmv.cuh
 delete mode 100644 SRC/cub/device/dispatch/device_histogram_dispatch.cuh
 delete mode 100644 SRC/cub/device/dispatch/device_radix_sort_dispatch.cuh
 delete mode 100644 SRC/cub/device/dispatch/device_reduce_by_key_dispatch.cuh
 delete mode 100644 SRC/cub/device/dispatch/device_reduce_dispatch.cuh
 delete mode 100644 SRC/cub/device/dispatch/device_scan_dispatch.cuh
 create mode 100644 SRC/cub/device/dispatch/dispatch_histogram.cuh
 create mode 100644 SRC/cub/device/dispatch/dispatch_radix_sort.cuh
 create mode 100644 SRC/cub/device/dispatch/dispatch_reduce.cuh
 create mode 100644 SRC/cub/device/dispatch/dispatch_reduce_by_key.cuh
 create mode 100644 SRC/cub/device/dispatch/dispatch_rle.cuh
 create mode 100644 SRC/cub/device/dispatch/dispatch_scan.cuh
 rename SRC/cub/device/dispatch/{device_select_dispatch.cuh => dispatch_select_if.cuh} (51%)
 create mode 100644 SRC/cub/device/dispatch/dispatch_spmv_orig.cuh
 create mode 100644 SRC/cub/host/mutex.cuh
 delete mode 100644 SRC/cub/host/spinlock.cuh
 create mode 100644 SRC/cub/iterator/discard_output_iterator.cuh
 create mode 100644 SRC/cub/thread/thread_search.cuh

diff --git a/SRC/cub/agent/agent_histogram.cuh b/SRC/cub/agent/agent_histogram.cuh
new file mode 100644
index 00000000..37b1ec97
--- /dev/null
+++ b/SRC/cub/agent/agent_histogram.cuh
@@ -0,0 +1,787 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+
+#pragma once
+
+#include 
+
+#include "../util_type.cuh"
+#include "../block/block_load.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ *
+ */
+enum BlockHistogramMemoryPreference
+{
+    GMEM,
+    SMEM,
+    BLEND
+};
+
+
+/**
+ * Parameterizable tuning policy type for AgentHistogram
+ */
+template <
+    int                             _BLOCK_THREADS,                 ///< Threads per thread block
+    int                             _PIXELS_PER_THREAD,             ///< Pixels per thread (per tile of input)
+    BlockLoadAlgorithm              _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
+    BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+    bool                            _WORK_STEALING>                 ///< Whether to dequeue tiles from a global work queue
+struct AgentHistogramPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,                   ///< Threads per thread block
+        PIXELS_PER_THREAD       = _PIXELS_PER_THREAD,               ///< Pixels per thread (per tile of input)
+        IS_RLE_COMPRESS         = _RLE_COMPRESS,                    ///< Whether to perform localized RLE to compress samples before histogramming
+        MEM_PREFERENCE          = _MEM_PREFERENCE,                  ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+        IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+template <
+    typename    AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
+    int         PRIVATIZED_SMEM_BINS,           ///< Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized counters to be maintained in device-accessible memory.
+    int         NUM_CHANNELS,                   ///< Number of channels interleaved in the input data.  Supports up to four channels.
+    int         NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename    SampleIteratorT,                ///< Random-access input iterator type for reading samples
+    typename    CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+    typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
+struct AgentHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The sample type of the input iterator
+    typedef typename std::iterator_traits::value_type SampleT;
+
+    /// The pixel type of SampleT
+    typedef typename CubVector::Type PixelT;
+
+    /// The quad type of SampleT
+    typedef typename CubVector::Type QuadT;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentHistogramPolicyT::BLOCK_THREADS,
+
+        PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,
+        SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,
+        QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,
+
+        TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,
+        TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,
+
+        IS_RLE_COMPRESS            = AgentHistogramPolicyT::IS_RLE_COMPRESS,
+
+        MEM_PREFERENCE          = (PRIVATIZED_SMEM_BINS > 0) ?
+                                        AgentHistogramPolicyT::MEM_PREFERENCE :
+                                        GMEM,
+
+        IS_WORK_STEALING           = AgentHistogramPolicyT::IS_WORK_STEALING,
+    };
+
+    /// Cache load modifier for reading input elements
+    static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
+
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    typedef typename If::VALUE,
+            CacheModifiedInputIterator,     // Wrap the native input pointer with CacheModifiedInputIterator
+            SampleIteratorT>::Type                                           // Directly use the supplied input iterator type
+        WrappedSampleIteratorT;
+
+    /// Pixel input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator
+        WrappedPixelIteratorT;
+
+    /// Qaud input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator
+        WrappedQuadIteratorT;
+
+    /// Parameterized BlockLoad type for samples
+    typedef BlockLoad<
+            SampleT,
+            BLOCK_THREADS,
+            SAMPLES_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadSampleT;
+
+    /// Parameterized BlockLoad type for pixels
+    typedef BlockLoad<
+            PixelT,
+            BLOCK_THREADS,
+            PIXELS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadPixelT;
+
+    /// Parameterized BlockLoad type for quads
+    typedef BlockLoad<
+            QuadT,
+            BLOCK_THREADS,
+            QUADS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadQuadT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     // Smem needed for block-privatized smem histogram (with 1 word of padding)
+
+        int tile_idx;
+
+        // Aliasable storage layout
+        union Aliasable
+        {
+            typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples
+            typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels
+            typename BlockLoadQuadT::TempStorage quad_load;         // Smem needed for loading a tile of quads
+
+        } aliasable;
+    };
+
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to temp_storage
+    _TempStorage &temp_storage;
+
+    /// Sample input iterator (with cache modifier applied, if possible)
+    WrappedSampleIteratorT d_wrapped_samples;
+
+    /// Native pointer for input samples (possibly NULL if unavailable)
+    SampleT* d_native_samples;
+
+    /// The number of output bins for each channel
+    int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// The number of privatized bins for each channel
+    int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to gmem privatized histograms for each channel
+    CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to final output histograms (gmem)
+    CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+    OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining privatized counter indices from samples, one for each channel
+    PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// Whether to prefer privatized smem counters vs privatized global counters
+    bool prefer_smem;
+
+
+    //---------------------------------------------------------------------
+    // Initialize privatized bin counters
+    //---------------------------------------------------------------------
+
+    // Initialize privatized bin counters
+    __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Initialize histogram bin counts to zeros
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
+            {
+                privatized_histograms[CHANNEL][privatized_bin] = 0;
+            }
+        }
+
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void InitSmemBinCounters()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        InitBinCounters(privatized_histograms);
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void InitGmemBinCounters()
+    {
+        InitBinCounters(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Update final output histograms
+    //---------------------------------------------------------------------
+
+    // Update final output histograms from privatized histograms
+    __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+
+        // Apply privatized bin counts to output bin counts
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int channel_bins = num_privatized_bins[CHANNEL];
+            for (int privatized_bin = threadIdx.x; 
+                    privatized_bin < channel_bins;  
+                    privatized_bin += BLOCK_THREADS)
+            {
+                int         output_bin  = -1;
+                CounterT    count       = privatized_histograms[CHANNEL][privatized_bin];
+                bool        is_valid    = count > 0;
+
+                output_decode_op[CHANNEL].template BinSelect((SampleT) privatized_bin, output_bin, is_valid);
+
+                if (output_bin >= 0)
+                {
+                    atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
+                }
+
+            }
+        }
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void StoreSmemOutput()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        StoreOutput(privatized_histograms);
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void StoreGmemOutput()
+    {
+        StoreOutput(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile accumulation
+    //---------------------------------------------------------------------
+
+    // Accumulate pixels.  Specialized for RLE compression.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type      is_rle_compress)
+    {
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            // Bin pixels
+            int bins[PIXELS_PER_THREAD];
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            {
+                bins[PIXEL] = -1;
+                privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
+            }
+
+            CounterT accumulator = 1;
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
+            {
+                if (bins[PIXEL] != bins[PIXEL + 1])
+                {
+                    if (bins[PIXEL] >= 0)
+                        atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
+
+                     accumulator = 0;
+                }
+                accumulator++;
+            }
+
+            // Last pixel
+            if (bins[PIXELS_PER_THREAD - 1] >= 0)
+                atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
+        }
+    }
+
+
+    // Accumulate pixels.  Specialized for individual accumulation of each pixel.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type     is_rle_compress)
+    {
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+        {
+            #pragma unroll
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            {
+                int bin = -1;
+                privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
+                if (bin >= 0)
+                    atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
+            }
+        }
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for smem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateSmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type());
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for gmem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateGmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type());
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Tile loading
+    //---------------------------------------------------------------------
+
+    // Load full, aligned tile using pixel iterator (multi-channel)
+    template 
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<_NUM_ACTIVE_CHANNELS>  num_active_channels)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast(samples));
+    }
+
+    // Load full, aligned tile using quad iterator (single-channel)
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<1>                     num_active_channels)
+    {
+        typedef QuadT AliasedQuads[QUADS_PER_THREAD];
+
+        WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped quad iterator
+        BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
+            d_wrapped_quads,
+            reinterpret_cast(samples));
+    }
+
+    // Load full, aligned tile
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type  is_full_tile,
+        Int2Type  is_aligned)
+    {
+        LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type());
+    }
+
+    // Load full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type  is_full_tile,
+        Int2Type is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        // Load using sample iterator
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast(samples));
+    }
+
+    // Load partially-full, aligned tile using the pixel iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type is_full_tile,
+        Int2Type  is_aligned)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        int valid_pixels = valid_samples / NUM_CHANNELS;
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast(samples),
+            valid_pixels);
+    }
+
+    // Load partially-full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type is_full_tile,
+        Int2Type is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast(samples),
+            valid_samples);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile processing
+    //---------------------------------------------------------------------
+
+    // Consume a tile of data samples
+    template <
+        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
+        bool IS_FULL_TILE>      // Whether the tile is full
+    __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
+    {
+        SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
+        bool        is_valid[PIXELS_PER_THREAD];
+
+        // Load tile
+        LoadTile(
+            block_offset,
+            valid_samples,
+            samples,
+            Int2Type(),
+            Int2Type());
+
+        // Set valid flags
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
+
+        // Accumulate samples
+#if CUB_PTX_ARCH >= 120
+        if (prefer_smem)
+            AccumulateSmemPixels(samples, is_valid);
+        else
+            AccumulateGmemPixels(samples, is_valid);
+#else
+        AccumulateGmemPixels(samples, is_valid);
+#endif
+
+    }
+
+
+    // Consume row tiles.  Specialized for work-stealing from queue
+    template 
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue      tile_queue,
+        Int2Type      is_work_stealing)
+    {
+
+        int         num_tiles                   = num_rows * tiles_per_row;
+        int         tile_idx                    = (blockIdx.y  * gridDim.x) + blockIdx.x;
+        OffsetT     num_even_share_tiles        = gridDim.x * gridDim.y;
+
+        while (tile_idx < num_tiles)
+        {
+            int     row             = tile_idx / tiles_per_row;
+            int     col             = tile_idx - (row * tiles_per_row);
+            OffsetT row_offset      = row * row_stride_samples;
+            OffsetT col_offset      = (col * TILE_SAMPLES);
+            OffsetT tile_offset     = row_offset + col_offset;
+
+            if (col == tiles_per_row - 1)
+            {
+                // Consume a partially-full tile at the end of the row
+                OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
+                ConsumeTile(tile_offset, num_remaining);
+            } 
+            else
+            {
+                // Consume full tile
+                ConsumeTile(tile_offset, TILE_SAMPLES);
+            }
+
+            CTA_SYNC();
+
+            // Get next tile
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
+
+            CTA_SYNC();
+
+            tile_idx = temp_storage.tile_idx;
+        }
+    }
+
+
+    // Consume row tiles.  Specialized for even-share (striped across thread blocks)
+    template 
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue      tile_queue,
+        Int2Type     is_work_stealing)
+    {
+        for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
+        {
+            OffsetT row_begin   = row * row_stride_samples;
+            OffsetT row_end     = row_begin + (num_row_pixels * NUM_CHANNELS);
+            OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
+
+            while (tile_offset < row_end)
+            {
+                OffsetT num_remaining = row_end - tile_offset;
+
+                if (num_remaining < TILE_SAMPLES)
+                {
+                    // Consume partial tile
+                    ConsumeTile(tile_offset, num_remaining);
+                    break;
+                }
+
+                // Consume full tile
+                ConsumeTile(tile_offset, TILE_SAMPLES);
+                tile_offset += gridDim.x * TILE_SAMPLES;
+            }
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Parameter extraction
+    //---------------------------------------------------------------------
+
+    // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
+    template <
+        CacheLoadModifier   _MODIFIER,
+        typename            _ValueT,
+        typename            _OffsetT>
+    __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
+    {
+        return itr.ptr;
+    }
+
+    // Return a native pixel pointer (specialized for other types)
+    template 
+    __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
+    {
+        return NULL;
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentHistogram(
+        TempStorage         &temp_storage,                                      ///< Reference to temp_storage
+        SampleIteratorT     d_samples,                                          ///< Input data to reduce
+        int                 (&num_output_bins)[NUM_ACTIVE_CHANNELS],            ///< The number bins per final output histogram
+        int                 (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],        ///< The number bins per privatized histogram
+        CounterT*           (&d_output_histograms)[NUM_ACTIVE_CHANNELS],        ///< Reference to final output histograms
+        CounterT*           (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],    ///< Reference to privatized histograms
+        OutputDecodeOpT     (&output_decode_op)[NUM_ACTIVE_CHANNELS],           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+        PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+    :
+        temp_storage(temp_storage.Alias()),
+        d_wrapped_samples(d_samples),
+        num_output_bins(num_output_bins),
+        num_privatized_bins(num_privatized_bins),
+        d_output_histograms(d_output_histograms),
+        privatized_decode_op(privatized_decode_op),
+        output_decode_op(output_decode_op),
+        d_native_samples(NativePointer(d_wrapped_samples)),
+        prefer_smem((MEM_PREFERENCE == SMEM) ?
+            true :                              // prefer smem privatized histograms
+            (MEM_PREFERENCE == GMEM) ?
+                false :                         // prefer gmem privatized histograms
+                blockIdx.x & 1)                 // prefer blended privatized histograms
+    {
+        int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
+
+        // Initialize the locations of this block's privatized histograms
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
+    }
+
+
+    /**
+     * Consume image
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue      tile_queue)                 ///< Queue descriptor for assigning tiles of work to thread blocks
+    {
+        // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
+        int     quad_mask           = AlignBytes::ALIGN_BYTES - 1;
+        int     pixel_mask          = AlignBytes::ALIGN_BYTES - 1;
+        size_t  row_bytes           = sizeof(SampleT) * row_stride_samples;
+
+        bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     // Single channel
+                                        ((size_t(d_native_samples) & quad_mask) == 0) &&        // ptr is quad-aligned
+                                        ((num_rows == 1) || ((row_bytes & quad_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
+
+        bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     // Multi channel
+                                        ((size_t(d_native_samples) & pixel_mask) == 0) &&       // ptr is pixel-aligned
+                                        ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
+
+        // Whether rows are aligned and can be vectorized
+        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
+            ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type());
+        else
+            ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type());
+    }
+
+
+    /**
+     * Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void InitBinCounters()
+    {
+        if (prefer_smem)
+            InitSmemBinCounters();
+        else
+            InitGmemBinCounters();
+    }
+
+
+    /**
+     * Store privatized histogram to device-accessible memory.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void StoreOutput()
+    {
+        if (prefer_smem)
+            StoreSmemOutput();
+        else
+            StoreGmemOutput();
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/SRC/cub/agent/agent_radix_sort_downsweep.cuh b/SRC/cub/agent/agent_radix_sort_downsweep.cuh
new file mode 100644
index 00000000..faea8813
--- /dev/null
+++ b/SRC/cub/agent/agent_radix_sort_downsweep.cuh
@@ -0,0 +1,789 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+
+
+#pragma once
+
+#include 
+
+#include "../thread/thread_load.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_radix_rank.cuh"
+#include "../block/block_exchange.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Radix ranking algorithm
+ */
+enum RadixRankAlgorithm
+{
+    RADIX_RANK_BASIC,
+    RADIX_RANK_MEMOIZE,
+    RADIX_RANK_MATCH
+};
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortDownsweep
+ */
+template <
+    int                         _BLOCK_THREADS,         ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,        ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,         ///< Cache load modifier for reading keys (and values)
+    RadixRankAlgorithm          _RANK_ALGORITHM,        ///< The radix ranking algorithm to use
+    BlockScanAlgorithm          _SCAN_ALGORITHM,        ///< The block scan algorithm to use
+    int                         _RADIX_BITS>            ///< The number of radix bits, i.e., log2(bins)
+struct AgentRadixSortDownsweepPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,           ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,        ///< Items per thread (per tile of input)
+        RADIX_BITS              = _RADIX_BITS,              ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const BlockLoadAlgorithm  LOAD_ALGORITHM     = _LOAD_ALGORITHM;    ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier   LOAD_MODIFIER      = _LOAD_MODIFIER;     ///< Cache load modifier for reading keys (and values)
+    static const RadixRankAlgorithm  RANK_ALGORITHM     = _RANK_ALGORITHM;    ///< The radix ranking algorithm to use
+    static const BlockScanAlgorithm  SCAN_ALGORITHM     = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+
+
+
+/**
+ * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+template <
+    typename AgentRadixSortDownsweepPolicy,     ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
+    bool     IS_DESCENDING,                     ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,                              ///< KeyT type
+    typename ValueT,                            ///< ValueT type
+    typename OffsetT>                           ///< Signed integer type for global offsets
+struct AgentRadixSortDownsweep
+{
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    // Appropriate unsigned-bits representation of KeyT
+    typedef typename Traits::UnsignedBits UnsignedBits;
+
+    static const UnsignedBits           LOWEST_KEY  = Traits::LOWEST_KEY;
+    static const UnsignedBits           MAX_KEY     = Traits::MAX_KEY;
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM  = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM;
+    static const CacheLoadModifier      LOAD_MODIFIER   = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER;
+    static const RadixRankAlgorithm     RANK_ALGORITHM  = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM;
+    static const BlockScanAlgorithm     SCAN_ALGORITHM  = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM;
+
+    enum
+    {
+        BLOCK_THREADS           = AgentRadixSortDownsweepPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
+        RADIX_BITS              = AgentRadixSortDownsweepPolicy::RADIX_BITS,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+        KEYS_ONLY               = Equals::VALUE,
+    };
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator    KeysItr;
+    typedef CacheModifiedInputIterator          ValuesItr;
+
+    // Radix ranking type to use
+    typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC),
+            BlockRadixRank,
+            typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+                BlockRadixRank,
+                BlockRadixRankMatch
+            >::Type
+        >::Type BlockRadixRankT;
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD
+    };
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        UnsignedBits,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM> BlockLoadKeysT;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        ValueT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM> BlockLoadValuesT;
+
+    // Value exchange array type
+    typedef ValueT ValueExchangeT[TILE_ITEMS];
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        typename BlockLoadKeysT::TempStorage    load_keys;
+        typename BlockLoadValuesT::TempStorage  load_values;
+        typename BlockRadixRankT::TempStorage   radix_rank;
+
+        struct
+        {
+            UnsignedBits                        exchange_keys[TILE_ITEMS];
+            OffsetT                             relative_bin_offsets[RADIX_DIGITS];
+        };
+
+        Uninitialized           exchange_values;
+
+        OffsetT                                 exclusive_digit_prefix[RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+    ValuesItr       d_values_in;
+    UnsignedBits    *d_keys_out;
+    ValueT          *d_values_out;
+
+    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
+    OffsetT         bin_offset[BINS_TRACKED_PER_THREAD];
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+    // Whether to short-cirucit
+    int             short_circuit;
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Scatter ranked keys through shared memory, then to device-accessible memory
+     */
+    template 
+    __device__ __forceinline__ void ScatterKeys(
+        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         valid_items)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            UnsignedBits key            = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
+            UnsignedBits digit          = BFE(key, current_bit, num_bits);
+            relative_bin_offsets[ITEM]  = temp_storage.relative_bin_offsets[digit];
+
+            // Un-twiddle
+            key = Traits::TwiddleOut(key);
+
+            if (FULL_TILE || 
+                (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter ranked values through shared memory, then to device-accessible memory
+     */
+    template 
+    __device__ __forceinline__ void ScatterValues(
+        ValueT      (&values)[ITEMS_PER_THREAD],
+        OffsetT     (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int         (&ranks)[ITEMS_PER_THREAD],
+        OffsetT     valid_items)
+    {
+        CTA_SYNC();
+
+        ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            exchange_values[ranks[ITEM]] = values[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
+
+            if (FULL_TILE ||
+                (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
+            }
+        }
+    }
+
+    /**
+     * Load a tile of keys (specialized for full tile, any ranking algorithm)
+     */
+    template 
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type              is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for partial tile, any ranking algorithm)
+     */
+    template 
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type             is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex(valid_items, 0, 0xffffffff);
+
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys, valid_items, oob_item);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for full tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type              is_full_tile,
+        Int2Type  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for partial tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type             is_full_tile,
+        Int2Type  rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex(valid_items, 0, 0xffffffff);
+
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
+    }
+
+
+    /**
+     * Load a tile of values (specialized for full tile, any ranking algorithm)
+     */
+    template 
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type              is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of values (specialized for partial tile, any ranking algorithm)
+     */
+    template 
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type             is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex(valid_items, 0, 0xffffffff);
+
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values, valid_items);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of items (specialized for full tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type              is_full_tile,
+        Int2Type  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
+    }
+
+
+    /**
+     * Load a tile of items (specialized for partial tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type             is_full_tile,
+        Int2Type  rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex(valid_items, 0, 0xffffffff);
+
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
+    }
+
+
+    /**
+     * Truck along associated values
+     */
+    template 
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         block_offset,
+        OffsetT         valid_items,
+        Int2Type /*is_keys_only*/)
+    {
+        ValueT values[ITEMS_PER_THREAD];
+
+        CTA_SYNC();
+
+        LoadValues(
+            values,
+            block_offset,
+            valid_items,
+            Int2Type(),
+            Int2Type());
+
+        ScatterValues(
+            values,
+            relative_bin_offsets,
+            ranks,
+            valid_items);
+    }
+
+
+    /**
+     * Truck along associated values (specialized for key-only sorting)
+     */
+    template 
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD],
+        int             (&/*ranks*/)[ITEMS_PER_THREAD],
+        OffsetT         /*block_offset*/,
+        OffsetT         /*valid_items*/,
+        Int2Type  /*is_keys_only*/)
+    {}
+
+
+    /**
+     * Process tile
+     */
+    template 
+    __device__ __forceinline__ void ProcessTile(
+        OffsetT block_offset,
+        const OffsetT &valid_items = TILE_ITEMS)
+    {
+        UnsignedBits    keys[ITEMS_PER_THREAD];
+        int             ranks[ITEMS_PER_THREAD];
+        OffsetT         relative_bin_offsets[ITEMS_PER_THREAD];
+
+        // Assign default (min/max) value to all keys
+        UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;
+
+        // Load tile of keys
+        LoadKeys(
+            keys,
+            block_offset,
+            valid_items, 
+            default_key,
+            Int2Type(),
+            Int2Type());
+
+        // Twiddle key bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            keys[KEY] = Traits::TwiddleIn(keys[KEY]);
+        }
+
+        // Rank the twiddled keys
+        int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+        BlockRadixRankT(temp_storage.radix_rank).RankKeys(
+            keys,
+            ranks,
+            current_bit,
+            num_bits,
+            exclusive_digit_prefix);
+
+        CTA_SYNC();
+
+        // Share exclusive digit prefix
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Store exclusive prefix
+                temp_storage.exclusive_digit_prefix[bin_idx] =
+                    exclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Get inclusive digit prefix
+        int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                {
+                    // Get inclusive digit prefix from exclusive prefix (higher bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == 0) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx - 1];
+                }
+                else
+                {
+                    // Get inclusive digit prefix from exclusive prefix (lower bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx + 1];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Update global scatter base offsets for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_offset[track] -= exclusive_digit_prefix[track];
+                temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track];
+                bin_offset[track] += inclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Scatter keys
+        ScatterKeys(keys, relative_bin_offsets, ranks, valid_items);
+
+        // Gather/scatter values
+        GatherScatterValues(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type());
+    }
+
+    //---------------------------------------------------------------------
+    // Copy shortcut
+    //---------------------------------------------------------------------
+
+    /**
+     * Copy tiles within the range of input
+     */
+    template <
+        typename InputIteratorT,
+        typename T>
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  d_in,
+        T               *d_out,
+        OffsetT         block_offset,
+        OffsetT         block_end)
+    {
+        // Simply copy the input
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped(threadIdx.x, d_in + block_offset, items);
+            CTA_SYNC();
+            StoreDirectStriped(threadIdx.x, d_out + block_offset, items);
+
+            block_offset += TILE_ITEMS;
+        }
+
+        // Clean up last partial tile with guarded-I/O
+        if (block_offset < block_end)
+        {
+            OffsetT valid_items = block_end - block_offset;
+
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped(threadIdx.x, d_in + block_offset, items, valid_items);
+            CTA_SYNC();
+            StoreDirectStriped(threadIdx.x, d_out + block_offset, items, valid_items);
+        }
+    }
+
+
+    /**
+     * Copy tiles within the range of input (specialized for NullType)
+     */
+    template 
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  /*d_in*/,
+        NullType        * /*d_out*/,
+        OffsetT         /*block_offset*/,
+        OffsetT         /*block_end*/)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         (&bin_offset)[BINS_TRACKED_PER_THREAD],
+        OffsetT         num_items,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast(d_keys_out)),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            this->bin_offset[track] = bin_offset[track];
+
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Short circuit if the histogram has only bin counts of only zeros or problem-size
+                short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items));
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         num_items,
+        OffsetT         *d_spine,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast(d_keys_out)),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
+                OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx];
+                short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
+
+                // Load my block's bin offset for my bin
+                bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Distribute keys from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT   block_offset,
+        OffsetT   block_end)
+    {
+        if (short_circuit)
+        {
+            // Copy keys
+            Copy(d_keys_in, d_keys_out, block_offset, block_end);
+
+            // Copy values
+            Copy(d_values_in, d_values_out, block_offset, block_end);
+        }
+        else
+        {
+            // Process full tiles of tile_items
+            #pragma unroll 1
+            while (block_offset + TILE_ITEMS <= block_end)
+            {
+                ProcessTile(block_offset);
+                block_offset += TILE_ITEMS;
+
+                CTA_SYNC();
+            }
+
+            // Clean up last partial tile with guarded-I/O
+            if (block_offset < block_end)
+            {
+                ProcessTile(block_offset, block_end - block_offset);
+            }
+
+        }
+    }
+
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/SRC/cub/block_range/block_range_radix_sort_upsweep.cuh b/SRC/cub/agent/agent_radix_sort_upsweep.cuh
similarity index 66%
rename from SRC/cub/block_range/block_range_radix_sort_upsweep.cuh
rename to SRC/cub/agent/agent_radix_sort_upsweep.cuh
index faadbd3f..2081cefb 100644
--- a/SRC/cub/block_range/block_range_radix_sort_upsweep.cuh
+++ b/SRC/cub/agent/agent_radix_sort_upsweep.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -28,13 +28,14 @@
 
 /**
  * \file
- * BlockRangeRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep across a range of tiles.
+ * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
  */
 
 #pragma once
 
 #include "../thread/thread_reduce.cuh"
 #include "../thread/thread_load.cuh"
+#include "../warp/warp_reduce.cuh"
 #include "../block/block_load.cuh"
 #include "../util_type.cuh"
 #include "../iterator/cache_modified_input_iterator.cuh"
@@ -51,14 +52,14 @@ namespace cub {
  ******************************************************************************/
 
 /**
- * Parameterizable tuning policy type for BlockRangeRadixSortUpsweep
+ * Parameterizable tuning policy type for AgentRadixSortUpsweep
  */
 template <
     int                 _BLOCK_THREADS,     ///< Threads per thread block
     int                 _ITEMS_PER_THREAD,  ///< Items per thread (per tile of input)
     CacheLoadModifier   _LOAD_MODIFIER,     ///< Cache load modifier for reading keys
     int                 _RADIX_BITS>        ///< The number of radix bits, i.e., log2(bins)
-struct BlockRangeRadixSortUpsweepPolicy
+struct AgentRadixSortUpsweepPolicy
 {
     enum
     {
@@ -76,20 +77,20 @@ struct BlockRangeRadixSortUpsweepPolicy
  ******************************************************************************/
 
 /**
- * \brief BlockRangeRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep across a range of tiles.
+ * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
  */
 template <
-    typename BlockRangeRadixSortUpsweepPolicy,      ///< Parameterized BlockRangeRadixSortUpsweepPolicy tuning policy type
-    typename Key,                                   ///< Key type
-    typename Offset>                                ///< Signed integer type for global offsets
-struct BlockRangeRadixSortUpsweep
+    typename AgentRadixSortUpsweepPolicy,   ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
+    typename KeyT,                          ///< KeyT type
+    typename OffsetT>                       ///< Signed integer type for global offsets
+struct AgentRadixSortUpsweep
 {
 
     //---------------------------------------------------------------------
     // Type definitions and constants
     //---------------------------------------------------------------------
 
-    typedef typename Traits::UnsignedBits UnsignedBits;
+    typedef typename Traits::UnsignedBits UnsignedBits;
 
     // Integer type for digit counters (to be packed into words of PackedCounters)
     typedef unsigned char DigitCounter;
@@ -97,13 +98,13 @@ struct BlockRangeRadixSortUpsweep
     // Integer type for packing DigitCounters into columns of shared memory banks
     typedef unsigned int PackedCounter;
 
-    static const CacheLoadModifier LOAD_MODIFIER = BlockRangeRadixSortUpsweepPolicy::LOAD_MODIFIER;
+    static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;
 
     enum
     {
-        RADIX_BITS              = BlockRangeRadixSortUpsweepPolicy::RADIX_BITS,
-        BLOCK_THREADS           = BlockRangeRadixSortUpsweepPolicy::BLOCK_THREADS,
-        KEYS_PER_THREAD         = BlockRangeRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
+        RADIX_BITS              = AgentRadixSortUpsweepPolicy::RADIX_BITS,
+        BLOCK_THREADS           = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
+        KEYS_PER_THREAD         = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
 
         RADIX_DIGITS            = 1 << RADIX_BITS,
 
@@ -134,20 +135,17 @@ struct BlockRangeRadixSortUpsweep
     };
 
 
-    // Input iterator wrapper types
-    typedef CacheModifiedInputIterator  KeysItr;
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator KeysItr;
 
     /**
      * Shared memory storage layout
      */
-    struct _TempStorage
+    union __align__(16) _TempStorage
     {
-        union
-        {
-            DigitCounter    digit_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
-            PackedCounter   packed_counters[COUNTER_LANES][BLOCK_THREADS];
-            Offset          digit_partials[RADIX_DIGITS][WARP_THREADS + 1];
-        };
+        DigitCounter    thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+        PackedCounter   packed_thread_counters[COUNTER_LANES][BLOCK_THREADS];
+        OffsetT         block_counters[WARP_THREADS][RADIX_DIGITS];
     };
 
 
@@ -163,7 +161,7 @@ struct BlockRangeRadixSortUpsweep
     _TempStorage    &temp_storage;
 
     // Thread-local counters for periodically aggregating composite-counter lanes
-    Offset          local_counts[LANES_PER_WARP][PACKING_RATIO];
+    OffsetT         local_counts[LANES_PER_WARP][PACKING_RATIO];
 
     // Input and output device pointers
     KeysItr         d_keys_in;
@@ -186,8 +184,8 @@ struct BlockRangeRadixSortUpsweep
     {
         // BucketKeys
         static __device__ __forceinline__ void BucketKeys(
-            BlockRangeRadixSortUpsweep     &cta,
-            UnsignedBits                    keys[KEYS_PER_THREAD])
+            AgentRadixSortUpsweep       &cta,
+            UnsignedBits                keys[KEYS_PER_THREAD])
         {
             cta.Bucket(keys[COUNT]);
 
@@ -201,7 +199,7 @@ struct BlockRangeRadixSortUpsweep
     struct Iterate
     {
         // BucketKeys
-        static __device__ __forceinline__ void BucketKeys(BlockRangeRadixSortUpsweep &cta, UnsignedBits keys[KEYS_PER_THREAD]) {}
+        static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {}
     };
 
 
@@ -215,7 +213,7 @@ struct BlockRangeRadixSortUpsweep
     __device__ __forceinline__ void Bucket(UnsignedBits key)
     {
         // Perform transform op
-        UnsignedBits converted_key = Traits::TwiddleIn(key);
+        UnsignedBits converted_key = Traits::TwiddleIn(key);
 
         // Extract current digit bits
         UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
@@ -227,7 +225,7 @@ struct BlockRangeRadixSortUpsweep
         UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
 
         // Increment counter
-        temp_storage.digit_counters[row_offset][threadIdx.x][sub_counter]++;
+        temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++;
     }
 
 
@@ -239,7 +237,7 @@ struct BlockRangeRadixSortUpsweep
         #pragma unroll
         for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
         {
-            temp_storage.packed_counters[LANE][threadIdx.x] = 0;
+            temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0;
         }
     }
 
@@ -268,7 +266,7 @@ struct BlockRangeRadixSortUpsweep
     __device__ __forceinline__ void UnpackDigitCounts()
     {
         unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
+        unsigned int warp_tid = LaneId();
 
         #pragma unroll
         for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
@@ -282,7 +280,7 @@ struct BlockRangeRadixSortUpsweep
                     #pragma unroll
                     for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
                     {
-                        Offset counter = temp_storage.digit_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
+                        OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
                         local_counts[LANE][UNPACKED_COUNTER] += counter;
                     }
                 }
@@ -291,48 +289,10 @@ struct BlockRangeRadixSortUpsweep
     }
 
 
-    /**
-     * Places unpacked counters into smem for final digit reduction
-     */
-    __device__ __forceinline__ void ReduceUnpackedCounts(Offset &bin_count)
-    {
-        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
-
-        // Place unpacked digit counters in shared memory
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                int digit_row = counter_lane << LOG_PACKING_RATIO;
-
-                #pragma unroll
-                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                {
-                    temp_storage.digit_partials[digit_row + UNPACKED_COUNTER][warp_tid] =
-                        local_counts[LANE][UNPACKED_COUNTER];
-                }
-            }
-        }
-
-        __syncthreads();
-
-        // Rake-reduce bin_count reductions
-        if (threadIdx.x < RADIX_DIGITS)
-        {
-            bin_count = ThreadReduce(
-                temp_storage.digit_partials[threadIdx.x],
-                Sum());
-        }
-    }
-
-
     /**
      * Processes a single, full tile
      */
-    __device__ __forceinline__ void ProcessFullTile(Offset block_offset)
+    __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset)
     {
         // Tile of keys
         UnsignedBits keys[KEYS_PER_THREAD];
@@ -340,8 +300,7 @@ struct BlockRangeRadixSortUpsweep
         LoadDirectStriped(threadIdx.x, d_keys_in + block_offset, keys);
 
         // Prevent hoisting
-//        __threadfence_block();
-//        __syncthreads();
+        CTA_SYNC();
 
         // Bucket tile of keys
         Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
@@ -352,8 +311,8 @@ struct BlockRangeRadixSortUpsweep
      * Processes a single load (may have some threads masked off)
      */
     __device__ __forceinline__ void ProcessPartialTile(
-        Offset block_offset,
-        const Offset &block_end)
+        OffsetT block_offset,
+        const OffsetT &block_end)
     {
         // Process partial tile if necessary using single loads
         block_offset += threadIdx.x;
@@ -374,14 +333,14 @@ struct BlockRangeRadixSortUpsweep
     /**
      * Constructor
      */
-    __device__ __forceinline__ BlockRangeRadixSortUpsweep(
+    __device__ __forceinline__ AgentRadixSortUpsweep(
         TempStorage &temp_storage,
-        Key         *d_keys_in,
+        const KeyT  *d_keys_in,
         int         current_bit,
         int         num_bits)
     :
         temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast(d_keys_in)),
+        d_keys_in(reinterpret_cast(d_keys_in)),
         current_bit(current_bit),
         num_bits(num_bits)
     {}
@@ -391,9 +350,8 @@ struct BlockRangeRadixSortUpsweep
      * Compute radix digit histograms from a segment of input tiles.
      */
     __device__ __forceinline__ void ProcessRegion(
-        Offset           block_offset,
-        const Offset     &block_end,
-        Offset           &bin_count)                ///< [out] The digit count for tid'th bin (output param, valid in the first RADIX_DIGITS threads)
+        OffsetT          block_offset,
+        const OffsetT    &block_end)
     {
         // Reset digit counters in smem and unpacked counters in registers
         ResetDigitCounters();
@@ -408,12 +366,12 @@ struct BlockRangeRadixSortUpsweep
                 block_offset += TILE_ITEMS;
             }
 
-            __syncthreads();
+            CTA_SYNC();
 
             // Aggregate back into local_count registers to prevent overflow
             UnpackDigitCounts();
 
-            __syncthreads();
+            CTA_SYNC();
 
             // Reset composite counters in lanes
             ResetDigitCounters();
@@ -431,15 +389,133 @@ struct BlockRangeRadixSortUpsweep
             block_offset,
             block_end);
 
-        __syncthreads();
+        CTA_SYNC();
 
         // Aggregate back into local_count registers
         UnpackDigitCounts();
+    }
+
+
+    /**
+     * Extract counts (saving them to the external array)
+     */
+    template 
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT     *counters,
+        int         bin_stride = 1,
+        int         bin_offset = 0)
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
 
-        __syncthreads();
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
 
-        // Final raking reduction of counts by bin
-        ReduceUnpackedCounts(bin_count);
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+
+        // Whole blocks
+        #pragma unroll
+        for (int BIN_BASE   = RADIX_DIGITS % BLOCK_THREADS;
+            (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
+            BIN_BASE += BLOCK_THREADS)
+        {
+            int bin_idx = BIN_BASE + threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+
+        // Remainder
+        if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
+        {
+            int bin_idx = threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+    }
+
+
+    /**
+     * Extract counts
+     */
+    template 
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])  ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_count[track] = 0;
+
+                #pragma unroll
+                for (int i = 0; i < WARP_THREADS; ++i)
+                    bin_count[track] += temp_storage.block_counters[i][bin_idx];
+            }
+        }
     }
 
 };
diff --git a/SRC/cub/agent/agent_reduce.cuh b/SRC/cub/agent/agent_reduce.cuh
new file mode 100644
index 00000000..000a905c
--- /dev/null
+++ b/SRC/cub/agent/agent_reduce.cuh
@@ -0,0 +1,385 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ */
+
+#pragma once
+
+#include 
+
+#include "../block/block_load.cuh"
+#include "../block/block_reduce.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduce
+ */
+template <
+    int                     _BLOCK_THREADS,         ///< Threads per thread block
+    int                     _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
+    int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
+    BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
+    CacheLoadModifier       _LOAD_MODIFIER>         ///< Cache load modifier for reading input elements
+struct AgentReducePolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
+        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
+    };
+
+    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
+    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate.  Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+template <
+    typename AgentReducePolicy,        ///< Parameterized AgentReducePolicy tuning policy type
+    typename InputIteratorT,           ///< Random-access iterator type for input
+    typename OutputIteratorT,          ///< Random-access iterator type for output
+    typename OffsetT,                  ///< Signed integer type for global offsets
+    typename ReductionOp>              ///< Binary reduction operator type having member T operator()(const T &a, const T &b)
+struct AgentReduce
+{
+
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    typedef typename std::iterator_traits::value_type InputT;
+
+    /// The output value type
+    typedef typename If<(Equals::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    /// Vector type of InputT for data movement
+    typedef typename CubVector::Type VectorT;
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    typedef typename If::VALUE,
+            CacheModifiedInputIterator,      // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReducePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReducePolicy::ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
+        ATTEMPT_VECTORIZATION   = (VECTOR_LOAD_LENGTH > 1) &&
+                                    (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
+                                    (IsPointer::VALUE) && Traits::PRIMITIVE,
+
+    };
+
+    static const CacheLoadModifier    LOAD_MODIFIER   = AgentReducePolicy::LOAD_MODIFIER;
+    static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
+
+    /// Parameterized BlockReduce primitive
+    typedef BlockReduce BlockReduceT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        typename BlockReduceT::TempStorage  reduce;
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&           temp_storage;       ///< Reference to temp_storage
+    InputIteratorT          d_in;               ///< Input data to reduce
+    WrappedInputIteratorT   d_wrapped_in;       ///< Wrapped input data to reduce
+    ReductionOp             reduction_op;       ///< Binary reduction operator
+
+
+    //---------------------------------------------------------------------
+    // Utility
+    //---------------------------------------------------------------------
+
+
+    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
+    template 
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        d_in,
+        Int2Type  /*can_vectorize*/)
+    {
+        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+    }
+
+    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
+    template 
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        /*d_in*/,
+        Int2Type /*can_vectorize*/)
+    {
+        return false;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentReduce(
+        TempStorage&            temp_storage,       ///< Reference to temp_storage
+        InputIteratorT          d_in,               ///< Input data to reduce
+        ReductionOp             reduction_op)       ///< Binary reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_wrapped_in(d_in),
+        reduction_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Tile consumption
+    //---------------------------------------------------------------------
+
+    /**
+     * Consume a full tile of input (non-vectorized)
+     */
+    template 
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type         /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        OutputT items[ITEMS_PER_THREAD];
+
+        // Load items in striped fashion
+        LoadDirectStriped(threadIdx.x, d_wrapped_in + block_offset, items);
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a full tile of input (vectorized)
+     */
+    template 
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type          /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Alias items as an array of VectorT and load it in striped fashion
+        enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
+
+        // Fabricate a vectorized input iterator
+        InputT *d_in_unqualified = const_cast(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
+        CacheModifiedInputIterator d_vec_in(
+            reinterpret_cast(d_in_unqualified));
+
+        // Load items as vector items
+        InputT input_items[ITEMS_PER_THREAD];
+        VectorT *vec_items = reinterpret_cast(input_items);
+        #pragma unroll
+        for (int i = 0; i < WORDS; ++i)
+            vec_items[i] = d_vec_in[BLOCK_THREADS * i];
+
+        // Convert from input type to output type
+        OutputT items[ITEMS_PER_THREAD];
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+            items[i] = input_items[i];
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a partial tile of input
+     */
+    template 
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     valid_items,        ///< The number of valid items in the tile
+        Int2Type         /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Partial tile
+        int thread_offset = threadIdx.x;
+
+        // Read first item
+        if ((IS_FIRST_TILE) && (thread_offset < valid_items))
+        {
+            thread_aggregate = d_wrapped_in[block_offset + thread_offset];
+            thread_offset += BLOCK_THREADS;
+        }
+
+        // Continue reading items (block-striped)
+        while (thread_offset < valid_items)
+        {
+            OutputT item        = d_wrapped_in[block_offset + thread_offset];
+            thread_aggregate    = reduction_op(thread_aggregate, item);
+            thread_offset       += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------
+    // Consume a contiguous segment of tiles
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    template 
+    __device__ __forceinline__ OutputT ConsumeRange(
+        GridEvenShare &even_share,          ///< GridEvenShare descriptor
+        Int2Type can_vectorize)      ///< Whether or not we can vectorize loads
+    {
+        OutputT thread_aggregate;
+
+        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
+        {
+            // First tile isn't full (not all threads have valid items)
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize);
+            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
+        }
+
+        // At least one full block
+        ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize);
+        even_share.block_offset += even_share.block_stride;
+
+        // Consume subsequent full tiles of input
+        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
+        {
+            ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize);
+            even_share.block_offset += even_share.block_stride;
+        }
+
+        // Consume a partially-full tile
+        if (even_share.block_offset < even_share.block_end)
+        {
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize);
+        }
+
+        // Compute block-wide reduction (all threads have valid items)
+        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
+    }
+
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeRange(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        GridEvenShare even_share;
+        even_share.template BlockInit(block_offset, block_end);
+
+        return (IsAligned(d_in + block_offset, Int2Type())) ?
+            ConsumeRange(even_share, Int2Type()) :
+            ConsumeRange(even_share, Int2Type());
+    }
+
+
+    /**
+     * Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeTiles(
+        GridEvenShare &even_share)        ///< [in] GridEvenShare descriptor
+    {
+        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
+        even_share.template BlockInit();
+
+        return (IsAligned(d_in, Int2Type())) ?
+            ConsumeRange(even_share, Int2Type()) :
+            ConsumeRange(even_share, Int2Type());
+
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/SRC/cub/agent/agent_reduce_by_key.cuh b/SRC/cub/agent/agent_reduce_by_key.cuh
new file mode 100644
index 00000000..51964d3e
--- /dev/null
+++ b/SRC/cub/agent/agent_reduce_by_key.cuh
@@ -0,0 +1,547 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include 
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduceByKey
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentReduceByKeyPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentReduceByKeyPolicyT,        ///< Parameterized AgentReduceByKeyPolicy tuning policy type
+    typename    KeysInputIteratorT,             ///< Random-access input iterator type for keys
+    typename    UniqueOutputIteratorT,          ///< Random-access output iterator type for keys
+    typename    ValuesInputIteratorT,           ///< Random-access input iterator type for values
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    NumRunsOutputIteratorT,         ///< Output iterator type for recording number of items selected
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT>                        ///< Signed integer type for global offsets
+struct AgentReduceByKey
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input keys type
+    typedef typename std::iterator_traits::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits::value_type,                                              // ... then the input iterator's value type,
+        typename std::iterator_traits::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair OffsetValuePairT;
+
+    // Tuple type for pairing keys and values
+    typedef KeyValuePair KeyValuePairT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState ScanTileStateT;
+
+    // Guarded inequality functor
+    template 
+    struct GuardedInequalityWrapper
+    {
+        _EqualityOpT     op;             ///< Wrapped equality operator
+        int             num_remaining;  ///< Items remaining
+
+        /// Constructor
+        __host__ __device__ __forceinline__
+        GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {}
+
+        /// Boolean inequality operator, returns (a != b)
+        template 
+        __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const
+        {
+            if (idx < num_remaining)
+                return !op(a, b);   // In bounds
+
+            // Return true if first out-of-bounds item, false otherwise
+            return (idx == num_remaining);
+       }
+    };
+
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReduceByKeyPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER   = (ITEMS_PER_THREAD > 1),
+
+        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+        HAS_IDENTITY_ZERO   = (Equals::VALUE) && (Traits::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    typedef typename If::VALUE,
+            CacheModifiedInputIterator,     // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            KeysInputIteratorT>::Type                                                                   // Directly use the supplied input iterator type
+        WrappedKeysInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    typedef typename If::VALUE,
+            CacheModifiedInputIterator,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            ValuesInputIteratorT>::Type                                                                 // Directly use the supplied input iterator type
+        WrappedValuesInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    typedef typename If::VALUE,
+            CacheModifiedInputIterator,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            AggregatesOutputIteratorT>::Type                                                            // Directly use the supplied input iterator type
+        WrappedFixupInputIteratorT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceBySegmentOp ReduceBySegmentOpT;
+
+    // Parameterized BlockLoad type for keys
+    typedef BlockLoad<
+            KeyOutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+        BlockLoadKeysT;
+
+    // Parameterized BlockLoad type for values
+    typedef BlockLoad<
+            ValueOutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+        BlockLoadValuesT;
+
+    // Parameterized BlockDiscontinuity type for keys
+    typedef BlockDiscontinuity<
+            KeyOutputT,
+            BLOCK_THREADS>
+        BlockDiscontinuityKeys;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OffsetValuePairT,
+            BLOCK_THREADS,
+            AgentReduceByKeyPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OffsetValuePairT,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Key and value exchange types
+    typedef KeyOutputT    KeyExchangeT[TILE_ITEMS + 1];
+    typedef ValueOutputT  ValueExchangeT[TILE_ITEMS + 1];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityKeys::TempStorage    discontinuity;  // Smem needed for discontinuity detection
+        };
+
+        // Smem needed for loading keys
+        typename BlockLoadKeysT::TempStorage load_keys;
+
+        // Smem needed for loading values
+        typename BlockLoadValuesT::TempStorage load_values;
+
+        // Smem needed for compacting key value pairs(allows non POD items in this union)
+        Uninitialized raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedKeysInputIteratorT       d_keys_in;          ///< Input keys
+    UniqueOutputIteratorT           d_unique_out;       ///< Unique output keys
+    WrappedValuesInputIteratorT     d_values_in;        ///< Input values
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    NumRunsOutputIteratorT          d_num_runs_out;     ///< Output pointer for total number of segments identified
+    EqualityOpT                     equality_op;        ///< KeyT equality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentReduceByKey(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        KeysInputIteratorT          d_keys_in,          ///< Input keys
+        UniqueOutputIteratorT       d_unique_out,       ///< Unique output keys
+        ValuesInputIteratorT        d_values_in,        ///< Input values
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        NumRunsOutputIteratorT      d_num_runs_out,     ///< Output pointer for total number of segments identified
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(d_keys_in),
+        d_unique_out(d_unique_out),
+        d_values_in(d_values_in),
+        d_aggregates_out(d_aggregates_out),
+        d_num_runs_out(d_num_runs_out),
+        equality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Directly scatter flagged items to output offsets
+     */
+    __device__ __forceinline__ void ScatterDirect(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD])
+    {
+        // Scatter flagged keys and values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                d_unique_out[segment_indices[ITEM]]     = scatter_items[ITEM].key;
+                d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value;
+            }
+        }
+    }
+
+
+    /**
+     * 2-phase scatter flagged items to output offsets
+     *
+     * The exclusive scan causes each head flag to be paired with the previous
+     * value aggregate: the scatter offsets must be decremented for value aggregates
+     */
+    __device__ __forceinline__ void ScatterTwoPhase(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        CTA_SYNC();
+
+        // Compact and scatter pairs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
+        {
+            KeyValuePairT pair                                  = temp_storage.raw_exchange.Alias()[item];
+            d_unique_out[num_tile_segments_prefix + item]       = pair.key;
+            d_aggregates_out[num_tile_segments_prefix + item]   = pair.value;
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    __device__ __forceinline__ void Scatter(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
+        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
+        {
+            ScatterTwoPhase(
+                scatter_items,
+                segment_flags,
+                segment_indices,
+                num_tile_segments,
+                num_tile_segments_prefix);
+        }
+        else
+        {
+            ScatterDirect(
+                scatter_items,
+                segment_flags,
+                segment_indices);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template                 ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        KeyOutputT          keys[ITEMS_PER_THREAD];             // Tile keys
+        KeyOutputT          prev_keys[ITEMS_PER_THREAD];        // Tile keys shuffled up
+        ValueOutputT        values[ITEMS_PER_THREAD];           // Tile values
+        OffsetT             head_flags[ITEMS_PER_THREAD];       // Segment head flags
+        OffsetT             segment_indices[ITEMS_PER_THREAD];  // Segment indices
+        OffsetValuePairT    scan_items[ITEMS_PER_THREAD];       // Zipped values and segment flags|indices
+        KeyValuePairT       scatter_items[ITEMS_PER_THREAD];    // Zipped key value pairs for scattering
+
+        // Load keys
+        if (IS_LAST_TILE)
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
+        else
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+
+        // Load tile predecessor key in first thread
+        KeyOutputT tile_predecessor;
+        if (threadIdx.x == 0)
+        {
+            tile_predecessor = (tile_idx == 0) ?
+                keys[0] :                       // First tile gets repeat of first item (thus first item will not be flagged as a head)
+                d_keys_in[tile_offset - 1];     // Subsequent tiles get last key from previous tile
+        }
+
+        CTA_SYNC();
+
+        // Load values
+        if (IS_LAST_TILE)
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
+        else
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
+
+        CTA_SYNC();
+
+        // Initialize head-flags and shuffle up the previous keys
+        if (IS_LAST_TILE)
+        {
+            // Use custom flag operator to additionally flag the first out-of-bounds item
+            GuardedInequalityWrapper flag_op(equality_op, num_remaining);
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+        else
+        {
+            InequalityWrapper flag_op(equality_op);
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+
+        // Zip values and head flags
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scan_items[ITEM].value  = values[ITEM];
+            scan_items[ITEM].key    = head_flags[ITEM];
+        }
+
+        // Perform exclusive tile scan
+        OffsetValuePairT    block_aggregate;        // Inclusive block-wide scan aggregate
+        OffsetT             num_segments_prefix;    // Number of segments prior to this tile
+        OffsetValuePairT    total_aggregate;        // The tile prefix folded with block_aggregate
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
+            num_segments_prefix     = 0;
+            total_aggregate         = block_aggregate;
+
+            // Update tile status if there are successor tiles
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op);
+
+            block_aggregate         = prefix_op.GetBlockAggregate();
+            num_segments_prefix     = prefix_op.GetExclusivePrefix().key;
+            total_aggregate         = prefix_op.GetInclusivePrefix();
+        }
+
+        // Rezip scatter items and segment indices
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scatter_items[ITEM].key     = prev_keys[ITEM];
+            scatter_items[ITEM].value   = scan_items[ITEM].value;
+            segment_indices[ITEM]       = scan_items[ITEM].key;
+        }
+
+        // At this point, each flagged segment head has:
+        //  - The key for the previous segment
+        //  - The reduced value from the previous segment
+        //  - The segment index for the reduced value
+
+        // Scatter flagged keys and values
+        OffsetT num_tile_segments = block_aggregate.key;
+        Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix);
+
+        // Last thread in last tile will output final count (and last pair, if necessary)
+        if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1))
+        {
+            OffsetT num_segments = num_segments_prefix + num_tile_segments;
+
+            // If the last tile is a whole tile, output the final_value
+            if (num_remaining == TILE_ITEMS)
+            {
+                d_unique_out[num_segments]      = keys[ITEMS_PER_THREAD - 1];
+                d_aggregates_out[num_segments]  = total_aggregate.value;
+                num_segments++;
+            }
+
+            // Output the total number of items selected
+            *d_num_runs_out = num_segments;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/SRC/cub/agent/agent_rle.cuh b/SRC/cub/agent/agent_rle.cuh
new file mode 100644
index 00000000..cb7a4a65
--- /dev/null
+++ b/SRC/cub/agent/agent_rle.cuh
@@ -0,0 +1,837 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode.
+ */
+
+#pragma once
+
+#include 
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRle
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentRlePolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode 
+ */
+template <
+    typename    AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
+    typename    InputIteratorT,         ///< Random-access input iterator type for data
+    typename    OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values
+    typename    LengthsOutputIteratorT, ///< Random-access output iterator type for length values
+    typename    EqualityOpT,            ///< T equality operator type
+    typename    OffsetT>                ///< Signed integer type for global offsets
+struct AgentRle
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    typedef typename std::iterator_traits::value_type T;
+
+    /// The lengths output value type
+    typedef typename If<(Equals::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    /// Tuple type for scanning (pairs run-length and run-index)
+    typedef KeyValuePair LengthOffsetPair;
+
+    /// Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
+        BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,
+        WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        /// Whether or not to sync after loading data
+        SYNC_AFTER_LOAD         = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
+
+        /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+        STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
+        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
+    };
+
+
+    /**
+     * Special operator that signals all out-of-bounds items are not equal to everything else,
+     * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
+     * trivial.
+     */
+    template 
+    struct OobInequalityOp
+    {
+        OffsetT         num_remaining;
+        EqualityOpT      equality_op;
+
+        __device__ __forceinline__ OobInequalityOp(
+            OffsetT     num_remaining,
+            EqualityOpT  equality_op)
+        :
+            num_remaining(num_remaining),
+            equality_op(equality_op)
+        {}
+
+        template 
+        __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx)
+        {
+            if (!LAST_TILE || (idx < num_remaining))
+                return !equality_op(first, second);
+            else
+                return true;
+        }
+    };
+
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for data
+    typedef typename If::VALUE,
+            CacheModifiedInputIterator,      // Wrap the native input pointer with CacheModifiedVLengthnputIterator
+            InputIteratorT>::Type                                                       // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Parameterized BlockLoad type for data
+    typedef BlockLoad<
+            T,
+            AgentRlePolicyT::BLOCK_THREADS,
+            AgentRlePolicyT::ITEMS_PER_THREAD,
+            AgentRlePolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockDiscontinuity type for data
+    typedef BlockDiscontinuity BlockDiscontinuityT;
+
+    // Parameterized WarpScan type
+    typedef WarpScan WarpScanPairs;
+
+    // Reduce-length-by-run scan operator
+    typedef ReduceBySegmentOp ReduceBySegmentOpT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            LengthOffsetPair,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Warp exchange types
+    typedef WarpExchange        WarpExchangePairs;
+
+    typedef typename If::Type WarpExchangePairsStorage;
+
+    typedef WarpExchange                 WarpExchangeOffsets;
+    typedef WarpExchange                 WarpExchangeLengths;
+
+    typedef LengthOffsetPair WarpAggregates[WARPS];
+
+    // Shared memory type for this thread block
+    struct _TempStorage
+    {
+        // Aliasable storage layout
+        union Aliasable
+        {
+            struct
+            {
+                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection
+                typename WarpScanPairs::TempStorage             warp_scan[WARPS];           // Smem needed for warp-synchronous scans
+                Uninitialized          warp_aggregates;            // Smem needed for sharing warp-wide aggregates
+                typename TilePrefixCallbackOpT::TempStorage     prefix;                     // Smem needed for cooperative prefix callback
+            };
+
+            // Smem needed for input loading
+            typename BlockLoadT::TempStorage                    load;
+
+            // Aliasable layout needed for two-phase scatter
+            union ScatterAliasable
+            {
+                unsigned long long                              align;
+                WarpExchangePairsStorage                        exchange_pairs[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeOffsets::TempStorage       exchange_offsets[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeLengths::TempStorage       exchange_lengths[ACTIVE_EXCHANGE_WARPS];
+
+            } scatter_aliasable;
+
+        } aliasable;
+
+        OffsetT             tile_idx;                   // Shared tile index
+        LengthOffsetPair    tile_inclusive;             // Inclusive tile prefix
+        LengthOffsetPair    tile_exclusive;             // Exclusive tile prefix
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+
+    WrappedInputIteratorT           d_in;               ///< Pointer to input sequence of data items
+    OffsetsOutputIteratorT          d_offsets_out;      ///< Input run offsets
+    LengthsOutputIteratorT          d_lengths_out;      ///< Output run lengths
+
+    EqualityOpT                     equality_op;        ///< T equality operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-length-by-flag scan operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentRle(
+        TempStorage                 &temp_storage,      ///< [in] Reference to temp_storage
+        InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run offsets
+        LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run lengths
+        EqualityOpT                 equality_op,        ///< [in] T equality operator
+        OffsetT                     num_items)          ///< [in] Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_offsets_out(d_offsets_out),
+        d_lengths_out(d_lengths_out),
+        equality_op(equality_op),
+        scan_op(cub::Sum()),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    template 
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT             tile_offset,
+        OffsetT             num_remaining,
+        T                   (&items)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        bool                head_flags[ITEMS_PER_THREAD];
+        bool                tail_flags[ITEMS_PER_THREAD];
+
+        OobInequalityOp inequality_op(num_remaining, equality_op);
+
+        if (FIRST_TILE && LAST_TILE)
+        {
+            // First-and-last-tile always head-flags the first item and tail-flags the last item
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, items, inequality_op);
+        }
+        else if (FIRST_TILE)
+        {
+            // First-tile always head-flags the first item
+
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, tile_successor_item, items, inequality_op);
+        }
+        else if (LAST_TILE)
+        {
+            // Last-tile always flags the last item
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
+        }
+        else
+        {
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
+        }
+
+        // Zip counts and runs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            lengths_and_num_runs[ITEM].key      = head_flags[ITEM] && (!tail_flags[ITEM]);
+            lengths_and_num_runs[ITEM].value    = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scan of allocations
+     */
+    __device__ __forceinline__ void WarpScanAllocations(
+        LengthOffsetPair    &tile_aggregate,
+        LengthOffsetPair    &warp_aggregate,
+        LengthOffsetPair    &warp_exclusive_in_tile,
+        LengthOffsetPair    &thread_exclusive_in_warp,
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        // Perform warpscans
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        LengthOffsetPair identity;
+        identity.key = 0;
+        identity.value = 0;
+
+        LengthOffsetPair thread_inclusive;
+        LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op);
+        WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan(
+            thread_aggregate,
+            thread_inclusive,
+            thread_exclusive_in_warp,
+            identity,
+            scan_op);
+
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive;
+
+        CTA_SYNC();
+
+        // Accumulate total selected and the warp-wide prefix
+        warp_exclusive_in_tile          = identity;
+        warp_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[warp_id];
+        tile_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[0];
+
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_exclusive_in_tile = tile_aggregate;
+
+            tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for scattering selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Two-phase scatter, specialized for warp time-slicing
+     */
+    template 
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type      is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Locally compact items within the warp (first warp)
+        if (warp_id == 0)
+        {
+            WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+        }
+
+        // Locally compact items within the warp (remaining warps)
+        #pragma unroll
+        for (int SLICE = 1; SLICE < WARPS; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                    lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+            }
+        }
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Two-phase scatter
+     */
+    template 
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type     is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Unzip
+        OffsetT run_offsets[ITEMS_PER_THREAD];
+        LengthT run_lengths[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            run_offsets[ITEM] = lengths_and_offsets[ITEM].key;
+            run_lengths[ITEM] = lengths_and_offsets[ITEM].value;
+        }
+
+        WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped(
+            run_offsets, thread_num_runs_exclusive_in_warp);
+
+        WARP_SYNC(0xffffffff);
+
+        WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped(
+            run_lengths, thread_num_runs_exclusive_in_warp);
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = run_offsets[ITEM];
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = run_lengths[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Direct scatter
+     */
+    template 
+    __device__ __forceinline__ void ScatterDirect(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    thread_num_runs_exclusive_in_warp[ITEM];
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if (item_offset >= 1)
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter
+     */
+    template 
+    __device__ __forceinline__ void Scatter(
+        OffsetT             tile_num_runs_aggregate,
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
+        {
+            // Direct scatter if the warp has any items
+            if (warp_num_runs_aggregate)
+            {
+                ScatterDirect(
+                    tile_num_runs_exclusive_in_global,
+                    warp_num_runs_aggregate,
+                    warp_num_runs_exclusive_in_tile,
+                    thread_num_runs_exclusive_in_warp,
+                    lengths_and_offsets);
+            }
+        }
+        else
+        {
+            // Scatter two phase
+            ScatterTwoPhase(
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets,
+                Int2Type());
+        }
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <
+        bool                LAST_TILE>
+    __device__ __forceinline__ LengthOffsetPair ConsumeTile(
+        OffsetT             num_items,          ///< Total number of global input items
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT      &tile_status)       ///< Global list of tile status
+    {
+        if (tile_idx == 0)
+        {
+            // First tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // Update tile status if this is not the last tile
+            if (!LAST_TILE && (threadIdx.x == 0))
+                tile_status.SetInclusive(0, tile_aggregate);
+
+            // Update thread_exclusive_in_warp to fold in warp run-length
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;
+
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+
+            // Downsweep scan through lengths_and_num_runs
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = 0;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return tile_aggregate;
+        }
+        else
+        {
+            // Not first tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // First warp computes tile prefix in lane 0
+            TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx);
+            unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+            if (warp_id == 0)
+            {
+                prefix_op(tile_aggregate);
+                if (threadIdx.x == 0)
+                    temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
+            }
+
+            CTA_SYNC();
+
+            LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
+
+            // Update thread_exclusive_in_warp to fold in warp and tile run-lengths
+            LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += thread_exclusive.value;
+
+            // Downsweep scan through lengths_and_num_runs
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = tile_exclusive_in_global.key;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return prefix_op.inclusive_prefix;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template             ///< Output iterator type for recording number of items selected
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_tiles,              ///< Total number of input tiles
+        ScanTileStateT&     tile_status,            ///< Global list of tile status
+        NumRunsIteratorT    d_num_runs_out)         ///< Output pointer for total number of runs identified
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                  // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            LengthOffsetPair running_total = ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selected
+                *d_num_runs_out = running_total.key;
+
+                // The inclusive prefix contains accumulated length reduction for the last run
+                if (running_total.key > 0)
+                    d_lengths_out[running_total.key - 1] = running_total.value;
+            }
+        }
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/SRC/cub/agent/agent_scan.cuh b/SRC/cub/agent/agent_scan.cuh
new file mode 100644
index 00000000..9368615e
--- /dev/null
+++ b/SRC/cub/agent/agent_scan.cuh
@@ -0,0 +1,471 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+
+#pragma once
+
+#include 
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentScan
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentScanPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+template <
+    typename AgentScanPolicyT,      ///< Parameterized AgentScanPolicyT tuning policy type
+    typename InputIteratorT,        ///< Random-access input iterator type
+    typename OutputIteratorT,       ///< Random-access output iterator type
+    typename ScanOpT,               ///< Scan functor type
+    typename InitValueT,            ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+    typename OffsetT>               ///< Signed integer type for global offsets
+struct AgentScan
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    typedef typename std::iterator_traits::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Tile status descriptor interface type
+    typedef ScanTileState ScanTileStateT;
+
+    // Input iterator wrapper type (for applying cache modifier)
+    typedef typename If::VALUE,
+            CacheModifiedInputIterator,   // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIteratorT>::Type                                                           // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Constants
+    enum
+    {
+        IS_INCLUSIVE        = Equals::VALUE,            // Inclusive scan if no init_value type is provided
+        BLOCK_THREADS       = AgentScanPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentScanPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Parameterized BlockLoad type
+    typedef BlockLoad<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockStore type
+    typedef BlockStore<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::STORE_ALGORITHM>
+        BlockStoreT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OutputT,
+            ScanOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef BlockScanRunningPrefixOp<
+            OutputT,
+            ScanOpT>
+        RunningPrefixCallbackOp;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
+        typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
+
+        struct
+        {
+            typename TilePrefixCallbackOpT::TempStorage  prefix;     // Smem needed for cooperative prefix callback
+            typename BlockScanT::TempStorage             scan;       // Smem needed for tile scanning
+        };
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&               temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT       d_in;               ///< Input data
+    OutputIteratorT             d_out;              ///< Output data
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    InitValueT                  init_value;         ///< The init_value element for ScanOpT
+
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        OutputT             init_value,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate);
+        block_aggregate = scan_op(init_value, block_aggregate);
+    }
+
+
+    /**
+     * Inclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        InitValueT          /*init_value*/,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * Exclusive scan specialization (subsequent tiles)
+     */
+    template 
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    /**
+     * Inclusive scan specialization (subsequent tiles)
+     */
+    template 
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentScan(
+        TempStorage&    temp_storage,       ///< Reference to temp_storage
+        InputIteratorT  d_in,               ///< Input data
+        OutputIteratorT d_out,              ///< Output data
+        ScanOpT         scan_op,            ///< Binary scan operator
+        InitValueT      init_value)         ///< Initial value to seed the exclusive scan
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out(d_out),
+        scan_op(scan_op),
+        init_value(init_value)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template                 ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+
+        CTA_SYNC();
+
+        // Perform tile scan
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type());
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            ScanTile(items, scan_op, prefix_op, Int2Type());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scan an sequence of consecutive tiles (independent of other thread blocks)
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input
+     */
+    template <
+        bool                        IS_FIRST_TILE,
+        bool                        IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT                     tile_offset,                ///< Tile offset
+        RunningPrefixCallbackOp&    prefix_op,                  ///< Running prefix operator
+        int                         valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+
+        CTA_SYNC();
+
+        // Block scan
+        if (IS_FIRST_TILE)
+        {
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type());
+            prefix_op.running_total = block_aggregate;
+        }
+        else
+        {
+            ScanTile(items, scan_op, prefix_op, Int2Type());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT  range_offset,      ///< [in] Threadblock begin offset (inclusive)
+        OffsetT  range_end)         ///< [in] Threadblock end offset (exclusive)
+    {
+        BlockScanRunningPrefixOp prefix_op(scan_op);
+
+        if (range_offset + TILE_ITEMS <= range_end)
+        {
+            // Consume first tile of input (full)
+            ConsumeTile(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+
+            // Consume subsequent full tiles of input
+            while (range_offset + TILE_ITEMS <= range_end)
+            {
+                ConsumeTile(range_offset, prefix_op);
+                range_offset += TILE_ITEMS;
+            }
+
+            // Consume a partially-full tile
+            if (range_offset < range_end)
+            {
+                int valid_items = range_end - range_offset;
+                ConsumeTile(range_offset, prefix_op, valid_items);
+            }
+        }
+        else
+        {
+            // Consume the first tile of input (partially-full)
+            int valid_items = range_end - range_offset;
+            ConsumeTile(range_offset, prefix_op, valid_items);
+        }
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles, seeded with the specified prefix value
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT range_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT range_end,                          ///< [in] Threadblock end offset (exclusive)
+        OutputT prefix)                             ///< [in] The prefix to apply to the scan segment
+    {
+        BlockScanRunningPrefixOp prefix_op(prefix, scan_op);
+
+        // Consume full tiles of input
+        while (range_offset + TILE_ITEMS <= range_end)
+        {
+            ConsumeTile(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (range_offset < range_end)
+        {
+            int valid_items = range_end - range_offset;
+            ConsumeTile(range_offset, prefix_op, valid_items);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/SRC/cub/agent/agent_segment_fixup.cuh b/SRC/cub/agent/agent_segment_fixup.cuh
new file mode 100644
index 00000000..e2de58ed
--- /dev/null
+++ b/SRC/cub/agent/agent_segment_fixup.cuh
@@ -0,0 +1,375 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include 
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSegmentFixup
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSegmentFixupPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
+    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT>                        ///< Signed integer type for global offsets
+struct AgentSegmentFixup
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data type of key-value input iterator
+    typedef typename std::iterator_traits::value_type KeyValuePairT;
+
+    // Value type
+    typedef typename KeyValuePairT::Value ValueT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentSegmentFixupPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Whether or not do fixup using RLE + global atomics
+        USE_ATOMIC_FIXUP    = (CUB_PTX_ARCH >= 350) && 
+                                (Equals::VALUE || 
+                                 Equals::VALUE ||
+                                 Equals::VALUE ||
+                                 Equals::VALUE),
+
+        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+        HAS_IDENTITY_ZERO   = (Equals::VALUE) && (Traits::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    typedef typename If::VALUE,
+            CacheModifiedInputIterator,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            PairsInputIteratorT>::Type                                                                      // Directly use the supplied input iterator type
+        WrappedPairsInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    typedef typename If::VALUE,
+            CacheModifiedInputIterator,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            AggregatesOutputIteratorT>::Type                                                        // Directly use the supplied input iterator type
+        WrappedFixupInputIteratorT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceByKeyOp ReduceBySegmentOpT;
+
+    // Parameterized BlockLoad type for pairs
+    typedef BlockLoad<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSegmentFixupPolicyT::LOAD_ALGORITHM>
+        BlockLoadPairs;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            AgentSegmentFixupPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            KeyValuePairT,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+        };
+
+        // Smem needed for loading keys
+        typename BlockLoadPairs::TempStorage load_pairs;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedPairsInputIteratorT      d_pairs_in;          ///< Input keys
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    WrappedFixupInputIteratorT      d_fixup_in;         ///< Fixup input values
+    InequalityWrapper  inequality_op;      ///< KeyT inequality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSegmentFixup(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        PairsInputIteratorT         d_pairs_in,          ///< Input keys
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_pairs_in(d_pairs_in),
+        d_aggregates_out(d_aggregates_out),
+        d_fixup_in(d_aggregates_out),
+        inequality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process input tile.  Specialized for atomic-fixup
+     */
+    template 
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type      use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        // RLE 
+        #pragma unroll
+        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key;
+            if (pairs[ITEM].key != pairs[ITEM - 1].key)
+                atomicAdd(d_scatter, pairs[ITEM - 1].value);
+            else
+                pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value);
+        }
+
+        // Flush last item if valid
+        ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key;
+        if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
+            atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
+    }
+
+
+    /**
+     * Process input tile.  Specialized for reduce-by-key fixup
+     */
+    template 
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type     use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+        KeyValuePairT   scatter_pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        CTA_SYNC();
+
+        KeyValuePairT tile_aggregate;
+        if (tile_idx == 0)
+        {
+            // Exclusive scan of values and segment_flags
+            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate);
+
+            // Update tile status if this is not the last tile
+            if (threadIdx.x == 0)
+            {
+                // Set first segment id to not trigger a flush (invalid from exclusive scan)
+                scatter_pairs[0].key = pairs[0].key;
+
+                if (!IS_LAST_TILE)
+                    tile_state.SetInclusive(0, tile_aggregate);
+
+            }
+        }
+        else
+        {
+            // Exclusive scan of values and segment_flags
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op);
+            tile_aggregate = prefix_op.GetBlockAggregate();
+        }
+
+        // Scatter updated values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (scatter_pairs[ITEM].key != pairs[ITEM].key)
+            {
+                // Update the value at the key location
+                ValueT value    = d_fixup_in[scatter_pairs[ITEM].key];
+                value           = reduction_op(value, scatter_pairs[ITEM].value);
+
+                d_aggregates_out[scatter_pairs[ITEM].key] = value;
+            }
+        }
+
+        // Finalize the last item
+        if (IS_LAST_TILE)
+        {
+            // Last thread will output final count and last item, if necessary
+            if (threadIdx.x == BLOCK_THREADS - 1)
+            {
+                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
+                if (num_remaining == TILE_ITEMS)
+                {
+                    // Update the value at the key location
+                    OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key;
+                    d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]);
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        int                 num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not the last tile (full)
+            ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type());
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type());
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/SRC/cub/agent/agent_select_if.cuh b/SRC/cub/agent/agent_select_if.cuh
new file mode 100644
index 00000000..52ca9fc2
--- /dev/null
+++ b/SRC/cub/agent/agent_select_if.cuh
@@ -0,0 +1,703 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
+ */
+
+#pragma once
+
+#include 
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSelectIf
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSelectIfPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+/**
+ * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
+    typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
+    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    SelectedOutputIteratorT,        ///< Random-access input iterator type for selection_flags items
+    typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
+    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct AgentSelectIf
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    typedef typename std::iterator_traits::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits::value_type,                                                  // ... then the input iterator's value type,
+        typename std::iterator_traits::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // The flag value type
+    typedef typename std::iterator_traits::value_type FlagT;
+
+    // Tile status descriptor interface type
+    typedef ScanTileState ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        USE_SELECT_OP,
+        USE_SELECT_FLAGS,
+        USE_DISCONTINUITY,
+
+        BLOCK_THREADS           = AgentSelectIfPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSelectIfPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER       = (ITEMS_PER_THREAD > 1),
+
+        SELECT_METHOD           = (!Equals::VALUE) ?
+                                    USE_SELECT_OP :
+                                    (!Equals::VALUE) ?
+                                        USE_SELECT_FLAGS :
+                                        USE_DISCONTINUITY
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for items
+    typedef typename If::VALUE,
+            CacheModifiedInputIterator,        // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    typedef typename If::VALUE,
+            CacheModifiedInputIterator,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            FlagsInputIteratorT>::Type                                                          // Directly use the supplied input iterator type
+        WrappedFlagsInputIteratorT;
+
+    // Parameterized BlockLoad type for input data
+    typedef BlockLoad<
+            OutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSelectIfPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockLoad type for flags
+    typedef BlockLoad<
+            FlagT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSelectIfPolicyT::LOAD_ALGORITHM>
+        BlockLoadFlags;
+
+    // Parameterized BlockDiscontinuity type for items
+    typedef BlockDiscontinuity<
+            OutputT,
+            BLOCK_THREADS>
+        BlockDiscontinuityT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OffsetT,
+            BLOCK_THREADS,
+            AgentSelectIfPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OffsetT,
+            cub::Sum,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Item exchange type
+    typedef OutputT ItemExchangeT[TILE_ITEMS];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityT::TempStorage       discontinuity;  // Smem needed for discontinuity detection
+        };
+
+        // Smem needed for loading items
+        typename BlockLoadT::TempStorage load_items;
+
+        // Smem needed for loading values
+        typename BlockLoadFlags::TempStorage load_flags;
+
+        // Smem needed for compacting items (allows non POD items in this union)
+        Uninitialized raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT           d_in;               ///< Input items
+    SelectedOutputIteratorT         d_selected_out;     ///< Unique output items
+    WrappedFlagsInputIteratorT      d_flags_in;         ///< Input selection flags (if applicable)
+    InequalityWrapper  inequality_op;      ///< T inequality operator
+    SelectOpT                       select_op;          ///< Selection operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSelectIf(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        InputIteratorT              d_in,               ///< Input data
+        FlagsInputIteratorT         d_flags_in,         ///< Input selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,     ///< Output data
+        SelectOpT                   select_op,          ///< Selection operator
+        EqualityOpT                 equality_op,        ///< Equality operator
+        OffsetT                     num_items)          ///< Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_flags_in(d_flags_in),
+        d_selected_out(d_selected_out),
+        select_op(select_op),
+        inequality_op(equality_op),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize selections (specialized for selection operator)
+     */
+    template 
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     /*tile_offset*/,
+        OffsetT                     num_tile_items,
+        OutputT                     (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type     /*select_method*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Out-of-bounds items are selection_flags
+            selection_flags[ITEM] = 1;
+
+            if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+                selection_flags[ITEM] = select_op(items[ITEM]);
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for valid flags)
+     */
+    template 
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        OutputT                     (&/*items*/)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type  /*select_method*/)
+    {
+        CTA_SYNC();
+
+        FlagT flags[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+            // Out-of-bounds items are selection_flags
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
+        }
+        else
+        {
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
+        }
+
+        // Convert flag type to selection_flags type
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            selection_flags[ITEM] = flags[ITEM];
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for discontinuity detection)
+     */
+    template 
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        OutputT                     (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type /*select_method*/)
+    {
+        if (IS_FIRST_TILE)
+        {
+            CTA_SYNC();
+
+            // Set head selection_flags.  First tile sets the first flag for the first item
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
+        }
+        else
+        {
+            OutputT tile_predecessor;
+            if (threadIdx.x == 0)
+                tile_predecessor = d_in[tile_offset - 1];
+
+            CTA_SYNC();
+
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
+        }
+
+        // Set selection flags for out-of-bounds items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Set selection_flags for out-of-bounds items
+            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+                selection_flags[ITEM] = 1;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scatter flagged items to output offsets (specialized for direct scattering)
+     */
+    template 
+    __device__ __forceinline__ void ScatterDirect(
+        OutputT (&items)[ITEMS_PER_THREAD],
+        OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+        OffsetT num_selections)
+    {
+        // Scatter flagged items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (selection_flags[ITEM])
+            {
+                if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
+                {
+                    d_selected_out[selection_indices[ITEM]] = items[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template 
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             /*num_tile_items*/,                         ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         /*num_rejected_prefix*/,                    ///< Total number of rejections prior to this tile
+        Int2Type /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        // Compact and scatter items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
+            if (selection_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
+        {
+            d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template 
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        Int2Type  /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        int tile_num_rejections = num_tile_items - num_tile_selections;
+
+        // Scatter items to shared memory (rejections first)
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx                = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+            int local_selection_idx     = selection_indices[ITEM] - num_selections_prefix;
+            int local_rejection_idx     = item_idx - local_selection_idx;
+            int local_scatter_offset    = (selection_flags[ITEM]) ?
+                                            tile_num_rejections + local_selection_idx :
+                                            local_rejection_idx;
+
+            temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // Gather items from shared memory and scatter to global
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx            = (ITEM * BLOCK_THREADS) + threadIdx.x;
+            int rejection_idx       = item_idx;
+            int selection_idx       = item_idx - tile_num_rejections;
+            OffsetT scatter_offset  = (item_idx < tile_num_rejections) ?
+                                        num_items - num_rejected_prefix - rejection_idx - 1 :
+                                        num_selections_prefix + selection_idx;
+
+            OutputT item = temp_storage.raw_exchange.Alias()[item_idx];
+
+            if (!IS_LAST_TILE || (item_idx < num_tile_items))
+            {
+                d_selected_out[scatter_offset] = item;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    template 
+    __device__ __forceinline__ void Scatter(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        OffsetT         num_selections)                             ///< Total number of selections including this tile
+    {
+        // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
+        if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
+        {
+            ScatterTwoPhase(
+                items,
+                selection_flags,
+                selection_indices,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_rejected_prefix,
+                Int2Type());
+        }
+        else
+        {
+            ScatterDirect(
+                items,
+                selection_flags,
+                selection_indices,
+                num_selections);
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template 
+    __device__ __forceinline__ OffsetT ConsumeFirstTile(
+        int                 num_tile_items,      ///< Number of input items comprising this tile
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OutputT     items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type());
+
+        CTA_SYNC();
+
+        // Exclusive scan of selection_flags
+        OffsetT num_tile_selections;
+        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);
+
+        if (threadIdx.x == 0)
+        {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+                tile_state.SetInclusive(0, num_tile_selections);
+        }
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+            num_tile_selections -= (TILE_ITEMS - num_tile_items);
+
+        // Scatter flagged items
+        Scatter(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            0,
+            0,
+            num_tile_selections);
+
+        return num_tile_selections;
+    }
+
+
+    /**
+     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template 
+    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
+        int                 num_tile_items,      ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OutputT     items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type());
+
+        CTA_SYNC();
+
+        // Exclusive scan of values and selection_flags
+        TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx);
+        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);
+
+        OffsetT num_tile_selections     = prefix_op.GetBlockAggregate();
+        OffsetT num_selections          = prefix_op.GetInclusivePrefix();
+        OffsetT num_selections_prefix   = prefix_op.GetExclusivePrefix();
+        OffsetT num_rejected_prefix     = (tile_idx * TILE_ITEMS) - num_selections_prefix;
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+        {
+            int num_discount    = TILE_ITEMS - num_tile_items;
+            num_selections      -= num_discount;
+            num_tile_selections -= num_discount;
+        }
+
+        // Scatter flagged items
+        Scatter(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            num_selections_prefix,
+            num_rejected_prefix,
+            num_selections);
+
+        return num_selections;
+    }
+
+
+    /**
+     * Process a tile of input
+     */
+    template 
+    __device__ __forceinline__ OffsetT ConsumeTile(
+        int                 num_tile_items,         ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OffsetT num_selections;
+        if (tile_idx == 0)
+        {
+            num_selections = ConsumeFirstTile(num_tile_items, tile_offset, tile_state);
+        }
+        else
+        {
+            num_selections = ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, tile_state);
+        }
+
+        return num_selections;
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template         ///< Output iterator type for recording number of items selection_flags
+    __device__ __forceinline__ void ConsumeRange(
+        int                     num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
+        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile(TILE_ITEMS, tile_idx, tile_offset, tile_state);
+        }
+        else
+        {
+            // The last tile (possibly partially-full)
+            OffsetT num_remaining   = num_items - tile_offset;
+            OffsetT num_selections  = ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selection_flags
+                *d_num_selected_out = num_selections;
+            }
+        }
+    }
+
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/SRC/cub/agent/agent_spmv_orig.cuh b/SRC/cub/agent/agent_spmv_orig.cuh
new file mode 100644
index 00000000..54e2a139
--- /dev/null
+++ b/SRC/cub/agent/agent_spmv_orig.cuh
@@ -0,0 +1,670 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+
+#pragma once
+
+#include 
+
+#include "../util_type.cuh"
+#include "../block/block_reduce.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../thread/thread_search.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/counting_input_iterator.cuh"
+#include "../iterator/tex_ref_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSpmv
+ */
+template <
+    int                             _BLOCK_THREADS,                         ///< Threads per thread block
+    int                             _ITEMS_PER_THREAD,                      ///< Items per thread (per tile of input)
+    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,      ///< Cache load modifier for reading CSR row-offsets during search
+    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,             ///< Cache load modifier for reading CSR row-offsets
+    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,          ///< Cache load modifier for reading CSR column-indices
+    CacheLoadModifier               _VALUES_LOAD_MODIFIER,                  ///< Cache load modifier for reading CSR values
+    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,           ///< Cache load modifier for reading vector values
+    bool                            _DIRECT_LOAD_NONZEROS,                  ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
+    BlockScanAlgorithm              _SCAN_ALGORITHM>                        ///< The BlockScan algorithm to use
+struct AgentSpmvPolicy
+{
+    enum
+    {
+        BLOCK_THREADS                                                   = _BLOCK_THREADS,                       ///< Threads per thread block
+        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,                    ///< Items per thread (per tile of input)
+        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,                ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
+    };
+
+    static const CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;    ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;           ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;        ///< Cache load modifier for reading CSR column-indices
+    static const CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;                ///< Cache load modifier for reading CSR values
+    static const CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;         ///< Cache load modifier for reading vector values
+    static const BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;                      ///< The BlockScan algorithm to use
+
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+template <
+    typename        ValueT,              ///< Matrix and vector value type
+    typename        OffsetT>             ///< Signed integer type for sequence offsets
+struct SpmvParams
+{
+    ValueT*         d_values;            ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A.
+    OffsetT*        d_row_end_offsets;   ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    OffsetT*        d_column_indices;    ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A.  (Indices are zero-valued.)
+    ValueT*         d_vector_x;          ///< Pointer to the array of \p num_cols values corresponding to the dense input vector x
+    ValueT*         d_vector_y;          ///< Pointer to the array of \p num_rows values corresponding to the dense output vector y
+    int             num_rows;            ///< Number of rows of matrix A.
+    int             num_cols;            ///< Number of columns of matrix A.
+    int             num_nonzeros;        ///< Number of nonzero elements of matrix A.
+    ValueT          alpha;               ///< Alpha multiplicand
+    ValueT          beta;                ///< Beta addend-multiplicand
+
+    TexRefInputIterator  t_vector_x;
+};
+
+
+/**
+ * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+template <
+    typename    AgentSpmvPolicyT,           ///< Parameterized AgentSpmvPolicy tuning policy type
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT,                    ///< Signed integer type for sequence offsets
+    bool        HAS_ALPHA,                  ///< Whether the input parameter \p alpha is 1
+    bool        HAS_BETA,                   ///< Whether the input parameter \p beta is 0
+    int         PTX_ARCH = CUB_PTX_ARCH>    ///< PTX compute capability
+struct AgentSpmv
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentSpmvPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSpmvPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    /// 2D merge path coordinate type
+    typedef typename CubVector::Type CoordinateT;
+
+    /// Input iterator wrapper types (for applying cache modifiers)
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsSearchIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        ColumnIndicesIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        ValueIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        VectorValueIteratorT;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair KeyValuePairT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceByKeyOp ReduceBySegmentOpT;
+
+    // BlockReduce specialization
+    typedef BlockReduce<
+            ValueT,
+            BLOCK_THREADS,
+            BLOCK_REDUCE_WARP_REDUCTIONS>
+        BlockReduceT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            ValueT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockPrefixSumT;
+
+    // BlockExchange specialization
+    typedef BlockExchange<
+            ValueT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD>
+        BlockExchangeT;
+
+    /// Merge item type (either a non-zero value or a row-end offset)
+    union MergeItem
+    {
+        // Value type to pair with index type OffsetT (NullType if loading values directly during merge)
+        typedef typename If::Type MergeValueT;
+
+        OffsetT     row_end_offset;
+        MergeValueT nonzero;
+    };
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CoordinateT tile_coords[2];
+
+        union Aliasable
+        {
+            // Smem needed for tile of merge items
+            MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
+
+            // Smem needed for block exchange
+            typename BlockExchangeT::TempStorage exchange;
+
+            // Smem needed for block-wide reduction
+            typename BlockReduceT::TempStorage reduce;
+
+            // Smem needed for tile scanning
+            typename BlockScanT::TempStorage scan;
+
+            // Smem needed for tile prefix sum
+            typename BlockPrefixSumT::TempStorage prefix_sum;
+
+        } aliasable;
+    };
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+
+    _TempStorage&                   temp_storage;         /// Reference to temp_storage
+
+    SpmvParams&    spmv_params;
+
+    ValueIteratorT                  wd_values;            ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A.
+    RowOffsetsIteratorT             wd_row_end_offsets;   ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    ColumnIndicesIteratorT          wd_column_indices;    ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A.  (Indices are zero-valued.)
+    VectorValueIteratorT            wd_vector_x;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x
+    VectorValueIteratorT            wd_vector_y;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentSpmv(
+        TempStorage&                    temp_storage,           ///< Reference to temp_storage
+        SpmvParams&    spmv_params)            ///< SpMV input parameter bundle
+    :
+        temp_storage(temp_storage.Alias()),
+        spmv_params(spmv_params),
+        wd_values(spmv_params.d_values),
+        wd_row_end_offsets(spmv_params.d_row_end_offsets),
+        wd_column_indices(spmv_params.d_column_indices),
+        wd_vector_x(spmv_params.d_vector_x),
+        wd_vector_y(spmv_params.d_vector_y)
+    {}
+
+
+
+
+    /**
+     * Consume a merge tile, specialized for direct-load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type  is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
+        {
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+
+        ValueT          running_total = 0.0;
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            OffsetT nonzero_idx         = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
+            OffsetT column_idx          = wd_column_indices[nonzero_idx];
+            ValueT  value               = wd_values[nonzero_idx];
+
+            ValueT  vector_value        = spmv_params.t_vector_x[column_idx];
+#if (CUB_PTX_ARCH >= 350)
+            vector_value                = wd_vector_x[column_idx];
+#endif
+            ValueT  nonzero             = value * vector_value;
+
+            OffsetT row_end_offset      = s_tile_row_end_offsets[thread_current_coord.x];
+
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                running_total += nonzero;
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = tile_num_rows;
+                ++thread_current_coord.y;
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = thread_current_coord.x;
+                running_total               = 0.0;
+                ++thread_current_coord.x;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key   = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (tile_num_rows > 0)
+        {
+            if (threadIdx.x == 0)
+                scan_item.key = -1;
+
+            // Direct scatter
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM].key < tile_num_rows)
+                {
+                    if (scan_item.key == scan_segment[ITEM].key)
+                        scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;
+
+                    if (HAS_ALPHA)
+                    {
+                        scan_segment[ITEM].value *= spmv_params.alpha;
+                    }
+
+                    if (HAS_BETA)
+                    {
+                        // Update the output vector element
+                        ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];
+                        scan_segment[ITEM].value += addend;
+                    }
+
+                    // Set the output vector element
+                    spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
+                }
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+
+    /**
+     * Consume a merge tile, specialized for indirect load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+
+#if (CUB_PTX_ARCH >= 520)
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
+
+            ValueIteratorT a                = wd_values + tile_start_coord.y + nonzero_idx;
+            ColumnIndicesIteratorT ci       = wd_column_indices + tile_start_coord.y + nonzero_idx;
+            ValueT* s                       = s_tile_nonzeros + nonzero_idx;
+
+            if (nonzero_idx < tile_num_nonzeros)
+            {
+
+                OffsetT column_idx              = *ci;
+                ValueT  value                   = *a;
+
+                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
+                vector_value                    = wd_vector_x[column_idx];
+
+                ValueT  nonzero                 = value * vector_value;
+
+                *s    = nonzero;
+            }
+        }
+
+
+#else
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        if (tile_num_nonzeros > 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                int     nonzero_idx             = threadIdx.x + (ITEM * BLOCK_THREADS);
+                nonzero_idx                     = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
+
+                OffsetT column_idx              = wd_column_indices[tile_start_coord.y + nonzero_idx];
+                ValueT  value                   = wd_values[tile_start_coord.y + nonzero_idx];
+
+                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
+#if (CUB_PTX_ARCH >= 350)
+                vector_value                    = wd_vector_x[column_idx];
+#endif
+                ValueT  nonzero                 = value * vector_value;
+
+                s_tile_nonzeros[nonzero_idx]    = nonzero;
+            }
+        }
+
+#endif
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        #pragma unroll 1
+        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
+        {
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+        ValueT          running_total = 0.0;
+
+        OffsetT row_end_offset  = s_tile_row_end_offsets[thread_current_coord.x];
+        ValueT  nonzero         = s_tile_nonzeros[thread_current_coord.y];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                scan_segment[ITEM].value    = nonzero;
+                running_total               += nonzero;
+                ++thread_current_coord.y;
+                nonzero                     = s_tile_nonzeros[thread_current_coord.y];
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = 0.0;
+                running_total               = 0.0;
+                ++thread_current_coord.x;
+                row_end_offset              = s_tile_row_end_offsets[thread_current_coord.x];
+            }
+
+            scan_segment[ITEM].key = thread_current_coord.x;
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (threadIdx.x == 0)
+        {
+            scan_item.key = thread_start_coord.x;
+            scan_item.value = 0.0;
+        }
+
+        if (tile_num_rows > 0)
+        {
+
+            CTA_SYNC();
+
+            // Scan downsweep and scatter
+            ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
+
+            if (scan_item.key != scan_segment[0].key)
+            {
+                s_partials[scan_item.key] = scan_item.value;
+            }
+            else
+            {
+                scan_segment[0].value += scan_item.value;
+            }
+
+            #pragma unroll
+            for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
+                {
+                    s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
+                }
+                else
+                {
+                    scan_segment[ITEM].value += scan_segment[ITEM - 1].value;
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll 1
+            for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
+            {
+                spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+    /**
+     * Consume input tile
+     */
+    __device__ __forceinline__ void ConsumeTile(
+        CoordinateT*    d_tile_coordinates,     ///< [in] Pointer to the temporary array of tile starting coordinates
+        KeyValuePairT*  d_tile_carry_pairs,     ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+        int             num_merge_tiles)        ///< [in] Number of merge tiles
+    {
+        int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+
+        if (tile_idx >= num_merge_tiles)
+            return;
+
+        // Read our starting coordinates
+        if (threadIdx.x < 2)
+        {
+            if (d_tile_coordinates == NULL)
+            {
+                // Search our starting coordinates
+                OffsetT                         diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
+                CoordinateT                     tile_coord;
+                CountingInputIterator  nonzero_indices(0);
+
+                // Search the merge path
+                MergePathSearch(
+                    diagonal,
+                    RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+                    nonzero_indices,
+                    spmv_params.num_rows,
+                    spmv_params.num_nonzeros,
+                    tile_coord);
+
+                temp_storage.tile_coords[threadIdx.x] = tile_coord;
+            }
+            else
+            {
+                temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
+            }
+        }
+
+        CTA_SYNC();
+
+        CoordinateT tile_start_coord     = temp_storage.tile_coords[0];
+        CoordinateT tile_end_coord       = temp_storage.tile_coords[1];
+
+        // Consume multi-segment tile
+        KeyValuePairT tile_carry = ConsumeTile(
+            tile_idx,
+            tile_start_coord,
+            tile_end_coord,
+            Int2Type());
+
+        // Output the tile's carry-out
+        if (threadIdx.x == 0)
+        {
+            if (HAS_ALPHA)
+                tile_carry.value *= spmv_params.alpha;
+
+            tile_carry.key += tile_start_coord.x;
+            d_tile_carry_pairs[tile_idx]    = tile_carry;
+        }
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/SRC/cub/block_range/block_scan_prefix_operators.cuh b/SRC/cub/agent/single_pass_scan_operators.cuh
similarity index 57%
rename from SRC/cub/block_range/block_scan_prefix_operators.cuh
rename to SRC/cub/agent/single_pass_scan_operators.cuh
index ba72cc2e..53409bde 100644
--- a/SRC/cub/block_range/block_scan_prefix_operators.cuh
+++ b/SRC/cub/agent/single_pass_scan_operators.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -50,7 +50,8 @@ namespace cub {
 
 
 /******************************************************************************
- * Prefix functor type for maintaining a running prefix while scanning a region
+ * Prefix functor type for maintaining a running prefix while scanning a
+ * region independent of other thread blocks
  ******************************************************************************/
 
 /**
@@ -60,14 +61,14 @@ namespace cub {
  */
 template <
     typename T,                 ///< BlockScan value type
-    typename ScanOp>            ///< Wrapped scan operator type
+    typename ScanOpT>            ///< Wrapped scan operator type
 struct BlockScanRunningPrefixOp
 {
-    ScanOp  op;                 ///< Wrapped scan operator
-    T       running_total;      ///< Running block-wide prefix
+    ScanOpT     op;                 ///< Wrapped scan operator
+    T           running_total;      ///< Running block-wide prefix
 
     /// Constructor
-    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOp op)
+    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op)
     :
         op(op)
     {}
@@ -75,7 +76,7 @@ struct BlockScanRunningPrefixOp
     /// Constructor
     __device__ __forceinline__ BlockScanRunningPrefixOp(
         T starting_prefix,
-        ScanOp op)
+        ScanOpT op)
     :
         op(op),
         running_total(starting_prefix)
@@ -95,17 +96,16 @@ struct BlockScanRunningPrefixOp
 
 
 /******************************************************************************
- * Bookkeeping and prefix functor types for single-pass device-wide scan with dynamic lookback
+ * Generic tile status interface types for block-cooperative scans
  ******************************************************************************/
 
-
 /**
  * Enumerations of tile status
  */
 enum ScanTileStatus
 {
     SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
-    SCAN_TILE_INVALID,      // Not yet processed
+    SCAN_TILE_INVALID = 99, // Not yet processed
     SCAN_TILE_PARTIAL,      // Tile aggregate is available
     SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
 };
@@ -164,25 +164,24 @@ struct ScanTileState
 
 
     // Device storage
-    TileDescriptor *d_tile_status;
-
+    TxnWord *d_tile_descriptors;
 
     /// Constructor
     __host__ __device__ __forceinline__
     ScanTileState()
     :
-        d_tile_status(NULL)
+        d_tile_descriptors(NULL)
     {}
 
 
     /// Initializer
     __host__ __device__ __forceinline__
     cudaError_t Init(
-        int     num_tiles,                          ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
     {
-        d_tile_status = reinterpret_cast(d_temp_storage);
+        d_tile_descriptors = reinterpret_cast(d_temp_storage);
         return cudaSuccess;
     }
 
@@ -206,16 +205,22 @@ struct ScanTileState
     __device__ __forceinline__ void InitializeStatus(int num_tiles)
     {
         int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+        TxnWord val = TxnWord();
+        TileDescriptor *descriptor = reinterpret_cast(&val);
+
         if (tile_idx < num_tiles)
         {
             // Not-yet-set
-            d_tile_status[TILE_STATUS_PADDING + tile_idx].status = StatusWord(SCAN_TILE_INVALID);
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
         }
 
         if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
         {
             // Padding
-            d_tile_status[threadIdx.x].status = StatusWord(SCAN_TILE_OOB);
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
         }
     }
 
@@ -231,7 +236,7 @@ struct ScanTileState
 
         TxnWord alias;
         *reinterpret_cast(&alias) = tile_descriptor;
-        ThreadStore(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
+        ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
     }
 
 
@@ -246,7 +251,7 @@ struct ScanTileState
 
         TxnWord alias;
         *reinterpret_cast(&alias) = tile_descriptor;
-        ThreadStore(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
+        ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
     }
 
     /**
@@ -257,15 +262,14 @@ struct ScanTileState
         StatusWord      &status,
         T               &value)
     {
-        // Use warp-any to determine when all threads have valid status
-        TxnWord alias = ThreadLoad(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx));
-        TileDescriptor tile_descriptor = reinterpret_cast(alias);
-
-        while ((tile_descriptor.status == SCAN_TILE_INVALID))
+        TileDescriptor tile_descriptor;
+        do
         {
-            alias = ThreadLoad(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx));
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
             tile_descriptor = reinterpret_cast(alias);
-        }
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
 
         status = tile_descriptor.status;
         value = tile_descriptor.value;
@@ -310,7 +314,7 @@ struct ScanTileState
     __host__ __device__ __forceinline__
     cudaError_t Init(
         int     num_tiles,                          ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
         size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
     {
         cudaError_t error = cudaSuccess;
@@ -416,23 +420,252 @@ struct ScanTileState
         StatusWord      &status,
         T               &value)
     {
-        status = ThreadLoad(d_tile_status + TILE_STATUS_PADDING + tile_idx);
-        while (status == SCAN_TILE_INVALID)
-        {
+        do {
             status = ThreadLoad(d_tile_status + TILE_STATUS_PADDING + tile_idx);
+
+            __threadfence();    // prevent hoisting loads from loop or loads below above this one
+
+        } while (status == SCAN_TILE_INVALID);
+
+        if (status == StatusWord(SCAN_TILE_PARTIAL)) 
+            value = ThreadLoad(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
+        else
+            value = ThreadLoad(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
+    }
+};
+
+
+/******************************************************************************
+ * ReduceByKey tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Tile status interface for reduction by key.
+ *
+ */
+template <
+    typename    ValueT,
+    typename    KeyT,
+    bool        SINGLE_WORD = (Traits::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)>
+struct ReduceByKeyScanTileState;
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <
+    typename    ValueT,
+    typename    KeyT>
+struct ReduceByKeyScanTileState :
+    ScanTileState >
+{
+    typedef ScanTileState > SuperClass;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState() : SuperClass() {}
+};
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * can be combined into one machine word that can be read/written coherently in a single access.
+ */
+template <
+    typename ValueT,
+    typename KeyT>
+struct ReduceByKeyScanTileState
+{
+    typedef KeyValuePairKeyValuePairT;
+
+    // Constants
+    enum
+    {
+        PAIR_SIZE           = sizeof(ValueT) + sizeof(KeyT),
+        TXN_WORD_SIZE       = 1 << Log2::VALUE,
+        STATUS_WORD_SIZE    = TXN_WORD_SIZE - PAIR_SIZE,
+
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Status word type
+    typedef typename If<(STATUS_WORD_SIZE == 8),
+        long long,
+        typename If<(STATUS_WORD_SIZE == 4),
+            int,
+            typename If<(STATUS_WORD_SIZE == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+    // Status word type
+    typedef typename If<(TXN_WORD_SIZE == 16),
+        longlong2,
+        typename If<(TXN_WORD_SIZE == 8),
+            long long,
+            int>::Type>::Type TxnWord;
+
+    // Device word type (for when sizeof(ValueT) == sizeof(KeyT))
+    struct TileDescriptorBigStatus
+    {
+        KeyT        key;
+        ValueT      value;
+        StatusWord  status;
+    };
+
+    // Device word type (for when sizeof(ValueT) != sizeof(KeyT))
+    struct TileDescriptorLittleStatus
+    {
+        ValueT      value;
+        StatusWord  status;
+        KeyT        key;
+    };
+
+    // Device word type
+    typedef typename If<
+            (sizeof(ValueT) == sizeof(KeyT)),
+            TileDescriptorBigStatus,
+            TileDescriptorLittleStatus>::Type
+        TileDescriptor;
+
+
+    // Device storage
+    TxnWord *d_tile_descriptors;
+
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState()
+    :
+        d_tile_descriptors(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_descriptors = reinterpret_cast(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int             tile_idx    = (blockIdx.x * blockDim.x) + threadIdx.x;
+        TxnWord         val         = TxnWord();
+        TileDescriptor  *descriptor = reinterpret_cast(&val);
+
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
         }
 
-        T partial = ThreadLoad(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
-        T inclusive = ThreadLoad(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value   = tile_inclusive.value;
+        tile_descriptor.key     = tile_inclusive.key;
+
+        TxnWord alias;
+        *reinterpret_cast(&alias) = tile_descriptor;
+        ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
 
-        value = (status == StatusWord(SCAN_TILE_PARTIAL)) ?
-            partial :
-            inclusive;
 
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_PARTIAL;
+        tile_descriptor.value   = tile_partial.value;
+        tile_descriptor.key     = tile_partial.key;
+
+        TxnWord alias;
+        *reinterpret_cast(&alias) = tile_descriptor;
+        ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
     }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int                     tile_idx,
+        StatusWord              &status,
+        KeyValuePairT           &value)
+    {
+//        TxnWord         alias           = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//        TileDescriptor  tile_descriptor = reinterpret_cast(alias);
+//
+//        while (tile_descriptor.status == SCAN_TILE_INVALID)
+//        {
+//            __threadfence_block(); // prevent hoisting loads from loop
+//
+//            alias           = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//            tile_descriptor = reinterpret_cast(alias);
+//        }
+//
+//        status      = tile_descriptor.status;
+//        value.value = tile_descriptor.value;
+//        value.key   = tile_descriptor.key;
+
+        TileDescriptor tile_descriptor;
+        do
+        {
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+            tile_descriptor = reinterpret_cast(alias);
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+        status      = tile_descriptor.status;
+        value.value = tile_descriptor.value;
+        value.key   = tile_descriptor.key;
+    }
+
 };
 
 
+/******************************************************************************
+ * Prefix call-back operator for coupling local block scan within a
+ * block-cooperative scan
+ ******************************************************************************/
 
 /**
  * Stateful block-scan prefix functor.  Provides the the running prefix for
@@ -440,58 +673,48 @@ struct ScanTileState
  * aggregates/prefixes from predecessor tiles to become available.
  */
 template <
-    typename T,
-    typename ScanOp,
-    typename ScanTileState>
-struct BlockScanLookbackPrefixOp
+    typename    T,
+    typename    ScanOpT,
+    typename    ScanTileStateT,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct TilePrefixCallbackOp
 {
     // Parameterized warp reduce
-    typedef WarpReduce WarpReduceT;
+    typedef WarpReduce WarpReduceT;
 
     // Temporary storage type
-    typedef typename WarpReduceT::TempStorage _TempStorage;
+    struct _TempStorage
+    {
+        typename WarpReduceT::TempStorage   warp_reduce;
+        T                                   exclusive_prefix;
+        T                                   inclusive_prefix;
+        T                                   block_aggregate;
+    };
 
     // Alias wrapper allowing temporary storage to be unioned
     struct TempStorage : Uninitialized<_TempStorage> {};
 
     // Type of status word
-    typedef typename ScanTileState::StatusWord StatusWord;
-
-    // Scan operator for switching the scan arguments
-    struct SwizzleScanOp
-    {
-        ScanOp scan_op;
-
-        // Constructor
-        __host__ __device__ __forceinline__
-        SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
-
-        // Switch the scan arguments
-        __host__ __device__ __forceinline__
-        T operator()(const T &a, const T &b)
-        {
-            return scan_op(b, a);
-        }
-    };
+    typedef typename ScanTileStateT::StatusWord StatusWord;
 
     // Fields
-    ScanTileState               &tile_status;       ///< Interface to tile status
-    _TempStorage                &temp_storage;      ///< Reference to a warp-reduction instance
-    ScanOp                      scan_op;            ///< Binary scan operator
+    _TempStorage&               temp_storage;       ///< Reference to a warp-reduction instance
+    ScanTileStateT&             tile_status;        ///< Interface to tile status
+    ScanOpT                     scan_op;            ///< Binary scan operator
     int                         tile_idx;           ///< The current tile index
     T                           exclusive_prefix;   ///< Exclusive prefix for the tile
     T                           inclusive_prefix;   ///< Inclusive prefix for the tile
 
     // Constructor
     __device__ __forceinline__
-    BlockScanLookbackPrefixOp(
-        ScanTileState      &tile_status,
-        TempStorage             &temp_storage,
-        ScanOp                  scan_op,
-        int                     tile_idx)
+    TilePrefixCallbackOp(
+        ScanTileStateT       &tile_status,
+        TempStorage         &temp_storage,
+        ScanOpT              scan_op,
+        int                 tile_idx)
     :
-        tile_status(tile_status),
         temp_storage(temp_storage.Alias()),
+        tile_status(tile_status),
         scan_op(scan_op),
         tile_idx(tile_idx) {}
 
@@ -510,11 +733,10 @@ struct BlockScanLookbackPrefixOp
         // Use the swizzled scan operator because we are now scanning *down* towards thread0.
 
         int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
-
-        window_aggregate = WarpReduceT(temp_storage).TailSegmentedReduce(
+        window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(
             value,
             tail_flag,
-            SwizzleScanOp(scan_op));
+            SwizzleScanOp(scan_op));
     }
 
 
@@ -522,9 +744,11 @@ struct BlockScanLookbackPrefixOp
     __device__ __forceinline__
     T operator()(T block_aggregate)
     {
+
         // Update our status with our tile-aggregate
         if (threadIdx.x == 0)
         {
+            temp_storage.block_aggregate = block_aggregate;
             tile_status.SetPartial(tile_idx, block_aggregate);
         }
 
@@ -539,7 +763,7 @@ struct BlockScanLookbackPrefixOp
         exclusive_prefix = window_aggregate;
 
         // Keep sliding the window back until we come across a tile whose inclusive prefix is known
-        while (WarpAll(predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)))
+        while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
         {
             predecessor_idx -= CUB_PTX_WARP_THREADS;
 
@@ -553,11 +777,36 @@ struct BlockScanLookbackPrefixOp
         {
             inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
             tile_status.SetInclusive(tile_idx, inclusive_prefix);
+
+            temp_storage.exclusive_prefix = exclusive_prefix;
+            temp_storage.inclusive_prefix = inclusive_prefix;
         }
 
         // Return exclusive_prefix
         return exclusive_prefix;
     }
+
+    // Get the exclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetExclusivePrefix()
+    {
+        return temp_storage.exclusive_prefix;
+    }
+
+    // Get the inclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetInclusivePrefix()
+    {
+        return temp_storage.inclusive_prefix;
+    }
+
+    // Get the block aggregate stored in temporary storage
+    __device__ __forceinline__
+    T GetBlockAggregate()
+    {
+        return temp_storage.block_aggregate;
+    }
+
 };
 
 
diff --git a/SRC/cub/block/block_adjacent_difference.cuh b/SRC/cub/block/block_adjacent_difference.cuh
new file mode 100644
index 00000000..acef9f05
--- /dev/null
+++ b/SRC/cub/block/block_adjacent_difference.cuh
@@ -0,0 +1,596 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+template <
+    typename    T,
+    int         BLOCK_DIM_X,
+    int         BLOCK_DIM_Y     = 1,
+    int         BLOCK_DIM_Z     = 1,
+    int         PTX_ARCH        = CUB_PTX_ARCH>
+class BlockAdjacentDifference
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template ::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(b, a, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template 
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
+        {
+            return flag_op(b, a);
+        }
+    };
+
+    /// Templated unrolling of item comparison (inductive case)
+    template 
+    struct Iterate
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            preds[ITERATION] = input[ITERATION - 1];
+
+            flags[ITERATION] = ApplyOp::FlagT(
+                flag_op,
+                preds[ITERATION],
+                input[ITERATION],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
+
+            Iterate::FlagHeads(linear_tid, flags, input, preds, flag_op);
+        }
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            flags[ITERATION] = ApplyOp::FlagT(
+                flag_op,
+                input[ITERATION],
+                input[ITERATION + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
+
+            Iterate::FlagTails(linear_tid, flags, input, flag_op);
+        }
+
+    };
+
+    /// Templated unrolling of item comparison (termination case)
+    template 
+    struct Iterate
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockAdjacentDifference()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockAdjacentDifference(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0).
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
+    }
+
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1).
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = temp_storage.last_items[linear_tid - 1];
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            head_flags[0] = ApplyOp::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/SRC/cub/block/block_discontinuity.cuh b/SRC/cub/block/block_discontinuity.cuh
index 6b2f8c78..503e3e0b 100644
--- a/SRC/cub/block/block_discontinuity.cuh
+++ b/SRC/cub/block/block_discontinuity.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -122,7 +122,11 @@ private:
 
 
     /// Shared memory storage layout type (last element from each thread's input)
-    typedef T _TempStorage[BLOCK_THREADS];
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
 
 
     /******************************************************************************
@@ -142,7 +146,7 @@ private:
     struct ApplyOp
     {
         // Apply flag operator
-        static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
         {
             return flag_op(a, b, idx);
         }
@@ -153,7 +157,7 @@ private:
     struct ApplyOp
     {
         // Apply flag operator
-        static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
         {
             return flag_op(a, b);
         }
@@ -163,39 +167,78 @@ private:
     template 
     struct Iterate
     {
+        // Head flags
         template <
             int             ITEMS_PER_THREAD,
             typename        FlagT,
             typename        FlagOp>
-        static __device__ __forceinline__ void FlagItems(
+        static __device__ __forceinline__ void FlagHeads(
             int                     linear_tid,
             FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
             T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
             FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
         {
-            flags[ITERATION] = ApplyOp::Flag(
+            preds[ITERATION] = input[ITERATION - 1];
+
+            flags[ITERATION] = ApplyOp::FlagT(
                 flag_op,
-                input[ITERATION - 1],
+                preds[ITERATION],
                 input[ITERATION],
                 (linear_tid * ITEMS_PER_THREAD) + ITERATION);
 
-            Iterate::FlagItems(linear_tid, flags, input, flag_op);
+            Iterate::FlagHeads(linear_tid, flags, input, preds, flag_op);
         }
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            flags[ITERATION] = ApplyOp::FlagT(
+                flag_op,
+                input[ITERATION],
+                input[ITERATION + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
+
+            Iterate::FlagTails(linear_tid, flags, input, flag_op);
+        }
+
     };
 
     /// Templated unrolling of item comparison (termination case)
     template 
     struct Iterate
     {
+        // Head flags
         template <
             int             ITEMS_PER_THREAD,
             typename        FlagT,
             typename        FlagOp>
-        static __device__ __forceinline__ void FlagItems(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        static __device__ __forceinline__ void FlagHeads(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
         {}
     };
 
@@ -208,7 +251,7 @@ private:
     _TempStorage &temp_storage;
 
     /// Linear thread-id
-    int linear_tid;
+    unsigned int linear_tid;
 
 
 public:
@@ -250,6 +293,68 @@ public:
     //@{
 
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
     /**
      * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
      *
@@ -308,22 +413,8 @@ public:
         T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
         FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
     {
-        // Share last item
-        temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        __syncthreads();
-
-        // Set flag for first item
-        head_flags[0] = (linear_tid == 0) ?
-            1 :                                 // First thread
-            ApplyOp::Flag(
-                flag_op,
-                temp_storage[linear_tid - 1],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagItems(linear_tid, head_flags, input, flag_op);
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op);
     }
 
 
@@ -390,29 +481,14 @@ public:
         FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
         T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
         FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)                   ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0).
+        T               tile_predecessor_item)              ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0).
     {
-        // Share last item
-        temp_storage[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        __syncthreads();
-
-        // Set flag for first item
-        T predecessor_item = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage[linear_tid - 1];
-
-        head_flags[0] = ApplyOp::Flag(
-            flag_op,
-            predecessor_item,
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagItems(linear_tid, head_flags, input, flag_op);
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
     }
 
 
+
     //@}  end member group
     /******************************************************************//**
      * \name Tail flag operations
@@ -480,21 +556,21 @@ public:
         FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
     {
         // Share first item
-        temp_storage[linear_tid] = input[0];
+        temp_storage.first_items[linear_tid] = input[0];
 
-        __syncthreads();
+        CTA_SYNC();
 
-        // Set flag for last item
+        // Set flag for last thread-item
         tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
             1 :                             // Last thread
-            ApplyOp::Flag(
+            ApplyOp::FlagT(
                 flag_op,
                 input[ITEMS_PER_THREAD - 1],
-                temp_storage[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
 
         // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagItems(linear_tid, tail_flags, input, flag_op);
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
     }
 
 
@@ -509,7 +585,7 @@ public:
      *   in the same thread or the first item in the next thread).
      * - For threadBLOCK_THREADS-1, item
      *   inputITEMS_PER_THREAD-1 is compared
-     *   against \p tile_predecessor_item.
+     *   against \p tile_successor_item.
      * - \blocked
      * - \granularity
      * - \smemreuse
@@ -565,25 +641,504 @@ public:
         T               tile_successor_item)                ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1).
     {
         // Share first item
-        temp_storage[linear_tid] = input[0];
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head & tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag head_flagsi is set for item
+     *   inputi when
+     *   flag_op(previous-item, inputi)
+     *   returns \p true (where previous-item is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For thread0, item input0 is always flagged.
+     * - The flag tail_flagsi is set for item
+     *   inputi when
+     *   flag_op(inputi, next-item)
+     *   returns \p true (where next-item is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For threadBLOCK_THREADS-1, item
+     *   inputITEMS_PER_THREAD-1 is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include    // or equivalently 
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }.
+     * and the corresponding output \p tail_flags in those threads will be
+     * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }.
+     *
+     * \tparam ITEMS_PER_THREAD     [inferred] The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                [inferred] The flag type (must be an integer type)
+     * \tparam FlagOp               [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = temp_storage.last_items[linear_tid - 1];
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            head_flags[0] = ApplyOp::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag head_flagsi is set for item
+     *   inputi when
+     *   flag_op(previous-item, inputi)
+     *   returns \p true (where previous-item is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For thread0, item input0 is always flagged.
+     * - The flag tail_flagsi is set for item
+     *   inputi when
+     *   flag_op(inputi, next-item)
+     *   returns \p true (where next-item is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For threadBLOCK_THREADS-1, item
+     *   inputITEMS_PER_THREAD-1 is compared
+     *   against \p tile_predecessor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include    // or equivalently 
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }.
+     * and the corresponding output \p tail_flags in those threads will be
+     * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }.
+     *
+     * \tparam ITEMS_PER_THREAD     [inferred] The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                [inferred] The flag type (must be an integer type)
+     * \tparam FlagOp               [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag head_flagsi is set for item
+     *   inputi when
+     *   flag_op(previous-item, inputi)
+     *   returns \p true (where previous-item is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For thread0, item input0 is compared
+     *   against \p tile_predecessor_item.
+     * - The flag tail_flagsi is set for item
+     *   inputi when
+     *   flag_op(inputi, next-item)
+     *   returns \p true (where next-item is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For threadBLOCK_THREADS-1, item
+     *   inputITEMS_PER_THREAD-1 is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include    // or equivalently 
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] },
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }.
+     * and the corresponding output \p tail_flags in those threads will be
+     * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }.
+     *
+     * \tparam ITEMS_PER_THREAD     [inferred] The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                [inferred] The flag type (must be an integer type)
+     * \tparam FlagOp               [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag head_flagsi is set for item
+     *   inputi when
+     *   flag_op(previous-item, inputi)
+     *   returns \p true (where previous-item is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For thread0, item input0 is compared
+     *   against \p tile_predecessor_item.
+     * - The flag tail_flagsi is set for item
+     *   inputi when
+     *   flag_op(inputi, next-item)
+     *   returns \p true (where next-item is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For threadBLOCK_THREADS-1, item
+     *   inputITEMS_PER_THREAD-1 is compared
+     *   against \p tile_successor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include    // or equivalently 
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] },
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }.
+     * and the corresponding output \p tail_flags in those threads will be
+     * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }.
+     *
+     * \tparam ITEMS_PER_THREAD     [inferred] The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                [inferred] The flag type (must be an integer type)
+     * \tparam FlagOp               [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
 
-        __syncthreads();
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
 
-        // Set flag for last item
+        // Set flag for last thread-item
         T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
             tile_successor_item :              // Last thread
-            temp_storage[linear_tid + 1];
+            temp_storage.first_items[linear_tid + 1];
 
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::Flag(
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT(
             flag_op,
             input[ITEMS_PER_THREAD - 1],
             successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
 
         // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagItems(linear_tid, tail_flags, input, flag_op);
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
     }
 
+
+
+
     //@}  end member group
 
 };
diff --git a/SRC/cub/block/block_exchange.cuh b/SRC/cub/block/block_exchange.cuh
index 1eb4c5f4..3ae99343 100644
--- a/SRC/cub/block/block_exchange.cuh
+++ b/SRC/cub/block/block_exchange.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -59,7 +59,7 @@ namespace cub {
  *
  * \par Overview
  * - It is commonplace for blocks of threads to rearrange data items between
- *   threads.  For example, the global memory subsystem prefers access patterns
+ *   threads.  For example, the device-accessible memory subsystem prefers access patterns
  *   where data items are "striped" across threads (where consecutive threads access consecutive items),
  *   yet most block-wide operations prefer a "blocked" partitioning of items across threads
  *   (where consecutive items belong to a single thread).
@@ -68,7 +68,7 @@ namespace cub {
  *   - Transposing between [blocked](index.html#sec5sec3) and [warp-striped](index.html#sec5sec3) arrangements
  *   - Scattering ranked items to a [blocked arrangement](index.html#sec5sec3)
  *   - Scattering ranked items to a [striped arrangement](index.html#sec5sec3)
- * - \blocked
+ * - \rowmajor
  *
  * \par A Simple Example
  * \blockcollective{BlockExchange}
@@ -106,7 +106,7 @@ namespace cub {
  *
  */
 template <
-    typename    T,
+    typename    InputT,
     int         BLOCK_DIM_X,
     int         ITEMS_PER_THREAD,
     bool        WARP_TIME_SLICING   = false,
@@ -144,8 +144,8 @@ private:
         WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
         WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
 
-        // Insert padding if the number of items per thread is a power of two
-        INSERT_PADDING              = 0, // Mooch PowerOfTwo::VALUE,
+        // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
+        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE),
         PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
     };
 
@@ -154,7 +154,10 @@ private:
      ******************************************************************************/
 
     /// Shared memory storage layout type
-    typedef T _TempStorage[TIME_SLICED_ITEMS + PADDING_ITEMS];
+    struct __align__(16) _TempStorage
+    {
+        InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
+    };
 
 public:
 
@@ -172,10 +175,10 @@ private:
     _TempStorage &temp_storage;
 
     /// Linear thread-id
-    int linear_tid;
-    int lane_id;
-    int warp_id;
-    int warp_offset;
+    unsigned int linear_tid;
+    unsigned int lane_id;
+    unsigned int warp_id;
+    unsigned int warp_offset;
 
 
     /******************************************************************************
@@ -193,26 +196,28 @@ private:
     /**
      * Transposes data items from blocked arrangement to striped arrangement.  Specialized for no timeslicing.
      */
+    template 
     __device__ __forceinline__ void BlockedToStriped(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between blocked and striped arrangements.
-        Int2Type time_slicing)
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between blocked and striped arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between blocked and striped arrangements.
+        Int2Type /*time_slicing*/)
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
             int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage[item_offset] = items[ITEM];
+            temp_storage.buff[item_offset] = input_items[ITEM];
         }
 
-        __syncthreads();
+        CTA_SYNC();
 
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
             int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            items[ITEM] = temp_storage[item_offset];
+            output_items[ITEM] = temp_storage.buff[item_offset];
         }
     }
 
@@ -220,11 +225,13 @@ private:
     /**
      * Transposes data items from blocked arrangement to striped arrangement.  Specialized for warp-timeslicing.
      */
+    template 
     __device__ __forceinline__ void BlockedToStriped(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between blocked and striped arrangements.
-        Int2Type  time_slicing)
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between blocked and striped arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between blocked and striped arrangements.
+        Int2Type  /*time_slicing*/)
     {
-        T temp_items[ITEMS_PER_THREAD];
+        InputT temp_items[ITEMS_PER_THREAD];
 
         #pragma unroll
         for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
@@ -232,7 +239,7 @@ private:
             const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
             const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
 
-            __syncthreads();
+            CTA_SYNC();
 
             if (warp_id == SLICE)
             {
@@ -241,11 +248,11 @@ private:
                 {
                     int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage[item_offset] = items[ITEM];
+                    temp_storage.buff[item_offset] = input_items[ITEM];
                 }
             }
 
-            __syncthreads();
+            CTA_SYNC();
 
             #pragma unroll
             for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -260,7 +267,7 @@ private:
                     if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
                     {
                         if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_items[ITEM] = temp_storage[item_offset];
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
                     }
                 }
             }
@@ -270,7 +277,7 @@ private:
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
-            items[ITEM] = temp_items[ITEM];
+            output_items[ITEM] = temp_items[ITEM];
         }
     }
 
@@ -278,38 +285,65 @@ private:
     /**
      * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for no timeslicing
      */
+    template 
     __device__ __forceinline__ void BlockedToWarpStriped(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between blocked and warp-striped arrangements.
-        Int2Type time_slicing)
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between blocked and striped arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between blocked and striped arrangements.
+        Int2Type /*time_slicing*/)
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
             int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage[item_offset] = items[ITEM];
+            temp_storage.buff[item_offset] = input_items[ITEM];
         }
 
+        WARP_SYNC(0xffffffff);
+
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
             int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            items[ITEM] = temp_storage[item_offset];
+            output_items[ITEM] = temp_storage.buff[item_offset];
         }
     }
 
     /**
      * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for warp-timeslicing
      */
+    template 
     __device__ __forceinline__ void BlockedToWarpStriped(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between blocked and warp-striped arrangements.
-        Int2Type  time_slicing)
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between blocked and striped arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between blocked and striped arrangements.
+        Int2Type  /*time_slicing*/)
     {
+        if (warp_id == 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                temp_storage.buff[item_offset] = input_items[ITEM];
+            }
+
+            WARP_SYNC(0xffffffff);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                output_items[ITEM] = temp_storage.buff[item_offset];
+            }
+        }
+
         #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+        for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
         {
-            __syncthreads();
+            CTA_SYNC();
 
             if (warp_id == SLICE)
             {
@@ -318,15 +352,17 @@ private:
                 {
                     int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage[item_offset] = items[ITEM];
+                    temp_storage.buff[item_offset] = input_items[ITEM];
                 }
 
+                WARP_SYNC(0xffffffff);
+
                 #pragma unroll
                 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
                 {
                     int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    items[ITEM] = temp_storage[item_offset];
+                    output_items[ITEM] = temp_storage.buff[item_offset];
                 }
             }
         }
@@ -336,19 +372,21 @@ private:
     /**
      * Transposes data items from striped arrangement to blocked arrangement.  Specialized for no timeslicing.
      */
+    template 
     __device__ __forceinline__ void StripedToBlocked(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between striped and blocked arrangements.
-        Int2Type time_slicing)
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between blocked and striped arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between blocked and striped arrangements.
+        Int2Type /*time_slicing*/)
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
             int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage[item_offset] = items[ITEM];
+            temp_storage.buff[item_offset] = input_items[ITEM];
         }
 
-        __syncthreads();
+        CTA_SYNC();
 
         // No timeslicing
         #pragma unroll
@@ -356,7 +394,7 @@ private:
         {
             int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            items[ITEM] = temp_storage[item_offset];
+            output_items[ITEM] = temp_storage.buff[item_offset];
         }
     }
 
@@ -364,12 +402,14 @@ private:
     /**
      * Transposes data items from striped arrangement to blocked arrangement.  Specialized for warp-timeslicing.
      */
+    template 
     __device__ __forceinline__ void StripedToBlocked(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between striped and blocked arrangements.
-        Int2Type  time_slicing)
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between blocked and striped arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between blocked and striped arrangements.
+        Int2Type  /*time_slicing*/)
     {
         // Warp time-slicing
-        T temp_items[ITEMS_PER_THREAD];
+        InputT temp_items[ITEMS_PER_THREAD];
 
         #pragma unroll
         for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
@@ -377,7 +417,7 @@ private:
             const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
             const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
 
-            __syncthreads();
+            CTA_SYNC();
 
             #pragma unroll
             for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -392,12 +432,12 @@ private:
                     if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
                     {
                         if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_storage[item_offset] = items[ITEM];
+                        temp_storage.buff[item_offset] = input_items[ITEM];
                     }
                 }
             }
 
-            __syncthreads();
+            CTA_SYNC();
 
             if (warp_id == SLICE)
             {
@@ -406,7 +446,7 @@ private:
                 {
                     int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_items[ITEM] = temp_storage[item_offset];
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
                 }
             }
         }
@@ -415,7 +455,7 @@ private:
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
-            items[ITEM] = temp_items[ITEM];
+            output_items[ITEM] = temp_items[ITEM];
         }
     }
 
@@ -423,24 +463,28 @@ private:
     /**
      * Transposes data items from warp-striped arrangement to blocked arrangement.  Specialized for no timeslicing
      */
+    template 
     __device__ __forceinline__ void WarpStripedToBlocked(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between warp-striped and blocked arrangements.
-        Int2Type time_slicing)
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between blocked and striped arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between blocked and striped arrangements.
+        Int2Type /*time_slicing*/)
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
             int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage[item_offset] = items[ITEM];
+            temp_storage.buff[item_offset] = input_items[ITEM];
         }
 
+        WARP_SYNC(0xffffffff);
+
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
             int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
             if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            items[ITEM] = temp_storage[item_offset];
+            output_items[ITEM] = temp_storage.buff[item_offset];
         }
     }
 
@@ -448,14 +492,16 @@ private:
     /**
      * Transposes data items from warp-striped arrangement to blocked arrangement.  Specialized for warp-timeslicing
      */
+    template 
     __device__ __forceinline__ void WarpStripedToBlocked(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between warp-striped and blocked arrangements.
-        Int2Type  time_slicing)
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between blocked and striped arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between blocked and striped arrangements.
+        Int2Type  /*time_slicing*/)
     {
         #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+        for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
         {
-            __syncthreads();
+            CTA_SYNC();
 
             if (warp_id == SLICE)
             {
@@ -464,15 +510,17 @@ private:
                 {
                     int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage[item_offset] = items[ITEM];
+                    temp_storage.buff[item_offset] = input_items[ITEM];
                 }
 
+                WARP_SYNC(0xffffffff);
+
                 #pragma unroll
                 for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
                 {
                     int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
                     if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    items[ITEM] = temp_storage[item_offset];
+                    output_items[ITEM] = temp_storage.buff[item_offset];
                 }
             }
         }
@@ -482,46 +530,48 @@ private:
     /**
      * Exchanges data items annotated by rank into blocked arrangement.  Specialized for no timeslicing.
      */
-    template 
+    template 
     __device__ __forceinline__ void ScatterToBlocked(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type time_slicing)
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between blocked and striped arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between blocked and striped arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type /*time_slicing*/)
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
             int item_offset = ranks[ITEM];
             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            temp_storage[item_offset] = items[ITEM];
+            temp_storage.buff[item_offset] = input_items[ITEM];
         }
 
-        __syncthreads();
+        CTA_SYNC();
 
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
             int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            items[ITEM] = temp_storage[item_offset];
+            output_items[ITEM] = temp_storage.buff[item_offset];
         }
     }
 
     /**
      * Exchanges data items annotated by rank into blocked arrangement.  Specialized for warp-timeslicing.
      */
-    template 
+    template 
     __device__ __forceinline__ void ScatterToBlocked(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type  time_slicing)
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between blocked and striped arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between blocked and striped arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type  /*time_slicing*/)
     {
-        T temp_items[ITEMS_PER_THREAD];
+        InputT temp_items[ITEMS_PER_THREAD];
 
         #pragma unroll
         for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
         {
-            __syncthreads();
+            CTA_SYNC();
 
             const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
 
@@ -532,11 +582,11 @@ private:
                 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
                 {
                     if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_storage[item_offset] = items[ITEM];
+                    temp_storage.buff[item_offset] = input_items[ITEM];
                 }
             }
 
-            __syncthreads();
+            CTA_SYNC();
 
             if (warp_id == SLICE)
             {
@@ -545,7 +595,7 @@ private:
                 {
                     int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
                     if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_items[ITEM] = temp_storage[item_offset];
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
                 }
             }
         }
@@ -554,7 +604,7 @@ private:
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
-            items[ITEM] = temp_items[ITEM];
+            output_items[ITEM] = temp_items[ITEM];
         }
     }
 
@@ -562,28 +612,29 @@ private:
     /**
      * Exchanges data items annotated by rank into striped arrangement.  Specialized for no timeslicing.
      */
-    template 
+    template 
     __device__ __forceinline__ void ScatterToStriped(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type time_slicing)
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between blocked and striped arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between blocked and striped arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type /*time_slicing*/)
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
             int item_offset = ranks[ITEM];
             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            temp_storage[item_offset] = items[ITEM];
+            temp_storage.buff[item_offset] = input_items[ITEM];
         }
 
-        __syncthreads();
+        CTA_SYNC();
 
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
             int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            items[ITEM] = temp_storage[item_offset];
+            output_items[ITEM] = temp_storage.buff[item_offset];
         }
     }
 
@@ -591,13 +642,14 @@ private:
     /**
      * Exchanges data items annotated by rank into striped arrangement.  Specialized for warp-timeslicing.
      */
-    template 
+    template 
     __device__ __forceinline__ void ScatterToStriped(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type time_slicing)
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between blocked and striped arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between blocked and striped arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type /*time_slicing*/)
     {
-        T temp_items[ITEMS_PER_THREAD];
+        InputT temp_items[ITEMS_PER_THREAD];
 
         #pragma unroll
         for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
@@ -605,7 +657,7 @@ private:
             const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
             const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
 
-            __syncthreads();
+            CTA_SYNC();
 
             #pragma unroll
             for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -614,11 +666,11 @@ private:
                 if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
                 {
                     if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_storage[item_offset] = items[ITEM];
+                    temp_storage.buff[item_offset] = input_items[ITEM];
                 }
             }
 
-            __syncthreads();
+            CTA_SYNC();
 
             #pragma unroll
             for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -633,7 +685,7 @@ private:
                     if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
                     {
                         if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_items[ITEM] = temp_storage[item_offset];
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
                     }
                 }
             }
@@ -643,7 +695,7 @@ private:
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
-            items[ITEM] = temp_items[ITEM];
+            output_items[ITEM] = temp_items[ITEM];
         }
     }
 
@@ -676,8 +728,8 @@ public:
     :
         temp_storage(temp_storage.Alias()),
         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
         lane_id(LaneId()),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
         warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
     {}
 
@@ -714,22 +766,25 @@ public:
      *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
      *
      *     // Collectively exchange data into a blocked arrangement across threads
-     *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
+     *     BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
      *
      * \endcode
      * \par
      * Suppose the set of striped input \p thread_data across the block of threads is
-     * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } after loading from global memory.
+     * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } after loading from device-accessible memory.
      * The corresponding output \p thread_data in those threads will be
      * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }.
      *
      */
+    template 
     __device__ __forceinline__ void StripedToBlocked(
-        T                items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between striped and blocked arrangements.
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between striped and blocked arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between striped and blocked arrangements.
     {
-        StripedToBlocked(items, Int2Type());
+        StripedToBlocked(input_items, output_items, Int2Type());
     }
 
+
     /**
      * \brief Transposes data items from blocked arrangement to striped arrangement.
      *
@@ -756,7 +811,7 @@ public:
      *     ...
      *
      *     // Collectively exchange data into a striped arrangement across threads
-     *     BlockExchange(temp_storage).BlockedToStriped(thread_data);
+     *     BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
      *
      *     // Store data striped across block threads into an ordered tile
      *     cub::StoreDirectStriped(threadIdx.x, d_data, thread_data);
@@ -767,16 +822,19 @@ public:
      * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }.
      * The corresponding output \p thread_data in those threads will be
      * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } in
-     * preparation for storing to global memory.
+     * preparation for storing to device-accessible memory.
      *
      */
+    template 
     __device__ __forceinline__ void BlockedToStriped(
-        T               items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between blocked and striped arrangements.
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between striped and blocked arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between striped and blocked arrangements.
     {
-        BlockedToStriped(items, Int2Type());
+        BlockedToStriped(input_items, output_items, Int2Type());
     }
 
 
+
     /**
      * \brief Transposes data items from warp-striped arrangement to blocked arrangement.
      *
@@ -809,18 +867,22 @@ public:
      * \par
      * Suppose the set of warp-striped input \p thread_data across the block of threads is
      * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }
-     * after loading from global memory.  (The first 128 items are striped across
+     * after loading from device-accessible memory.  (The first 128 items are striped across
      * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
      * The corresponding output \p thread_data in those threads will be
      * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }.
      *
      */
+    template 
     __device__ __forceinline__ void WarpStripedToBlocked(
-        T                items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between warp-striped and blocked arrangements.
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between striped and blocked arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between striped and blocked arrangements.
     {
-        WarpStripedToBlocked(items, Int2Type());
+        WarpStripedToBlocked(input_items, output_items, Int2Type());
     }
 
+
+
     /**
      * \brief Transposes data items from blocked arrangement to warp-striped arrangement.
      *
@@ -847,7 +909,7 @@ public:
      *     ...
      *
      *     // Collectively exchange data into a warp-striped arrangement across threads
-     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data);
+     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
      *
      *     // Store data striped across warp threads into an ordered tile
      *     cub::StoreDirectStriped(threadIdx.x, d_data, thread_data);
@@ -858,17 +920,20 @@ public:
      * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }.
      * The corresponding output \p thread_data in those threads will be
      * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }
-     * in preparation for storing to global memory. (The first 128 items are striped across
+     * in preparation for storing to device-accessible memory. (The first 128 items are striped across
      * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
      *
      */
+    template 
     __device__ __forceinline__ void BlockedToWarpStriped(
-        T                items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between blocked and warp-striped arrangements.
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between striped and blocked arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between striped and blocked arrangements.
     {
-        BlockedToWarpStriped(items, Int2Type());
+        BlockedToWarpStriped(input_items, output_items, Int2Type());
     }
 
 
+
     //@}  end member group
     /******************************************************************//**
      * \name Scatter exchanges
@@ -882,46 +947,51 @@ public:
      * \par
      * - \smemreuse
      *
-     * \tparam Offset                               [inferred] Signed integer type for local offsets
+     * \tparam OffsetT                              [inferred] Signed integer type for local offsets
      */
-    template 
+    template 
     __device__ __forceinline__ void ScatterToBlocked(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between striped and blocked arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between striped and blocked arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
     {
-        ScatterToBlocked(items, ranks, Int2Type());
+        ScatterToBlocked(input_items, output_items, ranks, Int2Type());
     }
 
 
+
     /**
      * \brief Exchanges data items annotated by rank into striped arrangement.
      *
      * \par
      * - \smemreuse
      *
-     * \tparam Offset                               [inferred] Signed integer type for local offsets
+     * \tparam OffsetT                              [inferred] Signed integer type for local offsets
      */
-    template 
+    template 
     __device__ __forceinline__ void ScatterToStriped(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between striped and blocked arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between striped and blocked arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
     {
-        ScatterToStriped(items, ranks, Int2Type());
+        ScatterToStriped(input_items, output_items, ranks, Int2Type());
     }
 
 
+
     /**
      * \brief Exchanges data items annotated by rank into striped arrangement.  Items with rank -1 are not exchanged.
      *
      * \par
      * - \smemreuse
      *
-     * \tparam Offset                               [inferred] Signed integer type for local offsets
+     * \tparam OffsetT                              [inferred] Signed integer type for local offsets
      */
-    template 
+    template 
     __device__ __forceinline__ void ScatterToStripedGuarded(
-        T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between striped and blocked arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between striped and blocked arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -929,34 +999,38 @@ public:
             int item_offset = ranks[ITEM];
             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
             if (ranks[ITEM] >= 0)
-                temp_storage[item_offset] = items[ITEM];
+                temp_storage.buff[item_offset] = input_items[ITEM];
         }
 
-        __syncthreads();
+        CTA_SYNC();
 
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
             int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            items[ITEM] = temp_storage[item_offset];
+            output_items[ITEM] = temp_storage.buff[item_offset];
         }
     }
 
+
+
+
     /**
      * \brief Exchanges valid data items annotated by rank into striped arrangement.
      *
      * \par
      * - \smemreuse
      *
-     * \tparam Offset                               [inferred] Signed integer type for local offsets
-     * \tparam ValidFlag                            [inferred] Flag type denoting which items are valid
+     * \tparam OffsetT                              [inferred] Signed integer type for local offsets
+     * \tparam ValidFlag                            [inferred] FlagT type denoting which items are valid
      */
-    template 
-    __device__ __forceinline__ void ScatterToStriped(
-        T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
-        ValidFlag       is_valid[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
+    template 
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between striped and blocked arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between striped and blocked arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
+        ValidFlag   is_valid[ITEMS_PER_THREAD])         ///< [in] Corresponding flag denoting item validity
     {
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
@@ -964,25 +1038,211 @@ public:
             int item_offset = ranks[ITEM];
             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
             if (is_valid[ITEM])
-                temp_storage[item_offset] = items[ITEM];
+                temp_storage.buff[item_offset] = input_items[ITEM];
         }
 
-        __syncthreads();
+        CTA_SYNC();
 
         #pragma unroll
         for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
         {
             int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
             if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            items[ITEM] = temp_storage[item_offset];
+            output_items[ITEM] = temp_storage.buff[item_offset];
         }
     }
 
+
     //@}  end member group
 
 
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between striped and blocked arrangements.
+    {
+        StripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between striped and blocked arrangements.
+    {
+        BlockedToStriped(items, items);
+    }
+
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between striped and blocked arrangements.
+    {
+        WarpStripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between striped and blocked arrangements.
+    {
+        BlockedToWarpStriped(items, items);
+    }
+
+    template 
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between striped and blocked arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(items, items, ranks);
+    }
+
+    template 
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between striped and blocked arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(items, items, ranks);
+    }
+
+    template 
+    __device__ __forceinline__ void ScatterToStripedGuarded(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between striped and blocked arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStripedGuarded(items, items, ranks);
+    }
+
+    template 
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange, converting between striped and blocked arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
+        ValidFlag   is_valid[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
+    {
+        ScatterToStriped(items, items, ranks, is_valid);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
 };
 
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+template <
+    typename    T,
+    int         ITEMS_PER_THREAD,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        // Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        WARP_ITEMS                  = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
+
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
+        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        T buff[WARP_ITEMS + PADDING_ITEMS];
+    };
+
+public:
+
+    /// \smemstorage{WarpExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    int             lane_id;
+
+public:
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpExchange(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS)
+    {}
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * \brief Exchanges valid data items annotated by rank into striped arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              [inferred] Signed integer type for local offsets
+     */
+    template 
+    __device__ __forceinline__ void ScatterToStriped(
+        T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
+        OffsetT         ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
+            temp_storage.buff[ranks[ITEM]] = items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+};
+
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+
+
 }               // CUB namespace
 CUB_NS_POSTFIX  // Optional outer namespace(s)
 
diff --git a/SRC/cub/block/block_histogram.cuh b/SRC/cub/block/block_histogram.cuh
index 1ec78388..b7cb9700 100644
--- a/SRC/cub/block/block_histogram.cuh
+++ b/SRC/cub/block/block_histogram.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -141,7 +141,7 @@ enum BlockHistogramAlgorithm
  * \endcode
  *
  * \par Performance and Usage Considerations
- * - The histogram output can be constructed in shared or global memory
+ * - The histogram output can be constructed in shared or device-accessible memory
  * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
  *
  */
@@ -197,7 +197,7 @@ private:
     _TempStorage &temp_storage;
 
     /// Linear thread-id
-    int linear_tid;
+    unsigned int linear_tid;
 
 
     /******************************************************************************
@@ -285,10 +285,10 @@ public:
      *
      * \endcode
      *
-     * \tparam HistoCounter         [inferred] Histogram counter type
+     * \tparam CounterT              [inferred] Histogram counter type
      */
-    template 
-    __device__ __forceinline__ void InitHistogram(HistoCounter histogram[BINS])
+    template 
+    __device__ __forceinline__ void InitHistogram(CounterT      histogram[BINS])
     {
         // Initialize histogram bin counts to zeros
         int histo_offset = 0;
@@ -307,7 +307,7 @@ public:
 
 
     /**
-     * \brief Constructs a block-wide histogram in shared/global memory.  Each thread contributes an array of input elements.
+     * \brief Constructs a block-wide histogram in shared/device-accessible memory.  Each thread contributes an array of input elements.
      *
      * \par
      * - \granularity
@@ -340,18 +340,18 @@ public:
      *
      * \endcode
      *
-     * \tparam HistoCounter         [inferred] Histogram counter type
+     * \tparam CounterT              [inferred] Histogram counter type
      */
     template <
-        typename            HistoCounter>
+        typename            CounterT     >
     __device__ __forceinline__ void Histogram(
         T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        HistoCounter        histogram[BINS])                ///< [out] Reference to shared/global memory histogram
+        CounterT             histogram[BINS])                ///< [out] Reference to shared/device-accessible memory histogram
     {
         // Initialize histogram bin counts to zeros
         InitHistogram(histogram);
 
-        __syncthreads();
+        CTA_SYNC();
 
         // Composite the histogram
         InternalBlockHistogram(temp_storage).Composite(items, histogram);
@@ -360,7 +360,7 @@ public:
 
 
     /**
-     * \brief Updates an existing block-wide histogram in shared/global memory.  Each thread composites an array of input elements.
+     * \brief Updates an existing block-wide histogram in shared/device-accessible memory.  Each thread composites an array of input elements.
      *
      * \par
      * - \granularity
@@ -397,13 +397,13 @@ public:
      *
      * \endcode
      *
-     * \tparam HistoCounter         [inferred] Histogram counter type
+     * \tparam CounterT              [inferred] Histogram counter type
      */
     template <
-        typename            HistoCounter>
+        typename            CounterT     >
     __device__ __forceinline__ void Composite(
         T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
+        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
     {
         InternalBlockHistogram(temp_storage).Composite(items, histogram);
     }
diff --git a/SRC/cub/block/block_load.cuh b/SRC/cub/block/block_load.cuh
index afa8ff7c..217f5212 100644
--- a/SRC/cub/block/block_load.cuh
+++ b/SRC/cub/block/block_load.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -36,6 +36,7 @@
 #include 
 
 #include "block_exchange.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
 #include "../util_ptx.cuh"
 #include "../util_macro.cuh"
 #include "../util_type.cuh"
@@ -66,22 +67,24 @@ namespace cub {
  *
  * \tparam T                    [inferred] The data type to load.
  * \tparam ITEMS_PER_THREAD     [inferred] The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        [inferred] The random-access iterator type for input \iterator.
+ * \tparam InputIteratorT       [inferred] The random-access iterator type for input \iterator.
  */
 template <
-    typename        T,
+    typename        InputT,
     int             ITEMS_PER_THREAD,
-    typename        InputIterator>
+    typename        InputIteratorT>
 __device__ __forceinline__ void LoadDirectBlocked(
     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
 {
+    InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
     // Load directly in thread-blocked order
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
     {
-        items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
+        items[ITEM] = thread_itr[ITEM];
     }
 }
 
@@ -93,26 +96,26 @@ __device__ __forceinline__ void LoadDirectBlocked(
  *
  * \tparam T                    [inferred] The data type to load.
  * \tparam ITEMS_PER_THREAD     [inferred] The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        [inferred] The random-access iterator type for input \iterator.
+ * \tparam InputIteratorT       [inferred] The random-access iterator type for input \iterator.
  */
 template <
-    typename        T,
+    typename        InputT,
     int             ITEMS_PER_THREAD,
-    typename        InputIterator>
+    typename        InputIteratorT>
 __device__ __forceinline__ void LoadDirectBlocked(
     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
     int             valid_items)                ///< [in] Number of valid items to load
 {
-    int bounds = valid_items - (linear_tid * ITEMS_PER_THREAD);
+    InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
 
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
     {
-        if (ITEM < bounds)
+        if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
         {
-            items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
+            items[ITEM] = thread_itr[ITEM];
         }
     }
 }
@@ -125,88 +128,109 @@ __device__ __forceinline__ void LoadDirectBlocked(
  *
  * \tparam T                    [inferred] The data type to load.
  * \tparam ITEMS_PER_THREAD     [inferred] The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        [inferred] The random-access iterator type for input \iterator.
+ * \tparam InputIteratorT       [inferred] The random-access iterator type for input \iterator.
  */
 template <
-    typename        T,
+    typename        InputT,
+    typename        DefaultT,
     int             ITEMS_PER_THREAD,
-    typename        InputIterator>
+    typename        InputIteratorT>
 __device__ __forceinline__ void LoadDirectBlocked(
     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
     int             valid_items,                ///< [in] Number of valid items to load
-    T               oob_default)                ///< [in] Default value to assign out-of-bound items
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
 {
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
         items[ITEM] = oob_default;
-    }
 
     LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
 }
 
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
 /**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block.
- *
- * \blocked
- *
- * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
- *
- * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
- *   - \p ITEMS_PER_THREAD is odd
- *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
- *
- * \tparam T                    [inferred] The data type to load.
- * \tparam ITEMS_PER_THREAD     [inferred] The number of consecutive items partitioned onto each thread.
+ * Internal implementation for load vectorization
  */
 template <
-    typename        T,
-    int             ITEMS_PER_THREAD>
-__device__ __forceinline__ void LoadDirectBlockedVectorized(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
-    T               *block_ptr,                 ///< [in] Input pointer for loading from
-    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+    CacheLoadModifier   MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
+    int    linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
+    T      *block_ptr,                 ///< [in] Input pointer for loading from
+    T      (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
 {
+    // Biggest memory access word that T is a whole multiple of
+    typedef typename UnitWord::DeviceWord DeviceWord;
+
     enum
     {
-        // Maximum CUDA vector size is 4 elements
-        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+        TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
 
-        // Vector size must be a power of two and an even divisor of the items per thread
-        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
-            MAX_VEC_SIZE :
-            1,
+        VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
+            4 :
+            (TOTAL_WORDS % 2 == 0) ?
+                2 :
+                1,
 
-        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+        VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
     };
 
     // Vector type
-    typedef typename CubVector::Type Vector;
+    typedef typename CubVector::Type Vector;
 
     // Vector items
     Vector vec_items[VECTORS_PER_THREAD];
 
     // Aliased input ptr
-    Vector *ptr = reinterpret_cast(block_ptr + (linear_tid * VEC_SIZE * VECTORS_PER_THREAD));
+    Vector* vec_ptr = reinterpret_cast(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
 
     // Load directly in thread-blocked order
     #pragma unroll
     for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
     {
-        vec_items[ITEM] = ptr[ITEM];
+        vec_items[ITEM] = ThreadLoad(vec_ptr + ITEM);
     }
 
     // Copy
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
     {
-        items[ITEM] = reinterpret_cast(vec_items)[ITEM];
+        items[ITEM] = *(reinterpret_cast(vec_items) + ITEM);
     }
 }
 
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * \blocked
+ *
+ * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
+ *
+ * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    [inferred] The data type to load.
+ * \tparam ITEMS_PER_THREAD     [inferred] The number of consecutive items partitioned onto each thread.
+ */
+template <
+    typename        T,
+    int             ITEMS_PER_THREAD>
+__device__ __forceinline__ void LoadDirectBlockedVectorized(
+    int linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
+    T   *block_ptr,                 ///< [in] Input pointer for loading from
+    T   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items);
+}
 
 
 //@}  end member group
@@ -224,22 +248,24 @@ __device__ __forceinline__ void LoadDirectBlockedVectorized(
  * \tparam BLOCK_THREADS        The thread block size in threads
  * \tparam T                    [inferred] The data type to load.
  * \tparam ITEMS_PER_THREAD     [inferred] The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        [inferred] The random-access iterator type for input \iterator.
+ * \tparam InputIteratorT       [inferred] The random-access iterator type for input \iterator.
  */
 template <
     int             BLOCK_THREADS,
-    typename        T,
+    typename        InputT,
     int             ITEMS_PER_THREAD,
-    typename        InputIterator>
+    typename        InputIteratorT>
 __device__ __forceinline__ void LoadDirectStriped(
     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
 {
+    InputIteratorT thread_itr = block_itr + linear_tid;
+
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
     {
-        items[ITEM] = block_itr[(ITEM * BLOCK_THREADS) + linear_tid];
+        items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
     }
 }
 
@@ -252,27 +278,27 @@ __device__ __forceinline__ void LoadDirectStriped(
  * \tparam BLOCK_THREADS        The thread block size in threads
  * \tparam T                    [inferred] The data type to load.
  * \tparam ITEMS_PER_THREAD     [inferred] The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        [inferred] The random-access iterator type for input \iterator.
+ * \tparam InputIteratorT       [inferred] The random-access iterator type for input \iterator.
  */
 template <
     int             BLOCK_THREADS,
-    typename        T,
+    typename        InputT,
     int             ITEMS_PER_THREAD,
-    typename        InputIterator>
+    typename        InputIteratorT>
 __device__ __forceinline__ void LoadDirectStriped(
     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
     int             valid_items)                ///< [in] Number of valid items to load
 {
-    int bounds = valid_items - linear_tid;
+    InputIteratorT thread_itr = block_itr + linear_tid;
 
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
     {
-        if (ITEM * BLOCK_THREADS < bounds)
+        if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
         {
-            items[ITEM] = block_itr[linear_tid + (ITEM * BLOCK_THREADS)];
+            items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
         }
     }
 }
@@ -286,25 +312,24 @@ __device__ __forceinline__ void LoadDirectStriped(
  * \tparam BLOCK_THREADS        The thread block size in threads
  * \tparam T                    [inferred] The data type to load.
  * \tparam ITEMS_PER_THREAD     [inferred] The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        [inferred] The random-access iterator type for input \iterator.
+ * \tparam InputIteratorT       [inferred] The random-access iterator type for input \iterator.
  */
 template <
     int             BLOCK_THREADS,
-    typename        T,
+    typename        InputT,
+    typename        DefaultT,
     int             ITEMS_PER_THREAD,
-    typename        InputIterator>
+    typename        InputIteratorT>
 __device__ __forceinline__ void LoadDirectStriped(
     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
     int             valid_items,                ///< [in] Number of valid items to load
-    T               oob_default)                ///< [in] Default value to assign out-of-bound items
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
 {
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
         items[ITEM] = oob_default;
-    }
 
     LoadDirectStriped(linear_tid, block_itr, items, valid_items);
 }
@@ -328,26 +353,28 @@ __device__ __forceinline__ void LoadDirectStriped(
  *
  * \tparam T                    [inferred] The data type to load.
  * \tparam ITEMS_PER_THREAD     [inferred] The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        [inferred] The random-access iterator type for input \iterator.
+ * \tparam InputIteratorT       [inferred] The random-access iterator type for input \iterator.
  */
 template <
-    typename        T,
+    typename        InputT,
     int             ITEMS_PER_THREAD,
-    typename        InputIterator>
+    typename        InputIteratorT>
 __device__ __forceinline__ void LoadDirectWarpStriped(
     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
 {
-    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    InputIteratorT thread_itr = block_itr + warp_offset + tid ;
 
     // Load directly in warp-striped order
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
     {
-        items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
+        items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
     }
 }
 
@@ -362,30 +389,31 @@ __device__ __forceinline__ void LoadDirectWarpStriped(
  *
  * \tparam T                    [inferred] The data type to load.
  * \tparam ITEMS_PER_THREAD     [inferred] The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        [inferred] The random-access iterator type for input \iterator.
+ * \tparam InputIteratorT        [inferred] The random-access iterator type for input \iterator.
  */
 template <
-    typename        T,
+    typename        InputT,
     int             ITEMS_PER_THREAD,
-    typename        InputIterator>
+    typename        InputIteratorT>
 __device__ __forceinline__ void LoadDirectWarpStriped(
     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
     int             valid_items)                ///< [in] Number of valid items to load
 {
-    int tid                 = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid                 = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset         = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-    int bounds              = valid_items - warp_offset - tid;
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    InputIteratorT thread_itr = block_itr + warp_offset + tid ;
 
     // Load directly in warp-striped order
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
     {
-        if ((ITEM * CUB_PTX_WARP_THREADS) < bounds)
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
         {
-            items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
+            items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
         }
     }
 }
@@ -401,29 +429,30 @@ __device__ __forceinline__ void LoadDirectWarpStriped(
  *
  * \tparam T                    [inferred] The data type to load.
  * \tparam ITEMS_PER_THREAD     [inferred] The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        [inferred] The random-access iterator type for input \iterator.
+ * \tparam InputIteratorT        [inferred] The random-access iterator type for input \iterator.
  */
 template <
-    typename        T,
+    typename        InputT,
+    typename        DefaultT,
     int             ITEMS_PER_THREAD,
-    typename        InputIterator>
+    typename        InputIteratorT>
 __device__ __forceinline__ void LoadDirectWarpStriped(
     int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,               ///< [in] Number of valid items to load
-    T               oob_default)                ///< [in] Default value to assign out-of-bound items
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
 {
+    // Load directly in warp-striped order
     #pragma unroll
     for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
         items[ITEM] = oob_default;
-    }
 
     LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
 }
 
 
+
 //@}  end member group
 
 /** @} */       // end group UtilIo
@@ -434,6 +463,10 @@ __device__ __forceinline__ void LoadDirectWarpStriped(
 // Generic BlockLoad abstraction
 //-----------------------------------------------------------------------------
 
+/**
+ * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ */
+
 /**
  * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
  */
@@ -443,8 +476,7 @@ enum BlockLoadAlgorithm
      * \par Overview
      *
      * A [blocked arrangement](index.html#sec5sec3) of data is read
-     * directly from memory.  The thread block reads items in a parallel "raking" fashion: threadi
-     * reads the ith segment of consecutive elements.
+     * directly from memory.
      *
      * \par Performance Considerations
      * - The utilization of memory transactions (coalescing) decreases as the
@@ -455,12 +487,10 @@ enum BlockLoadAlgorithm
     /**
      * \par Overview
      *
-     * A [blocked arrangement](index.html#sec5sec3) of data is read directly
+     * A [blocked arrangement](index.html#sec5sec3) of data is read
      * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
-     * The thread block reads items in a parallel "raking" fashion: threadi uses vector loads to
-     * read the ith segment of consecutive elements.
-     *
-     * For example, ld.global.v4.s32 instructions will be generated when \p T = \p int and \p ITEMS_PER_THREAD > 4.
+     * For example, ld.global.v4.s32 instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
      *
      * \par Performance Considerations
      * - The utilization of memory transactions (coalescing) remains high until the the
@@ -468,7 +498,7 @@ enum BlockLoadAlgorithm
      *   maximum vector load width (typically 4 items or 64B, whichever is lower).
      * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
      *   - \p ITEMS_PER_THREAD is odd
-     *   - The \p InputIterator is not a simple pointer type
+     *   - The \p InputIteratorTis not a simple pointer type
      *   - The block input offset is not quadword-aligned
      *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
      */
@@ -478,12 +508,8 @@ enum BlockLoadAlgorithm
      * \par Overview
      *
      * A [striped arrangement](index.html#sec5sec3) of data is read
-     * directly from memory and then is locally transposed into a
-     * [blocked arrangement](index.html#sec5sec3). The thread block
-     * reads items in a parallel "strip-mining" fashion:
-     * threadi reads items having stride \p BLOCK_THREADS
-     * between them. cub::BlockExchange is then used to locally reorder the items
-     * into a [blocked arrangement](index.html#sec5sec3).
+     * efficiently from memory and then locally transposed into a
+     * [blocked arrangement](index.html#sec5sec3).
      *
      * \par Performance Considerations
      * - The utilization of memory transactions (coalescing) remains high regardless
@@ -497,12 +523,8 @@ enum BlockLoadAlgorithm
     /**
      * \par Overview
      *
-     * A [warp-striped arrangement](index.html#sec5sec3) of data is read
-     * directly from memory and then is locally transposed into a
-     * [blocked arrangement](index.html#sec5sec3). Each warp reads its own
-     * contiguous segment in a parallel "strip-mining" fashion: lanei
-     * reads items having stride \p WARP_THREADS between them. cub::BlockExchange
-     * is then used to locally reorder the items into a
+     * A [warp-striped arrangement](index.html#sec5sec3) of data is
+     * read efficiently from memory and then locally transposed into a
      * [blocked arrangement](index.html#sec5sec3).
      *
      * \par Usage Considerations
@@ -511,10 +533,33 @@ enum BlockLoadAlgorithm
      * \par Performance Considerations
      * - The utilization of memory transactions (coalescing) remains high regardless
      *   of items loaded per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
+     * - The local reordering incurs slightly larger latencies than the
      *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     * - Provisions more shared storage, but incurs smaller latencies than the
+     *   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
      */
     BLOCK_LOAD_WARP_TRANSPOSE,
+
+
+    /**
+     * \par Overview
+     *
+     * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [warp-striped arrangement](index.html#sec5sec3)
+     * of data is read directly from memory and then is locally transposed into a
+     * [blocked arrangement](index.html#sec5sec3). To reduce the shared memory
+     * requirement, only one warp's worth of shared memory is provisioned and is
+     * subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
 };
 
 
@@ -523,7 +568,7 @@ enum BlockLoadAlgorithm
  * \ingroup BlockModule
  * \ingroup UtilIo
  *
- * \tparam InputIterator        The input iterator type \iterator.
+ * \tparam InputT               The data type to read into (which must be convertible from the input iterator's value type).
  * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
  * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
  * \tparam ALGORITHM            [optional] cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
@@ -548,6 +593,9 @@ enum BlockLoadAlgorithm
  *   -# cub::BLOCK_LOAD_WARP_TRANSPOSE.  A [warp-striped arrangement](index.html#sec5sec3)
  *      of data is read directly from memory and is then locally transposed into a
  *      [blocked arrangement](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,.  A [warp-striped arrangement](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [blocked arrangement](index.html#sec5sec3) one warp at a time.  [More...](\ref cub::BlockLoadAlgorithm)
  * - \rowmajor
  *
  * \par A Simple Example
@@ -565,7 +613,7 @@ enum BlockLoadAlgorithm
  * __global__ void ExampleKernel(int *d_data, ...)
  * {
  *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockLoad BlockLoad;
+ *     typedef cub::BlockLoad BlockLoad;
  *
  *     // Allocate shared memory for BlockLoad
  *     __shared__ typename BlockLoad::TempStorage temp_storage;
@@ -582,11 +630,10 @@ enum BlockLoadAlgorithm
  *
  */
 template <
-    typename            InputIterator,
+    typename            InputT,
     int                 BLOCK_DIM_X,
     int                 ITEMS_PER_THREAD,
     BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
-    bool                WARP_TIME_SLICING   = false,
     int                 BLOCK_DIM_Y         = 1,
     int                 BLOCK_DIM_Z         = 1,
     int                 PTX_ARCH            = CUB_PTX_ARCH>
@@ -605,9 +652,6 @@ private:
         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
     };
 
-    // Data type of input iterator
-    typedef typename std::iterator_traits::value_type T;
-
 
     /******************************************************************************
      * Algorithmic variants
@@ -632,35 +676,38 @@ private:
 
         /// Constructor
         __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
+            TempStorage &/*temp_storage*/,
             int linear_tid)
         :
             linear_tid(linear_tid)
         {}
 
         /// Load a linear segment of items from memory
+        template 
         __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
         {
             LoadDirectBlocked(linear_tid, block_itr, items);
         }
 
         /// Load a linear segment of items from memory, guarded by range
+        template 
         __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
             int             valid_items)                    ///< [in] Number of valid items to load
         {
             LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
         }
 
         /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template 
         __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
             int             valid_items,                    ///< [in] Number of valid items to load
-            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
         {
             LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
         }
@@ -682,46 +729,68 @@ private:
 
         /// Constructor
         __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
+            TempStorage &/*temp_storage*/,
             int linear_tid)
         :
             linear_tid(linear_tid)
         {}
 
         /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template 
         __device__ __forceinline__ void Load(
-            T               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+            InputT               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
         {
-            LoadDirectBlockedVectorized(linear_tid, block_ptr, items);
+            InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items);
         }
 
-        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template 
+        __device__ __forceinline__ void Load(
+            const InputT         *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
         template <
-            typename T,
-            typename _InputIterator>
+            CacheLoadModifier   MODIFIER,
+            typename            ValueType,
+            typename            OffsetT>
         __device__ __forceinline__ void Load(
-            _InputIterator    block_itr,                  ///< [in] The thread block's base input iterator for loading from
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+            CacheModifiedInputIterator    block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT                                                     (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized(linear_tid, block_itr.ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
+        template 
+        __device__ __forceinline__ void Load(
+            _InputIteratorT   block_itr,                    ///< [in] The thread block's base input iterator for loading from
+            InputT           (&items)[ITEMS_PER_THREAD])   ///< [out] Data to load
         {
             LoadDirectBlocked(linear_tid, block_itr, items);
         }
 
         /// Load a linear segment of items from memory, guarded by range (skips vectorization)
+        template 
         __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
             int             valid_items)                    ///< [in] Number of valid items to load
         {
             LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
         }
 
         /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
+        template 
         __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
             int             valid_items,                    ///< [in] Number of valid items to load
-            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
+            DefaultT          oob_default)                    ///< [in] Default value to assign out-of-bound items
         {
             LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
         }
@@ -736,10 +805,11 @@ private:
     struct LoadInternal
     {
         // BlockExchange utility type for keys
-        typedef BlockExchange BlockExchange;
+        typedef BlockExchange BlockExchange;
 
         /// Shared memory storage layout type
-        typedef typename BlockExchange::TempStorage _TempStorage;
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
 
         /// Alias wrapper allowing storage to be unioned
         struct TempStorage : Uninitialized<_TempStorage> {};
@@ -760,33 +830,36 @@ private:
         {}
 
         /// Load a linear segment of items from memory
+        template 
         __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
         {
             LoadDirectStriped(linear_tid, block_itr, items);
-            BlockExchange(temp_storage).StripedToBlocked(items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
         }
 
         /// Load a linear segment of items from memory, guarded by range
+        template 
         __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
             int             valid_items)                    ///< [in] Number of valid items to load
         {
             LoadDirectStriped(linear_tid, block_itr, items, valid_items);
-            BlockExchange(temp_storage).StripedToBlocked(items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
         }
 
         /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template 
         __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
             int             valid_items,                    ///< [in] Number of valid items to load
-            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
         {
             LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default);
-            BlockExchange(temp_storage).StripedToBlocked(items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
         }
 
     };
@@ -807,10 +880,86 @@ private:
         CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
 
         // BlockExchange utility type for keys
-        typedef BlockExchange BlockExchange;
+        typedef BlockExchange BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template 
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template 
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template 
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper
+     */
+    template 
+    struct LoadInternal
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange BlockExchange;
 
         /// Shared memory storage layout type
-        typedef typename BlockExchange::TempStorage _TempStorage;
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
 
         /// Alias wrapper allowing storage to be unioned
         struct TempStorage : Uninitialized<_TempStorage> {};
@@ -831,34 +980,37 @@ private:
         {}
 
         /// Load a linear segment of items from memory
+        template 
         __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
         {
             LoadDirectWarpStriped(linear_tid, block_itr, items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
         }
 
         /// Load a linear segment of items from memory, guarded by range
+        template 
         __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
             int             valid_items)                    ///< [in] Number of valid items to load
         {
             LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
         }
 
 
         /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template 
         __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
             int             valid_items,                    ///< [in] Number of valid items to load
-            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
         {
             LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
         }
     };
 
@@ -958,7 +1110,7 @@ public:
      * __global__ void ExampleKernel(int *d_data, ...)
      * {
      *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad BlockLoad;
+     *     typedef cub::BlockLoad BlockLoad;
      *
      *     // Allocate shared memory for BlockLoad
      *     __shared__ typename BlockLoad::TempStorage temp_storage;
@@ -974,9 +1126,10 @@ public:
      * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }.
      *
      */
+    template 
     __device__ __forceinline__ void Load(
-        InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
     {
         InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
     }
@@ -1002,7 +1155,7 @@ public:
      * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
      * {
      *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad BlockLoad;
+     *     typedef cub::BlockLoad BlockLoad;
      *
      *     // Allocate shared memory for BlockLoad
      *     __shared__ typename BlockLoad::TempStorage temp_storage;
@@ -1019,9 +1172,10 @@ public:
      * being unmasked to load portions of valid data (and other items remaining unassigned).
      *
      */
+    template 
     __device__ __forceinline__ void Load(
-        InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
         int             valid_items)                ///< [in] Number of valid items to load
     {
         InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
@@ -1048,7 +1202,7 @@ public:
      * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
      * {
      *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad BlockLoad;
+     *     typedef cub::BlockLoad BlockLoad;
      *
      *     // Allocate shared memory for BlockLoad
      *     __shared__ typename BlockLoad::TempStorage temp_storage;
@@ -1066,11 +1220,12 @@ public:
      * being unmasked to load portions of valid data (and other items are assigned \p -1)
      *
      */
+    template 
     __device__ __forceinline__ void Load(
-        InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
         int             valid_items,                ///< [in] Number of valid items to load
-        T               oob_default)                ///< [in] Default value to assign out-of-bound items
+        DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
     {
         InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
     }
diff --git a/SRC/cub/block/block_radix_rank.cuh b/SRC/cub/block/block_radix_rank.cuh
index 4b5a6a76..c26451c6 100644
--- a/SRC/cub/block/block_radix_rank.cuh
+++ b/SRC/cub/block/block_radix_rank.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -28,11 +28,13 @@
 
 /**
  * \file
- * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock
+ * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
  */
 
 #pragma once
 
+#include 
+
 #include "../thread/thread_reduce.cuh"
 #include "../thread/thread_scan.cuh"
 #include "../block/block_scan.cuh"
@@ -49,12 +51,12 @@ CUB_NS_PREFIX
 namespace cub {
 
 /**
- * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock.
+ * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
  * \ingroup BlockModule
  *
  * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
  * \tparam RADIX_BITS           The number of radix bits per digit place
- * \tparam DESCENDING           Whether or not the sorted-order is high-to-low
+ * \tparam IS_DESCENDING           Whether or not the sorted-order is high-to-low
  * \tparam MEMOIZE_OUTER_SCAN   [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
  * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
  * \tparam SMEM_CONFIG          [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
@@ -85,7 +87,7 @@ namespace cub {
 template <
     int                     BLOCK_DIM_X,
     int                     RADIX_BITS,
-    bool                    DESCENDING,
+    bool                    IS_DESCENDING,
     bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
     BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
     cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
@@ -129,12 +131,20 @@ private:
         COUNTER_LANES               = 1 << LOG_COUNTER_LANES,
 
         // The number of packed counters per thread (plus one for padding)
-        RAKING_SEGMENT              = COUNTER_LANES + 1,
+        PADDED_COUNTER_LANES        = COUNTER_LANES + 1,
+        RAKING_SEGMENT              = PADDED_COUNTER_LANES,
+    };
 
-        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
-        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
     };
 
+private:
+
 
     /// BlockScan type
     typedef BlockScan<
@@ -148,16 +158,17 @@ private:
 
 
     /// Shared memory storage layout type for BlockRadixRank
-    struct _TempStorage
+    struct __align__(16) _TempStorage
     {
-        // Storage for scanning local ranks
-        typename BlockScan::TempStorage block_scan;
-
-        union
+        union Aliasable
         {
-            DigitCounter            digit_counters[COUNTER_LANES + 1][BLOCK_THREADS][PACKING_RATIO];
+            DigitCounter            digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
             PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
-        };
+
+        } aliasable;
+
+        // Storage for scanning local ranks
+        typename BlockScan::TempStorage block_scan;
     };
 
 
@@ -169,109 +180,12 @@ private:
     _TempStorage &temp_storage;
 
     /// Linear thread-id
-    int linear_tid;
+    unsigned int linear_tid;
 
     /// Copy of raking segment, promoted to registers
     PackedCounter cached_segment[RAKING_SEGMENT];
 
 
-    /******************************************************************************
-     * Templated iteration
-     ******************************************************************************/
-
-    // General template iteration
-    template 
-    struct Iterate
-    {
-        /**
-         * Decode keys.  Decodes the radix digit from the current digit place
-         * and increments the thread's corresponding counter in shared
-         * memory for that digit.
-         *
-         * Saves both (1) the prior value of that counter (the key's
-         * thread-local exclusive prefix sum for that digit), and (2) the shared
-         * memory offset of the counter (for later use).
-         */
-        template 
-        static __device__ __forceinline__ void DecodeKeys(
-            BlockRadixRank  &cta,                                   // BlockRadixRank instance
-            UnsignedBits    (&keys)[KEYS_PER_THREAD],               // Key to decode
-            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],    // Prefix counter value (out parameter)
-            DigitCounter*   (&digit_counters)[KEYS_PER_THREAD],     // Counter smem offset (out parameter)
-            int             current_bit,                            // The least-significant bit position of the current digit to extract
-            int             num_bits)                               // The number of bits in the current digit
-        {
-            // Get digit
-            UnsignedBits digit = BFE(keys[COUNT], current_bit, num_bits);
-
-            // Get sub-counter
-            UnsignedBits sub_counter = digit >> LOG_COUNTER_LANES;
-
-            // Get counter lane
-            UnsignedBits counter_lane = digit & (COUNTER_LANES - 1);
-
-            if (DESCENDING)
-            {
-                sub_counter = PACKING_RATIO - 1 - sub_counter;
-                counter_lane = COUNTER_LANES - 1 - counter_lane;
-            }
-
-            // Pointer to smem digit counter
-            digit_counters[COUNT] = &cta.temp_storage.digit_counters[counter_lane][cta.linear_tid][sub_counter];
-
-            // Load thread-exclusive prefix
-            thread_prefixes[COUNT] = *digit_counters[COUNT];
-
-            // Store inclusive prefix
-            *digit_counters[COUNT] = thread_prefixes[COUNT] + 1;
-
-            // Iterate next key
-            Iterate::DecodeKeys(cta, keys, thread_prefixes, digit_counters, current_bit, num_bits);
-        }
-
-
-        // Termination
-        template 
-        static __device__ __forceinline__ void UpdateRanks(
-            int             (&ranks)[KEYS_PER_THREAD],              // Local ranks (out parameter)
-            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],    // Prefix counter value
-            DigitCounter*   (&digit_counters)[KEYS_PER_THREAD])     // Counter smem offset
-        {
-            // Add in threadblock exclusive prefix
-            ranks[COUNT] = thread_prefixes[COUNT] + *digit_counters[COUNT];
-
-            // Iterate next key
-            Iterate::UpdateRanks(ranks, thread_prefixes, digit_counters);
-        }
-    };
-
-
-    // Termination
-    template 
-    struct Iterate
-    {
-        // DecodeKeys
-        template 
-        static __device__ __forceinline__ void DecodeKeys(
-            BlockRadixRank  &cta,
-            UnsignedBits    (&keys)[KEYS_PER_THREAD],
-            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],
-            DigitCounter*   (&digit_counters)[KEYS_PER_THREAD],
-            int             current_bit,                            // The least-significant bit position of the current digit to extract
-            int             num_bits)                               // The number of bits in the current digit
-        {}
-
-
-        // UpdateRanks
-        template 
-        static __device__ __forceinline__ void UpdateRanks(
-            int             (&ranks)[KEYS_PER_THREAD],
-            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],
-            DigitCounter    *(&digit_counters)[KEYS_PER_THREAD])
-        {}
-    };
-
-
     /******************************************************************************
      * Utility methods
      ******************************************************************************/
@@ -291,7 +205,7 @@ private:
      */
     __device__ __forceinline__ PackedCounter Upsweep()
     {
-        PackedCounter *smem_raking_ptr = temp_storage.raking_grid[linear_tid];
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
         PackedCounter *raking_ptr;
 
         if (MEMOIZE_OUTER_SCAN)
@@ -309,7 +223,7 @@ private:
             raking_ptr = smem_raking_ptr;
         }
 
-        return ThreadReduce(raking_ptr, Sum());
+        return internal::ThreadReduce(raking_ptr, Sum());
     }
 
 
@@ -317,14 +231,14 @@ private:
     __device__ __forceinline__ void ExclusiveDownsweep(
         PackedCounter raking_partial)
     {
-        PackedCounter *smem_raking_ptr = temp_storage.raking_grid[linear_tid];
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
 
         PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
             cached_segment :
             smem_raking_ptr;
 
         // Exclusive raking downsweep scan
-        ThreadScanExclusive(raking_ptr, raking_ptr, Sum(), raking_partial);
+        internal::ThreadScanExclusive(raking_ptr, raking_ptr, Sum(), raking_partial);
 
         if (MEMOIZE_OUTER_SCAN)
         {
@@ -345,13 +259,34 @@ private:
     {
         // Reset shared memory digit counters
         #pragma unroll
-        for (int LANE = 0; LANE < COUNTER_LANES + 1; LANE++)
+        for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
         {
-            *((PackedCounter*) temp_storage.digit_counters[LANE][linear_tid]) = 0;
+            *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0;
         }
     }
 
 
+    /**
+     * Block-scan prefix callback
+     */
+    struct PrefixCallBack
+    {
+        __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate)
+        {
+            PackedCounter block_prefix = 0;
+
+            // Propagate totals in packed fields
+            #pragma unroll
+            for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
+            {
+                block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
+            }
+
+            return block_prefix;
+        }
+    };
+
+
     /**
      * Scan shared memory digit counters.
      */
@@ -362,15 +297,8 @@ private:
 
         // Compute exclusive sum
         PackedCounter exclusive_partial;
-        PackedCounter packed_aggregate;
-        BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, packed_aggregate);
-
-        // Propagate totals in packed fields
-        #pragma unroll
-        for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
-        {
-            exclusive_partial += packed_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
-        }
+        PrefixCallBack prefix_call_back;
+        BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
 
         // Downsweep scan with exclusive partial
         ExclusiveDownsweep(exclusive_partial);
@@ -432,18 +360,47 @@ public:
         // Reset shared memory digit counters
         ResetCounters();
 
-        // Decode keys and update digit counters
-        Iterate<0, KEYS_PER_THREAD>::DecodeKeys(*this, keys, thread_prefixes, digit_counters, current_bit, num_bits);
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Get digit
+            unsigned int digit = BFE(keys[ITEM], current_bit, num_bits);
 
-        __syncthreads();
+            // Get sub-counter
+            unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
+
+            // Get counter lane
+            unsigned int counter_lane = digit & (COUNTER_LANES - 1);
+
+            if (IS_DESCENDING)
+            {
+                sub_counter = PACKING_RATIO - 1 - sub_counter;
+                counter_lane = COUNTER_LANES - 1 - counter_lane;
+            }
+
+            // Pointer to smem digit counter
+            digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter];
+
+            // Load thread-exclusive prefix
+            thread_prefixes[ITEM] = *digit_counters[ITEM];
+
+            // Store inclusive prefix
+            *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
+        }
+
+        CTA_SYNC();
 
         // Scan shared memory counters
         ScanCounters();
 
-        __syncthreads();
+        CTA_SYNC();
 
         // Extract the local ranks of each key
-        Iterate<0, KEYS_PER_THREAD>::UpdateRanks(ranks, thread_prefixes, digit_counters);
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Add in thread block exclusive prefix
+            ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
+        }
     }
 
 
@@ -458,27 +415,281 @@ public:
         int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
         int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
         int             num_bits,                           ///< [in] The number of bits in the current digit
-        int             &inclusive_digit_prefix)            ///< [out] The incluisve prefix sum for the digit threadIdx.x
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
     {
         // Rank keys
         RankKeys(keys, ranks, current_bit, num_bits);
 
         // Get the inclusive and exclusive digit totals corresponding to the calling thread.
-        if ((BLOCK_THREADS == RADIX_DIGITS) || (linear_tid < RADIX_DIGITS))
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
+                // first counter column, resulting in unavoidable bank conflicts.)
+                unsigned int counter_lane   = (bin_idx & (COUNTER_LANES - 1));
+                unsigned int sub_counter    = bin_idx >> (LOG_COUNTER_LANES);
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter];
+            }
+        }
+    }
+};
+
+
+
+
+
+/**
+ * Radix-rank using match.any
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixRankMatch
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    typedef int32_t    RankT;
+    typedef int32_t    DigitCounterT;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        PADDED_WARPS            = ((WARPS & 0x1) == 0) ?
+                                    WARPS + 1 :
+                                    WARPS,
+
+        COUNTERS                = PADDED_WARPS * RADIX_DIGITS,
+        RAKING_SEGMENT          = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+        PADDED_RAKING_SEGMENT   = ((RAKING_SEGMENT & 0x1) == 0) ?
+                                    RAKING_SEGMENT + 1 :
+                                    RAKING_SEGMENT,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    };
+
+private:
+
+    /// BlockScan type
+    typedef BlockScan<
+            DigitCounterT,
+            BLOCK_THREADS,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockScanT;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        typename BlockScanT::TempStorage            block_scan;
+
+        union __align__(16) Aliasable
+        {
+            volatile DigitCounterT                  warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];
+            DigitCounterT                           raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];
+
+        } aliasable;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRankMatch(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits)                           ///< [in] The number of bits in the current digit
+    {
+        // Initialize shared digit counters
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0;
+
+        CTA_SYNC();
+
+        // Each warp will strip-mine its section of input, one strip at a time
+
+        volatile DigitCounterT  *digit_counters[KEYS_PER_THREAD];
+        uint32_t                warp_id         = linear_tid >> LOG_WARP_THREADS;
+        uint32_t                lane_mask_lt    = LaneMaskLt();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // My digit
+            uint32_t digit = BFE(keys[ITEM], current_bit, num_bits);
+
+            if (IS_DESCENDING)
+                digit = RADIX_DIGITS - digit - 1;
+
+            // Mask of peers who have same digit as me
+            uint32_t peer_mask = MatchAny(digit);
+
+            // Pointer to smem digit counter for this key
+            digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id];
+
+            // Number of occurrences in previous strips
+            DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of peers having same digit as me
+            int32_t digit_count = __popc(peer_mask);
+
+            // Number of lower-ranked peers having same digit seen so far
+            int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
+
+            if (peer_digit_prefix == 0)
+            {
+                // First thread for each digit updates the shared warp counter
+                *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
+            }
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of prior keys having same digit
+            ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
+        }
+
+        CTA_SYNC();
+
+        // Scan warp counters
+
+        DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM];
+
+        BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];
+
+        CTA_SYNC();
+
+        // Seed ranks with counter values from previous warps
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+            ranks[ITEM] += *digit_counters[ITEM];
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits,                           ///< [in] The number of bits in the current digit
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        RankKeys(keys, ranks, current_bit, num_bits);
+
+        // Get exclusive count for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
         {
-            int bin_idx = (DESCENDING) ?
-                RADIX_DIGITS - linear_tid - 1 :
-                linear_tid;
-
-            // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
-            // first counter column, resulting in unavoidable bank conflicts.)
-            int counter_lane = (bin_idx & (COUNTER_LANES - 1));
-            int sub_counter = bin_idx >> (LOG_COUNTER_LANES);
-            inclusive_digit_prefix = temp_storage.digit_counters[counter_lane + 1][0][sub_counter];
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0];
+            }
         }
     }
 };
 
+
 }               // CUB namespace
 CUB_NS_POSTFIX  // Optional outer namespace(s)
 
diff --git a/SRC/cub/block/block_radix_sort.cuh b/SRC/cub/block/block_radix_sort.cuh
index 032f3678..ac0c9f85 100644
--- a/SRC/cub/block/block_radix_sort.cuh
+++ b/SRC/cub/block/block_radix_sort.cuh
@@ -1,6 +1,6 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -51,10 +51,10 @@ namespace cub {
  * \brief The BlockRadixSort class provides [collective](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
  * \ingroup BlockModule
  *
- * \tparam Key                  Key type
+ * \tparam KeyT                 KeyT type
  * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
  * \tparam ITEMS_PER_THREAD     The number of items per thread
- * \tparam Value                [optional] Value type (default: cub::NullType, which indicates a keys-only sort)
+ * \tparam ValueT               [optional] ValueT type (default: cub::NullType, which indicates a keys-only sort)
  * \tparam RADIX_BITS           [optional] The number of radix bits per digit place (default: 4 bits)
  * \tparam MEMOIZE_OUTER_SCAN   [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
  * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
@@ -71,8 +71,9 @@ namespace cub {
  *   given input sequence of keys and a set of rules specifying a total ordering
  *   of the symbolic alphabet, the radix sorting method produces a lexicographic
  *   ordering of those keys.
- * - BlockRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
- *   unsigned char, \p int, \p double, etc.  Within each key, the implementation treats fixed-length
+ * - BlockRadixSort can sort all of the built-in C++ numeric primitive types
+ *   (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half
+ *   half-precision floating-point type. Within each key, the implementation treats fixed-length
  *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
  *   method can only be applied to unsigned integral types, BlockRadixSort
  *   is able to sort signed and floating-point types via simple bit-wise transformations
@@ -117,10 +118,10 @@ namespace cub {
  *
  */
 template <
-    typename                Key,
+    typename                KeyT,
     int                     BLOCK_DIM_X,
     int                     ITEMS_PER_THREAD,
-    typename                Value                   = NullType,
+    typename                ValueT                   = NullType,
     int                     RADIX_BITS              = 4,
     bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
     BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
@@ -142,11 +143,11 @@ private:
         BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
 
         // Whether or not there are values to be trucked along with keys
-        KEYS_ONLY                   = Equals::VALUE,
+        KEYS_ONLY                   = Equals::VALUE,
     };
 
-    // Key traits and unsigned bits type
-    typedef NumericTraits                  KeyTraits;
+    // KeyT traits and unsigned bits type
+    typedef Traits                        KeyTraits;
     typedef typename KeyTraits::UnsignedBits    UnsignedBits;
 
     /// Ascending BlockRadixRank utility type
@@ -176,21 +177,18 @@ private:
         DescendingBlockRadixRank;
 
     /// BlockExchange utility type for keys
-    typedef BlockExchange BlockExchangeKeys;
+    typedef BlockExchange BlockExchangeKeys;
 
     /// BlockExchange utility type for values
-    typedef BlockExchange BlockExchangeValues;
+    typedef BlockExchange BlockExchangeValues;
 
     /// Shared memory storage layout type
-    struct _TempStorage
+    union _TempStorage
     {
-        union
-        {
-            typename AscendingBlockRadixRank::TempStorage  asending_ranking_storage;
-            typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
-            typename BlockExchangeKeys::TempStorage        exchange_keys;
-            typename BlockExchangeValues::TempStorage      exchange_values;
-        };
+        typename AscendingBlockRadixRank::TempStorage  asending_ranking_storage;
+        typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
+        typename BlockExchangeKeys::TempStorage        exchange_keys;
+        typename BlockExchangeValues::TempStorage      exchange_values;
     };
 
 
@@ -202,7 +200,7 @@ private:
     _TempStorage &temp_storage;
 
     /// Linear thread-id
-    int linear_tid;
+    unsigned int linear_tid;
 
     /******************************************************************************
      * Utility methods
@@ -221,7 +219,7 @@ private:
         int             (&ranks)[ITEMS_PER_THREAD],
         int             begin_bit,
         int             pass_bits,
-        Int2Type is_descending)
+        Int2Type /*is_descending*/)
     {
         AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(
             unsigned_keys,
@@ -236,7 +234,7 @@ private:
         int             (&ranks)[ITEMS_PER_THREAD],
         int             begin_bit,
         int             pass_bits,
-        Int2Type  is_descending)
+        Int2Type  /*is_descending*/)
     {
         DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(
             unsigned_keys,
@@ -247,12 +245,12 @@ private:
 
     /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
     __device__ __forceinline__ void ExchangeValues(
-        Value           (&values)[ITEMS_PER_THREAD],
+        ValueT          (&values)[ITEMS_PER_THREAD],
         int             (&ranks)[ITEMS_PER_THREAD],
-        Int2Type is_keys_only,
-        Int2Type  is_blocked)
+        Int2Type /*is_keys_only*/,
+        Int2Type  /*is_blocked*/)
     {
-        __syncthreads();
+        CTA_SYNC();
 
         // Exchange values through shared memory in blocked arrangement
         BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
@@ -260,12 +258,12 @@ private:
 
     /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
     __device__ __forceinline__ void ExchangeValues(
-        Value           (&values)[ITEMS_PER_THREAD],
+        ValueT          (&values)[ITEMS_PER_THREAD],
         int             (&ranks)[ITEMS_PER_THREAD],
-        Int2Type is_keys_only,
-        Int2Type is_blocked)
+        Int2Type /*is_keys_only*/,
+        Int2Type /*is_blocked*/)
     {
-        __syncthreads();
+        CTA_SYNC();
 
         // Exchange values through shared memory in blocked arrangement
         BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
@@ -274,17 +272,17 @@ private:
     /// ExchangeValues (specialized for keys-only sort)
     template 
     __device__ __forceinline__ void ExchangeValues(
-        Value                   (&values)[ITEMS_PER_THREAD],
-        int                     (&ranks)[ITEMS_PER_THREAD],
-        Int2Type          is_keys_only,
-        Int2Type    is_blocked)
+        ValueT                  (&/*values*/)[ITEMS_PER_THREAD],
+        int                     (&/*ranks*/)[ITEMS_PER_THREAD],
+        Int2Type          /*is_keys_only*/,
+        Int2Type    /*is_blocked*/)
     {}
 
     /// Sort blocked arrangement
     template 
     __device__ __forceinline__ void SortBlocked(
-        Key                     (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
-        Value                   (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
         int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
         int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
         Int2Type    is_descending,                      ///< Tag whether is a descending-order sort
@@ -310,7 +308,7 @@ private:
             RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
             begin_bit += RADIX_BITS;
 
-            __syncthreads();
+            CTA_SYNC();
 
             // Exchange keys through shared memory in blocked arrangement
             BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
@@ -321,7 +319,7 @@ private:
             // Quit if done
             if (begin_bit >= end_bit) break;
 
-            __syncthreads();
+            CTA_SYNC();
         }
 
         // Untwiddle bits if necessary
@@ -332,11 +330,15 @@ private:
         }
     }
 
+public:
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
     /// Sort blocked -> striped arrangement
     template 
     __device__ __forceinline__ void SortBlockedToStriped(
-        Key                     (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
-        Value                   (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
         int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
         int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
         Int2Type    is_descending,                      ///< Tag whether is a descending-order sort
@@ -362,7 +364,7 @@ private:
             RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
             begin_bit += RADIX_BITS;
 
-            __syncthreads();
+            CTA_SYNC();
 
             // Check if this is the last pass
             if (begin_bit >= end_bit)
@@ -383,7 +385,7 @@ private:
             // Exchange values through shared memory in blocked arrangement
             ExchangeValues(values, ranks, is_keys_only, Int2Type());
 
-            __syncthreads();
+            CTA_SYNC();
         }
 
         // Untwiddle bits if necessary
@@ -394,11 +396,9 @@ private:
         }
     }
 
+#endif // DOXYGEN_SHOULD_SKIP_THIS
 
-
-public:
-
-    /// \smemstorage{BlockScan}
+    /// \smemstorage{BlockRadixSort}
     struct TempStorage : Uninitialized<_TempStorage> {};
 
 
@@ -472,9 +472,9 @@ public:
      * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }.
      */
     __device__ __forceinline__ void Sort(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
         int     begin_bit   = 0,                    ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
     {
         NullType values[ITEMS_PER_THREAD];
 
@@ -527,10 +527,10 @@ public:
      *
      */
     __device__ __forceinline__ void Sort(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        Value   (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
         int     begin_bit   = 0,                    ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
     {
         SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type());
     }
@@ -573,9 +573,9 @@ public:
      * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }.
      */
     __device__ __forceinline__ void SortDescending(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
         int     begin_bit   = 0,                    ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
     {
         NullType values[ITEMS_PER_THREAD];
 
@@ -628,10 +628,10 @@ public:
      *
      */
     __device__ __forceinline__ void SortDescending(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        Value   (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
         int     begin_bit   = 0,                    ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
     {
         SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type());
     }
@@ -683,9 +683,9 @@ public:
      *
      */
     __device__ __forceinline__ void SortBlockedToStriped(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
         int     begin_bit   = 0,                    ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
     {
         NullType values[ITEMS_PER_THREAD];
 
@@ -738,10 +738,10 @@ public:
      *
      */
     __device__ __forceinline__ void SortBlockedToStriped(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        Value   (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
         int     begin_bit   = 0,                    ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
     {
         SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type());
     }
@@ -786,9 +786,9 @@ public:
      *
      */
     __device__ __forceinline__ void SortDescendingBlockedToStriped(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
         int     begin_bit   = 0,                    ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
     {
         NullType values[ITEMS_PER_THREAD];
 
@@ -841,10 +841,10 @@ public:
      *
      */
     __device__ __forceinline__ void SortDescendingBlockedToStriped(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        Value   (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
         int     begin_bit   = 0,                    ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison
     {
         SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type());
     }
diff --git a/SRC/cub/block/block_raking_layout.cuh b/SRC/cub/block/block_raking_layout.cuh
index 9c01f255..35006168 100644
--- a/SRC/cub/block/block_raking_layout.cuh
+++ b/SRC/cub/block/block_raking_layout.cuh
@@ -1,149 +1,152 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
- */
-
-
-#pragma once
-
-#include "../util_macro.cuh"
-#include "../util_arch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
- * \ingroup BlockModule
- *
- * \par Overview
- * This type facilitates a shared memory usage pattern where a block of CUDA
- * threads places elements into shared memory and then reduces the active
- * parallelism to one "raking" warp of threads for serially aggregating consecutive
- * sequences of shared items.  Padding is inserted to eliminate bank conflicts
- * (for most data types).
- *
- * \tparam T                        The data type to be exchanged.
- * \tparam BLOCK_THREADS            The thread block size in threads.
- * \tparam PTX_ARCH                 [optional] \ptxversion
- */
-template <
-    typename    T,
-    int         BLOCK_THREADS,
-    int         PTX_ARCH = CUB_PTX_ARCH>
-struct BlockRakingLayout
-{
-    //---------------------------------------------------------------------
-    // Constants and type definitions
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        /// The total number of elements that need to be cooperatively reduced
-        SHARED_ELEMENTS = BLOCK_THREADS,
-
-        /// Maximum number of warp-synchronous raking threads
-        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Number of raking elements per warp-synchronous raking thread (rounded up)
-        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
-
-        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
-        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
-
-        /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
-        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
-
-        /// Degree of bank conflicts (e.g., 4-way)
-        CONFLICT_DEGREE = (HAS_CONFLICTS) ?
-            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
-            1,
-
-        /// Pad each segment length with one element if degree of bank conflicts is greater than 4-way (heuristic)
-        SEGMENT_PADDING = (CONFLICT_DEGREE > CUB_PREFER_CONFLICT_OVER_PADDING(PTX_ARCH)) ? 1 : 0,
-//        SEGMENT_PADDING = (HAS_CONFLICTS) ? 1 : 0,
-
-        /// Total number of elements in the raking grid
-        GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + SEGMENT_PADDING),
-
-        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
-        UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
-    };
-
-
-    /**
-     * \brief Shared memory storage type
-     */
-    typedef T _TempStorage[BlockRakingLayout::GRID_ELEMENTS];
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /**
-     * \brief Returns the location for the calling thread to place data into the grid
-     */
-    static __device__ __forceinline__ T* PlacementPtr(
-        TempStorage &temp_storage,
-        int linear_tid)
-    {
-        // Offset for partial
-        unsigned int offset = linear_tid;
-
-        // Add in one padding element for every segment
-        if (SEGMENT_PADDING > 0)
-        {
-            offset += offset / SEGMENT_LENGTH;
-        }
-
-        // Incorporating a block of padding partials every shared memory segment
-        return temp_storage.Alias() + offset;
-    }
-
-
-    /**
-     * \brief Returns the location for the calling thread to begin sequential raking
-     */
-    static __device__ __forceinline__ T* RakingPtr(
-        TempStorage &temp_storage,
-        int linear_tid)
-    {
-        return temp_storage.Alias() + (linear_tid * (SEGMENT_LENGTH + SEGMENT_PADDING));
-    }
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
+ */
+
+
+#pragma once
+
+#include "../util_macro.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * This type facilitates a shared memory usage pattern where a block of CUDA
+ * threads places elements into shared memory and then reduces the active
+ * parallelism to one "raking" warp of threads for serially aggregating consecutive
+ * sequences of shared items.  Padding is inserted to eliminate bank conflicts
+ * (for most data types).
+ *
+ * \tparam T                        The data type to be exchanged.
+ * \tparam BLOCK_THREADS            The thread block size in threads.
+ * \tparam PTX_ARCH                 [optional] \ptxversion
+ */
+template <
+    typename    T,
+    int         BLOCK_THREADS,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct BlockRakingLayout
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// The total number of elements that need to be cooperatively reduced
+        SHARED_ELEMENTS = BLOCK_THREADS,
+
+        /// Maximum number of warp-synchronous raking threads
+        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Number of raking elements per warp-synchronous raking thread (rounded up)
+        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
+
+        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
+        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
+
+        /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
+        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
+
+        /// Degree of bank conflicts (e.g., 4-way)
+        CONFLICT_DEGREE = (HAS_CONFLICTS) ?
+            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
+            1,
+
+        /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
+        USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
+
+        /// Total number of elements in the raking grid
+        GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
+
+        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
+        UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
+    };
+
+
+    /**
+     * \brief Shared memory storage type
+     */
+    struct __align__(16) _TempStorage
+    {
+        T buff[BlockRakingLayout::GRID_ELEMENTS];
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /**
+     * \brief Returns the location for the calling thread to place data into the grid
+     */
+    static __device__ __forceinline__ T* PlacementPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        // Offset for partial
+        unsigned int offset = linear_tid;
+
+        // Add in one padding element for every segment
+        if (USE_SEGMENT_PADDING > 0)
+        {
+            offset += offset / SEGMENT_LENGTH;
+        }
+
+        // Incorporating a block of padding partials every shared memory segment
+        return temp_storage.Alias().buff + offset;
+    }
+
+
+    /**
+     * \brief Returns the location for the calling thread to begin sequential raking
+     */
+    static __device__ __forceinline__ T* RakingPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/SRC/cub/block/block_reduce.cuh b/SRC/cub/block/block_reduce.cuh
index 8e3124c9..261f2ea6 100644
--- a/SRC/cub/block/block_reduce.cuh
+++ b/SRC/cub/block/block_reduce.cuh
@@ -1,607 +1,607 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_reduce_raking.cuh"
-#include "specializations/block_reduce_raking_commutative_only.cuh"
-#include "specializations/block_reduce_warp_reductions.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * BlockReduceAlgorithm enumerates alternative algorithms for parallel
- * reduction across a CUDA threadblock.
- */
-enum BlockReduceAlgorithm
-{
-
-    /**
-     * \par Overview
-     * An efficient "raking" reduction algorithm that only supports commutative
-     * reduction operators (true for most operations, e.g., addition).
-     *
-     * \par
-     * Execution is comprised of three phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Threads in warps other than the first warp place
-     *    their partial reductions into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within the first
-     *    warp continue to accumulate by raking across segments of shared partial reductions
-     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
-     *
-     * \par
-     * \image html block_reduce.png
-     * 
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.
- * - * \par Performance Considerations - * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE - * and is preferable when the reduction operator is commutative. This variant - * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall - * throughput across the GPU when suitably occupied. However, turn-around latency may be - * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable - * when the GPU is under-occupied. - */ - BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, - - - /** - * \par Overview - * An efficient "raking" reduction algorithm that supports commutative - * (e.g., addition) and non-commutative (e.g., string concatenation) reduction - * operators. \blocked. - * - * \par - * Execution is comprised of three phases: - * -# Upsweep sequential reduction in registers (if threads contribute more - * than one input each). Each thread then places the partial reduction - * of its item(s) into shared memory. - * -# Upsweep sequential reduction in shared memory. Threads within a - * single warp rake across segments of shared partial reductions. - * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. - * - * \par - * \image html block_reduce.png - *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.
- * - * \par Performance Considerations - * - This variant performs more communication than BLOCK_REDUCE_RAKING - * and is only preferable when the reduction operator is non-commutative. This variant - * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall - * throughput across the GPU when suitably occupied. However, turn-around latency may be - * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable - * when the GPU is under-occupied. - */ - BLOCK_REDUCE_RAKING, - - - /** - * \par Overview - * A quick "tiled warp-reductions" reduction algorithm that supports commutative - * (e.g., addition) and non-commutative (e.g., string concatenation) reduction - * operators. - * - * \par - * Execution is comprised of four phases: - * -# Upsweep sequential reduction in registers (if threads contribute more - * than one input each). Each thread then places the partial reduction - * of its item(s) into shared memory. - * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style - * reduction within each warp. - * -# A propagation phase where the warp reduction outputs in each warp are - * updated with the aggregate from each preceding warp. - * - * \par - * \image html block_scan_warpscans.png - *
\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.
- * - * \par Performance Considerations - * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING - * or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall - * throughput across the GPU. However turn-around latency may be lower and - * thus useful when the GPU is under-occupied. - */ - BLOCK_REDUCE_WARP_REDUCTIONS, -}; - - -/****************************************************************************** - * Block reduce - ******************************************************************************/ - -/** - * \brief The BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png) - * \ingroup BlockModule - * - * \tparam T Data type being reduced - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ALGORITHM [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - A reduction (or fold) - * uses a binary combining operator to compute a single aggregate from a list of input elements. - * - \rowmajor - * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles: - * -# cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY. An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) - * -# cub::BLOCK_REDUCE_RAKING. An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) - * -# cub::BLOCK_REDUCE_WARP_REDUCTIONS. A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) - * - * \par Performance Considerations - * - \granularity - * - Very efficient (only one synchronization barrier). - * - Incurs zero bank conflicts for most types - * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: - * - Summation (vs. generic reduction) - * - \p BLOCK_THREADS is a multiple of the architecture's warp size - * - Every thread has a valid input (i.e., full vs. partial-tiles) - * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives - * - * \par A Simple Example - * \blockcollective{BlockReduce} - * \par - * The code snippet below illustrates a sum reduction of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Compute the block-wide sum for thread0 - * int aggregate = BlockReduce(temp_storage).Sum(thread_data); - * - * \endcode - * - */ -template < - typename T, - int BLOCK_DIM_X, - BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockReduce -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - typedef BlockReduceWarpReductions WarpReductions; - typedef BlockReduceRakingCommutativeOnly RakingCommutativeOnly; - typedef BlockReduceRaking Raking; - - /// Internal specialization type - typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS), - WarpReductions, - typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), - RakingCommutativeOnly, - Raking>::Type>::Type InternalBlockReduce; // BlockReduceRaking - - /// Shared memory storage layout type for BlockReduce - typedef typename InternalBlockReduce::TempStorage _TempStorage; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - - -public: - - /// \smemstorage{BlockReduce} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockReduce() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockReduce( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Generic reductions - *********************************************************************/ - //@{ - - - /** - * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a max reduction of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Each thread obtains an input item - * int thread_data; - * ... - * - * // Compute the block-wide max for thread0 - * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); - * - * \endcode - * - * \tparam ReductionOp [inferred] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - ReductionOp reduction_op) ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - { - return InternalBlockReduce(temp_storage).template Reduce(input, BLOCK_THREADS, reduction_op); - } - - - /** - * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a max reduction of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Compute the block-wide max for thread0 - * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); - * - * \endcode - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ReductionOp [inferred] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment - ReductionOp reduction_op) ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - { - // Reduce partials - T partial = ThreadReduce(inputs, reduction_op); - return Reduce(partial, reduction_op); - } - - - /** - * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first \p num_valid threads each contribute one input element. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a max reduction of a partially-full tile of integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int num_valid, ...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Each thread obtains an input item - * int thread_data; - * if (threadIdx.x < num_valid) thread_data = ... - * - * // Compute the block-wide max for thread0 - * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid); - * - * \endcode - * - * \tparam ReductionOp [inferred] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - ReductionOp reduction_op, ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) - { - // Determine if we scan skip bounds checking - if (num_valid >= BLOCK_THREADS) - { - return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); - } - else - { - return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); - } - } - - - //@} end member group - /******************************************************************//** - * \name Summation reductions - *********************************************************************/ - //@{ - - - /** - * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sum reduction of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Each thread obtains an input item - * int thread_data; - * ... - * - * // Compute the block-wide sum for thread0 - * int aggregate = BlockReduce(temp_storage).Sum(thread_data); - * - * \endcode - * - */ - __device__ __forceinline__ T Sum( - T input) ///< [in] Calling thread's input - { - return InternalBlockReduce(temp_storage).template Sum(input, BLOCK_THREADS); - } - - /** - * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sum reduction of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Compute the block-wide sum for thread0 - * int aggregate = BlockReduce(temp_storage).Sum(thread_data); - * - * \endcode - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - */ - template - __device__ __forceinline__ T Sum( - T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment - { - // Reduce partials - T partial = ThreadReduce(inputs, cub::Sum()); - return Sum(partial); - } - - - /** - * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first \p num_valid threads each contribute one input element. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int num_valid, ...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Each thread obtains an input item (up to num_items) - * int thread_data; - * if (threadIdx.x < num_valid) - * thread_data = ... - * - * // Compute the block-wide sum for thread0 - * int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid); - * - * \endcode - * - */ - __device__ __forceinline__ T Sum( - T input, ///< [in] Calling thread's input - int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) - { - // Determine if we scan skip bounds checking - if (num_valid >= BLOCK_THREADS) - { - return InternalBlockReduce(temp_storage).template Sum(input, num_valid); - } - else - { - return InternalBlockReduce(temp_storage).template Sum(input, num_valid); - } - } - - - //@} end member group -}; - -/** - * \example example_block_reduce.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "specializations/block_reduce_raking.cuh" +#include "specializations/block_reduce_raking_commutative_only.cuh" +#include "specializations/block_reduce_warp_reductions.cuh" +#include "../util_ptx.cuh" +#include "../util_type.cuh" +#include "../thread/thread_operators.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + +/** + * BlockReduceAlgorithm enumerates alternative algorithms for parallel + * reduction across a CUDA thread block. + */ +enum BlockReduceAlgorithm +{ + + /** + * \par Overview + * An efficient "raking" reduction algorithm that only supports commutative + * reduction operators (true for most operations, e.g., addition). + * + * \par + * Execution is comprised of three phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Threads in warps other than the first warp place + * their partial reductions into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within the first + * warp continue to accumulate by raking across segments of shared partial reductions + * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. + * + * \par + * \image html block_reduce.png + *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE + * and is preferable when the reduction operator is commutative. This variant + * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall + * throughput across the GPU when suitably occupied. However, turn-around latency may be + * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable + * when the GPU is under-occupied. + */ + BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, + + + /** + * \par Overview + * An efficient "raking" reduction algorithm that supports commutative + * (e.g., addition) and non-commutative (e.g., string concatenation) reduction + * operators. \blocked. + * + * \par + * Execution is comprised of three phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Each thread then places the partial reduction + * of its item(s) into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within a + * single warp rake across segments of shared partial reductions. + * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. + * + * \par + * \image html block_reduce.png + *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - This variant performs more communication than BLOCK_REDUCE_RAKING + * and is only preferable when the reduction operator is non-commutative. This variant + * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall + * throughput across the GPU when suitably occupied. However, turn-around latency may be + * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable + * when the GPU is under-occupied. + */ + BLOCK_REDUCE_RAKING, + + + /** + * \par Overview + * A quick "tiled warp-reductions" reduction algorithm that supports commutative + * (e.g., addition) and non-commutative (e.g., string concatenation) reduction + * operators. + * + * \par + * Execution is comprised of four phases: + * -# Upsweep sequential reduction in registers (if threads contribute more + * than one input each). Each thread then places the partial reduction + * of its item(s) into shared memory. + * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style + * reduction within each warp. + * -# A propagation phase where the warp reduction outputs in each warp are + * updated with the aggregate from each preceding warp. + * + * \par + * \image html block_scan_warpscans.png + *
\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING + * or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall + * throughput across the GPU. However turn-around latency may be lower and + * thus useful when the GPU is under-occupied. + */ + BLOCK_REDUCE_WARP_REDUCTIONS, +}; + + +/****************************************************************************** + * Block reduce + ******************************************************************************/ + +/** + * \brief The BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png) + * \ingroup BlockModule + * + * \tparam T Data type being reduced + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ALGORITHM [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a list of input elements. + * - \rowmajor + * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles: + * -# cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY. An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * -# cub::BLOCK_REDUCE_RAKING. An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * -# cub::BLOCK_REDUCE_WARP_REDUCTIONS. A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) + * + * \par Performance Considerations + * - \granularity + * - Very efficient (only one synchronization barrier). + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Summation (vs. generic reduction) + * - \p BLOCK_THREADS is a multiple of the architecture's warp size + * - Every thread has a valid input (i.e., full vs. partial-tiles) + * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives + * + * \par A Simple Example + * \blockcollective{BlockReduce} + * \par + * The code snippet below illustrates a sum reduction of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data); + * + * \endcode + * + */ +template < + typename T, + int BLOCK_DIM_X, + BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockReduce +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + typedef BlockReduceWarpReductions WarpReductions; + typedef BlockReduceRakingCommutativeOnly RakingCommutativeOnly; + typedef BlockReduceRaking Raking; + + /// Internal specialization type + typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS), + WarpReductions, + typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), + RakingCommutativeOnly, + Raking>::Type>::Type InternalBlockReduce; // BlockReduceRaking + + /// Shared memory storage layout type for BlockReduce + typedef typename InternalBlockReduce::TempStorage _TempStorage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + +public: + + /// \smemstorage{BlockReduce} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockReduce() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockReduce( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Generic reductions + *********************************************************************/ + //@{ + + + /** + * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item + * int thread_data; + * ... + * + * // Compute the block-wide max for thread0 + * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); + * + * \endcode + * + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op) ///< [in] Binary reduction functor + { + return InternalBlockReduce(temp_storage).template Reduce(input, BLOCK_THREADS, reduction_op); + } + + + /** + * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Compute the block-wide max for thread0 + * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); + * + * \endcode + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment + ReductionOp reduction_op) ///< [in] Binary reduction functor + { + // Reduce partials + T partial = internal::ThreadReduce(inputs, reduction_op); + return Reduce(partial, reduction_op); + } + + + /** + * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first \p num_valid threads each contribute one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction of a partially-full tile of integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int num_valid, ...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item + * int thread_data; + * if (threadIdx.x < num_valid) thread_data = ... + * + * // Compute the block-wide max for thread0 + * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid); + * + * \endcode + * + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op, ///< [in] Binary reduction functor + int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) + { + // Determine if we scan skip bounds checking + if (num_valid >= BLOCK_THREADS) + { + return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); + } + else + { + return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); + } + } + + + //@} end member group + /******************************************************************//** + * \name Summation reductions + *********************************************************************/ + //@{ + + + /** + * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item + * int thread_data; + * ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data); + * + * \endcode + * + */ + __device__ __forceinline__ T Sum( + T input) ///< [in] Calling thread's input + { + return InternalBlockReduce(temp_storage).template Sum(input, BLOCK_THREADS); + } + + /** + * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data); + * + * \endcode + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ T Sum( + T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment + { + // Reduce partials + T partial = internal::ThreadReduce(inputs, cub::Sum()); + return Sum(partial); + } + + + /** + * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first \p num_valid threads each contribute one input element. + * + * \par + * - The return value is undefined in threads other than thread0. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(int num_valid, ...) + * { + * // Specialize BlockReduce for a 1D block of 128 threads on type int + * typedef cub::BlockReduce BlockReduce; + * + * // Allocate shared memory for BlockReduce + * __shared__ typename BlockReduce::TempStorage temp_storage; + * + * // Each thread obtains an input item (up to num_items) + * int thread_data; + * if (threadIdx.x < num_valid) + * thread_data = ... + * + * // Compute the block-wide sum for thread0 + * int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid); + * + * \endcode + * + */ + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input + int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) + { + // Determine if we scan skip bounds checking + if (num_valid >= BLOCK_THREADS) + { + return InternalBlockReduce(temp_storage).template Sum(input, num_valid); + } + else + { + return InternalBlockReduce(temp_storage).template Sum(input, num_valid); + } + } + + + //@} end member group +}; + +/** + * \example example_block_reduce.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/block_scan.cuh b/SRC/cub/block/block_scan.cuh index 84e58302..27ea7ed4 100644 --- a/SRC/cub/block/block_scan.cuh +++ b/SRC/cub/block/block_scan.cuh @@ -1,2318 +1,2126 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. - */ - -#pragma once - -#include "specializations/block_scan_raking.cuh" -#include "specializations/block_scan_warp_scans.cuh" -#include "../util_arch.cuh" -#include "../util_type.cuh" -#include "../util_ptx.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - - -/****************************************************************************** - * Scan utility types - ******************************************************************************/ - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -/** - * Reduce-value-by-ID scan operator - */ -template ///< Wrapped reduction operator type -struct ReduceByKeyOp -{ - ReductionOp op; ///< Wrapped reduction operator - - /// Constructor - __device__ __forceinline__ ReduceByKeyOp(ReductionOp op) : op(op) {} - - /// Scan operator - template - __device__ __forceinline__ KeyValuePair operator()( - const KeyValuePair &first, - const KeyValuePair &second) - { - KeyValuePair retval; - - retval.value = (second.key != first.key) ? - second.value : // The second value is for a different ID, return only that value - op(first.value, second.value); // The values are for the same ID so reduce them - - retval.key = second.key; - return retval; - } -}; - - - -/** - * Segmented scan operator - */ -template ///< Wrapped reduction operator type -struct SegmentedOp -{ - ReductionOp op; ///< Wrapped reduction operator - - /// Constructor - __device__ __forceinline__ SegmentedOp(ReductionOp op) : op(op) {} - - /// Scan operator - template - __device__ __forceinline__ KeyValuePair operator()( - const KeyValuePair &first, - const KeyValuePair &second) - { - if (second.key) { - KeyValuePair retval; - retval.value = second.value; - retval.key = first.key + second.key; - return retval; - } else { - KeyValuePair retval; - retval.value = op(first.value, second.value); - retval.key = first.key + second.key; - return ; - } - } -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - -/****************************************************************************** - * Algorithmic variants - ******************************************************************************/ - -/** - * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block. - */ -enum BlockScanAlgorithm -{ - - /** - * \par Overview - * An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases: - * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. - * -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions. - * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp. - * -# Downsweep sequential exclusive scan in shared memory. Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output. - * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. - * - * \par - * \image html block_scan_raking.png - *
\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.
- * - * \par Performance Considerations - * - Although this variant may suffer longer turnaround latencies when the - * GPU is under-occupied, it can often provide higher overall throughput - * across the GPU when suitably occupied. - */ - BLOCK_SCAN_RAKING, - - - /** - * \par Overview - * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at - * the expense of higher register pressure. Raking threads preserve their - * "upsweep" segment of values in registers while performing warp-synchronous - * scan, allowing the "downsweep" not to re-read them from shared memory. - */ - BLOCK_SCAN_RAKING_MEMOIZE, - - - /** - * \par Overview - * A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases: - * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. - * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp. - * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp. - * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. - * - * \par - * \image html block_scan_warpscans.png - *
\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.
- * - * \par Performance Considerations - * - Although this variant may suffer lower overall throughput across the - * GPU because due to a heavy reliance on inefficient warpscans, it can - * often provide lower turnaround latencies when the GPU is under-occupied. - */ - BLOCK_SCAN_WARP_SCANS, -}; - - -/****************************************************************************** - * Block scan - ******************************************************************************/ - -/** - * \brief The BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png) - * \ingroup BlockModule - * - * \tparam T Data type being scanned - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ALGORITHM [optional] cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) - * produces an output list where each element is computed to be the reduction - * of the elements occurring earlier in the input list. Prefix sum - * connotes a prefix scan with the addition operator. The term \em inclusive indicates - * that the ith output reduction incorporates the ith input. - * The term \em exclusive indicates the ith input is not incorporated into - * the ith output reduction. - * - \rowmajor - * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles: - * -# cub::BLOCK_SCAN_RAKING. An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) - * -# cub::BLOCK_SCAN_RAKING_MEMOIZE. Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm) - * -# cub::BLOCK_SCAN_WARP_SCANS. A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) - * - * \par Performance Considerations - * - \granularity - * - Uses special instructions when applicable (e.g., warp \p SHFL) - * - Uses synchronization-free communication between warp lanes when applicable - * - Invokes a minimal number of minimal block-wide synchronization barriers (only - * one or two depending on algorithm selection) - * - Incurs zero bank conflicts for most types - * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: - * - Prefix sum variants (vs. generic scan) - * - \blocksize - * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives - * - * \par A Simple Example - * \blockcollective{BlockScan} - * \par - * The code snippet below illustrates an exclusive prefix sum of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide exclusive prefix sum - * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * {[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}. - * The corresponding output \p thread_data in those threads will be - * {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}. - * - */ -template < - typename T, - int BLOCK_DIM_X, - BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockScan -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - /** - * Ensure the template parameterization meets the requirements of the - * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy - * cannot be used with threadblock sizes not a multiple of the - * architectural warp size. - */ - static const BlockScanAlgorithm SAFE_ALGORITHM = - ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ? - BLOCK_SCAN_RAKING : - ALGORITHM; - - typedef BlockScanWarpScans WarpScans; - typedef BlockScanRaking Raking; - - /// Define the delegate type for the desired algorithm - typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS), - WarpScans, - Raking>::Type InternalBlockScan; - - /// Shared memory storage layout type for BlockScan - typedef typename InternalBlockScan::TempStorage _TempStorage; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - -public: - - /// \smemstorage{BlockScan} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockScan() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockScan( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - - - - - //@} end member group - /******************************************************************//** - * \name Exclusive prefix sum operations - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. - * - * \par - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix sum of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide exclusive prefix sum - * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. - * - */ - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output) ///< [out] Calling thread's output item (may be aliased to \p input) - { - T block_aggregate; - InternalBlockScan(temp_storage).ExclusiveSum(input, output, block_aggregate); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix sum of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide exclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. - * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. - * - */ - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - InternalBlockScan(temp_storage).ExclusiveSum(input, output, block_aggregate); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an exclusive prefix sum over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total += block_aggregate; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockScan for a 1D block of 128 threads - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the block-wide exclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage).ExclusiveSum( - * thread_data, thread_data, block_aggregate, prefix_op); - * __syncthreads(); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... - * The corresponding output for the first segment will be 0, 1, ..., 127. - * The output for the second segment will be 128, 129, ..., 255. Furthermore, - * the value \p 128 will be stored in \p block_aggregate for all threads after each scan. - * - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. - { - InternalBlockScan(temp_storage).ExclusiveSum(input, output, block_aggregate, block_prefix_callback_op); - } - - - //@} end member group - /******************************************************************//** - * \name Exclusive prefix sum operations (multiple data per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix sum of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide exclusive prefix sum - * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - */ - template - __device__ __forceinline__ void ExclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) - { - // Reduce consecutive thread items in registers - Sum scan_op; - T thread_partial = ThreadReduce(input, scan_op); - - // Exclusive threadblock-scan - ExclusiveSum(thread_partial, thread_partial); - - // Exclusive scan in registers with prefix - ThreadScanExclusive(input, output, scan_op, thread_partial); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix sum of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide exclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - */ - template - __device__ __forceinline__ void ExclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - // Reduce consecutive thread items in registers - Sum scan_op; - T thread_partial = ThreadReduce(input, scan_op); - - // Exclusive threadblock-scan - ExclusiveSum(thread_partial, thread_partial, block_aggregate); - - // Exclusive scan in registers with prefix - ThreadScanExclusive(input, output, scan_op, thread_partial); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an exclusive prefix sum over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) - * across 128 threads where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total += block_aggregate; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread - * typedef cub::BlockLoad BlockLoad; - * typedef cub::BlockStore BlockStore; - * typedef cub::BlockScan BlockScan; - * - * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan - * __shared__ union { - * typename BlockLoad::TempStorage load; - * typename BlockScan::TempStorage scan; - * typename BlockStore::TempStorage store; - * } temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - * __syncthreads(); - * - * // Collectively compute the block-wide exclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage.scan).ExclusiveSum( - * thread_data, thread_data, block_aggregate, prefix_op); - * __syncthreads(); - * - * // Store scanned items to output segment - * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - * __syncthreads(); - * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... - * The corresponding output for the first segment will be 0, 1, 2, 3, ..., 510, 511. - * The output for the second segment will be 512, 513, 514, 515, ..., 1022, 1023. Furthermore, - * the value \p 512 will be stored in \p block_aggregate for all threads after each scan. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - int ITEMS_PER_THREAD, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. - { - // Reduce consecutive thread items in registers - Sum scan_op; - T thread_partial = ThreadReduce(input, scan_op); - - // Exclusive threadblock-scan - ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_callback_op); - - // Exclusive scan in registers with prefix - ThreadScanExclusive(input, output, scan_op, thread_partial); - } - - - - //@} end member group // Inclusive prefix sums - /******************************************************************//** - * \name Exclusive prefix scan operations - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide exclusive prefix max scan - * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. - * - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T identity, ///< [in] Identity value - ScanOp scan_op) ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - { - T block_aggregate; - InternalBlockScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, block_aggregate); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide exclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. - * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. - * - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &output, ///< [out] Calling thread's output items (may be aliased to \p input) - T identity, ///< [in] Identity value - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - InternalBlockScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, block_aggregate); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an exclusive prefix max scan over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockScan for a 1D block of 128 threads - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(INT_MIN); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the block-wide exclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage).ExclusiveScan( - * thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate, prefix_op); - * __syncthreads(); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, ..., 124, 126. - * The output for the second segment will be 126, 128, 128, 130, ..., 252, 254. Furthermore, - * \p block_aggregate will be assigned \p 126 in all threads after the first scan, assigned \p 254 after the second - * scan, etc. - * - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T identity, ///< [in] Identity value - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. - { - InternalBlockScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, block_aggregate, block_prefix_callback_op); - } - - - //@} end member group // Inclusive prefix sums - /******************************************************************//** - * \name Exclusive prefix scan operations (multiple data per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide exclusive prefix max scan - * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. - * The corresponding output \p thread_data in those threads will be - * { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T identity, ///< [in] Identity value - ScanOp scan_op) ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - { - // Reduce consecutive thread items in registers - T thread_partial = ThreadReduce(input, scan_op); - - // Exclusive threadblock-scan - ExclusiveScan(thread_partial, thread_partial, identity, scan_op); - - // Exclusive scan in registers with prefix - ThreadScanExclusive(input, output, scan_op, thread_partial); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide exclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The - * corresponding output \p thread_data in those threads will be { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. - * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T identity, ///< [in] Identity value - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - // Reduce consecutive thread items in registers - T thread_partial = ThreadReduce(input, scan_op); - - // Exclusive threadblock-scan - ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate); - - // Exclusive scan in registers with prefix - ThreadScanExclusive(input, output, scan_op, thread_partial); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an exclusive prefix max scan over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread - * typedef cub::BlockLoad BlockLoad; - * typedef cub::BlockStore BlockStore; - * typedef cub::BlockScan BlockScan; - * - * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan - * __shared__ union { - * typename BlockLoad::TempStorage load; - * typename BlockScan::TempStorage scan; - * typename BlockStore::TempStorage store; - * } temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - * __syncthreads(); - * - * // Collectively compute the block-wide exclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage.scan).ExclusiveScan( - * thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate, prefix_op); - * __syncthreads(); - * - * // Store scanned items to output segment - * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - * __syncthreads(); - * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510. - * The output for the second segment will be 510, 512, 512, 514, 514, 516, ..., 1020, 1022. Furthermore, - * \p block_aggregate will be assigned \p 510 in all threads after the first scan, assigned \p 1022 after the second - * scan, etc. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T identity, ///< [in] Identity value - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. - { - // Reduce consecutive thread items in registers - T thread_partial = ThreadReduce(input, scan_op); - - // Exclusive threadblock-scan - ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate, block_prefix_callback_op); - - // Exclusive scan in registers with prefix - ThreadScanExclusive(input, output, scan_op, thread_partial); - } - - - //@} end member group - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - /******************************************************************//** - * \name Exclusive prefix scan operations (identityless, single datum per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no identity value, the output computed for thread0 is undefined. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - { - T block_aggregate; - InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for thread0 is undefined. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. - { - InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate, block_prefix_callback_op); - } - - - //@} end member group - /******************************************************************//** - * \name Exclusive prefix scan operations (identityless, multiple data per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. With no identity value, the output computed for thread0 is undefined. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - { - // Reduce consecutive thread items in registers - T thread_partial = ThreadReduce(input, scan_op); - - // Exclusive threadblock-scan - ExclusiveScan(thread_partial, thread_partial, scan_op); - - // Exclusive scan in registers with prefix - ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for thread0 is undefined. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - // Reduce consecutive thread items in registers - T thread_partial = ThreadReduce(input, scan_op); - - // Exclusive threadblock-scan - ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate); - - // Exclusive scan in registers with prefix - ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. - { - // Reduce consecutive thread items in registers - T thread_partial = ThreadReduce(input, scan_op); - - // Exclusive threadblock-scan - ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_callback_op); - - // Exclusive scan in registers with prefix - ThreadScanExclusive(input, output, scan_op, thread_partial); - } - - - //@} end member group - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - /******************************************************************//** - * \name Inclusive prefix sum operations - *********************************************************************/ - //@{ - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. - * - * \par - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix sum of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide inclusive prefix sum - * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. - * - */ - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output) ///< [out] Calling thread's output item (may be aliased to \p input) - { - T block_aggregate; - InternalBlockScan(temp_storage).InclusiveSum(input, output, block_aggregate); - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix sum of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide inclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. - * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. - * - */ - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - InternalBlockScan(temp_storage).InclusiveSum(input, output, block_aggregate); - } - - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an inclusive prefix sum over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total += block_aggregate; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockScan for a 1D block of 128 threads - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the block-wide inclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage).InclusiveSum( - * thread_data, thread_data, block_aggregate, prefix_op); - * __syncthreads(); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... - * The corresponding output for the first segment will be 1, 2, ..., 128. - * The output for the second segment will be 129, 130, ..., 256. Furthermore, - * the value \p 128 will be stored in \p block_aggregate for all threads after each scan. - * - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. - { - InternalBlockScan(temp_storage).InclusiveSum(input, output, block_aggregate, block_prefix_callback_op); - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive prefix sum operations (multiple data per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix sum of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide inclusive prefix sum - * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - */ - template - __device__ __forceinline__ void InclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveSum(input[0], output[0]); - } - else - { - // Reduce consecutive thread items in registers - Sum scan_op; - T thread_partial = ThreadReduce(input, scan_op); - - // Exclusive threadblock-scan - ExclusiveSum(thread_partial, thread_partial); - - // Inclusive scan in registers with prefix - ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); - } - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix sum of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide inclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be - * { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. - * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void InclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveSum(input[0], output[0], block_aggregate); - } - else - { - // Reduce consecutive thread items in registers - Sum scan_op; - T thread_partial = ThreadReduce(input, scan_op); - - // Exclusive threadblock-scan - ExclusiveSum(thread_partial, thread_partial, block_aggregate); - - // Inclusive scan in registers with prefix - ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); - } - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an inclusive prefix sum over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) - * across 128 threads where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total += block_aggregate; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread - * typedef cub::BlockLoad BlockLoad; - * typedef cub::BlockStore BlockStore; - * typedef cub::BlockScan BlockScan; - * - * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan - * __shared__ union { - * typename BlockLoad::TempStorage load; - * typename BlockScan::TempStorage scan; - * typename BlockStore::TempStorage store; - * } temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - * __syncthreads(); - * - * // Collectively compute the block-wide inclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage.scan).IncluisveSum( - * thread_data, thread_data, block_aggregate, prefix_op); - * __syncthreads(); - * - * // Store scanned items to output segment - * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - * __syncthreads(); - * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... - * The corresponding output for the first segment will be 1, 2, 3, 4, ..., 511, 512. - * The output for the second segment will be 513, 514, 515, 516, ..., 1023, 1024. Furthermore, - * the value \p 512 will be stored in \p block_aggregate for all threads after each scan. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - int ITEMS_PER_THREAD, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveSum(input[0], output[0], block_aggregate, block_prefix_callback_op); - } - else - { - // Reduce consecutive thread items in registers - Sum scan_op; - T thread_partial = ThreadReduce(input, scan_op); - - // Exclusive threadblock-scan - ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_callback_op); - - // Inclusive scan in registers with prefix - ThreadScanInclusive(input, output, scan_op, thread_partial); - } - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive prefix scan operations - *********************************************************************/ - //@{ - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide inclusive prefix max scan - * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. - * - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - { - T block_aggregate; - InclusiveScan(input, output, scan_op, block_aggregate); - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide inclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. - * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. - * - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate); - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an inclusive prefix max scan over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockScan for a 1D block of 128 threads - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(INT_MIN); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the block-wide inclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage).InclusiveScan( - * thread_data, thread_data, cub::Max(), block_aggregate, prefix_op); - * __syncthreads(); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be 0, 0, 2, 2, ..., 126, 126. - * The output for the second segment will be 128, 128, 130, 130, ..., 254, 254. Furthermore, - * \p block_aggregate will be assigned \p 126 in all threads after the first scan, assigned \p 254 after the second - * scan, etc. - * - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. - { - InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate, block_prefix_callback_op); - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive prefix scan operations (multiple data per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide inclusive prefix max scan - * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The - * corresponding output \p thread_data in those threads will be { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void InclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveScan(input[0], output[0], scan_op); - } - else - { - // Reduce consecutive thread items in registers - T thread_partial = ThreadReduce(input, scan_op); - - // Exclusive threadblock-scan - ExclusiveScan(thread_partial, thread_partial, scan_op); - - // Inclusive scan in registers with prefix - ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); - } - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide inclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. - * The corresponding output \p thread_data in those threads will be - * { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. - * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void InclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveScan(input[0], output[0], scan_op, block_aggregate); - } - else - { - // Reduce consecutive thread items in registers - T thread_partial = ThreadReduce(input, scan_op); - - // Exclusive threadblock-scan - ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate); - - // Inclusive scan in registers with prefix - ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); - } - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an inclusive prefix max scan over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread - * typedef cub::BlockLoad BlockLoad; - * typedef cub::BlockStore BlockStore; - * typedef cub::BlockScan BlockScan; - * - * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan - * __shared__ union { - * typename BlockLoad::TempStorage load; - * typename BlockScan::TempStorage scan; - * typename BlockStore::TempStorage store; - * } temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - * __syncthreads(); - * - * // Collectively compute the block-wide inclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage.scan).InclusiveScan( - * thread_data, thread_data, cub::Max(), block_aggregate, prefix_op); - * __syncthreads(); - * - * // Store scanned items to output segment - * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - * __syncthreads(); - * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be 0, 0, 2, 2, 4, 4, ..., 510, 510. - * The output for the second segment will be 512, 512, 514, 514, 516, 516, ..., 1022, 1022. Furthermore, - * \p block_aggregate will be assigned \p 510 in all threads after the first scan, assigned \p 1022 after the second - * scan, etc. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - T &block_aggregate, ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to all inputs. - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveScan(input[0], output[0], scan_op, block_aggregate, block_prefix_callback_op); - } - else - { - // Reduce consecutive thread items in registers - T thread_partial = ThreadReduce(input, scan_op); - - // Exclusive threadblock-scan - ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_callback_op); - - // Inclusive scan in registers with prefix - ThreadScanInclusive(input, output, scan_op, thread_partial); - } - } - - //@} end member group - - -}; - -/** - * \example example_block_scan.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. + */ + +#pragma once + +#include "specializations/block_scan_raking.cuh" +#include "specializations/block_scan_warp_scans.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_ptx.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Algorithmic variants + ******************************************************************************/ + +/** + * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block. + */ +enum BlockScanAlgorithm +{ + + /** + * \par Overview + * An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases: + * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. + * -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions. + * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp. + * -# Downsweep sequential exclusive scan in shared memory. Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output. + * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. + * + * \par + * \image html block_scan_raking.png + *
\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - Although this variant may suffer longer turnaround latencies when the + * GPU is under-occupied, it can often provide higher overall throughput + * across the GPU when suitably occupied. + */ + BLOCK_SCAN_RAKING, + + + /** + * \par Overview + * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at + * the expense of higher register pressure. Raking threads preserve their + * "upsweep" segment of values in registers while performing warp-synchronous + * scan, allowing the "downsweep" not to re-read them from shared memory. + */ + BLOCK_SCAN_RAKING_MEMOIZE, + + + /** + * \par Overview + * A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases: + * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. + * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp. + * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp. + * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. + * + * \par + * \image html block_scan_warpscans.png + *
\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
+ * + * \par Performance Considerations + * - Although this variant may suffer lower overall throughput across the + * GPU because due to a heavy reliance on inefficient warpscans, it can + * often provide lower turnaround latencies when the GPU is under-occupied. + */ + BLOCK_SCAN_WARP_SCANS, +}; + + +/****************************************************************************** + * Block scan + ******************************************************************************/ + +/** + * \brief The BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png) + * \ingroup BlockModule + * + * \tparam T Data type being scanned + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam ALGORITHM [optional] cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING) + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) + * produces an output list where each element is computed to be the reduction + * of the elements occurring earlier in the input list. Prefix sum + * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * that the ith output reduction incorporates the ith input. + * The term \em exclusive indicates the ith input is not incorporated into + * the ith output reduction. + * - \rowmajor + * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles: + * -# cub::BLOCK_SCAN_RAKING. An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) + * -# cub::BLOCK_SCAN_RAKING_MEMOIZE. Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm) + * -# cub::BLOCK_SCAN_WARP_SCANS. A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) + * + * \par Performance Considerations + * - \granularity + * - Uses special instructions when applicable (e.g., warp \p SHFL) + * - Uses synchronization-free communication between warp lanes when applicable + * - Invokes a minimal number of minimal block-wide synchronization barriers (only + * one or two depending on algorithm selection) + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Prefix sum variants (vs. generic scan) + * - \blocksize + * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives + * + * \par A Simple Example + * \blockcollective{BlockScan} + * \par + * The code snippet below illustrates an exclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * {[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}. + * The corresponding output \p thread_data in those threads will be + * {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}. + * + */ +template < + typename T, + int BLOCK_DIM_X, + BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockScan +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + }; + + /** + * Ensure the template parameterization meets the requirements of the + * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy + * cannot be used with thread block sizes not a multiple of the + * architectural warp size. + */ + static const BlockScanAlgorithm SAFE_ALGORITHM = + ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ? + BLOCK_SCAN_RAKING : + ALGORITHM; + + typedef BlockScanWarpScans WarpScans; + typedef BlockScanRaking Raking; + + /// Define the delegate type for the desired algorithm + typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS), + WarpScans, + Raking>::Type InternalBlockScan; + + /// Shared memory storage layout type for BlockScan + typedef typename InternalBlockScan::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + + /****************************************************************************** + * Public types + ******************************************************************************/ +public: + + /// \smemstorage{BlockScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockScan() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockScan( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix sum operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. + * + * \par + * - \identityzero + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. + * + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output) ///< [out] Calling thread's output item (may be aliased to \p input) + { + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum()); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. + * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. + * + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 0, 1, ..., 127. + * The output for the second segment will be 128, 129, ..., 255. + * + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix sum operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. + * + * \par + * - \identityzero + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ void ExclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) + { + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum()); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. + * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ void ExclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + T initial_value = 0; + ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \identityzero + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) + * across 128 threads where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide exclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage.scan).ExclusiveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 0, 1, 2, 3, ..., 510, 511. + * The output for the second segment will be 512, 513, 514, 515, ..., 1022, 1023. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); + } + + + + //@} end member group // Exclusive prefix sums + /******************************************************************//** + * \name Exclusive prefix scan operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op) ///< [in] Binary scan functor + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. + * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(INT_MIN); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage).ExclusiveScan( + * thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, ..., 124, 126. + * The output for the second segment will be 126, 128, 128, 130, ..., 252, 254. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op); + } + + + //@} end member group // Inclusive prefix sums + /******************************************************************//** + * \name Exclusive prefix scan operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. + * The corresponding output \p thread_data in those threads will be + * { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op) ///< [in] Binary scan functor + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op); + + // Exclusive scan in registers with prefix as seed + internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide exclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The + * corresponding output \p thread_data in those threads will be { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. + * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate); + + // Exclusive scan in registers with prefix as seed + internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an exclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide exclusive prefix max scan + * BlockScan(temp_storage.scan).ExclusiveScan( + * thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510. + * The output for the second segment will be 510, 512, 512, 514, 514, 516, ..., 1020, 1022. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); + + // Exclusive scan in registers with prefix as seed + internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); + } + + + //@} end member group +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans + + /******************************************************************//** + * \name Exclusive prefix scan operations (no initial value, single datum per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate); + } + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix scan operations (no initial value, multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + // Reduce consecutive thread items in registers + T thread_partial = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_partial, thread_partial, scan_op); + + // Exclusive scan in registers with prefix + internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + T thread_partial = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate); + + // Exclusive scan in registers with prefix + internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } + + + //@} end member group +#endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans + + /******************************************************************//** + * \name Inclusive prefix sum operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. + * + * \par + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. + * + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output) ///< [out] Calling thread's output item (may be aliased to \p input) + { + InclusiveScan(input, output, cub::Sum()); + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The + * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. + * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. + * + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InclusiveScan(input, output, cub::Sum(), block_aggregate); + } + + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage).InclusiveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 1, 2, ..., 128. + * The output for the second segment will be 129, 130, ..., 256. + * + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix sum operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + */ + template + __device__ __forceinline__ void InclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveSum(input[0], output[0]); + } + else + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveSum(thread_prefix, thread_prefix); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix sum of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix sum + * int block_aggregate; + * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The + * corresponding output \p thread_data in those threads will be + * { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. + * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveSum(input[0], output[0], block_aggregate); + } + else + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveSum(thread_prefix, thread_prefix, block_aggregate); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix sum over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) + * across 128 threads where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total += block_aggregate; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide inclusive prefix sum + * BlockScan(temp_storage.scan).IncluisveSum( + * thread_data, thread_data, prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... + * The corresponding output for the first segment will be 1, 2, 3, 4, ..., 511, 512. + * The output for the second segment will be 513, 514, 515, 516, ..., 1023, 1024. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveSum( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveSum(input[0], output[0], block_prefix_callback_op); + } + else + { + // Reduce consecutive thread items in registers + Sum scan_op; + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); + } + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix scan operations + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op); + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that + * are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain input item for each thread + * int thread_data; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The + * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. + * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate); + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \rowmajor + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockScan for a 1D block of 128 threads + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(INT_MIN); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data = d_data[block_offset]; + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage).InclusiveScan( + * thread_data, thread_data, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * d_data[block_offset] = thread_data; + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be 0, 0, 2, 2, ..., 126, 126. + * The output for the second segment will be 128, 128, 130, 130, ..., 254, 254. + * + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix scan operations (multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The + * corresponding output \p thread_data in those threads will be { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveScan(input[0], output[0], scan_op); + } + else + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, scan_op); + + // Inclusive scan in registers with prefix as seed (first thread does not seed) + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that + * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads + * where each thread owns 4 consecutive items. + * \par + * \code + * #include // or equivalently + * + * __global__ void ExampleKernel(...) + * { + * // Specialize BlockScan for a 1D block of 128 threads on type int + * typedef cub::BlockScan BlockScan; + * + * // Allocate shared memory for BlockScan + * __shared__ typename BlockScan::TempStorage temp_storage; + * + * // Obtain a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * ... + * + * // Collectively compute the block-wide inclusive prefix max scan + * int block_aggregate; + * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is + * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. + * The corresponding output \p thread_data in those threads will be + * { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. + * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveScan(input[0], output[0], scan_op, block_aggregate); + } + else + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan (with no initial value) + ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate); + + // Inclusive scan in registers with prefix as seed (first thread does not seed) + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); + } + } + + + /** + * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + * + * \par + * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). + * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. + * The functor will be invoked by the first warp of threads in the block, however only the return value from + * lane0 is applied as the block-wide prefix. Can be stateful. + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates a single thread block that progressively + * computes an inclusive prefix max scan over multiple "tiles" of input using a + * prefix functor to maintain a running total between block-wide scans. Each tile consists + * of 128 integer items that are partitioned across 128 threads. + * \par + * \code + * #include // or equivalently + * + * // A stateful callback functor that maintains a running prefix to be applied + * // during consecutive scan operations. + * struct BlockPrefixCallbackOp + * { + * // Running prefix + * int running_total; + * + * // Constructor + * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} + * + * // Callback operator to be entered by the first warp of threads in the block. + * // Thread-0 is responsible for returning a value for seeding the block-wide scan. + * __device__ int operator()(int block_aggregate) + * { + * int old_prefix = running_total; + * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; + * return old_prefix; + * } + * }; + * + * __global__ void ExampleKernel(int *d_data, int num_items, ...) + * { + * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread + * typedef cub::BlockLoad BlockLoad; + * typedef cub::BlockStore BlockStore; + * typedef cub::BlockScan BlockScan; + * + * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan + * __shared__ union { + * typename BlockLoad::TempStorage load; + * typename BlockScan::TempStorage scan; + * typename BlockStore::TempStorage store; + * } temp_storage; + * + * // Initialize running total + * BlockPrefixCallbackOp prefix_op(0); + * + * // Have the block iterate over segments of items + * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) + * { + * // Load a segment of consecutive items that are blocked across threads + * int thread_data[4]; + * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); + * CTA_SYNC(); + * + * // Collectively compute the block-wide inclusive prefix max scan + * BlockScan(temp_storage.scan).InclusiveScan( + * thread_data, thread_data, cub::Max(), prefix_op); + * CTA_SYNC(); + * + * // Store scanned items to output segment + * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); + * CTA_SYNC(); + * } + * \endcode + * \par + * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... + * The corresponding output for the first segment will be 0, 0, 2, 2, 4, 4, ..., 510, 510. + * The output for the second segment will be 512, 512, 514, 514, 516, 516, ..., 1022, 1022. + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. + { + if (ITEMS_PER_THREAD == 1) + { + InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op); + } + else + { + // Reduce consecutive thread items in registers + T thread_prefix = internal::ThreadReduce(input, scan_op); + + // Exclusive thread block-scan + ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); + + // Inclusive scan in registers with prefix as seed + internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); + } + } + + //@} end member group + + +}; + +/** + * \example example_block_scan.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/block_shift.cuh b/SRC/cub/block/block_shift.cuh deleted file mode 100644 index 3cd09222..00000000 --- a/SRC/cub/block/block_shift.cuh +++ /dev/null @@ -1,325 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockShift class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. - */ - -#pragma once - -#include "../util_arch.cuh" -#include "../util_ptx.cuh" -#include "../util_macro.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief The BlockShift class provides [collective](index.html#sec0) methods for shifting data partitioned across a CUDA thread block. ![](transpose_logo.png) - * \ingroup BlockModule - * - * \tparam T The data type to be exchanged. - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * It is commonplace for blocks of threads to rearrange data items between - * threads. The BlockShift abstraction allows threads to efficiently shift items - * either (a) up to their successor or (b) down to their predecessor. - * - */ -template < - typename T, - int BLOCK_DIM_X, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockShift -{ -private: - - /****************************************************************************** - * Constants - ******************************************************************************/ - - enum - { - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), - WARP_THREADS = 1 << LOG_WARP_THREADS, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - }; - - /****************************************************************************** - * Type definitions - ******************************************************************************/ - - /// Shared memory storage layout type - typedef typename If<(PTX_ARCH >= 300), - T[WARPS], // Kepler+ only needs smem to share between warps - T[BLOCK_THREADS] >::Type _TempStorage; - -public: - - /// \smemstorage{BlockShift} - struct TempStorage : Uninitialized<_TempStorage> {}; - -private: - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - int lane_id; - int warp_id; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - -public: - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockShift() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), - warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), - lane_id(LaneId()) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockShift( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), - warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), - lane_id(LaneId()) - {} - - - //@} end member group - /******************************************************************//** - * \name Shift exchanges - *********************************************************************/ - //@{ - - - /** - * \brief Each thread obtains the \p input provided by its predecessor. The first thread receives \p block_prefix. - * - * \par - * - \smemreuse - */ - __device__ __forceinline__ void Up( - T input, ///< [in] Input item - T &output, ///< [out] Output item - T block_prefix) ///< [in] Prefix item to be provided to thread0 - { -#if CUB_PTX_ARCH >= 300 - if (lane_id == WARP_THREADS - 1) - temp_storage[warp_id] = input; - - __syncthreads(); - - output = ShuffleUp(input, 1); - if (lane_id == 0) - { - output = (linear_tid == 0) ? - block_prefix : - temp_storage[warp_id - 1]; - } -#else - temp_storage[linear_tid] = input; - - __syncthreads(); - - output = (linear_tid == 0) ? - block_prefix : - temp_storage[linear_tid - 1]; -#endif - } - - - /** - * \brief Each thread receives the \p input provided by its predecessor. The first thread receives \p block_prefix. All threads receive the \p input provided by threadBLOCK_THREADS-1. - * - * \par - * - \smemreuse - */ - __device__ __forceinline__ void Up( - T input, ///< [in] Input item - T &output, ///< [out] Output item - T block_prefix, ///< [in] Prefix item to be provided to thread0 - T &block_suffix) ///< [out] Suffix item shifted out by the threadBLOCK_THREADS-1 to be provided to all threads - { -#if CUB_PTX_ARCH >= 300 - if (lane_id == WARP_THREADS - 1) - temp_storage[warp_id] = input; - - __syncthreads(); - - output = ShuffleUp(input, 1); - if (lane_id == 0) - { - output = (linear_tid == 0) ? - block_prefix : - temp_storage[warp_id - 1]; - } - block_suffix = temp_storage[WARPS - 1]; -#else - temp_storage[linear_tid] = input; - - __syncthreads(); - - output = (linear_tid == 0) ? - block_prefix : - temp_storage[linear_tid - 1]; - - block_suffix = temp_storage[BLOCK_THREADS - 1]; -#endif - } - - - /** - * \brief Each thread obtains the \p input provided by its successor. The last thread receives \p block_suffix. - * - * \par - * - \smemreuse - */ - __device__ __forceinline__ void Down( - T input, ///< [in] Input item - T &output, ///< [out] Output item - T block_suffix) ///< [in] Suffix item to be provided to threadBLOCK_THREADS-1 - { -#if CUB_PTX_ARCH >= 300 - if (lane_id == 0) - temp_storage[warp_id] = input; - - __syncthreads(); - - output = ShuffleDown(input, 1); - if (lane_id == WARP_THREADS - 1) - { - output = (linear_tid == BLOCK_THREADS - 1) ? - block_suffix : - temp_storage[warp_id + 1]; - } -#else - temp_storage[linear_tid] = input; - - __syncthreads(); - - output = (linear_tid == BLOCK_THREADS - 1) ? - block_suffix : - temp_storage[linear_tid + 1]; -#endif - } - - - /** - * \brief Each thread obtains the \p input provided by its successor. The last thread receives \p block_suffix. All threads receive the \p input provided by thread0. - * - * \par - * - \smemreuse - */ - __device__ __forceinline__ void Down( - T input, ///< [in] Input item - T &output, ///< [out] Output item - T block_suffix, ///< [in] Suffix item to be provided to threadBLOCK_THREADS-1 - T &block_prefix) ///< [out] Prefix item shifted out by the thread0 to be provided to all threads - { -#if CUB_PTX_ARCH >= 300 - if (lane_id == 0) - temp_storage[warp_id] = input; - - __syncthreads(); - - output = ShuffleDown(input, 1); - if (lane_id == WARP_THREADS - 1) - { - output = (linear_tid == BLOCK_THREADS - 1) ? - block_suffix : - temp_storage[warp_id + 1]; - } -#else - temp_storage[linear_tid] = input; - - __syncthreads(); - - output = (linear_tid == BLOCK_THREADS - 1) ? - block_suffix : - temp_storage[linear_tid + 1]; -#endif - - block_prefix = temp_storage[0]; - } - - //@} end member group - - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/block_shuffle.cuh b/SRC/cub/block/block_shuffle.cuh new file mode 100644 index 00000000..a0cc71d2 --- /dev/null +++ b/SRC/cub/block/block_shuffle.cuh @@ -0,0 +1,305 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. + */ + +#pragma once + +#include "../util_arch.cuh" +#include "../util_ptx.cuh" +#include "../util_macro.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief The BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. + * \ingroup BlockModule + * + * \tparam T The data type to be exchanged. + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) + * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * It is commonplace for blocks of threads to rearrange data items between + * threads. The BlockShuffle abstraction allows threads to efficiently shift items + * either (a) up to their successor or (b) down to their predecessor. + * + */ +template < + typename T, + int BLOCK_DIM_X, + int BLOCK_DIM_Y = 1, + int BLOCK_DIM_Z = 1, + int PTX_ARCH = CUB_PTX_ARCH> +class BlockShuffle +{ +private: + + /****************************************************************************** + * Constants + ******************************************************************************/ + + enum + { + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), + WARP_THREADS = 1 << LOG_WARP_THREADS, + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + }; + + /****************************************************************************** + * Type definitions + ******************************************************************************/ + + /// Shared memory storage layout type (last element from each thread's input) + struct _TempStorage + { + T prev[BLOCK_THREADS]; + T next[BLOCK_THREADS]; + }; + + +public: + + /// \smemstorage{BlockShuffle} + struct TempStorage : Uninitialized<_TempStorage> {}; + +private: + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + /// Linear thread-id + unsigned int linear_tid; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Internal storage allocator + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + +public: + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using a private static allocation of shared memory as temporary storage. + */ + __device__ __forceinline__ BlockShuffle() + : + temp_storage(PrivateStorage()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. + */ + __device__ __forceinline__ BlockShuffle( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //@} end member group + /******************************************************************//** + * \name Shuffle movement + *********************************************************************/ + //@{ + + + /** + * \brief Each threadi obtains the \p input provided by threadi+distance. The offset \p distance may be negative. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Offset( + T input, ///< [in] The input item from the calling thread (threadi) + T& output, ///< [out] The \p input item from the successor (or predecessor) thread threadi+distance (may be aliased to \p input). This value is only updated for for threadi when 0 <= (i + \p distance) < BLOCK_THREADS-1 + int distance = 1) ///< [in] Offset distance (may be negative) + { + temp_storage[linear_tid].prev = input; + + CTA_SYNC(); + + if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS)) + output = temp_storage[linear_tid + distance].prev; + } + + + /** + * \brief Each threadi obtains the \p input provided by threadi+distance. + * + * \par + * - \smemreuse + */ + __device__ __forceinline__ void Rotate( + T input, ///< [in] The calling thread's input item + T& output, ///< [out] The \p input item from thread thread(i+distance>)% (may be aliased to \p input). This value is not updated for threadBLOCK_THREADS-1 + unsigned int distance = 1) ///< [in] Offset distance (0 < \p distance < BLOCK_THREADS) + { + temp_storage[linear_tid].prev = input; + + CTA_SYNC(); + + unsigned int offset = threadIdx.x + distance; + if (offset >= BLOCK_THREADS) + offset -= BLOCK_THREADS; + + output = temp_storage[offset].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it up by one item + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Up( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. + { + temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) + prev[ITEM] = input[ITEM - 1]; + + + if (linear_tid > 0) + prev[0] = temp_storage[linear_tid - 1].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it up by one item. All threads receive the \p input provided by threadBLOCK_THREADS-1. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Up( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. + T &block_suffix) ///< [out] The item \p input[ITEMS_PER_THREAD-1] from threadBLOCK_THREADS-1, provided to all threads + { + Up(input, prev); + block_suffix = temp_storage[BLOCK_THREADS - 1].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it down by one item + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Down( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. + { + temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1]; + + CTA_SYNC(); + + #pragma unroll + for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) + prev[ITEM] = input[ITEM - 1]; + + if (linear_tid > 0) + prev[0] = temp_storage[linear_tid - 1].prev; + } + + + /** + * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of input items, shifting it down by one item. All threads receive \p input[0] provided by thread0. + * + * \par + * - \blocked + * - \granularity + * - \smemreuse + */ + template + __device__ __forceinline__ void Down( + T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items + T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. + T &block_prefix) ///< [out] The item \p input[0] from thread0, provided to all threads + { + Up(input, prev); + block_prefix = temp_storage[BLOCK_THREADS - 1].prev; + } + + //@} end member group + + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/block_store.cuh b/SRC/cub/block/block_store.cuh index 066541ad..648bf9ff 100644 --- a/SRC/cub/block/block_store.cuh +++ b/SRC/cub/block/block_store.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -65,22 +65,24 @@ namespace cub { * * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIterator [inferred] The random-access iterator type for output \iterator. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. */ template < typename T, int ITEMS_PER_THREAD, - typename OutputIterator> + typename OutputIteratorT> __device__ __forceinline__ void StoreDirectBlocked( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { + OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + // Store directly in thread-blocked order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM] = items[ITEM]; + thread_itr[ITEM] = items[ITEM]; } } @@ -92,25 +94,27 @@ __device__ __forceinline__ void StoreDirectBlocked( * * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIterator [inferred] The random-access iterator type for output \iterator. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. */ template < typename T, int ITEMS_PER_THREAD, - typename OutputIterator> + typename OutputIteratorT> __device__ __forceinline__ void StoreDirectBlocked( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { + OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); + // Store directly in thread-blocked order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items) { - block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM] = items[ITEM]; + thread_itr[ITEM] = items[ITEM]; } } } @@ -158,7 +162,7 @@ __device__ __forceinline__ void StoreDirectBlockedVectorized( typedef typename CubVector::Type Vector; // Alias global pointer - Vector *block_ptr_vectors = reinterpret_cast(block_ptr); + Vector *block_ptr_vectors = reinterpret_cast(const_cast(block_ptr)); // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling) Vector raw_vector[VECTORS_PER_THREAD]; @@ -192,23 +196,25 @@ __device__ __forceinline__ void StoreDirectBlockedVectorized( * \tparam BLOCK_THREADS The thread block size in threads * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIterator [inferred] The random-access iterator type for output \iterator. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. */ template < int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, - typename OutputIterator> + typename OutputIteratorT> __device__ __forceinline__ void StoreDirectStriped( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { + OutputIteratorT thread_itr = block_itr + linear_tid; + // Store directly in striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - block_itr[(ITEM * BLOCK_THREADS) + linear_tid] = items[ITEM]; + thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; } } @@ -221,26 +227,28 @@ __device__ __forceinline__ void StoreDirectStriped( * \tparam BLOCK_THREADS The thread block size in threads * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIterator [inferred] The random-access iterator type for output \iterator. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. */ template < int BLOCK_THREADS, typename T, int ITEMS_PER_THREAD, - typename OutputIterator> + typename OutputIteratorT> __device__ __forceinline__ void StoreDirectStriped( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { + OutputIteratorT thread_itr = block_itr + linear_tid; + // Store directly in striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items) { - block_itr[(ITEM * BLOCK_THREADS) + linear_tid] = items[ITEM]; + thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; } } } @@ -264,26 +272,28 @@ __device__ __forceinline__ void StoreDirectStriped( * * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIterator [inferred] The random-access iterator type for output \iterator. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. */ template < typename T, int ITEMS_PER_THREAD, - typename OutputIterator> + typename OutputIteratorT> __device__ __forceinline__ void StoreDirectWarpStriped( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load { int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + OutputIteratorT thread_itr = block_itr + warp_offset + tid; + // Store directly in warp-striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { - block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; + thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; } } @@ -298,15 +308,15 @@ __device__ __forceinline__ void StoreDirectWarpStriped( * * \tparam T [inferred] The data type to store. * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIterator [inferred] The random-access iterator type for output \iterator. + * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. */ template < typename T, int ITEMS_PER_THREAD, - typename OutputIterator> + typename OutputIteratorT> __device__ __forceinline__ void StoreDirectWarpStriped( int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { @@ -314,13 +324,15 @@ __device__ __forceinline__ void StoreDirectWarpStriped( int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; + OutputIteratorT thread_itr = block_itr + warp_offset + tid; + // Store directly in warp-striped order #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) { - block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; + thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; } } } @@ -345,8 +357,7 @@ enum BlockStoreAlgorithm * \par Overview * * A [blocked arrangement](index.html#sec5sec3) of data is written - * directly to memory. The thread block writes items in a parallel "raking" fashion: - * threadi writes the ith segment of consecutive elements. + * directly to memory. * * \par Performance Considerations * - The utilization of memory transactions (coalescing) decreases as the @@ -359,10 +370,8 @@ enum BlockStoreAlgorithm * * A [blocked arrangement](index.html#sec5sec3) of data is written directly * to memory using CUDA's built-in vectorized stores as a coalescing optimization. - * The thread block writes items in a parallel "raking" fashion: threadi uses vector stores to - * write the ith segment of consecutive elements. - * - * For example, st.global.v4.s32 instructions will be generated when \p T = \p int and \p ITEMS_PER_THREAD > 4. + * For example, st.global.v4.s32 instructions will be generated + * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. * * \par Performance Considerations * - The utilization of memory transactions (coalescing) remains high until the the @@ -370,7 +379,7 @@ enum BlockStoreAlgorithm * maximum vector store width (typically 4 items or 64B, whichever is lower). * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT: * - \p ITEMS_PER_THREAD is odd - * - The \p OutputIterator is not a simple pointer type + * - The \p OutputIteratorT is not a simple pointer type * - The block output offset is not quadword-aligned * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) */ @@ -379,13 +388,7 @@ enum BlockStoreAlgorithm /** * \par Overview * A [blocked arrangement](index.html#sec5sec3) is locally - * transposed into a [striped arrangement](index.html#sec5sec3) - * which is then written to memory. More specifically, cub::BlockExchange - * used to locally reorder the items into a - * [striped arrangement](index.html#sec5sec3), after which the - * thread block writes items in a parallel "strip-mining" fashion: consecutive - * items owned by threadi are written to memory with - * stride \p BLOCK_THREADS between them. + * transposed and then efficiently written to memory as a [striped arrangement](index.html#sec5sec3). * * \par Performance Considerations * - The utilization of memory transactions (coalescing) remains high regardless @@ -398,13 +401,11 @@ enum BlockStoreAlgorithm /** * \par Overview * A [blocked arrangement](index.html#sec5sec3) is locally - * transposed into a [warp-striped arrangement](index.html#sec5sec3) - * which is then written to memory. More specifically, cub::BlockExchange used - * to locally reorder the items into a - * [warp-striped arrangement](index.html#sec5sec3), after which - * each warp writes its own contiguous segment in a parallel "strip-mining" fashion: - * consecutive items owned by lanei are written to memory - * with stride \p WARP_THREADS between them. + * transposed and then efficiently written to memory as a + * [warp-striped arrangement](index.html#sec5sec3) + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS * * \par Performance Considerations * - The utilization of memory transactions (coalescing) remains high regardless @@ -413,6 +414,26 @@ enum BlockStoreAlgorithm * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. */ BLOCK_STORE_WARP_TRANSPOSE, + + /** + * \par Overview + * A [blocked arrangement](index.html#sec5sec3) is locally + * transposed and then efficiently written to memory as a + * [warp-striped arrangement](index.html#sec5sec3) + * To reduce the shared memory requirement, only one warp's worth of shared + * memory is provisioned and is subsequently time-sliced among warps. + * + * \par Usage Considerations + * - BLOCK_THREADS must be a multiple of WARP_THREADS + * + * \par Performance Considerations + * - The utilization of memory transactions (coalescing) remains high regardless + * of items written per thread. + * - Provisions less shared memory temporary storage, but incurs larger + * latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative. + */ + BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, + }; @@ -421,7 +442,7 @@ enum BlockStoreAlgorithm * \ingroup BlockModule * \ingroup UtilIo * - * \tparam OutputIterator The input iterator type \iterator. + * \tparam T The type of data to be written. * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. * \tparam ALGORITHM [optional] cub::BlockStoreAlgorithm tuning policy enumeration. default: cub::BLOCK_STORE_DIRECT. @@ -463,7 +484,7 @@ enum BlockStoreAlgorithm * __global__ void ExampleKernel(int *d_data, ...) * { * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockStore BlockStore; + * typedef cub::BlockStore BlockStore; * * // Allocate shared memory for BlockStore * __shared__ typename BlockStore::TempStorage temp_storage; @@ -484,11 +505,10 @@ enum BlockStoreAlgorithm * */ template < - typename OutputIterator, + typename T, int BLOCK_DIM_X, int ITEMS_PER_THREAD, BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, - bool WARP_TIME_SLICING = false, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH> @@ -506,9 +526,6 @@ private: BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; - // Data type of input iterator - typedef typename std::iterator_traits::value_type T; - /****************************************************************************** * Algorithmic variants @@ -533,23 +550,25 @@ private: /// Constructor __device__ __forceinline__ StoreInternal( - TempStorage &temp_storage, + TempStorage &/*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} /// Store items into a linear segment of memory + template __device__ __forceinline__ void Store( - OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { StoreDirectBlocked(linear_tid, block_itr, items); } /// Store items into a linear segment of memory, guarded by range + template __device__ __forceinline__ void Store( - OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { @@ -572,7 +591,7 @@ private: /// Constructor __device__ __forceinline__ StoreInternal( - TempStorage &temp_storage, + TempStorage &/*temp_storage*/, int linear_tid) : linear_tid(linear_tid) @@ -587,17 +606,18 @@ private: } /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization) - template + template __device__ __forceinline__ void Store( - _OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { StoreDirectBlocked(linear_tid, block_itr, items); } /// Store items into a linear segment of memory, guarded by range + template __device__ __forceinline__ void Store( - OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { @@ -613,10 +633,14 @@ private: struct StoreInternal { // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; + typedef BlockExchange BlockExchange; /// Shared memory storage layout type - typedef typename BlockExchange::TempStorage _TempStorage; + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; @@ -637,8 +661,9 @@ private: {} /// Store items into a linear segment of memory + template __device__ __forceinline__ void Store( - OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { BlockExchange(temp_storage).BlockedToStriped(items); @@ -646,13 +671,17 @@ private: } /// Store items into a linear segment of memory, guarded by range + template __device__ __forceinline__ void Store( - OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { BlockExchange(temp_storage).BlockedToStriped(items); - StoreDirectStriped(linear_tid, block_itr, items, valid_items); + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + StoreDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; @@ -672,10 +701,82 @@ private: CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; + typedef BlockExchange BlockExchange; /// Shared memory storage layout type - typedef typename BlockExchange::TempStorage _TempStorage; + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + /// Thread reference to shared storage + _TempStorage &temp_storage; + + /// Linear thread-id + int linear_tid; + + /// Constructor + __device__ __forceinline__ StoreInternal( + TempStorage &temp_storage, + int linear_tid) + : + temp_storage(temp_storage.Alias()), + linear_tid(linear_tid) + {} + + /// Store items into a linear segment of memory + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + StoreDirectWarpStriped(linear_tid, block_itr, items); + } + + /// Store items into a linear segment of memory, guarded by range + template + __device__ __forceinline__ void Store( + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to + T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store + int valid_items) ///< [in] Number of valid items to write + { + BlockExchange(temp_storage).BlockedToWarpStriped(items); + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); + } + }; + + + /** + * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper + */ + template + struct StoreInternal + { + enum + { + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) + }; + + // Assert BLOCK_THREADS must be a multiple of WARP_THREADS + CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); + + // BlockExchange utility type for keys + typedef BlockExchange BlockExchange; + + /// Shared memory storage layout type + struct _TempStorage : BlockExchange::TempStorage + { + /// Temporary storage for partially-full block guard + volatile int valid_items; + }; /// Alias wrapper allowing storage to be unioned struct TempStorage : Uninitialized<_TempStorage> {}; @@ -696,8 +797,9 @@ private: {} /// Store items into a linear segment of memory + template __device__ __forceinline__ void Store( - OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { BlockExchange(temp_storage).BlockedToWarpStriped(items); @@ -705,13 +807,17 @@ private: } /// Store items into a linear segment of memory, guarded by range + template __device__ __forceinline__ void Store( - OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { BlockExchange(temp_storage).BlockedToWarpStriped(items); - StoreDirectWarpStriped(linear_tid, block_itr, items, valid_items); + if (linear_tid == 0) + temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads + CTA_SYNC(); + StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); } }; @@ -809,7 +915,7 @@ public: * __global__ void ExampleKernel(int *d_data, ...) * { * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockStore BlockStore; + * typedef cub::BlockStore BlockStore; * * // Allocate shared memory for BlockStore * __shared__ typename BlockStore::TempStorage temp_storage; @@ -829,8 +935,9 @@ public: * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... * */ + template __device__ __forceinline__ void Store( - OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store { InternalStore(temp_storage, linear_tid).Store(block_itr, items); @@ -856,7 +963,7 @@ public: * __global__ void ExampleKernel(int *d_data, int valid_items, ...) * { * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockStore BlockStore; + * typedef cub::BlockStore BlockStore; * * // Allocate shared memory for BlockStore * __shared__ typename BlockStore::TempStorage temp_storage; @@ -877,8 +984,9 @@ public: * only the first two threads being unmasked to store portions of valid data. * */ + template __device__ __forceinline__ void Store( - OutputIterator block_itr, ///< [in] The thread block's base output iterator for storing to + OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store int valid_items) ///< [in] Number of valid items to write { diff --git a/SRC/cub/block/specializations/block_histogram_atomic.cuh b/SRC/cub/block/specializations/block_histogram_atomic.cuh index ec4159ee..29db0df7 100644 --- a/SRC/cub/block/specializations/block_histogram_atomic.cuh +++ b/SRC/cub/block/specializations/block_histogram_atomic.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -61,11 +61,11 @@ struct BlockHistogramAtomic /// Composite data onto an existing histogram template < typename T, - typename HistoCounter, + typename CounterT, int ITEMS_PER_THREAD> __device__ __forceinline__ void Composite( T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram - HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram + CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram { // Update histogram #pragma unroll diff --git a/SRC/cub/block/specializations/block_histogram_sort.cuh b/SRC/cub/block/specializations/block_histogram_sort.cuh index 12766ae5..9ef417ad 100644 --- a/SRC/cub/block/specializations/block_histogram_sort.cuh +++ b/SRC/cub/block/specializations/block_histogram_sort.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -75,7 +75,7 @@ struct BlockHistogramSort 4, (PTX_ARCH >= 350) ? true : false, BLOCK_SCAN_WARP_SCANS, - (PTX_ARCH >= 350) ? cudaSharedMemBankSizeEightByte : cudaSharedMemBankSizeFourByte, + cudaSharedMemBankSizeFourByte, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> @@ -114,7 +114,7 @@ struct BlockHistogramSort // Thread fields _TempStorage &temp_storage; - int linear_tid; + unsigned int linear_tid; /// Constructor @@ -138,7 +138,7 @@ struct BlockHistogramSort {} // Discontinuity predicate - __device__ __forceinline__ bool operator()(const T &a, const T &b, unsigned int b_index) + __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index) { if (a != b) { @@ -158,17 +158,17 @@ struct BlockHistogramSort // Composite data onto an existing histogram template < - typename HistoCounter> + typename CounterT > __device__ __forceinline__ void Composite( T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram - HistoCounter histogram[BINS]) ///< [out] Reference to shared/global memory histogram + CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram { enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; // Sort bytes in blocked arrangement BlockRadixSortT(temp_storage.sort).Sort(items); - __syncthreads(); + CTA_SYNC(); // Initialize the shared memory's run_begin and run_end for each bin int histo_offset = 0; @@ -186,7 +186,7 @@ struct BlockHistogramSort temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; } - __syncthreads(); + CTA_SYNC(); int flags[ITEMS_PER_THREAD]; // unused @@ -197,7 +197,7 @@ struct BlockHistogramSort // Update begin for first item if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0; - __syncthreads(); + CTA_SYNC(); // Composite into histogram histo_offset = 0; @@ -206,7 +206,7 @@ struct BlockHistogramSort for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) { int thread_offset = histo_offset + linear_tid; - HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; + CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; histogram[thread_offset] += count; } @@ -214,7 +214,7 @@ struct BlockHistogramSort if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) { int thread_offset = histo_offset + linear_tid; - HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; + CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; histogram[thread_offset] += count; } } diff --git a/SRC/cub/block/specializations/block_reduce_raking.cuh b/SRC/cub/block/specializations/block_reduce_raking.cuh index 3bddce65..aff97fc9 100644 --- a/SRC/cub/block/specializations/block_reduce_raking.cuh +++ b/SRC/cub/block/specializations/block_reduce_raking.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -102,10 +102,10 @@ struct BlockReduceRaking /// Shared memory storage layout type - struct _TempStorage + union _TempStorage { typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction - typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid }; @@ -115,7 +115,7 @@ struct BlockReduceRaking // Thread fields _TempStorage &temp_storage; - int linear_tid; + unsigned int linear_tid; /// Constructor @@ -127,79 +127,39 @@ struct BlockReduceRaking {} - template + template __device__ __forceinline__ T RakingReduction( ReductionOp reduction_op, ///< [in] Binary scan operator T *raking_segment, T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type iteration) + Int2Type /*iteration*/) { // Update partial if addend is in range - if ((FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid)) + if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid)) { T addend = raking_segment[ITERATION]; partial = reduction_op(partial, addend); } - return RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type()); + return RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type()); } - template + template __device__ __forceinline__ T RakingReduction( - ReductionOp reduction_op, ///< [in] Binary scan operator - T *raking_segment, + ReductionOp /*reduction_op*/, ///< [in] Binary scan operator + T * /*raking_segment*/, T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type iteration) + int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type /*iteration*/) { return partial; } - /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template - __device__ __forceinline__ T Sum( - T partial, ///< [in] Calling thread's input partial reductions - int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - { - cub::Sum reduction_op; - - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two) - partial = WarpReduce(temp_storage.warp_storage).template Sum( - partial, - num_valid); - } - else - { - // Place partial into shared memory grid. - *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; - - __syncthreads(); - - // Reduce parallelism to one warp - if (linear_tid < RAKING_THREADS) - { - // Raking reduction in grid - T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); - partial = raking_segment[0]; - - partial = RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type<1>()); - - partial = WarpReduce(temp_storage.warp_storage).template Sum( - partial, - num_valid); - } - } - - return partial; - } - - /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. template < - bool FULL_TILE, + bool IS_FULL_TILE, typename ReductionOp> __device__ __forceinline__ T Reduce( T partial, ///< [in] Calling thread's input partial reductions @@ -209,7 +169,7 @@ struct BlockReduceRaking if (WARP_SYNCHRONOUS) { // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two) - partial = WarpReduce(temp_storage.warp_storage).template Reduce( + partial = WarpReduce(temp_storage.warp_storage).template Reduce( partial, num_valid, reduction_op); @@ -219,7 +179,7 @@ struct BlockReduceRaking // Place partial into shared memory grid. *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; - __syncthreads(); + CTA_SYNC(); // Reduce parallelism to one warp if (linear_tid < RAKING_THREADS) @@ -228,18 +188,37 @@ struct BlockReduceRaking T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); partial = raking_segment[0]; - partial = RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type<1>()); + partial = RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type<1>()); + + int valid_raking_threads = (IS_FULL_TILE) ? + RAKING_THREADS : + (num_valid + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH; - partial = WarpReduce(temp_storage.warp_storage).template Reduce( + partial = WarpReduce(temp_storage.warp_storage).template Reduce( partial, - num_valid, + valid_raking_threads, reduction_op); + } } return partial; } + + /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + template + __device__ __forceinline__ T Sum( + T partial, ///< [in] Calling thread's input partial reductions + int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + { + cub::Sum reduction_op; + + return Reduce(partial, num_valid, reduction_op); + } + + + }; } // CUB namespace diff --git a/SRC/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/SRC/cub/block/specializations/block_reduce_raking_commutative_only.cuh index d0d73678..454fdafa 100644 --- a/SRC/cub/block/specializations/block_reduce_raking_commutative_only.cuh +++ b/SRC/cub/block/specializations/block_reduce_raking_commutative_only.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -93,17 +93,14 @@ struct BlockReduceRakingCommutativeOnly typedef BlockRakingLayout BlockRakingLayout; /// Shared memory storage layout type - struct _TempStorage + union _TempStorage { - union + struct { - struct - { - typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction - typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid - }; - typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan + typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid }; + typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan }; @@ -113,7 +110,7 @@ struct BlockReduceRakingCommutativeOnly // Thread fields _TempStorage &temp_storage; - int linear_tid; + unsigned int linear_tid; /// Constructor @@ -125,7 +122,7 @@ struct BlockReduceRakingCommutativeOnly {} - /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. template __device__ __forceinline__ T Sum( T partial, ///< [in] Calling thread's input partial reductions @@ -141,14 +138,14 @@ struct BlockReduceRakingCommutativeOnly if (linear_tid >= RAKING_THREADS) *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; - __syncthreads(); + CTA_SYNC(); // Reduce parallelism to one warp if (linear_tid < RAKING_THREADS) { // Raking reduction in grid T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); - partial = ThreadReduce(raking_segment, cub::Sum(), partial); + partial = internal::ThreadReduce(raking_segment, cub::Sum(), partial); // Warpscan partial = WarpReduce(temp_storage.warp_storage).Sum(partial); @@ -159,7 +156,7 @@ struct BlockReduceRakingCommutativeOnly } - /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. template < bool FULL_TILE, typename ReductionOp> @@ -178,14 +175,14 @@ struct BlockReduceRakingCommutativeOnly if (linear_tid >= RAKING_THREADS) *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; - __syncthreads(); + CTA_SYNC(); // Reduce parallelism to one warp if (linear_tid < RAKING_THREADS) { // Raking reduction in grid T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); - partial = ThreadReduce(raking_segment, reduction_op, partial); + partial = internal::ThreadReduce(raking_segment, reduction_op, partial); // Warpscan partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op); diff --git a/SRC/cub/block/specializations/block_reduce_warp_reductions.cuh b/SRC/cub/block/specializations/block_reduce_warp_reductions.cuh index 648650f1..10ba303b 100644 --- a/SRC/cub/block/specializations/block_reduce_warp_reductions.cuh +++ b/SRC/cub/block/specializations/block_reduce_warp_reductions.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -28,7 +28,7 @@ /** * \file - * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock. Supports non-commutative reduction operators. + * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. */ #pragma once @@ -46,7 +46,7 @@ namespace cub { /** - * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock. Supports non-commutative reduction operators. + * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. */ template < typename T, ///< Data type being reduced @@ -71,7 +71,7 @@ struct BlockReduceWarpReductions /// The logical warp size for warp reductions LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS), - /// Whether or not the logical warp size evenly divides the threadblock size + /// Whether or not the logical warp size evenly divides the thread block size EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0) }; @@ -83,9 +83,9 @@ struct BlockReduceWarpReductions /// Shared memory storage layout type struct _TempStorage { - typename WarpReduce::TempStorage warp_reduce[WARPS]; ///< Buffer for warp-synchronous scan + typename WarpReduce::TempStorage warp_reduce[WARPS]; ///< Buffer for warp-synchronous scan T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan - T block_prefix; ///< Shared prefix for the entire threadblock + T block_prefix; ///< Shared prefix for the entire thread block }; /// Alias wrapper allowing storage to be unioned @@ -115,7 +115,7 @@ struct BlockReduceWarpReductions ReductionOp reduction_op, ///< [in] Binary scan operator T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type successor_warp) + Int2Type /*successor_warp*/) { if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid)) { @@ -127,10 +127,10 @@ struct BlockReduceWarpReductions template __device__ __forceinline__ T ApplyWarpAggregates( - ReductionOp reduction_op, ///< [in] Binary scan operator + ReductionOp /*reduction_op*/, ///< [in] Binary scan operator T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type successor_warp) + int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) + Int2Type /*successor_warp*/) { return warp_aggregate; } @@ -151,7 +151,7 @@ struct BlockReduceWarpReductions temp_storage.warp_aggregates[warp_id] = warp_aggregate; } - __syncthreads(); + CTA_SYNC(); // Update total aggregate in warp 0, lane 0 if (linear_tid == 0) @@ -163,31 +163,30 @@ struct BlockReduceWarpReductions } - /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. template __device__ __forceinline__ T Sum( T input, ///< [in] Calling thread's input partial reductions int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) { - cub::Sum reduction_op; - unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE; - unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ? + cub::Sum reduction_op; + int warp_offset = (warp_id * LOGICAL_WARP_SIZE); + int warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ? LOGICAL_WARP_SIZE : - (warp_offset < num_valid) ? - num_valid - warp_offset : - 0; + num_valid - warp_offset; // Warp reduction in every warp - T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Sum<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>( + T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>( input, - warp_num_valid); + warp_num_valid, + cub::Sum()); // Update outputs and block_aggregate with warp-wide aggregates from lane-0s return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); } - /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. + /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. template < bool FULL_TILE, typename ReductionOp> @@ -196,16 +195,13 @@ struct BlockReduceWarpReductions int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) ReductionOp reduction_op) ///< [in] Binary reduction operator { - unsigned int warp_id = (WARPS == 1) ? 0 : (linear_tid / LOGICAL_WARP_SIZE); - unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE; - unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ? + int warp_offset = warp_id * LOGICAL_WARP_SIZE; + int warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ? LOGICAL_WARP_SIZE : - (warp_offset < num_valid) ? - num_valid - warp_offset : - 0; + num_valid - warp_offset; // Warp reduction in every warp - T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>( + T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>( input, warp_num_valid, reduction_op); diff --git a/SRC/cub/block/specializations/block_scan_raking.cuh b/SRC/cub/block/specializations/block_scan_raking.cuh index 8ae388da..a855cda0 100644 --- a/SRC/cub/block/specializations/block_scan_raking.cuh +++ b/SRC/cub/block/specializations/block_scan_raking.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -29,7 +29,7 @@ /** * \file - * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock. + * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. */ #pragma once @@ -50,7 +50,7 @@ namespace cub { /** - * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock. + * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. */ template < typename T, ///< Data type being scanned @@ -61,6 +61,10 @@ template < int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective struct BlockScanRaking { + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + /// Constants enum { @@ -68,7 +72,7 @@ struct BlockScanRaking BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; - /// Layout type for padded threadblock raking grid + /// Layout type for padded thread block raking grid typedef BlockRakingLayout BlockRakingLayout; /// Constants @@ -91,7 +95,7 @@ struct BlockScanRaking struct _TempStorage { typename WarpScan::TempStorage warp_scan; ///< Buffer for warp-synchronous scan - typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid + typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid T block_aggregate; ///< Block aggregate }; @@ -100,20 +104,19 @@ struct BlockScanRaking struct TempStorage : Uninitialized<_TempStorage> {}; + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + // Thread fields _TempStorage &temp_storage; - int linear_tid; + unsigned int linear_tid; T cached_segment[SEGMENT_LENGTH]; - /// Constructor - __device__ __forceinline__ BlockScanRaking( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- /// Templated reduction template @@ -121,7 +124,7 @@ struct BlockScanRaking T* raking_ptr, ///< [in] Input array ScanOp scan_op, ///< [in] Binary reduction operator T raking_partial, ///< [in] Prefix to seed reduction with - Int2Type iteration) + Int2Type /*iteration*/) { if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS)) { @@ -136,10 +139,10 @@ struct BlockScanRaking /// Templated reduction (base case) template __device__ __forceinline__ T GuardedReduce( - T* raking_ptr, ///< [in] Input array - ScanOp scan_op, ///< [in] Binary reduction operator + T* /*raking_ptr*/, ///< [in] Input array + ScanOp /*scan_op*/, ///< [in] Binary reduction operator T raking_partial, ///< [in] Prefix to seed reduction with - Int2Type iteration) + Int2Type /*iteration*/) { return raking_partial; } @@ -150,7 +153,7 @@ struct BlockScanRaking __device__ __forceinline__ void CopySegment( T* out, ///< [out] Out array T* in, ///< [in] Input array - Int2Type iteration) + Int2Type /*iteration*/) { out[ITERATION] = in[ITERATION]; CopySegment(out, in, Int2Type()); @@ -159,9 +162,9 @@ struct BlockScanRaking /// Templated copy (base case) __device__ __forceinline__ void CopySegment( - T* out, ///< [out] Out array - T* in, ///< [in] Input array - Int2Type iteration) + T* /*out*/, ///< [out] Out array + T* /*in*/, ///< [in] Input array + Int2Type /*iteration*/) {} @@ -196,7 +199,7 @@ struct BlockScanRaking CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); } - ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); + internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); // Write data back to smem CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); @@ -218,32 +221,41 @@ struct BlockScanRaking CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); } - ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); + internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); // Write data back to smem CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); } - /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ BlockScanRaking( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) + {} + + + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. template __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &identity, ///< [in] Identity value - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator { - if (WARP_SYNCHRONOUS) { - // Short-circuit directly to warp scan - WarpScan(temp_storage.warp_scan).ExclusiveScan( - input, - output, - identity, - scan_op, - block_aggregate); + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op); } else { @@ -251,59 +263,41 @@ struct BlockScanRaking T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; - __syncthreads(); + CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { - // Raking upsweep reduction in grid - T raking_partial = Upsweep(scan_op); + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); - // Exclusive warp synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveScan( - raking_partial, - raking_partial, - identity, - scan_op, - temp_storage.block_aggregate); + // Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, raking_partial); + ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); } - __syncthreads(); + CTA_SYNC(); // Grab thread prefix from shared memory - output = *placement_ptr; - - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; + exclusive_output = *placement_ptr; } } - - /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T identity, ///< [in] Identity value - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator { if (WARP_SYNCHRONOUS) { - // Short-circuit directly to warp scan - WarpScan(temp_storage.warp_scan).ExclusiveScan( - input, - output, - identity, - scan_op, - block_aggregate, - block_prefix_callback_op); + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op); } else { @@ -311,39 +305,31 @@ struct BlockScanRaking T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; - __syncthreads(); + CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { - // Raking upsweep reduction in grid - T raking_partial = Upsweep(scan_op); - - // Exclusive warp synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveScan( - raking_partial, - raking_partial, - identity, - scan_op, - temp_storage.block_aggregate, - block_prefix_callback_op); + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Exclusive Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op); // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, raking_partial); + ExclusiveDownsweep(scan_op, exclusive_partial); } - __syncthreads(); + CTA_SYNC(); - // Grab thread prefix from shared memory + // Grab exclusive partial from shared memory output = *placement_ptr; - - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; } } - /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for thread0 is undefined. + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. template __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item @@ -353,12 +339,8 @@ struct BlockScanRaking { if (WARP_SYNCHRONOUS) { - // Short-circuit directly to warp scan - WarpScan(temp_storage.warp_scan).ExclusiveScan( - input, - output, - scan_op, - block_aggregate); + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate); } else { @@ -366,26 +348,28 @@ struct BlockScanRaking T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; - __syncthreads(); + CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { - // Raking upsweep reduction in grid - T raking_partial = Upsweep(scan_op); + // Raking upsweep reduction across shared partials + T upsweep_partial= Upsweep(scan_op); - // Exclusive warp synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveScan( - raking_partial, - raking_partial, - scan_op, - temp_storage.block_aggregate); + // Warp-synchronous scan + T inclusive_partial; + T exclusive_partial; + WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0)); + ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); + + // Broadcast aggregate to all threads + if (linear_tid == RAKING_THREADS - 1) + temp_storage.block_aggregate = inclusive_partial; } - __syncthreads(); + CTA_SYNC(); // Grab thread prefix from shared memory output = *placement_ptr; @@ -396,26 +380,19 @@ struct BlockScanRaking } - /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + T input, ///< [in] Calling thread's input items + T &output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items { if (WARP_SYNCHRONOUS) { - // Short-circuit directly to warp scan - WarpScan(temp_storage.warp_scan).ExclusiveScan( - input, - output, - scan_op, - block_aggregate, - block_prefix_callback_op); + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); } else { @@ -423,29 +400,29 @@ struct BlockScanRaking T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; - __syncthreads(); + CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { - // Raking upsweep reduction in grid - T raking_partial = Upsweep(scan_op); + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); - // Exclusive warp synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveScan( - raking_partial, - raking_partial, - scan_op, - temp_storage.block_aggregate, - block_prefix_callback_op); + // Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate); // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, raking_partial); + ExclusiveDownsweep(scan_op, exclusive_partial); + + // Broadcast aggregate to other threads + if (linear_tid == 0) + temp_storage.block_aggregate = block_aggregate; } - __syncthreads(); + CTA_SYNC(); - // Grab thread prefix from shared memory + // Grab exclusive partial from shared memory output = *placement_ptr; // Retrieve block aggregate @@ -454,115 +431,119 @@ struct BlockScanRaking } - /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. { if (WARP_SYNCHRONOUS) { - // Short-circuit directly to warp scan - WarpScan(temp_storage.warp_scan).ExclusiveSum( - input, - output, - block_aggregate); + // Short-circuit directly to warp-synchronous scan + T block_aggregate; + WarpScan warp_scan(temp_storage.warp_scan); + warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate); + + // Obtain warp-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); + + output = scan_op(block_prefix, output); + if (linear_tid == 0) + output = block_prefix; } else { - // Raking scan - Sum scan_op; - // Place thread partial into shared memory raking grid T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; - __syncthreads(); + CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { - // Raking upsweep reduction in grid - T raking_partial = Upsweep(scan_op); + WarpScan warp_scan(temp_storage.warp_scan); + + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); + + // Warp-synchronous scan + T exclusive_partial, block_aggregate; + warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); - // Exclusive warp synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveSum( - raking_partial, - raking_partial, - temp_storage.block_aggregate); + // Obtain block-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); + + // Update prefix with warpscan exclusive partial + T downsweep_prefix = scan_op(block_prefix, exclusive_partial); + if (linear_tid == 0) + downsweep_prefix = block_prefix; // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, raking_partial); + ExclusiveDownsweep(scan_op, downsweep_prefix); } - __syncthreads(); + CTA_SYNC(); // Grab thread prefix from shared memory output = *placement_ptr; - - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; } } - /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator { if (WARP_SYNCHRONOUS) { - // Short-circuit directly to warp scan - WarpScan(temp_storage.warp_scan).ExclusiveSum( - input, - output, - block_aggregate, - block_prefix_callback_op); + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op); } else { - // Raking scan - Sum scan_op; - // Place thread partial into shared memory raking grid T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; - __syncthreads(); + CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { - // Raking upsweep reduction in grid - T raking_partial = Upsweep(scan_op); + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); - // Exclusive warp synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveSum( - raking_partial, - raking_partial, - temp_storage.block_aggregate, - block_prefix_callback_op); + // Exclusive Warp-synchronous scan + T exclusive_partial; + WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); - // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, raking_partial); + // Inclusive raking downsweep scan + InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); } - __syncthreads(); + CTA_SYNC(); // Grab thread prefix from shared memory output = *placement_ptr; - - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; } } - /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. template __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item @@ -572,12 +553,8 @@ struct BlockScanRaking { if (WARP_SYNCHRONOUS) { - // Short-circuit directly to warp scan - WarpScan(temp_storage.warp_scan).InclusiveScan( - input, - output, - scan_op, - block_aggregate); + // Short-circuit directly to warp-synchronous scan + WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate); } else { @@ -585,26 +562,28 @@ struct BlockScanRaking T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; - __syncthreads(); + CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { - // Raking upsweep reduction in grid - T raking_partial = Upsweep(scan_op); + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); - // Exclusive warp synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveScan( - raking_partial, - raking_partial, - scan_op, - temp_storage.block_aggregate); + // Warp-synchronous scan + T inclusive_partial; + T exclusive_partial; + WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); // Inclusive raking downsweep scan - InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0)); + InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); + + // Broadcast aggregate to all threads + if (linear_tid == RAKING_THREADS - 1) + temp_storage.block_aggregate = inclusive_partial; } - __syncthreads(); + CTA_SYNC(); // Grab thread prefix from shared memory output = *placement_ptr; @@ -615,7 +594,7 @@ struct BlockScanRaking } - /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. template < typename ScanOp, typename BlockPrefixCallbackOp> @@ -623,160 +602,59 @@ struct BlockScanRaking T input, ///< [in] Calling thread's input item T &output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. { if (WARP_SYNCHRONOUS) { - // Short-circuit directly to warp scan - WarpScan(temp_storage.warp_scan).InclusiveScan( - input, - output, - scan_op, - block_aggregate, - block_prefix_callback_op); - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; + // Short-circuit directly to warp-synchronous scan + T block_aggregate; + WarpScan warp_scan(temp_storage.warp_scan); + warp_scan.InclusiveScan(input, output, scan_op, block_aggregate); - __syncthreads(); + // Obtain warp-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - // Raking upsweep reduction in grid - T raking_partial = Upsweep(scan_op); - - // Warp synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveScan( - raking_partial, - raking_partial, - scan_op, - temp_storage.block_aggregate, - block_prefix_callback_op); - - // Inclusive raking downsweep scan - InclusiveDownsweep(scan_op, raking_partial); - } - - __syncthreads(); - - // Grab thread prefix from shared memory - output = *placement_ptr; - - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; - } - } - - - /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp scan - WarpScan(temp_storage.warp_scan).InclusiveSum( - input, - output, - block_aggregate); + // Update prefix with exclusive warpscan partial + output = scan_op(block_prefix, output); } else { - // Raking scan - Sum scan_op; - // Place thread partial into shared memory raking grid T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); *placement_ptr = input; - __syncthreads(); + CTA_SYNC(); // Reduce parallelism down to just raking threads if (linear_tid < RAKING_THREADS) { - // Raking upsweep reduction in grid - T raking_partial = Upsweep(scan_op); - - // Exclusive warp synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveSum( - raking_partial, - raking_partial, - temp_storage.block_aggregate); - - // Inclusive raking downsweep scan - InclusiveDownsweep(scan_op, raking_partial, (linear_tid != 0)); - } - - __syncthreads(); - - // Grab thread prefix from shared memory - output = *placement_ptr; - - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; - } - } + WarpScan warp_scan(temp_storage.warp_scan); + // Raking upsweep reduction across shared partials + T upsweep_partial = Upsweep(scan_op); - /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp scan - WarpScan(temp_storage.warp_scan).InclusiveSum( - input, - output, - block_aggregate, - block_prefix_callback_op); - } - else - { - // Raking scan - Sum scan_op; + // Warp-synchronous scan + T exclusive_partial, block_aggregate; + warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - __syncthreads(); + // Obtain block-wide prefix in lane0, then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = warp_scan.Broadcast(block_prefix, 0); - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - // Raking upsweep reduction in grid - T raking_partial = Upsweep(scan_op); - - // Warp synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveSum( - raking_partial, - raking_partial, - temp_storage.block_aggregate, - block_prefix_callback_op); + // Update prefix with warpscan exclusive partial + T downsweep_prefix = scan_op(block_prefix, exclusive_partial); + if (linear_tid == 0) + downsweep_prefix = block_prefix; // Inclusive raking downsweep scan - InclusiveDownsweep(scan_op, raking_partial); + InclusiveDownsweep(scan_op, downsweep_prefix); } - __syncthreads(); + CTA_SYNC(); // Grab thread prefix from shared memory output = *placement_ptr; - - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; } } diff --git a/SRC/cub/block/specializations/block_scan_warp_scans.cuh b/SRC/cub/block/specializations/block_scan_warp_scans.cuh index f2d06beb..85e4d613 100644 --- a/SRC/cub/block/specializations/block_scan_warp_scans.cuh +++ b/SRC/cub/block/specializations/block_scan_warp_scans.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -28,7 +28,7 @@ /** * \file - * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock. + * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. */ #pragma once @@ -45,7 +45,7 @@ CUB_NS_PREFIX namespace cub { /** - * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock. + * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. */ template < typename T, @@ -55,6 +55,10 @@ template < int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective struct BlockScanWarpScans { + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + /// Constants enum { @@ -69,14 +73,18 @@ struct BlockScanWarpScans }; /// WarpScan utility type - typedef WarpScan WarpScan; + typedef WarpScan WarpScanT; + + /// WarpScan utility type + typedef WarpScan WarpAggregateScan; /// Shared memory storage layout type - struct _TempStorage + + struct __align__(32) _TempStorage { - typename WarpScan::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans - T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan - T block_prefix; ///< Shared prefix for the entire threadblock + T warp_aggregates[WARPS]; + typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans + T block_prefix; ///< Shared prefix for the entire thread block }; @@ -84,12 +92,20 @@ struct BlockScanWarpScans struct TempStorage : Uninitialized<_TempStorage> {}; + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + // Thread fields - _TempStorage &temp_storage; - int linear_tid; - int warp_id; - int lane_id; + _TempStorage &temp_storage; + unsigned int linear_tid; + unsigned int warp_id; + unsigned int lane_id; + + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- /// Constructor __device__ __forceinline__ BlockScanWarpScans( @@ -101,162 +117,182 @@ struct BlockScanWarpScans lane_id(LaneId()) {} + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + template __device__ __forceinline__ void ApplyWarpAggregates( - T &partial, ///< [out] The calling thread's partial reduction + T &warp_prefix, ///< [out] The calling thread's partial reduction ScanOp scan_op, ///< [in] Binary scan operator T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items - bool lane_valid, ///< [in] Whether or not the partial belonging to the current thread is valid - Int2Type addend_warp) + Int2Type /*addend_warp*/) { - T inclusive = scan_op(block_aggregate, partial); - if (warp_id == WARP) - { - partial = (lane_valid) ? - inclusive : - block_aggregate; - } + if (warp_id == WARP) + warp_prefix = block_aggregate; - T addend = temp_storage.warp_aggregates[WARP]; - block_aggregate = scan_op(block_aggregate, addend); + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); - ApplyWarpAggregates(partial, scan_op, block_aggregate, lane_valid, Int2Type()); + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type()); } template __device__ __forceinline__ void ApplyWarpAggregates( - T &partial, ///< [out] The calling thread's partial reduction - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items - bool lane_valid, ///< [in] Whether or not the partial belonging to the current thread is valid - Int2Type addend_warp) + T &/*warp_prefix*/, ///< [out] The calling thread's partial reduction + ScanOp /*scan_op*/, ///< [in] Binary scan operator + T &/*block_aggregate*/, ///< [out] Threadblock-wide aggregate reduction of input items + Int2Type /*addend_warp*/) {} - /// Update the calling thread's partial reduction with the warp-wide aggregates from preceding warps. Also returns block-wide aggregate in thread0. + /// Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. template - __device__ __forceinline__ void ApplyWarpAggregates( - T &partial, ///< [out] The calling thread's partial reduction + __device__ __forceinline__ T ComputeWarpPrefix( ScanOp scan_op, ///< [in] Binary scan operator T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items - bool lane_valid = true) ///< [in] Whether or not the partial belonging to the current thread is valid + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items { // Last lane in each warp shares its warp-aggregate if (lane_id == WARP_THREADS - 1) temp_storage.warp_aggregates[warp_id] = warp_aggregate; - __syncthreads(); + CTA_SYNC(); + // Accumulate block aggregates and save the one that is our warp's prefix + T warp_prefix; block_aggregate = temp_storage.warp_aggregates[0]; -#if __CUDA_ARCH__ <= 130 - - // Use template unrolling for SM1x (since the PTX backend can't handle it) - ApplyWarpAggregates(partial, scan_op, block_aggregate, lane_valid, Int2Type<1>()); - -#else - - // Use the pragma unrolling (since it uses less registers) + // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x) + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>()); +/* #pragma unroll - for (int WARP = 1; WARP < WARPS; WARP++) + for (int WARP = 1; WARP < WARPS; ++WARP) { - T inclusive = scan_op(block_aggregate, partial); if (warp_id == WARP) - { - partial = (lane_valid) ? - inclusive : - block_aggregate; - } + warp_prefix = block_aggregate; T addend = temp_storage.warp_aggregates[WARP]; block_aggregate = scan_op(block_aggregate, addend); } +*/ -#endif + return warp_prefix; } - /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &identity, ///< [in] Identity value + __device__ __forceinline__ T ComputeWarpPrefix( ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + const T &initial_value) ///< [in] Initial value to seed the exclusive scan { - T inclusive_output; - WarpScan(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, output, identity, scan_op); + T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate); + + warp_prefix = scan_op(initial_value, warp_prefix); + + if (warp_id == 0) + warp_prefix = initial_value; - // Update outputs and block_aggregate with warp-wide aggregates - ApplyWarpAggregates(output, scan_op, inclusive_output, block_aggregate); + return warp_prefix; } + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- - /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + template __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T identity, ///< [in] Identity value - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator { - ExclusiveScan(input, output, identity, scan_op, block_aggregate); + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + } - // Use the first warp to determine the threadblock prefix, returning the result in lane0 - if (warp_id == 0) + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + + // Apply warp prefix to our lane's partial + if (warp_id != 0) { - T block_prefix = block_prefix_callback_op(block_aggregate); + exclusive_output = scan_op(warp_prefix, exclusive_output); if (lane_id == 0) - { - // Share the prefix with all threads - temp_storage.block_prefix = block_prefix; - } + exclusive_output = warp_prefix; } - - __syncthreads(); - - // Incorporate threadblock prefix into outputs - T block_prefix = temp_storage.block_prefix; - output = scan_op(block_prefix, output); } - /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no identity value, the output computed for thread0 is undefined. + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. template __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. T inclusive_output; - WarpScan(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, output, scan_op); + WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value); - // Update outputs and block_aggregate with warp-wide aggregates - ApplyWarpAggregates(output, scan_op, inclusive_output, block_aggregate, (lane_id > 0)); + // Apply warp prefix to our lane's partial + exclusive_output = scan_op(warp_prefix, exclusive_output); + if (lane_id == 0) + exclusive_output = warp_prefix; } - /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. template < typename ScanOp, typename BlockPrefixCallbackOp> __device__ __forceinline__ void ExclusiveScan( T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. { - ExclusiveScan(input, output, scan_op, block_aggregate); + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); - // Use the first warp to determine the threadblock prefix, returning the result in lane0 + // Use the first warp to determine the thread block prefix, returning the result in lane0 if (warp_id == 0) { T block_prefix = block_prefix_callback_op(block_aggregate); @@ -264,95 +300,72 @@ struct BlockScanWarpScans { // Share the prefix with all threads temp_storage.block_prefix = block_prefix; + exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 } } - __syncthreads(); + CTA_SYNC(); - // Incorporate threadblock prefix into outputs + // Incorporate thread block prefix into outputs T block_prefix = temp_storage.block_prefix; - output = (linear_tid == 0) ? - block_prefix : - scan_op(block_prefix, output); + if (linear_tid > 0) + { + exclusive_output = scan_op(block_prefix, exclusive_output); + } } - /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - Sum scan_op; - T inclusive_output; + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- - WarpScan(temp_storage.warp_scan[warp_id]).Sum(input, inclusive_output, output); - - // Update outputs and block_aggregate with warp-wide aggregates from lane WARP_THREADS-1 - ApplyWarpAggregates(output, scan_op, inclusive_output, block_aggregate); - } - - - /// Computes an exclusive threadblock-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator { - ExclusiveSum(input, output, block_aggregate); - - // Use the first warp to determine the threadblock prefix, returning the result in lane0 - if (warp_id == 0) - { - T block_prefix = block_prefix_callback_op(block_aggregate); - if (lane_id == 0) - { - // Share the prefix with all threads - temp_storage.block_prefix = block_prefix; - } - } - - __syncthreads(); - - // Incorporate threadblock prefix into outputs - Sum scan_op; - T block_prefix = temp_storage.block_prefix; - output = scan_op(block_prefix, output); + T block_aggregate; + InclusiveScan(input, inclusive_output, scan_op, block_aggregate); } - /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. template __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items { - WarpScan(temp_storage.warp_scan[warp_id]).InclusiveScan(input, output, scan_op); + WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op); - // Update outputs and block_aggregate with warp-wide aggregates from lane WARP_THREADS-1 - ApplyWarpAggregates(output, scan_op, output, block_aggregate); + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + // Apply warp prefix to our lane's partial + if (warp_id != 0) + { + inclusive_output = scan_op(warp_prefix, inclusive_output); + } } - /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. template < typename ScanOp, typename BlockPrefixCallbackOp> __device__ __forceinline__ void InclusiveScan( T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. { - InclusiveScan(input, output, scan_op, block_aggregate); + T block_aggregate; + InclusiveScan(input, exclusive_output, scan_op, block_aggregate); - // Use the first warp to determine the threadblock prefix, returning the result in lane0 + // Use the first warp to determine the thread block prefix, returning the result in lane0 if (warp_id == 0) { T block_prefix = block_prefix_callback_op(block_aggregate); @@ -363,56 +376,14 @@ struct BlockScanWarpScans } } - __syncthreads(); + CTA_SYNC(); - // Incorporate threadblock prefix into outputs + // Incorporate thread block prefix into outputs T block_prefix = temp_storage.block_prefix; - output = scan_op(block_prefix, output); + exclusive_output = scan_op(block_prefix, exclusive_output); } - /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - WarpScan(temp_storage.warp_scan[warp_id]).InclusiveSum(input, output); - - // Update outputs and block_aggregate with warp-wide aggregates from lane WARP_THREADS-1 - ApplyWarpAggregates(output, Sum(), output, block_aggregate); - } - - - /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Instead of using 0 as the threadblock-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs. - { - InclusiveSum(input, output, block_aggregate); - - // Use the first warp to determine the threadblock prefix, returning the result in lane0 - if (warp_id == 0) - { - T block_prefix = block_prefix_callback_op(block_aggregate); - if (lane_id == 0) - { - // Share the prefix with all threads - temp_storage.block_prefix = block_prefix; - } - } - - __syncthreads(); - - // Incorporate threadblock prefix into outputs - Sum scan_op; - T block_prefix = temp_storage.block_prefix; - output = scan_op(block_prefix, output); - } - }; diff --git a/SRC/cub/block/specializations/block_scan_warp_scans2.cuh b/SRC/cub/block/specializations/block_scan_warp_scans2.cuh new file mode 100644 index 00000000..4de7c69b --- /dev/null +++ b/SRC/cub/block/specializations/block_scan_warp_scans2.cuh @@ -0,0 +1,436 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ + +#pragma once + +#include "../../util_arch.cuh" +#include "../../util_ptx.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ +template < + typename T, + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanWarpScans +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + /// Number of warp threads + WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of active warps + WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, + }; + + /// WarpScan utility type + typedef WarpScan WarpScanT; + + /// WarpScan utility type + typedef WarpScan WarpAggregateScanT; + + /// Shared memory storage layout type + struct _TempStorage + { + typename WarpAggregateScanT::TempStorage inner_scan[WARPS]; ///< Buffer for warp-synchronous scans + typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans + T warp_aggregates[WARPS]; + T block_prefix; ///< Shared prefix for the entire thread block + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + unsigned int warp_id; + unsigned int lane_id; + + + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ BlockScanWarpScans( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), + lane_id(LaneId()) + {} + + + //--------------------------------------------------------------------- + // Utility methods + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &warp_prefix, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + Int2Type addend_warp) + { + if (warp_id == WARP) + warp_prefix = block_aggregate; + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type()); + } + + template + __device__ __forceinline__ void ApplyWarpAggregates( + T &warp_prefix, ///< [out] The calling thread's partial reduction + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + Int2Type addend_warp) + {} + + + /// Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + template + __device__ __forceinline__ T ComputeWarpPrefix( + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = warp_aggregate; + + CTA_SYNC(); + + // Accumulate block aggregates and save the one that is our warp's prefix + T warp_prefix; + block_aggregate = temp_storage.warp_aggregates[0]; + + // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x) + ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>()); +/* + #pragma unroll + for (int WARP = 1; WARP < WARPS; ++WARP) + { + if (warp_id == WARP) + warp_prefix = block_aggregate; + + T addend = temp_storage.warp_aggregates[WARP]; + block_aggregate = scan_op(block_aggregate, addend); + } +*/ + + return warp_prefix; + } + + + /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. + template + __device__ __forceinline__ T ComputeWarpPrefix( + ScanOp scan_op, ///< [in] Binary scan operator + T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items + T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items + const T &initial_value) ///< [in] Initial value to seed the exclusive scan + { + T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate); + + warp_prefix = scan_op(initial_value, warp_prefix); + + if (warp_id == 0) + warp_prefix = initial_value; + + return warp_prefix; + } + + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]); + + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. +// T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + +//-------------------------------------------------- + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + // Get the warp scan partial + T warp_inclusive, warp_prefix; + if (lane_id < WARPS) + { + // Scan the warpscan partials + T warp_val = temp_storage.warp_aggregates[lane_id]; + WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op); + } + + warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id); + block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1); +//-------------------------------------------------- + + // Apply warp prefix to our lane's partial + if (warp_id != 0) + { + exclusive_output = scan_op(warp_prefix, exclusive_output); + if (lane_id == 0) + exclusive_output = warp_prefix; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]); + + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp +// T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value); + +//-------------------------------------------------- + // Last lane in each warp shares its warp-aggregate + if (lane_id == WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + // Get the warp scan partial + T warp_inclusive, warp_prefix; + if (lane_id < WARPS) + { + // Scan the warpscan partials + T warp_val = temp_storage.warp_aggregates[lane_id]; + WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op); + } + + warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id); + block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1); +//-------------------------------------------------- + + // Apply warp prefix to our lane's partial + exclusive_output = scan_op(warp_prefix, exclusive_output); + if (lane_id == 0) + exclusive_output = warp_prefix; + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + + // Use the first warp to determine the thread block prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 + } + } + + CTA_SYNC(); + + // Incorporate thread block prefix into outputs + T block_prefix = temp_storage.block_prefix; + if (linear_tid > 0) + { + exclusive_output = scan_op(block_prefix, exclusive_output); + } + } + + + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + InclusiveScan(input, inclusive_output, scan_op, block_aggregate); + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op); + + // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. + T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); + + // Apply warp prefix to our lane's partial + if (warp_id != 0) + { + inclusive_output = scan_op(warp_prefix, inclusive_output); + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + T block_aggregate; + InclusiveScan(input, exclusive_output, scan_op, block_aggregate); + + // Use the first warp to determine the thread block prefix, returning the result in lane0 + if (warp_id == 0) + { + T block_prefix = block_prefix_callback_op(block_aggregate); + if (lane_id == 0) + { + // Share the prefix with all threads + temp_storage.block_prefix = block_prefix; + } + } + + CTA_SYNC(); + + // Incorporate thread block prefix into outputs + T block_prefix = temp_storage.block_prefix; + exclusive_output = scan_op(block_prefix, exclusive_output); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block/specializations/block_scan_warp_scans3.cuh b/SRC/cub/block/specializations/block_scan_warp_scans3.cuh new file mode 100644 index 00000000..147ca4c5 --- /dev/null +++ b/SRC/cub/block/specializations/block_scan_warp_scans3.cuh @@ -0,0 +1,418 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ + +#pragma once + +#include "../../util_arch.cuh" +#include "../../util_ptx.cuh" +#include "../../warp/warp_scan.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. + */ +template < + typename T, + int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension + int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension + int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct BlockScanWarpScans +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// Constants + enum + { + /// The thread block size in threads + BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, + + /// Number of warp threads + INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), + OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS, + + /// Number of outer scan warps + OUTER_WARPS = INNER_WARP_THREADS + }; + + /// Outer WarpScan utility type + typedef WarpScan OuterWarpScanT; + + /// Inner WarpScan utility type + typedef WarpScan InnerWarpScanT; + + typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS]; + + + /// Shared memory storage layout type + struct _TempStorage + { + union Aliasable + { + Uninitialized outer_warp_scan; ///< Buffer for warp-synchronous outer scans + typename InnerWarpScanT::TempStorage inner_warp_scan; ///< Buffer for warp-synchronous inner scan + + } aliasable; + + T warp_aggregates[OUTER_WARPS]; + + T block_aggregate; ///< Shared prefix for the entire thread block + }; + + + /// Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + // Thread fields + _TempStorage &temp_storage; + unsigned int linear_tid; + unsigned int warp_id; + unsigned int lane_id; + + + //--------------------------------------------------------------------- + // Constructors + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ BlockScanWarpScans( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), + warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS), + lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS) + {} + + + //--------------------------------------------------------------------- + // Exclusive scans + //--------------------------------------------------------------------- + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. + T block_aggregate; + ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( + input, inclusive_output, exclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; + T outer_warp_exclusive; + + InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( + outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate); + + temp_storage.block_aggregate = block_aggregate; + temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; + } + + CTA_SYNC(); + + if (warp_id != 0) + { + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); + if (lane_id == 0) + exclusive_output = outer_warp_exclusive; + } + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input items + T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) + const T &initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( + input, inclusive_output, exclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + { + temp_storage.warp_aggregates[warp_id] = inclusive_output; + } + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; + T outer_warp_exclusive; + + InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( + outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate); + + temp_storage.block_aggregate = block_aggregate; + temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; + } + + CTA_SYNC(); + + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); + if (lane_id == 0) + exclusive_output = outer_warp_exclusive; + } + + + /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item + T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + T inclusive_output; + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( + input, inclusive_output, exclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan); + + T upsweep = temp_storage.warp_aggregates[linear_tid]; + T downsweep_prefix, block_aggregate; + + inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate); + + // Use callback functor to get block prefix in lane0 and then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = inner_scan.Broadcast(block_prefix, 0); + + downsweep_prefix = scan_op(block_prefix, downsweep_prefix); + if (linear_tid == 0) + downsweep_prefix = block_prefix; + + temp_storage.warp_aggregates[linear_tid] = downsweep_prefix; + } + + CTA_SYNC(); + + // Apply warp prefix to our lane's partial (or assign it if partial is invalid) + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); + if (lane_id == 0) + exclusive_output = outer_warp_exclusive; + } + + + //--------------------------------------------------------------------- + // Inclusive scans + //--------------------------------------------------------------------- + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan operator + { + T block_aggregate; + InclusiveScan(input, inclusive_output, scan_op, block_aggregate); + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan( + input, inclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; + T outer_warp_exclusive; + + InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( + outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate); + + temp_storage.block_aggregate = block_aggregate; + temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; + } + + CTA_SYNC(); + + if (warp_id != 0) + { + // Retrieve block aggregate + block_aggregate = temp_storage.block_aggregate; + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + inclusive_output = scan_op(outer_warp_exclusive, inclusive_output); + } + } + + + /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. + template < + typename ScanOp, + typename BlockPrefixCallbackOp> + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item + T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan operator + BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. + { + // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. + OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan( + input, inclusive_output, scan_op); + + // Share outer warp total + if (lane_id == OUTER_WARP_THREADS - 1) + temp_storage.warp_aggregates[warp_id] = inclusive_output; + + CTA_SYNC(); + + if (linear_tid < INNER_WARP_THREADS) + { + InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan); + + T upsweep = temp_storage.warp_aggregates[linear_tid]; + T downsweep_prefix, block_aggregate; + inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate); + + // Use callback functor to get block prefix in lane0 and then broadcast to other lanes + T block_prefix = block_prefix_callback_op(block_aggregate); + block_prefix = inner_scan.Broadcast(block_prefix, 0); + + downsweep_prefix = scan_op(block_prefix, downsweep_prefix); + if (linear_tid == 0) + downsweep_prefix = block_prefix; + + temp_storage.warp_aggregates[linear_tid] = downsweep_prefix; + } + + CTA_SYNC(); + + // Apply warp prefix to our lane's partial + T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; + inclusive_output = scan_op(outer_warp_exclusive, inclusive_output); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/block_range/block_range_histo.cuh b/SRC/cub/block_range/block_range_histo.cuh deleted file mode 100644 index 3ad884c1..00000000 --- a/SRC/cub/block_range/block_range_histo.cuh +++ /dev/null @@ -1,319 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles. - */ - -#pragma once - -#include - -#include "specializations/block_range_histo_gatomic.cuh" -#include "specializations/block_range_histo_satomic.cuh" -#include "specializations/block_range_histo_sort.cuh" -#include "../util_type.cuh" -#include "../grid/grid_mapping.cuh" -#include "../grid/grid_even_share.cuh" -#include "../grid/grid_queue.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Algorithmic variants - ******************************************************************************/ - - -/** - * \brief DeviceHistogramAlgorithm enumerates alternative algorithms for BlockRangeHistogram. - */ -enum DeviceHistogramAlgorithm -{ - - /** - * \par Overview - * A two-kernel approach in which: - * -# Thread blocks in the first kernel aggregate their own privatized - * histograms using block-wide sorting (see BlockHistogramAlgorithm::BLOCK_HISTO_SORT). - * -# A single thread block in the second kernel reduces them into the output histogram(s). - * - * \par Performance Considerations - * Delivers consistent throughput regardless of sample bin distribution. - * - * However, because histograms are privatized in shared memory, a large - * number of bins (e.g., thousands) may adversely affect occupancy and - * performance (or even the ability to launch). - */ - DEVICE_HISTO_SORT, - - - /** - * \par Overview - * A two-kernel approach in which: - * -# Thread blocks in the first kernel aggregate their own privatized - * histograms using shared-memory \p atomicAdd(). - * -# A single thread block in the second kernel reduces them into the - * output histogram(s). - * - * \par Performance Considerations - * Performance is strongly tied to the hardware implementation of atomic - * addition, and may be significantly degraded for non uniformly-random - * input distributions where many concurrent updates are likely to be - * made to the same bin counter. - * - * However, because histograms are privatized in shared memory, a large - * number of bins (e.g., thousands) may adversely affect occupancy and - * performance (or even the ability to launch). - */ - DEVICE_HISTO_SHARED_ATOMIC, - - - /** - * \par Overview - * A single-kernel approach in which thread blocks update the output histogram(s) directly - * using global-memory \p atomicAdd(). - * - * \par Performance Considerations - * Performance is strongly tied to the hardware implementation of atomic - * addition, and may be significantly degraded for non uniformly-random - * input distributions where many concurrent updates are likely to be - * made to the same bin counter. - * - * Performance is not significantly impacted when computing histograms having large - * numbers of bins (e.g., thousands). - */ - DEVICE_HISTO_GLOBAL_ATOMIC, - -}; - - -/****************************************************************************** - * Tuning policy - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for BlockRangeHistogram - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - DeviceHistogramAlgorithm _HISTO_ALGORITHM, ///< Cooperative histogram algorithm to use - GridMappingStrategy _GRID_MAPPING> ///< How to map tiles of input onto thread blocks -struct BlockRangeHistogramPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - }; - - static const DeviceHistogramAlgorithm HISTO_ALGORITHM = _HISTO_ALGORITHM; ///< Cooperative histogram algorithm to use - static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; ///< How to map tiles of input onto thread blocks -}; - - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles. - */ -template < - typename BlockRangeHistogramPolicy, ///< Parameterized BlockRangeHistogramPolicy tuning policy type - int BINS, ///< Number of histogram bins per channel - int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) - int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename InputIterator, ///< Random-access input iterator type for reading samples. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1] - typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin - typename Offset> ///< Signed integer type for global offsets -struct BlockRangeHistogram -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // Histogram grid algorithm - static const DeviceHistogramAlgorithm HISTO_ALGORITHM = BlockRangeHistogramPolicy::HISTO_ALGORITHM; - - // Alternative internal implementation types - typedef BlockRangeHistogramSort< BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramSortT; - typedef BlockRangeHistogramSharedAtomic< BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramSharedAtomicT; - typedef BlockRangeHistogramGlobalAtomic< BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockRangeHistogramGlobalAtomicT; - - // Internal block sweep histogram type - typedef typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SORT), - BlockRangeHistogramSortT, - typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SHARED_ATOMIC), - BlockRangeHistogramSharedAtomicT, - BlockRangeHistogramGlobalAtomicT>::Type>::Type InternalBlockDelegate; - - enum - { - TILE_ITEMS = InternalBlockDelegate::TILE_ITEMS, - }; - - - // Temporary storage type - typedef typename InternalBlockDelegate::TempStorage TempStorage; - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - // Internal block delegate - InternalBlockDelegate internal_delegate; - - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - /** - * Constructor - */ - __device__ __forceinline__ BlockRangeHistogram( - TempStorage &temp_storage, ///< Reference to temp_storage - InputIterator d_in, ///< Input data to reduce - HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms - : - internal_delegate(temp_storage, d_in, d_out_histograms) - {} - - - /** - * \brief Reduce a consecutive segment of input tiles - */ - __device__ __forceinline__ void ConsumeRange( - Offset block_offset, ///< [in] Threadblock begin offset (inclusive) - Offset block_end) ///< [in] Threadblock end offset (exclusive) - { - // Consume subsequent full tiles of input - while (block_offset + TILE_ITEMS <= block_end) - { - internal_delegate.ConsumeTile(block_offset); - block_offset += TILE_ITEMS; - } - - // Consume a partially-full tile - if (block_offset < block_end) - { - int valid_items = block_end - block_offset; - internal_delegate.ConsumeTile(block_offset, valid_items); - } - - // Aggregate output - internal_delegate.AggregateOutput(); - } - - - /** - * Reduce a consecutive segment of input tiles - */ - __device__ __forceinline__ void ConsumeRange( - Offset num_items, ///< [in] Total number of global input items - GridEvenShare &even_share, ///< [in] GridEvenShare descriptor - GridQueue &queue, ///< [in,out] GridQueue descriptor - Int2Type is_even_share) ///< [in] Marker type indicating this is an even-share mapping - { - even_share.BlockInit(); - ConsumeRange(even_share.block_offset, even_share.block_end); - } - - - /** - * Dequeue and reduce tiles of items as part of a inter-block scan - */ - __device__ __forceinline__ void ConsumeRange( - int num_items, ///< Total number of input items - GridQueue queue) ///< Queue descriptor for assigning tiles of work to thread blocks - { - // Shared block offset - __shared__ Offset shared_block_offset; - - // We give each thread block at least one tile of input. - Offset block_offset = blockIdx.x * TILE_ITEMS; - Offset even_share_base = gridDim.x * TILE_ITEMS; - - // Process full tiles of input - while (block_offset + TILE_ITEMS <= num_items) - { - internal_delegate.ConsumeTile(block_offset); - - // Dequeue up to TILE_ITEMS - if (threadIdx.x == 0) - shared_block_offset = queue.Drain(TILE_ITEMS) + even_share_base; - - __syncthreads(); - - block_offset = shared_block_offset; - - __syncthreads(); - } - - // Consume a partially-full tile - if (block_offset < num_items) - { - int valid_items = num_items - block_offset; - internal_delegate.ConsumeTile(block_offset, valid_items); - } - - // Aggregate output - internal_delegate.AggregateOutput(); - } - - - /** - * Dequeue and reduce tiles of items as part of a inter-block scan - */ - __device__ __forceinline__ void ConsumeRange( - Offset num_items, ///< [in] Total number of global input items - GridEvenShare &even_share, ///< [in] GridEvenShare descriptor - GridQueue &queue, ///< [in,out] GridQueue descriptor - Int2Type is_dynamic) ///< [in] Marker type indicating this is a dynamic mapping - { - ConsumeRange(num_items, queue); - } - - -}; - - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block_range/block_range_radix_sort_downsweep.cuh b/SRC/cub/block_range/block_range_radix_sort_downsweep.cuh deleted file mode 100644 index 4141315e..00000000 --- a/SRC/cub/block_range/block_range_radix_sort_downsweep.cuh +++ /dev/null @@ -1,744 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * BlockRangeRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles. - */ - - -#pragma once - -#include "../thread/thread_load.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_radix_rank.cuh" -#include "../block/block_exchange.cuh" -#include "../util_type.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Types of scattering strategies - */ -enum RadixSortScatterAlgorithm -{ - RADIX_SORT_SCATTER_DIRECT, ///< Scatter directly from registers to global bins - RADIX_SORT_SCATTER_TWO_PHASE, ///< First scatter from registers into shared memory bins, then into global bins -}; - - -/** - * Parameterizable tuning policy type for BlockRangeRadixSortDownsweep - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys (and values) - bool _EXCHANGE_TIME_SLICING, ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure - bool _MEMOIZE_OUTER_SCAN, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure. See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. - BlockScanAlgorithm _INNER_SCAN_ALGORITHM, ///< The BlockScan algorithm algorithm to use - RadixSortScatterAlgorithm _SCATTER_ALGORITHM, ///< The scattering strategy to use - cudaSharedMemConfig _SMEM_CONFIG, ///< Shared memory bank mode - int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins) -struct BlockRangeRadixSortDownsweepPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - EXCHANGE_TIME_SLICING = _EXCHANGE_TIME_SLICING, ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure - RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) - MEMOIZE_OUTER_SCAN = _MEMOIZE_OUTER_SCAN, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure. See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys (and values) - static const BlockScanAlgorithm INNER_SCAN_ALGORITHM = _INNER_SCAN_ALGORITHM; ///< The BlockScan algorithm algorithm to use - static const RadixSortScatterAlgorithm SCATTER_ALGORITHM = _SCATTER_ALGORITHM; ///< The scattering strategy to use - static const cudaSharedMemConfig SMEM_CONFIG = _SMEM_CONFIG; ///< Shared memory bank mode -}; - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief BlockRangeRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles. - */ -template < - typename BlockRangeRadixSortDownsweepPolicy, ///< Parameterized BlockRangeRadixSortDownsweepPolicy tuning policy type - bool DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename Key, ///< Key type - typename Value, ///< Value type - typename Offset> ///< Signed integer type for global offsets -struct BlockRangeRadixSortDownsweep -{ - //--------------------------------------------------------------------- - // Type definitions and constants - //--------------------------------------------------------------------- - - // Appropriate unsigned-bits representation of Key - typedef typename Traits::UnsignedBits UnsignedBits; - - static const UnsignedBits MIN_KEY = Traits::MIN_KEY; - static const UnsignedBits MAX_KEY = Traits::MAX_KEY; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = BlockRangeRadixSortDownsweepPolicy::LOAD_ALGORITHM; - static const CacheLoadModifier LOAD_MODIFIER = BlockRangeRadixSortDownsweepPolicy::LOAD_MODIFIER; - static const BlockScanAlgorithm INNER_SCAN_ALGORITHM = BlockRangeRadixSortDownsweepPolicy::INNER_SCAN_ALGORITHM; - static const RadixSortScatterAlgorithm SCATTER_ALGORITHM = BlockRangeRadixSortDownsweepPolicy::SCATTER_ALGORITHM; - static const cudaSharedMemConfig SMEM_CONFIG = BlockRangeRadixSortDownsweepPolicy::SMEM_CONFIG; - - enum - { - BLOCK_THREADS = BlockRangeRadixSortDownsweepPolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = BlockRangeRadixSortDownsweepPolicy::ITEMS_PER_THREAD, - EXCHANGE_TIME_SLICING = BlockRangeRadixSortDownsweepPolicy::EXCHANGE_TIME_SLICING, - RADIX_BITS = BlockRangeRadixSortDownsweepPolicy::RADIX_BITS, - MEMOIZE_OUTER_SCAN = BlockRangeRadixSortDownsweepPolicy::MEMOIZE_OUTER_SCAN, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - - RADIX_DIGITS = 1 << RADIX_BITS, - KEYS_ONLY = Equals::VALUE, - - WARP_THREADS = CUB_PTX_LOG_WARP_THREADS, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - BYTES_PER_SIZET = sizeof(Offset), - LOG_BYTES_PER_SIZET = Log2::VALUE, - - LOG_SMEM_BANKS = CUB_PTX_LOG_SMEM_BANKS, - SMEM_BANKS = 1 << LOG_SMEM_BANKS, - - DIGITS_PER_SCATTER_PASS = BLOCK_THREADS / SMEM_BANKS, - SCATTER_PASSES = RADIX_DIGITS / DIGITS_PER_SCATTER_PASS, - - LOG_STORE_TXN_THREADS = LOG_SMEM_BANKS, - STORE_TXN_THREADS = 1 << LOG_STORE_TXN_THREADS, - }; - - // Input iterator wrapper types - typedef CacheModifiedInputIterator KeysItr; - typedef CacheModifiedInputIterator ValuesItr; - - // BlockRadixRank type - typedef BlockRadixRank< - BLOCK_THREADS, - RADIX_BITS, - DESCENDING, - MEMOIZE_OUTER_SCAN, - INNER_SCAN_ALGORITHM, - SMEM_CONFIG> BlockRadixRank; - - // BlockLoad type (keys) - typedef BlockLoad< - KeysItr, - BLOCK_THREADS, - ITEMS_PER_THREAD, - LOAD_ALGORITHM, - EXCHANGE_TIME_SLICING> BlockLoadKeys; - - // BlockLoad type (values) - typedef BlockLoad< - ValuesItr, - BLOCK_THREADS, - ITEMS_PER_THREAD, - LOAD_ALGORITHM, - EXCHANGE_TIME_SLICING> BlockLoadValues; - - // BlockExchange type (keys) - typedef BlockExchange< - UnsignedBits, - BLOCK_THREADS, - ITEMS_PER_THREAD, - EXCHANGE_TIME_SLICING> BlockExchangeKeys; - - // BlockExchange type (values) - typedef BlockExchange< - Value, - BLOCK_THREADS, - ITEMS_PER_THREAD, - EXCHANGE_TIME_SLICING> BlockExchangeValues; - - - /** - * Shared memory storage layout - */ - struct _TempStorage - { - Offset relative_bin_offsets[RADIX_DIGITS + 1]; - bool short_circuit; - - union - { - typename BlockRadixRank::TempStorage ranking; - typename BlockLoadKeys::TempStorage load_keys; - typename BlockLoadValues::TempStorage load_values; - typename BlockExchangeKeys::TempStorage exchange_keys; - typename BlockExchangeValues::TempStorage exchange_values; - }; - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Thread fields - //--------------------------------------------------------------------- - - // Shared storage for this CTA - _TempStorage &temp_storage; - - // Input and output device pointers - KeysItr d_keys_in; - ValuesItr d_values_in; - UnsignedBits *d_keys_out; - Value *d_values_out; - - // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads) - Offset bin_offset; - - // The least-significant bit position of the current digit to extract - int current_bit; - - // Number of bits in current digit - int num_bits; - - // Whether to short-ciruit - bool short_circuit; - - - - //--------------------------------------------------------------------- - // Utility methods - //--------------------------------------------------------------------- - - /** - * Decodes given keys to lookup digit offsets in shared memory - */ - __device__ __forceinline__ void DecodeRelativeBinOffsets( - UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD], - Offset (&relative_bin_offsets)[ITEMS_PER_THREAD]) - { - #pragma unroll - for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) - { - UnsignedBits digit = BFE(twiddled_keys[KEY], current_bit, num_bits); - - // Lookup base digit offset from shared memory - relative_bin_offsets[KEY] = temp_storage.relative_bin_offsets[digit]; - } - } - - - /** - * Scatter ranked items to global memory - */ - template - __device__ __forceinline__ void ScatterItems( - T (&items)[ITEMS_PER_THREAD], - int (&local_ranks)[ITEMS_PER_THREAD], - Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], - T *d_out, - Offset valid_items) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - // Scatter if not out-of-bounds - if (FULL_TILE || (local_ranks[ITEM] < valid_items)) - { - d_out[relative_bin_offsets[ITEM] + local_ranks[ITEM]] = items[ITEM]; - } - } - } - - - /** - * Scatter ranked keys directly to global memory - */ - template - __device__ __forceinline__ void ScatterKeys( - UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD], - Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - Offset valid_items, - Int2Type scatter_algorithm) - { - // Compute scatter offsets - DecodeRelativeBinOffsets(twiddled_keys, relative_bin_offsets); - - // Untwiddle keys before outputting - UnsignedBits keys[ITEMS_PER_THREAD]; - - #pragma unroll - for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) - { - keys[KEY] = Traits::TwiddleOut(twiddled_keys[KEY]); - } - - // Scatter to global - ScatterItems(keys, ranks, relative_bin_offsets, d_keys_out, valid_items); - } - - - /** - * Scatter ranked keys through shared memory, then to global memory - */ - template - __device__ __forceinline__ void ScatterKeys( - UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD], - Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - Offset valid_items, - Int2Type scatter_algorithm) - { - // Exchange keys through shared memory - BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(twiddled_keys, ranks); - - // Compute striped local ranks - int local_ranks[ITEMS_PER_THREAD]; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS); - } - - // Scatter directly - ScatterKeys( - twiddled_keys, - relative_bin_offsets, - local_ranks, - valid_items, - Int2Type()); - } - - - /** - * Scatter ranked values directly to global memory - */ - template - __device__ __forceinline__ void ScatterValues( - Value (&values)[ITEMS_PER_THREAD], - Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - Offset valid_items, - Int2Type scatter_algorithm) - { - // Scatter to global - ScatterItems(values, ranks, relative_bin_offsets, d_values_out, valid_items); - } - - - /** - * Scatter ranked values through shared memory, then to global memory - */ - template - __device__ __forceinline__ void ScatterValues( - Value (&values)[ITEMS_PER_THREAD], - Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - Offset valid_items, - Int2Type scatter_algorithm) - { - __syncthreads(); - - // Exchange keys through shared memory - BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks); - - // Compute striped local ranks - int local_ranks[ITEMS_PER_THREAD]; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS); - } - - // Scatter directly - ScatterValues( - values, - relative_bin_offsets, - local_ranks, - valid_items, - Int2Type()); - } - - - /** - * Load a tile of items (specialized for full tile) - */ - template - __device__ __forceinline__ void LoadItems( - BlockLoadT &block_loader, - T (&items)[ITEMS_PER_THREAD], - InputIterator d_in, - Offset valid_items, - Int2Type is_full_tile) - { - block_loader.Load(d_in, items); - } - - - /** - * Load a tile of items (specialized for partial tile) - */ - template - __device__ __forceinline__ void LoadItems( - BlockLoadT &block_loader, - T (&items)[ITEMS_PER_THREAD], - InputIterator d_in, - Offset valid_items, - Int2Type is_full_tile) - { - block_loader.Load(d_in, items, valid_items); - } - - - /** - * Truck along associated values - */ - template - __device__ __forceinline__ void GatherScatterValues( - _Value (&values)[ITEMS_PER_THREAD], - Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - Offset block_offset, - Offset valid_items) - { - __syncthreads(); - - BlockLoadValues loader(temp_storage.load_values); - LoadItems( - loader, - values, - d_values_in + block_offset, - valid_items, - Int2Type()); - - ScatterValues( - values, - relative_bin_offsets, - ranks, - valid_items, - Int2Type()); - } - - - /** - * Truck along associated values (specialized for key-only sorting) - */ - template - __device__ __forceinline__ void GatherScatterValues( - NullType (&values)[ITEMS_PER_THREAD], - Offset (&relative_bin_offsets)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - Offset block_offset, - Offset valid_items) - {} - - - /** - * Process tile - */ - template - __device__ __forceinline__ void ProcessTile( - Offset block_offset, - const Offset &valid_items = TILE_ITEMS) - { - // Per-thread tile data - UnsignedBits keys[ITEMS_PER_THREAD]; // Keys - UnsignedBits twiddled_keys[ITEMS_PER_THREAD]; // Twiddled keys - int ranks[ITEMS_PER_THREAD]; // For each key, the local rank within the CTA - Offset relative_bin_offsets[ITEMS_PER_THREAD]; // For each key, the global scatter base offset of the corresponding digit - - // Assign max-key to all keys - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - keys[ITEM] = (DESCENDING) ? MIN_KEY : MAX_KEY; - } - - // Load tile of keys - BlockLoadKeys loader(temp_storage.load_keys); - LoadItems( - loader, - keys, - d_keys_in + block_offset, - valid_items, - Int2Type()); - - __syncthreads(); - - // Twiddle key bits if necessary - #pragma unroll - for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) - { - twiddled_keys[KEY] = Traits::TwiddleIn(keys[KEY]); - } - - // Rank the twiddled keys - int inclusive_digit_prefix; - BlockRadixRank(temp_storage.ranking).RankKeys( - twiddled_keys, - ranks, - current_bit, - num_bits, - inclusive_digit_prefix); - - // Update global scatter base offsets for each digit - if ((BLOCK_THREADS == RADIX_DIGITS) || (threadIdx.x < RADIX_DIGITS)) - { - int exclusive_digit_prefix; - - // Get exclusive digit prefix from inclusive prefix - if (DESCENDING) - { - // Get the prefix from the next thread (higher bins come first) -#if CUB_PTX_ARCH >= 300 - exclusive_digit_prefix = ShuffleDown(inclusive_digit_prefix, 1); - if (threadIdx.x == RADIX_DIGITS - 1) - exclusive_digit_prefix = 0; -#else - volatile int* exchange = reinterpret_cast(temp_storage.relative_bin_offsets); - exchange[threadIdx.x + 1] = 0; - exchange[threadIdx.x] = inclusive_digit_prefix; - exclusive_digit_prefix = exchange[threadIdx.x + 1]; -#endif - } - else - { - // Get the prefix from the previous thread (lower bins come first) -#if CUB_PTX_ARCH >= 300 - exclusive_digit_prefix = ShuffleUp(inclusive_digit_prefix, 1); - if (threadIdx.x == 0) - exclusive_digit_prefix = 0; -#else - volatile int* exchange = reinterpret_cast(temp_storage.relative_bin_offsets); - exchange[threadIdx.x] = 0; - exchange[threadIdx.x + 1] = inclusive_digit_prefix; - exclusive_digit_prefix = exchange[threadIdx.x]; -#endif - } - - bin_offset -= exclusive_digit_prefix; - temp_storage.relative_bin_offsets[threadIdx.x] = bin_offset; - bin_offset += inclusive_digit_prefix; - } - - __syncthreads(); - - // Scatter keys - ScatterKeys(twiddled_keys, relative_bin_offsets, ranks, valid_items, Int2Type()); - - // Gather/scatter values - Value values[ITEMS_PER_THREAD]; - GatherScatterValues(values, relative_bin_offsets, ranks, block_offset, valid_items); - } - - - /** - * Copy tiles within the range of input - */ - template < - typename InputIterator, - typename T> - __device__ __forceinline__ void Copy( - InputIterator d_in, - T *d_out, - Offset block_offset, - Offset block_end) - { - // Simply copy the input - while (block_offset + TILE_ITEMS <= block_end) - { - T items[ITEMS_PER_THREAD]; - - LoadDirectStriped(threadIdx.x, d_in + block_offset, items); - __syncthreads(); - StoreDirectStriped(threadIdx.x, d_out + block_offset, items); - - block_offset += TILE_ITEMS; - } - - // Clean up last partial tile with guarded-I/O - if (block_offset < block_end) - { - Offset valid_items = block_end - block_offset; - - T items[ITEMS_PER_THREAD]; - - LoadDirectStriped(threadIdx.x, d_in + block_offset, items, valid_items); - __syncthreads(); - StoreDirectStriped(threadIdx.x, d_out + block_offset, items, valid_items); - } - } - - - /** - * Copy tiles within the range of input (specialized for NullType) - */ - template - __device__ __forceinline__ void Copy( - InputIterator d_in, - NullType *d_out, - Offset block_offset, - Offset block_end) - {} - - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - /** - * Constructor - */ - __device__ __forceinline__ BlockRangeRadixSortDownsweep( - TempStorage &temp_storage, - Offset bin_offset, - Key *d_keys_in, - Key *d_keys_out, - Value *d_values_in, - Value *d_values_out, - int current_bit, - int num_bits) - : - temp_storage(temp_storage.Alias()), - bin_offset(bin_offset), - d_keys_in(reinterpret_cast(d_keys_in)), - d_keys_out(reinterpret_cast(d_keys_out)), - d_values_in(d_values_in), - d_values_out(d_values_out), - current_bit(current_bit), - num_bits(num_bits), - short_circuit(false) - {} - - - /** - * Constructor - */ - __device__ __forceinline__ BlockRangeRadixSortDownsweep( - TempStorage &temp_storage, - Offset num_items, - Offset *d_spine, - Key *d_keys_in, - Key *d_keys_out, - Value *d_values_in, - Value *d_values_out, - int current_bit, - int num_bits) - : - temp_storage(temp_storage.Alias()), - d_keys_in(reinterpret_cast(d_keys_in)), - d_keys_out(reinterpret_cast(d_keys_out)), - d_values_in(d_values_in), - d_values_out(d_values_out), - current_bit(current_bit), - num_bits(num_bits) - { - // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit) - if (threadIdx.x < RADIX_DIGITS) - { - int bin_idx = (DESCENDING) ? - RADIX_DIGITS - threadIdx.x - 1 : - threadIdx.x; - - // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size - Offset first_block_bin_offset = d_spine[gridDim.x * bin_idx]; - int predicate = ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items)); - this->temp_storage.short_circuit = WarpAll(predicate); - - // Load my block's bin offset for my bin - bin_offset = d_spine[(gridDim.x * bin_idx) + blockIdx.x]; - } - - __syncthreads(); - - short_circuit = this->temp_storage.short_circuit; - } - - - /** - * Distribute keys from a segment of input tiles. - */ - __device__ __forceinline__ void ProcessRegion( - Offset block_offset, - const Offset &block_end) - { - if (short_circuit) - { - // Copy keys - Copy(d_keys_in, d_keys_out, block_offset, block_end); - - // Copy values - Copy(d_values_in, d_values_out, block_offset, block_end); - } - else - { - // Process full tiles of tile_items - while (block_offset + TILE_ITEMS <= block_end) - { - ProcessTile(block_offset); - block_offset += TILE_ITEMS; - - __syncthreads(); - } - - // Clean up last partial tile with guarded-I/O - if (block_offset < block_end) - { - ProcessTile(block_offset, block_end - block_offset); - } - } - } -}; - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block_range/block_range_reduce.cuh b/SRC/cub/block_range/block_range_reduce.cuh deleted file mode 100644 index 9e97f87b..00000000 --- a/SRC/cub/block_range/block_range_reduce.cuh +++ /dev/null @@ -1,430 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles. - */ - -#pragma once - -#include - -#include "../block/block_load.cuh" -#include "../block/block_reduce.cuh" -#include "../grid/grid_mapping.cuh" -#include "../grid/grid_queue.cuh" -#include "../grid/grid_even_share.cuh" -#include "../util_type.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for BlockRangeReduce - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load - BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - GridMappingStrategy _GRID_MAPPING> ///< How to map tiles of input onto thread blocks -struct BlockRangeReducePolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load - }; - - static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; ///< How to map tiles of input onto thread blocks -}; - - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles. - * - * Each thread reduces only the values it loads. If \p FIRST_TILE, this - * partial reduction is stored into \p thread_aggregate. Otherwise it is - * accumulated into \p thread_aggregate. - */ -template < - typename BlockRangeReducePolicy, ///< Parameterized BlockRangeReducePolicy tuning policy type - typename InputIterator, ///< Random-access iterator type for input - typename Offset, ///< Signed integer type for global offsets - typename ReductionOp> ///< Binary reduction operator type having member T operator()(const T &a, const T &b) -struct BlockRangeReduce -{ - - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // The value type of the input iterator - typedef typename std::iterator_traits::value_type T; - - // Vector type of T for data movement - typedef typename CubVector::Type VectorT; - - // Input iterator wrapper type - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator - InputIterator>::Type // Directly use the supplied input iterator type - WrappedInputIterator; - - // Constants - enum - { - BLOCK_THREADS = BlockRangeReducePolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = BlockRangeReducePolicy::ITEMS_PER_THREAD, - VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, BlockRangeReducePolicy::VECTOR_LOAD_LENGTH), - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - - // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type - CAN_VECTORIZE = (VECTOR_LOAD_LENGTH > 1) && - (IsPointer::VALUE) && - Traits::PRIMITIVE, - - }; - - static const CacheLoadModifier LOAD_MODIFIER = BlockRangeReducePolicy::LOAD_MODIFIER; - static const BlockReduceAlgorithm BLOCK_ALGORITHM = BlockRangeReducePolicy::BLOCK_ALGORITHM; - - // Parameterized BlockReduce primitive - typedef BlockReduce BlockReduceT; - - /// Shared memory type required by this thread block - typedef typename BlockReduceT::TempStorage _TempStorage; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - T thread_aggregate; ///< Each thread's partial reduction - _TempStorage& temp_storage; ///< Reference to temp_storage - InputIterator d_in; ///< Input data to reduce - WrappedInputIterator d_wrapped_in; ///< Wrapped input data to reduce - ReductionOp reduction_op; ///< Binary reduction operator - int first_tile_size; ///< Size of first tile consumed - bool is_aligned; ///< Whether or not input is vector-aligned - - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - - // Whether or not the input is aligned with the vector type (specialized for types we can vectorize) - template - static __device__ __forceinline__ bool IsAligned( - Iterator d_in, - Int2Type can_vectorize) - { - return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0; - } - - // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize) - template - static __device__ __forceinline__ bool IsAligned( - Iterator d_in, - Int2Type can_vectorize) - { - return false; - } - - - /** - * Constructor - */ - __device__ __forceinline__ BlockRangeReduce( - TempStorage& temp_storage, ///< Reference to temp_storage - InputIterator d_in, ///< Input data to reduce - ReductionOp reduction_op) ///< Binary reduction operator - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_wrapped_in(d_in), - reduction_op(reduction_op), - first_tile_size(0), - is_aligned(IsAligned(d_in, Int2Type())) - {} - - - /** - * Consume a full tile of input (specialized for cases where we cannot vectorize) - */ - template - __device__ __forceinline__ T ConsumeFullTile( - _Offset block_offset, ///< The offset the tile to consume - Int2Type can_vectorize) ///< Whether or not we can vectorize loads - { - T items[ITEMS_PER_THREAD]; - - // Load items in striped fashion - LoadDirectStriped(threadIdx.x, d_wrapped_in + block_offset, items); - - // Reduce items within each thread stripe - return ThreadReduce(items, reduction_op); - } - - - /** - * Consume a full tile of input (specialized for cases where we can vectorize) - */ - template - __device__ __forceinline__ T ConsumeFullTile( - _Offset block_offset, ///< The offset the tile to consume - Int2Type can_vectorize) ///< Whether or not we can vectorize loads - { - if (!is_aligned) - { - // Not aligned - return ConsumeFullTile(block_offset, Int2Type()); - } - else - { - // Alias items as an array of VectorT and load it in striped fashion - enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH }; - - T items[ITEMS_PER_THREAD]; - - VectorT *vec_items = reinterpret_cast(items); - - // Vector input iterator wrapper type - CacheModifiedInputIterator d_vec_in( - reinterpret_cast(d_in + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH))); - - #pragma unroll - for (int i = 0; i < WORDS; ++i) - vec_items[i] = d_vec_in[BLOCK_THREADS * i]; - - // Reduce items within each thread stripe - return ThreadReduce(items, reduction_op); - } - } - - - - /** - * Process a single tile of input - */ - template - __device__ __forceinline__ void ConsumeTile( - Offset block_offset, ///< The offset the tile to consume - int valid_items = TILE_ITEMS) ///< The number of valid items in the tile - { - if (FULL_TILE) - { - // Full tile - T partial = ConsumeFullTile(block_offset, Int2Type()); - - // Update running thread aggregate - thread_aggregate = (first_tile_size) ? - reduction_op(thread_aggregate, partial) : // Update - partial; // Assign - } - else - { - // Partial tile - int thread_offset = threadIdx.x; - - if (!first_tile_size && (thread_offset < valid_items)) - { - // Assign thread_aggregate - thread_aggregate = d_wrapped_in[block_offset + thread_offset]; - thread_offset += BLOCK_THREADS; - } - - while (thread_offset < valid_items) - { - // Update thread aggregate - T item = d_wrapped_in[block_offset + thread_offset]; - thread_aggregate = reduction_op(thread_aggregate, item); - thread_offset += BLOCK_THREADS; - } - } - - // Set first tile size if necessary - if (!first_tile_size) - first_tile_size = valid_items; - } - - - //--------------------------------------------------------------- - // Consume a contiguous segment of tiles - //--------------------------------------------------------------------- - - /** - * \brief Reduce a contiguous segment of input tiles - */ - __device__ __forceinline__ void ConsumeRange( - Offset block_offset, ///< [in] Threadblock begin offset (inclusive) - Offset block_end, ///< [in] Threadblock end offset (exclusive) - T &block_aggregate) ///< [out] Running total - { - // Consume subsequent full tiles of input - while (block_offset + TILE_ITEMS <= block_end) - { - ConsumeTile(block_offset); - block_offset += TILE_ITEMS; - } - - // Consume a partially-full tile - if (block_offset < block_end) - { - int valid_items = block_end - block_offset; - ConsumeTile(block_offset, valid_items); - } - - // Compute block-wide reduction - block_aggregate = (first_tile_size < TILE_ITEMS) ? - BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) : - BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op); - } - - - /** - * Reduce a contiguous segment of input tiles - */ - __device__ __forceinline__ void ConsumeRange( - Offset num_items, ///< [in] Total number of global input items - GridEvenShare &even_share, ///< [in] GridEvenShare descriptor - GridQueue &queue, ///< [in,out] GridQueue descriptor - T &block_aggregate, ///< [out] Running total - Int2Type is_even_share) ///< [in] Marker type indicating this is an even-share mapping - { - // Initialize even-share descriptor for this thread block - even_share.BlockInit(); - - // Consume input tiles - ConsumeRange(even_share.block_offset, even_share.block_end, block_aggregate); - } - - - //--------------------------------------------------------------------- - // Dynamically consume tiles - //--------------------------------------------------------------------- - - /** - * Dequeue and reduce tiles of items as part of a inter-block scan - */ - __device__ __forceinline__ void ConsumeRange( - int num_items, ///< Total number of input items - GridQueue queue, ///< Queue descriptor for assigning tiles of work to thread blocks - T &block_aggregate) ///< [out] Running total - { - // Shared dequeue offset - __shared__ Offset dequeue_offset; - - // We give each thread block at least one tile of input. - Offset block_offset = blockIdx.x * TILE_ITEMS; - Offset even_share_base = gridDim.x * TILE_ITEMS; - - if (block_offset + TILE_ITEMS <= num_items) - { - // Consume full tile of input - ConsumeTile(block_offset); - - // Dequeue more tiles - while (true) - { - // Dequeue a tile of items - if (threadIdx.x == 0) - dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base; - - __syncthreads(); - - // Grab tile offset and check if we're done with full tiles - block_offset = dequeue_offset; - - __syncthreads(); - - if (block_offset + TILE_ITEMS > num_items) - break; - - // Consume a full tile - ConsumeTile(block_offset); - } - } - - if (block_offset < num_items) - { - int valid_items = num_items - block_offset; - ConsumeTile(block_offset, valid_items); - } - - // Compute block-wide reduction - block_aggregate = (first_tile_size < TILE_ITEMS) ? - BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) : - BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op); - } - - - /** - * Dequeue and reduce tiles of items as part of a inter-block scan - */ - __device__ __forceinline__ void ConsumeRange( - Offset num_items, ///< [in] Total number of global input items - GridEvenShare &even_share, ///< [in] GridEvenShare descriptor - GridQueue &queue, ///< [in,out] GridQueue descriptor - T &block_aggregate, ///< [out] Running total - Int2Type is_dynamic) ///< [in] Marker type indicating this is a dynamic mapping - { - ConsumeRange(num_items, queue, block_aggregate); - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block_range/block_range_reduce_by_key.cuh b/SRC/cub/block_range/block_range_reduce_by_key.cuh deleted file mode 100644 index f56baaa0..00000000 --- a/SRC/cub/block_range/block_range_reduce_by_key.cuh +++ /dev/null @@ -1,1034 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockRangeReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. - */ - -#pragma once - -#include - -#include "block_scan_prefix_operators.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_scan.cuh" -#include "../block/block_exchange.cuh" -#include "../block/block_discontinuity.cuh" -#include "../grid/grid_queue.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../iterator/constant_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for BlockRangeReduceByKey - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - bool _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use -struct BlockRangeReduceByKeyPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - TWO_PHASE_SCATTER = _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; - - -/****************************************************************************** - * Tile status interface types - ******************************************************************************/ - -/** - * Tile status interface for reduction by key. - * - */ -template < - typename Value, - typename Offset, - bool SINGLE_WORD = (Traits::PRIMITIVE) && (sizeof(Value) + sizeof(Offset) < 16)> -struct ReduceByKeyScanTileState; - - -/** - * Tile status interface for reduction by key, specialized for scan status and value types that - * cannot be combined into one machine word. - */ -template < - typename Value, - typename Offset> -struct ReduceByKeyScanTileState : - ScanTileState > -{ - typedef ScanTileState > SuperClass; - - /// Constructor - __host__ __device__ __forceinline__ - ReduceByKeyScanTileState() : SuperClass() {} -}; - - -/** - * Tile status interface for reduction by key, specialized for scan status and value types that - * can be combined into one machine word that can be read/written coherently in a single access. - */ -template < - typename Value, - typename Offset> -struct ReduceByKeyScanTileState -{ - typedef ItemOffsetPair ItemOffsetPair; - - // Constants - enum - { - PAIR_SIZE = sizeof(Value) + sizeof(Offset), - TXN_WORD_SIZE = 1 << Log2::VALUE, - STATUS_WORD_SIZE = TXN_WORD_SIZE - PAIR_SIZE, - - TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, - }; - - // Status word type - typedef typename If<(STATUS_WORD_SIZE == 8), - long long, - typename If<(STATUS_WORD_SIZE == 4), - int, - typename If<(STATUS_WORD_SIZE == 2), - short, - char>::Type>::Type>::Type StatusWord; - - // Status word type - typedef typename If<(TXN_WORD_SIZE == 16), - longlong2, - typename If<(TXN_WORD_SIZE == 8), - long long, - int>::Type>::Type TxnWord; - - // Device word type (for when sizeof(Value) == sizeof(Offset)) - struct TileDescriptorBigStatus - { - Offset offset; - Value value; - StatusWord status; - }; - - // Device word type (for when sizeof(Value) != sizeof(Offset)) - struct TileDescriptorLittleStatus - { - Value value; - StatusWord status; - Offset offset; - }; - - // Device word type - typedef typename If< - (sizeof(Value) == sizeof(Offset)), - TileDescriptorBigStatus, - TileDescriptorLittleStatus>::Type - TileDescriptor; - - - // Device storage - TileDescriptor *d_tile_status; - - - /// Constructor - __host__ __device__ __forceinline__ - ReduceByKeyScanTileState() - : - d_tile_status(NULL) - {} - - - /// Initializer - __host__ __device__ __forceinline__ - cudaError_t Init( - int num_tiles, ///< [in] Number of tiles - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation - { - d_tile_status = reinterpret_cast(d_temp_storage); - return cudaSuccess; - } - - - /** - * Compute device memory needed for tile status - */ - __host__ __device__ __forceinline__ - static cudaError_t AllocationSize( - int num_tiles, ///< [in] Number of tiles - size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation - { - temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors - return cudaSuccess; - } - - - /** - * Initialize (from device) - */ - __device__ __forceinline__ void InitializeStatus(int num_tiles) - { - int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tile_idx < num_tiles) - { - // Not-yet-set - d_tile_status[TILE_STATUS_PADDING + tile_idx].status = StatusWord(SCAN_TILE_INVALID); - } - - if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) - { - // Padding - d_tile_status[threadIdx.x].status = StatusWord(SCAN_TILE_OOB); - } - } - - - /** - * Update the specified tile's inclusive value and corresponding status - */ - __device__ __forceinline__ void SetInclusive(int tile_idx, ItemOffsetPair tile_inclusive) - { - TileDescriptor tile_descriptor; - tile_descriptor.status = SCAN_TILE_INCLUSIVE; - tile_descriptor.value = tile_inclusive.value; - tile_descriptor.offset = tile_inclusive.offset; - - TxnWord alias; - *reinterpret_cast(&alias) = tile_descriptor; - ThreadStore(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias); - } - - - /** - * Update the specified tile's partial value and corresponding status - */ - __device__ __forceinline__ void SetPartial(int tile_idx, ItemOffsetPair tile_partial) - { - TileDescriptor tile_descriptor; - tile_descriptor.status = SCAN_TILE_PARTIAL; - tile_descriptor.value = tile_partial.value; - tile_descriptor.offset = tile_partial.offset; - - TxnWord alias; - *reinterpret_cast(&alias) = tile_descriptor; - ThreadStore(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias); - } - - /** - * Wait for the corresponding tile to become non-invalid - */ - __device__ __forceinline__ void WaitForValid( - int tile_idx, - StatusWord &status, - ItemOffsetPair &value) - { - // Use warp-any to determine when all threads have valid status - TxnWord alias = ThreadLoad(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx)); - TileDescriptor tile_descriptor = reinterpret_cast(alias); - - while ((tile_descriptor.status == SCAN_TILE_INVALID)) - { - alias = ThreadLoad(reinterpret_cast(d_tile_status + TILE_STATUS_PADDING + tile_idx)); - tile_descriptor = reinterpret_cast(alias); - } - - status = tile_descriptor.status; - value.value = tile_descriptor.value; - value.offset = tile_descriptor.offset; - } - -}; - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief BlockRangeReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key across a range of tiles - */ -template < - typename BlockRangeReduceByKeyPolicy, ///< Parameterized BlockRangeReduceByKeyPolicy tuning policy type - typename KeyInputIterator, ///< Random-access input iterator type for keys - typename KeyOutputIterator, ///< Random-access output iterator type for keys - typename ValueInputIterator, ///< Random-access input iterator type for values - typename ValueOutputIterator, ///< Random-access output iterator type for values - typename EqualityOp, ///< Key equality operator type - typename ReductionOp, ///< Value reduction operator type - typename Offset> ///< Signed integer type for global offsets -struct BlockRangeReduceByKey -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // Data type of key iterator - typedef typename std::iterator_traits::value_type Key; - - // Data type of value iterator - typedef typename std::iterator_traits::value_type Value; - - // Tile status descriptor interface type - typedef ReduceByKeyScanTileState ScanTileState; - - // Constants - enum - { - BLOCK_THREADS = BlockRangeReduceByKeyPolicy::BLOCK_THREADS, - WARPS = BLOCK_THREADS / CUB_PTX_WARP_THREADS, - ITEMS_PER_THREAD = BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD, - TWO_PHASE_SCATTER = (BlockRangeReduceByKeyPolicy::TWO_PHASE_SCATTER) && (ITEMS_PER_THREAD > 1), - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - - // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type) - HAS_IDENTITY_ZERO = (Equals::VALUE) && (Traits::PRIMITIVE), - - // Whether or not to sync after loading data - SYNC_AFTER_LOAD = (BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), - - // Whether or not this is run-length-encoding with a constant iterator as values - IS_RUN_LENGTH_ENCODE = (Equals >::VALUE) || (Equals >::VALUE) || (Equals >::VALUE), - - }; - - // Cache-modified input iterator wrapper type for keys - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValueInputIterator - KeyInputIterator>::Type // Directly use the supplied input iterator type - WrappedKeyInputIterator; - - // Cache-modified input iterator wrapper type for values - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValueInputIterator - ValueInputIterator>::Type // Directly use the supplied input iterator type - WrappedValueInputIterator; - - // Value-offset tuple type for scanning (maps accumulated values to segment index) - typedef ItemOffsetPair ValueOffsetPair; - - // Reduce-value-by-segment scan operator - struct ReduceByKeyOp - { - ReductionOp op; ///< Wrapped reduction operator - - /// Constructor - __device__ __forceinline__ ReduceByKeyOp(ReductionOp op) : op(op) {} - - /// Scan operator (specialized for sum on primitive types) - __device__ __forceinline__ ValueOffsetPair operator()( - const ValueOffsetPair &first, ///< First partial reduction - const ValueOffsetPair &second, ///< Second partial reduction - Int2Type has_identity_zero) ///< Whether the operation has a zero-valued identity - { - Value select = (second.offset) ? 0 : first.value; - - ValueOffsetPair retval; - retval.offset = first.offset + second.offset; - retval.value = op(select, second.value); - return retval; - } - - /// Scan operator (specialized for reductions without zero-valued identity) - __device__ __forceinline__ ValueOffsetPair operator()( - const ValueOffsetPair &first, ///< First partial reduction - const ValueOffsetPair &second, ///< Second partial reduction - Int2Type has_identity_zero) ///< Whether the operation has a zero-valued identity - { -#if (__CUDA_ARCH__ > 130) - // This expression uses less registers and is faster when compiled with nvvm - ValueOffsetPair retval; - retval.offset = first.offset + second.offset; - if (second.offset) - { - retval.value = second.value; - return retval; - } - else - { - retval.value = op(first.value, second.value); - return retval; - } -#else - // This expression uses less registers and is faster when compiled with Open64 - ValueOffsetPair retval; - retval.offset = first.offset + second.offset; - retval.value = (second.offset) ? - second.value : // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate - op(first.value, second.value); // The second partial reduction does not span a reset, so accumulate both into the running aggregate - return retval; -#endif - } - - /// Scan operator - __device__ __forceinline__ ValueOffsetPair operator()( - const ValueOffsetPair &first, ///< First partial reduction - const ValueOffsetPair &second) ///< Second partial reduction - { - return (*this)(first, second, Int2Type()); - } - }; - - // Parameterized BlockLoad type for keys - typedef BlockLoad< - WrappedKeyInputIterator, - BlockRangeReduceByKeyPolicy::BLOCK_THREADS, - BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD, - BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM> - BlockLoadKeys; - - // Parameterized BlockLoad type for values - typedef BlockLoad< - WrappedValueInputIterator, - BlockRangeReduceByKeyPolicy::BLOCK_THREADS, - BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD, - (IS_RUN_LENGTH_ENCODE) ? - BLOCK_LOAD_DIRECT : - (BlockLoadAlgorithm) BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM> - BlockLoadValues; - - // Parameterized BlockExchange type for locally compacting items as part of a two-phase scatter - typedef BlockExchange< - Key, - BLOCK_THREADS, - ITEMS_PER_THREAD> - BlockExchangeKeys; - - // Parameterized BlockExchange type for locally compacting items as part of a two-phase scatter - typedef BlockExchange< - Value, - BLOCK_THREADS, - ITEMS_PER_THREAD> - BlockExchangeValues; - - // Parameterized BlockDiscontinuity type for keys - typedef BlockDiscontinuity BlockDiscontinuityKeys; - - // Parameterized BlockScan type - typedef BlockScan< - ValueOffsetPair, - BlockRangeReduceByKeyPolicy::BLOCK_THREADS, - BlockRangeReduceByKeyPolicy::SCAN_ALGORITHM> - BlockScanAllocations; - - // Callback type for obtaining tile prefix during block scan - typedef BlockScanLookbackPrefixOp< - ValueOffsetPair, - ReduceByKeyOp, - ScanTileState> - LookbackPrefixCallbackOp; - - // Shared memory type for this threadblock - struct _TempStorage - { - - union - { - struct - { - typename BlockScanAllocations::TempStorage scan; // Smem needed for tile scanning - typename LookbackPrefixCallbackOp::TempStorage prefix; // Smem needed for cooperative prefix callback - typename BlockDiscontinuityKeys::TempStorage discontinuity; // Smem needed for discontinuity detection - typename BlockLoadKeys::TempStorage load_keys; // Smem needed for loading keys - - Offset tile_idx; // Shared tile index - Offset tile_num_flags_prefix; // Exclusive tile prefix - }; - - // Smem needed for loading values - typename BlockLoadValues::TempStorage load_values; - - // Smem needed for compacting values - typename BlockExchangeValues::TempStorage exchange_values; - - // Smem needed for compacting keys - typename BlockExchangeKeys::TempStorage exchange_keys; - }; - - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage &temp_storage; ///< Reference to temp_storage - - WrappedKeyInputIterator d_keys_in; ///< Input keys - KeyOutputIterator d_keys_out; ///< Output keys - - WrappedValueInputIterator d_values_in; ///< Input values - ValueOutputIterator d_values_out; ///< Output values - - InequalityWrapper inequality_op; ///< Key inequality operator - ReduceByKeyOp scan_op; ///< Reduce-value-by flag scan operator - Offset num_items; ///< Total number of input items - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - // Constructor - __device__ __forceinline__ - BlockRangeReduceByKey( - TempStorage &temp_storage, ///< Reference to temp_storage - KeyInputIterator d_keys_in, ///< Input keys - KeyOutputIterator d_keys_out, ///< Output keys - ValueInputIterator d_values_in, ///< Input values - ValueOutputIterator d_values_out, ///< Output values - EqualityOp equality_op, ///< Key equality operator - ReductionOp reduction_op, ///< Value reduction operator - Offset num_items) ///< Total number of input items - : - temp_storage(temp_storage.Alias()), - d_keys_in(d_keys_in), - d_keys_out(d_keys_out), - d_values_in(d_values_in), - d_values_out(d_values_out), - inequality_op(equality_op), - scan_op(reduction_op), - num_items(num_items) - {} - - - //--------------------------------------------------------------------- - // Block scan utility methods - //--------------------------------------------------------------------- - - /** - * Scan with identity (first tile) - */ - __device__ __forceinline__ - void ScanBlock( - ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], - ValueOffsetPair &block_aggregate, - Int2Type has_identity) - { - ValueOffsetPair identity; - identity.value = 0; - identity.offset = 0; - BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, identity, scan_op, block_aggregate); - } - - /** - * Scan without identity (first tile). Without an identity, the first output item is undefined. - * - */ - __device__ __forceinline__ - void ScanBlock( - ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], - ValueOffsetPair &block_aggregate, - Int2Type has_identity) - { - BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, scan_op, block_aggregate); - } - - /** - * Scan with identity (subsequent tile) - */ - __device__ __forceinline__ - void ScanBlock( - ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], - ValueOffsetPair &block_aggregate, - LookbackPrefixCallbackOp &prefix_op, - Int2Type has_identity) - { - ValueOffsetPair identity; - identity.value = 0; - identity.offset = 0; - BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, identity, scan_op, block_aggregate, prefix_op); - } - - /** - * Scan without identity (subsequent tile). Without an identity, the first output item is undefined. - */ - __device__ __forceinline__ - void ScanBlock( - ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], - ValueOffsetPair &block_aggregate, - LookbackPrefixCallbackOp &prefix_op, - Int2Type has_identity) - { - BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, scan_op, block_aggregate, prefix_op); - } - - - //--------------------------------------------------------------------- - // Zip utility methods - //--------------------------------------------------------------------- - - template - __device__ __forceinline__ void ZipValuesAndFlags( - Offset num_remaining, - Value (&values)[ITEMS_PER_THREAD], - Offset (&flags)[ITEMS_PER_THREAD], - ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD]) - { - // Zip values and flags - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - // Unset flags for out-of-bounds keys - if ((LAST_TILE) && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_remaining)) - flags[ITEM] = 0; - - values_and_segments[ITEM].value = values[ITEM]; - values_and_segments[ITEM].offset = flags[ITEM]; - } - } - - //--------------------------------------------------------------------- - // Scatter utility methods - //--------------------------------------------------------------------- - - - - /** - * Scatter flagged items to output offsets (specialized for direct scattering) - * - * The exclusive scan causes each head flag to be paired with the previous - * value aggregate. As such: - * - The scatter offsets must be decremented for value value aggregates - * - The first tile does not scatter the first flagged value (it is undefined from the exclusive scan) - * - If the tile is partially-full, we need to scatter the first out-of-bounds value (which aggregates all valid values in the last segment) - * - */ - template - __device__ __forceinline__ void ScatterDirect( - Offset num_remaining, - Key (&keys)[ITEMS_PER_THREAD], - ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], - Offset (&flags)[ITEMS_PER_THREAD], - Offset tile_num_flags, - Int2Type iteration) - { - // Scatter key - if (flags[ITEM]) - { - d_keys_out[values_and_segments[ITEM].offset] = keys[ITEM]; - } - - bool is_first_flag = FIRST_TILE && (ITEM == 0) && (threadIdx.x == 0); - bool is_oob_value = (LAST_TILE) && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining); - - // Scatter value reduction - if (((flags[ITEM] || is_oob_value)) && (!is_first_flag)) - { - d_values_out[values_and_segments[ITEM].offset - 1] = values_and_segments[ITEM].value; - } - - ScatterDirect(num_remaining, keys, values_and_segments, flags, tile_num_flags, Int2Type()); - } - - template - __device__ __forceinline__ void ScatterDirect( - Offset num_remaining, - Key (&keys)[ITEMS_PER_THREAD], - ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], - Offset (&flags)[ITEMS_PER_THREAD], - Offset tile_num_flags, - Int2Type iteration) - {} - - /** - * Scatter flagged items to output offsets (specialized for two-phase scattering) - * - * The exclusive scan causes each head flag to be paired with the previous - * value aggregate. As such: - * - The scatter offsets must be decremented for value value aggregates - * - The first tile does not scatter the first flagged value (it is undefined from the exclusive scan) - * - If the tile is partially-full, we need to scatter the first out-of-bounds value (which aggregates all valid values in the last segment) - * - */ - template - __device__ __forceinline__ void ScatterTwoPhase( - Offset num_remaining, - Key (&keys)[ITEMS_PER_THREAD], - ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], - Offset (&flags)[ITEMS_PER_THREAD], - Offset tile_num_flags, - Offset tile_num_flags_prefix) - { - int local_ranks[ITEMS_PER_THREAD]; - Value values[ITEMS_PER_THREAD]; - - // Share exclusive tile prefix - if (threadIdx.x == 0) - { - temp_storage.tile_num_flags_prefix = tile_num_flags_prefix; - } - - __syncthreads(); - - // Load exclusive tile prefix in all threads - tile_num_flags_prefix = temp_storage.tile_num_flags_prefix; - - __syncthreads(); - - // Compute local scatter ranks - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - local_ranks[ITEM] = values_and_segments[ITEM].offset - tile_num_flags_prefix; - } - - // Compact keys in shared memory - BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, local_ranks, flags); - - // Scatter keys - StoreDirectStriped(threadIdx.x, d_keys_out + tile_num_flags_prefix, keys, tile_num_flags); - - // Unzip values and set flag for first oob item in last tile - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - values[ITEM] = values_and_segments[ITEM].value; - - if (FIRST_TILE) - local_ranks[ITEM]--; - - if (LAST_TILE && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining)) - flags[ITEM] = 1; - } - - // Unset first flag in first tile - if (FIRST_TILE && (threadIdx.x == 0)) - flags[0] = 0; - - __syncthreads(); - - // Compact values in shared memory - BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, local_ranks, flags); - - // Number to output - Offset exchange_count = tile_num_flags; - - if (LAST_TILE && (num_remaining < TILE_ITEMS)) - exchange_count++; - - if (FIRST_TILE) - { - exchange_count--; - } - else - { - tile_num_flags_prefix--; - } - - // Scatter values - StoreDirectStriped(threadIdx.x, d_values_out + tile_num_flags_prefix, values, exchange_count); - - __syncthreads(); - } - - - /** - * Scatter flagged items - */ - template - __device__ __forceinline__ void Scatter( - Offset num_remaining, - Key (&keys)[ITEMS_PER_THREAD], - ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD], - Offset (&flags)[ITEMS_PER_THREAD], - Offset tile_num_flags, - Offset tile_num_flags_prefix) - { - // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one - if ((TWO_PHASE_SCATTER) && ((tile_num_flags >> Log2::VALUE) > 0)) - { - ScatterTwoPhase( - num_remaining, - keys, - values_and_segments, - flags, - tile_num_flags, - tile_num_flags_prefix); - } - else - { - ScatterDirect( - num_remaining, - keys, - values_and_segments, - flags, - tile_num_flags, - Int2Type<0>()); - } - } - - - //--------------------------------------------------------------------- - // Cooperatively scan a device-wide sequence of tiles with other CTAs - //--------------------------------------------------------------------- - - /** - * Process a tile of input (dynamic domino scan) - */ - template < - bool LAST_TILE> - __device__ __forceinline__ ValueOffsetPair ConsumeTile( - Offset num_items, ///< Total number of global input items - Offset num_remaining, ///< Number of global input items remaining (including this tile) - int tile_idx, ///< Tile index - Offset block_offset, ///< Tile offset - ScanTileState &tile_status) ///< Global list of tile status - { - Key keys[ITEMS_PER_THREAD]; // Tile keys - Value values[ITEMS_PER_THREAD]; // Tile values - Offset flags[ITEMS_PER_THREAD]; // Segment head flags - ValueOffsetPair values_and_segments[ITEMS_PER_THREAD]; // Zipped values and segment flags|indices - - ValueOffsetPair running_total; // Running count of segments and current value aggregate (including this tile) - - if (tile_idx == 0) - { - // First tile - - // Load keys and values - if (LAST_TILE) - { - BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, num_remaining); - } - else - { - BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys); - } - - if (SYNC_AFTER_LOAD) - __syncthreads(); - - // Load values - if (LAST_TILE) - BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, num_remaining); - else - BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values); - - if (SYNC_AFTER_LOAD) - __syncthreads(); - - // Set head flags. First tile sets the first flag for the first item - BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(flags, keys, inequality_op); - - // Zip values and flags - ZipValuesAndFlags(num_remaining, values, flags, values_and_segments); - - // Exclusive scan of values and flags - ValueOffsetPair block_aggregate; - ScanBlock(values_and_segments, block_aggregate, Int2Type()); - - // Update tile status if this is not the last tile - if (!LAST_TILE && (threadIdx.x == 0)) - tile_status.SetInclusive(0, block_aggregate); - - // Set offset for first scan output - if (!HAS_IDENTITY_ZERO && (threadIdx.x == 0)) - values_and_segments[0].offset = 0; - - running_total = block_aggregate; - - // Scatter flagged items - Scatter(num_remaining, keys, values_and_segments, flags, block_aggregate.offset, 0); - } - else - { - // Not first tile - - // Load keys and values - if (LAST_TILE) - { - BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, num_remaining); - } - else - { - BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys); - } - - if (SYNC_AFTER_LOAD) - __syncthreads(); - - // Load values - if (LAST_TILE) - BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, num_remaining); - else - BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values); - - if (SYNC_AFTER_LOAD) - __syncthreads(); - - // Obtain the last key in the previous tile to compare with - Key tile_predecessor_key = (threadIdx.x == 0) ? - d_keys_in[block_offset - 1] : - ZeroInitialize(); - - // Set head flags - BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(flags, keys, inequality_op, tile_predecessor_key); - - // Zip values and flags - ZipValuesAndFlags(num_remaining, values, flags, values_and_segments); - - // Exclusive scan of values and flags - ValueOffsetPair block_aggregate; - LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx); - - ScanBlock(values_and_segments, block_aggregate, prefix_op, Int2Type()); - running_total = prefix_op.inclusive_prefix; - - // Scatter flagged items - Scatter(num_remaining, keys, values_and_segments, flags, block_aggregate.offset, prefix_op.exclusive_prefix.offset); - } - - return running_total; - } - - - /** - * Dequeue and scan tiles of items as part of a dynamic domino scan - */ - template ///< Output iterator type for recording number of items selected - __device__ __forceinline__ void ConsumeRange( - int num_tiles, ///< Total number of input tiles - GridQueue queue, ///< Queue descriptor for assigning tiles of work to thread blocks - ScanTileState &tile_status, ///< Global list of tile status - NumSegmentsIterator d_num_segments) ///< Output pointer for total number of segments identified - { -#if (CUB_PTX_ARCH <= 130) - // Blocks are launched in increasing order, so just assign one tile per block - - int tile_idx = (blockIdx.y * 32 * 1024) + blockIdx.x; // Current tile index - Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile - Offset num_remaining = num_items - block_offset; // Remaining items (including this tile) - - if (num_remaining > TILE_ITEMS) - { - // Full tile - ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); - } - else if (num_remaining > 0) - { - // Last tile - ValueOffsetPair running_total = ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); - - // Output the total number of items selected - if (threadIdx.x == 0) - { - *d_num_segments = running_total.offset; - - // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment - if (num_remaining == TILE_ITEMS) - { - d_values_out[running_total.offset - 1] = running_total.value; - } - } - } -#else - // Blocks may not be launched in increasing order, so work-steal tiles - - // Get first tile index - if (threadIdx.x == 0) - temp_storage.tile_idx = queue.Drain(1); - - __syncthreads(); - - int tile_idx = temp_storage.tile_idx; - Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile - Offset num_remaining = num_items - block_offset; // Remaining items (including this tile) - - while (num_remaining > TILE_ITEMS) - { - if (SYNC_AFTER_LOAD) - __syncthreads(); - - // Consume full tile - ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); - - // Get tile index - if (threadIdx.x == 0) - temp_storage.tile_idx = queue.Drain(1); - - __syncthreads(); - - tile_idx = temp_storage.tile_idx; - block_offset = Offset(TILE_ITEMS) * tile_idx; - num_remaining = num_items - block_offset; - } - - if (num_remaining > 0) - { - // Consume last tile (treat as partially-full) - ValueOffsetPair running_total = ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); - - if ((threadIdx.x == 0)) - { - // Output the total number of items selected - *d_num_segments = running_total.offset; - - // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment - if (num_remaining == TILE_ITEMS) - { - d_values_out[running_total.offset - 1] = running_total.value; - } - } - } -#endif - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block_range/block_range_scan.cuh b/SRC/cub/block_range/block_range_scan.cuh deleted file mode 100644 index 77d44d11..00000000 --- a/SRC/cub/block_range/block_range_scan.cuh +++ /dev/null @@ -1,538 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles. - */ - -#pragma once - -#include - -#include "block_scan_prefix_operators.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_scan.cuh" -#include "../grid/grid_queue.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for BlockRangeScan - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - bool _LOAD_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage) - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - BlockStoreAlgorithm _STORE_ALGORITHM, ///< The BlockStore algorithm to use - bool _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use -struct BlockRangeScanPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - LOAD_WARP_TIME_SLICING = _LOAD_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage) - STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; ///< The BlockStore algorithm to use - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; - - - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles. - */ -template < - typename BlockRangeScanPolicy, ///< Parameterized BlockRangeScanPolicy tuning policy type - typename InputIterator, ///< Random-access input iterator type - typename OutputIterator, ///< Random-access output iterator type - typename ScanOp, ///< Scan functor type - typename Identity, ///< Identity element type (cub::NullType for inclusive scan) - typename Offset> ///< Signed integer type for global offsets -struct BlockRangeScan -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // Data type of input iterator - typedef typename std::iterator_traits::value_type T; - - // Tile status descriptor interface type - typedef ScanTileState ScanTileState; - - // Input iterator wrapper type - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator - InputIterator>::Type // Directly use the supplied input iterator type - WrappedInputIterator; - - // Constants - enum - { - INCLUSIVE = Equals::VALUE, // Inclusive scan if no identity type is provided - BLOCK_THREADS = BlockRangeScanPolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = BlockRangeScanPolicy::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - // Parameterized BlockLoad type - typedef BlockLoad< - WrappedInputIterator, - BlockRangeScanPolicy::BLOCK_THREADS, - BlockRangeScanPolicy::ITEMS_PER_THREAD, - BlockRangeScanPolicy::LOAD_ALGORITHM, - BlockRangeScanPolicy::LOAD_WARP_TIME_SLICING> - BlockLoadT; - - // Parameterized BlockStore type - typedef BlockStore< - OutputIterator, - BlockRangeScanPolicy::BLOCK_THREADS, - BlockRangeScanPolicy::ITEMS_PER_THREAD, - BlockRangeScanPolicy::STORE_ALGORITHM, - BlockRangeScanPolicy::STORE_WARP_TIME_SLICING> - BlockStoreT; - - // Parameterized BlockScan type - typedef BlockScan< - T, - BlockRangeScanPolicy::BLOCK_THREADS, - BlockRangeScanPolicy::SCAN_ALGORITHM> - BlockScanT; - - // Callback type for obtaining tile prefix during block scan - typedef BlockScanLookbackPrefixOp< - T, - ScanOp, - ScanTileState> - LookbackPrefixCallbackOp; - - // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles - typedef BlockScanRunningPrefixOp< - T, - ScanOp> - RunningPrefixCallbackOp; - - // Shared memory type for this threadblock - struct _TempStorage - { - union - { - typename BlockLoadT::TempStorage load; // Smem needed for tile loading - typename BlockStoreT::TempStorage store; // Smem needed for tile storing - struct - { - typename LookbackPrefixCallbackOp::TempStorage prefix; // Smem needed for cooperative prefix callback - typename BlockScanT::TempStorage scan; // Smem needed for tile scanning - }; - }; - - Offset tile_idx; // Shared tile index - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage &temp_storage; ///< Reference to temp_storage - WrappedInputIterator d_in; ///< Input data - OutputIterator d_out; ///< Output data - ScanOp scan_op; ///< Binary scan operator - Identity identity; ///< Identity element - - - - //--------------------------------------------------------------------- - // Block scan utility methods (first tile) - //--------------------------------------------------------------------- - - /** - * Exclusive scan specialization - */ - template - __device__ __forceinline__ - void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate) - { - BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate); - } - - /** - * Exclusive sum specialization - */ - template - __device__ __forceinline__ - void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate) - { - BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate); - } - - /** - * Inclusive scan specialization - */ - template - __device__ __forceinline__ - void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate) - { - BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate); - } - - /** - * Inclusive sum specialization - */ - __device__ __forceinline__ - void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate) - { - BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate); - } - - //--------------------------------------------------------------------- - // Block scan utility methods (subsequent tiles) - //--------------------------------------------------------------------- - - /** - * Exclusive scan specialization (with prefix from predecessors) - */ - template - __device__ __forceinline__ - void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op) - { - BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate, prefix_op); - } - - /** - * Exclusive sum specialization (with prefix from predecessors) - */ - template - __device__ __forceinline__ - void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op) - { - BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate, prefix_op); - } - - /** - * Inclusive scan specialization (with prefix from predecessors) - */ - template - __device__ __forceinline__ - void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op) - { - BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate, prefix_op); - } - - /** - * Inclusive sum specialization (with prefix from predecessors) - */ - template - __device__ __forceinline__ - void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op) - { - BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate, prefix_op); - } - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - // Constructor - __device__ __forceinline__ - BlockRangeScan( - TempStorage &temp_storage, ///< Reference to temp_storage - InputIterator d_in, ///< Input data - OutputIterator d_out, ///< Output data - ScanOp scan_op, ///< Binary scan operator - Identity identity) ///< Identity element - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_out(d_out), - scan_op(scan_op), - identity(identity) - {} - - - //--------------------------------------------------------------------- - // Cooperatively scan a device-wide sequence of tiles with other CTAs - //--------------------------------------------------------------------- - - /** - * Process a tile of input (dynamic domino scan) - */ - template - __device__ __forceinline__ void ConsumeTile( - Offset num_items, ///< Total number of input items - Offset num_remaining, ///< Total number of items remaining to be processed (including this tile) - int tile_idx, ///< Tile index - Offset block_offset, ///< Tile offset - ScanTileState &tile_status) ///< Global list of tile status - { - // Load items - T items[ITEMS_PER_THREAD]; - - if (LAST_TILE) - BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_remaining); - else - BlockLoadT(temp_storage.load).Load(d_in + block_offset, items); - - __syncthreads(); - - // Perform tile scan - if (tile_idx == 0) - { - // Scan first tile - T block_aggregate; - ScanBlock(items, scan_op, identity, block_aggregate); - - // Update tile status if there may be successor tiles (i.e., this tile is full) - if (!LAST_TILE && (threadIdx.x == 0)) - tile_status.SetInclusive(0, block_aggregate); - } - else - { - // Scan non-first tile - T block_aggregate; - LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx); - ScanBlock(items, scan_op, identity, block_aggregate, prefix_op); - } - - __syncthreads(); - - // Store items - if (LAST_TILE) - BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, num_remaining); - else - BlockStoreT(temp_storage.store).Store(d_out + block_offset, items); - } - - - /** - * Dequeue and scan tiles of items as part of a dynamic domino scan - */ - __device__ __forceinline__ void ConsumeRange( - int num_items, ///< Total number of input items - GridQueue queue, ///< Queue descriptor for assigning tiles of work to thread blocks - ScanTileState &tile_status) ///< Global list of tile status - { -#if (CUB_PTX_ARCH <= 130) - // Blocks are launched in increasing order, so just assign one tile per block - - int tile_idx = (blockIdx.y * 32 * 1024) + blockIdx.x; // Current tile index - Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile - Offset num_remaining = num_items - block_offset; // Remaining items (including this tile) - - if (block_offset + TILE_ITEMS <= num_items) - ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); - else if (block_offset < num_items) - ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); - -#else - // Blocks may not be launched in increasing order, so work-steal tiles - - // Get first tile index - if (threadIdx.x == 0) - temp_storage.tile_idx = queue.Drain(1); - - __syncthreads(); - - int tile_idx = temp_storage.tile_idx; - Offset block_offset = TILE_ITEMS * tile_idx; - Offset num_remaining = num_items - block_offset; - - while (num_remaining >= TILE_ITEMS) - { - // Consume full tile - ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); - - // Get next tile - if (threadIdx.x == 0) - temp_storage.tile_idx = queue.Drain(1); - - __syncthreads(); - - tile_idx = temp_storage.tile_idx; - block_offset = TILE_ITEMS * tile_idx; - num_remaining = num_items - block_offset; - } - - // Consume the last (and potentially partially-full) tile - if (num_remaining > 0) - { - ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); - } - -#endif - } - - - //--------------------------------------------------------------------- - // Scan an sequence of consecutive tiles (independent of other thread blocks) - //--------------------------------------------------------------------- - - /** - * Process a tile of input - */ - template < - bool FULL_TILE, - bool FIRST_TILE> - __device__ __forceinline__ void ConsumeTile( - Offset block_offset, ///< Tile offset - RunningPrefixCallbackOp &prefix_op, ///< Running prefix operator - int valid_items = TILE_ITEMS) ///< Number of valid items in the tile - { - // Load items - T items[ITEMS_PER_THREAD]; - - if (FULL_TILE) - BlockLoadT(temp_storage.load).Load(d_in + block_offset, items); - else - BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, valid_items); - - __syncthreads(); - - // Block scan - if (FIRST_TILE) - { - T block_aggregate; - ScanBlock(items, scan_op, identity, block_aggregate); - prefix_op.running_total = block_aggregate; - } - else - { - T block_aggregate; - ScanBlock(items, scan_op, identity, block_aggregate, prefix_op); - } - - __syncthreads(); - - // Store items - if (FULL_TILE) - BlockStoreT(temp_storage.store).Store(d_out + block_offset, items); - else - BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, valid_items); - } - - - /** - * Scan a consecutive share of input tiles - */ - __device__ __forceinline__ void ConsumeRange( - Offset block_offset, ///< [in] Threadblock begin offset (inclusive) - Offset block_end) ///< [in] Threadblock end offset (exclusive) - { - BlockScanRunningPrefixOp prefix_op(scan_op); - - if (block_offset + TILE_ITEMS <= block_end) - { - // Consume first tile of input (full) - ConsumeTile(block_offset, prefix_op); - block_offset += TILE_ITEMS; - - // Consume subsequent full tiles of input - while (block_offset + TILE_ITEMS <= block_end) - { - ConsumeTile(block_offset, prefix_op); - block_offset += TILE_ITEMS; - } - - // Consume a partially-full tile - if (block_offset < block_end) - { - int valid_items = block_end - block_offset; - ConsumeTile(block_offset, prefix_op, valid_items); - } - } - else - { - // Consume the first tile of input (partially-full) - int valid_items = block_end - block_offset; - ConsumeTile(block_offset, prefix_op, valid_items); - } - } - - - /** - * Scan a consecutive share of input tiles, seeded with the specified prefix value - */ - __device__ __forceinline__ void ConsumeRange( - Offset block_offset, ///< [in] Threadblock begin offset (inclusive) - Offset block_end, ///< [in] Threadblock end offset (exclusive) - T prefix) ///< [in] The prefix to apply to the scan segment - { - BlockScanRunningPrefixOp prefix_op(prefix, scan_op); - - // Consume full tiles of input - while (block_offset + TILE_ITEMS <= block_end) - { - ConsumeTile(block_offset, prefix_op); - block_offset += TILE_ITEMS; - } - - // Consume a partially-full tile - if (block_offset < block_end) - { - int valid_items = block_end - block_offset; - ConsumeTile(block_offset, prefix_op, valid_items); - } - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block_range/block_range_select.cuh b/SRC/cub/block_range/block_range_select.cuh deleted file mode 100644 index 59fb5ce2..00000000 --- a/SRC/cub/block_range/block_range_select.cuh +++ /dev/null @@ -1,735 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockRangeSelect implements a stateful abstraction of CUDA thread blocks for participating in device-wide select. - */ - -#pragma once - -#include - -#include "block_scan_prefix_operators.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_scan.cuh" -#include "../block/block_exchange.cuh" -#include "../block/block_discontinuity.cuh" -#include "../grid/grid_queue.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for BlockRangeSelect - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - bool _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use -struct BlockRangeSelectPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - TWO_PHASE_SCATTER = _TWO_PHASE_SCATTER, ///< Whether or not to coalesce output values in shared memory before scattering them to global - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; - - - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief BlockRangeSelect implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles - * - * Performs functor-based selection if SelectOp functor type != NullType - * Otherwise performs flag-based selection if FlagIterator's value type != NullType - * Otherwise performs discontinuity selection (keep unique) - */ -template < - typename BlockRangeSelectPolicy, ///< Parameterized BlockRangeSelectPolicy tuning policy type - typename InputIterator, ///< Random-access input iterator type for selection items - typename FlagIterator, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection) - typename OutputIterator, ///< Random-access input iterator type for selected items - typename SelectOp, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection) - typename EqualityOp, ///< Equality operator type (NullType if selection functor or selections is to be used for selection) - typename Offset, ///< Signed integer type for global offsets - bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output -struct BlockRangeSelect -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // Data type of input iterator - typedef typename std::iterator_traits::value_type T; - - // Data type of flag iterator - typedef typename std::iterator_traits::value_type Flag; - - // Tile status descriptor interface type - typedef ScanTileState ScanTileState; - - // Constants - enum - { - USE_SELECT_OP, - USE_SELECT_FLAGS, - USE_DISCONTINUITY, - - BLOCK_THREADS = BlockRangeSelectPolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = BlockRangeSelectPolicy::ITEMS_PER_THREAD, - TWO_PHASE_SCATTER = (BlockRangeSelectPolicy::TWO_PHASE_SCATTER) && (ITEMS_PER_THREAD > 1), - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - - // Whether or not to sync after loading data - SYNC_AFTER_LOAD = (BlockRangeSelectPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), - - SELECT_METHOD = (!Equals::VALUE) ? - USE_SELECT_OP : - (!Equals::VALUE) ? - USE_SELECT_FLAGS : - USE_DISCONTINUITY - }; - - // Input iterator wrapper type - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator - InputIterator>::Type // Directly use the supplied input iterator type - WrappedInputIterator; - - // Flag iterator wrapper type - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator - FlagIterator>::Type // Directly use the supplied input iterator type - WrappedFlagIterator; - - // Parameterized BlockLoad type for input items - typedef BlockLoad< - WrappedInputIterator, - BlockRangeSelectPolicy::BLOCK_THREADS, - BlockRangeSelectPolicy::ITEMS_PER_THREAD, - BlockRangeSelectPolicy::LOAD_ALGORITHM> - BlockLoadT; - - // Parameterized BlockLoad type for flags - typedef BlockLoad< - WrappedFlagIterator, - BlockRangeSelectPolicy::BLOCK_THREADS, - BlockRangeSelectPolicy::ITEMS_PER_THREAD, - BlockRangeSelectPolicy::LOAD_ALGORITHM> - BlockLoadFlags; - - // Parameterized BlockExchange type for input items - typedef BlockExchange< - T, - BLOCK_THREADS, - ITEMS_PER_THREAD> - BlockExchangeT; - - // Parameterized BlockDiscontinuity type for input items - typedef BlockDiscontinuity BlockDiscontinuityT; - - // Parameterized BlockScan type - typedef BlockScan< - Offset, - BlockRangeSelectPolicy::BLOCK_THREADS, - BlockRangeSelectPolicy::SCAN_ALGORITHM> - BlockScanAllocations; - - // Callback type for obtaining tile prefix during block scan - typedef BlockScanLookbackPrefixOp< - Offset, - Sum, - ScanTileState> - LookbackPrefixCallbackOp; - - // Shared memory type for this threadblock - struct _TempStorage - { - union - { - struct - { - typename LookbackPrefixCallbackOp::TempStorage prefix; // Smem needed for cooperative prefix callback - typename BlockScanAllocations::TempStorage scan; // Smem needed for tile scanning - typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection - }; - - // Smem needed for input loading - typename BlockLoadT::TempStorage load_items; - - // Smem needed for flag loading - typename BlockLoadFlags::TempStorage load_flags; - - // Smem needed for two-phase scatter - typename If::Type exchange; - }; - - Offset tile_idx; // Shared tile index - Offset tile_num_selected_prefix; // Exclusive tile prefix - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage &temp_storage; ///< Reference to temp_storage - WrappedInputIterator d_in; ///< Input data - WrappedFlagIterator d_flags; ///< Input flags - OutputIterator d_out; ///< Output data - SelectOp select_op; ///< Selection operator - InequalityWrapper inequality_op; ///< Inequality operator - Offset num_items; ///< Total number of input items - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - // Constructor - __device__ __forceinline__ - BlockRangeSelect( - TempStorage &temp_storage, ///< Reference to temp_storage - InputIterator d_in, ///< Input data - FlagIterator d_flags, ///< Input flags - OutputIterator d_out, ///< Output data - SelectOp select_op, ///< Selection operator - EqualityOp equality_op, ///< Equality operator - Offset num_items) ///< Total number of input items - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_flags(d_flags), - d_out(d_out), - select_op(select_op), - inequality_op(equality_op), - num_items(num_items) - {} - - - //--------------------------------------------------------------------- - // Utility methods for initializing the selections - //--------------------------------------------------------------------- - - /** - * Template unrolled selection via selection operator - */ - template - __device__ __forceinline__ void ApplySelectionOp( - Offset block_offset, - Offset num_remaining, - T (&items)[ITEMS_PER_THREAD], - Offset (&selected)[ITEMS_PER_THREAD], - Int2Type iteration) - { - selected[ITERATION] = 0; - if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITERATION < num_remaining)) - selected[ITERATION] = select_op(items[ITERATION]); - - ApplySelectionOp(block_offset, num_remaining, items, selected, Int2Type()); - } - - /** - * Template unrolled selection via selection operator - */ - template - __device__ __forceinline__ void ApplySelectionOp( - Offset block_offset, - Offset num_remaining, - T (&items)[ITEMS_PER_THREAD], - Offset (&selected)[ITEMS_PER_THREAD], - Int2Type iteration) - {} - - /** - * Initialize selections (specialized for selection operator) - */ - template - __device__ __forceinline__ void InitializeSelections( - Offset block_offset, - Offset num_remaining, - T (&items)[ITEMS_PER_THREAD], - Offset (&selected)[ITEMS_PER_THREAD], - Int2Type select_method) - { - ApplySelectionOp(block_offset, num_remaining, items, selected, Int2Type<0>()); - } - - - /** - * Initialize selections (specialized for valid flags) - */ - template - __device__ __forceinline__ void InitializeSelections( - Offset block_offset, - Offset num_remaining, - T (&items)[ITEMS_PER_THREAD], - Offset (&selected)[ITEMS_PER_THREAD], - Int2Type select_method) - { - Flag flags[ITEMS_PER_THREAD]; - - if (LAST_TILE) - BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags, num_remaining, 0); - else - BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - selected[ITEM] = flags[ITEM]; - } - - if (SYNC_AFTER_LOAD) - __syncthreads(); - } - - - /** - * Initialize selections (specialized for discontinuity detection) - */ - template - __device__ __forceinline__ void InitializeSelections( - Offset block_offset, - Offset num_remaining, - T (&items)[ITEMS_PER_THREAD], - Offset (&selected)[ITEMS_PER_THREAD], - Int2Type select_method) - { - if (FIRST_TILE) - { - // First tile always flags the first item - BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op); - } - else - { - // Subsequent tiles require the last item from the previous tile - T tile_predecessor_item; - if (threadIdx.x == 0) - tile_predecessor_item = d_in[block_offset - 1]; - - BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op, tile_predecessor_item); - } - } - - - //--------------------------------------------------------------------- - // Utility methods for scattering selections - //--------------------------------------------------------------------- - - /** - * Scatter data items to select offsets (specialized for direct scattering and for discarding rejected items) - */ - template - __device__ __forceinline__ void Scatter( - Offset block_offset, - T (&items)[ITEMS_PER_THREAD], - Offset selected[ITEMS_PER_THREAD], - Offset scatter_offsets[ITEMS_PER_THREAD], - Offset tile_num_selected_prefix, - Offset tile_num_selected, - Offset num_remaining, - Int2Type keep_rejects, - Int2Type two_phase_scatter) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (selected[ITEM]) - { - // Selected items are placed front-to-back - d_out[scatter_offsets[ITEM]] = items[ITEM]; - } - } - } - - - /** - * Scatter data items to select offsets (specialized for direct scattering and for partitioning rejected items after selected items) - */ - template - __device__ __forceinline__ void Scatter( - Offset block_offset, - T (&items)[ITEMS_PER_THREAD], - Offset selected[ITEMS_PER_THREAD], - Offset scatter_offsets[ITEMS_PER_THREAD], - Offset tile_num_selected_prefix, - Offset tile_num_selected, - Offset num_remaining, - Int2Type keep_rejects, - Int2Type two_phase_scatter) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (selected[ITEM]) - { - // Selected items are placed front-to-back - d_out[scatter_offsets[ITEM]] = items[ITEM]; - } - else if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_remaining)) - { - Offset global_idx = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; - Offset reject_idx = global_idx - scatter_offsets[ITEM]; - - // Rejected items are placed back-to-front - d_out[num_items - reject_idx - 1] = items[ITEM]; - } - } - } - - - /** - * Scatter data items to select offsets (specialized for two-phase scattering and for discarding rejected items) - */ - template - __device__ __forceinline__ void Scatter( - Offset block_offset, - T (&items)[ITEMS_PER_THREAD], - Offset selected[ITEMS_PER_THREAD], - Offset scatter_offsets[ITEMS_PER_THREAD], - Offset tile_num_selected_prefix, - Offset tile_num_selected, - Offset num_remaining, - Int2Type keep_rejects, - Int2Type two_phase_scatter) - { - if ((tile_num_selected >> Log2::VALUE) == 0) - { - // Average number of selected items per thread is less than one, so just do a one-phase scatter - Scatter( - block_offset, - items, - selected, - scatter_offsets, - tile_num_selected_prefix, - tile_num_selected, - num_remaining, - keep_rejects, - Int2Type()); - } - else - { - // Share exclusive tile prefix - if (threadIdx.x == 0) - { - temp_storage.tile_num_selected_prefix = tile_num_selected_prefix; - } - - __syncthreads(); - - // Load exclusive tile prefix in all threads - tile_num_selected_prefix = temp_storage.tile_num_selected_prefix; - - int local_ranks[ITEMS_PER_THREAD]; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - local_ranks[ITEM] = scatter_offsets[ITEM] - tile_num_selected_prefix; - } - - BlockExchangeT(temp_storage.exchange).ScatterToStriped(items, local_ranks, selected); - - // Selected items are placed front-to-back - StoreDirectStriped(threadIdx.x, d_out + tile_num_selected_prefix, items, tile_num_selected); - } - } - - - /** - * Scatter data items to select offsets (specialized for two-phase scattering and for partitioning rejected items after selected items) - */ - template - __device__ __forceinline__ void Scatter( - Offset block_offset, - T (&items)[ITEMS_PER_THREAD], - Offset selected[ITEMS_PER_THREAD], - Offset scatter_offsets[ITEMS_PER_THREAD], - Offset tile_num_selected_prefix, - Offset tile_num_selected, - Offset num_remaining, - Int2Type keep_rejects, - Int2Type two_phase_scatter) - { - // Share exclusive tile prefix - if (threadIdx.x == 0) - { - temp_storage.tile_num_selected_prefix = tile_num_selected_prefix; - } - - __syncthreads(); - - // Load the exclusive tile prefix in all threads - tile_num_selected_prefix = temp_storage.tile_num_selected_prefix; - - // Determine the exclusive prefix for rejects - Offset tile_rejected_exclusive_prefix = block_offset - tile_num_selected_prefix; - - // Determine local scatter offsets - int local_ranks[ITEMS_PER_THREAD]; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - local_ranks[ITEM] = -1; - Offset global_idx = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; - Offset reject_idx = global_idx - scatter_offsets[ITEM]; - - if (selected[ITEM]) - { - // Selected items - local_ranks[ITEM] = scatter_offsets[ITEM] - tile_num_selected_prefix; - } - else if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_remaining)) - { - // Rejected items - local_ranks[ITEM] = (reject_idx - tile_rejected_exclusive_prefix) + tile_num_selected; - } - } - - // Coalesce selected and rejected items in shared memory, gathering in striped arrangements - if (LAST_TILE) - BlockExchangeT(temp_storage.exchange).ScatterToStripedGuarded(items, local_ranks); - else - BlockExchangeT(temp_storage.exchange).ScatterToStriped(items, local_ranks); - - // Store in striped order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - Offset local_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; - Offset scatter_offset = tile_num_selected_prefix + local_idx; - if (local_idx >= tile_num_selected) - scatter_offset = num_items - (tile_rejected_exclusive_prefix + (local_idx - tile_num_selected)) - 1; - - if (!LAST_TILE || (local_idx < num_remaining)) - { - d_out[scatter_offset] = items[ITEM]; - } - } - } - - - //--------------------------------------------------------------------- - // Cooperatively scan a device-wide sequence of tiles with other CTAs - //--------------------------------------------------------------------- - - /** - * Process a tile of input (dynamic domino scan) - */ - template - __device__ __forceinline__ Offset ConsumeTile( - Offset num_items, ///< Total number of input items - Offset num_remaining, ///< Total number of items remaining to be processed (including this tile) - int tile_idx, ///< Tile index - Offset block_offset, ///< Tile offset - ScanTileState &tile_status) ///< Global list of tile status - { - T items[ITEMS_PER_THREAD]; - Offset selected[ITEMS_PER_THREAD]; // Selection flags - Offset scatter_offsets[ITEMS_PER_THREAD]; // Scatter offsets - Offset tile_num_selected_prefix; // Total number of selected items prior to this tile - Offset tile_num_selected; // Total number of selected items within this tile - Offset num_selected; // - - // Load items - if (LAST_TILE) - BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items, num_remaining, d_in[num_items - 1]); // Repeat last item - else - BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items); - - if (SYNC_AFTER_LOAD) - __syncthreads(); - - if (tile_idx == 0) - { - // Initialize selected/rejected output flags for first tile - InitializeSelections( - block_offset, - num_remaining, - items, - selected, - Int2Type()); - - // Compute scatter offsets by scanning the flags - BlockScanAllocations(temp_storage.scan).ExclusiveSum(selected, scatter_offsets, tile_num_selected); - - // Update tile status if there may be successor tiles - if (!LAST_TILE && (threadIdx.x == 0)) - tile_status.SetInclusive(0, tile_num_selected); - - tile_num_selected_prefix = 0; - num_selected = tile_num_selected; - } - else - { - // Initialize selected/rejected output flags for non-first tile - InitializeSelections( - block_offset, - num_remaining, - items, - selected, - Int2Type()); - - // Compute scatter offsets by scanning the flags - LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, Sum(), tile_idx); - BlockScanAllocations(temp_storage.scan).ExclusiveSum(selected, scatter_offsets, tile_num_selected, prefix_op); - - tile_num_selected_prefix = prefix_op.exclusive_prefix; - num_selected = prefix_op.inclusive_prefix; - } - - // Store selected items - Scatter( - block_offset, - items, - selected, - scatter_offsets, - tile_num_selected_prefix, - tile_num_selected, - num_remaining, - Int2Type(), - Int2Type()); - - // Return total number of items selected (inclusive of this tile) - return num_selected; - } - - - /** - * Dequeue and scan tiles of items as part of a dynamic domino scan - */ - template ///< Output iterator type for recording number of items selected - __device__ __forceinline__ void ConsumeRange( - int num_tiles, ///< Total number of input tiles - GridQueue queue, ///< Queue descriptor for assigning tiles of work to thread blocks - ScanTileState &tile_status, ///< Global list of tile status - NumSelectedIterator d_num_selected) ///< Output total number selected - { -#if (CUB_PTX_ARCH <= 130) - // Blocks are launched in increasing order, so just assign one tile per block - - int tile_idx = (blockIdx.y * 32 * 1024) + blockIdx.x; // Current tile index - Offset block_offset = Offset(TILE_ITEMS) * tile_idx; // Global offset for the current tile - Offset num_remaining = num_items - block_offset; // Remaining items (including this tile) - - if (num_remaining > TILE_ITEMS) - { - ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); - } - else if (num_remaining > 0) - { - Offset total_selected = ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); - - // Output the total number of items selected - if (threadIdx.x == 0) - { - *d_num_selected = total_selected; - } - } - -#else - // Blocks may not be launched in increasing order, so work-steal tiles - - // Get first tile index - if (threadIdx.x == 0) - temp_storage.tile_idx = queue.Drain(1); - - __syncthreads(); - - int tile_idx = temp_storage.tile_idx; - Offset block_offset = Offset(TILE_ITEMS) * tile_idx; - Offset num_remaining = num_items - block_offset; - - while (num_remaining > TILE_ITEMS) - { - // Consume full tile - ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); - - // Get next tile - if (threadIdx.x == 0) - temp_storage.tile_idx = queue.Drain(1); - - __syncthreads(); - - tile_idx = temp_storage.tile_idx; - block_offset = Offset(TILE_ITEMS) * tile_idx; - num_remaining = num_items - block_offset; - } - - // Consume the last (and potentially partially-full) tile - if (num_remaining > 0) - { - Offset total_selected = ConsumeTile(num_items, num_remaining, tile_idx, block_offset, tile_status); - - // Output the total number of items selected - if (threadIdx.x == 0) - { - *d_num_selected = total_selected; - } - } - -#endif - - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block_range/specializations/block_range_histo_gatomic.cuh b/SRC/cub/block_range/specializations/block_range_histo_gatomic.cuh deleted file mode 100644 index ccfbd643..00000000 --- a/SRC/cub/block_range/specializations/block_range_histo_gatomic.cuh +++ /dev/null @@ -1,184 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram. - */ - -#pragma once - -#include - -#include "../../util_type.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - - -/** - * BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics - */ -template < - typename BlockRangeHistogramPolicy, ///< Tuning policy - int BINS, ///< Number of histogram bins per channel - int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) - int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename InputIterator, ///< The input iterator type \iterator. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1] - typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin - typename Offset> ///< Signed integer type for global offsets -struct BlockRangeHistogramGlobalAtomic -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // Sample type - typedef typename std::iterator_traits::value_type SampleT; - - // Constants - enum - { - BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD, - TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS, - }; - - // Shared memory type required by this thread block - typedef NullType TempStorage; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - /// Reference to output histograms - HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]; - - /// Input data to reduce - InputIterator d_in; - - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - /** - * Constructor - */ - __device__ __forceinline__ BlockRangeHistogramGlobalAtomic( - TempStorage &temp_storage, ///< Reference to temp_storage - InputIterator d_in, ///< Input data to reduce - HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms - : - d_in(d_in), - d_out_histograms(d_out_histograms) - {} - - - /** - * Process a single tile of input - */ - template - __device__ __forceinline__ void ConsumeTile( - Offset block_offset, ///< The offset the tile to consume - int valid_items = TILE_ITEMS) ///< The number of valid items in the tile - { - if (FULL_TILE) - { - // Full tile of samples to read and composite - SampleT items[ITEMS_PER_THREAD][CHANNELS]; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - #pragma unroll - for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) - { - if (CHANNEL < ACTIVE_CHANNELS) - { - items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; - } - } - } - - __threadfence_block(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - #pragma unroll - for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) - { - if (CHANNEL < ACTIVE_CHANNELS) - { - atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1); - } - } - } - } - else - { - // Only a partially-full tile of samples to read and composite - int bounds = valid_items - (threadIdx.x * CHANNELS); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - #pragma unroll - for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) - { - if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds)) - { - SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; - atomicAdd(d_out_histograms[CHANNEL] + item, 1); - } - } - } - - } - } - - - /** - * Aggregate results into output - */ - __device__ __forceinline__ void AggregateOutput() - {} -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block_range/specializations/block_range_histo_satomic.cuh b/SRC/cub/block_range/specializations/block_range_histo_satomic.cuh deleted file mode 100644 index 8c625695..00000000 --- a/SRC/cub/block_range/specializations/block_range_histo_satomic.cuh +++ /dev/null @@ -1,245 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockRangeHistogramSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics - */ - -#pragma once - -#include - -#include "../../util_type.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * BlockRangeHistogramSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics - */ -template < - typename BlockRangeHistogramPolicy, ///< Tuning policy - int BINS, ///< Number of histogram bins - int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) - int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename InputIterator, ///< The input iterator type \iterator. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1] - typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin - typename Offset> ///< Signed integer type for global offsets -struct BlockRangeHistogramSharedAtomic -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // Sample type - typedef typename std::iterator_traits::value_type SampleT; - - // Constants - enum - { - BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD, - TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS, - }; - - /// Shared memory type required by this thread block - struct _TempStorage - { - HistoCounter histograms[ACTIVE_CHANNELS][BINS + 1]; // One word of padding between channel histograms to prevent warps working on different histograms from hammering on the same bank - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - /// Reference to temp_storage - _TempStorage &temp_storage; - - /// Reference to output histograms - HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]; - - /// Input data to reduce - InputIterator d_in; - - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - /** - * Constructor - */ - __device__ __forceinline__ BlockRangeHistogramSharedAtomic( - TempStorage &temp_storage, ///< Reference to temp_storage - InputIterator d_in, ///< Input data to reduce - HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_out_histograms(d_out_histograms) - { - // Initialize histogram bin counts to zeros - #pragma unroll - for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) - { - int histo_offset = 0; - - #pragma unroll - for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) - { - this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0; - } - // Finish up with guarded initialization if necessary - if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS)) - { - this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0; - } - } - - __syncthreads(); - } - - - /** - * Process a single tile of input - */ - template - __device__ __forceinline__ void ConsumeTile( - Offset block_offset, ///< The offset the tile to consume - int valid_items = TILE_ITEMS) ///< The number of valid items in the tile - { - if (FULL_TILE) - { - // Full tile of samples to read and composite - SampleT items[ITEMS_PER_THREAD][CHANNELS]; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - #pragma unroll - for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) - { - if (CHANNEL < ACTIVE_CHANNELS) - { - items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; - } - } - } - - __threadfence_block(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - #pragma unroll - for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) - { - if (CHANNEL < ACTIVE_CHANNELS) - { - atomicAdd(temp_storage.histograms[CHANNEL] + items[ITEM][CHANNEL], 1); - } - } - } - - __threadfence_block(); - } - else - { - // Only a partially-full tile of samples to read and composite - int bounds = valid_items - (threadIdx.x * CHANNELS); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - #pragma unroll - for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL) - { - if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds)) - { - SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL]; - atomicAdd(temp_storage.histograms[CHANNEL] + item, 1); - } - } - } - - } - } - - - /** - * Aggregate results into output - */ - __device__ __forceinline__ void AggregateOutput() - { - // Barrier to ensure shared memory histograms are coherent - __syncthreads(); - - // Copy shared memory histograms to output - int channel_offset = (blockIdx.x * BINS); - - #pragma unroll - for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) - { - int histo_offset = 0; - - #pragma unroll - for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) - { - HistoCounter count = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x]; - - d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = count; - } - - // Finish up with guarded initialization if necessary - if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS)) - { - HistoCounter count = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x]; - - d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = count; - } - } - } -}; - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block_range/specializations/block_range_histo_sort.cuh b/SRC/cub/block_range/specializations/block_range_histo_sort.cuh deleted file mode 100644 index c28d1a74..00000000 --- a/SRC/cub/block_range/specializations/block_range_histo_sort.cuh +++ /dev/null @@ -1,364 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockRangeHistogramSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting - */ - -#pragma once - -#include - -#include "../../block/block_radix_sort.cuh" -#include "../../block/block_discontinuity.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * BlockRangeHistogramSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting - */ -template < - typename BlockRangeHistogramPolicy, ///< Tuning policy - int BINS, ///< Number of histogram bins per channel - int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed) - int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename InputIterator, ///< The input iterator type \iterator. Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1] - typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin - typename Offset> ///< Signed integer type for global offsets -struct BlockRangeHistogramSort -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // Sample type - typedef typename std::iterator_traits::value_type SampleT; - - // Constants - enum - { - BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD, - TILE_CHANNEL_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - TILE_ITEMS = TILE_CHANNEL_ITEMS * CHANNELS, - - STRIPED_COUNTERS_PER_THREAD = (BINS + BLOCK_THREADS - 1) / BLOCK_THREADS, - }; - - // Parameterize BlockRadixSort type for our thread block - typedef BlockRadixSort BlockRadixSortT; - - // Parameterize BlockDiscontinuity type for our thread block - typedef BlockDiscontinuity BlockDiscontinuityT; - - /// Shared memory type required by this thread block - union _TempStorage - { - // Storage for sorting bin values - typename BlockRadixSortT::TempStorage sort; - - struct - { - // Storage for detecting discontinuities in the tile of sorted bin values - typename BlockDiscontinuityT::TempStorage flag; - - // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values - int run_begin[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD]; - int run_end[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD]; - }; - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - // Discontinuity functor - struct DiscontinuityOp - { - // Reference to temp_storage - _TempStorage &temp_storage; - - // Constructor - __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) : - temp_storage(temp_storage) - {} - - // Discontinuity predicate - __device__ __forceinline__ bool operator()(const SampleT &a, const SampleT &b, int b_index) - { - if (a != b) - { - // Note the begin/end offsets in shared storage - temp_storage.run_begin[b] = b_index; - temp_storage.run_end[a] = b_index; - - return true; - } - else - { - return false; - } - } - }; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - /// Reference to temp_storage - _TempStorage &temp_storage; - - /// Histogram counters striped across threads - HistoCounter thread_counters[ACTIVE_CHANNELS][STRIPED_COUNTERS_PER_THREAD]; - - /// Reference to output histograms - HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]; - - /// Input data to reduce - InputIterator d_in; - - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - /** - * Constructor - */ - __device__ __forceinline__ BlockRangeHistogramSort( - TempStorage &temp_storage, ///< Reference to temp_storage - InputIterator d_in, ///< Input data to reduce - HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS]) ///< Reference to output histograms - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_out_histograms(d_out_histograms) - { - // Initialize histogram counters striped across threads - #pragma unroll - for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) - { - #pragma unroll - for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) - { - thread_counters[CHANNEL][COUNTER] = 0; - } - } - } - - - /** - * Composite a tile of input items - */ - __device__ __forceinline__ void Composite( - SampleT (&items)[ITEMS_PER_THREAD], ///< Tile of samples - HistoCounter thread_counters[STRIPED_COUNTERS_PER_THREAD]) ///< Histogram counters striped across threads - { - // Sort bytes in blocked arrangement - BlockRadixSortT(temp_storage.sort).Sort(items); - - __syncthreads(); - - // Initialize the shared memory's run_begin and run_end for each bin - #pragma unroll - for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) - { - temp_storage.run_begin[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS; - temp_storage.run_end[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS; - } - - __syncthreads(); - - // Note the begin/end run offsets of bin runs in the sorted tile - int flags[ITEMS_PER_THREAD]; // unused - DiscontinuityOp flag_op(temp_storage); - BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op); - - // Update begin for first item - if (threadIdx.x == 0) temp_storage.run_begin[items[0]] = 0; - - __syncthreads(); - - // Composite into histogram - // Initialize the shared memory's run_begin and run_end for each bin - #pragma unroll - for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) - { - int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x; - HistoCounter run_length = temp_storage.run_end[bin] - temp_storage.run_begin[bin]; - - thread_counters[COUNTER] += run_length; - } - } - - - /** - * Process one channel within a tile. - */ - template - __device__ __forceinline__ void ConsumeTileChannel( - int channel, - Offset block_offset, - int valid_items) - { - // Load items in striped fashion - if (FULL_TILE) - { - // Full tile of samples to read and composite - SampleT items[ITEMS_PER_THREAD]; - - // Unguarded loads - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)]; - } - - // Composite our histogram data - Composite(items, thread_counters[channel]); - } - else - { - // Only a partially-full tile of samples to read and composite - SampleT items[ITEMS_PER_THREAD]; - - // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later - int bounds = (valid_items - (threadIdx.x * CHANNELS)); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ? - d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] : - 0; - } - - // Composite our histogram data - Composite(items, thread_counters[channel]); - - __syncthreads(); - - // Correct the overcounting in the zero-bin from invalid (out-of-bounds) items - if (threadIdx.x == 0) - { - int extra = (TILE_ITEMS - valid_items) / CHANNELS; - thread_counters[channel][0] -= extra; - } - } - } - - - /** - * Template iteration over channels (to silence not-unrolled warnings for SM10-13). Inductive step. - */ - template - struct IterateChannels - { - /** - * Process one channel within a tile. - */ - static __device__ __forceinline__ void ConsumeTileChannel( - BlockRangeHistogramSort *cta, - Offset block_offset, - int valid_items) - { - __syncthreads(); - - cta->ConsumeTileChannel(CHANNEL, block_offset, valid_items); - - IterateChannels::ConsumeTileChannel(cta, block_offset, valid_items); - } - }; - - - /** - * Template iteration over channels (to silence not-unrolled warnings for SM10-13). Base step. - */ - template - struct IterateChannels - { - static __device__ __forceinline__ void ConsumeTileChannel(BlockRangeHistogramSort *cta, Offset block_offset, int valid_items) {} - }; - - - /** - * Process a single tile of input - */ - template - __device__ __forceinline__ void ConsumeTile( - Offset block_offset, ///< The offset the tile to consume - int valid_items = TILE_ITEMS) ///< The number of valid items in the tile - { - // First channel - ConsumeTileChannel(0, block_offset, valid_items); - - // Iterate through remaining channels - IterateChannels::ConsumeTileChannel(this, block_offset, valid_items); - } - - - /** - * Aggregate results into output - */ - __device__ __forceinline__ void AggregateOutput() - { - // Copy counters striped across threads into the histogram output - #pragma unroll - for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) - { - int channel_offset = (blockIdx.x * BINS); - - #pragma unroll - for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER) - { - int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x; - - if ((STRIPED_COUNTERS_PER_THREAD * BLOCK_THREADS == BINS) || (bin < BINS)) - { - d_out_histograms[CHANNEL][channel_offset + bin] = thread_counters[CHANNEL][COUNTER]; - } - } - } - } -}; - - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/cub.cuh b/SRC/cub/cub.cuh index a0902ba8..3ece0f65 100644 --- a/SRC/cub/cub.cuh +++ b/SRC/cub/cub.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -44,15 +44,19 @@ #include "block/block_reduce.cuh" #include "block/block_scan.cuh" #include "block/block_store.cuh" -#include "block/block_shift.cuh" +//#include "block/block_shift.cuh" // Device #include "device/device_histogram.cuh" #include "device/device_partition.cuh" #include "device/device_radix_sort.cuh" #include "device/device_reduce.cuh" +#include "device/device_run_length_encode.cuh" #include "device/device_scan.cuh" +#include "device/device_segmented_radix_sort.cuh" +#include "device/device_segmented_reduce.cuh" #include "device/device_select.cuh" +#include "device/device_spmv.cuh" // Grid //#include "grid/grid_barrier.cuh" @@ -60,9 +64,6 @@ #include "grid/grid_mapping.cuh" #include "grid/grid_queue.cuh" -// Host -#include "host/spinlock.cuh" - // Thread #include "thread/thread_load.cuh" #include "thread/thread_operators.cuh" @@ -85,7 +86,6 @@ #include "iterator/transform_input_iterator.cuh" // Util -#include "util_allocator.cuh" #include "util_arch.cuh" #include "util_debug.cuh" #include "util_device.cuh" diff --git a/SRC/cub/device/device_histogram.cuh b/SRC/cub/device/device_histogram.cuh index 1ce687e2..a2556a6b 100644 --- a/SRC/cub/device/device_histogram.cuh +++ b/SRC/cub/device/device_histogram.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -29,15 +29,16 @@ /** * \file - * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within global memory. + * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. */ #pragma once #include #include +#include -#include "dispatch/device_histogram_dispatch.cuh" +#include "dispatch/dispatch_histogram.cuh" #include "../util_namespace.cuh" /// Optional outer namespace(s) @@ -48,8 +49,8 @@ namespace cub { /** - * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within global memory. ![](histogram_logo.png) - * \ingroup DeviceModule + * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png) + * \ingroup SingleModule * * \par Overview * A histogram @@ -58,594 +59,806 @@ namespace cub { * \par Usage Considerations * \cdp_class{DeviceHistogram} * - * \par Performance - * - * \image html histo_perf.png - * */ struct DeviceHistogram { /******************************************************************//** - * \name Single-channel samples + * \name Evenly-segmented bin ranges *********************************************************************/ //@{ - /** - * \brief Computes a device-wide histogram using fast block-wide sorting. + * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. * * \par - * - The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS. - * - Delivers consistent throughput regardless of sample diversity - * - Histograms having a large number of bins (e.g., thousands) may adversely affect shared memory occupancy and performance (or even the ability to launch). - * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator). + * - The number of histogram bins is (\p num_levels - 1) + * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) * - \devicestorage - * - \cdp * * \par Snippet - * The code snippet below illustrates the computation of a 8-bin histogram of - * single-channel unsigned char samples. + * The code snippet below illustrates the computation of a six-bin histogram + * from a sequence of float samples + * * \par * \code * #include // or equivalently * - * // Declare, allocate, and initialize device pointers for input and histogram - * int num_samples; // e.g., 12 - * unsigned char *d_samples; // e.g., [2, 6, 7, 5, 3, 0, 2, 1, 7, 0, 6, 2] - * unsigned int *d_histogram; // e.g., [ , , , , , , , ] + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_samples; // e.g., 10 + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] + * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] + * int num_levels; // e.g., 7 (seven level boundaries for six bins) + * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) + * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) * ... * - * // Wrap d_samples device pointer in a random-access texture iterator - * cub::TexObjInputIterator d_samples_tex_itr; - * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char)); - * * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::SingleChannelSorting<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples); + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * - * // Compute histogram - * cub::DeviceHistogram::SingleChannelSorting<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples); - * - * // Unbind texture iterator - * d_samples_tex_itr.UnbindTexture(); + * // Compute histograms + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); * - * // d_histogram <-- [2, 1, 3, 1, 0, 1, 2, 2] + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; * * \endcode * - * \tparam BINS Number of histogram bins per channel - * \tparam InputIterator [inferred] Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]) \iterator - * \tparam HistoCounter [inferred] Integer type for counting sample occurrences per histogram bin + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 */ template < - int BINS, - typename InputIterator, - typename HistoCounter> + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> CUB_RUNTIME_FUNCTION - static cudaError_t SingleChannelSorting( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_samples, ///< [in] Input samples - HistoCounter* d_histogram, ///< [out] Array of BINS counters of integral type \p HistoCounter. - int num_samples, ///< [in] Number of samples to process - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + static cudaError_t HistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. + LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. + OffsetT num_samples, ///< [in] The number of input samples (i.e., the length of \p d_samples) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { - // Signed integer type for global offsets - typedef int Offset; - - // Dispatch type - typedef DeviceHistogramDispatch< - DEVICE_HISTO_SORT, - BINS, - 1, - 1, - InputIterator, - HistoCounter, - Offset> - DeviceHistogramDispatch; - - return DeviceHistogramDispatch::Dispatch( + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT lower_level1[1] = {lower_level}; + LevelT upper_level1[1] = {upper_level}; + + return MultiHistogramEven<1, 1>( d_temp_storage, temp_storage_bytes, d_samples, - &d_histogram, + d_histogram1, + num_levels1, + lower_level1, + upper_level1, num_samples, + 1, + sizeof(SampleT) * num_samples, stream, debug_synchronous); } /** - * \brief Computes a device-wide histogram using shared-memory atomic read-modify-write operations. + * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. * * \par - * - Input samples having lower diversity can cause performance to be degraded due to serializations from bin-collisions. - * - Histograms having a large number of bins (e.g., thousands) may adversely affect shared memory occupancy and performance (or even the ability to launch). - * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator). + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins is (\p num_levels - 1) + * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) * - \devicestorage - * - \cdp * * \par Snippet - * The code snippet below illustrates the computation of a 8-bin histogram of - * single-channel unsigned char samples. + * The code snippet below illustrates the computation of a six-bin histogram + * from a 2x5 region of interest within a flattened 2x7 array of float samples. + * * \par * \code * #include // or equivalently * - * // Declare, allocate, and initialize device pointers for input and histogram - * int num_samples; // e.g., 12 - * unsigned char *d_samples; // e.g., [2, 6, 7, 5, 3, 0, 2, 1, 7, 0, 6, 2] - * unsigned int *d_histogram; // e.g., [ , , , , , , , ] + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_row_samples; // e.g., 5 + * int num_rows; // e.g., 2; + * size_t row_stride_bytes; // e.g., 7 * sizeof(float) + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, + * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] + * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] + * int num_levels; // e.g., 7 (seven level boundaries for six bins) + * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) + * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) * ... * - * // Wrap d_samples device pointer in a random-access texture iterator - * cub::TexObjInputIterator d_samples_tex_itr; - * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char)); - * * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; + * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::SingleChannelSorting<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples); + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_samples, num_rows, row_stride_bytes); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * - * // Compute histogram - * cub::DeviceHistogram::SingleChannelSharedAtomic<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples); - * - * // Unbind texture iterator - * d_samples_tex_itr.UnbindTexture(); + * // Compute histograms + * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_samples, num_rows, row_stride_bytes); * - * // d_histogram <-- [2, 1, 3, 1, 0, 1, 2, 2] + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; * * \endcode * - * \tparam BINS Number of histogram bins per channel - * \tparam InputIterator [inferred] Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]) \iterator - * \tparam HistoCounter [inferred] Integer type for counting sample occurrences per histogram bin + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 */ template < - int BINS, - typename InputIterator, - typename HistoCounter> + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> CUB_RUNTIME_FUNCTION - static cudaError_t SingleChannelSharedAtomic( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_samples, ///< [in] Input samples - HistoCounter* d_histogram, ///< [out] Array of BINS counters of integral type \p HistoCounter. - int num_samples, ///< [in] Number of samples to process - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + static cudaError_t HistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. + LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. + OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { - // Signed integer type for global offsets - typedef int Offset; - - // Dispatch type - typedef DeviceHistogramDispatch< - DEVICE_HISTO_SHARED_ATOMIC, - BINS, - 1, - 1, - InputIterator, - HistoCounter, - Offset> - DeviceHistogramDispatch; - - return DeviceHistogramDispatch::Dispatch( + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT lower_level1[1] = {lower_level}; + LevelT upper_level1[1] = {upper_level}; + + return MultiHistogramEven<1, 1>( d_temp_storage, temp_storage_bytes, d_samples, - &d_histogram, - num_samples, + d_histogram1, + num_levels1, + lower_level1, + upper_level1, + num_row_samples, + num_rows, + row_stride_bytes, stream, debug_synchronous); } - /** - * \brief Computes a device-wide histogram using global-memory atomic read-modify-write operations. + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. * * \par - * - Input samples having lower diversity can cause performance to be degraded due to serializations from bin-collisions. - * - Performance is not significantly impacted when computing histograms having large numbers of bins (e.g., thousands). - * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator). + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., only RGB histograms from RGBA + * pixel samples). + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) * - \devicestorage - * - \cdp * * \par Snippet - * The code snippet below illustrates the computation of a 8-bin histogram of - * single-channel unsigned char samples. + * The code snippet below illustrates the computation of three 256-bin RGB histograms + * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) + * * \par * \code * #include // or equivalently * - * // Declare, allocate, and initialize device pointers for input and histogram - * int num_samples; // e.g., 12 - * unsigned char *d_samples; // e.g., [2, 6, 7, 5, 3, 0, 2, 1, 7, 0, 6, 2] - * unsigned int *d_histogram; // e.g., [ , , , , , , , ] + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_pixels; // e.g., 5 + * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), + * // (0, 6, 7, 5), (3, 0, 2, 6)] + * int* d_histogram[3]; // e.g., three device pointers to three device buffers, + * // each allocated with 256 integer counters + * int num_levels[3]; // e.g., {257, 257, 257}; + * unsigned int lower_level[3]; // e.g., {0, 0, 0}; + * unsigned int upper_level[3]; // e.g., {256, 256, 256}; * ... * - * // Wrap d_samples device pointer in a random-access texture iterator - * cub::TexObjInputIterator d_samples_tex_itr; - * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char)); - * * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; + * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::SingleChannelSorting<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples); + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * - * // Compute histogram - * cub::DeviceHistogram::SingleChannelGlobalAtomic<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples); - * - * // Unbind texture iterator - * d_samples_tex_itr.UnbindTexture(); + * // Compute histograms + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); * - * // d_histogram <-- [2, 1, 3, 1, 0, 1, 2, 2] + * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], + * // [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], + * // [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] * * \endcode * - * \tparam BINS Number of histogram bins per channel - * \tparam InputIterator [inferred] Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]) \iterator - * \tparam HistoCounter [inferred] Integer type for counting sample occurrences per histogram bin + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 */ template < - int BINS, - typename InputIterator, - typename HistoCounter> + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> CUB_RUNTIME_FUNCTION - static cudaError_t SingleChannelGlobalAtomic( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_samples, ///< [in] Input samples - HistoCounter* d_histogram, ///< [out] Array of BINS counters of integral type \p HistoCounter. - int num_samples, ///< [in] Number of samples to process - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + static cudaError_t MultiHistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { - // Signed integer type for global offsets - typedef int Offset; - - // Dispatch type - typedef DeviceHistogramDispatch< - DEVICE_HISTO_GLOBAL_ATOMIC, - BINS, - 1, - 1, - InputIterator, - HistoCounter, - Offset> - DeviceHistogramDispatch; - - return DeviceHistogramDispatch::Dispatch( + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + return MultiHistogramEven( d_temp_storage, temp_storage_bytes, d_samples, - &d_histogram, - num_samples, + d_histogram, + num_levels, + lower_level, + upper_level, + num_pixels, + 1, + sizeof(SampleT) * NUM_CHANNELS * num_pixels, stream, debug_synchronous); } + /** + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. + * + * \par + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., only RGB histograms from RGBA + * pixel samples). + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of three 256-bin RGB histograms from a 2x3 region of + * interest of within a flattened 2x4 array of quad-channel RGBA pixels (8 bits per channel per pixel). + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_row_pixels; // e.g., 3 + * int num_rows; // e.g., 2 + * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS + * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -), + * // (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)] + * int* d_histogram[3]; // e.g., three device pointers to three device buffers, + * // each allocated with 256 integer counters + * int num_levels[3]; // e.g., {257, 257, 257}; + * unsigned int lower_level[3]; // e.g., {0, 0, 0}; + * unsigned int upper_level[3]; // e.g., {256, 256, 256}; + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_pixels, num_rows, row_stride_bytes); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, lower_level, upper_level, + * num_row_pixels, num_rows, row_stride_bytes); + * + * // d_histogram <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], + * // [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], + * // [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] + * + * \endcode + * + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiHistogramEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + Int2Type is_byte_sample; + + if ((sizeof(OffsetT) > sizeof(int)) && + ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits::max())) + { + // Down-convert OffsetT data type + + + return DipatchHistogram::DispatchEven( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, + (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } + + return DipatchHistogram::DispatchEven( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, + num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } + + //@} end member group /******************************************************************//** - * \name Interleaved multi-channel samples + * \name Custom bin ranges *********************************************************************/ //@{ - /** - * \brief Computes a device-wide histogram from multi-channel data using fast block-sorting. + * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. * * \par - * - The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS. - * - Delivers consistent throughput regardless of sample diversity - * - Histograms having a large number of bins (e.g., thousands) may adversely affect shared memory occupancy and performance (or even the ability to launch). - * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator). + * - The number of histogram bins is (\p num_levels - 1) + * - The value range for bini is [level[i], level[i+1]) * - \devicestorage - * - \cdp * * \par Snippet - * The code snippet below illustrates the computation of three 256-bin histograms from - * an input sequence of quad-channel (interleaved) unsigned char samples. - * (E.g., RGB histograms from RGBA pixel samples.) + * The code snippet below illustrates the computation of an six-bin histogram + * from a sequence of float samples * * \par * \code * #include // or equivalently * - * // Declare, allocate, and initialize device pointers for input and histograms - * int num_samples; // e.g., 20 (five pixels with four channels each) - * unsigned char *d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), - * // (0, 6, 7, 5), (3, 0, 2, 6)] - * unsigned int *d_histogram[3]; // e.g., [ [ , , , , , , , ]; - * // [ , , , , , , , ]; - * // [ , , , , , , , ] ] + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_samples; // e.g., 10 + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] + * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] + * int num_levels // e.g., 7 (seven level boundaries for six bins) + * float* d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] * ... * - * // Wrap d_samples device pointer in a random-access texture iterator - * cub::TexObjInputIterator d_samples_tex_itr; - * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char)); - * * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; + * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::MultiChannelSorting<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples); + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_samples); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms - * cub::DeviceHistogram::MultiChannelSorting<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples); + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_samples); * - * // Unbind texture iterator - * d_samples_tex_itr.UnbindTexture(); - * - * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1]; - * // [0, 3, 0, 0, 0, 0, 2, 0]; - * // [0, 0, 2, 0, 0, 0, 1, 2] ] + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; * * \endcode * - * \tparam BINS Number of histogram bins per channel - * \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - * \tparam ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed - * \tparam InputIterator [inferred] Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]) \iterator - * \tparam HistoCounter [inferred] Integer type for counting sample occurrences per histogram bin + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 */ template < - int BINS, - int CHANNELS, - int ACTIVE_CHANNELS, - typename InputIterator, - typename HistoCounter> + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> CUB_RUNTIME_FUNCTION - static cudaError_t MultiChannelSorting( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_samples, ///< [in] Pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples). - HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of active channel histogram pointers, each pointing to an output array having BINS counters of integral type \p HistoCounter. - int num_samples, ///< [in] Total number of samples to process in all channels, including non-active channels - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + static cudaError_t HistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_samples, ///< [in] The number of data samples per row in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { - // Signed integer type for global offsets - typedef int Offset; - - // Dispatch type - typedef DeviceHistogramDispatch< - DEVICE_HISTO_SORT, - BINS, - CHANNELS, - ACTIVE_CHANNELS, - InputIterator, - HistoCounter, - Offset> DeviceHistogramDispatch; - - return DeviceHistogramDispatch::Dispatch( + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT* d_levels1[1] = {d_levels}; + + return MultiHistogramRange<1, 1>( d_temp_storage, temp_storage_bytes, d_samples, - d_histograms, + d_histogram1, + num_levels1, + d_levels1, num_samples, + 1, + sizeof(SampleT) * num_samples, stream, debug_synchronous); } /** - * \brief Computes a device-wide histogram from multi-channel data using shared-memory atomic read-modify-write operations. + * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. * * \par - * - The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS. - * - Input samples having lower diversity can cause performance to be degraded due to serializations from bin-collisions. - * - Histograms having a large number of bins (e.g., thousands) may adversely affect shared memory occupancy and performance (or even the ability to launch). - * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator). + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins is (\p num_levels - 1) + * - The value range for bini is [level[i], level[i+1]) * - \devicestorage - * - \cdp * * \par Snippet - * The code snippet below illustrates the computation of three 256-bin histograms from - * an input sequence of quad-channel (interleaved) unsigned char samples. - * (E.g., RGB histograms from RGBA pixel samples.) + * The code snippet below illustrates the computation of a six-bin histogram + * from a 2x5 region of interest within a flattened 2x7 array of float samples. + * * \par * \code * #include // or equivalently * - * // Declare, allocate, and initialize device pointers for input and histograms - * int num_samples; // e.g., 20 (five pixels with four channels each) - * unsigned char *d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), - * // (0, 6, 7, 5), (3, 0, 2, 6)] - * unsigned int *d_histogram[3]; // e.g., [ [ , , , , , , , ]; - * // [ , , , , , , , ]; - * // [ , , , , , , , ] ] + * // Declare, allocate, and initialize device-accessible pointers for input samples and + * // output histogram + * int num_row_samples; // e.g., 5 + * int num_rows; // e.g., 2; + * int row_stride_bytes; // e.g., 7 * sizeof(float) + * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, + * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] + * int* d_histogram; // e.g., [ , , , , , , , ] + * int num_levels // e.g., 7 (seven level boundaries for six bins) + * float *d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] * ... * - * // Wrap d_samples device pointer in a random-access texture iterator - * cub::TexObjInputIterator d_samples_tex_itr; - * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char)); - * * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::MultiChannelSharedAtomic<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples); + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, + * num_row_samples, num_rows, row_stride_bytes); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms - * cub::DeviceHistogram::MultiChannelSharedAtomic<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples); - * - * // Unbind texture iterator - * d_samples_tex_itr.UnbindTexture(); + * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, + * num_row_samples, num_rows, row_stride_bytes); * - * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1]; - * // [0, 3, 0, 0, 0, 0, 2, 0]; - * // [0, 0, 2, 0, 0, 0, 1, 2] ] + * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; * * \endcode * - * \tparam BINS Number of histogram bins per channel - * \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - * \tparam ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed - * \tparam InputIterator [inferred] Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]) \iterator - * \tparam HistoCounter [inferred] Integer type for counting sample occurrences per histogram bin + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 */ template < - int BINS, - int CHANNELS, - int ACTIVE_CHANNELS, - typename InputIterator, - typename HistoCounter> + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> CUB_RUNTIME_FUNCTION - static cudaError_t MultiChannelSharedAtomic( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_samples, ///< [in] Pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples). - HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of active channel histogram pointers, each pointing to an output array having BINS counters of integral type \p HistoCounter. - int num_samples, ///< [in] Total number of samples to process in all channels, including non-active channels - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + static cudaError_t HistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. + CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. + int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. + LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { - // Signed integer type for global offsets - typedef int Offset; - - // Dispatch type - typedef DeviceHistogramDispatch< - DEVICE_HISTO_SHARED_ATOMIC, - BINS, - CHANNELS, - ACTIVE_CHANNELS, - InputIterator, - HistoCounter, - Offset> DeviceHistogramDispatch; - - return DeviceHistogramDispatch::Dispatch( + CounterT* d_histogram1[1] = {d_histogram}; + int num_levels1[1] = {num_levels}; + LevelT* d_levels1[1] = {d_levels}; + + return MultiHistogramRange<1, 1>( d_temp_storage, temp_storage_bytes, d_samples, - d_histograms, - num_samples, + d_histogram1, + num_levels1, + d_levels1, + num_row_samples, + num_rows, + row_stride_bytes, stream, debug_synchronous); } - /** - * \brief Computes a device-wide histogram from multi-channel data using global-memory atomic read-modify-write operations. + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. * * \par - * - The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS. - * - Input samples having lower diversity can cause performance to be degraded due to serializations from bin-collisions. - * - Performance is not significantly impacted when computing histograms having large numbers of bins (e.g., thousands). - * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator). + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., RGB histograms from RGBA + * pixel samples). + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) * - \devicestorage - * - \cdp * * \par Snippet - * The code snippet below illustrates the computation of three 256-bin histograms from - * an input sequence of quad-channel (interleaved) unsigned char samples. - * (E.g., RGB histograms from RGBA pixel samples.) + * The code snippet below illustrates the computation of three 4-bin RGB histograms + * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) * * \par * \code * #include // or equivalently * - * // Declare, allocate, and initialize device pointers for input and histograms - * int num_samples; // e.g., 20 (five pixels with four channels each) - * unsigned char *d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), - * // (0, 6, 7, 5), (3, 0, 2, 6)] - * unsigned int *d_histogram[3]; // e.g., [ [ , , , , , , , ]; - * // [ , , , , , , , ]; - * // [ , , , , , , , ] ] + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_pixels; // e.g., 5 + * unsigned char *d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2), + * // (0, 6, 7, 5),(3, 0, 2, 6)] + * unsigned int *d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; + * int num_levels[3]; // e.g., {5, 5, 5}; + * unsigned int *d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8] ]; * ... * - * // Wrap d_samples device pointer in a random-access texture iterator - * cub::TexObjInputIterator d_samples_tex_itr; - * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char)); - * * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; + * void* d_temp_storage = NULL; * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::MultiChannelGlobalAtomic<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples); + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_pixels); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Compute histograms - * cub::DeviceHistogram::MultiChannelGlobalAtomic<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples); - * - * // Unbind texture iterator - * d_samples_tex_itr.UnbindTexture(); + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_pixels); * - * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1]; - * // [0, 3, 0, 0, 0, 0, 2, 0]; - * // [0, 0, 2, 0, 0, 0, 1, 2] ] + * // d_histogram <-- [ [1, 3, 0, 1], + * // [3, 0, 0, 2], + * // [0, 2, 0, 3] ] * * \endcode * - * \tparam BINS Number of histogram bins per channel - * \tparam CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - * \tparam ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed - * \tparam InputIterator [inferred] Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]) \iterator - * \tparam HistoCounter [inferred] Integer type for counting sample occurrences per histogram bin + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 */ template < - int BINS, - int CHANNELS, - int ACTIVE_CHANNELS, - typename InputIterator, - typename HistoCounter> + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> CUB_RUNTIME_FUNCTION - static cudaError_t MultiChannelGlobalAtomic( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_samples, ///< [in] Pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples). - HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of active channel histogram pointers, each pointing to an output array having BINS counters of integral type \p HistoCounter. - int num_samples, ///< [in] Total number of samples to process in all channels, including non-active channels - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + static cudaError_t MultiHistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { - // Signed integer type for global offsets - typedef int Offset; - - // Dispatch type - typedef DeviceHistogramDispatch< - DEVICE_HISTO_GLOBAL_ATOMIC, - BINS, - CHANNELS, - ACTIVE_CHANNELS, - InputIterator, - HistoCounter, - Offset> - DeviceHistogramDispatch; - - return DeviceHistogramDispatch::Dispatch( + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + return MultiHistogramRange( d_temp_storage, temp_storage_bytes, d_samples, - d_histograms, - num_samples, + d_histogram, + num_levels, + d_levels, + num_pixels, + 1, + sizeof(SampleT) * NUM_CHANNELS * num_pixels, stream, debug_synchronous); } - //@} end member group -}; + /** + * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. + * + * \par + * - The input is a sequence of pixel structures, where each pixel comprises + * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). + * - Of the \p NUM_CHANNELS specified, the function will only compute histograms + * for the first \p NUM_ACTIVE_CHANNELS (e.g., RGB histograms from RGBA + * pixel samples). + * - A two-dimensional region of interest within \p d_samples can be specified + * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. + * - The row stride must be a whole multiple of the sample data type + * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. + * - The number of histogram bins for channeli is num_levels[i] - 1. + * - For channeli, the range of values for all histogram bins + * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the computation of three 4-bin RGB histograms from a 2x3 region of + * interest of within a flattened 2x4 array of quad-channel RGBA pixels (8 bits per channel per pixel). + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input samples + * // and output histograms + * int num_row_pixels; // e.g., 3 + * int num_rows; // e.g., 2 + * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS + * unsigned char* d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -), + * // (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)] + * int* d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; + * int num_levels[3]; // e.g., {5, 5, 5}; + * unsigned int* d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8], + * // [0, 2, 4, 6, 8] ]; + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Compute histograms + * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, + * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); + * + * // d_histogram <-- [ [2, 3, 0, 1], + * // [3, 0, 0, 2], + * // [1, 2, 0, 3] ] + * + * \endcode + * + * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed + * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator + * \tparam CounterT [inferred] Integer type for histogram bin counters + * \tparam LevelT [inferred] Type for specifying boundaries (levels) + * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 + */ + template < + int NUM_CHANNELS, + int NUM_ACTIVE_CHANNELS, + typename SampleIteratorT, + typename CounterT, + typename LevelT, + typename OffsetT> + CUB_RUNTIME_FUNCTION + static cudaError_t MultiHistogramRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. + int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. + LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + Int2Type is_byte_sample; + + if ((sizeof(OffsetT) > sizeof(int)) && + ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits::max())) + { + // Down-convert OffsetT data type + return DipatchHistogram::DispatchRange( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, + (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } + + return DipatchHistogram::DispatchRange( + d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, + num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), + stream, debug_synchronous, is_byte_sample); + } -/** - * \example example_device_histogram.cu - */ + + + //@} end member group +}; } // CUB namespace CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/device/device_partition.cuh b/SRC/cub/device/device_partition.cuh index c9418af0..50535400 100644 --- a/SRC/cub/device/device_partition.cuh +++ b/SRC/cub/device/device_partition.cuh @@ -1,275 +1,273 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within global memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/device_select_dispatch.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within global memory. ![](partition_logo.png) - * \ingroup DeviceModule - * - * \par Overview - * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from - * a specified input sequence. - * - * \par Usage Considerations - * \cdp_class{DevicePartition} - * - * \par Performance - * \linear_performance{partition} - * - * \par - * The following chart illustrates DevicePartition::If - * performance across different CUDA architectures for \p int32 items, - * where 50% of the items are randomly selected for the first partition. - * \plots_below - * - * \image html partition_if_int32_50_percent.png - * - */ -struct DevicePartition -{ - /** - * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected. ![](partition_flags_logo.png) - * - * \par - * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). - * - Copies of the selected items are compacted into \p d_out and maintain their original - * relative ordering, however copies of the unselected items are compacted into the - * rear of \p d_out in reverse order. - * - \devicestorage - * - \cdp - * - * \par Snippet - * The code snippet below illustrates the compaction of items selected from an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device pointers for input, flags, and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] - * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected, num_items); - * - * // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2] - * // d_num_selected <-- [4] - * - * \endcode - * - * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator - * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator - * \tparam OutputIterator [inferred] Random-access output iterator type for writing output items \iterator - * \tparam NumSelectedIterator [inferred] Output iterator type for recording the number of items selected \iterator - */ - template < - typename InputIterator, - typename FlagIterator, - typename OutputIterator, - typename NumSelectedIterator> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Flagged( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags - OutputIterator d_out, ///< [out] Pointer to the output sequence of partitioned data items - NumSelectedIterator d_num_selected, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) - int num_items, ///< [in] Total number of items to select from - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int Offset; // Signed integer type for global offsets - typedef NullType SelectOp; // Selection op (not used) - typedef NullType EqualityOp; // Equality operator (not used) - - return DeviceSelectDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_flags, - d_out, - d_num_selected, - SelectOp(), - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected. ![](partition_logo.png) - * - * \par - * - Copies of the selected items are compacted into \p d_out and maintain their original - * relative ordering, however copies of the unselected items are compacted into the - * rear of \p d_out in reverse order. - * - \devicestorage - * - \cdp - * - * \par Performance - * The following charts illustrate saturated partition-if performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. Items are - * selected for the first partition with 50% probability. - * - * \image html partition_if_int32_50_percent.png - * \image html partition_if_int64_50_percent.png - * - * \par - * The following charts are similar, but 5% selection probability for the first partition: - * - * \image html partition_if_int32_5_percent.png - * \image html partition_if_int64_5_percent.png - * - * \par Snippet - * The code snippet below illustrates the compaction of items selected from an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Functor type for selecting values less than some criteria - * struct LessThan - * { - * int compare; - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * LessThan(int compare) : compare(compare) {} - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * bool operator()(const int &a) const { - * return (a < compare); - * } - * }; - * - * // Declare, allocate, and initialize device pointers for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected; // e.g., [ ] - * LessThan select_op(7); - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected, num_items, select_op); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected, num_items, select_op); - * - * // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9] - * // d_num_selected <-- [5] - * - * \endcode - * - * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIterator [inferred] Random-access output iterator type for writing output items \iterator - * \tparam NumSelectedIterator [inferred] Output iterator type for recording the number of items selected \iterator - * \tparam SelectOp [inferred] Selection functor type having member bool operator()(const T &a) - */ - template < - typename InputIterator, - typename OutputIterator, - typename NumSelectedIterator, - typename SelectOp> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t If( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output sequence of partitioned data items - NumSelectedIterator d_num_selected, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) - int num_items, ///< [in] Total number of items to select from - SelectOp select_op, ///< [in] Unary selection operator - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int Offset; // Signed integer type for global offsets - typedef NullType* FlagIterator; // Flag iterator type (not used) - typedef NullType EqualityOp; // Equality operator (not used) - - return DeviceSelectDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - NULL, - d_out, - d_num_selected, - select_op, - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - -}; - -/** - * \example example_device_partition_flagged.cu - * \example example_device_partition_if.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_select_if.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png) + * \ingroup SingleModule + * + * \par Overview + * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from + * a specified input sequence. + * + * \par Usage Considerations + * \cdp_class{DevicePartition} + * + * \par Performance + * \linear_performance{partition} + * + * \par + * The following chart illustrates DevicePartition::If + * performance across different CUDA architectures for \p int32 items, + * where 50% of the items are randomly selected for the first partition. + * \plots_below + * + * \image html partition_if_int32_50_percent.png + * + */ +struct DevicePartition +{ + /** + * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png) + * + * \par + * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). + * - Copies of the selected items are compacted into \p d_out and maintain their original + * relative ordering, however copies of the unselected items are compacted into the + * rear of \p d_out in reverse order. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2] + * // d_num_selected_out <-- [4] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing output items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename InputIteratorT, + typename FlagIterator, + typename OutputIteratorT, + typename NumSelectedIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Flagged( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) + int num_items, ///< [in] Total number of items to select from + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType SelectOp; // Selection op (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + SelectOp(), + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png) + * + * \par + * - Copies of the selected items are compacted into \p d_out and maintain their original + * relative ordering, however copies of the unselected items are compacted into the + * rear of \p d_out in reverse order. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated partition-if performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Items are + * selected for the first partition with 50% probability. + * + * \image html partition_if_int32_50_percent.png + * \image html partition_if_int64_50_percent.png + * + * \par + * The following charts are similar, but 5% selection probability for the first partition: + * + * \image html partition_if_int32_5_percent.png + * \image html partition_if_int64_5_percent.png + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Functor type for selecting values less than some criteria + * struct LessThan + * { + * int compare; + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * LessThan(int compare) : compare(compare) {} + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * bool operator()(const int &a) const { + * return (a < compare); + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * LessThan select_op(7); + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9] + * // d_num_selected_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing output items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + * \tparam SelectOp [inferred] Selection functor type having member bool operator()(const T &a) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename NumSelectedIteratorT, + typename SelectOp> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t If( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) + int num_items, ///< [in] Total number of items to select from + SelectOp select_op, ///< [in] Unary selection operator + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + NULL, + d_out, + d_num_selected_out, + select_op, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + +}; + +/** + * \example example_device_partition_flagged.cu + * \example example_device_partition_if.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/device_radix_sort.cuh b/SRC/cub/device/device_radix_sort.cuh index 8585f088..1c0bdbea 100644 --- a/SRC/cub/device/device_radix_sort.cuh +++ b/SRC/cub/device/device_radix_sort.cuh @@ -1,420 +1,797 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within global memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/device_radix_sort_dispatch.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within global memory. ![](sorting_logo.png) - * \ingroup DeviceModule - * - * \par Overview - * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges - * items into ascending order. It relies upon a positional representation for - * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, - * characters, etc.) specified from least-significant to most-significant. For a - * given input sequence of keys and a set of rules specifying a total ordering - * of the symbolic alphabet, the radix sorting method produces a lexicographic - * ordering of those keys. - * - * \par - * DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.: - * unsigned char, \p int, \p double, etc. Although the direct radix sorting - * method can only be applied to unsigned integral types, BlockRadixSort - * is able to sort signed and floating-point types via simple bit-wise transformations - * that ensure lexicographic key ordering. - * - * \par Usage Considerations - * \cdp_class{DeviceRadixSort} - * - * \par Performance - * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys - * performance across different CUDA architectures for uniform-random \p uint32 keys. - * \plots_below - * - * \image html lsb_radix_sort_int32_keys.png - * - */ -struct DeviceRadixSort -{ - /** - * \brief Sorts key-value pairs into ascending order. - * - * \par - * - The sorting operation requires a pair of key buffers and a pair of value - * buffers. Each pair is wrapped in a DoubleBuffer structure whose member - * DoubleBuffer::Current() references the active buffer. The currently-active - * buffer may be changed by the sorting operation. - * - \devicestorage - * - \cdp - * - * \par Performance - * The following charts illustrate saturated sorting performance across different - * CUDA architectures for uniform-random uint32,uint32 and - * uint64,uint64 pairs, respectively. - * - * \image html lsb_radix_sort_int32_pairs.png - * \image html lsb_radix_sort_int64_pairs.png - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device pointers for sorting data - * int num_items; // e.g., 7 - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [ ... ] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [ ... ] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); - * - * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] - * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] - * - * \endcode - * - * \tparam Key [inferred] Key type - * \tparam Value [inferred] Value type - */ - template < - typename Key, - typename Value> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairs( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - int num_items, ///< [in] Number of items to reduce - int begin_bit = 0, ///< [in] [optional] The first (least-significant) bit index needed for key comparison - int end_bit = sizeof(Key) * 8, ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int Offset; - - return DeviceRadixSortDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts key-value pairs into descending order. - * - * \par - * - The sorting operation requires a pair of key buffers and a pair of value - * buffers. Each pair is wrapped in a DoubleBuffer structure whose member - * DoubleBuffer::Current() references the active buffer. The currently-active - * buffer may be changed by the sorting operation. - * - \devicestorage - * - \cdp - * - * \par Performance - * Performance is similar to DeviceRadixSort::SortPairs. - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device pointers for sorting data - * int num_items; // e.g., 7 - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [ ... ] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [ ... ] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); - * - * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] - * // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5] - * - * \endcode - * - * \tparam Key [inferred] Key type - * \tparam Value [inferred] Value type - */ - template < - typename Key, - typename Value> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairsDescending( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - int num_items, ///< [in] Number of items to reduce - int begin_bit = 0, ///< [in] [optional] The first (least-significant) bit index needed for key comparison - int end_bit = sizeof(Key) * 8, ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int Offset; - - return DeviceRadixSortDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts keys into ascending order - * - * \par - * - The sorting operation requires a pair of key buffers. The pair is - * wrapped in a DoubleBuffer structure whose member DoubleBuffer::Current() - * references the active buffer. The currently-active buffer may be changed - * by the sorting operation. - * - \devicestorage - * - \cdp - * - * \par Performance - * The following charts illustrate saturated sorting performance across different - * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. - * - * \image html lsb_radix_sort_int32_keys.png - * \image html lsb_radix_sort_int64_keys.png - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device pointers for sorting data - * int num_items; // e.g., 7 - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [ ... ] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); - * - * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] - * - * \endcode - * - * \tparam Key [inferred] Key type - */ - template - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeys( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - int num_items, ///< [in] Number of items to reduce - int begin_bit = 0, ///< [in] [optional] The first (least-significant) bit index needed for key comparison - int end_bit = sizeof(Key) * 8, ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int Offset; - - // Null value type - DoubleBuffer d_values; - - return DeviceRadixSortDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts keys into ascending order - * - * \par - * - The sorting operation requires a pair of key buffers. The pair is - * wrapped in a DoubleBuffer structure whose member DoubleBuffer::Current() - * references the active buffer. The currently-active buffer may be changed - * by the sorting operation. - * - \devicestorage - * - \cdp - * - * \par Performance - * Performance is similar to DeviceRadixSort::SortKeys. - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device pointers for sorting data - * int num_items; // e.g., 7 - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [ ... ] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); - * - * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] - * - * \endcode - * - * \tparam Key [inferred] Key type - */ - template - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeysDescending( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - int num_items, ///< [in] Number of items to reduce - int begin_bit = 0, ///< [in] [optional] The first (least-significant) bit index needed for key comparison - int end_bit = sizeof(Key) * 8, ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int Offset; - - // Null value type - DoubleBuffer d_values; - - return DeviceRadixSortDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - stream, - debug_synchronous); - } - -}; - -/** - * \example example_device_radix_sort.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_radix_sort.cuh" +#include "../util_arch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png) + * \ingroup SingleModule + * + * \par Overview + * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges + * items into ascending (or descending) order. The algorithm relies upon a positional representation for + * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, + * characters, etc.) specified from least-significant to most-significant. For a + * given input sequence of keys and a set of rules specifying a total ordering + * of the symbolic alphabet, the radix sorting method produces a lexicographic + * ordering of those keys. + * + * \par + * DeviceRadixSort can sort all of the built-in C++ numeric primitive types + * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half + * half-precision floating-point type. Although the direct radix sorting + * method can only be applied to unsigned integral types, DeviceRadixSort + * is able to sort signed and floating-point types via simple bit-wise transformations + * that ensure lexicographic key ordering. + * + * \par Usage Considerations + * \cdp_class{DeviceRadixSort} + * + * \par Performance + * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys + * performance across different CUDA architectures for uniform-random \p uint32 keys. + * \plots_below + * + * \image html lsb_radix_sort_int32_keys.png + * + */ +struct DeviceRadixSort +{ + + /******************************************************************//** + * \name KeyT-value pairs + *********************************************************************/ + //@{ + + /** + * \brief Sorts key-value pairs into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random uint32,uint32 and + * uint64,uint64 pairs, respectively. + * + * \image html lsb_radix_sort_int32_pairs.png + * \image html lsb_radix_sort_int64_pairs.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [ ... ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] + * // d_values_out <-- [5, 4, 3, 1, 2, 0, 6] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts key-value pairs into ascending order. (~N auxiliary storage required) + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random uint32,uint32 and + * uint64,uint64 pairs, respectively. + * + * \image html lsb_radix_sort_int32_pairs.png + * \image html lsb_radix_sort_int64_pairs.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] + * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts key-value pairs into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortPairs. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [ ... ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); + * + * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0] + * // d_values_out <-- [6, 0, 2, 1, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts key-value pairs into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortPairs. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); + * + * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] + * // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + * \tparam ValueT [inferred] ValueT type + */ + template < + typename KeyT, + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + /******************************************************************//** + * \name Keys-only + *********************************************************************/ + //@{ + + + /** + * \brief Sorts keys into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. + * + * \image html lsb_radix_sort_int32_keys.png + * \image html lsb_radix_sort_int64_keys.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts keys into ascending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated sorting performance across different + * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. + * + * \image html lsb_radix_sort_int32_keys.png + * \image html lsb_radix_sort_int64_keys.png + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + /** + * \brief Sorts keys into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortKeys. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [ ... ] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); + * + * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]s + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts keys into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Performance + * Performance is similar to DeviceRadixSort::SortKeys. + * + * \par Snippet + * The code snippet below illustrates the sorting of a device vector of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [ ... ] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); + * + * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] + * + * \endcode + * + * \tparam KeyT [inferred] KeyT type + */ + template + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] Number of items to sort + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + + +}; + +/** + * \example example_device_radix_sort.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/device_reduce.cuh b/SRC/cub/device/device_reduce.cuh index 480248b0..13c7a72d 100644 --- a/SRC/cub/device/device_reduce.cuh +++ b/SRC/cub/device/device_reduce.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -29,15 +29,18 @@ /** * \file - * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within global memory. + * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. */ #pragma once #include #include +#include -#include "dispatch/device_reduce_dispatch.cuh" +#include "../iterator/arg_index_input_iterator.cuh" +#include "dispatch/dispatch_reduce.cuh" +#include "dispatch/dispatch_reduce_by_key.cuh" #include "../util_namespace.cuh" /// Optional outer namespace(s) @@ -48,8 +51,8 @@ namespace cub { /** - * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within global memory. ![](reduce_logo.png) - * \ingroup DeviceModule + * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png) + * \ingroup SingleModule * * \par Overview * A reduction (or fold) @@ -75,32 +78,25 @@ namespace cub { * \image html reduce_by_key_fp32_len_500.png * * \par - * The following chart illustrates DeviceReduce::RunLengthEncode performance across - * different CUDA architectures for \p int32 items. - * Segments have lengths uniformly sampled from [1,1000]. - * - * \image html rle_int32_len_500.png - * - * \par * \plots_below * - * */ struct DeviceReduce { /** - * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor. + * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init. * * \par - * - Does not support non-commutative reduction operators. + * - Does not support binary reduction operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. * - \devicestorage - * - \cdp - * - * \par Performance - * Performance is typically similar to DeviceReduce::Sum. * * \par Snippet - * The code snippet below illustrates a custom min reduction of a device vector of \p int items. + * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements. * \par * \code * #include // or equivalently @@ -109,142 +105,152 @@ struct DeviceReduce * struct CustomMin * { * template - * CUB_RUNTIME_FUNCTION __forceinline__ + * __device__ __forceinline__ * T operator()(const T &a, const T &b) const { * return (b < a) ? b : a; * } * }; * - * // Declare, allocate, and initialize device pointers for input and output + * // Declare, allocate, and initialize device-accessible pointers for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ ] + * int *d_out; // e.g., [-] * CustomMin min_op; + * int init; // e.g., INT_MAX * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op); + * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run reduction - * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op); + * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init); * * // d_out <-- [0] * * \endcode * - * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIterator [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam ReductionOpT [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + * \tparam T [inferred] Data element type that is convertible to the \p value type of \p InputIteratorT */ template < - typename InputIterator, - typename OutputIterator, - typename ReductionOp> + typename InputIteratorT, + typename OutputIteratorT, + typename ReductionOpT, + typename T> CUB_RUNTIME_FUNCTION static cudaError_t Reduce( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output aggregate + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - ReductionOp reduction_op, ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + ReductionOpT reduction_op, ///< [in] Binary reduction functor + T init, ///< [in] Initial value of the reduction cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. { // Signed integer type for global offsets - typedef int Offset; - - // Dispatch type - typedef DeviceReduceDispatch DeviceReduceDispatch; + typedef int OffsetT; - return DeviceReduceDispatch::Dispatch( + return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, + init, stream, debug_synchronous); } /** - * \brief Computes a device-wide sum using the addition ('+') operator. + * \brief Computes a device-wide sum using the addition (\p +) operator. * * \par - * - Does not support non-commutative reduction operators. + * - Uses \p 0 as the initial value of the reduction. + * - Does not support \p + operators that are non-commutative.. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. * - \devicestorage - * - \cdp * * \par Performance - * The following charts illustrate saturated reduction (sum) performance across different + * The following charts illustrate saturated sum-reduction performance across different * CUDA architectures for \p int32 and \p int64 items, respectively. * * \image html reduce_int32.png * \image html reduce_int64.png * * \par Snippet - * The code snippet below illustrates the sum reduction of a device vector of \p int items. + * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements. * \par * \code * #include // or equivalently * - * // Declare, allocate, and initialize device pointers for input and output + * // Declare, allocate, and initialize device-accessible pointers for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ ] + * int *d_out; // e.g., [-] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_sum, num_items); + * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sum-reduction - * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_sum, num_items); + * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); * * // d_out <-- [38] * * \endcode * - * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIterator [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator */ template < - typename InputIterator, - typename OutputIterator> + typename InputIteratorT, + typename OutputIteratorT> CUB_RUNTIME_FUNCTION static cudaError_t Sum( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output aggregate + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. { // Signed integer type for global offsets - typedef int Offset; + typedef int OffsetT; - // Dispatch type - typedef DeviceReduceDispatch DeviceReduceDispatch; + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - return DeviceReduceDispatch::Dispatch( + return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Sum(), + OutputT(), // zero-initialize stream, debug_synchronous); } @@ -254,69 +260,72 @@ struct DeviceReduce * \brief Computes a device-wide minimum using the less-than ('<') operator. * * \par - * - Does not support non-commutative minimum operators. + * - Uses std::numeric_limits::max() as the initial value of the reduction. + * - Does not support \p < operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. * - \devicestorage - * - \cdp - * - * \par Performance - * Performance is typically similar to DeviceReduce::Sum. * * \par Snippet - * The code snippet below illustrates the min-reduction of a device vector of \p int items. + * The code snippet below illustrates the min-reduction of a device vector of \p int data elements. * \par * \code * #include // or equivalently * - * // Declare, allocate, and initialize device pointers for input and output + * // Declare, allocate, and initialize device-accessible pointers for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ ] + * int *d_out; // e.g., [-] * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_min, num_items); + * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run min-reduction - * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_min, num_items); + * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); * * // d_out <-- [0] * * \endcode * - * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIterator [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator */ template < - typename InputIterator, - typename OutputIterator> + typename InputIteratorT, + typename OutputIteratorT> CUB_RUNTIME_FUNCTION static cudaError_t Min( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output aggregate + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. { // Signed integer type for global offsets - typedef int Offset; + typedef int OffsetT; - // Dispatch type - typedef DeviceReduceDispatch DeviceReduceDispatch; + // The input value type + typedef typename std::iterator_traits::value_type InputT; - return DeviceReduceDispatch::Dispatch( + return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Min(), + Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent stream, debug_synchronous); } @@ -326,28 +335,27 @@ struct DeviceReduce * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item. * * \par - * Assuming the input \p d_in has value type \p T, the output \p d_out must have value type - * ItemOffsetPair. The minimum value is written to d_out.value and its - * location in the input array is written to d_out.offset. - * - * \par - * - Does not support non-commutative minimum operators. + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The minimum is written to d_out.value and its offset in the input array is written to d_out.key. + * - The {1, std::numeric_limits::max()} tuple is produced for zero-length inputs + * - Does not support \p < operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. * - \devicestorage - * - \cdp - * - * \par Performance - * Performance is typically similar to DeviceReduce::Sum. * * \par Snippet - * The code snippet below illustrates the argmin-reduction of a device vector of \p int items. + * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements. * \par * \code * #include // or equivalently * - * // Declare, allocate, and initialize device pointers for input and output + * // Declare, allocate, and initialize device-accessible pointers for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * ItemOffsetPair *d_out; // e.g., [{ , }] + * KeyValuePair *d_out; // e.g., [{-,-}] * ... * * // Determine temporary device storage requirements @@ -361,43 +369,55 @@ struct DeviceReduce * // Run argmin-reduction * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); * - * // d_out <-- [{0, 5}] + * // d_out <-- [{5, 0}] * * \endcode * - * \tparam InputIterator [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator - * \tparam OutputIterator [inferred] Output iterator type for recording the reduced aggregate (having value type ItemOffsetPair) \iterator + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type cub::KeyValuePair) \iterator */ template < - typename InputIterator, - typename OutputIterator> + typename InputIteratorT, + typename OutputIteratorT> CUB_RUNTIME_FUNCTION static cudaError_t ArgMin( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output aggregate + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. { // Signed integer type for global offsets - typedef int Offset; + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type + + // The output value type + typedef typename OutputTupleT::Value OutputValueT; - // Wrapped input iterator - typedef ArgIndexInputIterator ArgIndexInputIterator; - ArgIndexInputIterator d_argmin_in(d_in, 0); + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); - // Dispatch type - typedef DeviceReduceDispatch DeviceReduceDispatch; + // Initial value + OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent - return DeviceReduceDispatch::Dispatch( + return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, - d_argmin_in, + d_indexed_in, d_out, num_items, cub::ArgMin(), + initial_value, stream, debug_synchronous); } @@ -407,23 +427,25 @@ struct DeviceReduce * \brief Computes a device-wide maximum using the greater-than ('>') operator. * * \par - * - Does not support non-commutative maximum operators. + * - Uses std::numeric_limits::lowest() as the initial value of the reduction. + * - Does not support \p > operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. * - \devicestorage - * - \cdp - * - * \par Performance - * Performance is typically similar to DeviceReduce::Sum. * * \par Snippet - * The code snippet below illustrates the max-reduction of a device vector of \p int items. + * The code snippet below illustrates the max-reduction of a device vector of \p int data elements. * \par * \code * #include // or equivalently * - * // Declare, allocate, and initialize device pointers for input and output + * // Declare, allocate, and initialize device-accessible pointers for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ ] + * int *d_out; // e.g., [-] * ... * * // Determine temporary device storage requirements @@ -441,35 +463,36 @@ struct DeviceReduce * * \endcode * - * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIterator [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator */ template < - typename InputIterator, - typename OutputIterator> + typename InputIteratorT, + typename OutputIteratorT> CUB_RUNTIME_FUNCTION static cudaError_t Max( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output aggregate + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. { // Signed integer type for global offsets - typedef int Offset; + typedef int OffsetT; - // Dispatch type - typedef DeviceReduceDispatch DeviceReduceDispatch; + // The input value type + typedef typename std::iterator_traits::value_type InputT; - return DeviceReduceDispatch::Dispatch( + return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, cub::Max(), + Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent stream, debug_synchronous); } @@ -479,28 +502,27 @@ struct DeviceReduce * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item * * \par - * Assuming the input \p d_in has value type \p T, the output \p d_out must have value type - * ItemOffsetPair. The maximum value is written to d_out.value and its - * location in the input array is written to d_out.offset. - * - * \par - * - Does not support non-commutative maximum operators. + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The maximum is written to d_out.value and its offset in the input array is written to d_out.key. + * - The {1, std::numeric_limits::lowest()} tuple is produced for zero-length inputs + * - Does not support \p > operators that are non-commutative. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. * - \devicestorage - * - \cdp - * - * \par Performance - * Performance is typically similar to DeviceReduce::Sum. * * \par Snippet - * The code snippet below illustrates the argmax-reduction of a device vector of \p int items. + * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements. * \par * \code * #include // or equivalently * - * // Declare, allocate, and initialize device pointers for input and output + * // Declare, allocate, and initialize device-accessible pointers for input and output * int num_items; // e.g., 7 * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * ItemOffsetPair *d_out; // e.g., [{ , }] + * KeyValuePair *d_out; // e.g., [{-,-}] * ... * * // Determine temporary device storage requirements @@ -514,43 +536,55 @@ struct DeviceReduce * // Run argmax-reduction * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); * - * // d_out <-- [{9, 6}] + * // d_out <-- [{6, 9}] * * \endcode * - * \tparam InputIterator [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator - * \tparam OutputIterator [inferred] Output iterator type for recording the reduced aggregate (having value type ItemOffsetPair) \iterator + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type cub::KeyValuePair) \iterator */ template < - typename InputIterator, - typename OutputIterator> + typename InputIteratorT, + typename OutputIteratorT> CUB_RUNTIME_FUNCTION static cudaError_t ArgMax( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output aggregate + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. { // Signed integer type for global offsets - typedef int Offset; + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type - // Wrapped input iterator - typedef ArgIndexInputIterator ArgIndexInputIterator; - ArgIndexInputIterator d_argmax_in(d_in, 0); + // The output value type + typedef typename OutputTupleT::Value OutputValueT; - // Dispatch type - typedef DeviceReduceDispatch DeviceReduceDispatch; + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); - return DeviceReduceDispatch::Dispatch( + // Initial value + OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + + return DispatchReduce::Dispatch( d_temp_storage, temp_storage_bytes, - d_argmax_in, + d_indexed_in, d_out, num_items, cub::ArgMax(), + initial_value, stream, debug_synchronous); } @@ -560,17 +594,22 @@ struct DeviceReduce * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys. * * \par - * This operation computes segmented reductions using the specified binary - * \p reduction_op functor. Each "run" of consecutive, identical keys in \p d_keys_in - * is used to identify a corresponding segment of values in \p d_values_in. The first key in - * the ith segment is copied to d_keys_out[i], and - * the value aggregate for that segment is written to d_values_out[i]. - * The total number of segments discovered is written to \p d_num_segments. + * This operation computes segmented reductions within \p d_values_in using + * the specified binary \p reduction_op functor. The segments are identified by + * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of + * consecutive, identical keys. For the ith run encountered, + * the first key of the run and the corresponding value aggregate of that run are + * written to d_unique_out[i] and d_aggregates_out[i], + * respectively. The total number of runs encountered is written to \p d_num_runs_out. * * \par * - The == equality operator is used to determine whether keys are equivalent + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. * - \devicestorage - * - \cdp * * \par Performance * The following chart illustrates reduction-by-key (sum) performance across @@ -603,190 +642,81 @@ struct DeviceReduce * } * }; * - * // Declare, allocate, and initialize device pointers for input and output + * // Declare, allocate, and initialize device-accessible pointers for input and output * int num_items; // e.g., 8 * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] * int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4] - * int *d_keys_out; // e.g., [ , , , , , , , ] - * int *d_values_out; // e.g., [ , , , , , , , ] - * int *d_num_segments; // e.g., [ ] + * int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -] + * int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -] + * int *d_num_runs_out; // e.g., [-] * CustomMin reduction_op; * ... * * // Determine temporary device storage requirements * void *d_temp_storage = NULL; * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_segments, reduction_op, num_items); + * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run reduce-by-key - * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_segments, reduction_op, num_items); + * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items); * - * // d_keys_out <-- [0, 2, 9, 5, 8] - * // d_values_out <-- [0, 1, 6, 2, 4] - * // d_num_segments <-- [5] + * // d_unique_out <-- [0, 2, 9, 5, 8] + * // d_aggregates_out <-- [0, 1, 6, 2, 4] + * // d_num_runs_out <-- [5] * * \endcode * - * \tparam KeyInputIterator [inferred] Random-access input iterator type for reading input keys \iterator - * \tparam KeyOutputIterator [inferred] Random-access output iterator type for writing output keys \iterator - * \tparam ValueInputIterator [inferred] Random-access input iterator type for reading input values \iterator - * \tparam ValueOutputIterator [inferred] Random-access output iterator type for writing output values \iterator - * \tparam NumSegmentsIterator [inferred] Output iterator type for recording the number of segments encountered \iterator - * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + * \tparam KeysInputIteratorT [inferred] Random-access input iterator type for reading input keys \iterator + * \tparam UniqueOutputIteratorT [inferred] Random-access output iterator type for writing unique output keys \iterator + * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading input values \iterator + * \tparam AggregatesOutputIterator [inferred] Random-access output iterator type for writing output value aggregates \iterator + * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator + * \tparam ReductionOpT [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) */ template < - typename KeyInputIterator, - typename KeyOutputIterator, - typename ValueInputIterator, - typename ValueOutputIterator, - typename NumSegmentsIterator, - typename ReductionOp> + typename KeysInputIteratorT, + typename UniqueOutputIteratorT, + typename ValuesInputIteratorT, + typename AggregatesOutputIteratorT, + typename NumRunsOutputIteratorT, + typename ReductionOpT> CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t ReduceByKey( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeyInputIterator d_keys_in, ///< [in] Pointer to consecutive runs of input keys - KeyOutputIterator d_keys_out, ///< [out] Pointer to output keys (one key per run) - ValueInputIterator d_values_in, ///< [in] Pointer to consecutive runs of input values - ValueOutputIterator d_values_out, ///< [out] Pointer to output value aggregates (one aggregate per run) - NumSegmentsIterator d_num_segments, ///< [out] Pointer to total number of segments - ReductionOp reduction_op, ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) + KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) + ReductionOpT reduction_op, ///< [in] Binary reduction functor int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { - typedef int Offset; // Signed integer type for global offsets - typedef NullType* FlagIterator; // Flag iterator type (not used) - typedef NullType SelectOp; // Selection op (not used) - typedef Equality EqualityOp; // Default == operator - - return DeviceReduceByKeyDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - d_num_segments, - EqualityOp(), - reduction_op, - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Counts the segment lengths in the sequence \p d_in, where segments are demarcated by runs of identical values. - * - * \par - * This operation computes a run-length encoding of \p d_in, where segments are identified - * by "runs" of consecutive, identical values. The length of the ith segment - * is written to d_counts_out[i]. The unique values are also compacted, - * i.e., the first value in the ith segment is copied to - * d_compacted_out[i]. The total number of segments discovered is written - * to \p d_num_segments. - * - * \par - * - The == equality operator is used to determine whether values are equivalent - * - \devicestorage - * - \cdp - * - * \par Performance - * The following charts illustrate saturated encode performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have - * lengths uniformly sampled from [1,1000]. - * - * \image html rle_int32_len_500.png - * \image html rle_int64_len_500.png - * - * \par - * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: - * - * \image html rle_int32_len_5.png - * \image html rle_int64_len_5.png - * - * \par Snippet - * The code snippet below illustrates the run-length encoding of a sequence of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device pointers for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_compacted_out; // e.g., [ , , , , , , , ] - * int *d_counts_out; // e.g., [ , , , , , , , ] - * int *d_num_segments; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::RunLengthEncode(d_temp_storage, temp_storage_bytes, d_in, d_compacted_out, d_counts_out, d_num_segments, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run encoding - * cub::DeviceReduce::RunLengthEncode(d_temp_storage, temp_storage_bytes, d_in, d_compacted_out, d_counts_out, d_num_segments, num_items); - * - * // d_keys_out <-- [0, 2, 9, 5, 8] - * // d_values_out <-- [1, 2, 1, 3, 1] - * // d_num_segments <-- [5] - * - * \endcode - * - * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIterator [inferred] Random-access output iterator type for writing compacted output items \iterator - * \tparam CountsOutputIterator [inferred] Random-access output iterator type for writing output counts \iterator - * \tparam NumSegmentsIterator [inferred] Output iterator type for recording the number of segments encountered \iterator - */ - template < - typename InputIterator, - typename OutputIterator, - typename CountsOutputIterator, - typename NumSegmentsIterator> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t RunLengthEncode( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to consecutive runs of input keys - OutputIterator d_compacted_out, ///< [out] Pointer to output keys (one key per run) - CountsOutputIterator d_counts_out, ///< [out] Pointer to output value aggregates (one aggregate per run) - NumSegmentsIterator d_num_segments, ///< [out] Pointer to total number of segments - int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Data type of value iterator - typedef typename std::iterator_traits::value_type Value; + // Signed integer type for global offsets + typedef int OffsetT; - typedef int Offset; // Signed integer type for global offsets - typedef NullType* FlagIterator; // Flag iterator type (not used) - typedef NullType SelectOp; // Selection op (not used) - typedef Equality EqualityOp; // Default == operator - typedef cub::Sum ReductionOp; // Value reduction operator + // FlagT iterator type (not used) - // Generator type for providing 1s values for run-length reduction - typedef ConstantInputIterator CountsInputIterator; + // Selection op (not used) - Value one_val; - one_val = 1; + // Default == operator + typedef Equality EqualityOp; - return DeviceReduceByKeyDispatch::Dispatch( + return DispatchReduceByKey::Dispatch( d_temp_storage, temp_storage_bytes, - d_in, - d_compacted_out, - CountsInputIterator(one_val), - d_counts_out, - d_num_segments, + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, EqualityOp(), - ReductionOp(), + reduction_op, num_items, stream, debug_synchronous); diff --git a/SRC/cub/device/device_run_length_encode.cuh b/SRC/cub/device/device_run_length_encode.cuh new file mode 100644 index 00000000..7a2e82d9 --- /dev/null +++ b/SRC/cub/device/device_run_length_encode.cuh @@ -0,0 +1,278 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_rle.cuh" +#include "dispatch/dispatch_reduce_by_key.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png) + * \ingroup SingleModule + * + * \par Overview + * A run-length encoding + * computes a simple compressed representation of a sequence of input elements such that each + * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a + * count of the elements in that run. + * + * \par Usage Considerations + * \cdp_class{DeviceRunLengthEncode} + * + * \par Performance + * \linear_performance{run-length encode} + * + * \par + * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across + * different CUDA architectures for \p int32 items. + * Segments have lengths uniformly sampled from [1,1000]. + * + * \image html rle_int32_len_500.png + * + * \par + * \plots_below + * + */ +struct DeviceRunLengthEncode +{ + + /** + * \brief Computes a run-length encoding of the sequence \p d_in. + * + * \par + * - For the ith run encountered, the first key of the run and its length are written to + * d_unique_out[i] and d_counts_out[i], + * respectively. + * - The total number of runs encountered is written to \p d_num_runs_out. + * - The == equality operator is used to determine whether values are equivalent + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated encode performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have + * lengths uniformly sampled from [1,1000]. + * + * \image html rle_int32_len_500.png + * \image html rle_int64_len_500.png + * + * \par + * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: + * + * \image html rle_int32_len_5.png + * \image html rle_int64_len_5.png + * + * \par Snippet + * The code snippet below illustrates the run-length encoding of a sequence of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_unique_out; // e.g., [ , , , , , , , ] + * int *d_counts_out; // e.g., [ , , , , , , , ] + * int *d_num_runs_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run encoding + * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); + * + * // d_unique_out <-- [0, 2, 9, 5, 8] + * // d_counts_out <-- [1, 2, 1, 3, 1] + * // d_num_runs_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam UniqueOutputIteratorT [inferred] Random-access output iterator type for writing unique output items \iterator + * \tparam LengthsOutputIteratorT [inferred] Random-access output iterator type for writing output counts \iterator + * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator + */ + template < + typename InputIteratorT, + typename UniqueOutputIteratorT, + typename LengthsOutputIteratorT, + typename NumRunsOutputIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Encode( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + LengthsOutputIteratorT d_counts_out, ///< [out] Pointer to the output sequence of run-lengths (one count per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs + int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType SelectOp; // Selection op (not used) + typedef Equality EqualityOp; // Default == operator + typedef cub::Sum ReductionOp; // Value reduction operator + + // The lengths output value type + typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? + OffsetT, // ... then the OffsetT type, + typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type + + // Generator type for providing 1s values for run-length reduction + typedef ConstantInputIterator LengthsInputIteratorT; + + return DispatchReduceByKey::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_unique_out, + LengthsInputIteratorT((LengthT) 1), + d_counts_out, + d_num_runs_out, + EqualityOp(), + ReductionOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in. + * + * \par + * - For the ith non-trivial run, the run's starting offset + * and its length are written to d_offsets_out[i] and + * d_lengths_out[i], respectively. + * - The total number of runs encountered is written to \p d_num_runs_out. + * - The == equality operator is used to determine whether values are equivalent + * - \devicestorage + * + * \par Performance + * + * \par Snippet + * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_offsets_out; // e.g., [ , , , , , , , ] + * int *d_lengths_out; // e.g., [ , , , , , , , ] + * int *d_num_runs_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run encoding + * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); + * + * // d_offsets_out <-- [1, 4] + * // d_lengths_out <-- [2, 3] + * // d_num_runs_out <-- [2] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OffsetsOutputIteratorT [inferred] Random-access output iterator type for writing run-offset values \iterator + * \tparam LengthsOutputIteratorT [inferred] Random-access output iterator type for writing run-length values \iterator + * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator + */ + template < + typename InputIteratorT, + typename OffsetsOutputIteratorT, + typename LengthsOutputIteratorT, + typename NumRunsOutputIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t NonTrivialRuns( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run) + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) + int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef Equality EqualityOp; // Default == operator + + return DeviceRleDispatch::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/device_scan.cuh b/SRC/cub/device/device_scan.cuh index 511acc88..e86fefe3 100644 --- a/SRC/cub/device/device_scan.cuh +++ b/SRC/cub/device/device_scan.cuh @@ -1,419 +1,443 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within global memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/device_scan_dispatch.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within global memory. ![](device_scan.png) - * \ingroup DeviceModule - * - * \par Overview - * Given a sequence of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) - * produces an output sequence where each element is computed to be the reduction - * of the elements occurring earlier in the input sequence. Prefix sum - * connotes a prefix scan with the addition operator. The term \em inclusive indicates - * that the ith output reduction incorporates the ith input. - * The term \em exclusive indicates the ith input is not incorporated into - * the ith output reduction. - * - * \par Usage Considerations - * \cdp_class{DeviceScan} - * - * \par Performance - * \linear_performance{prefix scan} - * - * \par - * The following chart illustrates DeviceScan::ExclusiveSum - * performance across different CUDA architectures for \p int32 keys. - * \plots_below - * - * \image html scan_int32.png - * - */ -struct DeviceScan -{ - /******************************************************************//** - * \name Exclusive scans - *********************************************************************/ - //@{ - - /** - * \brief Computes a device-wide exclusive prefix sum. - * - * \par - * - Supports non-commutative sum operators. - * - \devicestorage - * - \cdp - * - * \par Performance - * The following charts illustrate saturated exclusive sum performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. - * - * \image html scan_int32.png - * \image html scan_int64.png - * - * \par Snippet - * The code snippet below illustrates the exclusive prefix sum of an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix sum - * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out s<-- [0, 8, 14, 21, 26, 29, 29] - * - * \endcode - * - * \tparam InputIterator [inferred] Random-access input iterator type for reading scan input data \iterator - * \tparam OutputIterator [inferred] Random-access output iterator type for writing scan output data \iterator - */ - template < - typename InputIterator, - typename OutputIterator> - CUB_RUNTIME_FUNCTION - static cudaError_t ExclusiveSum( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output sequence of data items - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int Offset; - - // Scan data type - typedef typename std::iterator_traits::value_type T; - - return DeviceScanDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - Sum(), - T(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor. - * - * \par - * - Supports non-commutative scan operators. - * - \devicestorage - * - \cdp - * - * \par Performance - * Performance is typically similar to DeviceScan::ExclusiveSum. - * - * \par Snippet - * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector - * \par - * \code - * #include // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * CustomMin min_op - * ... - * - * // Determine temporary device storage requirements for exclusive prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); - * - * // Allocate temporary storage for exclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix min-scan - * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); - * - * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] - * - * \endcode - * - * \tparam InputIterator [inferred] Random-access input iterator type for reading scan input data \iterator - * \tparam OutputIterator [inferred] Random-access output iterator type for writing scan output data \iterator - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam Identity [inferred] Type of the \p identity value used Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - typename InputIterator, - typename OutputIterator, - typename ScanOp, - typename Identity> - CUB_RUNTIME_FUNCTION - static cudaError_t ExclusiveScan( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output sequence of data items - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - Identity identity, ///< [in] Identity element - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int Offset; - - return DeviceScanDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - identity, - num_items, - stream, - debug_synchronous); - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive scans - *********************************************************************/ - //@{ - - - /** - * \brief Computes a device-wide inclusive prefix sum. - * - * \par - * - Supports non-commutative sum operators. - * - \devicestorage - * - \cdp - * - * \par Performance - * Performance is typically similar to DeviceScan::ExclusiveSum. - * - * \par Snippet - * The code snippet below illustrates the inclusive prefix sum of an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * ... - * - * // Determine temporary device storage requirements for inclusive prefix sum - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage for inclusive prefix sum - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix sum - * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out <-- [8, 14, 21, 26, 29, 29, 38] - * - * \endcode - * - * \tparam InputIterator [inferred] Random-access input iterator type for reading scan input data \iterator - * \tparam OutputIterator [inferred] Random-access output iterator type for writing scan output data \iterator - */ - template < - typename InputIterator, - typename OutputIterator> - CUB_RUNTIME_FUNCTION - static cudaError_t InclusiveSum( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output sequence of data items - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int Offset; - - return DeviceScanDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - Sum(), - NullType(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor. - * - * \par - * - Supports non-commutative scan operators. - * - \devicestorage - * - \cdp - * - * \par Performance - * Performance is typically similar to DeviceScan::ExclusiveSum. - * - * \par Snippet - * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * CustomMin min_op; - * ... - * - * // Determine temporary device storage requirements for inclusive prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); - * - * // Allocate temporary storage for inclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix min-scan - * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); - * - * // d_out <-- [8, 6, 6, 5, 3, 0, 0] - * - * \endcode - * - * \tparam InputIterator [inferred] Random-access input iterator type for reading scan input data \iterator - * \tparam OutputIterator [inferred] Random-access output iterator type for writing scan output data \iterator - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - typename InputIterator, - typename OutputIterator, - typename ScanOp> - CUB_RUNTIME_FUNCTION - static cudaError_t InclusiveScan( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output sequence of data items - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int Offset; - - return DeviceScanDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - NullType(), - num_items, - stream, - debug_synchronous); - } - - //@} end member group - -}; - -/** - * \example example_device_scan.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_scan.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png) + * \ingroup SingleModule + * + * \par Overview + * Given a sequence of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) + * produces an output sequence where each element is computed to be the reduction + * of the elements occurring earlier in the input sequence. Prefix sum + * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * that the ith output reduction incorporates the ith input. + * The term \em exclusive indicates the ith input is not incorporated into + * the ith output reduction. + * + * \par + * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our "decoupled look-back" algorithm + * for performing global prefix scan with only a single pass through the + * input data, as described in our 2016 technical report [1]. The central + * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies + * of global prefix propagation with local computation. As such, our algorithm requires only + * ~2n data movement (n inputs are read, n outputs are written), and typically + * proceeds at "memcpy" speeds. + * + * \par + * [1] [Duane Merrill and Michael Garland. "Single-pass Parallel Prefix Scan with Decoupled Look-back", NVIDIA Technical Report NVR-2016-002, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back) + * + * \par Usage Considerations + * \cdp_class{DeviceScan} + * + * \par Performance + * \linear_performance{prefix scan} + * + * \par + * The following chart illustrates DeviceScan::ExclusiveSum + * performance across different CUDA architectures for \p int32 keys. + * \plots_below + * + * \image html scan_int32.png + * + */ +struct DeviceScan +{ + /******************************************************************//** + * \name Exclusive scans + *********************************************************************/ + //@{ + + /** + * \brief Computes a device-wide exclusive prefix sum. The value of 0 is applied as the initial value, and is assigned to *d_out. + * + * \par + * - Supports non-commutative sum operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated exclusive sum performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. + * + * \image html scan_int32.png + * \image html scan_int64.png + * + * \par Snippet + * The code snippet below illustrates the exclusive prefix sum of an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix sum + * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out s<-- [0, 8, 14, 21, 26, 29, 29] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ExclusiveSum( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Initial value + OutputT init_value = 0; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + Sum(), + init_value, + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor. The \p init_value value is applied as the initial value, and is assigned to *d_out. + * + * \par + * - Supports non-commutative scan operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * CustomMin min_op + * ... + * + * // Determine temporary device storage requirements for exclusive prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); + * + * // Allocate temporary storage for exclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run exclusive prefix min-scan + * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); + * + * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + * \tparam Identity [inferred] Type of the \p identity value used Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename ScanOpT, + typename InitValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t ExclusiveScan( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + InitValueT init_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out) + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + init_value, + num_items, + stream, + debug_synchronous); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive scans + *********************************************************************/ + //@{ + + + /** + * \brief Computes a device-wide inclusive prefix sum. + * + * \par + * - Supports non-commutative sum operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the inclusive prefix sum of an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * ... + * + * // Determine temporary device storage requirements for inclusive prefix sum + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // Allocate temporary storage for inclusive prefix sum + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix sum + * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); + * + * // d_out <-- [8, 14, 21, 26, 29, 29, 38] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t InclusiveSum( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + Sum(), + NullType(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor. + * + * \par + * - Supports non-commutative scan operators. + * - Provides "run-to-run" determinism for pseudo-associative reduction + * (e.g., addition of floating point types) on the same GPU device. + * However, results for pseudo-associative reduction may be inconsistent + * from one device to a another device of a different compute-capability + * because CUB can employ different tile-sizing for different architectures. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 7 + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [ , , , , , , ] + * CustomMin min_op; + * ... + * + * // Determine temporary device storage requirements for inclusive prefix scan + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); + * + * // Allocate temporary storage for inclusive prefix scan + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run inclusive prefix min-scan + * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); + * + * // d_out <-- [8, 6, 6, 5, 3, 0, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename ScanOpT> + CUB_RUNTIME_FUNCTION + static cudaError_t InclusiveScan( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchScan::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + NullType(), + num_items, + stream, + debug_synchronous); + } + + //@} end member group + +}; + +/** + * \example example_device_scan.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/device_segmented_radix_sort.cuh b/SRC/cub/device/device_segmented_radix_sort.cuh new file mode 100644 index 00000000..0d360762 --- /dev/null +++ b/SRC/cub/device/device_segmented_radix_sort.cuh @@ -0,0 +1,876 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_radix_sort.cuh" +#include "../util_arch.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png) + * \ingroup SegmentedModule + * + * \par Overview + * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges + * items into ascending (or descending) order. The algorithm relies upon a positional representation for + * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, + * characters, etc.) specified from least-significant to most-significant. For a + * given input sequence of keys and a set of rules specifying a total ordering + * of the symbolic alphabet, the radix sorting method produces a lexicographic + * ordering of those keys. + * + * \par + * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types + * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half + * half-precision floating-point type. Although the direct radix sorting + * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort + * is able to sort signed and floating-point types via simple bit-wise transformations + * that ensure lexicographic key ordering. + * + * \par Usage Considerations + * \cdp_class{DeviceSegmentedRadixSort} + * + */ +struct DeviceSegmentedRadixSort +{ + + /******************************************************************//** + * \name Key-value pairs + *********************************************************************/ + //@{ + + /** + * \brief Sorts segments of key-value pairs into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] + * // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of key-value pairs into ascending order. (~N auxiliary storage required) + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] + * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairs( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of key-value pairs into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_values_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, + * d_keys_in, d_keys_out, d_values_in, d_values_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] + * // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items + ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of key-value pairs into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers and a corresponding + * pair of associated value buffers. Each pair is managed by a DoubleBuffer + * structure that indicates which of the two buffers is "current" (and thus + * contains the input data to be sorted). + * - The contents of both buffers within each pair may be altered by the sorting + * operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within each DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys + * with associated vector of \p int values. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] + * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a set of DoubleBuffers to wrap pairs of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] + * // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam ValueT [inferred] Value type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename ValueT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortPairsDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + /******************************************************************//** + * \name Keys-only + *********************************************************************/ + //@{ + + + /** + * \brief Sorts segments of keys into ascending order. (~2N auxiliary storage required) + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of keys into ascending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeys( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + /** + * \brief Sorts segments of keys into descending order. (~2N auxiliary storage required). + * + * \par + * - The contents of the input data are not altered by the sorting operation + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort + KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + false, + stream, + debug_synchronous); + } + + + /** + * \brief Sorts segments of keys into descending order. (~N auxiliary storage required). + * + * \par + * - The sorting operation is given a pair of key buffers managed by a + * DoubleBuffer structure that indicates which of the two buffers is + * "current" (and thus contains the input data to be sorted). + * - The contents of both buffers may be altered by the sorting operation. + * - Upon completion, the sorting operation will update the "current" indicator + * within the DoubleBuffer wrapper to reference which of the two buffers + * now contains the sorted output sequence (a function of the number of key bits + * specified and the targeted device architecture). + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. + * - \devicestorageP + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for sorting data + * int num_items; // e.g., 7 + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] + * ... + * + * // Create a DoubleBuffer to wrap the pair of device pointers + * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sorting operation + * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, + * num_items, num_segments, d_offsets, d_offsets + 1); + * + * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] + * + * \endcode + * + * \tparam KeyT [inferred] Key type + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename KeyT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t SortKeysDescending( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + int num_items, ///< [in] The total number of items to sort (across all segments) + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison + int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // Null value type + DoubleBuffer d_values; + + return DispatchSegmentedRadixSort::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys, + d_values, + num_items, + num_segments, + d_begin_offsets, + d_end_offsets, + begin_bit, + end_bit, + true, + stream, + debug_synchronous); + } + + + //@} end member group + + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/device_segmented_reduce.cuh b/SRC/cub/device/device_segmented_reduce.cuh new file mode 100644 index 00000000..6c3b54a0 --- /dev/null +++ b/SRC/cub/device/device_segmented_reduce.cuh @@ -0,0 +1,619 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../iterator/arg_index_input_iterator.cuh" +#include "dispatch/dispatch_reduce.cuh" +#include "dispatch/dispatch_reduce_by_key.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png) + * \ingroup SegmentedModule + * + * \par Overview + * A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a sequence of input elements. + * + * \par Usage Considerations + * \cdp_class{DeviceSegmentedReduce} + * + */ +struct DeviceSegmentedReduce +{ + /** + * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor. + * + * \par + * - Does not support binary reduction operators that are non-commutative. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // CustomMin functor + * struct CustomMin + * { + * template + * CUB_RUNTIME_FUNCTION __forceinline__ + * T operator()(const T &a, const T &b) const { + * return (b < a) ? b : a; + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * CustomMin min_op; + * int initial_value; // e.g., INT_MAX + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run reduction + * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); + * + * // d_out <-- [6, INT_MAX, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) + * \tparam T [inferred] Data element type that is convertible to the \p value type of \p InputIteratorT + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT, + typename ReductionOp, + typename T> + CUB_RUNTIME_FUNCTION + static cudaError_t Reduce( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + ReductionOp reduction_op, ///< [in] Binary reduction functor + T initial_value, ///< [in] Initial value of the reduction for each segment + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + reduction_op, + initial_value, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide segmented sum using the addition ('+') operator. + * + * \par + * - Uses \p 0 as the initial value of the reduction for each segment. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p + operators that are non-commutative.. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the sum reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run sum-reduction + * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [21, 0, 17] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Sum( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Sum(), + OutputT(), // zero-initialize + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide segmented minimum using the less-than ('<') operator. + * + * \par + * - Uses std::numeric_limits::max() as the initial value of the reduction for each segment. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p < operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the min-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run min-reduction + * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [6, INT_MAX, 0] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Min( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Min(), + Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item. + * + * \par + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The minimum of the ith segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key. + * - The {1, std::numeric_limits::max()} tuple is produced for zero-length inputs + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p < operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmin-reduction + * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type KeyValuePair) \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMin( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type + + // The output value type + typedef typename OutputTupleT::Value OutputValueT; + + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::ArgMin(), + initial_value, + stream, + debug_synchronous); + } + + + /** + * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator. + * + * \par + * - Uses std::numeric_limits::lowest() as the initial value of the reduction. + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p > operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the max-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * int *d_out; // e.g., [-, -, -] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run max-reduction + * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [8, INT_MIN, 9] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t Max( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input value type + typedef typename std::iterator_traits::value_type InputT; + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::Max(), + Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + stream, + debug_synchronous); + } + + + /** + * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item + * + * \par + * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) + * - The maximum of the ith segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key. + * - The {1, std::numeric_limits::lowest()} tuple is produced for zero-length inputs + * - When input a contiguous sequence of segments, a single sequence + * \p segment_offsets (of length num_segments+1) can be aliased + * for both the \p d_begin_offsets and \p d_end_offsets parameters (where + * the latter is specified as segment_offsets+1). + * - Does not support \p > operators that are non-commutative. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_segments; // e.g., 3 + * int *d_offsets; // e.g., [0, 3, 3, 7] + * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] + * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run argmax-reduction + * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, + * num_segments, d_offsets, d_offsets + 1); + * + * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator + * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type KeyValuePair) \iterator + * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetIteratorT> + CUB_RUNTIME_FUNCTION + static cudaError_t ArgMax( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + // Signed integer type for global offsets + typedef int OffsetT; + + // The input type + typedef typename std::iterator_traits::value_type InputValueT; + + // The output tuple type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + KeyValuePair, // ... then the key value pair OffsetT + InputValueT + typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type + + // The output value type + typedef typename OutputTupleT::Value OutputValueT; + + // Wrapped input iterator to produce index-value tuples + typedef ArgIndexInputIterator ArgIndexInputIteratorT; + ArgIndexInputIteratorT d_indexed_in(d_in); + + // Initial value + OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent + + return DispatchSegmentedReduce::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_indexed_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + cub::ArgMax(), + initial_value, + stream, + debug_synchronous); + } + +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/device_select.cuh b/SRC/cub/device/device_select.cuh index 8357c59d..52a3e126 100644 --- a/SRC/cub/device/device_select.cuh +++ b/SRC/cub/device/device_select.cuh @@ -1,372 +1,369 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within global memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/device_select_dispatch.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within global memory. ![](select_logo.png) - * \ingroup DeviceModule - * - * \par Overview - * These operations apply a selection criterion to selectively copy - * items from a specified input sequence to a compact output sequence. - * - * \par Usage Considerations - * \cdp_class{DeviceSelect} - * - * \par Performance - * \linear_performance{select-flagged, select-if, and select-unique} - * - * \par - * The following chart illustrates DeviceSelect::If - * performance across different CUDA architectures for \p int32 items, - * where 50% of the items are randomly selected. - * - * \image html select_if_int32_50_percent.png - * - * \par - * The following chart illustrates DeviceSelect::Unique - * performance across different CUDA architectures for \p int32 items - * where segments have lengths uniformly sampled from [1,1000]. - * - * \image html select_unique_int32_len_500.png - * - * \par - * \plots_below - * - */ -struct DeviceSelect -{ - /** - * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected. ![](select_flags_logo.png) - * - * \par - * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). - * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. - * - \devicestorage - * - \cdp - * - * \par Snippet - * The code snippet below illustrates the compaction of items selected from an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device pointers for input, flags, and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] - * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected, num_items); - * - * // d_out <-- [1, 4, 6, 7] - * // d_num_selected <-- [4] - * - * \endcode - * - * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator - * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator - * \tparam OutputIterator [inferred] Random-access output iterator type for writing selected items \iterator - * \tparam NumSelectedIterator [inferred] Output iterator type for recording the number of items selected \iterator - */ - template < - typename InputIterator, - typename FlagIterator, - typename OutputIterator, - typename NumSelectedIterator> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Flagged( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags - OutputIterator d_out, ///< [out] Pointer to the output sequence of selected data items - NumSelectedIterator d_num_selected, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int Offset; // Signed integer type for global offsets - typedef NullType SelectOp; // Selection op (not used) - typedef NullType EqualityOp; // Equality operator (not used) - - return DeviceSelectDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_flags, - d_out, - d_num_selected, - SelectOp(), - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected. ![](select_logo.png) - * - * \par - * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. - * - \devicestorage - * - \cdp - * - * \par Performance - * The following charts illustrate saturated select-if performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. Items are - * selected with 50% probability. - * - * \image html select_if_int32_50_percent.png - * \image html select_if_int64_50_percent.png - * - * \par - * The following charts are similar, but 5% selection probability: - * - * \image html select_if_int32_5_percent.png - * \image html select_if_int64_5_percent.png - * - * \par Snippet - * The code snippet below illustrates the compaction of items selected from an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Functor type for selecting values less than some criteria - * struct LessThan - * { - * int compare; - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * LessThan(int compare) : compare(compare) {} - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * bool operator()(const int &a) const { - * return (a < compare); - * } - * }; - * - * // Declare, allocate, and initialize device pointers for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected; // e.g., [ ] - * LessThan select_op(7); - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected, num_items, select_op); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected, num_items, select_op); - * - * // d_out <-- [0, 2, 3, 5, 2] - * // d_num_selected <-- [5] - * - * \endcode - * - * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIterator [inferred] Random-access output iterator type for writing selected items \iterator - * \tparam NumSelectedIterator [inferred] Output iterator type for recording the number of items selected \iterator - * \tparam SelectOp [inferred] Selection operator type having member bool operator()(const T &a) - */ - template < - typename InputIterator, - typename OutputIterator, - typename NumSelectedIterator, - typename SelectOp> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t If( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output sequence of selected data items - NumSelectedIterator d_num_selected, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - SelectOp select_op, ///< [in] Unary selection operator - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int Offset; // Signed integer type for global offsets - typedef NullType* FlagIterator; // Flag iterator type (not used) - typedef NullType EqualityOp; // Equality operator (not used) - - return DeviceSelectDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - NULL, - d_out, - d_num_selected, - select_op, - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out. The total number of items selected is written to \p d_num_selected. ![](unique_logo.png) - * - * \par - * - The == equality operator is used to determine whether keys are equivalent - * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. - * - \devicestorage - * - \cdp - * - * \par Performance - * The following charts illustrate saturated select-unique performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have - * lengths uniformly sampled from [1,1000]. - * - * \image html select_unique_int32_len_500.png - * \image html select_unique_int64_len_500.png - * - * \par - * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: - * - * \image html select_unique_int32_len_5.png - * \image html select_unique_int64_len_5.png - * - * \par Snippet - * The code snippet below illustrates the compaction of items selected from an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device pointers for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected, num_items); - * - * // d_out <-- [0, 2, 9, 5, 8] - * // d_num_selected <-- [5] - * - * \endcode - * - * \tparam InputIterator [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIterator [inferred] Random-access output iterator type for writing selected items \iterator - * \tparam NumSelectedIterator [inferred] Output iterator type for recording the number of items selected \iterator - */ - template < - typename InputIterator, - typename OutputIterator, - typename NumSelectedIterator> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Unique( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output sequence of selected data items - NumSelectedIterator d_num_selected, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int Offset; // Signed integer type for global offsets - typedef NullType* FlagIterator; // Flag iterator type (not used) - typedef NullType SelectOp; // Selection op (not used) - typedef Equality EqualityOp; // Default == operator - - return DeviceSelectDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - NULL, - d_out, - d_num_selected, - SelectOp(), - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - -}; - -/** - * \example example_device_select_flagged.cu - * \example example_device_select_if.cu - * \example example_device_select_unique.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch/dispatch_select_if.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png) + * \ingroup SingleModule + * + * \par Overview + * These operations apply a selection criterion to selectively copy + * items from a specified input sequence to a compact output sequence. + * + * \par Usage Considerations + * \cdp_class{DeviceSelect} + * + * \par Performance + * \linear_performance{select-flagged, select-if, and select-unique} + * + * \par + * The following chart illustrates DeviceSelect::If + * performance across different CUDA architectures for \p int32 items, + * where 50% of the items are randomly selected. + * + * \image html select_if_int32_50_percent.png + * + * \par + * The following chart illustrates DeviceSelect::Unique + * performance across different CUDA architectures for \p int32 items + * where segments have lengths uniformly sampled from [1,1000]. + * + * \image html select_unique_int32_len_500.png + * + * \par + * \plots_below + * + */ +struct DeviceSelect +{ + /** + * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png) + * + * \par + * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); + * + * // d_out <-- [1, 4, 6, 7] + * // d_num_selected_out <-- [4] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename InputIteratorT, + typename FlagIterator, + typename OutputIteratorT, + typename NumSelectedIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Flagged( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType SelectOp; // Selection op (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_out, + d_num_selected_out, + SelectOp(), + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png) + * + * \par + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated select-if performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Items are + * selected with 50% probability. + * + * \image html select_if_int32_50_percent.png + * \image html select_if_int64_50_percent.png + * + * \par + * The following charts are similar, but 5% selection probability: + * + * \image html select_if_int32_5_percent.png + * \image html select_if_int64_5_percent.png + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Functor type for selecting values less than some criteria + * struct LessThan + * { + * int compare; + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * LessThan(int compare) : compare(compare) {} + * + * CUB_RUNTIME_FUNCTION __forceinline__ + * bool operator()(const int &a) const { + * return (a < compare); + * } + * }; + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * LessThan select_op(7); + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); + * + * // d_out <-- [0, 2, 3, 5, 2] + * // d_num_selected_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + * \tparam SelectOp [inferred] Selection operator type having member bool operator()(const T &a) + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename NumSelectedIteratorT, + typename SelectOp> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t If( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + SelectOp select_op, ///< [in] Unary selection operator + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType EqualityOp; // Equality operator (not used) + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + NULL, + d_out, + d_num_selected_out, + select_op, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + + + /** + * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png) + * + * \par + * - The == equality operator is used to determine whether keys are equivalent + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * + * \par Performance + * The following charts illustrate saturated select-unique performance across different + * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have + * lengths uniformly sampled from [1,1000]. + * + * \image html select_unique_int32_len_500.png + * \image html select_unique_int64_len_500.png + * + * \par + * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: + * + * \image html select_unique_int32_len_5.png + * \image html select_unique_int64_len_5.png + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); + * + * // d_out <-- [0, 2, 9, 5, 8] + * // d_num_selected_out <-- [5] + * + * \endcode + * + * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator + * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename InputIteratorT, + typename OutputIteratorT, + typename NumSelectedIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Unique( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + typedef int OffsetT; // Signed integer type for global offsets + typedef NullType* FlagIterator; // FlagT iterator type (not used) + typedef NullType SelectOp; // Selection op (not used) + typedef Equality EqualityOp; // Default == operator + + return DispatchSelectIf::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + NULL, + d_out, + d_num_selected_out, + SelectOp(), + EqualityOp(), + num_items, + stream, + debug_synchronous); + } + +}; + +/** + * \example example_device_select_flagged.cu + * \example example_device_select_if.cu + * \example example_device_select_unique.cu + */ + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/device_spmv.cuh b/SRC/cub/device/device_spmv.cuh new file mode 100644 index 00000000..63b6a7e8 --- /dev/null +++ b/SRC/cub/device/device_spmv.cuh @@ -0,0 +1,174 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). + */ + +#pragma once + +#include +#include +#include + +#include "dispatch/dispatch_spmv_orig.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV). + * \ingroup SingleModule + * + * \par Overview + * The [SpMV computation](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication) + * performs the matrix-vector operation + * y = alpha*A*x + beta*y, + * where: + * - A is an mxn sparse matrix whose non-zero structure is specified in + * [compressed-storage-row (CSR) format](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29) + * (i.e., three arrays: values, row_offsets, and column_indices) + * - x and y are dense vectors + * - alpha and beta are scalar multiplicands + * + * \par Usage Considerations + * \cdp_class{DeviceSpmv} + * + */ +struct DeviceSpmv +{ + /******************************************************************//** + * \name CSR matrix operations + *********************************************************************/ + //@{ + + /** + * \brief This function performs the matrix-vector operation y = A*x. + * + * \par Snippet + * The code snippet below illustrates SpMV upon a 9x9 CSR matrix A + * representing a 3x3 lattice (24 non-zeros). + * + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x, + * // and output vector y + * int num_rows = 9; + * int num_cols = 9; + * int num_nonzeros = 24; + * + * float* d_values; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, + * // 1, 1, 1, 1, 1, 1, 1, 1, + * // 1, 1, 1, 1, 1, 1, 1, 1] + * + * int* d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0, + * // 4, 6, 1, 3, 5, 7, 2, 4, + * // 8, 3, 7, 4, 6, 8, 5, 7] + * + * int* d_row_offsets; // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24] + * + * float* d_vector_x; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1] + * float* d_vector_y; // e.g., [ , , , , , , , , ] + * ... + * + * // Determine temporary device storage requirements + * void* d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, + * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, + * num_rows, num_cols, num_nonzeros, alpha, beta); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run SpMV + * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, + * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, + * num_rows, num_cols, num_nonzeros, alpha, beta); + * + * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2] + * + * \endcode + * + * \tparam ValueT [inferred] Matrix and vector value type (e.g., /p float, /p double, etc.) + */ + template < + typename ValueT> + CUB_RUNTIME_FUNCTION + static cudaError_t CsrMV( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + ValueT* d_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. + int* d_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros) + int* d_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) + ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector x + ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector y + int num_rows, ///< [in] number of rows of matrix A. + int num_cols, ///< [in] number of columns of matrix A. + int num_nonzeros, ///< [in] number of nonzero elements of matrix A. + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + SpmvParams spmv_params; + spmv_params.d_values = d_values; + spmv_params.d_row_end_offsets = d_row_offsets + 1; + spmv_params.d_column_indices = d_column_indices; + spmv_params.d_vector_x = d_vector_x; + spmv_params.d_vector_y = d_vector_y; + spmv_params.num_rows = num_rows; + spmv_params.num_cols = num_cols; + spmv_params.num_nonzeros = num_nonzeros; + spmv_params.alpha = 1.0; + spmv_params.beta = 0.0; + + return DispatchSpmv::Dispatch( + d_temp_storage, + temp_storage_bytes, + spmv_params, + stream, + debug_synchronous); + } + + //@} end member group +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/dispatch/device_histogram_dispatch.cuh b/SRC/cub/device/dispatch/device_histogram_dispatch.cuh deleted file mode 100644 index 1c2d1b36..00000000 --- a/SRC/cub/device/dispatch/device_histogram_dispatch.cuh +++ /dev/null @@ -1,554 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within global memory. - */ - -#pragma once - -#include -#include - -#include "../../block_range/block_range_histo.cuh" -#include "../../grid/grid_even_share.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Initialization kernel entry point (multi-block). Prepares queue descriptors and zeroes global counters. - */ -template < - int BINS, ///< Number of histogram bins per channel - int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename Offset, ///< Signed integer type for global offsets - typename HistoCounter> ///< Integer type for counting sample occurrences per histogram bin -__launch_bounds__ (BINS, 1) -__global__ void HistoInitKernel( - GridQueue grid_queue, ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks - ArrayWrapper d_out_histograms, ///< [out] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][BINS] - Offset num_samples) ///< [in] Total number of samples \p d_samples for all channels -{ - d_out_histograms.array[blockIdx.x][threadIdx.x] = 0; - if (threadIdx.x == 0) grid_queue.FillAndResetDrain(num_samples); -} - - -/** - * Histogram tiles kernel entry point (multi-block). Computes privatized histograms, one per thread block. - */ -template < - typename BlockRangeHistogramPolicy, ///< Parameterized BlockRangeHistogramPolicy tuning policy type - int BINS, ///< Number of histogram bins per channel - int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename InputIterator, ///< The input iterator type \iterator. Must have a value type that is assignable to unsigned char - typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin - typename Offset> ///< Signed integer type for global offsets -__launch_bounds__ (int(BlockRangeHistogramPolicy::BLOCK_THREADS)) -__global__ void HistoRegionKernel( - InputIterator d_samples, ///< [in] Array of sample data. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples). - ArrayWrapper d_out_histograms, ///< [out] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][gridDim.x][BINS] - Offset num_samples, ///< [in] Total number of samples \p d_samples for all channels - GridEvenShare even_share, ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block - GridQueue queue) ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks -{ - // Constants - enum - { - BLOCK_THREADS = BlockRangeHistogramPolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = BlockRangeHistogramPolicy::ITEMS_PER_THREAD, - TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - // Thread block type for compositing input tiles - typedef BlockRangeHistogram BlockRangeHistogramT; - - // Shared memory for BlockRangeHistogram - __shared__ typename BlockRangeHistogramT::TempStorage temp_storage; - - // Consume input tiles - BlockRangeHistogramT(temp_storage, d_samples, d_out_histograms.array).ConsumeRange( - num_samples, - even_share, - queue, - Int2Type()); -} - - -/** - * Aggregation kernel entry point (single-block). Aggregates privatized threadblock histograms from a previous multi-block histogram pass. - */ -template < - int BINS, ///< Number of histogram bins per channel - int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename HistoCounter> ///< Integer type for counting sample occurrences per histogram bin -__launch_bounds__ (BINS, 1) -__global__ void HistoAggregateKernel( - HistoCounter* d_block_histograms, ///< [in] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][num_threadblocks][BINS] - ArrayWrapper d_out_histograms, ///< [out] Histogram counter data having logical dimensions HistoCounter[ACTIVE_CHANNELS][BINS] - int num_threadblocks) ///< [in] Number of threadblock histograms per channel in \p d_block_histograms -{ - // Accumulate threadblock-histograms from the channel - HistoCounter bin_aggregate = 0; - - int block_offset = blockIdx.x * (num_threadblocks * BINS); - int block_end = block_offset + (num_threadblocks * BINS); - -#if CUB_PTX_ARCH >= 200 - #pragma unroll 32 -#endif - while (block_offset < block_end) - { - HistoCounter block_bin_count = d_block_histograms[block_offset + threadIdx.x]; - - bin_aggregate += block_bin_count; - block_offset += BINS; - } - - // Output - d_out_histograms.array[blockIdx.x][threadIdx.x] = bin_aggregate; -} - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram - */ -template < - DeviceHistogramAlgorithm HISTO_ALGORITHM, ///< Cooperative histogram algorithm to use - int BINS, ///< Number of histogram bins per channel - int CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - int ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename InputIterator, ///< The input iterator type \iterator. Must have a value type that is assignable to unsigned char - typename HistoCounter, ///< Integer type for counting sample occurrences per histogram bin - typename Offset> ///< Signed integer type for global offsets -struct DeviceHistogramDispatch -{ - /****************************************************************************** - * Tuning policies - ******************************************************************************/ - - /// SM35 - struct Policy350 - { - // HistoRegionPolicy - typedef BlockRangeHistogramPolicy< - (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 128 : 256, - (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 12 : (30 / ACTIVE_CHANNELS), - HISTO_ALGORITHM, - (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE> - HistoRegionPolicy; - }; - - /// SM30 - struct Policy300 - { - // HistoRegionPolicy - typedef BlockRangeHistogramPolicy< - 128, - (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 20 : (22 / ACTIVE_CHANNELS), - HISTO_ALGORITHM, - (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE> - HistoRegionPolicy; - }; - - /// SM20 - struct Policy200 - { - // HistoRegionPolicy - typedef BlockRangeHistogramPolicy< - 128, - (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 21 : (23 / ACTIVE_CHANNELS), - HISTO_ALGORITHM, - GRID_MAPPING_DYNAMIC> - HistoRegionPolicy; - }; - - /// SM10 - struct Policy100 - { - // HistoRegionPolicy - typedef BlockRangeHistogramPolicy< - 128, - 7, - DEVICE_HISTO_SORT, // (use sort regardless because g-atomics are unsupported and s-atomics are perf-useless) - GRID_MAPPING_EVEN_SHARE> - HistoRegionPolicy; - }; - - - /****************************************************************************** - * Tuning policies of current PTX compiler pass - ******************************************************************************/ - -#if (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#else - typedef Policy100 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxHistoRegionPolicy : PtxPolicy::HistoRegionPolicy {}; - - - /****************************************************************************** - * Utilities - ******************************************************************************/ - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static void InitConfigs( - int ptx_version, - KernelConfig &histo_range_config) - { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - histo_range_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - histo_range_config.template Init(); - } - else if (ptx_version >= 300) - { - histo_range_config.template Init(); - } - else if (ptx_version >= 200) - { - histo_range_config.template Init(); - } - else - { - histo_range_config.template Init(); - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - DeviceHistogramAlgorithm block_algorithm; - GridMappingStrategy grid_mapping; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - void Init() - { - block_threads = BlockPolicy::BLOCK_THREADS; - items_per_thread = BlockPolicy::ITEMS_PER_THREAD; - block_algorithm = BlockPolicy::HISTO_ALGORITHM; - grid_mapping = BlockPolicy::GRID_MAPPING; - } - - CUB_RUNTIME_FUNCTION __forceinline__ - void Print() - { - printf("%d, %d, %d, %d", block_threads, items_per_thread, block_algorithm, grid_mapping); - } - - }; - - - /****************************************************************************** - * Dispatch entrypoints - ******************************************************************************/ - - - /** - * Internal dispatch routine - */ - template < - typename InitHistoKernelPtr, ///< Function type of cub::HistoInitKernel - typename HistoRegionKernelPtr, ///< Function type of cub::HistoRegionKernel - typename AggregateHistoKernelPtr> ///< Function type of cub::HistoAggregateKernel - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_samples, ///< [in] Input samples to histogram - HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter. - Offset num_samples, ///< [in] Number of samples to process - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Default is \p false. - InitHistoKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::HistoInitKernel - HistoRegionKernelPtr histo_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::HistoRegionKernel - AggregateHistoKernelPtr aggregate_kernel, ///< [in] Kernel function pointer to parameterization of cub::HistoAggregateKernel - KernelConfig histo_range_config) ///< [in] Dispatch parameters that match the policy that \p histo_range_kernel was compiled for - { - #ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - - #else - - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get device SM version - int sm_version; - if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Get SM occupancy for histo_range_kernel - int histo_range_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - histo_range_sm_occupancy, - sm_version, - histo_range_kernel, - histo_range_config.block_threads))) break; - - // Get device occupancy for histo_range_kernel - int histo_range_occupancy = histo_range_sm_occupancy * sm_count; - - // Get tile size for histo_range_kernel - int channel_tile_size = histo_range_config.block_threads * histo_range_config.items_per_thread; - int tile_size = channel_tile_size * CHANNELS; - - // Even-share work distribution - int subscription_factor = histo_range_sm_occupancy; // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic) - GridEvenShare even_share( - num_samples, - histo_range_occupancy * subscription_factor, - tile_size); - - // Get grid size for histo_range_kernel - int histo_range_grid_size; - switch (histo_range_config.grid_mapping) - { - case GRID_MAPPING_EVEN_SHARE: - - // Work is distributed evenly - histo_range_grid_size = even_share.grid_size; - break; - - case GRID_MAPPING_DYNAMIC: - - // Work is distributed dynamically - int num_tiles = (num_samples + tile_size - 1) / tile_size; - histo_range_grid_size = (num_tiles < histo_range_occupancy) ? - num_tiles : // Not enough to fill the device with threadblocks - histo_range_occupancy; // Fill the device with threadblocks - break; - }; - - // Temporary storage allocation requirements - void* allocations[2]; - size_t allocation_sizes[2] = - { - ACTIVE_CHANNELS * histo_range_grid_size * sizeof(HistoCounter) * BINS, // bytes needed for privatized histograms - GridQueue::AllocationSize() // bytes needed for grid queue descriptor - }; - - // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - return cudaSuccess; - } - - // Alias the allocation for the privatized per-block reductions - HistoCounter *d_block_histograms = (HistoCounter*) allocations[0]; - - // Alias the allocation for the grid queue descriptor - GridQueue queue(allocations[1]); - - // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters) - ArrayWrapper d_histo_wrapper; - for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) - d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL]; - - // Setup array wrapper for temporary histogram channel output (because we can't pass static arrays as kernel parameters) - ArrayWrapper d_temp_histo_wrapper; - for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL) - d_temp_histo_wrapper.array[CHANNEL] = d_block_histograms + (CHANNEL * histo_range_grid_size * BINS); - - // Log init_kernel configuration - if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", ACTIVE_CHANNELS, BINS, (long long) stream); - - // Invoke init_kernel to initialize counters and queue descriptor - init_kernel<<>>(queue, d_histo_wrapper, num_samples); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Whether we need privatized histograms (i.e., non-global atomics and multi-block) - bool privatized_temporaries = (histo_range_grid_size > 1) && (histo_range_config.block_algorithm != DEVICE_HISTO_GLOBAL_ATOMIC); - - // Log histo_range_kernel configuration - if (debug_synchronous) CubLog("Invoking histo_range_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - histo_range_grid_size, histo_range_config.block_threads, (long long) stream, histo_range_config.items_per_thread, histo_range_sm_occupancy); - - // Invoke histo_range_kernel - histo_range_kernel<<>>( - d_samples, - (privatized_temporaries) ? - d_temp_histo_wrapper : - d_histo_wrapper, - num_samples, - even_share, - queue); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Aggregate privatized block histograms if necessary - if (privatized_temporaries) - { - // Log aggregate_kernel configuration - if (debug_synchronous) CubLog("Invoking aggregate_kernel<<<%d, %d, 0, %lld>>>()\n", - ACTIVE_CHANNELS, BINS, (long long) stream); - - // Invoke aggregate_kernel - aggregate_kernel<<>>( - d_block_histograms, - d_histo_wrapper, - histo_range_grid_size); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - } - while (0); - - return error; - - #endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_samples, ///< [in] Input samples to histogram - HistoCounter *d_histograms[ACTIVE_CHANNELS], ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter. - int num_samples, ///< [in] Number of samples to process - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel kernel dispatch configurations - KernelConfig histo_range_config; - InitConfigs(ptx_version, histo_range_config); - - // Dispatch - if (CubDebug(error = Dispatch( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histograms, - num_samples, - stream, - debug_synchronous, - HistoInitKernel, - HistoRegionKernel, - HistoAggregateKernel, - histo_range_config))) break; - } - while (0); - - return error; - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/dispatch/device_radix_sort_dispatch.cuh b/SRC/cub/device/dispatch/device_radix_sort_dispatch.cuh deleted file mode 100644 index 028a5684..00000000 --- a/SRC/cub/device/dispatch/device_radix_sort_dispatch.cuh +++ /dev/null @@ -1,939 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within global memory. - */ - -#pragma once - -#include -#include - -#include "../../block_range/block_range_radix_sort_upsweep.cuh" -#include "../../block_range/block_range_radix_sort_downsweep.cuh" -#include "../../block_range/block_range_scan.cuh" -#include "../../grid/grid_even_share.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Upsweep pass kernel entry point (multi-block). Computes privatized digit histograms, one per block. - */ -template < - typename BlockRangeRadixSortUpsweepPolicy, ///< Parameterized BlockRangeRadixSortUpsweepPolicy tuning policy type - bool DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename Key, ///< Key type - typename Offset> ///< Signed integer type for global offsets -__launch_bounds__ (int(BlockRangeRadixSortUpsweepPolicy::BLOCK_THREADS), 1) -__global__ void RadixSortUpsweepKernel( - Key *d_keys, ///< [in] Input keys buffer - Offset *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) - Offset num_items, ///< [in] Total number of input data items - int current_bit, ///< [in] Bit position of current radix digit - int num_bits, ///< [in] Number of bits of current radix digit - bool first_pass, ///< [in] Whether this is the first digit pass - GridEvenShare even_share) ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block -{ - // Parameterize BlockRangeRadixSortUpsweep type for the current configuration - typedef BlockRangeRadixSortUpsweep BlockRangeRadixSortUpsweepT; // Primary - - // Shared memory storage - __shared__ typename BlockRangeRadixSortUpsweepT::TempStorage temp_storage; - - // Initialize even-share descriptor for this thread block - even_share.BlockInit(); - - Offset bin_count; - BlockRangeRadixSortUpsweepT(temp_storage, d_keys, current_bit, num_bits).ProcessRegion( - even_share.block_offset, - even_share.block_end, - bin_count); - - // Write out digit counts (striped) - if (threadIdx.x < BlockRangeRadixSortUpsweepT::RADIX_DIGITS) - { - int bin_idx = (DESCENDING) ? - BlockRangeRadixSortUpsweepT::RADIX_DIGITS - threadIdx.x - 1 : - threadIdx.x; - - d_spine[(gridDim.x * bin_idx) + blockIdx.x] = bin_count; - } -} - - -/** - * Spine scan kernel entry point (single-block). Computes an exclusive prefix sum over the privatized digit histograms - */ -template < - typename BlockRangeScanPolicy, ///< Parameterizable tuning policy type for cub::BlockRangeScan abstraction - typename Offset> ///< Signed integer type for global offsets -__launch_bounds__ (int(BlockRangeScanPolicy::BLOCK_THREADS), 1) -__global__ void RadixSortScanKernel( - Offset *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) - int num_counts) ///< [in] Total number of bin-counts -{ - // Parameterize the BlockRangeScan type for the current configuration - typedef BlockRangeScan BlockRangeScanT; - - // Shared memory storage - __shared__ typename BlockRangeScanT::TempStorage temp_storage; - - if (blockIdx.x > 0) return; - - // Block scan instance - BlockRangeScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), Offset(0)) ; - - // Process full input tiles - int block_offset = 0; - BlockScanRunningPrefixOp prefix_op(0, Sum()); - while (block_offset + BlockRangeScanT::TILE_ITEMS <= num_counts) - { - block_scan.ConsumeTile(block_offset, prefix_op); - block_offset += BlockRangeScanT::TILE_ITEMS; - } -} - - -/** - * Downsweep pass kernel entry point (multi-block). Scatters keys (and values) into corresponding bins for the current digit place. - */ -template < - typename BlockRangeRadixSortDownsweepPolicy, ///< Parameterizable tuning policy type for cub::BlockRangeRadixSortUpsweep abstraction - bool DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename Key, ///< Key type - typename Value, ///< Value type - typename Offset> ///< Signed integer type for global offsets -__launch_bounds__ (int(BlockRangeRadixSortDownsweepPolicy::BLOCK_THREADS), 1) -__global__ void RadixSortDownsweepKernel( - Key *d_keys_in, ///< [in] Input keys ping buffer - Key *d_keys_out, ///< [in] Output keys pong buffer - Value *d_values_in, ///< [in] Input values ping buffer - Value *d_values_out, ///< [in] Output values pong buffer - Offset *d_spine, ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) - Offset num_items, ///< [in] Total number of input data items - int current_bit, ///< [in] Bit position of current radix digit - int num_bits, ///< [in] Number of bits of current radix digit - bool first_pass, ///< [in] Whether this is the first digit pass - bool last_pass, ///< [in] Whether this is the last digit pass - GridEvenShare even_share) ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block -{ - // Parameterize BlockRangeRadixSortDownsweep type for the current configuration - typedef BlockRangeRadixSortDownsweep BlockRangeRadixSortDownsweepT; - - // Shared memory storage - __shared__ typename BlockRangeRadixSortDownsweepT::TempStorage temp_storage; - - // Initialize even-share descriptor for this thread block - even_share.BlockInit(); - - // Process input tiles - BlockRangeRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion( - even_share.block_offset, - even_share.block_end); -} - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceRadixSort - */ -template < - bool DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename Key, ///< Key type - typename Value, ///< Value type - typename Offset> ///< Signed integer type for global offsets -struct DeviceRadixSortDispatch -{ - /****************************************************************************** - * Tuning policies - ******************************************************************************/ - - /// SM35 - struct Policy350 - { - enum { - KEYS_ONLY = (Equals::VALUE), - SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4, - RADIX_BITS = 5, - }; - - // Primary UpsweepPolicy - typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys; - typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs; - typedef typename If::Type UpsweepPolicy; - - // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes - typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 22 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS - 1> AltUpsweepPolicyKeys; - typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS - 1> AltUpsweepPolicyPairs; - typedef typename If::Type AltUpsweepPolicy; - - // ScanPolicy - typedef BlockRangeScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_WARP_SCANS> ScanPolicy; - - // Primary DownsweepPolicy - typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys; - typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs; - typedef typename If::Type DownsweepPolicy; - - // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes - typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 11 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyKeys; - typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyPairs; - typedef typename If::Type AltDownsweepPolicy; - }; - - - /// SM30 - struct Policy300 - { - enum { - KEYS_ONLY = (Equals::VALUE), - SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4, - RADIX_BITS = 5, - }; - - // UpsweepPolicy - typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys; - typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs; - typedef typename If::Type UpsweepPolicy; - - // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes - typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys; - typedef BlockRangeRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs; - typedef typename If::Type AltUpsweepPolicy; - - // ScanPolicy - typedef BlockRangeScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // DownsweepPolicy - typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys; - typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs; - typedef typename If::Type DownsweepPolicy; - - // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes - typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyKeys; - typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyPairs; - typedef typename If::Type AltDownsweepPolicy; - }; - - - /// SM20 - struct Policy200 - { - enum { - KEYS_ONLY = (Equals::VALUE), - SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4, - RADIX_BITS = 5, - }; - - // UpsweepPolicy - typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys; - typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs; - typedef typename If::Type UpsweepPolicy; - - // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes - typedef BlockRangeRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys; - typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs; - typedef typename If::Type AltUpsweepPolicy; - - // ScanPolicy - typedef BlockRangeScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // DownsweepPolicy - typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys; - typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs; - typedef typename If::Type DownsweepPolicy; - - // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes - typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyKeys; - typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyPairs; - typedef typename If::Type AltDownsweepPolicy; - }; - - - /// SM13 - struct Policy130 - { - enum { - KEYS_ONLY = (Equals::VALUE), - SCALE_FACTOR = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4, - RADIX_BITS = 5, - }; - - // UpsweepPolicy - typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys; - typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs; - typedef typename If::Type UpsweepPolicy; - - // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes - typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys; - typedef BlockRangeRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs; - typedef typename If::Type AltUpsweepPolicy; - - // ScanPolicy - typedef BlockRangeScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_WARP_SCANS> ScanPolicy; - - // DownsweepPolicy - typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys; - typedef BlockRangeRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs; - typedef typename If::Type DownsweepPolicy; - - // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes - typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyKeys; - typedef BlockRangeRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyPairs; - typedef typename If::Type AltDownsweepPolicy; - }; - - - /// SM10 - struct Policy100 - { - enum { - RADIX_BITS = 4, - }; - - // UpsweepPolicy - typedef BlockRangeRadixSortUpsweepPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS> UpsweepPolicy; - - // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes - typedef BlockRangeRadixSortUpsweepPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicy; - - // ScanPolicy - typedef BlockRangeScanPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // DownsweepPolicy - typedef BlockRangeRadixSortDownsweepPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicy; - - // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes - typedef BlockRangeRadixSortDownsweepPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicy; - }; - - - /****************************************************************************** - * Tuning policies of current PTX compiler pass - ******************************************************************************/ - -#if (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 130) - typedef Policy130 PtxPolicy; - -#else - typedef Policy100 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxUpsweepPolicy : PtxPolicy::UpsweepPolicy {}; - struct PtxAltUpsweepPolicy : PtxPolicy::AltUpsweepPolicy {}; - struct PtxScanPolicy : PtxPolicy::ScanPolicy {}; - struct PtxDownsweepPolicy : PtxPolicy::DownsweepPolicy {}; - struct PtxAltDownsweepPolicy : PtxPolicy::AltDownsweepPolicy {}; - - - /****************************************************************************** - * Utilities - ******************************************************************************/ - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template < - typename Policy, - typename KernelConfig, - typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel - typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel - typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t InitConfigs( - int sm_version, - int sm_count, - KernelConfig &upsweep_config, - KernelConfig &alt_upsweep_config, - KernelConfig &scan_config, - KernelConfig &downsweep_config, - KernelConfig &alt_downsweep_config, - UpsweepKernelPtr upsweep_kernel, - UpsweepKernelPtr alt_upsweep_kernel, - ScanKernelPtr scan_kernel, - DownsweepKernelPtr downsweep_kernel, - DownsweepKernelPtr alt_downsweep_kernel) - { - cudaError_t error; - do { - if (CubDebug(error = upsweep_config.template InitUpsweepPolicy( sm_version, sm_count, upsweep_kernel))) break; - if (CubDebug(error = alt_upsweep_config.template InitUpsweepPolicy( sm_version, sm_count, alt_upsweep_kernel))) break; - if (CubDebug(error = scan_config.template InitScanPolicy( sm_version, sm_count, scan_kernel))) break; - if (CubDebug(error = downsweep_config.template InitDownsweepPolicy( sm_version, sm_count, downsweep_kernel))) break; - if (CubDebug(error = alt_downsweep_config.template InitDownsweepPolicy( sm_version, sm_count, alt_downsweep_kernel))) break; - - } while (0); - - return error; - } - - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template < - typename KernelConfig, - typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel - typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel - typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t InitConfigs( - int ptx_version, - int sm_version, - int sm_count, - KernelConfig &upsweep_config, - KernelConfig &alt_upsweep_config, - KernelConfig &scan_config, - KernelConfig &downsweep_config, - KernelConfig &alt_downsweep_config, - UpsweepKernelPtr upsweep_kernel, - UpsweepKernelPtr alt_upsweep_kernel, - ScanKernelPtr scan_kernel, - DownsweepKernelPtr downsweep_kernel, - DownsweepKernelPtr alt_downsweep_kernel) - { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - cudaError_t error; - do { - - if (CubDebug(error = upsweep_config.template InitUpsweepPolicy( sm_version, sm_count, upsweep_kernel))) break; - if (CubDebug(error = alt_upsweep_config.template InitUpsweepPolicy( sm_version, sm_count, alt_upsweep_kernel))) break; - if (CubDebug(error = scan_config.template InitScanPolicy( sm_version, sm_count, scan_kernel))) break; - if (CubDebug(error = downsweep_config.template InitDownsweepPolicy( sm_version, sm_count, downsweep_kernel))) break; - if (CubDebug(error = alt_downsweep_config.template InitDownsweepPolicy( sm_version, sm_count, alt_downsweep_kernel))) break; - - } while (0); - - return error; - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - cudaError_t error; - if (ptx_version >= 350) - { - error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); - } - else if (ptx_version >= 300) - { - error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); - } - else if (ptx_version >= 200) - { - error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); - } - else if (ptx_version >= 130) - { - error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); - } - else - { - error = InitConfigs(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel); - } - - return error; - - #endif - } - - - - /** - * Kernel kernel dispatch configurations - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - int tile_size; - cudaSharedMemConfig smem_config; - int radix_bits; - int sm_occupancy; // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic) - int max_grid_size; - int subscription_factor; - - template - CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitUpsweepPolicy( - int sm_version, int sm_count, UpsweepKernelPtr upsweep_kernel) - { - block_threads = UpsweepPolicy::BLOCK_THREADS; - items_per_thread = UpsweepPolicy::ITEMS_PER_THREAD; - radix_bits = UpsweepPolicy::RADIX_BITS; - smem_config = cudaSharedMemBankSizeFourByte; - tile_size = block_threads * items_per_thread; - cudaError_t retval = MaxSmOccupancy(sm_occupancy, sm_version, upsweep_kernel, block_threads); - subscription_factor = CUB_SUBSCRIPTION_FACTOR(sm_version); - max_grid_size = (sm_occupancy * sm_count) * subscription_factor; - - return retval; - } - - template - CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitScanPolicy( - int sm_version, int sm_count, ScanKernelPtr scan_kernel) - { - block_threads = ScanPolicy::BLOCK_THREADS; - items_per_thread = ScanPolicy::ITEMS_PER_THREAD; - radix_bits = 0; - smem_config = cudaSharedMemBankSizeFourByte; - tile_size = block_threads * items_per_thread; - sm_occupancy = 1; - subscription_factor = 1; - max_grid_size = 1; - - return cudaSuccess; - } - - template - CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitDownsweepPolicy( - int sm_version, int sm_count, DownsweepKernelPtr downsweep_kernel) - { - block_threads = DownsweepPolicy::BLOCK_THREADS; - items_per_thread = DownsweepPolicy::ITEMS_PER_THREAD; - radix_bits = DownsweepPolicy::RADIX_BITS; - smem_config = DownsweepPolicy::SMEM_CONFIG; - tile_size = block_threads * items_per_thread; - cudaError_t retval = MaxSmOccupancy(sm_occupancy, sm_version, downsweep_kernel, block_threads); - subscription_factor = CUB_SUBSCRIPTION_FACTOR(sm_version); - max_grid_size = (sm_occupancy * sm_count) * subscription_factor; - - return retval; - } - }; - - - /****************************************************************************** - * Allocation of device temporaries - ******************************************************************************/ - - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t AllocateTemporaries( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - Offset* &d_spine, ///< [out] Digit count histograms per thread block - KernelConfig &scan_config, ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for - KernelConfig &downsweep_config) ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for - { - cudaError error = cudaSuccess; - do - { - // Get spine size (conservative) - int spine_size = (downsweep_config.max_grid_size * (1 << downsweep_config.radix_bits)) + scan_config.tile_size; - - // Temporary storage allocation requirements - void* allocations[1]; - size_t allocation_sizes[1] = - { - spine_size * sizeof(Offset), // bytes needed for privatized block digit histograms - }; - - // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - return cudaSuccess; - - // Alias the allocation for the privatized per-block digit histograms - d_spine = (Offset*) allocations[0]; - - } while(0); - - return error; - } - - - /****************************************************************************** - * Dispatch entrypoints - ******************************************************************************/ - - /** - * Internal dispatch routine for computing a device-wide radix sort using the - * specified kernel functions. - */ - template < - typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel - typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel - typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - Offset *d_spine, ///< [in] Digit count histograms per thread block - int spine_size, ///< [in] Number of histogram counters - Offset num_items, ///< [in] Number of items to reduce - int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison - int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - KernelConfig &upsweep_config, ///< [in] Dispatch parameters that match the policy that \p upsweep_kernel was compiled for - KernelConfig &scan_config, ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for - KernelConfig &downsweep_config, ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for - UpsweepKernelPtr upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel - ScanKernelPtr scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel - DownsweepKernelPtr downsweep_kernel) ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel - { -#ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); - -#else - - cudaError error = cudaSuccess; - do - { - // Get even-share work distribution descriptor - GridEvenShare even_share(num_items, downsweep_config.max_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size)); - -#if (CUB_PTX_ARCH == 0) - // Get current smem bank configuration - cudaSharedMemConfig original_smem_config; - if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break; - cudaSharedMemConfig current_smem_config = original_smem_config; -#endif - // Iterate over digit places - int current_bit = begin_bit; - while (current_bit < end_bit) - { - int num_bits = CUB_MIN(end_bit - current_bit, downsweep_config.radix_bits); - -#if (CUB_PTX_ARCH == 0) - // Update smem config if necessary - if (current_smem_config != upsweep_config.smem_config) - { - if (CubDebug(error = cudaDeviceSetSharedMemConfig(upsweep_config.smem_config))) break; - current_smem_config = upsweep_config.smem_config; - } -#endif - - // Log upsweep_kernel configuration - if (debug_synchronous) - CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy, selector %d, current bit %d, bit_grain %d\n", - even_share.grid_size, upsweep_config.block_threads, (long long) stream, upsweep_config.smem_config, upsweep_config.items_per_thread, upsweep_config.sm_occupancy, d_keys.selector, current_bit, downsweep_config.radix_bits); - - // Invoke upsweep_kernel with same grid size as downsweep_kernel - upsweep_kernel<<>>( - d_keys.d_buffers[d_keys.selector], - d_spine, - num_items, - current_bit, - num_bits, - (current_bit == begin_bit), - even_share); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Log scan_kernel configuration - if (debug_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", - 1, scan_config.block_threads, (long long) stream, scan_config.items_per_thread); - - // Invoke scan_kernel - scan_kernel<<<1, scan_config.block_threads, 0, stream>>>( - d_spine, - spine_size); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - -#if (CUB_PTX_ARCH == 0) - // Update smem config if necessary - if (current_smem_config != downsweep_config.smem_config) - { - if (CubDebug(error = cudaDeviceSetSharedMemConfig(downsweep_config.smem_config))) break; - current_smem_config = downsweep_config.smem_config; - } -#endif - // Log downsweep_kernel configuration - if (debug_synchronous) CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy\n", - even_share.grid_size, downsweep_config.block_threads, (long long) stream, downsweep_config.smem_config, downsweep_config.items_per_thread, downsweep_config.sm_occupancy); - - // Invoke downsweep_kernel - downsweep_kernel<<>>( - d_keys.d_buffers[d_keys.selector], - d_keys.d_buffers[d_keys.selector ^ 1], - d_values.d_buffers[d_values.selector], - d_values.d_buffers[d_values.selector ^ 1], - d_spine, - num_items, - current_bit, - num_bits, - (current_bit == begin_bit), - (current_bit + downsweep_config.radix_bits >= end_bit), - even_share); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Invert selectors - d_keys.selector ^= 1; - d_values.selector ^= 1; - - // Update current bit position - current_bit += downsweep_config.radix_bits; - } - -#if (CUB_PTX_ARCH == 0) - // Reset smem config if necessary - if (current_smem_config != original_smem_config) - { - if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break; - } -#endif - - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine - */ - template < - typename UpsweepKernelPtr, ///< Function type of cub::RadixSortUpsweepKernel - typename ScanKernelPtr, ///< Function type of cub::SpineScanKernel - typename DownsweepKernelPtr> ///< Function type of cub::RadixSortUpsweepKernel - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - Offset num_items, ///< [in] Number of items to reduce - int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison - int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - UpsweepKernelPtr upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel - UpsweepKernelPtr alt_upsweep_kernel, ///< [in] Alternate kernel function pointer to parameterization of cub::RadixSortUpsweepKernel - ScanKernelPtr scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel - DownsweepKernelPtr downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::RadixSortUpsweepKernel - DownsweepKernelPtr alt_downsweep_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::RadixSortUpsweepKernel - { -#ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); - -#else - - cudaError error = cudaSuccess; - - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get device SM version - int sm_version; - if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Get kernel kernel dispatch configurations - KernelConfig upsweep_config; - KernelConfig alt_upsweep_config; - KernelConfig scan_config; - KernelConfig downsweep_config; - KernelConfig alt_downsweep_config; - - if (CubDebug(error = InitConfigs(ptx_version, sm_version, sm_count, - upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, - upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel))) break; - - // Get spine sizes (conservative) - int spine_size = (downsweep_config.max_grid_size * (1 << downsweep_config.radix_bits)) + scan_config.tile_size; - int alt_spine_size = (alt_downsweep_config.max_grid_size * (1 << alt_downsweep_config.radix_bits)) + scan_config.tile_size; - - // Allocate temporaries - Offset *d_spine; - if (spine_size > alt_spine_size) - { - if (CubDebug(error = AllocateTemporaries(d_temp_storage, temp_storage_bytes, d_spine, scan_config, downsweep_config))) break; - } - else - { - if (CubDebug(error = AllocateTemporaries(d_temp_storage, temp_storage_bytes, d_spine, scan_config, alt_downsweep_config))) break; - } - - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - return cudaSuccess; - - // Run radix sorting passes - int num_bits = end_bit - begin_bit; - int remaining_bits = num_bits % downsweep_config.radix_bits; - - if (remaining_bits != 0) - { - // Run passes of alternate configuration - int max_alt_passes = downsweep_config.radix_bits - remaining_bits; - int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_downsweep_config.radix_bits)); - - if (CubDebug(error = Dispatch( - d_keys, - d_values, - d_spine, - alt_spine_size, - num_items, - begin_bit, - alt_end_bit, - stream, - debug_synchronous, - alt_upsweep_config, - scan_config, - alt_downsweep_config, - alt_upsweep_kernel, - scan_kernel, - alt_downsweep_kernel))) break; - - begin_bit = alt_end_bit; - } - - // Run passes of primary configuration - if (CubDebug(error = Dispatch( - d_keys, - d_values, - d_spine, - spine_size, - num_items, - begin_bit, - end_bit, - stream, - debug_synchronous, - upsweep_config, - scan_config, - downsweep_config, - upsweep_kernel, - scan_kernel, - downsweep_kernel))) break; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine - */ - - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - Offset num_items, ///< [in] Number of items to reduce - int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison - int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - return Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - stream, - debug_synchronous, - RadixSortUpsweepKernel, - RadixSortUpsweepKernel, - RadixSortScanKernel, - RadixSortDownsweepKernel, - RadixSortDownsweepKernel); - } - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/dispatch/device_reduce_by_key_dispatch.cuh b/SRC/cub/device/dispatch/device_reduce_by_key_dispatch.cuh deleted file mode 100644 index 81c028e1..00000000 --- a/SRC/cub/device/dispatch/device_reduce_by_key_dispatch.cuh +++ /dev/null @@ -1,594 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within global memory. - */ - -#pragma once - -#include -#include - -#include "device_scan_dispatch.cuh" -#include "../../block_range/block_range_reduce_by_key.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Reduce-by-key kernel entry point (multi-block) - */ -template < - typename BlockRangeReduceByKeyPolicy, ///< Parameterized BlockRangeReduceByKeyPolicy tuning policy type - typename KeyInputIterator, ///< Random-access input iterator type for keys - typename KeyOutputIterator, ///< Random-access output iterator type for keys - typename ValueInputIterator, ///< Random-access input iterator type for values - typename ValueOutputIterator, ///< Random-access output iterator type for values - typename NumSegmentsIterator, ///< Output iterator type for recording number of segments encountered - typename ScanTileState, ///< Tile status interface type - typename EqualityOp, ///< Key equality operator type - typename ReductionOp, ///< Value reduction operator type - typename Offset> ///< Signed integer type for global offsets -__launch_bounds__ (int(BlockRangeReduceByKeyPolicy::BLOCK_THREADS)) -__global__ void ReduceByKeyRegionKernel( - KeyInputIterator d_keys_in, ///< [in] Pointer to consecutive runs of input keys - KeyOutputIterator d_keys_out, ///< [in] Pointer to output keys (one key per run) - ValueInputIterator d_values_in, ///< [in] Pointer to consecutive runs of input values - ValueOutputIterator d_values_out, ///< [in] Pointer to output value aggregates (one aggregate per run) - NumSegmentsIterator d_num_segments, ///< [in] Pointer to total number of runs - ScanTileState tile_status, ///< [in] Tile status interface - EqualityOp equality_op, ///< [in] Key equality operator - ReductionOp reduction_op, ///< [in] Value reduction operator - Offset num_items, ///< [in] Total number of items to select from - int num_tiles, ///< [in] Total number of tiles for the entire problem - GridQueue queue) ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks -{ - // Thread block type for reducing tiles of value segments - typedef BlockRangeReduceByKey< - BlockRangeReduceByKeyPolicy, - KeyInputIterator, - KeyOutputIterator, - ValueInputIterator, - ValueOutputIterator, - EqualityOp, - ReductionOp, - Offset> BlockRangeReduceByKeyT; - - // Shared memory for BlockRangeReduceByKey - __shared__ typename BlockRangeReduceByKeyT::TempStorage temp_storage; - - // Process tiles - BlockRangeReduceByKeyT(temp_storage, d_keys_in, d_keys_out, d_values_in, d_values_out, equality_op, reduction_op, num_items).ConsumeRange( - num_tiles, - queue, - tile_status, - d_num_segments); -} - - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey - */ -template < - typename KeyInputIterator, ///< Random-access input iterator type for keys - typename KeyOutputIterator, ///< Random-access output iterator type for keys - typename ValueInputIterator, ///< Random-access input iterator type for values - typename ValueOutputIterator, ///< Random-access output iterator type for values - typename NumSegmentsIterator, ///< Output iterator type for recording number of segments encountered - typename EqualityOp, ///< Key equality operator type - typename ReductionOp, ///< Value reduction operator type - typename Offset> ///< Signed integer type for global offsets -struct DeviceReduceByKeyDispatch -{ - /****************************************************************************** - * Types and constants - ******************************************************************************/ - - // Data type of key input iterator - typedef typename std::iterator_traits::value_type Key; - - // Data type of value input iterator - typedef typename std::iterator_traits::value_type Value; - - enum - { - INIT_KERNEL_THREADS = 128, - MAX_INPUT_BYTES = CUB_MAX(sizeof(Key), sizeof(Value)), - COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value), - }; - - // Value-offset tuple type for scanning (maps accumulated values to segment index) - typedef ItemOffsetPair ValueOffsetPair; - - // Tile status descriptor interface type - typedef ReduceByKeyScanTileState ScanTileState; - - - /****************************************************************************** - * Tuning policies - ******************************************************************************/ - - /// SM35 - struct Policy350 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 8, - ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 8 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), - }; - - typedef BlockRangeReduceByKeyPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - true, - BLOCK_SCAN_WARP_SCANS> - ReduceByKeyPolicy; - }; - - /// SM30 - struct Policy300 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 6, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), - }; - - typedef BlockRangeReduceByKeyPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - BLOCK_SCAN_WARP_SCANS> - ReduceByKeyPolicy; - }; - - /// SM20 - struct Policy200 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 13, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), - }; - - typedef BlockRangeReduceByKeyPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - BLOCK_SCAN_WARP_SCANS> - ReduceByKeyPolicy; - }; - - /// SM13 - struct Policy130 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 7, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), - }; - - typedef BlockRangeReduceByKeyPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - BLOCK_SCAN_WARP_SCANS> - ReduceByKeyPolicy; - }; - - /// SM10 - struct Policy100 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 5, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)), - }; - - typedef BlockRangeReduceByKeyPolicy< - 64, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - BLOCK_SCAN_RAKING> - ReduceByKeyPolicy; - }; - - - /****************************************************************************** - * Tuning policies of current PTX compiler pass - ******************************************************************************/ - -#if (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 130) - typedef Policy130 PtxPolicy; - -#else - typedef Policy100 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicy {}; - - - /****************************************************************************** - * Utilities - ******************************************************************************/ - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static void InitConfigs( - int ptx_version, - KernelConfig &reduce_by_key_range_config) - { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - reduce_by_key_range_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - reduce_by_key_range_config.template Init(); - } - else if (ptx_version >= 300) - { - reduce_by_key_range_config.template Init(); - } - else if (ptx_version >= 200) - { - reduce_by_key_range_config.template Init(); - } - else if (ptx_version >= 130) - { - reduce_by_key_range_config.template Init(); - } - else - { - reduce_by_key_range_config.template Init(); - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration. Mirrors the constants within BlockRangeReduceByKeyPolicy. - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - BlockLoadAlgorithm load_policy; - bool two_phase_scatter; - BlockScanAlgorithm scan_algorithm; - cudaSharedMemConfig smem_config; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - void Init() - { - block_threads = BlockRangeReduceByKeyPolicy::BLOCK_THREADS; - items_per_thread = BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD; - load_policy = BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM; - two_phase_scatter = BlockRangeReduceByKeyPolicy::TWO_PHASE_SCATTER; - scan_algorithm = BlockRangeReduceByKeyPolicy::SCAN_ALGORITHM; - smem_config = cudaSharedMemBankSizeEightByte; - } - - CUB_RUNTIME_FUNCTION __forceinline__ - void Print() - { - printf("%d, %d, %d, %d, %d", - block_threads, - items_per_thread, - load_policy, - two_phase_scatter, - scan_algorithm); - } - }; - - - /****************************************************************************** - * Dispatch entrypoints - ******************************************************************************/ - - /** - * Internal dispatch routine for computing a device-wide prefix scan using the - * specified kernel functions. - */ - template < - typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel - typename ReduceByKeyRegionKernelPtr> ///< Function type of cub::ReduceByKeyRegionKernelPtr - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeyInputIterator d_keys_in, ///< [in] Pointer to consecutive runs of input keys - KeyOutputIterator d_keys_out, ///< [in] Pointer to output keys (one key per run) - ValueInputIterator d_values_in, ///< [in] Pointer to consecutive runs of input values - ValueOutputIterator d_values_out, ///< [in] Pointer to output value aggregates (one aggregate per run) - NumSegmentsIterator d_num_segments, ///< [in] Pointer to total number of runs - EqualityOp equality_op, ///< [in] Key equality operator - ReductionOp reduction_op, ///< [in] Value reduction operator - Offset num_items, ///< [in] Total number of items to select from - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version, ///< [in] PTX version of dispatch kernels - ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel - ReduceByKeyRegionKernelPtr reduce_by_key_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::ReduceByKeyRegionKernel - KernelConfig reduce_by_key_range_config) ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_range_kernel was compiled for - { - -#ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get device SM version - int sm_version; - if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Number of input tiles - int tile_size = reduce_by_key_range_config.block_threads * reduce_by_key_range_config.items_per_thread; - int num_tiles = (num_items + tile_size - 1) / tile_size; - - // Specify temporary storage allocation requirements - size_t allocation_sizes[2]; - if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors - allocation_sizes[1] = GridQueue::AllocationSize(); // bytes needed for grid queue descriptor - - // Compute allocation pointers into the single storage blob (or set the necessary size of the blob) - void* allocations[2]; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - return cudaSuccess; - } - - // Construct the tile status interface - ScanTileState tile_status; - if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; - - // Construct the grid queue descriptor - GridQueue queue(allocations[1]); - - // Log init_kernel configuration - int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; - if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); - - // Invoke init_kernel to initialize tile descriptors and queue descriptors - init_kernel<<>>( - queue, - tile_status, - num_tiles); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Get SM occupancy for reduce_by_key_range_kernel - int reduce_by_key_range_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - reduce_by_key_range_sm_occupancy, // out - sm_version, - reduce_by_key_range_kernel, - reduce_by_key_range_config.block_threads))) break; - - // Get grid size for scanning tiles - dim3 reduce_by_key_grid_size; - if (ptx_version <= 130) - { - // Blocks are launched in order, so just assign one block per tile - int max_dim_x = 32 * 1024; - reduce_by_key_grid_size.z = 1; - reduce_by_key_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x; - reduce_by_key_grid_size.x = CUB_MIN(num_tiles, max_dim_x); - } - else - { - // Blocks may not be launched in order, so use atomics - int reduce_by_key_range_occupancy = reduce_by_key_range_sm_occupancy * sm_count; // Whole-device occupancy for reduce_by_key_range_kernel - reduce_by_key_grid_size.z = 1; - reduce_by_key_grid_size.y = 1; - reduce_by_key_grid_size.x = (num_tiles < reduce_by_key_range_occupancy) ? - num_tiles : // Not enough to fill the device with threadblocks - reduce_by_key_range_occupancy; // Fill the device with threadblocks - } - -#if (CUB_PTX_ARCH == 0) - // Get current smem bank configuration - cudaSharedMemConfig original_smem_config; - if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break; - cudaSharedMemConfig current_smem_config = original_smem_config; - - // Update smem config if necessary - if (current_smem_config != reduce_by_key_range_config.smem_config) - { - if (CubDebug(error = cudaDeviceSetSharedMemConfig(reduce_by_key_range_config.smem_config))) break; - current_smem_config = reduce_by_key_range_config.smem_config; - } -#endif - - // Log reduce_by_key_range_kernel configuration - if (debug_synchronous) CubLog("Invoking reduce_by_key_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - reduce_by_key_grid_size.x, reduce_by_key_grid_size.y, reduce_by_key_grid_size.z, reduce_by_key_range_config.block_threads, (long long) stream, reduce_by_key_range_config.items_per_thread, reduce_by_key_range_sm_occupancy); - - // Invoke reduce_by_key_range_kernel - reduce_by_key_range_kernel<<>>( - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - d_num_segments, - tile_status, - equality_op, - reduction_op, - num_items, - num_tiles, - queue); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - -#if (CUB_PTX_ARCH == 0) - // Reset smem config if necessary - if (current_smem_config != original_smem_config) - { - if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break; - } -#endif - - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeyInputIterator d_keys_in, ///< [in] Pointer to consecutive runs of input keys - KeyOutputIterator d_keys_out, ///< [in] Pointer to output keys (one key per run) - ValueInputIterator d_values_in, ///< [in] Pointer to consecutive runs of input values - ValueOutputIterator d_values_out, ///< [in] Pointer to output value aggregates (one aggregate per run) - NumSegmentsIterator d_num_segments, ///< [in] Pointer to total number of runs - EqualityOp equality_op, ///< [in] Key equality operator - ReductionOp reduction_op, ///< [in] Value reduction operator - Offset num_items, ///< [in] Total number of items to select from - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel kernel dispatch configurations - KernelConfig reduce_by_key_range_config; - InitConfigs(ptx_version, reduce_by_key_range_config); - - // Dispatch - if (CubDebug(error = Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - d_num_segments, - equality_op, - reduction_op, - num_items, - stream, - debug_synchronous, - ptx_version, - ScanInitKernel, - ReduceByKeyRegionKernel, - reduce_by_key_range_config))) break; - } - while (0); - - return error; - } -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/dispatch/device_reduce_dispatch.cuh b/SRC/cub/device/dispatch/device_reduce_dispatch.cuh deleted file mode 100644 index 3c0bce5b..00000000 --- a/SRC/cub/device/dispatch/device_reduce_dispatch.cuh +++ /dev/null @@ -1,743 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within global memory. - */ - -#pragma once - -#include -#include - -#include "device_reduce_by_key_dispatch.cuh" -#include "../../block_range/block_range_reduce.cuh" -#include "../../iterator/constant_input_iterator.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_even_share.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../iterator/arg_index_input_iterator.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Reduce region kernel entry point (multi-block). Computes privatized reductions, one per thread block. - */ -template < - typename BlockRangeReducePolicy, ///< Parameterized BlockRangeReducePolicy tuning policy type - typename InputIterator, ///< Random-access input iterator type for reading input items \iterator - typename OutputIterator, ///< Output iterator type for recording the reduced aggregate \iterator - typename Offset, ///< Signed integer type for global offsets - typename ReductionOp> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) -__launch_bounds__ (int(BlockRangeReducePolicy::BLOCK_THREADS)) -__global__ void ReduceRegionKernel( - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output aggregate - Offset num_items, ///< [in] Total number of input data items - GridEvenShare even_share, ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block - GridQueue queue, ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks - ReductionOp reduction_op) ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) -{ - // Data type - typedef typename std::iterator_traits::value_type T; - - // Thread block type for reducing input tiles - typedef BlockRangeReduce BlockRangeReduceT; - - // Block-wide aggregate - T block_aggregate; - - // Shared memory storage - __shared__ typename BlockRangeReduceT::TempStorage temp_storage; - - // Consume input tiles - BlockRangeReduceT(temp_storage, d_in, reduction_op).ConsumeRange( - num_items, - even_share, - queue, - block_aggregate, - Int2Type()); - - // Output result - if (threadIdx.x == 0) - { - d_out[blockIdx.x] = block_aggregate; - } -} - - -/** - * Reduce a single tile kernel entry point (single-block). Can be used to aggregate privatized threadblock reductions from a previous multi-block reduction pass. - */ -template < - typename BlockRangeReducePolicy, ///< Parameterized BlockRangeReducePolicy tuning policy type - typename InputIterator, ///< Random-access input iterator type for reading input items \iterator - typename OutputIterator, ///< Output iterator type for recording the reduced aggregate \iterator - typename Offset, ///< Signed integer type for global offsets - typename ReductionOp> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) -__launch_bounds__ (int(BlockRangeReducePolicy::BLOCK_THREADS), 1) -__global__ void SingleTileKernel( - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output aggregate - Offset num_items, ///< [in] Total number of input data items - ReductionOp reduction_op) ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) -{ - // Data type - typedef typename std::iterator_traits::value_type T; - - // Thread block type for reducing input tiles - typedef BlockRangeReduce BlockRangeReduceT; - - // Block-wide aggregate - T block_aggregate; - - // Shared memory storage - __shared__ typename BlockRangeReduceT::TempStorage temp_storage; - - // Consume input tiles - BlockRangeReduceT(temp_storage, d_in, reduction_op).ConsumeRange( - Offset(0), - Offset(num_items), - block_aggregate); - - // Output result - if (threadIdx.x == 0) - { - d_out[blockIdx.x] = block_aggregate; - } -} - - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceReduce - */ -template < - typename InputIterator, ///< Random-access input iterator type for reading input items \iterator - typename OutputIterator, ///< Output iterator type for recording the reduced aggregate \iterator - typename Offset, ///< Signed integer type for global offsets - typename ReductionOp> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) -struct DeviceReduceDispatch -{ - // Data type of input iterator - typedef typename std::iterator_traits::value_type T; - - - /****************************************************************************** - * Tuning policies - ******************************************************************************/ - - /// SM35 - struct Policy350 - { - // ReduceRegionPolicy1B (GTX Titan: 228.7 GB/s @ 192M 1B items) - typedef BlockRangeReducePolicy< - 128, ///< Threads per thread block - 24, ///< Items per thread per tile of input - 4, ///< Number of items per vectorized load - BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use - LOAD_LDG, ///< Cache load modifier - GRID_MAPPING_DYNAMIC> ///< How to map tiles of input onto thread blocks - ReduceRegionPolicy1B; - - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 20, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - // ReduceRegionPolicy4B (GTX Titan: 255.1 GB/s @ 48M 4B items) - typedef BlockRangeReducePolicy< - 256, ///< Threads per thread block - ITEMS_PER_THREAD, ///< Items per thread per tile of input - 2, ///< Number of items per vectorized load - BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use - LOAD_LDG, ///< Cache load modifier - GRID_MAPPING_DYNAMIC> ///< How to map tiles of input onto thread blocks - ReduceRegionPolicy4B; - - // ReduceRegionPolicy - typedef typename If<(sizeof(T) >= 4), - ReduceRegionPolicy4B, - ReduceRegionPolicy1B>::Type ReduceRegionPolicy; - - // SingleTilePolicy - typedef BlockRangeReducePolicy< - 256, ///< Threads per thread block - 8, ///< Items per thread per tile of input - 1, ///< Number of items per vectorized load - BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT, ///< Cache load modifier - GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks - SingleTilePolicy; - }; - - /// SM30 - struct Policy300 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 2, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - // ReduceRegionPolicy (GTX670: 154.0 @ 48M 4B items) - typedef BlockRangeReducePolicy< - 256, ///< Threads per thread block - ITEMS_PER_THREAD, ///< Items per thread per tile of input - 1, ///< Number of items per vectorized load - BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT, ///< Cache load modifier - GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks - ReduceRegionPolicy; - - // SingleTilePolicy - typedef BlockRangeReducePolicy< - 256, ///< Threads per thread block - 24, ///< Items per thread per tile of input - 4, ///< Number of items per vectorized load - BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT, ///< Cache load modifier - GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks - SingleTilePolicy; - }; - - /// SM20 - struct Policy200 - { - // ReduceRegionPolicy1B (GTX 580: 158.1 GB/s @ 192M 1B items) - typedef BlockRangeReducePolicy< - 192, ///< Threads per thread block - 24, ///< Items per thread per tile of input - 4, ///< Number of items per vectorized load - BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT, ///< Cache load modifier - (sizeof(T) == 1) ? ///< How to map tiles of input onto thread blocks - GRID_MAPPING_EVEN_SHARE : - GRID_MAPPING_DYNAMIC> - ReduceRegionPolicy1B; - - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 8, - NOMINAL_4B_VEC_ITEMS = 4, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - VEC_ITEMS = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))), - }; - - // ReduceRegionPolicy4B (GTX 580: 178.9 GB/s @ 48M 4B items) - typedef BlockRangeReducePolicy< - 128, ///< Threads per thread block - ITEMS_PER_THREAD, ///< Items per thread per tile of input - VEC_ITEMS, ///< Number of items per vectorized load - BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT, ///< Cache load modifier - GRID_MAPPING_DYNAMIC> ///< How to map tiles of input onto thread blocks - ReduceRegionPolicy4B; - - // ReduceRegionPolicy - typedef typename If<(sizeof(T) < 4), - ReduceRegionPolicy1B, - ReduceRegionPolicy4B>::Type ReduceRegionPolicy; - - // SingleTilePolicy - typedef BlockRangeReducePolicy< - 192, ///< Threads per thread block - 7, ///< Items per thread per tile of input - 1, ///< Number of items per vectorized load - BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT, ///< Cache load modifier - GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks - SingleTilePolicy; - }; - - /// SM13 - struct Policy130 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 8, - NOMINAL_4B_VEC_ITEMS = 2, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - VEC_ITEMS = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))), - }; - - // ReduceRegionPolicy - typedef BlockRangeReducePolicy< - 128, ///< Threads per thread block - ITEMS_PER_THREAD, ///< Items per thread per tile of input - VEC_ITEMS, ///< Number of items per vectorized load - BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT, ///< Cache load modifier - GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks - ReduceRegionPolicy; - - // SingleTilePolicy - typedef BlockRangeReducePolicy< - 32, ///< Threads per thread block - 4, ///< Items per thread per tile of input - VEC_ITEMS, ///< Number of items per vectorized load - BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT, ///< Cache load modifier - GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks - SingleTilePolicy; - }; - - /// SM10 - struct Policy100 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 8, - NOMINAL_4B_VEC_ITEMS = 2, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - VEC_ITEMS = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))), - }; - - // ReduceRegionPolicy - typedef BlockRangeReducePolicy< - 128, ///< Threads per thread block - ITEMS_PER_THREAD, ///< Items per thread per tile of input - VEC_ITEMS, ///< Number of items per vectorized load - BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT, ///< Cache load modifier - GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks - ReduceRegionPolicy; - - // SingleTilePolicy - typedef BlockRangeReducePolicy< - 32, ///< Threads per thread block - 4, ///< Items per thread per tile of input - 4, ///< Number of items per vectorized load - BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT, ///< Cache load modifier - GRID_MAPPING_EVEN_SHARE> ///< How to map tiles of input onto thread blocks - SingleTilePolicy; - }; - - - /****************************************************************************** - * Tuning policies of current PTX compiler pass - ******************************************************************************/ - -#if (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 130) - typedef Policy130 PtxPolicy; - -#else - typedef Policy100 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxReduceRegionPolicy : PtxPolicy::ReduceRegionPolicy {}; - struct PtxSingleTilePolicy : PtxPolicy::SingleTilePolicy {}; - - - /****************************************************************************** - * Utilities - ******************************************************************************/ - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static void InitConfigs( - int ptx_version, - KernelConfig &reduce_range_config, - KernelConfig &single_tile_config) - { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - reduce_range_config.template Init(); - single_tile_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - reduce_range_config.template Init(); - single_tile_config.template Init(); - } - else if (ptx_version >= 300) - { - reduce_range_config.template Init(); - single_tile_config.template Init(); - } - else if (ptx_version >= 200) - { - reduce_range_config.template Init(); - single_tile_config.template Init(); - } - else if (ptx_version >= 130) - { - reduce_range_config.template Init(); - single_tile_config.template Init(); - } - else - { - reduce_range_config.template Init(); - single_tile_config.template Init(); - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - int vector_load_length; - BlockReduceAlgorithm block_algorithm; - CacheLoadModifier load_modifier; - GridMappingStrategy grid_mapping; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - void Init() - { - block_threads = BlockPolicy::BLOCK_THREADS; - items_per_thread = BlockPolicy::ITEMS_PER_THREAD; - vector_load_length = BlockPolicy::VECTOR_LOAD_LENGTH; - block_algorithm = BlockPolicy::BLOCK_ALGORITHM; - load_modifier = BlockPolicy::LOAD_MODIFIER; - grid_mapping = BlockPolicy::GRID_MAPPING; - } - - CUB_RUNTIME_FUNCTION __forceinline__ - void Print() - { - printf("%d threads, %d per thread, %d veclen, %d algo, %d loadmod, %d mapping", - block_threads, - items_per_thread, - vector_load_length, - block_algorithm, - load_modifier, - grid_mapping); - } - }; - - /****************************************************************************** - * Dispatch entrypoints - ******************************************************************************/ - - /** - * Internal dispatch routine for computing a device-wide reduction using the - * specified kernel functions. - * - * If the input is larger than a single tile, this method uses two-passes of - * kernel invocations. - */ - template < - typename ReduceRegionKernelPtr, ///< Function type of cub::ReduceRegionKernel - typename AggregateTileKernelPtr, ///< Function type of cub::SingleTileKernel for consuming partial reductions (T*) - typename SingleTileKernelPtr, ///< Function type of cub::SingleTileKernel for consuming input (InputIterator) - typename FillAndResetDrainKernelPtr> ///< Function type of cub::FillAndResetDrainKernel - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output aggregate - Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - ReductionOp reduction_op, ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - FillAndResetDrainKernelPtr prepare_drain_kernel, ///< [in] Kernel function pointer to parameterization of cub::FillAndResetDrainKernel - ReduceRegionKernelPtr reduce_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::ReduceRegionKernel - AggregateTileKernelPtr aggregate_kernel, ///< [in] Kernel function pointer to parameterization of cub::SingleTileKernel for consuming partial reductions (T*) - SingleTileKernelPtr single_kernel, ///< [in] Kernel function pointer to parameterization of cub::SingleTileKernel for consuming input (InputIterator) - KernelConfig &reduce_range_config, ///< [in] Dispatch parameters that match the policy that \p reduce_range_kernel_ptr was compiled for - KernelConfig &single_tile_config) ///< [in] Dispatch parameters that match the policy that \p single_kernel was compiled for - { -#ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); - -#else - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get device SM version - int sm_version; - if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Tile size of reduce_range_kernel - int tile_size = reduce_range_config.block_threads * reduce_range_config.items_per_thread; - - if ((reduce_range_kernel == NULL) || (num_items <= tile_size)) - { - // Dispatch a single-block reduction kernel - - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - { - temp_storage_bytes = 1; - return cudaSuccess; - } - - // Log single_kernel configuration - if (debug_synchronous) CubLog("Invoking ReduceSingle<<<1, %d, 0, %lld>>>(), %d items per thread\n", - single_tile_config.block_threads, (long long) stream, single_tile_config.items_per_thread); - - // Invoke single_kernel - single_kernel<<<1, single_tile_config.block_threads, 0, stream>>>( - d_in, - d_out, - num_items, - reduction_op); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - } - else - { - // Dispatch two kernels: (1) a multi-block kernel to compute - // privatized per-block reductions, and (2) a single-block - // to reduce those partial reductions - - // Get SM occupancy for reduce_range_kernel - int reduce_range_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - reduce_range_sm_occupancy, - sm_version, - reduce_range_kernel, - reduce_range_config.block_threads))) break; - - // Get device occupancy for reduce_range_kernel - int reduce_range_occupancy = reduce_range_sm_occupancy * sm_count; - - // Even-share work distribution - int subscription_factor = reduce_range_sm_occupancy; // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic) - GridEvenShare even_share( - num_items, - reduce_range_occupancy * subscription_factor, - tile_size); - - // Get grid size for reduce_range_kernel - int reduce_range_grid_size; - switch (reduce_range_config.grid_mapping) - { - case GRID_MAPPING_EVEN_SHARE: - - // Work is distributed evenly - reduce_range_grid_size = even_share.grid_size; - break; - - case GRID_MAPPING_DYNAMIC: - - // Work is distributed dynamically - int num_tiles = (num_items + tile_size - 1) / tile_size; - reduce_range_grid_size = (num_tiles < reduce_range_occupancy) ? - num_tiles : // Not enough to fill the device with threadblocks - reduce_range_occupancy; // Fill the device with threadblocks - break; - }; - - // Temporary storage allocation requirements - void* allocations[2]; - size_t allocation_sizes[2] = - { - reduce_range_grid_size * sizeof(T), // bytes needed for privatized block reductions - GridQueue::AllocationSize() // bytes needed for grid queue descriptor - }; - - // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - return cudaSuccess; - } - - // Alias the allocation for the privatized per-block reductions - T *d_block_reductions = (T*) allocations[0]; - - // Alias the allocation for the grid queue descriptor - GridQueue queue(allocations[1]); - - // Prepare the dynamic queue descriptor if necessary - if (reduce_range_config.grid_mapping == GRID_MAPPING_DYNAMIC) - { - // Prepare queue using a kernel so we know it gets prepared once per operation - if (debug_synchronous) CubLog("Invoking prepare_drain_kernel<<<1, 1, 0, %lld>>>()\n", (long long) stream); - - // Invoke prepare_drain_kernel - prepare_drain_kernel<<<1, 1, 0, stream>>>(queue, num_items); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - - // Log reduce_range_kernel configuration - if (debug_synchronous) CubLog("Invoking reduce_range_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - reduce_range_grid_size, reduce_range_config.block_threads, (long long) stream, reduce_range_config.items_per_thread, reduce_range_sm_occupancy); - - // Invoke reduce_range_kernel - reduce_range_kernel<<>>( - d_in, - d_block_reductions, - num_items, - even_share, - queue, - reduction_op); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Log single_kernel configuration - if (debug_synchronous) CubLog("Invoking single_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", - 1, single_tile_config.block_threads, (long long) stream, single_tile_config.items_per_thread); - - // Invoke single_kernel - aggregate_kernel<<<1, single_tile_config.block_threads, 0, stream>>>( - d_block_reductions, - d_out, - reduce_range_grid_size, - reduction_op); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine for computing a device-wide reduction - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output aggregate - Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - ReductionOp reduction_op, ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel kernel dispatch configurations - KernelConfig reduce_range_config; - KernelConfig single_tile_config; - InitConfigs(ptx_version, reduce_range_config, single_tile_config); - - // Dispatch - if (CubDebug(error = Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - reduction_op, - stream, - debug_synchronous, - FillAndResetDrainKernel, - ReduceRegionKernel, - SingleTileKernel, - SingleTileKernel, - reduce_range_config, - single_tile_config))) break; - } - while (0); - - return error; - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/dispatch/device_scan_dispatch.cuh b/SRC/cub/device/dispatch/device_scan_dispatch.cuh deleted file mode 100644 index 6abeb29d..00000000 --- a/SRC/cub/device/dispatch/device_scan_dispatch.cuh +++ /dev/null @@ -1,565 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within global memory. - */ - -#pragma once - -#include -#include - -#include "../../block_range/block_range_scan.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Initialization kernel for tile status initialization (multi-block) - */ -template < - typename Offset, ///< Signed integer type for global offsets - typename ScanTileState> ///< Tile status interface type -__global__ void ScanInitKernel( - GridQueue grid_queue, ///< [in] Descriptor for performing dynamic mapping of input tiles to thread blocks - ScanTileState tile_status, ///< [in] Tile status interface - int num_tiles) ///< [in] Number of tiles -{ - // Reset queue descriptor - if ((blockIdx.x == 0) && (threadIdx.x == 0)) - grid_queue.FillAndResetDrain(num_tiles); - - // Initialize tile status - tile_status.InitializeStatus(num_tiles); -} - - -/** - * Scan kernel entry point (multi-block) - */ -template < - typename BlockRangeScanPolicy, ///< Parameterized BlockRangeScanPolicy tuning policy type - typename InputIterator, ///< Random-access input iterator type for reading scan input data \iterator - typename OutputIterator, ///< Random-access output iterator type for writing scan output data \iterator - typename ScanTileState, ///< Tile status interface type - typename ScanOp, ///< Binary scan functor type having member T operator()(const T &a, const T &b) - typename Identity, ///< Identity value type (cub::NullType for inclusive scans) - typename Offset> ///< Signed integer type for global offsets -__launch_bounds__ (int(BlockRangeScanPolicy::BLOCK_THREADS)) -__global__ void ScanRegionKernel( - InputIterator d_in, ///< Input data - OutputIterator d_out, ///< Output data - ScanTileState tile_status, ///< [in] Tile status interface - ScanOp scan_op, ///< Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - Identity identity, ///< Identity element - Offset num_items, ///< Total number of scan items for the entire problem - GridQueue queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks -{ - // Thread block type for scanning input tiles - typedef BlockRangeScan< - BlockRangeScanPolicy, - InputIterator, - OutputIterator, - ScanOp, - Identity, - Offset> BlockRangeScanT; - - // Shared memory for BlockRangeScan - __shared__ typename BlockRangeScanT::TempStorage temp_storage; - - // Process tiles - BlockRangeScanT(temp_storage, d_in, d_out, scan_op, identity).ConsumeRange( - num_items, - queue, - tile_status); -} - - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceScan - */ -template < - typename InputIterator, ///< Random-access input iterator type for reading scan input data \iterator - typename OutputIterator, ///< Random-access output iterator type for writing scan output data \iterator - typename ScanOp, ///< Binary scan functor type having member T operator()(const T &a, const T &b) - typename Identity, ///< Identity value type (cub::NullType for inclusive scans) - typename Offset> ///< Signed integer type for global offsets -struct DeviceScanDispatch -{ - enum - { - INIT_KERNEL_THREADS = 128 - }; - - // Data type - typedef typename std::iterator_traits::value_type T; - - // Tile status descriptor interface type - typedef ScanTileState ScanTileState; - - - /****************************************************************************** - * Tuning policies - ******************************************************************************/ - - /// SM35 - struct Policy350 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 12, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T - typedef BlockRangeScanPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_DIRECT, - false, - LOAD_LDG, - BLOCK_STORE_WARP_TRANSPOSE, - true, - BLOCK_SCAN_RAKING_MEMOIZE> - ScanRegionPolicy; - }; - - /// SM30 - struct Policy300 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 9, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef BlockRangeScanPolicy< - 256, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - false, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE, - false, - BLOCK_SCAN_RAKING_MEMOIZE> - ScanRegionPolicy; - }; - - /// SM20 - struct Policy200 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 15, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T - typedef BlockRangeScanPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - false, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE, - false, - BLOCK_SCAN_RAKING_MEMOIZE> - ScanRegionPolicy; - }; - - /// SM13 - struct Policy130 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 21, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef BlockRangeScanPolicy< - 96, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - false, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE, - false, - BLOCK_SCAN_RAKING_MEMOIZE> - ScanRegionPolicy; - }; - - /// SM10 - struct Policy100 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 9, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef BlockRangeScanPolicy< - 64, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - true, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE, - true, - BLOCK_SCAN_WARP_SCANS> - ScanRegionPolicy; - }; - - - /****************************************************************************** - * Tuning policies of current PTX compiler pass - ******************************************************************************/ - -#if (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 130) - typedef Policy130 PtxPolicy; - -#else - typedef Policy100 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxScanRegionPolicy : PtxPolicy::ScanRegionPolicy {}; - - - /****************************************************************************** - * Utilities - ******************************************************************************/ - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static void InitConfigs( - int ptx_version, - KernelConfig &scan_range_config) - { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - scan_range_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - scan_range_config.template Init(); - } - else if (ptx_version >= 300) - { - scan_range_config.template Init(); - } - else if (ptx_version >= 200) - { - scan_range_config.template Init(); - } - else if (ptx_version >= 130) - { - scan_range_config.template Init(); - } - else - { - scan_range_config.template Init(); - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration. Mirrors the constants within BlockRangeScanPolicy. - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - BlockLoadAlgorithm load_policy; - BlockStoreAlgorithm store_policy; - BlockScanAlgorithm scan_algorithm; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - void Init() - { - block_threads = BlockRangeScanPolicy::BLOCK_THREADS; - items_per_thread = BlockRangeScanPolicy::ITEMS_PER_THREAD; - load_policy = BlockRangeScanPolicy::LOAD_ALGORITHM; - store_policy = BlockRangeScanPolicy::STORE_ALGORITHM; - scan_algorithm = BlockRangeScanPolicy::SCAN_ALGORITHM; - } - - CUB_RUNTIME_FUNCTION __forceinline__ - void Print() - { - printf("%d, %d, %d, %d, %d", - block_threads, - items_per_thread, - load_policy, - store_policy, - scan_algorithm); - } - }; - - - /****************************************************************************** - * Dispatch entrypoints - ******************************************************************************/ - - /** - * Internal dispatch routine for computing a device-wide prefix scan using the - * specified kernel functions. - */ - template < - typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel - typename ScanRegionKernelPtr> ///< Function type of cub::ScanRegionKernelPtr - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output sequence of data items - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - Identity identity, ///< [in] Identity element - Offset num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version, ///< [in] PTX version of dispatch kernels - ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel - ScanRegionKernelPtr scan_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanRegionKernel - KernelConfig scan_range_config) ///< [in] Dispatch parameters that match the policy that \p scan_range_kernel was compiled for - { - -#ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get device SM version - int sm_version; - if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Number of input tiles - int tile_size = scan_range_config.block_threads * scan_range_config.items_per_thread; - int num_tiles = (num_items + tile_size - 1) / tile_size; - - // Specify temporary storage allocation requirements - size_t allocation_sizes[2]; - if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors - allocation_sizes[1] = GridQueue::AllocationSize(); // bytes needed for grid queue descriptor - - // Compute allocation pointers into the single storage blob (or set the necessary size of the blob) - void* allocations[2]; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - return cudaSuccess; - } - - // Construct the tile status interface - ScanTileState tile_status; - if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; - - // Construct the grid queue descriptor - GridQueue queue(allocations[1]); - - // Log init_kernel configuration - int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; - if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); - - // Invoke init_kernel to initialize tile descriptors and queue descriptors - init_kernel<<>>( - queue, - tile_status, - num_tiles); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Get SM occupancy for scan_range_kernel - int scan_range_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - scan_range_sm_occupancy, // out - sm_version, - scan_range_kernel, - scan_range_config.block_threads))) break; - - // Get grid size for scanning tiles - dim3 scan_grid_size; - if (ptx_version <= 130) - { - // Blocks are launched in order, so just assign one block per tile - int max_dim_x = 32 * 1024; - scan_grid_size.z = 1; - scan_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x; - scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); - } - else - { - // Blocks may not be launched in order, so use atomics - int scan_range_occupancy = scan_range_sm_occupancy * sm_count; // Whole-device occupancy for scan_range_kernel - scan_grid_size.z = 1; - scan_grid_size.y = 1; - scan_grid_size.x = (num_tiles < scan_range_occupancy) ? - num_tiles : // Not enough to fill the device with threadblocks - scan_range_occupancy; // Fill the device with threadblocks - } - - // Log scan_range_kernel configuration - if (debug_synchronous) CubLog("Invoking scan_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, scan_range_config.block_threads, (long long) stream, scan_range_config.items_per_thread, scan_range_sm_occupancy); - - // Invoke scan_range_kernel - scan_range_kernel<<>>( - d_in, - d_out, - tile_status, - scan_op, - identity, - num_items, - queue); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to the input sequence of data items - OutputIterator d_out, ///< [out] Pointer to the output sequence of data items - ScanOp scan_op, ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) - Identity identity, ///< [in] Identity element - Offset num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel kernel dispatch configurations - KernelConfig scan_range_config; - InitConfigs(ptx_version, scan_range_config); - - // Dispatch - if (CubDebug(error = Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - identity, - num_items, - stream, - debug_synchronous, - ptx_version, - ScanInitKernel, - ScanRegionKernel, - scan_range_config))) break; - } - while (0); - - return error; - } -}; - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/dispatch/dispatch_histogram.cuh b/SRC/cub/device/dispatch/dispatch_histogram.cuh new file mode 100644 index 00000000..ab08e8ed --- /dev/null +++ b/SRC/cub/device/dispatch/dispatch_histogram.cuh @@ -0,0 +1,1096 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. + */ + +#pragma once + +#include +#include +#include + +#include "../../agent/agent_histogram.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../thread/thread_search.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + + +/****************************************************************************** + * Histogram kernel entry points + *****************************************************************************/ + +/** + * Histogram initialization kernel entry point + */ +template < + int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename CounterT, ///< Integer type for counting sample occurrences per histogram bin + typename OffsetT> ///< Signed integer type for global offsets +__global__ void DeviceHistogramInitKernel( + ArrayWrapper num_output_bins_wrapper, ///< Number of output histogram bins per channel + ArrayWrapper d_output_histograms_wrapper, ///< Histogram counter data having logical dimensions CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]] + GridQueue tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + if ((threadIdx.x == 0) && (blockIdx.x == 0)) + tile_queue.ResetDrain(); + + int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x; + + #pragma unroll + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + { + if (output_bin < num_output_bins_wrapper.array[CHANNEL]) + d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0; + } +} + + +/** + * Histogram privatized sweep kernel entry point (multi-block). Computes privatized histograms, one per thread block. + */ +template < + typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type + int PRIVATIZED_SMEM_BINS, ///< Maximum number of histogram bins per channel (e.g., up to 256) + int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename SampleIteratorT, ///< The input iterator type. \iterator. + typename CounterT, ///< Integer type for counting sample occurrences per histogram bin + typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel + typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS)) +__global__ void DeviceHistogramSweepKernel( + SampleIteratorT d_samples, ///< Input data to reduce + ArrayWrapper num_output_bins_wrapper, ///< The number bins per final output histogram + ArrayWrapper num_privatized_bins_wrapper, ///< The number bins per privatized histogram + ArrayWrapper d_output_histograms_wrapper, ///< Reference to final output histograms + ArrayWrapper d_privatized_histograms_wrapper, ///< Reference to privatized histograms + ArrayWrapper output_decode_op_wrapper, ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel + ArrayWrapper privatized_decode_op_wrapper, ///< The transform operator for determining privatized counter indices from samples, one for each channel + OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< The number of rows in the region of interest + OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest + int tiles_per_row, ///< Number of image tiles per row + GridQueue tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks +{ + // Thread block type for compositing input tiles + typedef AgentHistogram< + AgentHistogramPolicyT, + PRIVATIZED_SMEM_BINS, + NUM_CHANNELS, + NUM_ACTIVE_CHANNELS, + SampleIteratorT, + CounterT, + PrivatizedDecodeOpT, + OutputDecodeOpT, + OffsetT> + AgentHistogramT; + + // Shared memory for AgentHistogram + __shared__ typename AgentHistogramT::TempStorage temp_storage; + + AgentHistogramT agent( + temp_storage, + d_samples, + num_output_bins_wrapper.array, + num_privatized_bins_wrapper.array, + d_output_histograms_wrapper.array, + d_privatized_histograms_wrapper.array, + output_decode_op_wrapper.array, + privatized_decode_op_wrapper.array); + + // Initialize counters + agent.InitBinCounters(); + + // Consume input tiles + agent.ConsumeTiles( + num_row_pixels, + num_rows, + row_stride_samples, + tiles_per_row, + tile_queue); + + // Store output to global (if necessary) + agent.StoreOutput(); + +} + + + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram + */ +template < + int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) + int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed + typename SampleIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename CounterT, ///< Integer type for counting sample occurrences per histogram bin + typename LevelT, ///< Type for specifying bin level boundaries + typename OffsetT> ///< Signed integer type for global offsets +struct DipatchHistogram +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + /// The sample value type of the input iterator + typedef typename std::iterator_traits::value_type SampleT; + + enum + { + // Maximum number of bins per channel for which we will use a privatized smem strategy + MAX_PRIVATIZED_SMEM_BINS = 256 + }; + + + //--------------------------------------------------------------------- + // Transform functors for converting samples to bin-ids + //--------------------------------------------------------------------- + + // Searches for bin given a list of bin-boundary levels + template + struct SearchTransform + { + LevelIteratorT d_levels; // Pointer to levels array + int num_output_levels; // Number of levels in array + + // Initializer + __host__ __device__ __forceinline__ void Init( + LevelIteratorT d_levels, // Pointer to levels array + int num_output_levels) // Number of levels in array + { + this->d_levels = d_levels; + this->num_output_levels = num_output_levels; + } + + // Method for converting samples to bin-ids + template + __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) + { + /// Level iterator wrapper type + typedef typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator + LevelIteratorT>::Type // Directly use the supplied input iterator type + WrappedLevelIteratorT; + + WrappedLevelIteratorT wrapped_levels(d_levels); + + int num_bins = num_output_levels - 1; + if (valid) + { + bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1; + if (bin >= num_bins) + bin = -1; + } + } + }; + + + // Scales samples to evenly-spaced bins + struct ScaleTransform + { + int num_bins; // Number of levels in array + LevelT max; // Max sample level (exclusive) + LevelT min; // Min sample level (inclusive) + LevelT scale; // Bin scaling factor + + // Initializer + template + __host__ __device__ __forceinline__ void Init( + int num_output_levels, // Number of levels in array + _LevelT max, // Max sample level (exclusive) + _LevelT min, // Min sample level (inclusive) + _LevelT scale) // Bin scaling factor + { + this->num_bins = num_output_levels - 1; + this->max = max; + this->min = min; + this->scale = scale; + } + + // Initializer (float specialization) + __host__ __device__ __forceinline__ void Init( + int num_output_levels, // Number of levels in array + float max, // Max sample level (exclusive) + float min, // Min sample level (inclusive) + float scale) // Bin scaling factor + { + this->num_bins = num_output_levels - 1; + this->max = max; + this->min = min; + this->scale = float(1.0) / scale; + } + + // Initializer (double specialization) + __host__ __device__ __forceinline__ void Init( + int num_output_levels, // Number of levels in array + double max, // Max sample level (exclusive) + double min, // Min sample level (inclusive) + double scale) // Bin scaling factor + { + this->num_bins = num_output_levels - 1; + this->max = max; + this->min = min; + this->scale = double(1.0) / scale; + } + + // Method for converting samples to bin-ids + template + __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) + { + LevelT level_sample = (LevelT) sample; + + if (valid && (level_sample >= min) && (level_sample < max)) + bin = (int) ((level_sample - min) / scale); + } + + // Method for converting samples to bin-ids (float specialization) + template + __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid) + { + LevelT level_sample = (LevelT) sample; + + if (valid && (level_sample >= min) && (level_sample < max)) + bin = (int) ((level_sample - min) * scale); + } + + // Method for converting samples to bin-ids (double specialization) + template + __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid) + { + LevelT level_sample = (LevelT) sample; + + if (valid && (level_sample >= min) && (level_sample < max)) + bin = (int) ((level_sample - min) * scale); + } + }; + + + // Pass-through bin transform operator + struct PassThruTransform + { + // Method for converting samples to bin-ids + template + __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) + { + if (valid) + bin = (int) sample; + } + }; + + + + //--------------------------------------------------------------------- + // Tuning policies + //--------------------------------------------------------------------- + + template + struct TScale + { + enum + { + V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int), + VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1) + }; + }; + + + /// SM11 + struct Policy110 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 512, + (NUM_CHANNELS == 1) ? 8 : 2, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + true, + GMEM, + false> + HistogramSweepPolicy; + }; + + /// SM20 + struct Policy200 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + (NUM_CHANNELS == 1) ? 256 : 128, + (NUM_CHANNELS == 1) ? 8 : 3, + (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + SMEM, + false> + HistogramSweepPolicy; + }; + + /// SM30 + struct Policy300 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 512, + (NUM_CHANNELS == 1) ? 8 : 2, + BLOCK_LOAD_DIRECT, + LOAD_DEFAULT, + true, + GMEM, + false> + HistogramSweepPolicy; + }; + + /// SM35 + struct Policy350 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 128, + TScale<8>::VALUE, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + BLEND, + true> + HistogramSweepPolicy; + }; + + /// SM50 + struct Policy500 + { + // HistogramSweepPolicy + typedef AgentHistogramPolicy< + 384, + TScale<16>::VALUE, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + SMEM, + false> + HistogramSweepPolicy; + }; + + + + //--------------------------------------------------------------------- + // Tuning policies of current PTX compiler pass + //--------------------------------------------------------------------- + +#if (CUB_PTX_ARCH >= 500) + typedef Policy500 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#else + typedef Policy110 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {}; + + + //--------------------------------------------------------------------- + // Utilities + //--------------------------------------------------------------------- + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t InitConfigs( + int ptx_version, + KernelConfig &histogram_sweep_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + return histogram_sweep_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 500) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 350) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 300) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 200) + { + return histogram_sweep_config.template Init(); + } + else if (ptx_version >= 110) + { + return histogram_sweep_config.template Init(); + } + else + { + // No global atomic support + return cudaErrorNotSupported; + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration + */ + struct KernelConfig + { + int block_threads; + int pixels_per_thread; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Init() + { + block_threads = BlockPolicy::BLOCK_THREADS; + pixels_per_thread = BlockPolicy::PIXELS_PER_THREAD; + + return cudaSuccess; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Privatization-based dispatch routine + */ + template < + typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel + typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel + typename DeviceHistogramInitKernelT, ///< Function type of cub::DeviceHistogramInitKernel + typename DeviceHistogramSweepKernelT> ///< Function type of cub::DeviceHistogramSweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t PrivatizedDispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_privatized_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel + int max_num_output_bins, ///< [in] Maximum number of output bins in any channel + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + DeviceHistogramInitKernelT histogram_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel + DeviceHistogramSweepKernelT histogram_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel + KernelConfig histogram_sweep_config, ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + #ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + + #else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Get SM occupancy for histogram_sweep_kernel + int histogram_sweep_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + histogram_sweep_sm_occupancy, + histogram_sweep_kernel, + histogram_sweep_config.block_threads))) break; + + // Get device occupancy for histogram_sweep_kernel + int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count; + + if (num_row_pixels * NUM_CHANNELS == row_stride_samples) + { + // Treat as a single linear array of samples + num_row_pixels *= num_rows; + num_rows = 1; + row_stride_samples = num_row_pixels * NUM_CHANNELS; + } + + // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy + int pixels_per_tile = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread; + int tiles_per_row = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile; + int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row); + int blocks_per_col = (blocks_per_row > 0) ? + int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) : + 0; + int num_thread_blocks = blocks_per_row * blocks_per_col; + + dim3 sweep_grid_dims; + sweep_grid_dims.x = (unsigned int) blocks_per_row; + sweep_grid_dims.y = (unsigned int) blocks_per_col; + sweep_grid_dims.z = 1; + + // Temporary storage allocation requirements + const int NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1; + void* allocations[NUM_ALLOCATIONS]; + size_t allocation_sizes[NUM_ALLOCATIONS]; + + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT); + + allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue::AllocationSize(); + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the grid queue descriptor + GridQueue tile_queue(allocations[NUM_ALLOCATIONS - 1]); + + // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters) + ArrayWrapper d_output_histograms_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL]; + + // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters) + ArrayWrapper d_privatized_histograms_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL]; + + // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters) + ArrayWrapper privatized_decode_op_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL]; + + // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters) + ArrayWrapper output_decode_op_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL]; + + // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters) + ArrayWrapper num_privatized_bins_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1; + + // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters) + ArrayWrapper num_output_bins_wrapper; + for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) + num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1; + + int histogram_init_block_threads = 256; + int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads; + + // Log DeviceHistogramInitKernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n", + histogram_init_grid_dims, histogram_init_block_threads, (long long) stream); + + // Invoke histogram_init_kernel + histogram_init_kernel<<>>( + num_output_bins_wrapper, + d_output_histograms_wrapper, + tile_queue); + + // Return if empty problem + if ((blocks_per_row == 0) || (blocks_per_col == 0)) + break; + + // Log histogram_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n", + sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z, + histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy); + + // Invoke histogram_sweep_kernel + histogram_sweep_kernel<<>>( + d_samples, + num_output_bins_wrapper, + num_privatized_bins_wrapper, + d_output_histograms_wrapper, + d_privatized_histograms_wrapper, + output_decode_op_wrapper, + privatized_decode_op_wrapper, + num_row_pixels, + num_rows, + row_stride_samples, + tiles_per_row, + tile_queue); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + } + while (0); + + return error; + + #endif // CUB_RUNTIME_ENABLED + } + + + + /** + * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit + */ + CUB_RUNTIME_FUNCTION + static cudaError_t DispatchRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the search transform op for converting samples to privatized bins + typedef SearchTransform PrivatizedDecodeOpT; + + // Use the pass-thru transform op for converting privatized bins to output bins + typedef PassThruTransform OutputDecodeOpT; + + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + // Dispatch + if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) + { + // Too many bins to keep in shared memory. + const int PRIVATIZED_SMEM_BINS = 0; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + else + { + // Dispatch shared-privatized approach + const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + + } while (0); + + return error; + } + + + /** + * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) + */ + CUB_RUNTIME_FUNCTION + static cudaError_t DispatchRange( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the pass-thru transform op for converting samples to privatized bins + typedef PassThruTransform PrivatizedDecodeOpT; + + // Use the search transform op for converting privatized bins to output bins + typedef SearchTransform OutputDecodeOpT; + + int num_privatized_levels[NUM_ACTIVE_CHANNELS]; + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; // Maximum number of levels in any channel + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + num_privatized_levels[channel] = 257; + output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); + + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + const int PRIVATIZED_SMEM_BINS = 256; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_privatized_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + + } while (0); + + return error; + } + + + /** + * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t DispatchEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the scale transform op for converting samples to privatized bins + typedef ScaleTransform PrivatizedDecodeOpT; + + // Use the pass-thru transform op for converting privatized bins to output bins + typedef PassThruTransform OutputDecodeOpT; + + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + int bins = num_output_levels[channel] - 1; + LevelT scale = (upper_level[channel] - lower_level[channel]) / bins; + + privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); + + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) + { + // Dispatch shared-privatized approach + const int PRIVATIZED_SMEM_BINS = 0; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + else + { + // Dispatch shared-privatized approach + const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_output_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + } + } + while (0); + + return error; + } + + + /** + * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t DispatchEven( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). + CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. + int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. + LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. + LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. + OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest + OffsetT num_rows, ///< [in] The number of rows in the region of interest + OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel dispatch configurations + KernelConfig histogram_sweep_config; + if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) + break; + + // Use the pass-thru transform op for converting samples to privatized bins + typedef PassThruTransform PrivatizedDecodeOpT; + + // Use the scale transform op for converting privatized bins to output bins + typedef ScaleTransform OutputDecodeOpT; + + int num_privatized_levels[NUM_ACTIVE_CHANNELS]; + PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; + OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; + int max_levels = num_output_levels[0]; + + for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) + { + num_privatized_levels[channel] = 257; + + int bins = num_output_levels[channel] - 1; + LevelT scale = (upper_level[channel] - lower_level[channel]) / bins; + output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); + + if (num_output_levels[channel] > max_levels) + max_levels = num_output_levels[channel]; + } + int max_num_output_bins = max_levels - 1; + + const int PRIVATIZED_SMEM_BINS = 256; + + if (CubDebug(error = PrivatizedDispatch( + d_temp_storage, + temp_storage_bytes, + d_samples, + d_output_histograms, + num_privatized_levels, + privatized_decode_op, + num_output_levels, + output_decode_op, + max_num_output_bins, + num_row_pixels, + num_rows, + row_stride_samples, + DeviceHistogramInitKernel, + DeviceHistogramSweepKernel, + histogram_sweep_config, + stream, + debug_synchronous))) break; + + } + while (0); + + return error; + } + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/dispatch/dispatch_radix_sort.cuh b/SRC/cub/device/dispatch/dispatch_radix_sort.cuh new file mode 100644 index 00000000..d1a992d4 --- /dev/null +++ b/SRC/cub/device/dispatch/dispatch_radix_sort.cuh @@ -0,0 +1,1619 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../../agent/agent_radix_sort_upsweep.cuh" +#include "../../agent/agent_radix_sort_downsweep.cuh" +#include "../../agent/agent_scan.cuh" +#include "../../block/block_radix_sort.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../util_type.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Upsweep digit-counting kernel entry point (multi-block). Computes privatized digit histograms, one per block. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int((ALT_DIGIT_BITS) ? + ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS : + ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)) +__global__ void DeviceRadixSortUpsweepKernel( + const KeyT *d_keys, ///< [in] Input keys buffer + OffsetT *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + OffsetT /*num_items*/, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int num_bits, ///< [in] Number of bits of current radix digit + GridEvenShare even_share) ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block +{ + enum { + TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS * + ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD + }; + + // Parameterize AgentRadixSortUpsweep type for the current configuration + typedef AgentRadixSortUpsweep< + typename If<(ALT_DIGIT_BITS), + typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy, + typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>::Type, + KeyT, + OffsetT> + AgentRadixSortUpsweepT; + + // Shared memory storage + __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage; + + // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block + even_share.template BlockInit(); + + AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits); + + upsweep.ProcessRegion(even_share.block_offset, even_share.block_end); + + CTA_SYNC(); + + // Write out digit counts (striped) + upsweep.template ExtractCounts(d_spine, gridDim.x, blockIdx.x); +} + + +/** + * Spine scan kernel entry point (single-block). Computes an exclusive prefix sum over the privatized digit histograms + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1) +__global__ void RadixSortScanBinsKernel( + OffsetT *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + int num_counts) ///< [in] Total number of bin-counts +{ + // Parameterize the AgentScan type for the current configuration + typedef AgentScan< + typename ChainedPolicyT::ActivePolicy::ScanPolicy, + OffsetT*, + OffsetT*, + cub::Sum, + OffsetT, + OffsetT> + AgentScanT; + + // Shared memory storage + __shared__ typename AgentScanT::TempStorage temp_storage; + + // Block scan instance + AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ; + + // Process full input tiles + int block_offset = 0; + BlockScanRunningPrefixOp prefix_op(0, Sum()); + while (block_offset + AgentScanT::TILE_ITEMS <= num_counts) + { + block_scan.template ConsumeTile(block_offset, prefix_op); + block_offset += AgentScanT::TILE_ITEMS; + } +} + + +/** + * Downsweep pass kernel entry point (multi-block). Scatters keys (and values) into corresponding bins for the current digit place. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int((ALT_DIGIT_BITS) ? + ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS : + ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)) +__global__ void DeviceRadixSortDownsweepKernel( + const KeyT *d_keys_in, ///< [in] Input keys buffer + KeyT *d_keys_out, ///< [in] Output keys buffer + const ValueT *d_values_in, ///< [in] Input values buffer + ValueT *d_values_out, ///< [in] Output values buffer + OffsetT *d_spine, ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) + OffsetT num_items, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int num_bits, ///< [in] Number of bits of current radix digit + GridEvenShare even_share) ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block +{ + enum { + TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS * + ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD + }; + + // Parameterize AgentRadixSortDownsweep type for the current configuration + typedef AgentRadixSortDownsweep< + typename If<(ALT_DIGIT_BITS), + typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy, + typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>::Type, + IS_DESCENDING, + KeyT, + ValueT, + OffsetT> + AgentRadixSortDownsweepT; + + // Shared memory storage + __shared__ typename AgentRadixSortDownsweepT::TempStorage temp_storage; + + // Initialize even-share descriptor for this thread block + even_share.template BlockInit(); + + // Process input tiles + AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion( + even_share.block_offset, + even_share.block_end); +} + + +/** + * Single pass kernel entry point (single-block). Fully sorts a tile of input. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) +__global__ void DeviceRadixSortSingleTileKernel( + const KeyT *d_keys_in, ///< [in] Input keys buffer + KeyT *d_keys_out, ///< [in] Output keys buffer + const ValueT *d_values_in, ///< [in] Input values buffer + ValueT *d_values_out, ///< [in] Output values buffer + OffsetT num_items, ///< [in] Total number of input data items + int current_bit, ///< [in] Bit position of current radix digit + int end_bit) ///< [in] The past-the-end (most-significant) bit index needed for key comparison +{ + // Constants + enum + { + BLOCK_THREADS = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS, + ITEMS_PER_THREAD = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD, + KEYS_ONLY = Equals::VALUE, + }; + + // BlockRadixSort type + typedef BlockRadixSort< + KeyT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + ValueT, + ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS, + (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE), + ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM> + BlockRadixSortT; + + // BlockLoad type (keys) + typedef BlockLoad< + KeyT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys; + + // BlockLoad type (values) + typedef BlockLoad< + ValueT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues; + + // Unsigned word for key bits + typedef typename Traits::UnsignedBits UnsignedBitsT; + + // Shared memory storage + __shared__ union TempStorage + { + typename BlockRadixSortT::TempStorage sort; + typename BlockLoadKeys::TempStorage load_keys; + typename BlockLoadValues::TempStorage load_values; + + } temp_storage; + + // Keys and values for the block + KeyT keys[ITEMS_PER_THREAD]; + ValueT values[ITEMS_PER_THREAD]; + + // Get default (min/max) value for out-of-bounds keys + UnsignedBitsT default_key_bits = (IS_DESCENDING) ? Traits::LOWEST_KEY : Traits::MAX_KEY; + KeyT default_key = reinterpret_cast(default_key_bits); + + // Load keys + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key); + + CTA_SYNC(); + + // Load values + if (!KEYS_ONLY) + { + // Register pressure work-around: moving num_items through shfl prevents compiler + // from reusing guards/addressing from prior guarded loads + num_items = ShuffleIndex(num_items, 0, 0xffffffff); + + BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items); + + CTA_SYNC(); + } + + // Sort tile + BlockRadixSortT(temp_storage.sort).SortBlockedToStriped( + keys, + values, + current_bit, + end_bit, + Int2Type(), + Int2Type()); + + // Store keys and values + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int item_offset = ITEM * BLOCK_THREADS + threadIdx.x; + if (item_offset < num_items) + { + d_keys_out[item_offset] = keys[ITEM]; + if (!KEYS_ONLY) + d_values_out[item_offset] = values[ITEM]; + } + } +} + + +/** + * Segmented radix sorting pass (one block per segment) + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int((ALT_DIGIT_BITS) ? + ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS : + ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS)) +__global__ void DeviceSegmentedRadixSortKernel( + const KeyT *d_keys_in, ///< [in] Input keys buffer + KeyT *d_keys_out, ///< [in] Output keys buffer + const ValueT *d_values_in, ///< [in] Input values buffer + ValueT *d_values_out, ///< [in] Output values buffer + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data + int current_bit, ///< [in] Bit position of current radix digit + int pass_bits) ///< [in] Number of bits of current radix digit +{ + // + // Constants + // + + typedef typename If<(ALT_DIGIT_BITS), + typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy, + typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT; + + enum + { + BLOCK_THREADS = SegmentedPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD, + RADIX_BITS = SegmentedPolicyT::RADIX_BITS, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + RADIX_DIGITS = 1 << RADIX_BITS, + KEYS_ONLY = Equals::VALUE, + }; + + // Upsweep type + typedef AgentRadixSortUpsweep< + AgentRadixSortUpsweepPolicy, + KeyT, + OffsetT> + BlockUpsweepT; + + // Digit-scan type + typedef BlockScan DigitScanT; + + // Downsweep type + typedef AgentRadixSortDownsweep BlockDownsweepT; + + enum + { + /// Number of bin-starting offsets tracked per thread + BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD + }; + + // + // Process input tiles + // + + // Shared memory storage + __shared__ union + { + typename BlockUpsweepT::TempStorage upsweep; + typename BlockDownsweepT::TempStorage downsweep; + struct + { + volatile OffsetT reverse_counts_in[RADIX_DIGITS]; + volatile OffsetT reverse_counts_out[RADIX_DIGITS]; + typename DigitScanT::TempStorage scan; + }; + + } temp_storage; + + OffsetT segment_begin = d_begin_offsets[blockIdx.x]; + OffsetT segment_end = d_end_offsets[blockIdx.x]; + OffsetT num_items = segment_end - segment_begin; + + // Check if empty segment + if (num_items <= 0) + return; + + // Upsweep + BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits); + upsweep.ProcessRegion(segment_begin, segment_end); + + CTA_SYNC(); + + // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads) + OffsetT bin_count[BINS_TRACKED_PER_THREAD]; + upsweep.ExtractCounts(bin_count); + + CTA_SYNC(); + + if (IS_DESCENDING) + { + // Reverse bin counts + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + temp_storage.reverse_counts_in[bin_idx] = bin_count[track]; + } + + CTA_SYNC(); + + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1]; + } + } + + // Scan + OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads) + DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset); + + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + bin_offset[track] += segment_begin; + } + + if (IS_DESCENDING) + { + // Reverse bin offsets + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track]; + } + + CTA_SYNC(); + + #pragma unroll + for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) + { + int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; + + if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) + bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1]; + } + } + + CTA_SYNC(); + + // Downsweep + BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits); + downsweep.ProcessRegion(segment_begin, segment_end); +} + + + +/****************************************************************************** + * Policy + ******************************************************************************/ + +/** + * Tuning policy for kernel specialization + */ +template < + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +struct DeviceRadixSortPolicy +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + enum + { + // Whether this is a keys-only (or key-value) sort + KEYS_ONLY = (Equals::VALUE), + }; + + // Dominant-sized key/value type + typedef typename If<(sizeof(ValueT) > 4) && (sizeof(KeyT) < sizeof(ValueT)), ValueT, KeyT>::Type DominantT; + + //------------------------------------------------------------------------------ + // Architecture-specific tuning policies + //------------------------------------------------------------------------------ + + /// SM20 + struct Policy200 : ChainedPolicy<200, Policy200, Policy200> + { + enum { + PRIMARY_RADIX_BITS = 5, + ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + + // Relative size of KeyT type to a 4-byte word + SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4, + }; + + // Keys-only upsweep policies + typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; + typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; + + // Key-value pairs upsweep policies + typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; + typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; + + // Upsweep policies + typedef typename If::Type UpsweepPolicy; + typedef typename If::Type AltUpsweepPolicy; + + // Scan policy + typedef AgentScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Keys-only downsweep policies + typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; + typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; + + // Key-value pairs downsweep policies + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; + + // Downsweep policies + typedef typename If::Type DownsweepPolicy; + typedef typename If::Type AltDownsweepPolicy; + + // Single-tile policy + typedef DownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + }; + + /// SM30 + struct Policy300 : ChainedPolicy<300, Policy300, Policy200> + { + enum { + PRIMARY_RADIX_BITS = 5, + ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + + // Relative size of KeyT type to a 4-byte word + SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4, + }; + + // Keys-only upsweep policies + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; + + // Key-value pairs upsweep policies + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; + typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; + + // Upsweep policies + typedef typename If::Type UpsweepPolicy; + typedef typename If::Type AltUpsweepPolicy; + + // Scan policy + typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; + + // Keys-only downsweep policies + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; + + // Key-value pairs downsweep policies + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; + typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; + + // Downsweep policies + typedef typename If::Type DownsweepPolicy; + typedef typename If::Type AltDownsweepPolicy; + + // Single-tile policy + typedef DownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + }; + + + /// SM35 + struct Policy350 : ChainedPolicy<350, Policy350, Policy300> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m) + }; + + // Scan policy + typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; + + // Keys-only downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicyKeys; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicyKeys; + + // Key-value pairs downsweep policies + typedef DownsweepPolicyKeys DownsweepPolicyPairs; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicyPairs; + + // Downsweep policies + typedef typename If::Type DownsweepPolicy; + typedef typename If::Type AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef DownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + + + }; + + + /// SM50 + struct Policy500 : ChainedPolicy<500, Policy500, Policy350> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.1B 32b segmented keys/s (TitanX) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; + }; + + + /// SM60 (GP100) + struct Policy600 : ChainedPolicy<600, Policy600, Policy500> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 5.9B 32b segmented keys/s (Quadro P100) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; + + }; + + + /// SM61 (GP104) + struct Policy610 : ChainedPolicy<610, Policy610, Policy600> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.3B 32b segmented keys/s (1080) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; + + // Upsweep policies + typedef AgentRadixSortUpsweepPolicy UpsweepPolicy; + typedef AgentRadixSortUpsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; + }; + + + /// SM62 (Tegra, less RF) + struct Policy620 : ChainedPolicy<620, Policy620, Policy610> + { + enum { + PRIMARY_RADIX_BITS = 5, + ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef DownsweepPolicy SegmentedPolicy; + typedef AltDownsweepPolicy AltSegmentedPolicy; + }; + + + /// SM70 (GV100) + struct Policy700 : ChainedPolicy<700, Policy700, Policy620> + { + enum { + PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 7.62B 32b keys/s (GV100) + SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, + SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 8.7B 32b segmented keys/s (GV100) + }; + + // ScanPolicy + typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; + + // Downsweep policies + typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; + typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; + + // Upsweep policies + typedef DownsweepPolicy UpsweepPolicy; + typedef AltDownsweepPolicy AltUpsweepPolicy; + + // Single-tile policy + typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; + + // Segmented policies + typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; + typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; + }; + + + /// MaxPolicy + typedef Policy700 MaxPolicy; + + +}; + + + +/****************************************************************************** + * Single-problem dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort + */ +template < + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchRadixSort : + DeviceRadixSortPolicy +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + enum + { + // Whether this is a keys-only (or key-value) sort + KEYS_ONLY = (Equals::VALUE), + }; + + + //------------------------------------------------------------------------------ + // Problem state + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + OffsetT num_items; ///< [in] Number of items to sort + int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers + + + //------------------------------------------------------------------------------ + // Constructor + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchRadixSort( + void* d_temp_storage, + size_t &temp_storage_bytes, + DoubleBuffer &d_keys, + DoubleBuffer &d_values, + OffsetT num_items, + int begin_bit, + int end_bit, + bool is_overwrite_okay, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_keys(d_keys), + d_values(d_values), + num_items(num_items), + begin_bit(begin_bit), + end_bit(end_bit), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version), + is_overwrite_okay(is_overwrite_okay) + {} + + + //------------------------------------------------------------------------------ + // Small-problem (single tile) invocation + //------------------------------------------------------------------------------ + + /// Invoke a single block to sort in-core + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename SingleTileKernelT> ///< Function type of cub::DeviceRadixSortSingleTileKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokeSingleTile( + SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)single_tile_kernel; + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + cudaError error = cudaSuccess; + do + { + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + break; + } + + // Return if empty problem + if (num_items == 0) + break; + + // Log single_tile_kernel configuration + if (debug_synchronous) + _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", + 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream, + ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS); + + // Invoke upsweep_kernel with same grid size as downsweep_kernel + single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + d_keys.Current(), + d_keys.Alternate(), + d_values.Current(), + d_values.Alternate(), + num_items, + begin_bit, + end_bit); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Update selector + d_keys.selector ^= 1; + d_values.selector ^= 1; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Normal problem size invocation + //------------------------------------------------------------------------------ + + /** + * Invoke a three-kernel sorting pass at the current bit. + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePass( + const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + OffsetT *d_spine, + int spine_length, + int ¤t_bit, + PassConfigT &pass_config) + { + cudaError error = cudaSuccess; + do + { + int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); + + // Log upsweep_kernel configuration + if (debug_synchronous) + _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", + pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream, + pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits); + + // Invoke upsweep_kernel with same grid size as downsweep_kernel + pass_config.upsweep_kernel<<>>( + d_keys_in, + d_spine, + num_items, + current_bit, + pass_bits, + pass_config.even_share); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log scan_kernel configuration + if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", + 1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread); + + // Invoke scan_kernel + pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>( + d_spine, + spine_length); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log downsweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream, + pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy); + + // Invoke downsweep_kernel + pass_config.downsweep_kernel<<>>( + d_keys_in, + d_keys_out, + d_values_in, + d_values_out, + d_spine, + num_items, + current_bit, + pass_bits, + pass_config.even_share); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Update current bit + current_bit += pass_bits; + } + while (0); + + return error; + } + + + + /// Pass configuration structure + template < + typename UpsweepKernelT, + typename ScanKernelT, + typename DownsweepKernelT> + struct PassConfig + { + UpsweepKernelT upsweep_kernel; + KernelConfig upsweep_config; + ScanKernelT scan_kernel; + KernelConfig scan_config; + DownsweepKernelT downsweep_kernel; + KernelConfig downsweep_config; + int radix_bits; + int radix_digits; + int max_downsweep_grid_size; + GridEvenShare even_share; + + /// Initialize pass configuration + template < + typename UpsweepPolicyT, + typename ScanPolicyT, + typename DownsweepPolicyT> + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InitPassConfig( + UpsweepKernelT upsweep_kernel, + ScanKernelT scan_kernel, + DownsweepKernelT downsweep_kernel, + int ptx_version, + int sm_count, + int num_items) + { + cudaError error = cudaSuccess; + do + { + this->upsweep_kernel = upsweep_kernel; + this->scan_kernel = scan_kernel; + this->downsweep_kernel = downsweep_kernel; + radix_bits = DownsweepPolicyT::RADIX_BITS; + radix_digits = 1 << radix_bits; + + if (CubDebug(error = upsweep_config.Init(upsweep_kernel))) break; + if (CubDebug(error = scan_config.Init(scan_kernel))) break; + if (CubDebug(error = downsweep_config.Init(downsweep_kernel))) break; + + max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version); + + even_share.DispatchInit( + num_items, + max_downsweep_grid_size, + CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size)); + + } + while (0); + return error; + } + + }; + + + /// Invocation (run multiple digit passes) + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename UpsweepKernelT, ///< Function type of cub::DeviceRadixSortUpsweepKernel + typename ScanKernelT, ///< Function type of cub::SpineScanKernel + typename DownsweepKernelT> ///< Function type of cub::DeviceRadixSortDownsweepKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + UpsweepKernelT upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel + UpsweepKernelT alt_upsweep_kernel, ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel + ScanKernelT scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel + DownsweepKernelT downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel + DownsweepKernelT alt_downsweep_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)upsweep_kernel; + (void)alt_upsweep_kernel; + (void)scan_kernel; + (void)downsweep_kernel; + (void)alt_downsweep_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Init regular and alternate-digit kernel configurations + PassConfig pass_config, alt_pass_config; + if ((error = pass_config.template InitPassConfig< + typename ActivePolicyT::UpsweepPolicy, + typename ActivePolicyT::ScanPolicy, + typename ActivePolicyT::DownsweepPolicy>( + upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break; + + if ((error = alt_pass_config.template InitPassConfig< + typename ActivePolicyT::AltUpsweepPolicy, + typename ActivePolicyT::ScanPolicy, + typename ActivePolicyT::AltDownsweepPolicy>( + alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break; + + // Get maximum spine length + int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size); + int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size; + + // Temporary storage allocation requirements + void* allocations[3]; + size_t allocation_sizes[3] = + { + spine_length * sizeof(OffsetT), // bytes needed for privatized block digit histograms + (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer + (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer + }; + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + return cudaSuccess; + + // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size + int num_bits = end_bit - begin_bit; + int num_passes = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits; + bool is_num_passes_odd = num_passes & 1; + int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits; + int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits)); + + // Alias the temporary storage allocations + OffsetT *d_spine = static_cast(allocations[0]); + + DoubleBuffer d_keys_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[1]), + (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_keys.Alternate()); + + DoubleBuffer d_values_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[2]), + (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[2]) : d_values.Alternate()); + + // Run first pass, consuming from the input's current buffers + int current_bit = begin_bit; + if (CubDebug(error = InvokePass( + d_keys.Current(), d_keys_remaining_passes.Current(), + d_values.Current(), d_values_remaining_passes.Current(), + d_spine, spine_length, current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; + + // Run remaining passes + while (current_bit < end_bit) + { + if (CubDebug(error = InvokePass( + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_spine, spine_length, current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;; + + // Invert selectors + d_keys_remaining_passes.selector ^= 1; + d_values_remaining_passes.selector ^= 1; + } + + // Update selector + if (!is_overwrite_okay) { + num_passes = 1; // Sorted data always ends up in the other vector + } + + d_keys.selector = (d_keys.selector + num_passes) & 1; + d_values.selector = (d_values.selector + num_passes) & 1; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; + typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; + + // Force kernel code-generation in all compiler passes + if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) + { + // Small, single tile size + return InvokeSingleTile( + DeviceRadixSortSingleTileKernel); + } + else + { + // Regular size + return InvokePasses( + DeviceRadixSortUpsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, OffsetT>, + DeviceRadixSortUpsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, OffsetT>, + RadixSortScanBinsKernel< MaxPolicyT, OffsetT>, + DeviceRadixSortDownsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT>, + DeviceRadixSortDownsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT>); + } + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + OffsetT num_items, ///< [in] Number of items to sort + int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison + bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; + + cudaError_t error; + do { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchRadixSort dispatch( + d_temp_storage, temp_storage_bytes, + d_keys, d_values, + num_items, begin_bit, end_bit, is_overwrite_okay, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + + } while (0); + + return error; + } +}; + + + + +/****************************************************************************** + * Segmented dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort + */ +template < + bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low + typename KeyT, ///< Key type + typename ValueT, ///< Value type + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchSegmentedRadixSort : + DeviceRadixSortPolicy +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + enum + { + // Whether this is a keys-only (or key-value) sort + KEYS_ONLY = (Equals::VALUE), + }; + + + //------------------------------------------------------------------------------ + // Parameter members + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + OffsetT num_items; ///< [in] Number of items to sort + OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets; ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets; ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers + + + //------------------------------------------------------------------------------ + // Constructors + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchSegmentedRadixSort( + void* d_temp_storage, + size_t &temp_storage_bytes, + DoubleBuffer &d_keys, + DoubleBuffer &d_values, + OffsetT num_items, + OffsetT num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + int begin_bit, + int end_bit, + bool is_overwrite_okay, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_keys(d_keys), + d_values(d_values), + num_items(num_items), + num_segments(num_segments), + d_begin_offsets(d_begin_offsets), + d_end_offsets(d_end_offsets), + begin_bit(begin_bit), + end_bit(end_bit), + is_overwrite_okay(is_overwrite_okay), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version) + {} + + + //------------------------------------------------------------------------------ + // Multi-segment invocation + //------------------------------------------------------------------------------ + + /// Invoke a three-kernel sorting pass at the current bit. + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePass( + const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + int ¤t_bit, + PassConfigT &pass_config) + { + cudaError error = cudaSuccess; + do + { + int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); + + // Log kernel configuration + if (debug_synchronous) + _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", + num_segments, pass_config.segmented_config.block_threads, (long long) stream, + pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits); + + pass_config.segmented_kernel<<>>( + d_keys_in, d_keys_out, + d_values_in, d_values_out, + d_begin_offsets, d_end_offsets, num_segments, + current_bit, pass_bits); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Update current bit + current_bit += pass_bits; + } + while (0); + + return error; + } + + + /// PassConfig data structure + template + struct PassConfig + { + SegmentedKernelT segmented_kernel; + KernelConfig segmented_config; + int radix_bits; + int radix_digits; + + /// Initialize pass configuration + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel) + { + this->segmented_kernel = segmented_kernel; + this->radix_bits = SegmentedPolicyT::RADIX_BITS; + this->radix_digits = 1 << radix_bits; + + return CubDebug(segmented_config.Init(segmented_kernel)); + } + }; + + + /// Invocation (run multiple digit passes) + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename SegmentedKernelT> ///< Function type of cub::DeviceSegmentedRadixSortKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + SegmentedKernelT segmented_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel + SegmentedKernelT alt_segmented_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)segmented_kernel; + (void)alt_segmented_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + + cudaError error = cudaSuccess; + do + { + // Init regular and alternate kernel configurations + PassConfig pass_config, alt_pass_config; + if ((error = pass_config.template InitPassConfig(segmented_kernel))) break; + if ((error = alt_pass_config.template InitPassConfig(alt_segmented_kernel))) break; + + // Temporary storage allocation requirements + void* allocations[2]; + size_t allocation_sizes[2] = + { + (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer + (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer + }; + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + if (temp_storage_bytes == 0) + temp_storage_bytes = 1; + return cudaSuccess; + } + + // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size + int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS; + int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS; + int num_bits = end_bit - begin_bit; + int num_passes = (num_bits + radix_bits - 1) / radix_bits; + bool is_num_passes_odd = num_passes & 1; + int max_alt_passes = (num_passes * radix_bits) - num_bits; + int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits)); + + DoubleBuffer d_keys_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[0]), + (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[0]) : d_keys.Alternate()); + + DoubleBuffer d_values_remaining_passes( + (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[1]), + (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_values.Alternate()); + + // Run first pass, consuming from the input's current buffers + int current_bit = begin_bit; + + if (CubDebug(error = InvokePass( + d_keys.Current(), d_keys_remaining_passes.Current(), + d_values.Current(), d_values_remaining_passes.Current(), + current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; + + // Run remaining passes + while (current_bit < end_bit) + { + if (CubDebug(error = InvokePass( + d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], + current_bit, + (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; + + // Invert selectors and update current bit + d_keys_remaining_passes.selector ^= 1; + d_values_remaining_passes.selector ^= 1; + } + + // Update selector + if (!is_overwrite_okay) { + num_passes = 1; // Sorted data always ends up in the other vector + } + + d_keys.selector = (d_keys.selector + num_passes) & 1; + d_values.selector = (d_values.selector + num_passes) & 1; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; + + // Force kernel code-generation in all compiler passes + return InvokePasses( + DeviceSegmentedRadixSortKernel, + DeviceSegmentedRadixSortKernel); + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + + /// Internal dispatch routine + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys + DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values + int num_items, ///< [in] Number of items to sort + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison + int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison + bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; + + cudaError_t error; + do { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchSegmentedRadixSort dispatch( + d_temp_storage, temp_storage_bytes, + d_keys, d_values, + num_items, num_segments, d_begin_offsets, d_end_offsets, + begin_bit, end_bit, is_overwrite_okay, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + + } while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/dispatch/dispatch_reduce.cuh b/SRC/cub/device/dispatch/dispatch_reduce.cuh new file mode 100644 index 00000000..e9d1b7ac --- /dev/null +++ b/SRC/cub/device/dispatch/dispatch_reduce.cuh @@ -0,0 +1,882 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../../agent/agent_reduce.cuh" +#include "../../iterator/arg_index_input_iterator.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_even_share.cuh" +#include "../../iterator/arg_index_input_iterator.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Reduce region kernel entry point (multi-block). Computes privatized reductions, one per thread block. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) +__global__ void DeviceReduceKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetT num_items, ///< [in] Total number of input data items + GridEvenShare even_share, ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block + ReductionOpT reduction_op) ///< [in] Binary reduction functor +{ + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Thread block type for reducing input tiles + typedef AgentReduce< + typename ChainedPolicyT::ActivePolicy::ReducePolicy, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT> + AgentReduceT; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + // Consume input tiles + OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share); + + // Output result + if (threadIdx.x == 0) + d_out[blockIdx.x] = block_aggregate; +} + + +/** + * Reduce a single tile kernel entry point (single-block). Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass. + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) + typename OuputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) +__global__ void DeviceReduceSingleTileKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetT num_items, ///< [in] Total number of input data items + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OuputT init) ///< [in] The initial value of the reduction +{ + // Thread block type for reducing input tiles + typedef AgentReduce< + typename ChainedPolicyT::ActivePolicy::SingleTilePolicy, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT> + AgentReduceT; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + // Check if empty problem + if (num_items == 0) + { + if (threadIdx.x == 0) + *d_out = init; + return; + } + + // Consume input tiles + OuputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( + OffsetT(0), + num_items); + + // Output result + if (threadIdx.x == 0) + *d_out = reduction_op(init, block_aggregate); +} + + +/// Normalize input iterator to segment offset +template +__device__ __forceinline__ +void NormalizeReductionOutput( + T &/*val*/, + OffsetT /*base_offset*/, + IteratorT /*itr*/) +{} + + +/// Normalize input iterator to segment offset (specialized for arg-index) +template +__device__ __forceinline__ +void NormalizeReductionOutput( + KeyValuePairT &val, + OffsetT base_offset, + ArgIndexInputIterator /*itr*/) +{ + val.key -= base_offset; +} + + +/** + * Segmented reduction (one block per segment) + */ +template < + typename ChainedPolicyT, ///< Chained tuning policy + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) + typename OutputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT +__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) +__global__ void DeviceSegmentedReduceKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OutputT init) ///< [in] The initial value of the reduction +{ + // Thread block type for reducing input tiles + typedef AgentReduce< + typename ChainedPolicyT::ActivePolicy::ReducePolicy, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT> + AgentReduceT; + + // Shared memory storage + __shared__ typename AgentReduceT::TempStorage temp_storage; + + OffsetT segment_begin = d_begin_offsets[blockIdx.x]; + OffsetT segment_end = d_end_offsets[blockIdx.x]; + + // Check if empty problem + if (segment_begin == segment_end) + { + if (threadIdx.x == 0) + d_out[blockIdx.x] = init; + return; + } + + // Consume input tiles + OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( + segment_begin, + segment_end); + + // Normalize as needed + NormalizeReductionOutput(block_aggregate, segment_begin, d_in); + + if (threadIdx.x == 0) + d_out[blockIdx.x] = reduction_op(init, block_aggregate);; +} + + + + +/****************************************************************************** + * Policy + ******************************************************************************/ + +template < + typename OuputT, ///< Data type + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +struct DeviceReducePolicy +{ + //------------------------------------------------------------------------------ + // Architecture-specific tuning policies + //------------------------------------------------------------------------------ + + /// SM13 + struct Policy130 : ChainedPolicy<130, Policy130, Policy130> + { + // ReducePolicy + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(128, 8, OuputT), ///< Threads per block, items per thread + 2, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// SM20 + struct Policy200 : ChainedPolicy<200, Policy200, Policy130> + { + // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(128, 8, OuputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// SM30 + struct Policy300 : ChainedPolicy<300, Policy300, Policy200> + { + // ReducePolicy (GTX670: 154.0 @ 48M 4B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(256, 20, OuputT), ///< Threads per block, items per thread + 2, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_DEFAULT> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// SM35 + struct Policy350 : ChainedPolicy<350, Policy350, Policy300> + { + // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(256, 20, OuputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + /// SM60 + struct Policy600 : ChainedPolicy<600, Policy600, Policy350> + { + // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) + typedef AgentReducePolicy< + CUB_SCALED_GRANULARITIES(256, 16, OuputT), ///< Threads per block, items per thread + 4, ///< Number of items per vectorized load + BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use + LOAD_LDG> ///< Cache load modifier + ReducePolicy; + + // SingleTilePolicy + typedef ReducePolicy SingleTilePolicy; + + // SegmentedReducePolicy + typedef ReducePolicy SegmentedReducePolicy; + }; + + + /// MaxPolicy + typedef Policy600 MaxPolicy; + +}; + + + +/****************************************************************************** + * Single-problem dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +struct DispatchReduce : + DeviceReducePolicy< + typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type, // ... else the output iterator's value type + OffsetT, + ReductionOpT> +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + // Data type of output iterator + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + + //------------------------------------------------------------------------------ + // Problem state + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out; ///< [out] Pointer to the output aggregate + OffsetT num_items; ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOpT reduction_op; ///< [in] Binary reduction functor + OutputT init; ///< [in] The initial value of the reduction + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + + //------------------------------------------------------------------------------ + // Constructor + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchReduce( + void* d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_items, + ReductionOpT reduction_op, + OutputT init, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_in(d_in), + d_out(d_out), + num_items(num_items), + reduction_op(reduction_op), + init(init), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version) + {} + + + //------------------------------------------------------------------------------ + // Small-problem (single tile) invocation + //------------------------------------------------------------------------------ + + /// Invoke a single block block to reduce in-core + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokeSingleTile( + SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)single_tile_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + cudaError error = cudaSuccess; + do + { + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + break; + } + + // Log single_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", + ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); + + // Invoke single_reduce_sweep_kernel + single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + d_in, + d_out, + num_items, + reduction_op, + init); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + //------------------------------------------------------------------------------ + // Normal problem size invocation (two-pass) + //------------------------------------------------------------------------------ + + /// Invoke two-passes to reduce + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename ReduceKernelT, ///< Function type of cub::DeviceReduceKernel + typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + ReduceKernelT reduce_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel + SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void) reduce_kernel; + (void) single_tile_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Init regular kernel configuration + KernelConfig reduce_config; + if (CubDebug(error = reduce_config.Init(reduce_kernel))) break; + int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count; + + // Even-share work distribution + int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version); + GridEvenShare even_share; + even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size); + + // Temporary storage allocation requirements + void* allocations[1]; + size_t allocation_sizes[1] = + { + max_blocks * sizeof(OutputT) // bytes needed for privatized block reductions + }; + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + return cudaSuccess; + } + + // Alias the allocation for the privatized per-block reductions + OutputT *d_block_reductions = (OutputT*) allocations[0]; + + // Get grid size for device_reduce_sweep_kernel + int reduce_grid_size = even_share.grid_size; + + // Log device_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + reduce_grid_size, + ActivePolicyT::ReducePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD, + reduce_config.sm_occupancy); + + // Invoke DeviceReduceKernel + reduce_kernel<<>>( + d_in, + d_block_reductions, + num_items, + even_share, + reduction_op); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Log single_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", + ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); + + // Invoke DeviceReduceSingleTileKernel + single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( + d_block_reductions, + d_out, + reduce_grid_size, + reduction_op, + init); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + + } + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; + typedef typename DispatchReduce::MaxPolicy MaxPolicyT; + + // Force kernel code-generation in all compiler passes + if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) + { + // Small, single tile size + return InvokeSingleTile( + DeviceReduceSingleTileKernel); + } + else + { + // Regular size + return InvokePasses( + DeviceReduceKernel, + DeviceReduceSingleTileKernel); + } + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + /** + * Internal dispatch routine for computing a device-wide reduction + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OutputT init, ///< [in] The initial value of the reduction + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchReduce::MaxPolicy MaxPolicyT; + + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchReduce dispatch( + d_temp_storage, temp_storage_bytes, + d_in, d_out, num_items, reduction_op, init, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + } + while (0); + + return error; + } +}; + + + +/****************************************************************************** + * Segmented dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator + typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator + typename OffsetT, ///< Signed integer type for global offsets + typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) +struct DispatchSegmentedReduce : + DeviceReducePolicy< + typename std::iterator_traits::value_type, + OffsetT, + ReductionOpT> +{ + //------------------------------------------------------------------------------ + // Constants + //------------------------------------------------------------------------------ + + /// The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + + //------------------------------------------------------------------------------ + // Problem state + //------------------------------------------------------------------------------ + + void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out; ///< [out] Pointer to the output aggregate + OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets; ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets; ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + ReductionOpT reduction_op; ///< [in] Binary reduction functor + OutputT init; ///< [in] The initial value of the reduction + cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version; ///< [in] PTX version + + //------------------------------------------------------------------------------ + // Constructor + //------------------------------------------------------------------------------ + + /// Constructor + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchSegmentedReduce( + void* d_temp_storage, + size_t &temp_storage_bytes, + InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_segments, + OffsetIteratorT d_begin_offsets, + OffsetIteratorT d_end_offsets, + ReductionOpT reduction_op, + OutputT init, + cudaStream_t stream, + bool debug_synchronous, + int ptx_version) + : + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_in(d_in), + d_out(d_out), + num_segments(num_segments), + d_begin_offsets(d_begin_offsets), + d_end_offsets(d_end_offsets), + reduction_op(reduction_op), + init(init), + stream(stream), + debug_synchronous(debug_synchronous), + ptx_version(ptx_version) + {} + + + + //------------------------------------------------------------------------------ + // Chained policy invocation + //------------------------------------------------------------------------------ + + /// Invocation + template < + typename ActivePolicyT, ///< Umbrella policy active for the target device + typename DeviceSegmentedReduceKernelT> ///< Function type of cub::DeviceSegmentedReduceKernel + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t InvokePasses( + DeviceSegmentedReduceKernelT segmented_reduce_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel + { +#ifndef CUB_RUNTIME_ENABLED + (void)segmented_reduce_kernel; + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); +#else + cudaError error = cudaSuccess; + do + { + // Return if the caller is simply requesting the size of the storage allocation + if (d_temp_storage == NULL) + { + temp_storage_bytes = 1; + return cudaSuccess; + } + + // Init kernel configuration + KernelConfig segmented_reduce_config; + if (CubDebug(error = segmented_reduce_config.Init(segmented_reduce_kernel))) break; + + // Log device_reduce_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + num_segments, + ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, + (long long) stream, + ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD, + segmented_reduce_config.sm_occupancy); + + // Invoke DeviceReduceKernel + segmented_reduce_kernel<<>>( + d_in, + d_out, + d_begin_offsets, + d_end_offsets, + num_segments, + reduction_op, + init); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + + } + + + /// Invocation + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Invoke() + { + typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; + + // Force kernel code-generation in all compiler passes + return InvokePasses( + DeviceSegmentedReduceKernel); + } + + + //------------------------------------------------------------------------------ + // Dispatch entrypoints + //------------------------------------------------------------------------------ + + /** + * Internal dispatch routine for computing a device-wide reduction + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output aggregate + int num_segments, ///< [in] The number of segments that comprise the sorting data + OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* + OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. + ReductionOpT reduction_op, ///< [in] Binary reduction functor + OutputT init, ///< [in] The initial value of the reduction + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; + + if (num_segments <= 0) + return cudaSuccess; + + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchSegmentedReduce dispatch( + d_temp_storage, temp_storage_bytes, + d_in, d_out, + num_segments, d_begin_offsets, d_end_offsets, + reduction_op, init, + stream, debug_synchronous, ptx_version); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + } + while (0); + + return error; + } +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/dispatch/dispatch_reduce_by_key.cuh b/SRC/cub/device/dispatch/dispatch_reduce_by_key.cuh new file mode 100644 index 00000000..6f4837b7 --- /dev/null +++ b/SRC/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -0,0 +1,554 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch_scan.cuh" +#include "../../agent/agent_reduce_by_key.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Multi-block reduce-by-key sweep kernel entry point + */ +template < + typename AgentReduceByKeyPolicyT, ///< Parameterized AgentReduceByKeyPolicyT tuning policy type + typename KeysInputIteratorT, ///< Random-access input iterator type for keys + typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys + typename ValuesInputIteratorT, ///< Random-access input iterator type for values + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of segments encountered + typename ScanTileStateT, ///< Tile status interface type + typename EqualityOpT, ///< KeyT equality operator type + typename ReductionOpT, ///< ValueT reduction operator type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS)) +__global__ void DeviceReduceByKeyKernel( + KeysInputIteratorT d_keys_in, ///< Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out) + ScanTileStateT tile_state, ///< Tile status interface + int start_tile, ///< The starting tile for the current grid + EqualityOpT equality_op, ///< KeyT equality operator + ReductionOpT reduction_op, ///< ValueT reduction operator + OffsetT num_items) ///< Total number of items to select from +{ + // Thread block type for reducing tiles of value segments + typedef AgentReduceByKey< + AgentReduceByKeyPolicyT, + KeysInputIteratorT, + UniqueOutputIteratorT, + ValuesInputIteratorT, + AggregatesOutputIteratorT, + NumRunsOutputIteratorT, + EqualityOpT, + ReductionOpT, + OffsetT> + AgentReduceByKeyT; + + // Shared memory for AgentReduceByKey + __shared__ typename AgentReduceByKeyT::TempStorage temp_storage; + + // Process tiles + AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange( + num_items, + tile_state, + start_tile); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey + */ +template < + typename KeysInputIteratorT, ///< Random-access input iterator type for keys + typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys + typename ValuesInputIteratorT, ///< Random-access input iterator type for values + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of segments encountered + typename EqualityOpT, ///< KeyT equality operator type + typename ReductionOpT, ///< ValueT reduction operator type + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchReduceByKey +{ + //------------------------------------------------------------------------- + // Types and constants + //------------------------------------------------------------------------- + + // The input keys type + typedef typename std::iterator_traits::value_type KeyInputT; + + // The output keys type + typedef typename If<(Equals::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type KeyOutputT; // ... else the output iterator's value type + + // The input values type + typedef typename std::iterator_traits::value_type ValueInputT; + + // The output values type + typedef typename If<(Equals::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type ValueOutputT; // ... else the output iterator's value type + + enum + { + INIT_KERNEL_THREADS = 128, + MAX_INPUT_BYTES = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)), + COMBINED_INPUT_BYTES = sizeof(KeyOutputT) + sizeof(ValueOutputT), + }; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + + //------------------------------------------------------------------------- + // Tuning policies + //------------------------------------------------------------------------- + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 6, + ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 6, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 11, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 7, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + ReduceByKeyPolicyT; + }; + + /// SM11 + struct Policy110 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 5, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)), + }; + + typedef AgentReduceByKeyPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_RAKING> + ReduceByKeyPolicyT; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy110 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &reduce_by_key_config) + { + #if (CUB_PTX_ARCH > 0) + (void)ptx_version; + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + reduce_by_key_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 300) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 200) + { + reduce_by_key_config.template Init(); + } + else if (ptx_version >= 130) + { + reduce_by_key_config.template Init(); + } + else + { + reduce_by_key_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Internal dispatch routine for computing a device-wide reduce-by-key using the + * specified kernel functions. + */ + template < + typename ScanInitKernelT, ///< Function type of cub::DeviceScanInitKernel + typename ReduceByKeyKernelT> ///< Function type of cub::DeviceReduceByKeyKernelT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) + EqualityOpT equality_op, ///< [in] KeyT equality operator + ReductionOpT reduction_op, ///< [in] ValueT reduction operator + OffsetT num_items, ///< [in] Total number of items to select from + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int /*ptx_version*/, ///< [in] PTX version of dispatch kernels + ScanInitKernelT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + ReduceByKeyKernelT reduce_by_key_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel + KernelConfig reduce_by_key_config) ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_keys_in; + (void)d_unique_out; + (void)d_values_in; + (void)d_aggregates_out; + (void)d_num_runs_out; + (void)equality_op; + (void)reduction_op; + (void)num_items; + (void)stream; + (void)debug_synchronous; + (void)init_kernel; + (void)reduce_by_key_kernel; + (void)reduce_by_key_config; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_state; + if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log init_kernel configuration + int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); + if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors + init_kernel<<>>( + tile_state, + num_tiles, + d_num_runs_out); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Return if empty problem + if (num_items == 0) + break; + + // Get SM occupancy for reduce_by_key_kernel + int reduce_by_key_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + reduce_by_key_sm_occupancy, // out + reduce_by_key_kernel, + reduce_by_key_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Run grids in epochs (in case number of tiles exceeds max x-dimension + int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); + for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) + { + // Log reduce_by_key_kernel configuration + if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy); + + // Invoke reduce_by_key_kernel + reduce_by_key_kernel<<>>( + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + tile_state, + start_tile, + equality_op, + reduction_op, + num_items); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) + ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values + AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) + EqualityOpT equality_op, ///< [in] KeyT equality operator + ReductionOpT reduction_op, ///< [in] ValueT reduction operator + OffsetT num_items, ///< [in] Total number of items to select from + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig reduce_by_key_config; + InitConfigs(ptx_version, reduce_by_key_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_unique_out, + d_values_in, + d_aggregates_out, + d_num_runs_out, + equality_op, + reduction_op, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceCompactInitKernel, + DeviceReduceByKeyKernel, + reduce_by_key_config))) break; + } + while (0); + + return error; + } +}; + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/dispatch/dispatch_rle.cuh b/SRC/cub/device/dispatch/dispatch_rle.cuh new file mode 100644 index 00000000..98c3681f --- /dev/null +++ b/SRC/cub/device/dispatch/dispatch_rle.cuh @@ -0,0 +1,538 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch_scan.cuh" +#include "../../agent/agent_rle.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Select kernel entry point (multi-block) + * + * Performs functor-based selection if SelectOp functor type != NullType + * Otherwise performs flag-based selection if FlagIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator + typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator + typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator + typename ScanTileStateT, ///< Tile status interface type + typename EqualityOpT, ///< T equality operator type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS)) +__global__ void DeviceRleSweepKernel( + InputIteratorT d_in, ///< [in] Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) + ScanTileStateT tile_status, ///< [in] Tile status interface + EqualityOpT equality_op, ///< [in] Equality operator for input items + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + int num_tiles) ///< [in] Total number of tiles for the entire problem +{ + // Thread block type for selecting data from input tiles + typedef AgentRle< + AgentRlePolicyT, + InputIteratorT, + OffsetsOutputIteratorT, + LengthsOutputIteratorT, + EqualityOpT, + OffsetT> AgentRleT; + + // Shared memory for AgentRle + __shared__ typename AgentRleT::TempStorage temp_storage; + + // Process tiles + AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange( + num_tiles, + tile_status, + d_num_runs_out); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceRle + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator + typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator + typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator + typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator + typename EqualityOpT, ///< T equality operator type + typename OffsetT> ///< Signed integer type for global offsets +struct DeviceRleDispatch +{ + /****************************************************************************** + * Types and constants + ******************************************************************************/ + + // The input value type + typedef typename std::iterator_traits::value_type T; + + // The lengths output value type + typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? + OffsetT, // ... then the OffsetT type, + typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type + + enum + { + INIT_KERNEL_THREADS = 128, + }; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 15, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 96, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + true, + BLOCK_SCAN_WARP_SCANS> + RleSweepPolicy; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 5, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + RleSweepPolicy; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 15, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + RleSweepPolicy; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + RleSweepPolicy; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), + }; + + typedef AgentRlePolicy< + 256, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + true, + BLOCK_SCAN_RAKING_MEMOIZE> + RleSweepPolicy; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig& device_rle_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + device_rle_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 300) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 200) + { + device_rle_config.template Init(); + } + else if (ptx_version >= 130) + { + device_rle_config.template Init(); + } + else + { + device_rle_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. Mirrors the constants within AgentRlePolicyT. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + BlockLoadAlgorithm load_policy; + bool store_warp_time_slicing; + BlockScanAlgorithm scan_algorithm; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = AgentRlePolicyT::BLOCK_THREADS; + items_per_thread = AgentRlePolicyT::ITEMS_PER_THREAD; + load_policy = AgentRlePolicyT::LOAD_ALGORITHM; + store_warp_time_slicing = AgentRlePolicyT::STORE_WARP_TIME_SLICING; + scan_algorithm = AgentRlePolicyT::SCAN_ALGORITHM; + } + + CUB_RUNTIME_FUNCTION __forceinline__ + void Print() + { + printf("%d, %d, %d, %d, %d", + block_threads, + items_per_thread, + load_policy, + store_warp_time_slicing, + scan_algorithm); + } + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide run-length-encode using the + * specified kernel functions. + */ + template < + typename DeviceScanInitKernelPtr, ///< Function type of cub::DeviceScanInitKernel + typename DeviceRleSweepKernelPtr> ///< Function type of cub::DeviceRleSweepKernelPtr + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to the output sequence of run-offsets + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to the output sequence of run-lengths + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out) + EqualityOpT equality_op, ///< [in] Equality operator for input items + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int ptx_version, ///< [in] PTX version of dispatch kernels + DeviceScanInitKernelPtr device_scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + DeviceRleSweepKernelPtr device_rle_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel + KernelConfig device_rle_config) ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_status; + if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log device_scan_init_kernel configuration + int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); + if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors + device_scan_init_kernel<<>>( + tile_status, + num_tiles, + d_num_runs_out); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Return if empty problem + if (num_items == 0) + break; + + // Get SM occupancy for device_rle_sweep_kernel + int device_rle_kernel_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + device_rle_kernel_sm_occupancy, // out + device_rle_sweep_kernel, + device_rle_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Get grid size for scanning tiles + dim3 scan_grid_size; + scan_grid_size.z = 1; + scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x; + scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + + // Log device_rle_sweep_kernel configuration + if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy); + + // Invoke device_rle_sweep_kernel + device_rle_sweep_kernel<<>>( + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + tile_status, + equality_op, + num_items, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to input sequence of data items + OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets + LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths + NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) + EqualityOpT equality_op, ///< [in] Equality operator for input items + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig device_rle_config; + InitConfigs(ptx_version, device_rle_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_offsets_out, + d_lengths_out, + d_num_runs_out, + equality_op, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceCompactInitKernel, + DeviceRleSweepKernel, + device_rle_config))) break; + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/dispatch/dispatch_scan.cuh b/SRC/cub/device/dispatch/dispatch_scan.cuh new file mode 100644 index 00000000..3ef720a4 --- /dev/null +++ b/SRC/cub/device/dispatch/dispatch_scan.cuh @@ -0,0 +1,563 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "../../agent/agent_scan.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_arch.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Initialization kernel for tile status initialization (multi-block) + */ +template < + typename ScanTileStateT> ///< Tile status interface type +__global__ void DeviceScanInitKernel( + ScanTileStateT tile_state, ///< [in] Tile status interface + int num_tiles) ///< [in] Number of tiles +{ + // Initialize tile status + tile_state.InitializeStatus(num_tiles); +} + +/** + * Initialization kernel for tile status initialization (multi-block) + */ +template < + typename ScanTileStateT, ///< Tile status interface type + typename NumSelectedIteratorT> ///< Output iterator type for recording the number of items selected +__global__ void DeviceCompactInitKernel( + ScanTileStateT tile_state, ///< [in] Tile status interface + int num_tiles, ///< [in] Number of tiles + NumSelectedIteratorT d_num_selected_out) ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out) +{ + // Initialize tile status + tile_state.InitializeStatus(num_tiles); + + // Initialize d_num_selected_out + if ((blockIdx.x == 0) && (threadIdx.x == 0)) + *d_num_selected_out = 0; +} + + +/** + * Scan kernel entry point (multi-block) + */ +template < + typename ScanPolicyT, ///< Parameterized ScanPolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator + typename OutputIteratorT, ///< Random-access output iterator type for writing scan outputs \iterator + typename ScanTileStateT, ///< Tile status interface type + typename ScanOpT, ///< Binary scan functor type having member T operator()(const T &a, const T &b) + typename InitValueT, ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans) + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS)) +__global__ void DeviceScanKernel( + InputIteratorT d_in, ///< Input data + OutputIteratorT d_out, ///< Output data + ScanTileStateT tile_state, ///< Tile status interface + int start_tile, ///< The starting tile for the current grid + ScanOpT scan_op, ///< Binary scan functor + InitValueT init_value, ///< Initial value to seed the exclusive scan + OffsetT num_items) ///< Total number of scan items for the entire problem +{ + // Thread block type for scanning input tiles + typedef AgentScan< + ScanPolicyT, + InputIteratorT, + OutputIteratorT, + ScanOpT, + InitValueT, + OffsetT> AgentScanT; + + // Shared memory for AgentScan + __shared__ typename AgentScanT::TempStorage temp_storage; + + // Process tiles + AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange( + num_items, + tile_state, + start_tile); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceScan + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator + typename OutputIteratorT, ///< Random-access output iterator type for writing scan outputs \iterator + typename ScanOpT, ///< Binary scan functor type having member T operator()(const T &a, const T &b) + typename InitValueT, ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans) + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchScan +{ + //--------------------------------------------------------------------- + // Constants and Types + //--------------------------------------------------------------------- + + enum + { + INIT_KERNEL_THREADS = 128 + }; + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + + //--------------------------------------------------------------------- + // Tuning policies + //--------------------------------------------------------------------- + + /// SM600 + struct Policy600 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 15, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + + /// SM520 + struct Policy520 + { + // Titan X: 32.47B items/s @ 48M 32-bit T + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 12, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + + /// SM35 + struct Policy350 + { + // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 12, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, + BLOCK_SCAN_RAKING> + ScanPolicyT; + }; + + /// SM30 + struct Policy300 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(256, 9, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + /// SM20 + struct Policy200 + { + // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(128, 12, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + /// SM13 + struct Policy130 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(96, 21, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_RAKING_MEMOIZE> + ScanPolicyT; + }; + + /// SM10 + struct Policy100 + { + typedef AgentScanPolicy< + CUB_SCALED_GRANULARITIES(64, 9, OutputT), ///< Threads per block, items per thread + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_STORE_WARP_TRANSPOSE, + BLOCK_SCAN_WARP_SCANS> + ScanPolicyT; + }; + + + //--------------------------------------------------------------------- + // Tuning policies of current PTX compiler pass + //--------------------------------------------------------------------- + +#if (CUB_PTX_ARCH >= 600) + typedef Policy600 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 520) + typedef Policy520 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxAgentScanPolicy : PtxPolicy::ScanPolicyT {}; + + + //--------------------------------------------------------------------- + // Utilities + //--------------------------------------------------------------------- + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &scan_kernel_config) + { + #if (CUB_PTX_ARCH > 0) + (void)ptx_version; + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + scan_kernel_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 600) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 520) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 350) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 300) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 200) + { + scan_kernel_config.template Init(); + } + else if (ptx_version >= 130) + { + scan_kernel_config.template Init(); + } + else + { + scan_kernel_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Internal dispatch routine for computing a device-wide prefix scan using the + * specified kernel functions. + */ + template < + typename ScanInitKernelPtrT, ///< Function type of cub::DeviceScanInitKernel + typename ScanSweepKernelPtrT> ///< Function type of cub::DeviceScanKernelPtrT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + InitValueT init_value, ///< [in] Initial value to seed the exclusive scan + OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int /*ptx_version*/, ///< [in] PTX version of dispatch kernels + ScanInitKernelPtrT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + ScanSweepKernelPtrT scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanKernel + KernelConfig scan_kernel_config) ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_in; + (void)d_out; + (void)scan_op; + (void)init_value; + (void)num_items; + (void)stream; + (void)debug_synchronous; + (void)init_kernel; + (void)scan_kernel; + (void)scan_kernel_config; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Return if empty problem + if (num_items == 0) + break; + + // Construct the tile status interface + ScanTileStateT tile_state; + if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log init_kernel configuration + int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; + if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors + init_kernel<<>>( + tile_state, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Get SM occupancy for scan_kernel + int scan_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + scan_sm_occupancy, // out + scan_kernel, + scan_kernel_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Run grids in epochs (in case number of tiles exceeds max x-dimension + int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); + for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) + { + // Log scan_kernel configuration + if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy); + + // Invoke scan_kernel + scan_kernel<<>>( + d_in, + d_out, + tile_state, + start_tile, + scan_op, + init_value, + num_items); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items + ScanOpT scan_op, ///< [in] Binary scan functor + InitValueT init_value, ///< [in] Initial value to seed the exclusive scan + OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Get kernel kernel dispatch configurations + KernelConfig scan_kernel_config; + InitConfigs(ptx_version, scan_kernel_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + scan_op, + init_value, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceScanInitKernel, + DeviceScanKernel, + scan_kernel_config))) break; + } + while (0); + + return error; + } +}; + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/dispatch/device_select_dispatch.cuh b/SRC/cub/device/dispatch/dispatch_select_if.cuh similarity index 51% rename from SRC/cub/device/dispatch/device_select_dispatch.cuh rename to SRC/cub/device/dispatch/dispatch_select_if.cuh index de6f38b5..60b33133 100644 --- a/SRC/cub/device/dispatch/device_select_dispatch.cuh +++ b/SRC/cub/device/dispatch/dispatch_select_if.cuh @@ -1,564 +1,542 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within global memory. - */ - -#pragma once - -#include -#include - -#include "device_scan_dispatch.cuh" -#include "../../block_range/block_range_select.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Select kernel entry point (multi-block) - * - * Performs functor-based selection if SelectOp functor type != NullType - * Otherwise performs flag-based selection if FlagIterator's value type != NullType - * Otherwise performs discontinuity selection (keep unique) - */ -template < - typename BlockRangeSelectPolicy, ///< Parameterized BlockRangeSelectPolicy tuning policy type - typename InputIterator, ///< Random-access input iterator type for reading input items - typename FlagIterator, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) - typename OutputIterator, ///< Random-access output iterator type for writing selected items - typename NumSelectedIterator, ///< Output iterator type for recording the number of items selected - typename ScanTileState, ///< Tile status interface type - typename SelectOp, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) - typename EqualityOp, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) - typename Offset, ///< Signed integer type for global offsets - bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output -__launch_bounds__ (int(BlockRangeSelectPolicy::BLOCK_THREADS)) -__global__ void SelectRegionKernel( - InputIterator d_in, ///< [in] Pointer to input sequence of data items - FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags - OutputIterator d_out, ///< [in] Pointer to output sequence of selected data items - NumSelectedIterator d_num_selected, ///< [in] Pointer to total number of items selected (i.e., length of \p d_out) - ScanTileState tile_status, ///< [in] Tile status interface - SelectOp select_op, ///< [in] Selection operator - EqualityOp equality_op, ///< [in] Equality operator - Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - int num_tiles, ///< [in] Total number of tiles for the entire problem - GridQueue queue) ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks -{ - // Thread block type for selecting data from input tiles - typedef BlockRangeSelect< - BlockRangeSelectPolicy, - InputIterator, - FlagIterator, - OutputIterator, - SelectOp, - EqualityOp, - Offset, - KEEP_REJECTS> BlockRangeSelectT; - - // Shared memory for BlockRangeSelect - __shared__ typename BlockRangeSelectT::TempStorage temp_storage; - - // Process tiles - BlockRangeSelectT(temp_storage, d_in, d_flags, d_out, select_op, equality_op, num_items).ConsumeRange( - num_tiles, - queue, - tile_status, - d_num_selected); -} - - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect - */ -template < - typename InputIterator, ///< Random-access input iterator type for reading input items - typename FlagIterator, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) - typename OutputIterator, ///< Random-access output iterator type for writing selected items - typename NumSelectedIterator, ///< Output iterator type for recording the number of items selected - typename SelectOp, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) - typename EqualityOp, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) - typename Offset, ///< Signed integer type for global offsets - bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output -struct DeviceSelectDispatch -{ - /****************************************************************************** - * Types and constants - ******************************************************************************/ - - // Data type of input iterator - typedef typename std::iterator_traits::value_type T; - - // Data type of flag iterator - typedef typename std::iterator_traits::value_type Flag; - - enum - { - INIT_KERNEL_THREADS = 128, - }; - - // Tile status descriptor interface type - typedef ScanTileState ScanTileState; - - - /****************************************************************************** - * Tuning policies - ******************************************************************************/ - - /// SM35 - struct Policy350 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 11, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef BlockRangeSelectPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - true, - BLOCK_SCAN_WARP_SCANS> - SelectRegionPolicy; - }; - - /// SM30 - struct Policy300 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 5, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef BlockRangeSelectPolicy< - 256, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - BLOCK_SCAN_RAKING_MEMOIZE> - SelectRegionPolicy; - }; - - /// SM20 - struct Policy200 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 17, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef BlockRangeSelectPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - BLOCK_SCAN_WARP_SCANS> - SelectRegionPolicy; - }; - - /// SM13 - struct Policy130 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 9, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef BlockRangeSelectPolicy< - 64, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - BLOCK_SCAN_RAKING_MEMOIZE> - SelectRegionPolicy; - }; - - /// SM10 - struct Policy100 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 9, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef BlockRangeSelectPolicy< - 256, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - BLOCK_SCAN_RAKING_MEMOIZE> - SelectRegionPolicy; - }; - - - /****************************************************************************** - * Tuning policies of current PTX compiler pass - ******************************************************************************/ - -#if (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 130) - typedef Policy130 PtxPolicy; - -#else - typedef Policy100 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxSelectRegionPolicy : PtxPolicy::SelectRegionPolicy {}; - - - /****************************************************************************** - * Utilities - ******************************************************************************/ - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static void InitConfigs( - int ptx_version, - KernelConfig &select_range_config) - { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - select_range_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - select_range_config.template Init(); - } - else if (ptx_version >= 300) - { - select_range_config.template Init(); - } - else if (ptx_version >= 200) - { - select_range_config.template Init(); - } - else if (ptx_version >= 130) - { - select_range_config.template Init(); - } - else - { - select_range_config.template Init(); - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration. Mirrors the constants within BlockRangeSelectPolicy. - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - BlockLoadAlgorithm load_policy; - bool two_phase_scatter; - BlockScanAlgorithm scan_algorithm; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - void Init() - { - block_threads = BlockRangeSelectPolicy::BLOCK_THREADS; - items_per_thread = BlockRangeSelectPolicy::ITEMS_PER_THREAD; - load_policy = BlockRangeSelectPolicy::LOAD_ALGORITHM; - two_phase_scatter = BlockRangeSelectPolicy::TWO_PHASE_SCATTER; - scan_algorithm = BlockRangeSelectPolicy::SCAN_ALGORITHM; - } - - CUB_RUNTIME_FUNCTION __forceinline__ - void Print() - { - printf("%d, %d, %d, %d, %d", - block_threads, - items_per_thread, - load_policy, - two_phase_scatter, - scan_algorithm); - } - }; - - - /****************************************************************************** - * Dispatch entrypoints - ******************************************************************************/ - - /** - * Internal dispatch routine for computing a device-wide prefix scan using the - * specified kernel functions. - */ - template < - typename ScanInitKernelPtr, ///< Function type of cub::ScanInitKernel - typename SelectRegionKernelPtr> ///< Function type of cub::SelectRegionKernelPtr - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to input sequence of data items - FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags - OutputIterator d_out, ///< [in] Pointer to output sequence of selected data items - NumSelectedIterator d_num_selected, ///< [in] Pointer to total number of items selected (i.e., length of \p d_out) - SelectOp select_op, ///< [in] Selection operator - EqualityOp equality_op, ///< [in] Equality operator - Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version, ///< [in] PTX version of dispatch kernels - ScanInitKernelPtr init_kernel, ///< [in] Kernel function pointer to parameterization of cub::ScanInitKernel - SelectRegionKernelPtr select_range_kernel, ///< [in] Kernel function pointer to parameterization of cub::SelectRegionKernel - KernelConfig select_range_config) ///< [in] Dispatch parameters that match the policy that \p select_range_kernel was compiled for - { - -#ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get device SM version - int sm_version; - if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Number of input tiles - int tile_size = select_range_config.block_threads * select_range_config.items_per_thread; - int num_tiles = (num_items + tile_size - 1) / tile_size; - - // Specify temporary storage allocation requirements - size_t allocation_sizes[2]; - if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors - allocation_sizes[1] = GridQueue::AllocationSize(); // bytes needed for grid queue descriptor - - // Compute allocation pointers into the single storage blob (or set the necessary size of the blob) - void* allocations[2]; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - return cudaSuccess; - } - - // Construct the tile status interface - ScanTileState tile_status; - if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; - - // Construct the grid queue descriptor - GridQueue queue(allocations[1]); - - // Log init_kernel configuration - int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; - if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); - - // Invoke init_kernel to initialize tile descriptors and queue descriptors - init_kernel<<>>( - queue, - tile_status, - num_tiles); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Get SM occupancy for select_range_kernel - int select_range_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - select_range_sm_occupancy, // out - sm_version, - select_range_kernel, - select_range_config.block_threads))) break; - - // Get grid size for scanning tiles - dim3 select_grid_size; - if (ptx_version <= 130) - { - // Blocks are launched in order, so just assign one block per tile - int max_dim_x = 32 * 1024; - select_grid_size.z = 1; - select_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x; - select_grid_size.x = CUB_MIN(num_tiles, max_dim_x); - } - else - { - // Blocks may not be launched in order, so use atomics - int select_range_occupancy = select_range_sm_occupancy * sm_count; // Whole-device occupancy for select_range_kernel - select_grid_size.z = 1; - select_grid_size.y = 1; - select_grid_size.x = (num_tiles < select_range_occupancy) ? - num_tiles : // Not enough to fill the device with threadblocks - select_range_occupancy; // Fill the device with threadblocks - } - - // Log select_range_kernel configuration - if (debug_synchronous) CubLog("Invoking select_range_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - select_grid_size.x, select_grid_size.y, select_grid_size.z, select_range_config.block_threads, (long long) stream, select_range_config.items_per_thread, select_range_sm_occupancy); - - // Invoke select_range_kernel - select_range_kernel<<>>( - d_in, - d_flags, - d_out, - d_num_selected, - tile_status, - select_op, - equality_op, - num_items, - num_tiles, - queue); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIterator d_in, ///< [in] Pointer to input sequence of data items - FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags - OutputIterator d_out, ///< [in] Pointer to output sequence of selected data items - NumSelectedIterator d_num_selected, ///< [in] Pointer to total number of items selected (i.e., length of \p d_out) - SelectOp select_op, ///< [in] Selection operator - EqualityOp equality_op, ///< [in] Equality operator - Offset num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel kernel dispatch configurations - KernelConfig select_range_config; - InitConfigs(ptx_version, select_range_config); - - // Dispatch - if (CubDebug(error = Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_flags, - d_out, - d_num_selected, - select_op, - equality_op, - num_items, - stream, - debug_synchronous, - ptx_version, - ScanInitKernel, - SelectRegionKernel, - select_range_config))) break; - } - while (0); - - return error; - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory. + */ + +#pragma once + +#include +#include + +#include "dispatch_scan.cuh" +#include "../../agent/agent_select_if.cuh" +#include "../../thread/thread_operators.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_device.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Select kernel entry point (multi-block) + * + * Performs functor-based selection if SelectOpT functor type != NullType + * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType + * Otherwise performs discontinuity selection (keep unique) + */ +template < + typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicyT tuning policy type + typename InputIteratorT, ///< Random-access input iterator type for reading input items + typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items + typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected + typename ScanTileStateT, ///< Tile status interface type + typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) + typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) + typename OffsetT, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS)) +__global__ void DeviceSelectSweepKernel( + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) + SelectedOutputIteratorT d_selected_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out) + ScanTileStateT tile_status, ///< [in] Tile status interface + SelectOpT select_op, ///< [in] Selection operator + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + int num_tiles) ///< [in] Total number of tiles for the entire problem +{ + // Thread block type for selecting data from input tiles + typedef AgentSelectIf< + AgentSelectIfPolicyT, + InputIteratorT, + FlagsInputIteratorT, + SelectedOutputIteratorT, + SelectOpT, + EqualityOpT, + OffsetT, + KEEP_REJECTS> AgentSelectIfT; + + // Shared memory for AgentSelectIf + __shared__ typename AgentSelectIfT::TempStorage temp_storage; + + // Process tiles + AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange( + num_tiles, + tile_status, + d_num_selected_out); +} + + + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect + */ +template < + typename InputIteratorT, ///< Random-access input iterator type for reading input items + typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) + typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items + typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected + typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) + typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) + typename OffsetT, ///< Signed integer type for global offsets + bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output +struct DispatchSelectIf +{ + /****************************************************************************** + * Types and constants + ******************************************************************************/ + + // The output value type + typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type + + // The flag value type + typedef typename std::iterator_traits::value_type FlagT; + + enum + { + INIT_KERNEL_THREADS = 128, + }; + + // Tile status descriptor interface type + typedef ScanTileState ScanTileStateT; + + + /****************************************************************************** + * Tuning policies + ******************************************************************************/ + + /// SM35 + struct Policy350 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 10, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SelectIfPolicyT; + }; + + /// SM30 + struct Policy300 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 7, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SelectIfPolicyT; + }; + + /// SM20 + struct Policy200 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 128, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SelectIfPolicyT; + }; + + /// SM13 + struct Policy130 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_RAKING_MEMOIZE> + SelectIfPolicyT; + }; + + /// SM10 + struct Policy100 + { + enum { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), + }; + + typedef AgentSelectIfPolicy< + 64, + ITEMS_PER_THREAD, + BLOCK_LOAD_WARP_TRANSPOSE, + LOAD_DEFAULT, + BLOCK_SCAN_RAKING> + SelectIfPolicyT; + }; + + + /****************************************************************************** + * Tuning policies of current PTX compiler pass + ******************************************************************************/ + +#if (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 130) + typedef Policy130 PtxPolicy; + +#else + typedef Policy100 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {}; + + + /****************************************************************************** + * Utilities + ******************************************************************************/ + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &select_if_config) + { + #if (CUB_PTX_ARCH > 0) + (void)ptx_version; + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + select_if_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 350) + { + select_if_config.template Init(); + } + else if (ptx_version >= 300) + { + select_if_config.template Init(); + } + else if (ptx_version >= 200) + { + select_if_config.template Init(); + } + else if (ptx_version >= 130) + { + select_if_config.template Init(); + } + else + { + select_if_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + /** + * Internal dispatch routine for computing a device-wide selection using the + * specified kernel functions. + */ + template < + typename ScanInitKernelPtrT, ///< Function type of cub::DeviceScanInitKernel + typename SelectIfKernelPtrT> ///< Function type of cub::SelectIfKernelPtrT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) + SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out) + SelectOpT select_op, ///< [in] Selection operator + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + int /*ptx_version*/, ///< [in] PTX version of dispatch kernels + ScanInitKernelPtrT scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel + SelectIfKernelPtrT select_if_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel + KernelConfig select_if_config) ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for + { + +#ifndef CUB_RUNTIME_ENABLED + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_in; + (void)d_flags; + (void)d_selected_out; + (void)d_num_selected_out; + (void)select_op; + (void)equality_op; + (void)num_items; + (void)stream; + (void)debug_synchronous; + (void)scan_init_kernel; + (void)select_if_kernel; + (void)select_if_config; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Number of input tiles + int tile_size = select_if_config.block_threads * select_if_config.items_per_thread; + int num_tiles = (num_items + tile_size - 1) / tile_size; + + // Specify temporary storage allocation requirements + size_t allocation_sizes[1]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void* allocations[1]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_status; + if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log scan_init_kernel configuration + int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); + if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke scan_init_kernel to initialize tile descriptors + scan_init_kernel<<>>( + tile_status, + num_tiles, + d_num_selected_out); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Return if empty problem + if (num_items == 0) + break; + + // Get SM occupancy for select_if_kernel + int range_select_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + range_select_sm_occupancy, // out + select_if_kernel, + select_if_config.block_threads))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Get grid size for scanning tiles + dim3 scan_grid_size; + scan_grid_size.z = 1; + scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x; + scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + + // Log select_if_kernel configuration + if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy); + + // Invoke select_if_kernel + select_if_kernel<<>>( + d_in, + d_flags, + d_selected_out, + d_num_selected_out, + tile_status, + select_op, + equality_op, + num_items, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items + FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) + SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out) + SelectOpT select_op, ///< [in] Selection operator + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig select_if_config; + InitConfigs(ptx_version, select_if_config); + + // Dispatch + if (CubDebug(error = Dispatch( + d_temp_storage, + temp_storage_bytes, + d_in, + d_flags, + d_selected_out, + d_num_selected_out, + select_op, + equality_op, + num_items, + stream, + debug_synchronous, + ptx_version, + DeviceCompactInitKernel, + DeviceSelectSweepKernel, + select_if_config))) break; + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/device/dispatch/dispatch_spmv_orig.cuh b/SRC/cub/device/dispatch/dispatch_spmv_orig.cuh new file mode 100644 index 00000000..ab9c5346 --- /dev/null +++ b/SRC/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -0,0 +1,834 @@ + +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). + */ + +#pragma once + +#include +#include + +#include "../../agent/single_pass_scan_operators.cuh" +#include "../../agent/agent_segment_fixup.cuh" +#include "../../agent/agent_spmv_orig.cuh" +#include "../../util_type.cuh" +#include "../../util_debug.cuh" +#include "../../util_device.cuh" +#include "../../thread/thread_search.cuh" +#include "../../grid/grid_queue.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/****************************************************************************** + * SpMV kernel entry points + *****************************************************************************/ + +/** + * Spmv search kernel. Identifies merge path starting coordinates for each tile. + */ +template < + typename AgentSpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type + typename ValueT, ///< Matrix and vector value type + typename OffsetT> ///< Signed integer type for sequence offsets +__global__ void DeviceSpmv1ColKernel( + SpmvParams spmv_params) ///< [in] SpMV input parameter bundle +{ + typedef CacheModifiedInputIterator< + AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, + ValueT, + OffsetT> + VectorValueIteratorT; + + VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x); + + int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (row_idx < spmv_params.num_rows) + { + OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx]; + OffsetT nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1]; + + ValueT value = 0.0; + if (end_nonzero_idx != nonzero_idx) + { + value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]]; + } + + spmv_params.d_vector_y[row_idx] = value; + } +} + + +/** + * Spmv search kernel. Identifies merge path starting coordinates for each tile. + */ +template < + typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type + typename OffsetT, ///< Signed integer type for sequence offsets + typename CoordinateT, ///< Merge path coordinate type + typename SpmvParamsT> ///< SpmvParams type +__global__ void DeviceSpmvSearchKernel( + int num_merge_tiles, ///< [in] Number of SpMV merge tiles (spmv grid size) + CoordinateT* d_tile_coordinates, ///< [out] Pointer to the temporary array of tile starting coordinates + SpmvParamsT spmv_params) ///< [in] SpMV input parameter bundle +{ + /// Constants + enum + { + BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD, + TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + typedef CacheModifiedInputIterator< + SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, + OffsetT, + OffsetT> + RowOffsetsSearchIteratorT; + + // Find the starting coordinate for all tiles (plus the end coordinate of the last one) + int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (tile_idx < num_merge_tiles + 1) + { + OffsetT diagonal = (tile_idx * TILE_ITEMS); + CoordinateT tile_coordinate; + CountingInputIterator nonzero_indices(0); + + // Search the merge path + MergePathSearch( + diagonal, + RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), + nonzero_indices, + spmv_params.num_rows, + spmv_params.num_nonzeros, + tile_coordinate); + + // Output starting offset + d_tile_coordinates[tile_idx] = tile_coordinate; + } +} + + +/** + * Spmv agent entry point + */ +template < + typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type + typename ScanTileStateT, ///< Tile status interface type + typename ValueT, ///< Matrix and vector value type + typename OffsetT, ///< Signed integer type for sequence offsets + typename CoordinateT, ///< Merge path coordinate type + bool HAS_ALPHA, ///< Whether the input parameter Alpha is 1 + bool HAS_BETA> ///< Whether the input parameter Beta is 0 +__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS)) +__global__ void DeviceSpmvKernel( + SpmvParams spmv_params, ///< [in] SpMV input parameter bundle + CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates + KeyValuePair* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block + int num_tiles, ///< [in] Number of merge tiles + ScanTileStateT tile_state, ///< [in] Tile status interface for fixup reduce-by-key kernel + int num_segment_fixup_tiles) ///< [in] Number of reduce-by-key tiles (fixup grid size) +{ + // Spmv agent type specialization + typedef AgentSpmv< + SpmvPolicyT, + ValueT, + OffsetT, + HAS_ALPHA, + HAS_BETA> + AgentSpmvT; + + // Shared memory for AgentSpmv + __shared__ typename AgentSpmvT::TempStorage temp_storage; + + AgentSpmvT(temp_storage, spmv_params).ConsumeTile( + d_tile_coordinates, + d_tile_carry_pairs, + num_tiles); + + // Initialize fixup tile status + tile_state.InitializeStatus(num_segment_fixup_tiles); + +} + + +/** + * Multi-block reduce-by-key sweep kernel entry point + */ +template < + typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type + typename PairsInputIteratorT, ///< Random-access input iterator type for keys + typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values + typename OffsetT, ///< Signed integer type for global offsets + typename ScanTileStateT> ///< Tile status interface type +__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) +__global__ void DeviceSegmentFixupKernel( + PairsInputIteratorT d_pairs_in, ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block + AggregatesOutputIteratorT d_aggregates_out, ///< [in,out] Output value aggregates + OffsetT num_items, ///< [in] Total number of items to select from + int num_tiles, ///< [in] Total number of tiles for the entire problem + ScanTileStateT tile_state) ///< [in] Tile status interface +{ + // Thread block type for reducing tiles of value segments + typedef AgentSegmentFixup< + AgentSegmentFixupPolicyT, + PairsInputIteratorT, + AggregatesOutputIteratorT, + cub::Equality, + cub::Sum, + OffsetT> + AgentSegmentFixupT; + + // Shared memory for AgentSegmentFixup + __shared__ typename AgentSegmentFixupT::TempStorage temp_storage; + + // Process tiles + AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange( + num_items, + num_tiles, + tile_state); +} + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv + */ +template < + typename ValueT, ///< Matrix and vector value type + typename OffsetT> ///< Signed integer type for global offsets +struct DispatchSpmv +{ + //--------------------------------------------------------------------- + // Constants and Types + //--------------------------------------------------------------------- + + enum + { + INIT_KERNEL_THREADS = 128 + }; + + // SpmvParams bundle type + typedef SpmvParams SpmvParamsT; + + // 2D merge path coordinate type + typedef typename CubVector::Type CoordinateT; + + // Tile status descriptor interface type + typedef ReduceByKeyScanTileState ScanTileStateT; + + // Tuple type for scanning (pairs accumulated segment-value with segment-index) + typedef KeyValuePair KeyValuePairT; + + + //--------------------------------------------------------------------- + // Tuning policies + //--------------------------------------------------------------------- + + /// SM11 + struct Policy110 + { + typedef AgentSpmvPolicy< + 128, + 1, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 4, + BLOCK_LOAD_VECTORIZE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + /// SM20 + struct Policy200 + { + typedef AgentSpmvPolicy< + 96, + 18, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_RAKING> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 4, + BLOCK_LOAD_VECTORIZE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + + }; + + + + /// SM30 + struct Policy300 + { + typedef AgentSpmvPolicy< + 96, + 6, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 4, + BLOCK_LOAD_VECTORIZE, + LOAD_DEFAULT, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + + }; + + + /// SM35 + struct Policy350 + { + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 96 : 128, + (sizeof(ValueT) > 4) ? 4 : 7, + LOAD_LDG, + LOAD_CA, + LOAD_LDG, + LOAD_LDG, + LOAD_LDG, + (sizeof(ValueT) > 4) ? true : false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_VECTORIZE, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + + /// SM37 + struct Policy370 + { + + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 128 : 128, + (sizeof(ValueT) > 4) ? 9 : 14, + LOAD_LDG, + LOAD_CA, + LOAD_LDG, + LOAD_LDG, + LOAD_LDG, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_VECTORIZE, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + /// SM50 + struct Policy500 + { + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 64 : 128, + (sizeof(ValueT) > 4) ? 6 : 7, + LOAD_LDG, + LOAD_DEFAULT, + (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, + (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, + LOAD_LDG, + (sizeof(ValueT) > 4) ? true : false, + (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE> + SpmvPolicyT; + + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_VECTORIZE, + LOAD_LDG, + BLOCK_SCAN_RAKING_MEMOIZE> + SegmentFixupPolicyT; + }; + + + /// SM60 + struct Policy600 + { + typedef AgentSpmvPolicy< + (sizeof(ValueT) > 4) ? 64 : 128, + (sizeof(ValueT) > 4) ? 5 : 7, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + LOAD_DEFAULT, + false, + BLOCK_SCAN_WARP_SCANS> + SpmvPolicyT; + + + typedef AgentSegmentFixupPolicy< + 128, + 3, + BLOCK_LOAD_DIRECT, + LOAD_LDG, + BLOCK_SCAN_WARP_SCANS> + SegmentFixupPolicyT; + }; + + + + //--------------------------------------------------------------------- + // Tuning policies of current PTX compiler pass + //--------------------------------------------------------------------- + +#if (CUB_PTX_ARCH >= 600) + typedef Policy600 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 500) + typedef Policy500 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 370) + typedef Policy370 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 350) + typedef Policy350 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 300) + typedef Policy300 PtxPolicy; + +#elif (CUB_PTX_ARCH >= 200) + typedef Policy200 PtxPolicy; + +#else + typedef Policy110 PtxPolicy; + +#endif + + // "Opaque" policies (whose parameterizations aren't reflected in the type signature) + struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {}; + struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {}; + + + //--------------------------------------------------------------------- + // Utilities + //--------------------------------------------------------------------- + + /** + * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use + */ + template + CUB_RUNTIME_FUNCTION __forceinline__ + static void InitConfigs( + int ptx_version, + KernelConfig &spmv_config, + KernelConfig &segment_fixup_config) + { + #if (CUB_PTX_ARCH > 0) + + // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy + spmv_config.template Init(); + segment_fixup_config.template Init(); + + #else + + // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version + if (ptx_version >= 600) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 500) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 370) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 350) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else if (ptx_version >= 300) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + + } + else if (ptx_version >= 200) + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + else + { + spmv_config.template Init(); + segment_fixup_config.template Init(); + } + + #endif + } + + + /** + * Kernel kernel dispatch configuration. + */ + struct KernelConfig + { + int block_threads; + int items_per_thread; + int tile_items; + + template + CUB_RUNTIME_FUNCTION __forceinline__ + void Init() + { + block_threads = PolicyT::BLOCK_THREADS; + items_per_thread = PolicyT::ITEMS_PER_THREAD; + tile_items = block_threads * items_per_thread; + } + }; + + + //--------------------------------------------------------------------- + // Dispatch entrypoints + //--------------------------------------------------------------------- + + /** + * Internal dispatch routine for computing a device-wide reduction using the + * specified kernel functions. + * + * If the input is larger than a single tile, this method uses two-passes of + * kernel invocations. + */ + template < + typename Spmv1ColKernelT, ///< Function type of cub::DeviceSpmv1ColKernel + typename SpmvSearchKernelT, ///< Function type of cub::AgentSpmvSearchKernel + typename SpmvKernelT, ///< Function type of cub::AgentSpmvKernel + typename SegmentFixupKernelT> ///< Function type of cub::DeviceSegmentFixupKernelT + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SpmvParamsT& spmv_params, ///< SpMV input parameter bundle + cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + Spmv1ColKernelT spmv_1col_kernel, ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel + SpmvSearchKernelT spmv_search_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel + SpmvKernelT spmv_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel + SegmentFixupKernelT segment_fixup_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel + KernelConfig spmv_config, ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for + KernelConfig segment_fixup_config) ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for + { +#ifndef CUB_RUNTIME_ENABLED + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported ); + +#else + cudaError error = cudaSuccess; + do + { + if (spmv_params.num_cols == 1) + { + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + temp_storage_bytes = 1; + break; + } + + // Get search/init grid dims + int degen_col_kernel_block_size = INIT_KERNEL_THREADS; + int degen_col_kernel_grid_size = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size; + + if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n", + degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream); + + // Invoke spmv_search_kernel + spmv_1col_kernel<<>>( + spmv_params); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + break; + } + + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Get SM count + int sm_count; + if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + + // Total number of spmv work items + int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros; + + // Tile sizes of kernels + int merge_tile_size = spmv_config.block_threads * spmv_config.items_per_thread; + int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread; + + // Number of tiles for kernels + unsigned int num_merge_tiles = (num_merge_items + merge_tile_size - 1) / merge_tile_size; + unsigned int num_segment_fixup_tiles = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size; + + // Get SM occupancy for kernels + int spmv_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + spmv_sm_occupancy, + spmv_kernel, + spmv_config.block_threads))) break; + + int segment_fixup_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy( + segment_fixup_sm_occupancy, + segment_fixup_kernel, + segment_fixup_config.block_threads))) break; + + // Get grid dimensions + dim3 spmv_grid_size( + CUB_MIN(num_merge_tiles, max_dim_x), + (num_merge_tiles + max_dim_x - 1) / max_dim_x, + 1); + + dim3 segment_fixup_grid_size( + CUB_MIN(num_segment_fixup_tiles, max_dim_x), + (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x, + 1); + + // Get the temporary storage allocation requirements + size_t allocation_sizes[3]; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break; // bytes needed for reduce-by-key tile status descriptors + allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT); // bytes needed for block carry-out pairs + allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT); // bytes needed for tile starting coordinates + + // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) + void* allocations[3]; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_state; + if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break; + + // Alias the other allocations + KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[1]; // Agent carry-out pairs + CoordinateT* d_tile_coordinates = (CoordinateT*) allocations[2]; // Agent starting coordinates + + // Get search/init grid dims + int search_block_size = INIT_KERNEL_THREADS; + int search_grid_size = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size; + +#if (CUB_PTX_ARCH == 0) + // Init textures + if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break; +#endif + + if (search_grid_size < sm_count) +// if (num_merge_tiles < spmv_sm_occupancy * sm_count) + { + // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords + d_tile_coordinates = NULL; + } + else + { + // Use separate search kernel if we have enough spmv tiles to saturate the device + + // Log spmv_search_kernel configuration + if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n", + search_grid_size, search_block_size, (long long) stream); + + // Invoke spmv_search_kernel + spmv_search_kernel<<>>( + num_merge_tiles, + d_tile_coordinates, + spmv_params); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + + // Log spmv_kernel configuration + if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy); + + // Invoke spmv_kernel + spmv_kernel<<>>( + spmv_params, + d_tile_coordinates, + d_tile_carry_pairs, + num_merge_tiles, + tile_state, + num_segment_fixup_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Run reduce-by-key fixup if necessary + if (num_merge_tiles > 1) + { + // Log segment_fixup_kernel configuration + if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", + segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy); + + // Invoke segment_fixup_kernel + segment_fixup_kernel<<>>( + d_tile_carry_pairs, + spmv_params.d_vector_y, + num_merge_tiles, + num_segment_fixup_tiles, + tile_state); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + +#if (CUB_PTX_ARCH == 0) + // Free textures + if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break; +#endif + } + while (0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + + /** + * Internal dispatch routine for computing a device-wide reduction + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + SpmvParamsT& spmv_params, ///< SpMV input parameter bundle + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + cudaError error = cudaSuccess; + do + { + // Get PTX version + int ptx_version; + #if (CUB_PTX_ARCH == 0) + if (CubDebug(error = PtxVersion(ptx_version))) break; + #else + ptx_version = CUB_PTX_ARCH; + #endif + + // Get kernel kernel dispatch configurations + KernelConfig spmv_config, segment_fixup_config; + InitConfigs(ptx_version, spmv_config, segment_fixup_config); + + if (CubDebug(error = Dispatch( + d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, + DeviceSpmv1ColKernel, + DeviceSpmvSearchKernel, + DeviceSpmvKernel, + DeviceSegmentFixupKernel, + spmv_config, segment_fixup_config))) break; + + } + while (0); + + return error; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + + diff --git a/SRC/cub/grid/grid_barrier.cuh b/SRC/cub/grid/grid_barrier.cuh index eab5b518..461fb442 100644 --- a/SRC/cub/grid/grid_barrier.cuh +++ b/SRC/cub/grid/grid_barrier.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -80,7 +80,7 @@ public: // Threadfence and syncthreads to make sure global writes are visible before // thread-0 reports in with its sync counter __threadfence(); - __syncthreads(); + CTA_SYNC(); if (blockIdx.x == 0) { @@ -90,7 +90,7 @@ public: d_vol_sync[blockIdx.x] = 1; } - __syncthreads(); + CTA_SYNC(); // Wait for everyone else to report in for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) @@ -101,7 +101,7 @@ public: } } - __syncthreads(); + CTA_SYNC(); // Let everyone know it's safe to proceed for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) @@ -123,7 +123,7 @@ public: } } - __syncthreads(); + CTA_SYNC(); } } }; diff --git a/SRC/cub/grid/grid_even_share.cuh b/SRC/cub/grid/grid_even_share.cuh index a3556329..f0b3a69a 100644 --- a/SRC/cub/grid/grid_even_share.cuh +++ b/SRC/cub/grid/grid_even_share.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -28,7 +28,7 @@ /** * \file - * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). + * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly the same number of fixed-size work units (grains). */ @@ -36,6 +36,7 @@ #include "../util_namespace.cuh" #include "../util_macro.cuh" +#include "grid_mapping.cuh" /// Optional outer namespace(s) CUB_NS_PREFIX @@ -51,134 +52,170 @@ namespace cub { /** - * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). + * \brief GridEvenShare is a descriptor utility for distributing input among + * CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly + * the same number of input tiles. * * \par Overview - * GridEvenShare indicates which sections of input are to be mapped onto which threadblocks. - * Threadblocks may receive one of three different amounts of work: "big", "normal", - * and "last". The "big" workloads are one scheduling grain larger than "normal". The "last" work unit - * for the last threadblock may be partially-full if the input is not an even multiple of + * Each thread block is assigned a consecutive sequence of input tiles. To help + * preserve alignment and eliminate the overhead of guarded loads for all but the + * last thread block, to GridEvenShare assigns one of three different amounts of + * work to a given thread block: "big", "normal", or "last". The "big" workloads + * are one scheduling grain larger than "normal". The "last" work unit for the + * last thread block may be partially-full if the input is not an even multiple of * the scheduling grain size. * * \par - * Before invoking a child grid, a parent thread will typically construct an instance of - * GridEvenShare. The instance can be passed to child threadblocks which can - * initialize their per-threadblock offsets using \p BlockInit(). - * - * \tparam Offset Signed integer type for global offsets + * Before invoking a child grid, a parent thread will typically construct an + * instance of GridEvenShare. The instance can be passed to child thread blocks + * which can initialize their per-thread block offsets using \p BlockInit(). */ -template +template struct GridEvenShare { - Offset total_grains; - int big_blocks; - Offset big_share; - Offset normal_share; - Offset normal_base_offset; +private: + + OffsetT total_tiles; + int big_shares; + OffsetT big_share_items; + OffsetT normal_share_items; + OffsetT normal_base_offset; + +public: /// Total number of input items - Offset num_items; + OffsetT num_items; - /// Grid size in threadblocks + /// Grid size in thread blocks int grid_size; - /// Offset into input marking the beginning of the owning thread block's segment of input tiles - Offset block_offset; + /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles + OffsetT block_offset; + + /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles + OffsetT block_end; + + /// Stride between input tiles + OffsetT block_stride; - /// Offset into input of marking the end (one-past) of the owning thread block's segment of input tiles - Offset block_end; /** - * \brief Default constructor. Zero-initializes block-specific fields. + * \brief Constructor. */ __host__ __device__ __forceinline__ GridEvenShare() : + total_tiles(0), + big_shares(0), + big_share_items(0), + normal_share_items(0), + normal_base_offset(0), num_items(0), grid_size(0), block_offset(0), - block_end(0) {} + block_end(0), + block_stride(0) + {} + /** - * \brief Constructor. Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch) + * \brief Dispatch initializer. To be called prior prior to kernel launch. */ - __host__ __device__ __forceinline__ GridEvenShare( - Offset num_items, ///< Total number of input items - int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) - int schedule_granularity) ///< Granularity by which the input can be parcelled into and distributed among threablocks. Usually the thread block's native tile size (or a multiple thereof. + __host__ __device__ __forceinline__ void DispatchInit( + OffsetT num_items, ///< Total number of input items + int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) + int tile_items) ///< Number of data items per input tile { + this->block_offset = num_items; // Initialize past-the-end + this->block_end = num_items; // Initialize past-the-end this->num_items = num_items; - this->block_offset = num_items; - this->block_end = num_items; - this->total_grains = (num_items + schedule_granularity - 1) / schedule_granularity; - this->grid_size = CUB_MIN(total_grains, max_grid_size); - Offset grains_per_block = total_grains / grid_size; - this->big_blocks = total_grains - (grains_per_block * grid_size); // leftover grains go to big blocks - this->normal_share = grains_per_block * schedule_granularity; - this->normal_base_offset = big_blocks * schedule_granularity; - this->big_share = normal_share + schedule_granularity; + this->total_tiles = (num_items + tile_items - 1) / tile_items; + this->grid_size = CUB_MIN(total_tiles, max_grid_size); + OffsetT avg_tiles_per_block = total_tiles / grid_size; + this->big_shares = total_tiles - (avg_tiles_per_block * grid_size); // leftover grains go to big blocks + this->normal_share_items = avg_tiles_per_block * tile_items; + this->normal_base_offset = big_shares * tile_items; + this->big_share_items = normal_share_items + tile_items; } - /** - * \brief Initializes ranges for the specified partition index + * \brief Initializes ranges for the specified thread block index. Specialized + * for a "raking" access pattern in which each thread block is assigned a + * consecutive sequence of input tiles. */ - __device__ __forceinline__ void Init(int partition_id) + template + __device__ __forceinline__ void BlockInit( + int block_id, + Int2Type /*strategy_tag*/) { - if (partition_id < big_blocks) + block_stride = TILE_ITEMS; + if (block_id < big_shares) { - // This threadblock gets a big share of grains (grains_per_block + 1) - block_offset = (partition_id * big_share); - block_end = block_offset + big_share; + // This thread block gets a big share of grains (avg_tiles_per_block + 1) + block_offset = (block_id * big_share_items); + block_end = block_offset + big_share_items; } - else if (partition_id < total_grains) + else if (block_id < total_tiles) { - // This threadblock gets a normal share of grains (grains_per_block) - block_offset = normal_base_offset + (partition_id * normal_share); - block_end = CUB_MIN(num_items, block_offset + normal_share); + // This thread block gets a normal share of grains (avg_tiles_per_block) + block_offset = normal_base_offset + (block_id * normal_share_items); + block_end = CUB_MIN(num_items, block_offset + normal_share_items); } + // Else default past-the-end } /** - * \brief Initializes ranges for the current thread block (e.g., to be called by each threadblock after startup) + * \brief Block-initialization, specialized for a "raking" access + * pattern in which each thread block is assigned a consecutive sequence + * of input tiles. */ + template + __device__ __forceinline__ void BlockInit( + int block_id, + Int2Type /*strategy_tag*/) + { + block_stride = grid_size * TILE_ITEMS; + block_offset = (block_id * TILE_ITEMS); + block_end = num_items; + } + + + /** + * \brief Block-initialization, specialized for "strip mining" access + * pattern in which the input tiles assigned to each thread block are + * separated by a stride equal to the the extent of the grid. + */ + template < + int TILE_ITEMS, + GridMappingStrategy STRATEGY> __device__ __forceinline__ void BlockInit() { - Init(blockIdx.x); + BlockInit(blockIdx.x, Int2Type()); } /** - * Print to stdout + * \brief Block-initialization, specialized for a "raking" access + * pattern in which each thread block is assigned a consecutive sequence + * of input tiles. */ - __host__ __device__ __forceinline__ void Print() + template + __device__ __forceinline__ void BlockInit( + OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) + OffsetT block_end) ///< [in] Threadblock end offset (exclusive) { - printf( -#if (CUB_PTX_ARCH > 0) - "\tthreadblock(%d) " - "block_offset(%lu) " - "block_end(%lu) " -#endif - "num_items(%lu) " - "total_grains(%lu) " - "big_blocks(%lu) " - "big_share(%lu) " - "normal_share(%lu)\n", -#if (CUB_PTX_ARCH > 0) - blockIdx.x, - (unsigned long) block_offset, - (unsigned long) block_end, -#endif - (unsigned long) num_items, - (unsigned long) total_grains, - (unsigned long) big_blocks, - (unsigned long) big_share, - (unsigned long) normal_share); + this->block_offset = block_offset; + this->block_end = block_end; + this->block_stride = TILE_ITEMS; } + + }; + + /** @} */ // end group GridModule } // CUB namespace diff --git a/SRC/cub/grid/grid_mapping.cuh b/SRC/cub/grid/grid_mapping.cuh index ff6679b9..f0e9fded 100644 --- a/SRC/cub/grid/grid_mapping.cuh +++ b/SRC/cub/grid/grid_mapping.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -59,7 +59,8 @@ namespace cub { enum GridMappingStrategy { /** - * \brief An "even-share" strategy for assigning input tiles to thread blocks. + * \brief An a "raking" access pattern in which each thread block is + * assigned a consecutive sequence of input tiles * * \par Overview * The input is evenly partitioned into \p p segments, where \p p is @@ -71,7 +72,24 @@ enum GridMappingStrategy * of which iteratively consumes a segment of n/p elements * in tile-size increments. */ - GRID_MAPPING_EVEN_SHARE, + GRID_MAPPING_RAKE, + + /** + * \brief An a "strip mining" access pattern in which the input tiles assigned + * to each thread block are separated by a stride equal to the the extent of + * the grid. + * + * \par Overview + * The input is evenly partitioned into \p p sets, where \p p is + * constant and corresponds loosely to the number of thread blocks that may + * actively reside on the target device. Each set is comprised of + * data tiles separated by stride \p tiles, where a tile is a small, + * constant-sized unit of input to be processed to completion before the + * thread block terminates or obtains more work. The kernel invokes \p p + * thread blocks, each of which iteratively consumes a segment of + * n/p elements in tile-size increments. + */ + GRID_MAPPING_STRIP_MINE, /** * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. diff --git a/SRC/cub/grid/grid_queue.cuh b/SRC/cub/grid/grid_queue.cuh index 86566166..9615b14d 100644 --- a/SRC/cub/grid/grid_queue.cuh +++ b/SRC/cub/grid/grid_queue.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -76,9 +76,9 @@ namespace cub { * Iterative work management can be implemented simply with a pair of flip-flopping * work buffers, each with an associated set of fill and drain GridQueue descriptors. * - * \tparam Offset Signed integer type for global offsets + * \tparam OffsetT Signed integer type for global offsets */ -template +template class GridQueue { private: @@ -91,7 +91,7 @@ private: }; /// Pair of counters - Offset *d_counters; + OffsetT *d_counters; public: @@ -99,7 +99,7 @@ public: __host__ __device__ __forceinline__ static size_t AllocationSize() { - return sizeof(Offset) * 2; + return sizeof(OffsetT) * 2; } @@ -114,24 +114,25 @@ public: __host__ __device__ __forceinline__ GridQueue( void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as AllocationSize(). : - d_counters((Offset*) d_storage) + d_counters((OffsetT*) d_storage) {} /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining. __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain( - Offset fill_size, + OffsetT fill_size, cudaStream_t stream = 0) { #if (CUB_PTX_ARCH > 0) + (void)stream; d_counters[FILL] = fill_size; d_counters[DRAIN] = 0; return cudaSuccess; #else - Offset counters[2]; + OffsetT counters[2]; counters[FILL] = fill_size; counters[DRAIN] = 0; - return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(Offset) * 2, cudaMemcpyHostToDevice, stream)); + return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream)); #endif } @@ -140,49 +141,52 @@ public: __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0) { #if (CUB_PTX_ARCH > 0) + (void)stream; d_counters[DRAIN] = 0; return cudaSuccess; #else - return FillAndResetDrain(0, stream); + return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream)); #endif } /// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling. - __host__ __device__ __forceinline__ cudaError_t ResetFill() + __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0) { #if (CUB_PTX_ARCH > 0) + (void)stream; d_counters[FILL] = 0; return cudaSuccess; #else - return CubDebug(cudaMemset(d_counters + FILL, 0, sizeof(Offset))); + return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream)); #endif } /// Returns the fill-size established by the parent or by the previous kernel. __host__ __device__ __forceinline__ cudaError_t FillSize( - Offset &fill_size, + OffsetT &fill_size, cudaStream_t stream = 0) { #if (CUB_PTX_ARCH > 0) + (void)stream; fill_size = d_counters[FILL]; return cudaSuccess; #else - return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(Offset), cudaMemcpyDeviceToHost, stream)); + return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream)); #endif } - /// Drain num_items. Returns offset from which to read items. - __device__ __forceinline__ Offset Drain(Offset num_items) + /// Drain \p num_items from the queue. Returns offset from which to read items. To be called from CUDA kernel. + __device__ __forceinline__ OffsetT Drain(OffsetT num_items) { return atomicAdd(d_counters + DRAIN, num_items); } - /// Fill num_items. Returns offset from which to write items. - __device__ __forceinline__ Offset Fill(Offset num_items) + /// Fill \p num_items into the queue. Returns offset from which to write items. To be called from CUDA kernel. + __device__ __forceinline__ OffsetT Fill(OffsetT num_items) { return atomicAdd(d_counters + FILL, num_items); } @@ -195,10 +199,10 @@ public: /** * Reset grid queue (call with 1 block of 1 thread) */ -template +template __global__ void FillAndResetDrainKernel( - GridQueue grid_queue, - Offset num_items) + GridQueue grid_queue, + OffsetT num_items) { grid_queue.FillAndResetDrain(num_items); } diff --git a/SRC/cub/host/mutex.cuh b/SRC/cub/host/mutex.cuh new file mode 100644 index 00000000..ff7ec90d --- /dev/null +++ b/SRC/cub/host/mutex.cuh @@ -0,0 +1,171 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Simple portable mutex + */ + + +#pragma once + +#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) + #include +#else + #if defined(_WIN32) || defined(_WIN64) + #include + + #define WIN32_LEAN_AND_MEAN + #define NOMINMAX + #include + #undef WIN32_LEAN_AND_MEAN + #undef NOMINMAX + + /** + * Compiler read/write barrier + */ + #pragma intrinsic(_ReadWriteBarrier) + + #endif +#endif + +#include "../util_namespace.cuh" + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * Simple portable mutex + * - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms) + * - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++) + */ +struct Mutex +{ +#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) + + std::mutex mtx; + + void Lock() + { + mtx.lock(); + } + + void Unlock() + { + mtx.unlock(); + } + + void TryLock() + { + mtx.try_lock(); + } + +#else //__cplusplus > 199711L + + #if defined(_MSC_VER) + + // Microsoft VC++ + typedef long Spinlock; + + #else + + // GNU g++ + typedef int Spinlock; + + /** + * Compiler read/write barrier + */ + __forceinline__ void _ReadWriteBarrier() + { + __sync_synchronize(); + } + + /** + * Atomic exchange + */ + __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value) + { + // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier + _ReadWriteBarrier(); + return __sync_lock_test_and_set(Target, Value); + } + + /** + * Pause instruction to prevent excess processor bus usage + */ + __forceinline__ void YieldProcessor() + { + } + + #endif // defined(_MSC_VER) + + /// Lock member + volatile Spinlock lock; + + /** + * Constructor + */ + Mutex() : lock(0) {} + + /** + * Return when the specified spinlock has been acquired + */ + __forceinline__ void Lock() + { + while (1) + { + if (!_InterlockedExchange(&lock, 1)) return; + while (lock) YieldProcessor(); + } + } + + + /** + * Release the specified spinlock + */ + __forceinline__ void Unlock() + { + _ReadWriteBarrier(); + lock = 0; + } + +#endif // __cplusplus > 199711L + +}; + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) + diff --git a/SRC/cub/host/spinlock.cuh b/SRC/cub/host/spinlock.cuh deleted file mode 100644 index 6e4b47c7..00000000 --- a/SRC/cub/host/spinlock.cuh +++ /dev/null @@ -1,123 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Simple x86/x64 atomic spinlock, portable across MS Windows (cl.exe) & Linux (g++) - */ - - -#pragma once - -#if defined(_WIN32) || defined(_WIN64) - #include - #include - #undef small // Windows is terrible for polluting macro namespace - - /** - * Compiler read/write barrier - */ - #pragma intrinsic(_ReadWriteBarrier) - -#endif - -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -#if defined(_MSC_VER) - - // Microsoft VC++ - typedef long Spinlock; - -#else - - // GNU g++ - typedef int Spinlock; - - /** - * Compiler read/write barrier - */ - __forceinline__ void _ReadWriteBarrier() - { - __sync_synchronize(); - } - - /** - * Atomic exchange - */ - __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value) - { - // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier - _ReadWriteBarrier(); - return __sync_lock_test_and_set(Target, Value); - } - - /** - * Pause instruction to prevent excess processor bus usage - */ - __forceinline__ void YieldProcessor() - { -#ifndef __arm__ - asm volatile("pause\n": : :"memory"); -#endif // __arm__ - } - -#endif // defined(_MSC_VER) - -/** - * Return when the specified spinlock has been acquired - */ -__forceinline__ void Lock(volatile Spinlock *lock) -{ - while (1) - { - if (!_InterlockedExchange(lock, 1)) return; - while (*lock) YieldProcessor(); - } -} - - -/** - * Release the specified spinlock - */ -__forceinline__ void Unlock(volatile Spinlock *lock) -{ - _ReadWriteBarrier(); - *lock = 0; -} - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/iterator/arg_index_input_iterator.cuh b/SRC/cub/iterator/arg_index_input_iterator.cuh index 03b842d4..95a84a57 100644 --- a/SRC/cub/iterator/arg_index_input_iterator.cuh +++ b/SRC/cub/iterator/arg_index_input_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -62,12 +62,12 @@ namespace cub { /** - * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p ItemOffsetPair tuples). + * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples). * * \par Overview - * - ArgIndexInputIterator wraps a random access input iterator \p itr of type \p InputIterator. - * Dereferencing an ArgIndexInputIterator at offset \p i produces a \p ItemOffsetPair value whose - * \p offset field is \p i and whose \p item field is itr[i]. + * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT. + * Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose + * \p key field is \p i and whose \p value field is itr[i]. * - Can be used with any data type. * - Can be constructed, manipulated, and exchanged within and between host and device * functions. Wrapped host memory can only be dereferenced on the host, and wrapped @@ -75,7 +75,7 @@ namespace cub { * - Compatible with Thrust API v1.7 or newer. * * \par Snippet - * The code snippet below illustrates the use of \p ArgIndexInputIterator to + * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto * dereference an array of doubles * \par * \code @@ -89,41 +89,37 @@ namespace cub { * * // Within device code: * typedef typename cub::ArgIndexInputIterator::value_type Tuple; - * Tuple item_offset_pair.offset = *itr; + * Tuple item_offset_pair.key = *itr; * printf("%f @ %d\n", - * item_offset_pair.value, - * item_offset_pair.offset); // 8.0 @ 0 + * item_offset_pair.value, + * item_offset_pair.key); // 8.0 @ 0 * * itr = itr + 6; - * item_offset_pair.offset = *itr; + * item_offset_pair.key = *itr; * printf("%f @ %d\n", - * item_offset_pair.value, - * item_offset_pair.offset); // 9.0 @ 6 + * item_offset_pair.value, + * item_offset_pair.key); // 9.0 @ 6 * * \endcode * - * \tparam InputIterator The type of the wrapped input iterator - * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + * \tparam InputIteratorT The value type of the wrapped input iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) + * \tparam OutputValueT The paired value type of the tuple (Default: value type of input iterator) */ template < - typename InputIterator, - typename Offset = ptrdiff_t> + typename InputIteratorT, + typename OffsetT = ptrdiff_t, + typename OutputValueT = typename std::iterator_traits::value_type> class ArgIndexInputIterator { -private: - - // Data type of input iterator - typedef typename std::iterator_traits::value_type T; - public: - // Required iterator traits - typedef ArgIndexInputIterator self_type; ///< My own type - typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another - typedef ItemOffsetPair value_type; ///< The type of the element the iterator can point to - typedef value_type* pointer; ///< The type of a pointer to an element the iterator can point to - typedef value_type reference; ///< The type of a reference to an element the iterator can point to + typedef ArgIndexInputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef KeyValuePair value_type; ///< The type of the element the iterator can point to + typedef value_type* pointer; ///< The type of a pointer to an element the iterator can point to + typedef value_type reference; ///< The type of a reference to an element the iterator can point to #if (THRUST_VERSION >= 100700) // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods @@ -139,15 +135,15 @@ public: private: - InputIterator itr; + InputIteratorT itr; difference_type offset; public: /// Constructor __host__ __device__ __forceinline__ ArgIndexInputIterator( - InputIterator itr, ///< Input iterator to wrap - difference_type offset = 0) ///< Offset (in items) from \p itr denoting the position of the iterator + InputIteratorT itr, ///< Input iterator to wrap + difference_type offset = 0) ///< OffsetT (in items) from \p itr denoting the position of the iterator : itr(itr), offset(offset) @@ -173,7 +169,7 @@ public: { value_type retval; retval.value = itr[offset]; - retval.offset = offset; + retval.key = offset; return retval; } @@ -219,7 +215,8 @@ public: template __host__ __device__ __forceinline__ reference operator[](Distance n) const { - return *(*this + n); + self_type offset = (*this) + n; + return *offset; } /// Structure dereference @@ -240,8 +237,15 @@ public: return ((itr != rhs.itr) || (offset != rhs.offset)); } + /// Normalize + __host__ __device__ __forceinline__ void normalize() + { + itr += offset; + offset = 0; + } + /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) { return os; } diff --git a/SRC/cub/iterator/cache_modified_input_iterator.cuh b/SRC/cub/iterator/cache_modified_input_iterator.cuh index 16ba3a4a..b4ad91e2 100644 --- a/SRC/cub/iterator/cache_modified_input_iterator.cuh +++ b/SRC/cub/iterator/cache_modified_input_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -66,7 +66,7 @@ namespace cub { * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier. * * \par Overview - * - CacheModifiedInputIterator is a random-access input iterator that wraps a native + * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native * device pointer of type ValueType*. \p ValueType references are * made by reading \p ValueType values through loads modified by \p MODIFIER. * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG", @@ -76,7 +76,7 @@ namespace cub { * - Compatible with Thrust API v1.7 or newer. * * \par Snippet - * The code snippet below illustrates the use of \p CacheModifiedInputIterator to + * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto * dereference a device array of double using the "ldg" PTX load modifier * (i.e., load values through texture cache). * \par @@ -98,19 +98,19 @@ namespace cub { * * \tparam CacheLoadModifier The cub::CacheLoadModifier to use when accessing data * \tparam ValueType The value type of this iterator - * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) */ template < CacheLoadModifier MODIFIER, typename ValueType, - typename Offset = ptrdiff_t> + typename OffsetT = ptrdiff_t> class CacheModifiedInputIterator { public: // Required iterator traits typedef CacheModifiedInputIterator self_type; ///< My own type - typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another typedef ValueType value_type; ///< The type of the element the iterator can point to typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to typedef ValueType reference; ///< The type of a reference to an element the iterator can point to @@ -128,17 +128,17 @@ public: #endif // THRUST_VERSION -private: +public: + /// Wrapped native pointer ValueType* ptr; -public: - /// Constructor + template __host__ __device__ __forceinline__ CacheModifiedInputIterator( - ValueType* ptr) ///< Native pointer to wrap + QualifiedValueType* ptr) ///< Native pointer to wrap : - ptr(ptr) + ptr(const_cast::Type *>(ptr)) {} /// Postfix increment @@ -157,7 +157,7 @@ public: } /// Indirection - __host__ __device__ __forceinline__ reference operator*() const + __device__ __forceinline__ reference operator*() const { return ThreadLoad(ptr); } @@ -202,13 +202,13 @@ public: /// Array subscript template - __host__ __device__ __forceinline__ reference operator[](Distance n) const + __device__ __forceinline__ reference operator[](Distance n) const { return ThreadLoad(ptr + n); } /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() + __device__ __forceinline__ pointer operator->() { return &ThreadLoad(ptr); } @@ -226,7 +226,7 @@ public: } /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) { return os; } diff --git a/SRC/cub/iterator/cache_modified_output_iterator.cuh b/SRC/cub/iterator/cache_modified_output_iterator.cuh index 179ce146..c3e3321d 100644 --- a/SRC/cub/iterator/cache_modified_output_iterator.cuh +++ b/SRC/cub/iterator/cache_modified_output_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -100,12 +100,12 @@ namespace cub { * * \tparam CacheStoreModifier The cub::CacheStoreModifier to use when accessing data * \tparam ValueType The value type of this iterator - * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) */ template < CacheStoreModifier MODIFIER, typename ValueType, - typename Offset = ptrdiff_t> + typename OffsetT = ptrdiff_t> class CacheModifiedOutputIterator { private: @@ -119,7 +119,7 @@ private: __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {} /// Assignment - __host__ __device__ __forceinline__ ValueType operator =(ValueType val) + __device__ __forceinline__ ValueType operator =(ValueType val) { ThreadStore(ptr, val); return val; @@ -130,9 +130,9 @@ public: // Required iterator traits typedef CacheModifiedOutputIterator self_type; ///< My own type - typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another - typedef ValueType value_type; ///< The type of the element the iterator can point to - typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef void value_type; ///< The type of the element the iterator can point to + typedef void pointer; ///< The type of a pointer to an element the iterator can point to typedef Reference reference; ///< The type of a reference to an element the iterator can point to #if (THRUST_VERSION >= 100700) @@ -154,10 +154,11 @@ private: public: /// Constructor + template __host__ __device__ __forceinline__ CacheModifiedOutputIterator( - ValueType* ptr) ///< Native pointer to wrap + QualifiedValueType* ptr) ///< Native pointer to wrap : - ptr(ptr) + ptr(const_cast::Type *>(ptr)) {} /// Postfix increment diff --git a/SRC/cub/iterator/constant_input_iterator.cuh b/SRC/cub/iterator/constant_input_iterator.cuh index 4c386a6b..1e0a9104 100644 --- a/SRC/cub/iterator/constant_input_iterator.cuh +++ b/SRC/cub/iterator/constant_input_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -64,7 +64,7 @@ namespace cub { * \brief A random-access input generator for dereferencing a sequence of homogeneous values * * \par Overview - * - Read references to a ConstantInputIterator iterator always return the supplied constant + * - Read references to a ConstantInputIteratorTiterator always return the supplied constant * of type \p ValueType. * - Can be used with any data type. * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device @@ -72,7 +72,7 @@ namespace cub { * - Compatible with Thrust API v1.7 or newer. * * \par Snippet - * The code snippet below illustrates the use of \p ConstantInputIterator to + * The code snippet below illustrates the use of \p ConstantInputIteratorTto * dereference a sequence of homogeneous doubles. * \par * \code @@ -88,18 +88,18 @@ namespace cub { * \endcode * * \tparam ValueType The value type of this iterator - * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) */ template < typename ValueType, - typename Offset = ptrdiff_t> + typename OffsetT = ptrdiff_t> class ConstantInputIterator { public: // Required iterator traits typedef ConstantInputIterator self_type; ///< My own type - typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another typedef ValueType value_type; ///< The type of the element the iterator can point to typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to typedef ValueType reference; ///< The type of a reference to an element the iterator can point to @@ -119,9 +119,9 @@ public: private: ValueType val; - Offset offset; + OffsetT offset; #ifdef _WIN32 - Offset pad[CUB_MAX(1, (16 / sizeof(Offset) - 1))]; // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) + OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) #endif public: @@ -129,7 +129,7 @@ public: /// Constructor __host__ __device__ __forceinline__ ConstantInputIterator( ValueType val, ///< Starting value for the iterator instance to report - Offset offset = 0) ///< Base offset + OffsetT offset = 0) ///< Base offset : val(val), offset(offset) @@ -196,7 +196,7 @@ public: /// Array subscript template - __host__ __device__ __forceinline__ reference operator[](Distance n) const + __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const { return val; } diff --git a/SRC/cub/iterator/counting_input_iterator.cuh b/SRC/cub/iterator/counting_input_iterator.cuh index 7c6320f9..7f49348d 100644 --- a/SRC/cub/iterator/counting_input_iterator.cuh +++ b/SRC/cub/iterator/counting_input_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -63,14 +63,14 @@ namespace cub { * \brief A random-access input generator for dereferencing a sequence of incrementing integer values. * * \par Overview - * - After initializing a CountingInputIterator to a certain integer \p base, read references + * - After initializing a CountingInputIteratorTto a certain integer \p base, read references * at \p offset will return the value \p base + \p offset. * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device * functions. * - Compatible with Thrust API v1.7 or newer. * * \par Snippet - * The code snippet below illustrates the use of \p CountingInputIterator to + * The code snippet below illustrates the use of \p CountingInputIteratorTto * dereference a sequence of incrementing integers. * \par * \code @@ -86,18 +86,18 @@ namespace cub { * \endcode * * \tparam ValueType The value type of this iterator - * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) */ template < typename ValueType, - typename Offset = ptrdiff_t> + typename OffsetT = ptrdiff_t> class CountingInputIterator { public: // Required iterator traits typedef CountingInputIterator self_type; ///< My own type - typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another typedef ValueType value_type; ///< The type of the element the iterator can point to typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to typedef ValueType reference; ///< The type of a reference to an element the iterator can point to @@ -152,7 +152,7 @@ public: template __host__ __device__ __forceinline__ self_type operator+(Distance n) const { - self_type retval(val + n); + self_type retval(val + (ValueType) n); return retval; } @@ -160,7 +160,7 @@ public: template __host__ __device__ __forceinline__ self_type& operator+=(Distance n) { - val += n; + val += (ValueType) n; return *this; } @@ -168,7 +168,7 @@ public: template __host__ __device__ __forceinline__ self_type operator-(Distance n) const { - self_type retval(val - n); + self_type retval(val - (ValueType) n); return retval; } @@ -183,14 +183,14 @@ public: /// Distance __host__ __device__ __forceinline__ difference_type operator-(self_type other) const { - return val - other.val; + return (difference_type) (val - other.val); } /// Array subscript template __host__ __device__ __forceinline__ reference operator[](Distance n) const { - return val + n; + return val + (ValueType) n; } /// Structure dereference diff --git a/SRC/cub/iterator/discard_output_iterator.cuh b/SRC/cub/iterator/discard_output_iterator.cuh new file mode 100644 index 00000000..28473e5f --- /dev/null +++ b/SRC/cub/iterator/discard_output_iterator.cuh @@ -0,0 +1,220 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Random-access iterator types + */ + +#pragma once + +#include +#include + +#include "../util_namespace.cuh" +#include "../util_macro.cuh" + +#if (THRUST_VERSION >= 100700) + // This iterator is compatible with Thrust API 1.7 and newer + #include + #include +#endif // THRUST_VERSION + + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup UtilIterator + * @{ + */ + + +/** + * \brief A discard iterator + */ +template +class DiscardOutputIterator +{ +public: + + // Required iterator traits + typedef DiscardOutputIterator self_type; ///< My own type + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another + typedef void value_type; ///< The type of the element the iterator can point to + typedef void pointer; ///< The type of a pointer to an element the iterator can point to + typedef void reference; ///< The type of a reference to an element the iterator can point to + +#if (THRUST_VERSION >= 100700) + // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods + typedef typename thrust::detail::iterator_facade_category< + thrust::any_system_tag, + thrust::random_access_traversal_tag, + value_type, + reference + >::type iterator_category; ///< The iterator category +#else + typedef std::random_access_iterator_tag iterator_category; ///< The iterator category +#endif // THRUST_VERSION + +private: + + OffsetT offset; + +#if defined(_WIN32) || !defined(_WIN64) + // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) + OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; +#endif + +public: + + /// Constructor + __host__ __device__ __forceinline__ DiscardOutputIterator( + OffsetT offset = 0) ///< Base offset + : + offset(offset) + {} + + /// Postfix increment + __host__ __device__ __forceinline__ self_type operator++(int) + { + self_type retval = *this; + offset++; + return retval; + } + + /// Prefix increment + __host__ __device__ __forceinline__ self_type operator++() + { + offset++; + return *this; + } + + /// Indirection + __host__ __device__ __forceinline__ self_type& operator*() + { + // return self reference, which can be assigned to anything + return *this; + } + + /// Addition + template + __host__ __device__ __forceinline__ self_type operator+(Distance n) const + { + self_type retval(offset + n); + return retval; + } + + /// Addition assignment + template + __host__ __device__ __forceinline__ self_type& operator+=(Distance n) + { + offset += n; + return *this; + } + + /// Subtraction + template + __host__ __device__ __forceinline__ self_type operator-(Distance n) const + { + self_type retval(offset - n); + return retval; + } + + /// Subtraction assignment + template + __host__ __device__ __forceinline__ self_type& operator-=(Distance n) + { + offset -= n; + return *this; + } + + /// Distance + __host__ __device__ __forceinline__ difference_type operator-(self_type other) const + { + return offset - other.offset; + } + + /// Array subscript + template + __host__ __device__ __forceinline__ self_type& operator[](Distance n) + { + // return self reference, which can be assigned to anything + return *this; + } + + /// Structure dereference + __host__ __device__ __forceinline__ pointer operator->() + { + return; + } + + /// Assignment to self (no-op) + __host__ __device__ __forceinline__ void operator=(self_type const& other) + { + offset = other.offset; + } + + /// Assignment to anything else (no-op) + template + __host__ __device__ __forceinline__ void operator=(T const&) + {} + + /// Cast to void* operator + __host__ __device__ __forceinline__ operator void*() const { return NULL; } + + /// Equal to + __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) + { + return (offset == rhs.offset); + } + + /// Not equal to + __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) + { + return (offset != rhs.offset); + } + + /// ostream operator + friend std::ostream& operator<<(std::ostream& os, const self_type& itr) + { + os << "[" << itr.offset << "]"; + return os; + } + +}; + + +/** @} */ // end group UtilIterator + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/iterator/tex_obj_input_iterator.cuh b/SRC/cub/iterator/tex_obj_input_iterator.cuh index be5c79c1..b99103ec 100644 --- a/SRC/cub/iterator/tex_obj_input_iterator.cuh +++ b/SRC/cub/iterator/tex_obj_input_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -66,18 +66,18 @@ namespace cub { * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses newer Kepler-style texture objects. * * \par Overview - * - TexObjInputIterator wraps a native device pointer of type ValueType*. References + * - TexObjInputIteratorTwraps a native device pointer of type ValueType*. References * to elements are to be loaded through texture cache. * - Can be used to load any data type from memory through texture cache. * - Can be manipulated and exchanged within and between host and device * functions, can only be constructed within host functions, and can only be * dereferenced within device functions. - * - With regard to nested/dynamic parallelism, TexObjInputIterator iterators may only be + * - With regard to nested/dynamic parallelism, TexObjInputIteratorTiterators may only be * created by the host thread, but can be used by any descendant kernel. * - Compatible with Thrust API v1.7 or newer. * * \par Snippet - * The code snippet below illustrates the use of \p TexRefInputIterator to + * The code snippet below illustrates the use of \p TexRefInputIteratorTto * dereference a device array of doubles through texture cache. * \par * \code @@ -103,18 +103,18 @@ namespace cub { * \endcode * * \tparam T The value type of this iterator - * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) */ template < typename T, - typename Offset = ptrdiff_t> + typename OffsetT = ptrdiff_t> class TexObjInputIterator { public: // Required iterator traits typedef TexObjInputIterator self_type; ///< My own type - typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another typedef T value_type; ///< The type of the element the iterator can point to typedef T* pointer; ///< The type of a pointer to an element the iterator can point to typedef T reference; ///< The type of a reference to an element the iterator can point to @@ -158,12 +158,13 @@ public: {} /// Use this iterator to bind \p ptr with a texture reference + template cudaError_t BindTexture( - T *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment - size_t bytes, ///< Number of bytes in the range - size_t tex_offset = 0) ///< Offset (in items) from \p ptr denoting the position of the iterator + QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment + size_t bytes = size_t(-1), ///< Number of bytes in the range + size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator { - this->ptr = ptr; + this->ptr = const_cast::Type *>(ptr); this->tex_offset = tex_offset; cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); @@ -172,7 +173,7 @@ public: memset(&res_desc, 0, sizeof(cudaResourceDesc)); memset(&tex_desc, 0, sizeof(cudaTextureDesc)); res_desc.resType = cudaResourceTypeLinear; - res_desc.res.linear.devPtr = ptr; + res_desc.res.linear.devPtr = this->ptr; res_desc.res.linear.desc = channel_desc; res_desc.res.linear.sizeInBytes = bytes; tex_desc.readMode = cudaReadModeElementType; @@ -271,7 +272,8 @@ public: template __host__ __device__ __forceinline__ reference operator[](Distance n) const { - return *(*this + n); + self_type offset = (*this) + n; + return *offset; } /// Structure dereference diff --git a/SRC/cub/iterator/tex_ref_input_iterator.cuh b/SRC/cub/iterator/tex_ref_input_iterator.cuh index c1102af5..95d0ffbc 100644 --- a/SRC/cub/iterator/tex_ref_input_iterator.cuh +++ b/SRC/cub/iterator/tex_ref_input_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -91,13 +91,13 @@ struct IteratorTexRef static TexRef ref; /// Bind texture - static cudaError_t BindTexture(void *d_in) + static cudaError_t BindTexture(void *d_in, size_t &offset) { if (d_in) { cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc(); ref.channelDesc = tex_desc; - return (CubDebug(cudaBindTexture(NULL, ref, d_in))); + return (CubDebug(cudaBindTexture(&offset, ref, d_in))); } return cudaSuccess; @@ -151,24 +151,24 @@ typename IteratorTexRef::template TexId::TexRef IteratorTexRef: * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses older Tesla/Fermi-style texture references. * * \par Overview - * - TexRefInputIterator wraps a native device pointer of type ValueType*. References + * - TexRefInputIteratorTwraps a native device pointer of type ValueType*. References * to elements are to be loaded through texture cache. * - Can be used to load any data type from memory through texture cache. * - Can be manipulated and exchanged within and between host and device * functions, can only be constructed within host functions, and can only be * dereferenced within device functions. * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture - * reference. Only one TexRefInputIterator instance can be bound at any given time for a + * reference. Only one TexRefInputIteratorTinstance can be bound at any given time for a * specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host * thread, and (4) compilation .o unit. - * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be + * - With regard to nested/dynamic parallelism, TexRefInputIteratorTiterators may only be * created by the host thread and used by a top-level kernel (i.e. the one which is launched * from the host). * - Compatible with Thrust API v1.7 or newer. * - Compatible with CUDA toolkit v5.5 or newer. * * \par Snippet - * The code snippet below illustrates the use of \p TexRefInputIterator to + * The code snippet below illustrates the use of \p TexRefInputIteratorTto * dereference a device array of doubles through texture cache. * \par * \code @@ -195,19 +195,19 @@ typename IteratorTexRef::template TexId::TexRef IteratorTexRef: * * \tparam T The value type of this iterator * \tparam UNIQUE_ID A globally-unique identifier (within the compilation unit) to name the underlying texture reference - * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) */ template < typename T, int UNIQUE_ID, - typename Offset = ptrdiff_t> + typename OffsetT = ptrdiff_t> class TexRefInputIterator { public: // Required iterator traits typedef TexRefInputIterator self_type; ///< My own type - typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another typedef T value_type; ///< The type of the element the iterator can point to typedef T* pointer; ///< The type of a pointer to an element the iterator can point to typedef T reference; ///< The type of a reference to an element the iterator can point to @@ -233,23 +233,26 @@ private: typedef typename IteratorTexRef::template TexId TexId; public: - +/* /// Constructor __host__ __device__ __forceinline__ TexRefInputIterator() : ptr(NULL), tex_offset(0) {} - +*/ /// Use this iterator to bind \p ptr with a texture reference + template cudaError_t BindTexture( - T *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment - size_t bytes, ///< Number of bytes in the range - size_t tex_offset = 0) ///< Offset (in items) from \p ptr denoting the position of the iterator + QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment + size_t bytes = size_t(-1), ///< Number of bytes in the range + size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator { - this->ptr = ptr; - this->tex_offset = tex_offset; - return TexId::BindTexture(ptr); + this->ptr = const_cast::Type *>(ptr); + size_t offset; + cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset); + this->tex_offset = (difference_type) (offset / sizeof(QualifiedT)); + return retval; } /// Unbind this iterator from its texture reference @@ -331,7 +334,8 @@ public: template __host__ __device__ __forceinline__ reference operator[](Distance n) const { - return *(*this + n); + self_type offset = (*this) + n; + return *offset; } /// Structure dereference diff --git a/SRC/cub/iterator/transform_input_iterator.cuh b/SRC/cub/iterator/transform_input_iterator.cuh index 90ffbaad..dad1f500 100644 --- a/SRC/cub/iterator/transform_input_iterator.cuh +++ b/SRC/cub/iterator/transform_input_iterator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -64,8 +64,8 @@ namespace cub { * \brief A random-access input wrapper for transforming dereferenced values. * * \par Overview - * - TransformInputIterator wraps a unary conversion functor of type \p - * ConversionOp and a random-access input iterator of type InputIterator, + * - TransformInputIteratorTwraps a unary conversion functor of type \p + * ConversionOp and a random-access input iterator of type InputIteratorT, * using the former to produce references of type \p ValueType from the latter. * - Can be used with any data type. * - Can be constructed, manipulated, and exchanged within and between host and device @@ -74,7 +74,7 @@ namespace cub { * - Compatible with Thrust API v1.7 or newer. * * \par Snippet - * The code snippet below illustrates the use of \p TransformInputIterator to + * The code snippet below illustrates the use of \p TransformInputIteratorTto * dereference an array of integers, tripling the values and converting them to doubles. * \par * \code @@ -85,7 +85,7 @@ namespace cub { * { * __host__ __device__ __forceinline__ * double operator()(const int &a) const { - * return double(a * 2); + * return double(a * 3); * } * }; * @@ -105,22 +105,22 @@ namespace cub { * * \tparam ValueType The value type of this iterator * \tparam ConversionOp Unary functor type for mapping objects of type \p InputType to type \p ValueType. Must have member ValueType operator()(const InputType &datum). - * \tparam InputIterator The type of the wrapped input iterator - * \tparam Offset The difference type of this iterator (Default: \p ptrdiff_t) + * \tparam InputIteratorT The type of the wrapped input iterator + * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) * */ template < typename ValueType, typename ConversionOp, - typename InputIterator, - typename Offset = ptrdiff_t> + typename InputIteratorT, + typename OffsetT = ptrdiff_t> class TransformInputIterator { public: // Required iterator traits typedef TransformInputIterator self_type; ///< My own type - typedef Offset difference_type; ///< Type to express the result of subtracting one iterator from another + typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another typedef ValueType value_type; ///< The type of the element the iterator can point to typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to typedef ValueType reference; ///< The type of a reference to an element the iterator can point to @@ -139,14 +139,14 @@ public: private: - ConversionOp conversion_op; - InputIterator input_itr; + ConversionOp conversion_op; + InputIteratorT input_itr; public: /// Constructor __host__ __device__ __forceinline__ TransformInputIterator( - InputIterator input_itr, ///< Input iterator to wrap + InputIteratorT input_itr, ///< Input iterator to wrap ConversionOp conversion_op) ///< Conversion functor to wrap : conversion_op(conversion_op), diff --git a/SRC/cub/thread/thread_load.cuh b/SRC/cub/thread/thread_load.cuh index 8e3790f5..b1ca412f 100644 --- a/SRC/cub/thread/thread_load.cuh +++ b/SRC/cub/thread/thread_load.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -102,12 +102,12 @@ enum CacheLoadModifier * \endcode * * \tparam MODIFIER [inferred] CacheLoadModifier enumeration - * \tparam InputIterator [inferred] Input iterator type \iterator + * \tparam InputIteratorT [inferred] Input iterator type \iterator */ template < CacheLoadModifier MODIFIER, - typename InputIterator> -__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIterator itr); + typename InputIteratorT> +__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorT itr); //@} end member group @@ -121,17 +121,17 @@ template struct IterateThreadLoad { template - static __device__ __forceinline__ void Load(T *ptr, T *vals) + static __device__ __forceinline__ void Load(T const *ptr, T *vals) { vals[COUNT] = ThreadLoad(ptr + COUNT); IterateThreadLoad::template Load(ptr, vals); } - template - static __device__ __forceinline__ void Dereference(InputIterator ptr, T *vals) + template + static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals) { - vals[COUNT] = ptr[COUNT]; - IterateThreadLoad::Dereference(ptr, vals); + vals[COUNT] = itr[COUNT]; + IterateThreadLoad::Dereference(itr, vals); } }; @@ -141,19 +141,19 @@ template struct IterateThreadLoad { template - static __device__ __forceinline__ void Load(T *ptr, T *vals) {} + static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {} - template - static __device__ __forceinline__ void Dereference(InputIterator ptr, T *vals) {} + template + static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {} }; /** * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier */ -#define CUB_LOAD_16(cub_modifier, ptx_modifier) \ +#define _CUB_LOAD_16(cub_modifier, ptx_modifier) \ template<> \ - __device__ __forceinline__ uint4 ThreadLoad(uint4* ptr) \ + __device__ __forceinline__ uint4 ThreadLoad(uint4 const *ptr) \ { \ uint4 retval; \ asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \ @@ -165,7 +165,7 @@ struct IterateThreadLoad return retval; \ } \ template<> \ - __device__ __forceinline__ ulonglong2 ThreadLoad(ulonglong2* ptr) \ + __device__ __forceinline__ ulonglong2 ThreadLoad(ulonglong2 const *ptr) \ { \ ulonglong2 retval; \ asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \ @@ -178,9 +178,9 @@ struct IterateThreadLoad /** * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier */ -#define CUB_LOAD_8(cub_modifier, ptx_modifier) \ +#define _CUB_LOAD_8(cub_modifier, ptx_modifier) \ template<> \ - __device__ __forceinline__ ushort4 ThreadLoad(ushort4* ptr) \ + __device__ __forceinline__ ushort4 ThreadLoad(ushort4 const *ptr) \ { \ ushort4 retval; \ asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \ @@ -192,7 +192,7 @@ struct IterateThreadLoad return retval; \ } \ template<> \ - __device__ __forceinline__ uint2 ThreadLoad(uint2* ptr) \ + __device__ __forceinline__ uint2 ThreadLoad(uint2 const *ptr) \ { \ uint2 retval; \ asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \ @@ -202,7 +202,7 @@ struct IterateThreadLoad return retval; \ } \ template<> \ - __device__ __forceinline__ unsigned long long ThreadLoad(unsigned long long* ptr) \ + __device__ __forceinline__ unsigned long long ThreadLoad(unsigned long long const *ptr) \ { \ unsigned long long retval; \ asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \ @@ -214,9 +214,9 @@ struct IterateThreadLoad /** * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier */ -#define CUB_LOAD_4(cub_modifier, ptx_modifier) \ +#define _CUB_LOAD_4(cub_modifier, ptx_modifier) \ template<> \ - __device__ __forceinline__ unsigned int ThreadLoad(unsigned int* ptr) \ + __device__ __forceinline__ unsigned int ThreadLoad(unsigned int const *ptr) \ { \ unsigned int retval; \ asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \ @@ -229,9 +229,9 @@ struct IterateThreadLoad /** * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier */ -#define CUB_LOAD_2(cub_modifier, ptx_modifier) \ +#define _CUB_LOAD_2(cub_modifier, ptx_modifier) \ template<> \ - __device__ __forceinline__ unsigned short ThreadLoad(unsigned short* ptr) \ + __device__ __forceinline__ unsigned short ThreadLoad(unsigned short const *ptr) \ { \ unsigned short retval; \ asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \ @@ -244,9 +244,9 @@ struct IterateThreadLoad /** * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier */ -#define CUB_LOAD_1(cub_modifier, ptx_modifier) \ +#define _CUB_LOAD_1(cub_modifier, ptx_modifier) \ template<> \ - __device__ __forceinline__ unsigned char ThreadLoad(unsigned char* ptr) \ + __device__ __forceinline__ unsigned char ThreadLoad(unsigned char const *ptr) \ { \ unsigned short retval; \ asm volatile ( \ @@ -257,52 +257,62 @@ struct IterateThreadLoad "}" : \ "=h"(retval) : \ _CUB_ASM_PTR_(ptr)); \ - return (unsigned char) retval; \ + return (unsigned char) retval; \ } /** * Define powers-of-two ThreadLoad specializations for the given Cache load modifier */ -#define CUB_LOAD_ALL(cub_modifier, ptx_modifier) \ - CUB_LOAD_16(cub_modifier, ptx_modifier) \ - CUB_LOAD_8(cub_modifier, ptx_modifier) \ - CUB_LOAD_4(cub_modifier, ptx_modifier) \ - CUB_LOAD_2(cub_modifier, ptx_modifier) \ - CUB_LOAD_1(cub_modifier, ptx_modifier) \ +#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \ + _CUB_LOAD_16(cub_modifier, ptx_modifier) \ + _CUB_LOAD_8(cub_modifier, ptx_modifier) \ + _CUB_LOAD_4(cub_modifier, ptx_modifier) \ + _CUB_LOAD_2(cub_modifier, ptx_modifier) \ + _CUB_LOAD_1(cub_modifier, ptx_modifier) \ /** * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers */ #if CUB_PTX_ARCH >= 200 - CUB_LOAD_ALL(LOAD_CA, ca) - CUB_LOAD_ALL(LOAD_CG, cg) - CUB_LOAD_ALL(LOAD_CS, cs) - CUB_LOAD_ALL(LOAD_CV, cv) + _CUB_LOAD_ALL(LOAD_CA, ca) + _CUB_LOAD_ALL(LOAD_CG, cg) + _CUB_LOAD_ALL(LOAD_CS, cs) + _CUB_LOAD_ALL(LOAD_CV, cv) #else - CUB_LOAD_ALL(LOAD_CA, global) + _CUB_LOAD_ALL(LOAD_CA, global) // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1 - CUB_LOAD_ALL(LOAD_CG, volatile.global) - CUB_LOAD_ALL(LOAD_CS, global) - CUB_LOAD_ALL(LOAD_CV, volatile.global) + _CUB_LOAD_ALL(LOAD_CG, volatile.global) + _CUB_LOAD_ALL(LOAD_CS, global) + _CUB_LOAD_ALL(LOAD_CV, volatile.global) #endif #if CUB_PTX_ARCH >= 350 - CUB_LOAD_ALL(LOAD_LDG, global.nc) + _CUB_LOAD_ALL(LOAD_LDG, global.nc) #else - CUB_LOAD_ALL(LOAD_LDG, global) + _CUB_LOAD_ALL(LOAD_LDG, global) #endif +// Macro cleanup +#undef _CUB_LOAD_ALL +#undef _CUB_LOAD_1 +#undef _CUB_LOAD_2 +#undef _CUB_LOAD_4 +#undef _CUB_LOAD_8 +#undef _CUB_LOAD_16 + + + /** * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types */ -template -__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad( - InputIterator itr, - Int2Type modifier, - Int2Type is_pointer) +template +__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad( + InputIteratorT itr, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) { return *itr; } @@ -314,8 +324,8 @@ __device__ __forceinline__ typename std::iterator_traits::value_t template __device__ __forceinline__ T ThreadLoad( T *ptr, - Int2Type modifier, - Int2Type is_pointer) + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) { return *ptr; } @@ -327,14 +337,9 @@ __device__ __forceinline__ T ThreadLoad( template __device__ __forceinline__ T ThreadLoadVolatilePointer( T *ptr, - Int2Type is_primitive) + Int2Type /*is_primitive*/) { T retval = *reinterpret_cast(ptr); - -#if (CUB_PTX_ARCH <= 130) - if (sizeof(T) == 1) __threadfence_block(); -#endif - return retval; } @@ -345,17 +350,8 @@ __device__ __forceinline__ T ThreadLoadVolatilePointer( template __device__ __forceinline__ T ThreadLoadVolatilePointer( T *ptr, - Int2Type is_primitive) + Int2Type /*is_primitive*/) { - -#if CUB_PTX_ARCH <= 130 - - T retval = *ptr; - __threadfence_block(); - return retval; - -#else - typedef typename UnitWord::VolatileWord VolatileWord; // Word type for memcopying const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); @@ -375,8 +371,6 @@ __device__ __forceinline__ T ThreadLoadVolatilePointer( reinterpret_cast(ptr), words); return retval; - -#endif // CUB_PTX_ARCH <= 130 } @@ -386,8 +380,8 @@ __device__ __forceinline__ T ThreadLoadVolatilePointer( template __device__ __forceinline__ T ThreadLoad( T *ptr, - Int2Type modifier, - Int2Type is_pointer) + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) { // Apply tags for partial-specialization return ThreadLoadVolatilePointer(ptr, Int2Type::PRIMITIVE>()); @@ -399,9 +393,9 @@ __device__ __forceinline__ T ThreadLoad( */ template __device__ __forceinline__ T ThreadLoad( - T *ptr, - Int2Type modifier, - Int2Type is_pointer) + T const *ptr, + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) { typedef typename UnitWord::DeviceWord DeviceWord; @@ -410,7 +404,7 @@ __device__ __forceinline__ T ThreadLoad( DeviceWord words[DEVICE_MULTIPLE]; IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load( - reinterpret_cast(ptr), + reinterpret_cast(const_cast(ptr)), words); return *reinterpret_cast(words); @@ -422,14 +416,14 @@ __device__ __forceinline__ T ThreadLoad( */ template < CacheLoadModifier MODIFIER, - typename InputIterator> -__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIterator itr) + typename InputIteratorT> +__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorT itr) { // Apply tags for partial-specialization return ThreadLoad( itr, Int2Type(), - Int2Type::VALUE>()); + Int2Type::VALUE>()); } diff --git a/SRC/cub/thread/thread_operators.cuh b/SRC/cub/thread/thread_operators.cuh index 75c96273..76cd800f 100644 --- a/SRC/cub/thread/thread_operators.cuh +++ b/SRC/cub/thread/thread_operators.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -96,7 +96,7 @@ struct InequalityWrapper /// Boolean inequality operator, returns (a != b) template - __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const + __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) { return !op(a, b); } @@ -132,20 +132,22 @@ struct Max /** - * \brief Arg max functor (keeps the value and offset of the first occurrence of the l item) + * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item) */ struct ArgMax { /// Boolean max operator, preferring the item having the smaller offset in case of ties - template - __host__ __device__ __forceinline__ ItemOffsetPair operator()( - const ItemOffsetPair &a, - const ItemOffsetPair &b) const + template + __host__ __device__ __forceinline__ KeyValuePair operator()( + const KeyValuePair &a, + const KeyValuePair &b) const { - if (a.value == b.value) - return (b.offset < a.offset) ? b : a; +// Mooch BUG (device reduce argmax gk110 3.2 million random fp32) +// return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; - return (b.value > a.value) ? b : a; + if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) + return b; + return a; } }; @@ -170,15 +172,17 @@ struct Min struct ArgMin { /// Boolean min operator, preferring the item having the smaller offset in case of ties - template - __host__ __device__ __forceinline__ ItemOffsetPair operator()( - const ItemOffsetPair &a, - const ItemOffsetPair &b) const + template + __host__ __device__ __forceinline__ KeyValuePair operator()( + const KeyValuePair &a, + const KeyValuePair &b) const { - if (a.value == b.value) - return (b.offset < a.offset) ? b : a; +// Mooch BUG (device reduce argmax gk110 3.2 million random fp32) +// return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; - return (b.value < a.value) ? b : a; + if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) + return b; + return a; } }; @@ -187,9 +191,9 @@ struct ArgMin * \brief Default cast functor */ template -struct Cast +struct CastOp { - /// Boolean max operator, returns (a > b) ? a : b + /// Cast operator, returns (B) a template __host__ __device__ __forceinline__ B operator()(const A &a) const { @@ -198,6 +202,113 @@ struct Cast }; +/** + * \brief Binary operator wrapper for switching non-commutative scan arguments + */ +template +class SwizzleScanOp +{ +private: + + /// Wrapped scan operator + ScanOp scan_op; + +public: + + /// Constructor + __host__ __device__ __forceinline__ + SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {} + + /// Switch the scan arguments + template + __host__ __device__ __forceinline__ + T operator()(const T &a, const T &b) + { + T _a(a); + T _b(b); + + return scan_op(_b, _a); + } +}; + + +/** + * \brief Reduce-by-segment functor. + * + * Given two cub::KeyValuePair inputs \p a and \p b and a + * binary associative combining operator \p f(const T &x, const T &y), + * an instance of this functor returns a cub::KeyValuePair whose \p key + * field is a.key + b.key, and whose \p value field + * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise. + * + * ReduceBySegmentOp is an associative, non-commutative binary combining operator + * for input sequences of cub::KeyValuePair pairings. Such + * sequences are typically used to represent a segmented set of values to be reduced + * and a corresponding set of {0,1}-valued integer "head flags" demarcating the + * first value of each segment. + * + */ +template ///< Binary reduction operator to apply to values +struct ReduceBySegmentOp +{ + /// Wrapped reduction operator + ReductionOpT op; + + /// Constructor + __host__ __device__ __forceinline__ ReduceBySegmentOp() {} + + /// Constructor + __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {} + + /// Scan operator + template ///< KeyValuePair pairing of T (value) and OffsetT (head flag) + __host__ __device__ __forceinline__ KeyValuePairT operator()( + const KeyValuePairT &first, ///< First partial reduction + const KeyValuePairT &second) ///< Second partial reduction + { + KeyValuePairT retval; + retval.key = first.key + second.key; + retval.value = (second.key) ? + second.value : // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate + op(first.value, second.value); // The second partial reduction does not span a reset, so accumulate both into the running aggregate + return retval; + } +}; + + + +template ///< Binary reduction operator to apply to values +struct ReduceByKeyOp +{ + /// Wrapped reduction operator + ReductionOpT op; + + /// Constructor + __host__ __device__ __forceinline__ ReduceByKeyOp() {} + + /// Constructor + __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {} + + /// Scan operator + template + __host__ __device__ __forceinline__ KeyValuePairT operator()( + const KeyValuePairT &first, ///< First partial reduction + const KeyValuePairT &second) ///< Second partial reduction + { + KeyValuePairT retval = second; + + if (first.key == second.key) + retval.value = op(first.value, retval.value); + + return retval; + } +}; + + + + + + /** @} */ // end group UtilModule diff --git a/SRC/cub/thread/thread_reduce.cuh b/SRC/cub/thread/thread_reduce.cuh index 29bc8ce0..4c13688f 100644 --- a/SRC/cub/thread/thread_reduce.cuh +++ b/SRC/cub/thread/thread_reduce.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -42,17 +42,12 @@ CUB_NS_PREFIX /// CUB namespace namespace cub { -/** - * \addtogroup UtilModule - * @{ - */ +/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) +namespace internal { /** - * \name Sequential reduction over statically-sized array types - * @{ + * Sequential reduction over statically-sized array types */ - - template < int LENGTH, typename T, @@ -61,31 +56,22 @@ __device__ __forceinline__ T ThreadReduce( T* input, ///< [in] Input array ReductionOp reduction_op, ///< [in] Binary reduction operator T prefix, ///< [in] Prefix to seed reduction with - Int2Type length) + Int2Type /*length*/) { - T addend = *input; - prefix = reduction_op(prefix, addend); + T retval = prefix; - return ThreadReduce(input + 1, reduction_op, prefix, Int2Type()); -} + #pragma unroll + for (int i = 0; i < LENGTH; ++i) + retval = reduction_op(retval, input[i]); -template < - typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( - T* input, ///< [in] Input array - ReductionOp reduction_op, ///< [in] Binary reduction operator - T prefix, ///< [in] Prefix to seed reduction with - Int2Type<0> length) -{ - return prefix; + return retval; } /** * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. * - * \tparam LENGTH Length of input array + * \tparam LENGTH LengthT of input array * \tparam T [inferred] The data type to be reduced. * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) */ @@ -105,7 +91,7 @@ __device__ __forceinline__ T ThreadReduce( /** * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned. * - * \tparam LENGTH Length of input array + * \tparam LENGTH LengthT of input array * \tparam T [inferred] The data type to be reduced. * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) */ @@ -125,7 +111,7 @@ __device__ __forceinline__ T ThreadReduce( /** * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. * - * \tparam LENGTH [inferred] Length of \p input array + * \tparam LENGTH [inferred] LengthT of \p input array * \tparam T [inferred] The data type to be reduced. * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) */ @@ -138,14 +124,14 @@ __device__ __forceinline__ T ThreadReduce( ReductionOp reduction_op, ///< [in] Binary reduction operator T prefix) ///< [in] Prefix to seed reduction with { - return ThreadReduce(input, reduction_op, prefix); + return ThreadReduce(input, reduction_op, prefix, Int2Type()); } /** * \brief Serial reduction with the specified operator * - * \tparam LENGTH [inferred] Length of \p input array + * \tparam LENGTH [inferred] LengthT of \p input array * \tparam T [inferred] The data type to be reduced. * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) */ @@ -161,9 +147,6 @@ __device__ __forceinline__ T ThreadReduce( } -//@} end member group - -/** @} */ // end group UtilModule - +} // internal namespace } // CUB namespace CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/thread/thread_scan.cuh b/SRC/cub/thread/thread_scan.cuh index 6276bf83..8d67549a 100644 --- a/SRC/cub/thread/thread_scan.cuh +++ b/SRC/cub/thread/thread_scan.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -42,6 +42,10 @@ CUB_NS_PREFIX /// CUB namespace namespace cub { +/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) +namespace internal { + + /** * \addtogroup UtilModule * @{ @@ -62,35 +66,25 @@ __device__ __forceinline__ T ThreadScanExclusive( T *input, ///< [in] Input array T *output, ///< [out] Output array (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator - Int2Type length) + Int2Type /*length*/) { - T addend = *input; - inclusive = scan_op(exclusive, addend); - *output = exclusive; - exclusive = inclusive; - - return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type()); -} + #pragma unroll + for (int i = 0; i < LENGTH; ++i) + { + inclusive = scan_op(exclusive, input[i]); + output[i] = exclusive; + exclusive = inclusive; + } -template < - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanExclusive( - T inclusive, - T exclusive, - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - Int2Type<0> length) -{ return inclusive; } + /** * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. * - * \tparam LENGTH Length of \p input and \p output arrays + * \tparam LENGTH LengthT of \p input and \p output arrays * \tparam T [inferred] The data type to be scanned. * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ @@ -120,7 +114,7 @@ __device__ __forceinline__ T ThreadScanExclusive( /** * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. * - * \tparam LENGTH [inferred] Length of \p input and \p output arrays + * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays * \tparam T [inferred] The data type to be scanned. * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ @@ -155,25 +149,15 @@ __device__ __forceinline__ T ThreadScanInclusive( T *input, ///< [in] Input array T *output, ///< [out] Output array (may be aliased to \p input) ScanOp scan_op, ///< [in] Binary scan operator - Int2Type length) + Int2Type /*length*/) { - T addend = *input; - inclusive = scan_op(inclusive, addend); - output[0] = inclusive; - - return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); -} + #pragma unroll + for (int i = 0; i < LENGTH; ++i) + { + inclusive = scan_op(inclusive, input[i]); + output[i] = inclusive; + } -template < - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T inclusive, - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - Int2Type<0> length) -{ return inclusive; } @@ -181,7 +165,7 @@ __device__ __forceinline__ T ThreadScanInclusive( /** * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array. The aggregate is returned. * - * \tparam LENGTH Length of \p input and \p output arrays + * \tparam LENGTH LengthT of \p input and \p output arrays * \tparam T [inferred] The data type to be scanned. * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ @@ -205,7 +189,7 @@ __device__ __forceinline__ T ThreadScanInclusive( /** * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array. The aggregate is returned. * - * \tparam LENGTH [inferred] Length of \p input and \p output arrays + * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays * \tparam T [inferred] The data type to be scanned. * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ @@ -225,7 +209,7 @@ __device__ __forceinline__ T ThreadScanInclusive( /** * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. * - * \tparam LENGTH Length of \p input and \p output arrays + * \tparam LENGTH LengthT of \p input and \p output arrays * \tparam T [inferred] The data type to be scanned. * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ @@ -255,7 +239,7 @@ __device__ __forceinline__ T ThreadScanInclusive( /** * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. * - * \tparam LENGTH [inferred] Length of \p input and \p output arrays + * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays * \tparam T [inferred] The data type to be scanned. * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) */ @@ -279,5 +263,6 @@ __device__ __forceinline__ T ThreadScanInclusive( /** @} */ // end group UtilModule +} // internal namespace } // CUB namespace CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/thread/thread_search.cuh b/SRC/cub/thread/thread_search.cuh new file mode 100644 index 00000000..3099080a --- /dev/null +++ b/SRC/cub/thread/thread_search.cuh @@ -0,0 +1,154 @@ +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * Thread utilities for sequential search + */ + +#pragma once + +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * Computes the begin offsets into A and B for the specific diagonal + */ +template < + typename AIteratorT, + typename BIteratorT, + typename OffsetT, + typename CoordinateT> +__host__ __device__ __forceinline__ void MergePathSearch( + OffsetT diagonal, + AIteratorT a, + BIteratorT b, + OffsetT a_len, + OffsetT b_len, + CoordinateT& path_coordinate) +{ + /// The value type of the input iterator + typedef typename std::iterator_traits::value_type T; + + OffsetT split_min = CUB_MAX(diagonal - b_len, 0); + OffsetT split_max = CUB_MIN(diagonal, a_len); + + while (split_min < split_max) + { + OffsetT split_pivot = (split_min + split_max) >> 1; + if (a[split_pivot] <= b[diagonal - split_pivot - 1]) + { + // Move candidate split range up A, down B + split_min = split_pivot + 1; + } + else + { + // Move candidate split range up B, down A + split_max = split_pivot; + } + } + + path_coordinate.x = CUB_MIN(split_min, a_len); + path_coordinate.y = diagonal - split_min; +} + + + +/** + * \brief Returns the offset of the first value within \p input which does not compare less than \p val + */ +template < + typename InputIteratorT, + typename OffsetT, + typename T> +__device__ __forceinline__ OffsetT LowerBound( + InputIteratorT input, ///< [in] Input sequence + OffsetT num_items, ///< [in] Input sequence length + T val) ///< [in] Search key +{ + OffsetT retval = 0; + while (num_items > 0) + { + OffsetT half = num_items >> 1; + if (input[retval + half] < val) + { + retval = retval + (half + 1); + num_items = num_items - (half + 1); + } + else + { + num_items = half; + } + } + + return retval; +} + + +/** + * \brief Returns the offset of the first value within \p input which compares greater than \p val + */ +template < + typename InputIteratorT, + typename OffsetT, + typename T> +__device__ __forceinline__ OffsetT UpperBound( + InputIteratorT input, ///< [in] Input sequence + OffsetT num_items, ///< [in] Input sequence length + T val) ///< [in] Search key +{ + OffsetT retval = 0; + while (num_items > 0) + { + OffsetT half = num_items >> 1; + if (val < input[retval + half]) + { + num_items = half; + } + else + { + retval = retval + (half + 1); + num_items = num_items - (half + 1); + } + } + + return retval; +} + + + + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/thread/thread_store.cuh b/SRC/cub/thread/thread_store.cuh index 6d036d42..ec20b36f 100644 --- a/SRC/cub/thread/thread_store.cuh +++ b/SRC/cub/thread/thread_store.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -104,14 +104,14 @@ enum CacheStoreModifier * \endcode * * \tparam MODIFIER [inferred] CacheStoreModifier enumeration - * \tparam InputIterator [inferred] Output iterator type \iterator + * \tparam InputIteratorT [inferred] Output iterator type \iterator * \tparam T [inferred] Data type of output value */ template < CacheStoreModifier MODIFIER, - typename OutputIterator, + typename OutputIteratorT, typename T> -__device__ __forceinline__ void ThreadStore(OutputIterator itr, T val); +__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val); //@} end member group @@ -131,8 +131,8 @@ struct IterateThreadStore IterateThreadStore::template Store(ptr, vals); } - template - static __device__ __forceinline__ void Dereference(OutputIterator ptr, T *vals) + template + static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals) { ptr[COUNT] = vals[COUNT]; IterateThreadStore::Dereference(ptr, vals); @@ -145,17 +145,17 @@ template struct IterateThreadStore { template - static __device__ __forceinline__ void Store(T *ptr, T *vals) {} + static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {} - template - static __device__ __forceinline__ void Dereference(OutputIterator ptr, T *vals) {} + template + static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {} }; /** * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier */ -#define CUB_STORE_16(cub_modifier, ptx_modifier) \ +#define _CUB_STORE_16(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ void ThreadStore(uint4* ptr, uint4 val) \ { \ @@ -179,7 +179,7 @@ struct IterateThreadStore /** * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier */ -#define CUB_STORE_8(cub_modifier, ptx_modifier) \ +#define _CUB_STORE_8(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ void ThreadStore(ushort4* ptr, ushort4 val) \ { \ @@ -209,7 +209,7 @@ struct IterateThreadStore /** * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier */ -#define CUB_STORE_4(cub_modifier, ptx_modifier) \ +#define _CUB_STORE_4(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ void ThreadStore(unsigned int* ptr, unsigned int val) \ { \ @@ -222,7 +222,7 @@ struct IterateThreadStore /** * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier */ -#define CUB_STORE_2(cub_modifier, ptx_modifier) \ +#define _CUB_STORE_2(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ void ThreadStore(unsigned short* ptr, unsigned short val) \ { \ @@ -235,7 +235,7 @@ struct IterateThreadStore /** * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier */ -#define CUB_STORE_1(cub_modifier, ptx_modifier) \ +#define _CUB_STORE_1(cub_modifier, ptx_modifier) \ template<> \ __device__ __forceinline__ void ThreadStore(unsigned char* ptr, unsigned char val) \ { \ @@ -252,39 +252,48 @@ struct IterateThreadStore /** * Define powers-of-two ThreadStore specializations for the given Cache load modifier */ -#define CUB_STORE_ALL(cub_modifier, ptx_modifier) \ - CUB_STORE_16(cub_modifier, ptx_modifier) \ - CUB_STORE_8(cub_modifier, ptx_modifier) \ - CUB_STORE_4(cub_modifier, ptx_modifier) \ - CUB_STORE_2(cub_modifier, ptx_modifier) \ - CUB_STORE_1(cub_modifier, ptx_modifier) \ +#define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \ + _CUB_STORE_16(cub_modifier, ptx_modifier) \ + _CUB_STORE_8(cub_modifier, ptx_modifier) \ + _CUB_STORE_4(cub_modifier, ptx_modifier) \ + _CUB_STORE_2(cub_modifier, ptx_modifier) \ + _CUB_STORE_1(cub_modifier, ptx_modifier) \ /** * Define ThreadStore specializations for the various Cache load modifiers */ #if CUB_PTX_ARCH >= 200 - CUB_STORE_ALL(STORE_WB, ca) - CUB_STORE_ALL(STORE_CG, cg) - CUB_STORE_ALL(STORE_CS, cs) - CUB_STORE_ALL(STORE_WT, wt) + _CUB_STORE_ALL(STORE_WB, wb) + _CUB_STORE_ALL(STORE_CG, cg) + _CUB_STORE_ALL(STORE_CS, cs) + _CUB_STORE_ALL(STORE_WT, wt) #else - CUB_STORE_ALL(STORE_WB, global) - CUB_STORE_ALL(STORE_CG, global) - CUB_STORE_ALL(STORE_CS, global) - CUB_STORE_ALL(STORE_WT, volatile.global) + _CUB_STORE_ALL(STORE_WB, global) + _CUB_STORE_ALL(STORE_CG, global) + _CUB_STORE_ALL(STORE_CS, global) + _CUB_STORE_ALL(STORE_WT, volatile.global) #endif +// Macro cleanup +#undef _CUB_STORE_ALL +#undef _CUB_STORE_1 +#undef _CUB_STORE_2 +#undef _CUB_STORE_4 +#undef _CUB_STORE_8 +#undef _CUB_STORE_16 + + /** * ThreadStore definition for STORE_DEFAULT modifier on iterator types */ -template +template __device__ __forceinline__ void ThreadStore( - OutputIterator itr, + OutputIteratorT itr, T val, - Int2Type modifier, - Int2Type is_pointer) + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) { *itr = val; } @@ -297,8 +306,8 @@ template __device__ __forceinline__ void ThreadStore( T *ptr, T val, - Int2Type modifier, - Int2Type is_pointer) + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) { *ptr = val; } @@ -311,7 +320,7 @@ template __device__ __forceinline__ void ThreadStoreVolatilePtr( T *ptr, T val, - Int2Type is_primitive) + Int2Type /*is_primitive*/) { *reinterpret_cast(ptr) = val; } @@ -324,30 +333,24 @@ template __device__ __forceinline__ void ThreadStoreVolatilePtr( T *ptr, T val, - Int2Type is_primitive) + Int2Type /*is_primitive*/) { -#if CUB_PTX_ARCH <= 130 - - *ptr = val; - __threadfence_block(); - -#else - - typedef typename UnitWord::VolatileWord VolatileWord; // Word type for memcopying + // Create a temporary using shuffle-words, then store using volatile-words + typedef typename UnitWord::VolatileWord VolatileWord; + typedef typename UnitWord::ShuffleWord ShuffleWord; const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); - + const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); + VolatileWord words[VOLATILE_MULTIPLE]; - *reinterpret_cast(words) = val; -// VolatileWord *words = reinterpret_cast(&val); + #pragma unroll + for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) + reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference( reinterpret_cast(ptr), words); - -#endif // CUB_PTX_ARCH <= 130 - } @@ -358,8 +361,8 @@ template __device__ __forceinline__ void ThreadStore( T *ptr, T val, - Int2Type modifier, - Int2Type is_pointer) + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) { ThreadStoreVolatilePtr(ptr, val, Int2Type::PRIMITIVE>()); } @@ -372,16 +375,21 @@ template __device__ __forceinline__ void ThreadStore( T *ptr, T val, - Int2Type modifier, - Int2Type is_pointer) + Int2Type /*modifier*/, + Int2Type /*is_pointer*/) { - typedef typename UnitWord::DeviceWord DeviceWord; // Word type for memcopying - - const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); + // Create a temporary using shuffle-words, then store using device-words + typedef typename UnitWord::DeviceWord DeviceWord; + typedef typename UnitWord::ShuffleWord ShuffleWord; + const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); + const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); + DeviceWord words[DEVICE_MULTIPLE]; - *reinterpret_cast(words) = val; + #pragma unroll + for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) + reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; IterateThreadStore<0, DEVICE_MULTIPLE>::template Store( reinterpret_cast(ptr), @@ -392,14 +400,14 @@ __device__ __forceinline__ void ThreadStore( /** * ThreadStore definition for generic modifiers */ -template -__device__ __forceinline__ void ThreadStore(OutputIterator itr, T val) +template +__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val) { ThreadStore( itr, val, Int2Type(), - Int2Type::VALUE>()); + Int2Type::VALUE>()); } diff --git a/SRC/cub/util_allocator.cuh b/SRC/cub/util_allocator.cuh index 9e4b1ff6..0e6dd048 100644 --- a/SRC/cub/util_allocator.cuh +++ b/SRC/cub/util_allocator.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -33,17 +33,14 @@ #pragma once -#if (CUB_PTX_ARCH == 0) - #include // NVCC (EDG, really) takes FOREVER to compile std::map - #include -#endif - -#include - #include "util_namespace.cuh" #include "util_debug.cuh" -#include "host/spinlock.cuh" +#include +#include + +#include "host/mutex.cuh" +#include /// Optional outer namespace(s) CUB_NS_PREFIX @@ -66,12 +63,17 @@ namespace cub { * \brief A simple caching allocator for device memory allocations. * * \par Overview - * The allocator is thread-safe and is capable of managing cached device allocations - * on multiple devices. It behaves as follows: + * The allocator is thread-safe and stream-safe and is capable of managing cached + * device allocations on multiple devices. It behaves as follows: * * \par - * - Allocations categorized by bin size. - * - Bin sizes progress geometrically in accordance with the growth factor + * - Allocations from the allocator are associated with an \p active_stream. Once freed, + * the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for + * reuse within other streams when all prior work submitted to \p active_stream has completed. + * - Allocations are categorized and cached by bin size. A new allocation request of + * a given size will only consider cached allocations within the corresponding bin. + * - Bin limits progress geometrically in accordance with the growth factor * \p bin_growth provided during construction. Unused device allocations within * a larger bin cache are not reused for allocation requests that categorize to * smaller bin sizes. @@ -86,10 +88,10 @@ namespace cub { * * \par * For example, the default-constructed CachingDeviceAllocator is configured with: - * - \p bin_growth = 8 - * - \p min_bin = 3 - * - \p max_bin = 7 - * - \p max_cached_bytes = 6MB - 1B + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = 6MB - 1B * * \par * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB @@ -98,111 +100,86 @@ namespace cub { */ struct CachingDeviceAllocator { -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - //--------------------------------------------------------------------- - // Type definitions and constants + // Constants //--------------------------------------------------------------------- - enum - { - /// Invalid device ordinal - INVALID_DEVICE_ORDINAL = -1, - }; + /// Out-of-bounds bin + static const unsigned int INVALID_BIN = (unsigned int) -1; - /** - * Integer pow function for unsigned base and exponent - */ - static unsigned int IntPow( - unsigned int base, - unsigned int exp) - { - unsigned int retval = 1; - while (exp > 0) - { - if (exp & 1) { - retval = retval * base; // multiply the result by the current base - } - base = base * base; // square the base - exp = exp >> 1; // divide the exponent in half - } - return retval; - } + /// Invalid size + static const size_t INVALID_SIZE = (size_t) -1; +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - /** - * Round up to the nearest power-of - */ - static void NearestPowerOf( - unsigned int &power, - size_t &rounded_bytes, - unsigned int base, - size_t value) - { - power = 0; - rounded_bytes = 1; + /// Invalid device ordinal + static const int INVALID_DEVICE_ORDINAL = -1; - while (rounded_bytes < value) - { - rounded_bytes *= base; - power++; - } - } + //--------------------------------------------------------------------- + // Type definitions and helper types + //--------------------------------------------------------------------- /** * Descriptor for device memory allocations */ struct BlockDescriptor { - int device; // device ordinal - void* d_ptr; // Device pointer - size_t bytes; // Size of allocation in bytes - unsigned int bin; // Bin enumeration - - // Constructor + void* d_ptr; // Device pointer + size_t bytes; // Size of allocation in bytes + unsigned int bin; // Bin enumeration + int device; // device ordinal + cudaStream_t associated_stream; // Associated associated_stream + cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed + + // Constructor (suitable for searching maps for a specific block, given its pointer and device) BlockDescriptor(void *d_ptr, int device) : d_ptr(d_ptr), bytes(0), - bin(0), - device(device) {} - - // Constructor - BlockDescriptor(size_t bytes, unsigned int bin, int device) : + bin(INVALID_BIN), + device(device), + associated_stream(0), + ready_event(0) + {} + + // Constructor (suitable for searching maps for a range of suitable blocks, given a device) + BlockDescriptor(int device) : d_ptr(NULL), - bytes(bytes), - bin(bin), - device(device) {} + bytes(0), + bin(INVALID_BIN), + device(device), + associated_stream(0), + ready_event(0) + {} // Comparison functor for comparing device pointers static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { - if (a.device < b.device) { - return true; - } else if (a.device > b.device) { - return false; - } else { + if (a.device == b.device) return (a.d_ptr < b.d_ptr); - } + else + return (a.device < b.device); } // Comparison functor for comparing allocation sizes static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { - if (a.device < b.device) { - return true; - } else if (a.device > b.device) { - return false; - } else { + if (a.device == b.device) return (a.bytes < b.bytes); - } + else + return (a.device < b.device); } }; /// BlockDescriptor comparator function interface typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); -#if (CUB_PTX_ARCH == 0) // Only define STL container members in host code + class TotalBytes { + public: + size_t free; + size_t live; + TotalBytes() { free = live = 0; } + }; /// Set type for cached blocks (ordered by size) typedef std::multiset CachedBlocks; @@ -211,15 +188,66 @@ struct CachingDeviceAllocator typedef std::multiset BusyBlocks; /// Map type of device ordinals to the number of cached bytes cached by each device - typedef std::map GpuCachedBytes; + typedef std::map GpuCachedBytes; + + + //--------------------------------------------------------------------- + // Utility functions + //--------------------------------------------------------------------- + + /** + * Integer pow function for unsigned base and exponent + */ + static unsigned int IntPow( + unsigned int base, + unsigned int exp) + { + unsigned int retval = 1; + while (exp > 0) + { + if (exp & 1) { + retval = retval * base; // multiply the result by the current base + } + base = base * base; // square the base + exp = exp >> 1; // divide the exponent in half + } + return retval; + } + + + /** + * Round up to the nearest power-of + */ + void NearestPowerOf( + unsigned int &power, + size_t &rounded_bytes, + unsigned int base, + size_t value) + { + power = 0; + rounded_bytes = 1; + + if (value * base < value) + { + // Overflow + power = sizeof(size_t) * 8; + rounded_bytes = size_t(0) - 1; + return; + } + + while (rounded_bytes < value) + { + rounded_bytes *= base; + power++; + } + } -#endif // CUB_PTX_ARCH //--------------------------------------------------------------------- // Fields //--------------------------------------------------------------------- - Spinlock spin_lock; /// Spinlock for thread-safety + cub::Mutex mutex; /// Mutex for thread-safety unsigned int bin_growth; /// Geometric growth factor for bin-sizes unsigned int min_bin; /// Minimum bin enumeration @@ -229,17 +257,13 @@ struct CachingDeviceAllocator size_t max_bin_bytes; /// Maximum bin size size_t max_cached_bytes; /// Maximum aggregate cached bytes per device + const bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) bool debug; /// Whether or not to print (de)allocation events to stdout - bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) - -#if (CUB_PTX_ARCH == 0) // Only define STL container members in host code GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse BusyBlocks live_blocks; /// Set of live device allocations currently in use -#endif // CUB_PTX_ARCH - #endif // DOXYGEN_SHOULD_SKIP_THIS //--------------------------------------------------------------------- @@ -250,24 +274,23 @@ struct CachingDeviceAllocator * \brief Constructor. */ CachingDeviceAllocator( - unsigned int bin_growth, ///< Geometric growth factor for bin-sizes - unsigned int min_bin, ///< Minimum bin - unsigned int max_bin, ///< Maximum bin - size_t max_cached_bytes, ///< Maximum aggregate cached bytes per device - bool skip_cleanup = false) ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called. (Useful for preventing warnings when the allocator is declared at file/static/global scope: by the time the destructor is called on program exit, the CUDA runtime may have already shut down and freed all allocations.) + unsigned int bin_growth, ///< Geometric growth factor for bin-sizes + unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) + unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin) + size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default is no limit) + bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) + bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) : - #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code - cached_blocks(BlockDescriptor::SizeCompare), - live_blocks(BlockDescriptor::PtrCompare), - #endif - debug(false), - spin_lock(0), - bin_growth(bin_growth), - min_bin(min_bin), - max_bin(max_bin), - min_bin_bytes(IntPow(bin_growth, min_bin)), - max_bin_bytes(IntPow(bin_growth, max_bin)), - max_cached_bytes(max_cached_bytes) + bin_growth(bin_growth), + min_bin(min_bin), + max_bin(max_bin), + min_bin_bytes(IntPow(bin_growth, min_bin)), + max_bin_bytes(IntPow(bin_growth, max_bin)), + max_cached_bytes(max_cached_bytes), + skip_cleanup(skip_cleanup), + debug(debug), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) {} @@ -276,303 +299,338 @@ struct CachingDeviceAllocator * * Configured with: * \par - * - \p bin_growth = 8 - * - \p min_bin = 3 - * - \p max_bin = 7 - * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes + * - \p bin_growth = 8 + * - \p min_bin = 3 + * - \p max_bin = 7 + * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes * * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and * sets a maximum of 6,291,455 cached bytes per device */ CachingDeviceAllocator( - bool skip_cleanup = false) ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called. (Useful for preventing warnings when the allocator is declared at file/static/global scope: by the time the destructor is called on program exit, the CUDA runtime may have already shut down and freed all allocations.) + bool skip_cleanup = false, + bool debug = false) : - #if (CUB_PTX_ARCH == 0) // Only define STL container members in host code - cached_blocks(BlockDescriptor::SizeCompare), - live_blocks(BlockDescriptor::PtrCompare), - #endif - skip_cleanup(skip_cleanup), - debug(false), - spin_lock(0), bin_growth(8), min_bin(3), max_bin(7), min_bin_bytes(IntPow(bin_growth, min_bin)), max_bin_bytes(IntPow(bin_growth, max_bin)), - max_cached_bytes((max_bin_bytes * 3) - 1) + max_cached_bytes((max_bin_bytes * 3) - 1), + skip_cleanup(skip_cleanup), + debug(debug), + cached_blocks(BlockDescriptor::SizeCompare), + live_blocks(BlockDescriptor::PtrCompare) {} /** * \brief Sets the limit on the number bytes this allocator is allowed to cache per device. + * + * Changing the ceiling of cached bytes does not cause any allocations (in-use or + * cached-in-reserve) to be freed. See \p FreeAllCached(). */ cudaError_t SetMaxCachedBytes( size_t max_cached_bytes) { - #if (CUB_PTX_ARCH > 0) - // Caching functionality only defined on host - return CubDebug(cudaErrorInvalidConfiguration); - #else - // Lock - Lock(&spin_lock); + mutex.Lock(); - this->max_cached_bytes = max_cached_bytes; + if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes); - if (debug) CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes); + this->max_cached_bytes = max_cached_bytes; // Unlock - Unlock(&spin_lock); + mutex.Unlock(); return cudaSuccess; - - #endif // CUB_PTX_ARCH } /** - * \brief Provides a suitable allocation of device memory for the given size on the specified device + * \brief Provides a suitable allocation of device memory for the given size on the specified device. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. */ cudaError_t DeviceAllocate( - void** d_ptr, - size_t bytes, - int device) + int device, ///< [in] Device on which to place the allocation + void **d_ptr, ///< [out] Reference to pointer to the allocation + size_t bytes, ///< [in] Minimum number of bytes for the allocation + cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation { - #if (CUB_PTX_ARCH > 0) - // Caching functionality only defined on host - return CubDebug(cudaErrorInvalidConfiguration); - #else - - bool locked = false; + *d_ptr = NULL; int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; - // Round up to nearest bin size - unsigned int bin; - size_t bin_bytes; - NearestPowerOf(bin, bin_bytes, bin_growth, bytes); - if (bin < min_bin) { - bin = min_bin; - bin_bytes = min_bin_bytes; - } - - // Check if bin is greater than our maximum bin - if (bin > max_bin) + if (device == INVALID_DEVICE_ORDINAL) { - // Allocate the request exactly and give out-of-range bin - bin = (unsigned int) -1; - bin_bytes = bytes; + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + device = entrypoint_device; } - BlockDescriptor search_key(bin_bytes, bin, device); + // Create a block descriptor for the requested allocation + bool found = false; + BlockDescriptor search_key(device); + search_key.associated_stream = active_stream; + NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); - // Lock - if (!locked) { - Lock(&spin_lock); - locked = true; + if (search_key.bin > max_bin) + { + // Bin is greater than our maximum bin: allocate the request + // exactly and give out-of-bounds bin. It will not be cached + // for reuse when returned. + search_key.bin = INVALID_BIN; + search_key.bytes = bytes; } + else + { + // Search for a suitable cached allocation: lock + mutex.Lock(); + + if (search_key.bin < min_bin) + { + // Bin is less than minimum bin: round up + search_key.bin = min_bin; + search_key.bytes = min_bin_bytes; + } - do { - // Find a free block big enough within the same bin on the same device + // Iterate through the range of cached blocks on the same device in the same bin CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); - if ((block_itr != cached_blocks.end()) && - (block_itr->device == device) && - (block_itr->bin == search_key.bin)) + while ((block_itr != cached_blocks.end()) + && (block_itr->device == device) + && (block_itr->bin == search_key.bin)) { - // Reuse existing cache block. Insert into live blocks. - search_key = *block_itr; - live_blocks.insert(search_key); + // To prevent races with reusing blocks returned by the host but still + // in use by the device, only consider cached blocks that are + // either (from the active stream) or (from an idle stream) + if ((active_stream == block_itr->associated_stream) || + (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) + { + // Reuse existing cache block. Insert into live blocks. + found = true; + search_key = *block_itr; + search_key.associated_stream = active_stream; + live_blocks.insert(search_key); - // Remove from free blocks - cached_blocks.erase(block_itr); - cached_bytes[device] -= search_key.bytes; + // Remove from free blocks + cached_bytes[device].free -= search_key.bytes; + cached_bytes[device].live += search_key.bytes; - if (debug) CubLog("\tdevice %d reused cached block (%lld bytes). %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", - device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); + if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n", + device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream); + + cached_blocks.erase(block_itr); + + break; + } + block_itr++; } - else + + // Done searching: unlock + mutex.Unlock(); + } + + // Allocate the block if necessary + if (!found) + { + // Set runtime's current device to specified device (entrypoint may not be set) + if (device != entrypoint_device) { - // Need to allocate a new cache block. Unlock. - if (locked) { - Unlock(&spin_lock); - locked = false; - } + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + if (CubDebug(error = cudaSetDevice(device))) return error; + } - // Set to specified device - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; - if (CubDebug(error = cudaSetDevice(device))) break; + // Attempt to allocate + if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) + { + // The allocation attempt failed: free all cached blocks on device and retry + if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", + device, (long long) search_key.bytes, (long long) search_key.associated_stream); - // Allocate - if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break; + error = cudaSuccess; // Reset the error we will return + cudaGetLastError(); // Reset CUDART's error // Lock - if (!locked) { - Lock(&spin_lock); - locked = true; + mutex.Lock(); + + // Iterate the range of free blocks on the same device + BlockDescriptor free_key(device); + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key); + + while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) + { + // No need to worry about synchronization with the device: cudaFree is + // blocking and will synchronize across all kernels executing + // on the current device + + // Free device memory and destroy stream event. + if (CubDebug(error = cudaFree(block_itr->d_ptr))) break; + if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break; + + // Reduce balance and erase entry + cached_bytes[device].free -= block_itr->bytes; + + if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); + + cached_blocks.erase(block_itr); + + block_itr++; } - // Insert into live blocks - live_blocks.insert(search_key); + // Unlock + mutex.Unlock(); + + // Return under error + if (error) return error; - if (debug) CubLog("\tdevice %d allocating new device block %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", - device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); + // Try to allocate again + if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error; } - } while(0); - // Unlock - if (locked) { - Unlock(&spin_lock); - locked = false; + // Create ready event + if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) + return error; + + // Insert into live blocks + mutex.Lock(); + live_blocks.insert(search_key); + cached_bytes[device].live += search_key.bytes; + mutex.Unlock(); + + if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n", + device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream); + + // Attempt to revert back to previous device if necessary + if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } } - // Copy device pointer to output parameter (NULL on error) + // Copy device pointer to output parameter *d_ptr = search_key.d_ptr; - // Attempt to revert back to previous device if necessary - if (entrypoint_device != INVALID_DEVICE_ORDINAL) - { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; - } + if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", + (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); return error; - - #endif // CUB_PTX_ARCH } /** - * \brief Provides a suitable allocation of device memory for the given size on the current device + * \brief Provides a suitable allocation of device memory for the given size on the current device. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. */ cudaError_t DeviceAllocate( - void** d_ptr, - size_t bytes) + void **d_ptr, ///< [out] Reference to pointer to the allocation + size_t bytes, ///< [in] Minimum number of bytes for the allocation + cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation { - #if (CUB_PTX_ARCH > 0) - // Caching functionality only defined on host - return CubDebug(cudaErrorInvalidConfiguration); - #else - cudaError_t error = cudaSuccess; - do { - int current_device; - if (CubDebug(error = cudaGetDevice(¤t_device))) break; - if (CubDebug(error = DeviceAllocate(d_ptr, bytes, current_device))) break; - } while(0); - - return error; - - #endif // CUB_PTX_ARCH + return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream); } /** - * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator + * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. */ cudaError_t DeviceFree( - void* d_ptr, - int device) + int device, + void* d_ptr) { - #if (CUB_PTX_ARCH > 0) - // Caching functionality only defined on host - return CubDebug(cudaErrorInvalidConfiguration); - #else - - bool locked = false; int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; - BlockDescriptor search_key(d_ptr, device); + if (device == INVALID_DEVICE_ORDINAL) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) + return error; + device = entrypoint_device; + } // Lock - if (!locked) { - Lock(&spin_lock); - locked = true; - } + mutex.Lock(); - do { - // Find corresponding block descriptor - BusyBlocks::iterator block_itr = live_blocks.find(search_key); - if (block_itr == live_blocks.end()) + // Find corresponding block descriptor + bool recached = false; + BlockDescriptor search_key(d_ptr, device); + BusyBlocks::iterator block_itr = live_blocks.find(search_key); + if (block_itr != live_blocks.end()) + { + // Remove from live blocks + search_key = *block_itr; + live_blocks.erase(block_itr); + cached_bytes[device].live -= search_key.bytes; + + // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold + if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) { - // Cannot find pointer - if (CubDebug(error = cudaErrorUnknown)) break; + // Insert returned allocation into free blocks + recached = true; + cached_blocks.insert(search_key); + cached_bytes[device].free += search_key.bytes; + + if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", + device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), + (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); } - else - { - // Remove from live blocks - search_key = *block_itr; - live_blocks.erase(block_itr); - - // Check if we should keep the returned allocation - if (cached_bytes[device] + search_key.bytes <= max_cached_bytes) - { - // Insert returned allocation into free blocks - cached_blocks.insert(search_key); - cached_bytes[device] += search_key.bytes; - - if (debug) CubLog("\tdevice %d returned %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", - device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); - } - else - { - // Free the returned allocation. Unlock. - if (locked) { - Unlock(&spin_lock); - locked = false; - } + } - // Set to specified device - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; - if (CubDebug(error = cudaSetDevice(device))) break; + // Unlock + mutex.Unlock(); - // Free device memory - if (CubDebug(error = cudaFree(d_ptr))) break; + // First set to specified device (entrypoint may not be set) + if (device != entrypoint_device) + { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + if (CubDebug(error = cudaSetDevice(device))) return error; + } - if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", - device, (long long) search_key.bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size()); - } - } - } while (0); + if (recached) + { + // Insert the ready event in the associated stream (must have current device set properly) + if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error; + } + else + { + // Free the allocation from the runtime and cleanup the event. + if (CubDebug(error = cudaFree(d_ptr))) return error; + if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; - // Unlock - if (locked) { - Unlock(&spin_lock); - locked = false; + if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); } - // Attempt to revert back to entry-point device if necessary - if (entrypoint_device != INVALID_DEVICE_ORDINAL) + // Reset device + if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) { if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; } return error; - - #endif // CUB_PTX_ARCH } /** - * \brief Frees a live allocation of device memory on the current device, returning it to the allocator + * \brief Frees a live allocation of device memory on the current device, returning it to the allocator. + * + * Once freed, the allocation becomes available immediately for reuse within the \p active_stream + * with which it was associated with during allocation, and it becomes available for reuse within other + * streams when all prior work submitted to \p active_stream has completed. */ cudaError_t DeviceFree( - void* d_ptr) + void* d_ptr) { - #if (CUB_PTX_ARCH > 0) - // Caching functionality only defined on host - return CubDebug(cudaErrorInvalidConfiguration); - #else - - int current_device; - cudaError_t error = cudaSuccess; - - do { - if (CubDebug(error = cudaGetDevice(¤t_device))) break; - if (CubDebug(error = DeviceFree(d_ptr, current_device))) break; - } while(0); - - return error; - - #endif // CUB_PTX_ARCH + return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); } @@ -581,21 +639,11 @@ struct CachingDeviceAllocator */ cudaError_t FreeAllCached() { - #if (CUB_PTX_ARCH > 0) - // Caching functionality only defined on host - return CubDebug(cudaErrorInvalidConfiguration); - #else - cudaError_t error = cudaSuccess; - bool locked = false; int entrypoint_device = INVALID_DEVICE_ORDINAL; int current_device = INVALID_DEVICE_ORDINAL; - // Lock - if (!locked) { - Lock(&spin_lock); - locked = true; - } + mutex.Lock(); while (!cached_blocks.empty()) { @@ -617,21 +665,19 @@ struct CachingDeviceAllocator // Free device memory if (CubDebug(error = cudaFree(begin->d_ptr))) break; + if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break; // Reduce balance and erase entry - cached_bytes[current_device] -= begin->bytes; - cached_blocks.erase(begin); + cached_bytes[current_device].free -= begin->bytes; - if (debug) CubLog("\tdevice %d freed %lld bytes. %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n", - current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size()); - } + if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", + current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live); - // Unlock - if (locked) { - Unlock(&spin_lock); - locked = false; + cached_blocks.erase(begin); } + mutex.Unlock(); + // Attempt to revert back to entry-point device if necessary if (entrypoint_device != INVALID_DEVICE_ORDINAL) { @@ -639,8 +685,6 @@ struct CachingDeviceAllocator } return error; - - #endif // CUB_PTX_ARCH } diff --git a/SRC/cub/util_arch.cuh b/SRC/cub/util_arch.cuh index 917c3606..28d81e7c 100644 --- a/SRC/cub/util_arch.cuh +++ b/SRC/cub/util_arch.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -41,157 +41,111 @@ CUB_NS_PREFIX /// CUB namespace namespace cub { +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document -/** - * \addtogroup UtilMgmt - * @{ - */ - +#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS) + #define CUB_USE_COOPERATIVE_GROUPS +#endif /// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). -#ifndef __CUDA_ARCH__ - #define CUB_PTX_ARCH 0 -#else - #define CUB_PTX_ARCH __CUDA_ARCH__ +#ifndef CUB_PTX_ARCH + #ifndef __CUDA_ARCH__ + #define CUB_PTX_ARCH 0 + #else + #define CUB_PTX_ARCH __CUDA_ARCH__ + #endif #endif /// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. -#if (CUB_PTX_ARCH == 0) || defined(CUB_CDP) - #define CUB_RUNTIME_ENABLED - #define CUB_RUNTIME_FUNCTION __host__ __device__ -#else - #define CUB_RUNTIME_FUNCTION __host__ +#ifndef CUB_RUNTIME_FUNCTION + #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) + #define CUB_RUNTIME_ENABLED + #define CUB_RUNTIME_FUNCTION __host__ __device__ + #else + #define CUB_RUNTIME_FUNCTION __host__ + #endif #endif - -/// Number of threads per warp (log) -#define CUB_LOG_WARP_THREADS(arch) \ - (5) - /// Number of threads per warp -#define CUB_WARP_THREADS(arch) \ - (1 << CUB_LOG_WARP_THREADS(arch)) +#ifndef CUB_LOG_WARP_THREADS + #define CUB_LOG_WARP_THREADS(arch) \ + (5) + #define CUB_WARP_THREADS(arch) \ + (1 << CUB_LOG_WARP_THREADS(arch)) + + #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) + #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) +#endif -/// Number of smem banks (log) -#define CUB_LOG_SMEM_BANKS(arch) \ - ((arch >= 200) ? \ - (5) : \ - (4)) /// Number of smem banks -#define CUB_SMEM_BANKS(arch) \ - (1 << CUB_LOG_SMEM_BANKS(arch)) - -/// Number of bytes per smem bank -#define CUB_SMEM_BANK_BYTES(arch) \ - (4) - -/// Number of smem bytes provisioned per SM -#define CUB_SMEM_BYTES(arch) \ - ((arch >= 200) ? \ - (48 * 1024) : \ - (16 * 1024)) - -/// Smem allocation size in bytes -#define CUB_SMEM_ALLOC_UNIT(arch) \ - ((arch >= 300) ? \ - (256) : \ - ((arch >= 200) ? \ - (128) : \ - (512))) - -/// Whether or not the architecture allocates registers by block (or by warp) -#define CUB_REGS_BY_BLOCK(arch) \ - ((arch >= 200) ? \ - (false) : \ - (true)) - -/// Number of registers allocated at a time per block (or by warp) -#define CUB_REG_ALLOC_UNIT(arch) \ - ((arch >= 300) ? \ - (256) : \ - ((arch >= 200) ? \ - (64) : \ - ((arch >= 120) ? \ - (512) : \ - (256)))) - -/// Granularity of warps for which registers are allocated -#define CUB_WARP_ALLOC_UNIT(arch) \ - ((arch >= 300) ? \ - (4) : \ - (2)) - -/// Maximum number of threads per SM -#define CUB_MAX_SM_THREADS(arch) \ - ((arch >= 300) ? \ - (2048) : \ - ((arch >= 200) ? \ - (1536) : \ - ((arch >= 120) ? \ - (1024) : \ - (768)))) - -/// Maximum number of thread blocks per SM -#define CUB_MAX_SM_BLOCKS(arch) \ - ((arch >= 300) ? \ - (16) : \ - (8)) - -/// Maximum number of threads per thread block -#define CUB_MAX_BLOCK_THREADS(arch) \ - ((arch >= 200) ? \ - (1024) : \ - (512)) - -/// Maximum number of registers per SM -#define CUB_MAX_SM_REGISTERS(arch) \ - ((arch >= 300) ? \ - (64 * 1024) : \ - ((arch >= 200) ? \ - (32 * 1024) : \ - ((arch >= 120) ? \ - (16 * 1024) : \ - (8 * 1024)))) +#ifndef CUB_LOG_SMEM_BANKS + #define CUB_LOG_SMEM_BANKS(arch) \ + ((arch >= 200) ? \ + (5) : \ + (4)) + #define CUB_SMEM_BANKS(arch) \ + (1 << CUB_LOG_SMEM_BANKS(arch)) + + #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) + #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) +#endif + /// Oversubscription factor -#define CUB_SUBSCRIPTION_FACTOR(arch) \ - ((arch >= 300) ? \ - (5) : \ - ((arch >= 200) ? \ - (3) : \ - (10))) +#ifndef CUB_SUBSCRIPTION_FACTOR + #define CUB_SUBSCRIPTION_FACTOR(arch) \ + ((arch >= 300) ? \ + (5) : \ + ((arch >= 200) ? \ + (3) : \ + (10))) + #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH) +#endif + /// Prefer padding overhead vs X-way conflicts greater than this threshold -#define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ - ((arch >= 300) ? \ - (1) : \ - (4)) +#ifndef CUB_PREFER_CONFLICT_OVER_PADDING + #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ + ((arch >= 300) ? \ + (1) : \ + (4)) + #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) +#endif -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document -#define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) -#define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) -#define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) -#define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) -#define CUB_PTX_SMEM_BANK_BYTES CUB_SMEM_BANK_BYTES(CUB_PTX_ARCH) -#define CUB_PTX_SMEM_BYTES CUB_SMEM_BYTES(CUB_PTX_ARCH) -#define CUB_PTX_SMEM_ALLOC_UNIT CUB_SMEM_ALLOC_UNIT(CUB_PTX_ARCH) -#define CUB_PTX_REGS_BY_BLOCK CUB_REGS_BY_BLOCK(CUB_PTX_ARCH) -#define CUB_PTX_REG_ALLOC_UNIT CUB_REG_ALLOC_UNIT(CUB_PTX_ARCH) -#define CUB_PTX_WARP_ALLOC_UNIT CUB_WARP_ALLOC_UNIT(CUB_PTX_ARCH) -#define CUB_PTX_MAX_SM_THREADS CUB_MAX_SM_THREADS(CUB_PTX_ARCH) -#define CUB_PTX_MAX_SM_BLOCKS CUB_MAX_SM_BLOCKS(CUB_PTX_ARCH) -#define CUB_PTX_MAX_BLOCK_THREADS CUB_MAX_BLOCK_THREADS(CUB_PTX_ARCH) -#define CUB_PTX_MAX_SM_REGISTERS CUB_MAX_SM_REGISTERS(CUB_PTX_ARCH) -#define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) +/// Scale down the number of threads to keep same amount of scratch storage as the nominal configuration for 4B data. Minimum of two warps. +#ifndef CUB_SCALED_BLOCK_THREADS + #define CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ + (CUB_MIN( \ + NOMINAL_4B_BLOCK_THREADS, \ + CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX( \ + 2, \ + (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T)))) +#endif -#endif // Do not document +/// Scale down number of items per thread to keep the same amount of register storage as the nominal configuration for 4B data. Minimum 1 item per thread +#ifndef CUB_SCALED_ITEMS_PER_THREAD + #define CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ + CUB_MAX( \ + 1, \ + (sizeof(T) < 4) ? \ + ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) / 2 : \ + ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)) +#endif + +/// Define both nominal threads-per-block and items-per-thread +#ifndef CUB_SCALED_GRANULARITIES + #define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T) \ + CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200), \ + CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200) +#endif -/** @} */ // end group UtilMgmt + +#endif // Do not document } // CUB namespace CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/util_debug.cuh b/SRC/cub/util_debug.cuh index 375fd5e4..3ad832e7 100644 --- a/SRC/cub/util_debug.cuh +++ b/SRC/cub/util_debug.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -54,7 +54,7 @@ namespace cub { /// CUB error reporting macro (prints error messages to stderr) -#if (defined(DEBUG) || defined(_DEBUG)) +#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR) #define CUB_STDERR #endif @@ -70,6 +70,8 @@ __host__ __device__ __forceinline__ cudaError_t Debug( const char* filename, int line) { + (void)filename; + (void)line; #ifdef CUB_STDERR if (error) { @@ -77,7 +79,7 @@ __host__ __device__ __forceinline__ cudaError_t Debug( fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); fflush(stderr); #elif (CUB_PTX_ARCH >= 200) - printf("CUDA error %d [block %d, thread %d, %s, %d]\n", error, blockIdx.x, threadIdx.x, filename, line); + printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); #endif } #endif @@ -88,22 +90,50 @@ __host__ __device__ __forceinline__ cudaError_t Debug( /** * \brief Debug macro */ -#define CubDebug(e) cub::Debug((e), __FILE__, __LINE__) +#ifndef CubDebug + #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__) +#endif /** * \brief Debug macro with exit */ -#define CubDebugExit(e) if (cub::Debug((e), __FILE__, __LINE__)) { exit(1); } +#ifndef CubDebugExit + #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); } +#endif /** * \brief Log macro for printf statements. */ -#if (CUB_PTX_ARCH == 0) - #define CubLog(format, ...) printf(format,__VA_ARGS__); -#elif (CUB_PTX_ARCH >= 200) - #define CubLog(format, ...) printf("[block %d, thread %d]: " format, blockIdx.x, threadIdx.x, __VA_ARGS__); +#if !defined(_CubLog) + #if !(defined(__clang__) && defined(__CUDA__)) + #if (CUB_PTX_ARCH == 0) + #define _CubLog(format, ...) printf(format,__VA_ARGS__); + #elif (CUB_PTX_ARCH >= 200) + #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__); + #endif + #else + // XXX shameless hack for clang around variadic printf... + // Compilies w/o supplying -std=c++11 but shows warning, + // so we sielence them :) + #pragma clang diagnostic ignored "-Wc++11-extensions" + #pragma clang diagnostic ignored "-Wunnamed-type-template-args" + template + inline __host__ __device__ void va_printf(char const* format, Args const&... args) + { + #ifdef __CUDA_ARCH__ + printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...); + #else + printf(format, args...); + #endif + } + #ifndef __CUDA_ARCH__ + #define _CubLog(format, ...) va_printf(format,__VA_ARGS__); + #else + #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__); + #endif + #endif #endif diff --git a/SRC/cub/util_device.cuh b/SRC/cub/util_device.cuh index f3b79078..a5f3b614 100644 --- a/SRC/cub/util_device.cuh +++ b/SRC/cub/util_device.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -33,6 +33,7 @@ #pragma once +#include "util_type.cuh" #include "util_arch.cuh" #include "util_debug.cuh" #include "util_namespace.cuh" @@ -53,20 +54,13 @@ namespace cub { #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document -/** - * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device - */ -template -__global__ void EmptyKernel(void) { } - - /** * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). */ template -CUB_RUNTIME_FUNCTION __forceinline__ +__host__ __device__ __forceinline__ cudaError_t AliasTemporaries( - void *d_temp_storage, ///< [in] %Device allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed @@ -83,6 +77,7 @@ cudaError_t AliasTemporaries( allocation_offsets[i] = bytes_needed; bytes_needed += allocation_bytes; } + bytes_needed += ALIGN_BYTES - 1; // Check if the caller is simply requesting the size of the storage allocation if (!d_temp_storage) @@ -98,6 +93,7 @@ cudaError_t AliasTemporaries( } // Alias + d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK); for (int i = 0; i < ALLOCATIONS; ++i) { allocations[i] = static_cast(d_temp_storage) + allocation_offsets[i]; @@ -107,10 +103,14 @@ cudaError_t AliasTemporaries( } - -#endif // DOXYGEN_SHOULD_SKIP_THIS +/** + * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device + */ +template +__global__ void EmptyKernel(void) { } +#endif // DOXYGEN_SHOULD_SKIP_THIS /** * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10) @@ -132,6 +132,7 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version) #ifndef CUB_RUNTIME_ENABLED + (void)ptx_version; // CUDA API calls not supported from this device return cudaErrorInvalidConfiguration; @@ -164,6 +165,8 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version) CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal) { #ifndef CUB_RUNTIME_ENABLED + (void)sm_version; + (void)device_ordinal; // CUDA API calls not supported from this device return cudaErrorInvalidConfiguration; @@ -198,106 +201,13 @@ static cudaError_t SyncStream(cudaStream_t stream) #if (CUB_PTX_ARCH == 0) return cudaStreamSynchronize(stream); #else + (void)stream; // Device can't yet sync on a specific stream return cudaDeviceSynchronize(); #endif } -/** - * \brief Computes maximum SM occupancy in thread blocks for the given kernel function pointer \p kernel_ptr. - */ -template -CUB_RUNTIME_FUNCTION __forceinline__ -cudaError_t MaxSmOccupancy( - int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM - int sm_version, ///< [in] The SM architecture to run on - KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy - int block_threads) ///< [in] Number of threads per thread block -{ -#ifndef CUB_RUNTIME_ENABLED - - // CUDA API calls not supported from this device - return CubDebug(cudaErrorInvalidConfiguration); - -#else - - cudaError_t error = cudaSuccess; - do - { - int warp_threads = 1 << CUB_LOG_WARP_THREADS(sm_version); - int max_sm_blocks = CUB_MAX_SM_BLOCKS(sm_version); - int max_sm_warps = CUB_MAX_SM_THREADS(sm_version) / warp_threads; - int regs_by_block = CUB_REGS_BY_BLOCK(sm_version); - int max_sm_registers = CUB_MAX_SM_REGISTERS(sm_version); - int warp_alloc_unit = CUB_WARP_ALLOC_UNIT(sm_version); - int smem_alloc_unit = CUB_SMEM_ALLOC_UNIT(sm_version); - int reg_alloc_unit = CUB_REG_ALLOC_UNIT(sm_version); - int smem_bytes = CUB_SMEM_BYTES(sm_version); - - // Get kernel attributes - cudaFuncAttributes kernel_attrs; - if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break; - - // Number of warps per threadblock - int block_warps = (block_threads + warp_threads - 1) / warp_threads; - - // Max warp occupancy - int max_warp_occupancy = (block_warps > 0) ? - max_sm_warps / block_warps : - max_sm_blocks; - - // Maximum register occupancy - int max_reg_occupancy; - if ((block_threads == 0) || (kernel_attrs.numRegs == 0)) - { - // Prevent divide-by-zero - max_reg_occupancy = max_sm_blocks; - } - else if (regs_by_block) - { - // Allocates registers by threadblock - int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit); - max_reg_occupancy = max_sm_registers / block_regs; - } - else - { - // Allocates registers by warp - int sm_sides = warp_alloc_unit; - int sm_registers_per_side = max_sm_registers / sm_sides; - int regs_per_warp = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit); - int warps_per_side = sm_registers_per_side / regs_per_warp; - int warps = warps_per_side * sm_sides; - max_reg_occupancy = warps / block_warps; - } - - // Shared memory per threadblock - int block_allocated_smem = CUB_ROUND_UP_NEAREST( - kernel_attrs.sharedSizeBytes, - smem_alloc_unit); - - // Max shared memory occupancy - int max_smem_occupancy = (block_allocated_smem > 0) ? - (smem_bytes / block_allocated_smem) : - max_sm_blocks; - - // Max occupancy - max_sm_occupancy = CUB_MIN( - CUB_MIN(max_sm_blocks, max_warp_occupancy), - CUB_MIN(max_smem_occupancy, max_reg_occupancy)); - -// printf("max_smem_occupancy(%d), max_warp_occupancy(%d), max_reg_occupancy(%d) \n", max_smem_occupancy, max_warp_occupancy, max_reg_occupancy); - - } while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED -} - -#endif // Do not document - - /** * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block. * @@ -334,36 +244,101 @@ CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t MaxSmOccupancy( int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy - int block_threads) ///< [in] Number of threads per thread block + int block_threads, ///< [in] Number of threads per thread block + int dynamic_smem_bytes = 0) { #ifndef CUB_RUNTIME_ENABLED + (void)dynamic_smem_bytes; + (void)block_threads; + (void)kernel_ptr; + (void)max_sm_occupancy; // CUDA API calls not supported from this device return CubDebug(cudaErrorInvalidConfiguration); #else - cudaError_t error = cudaSuccess; - do + return cudaOccupancyMaxActiveBlocksPerMultiprocessor ( + &max_sm_occupancy, + kernel_ptr, + block_threads, + dynamic_smem_bytes); + +#endif // CUB_RUNTIME_ENABLED +} + + +/****************************************************************************** + * Policy management + ******************************************************************************/ + +/** + * Kernel dispatch configuration + */ +struct KernelConfig +{ + int block_threads; + int items_per_thread; + int tile_size; + int sm_occupancy; + + CUB_RUNTIME_FUNCTION __forceinline__ + KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {} + + template + CUB_RUNTIME_FUNCTION __forceinline__ + cudaError_t Init(KernelPtrT kernel_ptr) { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + block_threads = AgentPolicyT::BLOCK_THREADS; + items_per_thread = AgentPolicyT::ITEMS_PER_THREAD; + tile_size = block_threads * items_per_thread; + cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads); + return retval; + } +}; - // Get device SM version - int sm_version; - if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break; - // Get SM occupancy - if (CubDebug(error = MaxSmOccupancy(max_sm_occupancy, sm_version, kernel_ptr, block_threads))) break; - } while (0); +/// Helper for dispatching into a policy chain +template +struct ChainedPolicy +{ + /// The policy for the active compiler pass + typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy; + + /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version + template + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Invoke(int ptx_version, FunctorT &op) + { + if (ptx_version < PTX_VERSION) { + return PrevPolicyT::Invoke(ptx_version, op); + } + return op.template Invoke(); + } +}; + +/// Helper for dispatching into a policy chain (end-of-chain specialization) +template +struct ChainedPolicy +{ + /// The policy for the active compiler pass + typedef PolicyT ActivePolicy; + + /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version + template + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Invoke(int /*ptx_version*/, FunctorT &op) { + return op.template Invoke(); + } +}; - return error; -#endif // CUB_RUNTIME_ENABLED -} + +#endif // Do not document + + /** @} */ // end group UtilMgmt diff --git a/SRC/cub/util_macro.cuh b/SRC/cub/util_macro.cuh index a94031a4..ff863654 100644 --- a/SRC/cub/util_macro.cuh +++ b/SRC/cub/util_macro.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -46,60 +46,56 @@ namespace cub { * @{ */ -/** - * Align struct - */ -#if defined(_WIN32) || defined(_WIN64) - #define CUB_ALIGN(bytes) __declspec(align(32)) -#else - #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) +#ifndef CUB_ALIGN + #if defined(_WIN32) || defined(_WIN64) + /// Align struct + #define CUB_ALIGN(bytes) __declspec(align(32)) + #else + /// Align struct + #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) + #endif #endif -/** - * Select maximum(a, b) - */ -#define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) - -/** - * Select minimum(a, b) - */ -#define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) +#ifndef CUB_MAX + /// Select maximum(a, b) + #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) +#endif -/** - * Quotient of x/y rounded down to nearest integer - */ -#define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) +#ifndef CUB_MIN + /// Select minimum(a, b) + #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) +#endif -/** - * Quotient of x/y rounded up to nearest integer - */ -#define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) +#ifndef CUB_QUOTIENT_FLOOR + /// Quotient of x/y rounded down to nearest integer + #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) +#endif -/** - * x rounded up to the nearest multiple of y - */ -#define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) +#ifndef CUB_QUOTIENT_CEILING + /// Quotient of x/y rounded up to nearest integer + #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) +#endif -/** - * x rounded down to the nearest multiple of y - */ -#define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) +#ifndef CUB_ROUND_UP_NEAREST + /// x rounded up to the nearest multiple of y + #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) +#endif -/** - * Return character string for given type - */ -#define CUB_TYPE_STRING(type) ""#type +#ifndef CUB_ROUND_DOWN_NEAREST + /// x rounded down to the nearest multiple of y + #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) +#endif -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - #define CUB_CAT_(a, b) a ## b - #define CUB_CAT(a, b) CUB_CAT_(a, b) -#endif // DOXYGEN_SHOULD_SKIP_THIS -/** - * Static assert - */ -#define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] +#ifndef CUB_STATIC_ASSERT + #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + #define CUB_CAT_(a, b) a ## b + #define CUB_CAT(a, b) CUB_CAT_(a, b) + #endif // DOXYGEN_SHOULD_SKIP_THIS + /// Static assert + #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] +#endif /** @} */ // end group UtilModule diff --git a/SRC/cub/util_namespace.cuh b/SRC/cub/util_namespace.cuh index 39603644..c8991d08 100644 --- a/SRC/cub/util_namespace.cuh +++ b/SRC/cub/util_namespace.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -37,5 +37,10 @@ //#define CUB_NS_PREFIX namespace thrust{ namespace detail { //#define CUB_NS_POSTFIX } } +#ifndef CUB_NS_PREFIX #define CUB_NS_PREFIX +#endif + +#ifndef CUB_NS_POSTFIX #define CUB_NS_POSTFIX +#endif diff --git a/SRC/cub/util_ptx.cuh b/SRC/cub/util_ptx.cuh index 4172de2a..582ca0d8 100644 --- a/SRC/cub/util_ptx.cuh +++ b/SRC/cub/util_ptx.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -37,6 +37,8 @@ #include "util_type.cuh" #include "util_arch.cuh" #include "util_namespace.cuh" +#include "util_debug.cuh" + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -89,7 +91,7 @@ __device__ __forceinline__ unsigned int SHR_ADD( { unsigned int ret; #if CUB_PTX_ARCH >= 200 - asm("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : + asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); #else ret = (x >> shift) + addend; @@ -108,7 +110,7 @@ __device__ __forceinline__ unsigned int SHL_ADD( { unsigned int ret; #if CUB_PTX_ARCH >= 200 - asm("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : + asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); #else ret = (x << shift) + addend; @@ -126,11 +128,11 @@ __device__ __forceinline__ unsigned int BFE( UnsignedBits source, unsigned int bit_start, unsigned int num_bits, - Int2Type byte_len) + Int2Type /*byte_len*/) { unsigned int bits; #if CUB_PTX_ARCH >= 200 - asm("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits)); + asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits)); #else const unsigned int MASK = (1 << num_bits) - 1; bits = (source >> bit_start) & MASK; @@ -147,7 +149,7 @@ __device__ __forceinline__ unsigned int BFE( UnsignedBits source, unsigned int bit_start, unsigned int num_bits, - Int2Type<8> byte_len) + Int2Type<8> /*byte_len*/) { const unsigned long long MASK = (1ull << num_bits) - 1; return (source >> bit_start) & MASK; @@ -179,7 +181,7 @@ __device__ __forceinline__ void BFI( unsigned int num_bits) { #if CUB_PTX_ARCH >= 200 - asm("bfi.b32 %0, %1, %2, %3, %4;" : + asm ("bfi.b32 %0, %1, %2, %3, %4;" : "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits)); #else x <<= bit_start; @@ -196,7 +198,7 @@ __device__ __forceinline__ void BFI( __device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) { #if CUB_PTX_ARCH >= 200 - asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); + asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); #else x = x + y + z; #endif @@ -233,7 +235,7 @@ __device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, un __device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index) { int ret; - asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); + asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); return ret; } @@ -247,6 +249,120 @@ __device__ __forceinline__ void BAR(int count) asm volatile("bar.sync 1, %0;" : : "r"(count)); } +/** + * CTA barrier + */ +__device__ __forceinline__ void CTA_SYNC() +{ + __syncthreads(); +} + + +/** + * CTA barrier with predicate + */ +__device__ __forceinline__ int CTA_SYNC_AND(int p) +{ + return __syncthreads_and(p); +} + + +/** + * Warp barrier + */ +__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + __syncwarp(member_mask); +#endif +} + + +/** + * Warp any + */ +__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + return __any_sync(member_mask, predicate); +#else + return ::__any(predicate); +#endif +} + + +/** + * Warp any + */ +__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + return __all_sync(member_mask, predicate); +#else + return ::__all(predicate); +#endif +} + + +/** + * Warp ballot + */ +__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + return __ballot_sync(member_mask, predicate); +#else + return __ballot(predicate); +#endif +} + +/** + * Warp synchronous shfl_up + */ +__device__ __forceinline__ +unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); +#else + asm volatile("shfl.up.b32 %0, %1, %2, %3;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags)); +#endif + return word; +} + +/** + * Warp synchronous shfl_down + */ +__device__ __forceinline__ +unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); +#else + asm volatile("shfl.down.b32 %0, %1, %2, %3;" + : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags)); +#endif + return word; +} + +/** + * Warp synchronous shfl_idx + */ +__device__ __forceinline__ +unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask) +{ +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" + : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask)); +#else + asm volatile("shfl.idx.b32 %0, %1, %2, %3;" + : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags)); +#endif + return word; +} /** * Floating point multiply. (Mantissa LSB rounds towards zero.) @@ -254,7 +370,7 @@ __device__ __forceinline__ void BAR(int count) __device__ __forceinline__ float FMUL_RZ(float a, float b) { float d; - asm("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b)); + asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b)); return d; } @@ -265,7 +381,7 @@ __device__ __forceinline__ float FMUL_RZ(float a, float b) __device__ __forceinline__ float FFMA_RZ(float a, float b, float c) { float d; - asm("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c)); + asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c)); return d; } @@ -275,12 +391,20 @@ __device__ __forceinline__ float FFMA_RZ(float a, float b, float c) * \brief Terminates the calling thread */ __device__ __forceinline__ void ThreadExit() { - asm("exit;"); + asm volatile("exit;"); } /** - * \brief Returns the row-major linear thread identifier for a multidimensional threadblock + * \brief Abort execution and generate an interrupt to the host CPU + */ +__device__ __forceinline__ void ThreadTrap() { + asm volatile("trap;"); +} + + +/** + * \brief Returns the row-major linear thread identifier for a multidimensional thread block */ __device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z) { @@ -296,7 +420,7 @@ __device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int __device__ __forceinline__ unsigned int LaneId() { unsigned int ret; - asm("mov.u32 %0, %laneid;" : "=r"(ret) ); + asm ("mov.u32 %0, %%laneid;" : "=r"(ret) ); return ret; } @@ -307,7 +431,7 @@ __device__ __forceinline__ unsigned int LaneId() __device__ __forceinline__ unsigned int WarpId() { unsigned int ret; - asm("mov.u32 %0, %warpid;" : "=r"(ret) ); + asm ("mov.u32 %0, %%warpid;" : "=r"(ret) ); return ret; } @@ -317,7 +441,7 @@ __device__ __forceinline__ unsigned int WarpId() __device__ __forceinline__ unsigned int LaneMaskLt() { unsigned int ret; - asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) ); + asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) ); return ret; } @@ -327,7 +451,7 @@ __device__ __forceinline__ unsigned int LaneMaskLt() __device__ __forceinline__ unsigned int LaneMaskLe() { unsigned int ret; - asm("mov.u32 %0, %lanemask_le;" : "=r"(ret) ); + asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) ); return ret; } @@ -337,7 +461,7 @@ __device__ __forceinline__ unsigned int LaneMaskLe() __device__ __forceinline__ unsigned int LaneMaskGt() { unsigned int ret; - asm("mov.u32 %0, %lanemask_gt;" : "=r"(ret) ); + asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) ); return ret; } @@ -347,7 +471,7 @@ __device__ __forceinline__ unsigned int LaneMaskGt() __device__ __forceinline__ unsigned int LaneMaskGe() { unsigned int ret; - asm("mov.u32 %0, %lanemask_ge;" : "=r"(ret) ); + asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) ); return ret; } @@ -360,6 +484,9 @@ __device__ __forceinline__ unsigned int LaneMaskGe() * \brief Shuffle-up for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei-src_offset. For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png) * \ingroup WarpModule * + * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. + * \tparam T [inferred] The input/output element type + * * \par * - Available only for SM3.0 or newer * @@ -376,7 +503,7 @@ __device__ __forceinline__ unsigned int LaneMaskGe() * double thread_data = ... * * // Obtain item from two ranks below - * double peer_data = ShuffleUp(thread_data, 2); + * double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff); * * \endcode * \par @@ -384,31 +511,37 @@ __device__ __forceinline__ unsigned int LaneMaskGe() * The corresponding output \p peer_data will be {1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}. * */ -template +template < + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + typename T> __device__ __forceinline__ T ShuffleUp( T input, ///< [in] The value to broadcast - int src_offset) ///< [in] The relative down-offset of the peer to read from + int src_offset, ///< [in] The relative down-offset of the peer to read from + int first_thread, ///< [in] Index of first lane in logical warp (typically 0) + unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes { - enum - { - SHFL_C = 0, + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + enum { + SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8 }; typedef typename UnitWord::ShuffleWord ShuffleWord; const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); + T output; ShuffleWord *output_alias = reinterpret_cast(&output); ShuffleWord *input_alias = reinterpret_cast(&input); + unsigned int shuffle_word; + shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask); + output_alias[0] = shuffle_word; + #pragma unroll - for (int WORD = 0; WORD < WORDS; ++WORD) + for (int WORD = 1; WORD < WORDS; ++WORD) { - unsigned int shuffle_word = input_alias[WORD]; - asm( - " shfl.up.b32 %0, %1, %2, %3;" - : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_offset), "r"(SHFL_C)); - output_alias[WORD] = (ShuffleWord) shuffle_word; + shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask); + output_alias[WORD] = shuffle_word; } return output; @@ -419,6 +552,9 @@ __device__ __forceinline__ T ShuffleUp( * \brief Shuffle-down for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei+src_offset. For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread. ![](shfl_down_logo.png) * \ingroup WarpModule * + * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. + * \tparam T [inferred] The input/output element type + * * \par * - Available only for SM3.0 or newer * @@ -435,7 +571,7 @@ __device__ __forceinline__ T ShuffleUp( * double thread_data = ... * * // Obtain item from two ranks below - * double peer_data = ShuffleDown(thread_data, 2); + * double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff); * * \endcode * \par @@ -443,75 +579,51 @@ __device__ __forceinline__ T ShuffleUp( * The corresponding output \p peer_data will be {3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}. * */ -template +template < + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + typename T> __device__ __forceinline__ T ShuffleDown( T input, ///< [in] The value to broadcast - int src_offset) ///< [in] The relative up-offset of the peer to read from + int src_offset, ///< [in] The relative up-offset of the peer to read from + int last_thread, ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp) + unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes { - enum - { - SHFL_C = CUB_PTX_WARP_THREADS - 1, + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + enum { + SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8 }; typedef typename UnitWord::ShuffleWord ShuffleWord; const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); + T output; ShuffleWord *output_alias = reinterpret_cast(&output); ShuffleWord *input_alias = reinterpret_cast(&input); + unsigned int shuffle_word; + shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask); + output_alias[0] = shuffle_word; + #pragma unroll - for (int WORD = 0; WORD < WORDS; ++WORD) + for (int WORD = 1; WORD < WORDS; ++WORD) { - unsigned int shuffle_word = input_alias[WORD]; - asm( - " shfl.down.b32 %0, %1, %2, %3;" - : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_offset), "r"(SHFL_C)); - output_alias[WORD] = (ShuffleWord) shuffle_word; + shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask); + output_alias[WORD] = shuffle_word; } return output; } -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** - * \brief Shuffle-broadcast for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanesrc_lane. For \p src_lane < 0 or \p src_lane >= WARP_THREADS, then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png) - * \ingroup WarpModule + * \brief Shuffle-broadcast for any data type. Each warp-lanei obtains the value \p input + * contributed by warp-lanesrc_lane. For \p src_lane < 0 or \p src_lane >= WARP_THREADS, + * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png) + * + * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. + * \tparam T [inferred] The input/output element type * - * \par - * - Available only for SM3.0 or newer - */ -template -__device__ __forceinline__ T ShuffleBroadcast( - T input, ///< [in] The value to broadcast - int src_lane, ///< [in] Which warp lane is to do the broadcasting - int logical_warp_threads) ///< [in] Number of threads per logical warp -{ - typedef typename UnitWord::ShuffleWord ShuffleWord; - - const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); - T output; - ShuffleWord *output_alias = reinterpret_cast(&output); - ShuffleWord *input_alias = reinterpret_cast(&input); - - #pragma unroll - for (int WORD = 0; WORD < WORDS; ++WORD) - { - unsigned int shuffle_word = input_alias[WORD]; - asm("shfl.idx.b32 %0, %1, %2, %3;" - : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_lane), "r"(logical_warp_threads - 1)); - output_alias[WORD] = (ShuffleWord) shuffle_word; - } - - return output; -} - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - /** - * \brief Shuffle-broadcast for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanesrc_lane. For \p src_lane < 0 or \p src_lane >= WARP_THREADS, then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png) * \ingroup WarpModule * * \par @@ -530,7 +642,7 @@ __device__ __forceinline__ T ShuffleBroadcast( * double thread_data = ... * * // Obtain item from thread 0 - * double peer_data = ShuffleBroadcast(thread_data, 0); + * double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff); * * \endcode * \par @@ -538,68 +650,108 @@ __device__ __forceinline__ T ShuffleBroadcast( * The corresponding output \p peer_data will be {1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}. * */ -template -__device__ __forceinline__ T ShuffleBroadcast( - T input, ///< [in] The value to broadcast - int src_lane) ///< [in] Which warp lane is to do the broadcasting +template < + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + typename T> +__device__ __forceinline__ T ShuffleIndex( + T input, ///< [in] The value to broadcast + int src_lane, ///< [in] Which warp lane is to do the broadcasting + unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes { - return ShuffleBroadcast(input, src_lane, CUB_PTX_WARP_THREADS); -} + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + enum { + SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1) + }; + + typedef typename UnitWord::ShuffleWord ShuffleWord; + + const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); + + T output; + ShuffleWord *output_alias = reinterpret_cast(&output); + ShuffleWord *input_alias = reinterpret_cast(&input); + + unsigned int shuffle_word; + shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0], + src_lane, + SHFL_C, + member_mask); + + output_alias[0] = shuffle_word; + + #pragma unroll + for (int WORD = 1; WORD < WORDS; ++WORD) + { + shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD], + src_lane, + SHFL_C, + member_mask); + output_alias[WORD] = shuffle_word; + } + return output; +} /** - * \brief Portable implementation of __all - * \ingroup WarpModule + * Compute a 32b mask of threads having the same least-significant + * LABEL_BITS of \p label as the calling thread. */ -__device__ __forceinline__ int WarpAll(int cond) +template +inline __device__ unsigned int MatchAny(unsigned int label) { -#if CUB_PTX_ARCH < 120 + unsigned int retval; + + // Extract masks of common threads for each bit + #pragma unroll + for (int BIT = 0; BIT < LABEL_BITS; ++BIT) + { + unsigned int mask; + unsigned int current_bit = 1 << BIT; + asm ("{\n" + " .reg .pred p;\n" + " and.b32 %0, %1, %2;" + " setp.eq.u32 p, %0, %2;\n" +#ifdef CUB_USE_COOPERATIVE_GROUPS + " vote.ballot.sync.b32 %0, p, 0xffffffff;\n" +#else + " vote.ballot.b32 %0, p;\n" +#endif + " @!p not.b32 %0, %0;\n" + "}\n" : "=r"(mask) : "r"(label), "r"(current_bit)); + + // Remove peers who differ + retval = (BIT == 0) ? mask : retval & mask; + } + + return retval; + +// // VOLTA match +// unsigned int retval; +// asm ("{\n" +// " match.any.sync.b32 %0, %1, 0xffffffff;\n" +// "}\n" : "=r"(retval) : "r"(label)); +// return retval; + +} - __shared__ volatile int warp_signals[CUB_PTX_MAX_SM_THREADS / CUB_PTX_WARP_THREADS]; - if (LaneId() == 0) - warp_signals[WarpId()] = 1; - if (cond == 0) - warp_signals[WarpId()] = 0; - return warp_signals[WarpId()]; -#else - return __all(cond); -#endif -} -/** - * \brief Portable implementation of __any - * \ingroup WarpModule - */ -__device__ __forceinline__ int WarpAny(int cond) -{ -#if CUB_PTX_ARCH < 120 - __shared__ volatile int warp_signals[CUB_PTX_MAX_SM_THREADS / CUB_PTX_WARP_THREADS]; - if (LaneId() == 0) - warp_signals[WarpId()] = 0; - if (cond) - warp_signals[WarpId()] = 1; - return warp_signals[WarpId()]; -#else - return __any(cond); -#endif -} } // CUB namespace diff --git a/SRC/cub/util_type.cuh b/SRC/cub/util_type.cuh index 821a55db..0ba41e1e 100644 --- a/SRC/cub/util_type.cuh +++ b/SRC/cub/util_type.cuh @@ -1,6 +1,6 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -35,11 +35,18 @@ #include #include +#include + +#if (__CUDACC_VER_MAJOR__ >= 9) + #include +#endif #include "util_macro.cuh" #include "util_arch.cuh" #include "util_namespace.cuh" + + /// Optional outer namespace(s) CUB_NS_PREFIX @@ -110,6 +117,135 @@ struct Equals #endif // DOXYGEN_SHOULD_SKIP_THIS +/****************************************************************************** + * Static math + ******************************************************************************/ + +/** + * \brief Statically determine log2(N), rounded up. + * + * For example: + * Log2<8>::VALUE // 3 + * Log2<3>::VALUE // 2 + */ +template +struct Log2 +{ + /// Static logarithm value + enum { VALUE = Log2> 1), COUNT + 1>::VALUE }; // Inductive case +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct Log2 +{ + enum {VALUE = (1 << (COUNT - 1) < N) ? // Base case + COUNT : + COUNT - 1 }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/** + * \brief Statically determine if N is a power-of-two + */ +template +struct PowerOfTwo +{ + enum { VALUE = ((N & (N - 1)) == 0) }; +}; + + + +/****************************************************************************** + * Pointer vs. iterator detection + ******************************************************************************/ + +/** + * \brief Pointer vs. iterator + */ +template +struct IsPointer +{ + enum { VALUE = 0 }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct IsPointer +{ + enum { VALUE = 1 }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + + +/****************************************************************************** + * Qualifier detection + ******************************************************************************/ + +/** + * \brief Volatile modifier test + */ +template +struct IsVolatile +{ + enum { VALUE = 0 }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct IsVolatile +{ + enum { VALUE = 1 }; +}; + +#endif // DOXYGEN_SHOULD_SKIP_THIS + + +/****************************************************************************** + * Qualifier removal + ******************************************************************************/ + +/** + * \brief Removes \p const and \p volatile qualifiers from type \p Tp. + * + * For example: + * typename RemoveQualifiers::Type // int; + */ +template +struct RemoveQualifiers +{ + /// Type without \p const and \p volatile qualifiers + typedef Up Type; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + +template +struct RemoveQualifiers +{ + typedef Up Type; +}; + +template +struct RemoveQualifiers +{ + typedef Up Type; +}; + +template +struct RemoveQualifiers +{ + typedef Up Type; +}; + + /****************************************************************************** * Marker types ******************************************************************************/ @@ -122,11 +258,11 @@ struct NullType #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template - __host__ __device__ __forceinline__ NullType& operator =(const T& b) { return *this; } + __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; } - __host__ __device__ __forceinline__ bool operator ==(const NullType& b) { return true; } + __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; } - __host__ __device__ __forceinline__ bool operator !=(const NullType& b) { return false; } + __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; } #endif // DOXYGEN_SHOULD_SKIP_THIS }; @@ -161,41 +297,52 @@ struct AlignBytes enum { - /// The alignment of T in bytes + /// The "true CUDA" alignment of T in bytes ALIGN_BYTES = sizeof(Pad) - sizeof(T) }; -}; -// Specializations where host C++ compilers (e.g., Windows) may disagree with device C++ compilers (EDG) + /// The "truly aligned" type + typedef T Type; +}; -template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; -template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; -template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; -template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; +// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree +// with device C++ compilers (EDG) on types passed as template parameters through +// kernel functions + +#define __CUB_ALIGN_BYTES(t, b) \ + template <> struct AlignBytes \ + { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; }; + +__CUB_ALIGN_BYTES(short4, 8) +__CUB_ALIGN_BYTES(ushort4, 8) +__CUB_ALIGN_BYTES(int2, 8) +__CUB_ALIGN_BYTES(uint2, 8) +__CUB_ALIGN_BYTES(long long, 8) +__CUB_ALIGN_BYTES(unsigned long long, 8) +__CUB_ALIGN_BYTES(float2, 8) +__CUB_ALIGN_BYTES(double, 8) #ifdef _WIN32 - template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; - template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; -#endif -template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; -template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; -template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; -template <> struct AlignBytes { enum { ALIGN_BYTES = 8 }; }; - -template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; -template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; -template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; -#ifndef _WIN32 - template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; - template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; + __CUB_ALIGN_BYTES(long2, 8) + __CUB_ALIGN_BYTES(ulong2, 8) +#else + __CUB_ALIGN_BYTES(long2, 16) + __CUB_ALIGN_BYTES(ulong2, 16) #endif -template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; -template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; -template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; -template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; -template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; -template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; -template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; -template <> struct AlignBytes { enum { ALIGN_BYTES = 16 }; }; +__CUB_ALIGN_BYTES(int4, 16) +__CUB_ALIGN_BYTES(uint4, 16) +__CUB_ALIGN_BYTES(float4, 16) +__CUB_ALIGN_BYTES(long4, 16) +__CUB_ALIGN_BYTES(ulong4, 16) +__CUB_ALIGN_BYTES(longlong2, 16) +__CUB_ALIGN_BYTES(ulonglong2, 16) +__CUB_ALIGN_BYTES(double2, 16) +__CUB_ALIGN_BYTES(longlong4, 16) +__CUB_ALIGN_BYTES(ulonglong4, 16) +__CUB_ALIGN_BYTES(double4, 16) + +template struct AlignBytes : AlignBytes {}; +template struct AlignBytes : AlignBytes {}; +template struct AlignBytes : AlignBytes {}; /// Unit-words of data movement @@ -287,6 +434,12 @@ struct UnitWord typedef unsigned short TextureWord; }; + +template struct UnitWord : UnitWord {}; +template struct UnitWord : UnitWord {}; +template struct UnitWord : UnitWord {}; + + #endif // DOXYGEN_SHOULD_SKIP_THIS @@ -500,79 +653,114 @@ struct Uninitialized /** - * \brief An item value paired with a corresponding offset + * \brief A key identifier paired with a corresponding value */ -template -struct ItemOffsetPair +template < + typename _Key, + typename _Value +#if defined(_WIN32) && !defined(_WIN64) + , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES) + , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES) +#endif // #if defined(_WIN32) && !defined(_WIN64) + > +struct KeyValuePair { - typedef _T T; ///< Item data type - typedef _Offset Offset; ///< Integer offset data type + typedef _Key Key; ///< Key data type + typedef _Value Value; ///< Value data type -#if (CUB_PTX_ARCH == 0) - union - { - Offset offset; ///< Offset - typename UnitWord::DeviceWord align0; ///< Alignment/padding (for Win32 consistency between host/device) - }; -#else - Offset offset; ///< Offset -#endif + Key key; ///< Item key + Value value; ///< Item value + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair() {} - T value; ///< Item value + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} /// Inequality operator - __host__ __device__ __forceinline__ bool operator !=(const ItemOffsetPair &b) + __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) { - return (value != b.value) || (offset != b.offset); + return (value != b.value) || (key != b.key); } }; +#if defined(_WIN32) && !defined(_WIN64) /** - * \brief A key identifier paired with a corresponding value + * Win32 won't do 16B alignment. This can present two problems for + * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members: + * 1) If a smaller-aligned item were to be listed first, the host compiler places the + * should-be-16B item at too early an offset (and disagrees with device compiler) + * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size + * of the struct wrong (and disagrees with device compiler) + * + * So we put the larger-should-be-aligned item first, and explicitly pad the + * end of the struct */ -template -struct KeyValuePair + +/// Smaller key specialization +template +struct KeyValuePair { - typedef _Key Key; ///< Key data type - typedef _Value Value; ///< Value data type + typedef K Key; + typedef V Value; + + typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; - Value value; ///< Item value - Key key; ///< Item key + Value value; // Value has larger would-be alignment and goes first + Key key; + Pad pad; + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair() {} + + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} /// Inequality operator __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) { return (value != b.value) || (key != b.key); } - }; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -/** - * Workaround for inability for SM1.x compiler to properly zero-initialize POD structures when it's supposed to - */ -template -__host__ __device__ __forceinline__ T ZeroInitialize() +/// Smaller value specialization +template +struct KeyValuePair { -#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) + typedef K Key; + typedef V Value; - typedef typename UnitWord::ShuffleWord ShuffleWord; - const int MULTIPLE = sizeof(T) / sizeof(ShuffleWord); - ShuffleWord words[MULTIPLE]; - #pragma unroll - for (int i = 0; i < MULTIPLE; ++i) - words[i] = 0; - return *reinterpret_cast(words); + typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; -#else + Key key; // Key has larger would-be alignment and goes first + Value value; + Pad pad; - return T(); + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair() {} -#endif -} + /// Constructor + __host__ __device__ __forceinline__ + KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} + + /// Inequality operator + __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) + { + return (value != b.value) || (key != b.key); + } +}; + +#endif // #if defined(_WIN32) && !defined(_WIN64) + + +#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** @@ -581,8 +769,12 @@ __host__ __device__ __forceinline__ T ZeroInitialize() template struct ArrayWrapper { - /// Static array of type \p T + + /// Statically-sized array of type \p T T array[COUNT]; + + /// Constructor + __host__ __device__ __forceinline__ ArrayWrapper() {} }; #endif // DOXYGEN_SHOULD_SKIP_THIS @@ -624,140 +816,12 @@ struct DoubleBuffer /// \brief Return pointer to the currently valid buffer __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; } -}; - - - -/****************************************************************************** - * Static math - ******************************************************************************/ - -/** - * \brief Statically determine log2(N), rounded up. - * - * For example: - * Log2<8>::VALUE // 3 - * Log2<3>::VALUE // 2 - */ -template -struct Log2 -{ - /// Static logarithm value - enum { VALUE = Log2> 1), COUNT + 1>::VALUE }; // Inductive case -}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document -template -struct Log2 -{ - enum {VALUE = (1 << (COUNT - 1) < N) ? // Base case - COUNT : - COUNT - 1 }; -}; -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/** - * \brief Statically determine if N is a power-of-two - */ -template -struct PowerOfTwo -{ - enum { VALUE = ((N & (N - 1)) == 0) }; -}; - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - -/****************************************************************************** - * Pointer vs. iterator detection - ******************************************************************************/ - -/** - * \brief Pointer vs. iterator - */ -template -struct IsPointer -{ - enum { VALUE = 0 }; -}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -struct IsPointer -{ - enum { VALUE = 1 }; -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - -/****************************************************************************** - * Qualifier detection - ******************************************************************************/ - -/** - * \brief Volatile modifier test - */ -template -struct IsVolatile -{ - enum { VALUE = 0 }; -}; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + /// \brief Return pointer to the currently invalid buffer + __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; } -template -struct IsVolatile -{ - enum { VALUE = 1 }; }; -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/****************************************************************************** - * Qualifier removal - ******************************************************************************/ - -/** - * \brief Removes \p const and \p volatile qualifiers from type \p Tp. - * - * For example: - * typename RemoveQualifiers::Type // int; - */ -template -struct RemoveQualifiers -{ - /// Type without \p const and \p volatile qualifiers - typedef Up Type; -}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -struct RemoveQualifiers -{ - typedef Up Type; -}; - -template -struct RemoveQualifiers -{ - typedef Up Type; -}; - -template -struct RemoveQualifiers -{ - typedef Up Type; -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - /****************************************************************************** @@ -798,12 +862,10 @@ struct EnableIf typedef T Type; }; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document template struct EnableIf {}; -#endif // DOXYGEN_SHOULD_SKIP_THIS /****************************************************************************** @@ -817,25 +879,26 @@ template struct BinaryOpHasIdxParam { private: +/* template struct SFINAE1 {}; template struct SFINAE2 {}; template struct SFINAE3 {}; template struct SFINAE4 {}; - +*/ template struct SFINAE5 {}; template struct SFINAE6 {}; template struct SFINAE7 {}; template struct SFINAE8 {}; - +/* template static char Test(SFINAE1 *); template static char Test(SFINAE2 *); template static char Test(SFINAE3 *); template static char Test(SFINAE4 *); - - template static char Test(SFINAE5 *); - template static char Test(SFINAE6 *); - template static char Test(SFINAE7 *); - template static char Test(SFINAE8 *); +*/ + template __host__ __device__ static char Test(SFINAE5 *); + template __host__ __device__ static char Test(SFINAE6 *); + template __host__ __device__ static char Test(SFINAE7 *); + template __host__ __device__ static char Test(SFINAE8 *); template static int Test(...); @@ -845,7 +908,8 @@ public: static const bool HAS_PARAM = sizeof(Test(NULL)) == sizeof(char); }; -#endif // DOXYGEN_SHOULD_SKIP_THIS + + /****************************************************************************** * Simple type traits utilities. @@ -873,7 +937,7 @@ enum Category /** * \brief Basic type traits */ -template +template struct BaseTraits { /// Category @@ -885,18 +949,17 @@ struct BaseTraits }; }; -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document /** * Basic type traits (unsigned primitive specialization) */ -template -struct BaseTraits +template +struct BaseTraits { typedef _UnsignedBits UnsignedBits; static const Category CATEGORY = UNSIGNED_INTEGER; - static const UnsignedBits MIN_KEY = UnsignedBits(0); + static const UnsignedBits LOWEST_KEY = UnsignedBits(0); static const UnsignedBits MAX_KEY = UnsignedBits(-1); enum @@ -915,20 +978,32 @@ struct BaseTraits { return key; } + + static __host__ __device__ __forceinline__ T Max() + { + UnsignedBits retval = MAX_KEY; + return reinterpret_cast(retval); + } + + static __host__ __device__ __forceinline__ T Lowest() + { + UnsignedBits retval = LOWEST_KEY; + return reinterpret_cast(retval); + } }; /** * Basic type traits (signed primitive specialization) */ -template -struct BaseTraits +template +struct BaseTraits { typedef _UnsignedBits UnsignedBits; static const Category CATEGORY = SIGNED_INTEGER; static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); - static const UnsignedBits MIN_KEY = HIGH_BIT; + static const UnsignedBits LOWEST_KEY = HIGH_BIT; static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; enum @@ -947,22 +1022,83 @@ struct BaseTraits return key ^ HIGH_BIT; }; + static __host__ __device__ __forceinline__ T Max() + { + UnsignedBits retval = MAX_KEY; + return reinterpret_cast(retval); + } + + static __host__ __device__ __forceinline__ T Lowest() + { + UnsignedBits retval = LOWEST_KEY; + return reinterpret_cast(retval); + } }; +template +struct FpLimits; + +template <> +struct FpLimits +{ + static __host__ __device__ __forceinline__ float Max() { + return FLT_MAX; + } + + static __host__ __device__ __forceinline__ float Lowest() { + return FLT_MAX * float(-1); + } +}; + +template <> +struct FpLimits +{ + static __host__ __device__ __forceinline__ double Max() { + return DBL_MAX; + } + + static __host__ __device__ __forceinline__ double Lowest() { + return DBL_MAX * double(-1); + } +}; + + +#if (__CUDACC_VER_MAJOR__ >= 9) +template <> +struct FpLimits<__half> +{ + static __host__ __device__ __forceinline__ __half Max() { + unsigned short max_word = 0x7BFF; + return reinterpret_cast<__half&>(max_word); + } + + static __host__ __device__ __forceinline__ __half Lowest() { + unsigned short lowest_word = 0xFBFF; + return reinterpret_cast<__half&>(lowest_word); + } +}; +#endif + /** * Basic type traits (fp primitive specialization) */ -template -struct BaseTraits +template +struct BaseTraits { typedef _UnsignedBits UnsignedBits; static const Category CATEGORY = FLOATING_POINT; static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); - static const UnsignedBits MIN_KEY = UnsignedBits(-1); + static const UnsignedBits LOWEST_KEY = UnsignedBits(-1); static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; + enum + { + PRIMITIVE = true, + NULL_TYPE = false, + }; + static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) { UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT; @@ -975,42 +1111,44 @@ struct BaseTraits return key ^ mask; }; - enum - { - PRIMITIVE = true, - NULL_TYPE = false, - }; -}; + static __host__ __device__ __forceinline__ T Max() { + return FpLimits::Max(); + } -#endif // DOXYGEN_SHOULD_SKIP_THIS + static __host__ __device__ __forceinline__ T Lowest() { + return FpLimits::Lowest(); + } +}; /** * \brief Numeric type traits */ -template struct NumericTraits : BaseTraits {}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template <> struct NumericTraits : BaseTraits {}; - -template <> struct NumericTraits : BaseTraits<(std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char> {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; - -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; +template struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits<(std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; + +template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits {}; +#if (__CUDACC_VER_MAJOR__ >= 9) + template <> struct NumericTraits<__half> : BaseTraits {}; +#endif -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; +template <> struct NumericTraits : BaseTraits::VolatileWord, bool> {}; -#endif // DOXYGEN_SHOULD_SKIP_THIS /** @@ -1020,6 +1158,8 @@ template struct Traits : NumericTraits::Type> {}; +#endif // DOXYGEN_SHOULD_SKIP_THIS + /** @} */ // end group UtilModule diff --git a/SRC/cub/warp/specializations/warp_reduce_shfl.cuh b/SRC/cub/warp/specializations/warp_reduce_shfl.cuh index 2f11eab6..bbbf37e5 100644 --- a/SRC/cub/warp/specializations/warp_reduce_shfl.cuh +++ b/SRC/cub/warp/specializations/warp_reduce_shfl.cuh @@ -1,330 +1,541 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "../../thread/thread_operators.cuh" -#include "../../util_ptx.cuh" -#include "../../util_type.cuh" -#include "../../util_macro.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. - */ -template < - typename T, ///< Data type being reduced - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct WarpReduceShfl -{ - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// The number of warp reduction steps - STEPS = Log2::VALUE, - - // The 5-bit SHFL mask for logically splitting warps into sub-segments - SHFL_MASK = (-1 << STEPS) & 31, - - // The 5-bit SFHL clamp - SHFL_CLAMP = LOGICAL_WARP_THREADS - 1, - - // The packed C argument (mask starts 8 bits up) - SHFL_C = (SHFL_MASK << 8) | SHFL_CLAMP, - }; - - - /// Shared memory storage layout type - typedef NullType TempStorage; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - int lane_id; - - - /****************************************************************************** - * Construction - ******************************************************************************/ - - /// Constructor - __device__ __forceinline__ WarpReduceShfl( - TempStorage &temp_storage) - : - lane_id(IS_ARCH_WARP ? - LaneId() : - LaneId() % LOGICAL_WARP_THREADS) - {} - - - /****************************************************************************** - * Operation - ******************************************************************************/ - - /// Summation (single-SHFL) - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - int FOLDED_ITEMS_PER_LANE> ///< Number of items folded into each lane - __device__ __forceinline__ T Sum( - T input, ///< [in] Calling thread's input - int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp - Int2Type single_shfl) ///< [in] Marker type indicating whether only one SHFL instruction is required - { - unsigned int output = reinterpret_cast(input); - - // Iterate reduction steps - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - const int OFFSET = 1 << STEP; - - if (ALL_LANES_VALID) - { - // Use predicate set from SHFL to guard against invalid peers - asm( - "{" - " .reg .u32 r0;" - " .reg .pred p;" - " shfl.down.b32 r0|p, %1, %2, %3;" - " @p add.u32 r0, r0, %4;" - " mov.u32 %0, r0;" - "}" - : "=r"(output) : "r"(output), "r"(OFFSET), "r"(SHFL_C), "r"(output)); - } - else - { - // Set range predicate to guard against invalid peers - asm( - "{" - " .reg .u32 r0;" - " .reg .pred p;" - " shfl.down.b32 r0, %1, %2, %3;" - " setp.lt.u32 p, %5, %6;" - " mov.u32 %0, %1;" - " @p add.u32 %0, %1, r0;" - "}" - : "=r"(output) : "r"(output), "r"(OFFSET), "r"(SHFL_C), "r"(output), "r"((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE), "r"(folded_items_per_warp)); - } - } - - return output; - } - - - /// Summation (multi-SHFL) - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - int FOLDED_ITEMS_PER_LANE> ///< Number of items folded into each lane - __device__ __forceinline__ T Sum( - T input, ///< [in] Calling thread's input - int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp - Int2Type single_shfl) ///< [in] Marker type indicating whether only one SHFL instruction is required - { - // Delegate to generic reduce - return Reduce(input, folded_items_per_warp, cub::Sum()); - } - - - /// Summation (float) - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - int FOLDED_ITEMS_PER_LANE> ///< Number of items folded into each lane - __device__ __forceinline__ float Sum( - float input, ///< [in] Calling thread's input - int folded_items_per_warp) ///< [in] Total number of valid items folded into each logical warp - { - T output = input; - - // Iterate reduction steps - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - const int OFFSET = 1 << STEP; - - if (ALL_LANES_VALID) - { - // Use predicate set from SHFL to guard against invalid peers - asm( - "{" - " .reg .f32 r0;" - " .reg .pred p;" - " shfl.down.b32 r0|p, %1, %2, %3;" - " @p add.f32 r0, r0, %4;" - " mov.f32 %0, r0;" - "}" - : "=f"(output) : "f"(output), "r"(OFFSET), "r"(SHFL_C), "f"(output)); - } - else - { - // Set range predicate to guard against invalid peers - asm( - "{" - " .reg .f32 r0;" - " .reg .pred p;" - " shfl.down.b32 r0, %1, %2, %3;" - " setp.lt.u32 p, %5, %6;" - " mov.f32 %0, %1;" - " @p add.f32 %0, %0, r0;" - "}" - : "=f"(output) : "f"(output), "r"(OFFSET), "r"(SHFL_C), "f"(output), "r"((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE), "r"(folded_items_per_warp)); - } - } - - return output; - } - - /// Summation (generic) - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane - typename _T> - __device__ __forceinline__ _T Sum( - _T input, ///< [in] Calling thread's input - int folded_items_per_warp) ///< [in] Total number of valid items folded into each logical warp - { - // Whether sharing can be done with a single SHFL instruction (vs multiple SFHL instructions) - Int2Type<(Traits<_T>::PRIMITIVE) && (sizeof(_T) <= sizeof(unsigned int))> single_shfl; - - return Sum(input, folded_items_per_warp, single_shfl); - } - - - /// Reduction - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp - ReductionOp reduction_op) ///< [in] Binary reduction operator - { - T output = input; - - // Iterate scan steps - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - // Grab addend from peer - const int OFFSET = 1 << STEP; - - T temp = ShuffleDown(output, OFFSET); - - // Perform reduction op if from a valid peer - if (ALL_LANES_VALID) - { - if (lane_id < LOGICAL_WARP_THREADS - OFFSET) - output = reduction_op(output, temp); - } - else - { - if (((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE) < folded_items_per_warp) - output = reduction_op(output, temp); - } - } - - return output; - } - - - /// Segmented reduction - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename Flag, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - Flag flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op) ///< [in] Binary reduction operator - { - T output = input; - - // Get the start flags for each thread in the warp. - int warp_flags = __ballot(flag); - - if (!HEAD_SEGMENTED) - warp_flags <<= 1; - - // Keep bits above the current thread. - warp_flags &= LaneMaskGt(); - - // Accommodate packing of multiple logical warps in a single physical warp - if (!IS_ARCH_WARP) - { - warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; - } - - // Find next flag - int next_flag = __clz(__brev(warp_flags)); - - // Clip the next segment at the warp boundary if necessary - if (LOGICAL_WARP_THREADS != 32) - next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS); - - // Iterate scan steps - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - // Grab addend from peer - const int OFFSET = 1 << STEP; - - T temp = ShuffleDown(output, OFFSET); - - // Perform reduction op if valid - if (OFFSET < next_flag - lane_id) - output = reduction_op(output, temp); - } - - return output; - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../util_ptx.cuh" +#include "../../util_type.cuh" +#include "../../util_macro.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. + * + * LOGICAL_WARP_THREADS must be a power-of-two + */ +template < + typename T, ///< Data type being reduced + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpReduceShfl +{ + //--------------------------------------------------------------------- + // Constants and type definitions + //--------------------------------------------------------------------- + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// The number of warp reduction steps + STEPS = Log2::VALUE, + + /// Number of logical warps in a PTX warp + LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS, + + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8 + + }; + + template + struct IsInteger + { + enum { + ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange + IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) + }; + }; + + + /// Shared memory storage layout type + typedef NullType TempStorage; + + + //--------------------------------------------------------------------- + // Thread fields + //--------------------------------------------------------------------- + + /// Lane index in logical warp + unsigned int lane_id; + + /// Logical warp index in 32-thread physical warp + unsigned int warp_id; + + /// 32-thread physical warp member mask of logical warp + unsigned int member_mask; + + + //--------------------------------------------------------------------- + // Construction + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ WarpReduceShfl( + TempStorage &/*temp_storage*/) + { + lane_id = LaneId(); + warp_id = 0; + member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS); + + if (!IS_ARCH_WARP) + { + warp_id = lane_id / LOGICAL_WARP_THREADS; + lane_id = lane_id % LOGICAL_WARP_THREADS; + member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS); + } + } + + + //--------------------------------------------------------------------- + // Reduction steps + //--------------------------------------------------------------------- + + /// Reduction (specialized for summation across uint32 types) + __device__ __forceinline__ unsigned int ReduceStep( + unsigned int input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned int output; + int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.down.b32 r0|p, %1, %2, %3;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across fp32 types) + __device__ __forceinline__ float ReduceStep( + float input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + float output; + int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.down.b32 r0|p, %1, %2, %3;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across unsigned long long types) + __device__ __forceinline__ unsigned long long ReduceStep( + unsigned long long input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned long long output; + int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) + +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 %0, {lo, hi};" + " @p add.u64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.down.b32 lo|p, lo, %2, %3;" + " shfl.down.b32 hi|p, hi, %2, %3;" + " mov.b64 %0, {lo, hi};" + " @p add.u64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across long long types) + __device__ __forceinline__ long long ReduceStep( + long long input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + long long output; + int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 %0, {lo, hi};" + " @p add.s64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.down.b32 lo|p, lo, %2, %3;" + " shfl.down.b32 hi|p, hi, %2, %3;" + " mov.b64 %0, {lo, hi};" + " @p add.s64 %0, %0, %1;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c)); +#endif + + return output; + } + + + /// Reduction (specialized for summation across double types) + __device__ __forceinline__ double ReduceStep( + double input, ///< [in] Calling thread's input item. + cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + double output; + int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.down.b32 lo|p, lo, %2, %3;" + " shfl.down.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c)); +#endif + + return output; + } + + + /// Reduction (specialized for swizzled ReduceByKeyOp across KeyValuePair types) + template + __device__ __forceinline__ KeyValuePair ReduceStep( + KeyValuePair input, ///< [in] Calling thread's input item. + SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + KeyValuePair output; + + KeyT other_key = ShuffleDown(input.key, offset, last_lane, member_mask); + + output.key = input.key; + output.value = ReduceStep( + input.value, + cub::Sum(), + last_lane, + offset, + Int2Type::IS_SMALL_UNSIGNED>()); + + if (input.key != other_key) + output.value = input.value; + + return output; + } + + + + /// Reduction (specialized for swizzled ReduceBySegmentOp across KeyValuePair types) + template + __device__ __forceinline__ KeyValuePair ReduceStep( + KeyValuePair input, ///< [in] Calling thread's input item. + SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + KeyValuePair output; + + output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + + if (input.key > 0) + output.value = input.value; + + return output; + } + + + /// Reduction step (generic) + template + __device__ __forceinline__ _T ReduceStep( + _T input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset) ///< [in] Up-offset to pull from + { + _T output = input; + + _T temp = ShuffleDown(output, offset, last_lane, member_mask); + + // Perform reduction op if valid + if (offset + lane_id <= last_lane) + output = reduction_op(input, temp); + + return output; + } + + + /// Reduction step (specialized for small unsigned integers size 32b or less) + template + __device__ __forceinline__ _T ReduceStep( + _T input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer + { + return ReduceStep(input, reduction_op, last_lane, offset); + } + + + /// Reduction step (specialized for types other than small unsigned integers size 32b or less) + template + __device__ __forceinline__ _T ReduceStep( + _T input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer + { + return ReduceStep(input, reduction_op, last_lane, offset); + } + + + //--------------------------------------------------------------------- + // Templated inclusive scan iteration + //--------------------------------------------------------------------- + + template + __device__ __forceinline__ void ReduceStep( + T& input, ///< [in] Calling thread's input item. + ReductionOp reduction_op, ///< [in] Binary reduction operator + int last_lane, ///< [in] Index of last lane in segment + Int2Type /*step*/) + { + input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); + + ReduceStep(input, reduction_op, last_lane, Int2Type()); + } + + template + __device__ __forceinline__ void ReduceStep( + T& /*input*/, ///< [in] Calling thread's input item. + ReductionOp /*reduction_op*/, ///< [in] Binary reduction operator + int /*last_lane*/, ///< [in] Index of last lane in segment + Int2Type /*step*/) + {} + + + //--------------------------------------------------------------------- + // Reduction operations + //--------------------------------------------------------------------- + + /// Reduction + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + int valid_items, ///< [in] Total number of valid items across the logical warp + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + int last_lane = (ALL_LANES_VALID) ? + LOGICAL_WARP_THREADS - 1 : + valid_items - 1; + + T output = input; + +// // Iterate reduction steps +// #pragma unroll +// for (int STEP = 0; STEP < STEPS; STEP++) +// { +// output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); +// } + + // Template-iterate reduction steps + ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); + + return output; + } + + + /// Segmented reduction + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + // Get the start flags for each thread in the warp. + int warp_flags = WARP_BALLOT(flag, member_mask); + + // Convert to tail-segmented + if (HEAD_SEGMENTED) + warp_flags >>= 1; + + // Mask out the bits below the current thread + warp_flags &= LaneMaskGe(); + + // Mask of physical lanes outside the logical warp and convert to logical lanemask + if (!IS_ARCH_WARP) + { + warp_flags = (warp_flags & member_mask) >> (warp_id * LOGICAL_WARP_THREADS); + } + + // Mask in the last lane of logical warp + warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1); + + // Find the next set flag + int last_lane = __clz(__brev(warp_flags)); + + T output = input; + +// // Iterate reduction steps +// #pragma unroll +// for (int STEP = 0; STEP < STEPS; STEP++) +// { +// output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); +// } + + // Template-iterate reduction steps + ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); + + return output; + } +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/specializations/warp_reduce_smem.cuh b/SRC/cub/warp/specializations/warp_reduce_smem.cuh index 78d3ea23..7baa573b 100644 --- a/SRC/cub/warp/specializations/warp_reduce_smem.cuh +++ b/SRC/cub/warp/specializations/warp_reduce_smem.cuh @@ -1,358 +1,372 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "../../thread/thread_operators.cuh" -#include "../../thread/thread_load.cuh" -#include "../../thread/thread_store.cuh" -#include "../../util_type.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. - */ -template < - typename T, ///< Data type being reduced - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct WarpReduceSmem -{ - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// Whether the logical warp size is a power-of-two - IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0), - - /// The number of warp scan steps - STEPS = Log2::VALUE, - - /// The number of threads in half a warp - HALF_WARP_THREADS = 1 << (STEPS - 1), - - /// The number of shared memory elements per warp - WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, - - /// Flag status (when not using ballot) - UNSET = 0x0, // Is initially unset - SET = 0x1, // Is initially set - SEEN = 0x2, // Has seen another head flag from a successor peer - }; - - /// Shared memory flag type - typedef unsigned char SmemFlag; - - /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) - struct _TempStorage - { - T reduce[WARP_SMEM_ELEMENTS]; - SmemFlag flags[WARP_SMEM_ELEMENTS]; - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - _TempStorage &temp_storage; - int lane_id; - - - /****************************************************************************** - * Construction - ******************************************************************************/ - - /// Constructor - __device__ __forceinline__ WarpReduceSmem( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - lane_id(IS_ARCH_WARP ? - LaneId() : - LaneId() % LOGICAL_WARP_THREADS) - {} - - - /****************************************************************************** - * Operation - ******************************************************************************/ - - /** - * Reduction step - */ - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane - typename ReductionOp, - int STEP> - __device__ __forceinline__ T ReduceStep( - T input, ///< [in] Calling thread's input - int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp - ReductionOp reduction_op, ///< [in] Reduction operator - Int2Type step) - { - const int OFFSET = 1 << STEP; - - // Share input through buffer - ThreadStore(&temp_storage.reduce[lane_id], input); - - // Update input if peer_addend is in range - if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE < folded_items_per_warp)) - { - T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); - input = reduction_op(input, peer_addend); - } - - return ReduceStep(input, folded_items_per_warp, reduction_op, Int2Type()); - } - - - /** - * Reduction step (terminate) - */ - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane - typename ReductionOp> - __device__ __forceinline__ T ReduceStep( - T input, ///< [in] Calling thread's input - int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp - ReductionOp reduction_op, ///< [in] Reduction operator - Int2Type step) - { - return input; - } - - - /** - * Reduction - */ - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - int FOLDED_ITEMS_PER_LANE, ///< Number of items folded into each lane - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp - ReductionOp reduction_op) ///< [in] Reduction operator - { - return ReduceStep(input, folded_items_per_warp, reduction_op, Int2Type<0>()); - } - - - /** - * Ballot-based segmented reduce - */ - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename Flag, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - Flag flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op, ///< [in] Reduction operator - Int2Type has_ballot) ///< [in] Marker type for whether the target arch has ballot functionality - { - // Get the start flags for each thread in the warp. - int warp_flags = __ballot(flag); - - if (!HEAD_SEGMENTED) - warp_flags <<= 1; - - // Keep bits above the current thread. - warp_flags &= LaneMaskGt(); - - // Accommodate packing of multiple logical warps in a single physical warp - if (!IS_ARCH_WARP) - { - warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; - } - - // Find next flag - int next_flag = __clz(__brev(warp_flags)); - - // Clip the next segment at the warp boundary if necessary - if (LOGICAL_WARP_THREADS != 32) - next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS); - - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - const int OFFSET = 1 << STEP; - - // Share input into buffer - ThreadStore(&temp_storage.reduce[lane_id], input); - - // Update input if peer_addend is in range - if (OFFSET < next_flag - lane_id) - { - T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); - input = reduction_op(input, peer_addend); - } - } - - return input; - } - - - /** - * Smem-based segmented reduce - */ - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename Flag, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - Flag flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op, ///< [in] Reduction operator - Int2Type has_ballot) ///< [in] Marker type for whether the target arch has ballot functionality - { - enum - { - UNSET = 0x0, // Is initially unset - SET = 0x1, // Is initially set - SEEN = 0x2, // Has seen another head flag from a successor peer - }; - - // Alias flags onto shared data storage - volatile SmemFlag *flag_storage = temp_storage.flags; - - SmemFlag flag_status = (flag) ? SET : UNSET; - - for (int STEP = 0; STEP < STEPS; STEP++) - { - const int OFFSET = 1 << STEP; - - // Share input through buffer - ThreadStore(&temp_storage.reduce[lane_id], input); - - // Get peer from buffer - T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); - - // Share flag through buffer - flag_storage[lane_id] = flag_status; - - // Get peer flag from buffer - SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET]; - - // Update input if peer was in range - if (lane_id < LOGICAL_WARP_THREADS - OFFSET) - { - if (HEAD_SEGMENTED) - { - // Head-segmented - if ((flag_status & SEEN) == 0) - { - // Has not seen a more distant head flag - if (peer_flag_status & SET) - { - // Has now seen a head flag - flag_status |= SEEN; - } - else - { - // Peer is not a head flag: grab its count - input = reduction_op(input, peer_addend); - } - - // Update seen status to include that of peer - flag_status |= (peer_flag_status & SEEN); - } - } - else - { - // Tail-segmented. Simply propagate flag status - if (!flag_status) - { - input = reduction_op(input, peer_addend); - flag_status |= peer_flag_status; - } - - } - } - } - - return input; - } - - - /** - * Segmented reduction - */ - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename Flag, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - Flag flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op) ///< [in] Reduction operator - { - return SegmentedReduce(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>()); - } - - - /** - * Summation - */ - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - int FOLDED_ITEMS_PER_LANE> ///< Number of items folded into each lane - __device__ __forceinline__ T Sum( - T input, ///< [in] Calling thread's input - int folded_items_per_warp) ///< [in] Total number of valid items folded into each logical warp - { - return Reduce(input, folded_items_per_warp, cub::Sum()); - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../thread/thread_load.cuh" +#include "../../thread/thread_store.cuh" +#include "../../util_type.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. + */ +template < + typename T, ///< Data type being reduced + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpReduceSmem +{ + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = PowerOfTwo::VALUE, + + /// The number of warp scan steps + STEPS = Log2::VALUE, + + /// The number of threads in half a warp + HALF_WARP_THREADS = 1 << (STEPS - 1), + + /// The number of shared memory elements per warp + WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, + + /// FlagT status (when not using ballot) + UNSET = 0x0, // Is initially unset + SET = 0x1, // Is initially set + SEEN = 0x2, // Has seen another head flag from a successor peer + }; + + /// Shared memory flag type + typedef unsigned char SmemFlag; + + /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) + struct _TempStorage + { + T reduce[WARP_SMEM_ELEMENTS]; + SmemFlag flags[WARP_SMEM_ELEMENTS]; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + _TempStorage &temp_storage; + unsigned int lane_id; + unsigned int member_mask; + + + /****************************************************************************** + * Construction + ******************************************************************************/ + + /// Constructor + __device__ __forceinline__ WarpReduceSmem( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS), + + member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ? + 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp + ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) + {} + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + //--------------------------------------------------------------------- + // Regular reduction + //--------------------------------------------------------------------- + + /** + * Reduction step + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + typename ReductionOp, + int STEP> + __device__ __forceinline__ T ReduceStep( + T input, ///< [in] Calling thread's input + int valid_items, ///< [in] Total number of valid items across the logical warp + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type /*step*/) + { + const int OFFSET = 1 << STEP; + + // Share input through buffer + ThreadStore(&temp_storage.reduce[lane_id], input); + + WARP_SYNC(member_mask); + + // Update input if peer_addend is in range + if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items)) + { + T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + input = reduction_op(input, peer_addend); + } + + WARP_SYNC(member_mask); + + return ReduceStep(input, valid_items, reduction_op, Int2Type()); + } + + + /** + * Reduction step (terminate) + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + typename ReductionOp> + __device__ __forceinline__ T ReduceStep( + T input, ///< [in] Calling thread's input + int valid_items, ///< [in] Total number of valid items across the logical warp + ReductionOp /*reduction_op*/, ///< [in] Reduction operator + Int2Type /*step*/) + { + return input; + } + + + //--------------------------------------------------------------------- + // Segmented reduction + //--------------------------------------------------------------------- + + + /** + * Ballot-based segmented reduce + */ + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality + { + // Get the start flags for each thread in the warp. + int warp_flags = WARP_BALLOT(flag, member_mask); + + if (!HEAD_SEGMENTED) + warp_flags <<= 1; + + // Keep bits above the current thread. + warp_flags &= LaneMaskGt(); + + // Accommodate packing of multiple logical warps in a single physical warp + if (!IS_ARCH_WARP) + { + warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; + } + + // Find next flag + int next_flag = __clz(__brev(warp_flags)); + + // Clip the next segment at the warp boundary if necessary + if (LOGICAL_WARP_THREADS != 32) + next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS); + + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + const int OFFSET = 1 << STEP; + + // Share input into buffer + ThreadStore(&temp_storage.reduce[lane_id], input); + + WARP_SYNC(member_mask); + + // Update input if peer_addend is in range + if (OFFSET + lane_id < next_flag) + { + T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + input = reduction_op(input, peer_addend); + } + + WARP_SYNC(member_mask); + } + + return input; + } + + + /** + * Smem-based segmented reduce + */ + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op, ///< [in] Reduction operator + Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality + { + enum + { + UNSET = 0x0, // Is initially unset + SET = 0x1, // Is initially set + SEEN = 0x2, // Has seen another head flag from a successor peer + }; + + // Alias flags onto shared data storage + volatile SmemFlag *flag_storage = temp_storage.flags; + + SmemFlag flag_status = (flag) ? SET : UNSET; + + for (int STEP = 0; STEP < STEPS; STEP++) + { + const int OFFSET = 1 << STEP; + + // Share input through buffer + ThreadStore(&temp_storage.reduce[lane_id], input); + + WARP_SYNC(member_mask); + + // Get peer from buffer + T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); + + WARP_SYNC(member_mask); + + // Share flag through buffer + flag_storage[lane_id] = flag_status; + + // Get peer flag from buffer + SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET]; + + // Update input if peer was in range + if (lane_id < LOGICAL_WARP_THREADS - OFFSET) + { + if (HEAD_SEGMENTED) + { + // Head-segmented + if ((flag_status & SEEN) == 0) + { + // Has not seen a more distant head flag + if (peer_flag_status & SET) + { + // Has now seen a head flag + flag_status |= SEEN; + } + else + { + // Peer is not a head flag: grab its count + input = reduction_op(input, peer_addend); + } + + // Update seen status to include that of peer + flag_status |= (peer_flag_status & SEEN); + } + } + else + { + // Tail-segmented. Simply propagate flag status + if (!flag_status) + { + input = reduction_op(input, peer_addend); + flag_status |= peer_flag_status; + } + + } + } + } + + return input; + } + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + /** + * Reduction + */ + template < + bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items + typename ReductionOp> + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + int valid_items, ///< [in] Total number of valid items across the logical warp + ReductionOp reduction_op) ///< [in] Reduction operator + { + return ReduceStep(input, valid_items, reduction_op, Int2Type<0>()); + } + + + /** + * Segmented reduction + */ + template < + bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail + typename FlagT, + typename ReductionOp> + __device__ __forceinline__ T SegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail + ReductionOp reduction_op) ///< [in] Reduction operator + { + return SegmentedReduce(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>()); + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/specializations/warp_scan_shfl.cuh b/SRC/cub/warp/specializations/warp_scan_shfl.cuh index 43482986..7f4e1c94 100644 --- a/SRC/cub/warp/specializations/warp_scan_shfl.cuh +++ b/SRC/cub/warp/specializations/warp_scan_shfl.cuh @@ -1,401 +1,632 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "../../thread/thread_operators.cuh" -#include "../../util_type.cuh" -#include "../../util_ptx.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. - */ -template < - typename T, ///< Data type being scanned - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct WarpScanShfl -{ - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// The number of warp scan steps - STEPS = Log2::VALUE, - - // The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up - SHFL_C = ((-1 << STEPS) & 31) << 8, - - // Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange - SMALL_INTEGER = ((Traits::CATEGORY == UNSIGNED_INTEGER) || (Traits::CATEGORY == SIGNED_INTEGER)) && (sizeof(T) <= sizeof(unsigned int)) - }; - - /// Shared memory storage layout type - typedef NullType TempStorage; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - int lane_id; - - /****************************************************************************** - * Construction - ******************************************************************************/ - - /// Constructor - __device__ __forceinline__ WarpScanShfl( - TempStorage &temp_storage) - : - lane_id(IS_ARCH_WARP ? - LaneId() : - LaneId() % LOGICAL_WARP_THREADS) - {} - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Inclusive prefix scan (specialized for summation across primitive integer types 32b or smaller) - template - __device__ __forceinline__ void InclusiveScan( - _T input, ///< [in] Calling thread's input item. - _T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - Sum scan_op, ///< [in] Binary scan operator - Int2Type is_small_integer) ///< [in] Marker type indicating whether T is a small integer - { - unsigned int temp = reinterpret_cast(input); - - // Iterate scan steps - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - // Use predicate set from SHFL to guard against invalid peers - asm( - "{" - " .reg .u32 r0;" - " .reg .pred p;" - " shfl.up.b32 r0|p, %1, %2, %3;" - " @p add.u32 r0, r0, %4;" - " mov.u32 %0, r0;" - "}" - : "=r"(temp) : "r"(temp), "r"(1 << STEP), "r"(SHFL_C), "r"(temp)); - } - - output = reinterpret_cast<_T&>(temp); - } - - - /// Inclusive prefix scan (specialized for summation across float types) - __device__ __forceinline__ void InclusiveScan( - float input, ///< [in] Calling thread's input item. - float &output, ///< [out] Calling thread's output item. May be aliased with \p input. - Sum scan_op, ///< [in] Binary scan operator - Int2Type is_small_integer) ///< [in] Marker type indicating whether T is a small integer - { - output = input; - - // Iterate scan steps - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - // Use predicate set from SHFL to guard against invalid peers - asm( - "{" - " .reg .f32 r0;" - " .reg .pred p;" - " shfl.up.b32 r0|p, %1, %2, %3;" - " @p add.f32 r0, r0, %4;" - " mov.f32 %0, r0;" - "}" - : "=f"(output) : "f"(output), "r"(1 << STEP), "r"(SHFL_C), "f"(output)); - } - } - - - /// Inclusive prefix scan (specialized for summation across unsigned long long types) - __device__ __forceinline__ void InclusiveScan( - unsigned long long input, ///< [in] Calling thread's input item. - unsigned long long &output, ///< [out] Calling thread's output item. May be aliased with \p input. - Sum scan_op, ///< [in] Binary scan operator - Int2Type is_small_integer) ///< [in] Marker type indicating whether T is a small integer - { - output = input; - - // Iterate scan steps - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - // Use predicate set from SHFL to guard against invalid peers - asm( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.up.b32 lo|p, lo, %2, %3;" - " shfl.up.b32 hi|p, hi, %2, %3;" - " mov.b64 %0, {lo, hi};" - " @p add.u64 %0, %0, %1;" - "}" - : "=l"(output) : "l"(output), "r"(1 << STEP), "r"(SHFL_C)); - } - } - - - /// Inclusive prefix scan (specialized for summation across long long types) - __device__ __forceinline__ void InclusiveScan( - long long input, ///< [in] Calling thread's input item. - long long &output, ///< [out] Calling thread's output item. May be aliased with \p input. - Sum scan_op, ///< [in] Binary scan operator - Int2Type is_small_integer) ///< [in] Marker type indicating whether T is a small integer - { - output = input; - - // Iterate scan steps - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - // Use predicate set from SHFL to guard against invalid peers - asm( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.up.b32 lo|p, lo, %2, %3;" - " shfl.up.b32 hi|p, hi, %2, %3;" - " mov.b64 %0, {lo, hi};" - " @p add.s64 %0, %0, %1;" - "}" - : "=l"(output) : "l"(output), "r"(1 << STEP), "r"(SHFL_C)); - } - } - - - /// Inclusive prefix scan (specialized for summation across double types) - __device__ __forceinline__ void InclusiveScan( - double input, ///< [in] Calling thread's input item. - double &output, ///< [out] Calling thread's output item. May be aliased with \p input. - Sum scan_op, ///< [in] Binary scan operator - Int2Type is_small_integer) ///< [in] Marker type indicating whether T is a small integer - { - output = input; - - // Iterate scan steps - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - // Use predicate set from SHFL to guard against invalid peers - asm( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.up.b32 lo|p, lo, %2, %3;" - " shfl.up.b32 hi|p, hi, %2, %3;" - " mov.b64 %0, {lo, hi};" - " @p add.f64 %0, %0, %1;" - "}" - : "=d"(output) : "d"(output), "r"(1 << STEP), "r"(SHFL_C)); - } - } - - - /// Inclusive prefix scan - template - __device__ __forceinline__ void InclusiveScan( - _T input, ///< [in] Calling thread's input item. - _T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - Int2Type is_small_integer) ///< [in] Marker type indicating whether T is a small integer - { - output = input; - - // Iterate scan steps - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - // Grab addend from peer - const int OFFSET = 1 << STEP; - T temp = ShuffleUp(output, OFFSET); - - // Perform scan op if from a valid peer - if (lane_id >= OFFSET) - output = scan_op(temp, output); - } - } - - - - /****************************************************************************** - * Interface - ******************************************************************************/ - - - /// Broadcast - __device__ __forceinline__ T Broadcast( - T input, ///< [in] The value to broadcast - int src_lane) ///< [in] Which warp lane is to do the broadcasting - { - return ShuffleBroadcast(input, src_lane, LOGICAL_WARP_THREADS); - } - - - //--------------------------------------------------------------------- - // Inclusive operations - //--------------------------------------------------------------------- - - /// Inclusive scan - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op) ///< [in] Binary scan operator - { - InclusiveScan(input, output, scan_op, Int2Type()); - } - - - /// Inclusive scan with aggregate - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InclusiveScan(input, output, scan_op); - - // Grab aggregate from last warp lane - warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1); - } - - - //--------------------------------------------------------------------- - // Combo (inclusive & exclusive) operations - //--------------------------------------------------------------------- - - /// Combination scan without identity - template - __device__ __forceinline__ void Scan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. - T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. - ScanOp scan_op) ///< [in] Binary scan operator - { - // Compute inclusive scan - InclusiveScan(input, inclusive_output, scan_op); - - // Grab result from predecessor - exclusive_output = ShuffleUp(inclusive_output, 1); - } - - /// Combination scan with identity - template - __device__ __forceinline__ void Scan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. - T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. - T identity, ///< [in] Identity value - ScanOp scan_op) ///< [in] Binary scan operator - { - // Compute inclusive scan - InclusiveScan(input, inclusive_output, scan_op); - - // Grab result from predecessor - exclusive_output = ShuffleUp(inclusive_output, 1); - - exclusive_output = (lane_id == 0) ? - identity : - exclusive_output; - } - - - //--------------------------------------------------------------------- - // Exclusive operations - //--------------------------------------------------------------------- - - /// Exclusive scan with aggregate - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - T identity, ///< [in] Identity value - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - T inclusive_output; - Scan(input, inclusive_output, output, identity, scan_op); - - // Grab aggregate from last warp lane - warp_aggregate = Broadcast(inclusive_output, LOGICAL_WARP_THREADS - 1); - } - - - /// Exclusive scan with aggregate, without identity - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - T inclusive_output; - Scan(input, inclusive_output, output, scan_op); - - // Grab aggregate from last warp lane - warp_aggregate = Broadcast(inclusive_output, LOGICAL_WARP_THREADS - 1); - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../util_type.cuh" +#include "../../util_ptx.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + * + * LOGICAL_WARP_THREADS must be a power-of-two + */ +template < + typename T, ///< Data type being scanned + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpScanShfl +{ + //--------------------------------------------------------------------- + // Constants and type definitions + //--------------------------------------------------------------------- + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// The number of warp scan steps + STEPS = Log2::VALUE, + + /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up + SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8 + }; + + template + struct IntegerTraits + { + enum { + ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange + IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) + }; + }; + + /// Shared memory storage layout type + struct TempStorage {}; + + + //--------------------------------------------------------------------- + // Thread fields + //--------------------------------------------------------------------- + + /// Lane index in logical warp + unsigned int lane_id; + + /// Logical warp index in 32-thread physical warp + unsigned int warp_id; + + /// 32-thread physical warp member mask of logical warp + unsigned int member_mask; + + //--------------------------------------------------------------------- + // Construction + //--------------------------------------------------------------------- + + /// Constructor + __device__ __forceinline__ WarpScanShfl( + TempStorage &/*temp_storage*/) + { + lane_id = LaneId(); + warp_id = 0; + member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS); + + if (!IS_ARCH_WARP) + { + warp_id = lane_id / LOGICAL_WARP_THREADS; + lane_id = lane_id % LOGICAL_WARP_THREADS; + member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS); + } + } + + + //--------------------------------------------------------------------- + // Inclusive scan steps + //--------------------------------------------------------------------- + + /// Inclusive prefix scan step (specialized for summation across int32 types) + __device__ __forceinline__ int InclusiveScanStep( + int input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + int output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .s32 r0;" + " .reg .pred p;" + " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" + " @p add.s32 r0, r0, %4;" + " mov.s32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .s32 r0;" + " .reg .pred p;" + " shfl.up.b32 r0|p, %1, %2, %3;" + " @p add.s32 r0, r0, %4;" + " mov.s32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); +#endif + + return output; + } + + /// Inclusive prefix scan step (specialized for summation across uint32 types) + __device__ __forceinline__ unsigned int InclusiveScanStep( + unsigned int input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned int output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 r0;" + " .reg .pred p;" + " shfl.up.b32 r0|p, %1, %2, %3;" + " @p add.u32 r0, r0, %4;" + " mov.u32 %0, r0;" + "}" + : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across fp32 types) + __device__ __forceinline__ float InclusiveScanStep( + float input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + float output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .f32 r0;" + " .reg .pred p;" + " shfl.up.b32 r0|p, %1, %2, %3;" + " @p add.f32 r0, r0, %4;" + " mov.f32 %0, r0;" + "}" + : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across unsigned long long types) + __device__ __forceinline__ unsigned long long InclusiveScanStep( + unsigned long long input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + unsigned long long output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" + " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" + " mov.b64 r0, {lo, hi};" + " @p add.u64 r0, r0, %4;" + " mov.u64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.up.b32 lo|p, lo, %2, %3;" + " shfl.up.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.u64 r0, r0, %4;" + " mov.u64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across long long types) + __device__ __forceinline__ long long InclusiveScanStep( + long long input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + long long output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .s64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" + " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" + " mov.b64 r0, {lo, hi};" + " @p add.s64 r0, r0, %4;" + " mov.s64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .s64 r0;" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " mov.b64 {lo, hi}, %1;" + " shfl.up.b32 lo|p, lo, %2, %3;" + " shfl.up.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.s64 r0, r0, %4;" + " mov.s64 %0, r0;" + "}" + : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); +#endif + + return output; + } + + + /// Inclusive prefix scan step (specialized for summation across fp64 types) + __device__ __forceinline__ double InclusiveScanStep( + double input, ///< [in] Calling thread's input item. + cub::Sum /*scan_op*/, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + double output; + int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) + + // Use predicate set from SHFL to guard against invalid peers +#ifdef CUB_USE_COOPERATIVE_GROUPS + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.sync.up.b32 lo|p, lo, %2, %3, %4;" + " shfl.sync.up.b32 hi|p, hi, %2, %3, %4;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); +#else + asm volatile( + "{" + " .reg .u32 lo;" + " .reg .u32 hi;" + " .reg .pred p;" + " .reg .f64 r0;" + " mov.b64 %0, %1;" + " mov.b64 {lo, hi}, %1;" + " shfl.up.b32 lo|p, lo, %2, %3;" + " shfl.up.b32 hi|p, hi, %2, %3;" + " mov.b64 r0, {lo, hi};" + " @p add.f64 %0, %0, r0;" + "}" + : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c)); +#endif + + return output; + } + + +/* + /// Inclusive prefix scan (specialized for ReduceBySegmentOp across KeyValuePair types) + template + __device__ __forceinline__ KeyValuePairInclusiveScanStep( + KeyValuePair input, ///< [in] Calling thread's input item. + ReduceBySegmentOp scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + KeyValuePair output; + + output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); + + if (input.key > 0) + output.value = input.value; + + return output; + } +*/ + + /// Inclusive prefix scan step (generic) + template + __device__ __forceinline__ _T InclusiveScanStep( + _T input, ///< [in] Calling thread's input item. + ScanOpT scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset) ///< [in] Up-offset to pull from + { + _T temp = ShuffleUp(input, offset, first_lane, member_mask); + + // Perform scan op if from a valid peer + _T output = scan_op(temp, input); + if (static_cast(lane_id) < first_lane + offset) + output = input; + + return output; + } + + + /// Inclusive prefix scan step (specialized for small integers size 32b or less) + template + __device__ __forceinline__ _T InclusiveScanStep( + _T input, ///< [in] Calling thread's input item. + ScanOpT scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer + { + return InclusiveScanStep(input, scan_op, first_lane, offset); + } + + + /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less) + template + __device__ __forceinline__ _T InclusiveScanStep( + _T input, ///< [in] Calling thread's input item. + ScanOpT scan_op, ///< [in] Binary scan operator + int first_lane, ///< [in] Index of first lane in segment + int offset, ///< [in] Up-offset to pull from + Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer + { + return InclusiveScanStep(input, scan_op, first_lane, offset); + } + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + //--------------------------------------------------------------------- + // Broadcast + //--------------------------------------------------------------------- + + /// Broadcast + __device__ __forceinline__ T Broadcast( + T input, ///< [in] The value to broadcast + int src_lane) ///< [in] Which warp lane is to do the broadcasting + { + return ShuffleIndex(input, src_lane, member_mask); + } + + + //--------------------------------------------------------------------- + // Inclusive operations + //--------------------------------------------------------------------- + + /// Inclusive scan + template + __device__ __forceinline__ void InclusiveScan( + _T input, ///< [in] Calling thread's input item. + _T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOpT scan_op) ///< [in] Binary scan operator + { + inclusive_output = input; + + // Iterate scan steps + int segment_first_lane = 0; + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + inclusive_output = InclusiveScanStep( + inclusive_output, + scan_op, + segment_first_lane, + (1 << STEP), + Int2Type::IS_SMALL_UNSIGNED>()); + } + + } + + /// Inclusive scan, specialized for reduce-value-by-key + template + __device__ __forceinline__ void InclusiveScan( + KeyValuePair input, ///< [in] Calling thread's input item. + KeyValuePair &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ReduceByKeyOp scan_op) ///< [in] Binary scan operator + { + inclusive_output = input; + + KeyT pred_key = ShuffleUp(inclusive_output.key, 1, 0, member_mask); + + unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask); + + // Mask away all lanes greater than ours + ballot = ballot & LaneMaskLe(); + + // Find index of first set bit + int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot)); + + // Iterate scan steps + #pragma unroll + for (int STEP = 0; STEP < STEPS; STEP++) + { + inclusive_output.value = InclusiveScanStep( + inclusive_output.value, + scan_op.op, + segment_first_lane, + (1 << STEP), + Int2Type::IS_SMALL_UNSIGNED>()); + } + } + + + /// Inclusive scan with aggregate + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOpT scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InclusiveScan(input, inclusive_output, scan_op); + + // Grab aggregate from last warp lane + warp_aggregate = ShuffleIndex(inclusive_output, LOGICAL_WARP_THREADS - 1, member_mask); + } + + + //--------------------------------------------------------------------- + // Get exclusive from inclusive + //--------------------------------------------------------------------- + + /// Update inclusive and exclusive using input and inclusive + template + __device__ __forceinline__ void Update( + T /*input*/, ///< [in] + T &inclusive, ///< [in, out] + T &exclusive, ///< [out] + ScanOpT /*scan_op*/, ///< [in] + IsIntegerT /*is_integer*/) ///< [in] + { + // initial value unknown + exclusive = ShuffleUp(inclusive, 1, 0, member_mask); + } + + /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update( + T input, + T &inclusive, + T &exclusive, + cub::Sum /*scan_op*/, + Int2Type /*is_integer*/) + { + // initial value presumed 0 + exclusive = inclusive - input; + } + + /// Update inclusive and exclusive using initial value using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + ScanOpT scan_op, + T initial_value, + IsIntegerT /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + exclusive = ShuffleUp(inclusive, 1, 0, member_mask); + + if (lane_id == 0) + exclusive = initial_value; + } + + /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + cub::Sum scan_op, + T initial_value, + Int2Type /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + exclusive = inclusive - input; + } + + + /// Update inclusive, exclusive, and warp aggregate using input and inclusive + template + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT scan_op, + IsIntegerT is_integer) + { + warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, member_mask); + Update(input, inclusive, exclusive, scan_op, is_integer); + } + + /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT scan_op, + T initial_value, + IsIntegerT is_integer) + { + warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, member_mask); + Update(input, inclusive, exclusive, scan_op, initial_value, is_integer); + } + + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/specializations/warp_scan_smem.cuh b/SRC/cub/warp/specializations/warp_scan_smem.cuh index e23ebc41..3237fcbf 100644 --- a/SRC/cub/warp/specializations/warp_scan_smem.cuh +++ b/SRC/cub/warp/specializations/warp_scan_smem.cuh @@ -1,319 +1,397 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "../../thread/thread_operators.cuh" -#include "../../thread/thread_load.cuh" -#include "../../thread/thread_store.cuh" -#include "../../util_type.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. - */ -template < - typename T, ///< Data type being scanned - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct WarpScanSmem -{ - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// The number of warp scan steps - STEPS = Log2::VALUE, - - /// The number of threads in half a warp - HALF_WARP_THREADS = 1 << (STEPS - 1), - - /// The number of shared memory elements per warp - WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, - }; - - /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars) - typedef typename If<((Equals::VALUE || Equals::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT; - - /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) - typedef CellT _TempStorage[WARP_SMEM_ELEMENTS]; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - _TempStorage &temp_storage; - unsigned int lane_id; - - - /****************************************************************************** - * Construction - ******************************************************************************/ - - /// Constructor - __device__ __forceinline__ WarpScanSmem( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - lane_id(IS_ARCH_WARP ? - LaneId() : - LaneId() % LOGICAL_WARP_THREADS) - {} - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Basic inclusive scan iteration(template unrolled, base-case specialization) - template < - bool HAS_IDENTITY, - typename ScanOp> - __device__ __forceinline__ void ScanStep( - T &partial, - ScanOp scan_op, - Int2Type step) - {} - - - /// Basic inclusive scan iteration (template unrolled, inductive-case specialization) - template < - bool HAS_IDENTITY, - int STEP, - typename ScanOp> - __device__ __forceinline__ void ScanStep( - T &partial, - ScanOp scan_op, - Int2Type step) - { - const int OFFSET = 1 << STEP; - - // Share partial into buffer - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial); - - // Update partial if addend is in range - if (HAS_IDENTITY || (lane_id >= OFFSET)) - { - T addend = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]); - partial = scan_op(addend, partial); - } - - ScanStep(partial, scan_op, Int2Type()); - } - - - /// Inclusive prefix scan with identity - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - T identity, ///< [in] Identity value - ScanOp scan_op) ///< [in] Binary scan operator - { - ThreadStore(&temp_storage[lane_id], (CellT) identity); - - // Iterate scan steps - output = input; - ScanStep(output, scan_op, Int2Type<0>()); - } - - - /// Inclusive prefix scan (specialized for summation across primitive types) - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - Sum scan_op, ///< [in] Binary scan operator - Int2Type is_primitive) ///< [in] Marker type indicating whether T is primitive type - { - T identity = ZeroInitialize(); - InclusiveScan(input, output, identity, scan_op); - } - - - /// Inclusive prefix scan - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - Int2Type is_primitive) ///< [in] Marker type indicating whether T is primitive type - { - // Iterate scan steps - output = input; - ScanStep(output, scan_op, Int2Type<0>()); - } - - - - /****************************************************************************** - * Interface - ******************************************************************************/ - - /// Broadcast - __device__ __forceinline__ T Broadcast( - T input, ///< [in] The value to broadcast - unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting - { - if (lane_id == src_lane) - { - ThreadStore(temp_storage, (CellT) input); - } - - return (T) ThreadLoad(temp_storage); - } - - - //--------------------------------------------------------------------- - // Inclusive operations - //--------------------------------------------------------------------- - - /// Inclusive scan - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op) ///< [in] Binary scan operator - { - InclusiveScan(input, output, scan_op, Int2Type::PRIMITIVE>()); } - - - /// Inclusive scan with aggregate - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InclusiveScan(input, output, scan_op); - - // Retrieve aggregate - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) output); - warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); - } - - - //--------------------------------------------------------------------- - // Combo (inclusive & exclusive) operations - //--------------------------------------------------------------------- - - /// Combination scan without identity - template - __device__ __forceinline__ void Scan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. - T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. - ScanOp scan_op) ///< [in] Binary scan operator - { - // Compute inclusive scan - InclusiveScan(input, inclusive_output, scan_op); - - // Grab result from predecessor - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output); - exclusive_output = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); - } - - /// Combination scan with identity - template - __device__ __forceinline__ void Scan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. - T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. - T identity, ///< [in] Identity value - ScanOp scan_op) ///< [in] Binary scan operator - { - // Compute inclusive scan - InclusiveScan(input, inclusive_output, identity, scan_op); - - // Grab result from predecessor - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output); - exclusive_output = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); - } - - - //--------------------------------------------------------------------- - // Exclusive operations - //--------------------------------------------------------------------- - - /// Exclusive scan with aggregate - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - T identity, ///< [in] Identity value - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - T inclusive_output; - Scan(input, inclusive_output, output, identity, scan_op); - - // Retrieve aggregate - warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); - } - - - /// Exclusive scan with aggregate, without identity - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - T inclusive_output; - Scan(input, inclusive_output, output, scan_op); - - // Retrieve aggregate - warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "../../thread/thread_operators.cuh" +#include "../../thread/thread_load.cuh" +#include "../../thread/thread_store.cuh" +#include "../../util_type.cuh" +#include "../../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + */ +template < + typename T, ///< Data type being scanned + int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp + int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective +struct WarpScanSmem +{ + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = PowerOfTwo::VALUE, + + /// The number of warp scan steps + STEPS = Log2::VALUE, + + /// The number of threads in half a warp + HALF_WARP_THREADS = 1 << (STEPS - 1), + + /// The number of shared memory elements per warp + WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, + }; + + /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars) + typedef typename If<((Equals::VALUE || Equals::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT; + + /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) + typedef CellT _TempStorage[WARP_SMEM_ELEMENTS]; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + _TempStorage &temp_storage; + unsigned int lane_id; + unsigned int member_mask; + + + /****************************************************************************** + * Construction + ******************************************************************************/ + + /// Constructor + __device__ __forceinline__ WarpScanSmem( + TempStorage &temp_storage) + : + temp_storage(temp_storage.Alias()), + + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS), + + member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ? + 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp + ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) + {} + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + + /// Basic inclusive scan iteration (template unrolled, inductive-case specialization) + template < + bool HAS_IDENTITY, + int STEP, + typename ScanOp> + __device__ __forceinline__ void ScanStep( + T &partial, + ScanOp scan_op, + Int2Type /*step*/) + { + const int OFFSET = 1 << STEP; + + // Share partial into buffer + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial); + + WARP_SYNC(member_mask); + + // Update partial if addend is in range + if (HAS_IDENTITY || (lane_id >= OFFSET)) + { + T addend = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]); + partial = scan_op(addend, partial); + } + WARP_SYNC(member_mask); + + ScanStep(partial, scan_op, Int2Type()); + } + + + /// Basic inclusive scan iteration(template unrolled, base-case specialization) + template < + bool HAS_IDENTITY, + typename ScanOp> + __device__ __forceinline__ void ScanStep( + T &/*partial*/, + ScanOp /*scan_op*/, + Int2Type /*step*/) + {} + + + /// Inclusive prefix scan (specialized for summation across primitive types) + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + Sum scan_op, ///< [in] Binary scan operator + Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type + { + T identity = 0; + ThreadStore(&temp_storage[lane_id], (CellT) identity); + + WARP_SYNC(member_mask); + + // Iterate scan steps + output = input; + ScanStep(output, scan_op, Int2Type<0>()); + } + + + /// Inclusive prefix scan + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type + { + // Iterate scan steps + output = input; + ScanStep(output, scan_op, Int2Type<0>()); + } + + + /****************************************************************************** + * Interface + ******************************************************************************/ + + //--------------------------------------------------------------------- + // Broadcast + //--------------------------------------------------------------------- + + /// Broadcast + __device__ __forceinline__ T Broadcast( + T input, ///< [in] The value to broadcast + unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting + { + if (lane_id == src_lane) + { + ThreadStore(temp_storage, (CellT) input); + } + + WARP_SYNC(member_mask); + + return (T)ThreadLoad(temp_storage); + } + + + //--------------------------------------------------------------------- + // Inclusive operations + //--------------------------------------------------------------------- + + /// Inclusive scan + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + InclusiveScan(input, inclusive_output, scan_op, Int2Type::PRIMITIVE>()); + } + + + /// Inclusive scan with aggregate + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InclusiveScan(input, inclusive_output, scan_op); + + // Retrieve aggregate + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output); + + WARP_SYNC(member_mask); + + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + + WARP_SYNC(member_mask); + } + + + //--------------------------------------------------------------------- + // Get exclusive from inclusive + //--------------------------------------------------------------------- + + /// Update inclusive and exclusive using input and inclusive + template + __device__ __forceinline__ void Update( + T /*input*/, ///< [in] + T &inclusive, ///< [in, out] + T &exclusive, ///< [out] + ScanOpT /*scan_op*/, ///< [in] + IsIntegerT /*is_integer*/) ///< [in] + { + // initial value unknown + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); + } + + /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update( + T input, + T &inclusive, + T &exclusive, + cub::Sum /*scan_op*/, + Int2Type /*is_integer*/) + { + // initial value presumed 0 + exclusive = inclusive - input; + } + + /// Update inclusive and exclusive using initial value using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + ScanOpT scan_op, + T initial_value, + IsIntegerT /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); + if (lane_id == 0) + exclusive = initial_value; + } + + /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + cub::Sum scan_op, + T initial_value, + Int2Type /*is_integer*/) + { + inclusive = scan_op(initial_value, inclusive); + exclusive = inclusive - input; + } + + + /// Update inclusive, exclusive, and warp aggregate using input and inclusive + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT /*scan_op*/, + IsIntegerT /*is_integer*/) + { + // Initial value presumed to be unknown or identity (either way our padding is correct) + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + } + + /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types) + __device__ __forceinline__ void Update ( + T input, + T &inclusive, + T &exclusive, + T &warp_aggregate, + cub::Sum /*scan_o*/, + Int2Type /*is_integer*/) + { + // Initial value presumed to be unknown or identity (either way our padding is correct) + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + exclusive = inclusive - input; + } + + /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value + template + __device__ __forceinline__ void Update ( + T /*input*/, + T &inclusive, + T &exclusive, + T &warp_aggregate, + ScanOpT scan_op, + T initial_value, + IsIntegerT /*is_integer*/) + { + // Broadcast warp aggregate + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); + + WARP_SYNC(member_mask); + + warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); + + WARP_SYNC(member_mask); + + // Update inclusive with initial value + inclusive = scan_op(initial_value, inclusive); + + // Get exclusive from exclusive + ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive); + + WARP_SYNC(member_mask); + + exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 2]); + + if (lane_id == 0) + exclusive = initial_value; + } + + +}; + + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/warp_reduce.cuh b/SRC/cub/warp/warp_reduce.cuh index 1cd3fe0c..189896b0 100644 --- a/SRC/cub/warp/warp_reduce.cuh +++ b/SRC/cub/warp/warp_reduce.cuh @@ -1,627 +1,612 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "specializations/warp_reduce_shfl.cuh" -#include "specializations/warp_reduce_smem.cuh" -#include "../thread/thread_operators.cuh" -#include "../util_arch.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup WarpModule - * @{ - */ - -/** - * \brief The WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png) - * - * \tparam T The reduction input/output element type - * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20). - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - A reduction (or fold) - * uses a binary combining operator to compute a single aggregate from a list of input elements. - * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads) - * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS - * - * \par Performance Considerations - * - Uses special instructions when applicable (e.g., warp \p SHFL instructions) - * - Uses synchronization-free communication between warp lanes when applicable - * - Incurs zero bank conflicts for most types - * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: - * - Summation (vs. generic reduction) - * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS - * - * \par Simple Examples - * \warpcollective{WarpReduce} - * \par - * The code snippet below illustrates four concurrent warp sum reductions within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for 4 warps - * __shared__ typename WarpReduce::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96) - * int warp_id = threadIdx.x / 32; - * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. - * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, - * \p 2544, and \p 3568, respectively (and is undefined in other threads). - * - * \par - * The code snippet below illustrates a single warp sum reduction within a block of - * 128 threads. - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * ... - * - * // Only the first warp performs a reduction - * if (threadIdx.x < 32) - * { - * // Obtain one input item per thread - * int thread_data = ... - * - * // Return the warp-wide sum to lane0 - * int aggregate = WarpReduce(temp_storage).Sum(thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the warp of threads is {0, 1, 2, 3, ..., 31}. - * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads). - * - */ -template < - typename T, - int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, - int PTX_ARCH = CUB_PTX_ARCH> -class WarpReduce -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// Whether the logical warp size is a power-of-two - IS_POW_OF_TWO = PowerOfTwo::VALUE, - }; - -public: - - #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - /// Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) - typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), - WarpReduceShfl, - WarpReduceSmem >::Type InternalWarpReduce; - - #endif // DOXYGEN_SHOULD_SKIP_THIS - - -private: - - /// Shared memory storage layout type for WarpReduce - typedef typename InternalWarpReduce::TempStorage _TempStorage; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - -public: - - /// \smemstorage{WarpReduce} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. - */ - __device__ __forceinline__ WarpReduce( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()) - {} - - - //@} end member group - /******************************************************************//** - * \name Summation reductions - *********************************************************************/ - //@{ - - - /** - * \brief Computes a warp-wide sum in the calling warp. The output is valid in warp lane0. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp sum reductions within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for 4 warps - * __shared__ typename WarpReduce::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Return the warp-wide sums to each lane0 - * int warp_id = threadIdx.x / 32; - * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. - * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, - * \p 2544, and \p 3568, respectively (and is undefined in other threads). - * - */ - __device__ __forceinline__ T Sum( - T input) ///< [in] Calling thread's input - { - return InternalWarpReduce(temp_storage).Sum(input, LOGICAL_WARP_THREADS); - } - - /** - * \brief Computes a partially-full warp-wide sum in the calling warp. The output is valid in warp lane0. - * - * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sum reduction within a single, partially-full - * block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(int *d_data, int valid_items) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item per thread if in range - * int thread_data; - * if (threadIdx.x < valid_items) - * thread_data = d_data[threadIdx.x]; - * - * // Return the warp-wide sums to each lane0 - * int aggregate = WarpReduce(temp_storage).Sum( - * thread_data, valid_items); - * - * \endcode - * \par - * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items - * is \p 4. The corresponding output \p aggregate in thread0 is \p 6 (and is - * undefined in other threads). - * - */ - __device__ __forceinline__ T Sum( - T input, ///< [in] Calling thread's input - int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) - { - // Determine if we don't need bounds checking - if (valid_items >= LOGICAL_WARP_THREADS) - { - return InternalWarpReduce(temp_storage).Sum(input, valid_items); - } - else - { - return InternalWarpReduce(temp_storage).Sum(input, valid_items); - } - } - - - /** - * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a head-segmented warp sum - * reduction within a block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item and flag per thread - * int thread_data = ... - * int head_flag = ... - * - * // Return the warp-wide sums to each lane0 - * int aggregate = WarpReduce(temp_storage).HeadSegmentedSum( - * thread_data, head_flag); - * - * \endcode - * \par - * Suppose the set of input \p thread_data and \p head_flag across the block of threads - * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, - * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be - * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - * - */ - template < - typename Flag> - __device__ __forceinline__ T HeadSegmentedSum( - T input, ///< [in] Calling thread's input - Flag head_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment - { - return HeadSegmentedReduce(input, head_flag, cub::Sum()); - } - - - /** - * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a tail-segmented warp sum - * reduction within a block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item and flag per thread - * int thread_data = ... - * int tail_flag = ... - * - * // Return the warp-wide sums to each lane0 - * int aggregate = WarpReduce(temp_storage).TailSegmentedSum( - * thread_data, tail_flag); - * - * \endcode - * \par - * Suppose the set of input \p thread_data and \p tail_flag across the block of threads - * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, - * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be - * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ - template < - typename Flag> - __device__ __forceinline__ T TailSegmentedSum( - T input, ///< [in] Calling thread's input - Flag tail_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment - { - return TailSegmentedReduce(input, tail_flag, cub::Sum()); - } - - - - //@} end member group - /******************************************************************//** - * \name Generic reductions - *********************************************************************/ - //@{ - - /** - * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. - * - * Supports non-commutative reduction operators - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp max reductions within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for 4 warps - * __shared__ typename WarpReduce::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Return the warp-wide reductions to each lane0 - * int warp_id = threadIdx.x / 32; - * int aggregate = WarpReduce(temp_storage[warp_id]).Reduce( - * thread_data, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. - * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63, - * \p 95, and \p 127, respectively (and is undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - ReductionOp reduction_op) ///< [in] Binary reduction operator - { - return InternalWarpReduce(temp_storage).Reduce(input, LOGICAL_WARP_THREADS, reduction_op); - } - - /** - * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. - * - * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. - * - * Supports non-commutative reduction operators - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a max reduction within a single, partially-full - * block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(int *d_data, int valid_items) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item per thread if in range - * int thread_data; - * if (threadIdx.x < valid_items) - * thread_data = d_data[threadIdx.x]; - * - * // Return the warp-wide reductions to each lane0 - * int aggregate = WarpReduce(temp_storage).Reduce( - * thread_data, cub::Max(), valid_items); - * - * \endcode - * \par - * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items - * is \p 4. The corresponding output \p aggregate in thread0 is \p 3 (and is - * undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - ReductionOp reduction_op, ///< [in] Binary reduction operator - int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) - { - // Determine if we don't need bounds checking - if (valid_items >= LOGICAL_WARP_THREADS) - { - return InternalWarpReduce(temp_storage).Reduce(input, valid_items, reduction_op); - } - else - { - return InternalWarpReduce(temp_storage).Reduce(input, valid_items, reduction_op); - } - } - - - /** - * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). - * - * Supports non-commutative reduction operators - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a head-segmented warp max - * reduction within a block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item and flag per thread - * int thread_data = ... - * int head_flag = ... - * - * // Return the warp-wide reductions to each lane0 - * int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce( - * thread_data, head_flag, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data and \p head_flag across the block of threads - * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, - * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be - * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ - template < - typename ReductionOp, - typename Flag> - __device__ __forceinline__ T HeadSegmentedReduce( - T input, ///< [in] Calling thread's input - Flag head_flag, ///< [in] Head flag denoting whether or not \p input is the start of a new segment - ReductionOp reduction_op) ///< [in] Reduction operator - { - return InternalWarpReduce(temp_storage).template SegmentedReduce(input, head_flag, reduction_op); - } - - - /** - * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). - * - * Supports non-commutative reduction operators - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a tail-segmented warp max - * reduction within a block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item and flag per thread - * int thread_data = ... - * int tail_flag = ... - * - * // Return the warp-wide reductions to each lane0 - * int aggregate = WarpReduce(temp_storage).TailSegmentedReduce( - * thread_data, tail_flag, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data and \p tail_flag across the block of threads - * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, - * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be - * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ - template < - typename ReductionOp, - typename Flag> - __device__ __forceinline__ T TailSegmentedReduce( - T input, ///< [in] Calling thread's input - Flag tail_flag, ///< [in] Tail flag denoting whether or not \p input is the end of the current segment - ReductionOp reduction_op) ///< [in] Reduction operator - { - return InternalWarpReduce(temp_storage).template SegmentedReduce(input, tail_flag, reduction_op); - } - - - - //@} end member group -}; - -/** @} */ // end group WarpModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "specializations/warp_reduce_shfl.cuh" +#include "specializations/warp_reduce_smem.cuh" +#include "../thread/thread_operators.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + + +/** + * \addtogroup WarpModule + * @{ + */ + +/** + * \brief The WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png) + * + * \tparam T The reduction input/output element type + * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20). + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - A reduction (or fold) + * uses a binary combining operator to compute a single aggregate from a list of input elements. + * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads) + * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS + * + * \par Performance Considerations + * - Uses special instructions when applicable (e.g., warp \p SHFL instructions) + * - Uses synchronization-free communication between warp lanes when applicable + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Summation (vs. generic reduction) + * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS + * + * \par Simple Examples + * \warpcollective{WarpReduce} + * \par + * The code snippet below illustrates four concurrent warp sum reductions within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for 4 warps + * __shared__ typename WarpReduce::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96) + * int warp_id = threadIdx.x / 32; + * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, + * \p 2544, and \p 3568, respectively (and is undefined in other threads). + * + * \par + * The code snippet below illustrates a single warp sum reduction within a block of + * 128 threads. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * ... + * + * // Only the first warp performs a reduction + * if (threadIdx.x < 32) + * { + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide sum to lane0 + * int aggregate = WarpReduce(temp_storage).Sum(thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the warp of threads is {0, 1, 2, 3, ..., 31}. + * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads). + * + */ +template < + typename T, + int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +class WarpReduce +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = PowerOfTwo::VALUE, + }; + +public: + + #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document + + /// Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) + typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), + WarpReduceShfl, + WarpReduceSmem >::Type InternalWarpReduce; + + #endif // DOXYGEN_SHOULD_SKIP_THIS + + +private: + + /// Shared memory storage layout type for WarpReduce + typedef typename InternalWarpReduce::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + + + /****************************************************************************** + * Utility methods + ******************************************************************************/ + +public: + + /// \smemstorage{WarpReduce} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. + */ + __device__ __forceinline__ WarpReduce( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()) + {} + + + //@} end member group + /******************************************************************//** + * \name Summation reductions + *********************************************************************/ + //@{ + + + /** + * \brief Computes a warp-wide sum in the calling warp. The output is valid in warp lane0. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp sum reductions within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for 4 warps + * __shared__ typename WarpReduce::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide sums to each lane0 + * int warp_id = threadIdx.x / 32; + * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, + * \p 2544, and \p 3568, respectively (and is undefined in other threads). + * + */ + __device__ __forceinline__ T Sum( + T input) ///< [in] Calling thread's input + { + return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, cub::Sum()); + } + + /** + * \brief Computes a partially-full warp-wide sum in the calling warp. The output is valid in warp lane0. + * + * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a sum reduction within a single, partially-full + * block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(int *d_data, int valid_items) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item per thread if in range + * int thread_data; + * if (threadIdx.x < valid_items) + * thread_data = d_data[threadIdx.x]; + * + * // Return the warp-wide sums to each lane0 + * int aggregate = WarpReduce(temp_storage).Sum( + * thread_data, valid_items); + * + * \endcode + * \par + * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items + * is \p 4. The corresponding output \p aggregate in thread0 is \p 6 (and is + * undefined in other threads). + * + */ + __device__ __forceinline__ T Sum( + T input, ///< [in] Calling thread's input + int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) + { + // Determine if we don't need bounds checking + return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, cub::Sum()); + } + + + /** + * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a head-segmented warp sum + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int head_flag = ... + * + * // Return the warp-wide sums to each lane0 + * int aggregate = WarpReduce(temp_storage).HeadSegmentedSum( + * thread_data, head_flag); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p head_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + * + */ + template < + typename FlagT> + __device__ __forceinline__ T HeadSegmentedSum( + T input, ///< [in] Calling thread's input + FlagT head_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment + { + return HeadSegmentedReduce(input, head_flag, cub::Sum()); + } + + + /** + * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a tail-segmented warp sum + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int tail_flag = ... + * + * // Return the warp-wide sums to each lane0 + * int aggregate = WarpReduce(temp_storage).TailSegmentedSum( + * thread_data, tail_flag); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p tail_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template < + typename FlagT> + __device__ __forceinline__ T TailSegmentedSum( + T input, ///< [in] Calling thread's input + FlagT tail_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment + { + return TailSegmentedReduce(input, tail_flag, cub::Sum()); + } + + + + //@} end member group + /******************************************************************//** + * \name Generic reductions + *********************************************************************/ + //@{ + + /** + * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp max reductions within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for 4 warps + * __shared__ typename WarpReduce::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Return the warp-wide reductions to each lane0 + * int warp_id = threadIdx.x / 32; + * int aggregate = WarpReduce(temp_storage[warp_id]).Reduce( + * thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63, + * \p 95, and \p 127, respectively (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op) ///< [in] Binary reduction operator + { + return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, reduction_op); + } + + /** + * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. + * + * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a max reduction within a single, partially-full + * block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(int *d_data, int valid_items) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item per thread if in range + * int thread_data; + * if (threadIdx.x < valid_items) + * thread_data = d_data[threadIdx.x]; + * + * // Return the warp-wide reductions to each lane0 + * int aggregate = WarpReduce(temp_storage).Reduce( + * thread_data, cub::Max(), valid_items); + * + * \endcode + * \par + * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items + * is \p 4. The corresponding output \p aggregate in thread0 is \p 3 (and is + * undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ T Reduce( + T input, ///< [in] Calling thread's input + ReductionOp reduction_op, ///< [in] Binary reduction operator + int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) + { + return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, reduction_op); + } + + + /** + * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a head-segmented warp max + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int head_flag = ... + * + * // Return the warp-wide reductions to each lane0 + * int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce( + * thread_data, head_flag, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p head_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template < + typename ReductionOp, + typename FlagT> + __device__ __forceinline__ T HeadSegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT head_flag, ///< [in] Head flag denoting whether or not \p input is the start of a new segment + ReductionOp reduction_op) ///< [in] Reduction operator + { + return InternalWarpReduce(temp_storage).template SegmentedReduce(input, head_flag, reduction_op); + } + + + /** + * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). + * + * Supports non-commutative reduction operators + * + * \smemreuse + * + * \par Snippet + * The code snippet below illustrates a tail-segmented warp max + * reduction within a block of 32 threads (one warp). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpReduce for type int + * typedef cub::WarpReduce WarpReduce; + * + * // Allocate WarpReduce shared memory for one warp + * __shared__ typename WarpReduce::TempStorage temp_storage; + * + * // Obtain one input item and flag per thread + * int thread_data = ... + * int tail_flag = ... + * + * // Return the warp-wide reductions to each lane0 + * int aggregate = WarpReduce(temp_storage).TailSegmentedReduce( + * thread_data, tail_flag, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data and \p tail_flag across the block of threads + * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, + * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be + * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). + * + * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) + */ + template < + typename ReductionOp, + typename FlagT> + __device__ __forceinline__ T TailSegmentedReduce( + T input, ///< [in] Calling thread's input + FlagT tail_flag, ///< [in] Tail flag denoting whether or not \p input is the end of the current segment + ReductionOp reduction_op) ///< [in] Reduction operator + { + return InternalWarpReduce(temp_storage).template SegmentedReduce(input, tail_flag, reduction_op); + } + + + + //@} end member group +}; + +/** @} */ // end group WarpModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/warp_scan.cuh b/SRC/cub/warp/warp_scan.cuh index a065f984..c7af0d34 100644 --- a/SRC/cub/warp/warp_scan.cuh +++ b/SRC/cub/warp/warp_scan.cuh @@ -1,1451 +1,936 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "specializations/warp_scan_shfl.cuh" -#include "specializations/warp_scan_smem.cuh" -#include "../thread/thread_operators.cuh" -#include "../util_arch.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup WarpModule - * @{ - */ - -/** - * \brief The WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. ![](warp_scan_logo.png) - * - * \tparam T The scan input/output element type - * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20). - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) - * produces an output list where each element is computed to be the reduction - * of the elements occurring earlier in the input list. Prefix sum - * connotes a prefix scan with the addition operator. The term \em inclusive indicates - * that the ith output reduction incorporates the ith input. - * The term \em exclusive indicates the ith input is not incorporated into - * the ith output reduction. - * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads) - * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS - * - * \par Performance Considerations - * - Uses special instructions when applicable (e.g., warp \p SHFL) - * - Uses synchronization-free communication between warp lanes when applicable - * - Incurs zero bank conflicts for most types - * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: - * - Summation (vs. generic scan) - * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS - * - * \par Simple Examples - * \warpcollective{WarpScan} - * \par - * The code snippet below illustrates four concurrent warp prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute warp-wide prefix sums - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data in each of the four warps of threads will be - * 0, 1, 2, 3, ..., 31}. - * - * \par - * The code snippet below illustrates a single warp prefix sum within a block of - * 128 threads. - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for one warp - * __shared__ typename WarpScan::TempStorage temp_storage; - * ... - * - * // Only the first warp performs a prefix sum - * if (threadIdx.x < 32) - * { - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute warp-wide prefix sums - * WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the warp of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data will be {0, 1, 2, 3, ..., 31}. - * - */ -template < - typename T, - int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, - int PTX_ARCH = CUB_PTX_ARCH> -class WarpScan -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// Whether the logical warp size is a power-of-two - IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0), - - /// Whether the data type is an integer (which has fully-associative addition) - IS_INTEGER = ((Traits::CATEGORY == SIGNED_INTEGER) || (Traits::CATEGORY == UNSIGNED_INTEGER)) - }; - - /// Internal specialization. Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) - typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), - WarpScanShfl, - WarpScanSmem >::Type InternalWarpScan; - - /// Shared memory storage layout type for WarpScan - typedef typename InternalWarpScan::TempStorage _TempStorage; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - int lane_id; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - -public: - - /// \smemstorage{WarpScan} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. - */ - __device__ __forceinline__ WarpScan( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - lane_id(IS_ARCH_WARP ? - LaneId() : - LaneId() % LOGICAL_WARP_THREADS) - {} - - - //@} end member group - /******************************************************************//** - * \name Inclusive prefix sums - *********************************************************************/ - //@{ - - - /** - * \brief Computes an inclusive prefix sum across the calling warp. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute inclusive warp-wide prefix sums - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data in each of the four warps of threads will be - * 1, 2, 3, ..., 32}. - */ - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item. - T &output) ///< [out] Calling thread's output item. May be aliased with \p input. - { - InternalWarpScan(temp_storage).InclusiveScan(input, output, cub::Sum()); - } - - - /** - * \brief Computes an inclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * The \p warp_aggregate is undefined in threads other than warp-lane0. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute inclusive warp-wide prefix sums - * int warp_aggregate; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data in each of the four warps of threads will be - * 1, 2, 3, ..., 32}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. - */ - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InternalWarpScan(temp_storage).InclusiveScan(input, output, cub::Sum(), warp_aggregate); - } - - - /** - * \brief Computes an inclusive prefix sum across the calling warp. Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * The \p warp_aggregate is undefined in threads other than warp-lane0. - * - * The \p warp_prefix_op functor must implement a member function T operator()(T warp_aggregate). - * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the entire warp of threads, however only the return value from - * lane0 is applied as the threadblock-wide prefix. Can be stateful. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively - * computes an inclusive prefix sum over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 32 integer items that are partitioned across the warp. - * \par - * \code - * #include - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct WarpPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ WarpPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the entire warp. Lane-0 is responsible - * // for returning a value for seeding the warp-wide scan. - * __device__ int operator()(int warp_aggregate) - * { - * int old_prefix = running_total; - * running_total += warp_aggregate; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize WarpScan for int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for one warp - * __shared__ typename WarpScan::TempStorage temp_storage; - * - * // Initialize running total - * WarpPrefixCallbackOp prefix_op(0); - * - * // Have the warp iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 32) - * { - * // Load a segment of consecutive items - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the warp-wide inclusive prefix sum - * int warp_aggregate; - * WarpScan(temp_storage).InclusiveSum( - * thread_data, thread_data, warp_aggregate, prefix_op); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is {1, 1, 1, 1, 1, 1, 1, 1, ...}. - * The corresponding output for the first segment will be {1, 2, 3, ..., 32}. - * The output for the second segment will be {33, 34, 35, ..., 64}. Furthermore, - * the value \p 32 will be stored in \p warp_aggregate for all threads after each scan. - * - * \tparam WarpPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T warp_aggregate) - */ - template - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - T &warp_aggregate, ///< [out] [warp-lane0 only] Warp-wide aggregate reduction of input items, exclusive of the \p warp_prefix_op value - WarpPrefixCallbackOp &warp_prefix_op) ///< [in-out] [warp-lane0 only] Call-back functor for specifying a warp-wide prefix to be applied to all inputs. - { - // Compute inclusive warp scan - InclusiveSum(input, output, warp_aggregate); - - // Compute warp-wide prefix from aggregate, then broadcast to other lanes - T prefix; - prefix = warp_prefix_op(warp_aggregate); - prefix = InternalWarpScan(temp_storage).Broadcast(prefix, 0); - - // Update output - output = prefix + output; - } - - //@} end member group - -private: - - /// Combination scan with identity - __device__ __forceinline__ void Sum(T input, T &inclusive_output, T &exclusive_output, Int2Type is_integer) - { - // Compute exclusive warp scan from inclusive warp scan - InclusiveSum(input, inclusive_output); - exclusive_output = inclusive_output - input; - } - - /// Combination scan with identity - __device__ __forceinline__ void Sum(T input, T &inclusive_output, T &exclusive_output, Int2Type is_integer) - { - // Delegate to regular scan for non-integer types (because we won't be able to use subtraction) - T identity = ZeroInitialize(); - InternalWarpScan(temp_storage).Scan(input, inclusive_output, exclusive_output, identity, cub::Sum()); - } - - /// Computes an exclusive prefix sum across the calling warp. - __device__ __forceinline__ void ExclusiveSum(T input, T &output, Int2Type is_integer) - { - // Compute exclusive warp scan from inclusive warp scan - T inclusive; - InclusiveSum(input, inclusive); - output = inclusive - input; - } - - /// Computes an exclusive prefix sum across the calling warp. Specialized for non-integer types. - __device__ __forceinline__ void ExclusiveSum(T input, T &output, Int2Type is_integer) - { - // Delegate to regular scan for non-integer types (because we won't be able to use subtraction) - T identity = ZeroInitialize(); - ExclusiveScan(input, output, identity, cub::Sum()); - } - - /// Computes an exclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, Int2Type is_integer) - { - // Compute exclusive warp scan from inclusive warp scan - T inclusive; - InclusiveSum(input, inclusive, warp_aggregate); - output = inclusive - input; - } - - /// Computes an exclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. Specialized for non-integer types. - __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, Int2Type is_integer) - { - // Delegate to regular scan for non-integer types (because we won't be able to use subtraction) - T identity = ZeroInitialize(); - ExclusiveScan(input, output, identity, cub::Sum(), warp_aggregate); - } - - /// Computes an exclusive prefix sum across the calling warp. Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - template - __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, WarpPrefixCallbackOp &warp_prefix_op, Int2Type is_integer) - { - // Compute exclusive warp scan from inclusive warp scan - T inclusive; - InclusiveSum(input, inclusive, warp_aggregate, warp_prefix_op); - output = inclusive - input; - } - - /// Computes an exclusive prefix sum across the calling warp. Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. Specialized for non-integer types. - template - __device__ __forceinline__ void ExclusiveSum(T input, T &output, T &warp_aggregate, WarpPrefixCallbackOp &warp_prefix_op, Int2Type is_integer) - { - // Delegate to regular scan for non-integer types (because we won't be able to use subtraction) - T identity = ZeroInitialize(); - ExclusiveScan(input, output, identity, cub::Sum(), warp_aggregate, warp_prefix_op); - } - -public: - - - /******************************************************************//** - * \name Exclusive prefix sums - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive prefix sum across the calling warp. - * - * This operation assumes the value of obtained by the T's default - * constructor (or by zero-initialization if no user-defined default - * constructor exists) is suitable as the identity value "zero" for - * addition. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix sums - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data in each of the four warps of threads will be - * 0, 1, 2, ..., 31}. - * - */ - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item. - T &output) ///< [out] Calling thread's output item. May be aliased with \p input. - { - ExclusiveSum(input, output, Int2Type()); - } - - - /** - * \brief Computes an exclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * This operation assumes the value of obtained by the T's default - * constructor (or by zero-initialization if no user-defined default - * constructor exists) is suitable as the identity value "zero" for - * addition. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix sums - * int warp_aggregate; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data in each of the four warps of threads will be - * 0, 1, 2, ..., 31}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. - */ - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - ExclusiveSum(input, output, warp_aggregate, Int2Type()); - } - - - /** - * \brief Computes an exclusive prefix sum across the calling warp. Instead of using 0 as the warp-wide prefix, the call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * This operation assumes the value of obtained by the T's default - * constructor (or by zero-initialization if no user-defined default - * constructor exists) is suitable as the identity value "zero" for - * addition. - * - * The \p warp_prefix_op functor must implement a member function T operator()(T warp_aggregate). - * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the entire warp of threads, however only the return value from - * lane0 is applied as the threadblock-wide prefix. Can be stateful. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively - * computes an exclusive prefix sum over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 32 integer items that are partitioned across the warp. - * \par - * \code - * #include - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct WarpPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ WarpPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the entire warp. Lane-0 is responsible - * // for returning a value for seeding the warp-wide scan. - * __device__ int operator()(int warp_aggregate) - * { - * int old_prefix = running_total; - * running_total += warp_aggregate; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize WarpScan for int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for one warp - * __shared__ typename WarpScan::TempStorage temp_storage; - * - * // Initialize running total - * WarpPrefixCallbackOp prefix_op(0); - * - * // Have the warp iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 32) - * { - * // Load a segment of consecutive items - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the warp-wide exclusive prefix sum - * int warp_aggregate; - * WarpScan(temp_storage).ExclusiveSum( - * thread_data, thread_data, warp_aggregate, prefix_op); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is {1, 1, 1, 1, 1, 1, 1, 1, ...}. - * The corresponding output for the first segment will be {0, 1, 2, ..., 31}. - * The output for the second segment will be {32, 33, 34, ..., 63}. Furthermore, - * the value \p 32 will be stored in \p warp_aggregate for all threads after each scan. - * - * \tparam WarpPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T warp_aggregate) - */ - template - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - T &warp_aggregate, ///< [out] [warp-lane0 only] Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value). - WarpPrefixCallbackOp &warp_prefix_op) ///< [in-out] [warp-lane0 only] Call-back functor for specifying a warp-wide prefix to be applied to all inputs. - { - ExclusiveSum(input, output, warp_aggregate, warp_prefix_op, Int2Type()); - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive prefix scans - *********************************************************************/ - //@{ - - /** - * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. - * - * Supports non-commutative scan operators. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute inclusive warp-wide prefix max scans - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op) ///< [in] Binary scan operator - { - InternalWarpScan(temp_storage).InclusiveScan(input, output, scan_op); - } - - - /** - * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * Supports non-commutative scan operators. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute inclusive warp-wide prefix max scans - * int warp_aggregate; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).InclusiveScan( - * thread_data, thread_data, cub::Max(), warp_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. - * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads - * in the second warp, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InternalWarpScan(temp_storage).InclusiveScan(input, output, scan_op, warp_aggregate); - } - - - /** - * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. The call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * The \p warp_prefix_op functor must implement a member function T operator()(T warp_aggregate). - * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the entire warp of threads, however only the return value from - * lane0 is applied as the threadblock-wide prefix. Can be stateful. - * - * Supports non-commutative scan operators. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively - * computes an inclusive prefix max scan over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 32 integer items that are partitioned across the warp. - * \par - * \code - * #include - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct WarpPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ WarpPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the entire warp. Lane-0 is responsible - * // for returning a value for seeding the warp-wide scan. - * __device__ int operator()(int warp_aggregate) - * { - * int old_prefix = running_total; - * running_total = (warp_aggregate > old_prefix) ? warp_aggregate : old_prefix; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize WarpScan for int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for one warp - * __shared__ typename WarpScan::TempStorage temp_storage; - * - * // Initialize running total - * WarpPrefixCallbackOp prefix_op(0); - * - * // Have the warp iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 32) - * { - * // Load a segment of consecutive items - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the warp-wide inclusive prefix max scan - * int warp_aggregate; - * WarpScan(temp_storage).InclusiveScan( - * thread_data, thread_data, cub::Max(), warp_aggregate, prefix_op); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is {0, -1, 2, -3, 4, -5, ...}. - * The corresponding output for the first segment will be {0, 0, 2, 2, ..., 30, 30}. - * The output for the second segment will be {32, 32, 34, 34, ..., 62, 62}. Furthermore, - * \p block_aggregate will be assigned \p 30 in all threads after the first scan, assigned \p 62 after the second - * scan, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - * \tparam WarpPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T warp_aggregate) - */ - template < - typename ScanOp, - typename WarpPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate, ///< [out] [warp-lane0 only] Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value). - WarpPrefixCallbackOp &warp_prefix_op) ///< [in-out] [warp-lane0 only] Call-back functor for specifying a warp-wide prefix to be applied to all inputs. - { - // Compute inclusive warp scan - InclusiveScan(input, output, scan_op, warp_aggregate); - - // Compute warp-wide prefix from aggregate, then broadcast to other lanes - T prefix; - prefix = warp_prefix_op(warp_aggregate); - prefix = InternalWarpScan(temp_storage).Broadcast(prefix, 0); - - // Update output - output = scan_op(prefix, output); - } - - - //@} end member group - /******************************************************************//** - * \name Exclusive prefix scans - *********************************************************************/ - //@{ - - /** - * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. - * - * Supports non-commutative scan operators. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix max scans - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - T identity, ///< [in] Identity value - ScanOp scan_op) ///< [in] Binary scan operator - { - T inclusive_output; - InternalWarpScan(temp_storage).Scan(input, inclusive_output, output, identity, scan_op); - } - - - /** - * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * Supports non-commutative scan operators. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix max scans - * int warp_aggregate; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. - * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads - * in the second warp, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - T identity, ///< [in] Identity value - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InternalWarpScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, warp_aggregate); - } - - - /** - * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. The call-back functor \p warp_prefix_op is invoked to provide the "seed" value that logically prefixes the warp's scan inputs. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * The \p warp_prefix_op functor must implement a member function T operator()(T warp_aggregate). - * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the entire warp of threads, however only the return value from - * lane0 is applied as the threadblock-wide prefix. Can be stateful. - * - * Supports non-commutative scan operators. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively - * computes an exclusive prefix max scan over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 32 integer items that are partitioned across the warp. - * \par - * \code - * #include - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct WarpPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ WarpPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the entire warp. Lane-0 is responsible - * // for returning a value for seeding the warp-wide scan. - * __device__ int operator()(int warp_aggregate) - * { - * int old_prefix = running_total; - * running_total = (warp_aggregate > old_prefix) ? warp_aggregate : old_prefix; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize WarpScan for int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for one warp - * __shared__ typename WarpScan::TempStorage temp_storage; - * - * // Initialize running total - * WarpPrefixCallbackOp prefix_op(INT_MIN); - * - * // Have the warp iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 32) - * { - * // Load a segment of consecutive items - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the warp-wide exclusive prefix max scan - * int warp_aggregate; - * WarpScan(temp_storage).ExclusiveScan( - * thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate, prefix_op); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is {0, -1, 2, -3, 4, -5, ...}. - * The corresponding output for the first segment will be {INT_MIN, 0, 0, 2, ..., 28, 30}. - * The output for the second segment will be {30, 32, 32, 34, ..., 60, 62}. Furthermore, - * \p block_aggregate will be assigned \p 30 in all threads after the first scan, assigned \p 62 after the second - * scan, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - * \tparam WarpPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T warp_aggregate) - */ - template < - typename ScanOp, - typename WarpPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - T identity, ///< [in] Identity value - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate, ///< [out] [warp-lane0 only] Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value). - WarpPrefixCallbackOp &warp_prefix_op) ///< [in-out] [warp-lane0 only] Call-back functor for specifying a warp-wide prefix to be applied to all inputs. - { - // Exclusive warp scan - ExclusiveScan(input, output, identity, scan_op, warp_aggregate); - - // Compute warp-wide prefix from aggregate, then broadcast to other lanes - T prefix = warp_prefix_op(warp_aggregate); - prefix = InternalWarpScan(temp_storage).Broadcast(prefix, 0); - - // Update output - output = (lane_id == 0) ? - prefix : - scan_op(prefix, output); - } - - - //@} end member group - /******************************************************************//** - * \name Identityless exclusive prefix scans - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no identity value is supplied, the \p output computed for warp-lane0 is undefined. - * - * Supports non-commutative scan operators. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix max scans - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. - * (The output \p thread_data in warp lane0 is undefined.) - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op) ///< [in] Binary scan operator - { - T inclusive_output; - InternalWarpScan(temp_storage).Scan(input, inclusive_output, output, scan_op); - } - - - /** - * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no identity value is supplied, the \p output computed for warp-lane0 is undefined. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * Supports non-commutative scan operators. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix max scans - * int warp_aggregate; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. - * (The output \p thread_data in warp lane0 is undefined.) Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads - * in the second warp, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InternalWarpScan(temp_storage).ExclusiveScan(input, output, scan_op, warp_aggregate); - } - - - /** - * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. The \p warp_prefix_op value from warp-lane0 is applied to all scan outputs. Also computes the warp-wide \p warp_aggregate of all inputs for warp-lane0. - * - * The \p warp_prefix_op functor must implement a member function T operator()(T warp_aggregate)}. - * The functor's input parameter \p warp_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the entire warp of threads, however only the return value from - * lane0 is applied as the threadblock-wide prefix. Can be stateful. - * - * Supports non-commutative scan operators. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block of 32 threads (one warp) that progressively - * computes an exclusive prefix max scan over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 32 integer items that are partitioned across the warp. - * \par - * \code - * #include - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct WarpPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ WarpPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the entire warp. Lane-0 is responsible - * // for returning a value for seeding the warp-wide scan. - * __device__ int operator()(int warp_aggregate) - * { - * int old_prefix = running_total; - * running_total = (warp_aggregate > old_prefix) ? warp_aggregate : old_prefix; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize WarpScan for int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for one warp - * __shared__ typename WarpScan::TempStorage temp_storage; - * - * // Initialize running total - * WarpPrefixCallbackOp prefix_op(INT_MIN); - * - * // Have the warp iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 32) - * { - * // Load a segment of consecutive items - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the warp-wide exclusive prefix max scan - * int warp_aggregate; - * WarpScan(temp_storage).ExclusiveScan( - * thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate, prefix_op); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is {0, -1, 2, -3, 4, -5, ...}. - * The corresponding output for the first segment will be {INT_MIN, 0, 0, 2, ..., 28, 30}. - * The output for the second segment will be {30, 32, 32, 34, ..., 60, 62}. Furthermore, - * \p block_aggregate will be assigned \p 30 in all threads after the first scan, assigned \p 62 after the second - * scan, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - * \tparam WarpPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T warp_aggregate) - */ - template < - typename ScanOp, - typename WarpPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate, ///< [out] [warp-lane0 only] Warp-wide aggregate reduction of input items (exclusive of the \p warp_prefix_op value). - WarpPrefixCallbackOp &warp_prefix_op) ///< [in-out] [warp-lane0 only] Call-back functor for specifying a warp-wide prefix to be applied to all inputs. - { - // Exclusive warp scan - ExclusiveScan(input, output, scan_op, warp_aggregate); - - // Compute warp-wide prefix from aggregate, then broadcast to other lanes - T prefix = warp_prefix_op(warp_aggregate); - prefix = InternalWarpScan(temp_storage).Broadcast(prefix, 0); - - // Update output with prefix - output = (lane_id == 0) ? - prefix : - scan_op(prefix, output); - } - - //@} end member group - /******************************************************************//** - * \name Combination (inclusive & exclusive) prefix scans - *********************************************************************/ - //@{ - - /** - * \brief Computes both inclusive and exclusive prefix sums across the calling warp. - * - * This operation assumes the value of obtained by the T's default - * constructor (or by zero-initialization if no user-defined default - * constructor exists) is suitable as the identity value "zero" for - * addition. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute in|exclusive warp-wide prefix sums - * int inclusive_partial, exclusive_partial; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).Sum(thread_data, inclusive_partial, exclusive_partial); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p inclusive_partial in each of the four warps of threads will be - * 1, 2, 3, ..., 32}. - * The corresponding output \p exclusive_partial in each of the four warps of threads will be - * 0, 1, 2, ..., 31}. - * - */ - __device__ __forceinline__ void Sum( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. - T &exclusive_output) ///< [out] Calling thread's exclusive-scan output item. - { - Sum(input, inclusive_output, exclusive_output, Int2Type()); - } - - - /** - * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. - * - * Supports non-commutative scan operators. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute inclusive warp-wide prefix max scans - * int warp_id = threadIdx.x / 32; - * int inclusive_partial, exclusive_partial; - * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p inclusive_partial in the first warp would be - * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. - * The corresponding output \p exclusive_partial in the first warp would be - * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void Scan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. - T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. - T identity, ///< [in] Identity value - ScanOp scan_op) ///< [in] Binary scan operator - { - InternalWarpScan(temp_storage).Scan(input, inclusive_output, exclusive_output, identity, scan_op); - } - - - /** - * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. Because no identity value is supplied, the \p exclusive_output computed for warp-lane0 is undefined. - * - * Supports non-commutative scan operators. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix max scans - * int inclusive_partial, exclusive_partial; - * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p inclusive_partial in the first warp would be - * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. - * The corresponding output \p exclusive_partial in the first warp would be - * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. - * (The output \p thread_data in warp lane0 is undefined.) - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void Scan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. - T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. - ScanOp scan_op) ///< [in] Binary scan operator - { - InternalWarpScan(temp_storage).Scan(input, inclusive_output, exclusive_output, scan_op); - } - - - //@} end member group -}; - -/** @} */ // end group WarpModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) +/****************************************************************************** + * Copyright (c) 2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * The cub::WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. + */ + +#pragma once + +#include "specializations/warp_scan_shfl.cuh" +#include "specializations/warp_scan_smem.cuh" +#include "../thread/thread_operators.cuh" +#include "../util_arch.cuh" +#include "../util_type.cuh" +#include "../util_namespace.cuh" + +/// Optional outer namespace(s) +CUB_NS_PREFIX + +/// CUB namespace +namespace cub { + +/** + * \addtogroup WarpModule + * @{ + */ + +/** + * \brief The WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. ![](warp_scan_logo.png) + * + * \tparam T The scan input/output element type + * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20). + * \tparam PTX_ARCH [optional] \ptxversion + * + * \par Overview + * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) + * produces an output list where each element is computed to be the reduction + * of the elements occurring earlier in the input list. Prefix sum + * connotes a prefix scan with the addition operator. The term \em inclusive indicates + * that the ith output reduction incorporates the ith input. + * The term \em exclusive indicates the ith input is not incorporated into + * the ith output reduction. + * - Supports non-commutative scan operators + * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads) + * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS + * + * \par Performance Considerations + * - Uses special instructions when applicable (e.g., warp \p SHFL) + * - Uses synchronization-free communication between warp lanes when applicable + * - Incurs zero bank conflicts for most types + * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: + * - Summation (vs. generic scan) + * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS + * + * \par Simple Examples + * \warpcollective{WarpScan} + * \par + * The code snippet below illustrates four concurrent warp prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute warp-wide prefix sums + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 0, 1, 2, 3, ..., 31}. + * + * \par + * The code snippet below illustrates a single warp prefix sum within a block of + * 128 threads. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for one warp + * __shared__ typename WarpScan::TempStorage temp_storage; + * ... + * + * // Only the first warp performs a prefix sum + * if (threadIdx.x < 32) + * { + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute warp-wide prefix sums + * WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the warp of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data will be {0, 1, 2, 3, ..., 31}. + * + */ +template < + typename T, + int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, + int PTX_ARCH = CUB_PTX_ARCH> +class WarpScan +{ +private: + + /****************************************************************************** + * Constants and type definitions + ******************************************************************************/ + + enum + { + /// Whether the logical warp size and the PTX warp size coincide + IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), + + /// Whether the logical warp size is a power-of-two + IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0), + + /// Whether the data type is an integer (which has fully-associative addition) + IS_INTEGER = ((Traits::CATEGORY == SIGNED_INTEGER) || (Traits::CATEGORY == UNSIGNED_INTEGER)) + }; + + /// Internal specialization. Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) + typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), + WarpScanShfl, + WarpScanSmem >::Type InternalWarpScan; + + /// Shared memory storage layout type for WarpScan + typedef typename InternalWarpScan::TempStorage _TempStorage; + + + /****************************************************************************** + * Thread fields + ******************************************************************************/ + + /// Shared storage reference + _TempStorage &temp_storage; + unsigned int lane_id; + + + + /****************************************************************************** + * Public types + ******************************************************************************/ + +public: + + /// \smemstorage{WarpScan} + struct TempStorage : Uninitialized<_TempStorage> {}; + + + /******************************************************************//** + * \name Collective constructors + *********************************************************************/ + //@{ + + /** + * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. + */ + __device__ __forceinline__ WarpScan( + TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage + : + temp_storage(temp_storage.Alias()), + lane_id(IS_ARCH_WARP ? + LaneId() : + LaneId() % LOGICAL_WARP_THREADS) + {} + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix sums + *********************************************************************/ + //@{ + + + /** + * \brief Computes an inclusive prefix sum across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix sums + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 1, 2, 3, ..., 32}. + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item. + T &inclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. + { + InclusiveScan(input, inclusive_output, cub::Sum()); + } + + + /** + * \brief Computes an inclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix sums + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 1, 2, 3, ..., 32}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. + */ + __device__ __forceinline__ void InclusiveSum( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate); + } + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix sums + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in thread0. + * + * \par + * - \identityzero + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix sums + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 0, 1, 2, ..., 31}. + * + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item. + T &exclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. + { + T initial_value = 0; + ExclusiveScan(input, exclusive_output, initial_value, cub::Sum()); + } + + + /** + * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in thread0. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \identityzero + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix sums + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. + * The corresponding output \p thread_data in each of the four warps of threads will be + * 0, 1, 2, ..., 31}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. + */ + __device__ __forceinline__ void ExclusiveSum( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + T initial_value = 0; + ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate); + } + + + //@} end member group + /******************************************************************//** + * \name Inclusive prefix scans + *********************************************************************/ + //@{ + + /** + * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op); + } + + + /** + * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix max scans + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).InclusiveScan( + * thread_data, thread_data, cub::Max(), warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads + * in the second warp, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void InclusiveScan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate); + } + + + //@} end member group + /******************************************************************//** + * \name Exclusive prefix scans + *********************************************************************/ + //@{ + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for warp-lane0 is undefined. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. + * (The output \p thread_data in warp lane0 is undefined.) + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + Int2Type()); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + initial_value, + Int2Type()); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for warp-lane0 is undefined. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. + * (The output \p thread_data in warp lane0 is undefined.) Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads + * in the second warp, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + warp_aggregate, + scan_op, + Int2Type()); + } + + + /** + * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int warp_aggregate; + * int warp_id = threadIdx.x / 32; + * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p thread_data in the first warp would be + * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. + * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads + * in the second warp, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void ExclusiveScan( + T input, ///< [in] Calling thread's input item. + T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. + T initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op, ///< [in] Binary scan operator + T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. + { + InternalWarpScan internal(temp_storage); + + T inclusive_output; + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + warp_aggregate, + scan_op, + initial_value, + Int2Type()); + } + + + //@} end member group + /******************************************************************//** + * \name Combination (inclusive & exclusive) prefix scans + *********************************************************************/ + //@{ + + + /** + * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p exclusive_output computed for warp-lane0 is undefined. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute exclusive warp-wide prefix max scans + * int inclusive_partial, exclusive_partial; + * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p inclusive_partial in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * The corresponding output \p exclusive_partial in the first warp would be + * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. + * (The output \p thread_data in warp lane0 is undefined.) + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void Scan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. + T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + Int2Type()); + } + + + /** + * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of + * 128 threads (one per each of the 32-thread warps). + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Compute inclusive warp-wide prefix max scans + * int warp_id = threadIdx.x / 32; + * int inclusive_partial, exclusive_partial; + * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max()); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. + * The corresponding output \p inclusive_partial in the first warp would be + * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. + * The corresponding output \p exclusive_partial in the first warp would be + * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. + * + * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) + */ + template + __device__ __forceinline__ void Scan( + T input, ///< [in] Calling thread's input item. + T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. + T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. + T initial_value, ///< [in] Initial value to seed the exclusive scan + ScanOp scan_op) ///< [in] Binary scan operator + { + InternalWarpScan internal(temp_storage); + + internal.InclusiveScan(input, inclusive_output, scan_op); + + internal.Update( + input, + inclusive_output, + exclusive_output, + scan_op, + initial_value, + Int2Type()); + } + + + + //@} end member group + /******************************************************************//** + * \name Data exchange + *********************************************************************/ + //@{ + + /** + * \brief Broadcast the value \p input from warp-lanesrc_lane to all lanes in the warp + * + * \par + * - \smemreuse + * + * \par Snippet + * The code snippet below illustrates the warp-wide broadcasts of values from + * lanes0 in each of four warps to all other threads in those warps. + * \par + * \code + * #include + * + * __global__ void ExampleKernel(...) + * { + * // Specialize WarpScan for type int + * typedef cub::WarpScan WarpScan; + * + * // Allocate WarpScan shared memory for 4 warps + * __shared__ typename WarpScan::TempStorage temp_storage[4]; + * + * // Obtain one input item per thread + * int thread_data = ... + * + * // Broadcast from lane0 in each warp to all other threads in the warp + * int warp_id = threadIdx.x / 32; + * thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0); + * + * \endcode + * \par + * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. + * The corresponding output \p thread_data will be + * {0, 0, ..., 0} in warp0, + * {32, 32, ..., 32} in warp1, + * {64, 64, ..., 64} in warp2, etc. + */ + __device__ __forceinline__ T Broadcast( + T input, ///< [in] The value to broadcast + unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting + { + return InternalWarpScan(temp_storage).Broadcast(input, src_lane); + } + + //@} end member group + +}; + +/** @} */ // end group WarpModule + +} // CUB namespace +CUB_NS_POSTFIX // Optional outer namespace(s) From 1626a06b7f8368b429415bd818ce5c19b5eeb34d Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Sat, 26 Jun 2021 09:48:36 -0700 Subject: [PATCH 102/147] Disable the inclusion of mkl.h. --- SRC/dscatter3d.c | 15 +- SRC/dtreeFactorizationGPU.c | 7 +- SRC/dtrfCommWrapper.c | 345 ++++++++++++++++++------------------ SRC/pdgstrf.c | 1 - SRC/pzgstrf.c | 1 - SRC/scatter.c | 7 - SRC/superlu_defs.h | 6 +- SRC/supernodal_etree.c | 2 +- SRC/zscatter3d.c | 15 +- SRC/ztreeFactorizationGPU.c | 7 +- SRC/ztrfCommWrapper.c | 64 ++----- 11 files changed, 215 insertions(+), 255 deletions(-) diff --git a/SRC/dscatter3d.c b/SRC/dscatter3d.c index 3abb2440..53af7944 100644 --- a/SRC/dscatter3d.c +++ b/SRC/dscatter3d.c @@ -10,15 +10,22 @@ at the top-level directory. */ +/*! @file + * \brief Scatter the computed blocks into LU destination. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
 #include "superlu_ddefs.h"
 //#include "scatter.h"
 //#include "compiler.h"
 
-#ifdef __INTEL_COMPILER
-#include "mkl.h"
-#else
 //#include "cblas.h"
-#endif
+
 
 #define ISORT
 #define SCATTER_U_CPU  scatter_u
diff --git a/SRC/dtreeFactorizationGPU.c b/SRC/dtreeFactorizationGPU.c
index 21a03f97..9f5bb8ee 100644
--- a/SRC/dtreeFactorizationGPU.c
+++ b/SRC/dtreeFactorizationGPU.c
@@ -7,17 +7,14 @@
  * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley,
  * Georgia Institute of Technology, Oak Ridge National Laboratory
- * March 14, 2021 version 7.0.0
+ * May 12, 2021
  * 
*/ // #include "treeFactorization.h" // #include "trfCommWrapper.h" #include "dlustruct_gpu.h" -#ifdef __INTEL_COMPILER -#include "mkl.h" -#else + //#include "cblas.h" -#endif #ifdef GPU_ACC ///////////////// enable GPU diff --git a/SRC/dtrfCommWrapper.c b/SRC/dtrfCommWrapper.c index 69721631..1ff15bdc 100644 --- a/SRC/dtrfCommWrapper.c +++ b/SRC/dtrfCommWrapper.c @@ -9,13 +9,15 @@ The source code is distributed under BSD license, see the file License.txt at the top-level directory. */ + /*! @file * \brief Communication wrapper routines for 2D factorization. * *
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
  */
 
 #include "superlu_ddefs.h"
@@ -25,60 +27,57 @@ at the top-level directory.
 #include "trfCommWrapper.h"
 #endif
 
-#ifdef __INTEL_COMPILER
-#include "mkl.h"
-#else
 //#include "cblas.h"
-#endif
 
-int_t dDiagFactIBCast(int_t k, int_t k0, // supernode to be factored
-                      double *BlockUFactor,
-                      double *BlockLFactor,
-                      int_t *IrecvPlcd_D,
-                      MPI_Request *U_diag_blk_recv_req,
-                      MPI_Request *L_diag_blk_recv_req,
-                      MPI_Request *U_diag_blk_send_req,
-                      MPI_Request *L_diag_blk_send_req,
-                      gridinfo_t *grid,
-                      superlu_dist_options_t *options,
-                      double thresh,
-                      dLUstruct_t *LUstruct,
-                      SuperLUStat_t *stat, int *info,
-                      SCT_t *SCT,
-                      int tag_ub)
+int_t dDiagFactIBCast(int_t k,  int_t k0,      // supernode to be factored
+                     double *BlockUFactor,
+                     double *BlockLFactor,
+                     int_t* IrecvPlcd_D,
+                     MPI_Request *U_diag_blk_recv_req,
+                     MPI_Request *L_diag_blk_recv_req,
+                     MPI_Request *U_diag_blk_send_req,
+                     MPI_Request *L_diag_blk_send_req,
+                     gridinfo_t *grid,
+                     superlu_dist_options_t *options,
+                     double thresh,
+                     dLUstruct_t *LUstruct,
+                     SuperLUStat_t *stat, int *info,
+                     SCT_t *SCT,
+		     int tag_ub
+                    )
 {
     // unpacking variables
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     dLocalLU_t *Llu = LUstruct->Llu;
-    int_t *xsup = Glu_persist->xsup;
+    int_t* xsup = Glu_persist->xsup;
 
     int_t iam = grid->iam;
     int_t Pc = grid->npcol;
     int_t Pr = grid->nprow;
-    int_t myrow = MYROW(iam, grid);
-    int_t mycol = MYCOL(iam, grid);
-    int_t pkk = PNUM(PROW(k, grid), PCOL(k, grid), grid);
-    int_t krow = PROW(k, grid);
-    int_t kcol = PCOL(k, grid);
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
 
     //xsup for supersize
 
     /*Place Irecvs first*/
     // if (IrecvPlcd_D[k] == 0 )
     // {
-    int_t nsupc = SuperSize(k);
+    int_t nsupc = SuperSize (k);
     if (mycol == kcol && iam != pkk)
     {
-        dIRecv_UDiagBlock(k0, BlockUFactor, /*pointer for the diagonal block*/
-                          nsupc * nsupc, krow,
-                          U_diag_blk_recv_req, grid, SCT, tag_ub);
+        dIRecv_UDiagBlock(k0, BlockUFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, krow,
+                         U_diag_blk_recv_req, grid, SCT, tag_ub);
     }
 
     if (myrow == krow && iam != pkk)
     {
-        dIRecv_LDiagBlock(k0, BlockLFactor, /*pointer for the diagonal block*/
-                          nsupc * nsupc, kcol,
-                          L_diag_blk_recv_req, grid, SCT, tag_ub);
+        dIRecv_LDiagBlock(k0, BlockLFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, kcol,
+                         L_diag_blk_recv_req, grid, SCT, tag_ub);
     }
     IrecvPlcd_D[k] = 1;
     // }
@@ -107,39 +106,39 @@ int_t dDiagFactIBCast(int_t k, int_t k0, // supernode to be factored
         /*Isend U blocks to the process row*/
         int_t nsupc = SuperSize(k);
         dISend_UDiagBlock(k0, BlockLFactor,
-                          nsupc * nsupc, U_diag_blk_send_req, grid, tag_ub);
+                         nsupc * nsupc, U_diag_blk_send_req , grid, tag_ub);
 
         /*Isend L blocks to the process col*/
         dISend_LDiagBlock(k0, BlockLFactor,
-                          nsupc * nsupc, L_diag_blk_send_req, grid, tag_ub);
+                         nsupc * nsupc, L_diag_blk_send_req, grid, tag_ub);
         SCT->commVolFactor += 1.0 * nsupc * nsupc * (Pr + Pc);
     }
     // }
     return 0;
 }
 
-int_t dLPanelTrSolve(int_t k, int_t *factored_L,
-                     double *BlockUFactor,
-                     gridinfo_t *grid,
-                     dLUstruct_t *LUstruct)
+int_t dLPanelTrSolve( int_t k,   int_t* factored_L,
+		      double* BlockUFactor,
+		      gridinfo_t *grid,
+		      dLUstruct_t *LUstruct)
 {
     double alpha = 1.0;
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     dLocalLU_t *Llu = LUstruct->Llu;
-    int_t *xsup = Glu_persist->xsup;
+    int_t* xsup = Glu_persist->xsup;
 
     int_t iam = grid->iam;
 
-    int_t pkk = PNUM(PROW(k, grid), PCOL(k, grid), grid);
-    int_t kcol = PCOL(k, grid);
-    int_t mycol = MYCOL(iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t kcol = PCOL (k, grid);
+    int_t mycol = MYCOL (iam, grid);
     int nsupc = SuperSize(k);
 
     /*factor the L panel*/
-    if (mycol == kcol && iam != pkk)
+    if (mycol == kcol  && iam != pkk)
     {
         // factored_L[k] = 1;
-        int_t lk = LBj(k, grid);
+        int_t lk = LBj (k, grid);
         double *lusup = Llu->Lnzval_bc_ptr[lk];
         int nsupr;
         if (Llu->Lrowind_bc_ptr[lk])
@@ -156,23 +155,23 @@ int_t dLPanelTrSolve(int_t k, int_t *factored_L,
         // }
 
         int_t l = nsupr;
-        double *ublk_ptr = BlockUFactor;
+        double* ublk_ptr = BlockUFactor;
         int ld_ujrow = nsupc;
 
         // unsigned long long t1 = _rdtsc();
 
         // #pragma omp for schedule(dynamic) nowait
-#define BL 32
+#define BL  32
         for (int i = 0; i < CEILING(l, BL); ++i)
         {
-#pragma omp task
+            #pragma omp task
             {
                 int_t off = i * BL;
                 // Sherry: int_t len = MY_MIN(BL, l - i * BL);
                 int len = SUPERLU_MIN(BL, l - i * BL);
-                superlu_dtrsm("R", "U", "N", "N",
-                              len, nsupc, alpha, ublk_ptr,
-                              ld_ujrow, &lusup[off], nsupr);
+
+                superlu_dtrsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[off], nsupr);
             }
         }
     }
@@ -183,23 +182,21 @@ int_t dLPanelTrSolve(int_t k, int_t *factored_L,
          { */
         /* code */
         factored_L[k] = 1;
-        int_t lk = LBj(k, grid);
+        int_t lk = LBj (k, grid);
         double *lusup = Llu->Lnzval_bc_ptr[lk];
         int nsupr;
-        if (Llu->Lrowind_bc_ptr[lk])
-            nsupr = Llu->Lrowind_bc_ptr[lk][1];
-        else
-            nsupr = 0;
+        if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else nsupr = 0;
 
         /*factorize A[kk]*/
 
         int_t l = nsupr - nsupc;
 
-        double *ublk_ptr = BlockUFactor;
+        double* ublk_ptr = BlockUFactor;
         int ld_ujrow = nsupc;
         // printf("%d: L update \n",k );
 
-#define BL 32
+#define BL  32
         // #pragma omp parallel for
         for (int i = 0; i < CEILING(l, BL); ++i)
         {
@@ -208,62 +205,60 @@ int_t dLPanelTrSolve(int_t k, int_t *factored_L,
             int len = SUPERLU_MIN(BL, (l - i * BL));
 #pragma omp task
             {
-
-                superlu_dtrsm("R", "U", "N", "N",
-                              len, nsupc, alpha, ublk_ptr,
-                              ld_ujrow, &lusup[nsupc + off], nsupr);
+                superlu_dtrsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
             }
         }
     }
 
     return 0;
-} /* dLPanelTrSolve */
+}  /* dLPanelTrSolve */
 
-int_t dLPanelUpdate(int_t k, int_t *IrecvPlcd_D, int_t *factored_L,
-                    MPI_Request *U_diag_blk_recv_req,
-                    double *BlockUFactor,
+int_t dLPanelUpdate( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
+                    MPI_Request * U_diag_blk_recv_req,
+                    double* BlockUFactor,
                     gridinfo_t *grid,
                     dLUstruct_t *LUstruct, SCT_t *SCT)
 {
 
-    dUDiagBlockRecvWait(k, IrecvPlcd_D, factored_L,
-                        U_diag_blk_recv_req, grid, LUstruct, SCT);
+    dUDiagBlockRecvWait( k,  IrecvPlcd_D, factored_L,
+                         U_diag_blk_recv_req, grid, LUstruct, SCT);
 
-    dLPanelTrSolve(k, factored_L, BlockUFactor, grid, LUstruct);
+    dLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct );
 
     return 0;
-} /* dLPanelUpdate */
+}  /* dLPanelUpdate */
 
-#define BL 32
+#define BL  32
 
-int_t dUPanelTrSolve(int_t k,
-                     double *BlockLFactor,
-                     double *bigV,
+int_t dUPanelTrSolve( int_t k,  
+                     double* BlockLFactor,
+                     double* bigV,
                      int_t ldt,
-                     Ublock_info_t *Ublock_info,
+                     Ublock_info_t* Ublock_info,
                      gridinfo_t *grid,
                      dLUstruct_t *LUstruct,
                      SuperLUStat_t *stat, SCT_t *SCT)
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     dLocalLU_t *Llu = LUstruct->Llu;
-    int_t *xsup = Glu_persist->xsup;
+    int_t* xsup = Glu_persist->xsup;
     int_t iam = grid->iam;
-    int_t myrow = MYROW(iam, grid);
-    int_t pkk = PNUM(PROW(k, grid), PCOL(k, grid), grid);
-    int_t krow = PROW(k, grid);
+    int_t myrow = MYROW (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
     int_t nsupc = SuperSize(k);
 
     /*factor the U panel*/
-    if (myrow == krow && iam != pkk)
+    if (myrow == krow  && iam != pkk)
     {
-        int_t lk = LBi(k, grid); /* Local block number */
+        int_t lk = LBi (k, grid);         /* Local block number */
         if (!Llu->Unzval_br_ptr[lk])
             return 0;
         /* Initialization. */
-        int_t klst = FstBlockC(k + 1);
+        int_t klst = FstBlockC (k + 1);
 
-        int_t *usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */
+        int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
         double *uval = Llu->Unzval_br_ptr[lk];
         int_t nb = usub[0];
 
@@ -271,22 +266,22 @@ int_t dUPanelTrSolve(int_t k,
         double *lusup = BlockLFactor;
 
         /* Loop through all the row blocks. to get the iukp and rukp*/
-        Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat);
+        Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
 
         /* Loop through all the row blocks. */
         // #pragma omp for schedule(dynamic,2) nowait
         for (int_t b = 0; b < nb; ++b)
         {
-#pragma omp task
+            #pragma omp task
             {
-#ifdef _OPENMP
+#ifdef _OPENMP	    
                 int_t thread_id = omp_get_thread_num();
-#else
+#else		
                 int_t thread_id = 0;
-#endif
-                double *tempv = bigV + thread_id * ldt * ldt;
+#endif		
+                double *tempv = bigV +  thread_id * ldt * ldt;
                 dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
-                                        usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist);
+				       usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist);
             }
         }
     }
@@ -298,46 +293,48 @@ int_t dUPanelTrSolve(int_t k,
         // factored_U[k] = 1;
         int_t *Lsub_buf;
         double *Lval_buf;
-        int_t lk = LBj(k, grid);
+        int_t lk = LBj (k, grid);
         Lsub_buf = Llu->Lrowind_bc_ptr[lk];
         Lval_buf = Llu->Lnzval_bc_ptr[lk];
 
+
         /* calculate U panel */
         // PDGSTRS2 (n, k0, k, Lsub_buf, Lval_buf, Glu_persist, grid, Llu,
         //           stat, HyP->Ublock_info, bigV, ldt, SCT);
 
-        lk = LBi(k, grid); /* Local block number */
+        lk = LBi (k, grid);         /* Local block number */
         if (Llu->Unzval_br_ptr[lk])
         {
             /* Initialization. */
-            int_t klst = FstBlockC(k + 1);
+            int_t klst = FstBlockC (k + 1);
 
-            int_t *usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */
+            int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
             double *uval = Llu->Unzval_br_ptr[lk];
             int_t nb = usub[0];
 
             // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
-            int_t nsupr = Lsub_buf[1]; /* LDA of lusup[] */
+            int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
             double *lusup = Lval_buf;
 
             /* Loop through all the row blocks. to get the iukp and rukp*/
-            Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat);
+            Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
 
             /* Loop through all the row blocks. */
             // printf("%d :U update \n", k);
             for (int_t b = 0; b < nb; ++b)
             {
-#pragma omp task
+                #pragma omp task
                 {
-#ifdef _OPENMP
+#ifdef _OPENMP		
                     int_t thread_id = omp_get_thread_num();
-#else
+#else		    
                     int_t thread_id = 0;
-#endif
-                    double *tempv = bigV + thread_id * ldt * ldt;
+#endif		    
+                    double *tempv = bigV +  thread_id * ldt * ldt;
                     dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
-                                            usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist);
+					   usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist);
                 }
+
             }
         }
     }
@@ -345,114 +342,116 @@ int_t dUPanelTrSolve(int_t k,
     return 0;
 } /* dUPanelTrSolve */
 
-int_t dUPanelUpdate(int_t k, int_t *factored_U,
-                    MPI_Request *L_diag_blk_recv_req,
-                    double *BlockLFactor,
-                    double *bigV,
+int_t dUPanelUpdate( int_t k,  int_t* factored_U,
+                    MPI_Request * L_diag_blk_recv_req,
+                    double* BlockLFactor,
+                    double* bigV,
                     int_t ldt,
-                    Ublock_info_t *Ublock_info,
+                    Ublock_info_t* Ublock_info,
                     gridinfo_t *grid,
                     dLUstruct_t *LUstruct,
                     SuperLUStat_t *stat, SCT_t *SCT)
 {
 
-    LDiagBlockRecvWait(k, factored_U, L_diag_blk_recv_req, grid);
+    LDiagBlockRecvWait( k, factored_U, L_diag_blk_recv_req, grid);
 
-    dUPanelTrSolve(k, BlockLFactor, bigV, ldt, Ublock_info, grid,
-                   LUstruct, stat, SCT);
+    dUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid,
+                       LUstruct, stat, SCT);
     return 0;
 }
 
 int_t dIBcastRecvLPanel(
     int_t k,
     int_t k0,
-    int *msgcnt,
+    int* msgcnt,
     MPI_Request *send_req,
-    MPI_Request *recv_req,
-    int_t *Lsub_buf,
-    double *Lval_buf,
-    int_t *factored,
+    MPI_Request *recv_req ,
+    int_t* Lsub_buf,
+    double* Lval_buf,
+    int_t * factored,
     gridinfo_t *grid,
     dLUstruct_t *LUstruct,
     SCT_t *SCT,
-    int tag_ub)
+    int tag_ub
+)
 {
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     dLocalLU_t *Llu = LUstruct->Llu;
-    int_t *xsup = Glu_persist->xsup;
-    int **ToSendR = Llu->ToSendR;
-    int *ToRecv = Llu->ToRecv;
+    int_t* xsup = Glu_persist->xsup;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
     int_t iam = grid->iam;
     int_t Pc = grid->npcol;
-    int_t mycol = MYCOL(iam, grid);
-    int_t kcol = PCOL(k, grid);
-    int_t **Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
-    double **Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    double** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
     /* code */
     if (mycol == kcol)
     {
         /*send the L panel to myrow*/
 
-        int_t lk = LBj(k, grid); /* Local block number. */
-        int_t *lsub = Lrowind_bc_ptr[lk];
-        double *lusup = Lnzval_bc_ptr[lk];
+        int_t lk = LBj (k, grid);     /* Local block number. */
+        int_t* lsub = Lrowind_bc_ptr[lk];
+        double* lusup = Lnzval_bc_ptr[lk];
 
-        dIBcast_LPanel(k, k0, lsub, lusup, grid, msgcnt, send_req,
-                       ToSendR, xsup, tag_ub);
+        dIBcast_LPanel (k, k0, lsub, lusup, grid, msgcnt, send_req,
+		       ToSendR, xsup, tag_ub);
 
         if (lsub)
         {
-            int_t nrbl = lsub[0]; /*number of L blocks */
-            int_t len = lsub[1];  /* LDA of the nzval[] */
-            int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
-            int_t len2 = SuperSize(lk) * len;
+            int_t nrbl  =   lsub[0]; /*number of L blocks */
+            int_t   len   = lsub[1];       /* LDA of the nzval[] */
+            int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+            int_t len2  = SuperSize(lk) * len;
             SCT->commVolFactor += 1.0 * (Pc - 1) * (len1 * sizeof(int_t) + len2 * sizeof(double));
         }
     }
     else
     {
         /*receive factored L panels*/
-        if (ToRecv[k] >= 1) /* Recv block column L(:,0). */
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
         {
             /*place Irecv*/
-            dIrecv_LPanel(k, k0, Lsub_buf, Lval_buf, grid, recv_req, Llu, tag_ub);
+            dIrecv_LPanel (k, k0, Lsub_buf, Lval_buf, grid, recv_req, Llu, tag_ub);
         }
         else
         {
             msgcnt[0] = 0;
         }
+
     }
     factored[k] = 0;
 
     return 0;
 }
 
-int_t dIBcastRecvUPanel(int_t k, int_t k0, int *msgcnt,
-                        MPI_Request *send_requ,
-                        MPI_Request *recv_requ,
-                        int_t *Usub_buf, double *Uval_buf,
-                        gridinfo_t *grid, dLUstruct_t *LUstruct,
-                        SCT_t *SCT, int tag_ub)
+int_t dIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt,
+    			     MPI_Request *send_requ,
+    			     MPI_Request *recv_requ,
+    			     int_t* Usub_buf, double* Uval_buf,
+    			     gridinfo_t *grid, dLUstruct_t *LUstruct,
+    			     SCT_t *SCT, int tag_ub)
 {
     dLocalLU_t *Llu = LUstruct->Llu;
 
-    int *ToSendD = Llu->ToSendD;
-    int *ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
+    int* ToRecv = Llu->ToRecv;
     int_t iam = grid->iam;
     int_t Pr = grid->nprow;
-    int_t myrow = MYROW(iam, grid);
-    int_t krow = PROW(k, grid);
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
 
-    int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
-    double **Unzval_br_ptr = Llu->Unzval_br_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    double** Unzval_br_ptr = Llu->Unzval_br_ptr;
     if (myrow == krow)
     {
         /*send U panel to myrow*/
-        int_t lk = LBi(k, grid);
-        int_t *usub = Ufstnz_br_ptr[lk];
-        double *uval = Unzval_br_ptr[lk];
+        int_t   lk = LBi (k, grid);
+        int_t*  usub = Ufstnz_br_ptr[lk];
+        double* uval = Unzval_br_ptr[lk];
         dIBcast_UPanel(k, k0, usub, uval, grid, msgcnt,
-                       send_requ, ToSendD, tag_ub);
+                        send_requ, ToSendD, tag_ub);
         if (usub)
         {
             /* code */
@@ -464,9 +463,9 @@ int_t dIBcastRecvUPanel(int_t k, int_t k0, int *msgcnt,
     else
     {
         /*receive U panels */
-        if (ToRecv[k] == 2) /* Recv block row U(k,:). */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
         {
-            dIrecv_UPanel(k, k0, Usub_buf, Uval_buf, Llu, grid, recv_requ, tag_ub);
+            dIrecv_UPanel (k, k0, Usub_buf, Uval_buf, Llu, grid, recv_requ, tag_ub);
         }
         else
         {
@@ -477,58 +476,58 @@ int_t dIBcastRecvUPanel(int_t k, int_t k0, int *msgcnt,
     return 0;
 }
 
-int_t dWaitL(int_t k, int *msgcnt, int *msgcntU,
-             MPI_Request *send_req, MPI_Request *recv_req,
-             gridinfo_t *grid, dLUstruct_t *LUstruct, SCT_t *SCT)
+int_t dWaitL( int_t k, int* msgcnt, int* msgcntU,
+              MPI_Request *send_req, MPI_Request *recv_req,
+    	      gridinfo_t *grid, dLUstruct_t *LUstruct, SCT_t *SCT)
 {
     dLocalLU_t *Llu = LUstruct->Llu;
-    int **ToSendR = Llu->ToSendR;
-    int *ToRecv = Llu->ToRecv;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
     int_t iam = grid->iam;
-    int_t mycol = MYCOL(iam, grid);
-    int_t kcol = PCOL(k, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
     if (mycol == kcol)
     {
         /*send the L panel to myrow*/
-        Wait_LSend(k, grid, ToSendR, send_req, SCT);
+        Wait_LSend (k, grid, ToSendR, send_req, SCT);
     }
     else
     {
         /*receive factored L panels*/
-        if (ToRecv[k] >= 1) /* Recv block column L(:,0). */
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
         {
             /*force wait for I recv to complete*/
-            dWait_LRecv(recv_req, msgcnt, msgcntU, grid, SCT);
+            dWait_LRecv( recv_req,  msgcnt, msgcntU, grid, SCT);
         }
     }
 
     return 0;
 }
 
-int_t dWaitU(int_t k, int *msgcnt,
-             MPI_Request *send_requ, MPI_Request *recv_requ,
-             gridinfo_t *grid, dLUstruct_t *LUstruct, SCT_t *SCT)
+int_t dWaitU( int_t k, int* msgcnt,
+              MPI_Request *send_requ, MPI_Request *recv_requ,
+    	      gridinfo_t *grid, dLUstruct_t *LUstruct, SCT_t *SCT)
 {
     dLocalLU_t *Llu = LUstruct->Llu;
 
-    int *ToRecv = Llu->ToRecv;
-    int *ToSendD = Llu->ToSendD;
+    int* ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
     int_t iam = grid->iam;
-    int_t myrow = MYROW(iam, grid);
-    int_t krow = PROW(k, grid);
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
     if (myrow == krow)
     {
-        int_t lk = LBi(k, grid);
+        int_t lk = LBi (k, grid);
         if (ToSendD[lk] == YES)
             Wait_USend(send_requ, grid, SCT);
     }
     else
     {
         /*receive U panels */
-        if (ToRecv[k] == 2) /* Recv block row U(k,:). */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
         {
             /*force wait*/
-            dWait_URecv(recv_requ, msgcnt, SCT);
+            dWait_URecv( recv_requ, msgcnt, SCT);
         }
     }
     return 0;
diff --git a/SRC/pdgstrf.c b/SRC/pdgstrf.c
index f26c9e66..ddd0870b 100644
--- a/SRC/pdgstrf.c
+++ b/SRC/pdgstrf.c
@@ -109,7 +109,6 @@ at the top-level directory.
  */
 
 #include 
-/*#include "mkl.h"*/
 #include "superlu_ddefs.h"
 
 #ifdef GPU_ACC
diff --git a/SRC/pzgstrf.c b/SRC/pzgstrf.c
index c9bbdd4b..7e1feadb 100644
--- a/SRC/pzgstrf.c
+++ b/SRC/pzgstrf.c
@@ -108,7 +108,6 @@ at the top-level directory.
  */
 
 #include 
-/*#include "mkl.h"*/
 #include "superlu_zdefs.h"
 
 #ifdef GPU_ACC
diff --git a/SRC/scatter.c b/SRC/scatter.c
index ed9e89bd..46926955 100644
--- a/SRC/scatter.c
+++ b/SRC/scatter.c
@@ -2,13 +2,6 @@
 #include "scatter.h"
 //#include "compiler.h"
 
-#ifdef __INTEL_COMPILER
-#include "mkl.h"
-#else
-#include "cblas.h"
-#endif
-
-
 #define ISORT
 
 #if 0 /**** Sherry: this routine is moved to util.c ****/
diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h
index a12d4b7c..75f91112 100644
--- a/SRC/superlu_defs.h
+++ b/SRC/superlu_defs.h
@@ -94,9 +94,9 @@ at the top-level directory.
   #define IFMT "%8d"
 #endif
 
-#ifdef __INTEL_COMPILER
-#include "mkl.h"
-#endif
+//#ifdef __INTEL_COMPILER
+//#include "mkl.h"
+//#endif
 
 #if 0 // Sherry: the following does not work with gcc on Linux.
 #define  _mm_malloc(a,b) malloc(a)
diff --git a/SRC/supernodal_etree.c b/SRC/supernodal_etree.c
index 08de5b2e..c683feaf 100644
--- a/SRC/supernodal_etree.c
+++ b/SRC/supernodal_etree.c
@@ -1,6 +1,6 @@
 /*! @file
  * \brief function to generate supernodal etree
-*
+ *
  * 
  * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Oak Ridge National Lab
diff --git a/SRC/zscatter3d.c b/SRC/zscatter3d.c
index 43388abf..9070fdca 100644
--- a/SRC/zscatter3d.c
+++ b/SRC/zscatter3d.c
@@ -9,15 +9,22 @@ The source code is distributed under BSD license, see the file License.txt
 at the top-level directory.
 */
 
+/*! @file
+ * \brief Scatter the computed blocks into LU destination.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
 #include "superlu_zdefs.h"
 //#include "scatter.h"
 //#include "compiler.h"
 
-#ifdef __INTEL_COMPILER
-#include "mkl.h"
-#else
 //#include "cblas.h"
-#endif
+
 
 #define ISORT
 #define SCATTER_U_CPU  scatter_u
diff --git a/SRC/ztreeFactorizationGPU.c b/SRC/ztreeFactorizationGPU.c
index 0377acf1..7875366f 100644
--- a/SRC/ztreeFactorizationGPU.c
+++ b/SRC/ztreeFactorizationGPU.c
@@ -6,17 +6,14 @@
  * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley,
  * Georgia Institute of Technology, Oak Ridge National Laboratory
- * March 14, 2021 version 7.0.0
+ * May 12, 2021
  * 
*/ // #include "treeFactorization.h" // #include "trfCommWrapper.h" #include "zlustruct_gpu.h" -#ifdef __INTEL_COMPILER -#include "mkl.h" -#else + //#include "cblas.h" -#endif #ifdef GPU_ACC ///////////////// enable GPU diff --git a/SRC/ztrfCommWrapper.c b/SRC/ztrfCommWrapper.c index b37fbb4a..e180769e 100644 --- a/SRC/ztrfCommWrapper.c +++ b/SRC/ztrfCommWrapper.c @@ -14,8 +14,9 @@ at the top-level directory. * *
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
  */
 
 #include "superlu_zdefs.h"
@@ -25,11 +26,7 @@ at the top-level directory.
 #include "trfCommWrapper.h"
 #endif
 
-#ifdef __INTEL_COMPILER
-#include "mkl.h"
-#else
 //#include "cblas.h"
-#endif 
 
 int_t zDiagFactIBCast(int_t k,  int_t k0,      // supernode to be factored
                      doublecomplex *BlockUFactor,
@@ -174,23 +171,6 @@ int_t zLPanelTrSolve( int_t k,   int_t* factored_L,
 
                 superlu_ztrsm("R", "U", "N", "N", len, nsupc, alpha,
 			      ublk_ptr, ld_ujrow, &lusup[off], nsupr);
-		
-#if 0 // ** replaced by superlu_ztrsm 		
-#if 1
-  #if defined (USE_VENDOR_BLAS)
-		ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-			ublk_ptr, &ld_ujrow, &lusup[off], &nsupr,
-			1, 1, 1, 1);
-  #else
-		ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-			ublk_ptr, &ld_ujrow, &lusup[off], &nsupr);
-  #endif
-#else
-                cblas_ztrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
-                len, nsupc, (void*) &alpha, ublk_ptr, ld_ujrow, &lusup[off], nsupr);
-#endif
-#endif // ** replaced by superlu_ztrsm 		
-		
             }
         }
     }
@@ -204,10 +184,8 @@ int_t zLPanelTrSolve( int_t k,   int_t* factored_L,
         int_t lk = LBj (k, grid);
         doublecomplex *lusup = Llu->Lnzval_bc_ptr[lk];
         int nsupr;
-        if (Llu->Lrowind_bc_ptr[lk])
-            nsupr = Llu->Lrowind_bc_ptr[lk][1];
-        else
-            nsupr = 0;
+        if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else nsupr = 0;
 
         /*factorize A[kk]*/
 
@@ -228,22 +206,6 @@ int_t zLPanelTrSolve( int_t k,   int_t* factored_L,
             {
                 superlu_ztrsm("R", "U", "N", "N", len, nsupc, alpha,
 			      ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
-#if 0 // ** replaced by superlu_ztrsm
-#if 1
-  #if defined (USE_VENDOR_BLAS)
-		ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-			ublk_ptr, &ld_ujrow, &lusup[nsupc + off], &nsupr,
-			1, 1, 1, 1);
-  #else
-		ztrsm_ ("R", "U", "N", "N", &len, &nsupc, &alpha,
-			ublk_ptr, &ld_ujrow, &lusup[nsupc + off], &nsupr);
-  #endif
-#else
-                cblas_ztrsm (CblasColMajor, CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit,
-                             len, nsupc, (void*) &alpha, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
-#endif
-#endif // ** replaced by superlu_ztrsm
-
             }
         }
     }
@@ -309,13 +271,13 @@ int_t zUPanelTrSolve( int_t k,
         // #pragma omp for schedule(dynamic,2) nowait
         for (int_t b = 0; b < nb; ++b)
         {
- #pragma omp task
+            #pragma omp task
             {
-#ifdef _OPENNP
+#ifdef _OPENMP	    
                 int_t thread_id = omp_get_thread_num();
-#else
+#else		
                 int_t thread_id = 0;
-#endif
+#endif		
                 doublecomplex *tempv = bigV +  thread_id * ldt * ldt;
                 zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
 				       usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist);
@@ -360,13 +322,13 @@ int_t zUPanelTrSolve( int_t k,
             // printf("%d :U update \n", k);
             for (int_t b = 0; b < nb; ++b)
             {
- #pragma omp task
+                #pragma omp task
                 {
-#ifdef _OPENMP
+#ifdef _OPENMP		
                     int_t thread_id = omp_get_thread_num();
-#else
+#else		    
                     int_t thread_id = 0;
-#endif
+#endif		    
                     doublecomplex *tempv = bigV +  thread_id * ldt * ldt;
                     zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
 					   usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist);

From daab26087fa02662092cadc7fce8595b7c49586c Mon Sep 17 00:00:00 2001
From: Xiaoye Li 
Date: Wed, 30 Jun 2021 13:05:47 -0400
Subject: [PATCH 103/147] Add the single-precision code, generated from
 basefiles.

---
 CMakeLists.txt                     |    1 +
 EXAMPLE/CMakeLists.txt             |   31 +-
 EXAMPLE/psdrive.c                  |  287 +++
 EXAMPLE/psdrive1.c                 |  307 +++
 EXAMPLE/psdrive1_ABglobal.c        |  281 +++
 EXAMPLE/psdrive2.c                 |  296 +++
 EXAMPLE/psdrive2_ABglobal.c        |  298 +++
 EXAMPLE/psdrive3.c                 |  304 +++
 EXAMPLE/psdrive3_ABglobal.c        |  305 +++
 EXAMPLE/psdrive3d.c                |  411 ++++
 EXAMPLE/psdrive4.c                 |  301 +++
 EXAMPLE/psdrive4_ABglobal.c        |  361 ++++
 EXAMPLE/psdrive_ABglobal.c         |  256 +++
 EXAMPLE/screate_matrix.c           |  427 +++++
 EXAMPLE/screate_matrix3d.c         |  463 +++++
 EXAMPLE/screate_matrix_perturbed.c |  419 ++++
 FORTRAN/superlu_dist_config.fh     |    2 +
 SRC/CMakeLists.txt                 |   62 +
 SRC/TreeInterface.cpp              |  328 ++--
 SRC/ps3dcomm.c                     |  876 +++++++++
 SRC/psGetDiagU.c                   |  121 ++
 SRC/psdistribute.c                 | 1987 +++++++++++++++++++
 SRC/psgsequ.c                      |  244 +++
 SRC/psgsmv.c                       |  383 ++++
 SRC/psgsmv_AXglobal.c              |  324 ++++
 SRC/psgsrfs.c                      |  260 +++
 SRC/psgsrfs_ABXglobal.c            |  465 +++++
 SRC/psgssvx.c                      | 1579 ++++++++++++++++
 SRC/psgssvx3d.c                    | 1586 ++++++++++++++++
 SRC/psgssvx_ABglobal.c             | 1112 +++++++++++
 SRC/psgstrf.c                      | 2006 ++++++++++++++++++++
 SRC/psgstrf2.c                     |  921 +++++++++
 SRC/psgstrf3d.c                    |  375 ++++
 SRC/psgstrs.c                      | 2400 +++++++++++++++++++++++
 SRC/psgstrs1.c                     |  910 +++++++++
 SRC/psgstrs_Bglobal.c              | 1040 ++++++++++
 SRC/psgstrs_lsum.c                 | 2138 +++++++++++++++++++++
 SRC/pslangs.c                      |  145 ++
 SRC/pslaqgs.c                      |  151 ++
 SRC/pssymbfact_distdata.c          | 2831 ++++++++++++++++++++++++++++
 SRC/psutil.c                       |  868 +++++++++
 SRC/sSchCompUdt-2Ddynamic.c        |  714 +++++++
 SRC/sSchCompUdt-cuda.c             |  589 ++++++
 SRC/sbinary_io.c                   |   42 +
 SRC/scommunication_aux.c           |  504 +++++
 SRC/sdistribute.c                  | 1652 ++++++++++++++++
 SRC/sgather.c                      |  394 ++++
 SRC/sgsequ_dist.c                  |  204 ++
 SRC/slangs_dist.c                  |  130 ++
 SRC/slaqgs_dist.c                  |  154 ++
 SRC/sldperm_dist.c                 |  175 ++
 SRC/slook_ahead_update.c           |  278 +++
 SRC/slustruct_gpu.h                |  240 +++
 SRC/smemory_dist.c                 |  286 +++
 SRC/smyblas2_dist.c                |  248 +++
 SRC/snrformat_loc3d.c              |  315 ++++
 SRC/sreadMM.c                      |  244 +++
 SRC/sreadhb.c                      |  389 ++++
 SRC/sreadrb.c                      |  346 ++++
 SRC/sreadtriple.c                  |  180 ++
 SRC/sreadtriple_noheader.c         |  199 ++
 SRC/sscatter.c                     |  524 +++++
 SRC/sscatter3d.c                   |  625 ++++++
 SRC/ssp_blas2_dist.c               |  501 +++++
 SRC/ssp_blas3_dist.c               |  138 ++
 SRC/sstatic_schedule.c             |  984 ++++++++++
 SRC/ssuperlu_blas.c                |  123 ++
 SRC/ssuperlu_gpu.cu                | 1788 ++++++++++++++++++
 SRC/streeFactorization.c           |  746 ++++++++
 SRC/streeFactorizationGPU.c        |  735 ++++++++
 SRC/strfAux.c                      |  758 ++++++++
 SRC/strfCommWrapper.c              |  534 ++++++
 SRC/superlu_FortranCInterface.h    |    8 +-
 SRC/superlu_dist_config.h          |   20 -
 SRC/superlu_gpu_utils.cu           |   14 +
 SRC/sutil_dist.c                   |  945 ++++++++++
 SRC/treeFactorizationGPU.c         |    5 -
 77 files changed, 43428 insertions(+), 165 deletions(-)
 create mode 100644 EXAMPLE/psdrive.c
 create mode 100644 EXAMPLE/psdrive1.c
 create mode 100644 EXAMPLE/psdrive1_ABglobal.c
 create mode 100644 EXAMPLE/psdrive2.c
 create mode 100644 EXAMPLE/psdrive2_ABglobal.c
 create mode 100644 EXAMPLE/psdrive3.c
 create mode 100644 EXAMPLE/psdrive3_ABglobal.c
 create mode 100644 EXAMPLE/psdrive3d.c
 create mode 100644 EXAMPLE/psdrive4.c
 create mode 100644 EXAMPLE/psdrive4_ABglobal.c
 create mode 100644 EXAMPLE/psdrive_ABglobal.c
 create mode 100644 EXAMPLE/screate_matrix.c
 create mode 100644 EXAMPLE/screate_matrix3d.c
 create mode 100644 EXAMPLE/screate_matrix_perturbed.c
 create mode 100644 SRC/ps3dcomm.c
 create mode 100644 SRC/psGetDiagU.c
 create mode 100644 SRC/psdistribute.c
 create mode 100644 SRC/psgsequ.c
 create mode 100644 SRC/psgsmv.c
 create mode 100644 SRC/psgsmv_AXglobal.c
 create mode 100644 SRC/psgsrfs.c
 create mode 100644 SRC/psgsrfs_ABXglobal.c
 create mode 100644 SRC/psgssvx.c
 create mode 100644 SRC/psgssvx3d.c
 create mode 100644 SRC/psgssvx_ABglobal.c
 create mode 100644 SRC/psgstrf.c
 create mode 100644 SRC/psgstrf2.c
 create mode 100644 SRC/psgstrf3d.c
 create mode 100644 SRC/psgstrs.c
 create mode 100644 SRC/psgstrs1.c
 create mode 100644 SRC/psgstrs_Bglobal.c
 create mode 100644 SRC/psgstrs_lsum.c
 create mode 100644 SRC/pslangs.c
 create mode 100644 SRC/pslaqgs.c
 create mode 100644 SRC/pssymbfact_distdata.c
 create mode 100644 SRC/psutil.c
 create mode 100644 SRC/sSchCompUdt-2Ddynamic.c
 create mode 100644 SRC/sSchCompUdt-cuda.c
 create mode 100644 SRC/sbinary_io.c
 create mode 100644 SRC/scommunication_aux.c
 create mode 100644 SRC/sdistribute.c
 create mode 100644 SRC/sgather.c
 create mode 100644 SRC/sgsequ_dist.c
 create mode 100644 SRC/slangs_dist.c
 create mode 100644 SRC/slaqgs_dist.c
 create mode 100644 SRC/sldperm_dist.c
 create mode 100644 SRC/slook_ahead_update.c
 create mode 100644 SRC/slustruct_gpu.h
 create mode 100644 SRC/smemory_dist.c
 create mode 100644 SRC/smyblas2_dist.c
 create mode 100644 SRC/snrformat_loc3d.c
 create mode 100644 SRC/sreadMM.c
 create mode 100644 SRC/sreadhb.c
 create mode 100644 SRC/sreadrb.c
 create mode 100644 SRC/sreadtriple.c
 create mode 100644 SRC/sreadtriple_noheader.c
 create mode 100644 SRC/sscatter.c
 create mode 100644 SRC/sscatter3d.c
 create mode 100644 SRC/ssp_blas2_dist.c
 create mode 100644 SRC/ssp_blas3_dist.c
 create mode 100644 SRC/sstatic_schedule.c
 create mode 100644 SRC/ssuperlu_blas.c
 create mode 100644 SRC/ssuperlu_gpu.cu
 create mode 100644 SRC/streeFactorization.c
 create mode 100644 SRC/streeFactorizationGPU.c
 create mode 100644 SRC/strfAux.c
 create mode 100644 SRC/strfCommWrapper.c
 delete mode 100644 SRC/superlu_dist_config.h
 create mode 100644 SRC/superlu_gpu_utils.cu
 create mode 100644 SRC/sutil_dist.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d5b49650..0f13a12a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 # Set up options
 option(enable_doc       "Build doxygen documentation" OFF)
 option(enable_double    "Enable double precision library" ON)
+option(enable_single    "Enable single precision library" OFF)
 option(enable_complex16 "Enable complex16 precision library" OFF)
 option(enable_tests  "Build tests" ON)
 option(enable_examples  "Build examples" ON)
diff --git a/EXAMPLE/CMakeLists.txt b/EXAMPLE/CMakeLists.txt
index d94fd05c..59ca46fd 100644
--- a/EXAMPLE/CMakeLists.txt
+++ b/EXAMPLE/CMakeLists.txt
@@ -86,8 +86,37 @@ if(enable_double)
   target_link_libraries(pddrive_spawn ${all_link_libs})
 
 
-endif()
+endif() #### end enable_double
+
+if(enable_single)
+  set(SEXM psdrive.c screate_matrix.c)
+  add_executable(psdrive ${SEXM})
+  target_link_libraries(psdrive ${all_link_libs})
+
+  set(SEXM1 psdrive1.c screate_matrix.c)
+  add_executable(psdrive1 ${SEXM1})
+  target_link_libraries(psdrive1 ${all_link_libs})
+  add_superlu_dist_example(psdrive1 big.rua 2 2)
+
+  set(SEXM2 psdrive2.c screate_matrix.c screate_matrix_perturbed.c)
+  add_executable(psdrive2 ${SEXM2})
+  target_link_libraries(psdrive2 ${all_link_libs})
+  add_superlu_dist_example(psdrive2 big.rua 2 2)
+
+  set(SEXM3 psdrive3.c screate_matrix.c)
+  add_executable(psdrive3 ${SEXM3})
+  target_link_libraries(psdrive3 ${all_link_libs})
+  add_superlu_dist_example(psdrive3 big.rua 2 2)
+
+  set(SEXM4 psdrive4.c screate_matrix.c)
+  add_executable(psdrive4 ${SEXM4})
+  target_link_libraries(psdrive4 ${all_link_libs})
+
+  set(SEXM3D psdrive3d.c screate_matrix.c screate_matrix3d.c)
+  add_executable(psdrive3d ${SEXM3D})
+  target_link_libraries(psdrive3d ${all_link_libs})
 
+endif() #### end enable_single
 
 if(enable_complex16)
 
diff --git a/EXAMPLE/psdrive.c b/EXAMPLE/psdrive.c
new file mode 100644
index 00000000..b34fcefc
--- /dev/null
+++ b/EXAMPLE/psdrive.c
@@ -0,0 +1,287 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file 
+ * \brief Driver program for PSGSSVX example
+ *
+ * 
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 1, 2007
+ * December 6, 2018
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE.
+ *
+ * This example illustrates how to use PSGSSVX with the full
+ * (default) options to solve a linear system.
+ * 
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call psgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n  psdrive -r  -c  big.rua
+ * 
+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo_t grid; + float *berr; + float *b, *xtrue; + int m, n; + int nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *postfix;; + FILE *fp, *fopen(); + int cpp_defs(); + int ii, omp_mpi_level; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + //MPI_Init( &argc, &argv ); + MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); + + +#if ( VAMPIR>=1 ) + VT_traceoff(); +#endif + +#if ( VTUNE>=1 ) + __itt_pause(); +#endif + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default %4d)\n", nprow); + printf("\t-c : process columns (default %4d)\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + if(grid.iam==0){ + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( iam == -1 ) goto out; + if ( !iam ) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( VAMPIR>=1 ) + VT_traceoff(); +#endif + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + for(ii = 0;iim_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + + PStatFree(&stat); + Destroy_CompRowLoc_Matrix_dist(&A); + sScalePermstructFree(&ScalePermstruct); + sDestroy_LU(n, &grid, &LUstruct); + sLUstructFree(&LUstruct); + if ( options.SolveInitialized ) { + sSolveFinalize(&options, &SOLVEstruct); + } + SUPERLU_FREE(b); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive1.c b/EXAMPLE/psdrive1.c new file mode 100644 index 00000000..6d77ec69 --- /dev/null +++ b/EXAMPLE/psdrive1.c @@ -0,0 +1,307 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PSGSSVX example + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * January 4 2020
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE1.
+ *
+ * This example illustrates how to use PSGSSVX to
+ * solve systems with the same A but different right-hand side,
+ * possibly with different number of right-hand sides.
+ * In this case, we factorize A only once in the first call to
+ * PSGSSVX, and reuse the following data structures
+ * in the subsequent call to PSGSSVX:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ * 
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n  psdrive1 -r  -c  big.rua
+ * 
+ */ +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo_t grid; + float *berr; + float *b, *xtrue, *b1, *b2; + int i, j, m, n, m_loc; + int nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *postfix; + int ii, omp_mpi_level; + FILE *fp, *fopen(); + int cpp_defs(); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 3; /* Max. number of right-hand sides. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default %d)\n", nprow); + printf("\t-c : process columns (default %d)\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( iam == -1 ) goto out; + if ( !iam ) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( VAMPIR>=1 ) + VT_traceoff(); +#endif + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + for(ii = 0;iim_loc; + + /* ------------------------------------------------------------ + 1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver. */ + nrhs = 1; + psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + psinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + + /* ------------------------------------------------------------ + 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN + LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + psgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + psinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + + /* ------------------------------------------------------------ + 3. SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + NUMBER OF RIGHT-HAND SIDES, WE WILL USE THE EXISTING L AND U + FACTORS IN LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 3; + + /* When changing the number of RHS's, the following counters + for communication messages must be reset. */ + pxgstrs_comm_t *gstrs_comm = SOLVEstruct.gstrs_comm; + SUPERLU_FREE(gstrs_comm->B_to_X_SendCnt); + SUPERLU_FREE(gstrs_comm->X_to_B_SendCnt); + SUPERLU_FREE(gstrs_comm->ptr_to_ibuf); + psgstrs_init(n, m_loc, nrhs, ((NRformat_loc *)A.Store)->fst_row, + ScalePermstruct.perm_r, ScalePermstruct.perm_c, &grid, + LUstruct.Glu_persist, &SOLVEstruct); + + psgssvx(&options, &A, &ScalePermstruct, b2, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with 3 RHS's:\n"); + psinf_norm_error(iam, m_loc, nrhs, b2, ldb, xtrue, ldx, grid.comm); + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + Destroy_CompRowLoc_Matrix_dist(&A); + sScalePermstructFree(&ScalePermstruct); + sDestroy_LU(n, &grid, &LUstruct); + sLUstructFree(&LUstruct); + if ( options.SolveInitialized ) { + sSolveFinalize(&options, &SOLVEstruct); + } + SUPERLU_FREE(b); + SUPERLU_FREE(b1); + SUPERLU_FREE(b2); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive1_ABglobal.c b/EXAMPLE/psdrive1_ABglobal.c new file mode 100644 index 00000000..2ae6fedb --- /dev/null +++ b/EXAMPLE/psdrive1_ABglobal.c @@ -0,0 +1,281 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for psgssvx_ABglobal example + * + *
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program psdrive1_ABglobal.
+ *
+ * This example illustrates how to use psgssvx_ABglobal to
+ * solve systems with the same A but different right-hand side.
+ * In this case, we factorize A only once in the first call to
+ * psgssvx_ABglobal, and reuse the following data structures
+ * in the subsequent call to psgssvx_ABglobal:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ * 
+ * On an IBM SP, the program may be run by typing:
+ *    poe psdrive1_ABglobal -r  -c   -procs 

+ *

+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + gridinfo_t grid; + float *berr; + float *a, *b, *b1, *xtrue; + int_t *asub, *xa; + int_t i, j, m, n, nnz; + int_t nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char trans[1]; + char **cpp, c; + FILE *fp, *fopen(); + extern int cpp_defs(); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init( &argc, &argv ); + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default " IFMT ")\n", nprow); + printf("\t-c : process columns (default " IFMT ")\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( iam == -1 ) + goto out; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL + THE OTHER PROCESSES. + ------------------------------------------------------------*/ + if ( !iam ) { + /* Print the CPP definitions. */ + cpp_defs(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); + + printf("Input matrix file: %s\n", *cpp); + printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz); + printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &a, &asub, &xa); + + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b = floatMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b[]"); + if ( !(b1 = floatMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]"); + if ( !(xtrue = floatMalloc_dist(n*nrhs)) ) ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + ldx = n; + ldb = m; + sGenXtrue_dist(n, nrhs, xtrue, ldx); + sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); + for (j = 0; j < nrhs; ++j) + for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb]; + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver: factorize and solve. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid); + } + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + + /* ------------------------------------------------------------ + NOW WE SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN + LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + printf("Solve the system with a different B.\n"); + sinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid); + } + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); + sDestroy_LU(n, &grid, &LUstruct); + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); + SUPERLU_FREE(b); + SUPERLU_FREE(b1); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive2.c b/EXAMPLE/psdrive2.c new file mode 100644 index 00000000..6b5be490 --- /dev/null +++ b/EXAMPLE/psdrive2.c @@ -0,0 +1,296 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PSGSSVX example + * + *
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * December 31, 2016 version 5.1.3
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE2.
+ *
+ * This example illustrates how to use  to solve
+ * systems repeatedly with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once. The following data structures will be reused in the
+ * subsequent call to PSGSSVX:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n  psdrive2 -r  -c  g20.rua
+ * 
+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + NRformat_loc *Astore; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo_t grid; + float *berr; + float *b, *b1, *xtrue, *xtrue1; + int_t *colind, *colind1, *rowptr, *rowptr1; + int_t i, j, m, n, nnz_loc, m_loc; + int nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *postfix; + int ii, omp_mpi_level; + FILE *fp, *fopen(); + int cpp_defs(); + + /* prototypes */ + extern int screate_matrix_perturbed + (SuperMatrix *, int, float **, int *, float **, int *, + FILE *, gridinfo_t *); + extern int screate_matrix_perturbed_postfix + (SuperMatrix *, int, float **, int *, float **, int *, + FILE *, char *, gridinfo_t *); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default %4d)\n", nprow); + printf("\t-c : process columns (default %4d)\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( iam == -1 ) goto out; + if ( !iam ) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + for(ii = 0;iim_loc; + + /* ------------------------------------------------------------ + WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver: factorize and solve. */ + psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + psinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with + the L and U matrices. */ + SUPERLU_FREE(b); /* Free storage of right-hand side. */ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution. */ + + /* ------------------------------------------------------------ + NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern; + + if (iam==0) { + print_options_dist(&options); +#if ( PRNTlevel>=2 ) + PrintInt10("perm_r", m, ScalePermstruct.perm_r); + PrintInt10("perm_c", n, ScalePermstruct.perm_c); +#endif + } + + /* Get the matrix from file, perturbed some diagonal entries to force + a different perm_r[]. Set up the right-hand side. */ + if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); + screate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb, &xtrue1, &ldx, fp, postfix, &grid); + + PStatInit(&stat); /* Initialize the statistics variables. */ + + /* Solve the linear system. */ + psgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + psinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, grid.comm); + +#if ( PRNTlevel>=2 ) + if (iam==0) { + PrintInt10("new perm_r", m, ScalePermstruct.perm_r); + PrintInt10("new perm_c", n, ScalePermstruct.perm_c); + } +#endif + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with + the L and U matrices. */ + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); /* Deallocate the structure of L and U.*/ + if ( options.SolveInitialized ) { + sSolveFinalize(&options, &SOLVEstruct); + } + SUPERLU_FREE(b1); /* Free storage of right-hand side. */ + SUPERLU_FREE(xtrue1); /* Free storage of the exact solution. */ + SUPERLU_FREE(berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} + + diff --git a/EXAMPLE/psdrive2_ABglobal.c b/EXAMPLE/psdrive2_ABglobal.c new file mode 100644 index 00000000..fe4e5a29 --- /dev/null +++ b/EXAMPLE/psdrive2_ABglobal.c @@ -0,0 +1,298 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for psgssvx_ABglobal example + * + *
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program psdrive2_ABglobal.
+ *
+ * This example illustrates how to use psgssvx_ABglobal to solve
+ * systems repeatedly with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once.  The following data structures will be reused in the
+ * subsequent call to psgssvx_ABglobal:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *
+ * On an IBM SP, the program may be run by typing:
+ *    poe psdrive2_ABglobal -r  -c   -procs 

+ *

+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + gridinfo_t grid; + float *berr; + float *a, *a1, *b, *b1, *xtrue; + int_t *asub, *asub1, *xa, *xa1; + int_t i, j, m, n, nnz; + int_t nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char trans[1]; + char **cpp, c; + FILE *fp, *fopen(); + extern int cpp_defs(); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init( &argc, &argv ); + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default " IFMT ")\n", nprow); + printf("\t-c : process columns (default " IFMT ")\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( iam == -1 ) goto out; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + Process 0 reads the matrix A, and then broadcasts it to all + the other processes. + ------------------------------------------------------------*/ + if ( !iam ) { + /* Print the CPP definitions. */ + cpp_defs(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); + + printf("Input matrix file: %s\n", *cpp); + printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz); + printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &a, &asub, &xa); + + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if (!(b=floatMalloc_dist(m * nrhs))) ABORT("Malloc fails for b[]"); + if (!(xtrue=floatMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + ldx = n; + ldb = m; + sGenXtrue_dist(n, nrhs, xtrue, ldx); + sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); + + /* Save a copy of the right-hand side. */ + if ( !(b1 = floatMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]"); + for (j = 0; j < nrhs; ++j) + for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb]; + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* Save a copy of the matrix A. */ + sallocateA_dist(n, nnz, &a1, &asub1, &xa1); + for (i = 0; i < nnz; ++i) { a1[i] = a[i]; asub1[i] = asub[i]; } + for (i = 0; i < n+1; ++i) xa1[i] = xa[i]; + + + /* ------------------------------------------------------------ + WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver: factorize and solve. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid); + } + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with + the L and U matrices. */ + SUPERLU_FREE(b); /* Free storage of right-hand side. */ + + + /* ------------------------------------------------------------ + NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern; + PStatInit(&stat); /* Initialize the statistics variables. */ + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a1, asub1, xa1, + SLU_NC, SLU_S, SLU_GE); + + /* Solve the linear system. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + printf("Solve the system with the same sparsity pattern.\n"); + sinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid); + } + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with + the L and U matrices. */ + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); /* Deallocate the structure of L and U.*/ + SUPERLU_FREE(b1); /* Free storage of right-hand side. */ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution. */ + SUPERLU_FREE(berr); + fclose(fp); + + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive3.c b/EXAMPLE/psdrive3.c new file mode 100644 index 00000000..33fda121 --- /dev/null +++ b/EXAMPLE/psdrive3.c @@ -0,0 +1,304 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PSGSSVX example + * + *
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE3.
+ *
+ * This example illustrates how to use PSGSSVX to solve
+ * systems repeatedly with the same sparsity pattern and similar
+ * numerical values of matrix A.
+ * In this case, the column permutation vector and symbolic factorization are
+ * computed only once. The following data structures will be reused in the
+ * subsequent call to PSGSSVX:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to PSGSSVX, and reused in the subsequent call.
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n  psdrive3 -r  -c  big.rua
+ * 
+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + NRformat_loc *Astore; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo_t grid; + float *berr; + float *b, *b1, *xtrue, *nzval, *nzval1; + int_t *colind, *colind1, *rowptr, *rowptr1; + int_t i, j, m, n, nnz_loc, m_loc, fst_row; + int nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *postfix; + int ii, omp_mpi_level; + FILE *fp, *fopen(); + int cpp_defs(); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default %d)\n", nprow); + printf("\t-c : process columns (default %d)\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( iam == -1 ) goto out; + if ( !iam ) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("Process grid:\t\t%d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + for(ii = 0;iinnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + nzval = Astore->nzval; + colind = Astore->colind; + rowptr = Astore->rowptr; + nzval1 = floatMalloc_dist(nnz_loc); + colind1 = intMalloc_dist(nnz_loc); + rowptr1 = intMalloc_dist(m_loc+1); + for (i = 0; i < nnz_loc; ++i) { + nzval1[i] = nzval[i]; + colind1[i] = colind[i]; + } + for (i = 0; i < m_loc+1; ++i) rowptr1[i] = rowptr[i]; + + /* ------------------------------------------------------------ + WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver: factorize and solve. */ + psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + psinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + SUPERLU_FREE(b); /* Free storage of right-hand side. */ + + + /* ------------------------------------------------------------ + NOW WE SOLVE ANOTHER LINEAR SYSTEM. + THE MATRIX A HAS THE SAME SPARSITY PATTERN AND THE SIMILAR + NUMERICAL VALUES AS THAT IN A PREVIOUS SYSTEM. + ------------------------------------------------------------*/ + options.Fact = SamePattern_SameRowPerm; + PStatInit(&stat); /* Initialize the statistics variables. */ + + /* Set up the local A in NR_loc format */ + + /* Perturb the 1st diagonal of the matrix to larger value. + Intention is to change values of A. */ + if (iam == 0) { + } + + /* Zero the numerical values in L. */ + sZeroLblocks(iam, n, &grid, &LUstruct); + + sCreate_CompRowLoc_Matrix_dist(&A, m, n, nnz_loc, m_loc, fst_row, + nzval1, colind1, rowptr1, + SLU_NR_loc, SLU_S, SLU_GE); + + /* Solve the linear system. */ + psgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) + printf("Solve a system with the same pattern and similar values.\n"); + psinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with + the L and U matrices. */ + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); /* Deallocate the structure of L and U.*/ + if ( options.SolveInitialized ) { + sSolveFinalize(&options, &SOLVEstruct); + } + SUPERLU_FREE(b1); /* Free storage of right-hand side. */ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution. */ + SUPERLU_FREE(berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive3_ABglobal.c b/EXAMPLE/psdrive3_ABglobal.c new file mode 100644 index 00000000..ee06fa2c --- /dev/null +++ b/EXAMPLE/psdrive3_ABglobal.c @@ -0,0 +1,305 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for psgssvx_ABglobal example + * + *
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program psdrive3A_ABglobal.
+ *
+ * This example illustrates how to use psgssvx_ABglobal to solve
+ * systems repeatedly with the same sparsity pattern and similar
+ * numerical values of matrix A.
+ * In this case, the column permutation vector and symbolic factorization are
+ * computed only once. The following data structures will be reused in the
+ * subsequent call to psgssvx_ABglobal:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to psgssvx_ABglobal, and reused in the subsequent call.
+ *
+ * On an IBM SP, the program may be run by typing:
+ *    poe psdrive3_ABglobal -r  -c    -procs 

+ *

+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + gridinfo_t grid; + float *berr; + float *a, *a1, *b, *b1, *xtrue; + int_t *asub, *asub1, *xa, *xa1; + int_t i, j, m, n, nnz; + int_t nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char trans[1]; + char **cpp, c; + FILE *fp, *fopen(); + extern int cpp_defs(); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init( &argc, &argv ); + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default " IFMT ")\n", nprow); + printf("\t-c : process columns (default " IFMT ")\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( iam == -1 ) goto out; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL + THE OTHER PROCESSES. + ------------------------------------------------------------*/ + if ( !iam ) { + /* Print the CPP definitions. */ + cpp_defs(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); + + printf("Input matrix file: %s\n", *cpp); + printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz); + printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &a, &asub, &xa); + + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if (!(b=floatMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]"); + if (!(xtrue=floatMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + ldx = n; + ldb = m; + sGenXtrue_dist(n, nrhs, xtrue, ldx); + sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); + + /* Save a copy of the right-hand side. */ + if ( !(b1 = floatMalloc_dist(m * nrhs)) ) ABORT("Malloc fails for b1[]"); + for (j = 0; j < nrhs; ++j) + for (i = 0; i < m; ++i) b1[i+j*ldb] = b[i+j*ldb]; + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* Save a copy of the matrix A. */ + sallocateA_dist(n, nnz, &a1, &asub1, &xa1); + for (i = 0; i < nnz; ++i) { a1[i] = a[i]; asub1[i] = asub[i]; } + for (i = 0; i < n+1; ++i) xa1[i] = xa[i]; + + + /* ------------------------------------------------------------ + WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver: factorize and solve. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid); + } + + + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + SUPERLU_FREE(b); /* Free storage of right-hand side. */ + + + /* ------------------------------------------------------------ + NOW WE SOLVE ANOTHER LINEAR SYSTEM. + THE MATRIX A HAS THE SAME SPARSITY PATTERN AND THE SIMILAR + NUMERICAL VALUES AS THAT IN A PREVIOUS SYSTEM. + ------------------------------------------------------------*/ + options.Fact = SamePattern_SameRowPerm; + PStatInit(&stat); /* Initialize the statistics variables. */ + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a1, asub1, xa1, + SLU_NC, SLU_S, SLU_GE); + + /* Solve the linear system. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + printf("Solve a system with the same pattern and similar values.\n"); + sinf_norm_error_dist(n, nrhs, b1, ldb, xtrue, ldx, &grid); + } + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); /* Deallocate storage of matrix A. */ + sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with + the L and U matrices. */ + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); /* Deallocate the structure of L and U.*/ + SUPERLU_FREE(b1); /* Free storage of right-hand side. */ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution. */ + SUPERLU_FREE(berr); + fclose(fp); + + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive3d.c b/EXAMPLE/psdrive3d.c new file mode 100644 index 00000000..8e2e0ea0 --- /dev/null +++ b/EXAMPLE/psdrive3d.c @@ -0,0 +1,411 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PSGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * May 12, 2021
+ *
+ */
+#include "superlu_sdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE3D.
+ *
+ * This example illustrates how to use PSGSSVX3D with the full
+ * (default) options to solve a linear system.
+ *
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call psgssvx
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * The program may be run by typing
+ *    mpiexec -np 

psdrive3d -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * d must be a power-of-two, e.g., 1, 2, 4, ... + * + *

+ */ + +static void matCheck(int n, int m, float* A, int LDA, + float* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + PrintDouble5("A", A->nnz_loc, Aval); + PrintDouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + float * Aval = (float *) A->nzval; + float * Bval = (float *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + float *berr; + float *b, *xtrue; + int_t m, n; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iim_loc, nrhs, B2d, Astore->m_loc, bref, ldb); + } + // MPI_Finalize(); exit(0); + #endif +#endif + + if (!(berr = floatMalloc_dist (nrhs))) + ABORT ("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------ */ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ParSymbFact = NO; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.IterRefine = DOUBLE; + options.Trans = NOTRANS; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = NO; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + +#ifdef NRFRMT // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; +#else + if ( grid.zscp.Iam == 0 ) // Process layer 0 + { + m = A.nrow; + n = A.ncol; + } + // broadcast m, n to all the process layers; + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.zscp.comm); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.zscp.comm); +#endif + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit (m, n, &ScalePermstruct); + sLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + fflush(stdout); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + sDestroy_LU (n, &(grid.grid2d), &LUstruct); + if (options.SolveInitialized) { + sSolveFinalize (&options, &SOLVEstruct); + } + } else { // Process layers not equal 0 + sDeAllocLlu_3d(n, &LUstruct, &grid); + sDeAllocGlu_3d(&LUstruct); + } + + Destroy_CompRowLoc_Matrix_dist (&A); + SUPERLU_FREE (b); + SUPERLU_FREE (xtrue); + SUPERLU_FREE (berr); + sScalePermstructFree (&ScalePermstruct); + sLUstructFree (&LUstruct); + PStatFree (&stat); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive4.c b/EXAMPLE/psdrive4.c new file mode 100644 index 00000000..db3216a3 --- /dev/null +++ b/EXAMPLE/psdrive4.c @@ -0,0 +1,301 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief This example illustrates how to divide up the processes into subgroups + * + *
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * April 5, 2015
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE4.
+ *
+ * This example illustrates how to divide up the processes into
+ * subgroups (multiple grids) such that each subgroup solves a linear
+ * system independently from the other.
+ *
+ * In this example, there are 2 subgroups:
+ *  1. subgroup 1 consists of processes 0 to 5 arranged as
+ *     a 2-by-3 process grid.
+ *  2. subgroup 2 consists of processes 6 to 9 arranged as
+ *     a 2-by-2 process grid.
+ *
+ * With MPICH,  program may be run by typing:
+ *    mpiexec -n 10 psdrive4 big.rua
+ * 
+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo_t grid1, grid2; + float *berr; + float *a, *b, *xtrue; + int_t *asub, *xa; + int_t i, j, m, n; + int nprow, npcol, ldumap, p; + int usermap[6]; + int iam, info, ldb, ldx, nprocs; + int nrhs = 1; /* Number of right-hand side. */ + int ii, omp_mpi_level; + char **cpp, c, *postfix; + FILE *fp, *fopen(); + int cpp_defs(); + + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); + + MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); + if ( nprocs < 10 ) { + fprintf(stderr, "Requires at least 10 processes\n"); + exit(-1); + } + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default %d)\n", nprow); + printf("\t-c : process columns (default %d)\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID 1. + ------------------------------------------------------------*/ + nprow = 2; + npcol = 3; + ldumap = 2; + p = 0; /* Grid 1 starts from process 0. */ + for (i = 0; i < nprow; ++i) + for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++; + superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid1); + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID 2. + ------------------------------------------------------------*/ + nprow = 2; + npcol = 2; + ldumap = 2; + p = 6; /* Grid 2 starts from process 6. */ + for (i = 0; i < nprow; ++i) + for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++; + superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid2); + + /* Bail out if I do not belong in any of the 2 grids. */ + MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + if ( iam == -1 ) goto out; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + for(ii = 0;ii= 0 && iam < 6 ) { /* I am in grid 1. */ + iam = grid1.iam; /* Get the logical number in the new grid. */ + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------*/ + screate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid1); + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = NO; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + } + + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver. */ + psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + psinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid1.comm); + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid1); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompRowLoc_Matrix_dist(&A); + sScalePermstructFree(&ScalePermstruct); + sDestroy_LU(n, &grid1, &LUstruct); + sLUstructFree(&LUstruct); + if ( options.SolveInitialized ) { + sSolveFinalize(&options, &SOLVEstruct); + } + SUPERLU_FREE(b); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + + } else { /* I am in grid 2. */ + iam = grid2.iam; /* Get the logical number in the new grid. */ + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------*/ + screate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid2); + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = MMD_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver. */ + psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + psinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid2.comm); + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid2); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompRowLoc_Matrix_dist(&A); + sScalePermstructFree(&ScalePermstruct); + sDestroy_LU(n, &grid2, &LUstruct); + sLUstructFree(&LUstruct); + if ( options.SolveInitialized ) { + sSolveFinalize(&options, &SOLVEstruct); + } + SUPERLU_FREE(b); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + } + + fclose(fp); + + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRIDS. + ------------------------------------------------------------*/ + superlu_gridexit(&grid1); + superlu_gridexit(&grid2); + +out: + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} diff --git a/EXAMPLE/psdrive4_ABglobal.c b/EXAMPLE/psdrive4_ABglobal.c new file mode 100644 index 00000000..44736a07 --- /dev/null +++ b/EXAMPLE/psdrive4_ABglobal.c @@ -0,0 +1,361 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief This example illustrates how to divide up the processes into subgroups + * + *
+ * -- Distributed SuperLU routine (version 4.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * April 5, 2015
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program psdrive4_ABglobal.
+ *
+ * This example illustrates how to divide up the processes into
+ * subgroups (multiple grids) such that each subgroup solves a linear
+ * system independently from the other.
+ *
+ * In this example, there are 2 subgroups:
+ *  1. subgroup 1 consists of processes 0 to 5 arranged as
+ *     a 2-by-3 process grid.
+ *  2. subgroup 2 consists of processes 6 to 9 arranged as
+ *     a 2-by-2 process grid.
+ *
+ * On an IBM SP, the program may be run by typing
+ *    poe psdrive4_ABglobal  -procs 10
+ * 
+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + gridinfo_t grid1, grid2; + float *berr; + float *a, *b, *xtrue; + int_t *asub, *xa; + int_t i, j, m, n, nnz; + int_t nprow, npcol, ldumap, p; + int usermap[6]; + int iam, info, ldb, ldx, nprocs; + int nrhs = 1; /* Number of right-hand side. */ + char trans[1]; + char **cpp, c; + FILE *fp, *fopen(); + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init( &argc, &argv ); + MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); + if ( nprocs < 10 ) { + fprintf(stderr, "Requires at least 10 processes\n"); + exit(-1); + } + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default " IFMT ")\n", nprow); + printf("\t-c : process columns (default " IFMT ")\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID 1. + ------------------------------------------------------------*/ + nprow = 2; + npcol = 3; + ldumap = 2; + p = 0; /* Grid 1 starts from process 0. */ + for (i = 0; i < nprow; ++i) + for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++; + superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid1); + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID 2. + ------------------------------------------------------------*/ + nprow = 2; + npcol = 2; + ldumap = 2; + p = 6; /* Grid 2 starts from process 6. */ + for (i = 0; i < nprow; ++i) + for (j = 0; j < npcol; ++j) usermap[i+j*ldumap] = p++; + superlu_gridmap(MPI_COMM_WORLD, nprow, npcol, usermap, ldumap, &grid2); + + /* Bail out if I do not belong in any of the 2 grids. */ + MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + if ( iam == -1 ) goto out; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + if ( iam >= 0 && iam < 6 ) { /* I am in grid 1. */ + iam = grid1.iam; /* Get the logical number in the new grid. */ + + /* ------------------------------------------------------------ + PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL + THE OTHER PROCESSES. + ------------------------------------------------------------*/ + if ( !iam ) { + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); + + printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz); + printf("\tProcess grid\t%d X %d\n", (int) grid1.nprow, (int) grid1.npcol); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid1.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid1.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid1.comm ); + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid1.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid1.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid1.comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid1.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid1.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid1.comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &a, &asub, &xa); + + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid1.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid1.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid1.comm ); + } + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if (!(b=floatMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]"); + if (!(xtrue=floatMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + ldx = n; + ldb = m; + sGenXtrue_dist(n, nrhs, xtrue, ldx); + sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver: factorize and solve. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid1); + } + + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid1); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); + sDestroy_LU(n, &grid1, &LUstruct); + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); + SUPERLU_FREE(b); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + + } else { /* I am in grid 2. */ + iam = grid2.iam; /* Get the logical number in the new grid. */ + + /* ------------------------------------------------------------ + PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL + THE OTHER PROCESSES. + ------------------------------------------------------------*/ + if ( !iam ) { + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); + + printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz); + printf("\tProcess grid\t%d X %d\n", (int) grid2.nprow, (int) grid2.npcol); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid2.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid2.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid2.comm ); + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid2.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid2.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid2.comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid2.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid2.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid2.comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &a, &asub, &xa); + + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid2.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid2.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid2.comm ); + } + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if (!(b=floatMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]"); + if (!(xtrue=floatMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + ldx = n; + ldb = m; + sGenXtrue_dist(n, nrhs, xtrue, ldx); + sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = MMD_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver: factorize and solve. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid2); + } + + + /* Print the statistics. */ + PStatPrint(&options, &stat, &grid2); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); + sDestroy_LU(n, &grid2, &LUstruct); + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); + SUPERLU_FREE(b); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + } + + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRIDS. + ------------------------------------------------------------*/ + superlu_gridexit(&grid1); + superlu_gridexit(&grid2); + +out: + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} diff --git a/EXAMPLE/psdrive_ABglobal.c b/EXAMPLE/psdrive_ABglobal.c new file mode 100644 index 00000000..7d7cdc91 --- /dev/null +++ b/EXAMPLE/psdrive_ABglobal.c @@ -0,0 +1,256 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for psgssvx_ABglobal example + * + *
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * The driver program psdrive_ABglobal.
+ *
+ * This example illustrates how to use psgssvx_ABglobal with the full
+ * (default) options to solve a linear system.
+ * 
+ * Five basic steps are required:
+ *   1. Initialize the MPI environment and the SuperLU process grid
+ *   2. Set up the input matrix and the right-hand side
+ *   3. Set the options argument
+ *   4. Call psgssvx_ABglobal
+ *   5. Release the process grid and terminate the MPI environment
+ *
+ * On an IBM SP, the program may be run by typing
+ *    poe psdrive_ABglobal -r  -c   -procs 

+ *

+ */ + +int main(int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + gridinfo_t grid; + float *berr; + float *a, *b, *xtrue; + int_t *asub, *xa; + int_t m, n, nnz; + int_t nprow, npcol; + int iam, info, ldb, ldx, nrhs; + char trans[1]; + char **cpp, c; + FILE *fp, *fopen(); + extern int cpp_defs(); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Init( &argc, &argv ); + + /* Parse command line argv[]. */ + for (cpp = argv+1; *cpp; ++cpp) { + if ( **cpp == '-' ) { + c = *(*cpp+1); + ++cpp; + switch (c) { + case 'h': + printf("Options:\n"); + printf("\t-r : process rows (default " IFMT ")\n", nprow); + printf("\t-c : process columns (default " IFMT ")\n", npcol); + exit(0); + break; + case 'r': nprow = atoi(*cpp); + break; + case 'c': npcol = atoi(*cpp); + break; + } + } else { /* Last arg is considered a filename */ + if ( !(fp = fopen(*cpp, "r")) ) { + ABORT("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ + superlu_gridinit(MPI_COMM_WORLD, nprow, npcol, &grid); + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if ( iam == -1 ) goto out; + + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + PROCESS 0 READS THE MATRIX A, AND THEN BROADCASTS IT TO ALL + THE OTHER PROCESSES. + ------------------------------------------------------------*/ + if ( !iam ) { + /* Print the CPP definitions. */ + cpp_defs(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &a, &asub, &xa); + + printf("Input matrix file: %s\n", *cpp); + printf("\tDimension\t" IFMT "x" IFMT "\t # nonzeros " IFMT "\n", m, n, nnz); + printf("\tProcess grid\t%d X %d\n", (int) grid.nprow, (int) grid.npcol); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid.comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid.comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &a, &asub, &xa); + + MPI_Bcast( a, nnz, MPI_FLOAT, 0, grid.comm ); + MPI_Bcast( asub, nnz, mpi_int_t, 0, grid.comm ); + MPI_Bcast( xa, n+1, mpi_int_t, 0, grid.comm ); + } + + /* Create compressed column matrix for A. */ + sCreate_CompCol_Matrix_dist(&A, m, n, nnz, a, asub, xa, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if (!(b=floatMalloc_dist(m*nrhs))) ABORT("Malloc fails for b[]"); + if (!(xtrue=floatMalloc_dist(n*nrhs))) ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + ldx = n; + ldb = m; + sGenXtrue_dist(n, nrhs, xtrue, ldx); + sFillRHS_dist(trans, nrhs, xtrue, ldx, &A, b, ldb); + + if ( !(berr = floatMalloc_dist(nrhs)) ) + ABORT("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + NOW WE SOLVE THE LINEAR SYSTEM. + ------------------------------------------------------------*/ + + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.Trans = NOTRANS; + options.IterRefine = DOUBLE; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + */ + set_default_options_dist(&options); + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + } + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit(m, n, &ScalePermstruct); + sLUstructInit(n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit(&stat); + + /* Call the linear equation solver. */ + psgssvx_ABglobal(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) { + sinf_norm_error_dist(n, nrhs, b, ldb, xtrue, ldx, &grid); + } + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------*/ + PStatFree(&stat); + Destroy_CompCol_Matrix_dist(&A); + sDestroy_LU(n, &grid, &LUstruct); + sScalePermstructFree(&ScalePermstruct); + sLUstructFree(&LUstruct); + SUPERLU_FREE(b); + SUPERLU_FREE(xtrue); + SUPERLU_FREE(berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------*/ +out: + superlu_gridexit(&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------*/ + MPI_Finalize(); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit main()"); +#endif + +} + + +int cpp_defs() +{ + printf(".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf("\tPROFlevel = %d\n", PROFlevel); +#endif +#if ( StaticPivot>=1 ) + printf("\tStaticPivot = %d\n", StaticPivot); +#endif + printf("....\n"); + return 0; +} diff --git a/EXAMPLE/screate_matrix.c b/EXAMPLE/screate_matrix.c new file mode 100644 index 00000000..2fe905bf --- /dev/null +++ b/EXAMPLE/screate_matrix.c @@ -0,0 +1,427 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Read the matrix from data file + * + *
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * 
+ */ +#include +#include "superlu_sdefs.h" + +/* \brief + * + *
+ * Purpose
+ * =======
+ * 
+ * SCREATE_MATRIX read the matrix from data file in Harwell-Boeing format,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ *
+ * Arguments   
+ * =========      
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format. 
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) float**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) float**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * 
+ */ + +int screate_matrix(SuperMatrix *A, int nrhs, float **rhs, + int *ldb, float **x, int *ldx, + FILE *fp, gridinfo_t *grid) +{ + SuperMatrix GA; /* global A */ + float *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + float *nzval; /* global */ + float *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter screate_matrix()"); +#endif + + if ( !iam ) { + double t = SuperLU_timer_(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } + +#if 0 + nzval[0]=0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid->nprow * grid->npcol); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid->nprow * grid->npcol) != m) { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid->nprow * grid->npcol - 1); + } + + /* Create compressed column matrix for GA. */ + sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = floatMalloc_dist(m*nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = floatMalloc_dist(n*nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + sGenXtrue_dist(n, nrhs, xtrue_global, n); + sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc+1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) { + row = fst_row + j; + rowptr[j+1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (float *) floatMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) { + for (j = colptr[i]; j < colptr[i+1]; ++j) { + row = rowind[j]; + if ( (row>=fst_row) && (row=2 ) + if ( !iam ) sPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_S, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = floatMalloc_dist(m_loc*nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j =0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) { + row = fst_row + i; + (*rhs)[j*m_loc+i] = b_global[j*n+row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) + (*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit screate_matrix()"); +#endif + return 0; +} + + + +int screate_matrix_postfix(SuperMatrix *A, int nrhs, float **rhs, + int *ldb, float **x, int *ldx, + FILE *fp, char * postfix, gridinfo_t *grid) +{ + SuperMatrix GA; /* global A */ + float *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + float *nzval; /* global */ + float *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter screate_matrix()"); +#endif + + if ( !iam ) { + double t = SuperLU_timer_(); + + if(!strcmp(postfix,"rua")){ + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"mtx")){ + /* Read the matrix stored on disk in Matrix Market format. */ + sreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"rb")){ + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + sreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"dat")){ + /* Read the matrix stored on disk in triplet format. */ + sreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"datnh")){ + /* Read the matrix stored on disk in triplet format (without header). */ + sreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"bin")){ + /* Read the matrix stored on disk in binary format. */ + sread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else { + ABORT("File format not known"); + } + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } + +#if 0 + nzval[0]=0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid->nprow * grid->npcol); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid->nprow * grid->npcol) != m) { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid->nprow * grid->npcol - 1); + } + + /* Create compressed column matrix for GA. */ + sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = floatMalloc_dist(m*nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = floatMalloc_dist(n*nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + sGenXtrue_dist(n, nrhs, xtrue_global, n); + sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc+1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) { + row = fst_row + j; + rowptr[j+1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (float *) floatMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) { + for (j = colptr[i]; j < colptr[i+1]; ++j) { + row = rowind[j]; + if ( (row>=fst_row) && (row=2 ) + if ( !iam ) sPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_S, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = floatMalloc_dist(m_loc*nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j =0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) { + row = fst_row + i; + (*rhs)[j*m_loc+i] = b_global[j*n+row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) + (*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit screate_matrix()"); +#endif + return 0; +} diff --git a/EXAMPLE/screate_matrix3d.c b/EXAMPLE/screate_matrix3d.c new file mode 100644 index 00000000..f1b0cf64 --- /dev/null +++ b/EXAMPLE/screate_matrix3d.c @@ -0,0 +1,463 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + + +/*! @file + * \brief Read the matrix from data file + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Oak Ridge National Lab.
+ * May 12, 2021
+ * 
+ */ +#include +#include "superlu_sdefs.h" + +/* \brief + * + *
+ * Purpose
+ * =======
+ *
+ * SCREATE_MATRIX read the matrix from data file in Harwell-Boeing format,
+ * and distribute it to processors in a distributed compressed row format.
+ * It also generate the distributed true solution X and the right-hand
+ * side RHS.
+ *
+ *
+ * Arguments
+ * =========
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format.
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) double**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) double**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * 
+ */ + +int screate_matrix3d(SuperMatrix *A, int nrhs, float **rhs, + int *ldb, float **x, int *ldx, + FILE *fp, gridinfo3d_t *grid3d) +{ + SuperMatrix GA; /* global A */ + float *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + float *nzval; /* global */ + float *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid3d->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter dcreate_matrix3d()"); +#endif + + if ( !iam ) + { + double t = SuperLU_timer_(); + + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + else + { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + +#if 0 + nzval[0] = 0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) + { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); + } + + /* Create compressed column matrix for GA. */ + sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = floatMalloc_dist(m * nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = floatMalloc_dist(n * nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + sGenXtrue_dist(n, nrhs, xtrue_global, n); + sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc + 1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) + { + row = fst_row + j; + rowptr[j + 1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (float *) floatMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) + { + for (j = colptr[i]; j < colptr[i + 1]; ++j) + { + row = rowind[j]; + if ( (row >= fst_row) && (row < fst_row + m_loc) ) + { + row = row - fst_row; + relpos = marker[row]; + colind[relpos] = i; + nzval_loc[relpos] = nzval[j]; + ++marker[row]; + } + } + } + +#if ( DEBUGlevel>=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_D, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = floatMalloc_dist(m_loc * nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + { + row = fst_row + i; + (*rhs)[j * m_loc + i] = b_global[j * n + row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit dcreate_matrix()"); +#endif + return 0; +} + + +int screate_matrix_postfix3d(SuperMatrix *A, int nrhs, float **rhs, + int *ldb, float **x, int *ldx, + FILE *fp, char * postfix, gridinfo3d_t *grid3d) +{ + SuperMatrix GA; /* global A */ + float *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + float *nzval; /* global */ + float *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid3d->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter dcreate_matrix()"); +#endif + + if ( !iam ) + { + double t = SuperLU_timer_(); + + if (!strcmp(postfix, "rua")) + { + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "mtx")) + { + /* Read the matrix stored on disk in Matrix Market format. */ + sreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "rb")) + { + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + sreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "dat")) + { + /* Read the matrix stored on disk in triplet format. */ + sreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "datnh")) + { + /* Read the matrix stored on disk in triplet format (without header). */ + sreadtriple_noheader(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else if (!strcmp(postfix, "bin")) + { + /* Read the matrix stored on disk in binary format. */ + sread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + } + else + { + ABORT("File format not known"); + } + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + else + { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid3d->comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid3d->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid3d->comm ); + MPI_Bcast( colptr, n + 1, mpi_int_t, 0, grid3d->comm ); + } + +#if 0 + nzval[0] = 0.1; +#endif + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) + { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); + } + + /* Create compressed column matrix for GA. */ + sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_D, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = floatMalloc_dist(m * nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = floatMalloc_dist(n * nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + sGenXtrue_dist(n, nrhs, xtrue_global, n); + sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc + 1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) + { + row = fst_row + j; + rowptr[j + 1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (float *) floatMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) + { + for (j = colptr[i]; j < colptr[i + 1]; ++j) + { + row = rowind[j]; + if ( (row >= fst_row) && (row < fst_row + m_loc) ) + { + row = row - fst_row; + relpos = marker[row]; + colind[relpos] = i; + nzval_loc[relpos] = nzval[j]; + ++marker[row]; + } + } + } + +#if ( DEBUGlevel>=2 ) + if ( !iam ) dPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_S, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = floatMalloc_dist(m_loc * nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + { + row = fst_row + i; + (*rhs)[j * m_loc + i] = b_global[j * n + row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) + { + for (i = 0; i < m_loc; ++i) + (*x)[i + j * (*ldx)] = xtrue_global[i + fst_row + j * n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit dcreate_matrix()"); +#endif + return 0; +} diff --git a/EXAMPLE/screate_matrix_perturbed.c b/EXAMPLE/screate_matrix_perturbed.c new file mode 100644 index 00000000..0f1d6625 --- /dev/null +++ b/EXAMPLE/screate_matrix_perturbed.c @@ -0,0 +1,419 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Read the matrix from data file + * + *
+ * -- Distributed SuperLU routine (version 5.1.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * December 31, 2016
+ * 
+ */ +#include +#include "superlu_sdefs.h" + +/* \brief + * + *
+ * Purpose
+ * =======
+ * 
+ * SCREATE_MATRIX_PERTURBED read the matrix from data file in
+ * Harwell-Boeing format, and distribute it to processors in a distributed
+ * compressed row format. It also generate the distributed true solution X
+ * and the right-hand side RHS.
+ *
+ * Arguments   
+ * =========      
+ *
+ * A     (output) SuperMatrix*
+ *       Local matrix A in NR_loc format. 
+ *
+ * NRHS  (input) int_t
+ *       Number of right-hand sides.
+ *
+ * RHS   (output) float**
+ *       The right-hand side matrix.
+ *
+ * LDB   (output) int*
+ *       Leading dimension of the right-hand side matrix.
+ *
+ * X     (output) float**
+ *       The true solution matrix.
+ *
+ * LDX   (output) int*
+ *       The leading dimension of the true solution matrix.
+ *
+ * FP    (input) FILE*
+ *       The matrix file pointer.
+ *
+ * GRID  (input) gridinof_t*
+ *       The 2D process mesh.
+ * 
+ */ + +int screate_matrix_perturbed(SuperMatrix *A, int nrhs, float **rhs, + int *ldb, float **x, int *ldx, + FILE *fp, gridinfo_t *grid) +{ + SuperMatrix GA; /* global A */ + float *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + float *nzval; /* global */ + float *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter screate_matrix()"); +#endif + + if ( !iam ) { + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } + + /* Perturbed the 1st and last diagonal of the matrix to lower + values. Intention is to change perm_r[]. */ + nzval[0] *= 0.01; + nzval[nnz-1] *= 0.0001; + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid->nprow * grid->npcol); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid->nprow * grid->npcol) != m) { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid->nprow * grid->npcol - 1); + } + + /* Create compressed column matrix for GA. */ + sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = floatMalloc_dist(m*nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = floatMalloc_dist(n*nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + sGenXtrue_dist(n, nrhs, xtrue_global, n); + sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc+1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) { + row = fst_row + j; + rowptr[j+1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (float *) floatMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) { + for (j = colptr[i]; j < colptr[i+1]; ++j) { + row = rowind[j]; + if ( (row>=fst_row) && (row=2 ) + if ( !iam ) sPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_S, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = floatMalloc_dist(m_loc*nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j =0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) { + row = fst_row + i; + (*rhs)[j*m_loc+i] = b_global[j*n+row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) + (*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit screate_matrix()"); +#endif + return 0; +} + + + +int screate_matrix_perturbed_postfix(SuperMatrix *A, int nrhs, float **rhs, + int *ldb, float **x, int *ldx, + FILE *fp, char *postfix, gridinfo_t *grid) +{ + SuperMatrix GA; /* global A */ + float *b_global, *xtrue_global; /* replicated on all processes */ + int_t *rowind, *colptr; /* global */ + float *nzval; /* global */ + float *nzval_loc; /* local */ + int_t *colind, *rowptr; /* local */ + int_t m, n, nnz; + int_t m_loc, fst_row, nnz_loc; + int_t m_loc_fst; /* Record m_loc of the first p-1 processors, + when mod(m, p) is not zero. */ + int_t row, col, i, j, relpos; + int iam; + char trans[1]; + int_t *marker; + + iam = grid->iam; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter screate_matrix()"); +#endif + + if ( !iam ) { + double t = SuperLU_timer_(); + if(!strcmp(postfix,"rua")){ + /* Read the matrix stored on disk in Harwell-Boeing format. */ + sreadhb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"mtx")){ + /* Read the matrix stored on disk in Matrix Market format. */ + sreadMM_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"rb")){ + /* Read the matrix stored on disk in Rutherford-Boeing format. */ + sreadrb_dist(iam, fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"dat")){ + /* Read the matrix stored on disk in triplet format. */ + sreadtriple_dist(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else if(!strcmp(postfix,"bin")){ + /* Read the matrix stored on disk in binary format. */ + sread_binary(fp, &m, &n, &nnz, &nzval, &rowind, &colptr); + }else { + ABORT("File format not known"); + } + + printf("Time to read and distribute matrix %.2f\n", + SuperLU_timer_() - t); fflush(stdout); + + /* Broadcast matrix A to the other PEs. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } else { + /* Receive matrix A from PE 0. */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid->comm ); + MPI_Bcast( &nnz, 1, mpi_int_t, 0, grid->comm ); + + /* Allocate storage for compressed column representation. */ + sallocateA_dist(n, nnz, &nzval, &rowind, &colptr); + + MPI_Bcast( nzval, nnz, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( rowind, nnz, mpi_int_t, 0, grid->comm ); + MPI_Bcast( colptr, n+1, mpi_int_t, 0, grid->comm ); + } + + /* Perturbed the 1st and last diagonal of the matrix to lower + values. Intention is to change perm_r[]. */ + nzval[0] *= 0.01; + nzval[nnz-1] *= 0.0001; + + /* Compute the number of rows to be distributed to local process */ + m_loc = m / (grid->nprow * grid->npcol); + m_loc_fst = m_loc; + /* When m / procs is not an integer */ + if ((m_loc * grid->nprow * grid->npcol) != m) { + /*m_loc = m_loc+1; + m_loc_fst = m_loc;*/ + if (iam == (grid->nprow * grid->npcol - 1)) /* last proc. gets all*/ + m_loc = m - m_loc * (grid->nprow * grid->npcol - 1); + } + + /* Create compressed column matrix for GA. */ + sCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, + SLU_NC, SLU_S, SLU_GE); + + /* Generate the exact solution and compute the right-hand side. */ + if ( !(b_global = floatMalloc_dist(m*nrhs)) ) + ABORT("Malloc fails for b[]"); + if ( !(xtrue_global = floatMalloc_dist(n*nrhs)) ) + ABORT("Malloc fails for xtrue[]"); + *trans = 'N'; + + sGenXtrue_dist(n, nrhs, xtrue_global, n); + sFillRHS_dist(trans, nrhs, xtrue_global, n, &GA, b_global, m); + + /************************************************* + * Change GA to a local A with NR_loc format * + *************************************************/ + + rowptr = (int_t *) intMalloc_dist(m_loc+1); + marker = (int_t *) intCalloc_dist(n); + + /* Get counts of each row of GA */ + for (i = 0; i < n; ++i) + for (j = colptr[i]; j < colptr[i+1]; ++j) ++marker[rowind[j]]; + /* Set up row pointers */ + rowptr[0] = 0; + fst_row = iam * m_loc_fst; + nnz_loc = 0; + for (j = 0; j < m_loc; ++j) { + row = fst_row + j; + rowptr[j+1] = rowptr[j] + marker[row]; + marker[j] = rowptr[j]; + } + nnz_loc = rowptr[m_loc]; + + nzval_loc = (float *) floatMalloc_dist(nnz_loc); + colind = (int_t *) intMalloc_dist(nnz_loc); + + /* Transfer the matrix into the compressed row storage */ + for (i = 0; i < n; ++i) { + for (j = colptr[i]; j < colptr[i+1]; ++j) { + row = rowind[j]; + if ( (row>=fst_row) && (row=2 ) + if ( !iam ) sPrint_CompCol_Matrix_dist(&GA); +#endif + + /* Destroy GA */ + Destroy_CompCol_Matrix_dist(&GA); + + /******************************************************/ + /* Change GA to a local A with NR_loc format */ + /******************************************************/ + + /* Set up the local A in NR_loc format */ + sCreate_CompRowLoc_Matrix_dist(A, m, n, nnz_loc, m_loc, fst_row, + nzval_loc, colind, rowptr, + SLU_NR_loc, SLU_S, SLU_GE); + + /* Get the local B */ + if ( !((*rhs) = floatMalloc_dist(m_loc*nrhs)) ) + ABORT("Malloc fails for rhs[]"); + for (j =0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) { + row = fst_row + i; + (*rhs)[j*m_loc+i] = b_global[j*n+row]; + } + } + *ldb = m_loc; + + /* Set the true X */ + *ldx = m_loc; + if ( !((*x) = floatMalloc_dist(*ldx * nrhs)) ) + ABORT("Malloc fails for x_loc[]"); + + /* Get the local part of xtrue_global */ + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) + (*x)[i + j*(*ldx)] = xtrue_global[i + fst_row + j*n]; + } + + SUPERLU_FREE(b_global); + SUPERLU_FREE(xtrue_global); + SUPERLU_FREE(marker); + +#if ( DEBUGlevel>=1 ) + printf("sizeof(NRforamt_loc) %lu\n", sizeof(NRformat_loc)); + CHECK_MALLOC(iam, "Exit screate_matrix()"); +#endif + return 0; +} diff --git a/FORTRAN/superlu_dist_config.fh b/FORTRAN/superlu_dist_config.fh index cbe990cc..878933ff 100644 --- a/FORTRAN/superlu_dist_config.fh +++ b/FORTRAN/superlu_dist_config.fh @@ -1,9 +1,11 @@ +#define HAVE_CUDA TRUE #define HAVE_PARMETIS TRUE +#define XSDK_INDEX_SIZE 64 #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 3589770f..51f10a42 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -127,6 +127,68 @@ endif() endif() ########## enable double +if(enable_single) + list(APPEND headers superlu_sdefs.h slustruct_gpu.h) + + list(APPEND sources + slangs_dist.c + sgsequ_dist.c + slaqgs_dist.c + sutil_dist.c + smemory_dist.c + smyblas2_dist.c + ssp_blas2_dist.c + ssp_blas3_dist.c + psgssvx.c + psgssvx_ABglobal.c + sreadhb.c + sreadrb.c + sreadtriple.c + sreadtriple_noheader.c + sbinary_io.c + sreadMM.c + psgsequ.c + pslaqgs.c + sldperm_dist.c + pslangs.c + psutil.c + pssymbfact_distdata.c + sdistribute.c + psdistribute.c + psgstrf.c + sstatic_schedule.c + psgstrf2.c + psgstrs.c + psgstrs1.c + psgstrs_lsum.c + psgstrs_Bglobal.c + psgsrfs.c + psgsmv.c + psgsrfs_ABXglobal.c + psgsmv_AXglobal.c + psGetDiagU.c + psgssvx3d.c ## 3D code + snrformat_loc3d.c + psgstrf3d.c + streeFactorization.c + streeFactorizationGPU.c + sgather.c + sscatter3d.c + ps3dcomm.c + strfAux.c + scommunication_aux.c + strfCommWrapper.c + ssuperlu_blas.c + ) +if (HAVE_CUDA) + list(APPEND sources ssuperlu_gpu.cu) +endif() +if (HAVE_COMBBLAS) + list(APPEND sources s_c2cpp_GetHWPM.cpp sHWPM_CombBLAS.hpp) +endif() + +endif() ########## enable single + if(enable_complex16) list(APPEND headers superlu_zdefs.h zlustruct_gpu.h) diff --git a/SRC/TreeInterface.cpp b/SRC/TreeInterface.cpp index 4874b8b4..60c957c5 100644 --- a/SRC/TreeInterface.cpp +++ b/SRC/TreeInterface.cpp @@ -15,11 +15,14 @@ namespace SuperLU_ASYNCOMM{ TreeBcast_slu* BcastTree = TreeBcast_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); return (BcTree) BcastTree; } + if(precision=='s'){ + TreeBcast_slu* BcastTree = TreeBcast_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); + return (BcTree) BcastTree; + } if(precision=='z'){ TreeBcast_slu* BcastTree = TreeBcast_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); return (BcTree) BcastTree; } - return 0; } void BcTree_Destroy(BcTree Tree, char precision){ @@ -27,6 +30,10 @@ namespace SuperLU_ASYNCOMM{ TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; delete BcastTree; } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + delete BcastTree; + } if(precision=='z'){ TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; delete BcastTree; @@ -35,90 +42,112 @@ namespace SuperLU_ASYNCOMM{ } void BcTree_SetTag(BcTree Tree, Int tag, char precision){ - if(precision=='d'){ - TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + if(precision=='d'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; BcastTree->SetTag(tag); - } - if(precision=='z'){ + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->SetTag(tag); + } + if(precision=='z'){ TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; BcastTree->SetTag(tag); - } + } } yes_no_t BcTree_IsRoot(BcTree Tree, char precision){ if(precision=='d'){ - TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; - return BcastTree->IsRoot()?YES:NO; + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->IsRoot()?YES:NO; + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->IsRoot()?YES:NO; } if(precision=='z'){ TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; return BcastTree->IsRoot()?YES:NO; } - return NO; } void BcTree_forwardMessageSimple(BcTree Tree, void* localBuffer, Int msgSize, char precision){ - if(precision=='d'){ - TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; - BcastTree->forwardMessageSimple((double*)localBuffer,msgSize); - } - if(precision=='z'){ - TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; - BcastTree->forwardMessageSimple((doublecomplex*)localBuffer,msgSize); - } + if(precision=='d'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->forwardMessageSimple((double*)localBuffer,msgSize); + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->forwardMessageSimple((float*)localBuffer,msgSize); + } + if(precision=='z'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->forwardMessageSimple((doublecomplex*)localBuffer,msgSize); + } } - void BcTree_waitSendRequest(BcTree Tree, char precision){ - if(precision=='d'){ - TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; - BcastTree->waitSendRequest(); - } - if(precision=='z'){ - TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; - BcastTree->waitSendRequest(); - } + void BcTree_waitSendRequest(BcTree Tree, char precision) { + if(precision=='d'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->waitSendRequest(); + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->waitSendRequest(); + } + if(precision=='z'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->waitSendRequest(); + } } - - void BcTree_allocateRequest(BcTree Tree, char precision){ - if(precision=='d'){ - TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; - BcastTree->allocateRequest(); - } - if(precision=='z'){ - TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; - BcastTree->allocateRequest(); - } + if(precision=='d'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->allocateRequest(); + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->allocateRequest(); + } + if(precision=='z'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + BcastTree->allocateRequest(); + } } int BcTree_getDestCount(BcTree Tree, char precision){ - if(precision=='d'){ - TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; - return BcastTree->GetDestCount(); - } - if(precision=='z'){ - TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; - return BcastTree->GetDestCount(); - } - return 0; + if(precision=='d'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->GetDestCount(); + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->GetDestCount(); + } + if(precision=='z'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->GetDestCount(); + } } int BcTree_GetMsgSize(BcTree Tree, char precision){ - if(precision=='d'){ - TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; - return BcastTree->GetMsgSize(); - } - if(precision=='z'){ - TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; - return BcastTree->GetMsgSize(); - } - return 0; + if(precision=='d'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->GetMsgSize(); + } + if(precision=='s'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->GetMsgSize(); + } + if(precision=='z'){ + TreeBcast_slu* BcastTree = (TreeBcast_slu*) Tree; + return BcastTree->GetMsgSize(); + } } - StdList StdList_Init(){ std::list* lst = new std::list(); return (StdList) lst; @@ -165,110 +194,139 @@ namespace SuperLU_ASYNCOMM{ RdTree RdTree_Create(MPI_Comm comm, Int* ranks, Int rank_cnt, Int msgSize, double rseed, char precision){ - assert(msgSize>0); - if(precision=='d'){ - TreeReduce_slu* ReduceTree = TreeReduce_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); - return (RdTree) ReduceTree; - } - if(precision=='z'){ - TreeReduce_slu* ReduceTree = TreeReduce_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); - return (RdTree) ReduceTree; - } - return 0; + assert(msgSize>0); + if(precision=='d'){ + TreeReduce_slu* ReduceTree = TreeReduce_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); + return (RdTree) ReduceTree; + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = TreeReduce_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); + return (RdTree) ReduceTree; + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = TreeReduce_slu::Create(comm,ranks,rank_cnt,msgSize,rseed); + return (RdTree) ReduceTree; + } } void RdTree_Destroy(RdTree Tree, char precision){ - if(precision=='d'){ - TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - delete ReduceTree; - } - if(precision=='z'){ - TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - delete ReduceTree; - } + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + delete ReduceTree; + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + delete ReduceTree; + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + delete ReduceTree; + } } void RdTree_SetTag(RdTree Tree, Int tag, char precision){ - if(precision=='d'){ - TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - ReduceTree->SetTag(tag); - } - if(precision=='z'){ - TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - ReduceTree->SetTag(tag); - } + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->SetTag(tag); + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->SetTag(tag); + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->SetTag(tag); + } } int RdTree_GetDestCount(RdTree Tree, char precision){ - if(precision=='d'){ - TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - return ReduceTree->GetDestCount(); - } - if(precision=='z'){ - TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - return ReduceTree->GetDestCount(); - } - return 0; + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->GetDestCount(); + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->GetDestCount(); + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->GetDestCount(); + } } int RdTree_GetMsgSize(RdTree Tree, char precision){ - if(precision=='d'){ - TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - return ReduceTree->GetMsgSize(); - } - if(precision=='z'){ - TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - return ReduceTree->GetMsgSize(); - } - return 0; + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->GetMsgSize(); + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->GetMsgSize(); + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->GetMsgSize(); + } } - - yes_no_t RdTree_IsRoot(RdTree Tree, char precision){ - if(precision=='d'){ - TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - return ReduceTree->IsRoot()?YES:NO; - } - if(precision=='z'){ - TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - return ReduceTree->IsRoot()?YES:NO; - } - return NO; + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->IsRoot()?YES:NO; + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->IsRoot()?YES:NO; + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + return ReduceTree->IsRoot()?YES:NO; + } } - void RdTree_forwardMessageSimple(RdTree Tree, void* localBuffer, Int msgSize, char precision){ - if(precision=='d'){ - TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - ReduceTree->forwardMessageSimple((double*)localBuffer,msgSize); - } - if(precision=='z'){TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - ReduceTree->forwardMessageSimple((doublecomplex*)localBuffer,msgSize); - } + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->forwardMessageSimple((double*)localBuffer,msgSize); + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->forwardMessageSimple((float*)localBuffer,msgSize); + } + if(precision=='z'){TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->forwardMessageSimple((doublecomplex*)localBuffer,msgSize); + } } + void RdTree_allocateRequest(RdTree Tree, char precision){ - if(precision=='d'){ - TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - ReduceTree->allocateRequest(); - } - if(precision=='z'){ - TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - ReduceTree->allocateRequest(); - } - + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->allocateRequest(); + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->allocateRequest(); + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->allocateRequest(); + } } void RdTree_waitSendRequest(RdTree Tree, char precision){ - if(precision=='d'){ - TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - ReduceTree->waitSendRequest(); - } - if(precision=='z'){ - TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; - ReduceTree->waitSendRequest(); - } + if(precision=='d'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->waitSendRequest(); + } + if(precision=='s'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->waitSendRequest(); + } + if(precision=='z'){ + TreeReduce_slu* ReduceTree = (TreeReduce_slu*) Tree; + ReduceTree->waitSendRequest(); + } } #ifdef __cplusplus diff --git a/SRC/ps3dcomm.c b/SRC/ps3dcomm.c new file mode 100644 index 00000000..2c113562 --- /dev/null +++ b/SRC/ps3dcomm.c @@ -0,0 +1,876 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Communication routines for the 3D algorithm. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * May 12, 2021
+ */
+#include "superlu_sdefs.h"
+//#include "cblas.h"
+#if 0
+#include "p3dcomm.h"
+#include "sec_structs.h"
+//#include "load-balance/supernodal_etree.h"
+//#include "load-balance/supernodalForest.h"
+#include "supernodal_etree.h"
+#include "supernodalForest.h"
+#include "trfAux.h"
+#include "treeFactorization.h"
+#include "xtrf3Dpartition.h"
+#endif
+
+// #define MPI_MALLOC
+#define MPI_INT_ALLOC(a, b) (MPI_Alloc_mem( (b)*sizeof(int_t), MPI_INFO_NULL, &(a) ))
+#define MPI_DATATYPE_ALLOC(a, b) (MPI_Alloc_mem((b)*sizeof(float), MPI_INFO_NULL, &(a)))
+
+int_t sAllocLlu(int_t nsupers, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int i;
+    int_t Pc = grid3d->npcol;
+    int_t Pr = grid3d->nprow;
+    
+    int_t nbc = CEILING(nsupers, Pc);
+    int_t nbr = CEILING(nsupers, Pr);
+    
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t   **Lrowind_bc_ptr =
+	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbc); 	/* size ceil(NSUPERS/Pc) */
+    float  **Lnzval_bc_ptr =
+	(float **) SUPERLU_MALLOC(sizeof(float*)*nbc);  /* size ceil(NSUPERS/Pc) */
+
+    for (i = 0; i < nbc ; ++i)
+	{
+	    /* code */
+	    Lrowind_bc_ptr[i] = NULL;
+	    Lnzval_bc_ptr[i] = NULL;
+	}
+    
+    int_t   **Ufstnz_br_ptr =
+	(int_t**) SUPERLU_MALLOC(sizeof(int_t*)*nbr); /* size ceil(NSUPERS/Pr) */
+    float  **Unzval_br_ptr =
+	(float **) SUPERLU_MALLOC(sizeof(float*)*nbr); /* size ceil(NSUPERS/Pr) */
+    
+    for (i = 0; i < nbr ; ++i)
+	{
+	    /* code */
+	    Ufstnz_br_ptr[i] = NULL;
+	    Unzval_br_ptr[i] = NULL;
+	}
+
+   // Sherry: use int type
+                  /* Recv from no one (0), left (1), and up (2).*/
+    int *ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int));
+    for (i = 0; i < nsupers; ++i) ToRecv[i] = 0;
+                  /* Whether need to send down block row. */
+    int *ToSendD = SUPERLU_MALLOC(nbr * sizeof(int));
+    for (i = 0; i < nbr; ++i) ToSendD[i] = 0;
+                  /* List of processes to send right block col. */
+    int **ToSendR = (int **) SUPERLU_MALLOC(nbc * sizeof(int*));
+
+    for (int_t i = 0; i < nbc; ++i)
+	{
+	    /* code */
+	    //ToSendR[i] = INT_T_ALLOC(Pc);
+	    ToSendR[i] = SUPERLU_MALLOC(Pc * sizeof(int));
+	}
+    
+    /*now setup the pointers*/
+    Llu->Lrowind_bc_ptr = Lrowind_bc_ptr ;
+    Llu->Lnzval_bc_ptr = Lnzval_bc_ptr ;
+    Llu->Ufstnz_br_ptr = Ufstnz_br_ptr ;
+    Llu->Unzval_br_ptr = Unzval_br_ptr ;
+    Llu->ToRecv = ToRecv ;
+    Llu->ToSendD = ToSendD ;
+    Llu->ToSendR = ToSendR ;
+    
+    return 0;
+} /* sAllocLlu */
+
+int_t smpiMallocLUStruct(int_t nsupers, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb)
+	{
+	    int_t *usub, *usub_new;
+	    usub =  Ufstnz_br_ptr[lb];
+	    
+	    float * uval = Unzval_br_ptr[lb];
+	    float * uval_new;
+	    
+	    /*if non empty set the flag*/
+	    if (usub != NULL)
+		{
+		    int_t lenv, lens;
+		    lenv = usub[1];
+		    lens = usub[2];
+		    
+		    MPI_INT_ALLOC(usub_new, lens);
+		    memcpy( usub_new, usub, lens * sizeof(int_t));
+		    MPI_DATATYPE_ALLOC(uval_new, lenv);
+		    memcpy( uval_new, uval, lenv * sizeof(float));
+		    Ufstnz_br_ptr[lb] = usub_new;
+		    Unzval_br_ptr[lb] = uval_new;
+		    SUPERLU_FREE(usub);
+		    SUPERLU_FREE(uval);
+		}
+	} /*for ( int_t lb = 0; lb < k; ++lb)*/
+    
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+	{
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *lsub , *lsub_new;
+		    float *lnzval, *lnzval_new;
+		    lsub = Lrowind_bc_ptr[ljb];
+		    lnzval = Lnzval_bc_ptr[ljb];
+		    
+		    if (lsub)
+			{
+			    int_t nrbl, len, len1, len2;
+			    
+			    nrbl  =   lsub[0]; /*number of L blocks */
+			    len   = lsub[1];       /* LDA of the nzval[] */
+			    len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			    len2  = SuperSize(jb) * len;
+			    
+			    MPI_INT_ALLOC(lsub_new, len1);
+			    memcpy( lsub_new, lsub, len1 * sizeof(int_t));
+			    MPI_DATATYPE_ALLOC(lnzval_new, len2);
+			    memcpy( lnzval_new, lnzval, len2 * sizeof(float));
+			    Lrowind_bc_ptr[ljb] = lsub_new;
+			    SUPERLU_FREE(lsub );
+			    Lnzval_bc_ptr[ljb] = lnzval_new;
+			    SUPERLU_FREE(lnzval );
+			}
+		} /* if mycol == pc ... */
+	} /* for jb ... */
+    
+    return 0;
+}
+
+
+int_t szSendLPanel(int_t k, int_t receiver,
+                   sLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t pc = PCOL( k, grid );
+    if (mycol == pc)
+	{
+	    int_t lk = LBj( k, grid ); /* Local block number */
+	    int_t  *lsub;
+	    float* lnzval;
+	    lsub = Lrowind_bc_ptr[lk];
+	    lnzval = Lnzval_bc_ptr[lk];
+	    
+	    if (lsub != NULL)
+		{
+		    int_t len   = lsub[1];       /* LDA of the nzval[] */
+		    int_t len2  = SuperSize(k) * len; /* size of nzval of L panel */
+		    
+		    MPI_Send(lnzval, len2, MPI_FLOAT, receiver, k, grid3d->zscp.comm);
+		    SCT->commVolRed += len2 * sizeof(float);
+		}
+	}
+    return 0;
+}
+
+
+int_t szRecvLPanel(int_t k, int_t sender, float alpha, float beta,
+                    float* Lval_buf,
+                    sLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    
+    // A(k) = alpha*A(k) + beta* A^{sender}(k)
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;    
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    
+    int_t pc = PCOL( k, grid );
+    if (mycol == pc)
+	{
+	    int_t lk = LBj( k, grid ); /* Local block number */
+	    int_t  *lsub;
+	    float* lnzval;
+	    lsub = Lrowind_bc_ptr[lk];
+	    lnzval = Lnzval_bc_ptr[lk];
+	    
+	    if (lsub != NULL)
+		{
+		    int len   = lsub[1];       /* LDA of the nzval[] */
+		    int len2  = SuperSize(k) * len; /* size of nzval of L panels */
+		    
+		    MPI_Status status;
+		    MPI_Recv(Lval_buf , len2, MPI_FLOAT, sender, k,
+			     grid3d->zscp.comm, &status);
+		    
+		    /*reduce the updates*/
+		    superlu_sscal(len2, alpha, lnzval, 1);
+		    superlu_saxpy(len2, beta, Lval_buf, 1, lnzval, 1);
+		}
+	}
+
+    return 0;
+}
+
+int_t szSendUPanel(int_t k, int_t receiver,
+                    sLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t pr = PROW( k, grid );
+    if (myrow == pr)
+	{
+	    int_t lk = LBi( k, grid ); /* Local block number */
+	    int_t  *usub;
+	    float* unzval;
+	    usub = Ufstnz_br_ptr[lk];
+	    unzval = Unzval_br_ptr[lk];
+	    
+	    if (usub != NULL)
+		{
+		    int lenv = usub[1];
+		    
+		    /* code */
+		    MPI_Send(unzval, lenv, MPI_FLOAT, receiver, k, grid3d->zscp.comm);
+		    SCT->commVolRed += lenv * sizeof(float);
+		}
+	}
+	
+    return 0;
+}
+
+
+int_t szRecvUPanel(int_t k, int_t sender, float alpha, float beta,
+                    float* Uval_buf, sLUstruct_t* LUstruct,
+                    gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int inc = 1;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t pr = PROW( k, grid );
+
+    if (myrow == pr)
+	{
+	    int_t lk = LBi( k, grid ); /* Local block number */
+	    int_t  *usub;
+	    float* unzval;
+	    usub = Ufstnz_br_ptr[lk];
+	    unzval = Unzval_br_ptr[lk];
+	    
+	    if (usub != NULL)
+		{
+		    int lenv = usub[1];
+		    MPI_Status status;
+		    MPI_Recv(Uval_buf , lenv, MPI_FLOAT, sender, k,
+			     grid3d->zscp.comm, &status);
+		    
+		    /*reduce the updates*/
+		    superlu_sscal(lenv, alpha, unzval, 1);
+		    superlu_saxpy(lenv, beta, Uval_buf, 1, unzval, 1);
+		}
+	}
+    return 0;
+}
+
+
+int_t sp3dScatter(int_t n, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+/* Copies LU structure from layer 0 to all the layers */
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    
+    /* broadcast etree */
+    int_t *etree = LUstruct->etree;
+    MPI_Bcast( etree, n, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    int_t nsupers;
+    
+    if (!grid3d->zscp.Iam)
+	nsupers = getNsupers(n, LUstruct->Glu_persist);
+    
+    /* broadcast nsupers */
+    MPI_Bcast( &nsupers, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* Scatter and alloc Glu_persist */
+    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
+	sAllocGlu_3d(n, nsupers, LUstruct);
+    
+    /* broadcast Glu_persist */
+    int_t *xsup = LUstruct->Glu_persist->xsup;
+    MPI_Bcast( xsup, nsupers + 1, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    int_t *supno = LUstruct->Glu_persist->supno;
+    MPI_Bcast( supno, n, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* now broadcast local LU structure */
+    /* first allocating space for it */
+    if ( grid3d->zscp.Iam ) // all other process layers not equal 0
+	sAllocLlu(nsupers, LUstruct, grid3d);
+    
+    sLocalLU_t *Llu = LUstruct->Llu;
+    
+    /*scatter all the L blocks and indexes*/
+    sscatter3dLPanels( nsupers, LUstruct, grid3d);
+
+    /*scatter all the U blocks and indexes*/
+    sscatter3dUPanels( nsupers, LUstruct, grid3d);
+    
+    int_t* bufmax = Llu->bufmax;
+    MPI_Bcast( bufmax, NBUFFERS, mpi_int_t, 0,  grid3d->zscp.comm);
+    
+    /* now sending tosendR etc */
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
+    
+    int_t nbr = CEILING(nsupers, Pr);
+    int_t nbc = CEILING(nsupers, Pc);
+    //    MPI_Bcast( ToRecv, nsupers, mpi_int_t, 0,  grid3d->zscp.comm);
+    MPI_Bcast( ToRecv, nsupers, MPI_INT, 0,  grid3d->zscp.comm);
+    
+    MPI_Bcast( ToSendD, nbr, MPI_INT, 0,  grid3d->zscp.comm);
+    for (int_t i = 0; i < nbc; ++i)
+	{
+	    /* code */
+	    MPI_Bcast( ToSendR[i], Pc, MPI_INT, 0,  grid3d->zscp.comm);
+	}
+    
+    //
+#ifdef MPI_MALLOC
+    // change MY LU struct into MPI malloc based
+    if (!grid3d->zscp.Iam)
+	mpiMallocLUStruct(nsupers, LUstruct, grid3d);
+#endif
+    return 0;
+} /* sp3dScatter */
+
+
+int_t sscatter3dUPanels(int_t nsupers,
+		       sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb) {
+	int_t *usub;
+	usub =  Ufstnz_br_ptr[lb];
+	
+	float * uval = Unzval_br_ptr[lb];
+	
+	int_t flag = 0;
+	/*if non empty set the flag*/
+	if (!grid3d->zscp.Iam && usub != NULL)
+	    flag = 1;
+	/*bcast the flag*/
+	MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	
+	if (flag) {
+	    int_t lenv, lens;
+	    lenv = 0;
+	    lens = 0;
+	    
+	    if (!grid3d->zscp.Iam)
+		{
+		    lenv = usub[1];
+		    lens = usub[2];
+		}
+	    
+	    /*broadcast the size of sub array*/
+	    MPI_Bcast( &lens, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	    MPI_Bcast( &lenv, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+	    
+	    /*allocate lsub*/
+	    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		MPI_INT_ALLOC(usub, lens);
+#else
+ 	        usub = INT_T_ALLOC(lens);
+#endif
+
+	    /*bcast usub*/
+	    MPI_Bcast( usub, lens, mpi_int_t, 0,  grid3d->zscp.comm);
+
+	    /*allocate uval*/
+	    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		MPI_DATATYPE_ALLOC(uval, lenv);
+#else
+	        uval = floatMalloc_dist(lenv); //DOUBLE_ALLOC(lenv);
+#endif
+	    /*broadcast uval*/
+	    MPI_Bcast( uval, lenv, MPI_FLOAT, 0,  grid3d->zscp.comm);
+	    
+	    /*setup the pointer*/
+	    Unzval_br_ptr[lb] = uval;
+	    Ufstnz_br_ptr[lb] = usub;
+	} /* end if flag */
+
+    } /* end for lb ... */
+    return 0;
+} /* end sScatter3dUPanels */
+
+
+int_t sscatter3dLPanels(int_t nsupers,
+                       sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    int_t iam = grid->iam;
+    
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+    {
+	int_t pc = PCOL( jb, grid );
+	if (mycol == pc)
+        {
+	    int_t ljb = LBj( jb, grid ); /* Local block number */
+	    int_t  *lsub;
+	    float* lnzval;
+	    lsub = Lrowind_bc_ptr[ljb];
+	    lnzval = Lnzval_bc_ptr[ljb];
+		
+	    int_t flag = 0;
+	    /*if non empty set the flag*/
+	    if (!grid3d->zscp.Iam && lsub != NULL)
+		    flag = 1;
+            /*bcast the flag*/
+	    MPI_Bcast( &flag, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		
+            if (flag) {
+		int_t nrbl, len, len1, len2;
+		if (!grid3d->zscp.Iam)
+		    {
+			nrbl  =   lsub[0]; /*number of L blocks */
+			len   = lsub[1];   /* LDA of the nzval[] */
+			len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+			len2  = SuperSize(jb) * len;
+		    }
+
+		/*bcast lsub len*/
+		MPI_Bcast( &len1, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		    
+   	        /*allocate lsub*/
+		if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+		    MPI_INT_ALLOC(lsub, len1);
+#else
+		    
+		    lsub = INT_T_ALLOC(len1);
+#endif
+		    /*now broadcast lsub*/
+		    MPI_Bcast( lsub, len1, mpi_int_t, 0,  grid3d->zscp.comm);
+
+		    /*set up pointer*/
+		    Lrowind_bc_ptr[ljb] = lsub;
+		    
+		    /*bcast lnzval len*/
+		    MPI_Bcast( &len2, 1, mpi_int_t, 0,  grid3d->zscp.comm);
+		    
+		    /*allocate space for nzval*/
+		    if (grid3d->zscp.Iam)
+#ifdef MPI_MALLOC
+			MPI_DATATYPE_ALLOC(lnzval, len2);
+#else
+		        lnzval = floatCalloc_dist(len2);
+#endif
+		    
+		    /*bcast nonzero values*/
+		    MPI_Bcast( lnzval, len2, MPI_FLOAT, 0,  grid3d->zscp.comm);
+		    
+		    /*setup the pointers*/
+		    Lnzval_bc_ptr[ljb] = lnzval;
+
+		} /* end if flag */
+
+	} /* end if mycol == pc */
+    } /* end for jb ... */
+
+    return 0;
+} /* sscatter3dLPanels */
+
+int_t scollect3dLpanels(int_t layer, int_t nsupers, sLUstruct_t * LUstruct,
+		       gridinfo3d_t* grid3d)
+{
+
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    /*start broadcasting blocks*/
+    for (int_t jb = 0; jb < nsupers; ++jb)   /* for each block column ... */
+    {
+	int_t pc = PCOL( jb, grid );
+	if (mycol == pc)
+	{
+	    int_t ljb = LBj( jb, grid ); /* Local block number */
+	    int_t  *lsub;
+	    float* lnzval;
+	    lsub = Lrowind_bc_ptr[ljb];
+	    lnzval = Lnzval_bc_ptr[ljb];
+		    
+	    if (lsub != NULL)
+	    {
+	        int_t len   = lsub[1];       /* LDA of the nzval[] */
+		int_t len2  = SuperSize(jb) * len; /*size of nzval of L panel */
+			    
+	        if (grid3d->zscp.Iam == layer)
+		{
+		    MPI_Send(lnzval, len2, MPI_FLOAT, 0, jb, grid3d->zscp.comm);
+		}
+		if (!grid3d->zscp.Iam)
+		{
+		    MPI_Status status;
+		    MPI_Recv(lnzval, len2, MPI_DOUBLE, layer, jb, grid3d->zscp.comm, &status);
+		}
+	     }
+	}
+    } /* for jb ... */
+    return 0;
+}
+
+int_t scollect3dUpanels(int_t layer, int_t nsupers, sLUstruct_t * LUstruct,
+      			 gridinfo3d_t* grid3d)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */
+    for ( int_t lb = 0; lb < k; ++lb)
+    {
+	int_t *usub;
+	usub =  Ufstnz_br_ptr[lb];
+	float * uval = Unzval_br_ptr[lb];
+	    
+	if (usub)
+	{
+	    /* code */
+	    int lenv = usub[1];
+	    if (grid3d->zscp.Iam == layer)
+		{
+		    MPI_Send(uval, lenv, MPI_FLOAT, 0, lb, grid3d->zscp.comm);
+		}
+		    
+	    if (!grid3d->zscp.Iam)
+		{
+		    MPI_Status status;
+		    MPI_Recv(uval, lenv, MPI_FLOAT, layer, lb, grid3d->zscp.comm, &status);
+		}
+	}
+    } /* for lb ... */
+    return 0;
+}
+
+/* Gather the LU factors on layer-0 */
+int_t sp3dCollect(int_t layer, int_t n, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d)
+{
+    int_t nsupers = getNsupers(n, LUstruct->Glu_persist);
+    scollect3dLpanels(layer, nsupers,  LUstruct, grid3d);
+    scollect3dUpanels(layer,  nsupers, LUstruct, grid3d);
+    return 0;
+}
+
+
+/* Zero out LU non zero entries */
+int_t szeroSetLU(int_t nnodes, int_t* nodeList, sLUstruct_t *LUstruct,
+      		 gridinfo3d_t* grid3d)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    gridinfo_t* grid = &(grid3d->grid2d);
+    
+    int_t iam = grid->iam;
+    
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    
+    /*first setting the L blocks to zero*/
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    
+	    int_t jb = nodeList[node];
+	    int_t pc = PCOL( jb, grid );
+	    if (mycol == pc)
+		{
+		    int_t ljb = LBj( jb, grid ); /* Local block number */
+		    int_t  *lsub;
+		    float* lnzval;
+		    lsub = Lrowind_bc_ptr[ljb];
+		    lnzval = Lnzval_bc_ptr[ljb];
+		    
+		    if (lsub != NULL)
+			{
+			    int_t len   = lsub[1];       /* LDA of the nzval[] */
+			    int_t len2  = SuperSize(jb) * len;	/*size of nzval of L panel */
+			    memset( lnzval, 0, len2 * sizeof(float) );
+			}
+		}
+	}
+
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    
+	    int_t ib = nodeList[node];
+	    int_t pr = PROW( ib, grid );
+	    if (myrow == pr)
+		{
+		    int_t lib = LBi( ib, grid ); /* Local block number */
+		    int_t  *usub;
+		    float* unzval;
+		    usub = Ufstnz_br_ptr[lib];
+		    unzval = Unzval_br_ptr[lib];
+		    
+		    if (usub != NULL)
+			{
+			    int lenv = usub[1];
+			    memset( unzval, 0, lenv * sizeof(float) );
+			}
+		}
+	}
+    
+    return 0;
+}
+
+
+int_t sreduceAncestors3d(int_t sender, int_t receiver,
+                        int_t nnodes, int_t* nodeList,
+                        float* Lval_buf, float* Uval_buf,
+                        sLUstruct_t* LUstruct,  gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    double alpha = 1.0, beta = 1.0;	
+    int_t myGrid = grid3d->zscp.Iam;
+    
+    /*first setting the L blocks to zero*/
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    int_t jb = nodeList[node];
+	    
+	    if (myGrid == sender)
+		{
+		    szSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    szSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		}
+	    else {
+	        szRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+                                LUstruct, grid3d, SCT);
+		szRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+                                LUstruct,  grid3d, SCT);
+	    }
+	    
+	}
+    return 0;
+    
+}
+
+
+int_t sgatherFactoredLU(int_t sender, int_t receiver,
+                        int_t nnodes, int_t *nodeList,
+                        sLUValSubBuf_t* LUvsb,
+                        sLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT)
+{
+    double alpha = 0.0, beta = 1.0;	
+    float * Lval_buf  = LUvsb->Lval_buf;
+    float * Uval_buf  = LUvsb->Uval_buf;
+    int_t myGrid = grid3d->zscp.Iam;
+    for (int_t node = 0; node < nnodes; ++node)   /* for each block column ... */
+	{
+	    int_t jb = nodeList[node];
+	    if (myGrid == sender)
+		{
+		    szSendLPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    szSendUPanel(jb, receiver, LUstruct,  grid3d, SCT);
+		    
+		}
+	    else
+		{
+		    szRecvLPanel(jb, sender, alpha, beta, Lval_buf,
+                                     LUstruct, grid3d, SCT);
+		    szRecvUPanel(jb, sender, alpha, beta, Uval_buf,
+                                     LUstruct, grid3d, SCT);
+		}
+	}
+    return 0;
+    
+}
+
+
+int_t sinit3DLUstruct( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+                      int_t* nodeCount, int_t** nodeList, sLUstruct_t* LUstruct,
+		      gridinfo3d_t* grid3d)
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    
+    for (int_t lvl = 0; lvl < maxLvl; lvl++)
+	{
+	    if (myZeroTrIdxs[lvl])
+		{
+		    /* code */
+		    int_t treeId = myTreeIdxs[lvl];
+		    szeroSetLU(nodeCount[treeId], nodeList[treeId], LUstruct, grid3d);
+		}
+	}
+    
+    return 0;
+}
+
+
+int sreduceAllAncestors3d(int_t ilvl, int_t* myNodeCount, int_t** treePerm,
+                             sLUValSubBuf_t* LUvsb, sLUstruct_t* LUstruct,
+                             gridinfo3d_t* grid3d, SCT_t* SCT )
+{
+    float * Lval_buf  = LUvsb->Lval_buf;
+    float * Uval_buf  = LUvsb->Uval_buf;
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    
+    int_t sender, receiver;
+    if ((myGrid % (1 << (ilvl + 1))) == 0)
+	{
+	    sender = myGrid + (1 << ilvl);
+	    receiver = myGrid;
+	}
+    else
+	{
+	    sender = myGrid;
+	    receiver = myGrid - (1 << ilvl);
+	}
+    
+    /*Reduce all the ancestors*/
+    for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl)
+	{
+	    /* code */
+	    // int_t atree = myTreeIdxs[alvl];
+	    int_t nsAncestor = myNodeCount[alvl];
+	    int_t* cAncestorList = treePerm[alvl];
+	    double treduce = SuperLU_timer_();
+	    sreduceAncestors3d(sender, receiver, nsAncestor, cAncestorList,
+			        Lval_buf, Uval_buf, LUstruct, grid3d, SCT);
+	    SCT->ancsReduce += SuperLU_timer_() - treduce;
+	    
+	}
+    return 0;
+}
+
+int_t sgatherAllFactoredLU( trf3Dpartition_t*  trf3Dpartition,
+			   sLUstruct_t* LUstruct, gridinfo3d_t* grid3d, SCT_t* SCT )
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t myGrid = grid3d->zscp.Iam;
+    int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    sLUValSubBuf_t*  LUvsb =  trf3Dpartition->LUvsb;
+    int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
+    int_t** gNodeLists = getNodeListFr(maxLvl, sForests);
+    
+    for (int_t ilvl = 0; ilvl < maxLvl - 1; ++ilvl)
+	{
+	    /* code */
+	    int_t sender, receiver;
+	    if (!myZeroTrIdxs[ilvl])
+		{
+		    if ((myGrid % (1 << (ilvl + 1))) == 0)
+			{
+			    sender = myGrid + (1 << ilvl);
+			    receiver = myGrid;
+			}
+		    else
+			{
+			    sender = myGrid;
+			    receiver = myGrid - (1 << ilvl);
+			}
+		    
+		    for (int_t alvl = 0; alvl <= ilvl; alvl++)
+			{
+			    int_t diffLvl  = ilvl - alvl;
+			    int_t numTrees = 1 << diffLvl;
+			    int_t blvl = maxLvl - alvl - 1;
+			    int_t st = (1 << blvl) - 1 + (sender >> alvl);
+			    
+			    for (int_t tr = st; tr < st + numTrees; ++tr)
+				{
+				    /* code */
+				    sgatherFactoredLU(sender, receiver,
+						     gNodeCount[tr], gNodeLists[tr],
+						     LUvsb,
+						     LUstruct, grid3d, SCT );
+				}
+			}
+		    
+		}
+	} /* for ilvl ... */
+    	
+    SUPERLU_FREE(gNodeCount); // sherry added
+    SUPERLU_FREE(gNodeLists);
+
+    return 0;
+} /* sgatherAllFactoredLU */
+
diff --git a/SRC/psGetDiagU.c b/SRC/psGetDiagU.c
new file mode 100644
index 00000000..9303baa9
--- /dev/null
+++ b/SRC/psGetDiagU.c
@@ -0,0 +1,121 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+/*! @file p@(pre)GetDiagU.c
+ * \brief Extracts the main diagonal of matrix U
+ *
+ * 
+ * -- Auxiliary routine in distributed SuperLU (version 5.1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * Xiaoye S. Li
+ * Created:  April 16, 2002
+ * Modified: May 15, 2016
+ * 
+ */ + + + +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * GetDiagU extracts the main diagonal of matrix U of the LU factorization.
+ *
+ * Arguments
+ * =========
+ *
+ * n        (input) int
+ *          Dimension of the matrix.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *          The data structures to store the distributed L and U factors.
+ *          see superlu_ddefs.h for its definition.
+ *
+ * grid     (input) gridinfo_t*
+ *          The 2D process mesh. It contains the MPI communicator, the number
+ *          of process rows (NPROW), the number of process columns (NPCOL),
+ *          and my process rank. It is an input argument to all the
+ *          parallel routines.
+ *
+ * diagU    (output) double*, dimension (n)
+ *          The main diagonal of matrix U.
+ *          On exit, it is available on all processes.
+ *
+ *
+ * Note
+ * ====
+ *
+ * The diagonal blocks of the L and U matrices are stored in the L
+ * data structures, and are on the diagonal processes of the
+ * 2D process grid.
+ *
+ * This routine is modified from gather_diag_to_all() in psgstrs_Bglobal.c.
+ * 
+ */ +void psGetDiagU(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid, + float *diagU) +{ + + int_t *xsup; + int iam, knsupc, pkk; + int nsupr; /* number of rows in the block L(:,k) (LDA) */ + int_t i, j, jj, k, lk, lwork, nsupers, p; + int_t num_diag_procs, *diag_procs, *diag_len; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + float *sblock, *swork, *lusup; + + iam = grid->iam; + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + + get_diag_procs(n, Glu_persist, grid, &num_diag_procs, + &diag_procs, &diag_len); + jj = diag_len[0]; + for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] ); + if ( !(swork = floatMalloc_dist(jj)) ) ABORT("Malloc fails for swork[]"); + + for (p = 0; p < num_diag_procs; ++p) { + pkk = diag_procs[p]; + if ( iam == pkk ) { + /* Copy diagonal into buffer dwork[]. */ + lwork = 0; + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + lk = LBj( k, grid ); + nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ + lusup = Llu->Lnzval_bc_ptr[lk]; + for (i = 0; i < knsupc; ++i) /* Copy the diagonal. */ + swork[lwork+i] = lusup[i*(nsupr+1)]; + lwork += knsupc; + } + MPI_Bcast( swork, lwork, MPI_FLOAT, pkk, grid->comm ); + } else { + MPI_Bcast( swork, diag_len[p], MPI_FLOAT, pkk, grid->comm ); + } + + /* Scatter swork[] into global diagU vector. */ + lwork = 0; + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + sblock = &diagU[FstBlockC( k )]; + for (i = 0; i < knsupc; ++i) sblock[i] = swork[lwork+i]; + lwork += knsupc; + } + } /* for p = ... */ + + SUPERLU_FREE(diag_procs); + SUPERLU_FREE(diag_len); + SUPERLU_FREE(swork); +} diff --git a/SRC/psdistribute.c b/SRC/psdistribute.c new file mode 100644 index 00000000..4d4ed76e --- /dev/null +++ b/SRC/psdistribute.c @@ -0,0 +1,1987 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Re-distribute A on the 2D process mesh. + *
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * 
+ */ + +#include "superlu_sdefs.h" + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Re-distribute A on the 2D process mesh.
+ *
+ * Arguments
+ * =========
+ *
+ * A      (input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
+ *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ * ScalePermstruct (input) sScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_freeable (input) *Glu_freeable_t
+ *        The global structure describing the graph of L and U.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * colptr (output) int*
+ *
+ * rowind (output) int*
+ *
+ * a      (output) float*
+ *
+ * Return value
+ * ============
+ *   > 0, working storage (in bytes) required to perform redistribution.
+ *        (excluding LU factor size)
+ * 
+ */ +int_t +sReDistribute_A(SuperMatrix *A, sScalePermstruct_t *ScalePermstruct, + Glu_freeable_t *Glu_freeable, int_t *xsup, int_t *supno, + gridinfo_t *grid, int_t *colptr[], int_t *rowind[], + float *a[]) +{ + NRformat_loc *Astore; + int_t *perm_r; /* row permutation vector */ + int_t *perm_c; /* column permutation vector */ + int_t i, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize,nnz_tot; + int_t nnz_loc; /* number of local nonzeros */ + int_t SendCnt; /* number of remote nonzeros to be sent */ + int_t RecvCnt; /* number of remote nonzeros to be sent */ + int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; + int_t *ia, *ja, **ia_send, *index, *itemp = NULL; + int_t *ptr_to_send; + float *aij, **aij_send, *nzval, *dtemp = NULL; + float *nzval_a; + float asum,asum_tot; + int iam, it, p, procs, iam_g; + MPI_Request *send_req; + MPI_Status status; + + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + iam = grid->iam; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter sReDistribute_A()"); +#endif + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + procs = grid->nprow * grid->npcol; + Astore = (NRformat_loc *) A->Store; + n = A->ncol; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + nnzToRecv = intCalloc_dist(2*procs); + nnzToSend = nnzToRecv + procs; + + /* ------------------------------------------------------------ + COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, + THEN ALLOCATE SPACE. + THIS ACCOUNTS FOR THE FIRST PASS OF A. + ------------------------------------------------------------*/ + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ + jcol = Astore->colind[j]; + gbi = BlockNum( irow ); + gbj = BlockNum( jcol ); + p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); + ++nnzToSend[p]; + } + } + + /* All-to-all communication */ + MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, + grid->comm); + + maxnnzToRecv = 0; + nnz_loc = SendCnt = RecvCnt = 0; + + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + SendCnt += nnzToSend[p]; + RecvCnt += nnzToRecv[p]; + maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); + } else { + nnz_loc += nnzToRecv[p]; + /*assert(nnzToSend[p] == nnzToRecv[p]);*/ + } + } + k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ + + /* Allocate space for storing the triplets after redistribution. */ + if ( k ) { /* count can be zero. */ + if ( !(ia = intMalloc_dist(2*k)) ) + ABORT("Malloc fails for ia[]."); + if ( !(aij = floatMalloc_dist(k)) ) + ABORT("Malloc fails for aij[]."); + } + ja = ia + k; + + /* Allocate temporary storage for sending/receiving the A triplets. */ + if ( procs > 1 ) { + if ( !(send_req = (MPI_Request *) + SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) + ABORT("Malloc fails for send_req[]."); + if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) + ABORT("Malloc fails for ia_send[]."); + if ( !(aij_send = (float **)SUPERLU_MALLOC(procs*sizeof(float*))) ) + ABORT("Malloc fails for aij_send[]."); + if ( SendCnt ) { /* count can be zero */ + if ( !(index = intMalloc_dist(2*SendCnt)) ) + ABORT("Malloc fails for index[]."); + if ( !(nzval = floatMalloc_dist(SendCnt)) ) + ABORT("Malloc fails for nzval[]."); + } + if ( !(ptr_to_send = intCalloc_dist(procs)) ) + ABORT("Malloc fails for ptr_to_send[]."); + if ( maxnnzToRecv ) { /* count can be zero */ + if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) + ABORT("Malloc fails for itemp[]."); + if ( !(dtemp = floatMalloc_dist(maxnnzToRecv)) ) + ABORT("Malloc fails for dtemp[]."); + } + + for (i = 0, j = 0, p = 0; p < procs; ++p) { + if ( p != iam ) { + ia_send[p] = &index[i]; + i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ + aij_send[p] = &nzval[j]; + j += nnzToSend[p]; + } + } + } /* if procs > 1 */ + + if ( !(*colptr = intCalloc_dist(n+1)) ) + ABORT("Malloc fails for *colptr[]."); + + /* ------------------------------------------------------------ + LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. + THIS ACCOUNTS FOR THE SECOND PASS OF A. + ------------------------------------------------------------*/ + nnz_loc = 0; /* Reset the local nonzero count. */ + nzval_a = Astore->nzval; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ + jcol = Astore->colind[j]; + gbi = BlockNum( irow ); + gbj = BlockNum( jcol ); + p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); + + if ( p != iam ) { /* remote */ + k = ptr_to_send[p]; + ia_send[p][k] = irow; + ia_send[p][k + nnzToSend[p]] = jcol; + aij_send[p][k] = nzval_a[j]; + ++ptr_to_send[p]; + } else { /* local */ + ia[nnz_loc] = irow; + ja[nnz_loc] = jcol; + aij[nnz_loc] = nzval_a[j]; + ++nnz_loc; + ++(*colptr)[jcol]; /* Count nonzeros in each column */ + } + } + } + + /* ------------------------------------------------------------ + PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. + NOTE: Can possibly use MPI_Alltoallv. + ------------------------------------------------------------*/ + for (p = 0; p < procs; ++p) { + if ( p != iam && nnzToSend[p] > 0 ) { + //if ( p != iam ) { + it = 2*nnzToSend[p]; + MPI_Isend( ia_send[p], it, mpi_int_t, + p, iam, grid->comm, &send_req[p] ); + it = nnzToSend[p]; + MPI_Isend( aij_send[p], it, MPI_FLOAT, + p, iam+procs, grid->comm, &send_req[procs+p] ); + } + } + + for (p = 0; p < procs; ++p) { + if ( p != iam && nnzToRecv[p] > 0 ) { + //if ( p != iam ) { + it = 2*nnzToRecv[p]; + MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); + it = nnzToRecv[p]; + MPI_Recv( dtemp, it, MPI_FLOAT, p, p+procs, + grid->comm, &status ); + for (i = 0; i < nnzToRecv[p]; ++i) { + ia[nnz_loc] = itemp[i]; + jcol = itemp[i + nnzToRecv[p]]; + /*assert(jcol 0 ) { // cause two of the tests to hang + //if ( p != iam ) { + MPI_Wait( &send_req[p], &status); + MPI_Wait( &send_req[procs+p], &status); + } + } + + /* ------------------------------------------------------------ + DEALLOCATE TEMPORARY STORAGE + ------------------------------------------------------------*/ + + SUPERLU_FREE(nnzToRecv); + + if ( procs > 1 ) { + SUPERLU_FREE(send_req); + SUPERLU_FREE(ia_send); + SUPERLU_FREE(aij_send); + if ( SendCnt ) { + SUPERLU_FREE(index); + SUPERLU_FREE(nzval); + } + SUPERLU_FREE(ptr_to_send); + if ( maxnnzToRecv ) { + SUPERLU_FREE(itemp); + SUPERLU_FREE(dtemp); + } + } + + /* ------------------------------------------------------------ + CONVERT THE TRIPLET FORMAT INTO THE CCS FORMAT. + ------------------------------------------------------------*/ + if ( nnz_loc ) { /* nnz_loc can be zero */ + if ( !(*rowind = intMalloc_dist(nnz_loc)) ) + ABORT("Malloc fails for *rowind[]."); + if ( !(*a = floatMalloc_dist(nnz_loc)) ) + ABORT("Malloc fails for *a[]."); + } + + /* Initialize the array of column pointers */ + k = 0; + jsize = (*colptr)[0]; + (*colptr)[0] = 0; + for (j = 1; j < n; ++j) { + k += jsize; + jsize = (*colptr)[j]; + (*colptr)[j] = k; + } + + /* Copy the triplets into the column oriented storage */ + for (i = 0; i < nnz_loc; ++i) { + j = ja[i]; + k = (*colptr)[j]; + (*rowind)[k] = ia[i]; + (*a)[k] = aij[i]; + ++(*colptr)[j]; + } + + /* Reset the column pointers to the beginning of each column */ + for (j = n; j > 0; --j) (*colptr)[j] = (*colptr)[j-1]; + (*colptr)[0] = 0; + + if ( nnz_loc ) { + SUPERLU_FREE(ia); + SUPERLU_FREE(aij); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit sReDistribute_A()"); +#endif + + return 0; +} /* sReDistribute_A */ + +float +psdistribute(fact_t fact, int_t n, SuperMatrix *A, + sScalePermstruct_t *ScalePermstruct, + Glu_freeable_t *Glu_freeable, sLUstruct_t *LUstruct, + gridinfo_t *grid) +/* + * -- Distributed SuperLU routine (version 2.0) -- + * Lawrence Berkeley National Lab, Univ. of California Berkeley. + * March 15, 2003 + * + * + * Purpose + * ======= + * Distribute the matrix onto the 2D process mesh. + * + * Arguments + * ========= + * + * fact (input) fact_t + * Specifies whether or not the L and U structures will be re-used. + * = SamePattern_SameRowPerm: L and U structures are input, and + * unchanged on exit. + * = DOFACT or SamePattern: L and U structures are computed and output. + * + * n (input) int + * Dimension of the matrix. + * + * A (input) SuperMatrix* + * The distributed input matrix A of dimension (A->nrow, A->ncol). + * A may be overwritten by diag(R)*A*diag(C)*Pc^T. The type of A can be: + * Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE. + * + * ScalePermstruct (input) sScalePermstruct_t* + * The data structure to store the scaling and permutation vectors + * describing the transformations performed to the original matrix A. + * + * Glu_freeable (input) *Glu_freeable_t + * The global structure describing the graph of L and U. + * + * LUstruct (input) sLUstruct_t* + * Data structures for L and U factors. + * + * grid (input) gridinfo_t* + * The 2D process mesh. + * + * Return value + * ============ + * > 0, working storage required (in bytes). + * + */ +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1, + len, len1, nsupc; + int_t lib; /* local block row number */ + int_t nlb; /* local block rows*/ + int_t ljb; /* local block column number */ + int_t nrbl; /* number of L blocks in current block column */ + int_t nrbu; /* number of U blocks in current block column */ + int_t gb; /* global block number; 0 < gb <= nsuper */ + int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ + int_t ub,gik,iklrow,fnz; + int iam, jbrow, kcol, krow, mycol, myrow, pc, pr; + int_t mybufmax[NBUFFERS]; + NRformat_loc *Astore; + float *a; + int_t *asub, *xa; + int_t *xa_begin, *xa_end; + int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ + int_t *supno = Glu_persist->supno; + int_t *lsub, *xlsub, *usub, *usub1, *xusub; + int_t nsupers; + int_t next_lind; /* next available position in index[*] */ + int_t next_lval; /* next available position in nzval[*] */ + int_t *index; /* indices consist of headers and row subscripts */ + int_t *index_srt; /* indices consist of headers and row subscripts */ + int *index1; /* temporary pointer to array of int */ + float *lusup, *lusup_srt, *uval; /* nonzero values in L and U */ + float **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t *Unnz; /* size ceil(NSUPERS/Pc) */ + float **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ + int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ + + BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ + int msgsize; + + int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + /*-- Counts to be used in factorization. --*/ + int *ToRecv, *ToSendD, **ToSendR; + + /*-- Counts to be used in lower triangular solve. --*/ + int_t *fmod; /* Modification count for L-solve. */ + int_t **fsendx_plist; /* Column process list to send down Xk. */ + int_t nfrecvx = 0; /* Number of Xk I will receive. */ + int_t nfsendx = 0; /* Number of Xk I will send */ + int_t kseen; + + /*-- Counts to be used in upper triangular solve. --*/ + int_t *bmod; /* Modification count for U-solve. */ + int_t **bsendx_plist; /* Column process list to send down Xk. */ + int_t nbrecvx = 0; /* Number of Xk I will receive. */ + int_t nbsendx = 0; /* Number of Xk I will send */ + int_t *ilsum; /* starting position of each supernode in + the full array (local) */ + + /*-- Auxiliary arrays; freed on return --*/ + int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ + int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ + int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ + int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ + int_t *Ucbs; /* number of column blocks in a block row */ + int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ + int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ + int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ + int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ + int_t *ActiveFlag; + int_t *ActiveFlagAll; + int_t Iactive; + int *ranks; + int_t *idxs; + int_t **nzrows; + double rseed; + int rank_cnt,rank_cnt_ref,Root; + float *dense, *dense_col; /* SPA */ + float zero = 0.0; + int_t ldaspa; /* LDA of SPA */ + int_t iword, dword; + float mem_use = 0.0; + float memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/ + + int_t *mod_bit; + int_t *frecv, *brecv, *lloc; + float **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + float **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + double *SeedSTD_BC,*SeedSTD_RD; + int_t idx_indx,idx_lusup; + int_t nbrow; + int_t ik, il, lk, rel, knsupc, idx_r; + int_t lptr1_tmp, idx_i, idx_v,m, uu; + int_t nub; + int tag; + +#if ( PRNTlevel>=1 ) + int_t nLblocks = 0, nUblocks = 0; +#endif +#if ( PROFlevel>=1 ) + double t, t_u, t_l; + int_t u_blks; +#endif + + /* Initialization. */ + iam = grid->iam; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; + nsupers = supno[n-1] + 1; + Astore = (NRformat_loc *) A->Store; + +//#if ( PRNTlevel>=1 ) + iword = sizeof(int_t); + dword = sizeof(float); +//#endif + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psdistribute()"); +#endif +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + sReDistribute_A(A, ScalePermstruct, Glu_freeable, xsup, supno, + grid, &xa, &asub, &a); + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam ) printf("--------\n" + ".. Phase 1 - ReDistribute_A time: %.2f\t\n", t); +#endif + + if ( fact == SamePattern_SameRowPerm ) { + +#if ( PROFlevel>=1 ) + t_l = t_u = 0; u_blks = 0; +#endif + /* We can propagate the new values of A into the existing + L and U data structures. */ + ilsum = Llu->ilsum; + ldaspa = Llu->ldalsum; + if ( !(dense = floatCalloc_dist(ldaspa * sp_ienv_dist(3))) ) + ABORT("Calloc fails for SPA dense[]."); + nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ + if ( !(Urb_length = intCalloc_dist(nrbu)) ) + ABORT("Calloc fails for Urb_length[]."); + if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) + ABORT("Malloc fails for Urb_indptr[]."); + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + Unzval_br_ptr = Llu->Unzval_br_ptr; + Unnz = Llu->Unnz; + + mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*dword; + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + /* Initialize Uval to zero. */ + for (lb = 0; lb < nrbu; ++lb) { + Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ + index = Ufstnz_br_ptr[lb]; + if ( index ) { + uval = Unzval_br_ptr[lb]; + len = index[1]; + for (i = 0; i < len; ++i) uval[i] = zero; + } /* if index != NULL */ + } /* for lb ... */ + + for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ + pc = PCOL( jb, grid ); + if ( mycol == pc ) { /* Block column jb in my process column */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + + /* Scatter A into SPA (for L), or into U directly. */ + for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { + for (i = xa[j]; i < xa[j+1]; ++i) { + irow = asub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + if ( gb < jb ) { /* in U */ + index = Ufstnz_br_ptr[lb]; + uval = Unzval_br_ptr[lb]; + while ( (k = index[Urb_indptr[lb]]) < jb ) { + /* Skip nonzero values in this block */ + Urb_length[lb] += index[Urb_indptr[lb]+1]; + /* Move pointer to the next block */ + Urb_indptr[lb] += UB_DESCRIPTOR + + SuperSize( k ); + } + /*assert(k == jb);*/ + /* start fstnz */ + istart = Urb_indptr[lb] + UB_DESCRIPTOR; + len = Urb_length[lb]; + fsupc1 = FstBlockC( gb+1 ); + k = j - fsupc; + /* Sum the lengths of the leading columns */ + for (jj = 0; jj < k; ++jj) + len += fsupc1 - index[istart++]; + /*assert(irow>=index[istart]);*/ + uval[len + irow - index[istart]] = a[i]; + } else { /* in L; put in SPA first */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + dense_col[irow] = a[i]; + } + } + } /* for i ... */ + dense_col += ldaspa; + } /* for j ... */ + +#if ( PROFlevel>=1 ) + t_u += SuperLU_timer_() - t; + t = SuperLU_timer_(); +#endif + + /* Gather the values of A from SPA into Lnzval[]. */ + ljb = LBj( jb, grid ); /* Local block number */ + index = Lrowind_bc_ptr[ljb]; + if ( index ) { + nrbl = index[0]; /* Number of row blocks. */ + len = index[1]; /* LDA of lusup[]. */ + lusup = Lnzval_bc_ptr[ljb]; + next_lind = BC_HEADER; + next_lval = 0; + for (jj = 0; jj < nrbl; ++jj) { + gb = index[next_lind++]; + len1 = index[next_lind++]; /* Rows in the block. */ + lb = LBi( gb, grid ); + for (bnnz = 0; bnnz < len1; ++bnnz) { + irow = index[next_lind++]; /* Global index. */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + k = next_lval++; + for (j = 0, dense_col = dense; j < nsupc; ++j) { + lusup[k] = dense_col[irow]; + dense_col[irow] = zero; + k += len; + dense_col += ldaspa; + } + } /* for bnnz ... */ + } /* for jj ... */ + } /* if index ... */ +#if ( PROFlevel>=1 ) + t_l += SuperLU_timer_() - t; +#endif + } /* if mycol == pc */ + } /* for jb ... */ + + SUPERLU_FREE(dense); + SUPERLU_FREE(Urb_length); + SUPERLU_FREE(Urb_indptr); +#if ( PROFlevel>=1 ) + if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", + t_l, t_u, u_blks, nrbu); +#endif + + } else { /* fact is not SamePattern_SameRowPerm */ + /* ------------------------------------------------------------ + FIRST TIME CREATING THE L AND U DATA STRUCTURES. + ------------------------------------------------------------*/ + +#if ( PROFlevel>=1 ) + t_l = t_u = 0; u_blks = 0; +#endif + /* We first need to set up the L and U data structures and then + * propagate the values of A into them. + */ + lsub = Glu_freeable->lsub; /* compressed L subscripts */ + xlsub = Glu_freeable->xlsub; + usub = Glu_freeable->usub; /* compressed U subscripts */ + xusub = Glu_freeable->xusub; + + if ( !(ToRecv = (int *) SUPERLU_MALLOC(nsupers * sizeof(int))) ) + ABORT("Malloc fails for ToRecv[]."); + for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; + + k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ + if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) + ABORT("Malloc fails for ToSendR[]."); + j = k * grid->npcol; + if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) + ABORT("Malloc fails for index[]."); + + mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; + + for (i = 0; i < j; ++i) index1[i] = EMPTY; + for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j]; + k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + + /* Pointers to the beginning of each block row of U. */ + if ( !(Unzval_br_ptr = + (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) + ABORT("Malloc fails for Unzval_br_ptr[]."); + if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Ufstnz_br_ptr[]."); + + if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) ) + ABORT("Malloc fails for ToSendD[]."); + for (i = 0; i < k; ++i) ToSendD[i] = NO; + if ( !(ilsum = intMalloc_dist(k+1)) ) + ABORT("Malloc fails for ilsum[]."); + + /* Auxiliary arrays used to set up U block data structures. + They are freed on return. */ + if ( !(rb_marker = intCalloc_dist(k)) ) + ABORT("Calloc fails for rb_marker[]."); + if ( !(Urb_length = intCalloc_dist(k)) ) + ABORT("Calloc fails for Urb_length[]."); + if ( !(Urb_indptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Urb_indptr[]."); + if ( !(Urb_fstnz = intCalloc_dist(k)) ) + ABORT("Calloc fails for Urb_fstnz[]."); + if ( !(Ucbs = intCalloc_dist(k)) ) + ABORT("Calloc fails for Ucbs[]."); + + mem_use += 2.0*k*sizeof(int_t*) + (7*k+1)*iword; + + /* Compute ldaspa and ilsum[]. */ + ldaspa = 0; + ilsum[0] = 0; + for (gb = 0; gb < nsupers; ++gb) { + if ( myrow == PROW( gb, grid ) ) { + i = SuperSize( gb ); + ldaspa += i; + lb = LBi( gb, grid ); + ilsum[lb + 1] = ilsum[lb] + i; + } + } + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* ------------------------------------------------------------ + COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). + ------------------------------------------------------------*/ + + /* Loop through each supernode column. */ + for (jb = 0; jb < nsupers; ++jb) { + pc = PCOL( jb, grid ); + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + /* Loop through each column in the block. */ + for (j = fsupc; j < fsupc + nsupc; ++j) { + /* usub[*] contains only "first nonzero" in each segment. */ + for (i = xusub[j]; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero of the segment. */ + gb = BlockNum( irow ); + kcol = PCOL( gb, grid ); + ljb = LBj( gb, grid ); + if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; + pr = PROW( gb, grid ); + lb = LBi( gb, grid ); + if ( mycol == pc ) { + if ( myrow == pr ) { + ToSendD[lb] = YES; + /* Count nonzeros in entire block row. */ + Urb_length[lb] += FstBlockC( gb+1 ) - irow; + if (rb_marker[lb] <= jb) {/* First see the block */ + rb_marker[lb] = jb + 1; + Urb_fstnz[lb] += nsupc; + ++Ucbs[lb]; /* Number of column blocks + in block row lb. */ +#if ( PRNTlevel>=1 ) + ++nUblocks; +#endif + } + ToRecv[gb] = 1; + } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ + } + } /* for i ... */ + } /* for j ... */ + } /* for jb ... */ + + /* Set up the initial pointers for each block row in U. */ + nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + for (lb = 0; lb < nrbu; ++lb) { + len = Urb_length[lb]; + rb_marker[lb] = 0; /* Reset block marker. */ + if ( len ) { + /* Add room for descriptors */ + len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1+1)) ) + ABORT("Malloc fails for Uindex[]."); + Ufstnz_br_ptr[lb] = index; + if ( !(Unzval_br_ptr[lb] = floatMalloc_dist(len)) ) + ABORT("Malloc fails for Unzval_br_ptr[*][]."); + mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); + mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); + index[0] = Ucbs[lb]; /* Number of column blocks */ + index[1] = len; /* Total length of nzval[] */ + index[2] = len1; /* Total length of index[] */ + index[len1] = -1; /* End marker */ + } else { + Ufstnz_br_ptr[lb] = NULL; + Unzval_br_ptr[lb] = NULL; + } + Urb_length[lb] = 0; /* Reset block length. */ + Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ + Urb_fstnz[lb] = BR_HEADER; + } /* for lb ... */ + + SUPERLU_FREE(Ucbs); + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); +#endif + + mem_use -= 2.0*k * iword; + + /* Auxiliary arrays used to set up L block data structures. + They are freed on return. + k is the number of local row blocks. */ + if ( !(Lrb_length = intCalloc_dist(k)) ) + ABORT("Calloc fails for Lrb_length[]."); + if ( !(Lrb_number = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_number[]."); + if ( !(Lrb_indptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_indptr[]."); + if ( !(Lrb_valptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_valptr[]."); + if ( !(dense = floatCalloc_dist(ldaspa * sp_ienv_dist(3))) ) + ABORT("Calloc fails for SPA dense[]."); + + /* These counts will be used for triangular solves. */ + if ( !(fmod = intCalloc_dist(k)) ) + ABORT("Calloc fails for fmod[]."); + if ( !(bmod = intCalloc_dist(k)) ) + ABORT("Calloc fails for bmod[]."); + + /* ------------------------------------------------ */ + mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*dword; + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + + /* Pointers to the beginning of each block column of L. */ + if ( !(Lnzval_bc_ptr = + (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) + ABORT("Malloc fails for Lnzval_bc_ptr[]."); + if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Lrowind_bc_ptr[]."); + Lrowind_bc_ptr[k-1] = NULL; + + if ( !(Lindval_loc_bc_ptr = + (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Lindval_loc_bc_ptr[]."); + Lindval_loc_bc_ptr[k-1] = NULL; + + if ( !(Linv_bc_ptr = + (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); + } + if ( !(Uinv_bc_ptr = + (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); + } + Linv_bc_ptr[k-1] = NULL; + Uinv_bc_ptr[k-1] = NULL; + + if ( !(Unnz = + (int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) ) + ABORT("Malloc fails for Unnz[]."); + + + /* These lists of processes will be used for triangular solves. */ + if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + ABORT("Malloc fails for fsendx_plist[]."); + len = k * grid->nprow; + if ( !(index = intMalloc_dist(len)) ) + ABORT("Malloc fails for fsendx_plist[0]"); + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < k; ++i, j += grid->nprow) + fsendx_plist[i] = &index[j]; + if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + ABORT("Malloc fails for bsendx_plist[]."); + if ( !(index = intMalloc_dist(len)) ) + ABORT("Malloc fails for bsendx_plist[0]"); + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < k; ++i, j += grid->nprow) + bsendx_plist[i] = &index[j]; + /* -------------------------------------------------------------- */ + mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; + memTRS += k*sizeof(int_t*) + 2.0*k*sizeof(double*) + k*iword; //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr + + /*------------------------------------------------------------ + PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. + ------------------------------------------------------------*/ + + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + pc = PCOL( jb, grid ); + if ( mycol == pc ) { /* Block column jb in my process column */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + ljb = LBj( jb, grid ); /* Local block number */ + + /* Scatter A into SPA. */ + for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { + for (i = xa[j]; i < xa[j+1]; ++i) { + irow = asub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + irow = ilsum[lb] + irow - FstBlockC( gb ); + dense_col[irow] = a[i]; + } + } + dense_col += ldaspa; + } /* for j ... */ + + jbrow = PROW( jb, grid ); + + /*------------------------------------------------ + * SET UP U BLOCKS. + *------------------------------------------------*/ +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + kseen = 0; + dense_col = dense; + /* Loop through each column in the block column. */ + for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + istart = xusub[j]; + /* NOTE: Only the first nonzero index of the segment + is stored in usub[]. */ + for (i = istart; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero in the segment. */ + gb = BlockNum( irow ); + pr = PROW( gb, grid ); + if ( pr != jbrow && + myrow == jbrow && /* diag. proc. owning jb */ + bsendx_plist[ljb][pr] == EMPTY ) { + bsendx_plist[ljb][pr] = YES; + ++nbsendx; + } + if ( myrow == pr ) { + lb = LBi( gb, grid ); /* Local block number */ + index = Ufstnz_br_ptr[lb]; + uval = Unzval_br_ptr[lb]; + fsupc1 = FstBlockC( gb+1 ); + if (rb_marker[lb] <= jb) { /* First time see + the block */ + rb_marker[lb] = jb + 1; + Urb_indptr[lb] = Urb_fstnz[lb];; + index[Urb_indptr[lb]] = jb; /* Descriptor */ + Urb_indptr[lb] += UB_DESCRIPTOR; + /* Record the first location in index[] of the + next block */ + Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; + len = Urb_indptr[lb];/* Start fstnz in index */ + index[len-1] = 0; + for (k = 0; k < nsupc; ++k) + index[len+k] = fsupc1; + if ( gb != jb )/* Exclude diagonal block. */ + ++bmod[lb];/* Mod. count for back solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nbrecvx; + kseen = 1; + } + } else { /* Already saw the block */ + len = Urb_indptr[lb];/* Start fstnz in index */ + } + jj = j - fsupc; + index[len+jj] = irow; + /* Load the numerical values */ + k = fsupc1 - irow; /* No. of nonzeros in segment */ + index[len-1] += k; /* Increment block length in + Descriptor */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + for (ii = 0; ii < k; ++ii) { + uval[Urb_length[lb]++] = dense_col[irow + ii]; + dense_col[irow + ii] = zero; + } + } /* if myrow == pr ... */ + } /* for i ... */ + dense_col += ldaspa; + } /* for j ... */ + +#if ( PROFlevel>=1 ) + t_u += SuperLU_timer_() - t; + t = SuperLU_timer_(); +#endif + /*------------------------------------------------ + * SET UP L BLOCKS. + *------------------------------------------------*/ + + /* Count number of blocks and length of each block. */ + nrbl = 0; + len = 0; /* Number of row subscripts I own. */ + kseen = 0; + istart = xlsub[fsupc]; + for (i = istart; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + gb = BlockNum( irow ); /* Global block number */ + pr = PROW( gb, grid ); /* Process row owning this block */ + if ( pr != jbrow && + myrow == jbrow && /* diag. proc. owning jb */ + fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { + fsendx_plist[ljb][pr] = YES; + ++nfsendx; + } + if ( myrow == pr ) { + lb = LBi( gb, grid ); /* Local block number */ + if (rb_marker[lb] <= jb) { /* First see this block */ + rb_marker[lb] = jb + 1; + Lrb_length[lb] = 1; + Lrb_number[nrbl++] = gb; + if ( gb != jb ) /* Exclude diagonal block. */ + ++fmod[lb]; /* Mod. count for forward solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nfrecvx; + kseen = 1; + } +#if ( PRNTlevel>=1 ) + ++nLblocks; +#endif + } else { + ++Lrb_length[lb]; + } + ++len; + } + } /* for i ... */ + + if ( nrbl ) { /* Do not ensure the blocks are sorted! */ + /* Set up the initial pointers for each block in + index[] and nzval[]. */ + /* Add room for descriptors */ + len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index[]"); + if (!(lusup = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float)))) + ABORT("Malloc fails for lusup[]"); + if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3)) ) + ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]"); + if (!(Linv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float)))) + ABORT("Malloc fails for Linv_bc_ptr[ljb][]"); + if (!(Uinv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float)))) + ABORT("Malloc fails for Uinv_bc_ptr[ljb][]"); + mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); + mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); + mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); + memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword; //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb] + index[0] = nrbl; /* Number of row blocks */ + index[1] = len; /* LDA of the nzval[] */ + next_lind = BC_HEADER; + next_lval = 0; + for (k = 0; k < nrbl; ++k) { + gb = Lrb_number[k]; + lb = LBi( gb, grid ); + len = Lrb_length[lb]; + Lindval_loc_bc_ptr[ljb][k] = lb; + Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind; + Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval; + Lrb_length[lb] = 0; /* Reset vector of block length */ + index[next_lind++] = gb; /* Descriptor */ + index[next_lind++] = len; + Lrb_indptr[lb] = next_lind; + Lrb_valptr[lb] = next_lval; + next_lind += len; + next_lval += len; + } + /* Propagate the compressed row subscripts to Lindex[], + and the initial values of A from SPA into Lnzval[]. */ + len = index[1]; /* LDA of lusup[] */ + for (i = istart; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + k = Lrb_indptr[lb]++; /* Random access a block */ + index[k] = irow; + k = Lrb_valptr[lb]++; + irow = ilsum[lb] + irow - FstBlockC( gb ); + for (j = 0, dense_col = dense; j < nsupc; ++j) { + lusup[k] = dense_col[irow]; + dense_col[irow] = 0.0; + k += len; + dense_col += ldaspa; + } + } + } /* for i ... */ + + Lrowind_bc_ptr[ljb] = index; + Lnzval_bc_ptr[ljb] = lusup; + + /* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb] + and Lnzval_bc_ptr[ljb] here. */ + if(nrbl>1){ + krow = PROW( jb, grid ); + if(myrow==krow){ /* skip the diagonal block */ + uu=nrbl-2; + lloc = &Lindval_loc_bc_ptr[ljb][1]; + }else{ + uu=nrbl-1; + lloc = Lindval_loc_bc_ptr[ljb]; + } + quickSortM(lloc,0,uu,nrbl,0,3); + } + + + if ( !(index_srt = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index_srt[]"); + if (!(lusup_srt = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float)))) + ABORT("Malloc fails for lusup_srt[]"); + + idx_indx = BC_HEADER; + idx_lusup = 0; + for (jj=0;jj=1 ) + t_l += SuperLU_timer_() - t; +#endif + } /* if mycol == pc */ + + } /* for jb ... */ + + ///////////////////////////////////////////////////////////////// + + /* Set up additional pointers for the index and value arrays of U. + nub is the number of local block columns. */ + nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */ + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + /* usub1[0] -- number of column blocks in this block row. */ + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) { + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + } + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + + ++Urbs1[ljb]; + j += usub1[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + +/* Count the nnzs per block column */ + for (lb = 0; lb < nub; ++lb) { + Unnz[lb] = 0; + k = lb * grid->npcol + mycol;/* Global block number, column-wise. */ + knsupc = SuperSize( k ); + for (ub = 0; ub < Urbs[lb]; ++ub) { + ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */ + i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iklrow = FstBlockC( gik+1 ); + for (jj = 0; jj < knsupc; ++jj) { + fnz = Ufstnz_br_ptr[ik][i + jj]; + if ( fnz < iklrow ) { + Unnz[lb] +=iklrow-fnz; + } + } /* for jj ... */ + } + } + + ///////////////////////////////////////////////////////////////// + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Bcast tree for L ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for LBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + + for (i=0;icscp.comm); + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlag[]."); + memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword; //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers; + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=SUPERLU_MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb); + } /* for j ... */ + } + } + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2); + + if(Iactive==1){ + // printf("jb %5d damn\n",jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s'); + BcTree_SetTag(LBtree_ptr[ljb],BC_L,'s'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(iam==15 || iam==3){ + // printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb],'s')); + // fflush(stdout); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + if ( fsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + assert(rank_cnt==rank_cnt_ref); + + // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); + + // // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;jnprow*k*iword; //acount for SeedSTD_BC, ActiveFlagAll + +#if ( PROFlevel>=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); +#endif + + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for L ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || fmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for LRtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + + // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + // fsupc = FstBlockC( jb ); + // len=xlsub[fsupc+1]-xlsub[fsupc]; + // idxs[jb] = len-1; + // if(len>0){ + // if ( !(nzrows[jb] = intMalloc_dist(len)) ) + // ABORT("Malloc fails for nzrows[jb]"); + // for(i=xlsub[fsupc];inpcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers; + memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword; //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + fsupc = FstBlockC( jb ); + pc = PCOL( jb, grid ); + for(i=xlsub[fsupc];inpcol]=SUPERLU_MAX(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s'); + RdTree_SetTag(LRtree_ptr[lib], RD_L,'s'); + // } + + // printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(ib==15 || ib ==16){ + + // if(iam==15 || iam==3){ + // printf("iam %5d rtree lk %5d tag %5d root %5d\n",iam,lib,ib,RdTree_IsRoot(LRtree_ptr[lib],'s')); + // fflush(stdout); + // } + + + // #if ( PRNTlevel>=1 ) + // if(Root==mycol){ + // assert(rank_cnt==frecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // // for(j=0;jnprow*k*iword; //acount for SeedSTD_RD, ActiveFlagAll + //////////////////////////////////////////////////////// + +#if ( PROFlevel>=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + /* construct the Bcast tree for U ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for UBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + for (i=0;icscp.comm); + + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers; + memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword; //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],gb); + // printf("gb:%5d jb: %5d nsupers: %5d\n",gb,jb,nsupers); + // fflush(stdout); + //if(gb==jb)Root=pr; + } + + + } + pr = PROW( jb, grid ); // take care of diagonal node stored as L + // printf("jb %5d current: %5d",jb,ActiveFlagAll[pr+ljb*grid->nprow]); + // fflush(stdout); + ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb); + } + } + + + + for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2); + // printf("jb: %5d Iactive %5d\n",jb,Iactive); + // fflush(stdout); + if(Iactive==1){ + // printf("root:%5d jb: %5d\n",Root,jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + // printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt); + // fflush(stdout); + if(rank_cnt>1){ + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s'); + BcTree_SetTag(UBtree_ptr[ljb],BC_U,'s'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + // printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow); + // fflush(stdout); + if ( bsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + // printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref); + // fflush(stdout); + assert(rank_cnt==rank_cnt_ref); + } + } + } + } + } + SUPERLU_FREE(ActiveFlag); + SUPERLU_FREE(ActiveFlagAll); + SUPERLU_FREE(ranks); + SUPERLU_FREE(SeedSTD_BC); + memTRS -= k*dword + grid->nprow*k*iword; //acount for SeedSTD_BC, ActiveFlagAll + +#if ( PROFlevel>=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for U ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || bmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for URtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + + // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + // fsupc = FstBlockC( jb ); + // len=0; + // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + // istart = xusub[j]; + // /* NOTE: Only the first nonzero index of the segment + // is stored in usub[]. */ + // len += xusub[j+1] - xusub[j]; + // } + + // idxs[jb] = len-1; + + // if(len>0){ + // if ( !(nzrows[jb] = intMalloc_dist(len)) ) + // ABORT("Malloc fails for nzrows[jb]"); + + // fsupc = FstBlockC( jb ); + + // len=0; + + // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + // istart = xusub[j]; + // /* NOTE: Only the first nonzero index of the segment + // is stored in usub[]. */ + // for (i = istart; i < xusub[j+1]; ++i) { + // irow = usub[i]; /* First nonzero in the segment. */ + // nzrows[jb][len]=irow; + // len++; + // } + // } + // quickSort(nzrows[jb],0,len-1,0); + // } + // else{ + // nzrows[jb] = NULL; + // } + // } + + + for (lib = 0; lib npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers; + memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword; //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll + + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + fsupc = FstBlockC( jb ); + pc = PCOL( jb, grid ); + + fsupc = FstBlockC( jb ); + for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + istart = xusub[j]; + /* NOTE: Only the first nonzero index of the segment + is stored in usub[]. */ + for (i = istart; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero in the segment. */ + ib = BlockNum( irow ); + pr = PROW( ib, grid ); + if ( myrow == pr ) { /* Block row ib in my process row */ + lib = LBi( ib, grid ); /* Local block number */ + ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + + pr = PROW( jb, grid ); + if ( myrow == pr ) { /* Block row ib in my process row */ + lib = LBi( jb, grid ); /* Local block number */ + ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s'); + RdTree_SetTag(URtree_ptr[lib], RD_U,'s'); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==mycol){ + // printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]); + // fflush(stdout); + assert(rank_cnt==brecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;jnprow*k*iword; //acount for SeedSTD_RD, ActiveFlagAll + +#if ( PROFlevel>=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); +#endif + + //////////////////////////////////////////////////////// + + + Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; + Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; + Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; + Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; + Llu->Unzval_br_ptr = Unzval_br_ptr; + Llu->Unnz = Unnz; + Llu->ToRecv = ToRecv; + Llu->ToSendD = ToSendD; + Llu->ToSendR = ToSendR; + Llu->fmod = fmod; + Llu->fsendx_plist = fsendx_plist; + Llu->nfrecvx = nfrecvx; + Llu->nfsendx = nfsendx; + Llu->bmod = bmod; + Llu->bsendx_plist = bsendx_plist; + Llu->nbrecvx = nbrecvx; + Llu->nbsendx = nbsendx; + Llu->ilsum = ilsum; + Llu->ldalsum = ldaspa; + + Llu->LRtree_ptr = LRtree_ptr; + Llu->LBtree_ptr = LBtree_ptr; + Llu->URtree_ptr = URtree_ptr; + Llu->UBtree_ptr = UBtree_ptr; + Llu->Linv_bc_ptr = Linv_bc_ptr; + Llu->Uinv_bc_ptr = Uinv_bc_ptr; + Llu->Urbs = Urbs; + Llu->Ucb_indptr = Ucb_indptr; + Llu->Ucb_valptr = Ucb_valptr; + + +#if ( PRNTlevel>=1 ) + if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", + nLblocks, nUblocks); +#endif + + SUPERLU_FREE(rb_marker); + SUPERLU_FREE(Urb_fstnz); + SUPERLU_FREE(Urb_length); + SUPERLU_FREE(Urb_indptr); + SUPERLU_FREE(Lrb_length); + SUPERLU_FREE(Lrb_number); + SUPERLU_FREE(Lrb_indptr); + SUPERLU_FREE(Lrb_valptr); + SUPERLU_FREE(dense); + + /* Find the maximum buffer size. */ + MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, + MPI_MAX, grid->comm); + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(Llu->mod_bit = intMalloc_dist(k)) ) + ABORT("Malloc fails for mod_bit[]."); + +#if ( PROFlevel>=1 ) + if ( !iam ) printf(".. 1st distribute time:\n " + "\tL\t%.2f\n\tU\t%.2f\n" + "\tu_blks %d\tnrbu %d\n--------\n", + t_l, t_u, u_blks, nrbu); +#endif + + } /* else fact != SamePattern_SameRowPerm */ + + if ( xa[A->ncol] > 0 ) { /* may not have any entries on this process. */ + SUPERLU_FREE(asub); + SUPERLU_FREE(a); + } + SUPERLU_FREE(xa); + +#if ( DEBUGlevel>=1 ) + /* Memory allocated but not freed: + ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ + CHECK_MALLOC(iam, "Exit psdistribute()"); +#endif + + return (mem_use+memTRS); + +} /* PSDISTRIBUTE */ diff --git a/SRC/psgsequ.c b/SRC/psgsequ.c new file mode 100644 index 00000000..02bd4a0d --- /dev/null +++ b/SRC/psgsequ.c @@ -0,0 +1,244 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Computes row and column scalings + * + * File name: psgsequ.c + * History: Modified from LAPACK routine SGEEQU + */ +#include +#include "superlu_sdefs.h" + +/*! \brief + +
+    Purpose
+    =======
+
+    PSGSEQU computes row and column scalings intended to equilibrate an
+    M-by-N sparse matrix A and reduce its condition number. R returns the row
+    scale factors and C the column scale factors, chosen to try to make
+    the largest element in each row and column of the matrix B with
+    elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.
+
+    R(i) and C(j) are restricted to be between SMLNUM = smallest safe
+    number and BIGNUM = largest safe number.  Use of these scaling
+    factors is not guaranteed to reduce the condition number of A but
+    works well in practice.
+
+    See supermatrix.h for the definition of 'SuperMatrix' structure.
+
+    Arguments
+    =========
+
+    A       (input) SuperMatrix*
+            The matrix of dimension (A->nrow, A->ncol) whose equilibration
+            factors are to be computed. The type of A can be:
+            Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+
+    R       (output) float*, size A->nrow
+            If INFO = 0 or INFO > M, R contains the row scale factors
+            for A.
+
+    C       (output) float*, size A->ncol
+            If INFO = 0,  C contains the column scale factors for A.
+
+    ROWCND  (output) float*
+            If INFO = 0 or INFO > M, ROWCND contains the ratio of the
+            smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and
+            AMAX is neither too large nor too small, it is not worth
+            scaling by R.
+
+    COLCND  (output) float*
+            If INFO = 0, COLCND contains the ratio of the smallest
+            C(i) to the largest C(i).  If COLCND >= 0.1, it is not
+            worth scaling by C.
+
+    AMAX    (output) float*
+            Absolute value of largest matrix element.  If AMAX is very
+            close to overflow or very close to underflow, the matrix
+            should be scaled.
+
+    INFO    (output) int*
+            = 0:  successful exit
+            < 0:  if INFO = -i, the i-th argument had an illegal value
+            > 0:  if INFO = i,  and i is
+                  <= M:  the i-th row of A is exactly zero
+                  >  M:  the (i-M)-th column of A is exactly zero
+
+    GRID    (input) gridinof_t*
+            The 2D process mesh.
+    =====================================================================
+
+*/ + +void +psgsequ(SuperMatrix *A, float *r, float *c, float *rowcnd, + float *colcnd, float *amax, int_t *info, gridinfo_t *grid) +{ + + /* Local variables */ + NRformat_loc *Astore; + float *Aval; + int i, j, irow, jcol, m_loc; + float rcmin, rcmax; + float bignum, smlnum; + float tempmax, tempmin; + float *loc_max; + int *r_sizes, *displs; + float *loc_r; + int_t procs; + + /* Test the input parameters. */ + *info = 0; + if ( A->nrow < 0 || A->ncol < 0 || + A->Stype != SLU_NR_loc || A->Dtype != SLU_S || A->Mtype != SLU_GE ) + *info = -1; + if (*info != 0) { + i = -(*info); + pxerr_dist("psgsequ", grid, i); + return; + } + + /* Quick return if possible */ + if ( A->nrow == 0 || A->ncol == 0 ) { + *rowcnd = 1.; + *colcnd = 1.; + *amax = 0.; + return; + } + + Astore = A->Store; + Aval = Astore->nzval; + m_loc = Astore->m_loc; + + /* Get machine constants. */ + smlnum = smach_dist("S"); + bignum = 1. / smlnum; + + /* Compute row scale factors. */ + for (i = 0; i < A->nrow; ++i) r[i] = 0.; + + /* Find the maximum element in each row. */ + irow = Astore->fst_row; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) + r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[j]) ); + ++irow; + } + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (i = Astore->fst_row; i < Astore->fst_row + m_loc; ++i) { + rcmax = SUPERLU_MAX(rcmax, r[i]); + rcmin = SUPERLU_MIN(rcmin, r[i]); + } + + /* Get the global MAX and MIN for R */ + tempmax = rcmax; + tempmin = rcmin; + MPI_Allreduce( &tempmax, &rcmax, + 1, MPI_FLOAT, MPI_MAX, grid->comm); + MPI_Allreduce( &tempmin, &rcmin, + 1, MPI_FLOAT, MPI_MIN, grid->comm); + + *amax = rcmax; + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (i = 0; i < A->nrow; ++i) + if (r[i] == 0.) { + *info = i + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (i = 0; i < A->nrow; ++i) + r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); + /* Compute ROWCND = min(R(I)) / max(R(I)) */ + *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + /* Compute column scale factors */ + for (j = 0; j < A->ncol; ++j) c[j] = 0.; + + /* Find the maximum element in each column, assuming the row + scalings computed above. */ + irow = Astore->fst_row; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + jcol = Astore->colind[j]; + c[jcol] = SUPERLU_MAX( c[jcol], fabs(Aval[j]) * r[irow] ); + } + ++irow; + } + + /* Find the global maximum for c[j] */ + if ( !(loc_max = floatMalloc_dist(A->ncol))) + ABORT("Malloc fails for loc_max[]."); + for (j = 0; j < A->ncol; ++j) loc_max[j] = c[j]; + MPI_Allreduce(loc_max, c, A->ncol, MPI_FLOAT, MPI_MAX, grid->comm); + SUPERLU_FREE(loc_max); + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->ncol; ++j) { + rcmax = SUPERLU_MAX(rcmax, c[j]); + rcmin = SUPERLU_MIN(rcmin, c[j]); + } + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (j = 0; j < A->ncol; ++j) + if ( c[j] == 0. ) { + *info = A->nrow + j + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (j = 0; j < A->ncol; ++j) + c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); + /* Compute COLCND = min(C(J)) / max(C(J)) */ + *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + /* gather R from each process to get the global R. */ + + procs = grid->nprow * grid->npcol; + if ( !(r_sizes = SUPERLU_MALLOC(2 * procs * sizeof(int)))) + ABORT("Malloc fails for r_sizes[]."); + displs = r_sizes + procs; + if ( !(loc_r = floatMalloc_dist(m_loc))) + ABORT("Malloc fails for loc_r[]."); + j = Astore->fst_row; + for (i = 0; i < m_loc; ++i) loc_r[i] = r[j++]; + + /* First gather the size of each piece. */ + MPI_Allgather(&m_loc, 1, MPI_INT, r_sizes, 1, MPI_INT, grid->comm); + + /* Set up the displacements for allgatherv */ + displs[0] = 0; + for (i = 1; i < procs; ++i) displs[i] = displs[i-1] + r_sizes[i-1]; + + /* Now gather the actual data */ + MPI_Allgatherv(loc_r, m_loc, MPI_FLOAT, r, r_sizes, displs, + MPI_FLOAT, grid->comm); + + SUPERLU_FREE(r_sizes); + SUPERLU_FREE(loc_r); + + return; + +} /* psgsequ */ diff --git a/SRC/psgsmv.c b/SRC/psgsmv.c new file mode 100644 index 00000000..b0478a9d --- /dev/null +++ b/SRC/psgsmv.c @@ -0,0 +1,383 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Parallel sparse matrix-vector multiplication + * + *
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +void psgsmv_init +( + SuperMatrix *A, /* Matrix A permuted by columns (input/output). + The type of A can be: + Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE. */ + int_t *row_to_proc, /* Input. Mapping between rows and processes. */ + gridinfo_t *grid, /* Input */ + psgsmv_comm_t *gsmv_comm /* Output. The data structure for communication. */ + ) +{ + NRformat_loc *Astore; + int iam, p, procs; + int *SendCounts, *RecvCounts; + int_t i, j, k, l, m, m_loc, n, fst_row, jcol; + int_t TotalIndSend, TotalValSend; + int_t *colind, *rowptr; + int_t *ind_tosend = NULL, *ind_torecv = NULL; + int_t *ptr_ind_tosend, *ptr_ind_torecv; + int_t *extern_start, *spa, *itemp; + float *nzval, *val_tosend = NULL, *val_torecv = NULL, t; + MPI_Request *send_req, *recv_req; + MPI_Status status; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Enter psgsmv_init()"); +#endif + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + iam = grid->iam; + procs = grid->nprow * grid->npcol; + Astore = (NRformat_loc *) A->Store; + m = A->nrow; + n = A->ncol; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + colind = Astore->colind; + rowptr = Astore->rowptr; + nzval = Astore->nzval; + if ( !(SendCounts = SUPERLU_MALLOC(2*procs * sizeof(int))) ) + ABORT("Malloc fails for SendCounts[]"); + /*for (i = 0; i < 2*procs; ++i) SendCounts[i] = 0;*/ + RecvCounts = SendCounts + procs; + if ( !(ptr_ind_tosend = intMalloc_dist(2*(procs+1))) ) + ABORT("Malloc fails for ptr_ind_tosend[]"); + ptr_ind_torecv = ptr_ind_tosend + procs + 1; + if ( !(extern_start = intMalloc_dist(m_loc)) ) + ABORT("Malloc fails for extern_start[]"); + for (i = 0; i < m_loc; ++i) extern_start[i] = rowptr[i]; + + /* ------------------------------------------------------------ + COUNT THE NUMBER OF X ENTRIES TO BE SENT TO EACH PROCESS. + THIS IS THE UNION OF THE COLUMN INDICES OF MY ROWS. + SWAP TO THE BEGINNING THE PART OF A CORRESPONDING TO THE + LOCAL PART OF X. + THIS ACCOUNTS FOR THE FIRST PASS OF ACCESSING MATRIX A. + ------------------------------------------------------------*/ + if ( !(spa = intCalloc_dist(n)) ) /* Aid in global to local translation */ + ABORT("Malloc fails for spa[]"); + for (p = 0; p < procs; ++p) SendCounts[p] = 0; + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + k = extern_start[i]; + for (j = rowptr[i]; j < rowptr[i+1]; ++j) {/* Each nonzero in row i */ + jcol = colind[j]; + p = row_to_proc[jcol]; + if ( p != iam ) { /* External */ + if ( spa[jcol] == 0 ) { /* First time see this index */ + ++SendCounts[p]; + spa[jcol] = 1; + } + } else { /* Swap to beginning the part of A corresponding + to the local part of X */ + l = colind[k]; + t = nzval[k]; + colind[k] = jcol; + nzval[k] = nzval[j]; + colind[j] = l; + nzval[j] = t; + ++k; + } + } + extern_start[i] = k; + } + + /* ------------------------------------------------------------ + LOAD THE X-INDICES TO BE SENT TO THE OTHER PROCESSES. + THIS ACCOUNTS FOR THE SECOND PASS OF ACCESSING MATRIX A. + ------------------------------------------------------------*/ + /* Build pointers to ind_tosend[]. */ + ptr_ind_tosend[0] = 0; + for (p = 0, TotalIndSend = 0; p < procs; ++p) { + TotalIndSend += SendCounts[p]; /* Total to send. */ + ptr_ind_tosend[p+1] = ptr_ind_tosend[p] + SendCounts[p]; + } +#if 0 + ptr_ind_tosend[iam] = 0; /* Local part of X */ +#endif + if ( TotalIndSend ) { + if ( !(ind_tosend = intMalloc_dist(TotalIndSend)) ) + ABORT("Malloc fails for ind_tosend[]"); /* Exclude local part of X */ + } + + /* Build SPA to aid global to local translation. */ + for (i = 0; i < n; ++i) spa[i] = EMPTY; + for (i = 0; i < m_loc; ++i) { /* Loop through each row of A */ + for (j = rowptr[i]; j < rowptr[i+1]; ++j) { + jcol = colind[j]; + if ( spa[jcol] == EMPTY ) { /* First time see this index */ + p = row_to_proc[jcol]; + if ( p == iam ) { /* Local */ + /*assert(jcol>=fst_row);*/ + spa[jcol] = jcol - fst_row; /* Relative position in local X */ + } else { /* External */ + ind_tosend[ptr_ind_tosend[p]] = jcol; /* Still global */ + spa[jcol] = ptr_ind_tosend[p]; /* Position in ind_tosend[] */ + ++ptr_ind_tosend[p]; + } + } + } + } + + /* ------------------------------------------------------------ + TRANSFORM THE COLUMN INDICES OF MATRIX A INTO LOCAL INDICES. + THIS ACCOUNTS FOR THE THIRD PASS OF ACCESSING MATRIX A. + ------------------------------------------------------------*/ + for (i = 0; i < m_loc; ++i) { + for (j = rowptr[i]; j < rowptr[i+1]; ++j) { + jcol = colind[j]; + colind[j] = spa[jcol]; + } + } + + /* ------------------------------------------------------------ + COMMUNICATE THE EXTERNAL INDICES OF X. + ------------------------------------------------------------*/ + MPI_Alltoall(SendCounts, 1, MPI_INT, RecvCounts, 1, MPI_INT, + grid->comm); + + /* Build pointers to ind_torecv[]. */ + ptr_ind_torecv[0] = 0; + for (p = 0, TotalValSend = 0; p < procs; ++p) { + TotalValSend += RecvCounts[p]; /* Total to receive. */ + ptr_ind_torecv[p+1] = ptr_ind_torecv[p] + RecvCounts[p]; + } + if ( TotalValSend ) { + if ( !(ind_torecv = intMalloc_dist(TotalValSend)) ) + ABORT("Malloc fails for ind_torecv[]"); + } + + if ( !(send_req = (MPI_Request *) + SUPERLU_MALLOC(2*procs *sizeof(MPI_Request)))) + ABORT("Malloc fails for recv_req[]."); + recv_req = send_req + procs; + for (p = 0; p < procs; ++p) { + ptr_ind_tosend[p] -= SendCounts[p]; /* Reset pointer to beginning */ + if ( SendCounts[p] ) { + MPI_Isend(&ind_tosend[ptr_ind_tosend[p]], SendCounts[p], + mpi_int_t, p, iam, grid->comm, &send_req[p]); + } + if ( RecvCounts[p] ) { + MPI_Irecv(&ind_torecv[ptr_ind_torecv[p]], RecvCounts[p], + mpi_int_t, p, p, grid->comm, &recv_req[p]); + } + } + for (p = 0; p < procs; ++p) { + if ( SendCounts[p] ) MPI_Wait(&send_req[p], &status); + if ( RecvCounts[p] ) MPI_Wait(&recv_req[p], &status); + } + + /* Allocate storage for the X values to to transferred. */ + if ( TotalIndSend && + !(val_torecv = floatMalloc_dist(TotalIndSend)) ) + ABORT("Malloc fails for val_torecv[]."); + if ( TotalValSend && + !(val_tosend = floatMalloc_dist(TotalValSend)) ) + ABORT("Malloc fails for val_tosend[]."); + + gsmv_comm->extern_start = extern_start; + gsmv_comm->ind_tosend = ind_tosend; + gsmv_comm->ind_torecv = ind_torecv; + gsmv_comm->ptr_ind_tosend = ptr_ind_tosend; + gsmv_comm->ptr_ind_torecv = ptr_ind_torecv; + gsmv_comm->SendCounts = SendCounts; + gsmv_comm->RecvCounts = RecvCounts; + gsmv_comm->val_tosend = val_tosend; + gsmv_comm->val_torecv = val_torecv; + gsmv_comm->TotalIndSend = TotalIndSend; + gsmv_comm->TotalValSend = TotalValSend; + + SUPERLU_FREE(spa); + SUPERLU_FREE(send_req); + +#if ( DEBUGlevel>=2 ) + PrintInt10("psgsmv_init::rowptr", m_loc+1, rowptr); + PrintInt10("psgsmv_init::extern_start", m_loc, extern_start); +#endif +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgsmv_init()"); +#endif + +} /* PSGSMV_INIT */ + + +/* + * Performs sparse matrix-vector multiplication. + */ +void +psgsmv +( + int_t abs, /* Input. Do abs(A)*abs(x). */ + SuperMatrix *A_internal, /* Input. Matrix A permuted by columns. + The column indices are translated into + the relative positions in the gathered x-vector. + The type of A can be: + Stype = NR_loc; Dtype = SLU_S; Mtype = GE. */ + gridinfo_t *grid, /* Input */ + psgsmv_comm_t *gsmv_comm, /* Input. The data structure for communication. */ + float x[], /* Input. The distributed source vector */ + float ax[] /* Output. The distributed destination vector */ +) +{ + NRformat_loc *Astore; + int iam, procs; + int_t i, j, p, m, m_loc, n, fst_row, jcol; + int_t *colind, *rowptr; + int *SendCounts, *RecvCounts; + int_t *ind_tosend, *ind_torecv, *ptr_ind_tosend, *ptr_ind_torecv; + int_t *extern_start, TotalValSend; + float *nzval, *val_tosend, *val_torecv; + float zero = 0.0; + MPI_Request *send_req, *recv_req; + MPI_Status status; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Enter psgsmv()"); +#endif + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + iam = grid->iam; + procs = grid->nprow * grid->npcol; + Astore = (NRformat_loc *) A_internal->Store; + m = A_internal->nrow; + n = A_internal->ncol; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + colind = Astore->colind; + rowptr = Astore->rowptr; + nzval = (float *) Astore->nzval; + extern_start = gsmv_comm->extern_start; + ind_torecv = gsmv_comm->ind_torecv; + ptr_ind_tosend = gsmv_comm->ptr_ind_tosend; + ptr_ind_torecv = gsmv_comm->ptr_ind_torecv; + SendCounts = gsmv_comm->SendCounts; + RecvCounts = gsmv_comm->RecvCounts; + val_tosend = (float *) gsmv_comm->val_tosend; + val_torecv = (float *) gsmv_comm->val_torecv; + TotalValSend = gsmv_comm->TotalValSend; + + /* ------------------------------------------------------------ + COPY THE X VALUES INTO THE SEND BUFFER. + ------------------------------------------------------------*/ + for (i = 0; i < TotalValSend; ++i) { + j = ind_torecv[i] - fst_row; /* Relative index in x[] */ + val_tosend[i] = x[j]; + } + + /* ------------------------------------------------------------ + COMMUNICATE THE X VALUES. + ------------------------------------------------------------*/ + if ( !(send_req = (MPI_Request *) + SUPERLU_MALLOC(2*procs *sizeof(MPI_Request)))) + ABORT("Malloc fails for recv_req[]."); + recv_req = send_req + procs; + for (p = 0; p < procs; ++p) { + if ( RecvCounts[p] ) { + MPI_Isend(&val_tosend[ptr_ind_torecv[p]], RecvCounts[p], + MPI_FLOAT, p, iam, + grid->comm, &send_req[p]); + } + if ( SendCounts[p] ) { + MPI_Irecv(&val_torecv[ptr_ind_tosend[p]], SendCounts[p], + MPI_FLOAT, p, p, + grid->comm, &recv_req[p]); + } + } + + /* ------------------------------------------------------------ + PERFORM THE ACTUAL MULTIPLICATION. + ------------------------------------------------------------*/ + if ( abs ) { /* Perform abs(A)*abs(x) */ + /* Multiply the local part. */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + ax[i] = 0.0; + for (j = rowptr[i]; j < extern_start[i]; ++j) { + jcol = colind[j]; + ax[i] += fabs(nzval[j]) * fabs(x[jcol]); + } + } + + for (p = 0; p < procs; ++p) { + if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status); + if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status); + } + + /* Multiply the external part. */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + for (j = extern_start[i]; j < rowptr[i+1]; ++j) { + jcol = colind[j]; + ax[i] += fabs(nzval[j]) * fabs(val_torecv[jcol]); + } + } + } else { + /* Multiply the local part. */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + ax[i] = zero; + for (j = rowptr[i]; j < extern_start[i]; ++j) { + jcol = colind[j]; + ax[i] += nzval[j] * x[jcol]; + } + } + + for (p = 0; p < procs; ++p) { + if ( RecvCounts[p] ) MPI_Wait(&send_req[p], &status); + if ( SendCounts[p] ) MPI_Wait(&recv_req[p], &status); + } + + /* Multiply the external part. */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + for (j = extern_start[i]; j < rowptr[i+1]; ++j) { + jcol = colind[j]; + ax[i] += nzval[j] * val_torecv[jcol]; + } + } + } + + SUPERLU_FREE(send_req); +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgsmv()"); +#endif + +} /* PSGSMV */ + +void psgsmv_finalize(psgsmv_comm_t *gsmv_comm) +{ + int_t *it; + float *dt; + SUPERLU_FREE(gsmv_comm->extern_start); + if ( (it = gsmv_comm->ind_tosend) ) SUPERLU_FREE(it); + if ( (it = gsmv_comm->ind_torecv) ) SUPERLU_FREE(it); + SUPERLU_FREE(gsmv_comm->ptr_ind_tosend); + SUPERLU_FREE(gsmv_comm->SendCounts); + if ( (dt = gsmv_comm->val_tosend) ) SUPERLU_FREE(dt); + if ( (dt = gsmv_comm->val_torecv) ) SUPERLU_FREE(dt); +} + diff --git a/SRC/psgsmv_AXglobal.c b/SRC/psgsmv_AXglobal.c new file mode 100644 index 00000000..b81b3192 --- /dev/null +++ b/SRC/psgsmv_AXglobal.c @@ -0,0 +1,324 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Performs sparse matrix-vector multiplication + * + *
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + + +static void screate_msr_matrix(SuperMatrix *, int_t [], int_t, + float **, int_t **); +static void sPrintMSRmatrix(int, float [], int_t [], gridinfo_t *); + + +int psgsmv_AXglobal_setup +( + SuperMatrix *A, /* Matrix A permuted by columns (input). + The type of A can be: + Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE. */ + Glu_persist_t *Glu_persist, /* input */ + gridinfo_t *grid, /* input */ + int_t *m, /* output */ + int_t *update[], /* output */ + float *val[], /* output */ + int_t *bindx[], /* output */ + int_t *mv_sup_to_proc /* output */ + ) +{ + int n; + int input_option; + int N_update; /* Number of variables updated on this process (output) */ + int iam = grid->iam; + int nprocs = grid->nprow * grid->npcol; + int_t *xsup = Glu_persist->xsup; + int_t *supno = Glu_persist->supno; + int_t nsupers; + int i, nsup, p, t1, t2, t3; + + + /* Initialize the list of global indices. + * NOTE: the list of global indices must be in ascending order. + */ + n = A->nrow; + input_option = SUPER_LINEAR; + nsupers = supno[n-1] + 1; + +#if ( DEBUGlevel>=2 ) + if ( !iam ) { + PrintInt10("xsup", supno[n-1]+1, xsup); + PrintInt10("supno", n, supno); + } +#endif + + if ( input_option == SUPER_LINEAR ) { /* Block partitioning based on + individual rows. */ + /* Figure out mv_sup_to_proc[] on all processes. */ + for (p = 0; p < nprocs; ++p) { + t1 = n / nprocs; /* Number of rows */ + t2 = n - t1 * nprocs; /* left-over, which will be assigned + to the first t2 processes. */ + if ( p >= t2 ) t2 += (p * t1); /* Starting row number */ + else { /* First t2 processes will get one more row. */ + ++t1; /* Number of rows. */ + t2 = p * t1; /* Starting row. */ + } + /* Make sure the starting and ending rows are at the + supernode boundaries. */ + t3 = t2 + t1; /* Ending row. */ + nsup = supno[t2]; + if ( t2 > xsup[nsup] ) { /* Round up the starting row. */ + t1 -= xsup[nsup+1] - t2; + t2 = xsup[nsup+1]; + } + nsup = supno[t3]; + if ( t3 > xsup[nsup] ) /* Round up the ending row. */ + t1 += xsup[nsup+1] - t3; + t3 = t2 + t1 - 1; + if ( t1 ) { + for (i = supno[t2]; i <= supno[t3]; ++i) { + mv_sup_to_proc[i] = p; +#if ( DEBUGlevel>=3 ) + if ( mv_sup_to_proc[i] == p-1 ) { + fprintf(stderr, + "mv_sup_to_proc conflicts at supno %d\n", i); + exit(-1); + } +#endif + } + } + + if ( iam == p ) { + N_update = t1; + if ( N_update ) { + if ( !(*update = intMalloc_dist(N_update)) ) + ABORT("Malloc fails for update[]"); + } + for (i = 0; i < N_update; ++i) (*update)[i] = t2 + i; +#if ( DEBUGlevel>=3 ) + printf("(%2d) N_update = %4d\t" + "supers %4d to %4d\trows %4d to %4d\n", + iam, N_update, supno[t2], supno[t3], t2, t3); +#endif + } + } /* for p ... */ + } else if ( input_option == SUPER_BLOCK ) { /* Block partitioning based on + individual supernodes. */ + /* This may cause bad load balance, because the blocks are usually + small in the beginning and large toward the end. */ + t1 = nsupers / nprocs; + t2 = nsupers - t1 * nprocs; /* left-over */ + if ( iam >= t2 ) t2 += (iam * t1); + else { + ++t1; /* Number of blocks. */ + t2 = iam * t1; /* Starting block. */ + } + N_update = xsup[t2+t1] - xsup[t2]; + if ( !(*update = intMalloc_dist(N_update)) ) + ABORT("Malloc fails for update[]"); + for (i = 0; i < N_update; ++i) (*update)[i] = xsup[t2] + i; + } + + + /* Create an MSR matrix in val/bindx to be used by pdgsmv(). */ + screate_msr_matrix(A, *update, N_update, val, bindx); + +#if ( DEBUGlevel>=2 ) + PrintInt10("mv_sup_to_proc", nsupers, mv_sup_to_proc); + sPrintMSRmatrix(N_update, *val, *bindx, grid); +#endif + + *m = N_update; + return 0; +} /* PSGSMV_AXglobal_SETUP */ + + +/*! \brief + * + *
+ * Create the distributed modified sparse row (MSR) matrix: bindx/val.
+ * For a submatrix of size m-by-n, the MSR arrays are as follows:
+ *    bindx[0]      = m + 1
+ *    bindx[0..m]   = pointer to start of each row
+ *    bindx[ks..ke] = column indices of the off-diagonal nonzeros in row k,
+ *                    where, ks = bindx[k], ke = bindx[k+1]-1
+ *    val[k]        = A(k,k), k < m, diagonal elements
+ *    val[m]        = not used
+ *    val[ki]       = A(k, bindx[ki]), where ks <= ki <= ke
+ * Both arrays are of length nnz + 1.
+ * 
+*/ +static void screate_msr_matrix +( + SuperMatrix *A, /* Matrix A permuted by columns (input). + The type of A can be: + Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE. */ + int_t update[], /* input (local) */ + int_t N_update, /* input (local) */ + float **val, /* output */ + int_t **bindx /* output */ +) +{ + int hi, i, irow, j, k, lo, n, nnz_local, nnz_diag; + NCPformat *Astore; + float *nzval; + int_t *rowcnt; + double zero = 0.0; + + if ( !N_update ) return; + + n = A->ncol; + Astore = A->Store; + nzval = Astore->nzval; + + /* One pass of original matrix A to count nonzeros of each row. */ + if ( !(rowcnt = (int_t *) intCalloc_dist(N_update)) ) + ABORT("Malloc fails for rowcnt[]"); + lo = update[0]; + hi = update[N_update-1]; + nnz_local = 0; + nnz_diag = 0; + for (j = 0; j < n; ++j) { + for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) { + irow = Astore->rowind[i]; + if ( irow >= lo && irow <= hi ) { + if ( irow != j ) /* Exclude diagonal */ + ++rowcnt[irow - lo]; + else ++nnz_diag; /* Count nonzero diagonal entries */ + ++nnz_local; + } + } + } + + /* Add room for the logical diagonal zeros which are not counted + in nnz_local. */ + nnz_local += (N_update - nnz_diag); + + /* Allocate storage for bindx[] and val[]. */ + if ( !(*val = (float *) floatMalloc_dist(nnz_local+1)) ) + ABORT("Malloc fails for val[]"); + for (i = 0; i < N_update; ++i) (*val)[i] = zero; /* Initialize diagonal */ + if ( !(*bindx = (int_t *) SUPERLU_MALLOC((nnz_local+1) * sizeof(int_t))) ) + ABORT("Malloc fails for bindx[]"); + + /* Set up row pointers. */ + (*bindx)[0] = N_update + 1; + for (j = 1; j <= N_update; ++j) { + (*bindx)[j] = (*bindx)[j-1] + rowcnt[j-1]; + rowcnt[j-1] = (*bindx)[j-1]; + } + + /* One pass of original matrix A to fill in matrix entries. */ + for (j = 0; j < n; ++j) { + for (i = Astore->colbeg[j]; i < Astore->colend[j]; ++i) { + irow = Astore->rowind[i]; + if ( irow >= lo && irow <= hi ) { + if ( irow == j ) /* Diagonal */ + (*val)[irow - lo] = nzval[i]; + else { + irow -= lo; + k = rowcnt[irow]; + (*bindx)[k] = j; + (*val)[k] = nzval[i]; + ++rowcnt[irow]; + } + } + } + } + + SUPERLU_FREE(rowcnt); +} + +/*! \brief + * + *
+ * Performs sparse matrix-vector multiplication.
+ *   - val/bindx stores the distributed MSR matrix A
+ *   - X is global
+ *   - ax product is distributed the same way as A
+ * 
+ */ +int +psgsmv_AXglobal(int_t m, int_t update[], float val[], int_t bindx[], + float X[], float ax[]) +{ + int_t i, j, k; + + if ( m <= 0 ) return 0; /* number of rows (local) */ + + for (i = 0; i < m; ++i) { + ax[i] = 0.0; + + for (k = bindx[i]; k < bindx[i+1]; ++k) { + j = bindx[k]; /* column index */ + ax[i] += val[k] * X[j]; + } + ax[i] += val[i] * X[update[i]]; /* diagonal */ + } + return 0; +} /* PSGSMV_AXglobal */ + +/* + * Performs sparse matrix-vector multiplication. + * - val/bindx stores the distributed MSR matrix A + * - X is global + * - ax product is distributed the same way as A + */ +int +psgsmv_AXglobal_abs(int_t m, int_t update[], float val[], int_t bindx[], + float X[], float ax[]) +{ + int_t i, j, k; + + if ( m <= 0 ) return 0; /* number of rows (local) */ + + for (i = 0; i < m; ++i) { + ax[i] = 0.0; + for (k = bindx[i]; k < bindx[i+1]; ++k) { + j = bindx[k]; /* column index */ + ax[i] += fabs(val[k]) * fabs(X[j]); + } + ax[i] += fabs(val[i]) * fabs(X[update[i]]); /* diagonal */ + } + + return 0; +} /* PSGSMV_AXglobal_ABS */ + +/* + * Print the local MSR matrix + */ +static void sPrintMSRmatrix +( + int m, /* Number of rows of the submatrix. */ + float val[], + int_t bindx[], + gridinfo_t *grid +) +{ + int iam, nnzp1; + + if ( !m ) return; + + iam = grid->iam; + nnzp1 = bindx[m]; + printf("(%2d) MSR submatrix has %d rows -->\n", iam, m); + Printfloat5("val", nnzp1, val); + PrintInt10("bindx", nnzp1, bindx); +} diff --git a/SRC/psgsrfs.c b/SRC/psgsrfs.c new file mode 100644 index 00000000..db734585 --- /dev/null +++ b/SRC/psgsrfs.c @@ -0,0 +1,260 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Improves the computed solution to a system of linear equations and provides error bounds and backward error estimates + * + *
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ * Last modified:
+ * December 31, 2015
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * PSGSRFS improves the computed solution to a system of linear
+ * equations and provides error bounds and backward error estimates
+ * for the solution.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, or the scaled A if equilibration was done.
+ *        A is also permuted into diag(R)*A*diag(C)*Pc'. The type of A can be:
+ *        Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from pdgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_sdefs.h for the definition of 'sLUstruct_t'.
+ *
+ * ScalePermstruct (input) sScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_defs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input) float* (local)
+ *        The m_loc-by-NRHS right-hand side matrix of the possibly
+ *        equilibrated system. That is, B may be overwritten by diag(R)*B.
+ *
+ * ldb    (input) int (local)
+ *        Leading dimension of matrix B.
+ *
+ * X      (input/output) float* (local)
+ *        On entry, the solution matrix Y, as computed by PDGSTRS, of the
+ *            transformed system A1*Y = Pc*Pr*B. where
+ *            A1 = Pc*Pr*diag(R)*A*diag(C)*Pc' and Y = Pc*diag(C)^(-1)*X.
+ *        On exit, the improved solution matrix Y.
+ *
+ *        In order to obtain the solution X to the original system,
+ *        Y should be permutated by Pc^T, and premultiplied by diag(C)
+ *        if DiagScale = COL or BOTH.
+ *        This must be done after this routine is called.
+ *
+ * ldx    (input) int (local)
+ *        Leading dimension of matrix X.
+ *
+ * nrhs   (input) int
+ *        Number of right-hand sides.
+ *
+ * SOLVEstruct (output) sSOLVEstruct_t* (global)
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * berr   (output) float*, dimension (nrhs)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the refinement steps.
+ *        See util.h for the definition of SuperLUStat_t.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *
+ * Internal Parameters
+ * ===================
+ *
+ * ITMAX is the maximum number of steps of iterative refinement.
+ * 
+ */ +void +psgsrfs(int_t n, SuperMatrix *A, float anorm, sLUstruct_t *LUstruct, + sScalePermstruct_t *ScalePermstruct, gridinfo_t *grid, + float *B, int_t ldb, float *X, int_t ldx, int nrhs, + sSOLVEstruct_t *SOLVEstruct, + float *berr, SuperLUStat_t *stat, int *info) +{ +#define ITMAX 20 + + float *ax, *R, *dx, *temp, *work, *B_col, *X_col; + int_t count, i, j, lwork, nz; + int iam; + float eps, lstres; + float s, safmin, safe1, safe2; + + /* Data structures used by matrix-vector multiply routine. */ + psgsmv_comm_t *gsmv_comm = SOLVEstruct->gsmv_comm; + NRformat_loc *Astore; + int_t m_loc, fst_row; + + + /* Initialization. */ + Astore = (NRformat_loc *) A->Store; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + iam = grid->iam; + + /* Test the input parameters. */ + *info = 0; + if ( n < 0 ) *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc + || A->Dtype != SLU_S || A->Mtype != SLU_GE ) + *info = -2; + else if ( ldb < SUPERLU_MAX(0, m_loc) ) *info = -10; + else if ( ldx < SUPERLU_MAX(0, m_loc) ) *info = -12; + else if ( nrhs < 0 ) *info = -13; + if (*info != 0) { + i = -(*info); + pxerr_dist("PSGSRFS", grid, i); + return; + } + + /* Quick return if possible. */ + if ( n == 0 || nrhs == 0 ) { + return; + } + + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psgsrfs()"); +#endif + + lwork = 2 * m_loc; /* For ax/R/dx and temp */ + if ( !(work = floatMalloc_dist(lwork)) ) + ABORT("Malloc fails for work[]"); + ax = R = dx = work; + temp = ax + m_loc; + + /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ + nz = A->ncol + 1; + eps = smach_dist("Epsilon"); + safmin = smach_dist("Safe minimum"); + + /* Set SAFE1 essentially to be the underflow threshold times the + number of additions in each row. */ + safe1 = nz * safmin; + safe2 = safe1 / eps; + +#if ( DEBUGlevel>=1 ) + if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", + eps, anorm, safe1, safe2); +#endif + + /* Do for each right-hand side ... */ + for (j = 0; j < nrhs; ++j) { + count = 0; + lstres = 3.; + B_col = &B[j*ldb]; + X_col = &X[j*ldx]; + + while (1) { /* Loop until stopping criterion is satisfied. */ + + /* Compute residual R = B - op(A) * X, + where op(A) = A, A**T, or A**H, depending on TRANS. */ + + /* Matrix-vector multiply. */ + psgsmv(0, A, grid, gsmv_comm, X_col, ax); + + /* Compute residual, stored in R[]. */ + for (i = 0; i < m_loc; ++i) R[i] = B_col[i] - ax[i]; + + /* Compute abs(op(A))*abs(X) + abs(B), stored in temp[]. */ + psgsmv(1, A, grid, gsmv_comm, X_col, temp); + for (i = 0; i < m_loc; ++i) temp[i] += fabs(B_col[i]); + + s = 0.0; + for (i = 0; i < m_loc; ++i) { + if ( temp[i] > safe2 ) { + s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]); + } else if ( temp[i] != 0.0 ) { + /* Adding SAFE1 to the numerator guards against + spuriously zero residuals (underflow). */ + s = SUPERLU_MAX(s, (safe1 + fabs(R[i])) /temp[i]); + } + /* If temp[i] is exactly 0.0 (computed by PxGSMV), then + we know the true residual also must be exactly 0.0. */ + } + MPI_Allreduce( &s, &berr[j], 1, MPI_FLOAT, MPI_MAX, grid->comm ); + +#if ( PRNTlevel>= 1 ) + if ( !iam ) + printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]); +#endif + if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { + /* Compute new dx. */ + psgstrs(n, LUstruct, ScalePermstruct, grid, + dx, m_loc, fst_row, m_loc, 1, + SOLVEstruct, stat, info); + + /* Update solution. */ + for (i = 0; i < m_loc; ++i) X_col[i] += dx[i]; + + lstres = berr[j]; + ++count; + } else { + break; + } + } /* end while */ + + stat->RefineSteps = count; + + } /* for j ... */ + + /* Deallocate storage. */ + SUPERLU_FREE(work); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgsrfs()"); +#endif + +} /* PSGSRFS */ + diff --git a/SRC/psgsrfs_ABXglobal.c b/SRC/psgsrfs_ABXglobal.c new file mode 100644 index 00000000..989f591d --- /dev/null +++ b/SRC/psgsrfs_ABXglobal.c @@ -0,0 +1,465 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Improves the computed solution and provies error bounds + * + *
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Last modified:
+ * December 31, 2015  version 4.3
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*-- Function prototypes --*/ +static void gather_1rhs_diag_to_all(int_t, float [], Glu_persist_t *, + sLocalLU_t *, gridinfo_t *, int_t, int_t [], + int_t [], float [], float []); +static void redist_all_to_diag(int_t, float [], Glu_persist_t *, + sLocalLU_t *, gridinfo_t *, int_t [], float []); + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * psgsrfs_ABXglobal improves the computed solution to a system of linear
+ * equations and provides error bounds and backward error estimates
+ * for the solution.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, or the scaled A if equilibration was done.
+ *        A is also permuted into the form Pc*Pr*A*Pc', where Pr and Pc
+ *        are permutation matrices. The type of A can be:
+ *        Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ *        NOTE: Currently, A must reside in all processes when calling
+ *              this routine.
+ *
+ * anorm  (input) double
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from psgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_ddefs.h for the definition of 'sLUstruct_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input) float* (global)
+ *        The N-by-NRHS right-hand side matrix of the possibly equilibrated
+ *        and row permuted system.
+ *
+ *        NOTE: Currently, B must reside on all processes when calling
+ *              this routine.
+ *
+ * ldb    (input) int (global)
+ *        Leading dimension of matrix B.
+ *
+ * X      (input/output) float* (global)
+ *        On entry, the solution matrix X, as computed by PSGSTRS.
+ *        On exit, the improved solution matrix X.
+ *        If DiagScale = COL or BOTH, X should be premultiplied by diag(C)
+ *        in order to obtain the solution to the original system.
+ *
+ *        NOTE: Currently, X must reside on all processes when calling
+ *              this routine.
+ *
+ * ldx    (input) int (global)
+ *        Leading dimension of matrix X.
+ *
+ * nrhs   (input) int
+ *        Number of right-hand sides.
+ *
+ * berr   (output) double*, dimension (nrhs)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the refinement steps.
+ *        See util.h for the definition of SuperLUStat_t.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *
+ * Internal Parameters
+ * ===================
+ *
+ * ITMAX is the maximum number of steps of iterative refinement.
+ * 
+ */ + +void +psgsrfs_ABXglobal(int_t n, SuperMatrix *A, float anorm, sLUstruct_t *LUstruct, + gridinfo_t *grid, float *B, int_t ldb, float *X, int_t ldx, + int nrhs, float *berr, SuperLUStat_t *stat, int *info) +{ + + +#define ITMAX 20 + + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + /* + * Data structures used by matrix-vector multiply routine. + */ + int_t N_update; /* Number of variables updated on this process */ + int_t *update; /* vector elements (global index) updated + on this processor. */ + int_t *bindx; + float *val; + int_t *mv_sup_to_proc; /* Supernode to process mapping in + matrix-vector multiply. */ + /*-- end data structures for matrix-vector multiply --*/ + float *b, *ax, *R, *B_col, *temp, *work, *X_col, + *x_trs, *dx_trs; + int_t count, ii, j, jj, k, knsupc, lk, lwork, + nprow, nsupers, nz, p; + int i, iam, pkk; + int_t *ilsum, *xsup; + double eps, lstres; + double s, safmin, safe1, safe2; + + /* NEW STUFF */ + int_t num_diag_procs, *diag_procs; /* Record diagonal process numbers. */ + int_t *diag_len; /* Length of the X vector on diagonal processes. */ + + /*-- Function prototypes --*/ + extern void psgstrs1(int_t, sLUstruct_t *, gridinfo_t *, + float *, int, SuperLUStat_t *, int *); + + /* Test the input parameters. */ + *info = 0; + if ( n < 0 ) *info = -1; + else if ( A->nrow != A->ncol || A->nrow < 0 || + A->Stype != SLU_NCP || A->Dtype != SLU_S || A->Mtype != SLU_GE ) + *info = -2; + else if ( ldb < SUPERLU_MAX(0, n) ) *info = -10; + else if ( ldx < SUPERLU_MAX(0, n) ) *info = -12; + else if ( nrhs < 0 ) *info = -13; + if (*info != 0) { + i = -(*info); + pxerr_dist("psgsrfs_ABXglobal", grid, i); + return; + } + + /* Quick return if possible. */ + if ( n == 0 || nrhs == 0 ) { + return; + } + + /* Initialization. */ + iam = grid->iam; + nprow = grid->nprow; + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + ilsum = Llu->ilsum; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psgsrfs_ABXglobal()"); +#endif + + get_diag_procs(n, Glu_persist, grid, &num_diag_procs, + &diag_procs, &diag_len); +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. number of diag processes = " IFMT "\n", num_diag_procs); + PrintInt10("diag_procs", num_diag_procs, diag_procs); + PrintInt10("diag_len", num_diag_procs, diag_len); + } +#endif + + if ( !(mv_sup_to_proc = intCalloc_dist(nsupers)) ) + ABORT("Calloc fails for mv_sup_to_proc[]"); + + psgsmv_AXglobal_setup(A, Glu_persist, grid, &N_update, &update, + &val, &bindx, mv_sup_to_proc); + + i = CEILING( nsupers, nprow ); /* Number of local block rows */ + ii = Llu->ldalsum + i * XK_H; + k = SUPERLU_MAX(N_update, sp_ienv_dist(3)); + jj = diag_len[0]; + for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX( jj, diag_len[j] ); + jj = SUPERLU_MAX( jj, N_update ); + lwork = N_update /* For ax and R */ + + ii /* For dx_trs */ + + ii /* For x_trs */ + + k /* For b */ + + jj; /* for temp */ + if ( !(work = floatMalloc_dist(lwork)) ) + ABORT("Malloc fails for work[]"); + ax = R = work; + dx_trs = work + N_update; + x_trs = dx_trs + ii; + b = x_trs + ii; + temp = b + k; + +#if ( DEBUGlevel>=2 ) + { + float *dwork = floatMalloc_dist(n); + for (i = 0; i < n; ++i) { + if ( i & 1 ) dwork[i] = 1.; + else dwork[i] = 2.; + } + /* Check correctness of matrix-vector multiply. */ + psgsmv_AXglobal(N_update, update, val, bindx, dwork, ax); + Printfloat5("Mult A*x", N_update, ax); + SUPERLU_FREE(dwork); + } +#endif + + + /* NZ = maximum number of nonzero elements in each row of A, plus 1 */ + nz = A->ncol + 1; + eps = smach_dist("Epsilon"); + safmin = smach_dist("Safe minimum"); + + /* Set SAFE1 essentially to be the underflow threshold times the + number of additions in each row. */ + safe1 = nz * safmin; + safe2 = safe1 / eps; + +#if ( DEBUGlevel>=1 ) + if ( !iam ) printf(".. eps = %e\tanorm = %e\tsafe1 = %e\tsafe2 = %e\n", + eps, anorm, safe1, safe2); +#endif + + /* Do for each right-hand side ... */ + for (j = 0; j < nrhs; ++j) { + count = 0; + lstres = 3.; + + /* Copy X into x on the diagonal processes. */ + B_col = &B[j*ldb]; + X_col = &X[j*ldx]; + for (p = 0; p < num_diag_procs; ++p) { + pkk = diag_procs[p]; + if ( iam == pkk ) { + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + ii = ilsum[lk] + (lk+1)*XK_H; + jj = FstBlockC( k ); + for (i = 0; i < knsupc; ++i) x_trs[i+ii] = X_col[i+jj]; + dx_trs[ii-XK_H] = k;/* Block number prepended in header. */ + } + } + } + /* Copy B into b distributed the same way as matrix-vector product. */ + if ( N_update ) ii = update[0]; + for (i = 0; i < N_update; ++i) b[i] = B_col[i + ii]; + + while (1) { /* Loop until stopping criterion is satisfied. */ + + /* Compute residual R = B - op(A) * X, + where op(A) = A, A**T, or A**H, depending on TRANS. */ + + /* Matrix-vector multiply. */ + psgsmv_AXglobal(N_update, update, val, bindx, X_col, ax); + + /* Compute residual. */ + for (i = 0; i < N_update; ++i) R[i] = b[i] - ax[i]; + + /* Compute abs(op(A))*abs(X) + abs(B). */ + psgsmv_AXglobal_abs(N_update, update, val, bindx, X_col, temp); + for (i = 0; i < N_update; ++i) temp[i] += fabs(b[i]); + + s = 0.0; + for (i = 0; i < N_update; ++i) { + if ( temp[i] > safe2 ) { + s = SUPERLU_MAX(s, fabs(R[i]) / temp[i]); + } else if ( temp[i] != 0.0 ) { + /* Adding SAFE1 to the numerator guards against + spuriously zero residuals (underflow). */ + s = SUPERLU_MAX(s, (safe1 + fabs(R[i])) / temp[i]); + } + /* If temp[i] is exactly 0.0 (computed by PxGSMV), then + we know the true residual also must be exactly 0.0. */ + } + MPI_Allreduce( &s, &berr[j], 1, MPI_DOUBLE, MPI_MAX, grid->comm ); + +#if ( PRNTlevel>= 1 ) + if ( !iam ) + printf("(%2d) .. Step " IFMT ": berr[j] = %e\n", iam, count, berr[j]); +#endif + if ( berr[j] > eps && berr[j] * 2 <= lstres && count < ITMAX ) { + /* Compute new dx. */ + redist_all_to_diag(n, R, Glu_persist, Llu, grid, + mv_sup_to_proc, dx_trs); + psgstrs1(n, LUstruct, grid, dx_trs, 1, stat, info); + + /* Update solution. */ + for (p = 0; p < num_diag_procs; ++p) + if ( iam == diag_procs[p] ) + for (k = p; k < nsupers; k += num_diag_procs) { + lk = LBi( k, grid ); + ii = ilsum[lk] + (lk+1)*XK_H; + knsupc = SuperSize( k ); + for (i = 0; i < knsupc; ++i) + x_trs[i + ii] += dx_trs[i + ii]; + } + lstres = berr[j]; + ++count; + /* Transfer x_trs (on diagonal processes) into X + (on all processes). */ + gather_1rhs_diag_to_all(n, x_trs, Glu_persist, Llu, grid, + num_diag_procs, diag_procs, diag_len, + X_col, temp); + } else { + break; + } + } /* end while */ + + stat->RefineSteps = count; + + } /* for j ... */ + + + /* Deallocate storage used by matrix-vector multiplication. */ + SUPERLU_FREE(diag_procs); + SUPERLU_FREE(diag_len); + if ( N_update ) { + SUPERLU_FREE(update); + SUPERLU_FREE(bindx); + SUPERLU_FREE(val); + } + SUPERLU_FREE(mv_sup_to_proc); + SUPERLU_FREE(work); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgsrfs_ABXglobal()"); +#endif + +} /* PSGSRFS_ABXGLOBAL */ + + +/*! \brief + * + *
+ * r[] is the residual vector distributed the same way as
+ * matrix-vector product.
+ * 
+ */ +static void +redist_all_to_diag(int_t n, float r[], Glu_persist_t *Glu_persist, + sLocalLU_t *Llu, gridinfo_t *grid, int_t mv_sup_to_proc[], + float work[]) +{ + int_t i, ii, k, lk, lr, nsupers; + int_t *ilsum, *xsup; + int iam, knsupc, psrc, pkk; + MPI_Status status; + + iam = grid->iam; + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + ilsum = Llu->ilsum; + lr = 0; + + for (k = 0; k < nsupers; ++k) { + pkk = PNUM( PROW( k, grid ), PCOL( k, grid ), grid ); + psrc = mv_sup_to_proc[k]; + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + ii = ilsum[lk] + (lk+1)*XK_H; + if ( iam == psrc ) { + if ( iam != pkk ) { /* Send X component. */ + MPI_Send( &r[lr], knsupc, MPI_FLOAT, pkk, Xk, + grid->comm ); + } else { /* Local copy. */ + for (i = 0; i < knsupc; ++i) + work[i + ii] = r[i + lr]; + } + lr += knsupc; + } else { + if ( iam == pkk ) { /* Recv X component. */ + MPI_Recv( &work[ii], knsupc, MPI_FLOAT, psrc, Xk, + grid->comm, &status ); + } + } + } +} /* REDIST_ALL_TO_DIAG */ + + +/*! \brief + * + *
+ * Gather the components of x vector on the diagonal processes
+ * onto all processes, and combine them into the global vector y.
+ * 
+ */ +static void +gather_1rhs_diag_to_all(int_t n, float x[], + Glu_persist_t *Glu_persist, sLocalLU_t *Llu, + gridinfo_t *grid, int_t num_diag_procs, + int_t diag_procs[], int_t diag_len[], + float y[], float work[]) +{ + int_t i, ii, k, lk, lwork, nsupers, p; + int_t *ilsum, *xsup; + int iam, knsupc, pkk; + + iam = grid->iam; + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + ilsum = Llu->ilsum; + + for (p = 0; p < num_diag_procs; ++p) { + pkk = diag_procs[p]; + if ( iam == pkk ) { + /* Copy x vector into a buffer. */ + lwork = 0; + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + ii = ilsum[lk] + (lk+1)*XK_H; + for (i = 0; i < knsupc; ++i) work[i+lwork] = x[i+ii]; + lwork += knsupc; + } + MPI_Bcast( work, lwork, MPI_FLOAT, pkk, grid->comm ); + } else { + MPI_Bcast( work, diag_len[p], MPI_FLOAT, pkk, grid->comm ); + } + /* Scatter work[] into global y vector. */ + lwork = 0; + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + ii = FstBlockC( k ); + for (i = 0; i < knsupc; ++i) y[i+ii] = work[i+lwork]; + lwork += knsupc; + } + } +} /* GATHER_1RHS_DIAG_TO_ALL */ + diff --git a/SRC/psgssvx.c b/SRC/psgssvx.c new file mode 100644 index 00000000..73020a02 --- /dev/null +++ b/SRC/psgssvx.c @@ -0,0 +1,1579 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Solves a system of linear equations A*X=B + * + *
+ * -- Distributed SuperLU routine (version 6.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * November 1, 2007
+ * October 22, 2012
+ * October  1, 2014
+ * April 5, 2015
+ * December 31, 2015  version 4.3
+ * December 31, 2016  version 5.1.3
+ * April 10, 2018  version 5.3
+ * September 18, 2018  version 6.0
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * PSGSSVX solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinement is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows
+ *                        of the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be identical on all processes when
+ *               calling this routine.
+ *
+ * A (input/output) SuperMatrix* (local)
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PDGSTRF can factorize rectangular matrices.
+ *         On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) sScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (float *) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (float *) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) float* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_sdefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) sLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (sLocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_sdefs.h for the definition of 'sLocalLU_t'.
+ *
+ * SOLVEstruct (input/output) sSOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be initialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_sdefs.h for the definition of 'sSOLVEstruct_t'.
+ *
+ * berr    (output) float*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         < 0: if info = -i, the i-th argument had an illegal value  
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_sdefs.h for the definitions of various data types.
+ * 
+ */ + +void +psgssvx(superlu_dist_options_t *options, SuperMatrix *A, + sScalePermstruct_t *ScalePermstruct, + float B[], int ldb, int nrhs, gridinfo_t *grid, + sLUstruct_t *LUstruct, sSOLVEstruct_t *SOLVEstruct, float *berr, + SuperLUStat_t *stat, int *info) +{ + NRformat_loc *Astore; + SuperMatrix GA; /* Global A in NC format */ + NCformat *GAstore; + float *a_GA; + SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ + NCPformat *GACstore; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t *Glu_freeable; + /* The nonzero structures of L and U factors, which are + replicated on all processrs. + (lsub, xlsub) contains the compressed subscript of + supernodes in L. + (usub, xusub) contains the compressed subscript of + nonzero segments in U. + If options->Fact != SamePattern_SameRowPerm, they are + computed by SYMBFACT routine, and then used by PDDISTRIBUTE + routine. They will be freed after PDDISTRIBUTE routine. + If options->Fact == SamePattern_SameRowPerm, these + structures are not used. */ + fact_t Fact; + float *a; + int_t *colptr, *rowind; + int_t *perm_r; /* row permutations from partial pivoting */ + int_t *perm_c; /* column permutation vector */ + int_t *etree; /* elimination tree */ + int_t *rowptr, *colind; /* Local A in NR*/ + int_t colequ, Equil, factored, job, notran, rowequ, need_value; + int_t i, iinfo, j, irow, m, n, nnz, permc_spec; + int_t nnz_loc, m_loc, fst_row, icol; + int iam,iam_g; + int ldx; /* LDA for matrix X (local). */ + char equed[1], norm[1]; + float *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; + float *X, *b_col, *b_work, *x_col; + double t; + float GA_mem_use = 0.0; /* memory usage by global A */ + float dist_mem_use = 0.0; /* memory usage during distribution */ + superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; + int64_t nnzLU; + int_t nnz_tot; + float *nzval_a; + float asum,asum_tot,lsum,lsum_tot; + int_t nsupers,nsupers_j; + int_t lk,k,knsupc,nsupr; + int_t *lsub,*xsup; + float *lusup; +#if ( PRNTlevel>= 2 ) + double dmin, dsum, dprod; +#endif + + LUstruct->dt = 's'; + + /* Structures needed for parallel symbolic factorization */ + int_t *sizes, *fstVtxSep, parSymbFact; + int noDomains, nprocs_num; + MPI_Comm symb_comm; /* communicator for symbolic factorization */ + int col, key; /* parameters for creating a new communicator */ + Pslu_freeable_t Pslu_freeable; + float flinfo; + + /* Initialization. */ + m = A->nrow; + n = A->ncol; + Astore = (NRformat_loc *) A->Store; + nnz_loc = Astore->nnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + a = (float *) Astore->nzval; + rowptr = Astore->rowptr; + colind = Astore->colind; + sizes = NULL; + fstVtxSep = NULL; + symb_comm = MPI_COMM_NULL; + num_mem_usage.for_lu = num_mem_usage.total = 0.0; + symb_mem_usage.for_lu = symb_mem_usage.total = 0.0; + + /* Test the input parameters. */ + *info = 0; + Fact = options->Fact; + if ( Fact < DOFACT || Fact > FACTORED ) + *info = -1; + else if ( options->RowPerm < NOROWPERM || options->RowPerm > MY_PERMR ) + *info = -1; + else if ( options->ColPerm < NATURAL || options->ColPerm > MY_PERMC ) + *info = -1; + else if ( options->IterRefine < NOREFINE || options->IterRefine > SLU_EXTRA ) + *info = -1; + else if ( options->IterRefine == SLU_EXTRA ) { + *info = -1; + printf("ERROR: Extra precise iterative refinement yet to support.\n"); + } else if ( A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc + || A->Dtype != SLU_S || A->Mtype != SLU_GE ) + *info = -2; + else if ( ldb < m_loc ) + *info = -5; + else if ( nrhs < 0 ) + *info = -6; + if ( sp_ienv_dist(2) > sp_ienv_dist(3) ) { + *info = 1; + printf("ERROR: Relaxation (NREL) cannot be larger than max. supernode size (NSUP).\n" + "\t-> Check parameter setting in sp_ienv_dist.c to correct error.\n"); + } + if ( *info ) { + i = -(*info); + pxerr_dist("psgssvx", grid, -*info); + return; + } + + factored = (Fact == FACTORED); + Equil = (!factored && options->Equil == YES); + notran = (options->Trans == NOTRANS); + parSymbFact = options->ParSymbFact; + + iam = grid->iam; + job = 5; + if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { + rowequ = (ScalePermstruct->DiagScale == ROW) || + (ScalePermstruct->DiagScale == BOTH); + colequ = (ScalePermstruct->DiagScale == COL) || + (ScalePermstruct->DiagScale == BOTH); + } else rowequ = colequ = FALSE; + + /* The following arrays are replicated on all processes. */ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + etree = LUstruct->etree; + R = ScalePermstruct->R; + C = ScalePermstruct->C; + /********/ + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psgssvx()"); +#endif + + /* Not factored & ask for equilibration */ + if ( Equil && Fact != SamePattern_SameRowPerm ) { + /* Allocate storage if not done so before. */ + switch ( ScalePermstruct->DiagScale ) { + case NOEQUIL: + if ( !(R = (float *) floatMalloc_dist(m)) ) + ABORT("Malloc fails for R[]."); + if ( !(C = (float *) floatMalloc_dist(n)) ) + ABORT("Malloc fails for C[]."); + ScalePermstruct->R = R; + ScalePermstruct->C = C; + break; + case ROW: + if ( !(C = (float *) floatMalloc_dist(n)) ) + ABORT("Malloc fails for C[]."); + ScalePermstruct->C = C; + break; + case COL: + if ( !(R = (float *) floatMalloc_dist(m)) ) + ABORT("Malloc fails for R[]."); + ScalePermstruct->R = R; + break; + default: break; + } + } + + /* ------------------------------------------------------------ + * Diagonal scaling to equilibrate the matrix. (simple scheme) + * for row i = 1:n, A(i,:) <- A(i,:) / max(abs(A(i,:)); + * for column j = 1:n, A(:,j) <- A(:, j) / max(abs(A(:,j)) + * ------------------------------------------------------------*/ + if ( Equil ) { +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter equil"); +#endif + t = SuperLU_timer_(); + + if ( Fact == SamePattern_SameRowPerm ) { + /* Reuse R and C. */ + switch ( ScalePermstruct->DiagScale ) { + case NOEQUIL: + break; + case ROW: + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j+1]; ++i) { + a[i] *= R[irow]; /* Scale rows. */ + } + ++irow; + } + break; + case COL: + for (j = 0; j < m_loc; ++j) + for (i = rowptr[j]; i < rowptr[j+1]; ++i){ + icol = colind[i]; + a[i] *= C[icol]; /* Scale columns. */ + } + break; + case BOTH: + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j+1]; ++i) { + icol = colind[i]; + a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ + } + ++irow; + } + break; + } + } else { /* Compute R & C from scratch */ + /* Compute the row and column scalings. */ + psgsequ(A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); + + if ( iinfo > 0 ) { + if ( iinfo <= m ) { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo); +#endif + } else { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n); +#endif + } + } else if ( iinfo < 0 ) return; + + /* Now iinfo == 0 */ + + /* Equilibrate matrix A if it is badly-scaled. + A <-- diag(R)*A*diag(C) */ + pslaqgs(A, R, C, rowcnd, colcnd, amax, equed); + + if ( strncmp(equed, "R", 1)==0 ) { + ScalePermstruct->DiagScale = ROW; + rowequ = ROW; + } else if ( strncmp(equed, "C", 1)==0 ) { + ScalePermstruct->DiagScale = COL; + colequ = COL; + } else if ( strncmp(equed, "B", 1)==0 ) { + ScalePermstruct->DiagScale = BOTH; + rowequ = ROW; + colequ = COL; + } else ScalePermstruct->DiagScale = NOEQUIL; + +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. equilibrated? *equed = %c\n", *equed); + fflush(stdout); + } +#endif + } /* end if Fact ... */ + + stat->utime[EQUIL] = SuperLU_timer_() - t; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit equil"); +#endif + } /* end if Equil ... LAPACK style, not involving MC64 */ + + if ( !factored ) { /* Skip this if already factored. */ + /* + * For serial symbolic factorization, gather A from the distributed + * compressed row format to global A in compressed column format. + * Numerical values are gathered only when a row permutation + * for large diagonal is sought after. + */ + if ( Fact != SamePattern_SameRowPerm && + (parSymbFact == NO || options->RowPerm != NO) ) { + /* Performs serial symbolic factorzation and/or MC64 */ + + need_value = (options->RowPerm == LargeDiag_MC64); + + psCompRow_loc_to_CompCol_global(need_value, A, grid, &GA); + + GAstore = (NCformat *) GA.Store; + colptr = GAstore->colptr; + rowind = GAstore->rowind; + nnz = GAstore->nnz; + GA_mem_use = (nnz + n + 1) * sizeof(int_t); + + if ( need_value ) { + a_GA = (float *) GAstore->nzval; + GA_mem_use += nnz * sizeof(float); + } else assert(GAstore->nzval == NULL); + } + + /* ------------------------------------------------------------ + Find the row permutation Pr for A, and apply Pr*[GA]. + GA is overwritten by Pr*[GA]. + ------------------------------------------------------------*/ + if ( options->RowPerm != NO ) { + t = SuperLU_timer_(); + if ( Fact != SamePattern_SameRowPerm ) { + if ( options->RowPerm == MY_PERMR ) { /* Use user's perm_r. */ + /* Permute the global matrix GA for symbfact() */ + for (i = 0; i < colptr[n]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } else if ( options->RowPerm == LargeDiag_MC64 ) { + /* Get a new perm_r[] from MC64 */ + if ( job == 5 ) { + /* Allocate storage for scaling factors. */ + if ( !(R1 = floatMalloc_dist(m)) ) + ABORT("SUPERLU_MALLOC fails for R1[]"); + if ( !(C1 = floatMalloc_dist(n)) ) + ABORT("SUPERLU_MALLOC fails for C1[]"); + } + + if ( !iam ) { /* Process 0 finds a row permutation */ + iinfo = sldperm_dist(job, m, nnz, colptr, rowind, a_GA, + perm_r, R1, C1); + + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); + if ( job == 5 && Equil ) { + MPI_Bcast( R1, m, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( C1, n, MPI_FLOAT, 0, grid->comm ); + } + } + } else { + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); + if ( job == 5 && Equil ) { + MPI_Bcast( R1, m, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( C1, n, MPI_FLOAT, 0, grid->comm ); + } + } + } + + if ( iinfo && job == 5) { /* Error return */ + SUPERLU_FREE(R1); + SUPERLU_FREE(C1); + } +#if ( PRNTlevel>=2 ) + dmin = smach_dist("Overflow"); + dsum = 0.0; + dprod = 1.0; +#endif + if ( iinfo == 0 ) { + if ( job == 5 ) { + if ( Equil ) { + for (i = 0; i < n; ++i) { + R1[i] = exp(R1[i]); + C1[i] = exp(C1[i]); + } + + /* Scale the distributed matrix further. + A <-- diag(R1)*A*diag(C1) */ + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j+1]; ++i) { + icol = colind[i]; + a[i] *= R1[irow] * C1[icol]; +#if ( PRNTlevel>=2 ) + if ( perm_r[irow] == icol ) { /* New diagonal */ + if ( job == 2 || job == 3 ) + dmin = SUPERLU_MIN(dmin, fabs(a[i])); + else if ( job == 4 ) + dsum += fabs(a[i]); + else if ( job == 5 ) + dprod *= fabs(a[i]); + } +#endif + } + ++irow; + } + + /* Multiply together the scaling factors -- + R/C from simple scheme, R1/C1 from MC64. */ + if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; + else for (i = 0; i < m; ++i) R[i] = R1[i]; + if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; + else for (i = 0; i < n; ++i) C[i] = C1[i]; + + ScalePermstruct->DiagScale = BOTH; + rowequ = colequ = 1; + + } /* end Equil */ + + /* Now permute global GA to prepare for symbfact() */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } + SUPERLU_FREE (R1); + SUPERLU_FREE (C1); + } else { /* job = 2,3,4 */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } /* end for i ... */ + } /* end for j ... */ + } /* end else job ... */ + } else { /* if iinfo != 0 */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } + +#if ( PRNTlevel>=2 ) + if ( job == 2 || job == 3 ) { + if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); + } else if ( job == 4 ) { + if ( !iam ) printf("\tsum of diagonal %e\n", dsum); + } else if ( job == 5 ) { + if ( !iam ) printf("\t product of diagonal %e\n", dprod); + } +#endif + } else { /* use LargeDiag_HWPM */ +#ifdef HAVE_COMBBLAS + s_c2cpp_GetHWPM(A, grid, ScalePermstruct); +#else + if ( iam == 0 ) { + printf("CombBLAS is not available\n"); fflush(stdout); + } +#endif + } /* end if options->RowPerm ... */ + + t = SuperLU_timer_() - t; + stat->utime[ROWPERM] = t; +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); + fflush(stdout); + } +#endif + } /* end if Fact ... */ + + } else { /* options->RowPerm == NOROWPERM / NATURAL */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } + +#if ( DEBUGlevel>=2 ) + if ( !iam ) PrintInt10("perm_r", m, perm_r); +#endif + } /* end if (!factored) */ + + if ( !factored || options->IterRefine ) { + /* Compute norm(A), which will be used to adjust small diagonal. */ + if ( notran ) *(unsigned char *)norm = '1'; + else *(unsigned char *)norm = 'I'; + anorm = pslangs(norm, A, grid); +#if ( PRNTlevel>=1 ) + if ( !iam ) { printf(".. anorm %e\n", anorm); fflush(stdout); } +#endif + } + + /* ------------------------------------------------------------ + Perform the LU factorization: symbolic factorization, + redistribution, and numerical factorization. + ------------------------------------------------------------*/ + if ( !factored ) { + t = SuperLU_timer_(); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A + * permc_spec = PARMETIS: parallel METIS on structure of A'+A + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + + if ( parSymbFact == YES || permc_spec == PARMETIS ) { + nprocs_num = grid->nprow * grid->npcol; + noDomains = (int) ( pow(2, ((int) LOG2( nprocs_num )))); + + /* create a new communicator for the first noDomains + processes in grid->comm */ + key = iam; + if (iam < noDomains) col = 0; + else col = MPI_UNDEFINED; + MPI_Comm_split (grid->comm, col, key, &symb_comm ); + + if ( permc_spec == NATURAL || permc_spec == MY_PERMC ) { + if ( permc_spec == NATURAL ) { + for (j = 0; j < n; ++j) perm_c[j] = j; + } + if ( !(sizes = intMalloc_dist(2 * noDomains)) ) + ABORT("SUPERLU_MALLOC fails for sizes."); + if ( !(fstVtxSep = intMalloc_dist(2 * noDomains)) ) + ABORT("SUPERLU_MALLOC fails for fstVtxSep."); + for (i = 0; i < 2*noDomains - 2; ++i) { + sizes[i] = 0; + fstVtxSep[i] = 0; + } + sizes[2*noDomains - 2] = m; + fstVtxSep[2*noDomains - 2] = 0; + } else if ( permc_spec != PARMETIS ) { /* same as before */ + printf("{" IFMT "," IFMT "}: psgssvx: invalid ColPerm option when ParSymbfact is used\n", + MYROW(grid->iam, grid), MYCOL(grid->iam, grid)); + } + } + + if ( permc_spec != MY_PERMC && Fact == DOFACT ) { + /* Reuse perm_c if Fact == SamePattern, or SamePattern_SameRowPerm */ + if ( permc_spec == PARMETIS ) { + // #pragma omp parallel + // { + // #pragma omp master + // { + /* Get column permutation vector in perm_c. * + * This routine takes as input the distributed input matrix A * + * and does not modify it. It also allocates memory for * + * sizes[] and fstVtxSep[] arrays, that contain information * + * on the separator tree computed by ParMETIS. */ + flinfo = get_perm_c_parmetis(A, perm_r, perm_c, nprocs_num, + noDomains, &sizes, &fstVtxSep, + grid, &symb_comm); + // } + // } + if (flinfo > 0) { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "Insufficient memory for get_perm_c parmetis\n"); +#endif + *info = flinfo; + return; + } + } else { + get_perm_c_dist(iam, permc_spec, &GA, perm_c); + } + } + + stat->utime[COLPERM] = SuperLU_timer_() - t; + + /* Symbolic factorization. */ + if ( Fact != SamePattern_SameRowPerm ) { + if ( parSymbFact == NO ) { /* Perform serial symbolic factorization */ + /* GA = Pr*A, perm_r[] is already applied. */ + int_t *GACcolbeg, *GACcolend, *GACrowind; + + /* Compute the elimination tree of Pc*(A^T+A)*Pc^T or Pc*A^T*A*Pc^T + (a.k.a. column etree), depending on the choice of ColPerm. + Adjust perm_c[] to be consistent with a postorder of etree. + Permute columns of A to form A*Pc'. + After this routine, GAC = GA*Pc^T. */ + sp_colorder(options, &GA, perm_c, etree, &GAC); + + /* Form Pc*A*Pc^T to preserve the diagonal of the matrix GAC. */ + GACstore = (NCPformat *) GAC.Store; + GACcolbeg = GACstore->colbeg; + GACcolend = GACstore->colend; + GACrowind = GACstore->rowind; + for (j = 0; j < n; ++j) { + for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { + irow = GACrowind[i]; + GACrowind[i] = perm_c[irow]; + } + } + + /* Perform a symbolic factorization on Pc*Pr*A*Pc^T and set up + the nonzero data structures for L & U. */ +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n", + sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); + fflush(stdout); + } +#endif + t = SuperLU_timer_(); + if ( !(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) + ABORT("Malloc fails for Glu_freeable."); + + /* Every process does this. */ + iinfo = symbfact(options, iam, &GAC, perm_c, etree, + Glu_persist, Glu_freeable); + nnzLU = Glu_freeable->nnzLU; + stat->utime[SYMBFAC] = SuperLU_timer_() - t; + if ( iinfo <= 0 ) { /* Successful return */ + QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1); + printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]); + printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]); + printf("\tint %lu, short %lu, float %lu, double %lu\n", + sizeof(int_t), sizeof(short), + sizeof(float), sizeof(double)); + printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", + symb_mem_usage.for_lu*1e-6, + symb_mem_usage.total*1e-6, + symb_mem_usage.expansions); + fflush(stdout); + } +#endif + } else { /* symbfact out of memory */ +#if ( PRNTlevel>=1 ) + if ( !iam ) + fprintf(stderr,"symbfact() error returns " IFMT "\n",iinfo); +#endif + *info = iinfo; + return; + } + } /* end serial symbolic factorization */ + else { /* parallel symbolic factorization */ + t = SuperLU_timer_(); + flinfo = symbfact_dist(nprocs_num, noDomains, A, perm_c, perm_r, + sizes, fstVtxSep, &Pslu_freeable, + &(grid->comm), &symb_comm, + &symb_mem_usage); + nnzLU = Pslu_freeable.nnzLU; + stat->utime[SYMBFAC] = SuperLU_timer_() - t; + if (flinfo > 0) { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "Insufficient memory for parallel symbolic factorization."); +#endif + *info = flinfo; + return; + } + } + + /* Destroy global GA */ + if ( parSymbFact == NO || options->RowPerm != NO ) + Destroy_CompCol_Matrix_dist(&GA); + if ( parSymbFact == NO ) + Destroy_CompCol_Permuted_dist(&GAC); + + } /* end if Fact != SamePattern_SameRowPerm ... */ + + if (sizes) SUPERLU_FREE (sizes); + if (fstVtxSep) SUPERLU_FREE (fstVtxSep); + if (symb_comm != MPI_COMM_NULL) MPI_Comm_free (&symb_comm); + + /* Distribute entries of A into L & U data structures. */ + //if (parSymbFact == NO || ???? Fact == SamePattern_SameRowPerm) { + if ( parSymbFact == NO ) { + /* CASE OF SERIAL SYMBOLIC */ + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; + + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc^T into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + t = SuperLU_timer_(); + dist_mem_use = psdistribute(Fact, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid); + stat->utime[DIST] = SuperLU_timer_() - t; + + /* Deallocate storage used in symbolic factorization. */ + if ( Fact != SamePattern_SameRowPerm ) { + iinfo = symbfact_SubFree(Glu_freeable); + SUPERLU_FREE(Glu_freeable); + } + } else { /* CASE OF PARALLEL SYMBOLIC */ + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) colind[j] = perm_c[colind[j]]; + + t = SuperLU_timer_(); + dist_mem_use = sdist_psymbtonum(Fact, n, A, ScalePermstruct, + &Pslu_freeable, LUstruct, grid); + if (dist_mem_use > 0) + ABORT ("Not enough memory available for dist_psymbtonum\n"); + + stat->utime[DIST] = SuperLU_timer_() - t; + } + + /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]);*/ + + /* Perform numerical factorization in parallel. */ + t = SuperLU_timer_(); + // #pragma omp parallel + // { + // #pragma omp master + // { + psgstrf(options, m, n, anorm, LUstruct, grid, stat, info); + stat->utime[FACT] = SuperLU_timer_() - t; + // } + // } + + +#if ( PRNTlevel>=2 ) + /* ------------------------------------------------------------ + SUM OVER ALL ENTRIES OF A AND PRINT NNZ AND SIZE OF A. + ------------------------------------------------------------*/ + Astore = (NRformat_loc *) A->Store; + xsup = Glu_persist->xsup; + nzval_a = Astore->nzval; + + + asum=0; + for (i = 0; i < Astore->m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + asum += nzval_a[j]; + } + } + + nsupers = Glu_persist->supno[n-1] + 1; + nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ + + + + lsum=0.0; + for (lk=0;lkLlu->Lrowind_bc_ptr[lk]; + lusup = LUstruct->Llu->Lnzval_bc_ptr[lk]; + if(lsub){ + k = MYCOL(grid->iam, grid)+lk*grid->npcol; /* not sure */ + knsupc = SuperSize( k ); + nsupr = lsub[1]; + for (j=0; jcomm ); + MPI_Allreduce( &lsum, &lsum_tot,1, MPI_FLOAT, MPI_SUM, grid->comm ); + + + MPI_Allreduce( &Astore->rowptr[Astore->m_loc], &nnz_tot,1, mpi_int_t, MPI_SUM, grid->comm ); + // MPI_Bcast( &nnzLU, 1, mpi_int_t, 0, grid->comm ); + + MPI_Comm_rank( MPI_COMM_WORLD, &iam_g ); + + printf(".. Ainfo mygid %5d mysid %5d nnz_loc " IFMT " sum_loc %e lsum_loc %e nnz " IFMT " nnzLU %ld sum %e lsum %e N " IFMT "\n", iam_g,iam,Astore->rowptr[Astore->m_loc],asum, lsum, nnz_tot,nnzLU,asum_tot,lsum_tot,A->ncol); + fflush(stdout); +#endif + +#if 0 + +// #ifdef GPU_PROF + +// if(!iam ) +// { +// char* ttemp; + +// ttemp = getenv("IO_FILE"); +// if(ttemp!=NULL) +// { +// printf("File being opend is %s\n",ttemp ); +// FILE* fp; +// fp = fopen(ttemp,"w"); +// if(!fp) +// { +// fprintf(stderr," Couldn't open output file %s\n",ttemp); +// } + +// int nsup=Glu_persist->supno[n-1]+1; +// int ii; +// for (ii = 0; ii < nsup; ++ii) +// { +// fprintf(fp,"%d,%d,%d,%d,%d,%d\n",gs1.mnk_min_stats[ii],gs1.mnk_min_stats[ii+nsup], +// gs1.mnk_min_stats[ii+2*nsup], +// gs1.mnk_max_stats[ii],gs1.mnk_max_stats[ii+nsup],gs1.mnk_max_stats[ii+2*nsup]); +// } + +// // lastly put the timeing stats that we need + +// fprintf(fp,"Min %lf Max %lf totaltime %lf \n",gs1.osDgemmMin,gs1.osDgemmMax,stat->utime[FACT]); +// fclose(fp); +// } + +// } +// #endif + +#endif + + if ( options->PrintStat ) { + int_t TinyPivots; + float for_lu, total, max, avg, temp; + + sQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage); + + if (parSymbFact == TRUE) { + /* The memory used in the redistribution routine + includes the memory used for storing the symbolic + structure and the memory allocated for numerical + factorization */ + temp = SUPERLU_MAX(symb_mem_usage.total, -dist_mem_use); + if ( options->RowPerm != NO ) + temp = SUPERLU_MAX(temp, GA_mem_use); + } else { + temp = SUPERLU_MAX ( + symb_mem_usage.total + GA_mem_use, /* symbfact step */ + symb_mem_usage.for_lu + dist_mem_use + + num_mem_usage.for_lu /* distribution step */ + ); + } + + temp = SUPERLU_MAX(temp, num_mem_usage.total); + + MPI_Reduce( &temp, &max, + 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); + MPI_Reduce( &temp, &avg, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, + MPI_SUM, grid->comm ); + stat->TinyPivots = TinyPivots; + + MPI_Reduce( &num_mem_usage.for_lu, &for_lu, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + MPI_Reduce( &num_mem_usage.total, &total, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + + if (!iam) { + printf("\n** Memory Usage **********************************\n"); + printf("** NUMfact space (MB): (sum-of-all-processes)\n" + " L\\U : %8.2f | Total : %8.2f\n", + for_lu * 1e-6, total * 1e-6); + printf("** Total highmark (MB):\n" + " Sum-of-all : %8.2f | Avg : %8.2f | Max : %8.2f\n", + avg * 1e-6, + avg / grid->nprow / grid->npcol * 1e-6, + max * 1e-6); + printf("**************************************************\n\n"); + printf("** number of Tiny Pivots: %8d\n\n", stat->TinyPivots); + fflush(stdout); + } + } /* end printing stats */ + + } /* end if (!factored) */ + + + if ( options->Fact == DOFACT || options->Fact == SamePattern ) { + /* Need to reset the solve's communication pattern, + because perm_r[] and/or perm_c[] is changed. */ + if ( options->SolveInitialized == YES ) { /* Initialized before */ + sSolveFinalize(options, SOLVEstruct); /* Clean up structure */ + options->SolveInitialized = NO; /* Reset the solve state */ + } + } +#if 0 + /* Need to revisit: Why the following is not good enough for X-to-B + distribution -- inv_perm_c changed */ + pxgstrs_finalize(SOLVEstruct->gstrs_comm); + psgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, + LUstruct->Glu_persist, SOLVEstruct); +#endif + + + /* ------------------------------------------------------------ + Compute the solution matrix X. + ------------------------------------------------------------*/ + if ( nrhs && *info == 0 ) { + + if ( !(b_work = floatMalloc_dist(n)) ) + ABORT("Malloc fails for b_work[]"); + + /* ------------------------------------------------------------ + Scale the right-hand side if equilibration was performed. + ------------------------------------------------------------*/ + if ( notran ) { + if ( rowequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + irow = fst_row; + for (i = 0; i < m_loc; ++i) { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + } else if ( colequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + irow = fst_row; + for (i = 0; i < m_loc; ++i) { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if ( !(X = floatMalloc_dist(((size_t)ldx) * nrhs)) ) + ABORT("Malloc fails for X[]"); + x_col = X; b_col = B; + for (j = 0; j < nrhs; ++j) { +#if 0 /* Sherry */ + for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; +#endif + memcpy(x_col, b_col, m_loc * sizeof(float)); + x_col += ldx; b_col += ldb; + } + + /* ------------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------------*/ + if ( options->SolveInitialized == NO ) { /* First time */ + sSolveInit(options, A, perm_r, perm_c, nrhs, LUstruct, grid, + SOLVEstruct); + /* Inside this routine, SolveInitialized is set to YES. + For repeated call to psgssvx(), no need to re-initialilze + the Solve data & communication structures, unless a new + factorization with Fact == DOFACT or SamePattern is asked for. */ + } + + if ( options->DiagInv==YES && + (options->SolveInitialized == NO || Fact == SamePattern || + Fact == SamePattern_SameRowPerm) ) { + psCompute_Diag_Inv(n, LUstruct, grid, stat, info); + } + + + // #pragma omp parallel + // { + // #pragma omp master + // { + psgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); + // } + // } + + /* ------------------------------------------------------------ + Use iterative refinement to improve the computed solution and + compute error bounds and backward error estimates for it. + ------------------------------------------------------------*/ + if ( options->IterRefine ) { + /* Improve the solution by iterative refinement. */ + int_t *it; + int_t *colind_gsmv = SOLVEstruct->A_colind_gsmv; + /* This was allocated and set to NULL in sSolveInit() */ + sSOLVEstruct_t *SOLVEstruct1; /* Used by refinement. */ + + t = SuperLU_timer_(); + if ( options->RefineInitialized == NO || Fact == DOFACT ) { + /* All these cases need to re-initialize gsmv structure */ + if ( options->RefineInitialized ) + psgsmv_finalize(SOLVEstruct->gsmv_comm); + psgsmv_init(A, SOLVEstruct->row_to_proc, grid, + SOLVEstruct->gsmv_comm); + + /* Save a copy of the transformed local col indices + in colind_gsmv[]. */ + if ( colind_gsmv ) SUPERLU_FREE(colind_gsmv); + if ( !(it = intMalloc_dist(nnz_loc)) ) + ABORT("Malloc fails for colind_gsmv[]"); + colind_gsmv = SOLVEstruct->A_colind_gsmv = it; + for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; + options->RefineInitialized = YES; + } else if ( Fact == SamePattern || + Fact == SamePattern_SameRowPerm ) { + float atemp; + int_t k, jcol, p; + /* Swap to beginning the part of A corresponding to the + local part of X, as was done in psgsmv_init() */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + k = rowptr[i]; + for (j = rowptr[i]; j < rowptr[i+1]; ++j) { + jcol = colind[j]; + p = SOLVEstruct->row_to_proc[jcol]; + if ( p == iam ) { /* Local */ + atemp = a[k]; a[k] = a[j]; a[j] = atemp; + ++k; + } + } + } + + /* Re-use the local col indices of A obtained from the + previous call to psgsmv_init() */ + for (i = 0; i < nnz_loc; ++i) colind[i] = colind_gsmv[i]; + } + + if ( nrhs == 1 ) { /* Use the existing solve structure */ + SOLVEstruct1 = SOLVEstruct; + } else { /* For nrhs > 1, since refinement is performed for RHS + one at a time, the communication structure for pdgstrs + is different than the solve with nrhs RHS. + So we use SOLVEstruct1 for the refinement step. + */ + if ( !(SOLVEstruct1 = (sSOLVEstruct_t *) + SUPERLU_MALLOC(sizeof(sSOLVEstruct_t))) ) + ABORT("Malloc fails for SOLVEstruct1"); + /* Copy the same stuff */ + SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; + SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; + SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; + SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; + SOLVEstruct1->diag_len = SOLVEstruct->diag_len; + SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; + SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; + + /* Initialize the *gstrs_comm for 1 RHS. */ + if ( !(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) + ABORT("Malloc fails for gstrs_comm[]"); + psgstrs_init(n, m_loc, 1, fst_row, perm_r, perm_c, grid, + Glu_persist, SOLVEstruct1); + } + + psgsrfs(n, A, anorm, LUstruct, ScalePermstruct, grid, + B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); + + /* Deallocate the storage associated with SOLVEstruct1 */ + if ( nrhs > 1 ) { + pxgstrs_finalize(SOLVEstruct1->gstrs_comm); + SUPERLU_FREE(SOLVEstruct1); + } + + stat->utime[REFINE] = SuperLU_timer_() - t; + } /* end if IterRefine */ + + /* Permute the solution matrix B <= Pc'*X. */ + psPermute_Dense_Matrix(fst_row, m_loc, SOLVEstruct->row_to_proc, + SOLVEstruct->inv_perm_c, + X, ldx, B, ldb, nrhs, grid); +#if ( DEBUGlevel>=2 ) + printf("\n (%d) .. After psPermute_Dense_Matrix(): b =\n", iam); + for (i = 0; i < m_loc; ++i) + printf("\t(%d)\t%4d\t%.10f\n", iam, i+fst_row, B[i]); +#endif + + /* Transform the solution matrix X to a solution of the original + system before equilibration. */ + if ( notran ) { + if ( colequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + irow = fst_row; + for (i = 0; i < m_loc; ++i) { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + } else if ( rowequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + irow = fst_row; + for (i = 0; i < m_loc; ++i) { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + + SUPERLU_FREE(b_work); + SUPERLU_FREE(X); + + } /* end if nrhs != 0 && *info == 0 */ + +#if ( PRNTlevel>=1 ) + if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); +#endif + + /* Deallocate R and/or C if it was not used. */ + if ( Equil && Fact != SamePattern_SameRowPerm ) { + switch ( ScalePermstruct->DiagScale ) { + case NOEQUIL: + SUPERLU_FREE(R); + SUPERLU_FREE(C); + break; + case ROW: + SUPERLU_FREE(C); + break; + case COL: + SUPERLU_FREE(R); + break; + default: break; + } + } + +#if 0 + if ( !factored && Fact != SamePattern_SameRowPerm && !parSymbFact) + Destroy_CompCol_Permuted_dist(&GAC); +#endif +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgssvx()"); +#endif + +} diff --git a/SRC/psgssvx3d.c b/SRC/psgssvx3d.c new file mode 100644 index 00000000..9b057aed --- /dev/null +++ b/SRC/psgssvx3d.c @@ -0,0 +1,1586 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Solves a system of linear equations A*X=B using 3D process grid. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include "superlu_sdefs.h"
+#if 0
+#include "p3dcomm.h"
+#include "pdgstrf3d.h"
+#include "triangularSolve/pdgstrs.h"
+#include "triangularSolve/pdgstrs3d.h"
+#include "xtrf3Dpartition.h"
+#endif
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * PSGSSVX3D solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * The input matrices A and B are distributed by block rows.
+ * Here is a graphical illustration (0-based indexing):
+ *
+ *                        A                B
+ *               0 ---------------       ------
+ *                   |           |        |  |
+ *                   |           |   P0   |  |
+ *                   |           |        |  |
+ *                 ---------------       ------
+ *        - fst_row->|           |        |  |
+ *        |          |           |        |  |
+ *       m_loc       |           |   P1   |  |
+ *        |          |           |        |  |
+ *        -          |           |        |  |
+ *                 ---------------       ------
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                   |    .      |        |. |
+ *                 ---------------       ------
+ *
+ * where, fst_row is the row number of the first row,
+ *        m_loc is the number of rows local to this processor
+ * These are defined in the 'SuperMatrix' structure, see supermatrix.h.
+ *
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right-hand sides, distributed by block rows,
+ *            and its dimensions ldb (local) and nrhs (global)
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has four options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *        o  A, the input matrix
+ *
+ *        as well as the following options to determine what matrix to
+ *        factorize.
+ *
+ *        o  options->Equil,   to specify how to scale the rows and columns
+ *                             of A to "equilibrate" it (to try to reduce its
+ *                             condition number and so improve the
+ *                             accuracy of the computed solution)
+ *
+ *        o  options->RowPerm, to specify how to permute the rows of A
+ *                             (typically to control numerical stability)
+ *
+ *        o  options->ColPerm, to specify how to permute the columns of A
+ *                             (typically to control fill-in and enhance
+ *                             parallelism during factorization)
+ *
+ *        o  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                             pivots encountered during factorization
+ *                             (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *        o  ScalePermstruct,  modified to describe how the input matrix A
+ *                             was equilibrated and permuted:
+ *          .  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                         columns of A were scaled
+ *          .  ScalePermstruct->R, array of row scale factors
+ *          .  ScalePermstruct->C, array of column scale factors
+ *          .  ScalePermstruct->perm_r, row permutation vector
+ *          .  ScalePermstruct->perm_c, column permutation vector
+ *
+ *          (part of ScalePermstruct may also need to be supplied on input,
+ *           depending on options->RowPerm and options->ColPerm as described
+ *           later).
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix diag(R)*A*diag(C)*Pc^T, where
+ *              Pc is the row permutation matrix determined by
+ *                  ScalePermstruct->perm_c
+ *              diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *        o  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *               (Note that A1 = Pc*Pr*Aout, where Aout is the matrix stored
+ *                in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In
+ *            this case the algorithm saves time by reusing the previously
+ *            computed column permutation vector stored in
+ *            ScalePermstruct->perm_c and the "elimination tree" of A
+ *            stored in LUstruct->etree
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->RowPerm
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->perm_c, the column permutation
+ *        o  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct, modified to describe how the input matrix A was
+ *                            equilibrated and row permuted
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *        o  options->Equil
+ *        o  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are
+ *      ignored. This is because the permutations from ScalePermstruct->perm_r
+ *      and ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *        o  A, the input matrix
+ *        o  ScalePermstruct->DiagScale, how the previous matrix was row
+ *                                       and/or column scaled
+ *        o  ScalePermstruct->R, the row scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                               if any
+ *        o  ScalePermstruct->perm_r, the row permutation of the previous
+ *                                    matrix
+ *        o  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                    matrix
+ *        o  all of LUstruct, the previously computed information about
+ *                            L and U (the actual numerical values of L and U
+ *                            stored in LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *        o  A, the input matrix A overwritten by the scaled and permuted
+ *              matrix as described above
+ *        o  ScalePermstruct,  modified to describe how the input matrix A was
+ *                             equilibrated (thus ScalePermstruct->DiagScale,
+ *                             R and C may be modified)
+ *        o  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *        o  A, the unfactored matrix, only in the case that iterative
+ *              refinment is to be done (specifically A must be the output
+ *              A from the previous call, so that it has been scaled and permuted)
+ *        o  all of ScalePermstruct
+ *        o  all of LUstruct, including the actual numerical values of
+ *           L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows of
+ *                        the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_HPWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be indentical on all processes when
+ *               calling this routine.
+ *
+ * A (input/output) SuperMatrix* (local); A resides only on process layer 0.
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *           The number of linear equations is A->nrow. The type of A must be:
+ *           Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+ *           That is, A is stored in distributed compressed row format.
+ *           See supermatrix.h for the definition of 'SuperMatrix'.
+ *           This routine only handles square A, however, the LU factorization
+ *           routine PDGSTRF can factorize rectangular matrices.
+ *         On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T,
+ *           depending on ScalePermstruct->DiagScale and options->ColPerm:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A is further overwritten by
+ *                diag(R)*A*diag(C)*Pc^T.
+ *           If all the above condition are true, the LU decomposition is
+ *           performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ * ScalePermstruct (input/output) sScalePermstruct_t* (global)
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (float *) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (float *) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) float* (local)
+ *         On entry, the right-hand side matrix of dimension (m_loc, nrhs),
+ *           where, m_loc is the number of rows stored locally on my
+ *           process and is defined in the data structure of matrix A.
+ *         On exit, the solution matrix if info = 0;
+ *
+ * ldb     (input) int (local)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t* (global)
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) sLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol) (global)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc'.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*) (global)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (sLocalLU_t*) (local)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'sLocalLU_t'.
+ *
+ * SOLVEstruct (input/output) sSOLVEstruct_t*
+ *         The data structure to hold the communication pattern used
+ *         in the phases of triangular solution and iterative refinement.
+ *         This pattern should be intialized only once for repeated solutions.
+ *         If options->SolveInitialized = YES, it is an input argument.
+ *         If options->SolveInitialized = NO and nrhs != 0, it is an output
+ *         argument. See superlu_sdefs.h for the definition of 'sSOLVEstruct_t'.
+ *
+ * berr    (output) float*, dimension (nrhs) (global)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         < 0: if info = -i, the i-th argument had an illegal value  
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ * See superlu_ddefs.h for the definitions of varioous data types.
+ * 
+ */ + +void +psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, + sScalePermstruct_t * ScalePermstruct, + float B[], int ldb, int nrhs, gridinfo3d_t * grid3d, + sLUstruct_t * LUstruct, sSOLVEstruct_t * SOLVEstruct, + float *berr, SuperLUStat_t * stat, int *info) +{ + NRformat_loc *Astore; + SuperMatrix GA; /* Global A in NC format */ + NCformat *GAstore; + float *a_GA; + SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */ + NCPformat *GACstore; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t *Glu_freeable; + /* The nonzero structures of L and U factors, which are + replicated on all processrs. + (lsub, xlsub) contains the compressed subscript of + supernodes in L. + (usub, xusub) contains the compressed subscript of + nonzero segments in U. + If options->Fact != SamePattern_SameRowPerm, they are + computed by SYMBFACT routine, and then used by PDDISTRIBUTE + routine. They will be freed after PDDISTRIBUTE routine. + If options->Fact == SamePattern_SameRowPerm, these + structures are not used. */ + yes_no_t parSymbFact = options->ParSymbFact; + fact_t Fact; + float *a; + int_t *colptr, *rowind; + int_t *perm_r; /* row permutations from partial pivoting */ + int_t *perm_c; /* column permutation vector */ + int_t *etree; /* elimination tree */ + int_t *rowptr, *colind; /* Local A in NR */ + int_t colequ, Equil, factored, job, notran, rowequ, need_value; + int_t i, iinfo, j, irow, m, n, nnz, permc_spec; + int_t nnz_loc, m_loc, fst_row, icol; + int iam; + int ldx; /* LDA for matrix X (local). */ + char equed[1], norm[1]; + float *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; + float *X, *b_col, *b_work, *x_col; + double t; + float GA_mem_use; /* memory usage by global A */ + float dist_mem_use; /* memory usage during distribution */ + superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; +#if ( PRNTlevel>= 2 ) + double dmin, dsum, dprod; +#endif + + // get the 2d grid + gridinfo_t *grid = &(grid3d->grid2d); + iam = grid->iam; + + /* Initialization. */ + /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d + B3d and Astore3d will be restored on return */ + int ldb3d = ldb; + // float *B3d = B; + NRformat_loc *Astore3d = (NRformat_loc *)A->Store; + float *B2d; + NRformat_loc3d *A3d = sGatherNRformat_loc3d((NRformat_loc *)A->Store, + B, ldb, nrhs, grid3d); + B2d = (float *) A3d->B2d; + NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0 + NRformat_loc *A_orig = A->Store; + + /* definition of factored seen by each process layer */ + Fact = options->Fact; + factored = (Fact == FACTORED); + + /* Test the options choices. */ + *info = 0; + Fact = options->Fact; + if (Fact < 0 || Fact > FACTORED) + *info = -1; + else if (options->RowPerm < 0 || options->RowPerm > MY_PERMR) + *info = -1; + else if (options->ColPerm < 0 || options->ColPerm > MY_PERMC) + *info = -1; + else if (options->IterRefine < 0 || options->IterRefine > SLU_EXTRA) + *info = -1; + else if (options->IterRefine == SLU_EXTRA) { + *info = -1; + fprintf (stderr, + "Extra precise iterative refinement yet to support."); + } else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc + || A->Dtype != SLU_S || A->Mtype != SLU_GE) + *info = -2; + else if (ldb < Astore3d->m_loc) + *info = -5; + else if (nrhs < 0) { + *info = -6; + } + if (*info) { + i = -(*info); + pxerr_dist ("psgssvx3d", grid, -*info); + return; + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter psgssvx3d()"); +#endif + + /* Perform preprocessing steps on process layer zero, including: + gather 3D matrices {A, B} onto 2D grid-0, + ordering, symbolic factorization, distribution of L & U */ + +#define NRFRMT + + if (grid3d->zscp.Iam == 0) + { + m = A->nrow; + n = A->ncol; + // checkNRFMT(Astore0, (NRformat_loc *) A->Store); +#ifdef NRFRMT + // On input, A->Store is on 3D, now A->Store is re-assigned to 2D store + A->Store = Astore0; + ldb = Astore0->m_loc; + B = B2d; // B is now re-assigned to B2d + //PrintDouble5("after gather B=B2d", ldb, B); +#endif + + /* The following code now works on 2D grid-0 */ + Astore = (NRformat_loc *) A->Store; + nnz_loc = Astore->nnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + a = (float *) Astore->nzval; + rowptr = Astore->rowptr; + colind = Astore->colind; + + /* Structures needed for parallel symbolic factorization */ + int_t *sizes, *fstVtxSep; + int noDomains, nprocs_num; + MPI_Comm symb_comm; /* communicator for symbolic factorization */ + int col, key; /* parameters for creating a new communicator */ + Pslu_freeable_t Pslu_freeable; + float flinfo; + + sizes = NULL; + fstVtxSep = NULL; + symb_comm = MPI_COMM_NULL; + + Equil = (!factored && options->Equil == YES); + notran = (options->Trans == NOTRANS); + + iam = grid->iam; + job = 5; + if (factored || (Fact == SamePattern_SameRowPerm && Equil)) + { + rowequ = (ScalePermstruct->DiagScale == ROW) || + (ScalePermstruct->DiagScale == BOTH); + colequ = (ScalePermstruct->DiagScale == COL) || + (ScalePermstruct->DiagScale == BOTH); + } + else + rowequ = colequ = FALSE; + + /* The following arrays are replicated on all processes. */ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + etree = LUstruct->etree; + R = ScalePermstruct->R; + C = ScalePermstruct->C; + /********/ + + /* Not factored & ask for equilibration */ + if (Equil && Fact != SamePattern_SameRowPerm) { + /* Allocate storage if not done so before. */ + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + if (!(R = (float *) floatMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + if (!(C = (float *) floatMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->R = R; + ScalePermstruct->C = C; + break; + case ROW: + if (!(C = (float *) floatMalloc_dist (n))) + ABORT ("Malloc fails for C[]."); + ScalePermstruct->C = C; + break; + case COL: + if (!(R = (float *) floatMalloc_dist (m))) + ABORT ("Malloc fails for R[]."); + ScalePermstruct->R = R; + break; + default: break; + } + } + + /* ------------------------------------------------------------ + Diagonal scaling to equilibrate the matrix. + ------------------------------------------------------------ */ + if (Equil) { +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter equil"); +#endif + t = SuperLU_timer_ (); + + if (Fact == SamePattern_SameRowPerm) { + /* Reuse R and C. */ + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + break; + case ROW: + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + a[i] *= R[irow]; /* Scale rows. */ + } + ++irow; + } + break; + case COL: + for (j = 0; j < m_loc; ++j) + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + icol = colind[i]; + a[i] *= C[icol]; /* Scale columns. */ + } + break; + case BOTH: + irow = fst_row; + for (j = 0; j < m_loc; ++j) + { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) + { + icol = colind[i]; + a[i] *= R[irow] * C[icol]; /* Scale rows and cols. */ + } + ++irow; + } + break; + } + } else { /* Compute R & C from scratch */ + /* Compute the row and column scalings. */ + psgsequ (A, R, C, &rowcnd, &colcnd, &amax, &iinfo, grid); + + if ( iinfo > 0 ) { + if ( iinfo <= m ) { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", iinfo); +#endif + } else { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", iinfo-n); +#endif + } + } else if ( iinfo < 0 ) return; + + /* Now iinfo == 0 */ + + /* Equilibrate matrix A if it is badly-scaled. + A <-- diag(R)*A*diag(C) */ + pslaqgs (A, R, C, rowcnd, colcnd, amax, equed); + + if ( strncmp(equed, "R", 1)==0 ) { + ScalePermstruct->DiagScale = ROW; + rowequ = ROW; + } else if ( strncmp(equed, "C", 1)==0 ) { + ScalePermstruct->DiagScale = COL; + colequ = COL; + } else if ( strncmp(equed, "B", 1)==0 ) { + ScalePermstruct->DiagScale = BOTH; + rowequ = ROW; + colequ = COL; + } else ScalePermstruct->DiagScale = NOEQUIL; + +#if ( PRNTlevel>=1 ) + if (iam==0) { + printf (".. equilibrated? *equed = %c\n", *equed); + fflush(stdout); + } +#endif + } /* end if-else Fact ... */ + + stat->utime[EQUIL] = SuperLU_timer_ () - t; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit equil"); +#endif + } /* end if Equil ... LAPACK style, not involving MC64 */ + + if (!factored) { /* Skip this if already factored. */ + /* + * Gather A from the distributed compressed row format to + * global A in compressed column format. + * Numerical values are gathered only when a row permutation + * for large diagonal is sought after. + */ + if (Fact != SamePattern_SameRowPerm && + (parSymbFact == NO || options->RowPerm != NO)) { + + need_value = (options->RowPerm == LargeDiag_MC64); + + psCompRow_loc_to_CompCol_global (need_value, A, grid, &GA); + + GAstore = (NCformat *) GA.Store; + colptr = GAstore->colptr; + rowind = GAstore->rowind; + nnz = GAstore->nnz; + GA_mem_use = (nnz + n + 1) * sizeof (int_t); + + if (need_value) { + a_GA = (float *) GAstore->nzval; + GA_mem_use += nnz * sizeof (float); + } + + else + assert (GAstore->nzval == NULL); + } + + /* ------------------------------------------------------------ + Find the row permutation for A. + ------------------------------------------------------------ */ + if (options->RowPerm != NO) { + t = SuperLU_timer_ (); + if (Fact != SamePattern_SameRowPerm) { + if (options->RowPerm == MY_PERMR) { + /* Use user's perm_r. */ + /* Permute the global matrix GA for symbfact() */ + for (i = 0; i < colptr[n]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } else if ( options->RowPerm == LargeDiag_MC64 ) { + /* Get a new perm_r[] */ + if (job == 5) { + /* Allocate storage for scaling factors. */ + if (!(R1 = floatMalloc_dist (m))) + ABORT ("SUPERLU_MALLOC fails for R1[]"); + if (!(C1 = floatMalloc_dist (n))) + ABORT ("SUPERLU_MALLOC fails for C1[]"); + } + + if ( iam==0 ) { + /* Process 0 finds a row permutation */ + iinfo = sldperm_dist (job, m, nnz, colptr, rowind, a_GA, + perm_r, R1, C1); + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) { + MPI_Bcast (R1, m, MPI_FLOAT, 0, grid->comm); + MPI_Bcast (C1, n, MPI_FLOAT, 0, grid->comm); + } + } + } else { + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast (perm_r, m, mpi_int_t, 0, grid->comm); + if (job == 5 && Equil) { + MPI_Bcast (R1, m, MPI_FLOAT, 0, grid->comm); + MPI_Bcast (C1, n, MPI_FLOAT, 0, grid->comm); + } + } + } + + if ( iinfo && job == 5) { /* Error return */ + SUPERLU_FREE(R1); + SUPERLU_FREE(C1); + } +#if ( PRNTlevel>=2 ) + dmin = damch_dist ("Overflow"); + dsum = 0.0; + dprod = 1.0; +#endif + if ( iinfo == 0 ) { + if (job == 5) { + if ( Equil ) { + for (i = 0; i < n; ++i) { + R1[i] = exp (R1[i]); + C1[i] = exp (C1[i]); + } + + /* Scale the distributed matrix further. + A <-- diag(R1)*A*diag(C1) */ + irow = fst_row; + for (j = 0; j < m_loc; ++j) { + for (i = rowptr[j]; i < rowptr[j + 1]; ++i) { + icol = colind[i]; + a[i] *= R1[irow] * C1[icol]; +#if ( PRNTlevel>=2 ) + if (perm_r[irow] == icol) { + /* New diagonal */ + if (job == 2 || job == 3) + dmin = SUPERLU_MIN(dmin, fabs(a[i])); + else if (job == 4) + dsum += fabs(a[i]); + else if (job == 5) + dprod *= fabs(a[i]); + } +#endif + } + ++irow; + } + + /* Multiply together the scaling factors -- + R/C from simple scheme, R1/C1 from MC64. */ + if (rowequ) + for (i = 0; i < m; ++i) R[i] *= R1[i]; + else + for (i = 0; i < m; ++i) R[i] = R1[i]; + if (colequ) + for (i = 0; i < n; ++i) C[i] *= C1[i]; + else + for (i = 0; i < n; ++i) C[i] = C1[i]; + + ScalePermstruct->DiagScale = BOTH; + rowequ = colequ = 1; + + } /* end if Equil */ + + /* Now permute global A to prepare for symbfact() */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } + SUPERLU_FREE (R1); + SUPERLU_FREE (C1); + } else { /* job = 2,3,4 */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j + 1]; ++i) + { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } /* end for i ... */ + } /* end for j ... */ + } /* end else job ... */ + } else { /* if iinfo != 0 */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } +#if ( PRNTlevel>=2 ) + if (job == 2 || job == 3) { + if (!iam) + printf ("\tsmallest diagonal %e\n", dmin); + } else if (job == 4) { + if (!iam) + printf ("\tsum of diagonal %e\n", dsum); + } else if (job == 5) { + if (!iam) + printf ("\t product of diagonal %e\n", dprod); + } +#endif + } else { /* use LargeDiag_HWPM */ +#ifdef HAVE_COMBBLAS + s_c2cpp_GetHWPM(A, grid, ScalePermstruct); +#else + if ( iam == 0 ) { + printf("CombBLAS is not available\n"); fflush(stdout); + } +#endif + } /* end if-else options->RowPerm ... */ + + t = SuperLU_timer_ () - t; + stat->utime[ROWPERM] = t; +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); + fflush(stdout); + } +#endif + } /* end if Fact not SamePattern_SameRowPerm ... */ + } else { /* options->RowPerm == NOROWPERM / NATURAL */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } + +#if ( DEBUGlevel>=2 ) + if (!iam) + PrintInt10 ("perm_r", m, perm_r); +#endif + } /* end if (!factored) */ + + if (!factored || options->IterRefine) { + /* Compute norm(A), which will be used to adjust small diagonal. */ + if (notran) + *(unsigned char *) norm = '1'; + else + *(unsigned char *) norm = 'I'; + anorm = pslangs (norm, A, grid); +#if ( PRNTlevel>=1 ) + if (!iam) { + printf (".. anorm %e\n", anorm); fflush(stdout); + } +#endif + } + + + /* ------------------------------------------------------------ + Perform the LU factorization. + ------------------------------------------------------------ */ + if (!factored) { + t = SuperLU_timer_ (); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = METIS_AT_PLUS_A: METIS on structure of A'+A + * permc_spec = PARMETIS: parallel METIS on structure of A'+A + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + + if (parSymbFact == YES || permc_spec == PARMETIS) { + nprocs_num = grid->nprow * grid->npcol; + noDomains = (int) (pow (2, ((int) LOG2 (nprocs_num)))); + + /* create a new communicator for the first noDomains + processes in grid->comm */ + key = iam; + if (iam < noDomains) + col = 0; + else + col = MPI_UNDEFINED; + MPI_Comm_split (grid->comm, col, key, &symb_comm); + + if (permc_spec == NATURAL || permc_spec == MY_PERMC) { + if (permc_spec == NATURAL) + { + for (j = 0; j < n; ++j) + perm_c[j] = j; + } + if (!(sizes = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for sizes."); + if (!(fstVtxSep = intMalloc_dist (2 * noDomains))) + ABORT ("SUPERLU_MALLOC fails for fstVtxSep."); + for (i = 0; i < 2 * noDomains - 2; ++i) { + sizes[i] = 0; + fstVtxSep[i] = 0; + } + sizes[2 * noDomains - 2] = m; + fstVtxSep[2 * noDomains - 2] = 0; + } else if (permc_spec != PARMETIS) { + /* same as before */ + printf("{%4d,%4d}: psgssvx3d: invalid ColPerm option when ParSymbfact is used\n", + (int) MYROW(grid->iam, grid), (int) MYCOL(grid->iam, grid)); + } + } /* end ... use parmetis */ + + if (permc_spec != MY_PERMC && Fact == DOFACT) { + if (permc_spec == PARMETIS) { + /* Get column permutation vector in perm_c. * + * This routine takes as input the distributed input matrix A * + * and does not modify it. It also allocates memory for * + * sizes[] and fstVtxSep[] arrays, that contain information * + * on the separator tree computed by ParMETIS. */ + flinfo = get_perm_c_parmetis (A, perm_r, perm_c, nprocs_num, + noDomains, &sizes, &fstVtxSep, + grid, &symb_comm); + if (flinfo > 0) + ABORT ("ERROR in get perm_c parmetis."); + } else { + get_perm_c_dist (iam, permc_spec, &GA, perm_c); + } + } + + stat->utime[COLPERM] = SuperLU_timer_ () - t; + + /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' + (a.k.a. column etree), depending on the choice of ColPerm. + Adjust perm_c[] to be consistent with a postorder of etree. + Permute columns of A to form A*Pc'. */ + if (Fact != SamePattern_SameRowPerm) { + if (parSymbFact == NO) { + + int_t *GACcolbeg, *GACcolend, *GACrowind; + + sp_colorder (options, &GA, perm_c, etree, &GAC); + + /* Form Pc*A*Pc' to preserve the diagonal of the matrix GAC. */ + GACstore = (NCPformat *) GAC.Store; + GACcolbeg = GACstore->colbeg; + GACcolend = GACstore->colend; + GACrowind = GACstore->rowind; + for (j = 0; j < n; ++j) { + for (i = GACcolbeg[j]; i < GACcolend[j]; ++i) { + irow = GACrowind[i]; + GACrowind[i] = perm_c[irow]; + } + } + + /* Perform a symbolic factorization on Pc*Pr*A*Pc' and set up + the nonzero data structures for L & U. */ +#if ( PRNTlevel>=1 ) + if (!iam) + printf + (".. symbfact(): relax %4d, maxsuper %4d, fill %4d\n", + sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); +#endif + t = SuperLU_timer_ (); + if (!(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC (sizeof (Glu_freeable_t)))) + ABORT ("Malloc fails for Glu_freeable."); + + /* Every process does this. */ + iinfo = symbfact (options, iam, &GAC, perm_c, etree, + Glu_persist, Glu_freeable); + + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (iinfo < 0) { + /* Successful return */ + QuerySpace_dist (n, -iinfo, Glu_freeable, &symb_mem_usage); + +#if ( PRNTlevel>=1 ) + if (!iam) { + printf ("\tNo of supers %ld\n", + (long) Glu_persist->supno[n - 1] + 1); + printf ("\tSize of G(L) %ld\n", (long) Glu_freeable->xlsub[n]); + printf ("\tSize of G(U) %ld\n", (long) Glu_freeable->xusub[n]); + printf ("\tint %lu, short %lu, float %lu, double %lu\n", + sizeof(int_t), sizeof (short), + sizeof(float), sizeof (double)); + printf + ("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", + symb_mem_usage.for_lu * 1e-6, + symb_mem_usage.total * 1e-6, + symb_mem_usage.expansions); + } +#endif + } else { + if (!iam) { + fprintf (stderr, "symbfact() error returns %d\n", + (int) iinfo); + exit (-1); + } + } + + } /* end serial symbolic factorization */ + else { /* parallel symbolic factorization */ + t = SuperLU_timer_ (); + flinfo = + symbfact_dist (nprocs_num, noDomains, A, perm_c, perm_r, + sizes, fstVtxSep, &Pslu_freeable, + &(grid->comm), &symb_comm, + &symb_mem_usage); + stat->utime[SYMBFAC] = SuperLU_timer_ () - t; + if (flinfo > 0) + ABORT + ("Insufficient memory for parallel symbolic factorization."); + } + + /* Destroy GA */ + if (parSymbFact == NO || options->RowPerm != NO) + Destroy_CompCol_Matrix_dist (&GA); + if (parSymbFact == NO) + Destroy_CompCol_Permuted_dist (&GAC); + + } /* end if Fact not SamePattern_SameRowPerm */ + + if (sizes) + SUPERLU_FREE (sizes); + if (fstVtxSep) + SUPERLU_FREE (fstVtxSep); + if (symb_comm != MPI_COMM_NULL) + MPI_Comm_free (&symb_comm); + + if (parSymbFact == NO || Fact == SamePattern_SameRowPerm) { + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + t = SuperLU_timer_ (); + dist_mem_use = psdistribute (Fact, n, A, ScalePermstruct, + Glu_freeable, LUstruct, grid); + stat->utime[DIST] = SuperLU_timer_ () - t; + + /* Deallocate storage used in symbolic factorization. */ + if (Fact != SamePattern_SameRowPerm) + { + iinfo = symbfact_SubFree (Glu_freeable); + SUPERLU_FREE (Glu_freeable); + } + } else { + /* Distribute Pc*Pr*diag(R)*A*diag(C)*Pc' into L and U storage. + NOTE: the row permutation Pc*Pr is applied internally in the + distribution routine. */ + /* Apply column permutation to the original distributed A */ + for (j = 0; j < nnz_loc; ++j) + colind[j] = perm_c[colind[j]]; + + t = SuperLU_timer_ (); + dist_mem_use = sdist_psymbtonum (Fact, n, A, ScalePermstruct, + &Pslu_freeable, LUstruct, grid); + if (dist_mem_use > 0) + ABORT ("Not enough memory available for dist_psymbtonum\n"); + + stat->utime[DIST] = SuperLU_timer_ () - t; + } + + /*if (!iam) printf ("\tDISTRIBUTE time %8.2f\n", stat->utime[DIST]); */ + } /* end if not Factored */ + } /* end if process layer 0 */ + + trf3Dpartition_t* trf3Dpartition; + + /* Perform numerical factorization in parallel on all process layers.*/ + if ( !factored ) { + + /* send the data across all the layers */ + MPI_Bcast( &m, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &n, 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Bcast( &anorm, 1, MPI_FLOAT, 0, grid3d->zscp.comm); + + /* send the LU structure to all the grids */ + sp3dScatter(n, LUstruct, grid3d); + + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); + trf3Dpartition = sinitTrf3Dpartition(nsupers, options, LUstruct, grid3d); + + SCT_t *SCT = (SCT_t *) SUPERLU_MALLOC(sizeof(SCT_t)); + SCT_init(SCT); + +#if ( PRNTlevel>=1 ) + if (grid3d->iam == 0) { + printf("after 3D initialization.\n"); fflush(stdout); + } +#endif + + t = SuperLU_timer_ (); + + /*factorize in grid 1*/ + // if(grid3d->zscp.Iam) + + psgstrf3d (options, m, n, anorm, trf3Dpartition, SCT, LUstruct, + grid3d, stat, info); + stat->utime[FACT] = SuperLU_timer_ () - t; + + double tgather = SuperLU_timer_(); + + sgatherAllFactoredLU(trf3Dpartition, LUstruct, grid3d, SCT); + + SCT->gatherLUtimer += SuperLU_timer_() - tgather; + /*print stats for bottom grid*/ + +#if ( PRNTlevel>=1 ) + if (!grid3d->zscp.Iam) + { + SCT_print(grid, SCT); + SCT_print3D(grid3d, SCT); + } + SCT_printComm3D(grid3d, SCT); + + /*print memory usage*/ + s3D_printMemUse( trf3Dpartition, LUstruct, grid3d ); + + /*print forest weight and costs*/ + printForestWeightCost(trf3Dpartition->sForests, SCT, grid3d); + /*reduces stat from all the layers*/ +#endif + + sDestroy_trf3Dpartition(trf3Dpartition, grid3d); + SCT_free(SCT); + + } /* end if not Factored */ + + if ( grid3d->zscp.Iam == 0 ) { // only process layer 0 + if (!factored) { + if (options->PrintStat) { + int_t TinyPivots; + float for_lu, total, max, avg, temp; + + sQuerySpace_dist (n, LUstruct, grid, stat, &num_mem_usage); + + if (parSymbFact == TRUE) { + /* The memory used in the redistribution routine + includes the memory used for storing the symbolic + structure and the memory allocated for numerical factorization */ + temp = SUPERLU_MAX (symb_mem_usage.total, -dist_mem_use); + if (options->RowPerm != NO) + temp = SUPERLU_MAX (temp, GA_mem_use); + } + else { + temp = SUPERLU_MAX (symb_mem_usage.total + GA_mem_use, /* symbfact step */ + symb_mem_usage.for_lu + dist_mem_use + num_mem_usage.for_lu /* distribution step */ + ); + } + + temp = SUPERLU_MAX (temp, num_mem_usage.total); + + MPI_Reduce (&temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + MPI_Reduce (&temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Allreduce (&stat->TinyPivots, &TinyPivots, 1, mpi_int_t, + MPI_SUM, grid->comm); + stat->TinyPivots = TinyPivots; + + MPI_Reduce (&num_mem_usage.for_lu, &for_lu, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&num_mem_usage.total, &total, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + + if (!iam) { + printf("\tNUMfact space (MB) sum(procs): L\\U\t%.2f\tall\t%.2f\n", + for_lu * 1e-6, total * 1e-6); + printf ("\tTotal highmark (MB): " + "All\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", avg * 1e-6, + avg / grid->nprow / grid->npcol * 1e-6, max * 1e-6); + printf("**************************************************\n"); + fflush(stdout); + } + } + + } /* end if not Factored */ + + /* ------------------------------------------------------------ + Compute the solution matrix X. + ------------------------------------------------------------ */ + if (nrhs) { + if (!(b_work = floatMalloc_dist (n))) + ABORT ("Malloc fails for b_work[]"); + + /* ------------------------------------------------------ + Scale the right-hand side if equilibration was performed + ------------------------------------------------------*/ + if (notran) + { + if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if (!(X = floatMalloc_dist (((size_t) ldx) * nrhs))) + ABORT ("Malloc fails for X[]"); + x_col = X; + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i]; + x_col += ldx; + b_col += ldb; + } + + /* ------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------*/ + if (options->SolveInitialized == NO) /* First time */ + /* Inside this routine, SolveInitialized is set to YES. + For repeated call to psgssvx3d(), no need to re-initialilze + the Solve data & communication structures, unless a new + factorization with Fact == DOFACT or SamePattern is asked for. */ + { + sSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct, + grid, SOLVEstruct); + } + stat->utime[SOLVE] = 0.0; +#if 0 // Sherry: the following interface is needed by 3D trisolve. + psgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); +#else + psgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc, + fst_row, ldb, nrhs, SOLVEstruct, stat, info); +#endif + + /* ------------------------------------------------------------ + Use iterative refinement to improve the computed solution and + compute error bounds and backward error estimates for it. + ------------------------------------------------------------ */ + if (options->IterRefine) + { + /* Improve the solution by iterative refinement. */ + int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv; + sSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */ + + t = SuperLU_timer_ (); + if (options->RefineInitialized == NO || Fact == DOFACT) { + /* All these cases need to re-initialize gsmv structure */ + if (options->RefineInitialized) + psgsmv_finalize (SOLVEstruct->gsmv_comm); + psgsmv_init (A, SOLVEstruct->row_to_proc, grid, + SOLVEstruct->gsmv_comm); + + /* Save a copy of the transformed local col indices + in colind_gsmv[]. */ + if (colind_gsmv) SUPERLU_FREE (colind_gsmv); + if (!(it = intMalloc_dist (nnz_loc))) + ABORT ("Malloc fails for colind_gsmv[]"); + colind_gsmv = SOLVEstruct->A_colind_gsmv = it; + for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i]; + options->RefineInitialized = YES; + } + else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) { + float at; + int_t k, jcol, p; + /* Swap to beginning the part of A corresponding to the + local part of X, as was done in pdgsmv_init() */ + for (i = 0; i < m_loc; ++i) { /* Loop through each row */ + k = rowptr[i]; + for (j = rowptr[i]; j < rowptr[i + 1]; ++j) + { + jcol = colind[j]; + p = SOLVEstruct->row_to_proc[jcol]; + if (p == iam) + { /* Local */ + at = a[k]; + a[k] = a[j]; + a[j] = at; + ++k; + } + } + } + + /* Re-use the local col indices of A obtained from the + previous call to pdgsmv_init() */ + for (i = 0; i < nnz_loc; ++i) + colind[i] = colind_gsmv[i]; + } + + if (nrhs == 1) + { /* Use the existing solve structure */ + SOLVEstruct1 = SOLVEstruct; + } + else { + /* For nrhs > 1, since refinement is performed for RHS + one at a time, the communication structure for pdgstrs + is different than the solve with nrhs RHS. + So we use SOLVEstruct1 for the refinement step. + */ + if (!(SOLVEstruct1 = (sSOLVEstruct_t *) + SUPERLU_MALLOC(sizeof(sSOLVEstruct_t)))) + ABORT ("Malloc fails for SOLVEstruct1"); + /* Copy the same stuff */ + SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc; + SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c; + SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs; + SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs; + SOLVEstruct1->diag_len = SOLVEstruct->diag_len; + SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm; + SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv; + + /* Initialize the *gstrs_comm for 1 RHS. */ + if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC (sizeof (pxgstrs_comm_t)))) + ABORT ("Malloc fails for gstrs_comm[]"); + psgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid, + Glu_persist, SOLVEstruct1); + } + + psgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid, + B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info); + + /* Deallocate the storage associated with SOLVEstruct1 */ + if (nrhs > 1) + { + pxgstrs_finalize (SOLVEstruct1->gstrs_comm); + SUPERLU_FREE (SOLVEstruct1); + } + + stat->utime[REFINE] = SuperLU_timer_ () - t; + } + + /* Permute the solution matrix B <= Pc'*X. */ + psPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc, + SOLVEstruct->inv_perm_c, + X, ldx, B, ldb, nrhs, grid); +#if ( DEBUGlevel>=2 ) + printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam); + for (i = 0; i < m_loc; ++i) + printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]); +#endif + + /* Transform the solution matrix X to a solution of the original + system before the equilibration. */ + if (notran) + { + if (colequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= C[irow]; + ++irow; + } + b_col += ldb; + } + } + } + else if (rowequ) + { + b_col = B; + for (j = 0; j < nrhs; ++j) + { + irow = fst_row; + for (i = 0; i < m_loc; ++i) + { + b_col[i] *= R[irow]; + ++irow; + } + b_col += ldb; + } + } + + SUPERLU_FREE (b_work); + SUPERLU_FREE (X); + + } /* end if nrhs != 0 */ + +#if ( PRNTlevel>=1 ) + if (!iam) + printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale); +#endif + + /* Deallocate R and/or C if it was not used. */ + if (Equil && Fact != SamePattern_SameRowPerm) + { + switch (ScalePermstruct->DiagScale) { + case NOEQUIL: + SUPERLU_FREE (R); + SUPERLU_FREE (C); + break; + case ROW: + SUPERLU_FREE (C); + break; + case COL: + SUPERLU_FREE (R); + break; + default: break; + } + } + +#if 0 + if (!factored && Fact != SamePattern_SameRowPerm && !parSymbFact) + Destroy_CompCol_Permuted_dist (&GAC); +#endif + + } /* process layer 0 done solve */ + +#ifdef NRFRMT + /* Scatter the solution from 2D grid_0 to 3D grid */ + sScatter_B3d(A3d, grid3d); + + B = A3d->B3d; // B is now assigned back to B3d on return + A->Store = Astore3d; // restore Astore to 3D + + /* free A2d and B2d, which are allocated only in 2D layer Grid_0 */ + NRformat_loc *A2d = A3d->A_nfmt; + if (grid3d->zscp.Iam == 0) { + SUPERLU_FREE( A2d->rowptr ); + SUPERLU_FREE( A2d->colind ); + SUPERLU_FREE( A2d->nzval ); + SUPERLU_FREE( A3d->B2d ); + } + SUPERLU_FREE( A2d ); // free 2D structure + SUPERLU_FREE( A3d ); // free 3D structure +#endif + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit psgssvx3d()"); +#endif + +} diff --git a/SRC/psgssvx_ABglobal.c b/SRC/psgssvx_ABglobal.c new file mode 100644 index 00000000..f4d582a3 --- /dev/null +++ b/SRC/psgssvx_ABglobal.c @@ -0,0 +1,1112 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Solves a system of linear equations A*X=B, + * + *
+ * -- Distributed SuperLU routine (version 4.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ *
+ * Last modified:
+ * December 31, 2015   version 4.3
+ * 
+ */ + +#include +#include "superlu_sdefs.h" +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * psgssvx_ABglobal solves a system of linear equations A*X=B,
+ * by using Gaussian elimination with "static pivoting" to
+ * compute the LU factorization of A.
+ *
+ * Static pivoting is a technique that combines the numerical stability
+ * of partial pivoting with the scalability of Cholesky (no pivoting),
+ * to run accurately and efficiently on large numbers of processors.
+ *
+ * See our paper at http://www.nersc.gov/~xiaoye/SuperLU/ for a detailed
+ * description of the parallel algorithms.
+ *
+ * Here are the options for using this code:
+ *
+ *   1. Independent of all the other options specified below, the
+ *      user must supply
+ *
+ *      -  B, the matrix of right hand sides, and its dimensions ldb and nrhs
+ *      -  grid, a structure describing the 2D processor mesh
+ *      -  options->IterRefine, which determines whether or not to
+ *            improve the accuracy of the computed solution using
+ *            iterative refinement
+ *
+ *      On output, B is overwritten with the solution X.
+ *
+ *   2. Depending on options->Fact, the user has several options
+ *      for solving A*X=B. The standard option is for factoring
+ *      A "from scratch". (The other options, described below,
+ *      are used when A is sufficiently similar to a previously
+ *      solved problem to save time by reusing part or all of
+ *      the previous factorization.)
+ *
+ *      -  options->Fact = DOFACT: A is factored "from scratch"
+ *
+ *      In this case the user must also supply
+ *
+ *      -  A, the input matrix
+ *
+ *      as well as the following options, which are described in more
+ *      detail below:
+ *
+ *      -  options->Equil,   to specify how to scale the rows and columns
+ *                           of A to "equilibrate" it (to try to reduce its
+ *                           condition number and so improve the
+ *                           accuracy of the computed solution)
+ *
+ *      -  options->RowPerm, to specify how to permute the rows of A
+ *                           (typically to control numerical stability)
+ *
+ *      -  options->ColPerm, to specify how to permute the columns of A
+ *                           (typically to control fill-in and enhance
+ *                           parallelism during factorization)
+ *
+ *      -  options->ReplaceTinyPivot, to specify how to deal with tiny
+ *                           pivots encountered during factorization
+ *                           (to control numerical stability)
+ *
+ *      The outputs returned include
+ *
+ *      -  ScalePermstruct,  modified to describe how the input matrix A
+ *                           was equilibrated and permuted:
+ *         -  ScalePermstruct->DiagScale, indicates whether the rows and/or
+ *                                        columns of A were scaled
+ *         -  ScalePermstruct->R, array of row scale factors
+ *         -  ScalePermstruct->C, array of column scale factors
+ *         -  ScalePermstruct->perm_r, row permutation vector
+ *         -  ScalePermstruct->perm_c, column permutation vector
+ *
+ *            (part of ScalePermstruct may also need to be supplied on input,
+ *             depending on options->RowPerm and options->ColPerm as described
+ *             later).
+ *
+ *      -  A, the input matrix A overwritten by the scaled and permuted matrix
+ *                Pc*Pr*diag(R)*A*diag(C)
+ *             where
+ *                Pr and Pc are row and columns permutation matrices determined
+ *                  by ScalePermstruct->perm_r and ScalePermstruct->perm_c,
+ *                  respectively, and
+ *                diag(R) and diag(C) are diagonal scaling matrices determined
+ *                  by ScalePermstruct->DiagScale, ScalePermstruct->R and
+ *                  ScalePermstruct->C
+ *
+ *      -  LUstruct, which contains the L and U factorization of A1 where
+ *
+ *                A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ *              (Note that A1 = Aout * Pc^T, where Aout is the matrix stored
+ *               in A on output.)
+ *
+ *   3. The second value of options->Fact assumes that a matrix with the same
+ *      sparsity pattern as A has already been factored:
+ *
+ *      -  options->Fact = SamePattern: A is factored, assuming that it has
+ *            the same nonzero pattern as a previously factored matrix. In this
+ *            case the algorithm saves time by reusing the previously computed
+ *            column permutation vector stored in ScalePermstruct->perm_c
+ *            and the "elimination tree" of A stored in LUstruct->etree.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *      -  options->Equil
+ *      -  options->RowPerm
+ *      -  options->ReplaceTinyPivot
+ *
+ *      but not options->ColPerm, whose value is ignored. This is because the
+ *      previous column permutation from ScalePermstruct->perm_c is used as
+ *      input. The user must also supply
+ *
+ *      -  A, the input matrix
+ *      -  ScalePermstruct->perm_c, the column permutation
+ *      -  LUstruct->etree, the elimination tree
+ *
+ *      The outputs returned include
+ *
+ *      -  A, the input matrix A overwritten by the scaled and permuted matrix
+ *            as described above
+ *      -  ScalePermstruct,  modified to describe how the input matrix A was
+ *                           equilibrated and row permuted
+ *      -  LUstruct, modified to contain the new L and U factors
+ *
+ *   4. The third value of options->Fact assumes that a matrix B with the same
+ *      sparsity pattern as A has already been factored, and where the
+ *      row permutation of B can be reused for A. This is useful when A and B
+ *      have similar numerical values, so that the same row permutation
+ *      will make both factorizations numerically stable. This lets us reuse
+ *      all of the previously computed structure of L and U.
+ *
+ *      -  options->Fact = SamePattern_SameRowPerm: A is factored,
+ *            assuming not only the same nonzero pattern as the previously
+ *            factored matrix B, but reusing B's row permutation.
+ *
+ *      In this case the user must still specify the following options
+ *      as before:
+ *
+ *      -  options->Equil
+ *      -  options->ReplaceTinyPivot
+ *
+ *      but not options->RowPerm or options->ColPerm, whose values are ignored.
+ *      This is because the permutations from ScalePermstruct->perm_r and
+ *      ScalePermstruct->perm_c are used as input.
+ *
+ *      The user must also supply
+ *
+ *      -  A, the input matrix
+ *      -  ScalePermstruct->DiagScale, how the previous matrix was row and/or
+ *                                     column scaled
+ *      -  ScalePermstruct->R, the row scalings of the previous matrix, if any
+ *      -  ScalePermstruct->C, the columns scalings of the previous matrix,
+ *                             if any
+ *      -  ScalePermstruct->perm_r, the row permutation of the previous matrix
+ *      -  ScalePermstruct->perm_c, the column permutation of the previous
+ *                                  matrix
+ *      -  all of LUstruct, the previously computed information about L and U
+ *                (the actual numerical values of L and U stored in
+ *                 LUstruct->Llu are ignored)
+ *
+ *      The outputs returned include
+ *
+ *      -  A, the input matrix A overwritten by the scaled and permuted matrix
+ *            as described above
+ *      -  ScalePermstruct,  modified to describe how the input matrix A was
+ *                           equilibrated
+ *                  (thus ScalePermstruct->DiagScale, R and C may be modified)
+ *      -  LUstruct, modified to contain the new L and U factors
+ *
+ *   5. The fourth and last value of options->Fact assumes that A is
+ *      identical to a matrix that has already been factored on a previous
+ *      call, and reuses its entire LU factorization
+ *
+ *      -  options->Fact = Factored: A is identical to a previously
+ *            factorized matrix, so the entire previous factorization
+ *            can be reused.
+ *
+ *      In this case all the other options mentioned above are ignored
+ *      (options->Equil, options->RowPerm, options->ColPerm,
+ *       options->ReplaceTinyPivot)
+ *
+ *      The user must also supply
+ *
+ *      -  A, the unfactored matrix, only in the case that iterative refinement
+ *            is to be done (specifically A must be the output A from
+ *            the previous call, so that it has been scaled and permuted)
+ *      -  all of ScalePermstruct
+ *      -  all of LUstruct, including the actual numerical values of L and U
+ *
+ *      all of which are unmodified on output.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following fields should be defined for this structure:
+ *
+ *         o Fact (fact_t)
+ *           Specifies whether or not the factored form of the matrix
+ *           A is supplied on entry, and if not, how the matrix A should
+ *           be factorized based on the previous history.
+ *
+ *           = DOFACT: The matrix A will be factorized from scratch.
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ColPerm, ReplaceTinyPivot
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *           = SamePattern: the matrix A will be factorized assuming
+ *             that a factorization of a matrix with the same sparsity
+ *             pattern was performed prior to this one. Therefore, this
+ *             factorization will reuse column permutation vector
+ *             ScalePermstruct->perm_c and the elimination tree
+ *             LUstruct->etree
+ *                 Inputs:  A
+ *                          options->Equil, RowPerm, ReplaceTinyPivot
+ *                          ScalePermstruct->perm_c
+ *                          LUstruct->etree
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          rest of ScalePermstruct (DiagScale, R, C, perm_r)
+ *                          rest of LUstruct (GLU_persist, Llu)
+ *
+ *           = SamePattern_SameRowPerm: the matrix A will be factorized
+ *             assuming that a factorization of a matrix with the same
+ *             sparsity	pattern and similar numerical values was performed
+ *             prior to this one. Therefore, this factorization will reuse
+ *             both row and column scaling factors R and C, and the
+ *             both row and column permutation vectors perm_r and perm_c,
+ *             distributed data structure set up from the previous symbolic
+ *             factorization.
+ *                 Inputs:  A
+ *                          options->Equil, ReplaceTinyPivot
+ *                          all of ScalePermstruct
+ *                          all of LUstruct
+ *                 Outputs: modified A
+ *                             (possibly row and/or column scaled and/or
+ *                              permuted)
+ *                          modified LUstruct->Llu
+ *           = FACTORED: the matrix A is already factored.
+ *                 Inputs:  all of ScalePermstruct
+ *                          all of LUstruct
+ *
+ *         o Equil (yes_no_t)
+ *           Specifies whether to equilibrate the system.
+ *           = NO:  no equilibration.
+ *           = YES: scaling factors are computed to equilibrate the system:
+ *                      diag(R)*A*diag(C)*inv(diag(C))*X = diag(R)*B.
+ *                  Whether or not the system will be equilibrated depends
+ *                  on the scaling of the matrix A, but if equilibration is
+ *                  used, A is overwritten by diag(R)*A*diag(C) and B by
+ *                  diag(R)*B.
+ *
+ *         o RowPerm (rowperm_t)
+ *           Specifies how to permute rows of the matrix A.
+ *           = NATURAL:   use the natural ordering.
+ *           = LargeDiag_MC64: use the Duff/Koster algorithm to permute rows
+ *                        of the original matrix to make the diagonal large
+ *                        relative to the off-diagonal.
+ *           = LargeDiag_APWM: use the parallel approximate-weight perfect
+ *                        matching to permute rows of the original matrix
+ *                        to make the diagonal large relative to the
+ *                        off-diagonal.
+ *           = MY_PERMR:  use the ordering given in ScalePermstruct->perm_r
+ *                        input by the user.
+ *
+ *         o ColPerm (colperm_t)
+ *           Specifies what type of column permutation to use to reduce fill.
+ *           = NATURAL:       natural ordering.
+ *           = MMD_AT_PLUS_A: minimum degree ordering on structure of A'+A.
+ *           = MMD_ATA:       minimum degree ordering on structure of A'*A.
+ *           = MY_PERMC:      the ordering given in ScalePermstruct->perm_c.
+ *
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           = NO:  do not modify pivots
+ *           = YES: replace tiny pivots by sqrt(epsilon)*norm(A) during
+ *                  LU factorization.
+ *
+ *         o IterRefine (IterRefine_t)
+ *           Specifies how to perform iterative refinement.
+ *           = NO:     no iterative refinement.
+ *           = SLU_DOUBLE: accumulate residual in double precision.
+ *           = SLU_EXTRA:  accumulate residual in extra precision.
+ *
+ *         NOTE: all options must be identical on all processes when
+ *               calling this routine.
+ *
+ * A (input/output) SuperMatrix*
+ *         On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol).
+ *         The number of linear equations is A->nrow. The type of A must be:
+ *         Stype = SLU_NC; Dtype = SLU_S; Mtype = SLU_GE. That is, A is stored in
+ *         compressed column format (also known as Harwell-Boeing format).
+ *         See supermatrix.h for the definition of 'SuperMatrix'.
+ *         This routine only handles square A, however, the LU factorization
+ *         routine psgstrf can factorize rectangular matrices.
+ *         On exit, A may be overwritten by Pc*Pr*diag(R)*A*diag(C),
+ *         depending on ScalePermstruct->DiagScale, options->RowPerm and
+ *         options->colpem:
+ *             if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by
+ *                diag(R)*A*diag(C).
+ *             if options->RowPerm != NATURAL, A is further overwritten by
+ *                Pr*diag(R)*A*diag(C).
+ *             if options->ColPerm != NATURAL, A is further overwritten by
+ *                Pc*Pr*diag(R)*A*diag(C).
+ *         If all the above condition are true, the LU decomposition is
+ *         performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T.
+ *
+ *         NOTE: Currently, A must reside in all processes when calling
+ *               this routine.
+ *
+ * ScalePermstruct (input/output) sScalePermstruct_t*
+ *         The data structure to store the scaling and permutation vectors
+ *         describing the transformations performed to the matrix A.
+ *         It contains the following fields:
+ *
+ *         o DiagScale (DiagScale_t)
+ *           Specifies the form of equilibration that was done.
+ *           = NOEQUIL: no equilibration.
+ *           = ROW:     row equilibration, i.e., A was premultiplied by
+ *                      diag(R).
+ *           = COL:     Column equilibration, i.e., A was postmultiplied
+ *                      by diag(C).
+ *           = BOTH:    both row and column equilibration, i.e., A was
+ *                      replaced by diag(R)*A*diag(C).
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm,
+ *           DiagScale is an input argument; otherwise it is an output
+ *           argument.
+ *
+ *         o perm_r (int*)
+ *           Row permutation vector, which defines the permutation matrix Pr;
+ *           perm_r[i] = j means row i of A is in position j in Pr*A.
+ *           If options->RowPerm = MY_PERMR, or
+ *           options->Fact = SamePattern_SameRowPerm, perm_r is an
+ *           input argument; otherwise it is an output argument.
+ *
+ *         o perm_c (int*)
+ *           Column permutation vector, which defines the
+ *           permutation matrix Pc; perm_c[i] = j means column i of A is
+ *           in position j in A*Pc.
+ *           If options->ColPerm = MY_PERMC or options->Fact = SamePattern
+ *           or options->Fact = SamePattern_SameRowPerm, perm_c is an
+ *           input argument; otherwise, it is an output argument.
+ *           On exit, perm_c may be overwritten by the product of the input
+ *           perm_c and a permutation that postorders the elimination tree
+ *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
+ *           is already in postorder.
+ *
+ *         o R (double*) dimension (A->nrow)
+ *           The row scale factors for A.
+ *           If DiagScale = ROW or BOTH, A is multiplied on the left by
+ *                          diag(R).
+ *           If DiagScale = NOEQUIL or COL, R is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
+ *           an input argument; otherwise, R is an output argument.
+ *
+ *         o C (double*) dimension (A->ncol)
+ *           The column scale factors for A.
+ *           If DiagScale = COL or BOTH, A is multiplied on the right by
+ *                          diag(C).
+ *           If DiagScale = NOEQUIL or ROW, C is not defined.
+ *           If options->Fact = FACTORED or SamePattern_SameRowPerm, C is
+ *           an input argument; otherwise, C is an output argument.
+ *
+ * B       (input/output) float*
+ *         On entry, the right-hand side matrix of dimension (A->nrow, nrhs).
+ *         On exit, the solution matrix if info = 0;
+ *
+ *         NOTE: Currently, B must reside in all processes when calling
+ *               this routine.
+ *
+ * ldb     (input) int (global)
+ *         The leading dimension of matrix B.
+ *
+ * nrhs    (input) int (global)
+ *         The number of right-hand sides.
+ *         If nrhs = 0, only LU decomposition is performed, the forward
+ *         and back substitutions are skipped.
+ *
+ * grid    (input) gridinfo_t*
+ *         The 2D process mesh. It contains the MPI communicator, the number
+ *         of process rows (NPROW), the number of process columns (NPCOL),
+ *         and my process rank. It is an input argument to all the
+ *         parallel routines.
+ *         Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *         See superlu_sdefs.h for the definition of 'gridinfo_t'.
+ *
+ * LUstruct (input/output) sLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         It contains the following fields:
+ *
+ *         o etree (int*) dimension (A->ncol)
+ *           Elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc', dimension A->ncol.
+ *           It is computed in sp_colorder() during the first factorization,
+ *           and is reused in the subsequent factorizations of the matrices
+ *           with the same nonzero pattern.
+ *           On exit of sp_colorder(), the columns of A are permuted so that
+ *           the etree is in a certain postorder. This postorder is reflected
+ *           in ScalePermstruct->perm_c.
+ *           NOTE:
+ *           Etree is a vector of parent pointers for a forest whose vertices
+ *           are the integers 0 to A->ncol-1; etree[root]==A->ncol.
+ *
+ *         o Glu_persist (Glu_persist_t*)
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *	       xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (sLocalLU_t*)
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_ddefs.h for the definition of 'sLocalLU_t'.
+ *
+ * berr    (output) double*, dimension (nrhs)
+ *         The componentwise relative backward error of each solution
+ *         vector X(j) (i.e., the smallest relative change in
+ *         any element of A or B that makes X(j) an exact solution).
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info    (output) int*
+ *         = 0: successful exit
+ *         > 0: if info = i, and i is
+ *             <= A->ncol: U(i,i) is exactly zero. The factorization has
+ *                been completed, but the factor U is exactly singular,
+ *                so the solution could not be computed.
+ *             > A->ncol: number of bytes allocated when memory allocation
+ *                failure occurred, plus A->ncol.
+ *
+ *
+ * See superlu_sdefs.h for the definitions of various data types.
+ * 
+ */ +void +psgssvx_ABglobal(superlu_dist_options_t *options, SuperMatrix *A, + sScalePermstruct_t *ScalePermstruct, + float B[], int ldb, int nrhs, gridinfo_t *grid, + sLUstruct_t *LUstruct, float *berr, + SuperLUStat_t *stat, int *info) +{ + SuperMatrix AC; + NCformat *Astore; + NCPformat *ACstore; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t *Glu_freeable; + /* The nonzero structures of L and U factors, which are + replicated on all processrs. + (lsub, xlsub) contains the compressed subscript of + supernodes in L. + (usub, xusub) contains the compressed subscript of + nonzero segments in U. + If options->Fact != SamePattern_SameRowPerm, they are + computed by SYMBFACT routine, and then used by DDISTRIBUTE + routine. They will be freed after DDISTRIBUTE routine. + If options->Fact == SamePattern_SameRowPerm, these + structures are not used. */ + fact_t Fact; + float *a; + int_t *perm_r; /* row permutations from partial pivoting */ + int_t *perm_c; /* column permutation vector */ + int_t *etree; /* elimination tree */ + int_t *colptr, *rowind; + int_t Equil, factored, job, notran, colequ, rowequ; + int_t i, iinfo, j, irow, m, n, nnz, permc_spec, dist_mem_use; + int iam; + int ldx; /* LDA for matrix X (global). */ + char equed[1], norm[1]; + float *C, *R, *C1, *R1, amax, anorm, colcnd, rowcnd; + float *X, *b_col, *b_work, *x_col; + double t; + static superlu_dist_mem_usage_t num_mem_usage, symb_mem_usage; +#if ( PRNTlevel>= 2 ) + double dmin, dsum, dprod; +#endif + LUstruct->dt = 's'; + + /* Test input parameters. */ + *info = 0; + Fact = options->Fact; + if ( Fact < 0 || Fact > FACTORED ) + *info = -1; + else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR ) + *info = -1; + else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC ) + *info = -1; + else if ( options->IterRefine < 0 || options->IterRefine > SLU_EXTRA ) + *info = -1; + else if ( options->IterRefine == SLU_EXTRA ) { + *info = -1; + fprintf(stderr, "Extra precise iterative refinement yet to support."); + } else if ( A->nrow != A->ncol || A->nrow < 0 || + A->Stype != SLU_NC || A->Dtype != SLU_S || A->Mtype != SLU_GE ) + *info = -2; + else if ( ldb < A->nrow ) + *info = -5; + else if ( nrhs < 0 ) + *info = -6; + if ( *info ) { + i = -(*info); + pxerr_dist("psgssvx_ABglobal", grid, -*info); + return; + } + + /* Initialization */ + factored = (Fact == FACTORED); + Equil = (!factored && options->Equil == YES); + notran = (options->Trans == NOTRANS); + iam = grid->iam; + job = 5; + m = A->nrow; + n = A->ncol; + Astore = A->Store; + nnz = Astore->nnz; + a = Astore->nzval; + colptr = Astore->colptr; + rowind = Astore->rowind; + if ( factored || (Fact == SamePattern_SameRowPerm && Equil) ) { + rowequ = (ScalePermstruct->DiagScale == ROW) || + (ScalePermstruct->DiagScale == BOTH); + colequ = (ScalePermstruct->DiagScale == COL) || + (ScalePermstruct->DiagScale == BOTH); + } else rowequ = colequ = FALSE; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psgssvx_ABglobal()"); +#endif + + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + etree = LUstruct->etree; + R = ScalePermstruct->R; + C = ScalePermstruct->C; + if ( Equil && Fact != SamePattern_SameRowPerm ) { + /* Allocate storage if not done so before. */ + switch ( ScalePermstruct->DiagScale ) { + case NOEQUIL: + if ( !(R = (float *) floatMalloc_dist(m)) ) + ABORT("Malloc fails for R[]."); + if ( !(C = (float *) floatMalloc_dist(n)) ) + ABORT("Malloc fails for C[]."); + ScalePermstruct->R = R; + ScalePermstruct->C = C; + break; + case ROW: + if ( !(C = (float *) floatMalloc_dist(n)) ) + ABORT("Malloc fails for C[]."); + ScalePermstruct->C = C; + break; + case COL: + if ( !(R = (float *) floatMalloc_dist(m)) ) + ABORT("Malloc fails for R[]."); + ScalePermstruct->R = R; + break; + default: break; + } + } + + /* ------------------------------------------------------------ + Diagonal scaling to equilibrate the matrix. + ------------------------------------------------------------*/ + if ( Equil ) { +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter equil"); +#endif + t = SuperLU_timer_(); + + if ( Fact == SamePattern_SameRowPerm ) { + /* Reuse R and C. */ + switch ( ScalePermstruct->DiagScale ) { + case NOEQUIL: + break; + case ROW: + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) { + irow = rowind[i]; + a[i] *= R[irow]; /* Scale rows. */ + } + } + break; + case COL: + for (j = 0; j < n; ++j) + for (i = colptr[j]; i < colptr[j+1]; ++i) + a[i] *= C[j]; /* Scale columns. */ + break; + case BOTH: + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) { + irow = rowind[i]; + a[i] *= R[irow] * C[j]; /* Scale rows and columns. */ + } + } + break; + } + } else { + if ( !iam ) { + /* Compute row and column scalings to equilibrate matrix A. */ + sgsequ_dist(A, R, C, &rowcnd, &colcnd, &amax, &iinfo); + + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast( R, m, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( C, n, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( &amax, 1, MPI_DOUBLE, 0, grid->comm ); + } else { + if ( iinfo > 0 ) { + if ( iinfo <= m ) { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th row of A is exactly zero\n", + iinfo); +#endif + } else { +#if ( PRNTlevel>=1 ) + fprintf(stderr, "The " IFMT "-th column of A is exactly zero\n", + iinfo-n); +#endif + } + } + } + } else { + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast( R, m, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( C, n, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( &rowcnd, 1, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( &colcnd, 1, MPI_DOUBLE, 0, grid->comm ); + MPI_Bcast( &amax, 1, MPI_DOUBLE, 0, grid->comm ); + } + } + + if ( iinfo == 0 ) { + /* Equilibrate matrix A. */ + slaqgs_dist(A, R, C, rowcnd, colcnd, amax, equed); + if ( strncmp(equed, "R", 1)==0 ) { + ScalePermstruct->DiagScale = ROW; + rowequ = ROW; + } else if ( strncmp(equed, "C", 1)==0 ) { + ScalePermstruct->DiagScale = COL; + colequ = COL; + } else if ( strncmp(equed, "B", 1)==0 ) { + ScalePermstruct->DiagScale = BOTH; + rowequ = ROW; + colequ = COL; + } else ScalePermstruct->DiagScale = NOEQUIL; + } + +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf(".. equilibrated? *equed = %c\n", *equed); + /*fflush(stdout);*/ + } +#endif + } /* if Fact ... */ + + stat->utime[EQUIL] = SuperLU_timer_() - t; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit equil"); +#endif + } /* end if Equil ... */ + + /* ------------------------------------------------------------ + Permute rows of A. + ------------------------------------------------------------*/ + if ( options->RowPerm != NO ) { + t = SuperLU_timer_(); + + if ( Fact == SamePattern_SameRowPerm /* Reuse perm_r. */ + || options->RowPerm == MY_PERMR ) { /* Use my perm_r. */ + for (i = 0; i < colptr[n]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } else if ( !factored ) { + if ( job == 5 ) { + /* Allocate storage for scaling factors. */ + if ( !(R1 = (float *) SUPERLU_MALLOC(m * sizeof(float))) ) + ABORT("SUPERLU_MALLOC fails for R1[]"); + if ( !(C1 = (float *) SUPERLU_MALLOC(n * sizeof(float))) ) + ABORT("SUPERLU_MALLOC fails for C1[]"); + } + + if ( !iam ) { + /* Process 0 finds a row permutation for large diagonal. */ + iinfo = sldperm_dist(job, m, nnz, colptr, rowind, a, + perm_r, R1, C1); + + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); + if ( job == 5 && Equil ) { + MPI_Bcast( R1, m, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( C1, n, MPI_FLOAT, 0, grid->comm ); + } + } + } else { + MPI_Bcast( &iinfo, 1, mpi_int_t, 0, grid->comm ); + if ( iinfo == 0 ) { + MPI_Bcast( perm_r, m, mpi_int_t, 0, grid->comm ); + if ( job == 5 && Equil ) { + MPI_Bcast( R1, m, MPI_FLOAT, 0, grid->comm ); + MPI_Bcast( C1, n, MPI_FLOAT, 0, grid->comm ); + } + } + } + + if ( iinfo && job == 5) { + SUPERLU_FREE(R1); + SUPERLU_FREE(C1); + } + +#if ( PRNTlevel>=2 ) + dmin = smach_dist("Overflow"); + dsum = 0.0; + dprod = 1.0; +#endif + if ( iinfo == 0 ) { + if ( job == 5 ) { + if ( Equil ) { + for (i = 0; i < n; ++i) { + R1[i] = exp(R1[i]); + C1[i] = exp(C1[i]); + } + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) { + irow = rowind[i]; + a[i] *= R1[irow] * C1[j]; /* Scale the matrix. */ + rowind[i] = perm_r[irow]; +#if ( PRNTlevel>=2 ) + if ( rowind[i] == j ) /* New diagonal */ + dprod *= fabs(a[i]); +#endif + } + } + + /* Multiply together the scaling factors. */ + if ( rowequ ) for (i = 0; i < m; ++i) R[i] *= R1[i]; + else for (i = 0; i < m; ++i) R[i] = R1[i]; + if ( colequ ) for (i = 0; i < n; ++i) C[i] *= C1[i]; + else for (i = 0; i < n; ++i) C[i] = C1[i]; + + ScalePermstruct->DiagScale = BOTH; + rowequ = colequ = 1; + } else { /* No equilibration. */ + for (i = colptr[0]; i < colptr[n]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; + } + } + SUPERLU_FREE (R1); + SUPERLU_FREE (C1); + } else { /* job = 2,3,4 */ + for (j = 0; j < n; ++j) { + for (i = colptr[j]; i < colptr[j+1]; ++i) { + irow = rowind[i]; + rowind[i] = perm_r[irow]; +#if ( PRNTlevel>=2 ) + if ( rowind[i] == j ) { /* New diagonal */ + if ( job == 2 || job == 3 ) + dmin = SUPERLU_MIN(dmin, fabs(a[i])); + else if ( job == 4 ) + dsum += fabs(a[i]); + else if ( job == 5 ) + dprod *= fabs(a[i]); + } +#endif + } /* end for i ... */ + } /* end for j ... */ + } /* end else */ + } else { /* if iinfo != 0 */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } + +#if ( PRNTlevel>=2 ) + if ( job == 2 || job == 3 ) { + if ( !iam ) printf("\tsmallest diagonal %e\n", dmin); + } else if ( job == 4 ) { + if ( !iam ) printf("\tsum of diagonal %e\n", dsum); + } else if ( job == 5 ) { + if ( !iam ) printf("\t product of diagonal %e\n", dprod); + } +#endif + + } /* else !factored */ + + t = SuperLU_timer_() - t; + stat->utime[ROWPERM] = t; +#if ( PRNTlevel>=1 ) + if ( !iam ) printf(".. LDPERM job " IFMT "\t time: %.2f\n", job, t); +#endif + + } else { /* options->RowPerm == NOROWPERM */ + for (i = 0; i < m; ++i) perm_r[i] = i; + } + + if ( !factored || options->IterRefine ) { + /* Compute norm(A), which will be used to adjust small diagonal. */ + if ( notran ) *(unsigned char *)norm = '1'; + else *(unsigned char *)norm = 'I'; + anorm = slangs_dist(norm, A); +#if ( PRNTlevel>=1 ) + if ( !iam ) printf(".. anorm %e\n", anorm); +#endif + } + + /* ------------------------------------------------------------ + Perform the LU factorization. + ------------------------------------------------------------*/ + if ( !factored ) { + t = SuperLU_timer_(); + /* + * Get column permutation vector perm_c[], according to permc_spec: + * permc_spec = NATURAL: natural ordering + * permc_spec = MMD_AT_PLUS_A: minimum degree on structure of A'+A + * permc_spec = MMD_ATA: minimum degree on structure of A'*A + * permc_spec = MY_PERMC: the ordering already supplied in perm_c[] + */ + permc_spec = options->ColPerm; + if ( permc_spec != MY_PERMC && Fact == DOFACT ) + /* Use an ordering provided by SuperLU */ + get_perm_c_dist(iam, permc_spec, A, perm_c); + + /* Compute the elimination tree of Pc*(A'+A)*Pc' or Pc*A'*A*Pc' + (a.k.a. column etree), depending on the choice of ColPerm. + Adjust perm_c[] to be consistent with a postorder of etree. + Permute columns of A to form A*Pc'. */ + sp_colorder(options, A, perm_c, etree, &AC); + + /* Form Pc*A*Pc' to preserve the diagonal of the matrix Pr*A. */ + ACstore = AC.Store; + for (j = 0; j < n; ++j) + for (i = ACstore->colbeg[j]; i < ACstore->colend[j]; ++i) { + irow = ACstore->rowind[i]; + ACstore->rowind[i] = perm_c[irow]; + } + stat->utime[COLPERM] = SuperLU_timer_() - t; + + /* Perform a symbolic factorization on matrix A and set up the + nonzero data structures which are suitable for supernodal GENP. */ + if ( Fact != SamePattern_SameRowPerm ) { +#if ( PRNTlevel>=1 ) + if ( !iam ) + printf(".. symbfact(): relax %d, maxsuper %d, fill %d\n", + sp_ienv_dist(2), sp_ienv_dist(3), sp_ienv_dist(6)); +#endif + t = SuperLU_timer_(); + if ( !(Glu_freeable = (Glu_freeable_t *) + SUPERLU_MALLOC(sizeof(Glu_freeable_t))) ) + ABORT("Malloc fails for Glu_freeable."); + + iinfo = symbfact(options, iam, &AC, perm_c, etree, + Glu_persist, Glu_freeable); + + stat->utime[SYMBFAC] = SuperLU_timer_() - t; + + if ( iinfo <= 0 ) { + QuerySpace_dist(n, -iinfo, Glu_freeable, &symb_mem_usage); +#if ( PRNTlevel>=1 ) + if ( !iam ) { + printf("\tNo of supers " IFMT "\n", Glu_persist->supno[n-1]+1); + printf("\tSize of G(L) " IFMT "\n", Glu_freeable->xlsub[n]); + printf("\tSize of G(U) " IFMT "\n", Glu_freeable->xusub[n]); + printf("\tint %d, short %d, float %d, double %d\n", + (int) sizeof(int_t), (int) sizeof(short), + (int) sizeof(float), (int) sizeof(double)); + printf("\tSYMBfact (MB):\tL\\U %.2f\ttotal %.2f\texpansions %d\n", + symb_mem_usage.for_lu*1e-6, + symb_mem_usage.total*1e-6, + symb_mem_usage.expansions); + } +#endif + } else { /* symbfact out of memory */ +#if ( PRNTlevel>=1 ) + if ( !iam ) + fprintf(stderr, "symbfact() error returns " IFMT "\n", iinfo); +#endif + *info = iinfo; + return; + } + } + + /* Distribute the L and U factors onto the process grid. */ + t = SuperLU_timer_(); + dist_mem_use = sdistribute(Fact, n, &AC, Glu_freeable, LUstruct, grid); + stat->utime[DIST] = SuperLU_timer_() - t; + + /* Deallocate storage used in symbolic factor. */ + if ( Fact != SamePattern_SameRowPerm ) { + iinfo = symbfact_SubFree(Glu_freeable); + SUPERLU_FREE(Glu_freeable); + } + + /* Perform numerical factorization in parallel. */ + t = SuperLU_timer_(); + psgstrf(options, m, n, anorm, LUstruct, grid, stat, info); + stat->utime[FACT] = SuperLU_timer_() - t; + +#if ( PRNTlevel>=1 ) + { + int_t TinyPivots; + float for_lu, total, max, avg, temp; + sQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage); + MPI_Reduce( &num_mem_usage.for_lu, &for_lu, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + MPI_Reduce( &num_mem_usage.total, &total, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + temp = SUPERLU_MAX(symb_mem_usage.total, + symb_mem_usage.for_lu + + (float)dist_mem_use + num_mem_usage.for_lu); + temp = SUPERLU_MAX(temp, num_mem_usage.total); + MPI_Reduce( &temp, &max, + 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); + MPI_Reduce( &temp, &avg, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + MPI_Allreduce( &stat->TinyPivots, &TinyPivots, 1, mpi_int_t, + MPI_SUM, grid->comm ); + stat->TinyPivots = TinyPivots; + if ( !iam ) { + printf("\tNUMfact (MB) all PEs:\tL\\U\t%.2f\tall\t%.2f\n", + for_lu*1e-6, total*1e-6); + printf("\tAll space (MB):" + "\t\ttotal\t%.2f\tAvg\t%.2f\tMax\t%.2f\n", + avg*1e-6, avg/grid->nprow/grid->npcol*1e-6, max*1e-6); + printf("\tNumber of tiny pivots: %10d\n", stat->TinyPivots); + printf(".. psgstrf INFO = %d\n", *info); + } + } +#endif + + } else if ( options->IterRefine ) { /* options->Fact==FACTORED */ + /* Permute columns of A to form A*Pc' using the existing perm_c. + * NOTE: rows of A were previously permuted to Pc*A. + */ + sp_colorder(options, A, perm_c, NULL, &AC); + } /* if !factored ... */ + + /* ------------------------------------------------------------ + Compute the solution matrix X. + ------------------------------------------------------------*/ + if ( nrhs && *info == 0 ) { + + if ( !(b_work = floatMalloc_dist(n)) ) + ABORT("Malloc fails for b_work[]"); + + /* ------------------------------------------------------------ + Scale the right-hand side if equilibration was performed. + ------------------------------------------------------------*/ + if ( notran ) { + if ( rowequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m; ++i) b_col[i] *= R[i]; + b_col += ldb; + } + } + } else if ( colequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m; ++i) b_col[i] *= C[i]; + b_col += ldb; + } + } + + /* ------------------------------------------------------------ + Permute the right-hand side to form Pr*B. + ------------------------------------------------------------*/ + if ( options->RowPerm != NO ) { + if ( notran ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m; ++i) b_work[perm_r[i]] = b_col[i]; + for (i = 0; i < m; ++i) b_col[i] = b_work[i]; + b_col += ldb; + } + } + } + + + /* ------------------------------------------------------------ + Permute the right-hand side to form Pc*B. + ------------------------------------------------------------*/ + if ( notran ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < m; ++i) b_work[perm_c[i]] = b_col[i]; + for (i = 0; i < m; ++i) b_col[i] = b_work[i]; + b_col += ldb; + } + } + + /* Save a copy of the right-hand side. */ + ldx = ldb; + if ( !(X = floatMalloc_dist(((size_t)ldx) * nrhs)) ) + ABORT("Malloc fails for X[]"); + x_col = X; b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < ldb; ++i) x_col[i] = b_col[i]; + x_col += ldx; b_col += ldb; + } + + /* ------------------------------------------------------------ + Solve the linear system. + ------------------------------------------------------------*/ + psgstrs_Bglobal(n, LUstruct, grid, X, ldb, nrhs, stat, info); + + /* ------------------------------------------------------------ + Use iterative refinement to improve the computed solution and + compute error bounds and backward error estimates for it. + ------------------------------------------------------------*/ + if ( options->IterRefine ) { + /* Improve the solution by iterative refinement. */ + t = SuperLU_timer_(); + psgsrfs_ABXglobal(n, &AC, anorm, LUstruct, grid, B, ldb, + X, ldx, nrhs, berr, stat, info); + stat->utime[REFINE] = SuperLU_timer_() - t; + } + + /* Permute the solution matrix X <= Pc'*X. */ + for (j = 0; j < nrhs; j++) { + b_col = &B[j*ldb]; + x_col = &X[j*ldx]; + for (i = 0; i < n; ++i) b_col[i] = x_col[perm_c[i]]; + } + + /* Transform the solution matrix X to a solution of the original system + before the equilibration. */ + if ( notran ) { + if ( colequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < n; ++i) b_col[i] *= C[i]; + b_col += ldb; + } + } + } else if ( rowequ ) { + b_col = B; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < n; ++i) b_col[i] *= R[i]; + b_col += ldb; + } + } + + SUPERLU_FREE(b_work); + SUPERLU_FREE(X); + + } /* end if nrhs != 0 */ + +#if ( PRNTlevel>=1 ) + if ( !iam ) printf(".. DiagScale = %d\n", ScalePermstruct->DiagScale); +#endif + + /* Deallocate R and/or C if it is not used. */ + if ( Equil && Fact != SamePattern_SameRowPerm ) { + switch ( ScalePermstruct->DiagScale ) { + case NOEQUIL: + SUPERLU_FREE(R); + SUPERLU_FREE(C); + break; + case ROW: + SUPERLU_FREE(C); + break; + case COL: + SUPERLU_FREE(R); + break; + default: break; + } + } + if ( !factored || (factored && options->IterRefine) ) + Destroy_CompCol_Permuted_dist(&AC); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgssvx_ABglobal()"); +#endif +} + diff --git a/SRC/psgstrf.c b/SRC/psgstrf.c new file mode 100644 index 00000000..66113792 --- /dev/null +++ b/SRC/psgstrf.c @@ -0,0 +1,2006 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Performs LU factorization in parallel. + * + *
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ * Modified:
+ *   September 1, 1999
+ *   Feburary 7, 2001  use MPI_Isend/MPI_Irecv
+ *   October 15, 2008  latency-reducing panel factorization
+ *   July    12, 2011  static scheduling and arbitrary look-ahead
+ *   March   13, 2013  change NTAGS to MPI_TAG_UB value
+ *   September 24, 2015 replace xLAMCH by xMACH, using C99 standard.
+ *   December 31, 2015 rename xMACH to xMACH_DIST.
+ *   September 30, 2017 optimization for Intel Knights Landing (KNL) node .
+ *   June 1, 2018      add parallel AWPM pivoting; add back arrive_at_ublock()
+ *   February 8, 2019  version 6.1.1
+ *
+ * Sketch of the algorithm
+ *
+ * =======================
+ *
+ * The following relations hold:
+ *     * A_kk = L_kk * U_kk
+ *     * L_ik = Aik * U_kk^(-1)
+ *     * U_kj = L_kk^(-1) * A_kj
+ *
+ *              ----------------------------------
+ *              |   |                            |
+ *              ----|-----------------------------
+ *              |   | \ U_kk|                    |
+ *              |   |   \   |        U_kj        |
+ *              |   |L_kk \ |         ||         |
+ *              ----|-------|---------||----------
+ *              |   |       |         \/         |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   | L_ik ==>       A_ij        |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              |   |       |                    |
+ *              ----------------------------------
+ *
+ * Handle the first block of columns separately.
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity. ( psgstrf2(0), one column at a time )
+ *     * Compute block row of U
+ *     * Update trailing matrix
+ *
+ * Loop over the remaining blocks of columns.
+ *   mycol = MYCOL( iam, grid );
+ *   myrow = MYROW( iam, grid );
+ *   N = nsupers;
+ *   For (k = 1; k < N; ++k) {
+ *       krow = PROW( k, grid );
+ *       kcol = PCOL( k, grid );
+ *       Pkk = PNUM( krow, kcol, grid );
+ *
+ *     * Factor diagonal and subdiagonal blocks and test for exact
+ *       singularity.
+ *       if ( mycol == kcol ) {
+ *           psgstrf2(k), one column at a time
+ *       }
+ *
+ *     * Parallel triangular solve
+ *       if ( iam == Pkk ) multicast L_k,k to this process row;
+ *       if ( myrow == krow && mycol != kcol ) {
+ *          Recv L_k,k from process Pkk;
+ *          for (j = k+1; j < N; ++j)
+ *              if ( PCOL( j, grid ) == mycol && A_k,j != 0 )
+ *                 U_k,j = L_k,k \ A_k,j;
+ *       }
+ *
+ *     * Parallel rank-k update
+ *       if ( myrow == krow ) multicast U_k,k+1:N to this process column;
+ *       if ( mycol == kcol ) multicast L_k+1:N,k to this process row;
+ *       if ( myrow != krow ) {
+ *          Pkj = PNUM( krow, mycol, grid );
+ *          Recv U_k,k+1:N from process Pkj;
+ *       }
+ *       if ( mycol != kcol ) {
+ *          Pik = PNUM( myrow, kcol, grid );
+ *          Recv L_k+1:N,k from process Pik;
+ *       }
+ *       for (j = k+1; k < N; ++k) {
+ *          for (i = k+1; i < N; ++i)
+ *              if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid )
+ *                   && L_i,k != 0 && U_k,j != 0 )
+ *                 A_i,j = A_i,j - L_i,k * U_k,j;
+ *       }
+ *  }
+ *
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +#ifdef GPU_ACC +#include "cublas_utils.h" +/*#include "cublas_sgemm.h"*/ +// #define NUM_CUDA_STREAMS 16 +// #define NUM_CUDA_STREAMS 16 +#endif + +/* Various defininations */ +/* + Name : SUPERNODE_PROFILE + Purpose : For SuperNode Level profiling of various measurements such as gigaflop/sec + obtained,bandwidth achieved: + Overhead : Low +*/ +// #define SUPERNODE_PROFILE + +/* + Name : BASELINE + Purpose : baseline to compare performance against + Overhead : NA : this won't be used for running experiments +*/ +// #define BASELINE + +/* + Name : PHI_FRAMEWORK + Purpose : To simulate and test algorithm used for offloading Phi + Overhead : NA : this won't be used for running experiments +*/ +#define PHI_FRAMEWORK + +#if 0 +#define CACHELINE 64 /* bytes, Xeon Phi KNL */ +#else +#define CACHELINE 0 /* not worry about false sharing of different threads */ +#endif +//#define GEMM_PADLEN 1 +#define GEMM_PADLEN 8 + +#define PSGSTRF2 psgstrf2_trsm + +#ifdef ISORT +extern void isort (int_t N, int_t * ARRAY1, int_t * ARRAY2); +extern void isort1 (int_t N, int_t * ARRAY); + +#else + +int +superlu_sort_perm (const void *arg1, const void *arg2) +{ + const int_t *val1 = (const int_t *) arg1; + const int_t *val2 = (const int_t *) arg2; + return (*val2 < *val1); +} +#endif + + +/************************************************************************/ + +#include "sscatter.c" + +/************************************************************************/ + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * PSGSTRF performs the LU factorization in parallel.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) float
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * LUstruct (input/output) sLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *         xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) sLocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_sdefs.h for the definition of 'sLocalLU_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ */ +int_t +psgstrf(superlu_dist_options_t * options, int m, int n, float anorm, + sLUstruct_t * LUstruct, gridinfo_t * grid, SuperLUStat_t * stat, int *info) +{ +#ifdef _CRAY + _fcd ftcs = _cptofcd ("N", strlen ("N")); + _fcd ftcs1 = _cptofcd ("L", strlen ("L")); + _fcd ftcs2 = _cptofcd ("N", strlen ("N")); + _fcd ftcs3 = _cptofcd ("U", strlen ("U")); +#endif + float zero = 0.0, alpha = 1.0, beta = 0.0; + int_t *xsup; + int_t *lsub, *lsub1, *usub, *Usub_buf; + int_t **Lsub_buf_2, **Usub_buf_2; + float **Lval_buf_2, **Uval_buf_2; /* pointers to starts of bufs */ + float *lusup, *lusup1, *uval, *Uval_buf; /* pointer to current buf */ + int_t fnz, i, ib, ijb, ilst, it, iukp, jb, jj, klst, knsupc, + lb, lib, ldv, ljb, lptr, lptr0, lptrj, luptr, luptr0, luptrj, + nlb, nub, nsupc, rel, rukp, il, iu; + int_t Pc, Pr; + int iam, kcol, krow, yourcol, mycol, myrow, pi, pj; + int j, k, lk, nsupers; /* k - current panel to work on */ + int k0; /* counter of the next supernode to be factored */ + int kk, kk0, kk1, kk2, jj0; /* panels in the look-ahead window */ + int iukp0, rukp0, flag0, flag1; + int nsupr, nbrow, segsize; + int msg0, msg2; + int_t **Ufstnz_br_ptr, **Lrowind_bc_ptr; + float **Unzval_br_ptr, **Lnzval_bc_ptr; + int_t *index; + float *nzval; + float *ucol; + int *indirect, *indirect2; + int_t *tempi; + float *tempu, *tempv, *tempr; + /* float *tempv2d, *tempU2d; Sherry */ + int iinfo; + int *ToRecv, *ToSendD, **ToSendR; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + superlu_scope_t *scp; + float s_eps; + double thresh; + /*int full;*/ + int ldt, ldu, lead_zero, ncols, ncb, nrb, p, pr, pc, nblocks; + int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows, + *Lblock, *Lrows, *perm_u, *sf_block, *sf_block_l, *nnodes_l, + *nnodes_u, *edag_supno_l, *recvbuf, **edag_supno; + float edag_supno_l_bytes; +#ifdef ISORT + int_t *iperm_u; +#endif + int *msgcnt; /* Count the size of the message xfer'd in each buffer: + * 0 : transferred in Lsub_buf[] + * 1 : transferred in Lval_buf[] + * 2 : transferred in Usub_buf[] + * 3 : transferred in Uval_buf[] + */ + int **msgcnts, **msgcntsU; /* counts in the look-ahead window */ + int *factored; /* factored[j] == 0 : L col panel j is factorized. */ + int *factoredU; /* factoredU[i] == 1 : U row panel i is factorized. */ + int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows; + etree_node *head, *tail, *ptr; + int *num_child; + int num_look_aheads, look_id; + int *look_ahead; /* global look_ahead table */ + int_t *perm_c_supno, *iperm_c_supno; + /* perm_c_supno[k] = j means at the k-th step of elimination, + * the j-th supernode is chosen. */ + MPI_Request *recv_req, **recv_reqs, **send_reqs, **send_reqs_u, + **recv_reqs_u; + MPI_Request *send_req, *U_diag_blk_send_req = NULL; + MPI_Status status; + void *attr_val; + int flag; + + /* The following variables are used to pad GEMM dimensions so that + each is a multiple of vector length (8 doubles for KNL) */ + int gemm_m_pad = GEMM_PADLEN, gemm_k_pad = GEMM_PADLEN, + gemm_n_pad = GEMM_PADLEN; + int gemm_padding = 0; + + int iword = sizeof (int_t); + int dword = sizeof (float); + + /* For measuring load imbalence in omp threads */ + double omp_load_imblc = 0.0; + double *omp_loop_time; + + double schur_flop_timer = 0.0; + double pdgstrf2_timer = 0.0; + double pdgstrs2_timer = 0.0; + double lookaheadupdatetimer = 0.0; + double InitTimer = 0.0; /* including compute schedule, malloc */ + double tt_start, tt_end; + +/* #if !defined( GPU_ACC ) */ + /* Counters for memory operations and timings */ + double scatter_mem_op_counter = 0.0; + double scatter_mem_op_timer = 0.0; + double scatterL_mem_op_counter = 0.0; + double scatterL_mem_op_timer = 0.0; + double scatterU_mem_op_counter = 0.0; + double scatterU_mem_op_timer = 0.0; + + /* Counters for flops/gather/scatter and timings */ + double GatherLTimer = 0.0; + double LookAheadRowSepMOP = 0.0; + double GatherUTimer = 0.0; + double GatherMOP = 0.0; + double LookAheadGEMMTimer = 0.0; + double LookAheadGEMMFlOp = 0.0; + double LookAheadScatterTimer = 0.0; + double LookAheadScatterMOP = 0.0; + double RemainGEMMTimer = 0.0; + double RemainGEMM_flops = 0.0; + double RemainScatterTimer = 0.0; + double NetSchurUpTimer = 0.0; + double schur_flop_counter = 0.0; +/* #endif */ + +#if ( PRNTlevel>= 1) + /* count GEMM max dimensions */ + int gemm_max_m = 0, gemm_max_n = 0, gemm_max_k = 0; +#endif + +#if ( DEBUGlevel>=2 ) + int_t num_copy = 0, num_update = 0; +#endif +#if ( PRNTlevel==3 ) + int zero_msg = 0, total_msg = 0; +#endif +#if ( PROFlevel>=1 ) + double t1, t2; + float msg_vol = 0, msg_cnt = 0; + double comm_wait_time = 0.0; + /* Record GEMM dimensions and times */ + FILE *fopen(), *fgemm; + int gemm_count = 0; + typedef struct { + int m, n, k; + double microseconds; + } gemm_profile; + gemm_profile *gemm_stats; +#endif + + /* Test the input parameters. */ + *info = 0; + if (m < 0) + *info = -2; + else if (n < 0) + *info = -3; + if (*info) { + pxerr_dist ("psgstrf", grid, -*info); + return (-1); + } + + /* Quick return if possible. */ + if (m == 0 || n == 0) return 0; + + double tt1 = SuperLU_timer_ (); + + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW (iam, grid); + mycol = MYCOL (iam, grid); + nsupers = Glu_persist->supno[n - 1] + 1; + xsup = Glu_persist->xsup; + s_eps = smach_dist("Epsilon"); + thresh = s_eps * anorm; + + MPI_Comm_get_attr (MPI_COMM_WORLD, MPI_TAG_UB, &attr_val, &flag); + if (!flag) { + fprintf (stderr, "Could not get TAG_UB\n"); + return (-1); + } + int tag_ub = *(int *) attr_val; + +#if ( PRNTlevel>=1 ) + if (!iam) { + printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout); + } +#endif + +#if ( DEBUGlevel>=1 ) + if (s_eps == 0.0) + printf (" ***** warning s_eps = %e *****\n", s_eps); + CHECK_MALLOC (iam, "Enter psgstrf()"); +#endif +#if (PROFlevel >= 1 ) + gemm_stats = (gemm_profile *) SUPERLU_MALLOC(nsupers * sizeof(gemm_profile)); + if (iam == 0) fgemm = fopen("dgemm_mnk.dat", "w"); + int *prof_sendR = intCalloc_dist(nsupers); +#endif + + stat->ops[FACT] = 0.0; + stat->current_buffer = 0.0; + stat->peak_buffer = 0.0; + stat->gpu_buffer = 0.0; + + /* make sure the range of look-ahead window [0, MAX_LOOKAHEADS-1] */ + num_look_aheads = SUPERLU_MAX(0, SUPERLU_MIN(options->num_lookaheads, MAX_LOOKAHEADS - 1)); + + if (Pr * Pc > 1) { + if (!(U_diag_blk_send_req = + (MPI_Request *) SUPERLU_MALLOC (Pr * sizeof (MPI_Request)))) + ABORT ("Malloc fails for U_diag_blk_send_req[]."); + /* flag no outstanding Isend */ + U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL; /* used 0 before */ + + /* allocating buffers for look-ahead */ + i = Llu->bufmax[0]; + if (i != 0) { + if ( !(Llu->Lsub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * ((size_t) i))) ) + ABORT ("Malloc fails for Lsub_buf."); + tempi = Llu->Lsub_buf_2[0]; + for (jj = 0; jj < num_look_aheads; jj++) + Llu->Lsub_buf_2[jj+1] = tempi + i*(jj+1); /* vectorize */ + //Llu->Lsub_buf_2[jj + 1] = Llu->Lsub_buf_2[jj] + i; + } + i = Llu->bufmax[1]; + if (i != 0) { + if (!(Llu->Lval_buf_2[0] = floatMalloc_dist ((num_look_aheads + 1) * ((size_t) i)))) + ABORT ("Malloc fails for Lval_buf[]."); + tempr = Llu->Lval_buf_2[0]; + for (jj = 0; jj < num_look_aheads; jj++) + Llu->Lval_buf_2[jj+1] = tempr + i*(jj+1); /* vectorize */ + //Llu->Lval_buf_2[jj + 1] = Llu->Lval_buf_2[jj] + i; + } + i = Llu->bufmax[2]; + if (i != 0) { + if (!(Llu->Usub_buf_2[0] = intMalloc_dist ((num_look_aheads + 1) * i))) + ABORT ("Malloc fails for Usub_buf_2[]."); + tempi = Llu->Usub_buf_2[0]; + for (jj = 0; jj < num_look_aheads; jj++) + Llu->Usub_buf_2[jj+1] = tempi + i*(jj+1); /* vectorize */ + //Llu->Usub_buf_2[jj + 1] = Llu->Usub_buf_2[jj] + i; + } + i = Llu->bufmax[3]; + if (i != 0) { + if (!(Llu->Uval_buf_2[0] = floatMalloc_dist ((num_look_aheads + 1) * i))) + ABORT ("Malloc fails for Uval_buf_2[]."); + tempr = Llu->Uval_buf_2[0]; + for (jj = 0; jj < num_look_aheads; jj++) + Llu->Uval_buf_2[jj+1] = tempr + i*(jj+1); /* vectorize */ + //Llu->Uval_buf_2[jj + 1] = Llu->Uval_buf_2[jj] + i; + } + } + + log_memory( (Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1) + * iword + + (Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1) + * dword, stat ); + + /* creating pointers to the look-ahead buffers */ + if (! (Lsub_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int_t *)))) + ABORT ("Malloc fails for Lsub_buf_2[]."); + if (! (Lval_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (float *)))) + ABORT ("Malloc fails for Lval_buf_2[]."); + if (! (Usub_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int_t *)))) + ABORT ("Malloc fails for Uval_buf_2[]."); + if (! (Uval_buf_2 = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (float *)))) + ABORT ("Malloc fails for buf_2[]."); + for (i = 0; i <= num_look_aheads; i++) { + Lval_buf_2[i] = Llu->Lval_buf_2[i]; + Lsub_buf_2[i] = Llu->Lsub_buf_2[i]; + Uval_buf_2[i] = Llu->Uval_buf_2[i]; + Usub_buf_2[i] = Llu->Usub_buf_2[i]; + } + + if (!(msgcnts = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int *)))) + ABORT ("Malloc fails for msgcnts[]."); + if (!(msgcntsU = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (int *)))) + ABORT ("Malloc fails for msgcntsU[]."); + for (i = 0; i <= num_look_aheads; i++) { + if (!(msgcnts[i] = SUPERLU_MALLOC (4 * sizeof (int)))) + ABORT ("Malloc fails for msgcnts[]."); + if (!(msgcntsU[i] = SUPERLU_MALLOC (4 * sizeof (int)))) + ABORT ("Malloc fails for msgcntsU[]."); + } + + if (! (recv_reqs_u = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *)))) + ABORT ("Malloc fails for recv_reqs_u[]."); + if (! (send_reqs_u = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *)))) + ABORT ("Malloc fails for send_reqs_u[]."); + if (! (send_reqs = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *)))) + ABORT ("Malloc fails for send_reqs_u[]."); + if (! (recv_reqs = SUPERLU_MALLOC ((1 + num_look_aheads) * sizeof (MPI_Request *)))) + ABORT ("Malloc fails for recv_reqs[]."); + for (i = 0; i <= num_look_aheads; i++) { + if (!(recv_reqs_u[i] = (MPI_Request *) SUPERLU_MALLOC (2 * sizeof (MPI_Request)))) + ABORT ("Malloc fails for recv_req_u[i]."); + if (!(send_reqs_u[i] = (MPI_Request *) SUPERLU_MALLOC (2 * Pr * sizeof (MPI_Request)))) + ABORT ("Malloc fails for send_req_u[i]."); + if (!(send_reqs[i] = (MPI_Request *) SUPERLU_MALLOC (2 * Pc * sizeof (MPI_Request)))) + ABORT ("Malloc fails for send_reqs[i]."); + if (!(recv_reqs[i] = (MPI_Request *) SUPERLU_MALLOC (4 * sizeof (MPI_Request)))) + ABORT ("Malloc fails for recv_req[]."); + send_reqs[i][0] = send_reqs[i][1] = MPI_REQUEST_NULL; + recv_reqs[i][0] = recv_reqs[i][1] = MPI_REQUEST_NULL; + } + + if (!(factored = SUPERLU_MALLOC (nsupers * sizeof (int_t)))) + ABORT ("Malloc fails for factored[]."); + if (!(factoredU = SUPERLU_MALLOC (nsupers * sizeof (int_t)))) + ABORT ("Malloc fails for factoredU[]."); + for (i = 0; i < nsupers; i++) factored[i] = factoredU[i] = -1; + + log_memory(2 * nsupers * iword, stat); + + int num_threads = 1; +#ifdef _OPENMP +#pragma omp parallel default(shared) + #pragma omp master + { + num_threads = omp_get_num_threads (); + } +#endif + +#if 0 + omp_loop_time = (double *) _mm_malloc (sizeof (double) * num_threads,64); +#else + omp_loop_time = (double *) SUPERLU_MALLOC(num_threads * sizeof(double)); +#endif + +#if ( PRNTlevel>=1 ) + if(!iam) { + printf(".. Starting with %d OpenMP threads \n", num_threads ); + fflush(stdout); + } +#endif + + nblocks = 0; + ncb = nsupers / Pc; /* number of column blocks, horizontal */ + nrb = nsupers / Pr; /* number of row blocks, vertical */ + + /* in order to have dynamic scheduling */ + int *full_u_cols; + int *blk_ldu; +#if 0 + full_u_cols = (int_t *) _mm_malloc (sizeof (int_t) * ncb,64); + blk_ldu = (int_t *) _mm_malloc (sizeof (int_t) * ncb,64); +#else + full_u_cols = SUPERLU_MALLOC((ncb+1) * sizeof(int)); + blk_ldu = SUPERLU_MALLOC((ncb+1) * sizeof(int)); // +1 to accommodate un-even division +#endif + + log_memory(2 * ncb * iword, stat); + +#if 0 /* Sherry: not used? */ + /* This bunch is used for static scheduling */ + pair *full_col_count = (pair *) _mm_malloc (sizeof (pair) * ncb,64); + int_t *count_cols, *sum_cols, *partition; + count_cols = (int_t *) _mm_malloc (sizeof (int_t) * num_threads,64); + sum_cols = (int_t *) _mm_malloc (sizeof (int_t) * num_threads,64); + partition = (int_t *) _mm_malloc (sizeof (int_t) * num_threads * ncb,64); + int_t ldp = ncb; +#endif + + /* ################################################################## + * Compute a good static schedule based on the factorization task graph. + * ################################################################## */ + perm_c_supno = SUPERLU_MALLOC (2 * nsupers * sizeof (int_t)); + iperm_c_supno = perm_c_supno + nsupers; + + sstatic_schedule(options, m, n, LUstruct, grid, stat, + perm_c_supno, iperm_c_supno, info); + +#if ( DEBUGlevel >= 2 ) + PrintInt10("schedule:perm_c_supno", nsupers, perm_c_supno); + + /* Turn off static schedule */ + printf("[%d] .. Turn off static schedule for debugging ..\n", iam); + for (i = 0; i < nsupers; ++i) perm_c_supno[i] = iperm_c_supno[i] = i; +#endif + /* ################################################################## */ + + /* constructing look-ahead table to indicate the last dependency */ + int *look_ahead_l; /* Sherry: add comment on look_ahead_l[] */ + stat->num_look_aheads = num_look_aheads; + + look_ahead_l = SUPERLU_MALLOC (nsupers * sizeof (int)); + look_ahead = SUPERLU_MALLOC (nsupers * sizeof (int)); + for (lb = 0; lb < nsupers; lb++) look_ahead_l[lb] = -1; /* vectorized */ + log_memory(3 * nsupers * iword, stat); + + /* Sherry: omp parallel? + not worth doing, due to concurrent write to look_ahead_l[jb] */ + for (lb = 0; lb < nrb; ++lb) { /* go through U-factor */ + ib = lb * Pr + myrow; + index = Llu->Ufstnz_br_ptr[lb]; + if (index) { /* Not an empty row */ + k = BR_HEADER; + for (j = 0; j < index[0]; ++j) { + jb = index[k]; /* global block number */ + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + if (myrow < nsupers % grid->nprow) { /* leftover block rows */ + ib = nrb * Pr + myrow; + index = Llu->Ufstnz_br_ptr[nrb]; + if (index) { /* Not an empty row */ + k = BR_HEADER; + for (j = 0; j < index[0]; ++j) { + jb = index[k]; + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + + if (options->SymPattern == NO) { + /* Sherry: omp parallel? + not worth doing, due to concurrent write to look_ahead_l[jb] */ + for (lb = 0; lb < ncb; lb++) { /* go through L-factor */ + ib = lb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[lb]; + if (index) { + k = BC_HEADER; + for (j = 0; j < index[0]; j++) { + jb = index[k]; + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + if (mycol < nsupers % grid->npcol) { /* leftover block columns */ + ib = ncb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[ncb]; + if (index) { + k = BC_HEADER; + for (j = 0; j < index[0]; j++) { + jb = index[k]; + if (jb != ib) + look_ahead_l[jb] = + SUPERLU_MAX (iperm_c_supno[ib], look_ahead_l[jb]); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + } + MPI_Allreduce (look_ahead_l, look_ahead, nsupers, MPI_INT, MPI_MAX, grid->comm); + SUPERLU_FREE (look_ahead_l); + +#ifdef ISORT + iperm_u = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + perm_u = SUPERLU_MALLOC (nsupers * sizeof (int_t)); +#else + perm_u = SUPERLU_MALLOC (2 * nsupers * sizeof (int_t)); +#endif + log_memory(nsupers * iword, stat); + + k = sp_ienv_dist (3); /* max supernode size */ +#if 0 + if ( !(Llu->ujrow = floatMalloc_dist(k*(k+1)/2)) ) + ABORT("Malloc fails for ujrow[]."); +#else + /* Instead of half storage, we'll do full storage */ + if (!(Llu->ujrow = floatCalloc_dist (k * k))) + ABORT ("Malloc fails for ujrow[]."); +#endif + log_memory(k * k * iword, stat); + +#if ( PRNTlevel>=1 ) + if (!iam) { + printf (".. thresh = s_eps %e * anorm %e = %e\n", s_eps, anorm, + thresh); + printf + (".. Buffer size: Lsub %ld\tLval %ld\tUsub %ld\tUval %ld\tLDA %ld\n", + (long int) Llu->bufmax[0], (long int) Llu->bufmax[1], + (long int) Llu->bufmax[2], (long int) Llu->bufmax[3], + (long int) Llu->bufmax[4]); + fflush(stdout); + } +#endif + + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + Unzval_br_ptr = Llu->Unzval_br_ptr; + ToRecv = Llu->ToRecv; + ToSendD = Llu->ToSendD; + ToSendR = Llu->ToSendR; + + ldt = sp_ienv_dist (3); /* Size of maximum supernode */ + k = CEILING (nsupers, Pr); /* Number of local block rows */ + + /* Following code is for finding maximum row dimension of all L panels */ + int local_max_row_size = 0; + int max_row_size; + +#if 0 +#if defined _OPENMP // Sherry: parallel reduction -- seems slower? +#pragma omp parallel for reduction(max :local_max_row_size) private(lk,lsub) +#endif +#endif + for (i = mycol; i < nsupers; i += Pc) { /* grab my local columns */ + //int tpc = PCOL (i, grid); + lk = LBj (i, grid); + lsub = Lrowind_bc_ptr[lk]; + if (lsub != NULL) { + if (lsub[1] > local_max_row_size) local_max_row_size = lsub[1]; + } + + } + + /* Max row size is global reduction within a row */ + MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX, + (grid->rscp.comm)); + + /* Buffer size is max of look-ahead window */ + /* int_t buffer_size = + SUPERLU_MAX (max_row_size * num_threads * ldt, + get_max_buffer_size ()); */ + +#ifdef GPU_ACC /*-------- use GPU --------*/ + int cublas_nb = get_cublas_nb(); // default 64 + int nstreams = get_num_cuda_streams (); // default 8 + + int_t buffer_size = SUPERLU_MAX(max_row_size * nstreams * cublas_nb, sp_ienv_dist(8)); + // get_max_buffer_size()); + /* array holding last column blk for each partition, + used in SchCompUdt-cuda.c */ + #if 0 + int *stream_end_col = (int_t *) _mm_malloc (sizeof (int_t) * nstreams,64); + #else + int *stream_end_col = SUPERLU_MALLOC( nstreams * sizeof(int) ); + #endif + +#else /* not to use GPU */ + + int Threads_per_process = get_thread_per_process(); + int_t buffer_size = SUPERLU_MAX(max_row_size * Threads_per_process * ldt, sp_ienv_dist(8)); + // get_max_buffer_size()); +#endif /* end ifdef GPU_ACC -----------*/ + + int_t max_ncols = 0; +#if 0 + /* symmetric assumption -- using L's supernode to estimate. */ + /* Note that in following expression 8 can be anything + as long as its not too big */ + int bigu_size = 8 * sp_ienv_dist (3) * (max_row_size); +#else + int_t bigu_size = estimate_bigu_size( nsupers, Ufstnz_br_ptr, Glu_persist, + grid, perm_u, &max_ncols ); +#endif + + /* +16 to avoid cache line false sharing */ + // int_t bigv_size = SUPERLU_MAX(max_row_size * (bigu_size / ldt), + int_t bigv_size = SUPERLU_MAX(max_row_size * max_ncols, + (ldt*ldt + CACHELINE / dword) * num_threads); + + /* bigU and bigV are only allocated on CPU, but may be allocated as + page-locked memory accessible to GPU. */ + float* bigU; /* for storing entire U(k,:) panel, prepare for GEMM. */ + float* bigV; /* for storing GEMM output matrix, i.e. update matrix. + bigV is large enough to hold the aggregate GEMM output.*/ + bigU = NULL; /* allocated only on CPU */ + bigV = NULL; + +#if ( PRNTlevel>=1 ) + if(!iam) { + printf("\t.. MAX_BUFFER_SIZE %d set for GPU\n", sp_ienv_dist(8)); + printf("\t.. N_GEMM: %d flops of GEMM done on CPU (1st block always on CPU)\n", sp_ienv_dist(7)); + printf("\t.. GEMM buffer size: max_row_size X max_ncols = %d x " IFMT "\n", + max_row_size, max_ncols); + } + printf("[%d].. BIG U size " IFMT " (on CPU)\n", iam, bigu_size); + fflush(stdout); +#endif + +#ifdef GPU_ACC /*-- use GPU --*/ + + if ( checkCuda(cudaHostAlloc((void**)&bigU, bigu_size * sizeof(float), cudaHostAllocDefault)) ) + ABORT("Malloc fails for sgemm buffer U "); + +#if 0 // !!Sherry fix -- only dC on GPU uses buffer_size + bigv_size = buffer_size; +#endif + +#if ( PRNTlevel>=1 ) + printf("[%d].. BIG V size " IFMT " (on CPU), dC buffer_size " IFMT " (on GPU)\n", + iam, bigv_size, buffer_size); + fflush(stdout); +#endif + + if ( checkCuda(cudaHostAlloc((void**)&bigV, bigv_size * sizeof(float) ,cudaHostAllocDefault)) ) + ABORT("Malloc fails for sgemm buffer V"); + +#if ( PRNTlevel>=1 ) + if ( iam==0 ) { + DisplayHeader(); + printf(" Starting with %d Cuda Streams \n",nstreams ); + fflush(stdout); + } +#endif + + cublasHandle_t *handle; + handle = (cublasHandle_t *) SUPERLU_MALLOC(sizeof(cublasHandle_t)*nstreams); + for(int i = 0; i < nstreams; i++) handle[i] = create_handle(); + + // creating streams + cudaStream_t *streams; + streams = (cudaStream_t *) SUPERLU_MALLOC(sizeof(cudaStream_t)*nstreams); + for (int i = 0; i < nstreams; ++i) + checkCuda( cudaStreamCreate(&streams[i]) ); + + // allocating data in device + float *dA, *dB, *dC; + cudaError_t cudaStat; +#if 0 + // cudaStat = cudaMalloc( (void**)&dA, m*k*sizeof(double)); + // HOw much should be the size of dA? + // for time being just making it + // cudaStat = cudaMalloc( (void**)&dA, ((max_row_size*sp_ienv_dist(3)))* sizeof(double)); +#endif + + cudaStat = cudaMalloc( (void**)&dA, max_row_size*sp_ienv_dist(3)* sizeof(float)); + if (cudaStat!= cudaSuccess) { + fprintf(stderr, "!!!! Error in allocating A in the device %ld \n",m*k*sizeof(float) ); + return 1; + } + + // size of B should be bigu_size + cudaStat = cudaMalloc((void**)&dB, bigu_size * sizeof(float)); + if (cudaStat!= cudaSuccess) { + fprintf(stderr, "!!!! Error in allocating B in the device %ld \n",n*k*sizeof(float)); + return 1; + } + + cudaStat = cudaMalloc((void**)&dC, buffer_size * sizeof(float) ); + if (cudaStat!= cudaSuccess) { + fprintf(stderr, "!!!! Error in allocating C in the device \n" ); + return 1; + } + + stat->gpu_buffer += dword * ( max_row_size * sp_ienv_dist(3) // dA + + bigu_size // dB + + buffer_size ); // dC + +#else /*-------- not to use GPU --------*/ + + // for GEMM padding 0 + j = bigu_size / ldt; + bigu_size += (gemm_k_pad * (j + ldt + gemm_n_pad)); + bigv_size += (gemm_m_pad * (j + max_row_size + gemm_n_pad)); + +#if ( PRNTlevel>=1 ) + printf("[%d].. BIG V size " IFMT " (on CPU)\n", iam, bigv_size); + fflush(stdout); +#endif + +//#ifdef __INTEL_COMPILER +// bigU = _mm_malloc(bigu_size * sizeof(float), 1<<12); // align at 4K page +// bigV = _mm_malloc(bigv_size * sizeof(float), 1<<12); +//#else + if ( !(bigU = floatMalloc_dist(bigu_size)) ) + ABORT ("Malloc fails for sgemm U buffer"); + if ( !(bigV = floatMalloc_dist(bigv_size)) ) + ABORT ("Malloc failed for sgemm V buffer"); +//#endif + +#endif +/*************** end ifdef GPU_ACC ****************/ + + log_memory((bigv_size + bigu_size) * dword, stat); + + // mlock(bigU,(bigu_size) * sizeof (double)); + +#if ( PRNTlevel>=1 ) + if(!iam) { + printf (" Max row size is %d \n", max_row_size); + printf (" Threads per process %d \n", num_threads); + fflush(stdout); + } + +#endif + + /* Sherry: (ldt + 16), avoid cache line false sharing. + KNL cacheline size = 64 bytes = 16 int */ + iinfo = ldt + CACHELINE / sizeof(int); + if (!(indirect = SUPERLU_MALLOC (iinfo * num_threads * sizeof(int)))) + ABORT ("Malloc fails for indirect[]."); + if (!(indirect2 = SUPERLU_MALLOC (iinfo * num_threads * sizeof(int)))) + ABORT ("Malloc fails for indirect[]."); + + log_memory(2 * ldt*ldt * dword + 2 * iinfo * num_threads * iword, stat); + + int_t *lookAheadFullRow,*lookAheadStRow,*lookAhead_lptr,*lookAhead_ib, + *RemainStRow,*Remain_lptr,*Remain_ib; + + lookAheadFullRow = intMalloc_dist( (num_look_aheads+1) ); + lookAheadStRow = intMalloc_dist( (num_look_aheads+1) ); + lookAhead_lptr = intMalloc_dist( (num_look_aheads+1) ); + lookAhead_ib = intMalloc_dist( (num_look_aheads+1) ); + + int_t mrb = (nsupers + Pr - 1) / Pr; + int_t mcb = (nsupers + Pc - 1) / Pc; + + RemainStRow = intMalloc_dist(mrb); +#if 0 + Remain_lptr = (int *) _mm_malloc(sizeof(int)*mrb,1); +#else + Remain_lptr = intMalloc_dist(mrb); +#endif + // mlock(Remain_lptr, sizeof(int)*mrb ); + Remain_ib = intMalloc_dist(mrb); + + Remain_info_t *Remain_info; +#if 0 + Remain_info = (Remain_info_t *) _mm_malloc(mrb*sizeof(Remain_info_t),64); +#else + Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb*sizeof(Remain_info_t)); +#endif + + float *lookAhead_L_buff, *Remain_L_buff; /* Stores entire L-panel */ + Ublock_info_t *Ublock_info; + ldt = sp_ienv_dist (3); /* max supernode size */ + /* The following is quite loose */ + lookAhead_L_buff = floatMalloc_dist(ldt*ldt* (num_look_aheads+1) ); + +#if 0 + Remain_L_buff = (float *) _mm_malloc( sizeof(float)*(Llu->bufmax[1]),64); + Ublock_info = (Ublock_info_t *) _mm_malloc(mcb*sizeof(Ublock_info_t),64); + /*int * Ublock_info_iukp = (int *) _mm_malloc(mcb*sizeof(int),64); + int * Ublock_info_rukp = (int *) _mm_malloc(mcb*sizeof(int),64); + int * Ublock_info_jb = (int *) _mm_malloc(mcb*sizeof(int),64); */ +#else + j = gemm_m_pad * (ldt + max_row_size + gemm_k_pad); + Remain_L_buff = floatMalloc_dist(Llu->bufmax[1] + j); /* This is loose */ + Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb*sizeof(Ublock_info_t)); + /*int *Ublock_info_iukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); + int *Ublock_info_rukp = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); + int *Ublock_info_jb = (int *) SUPERLU_MALLOC(mcb*sizeof(int)); */ +#endif + + long long alloc_mem = 3 * mrb * iword + mrb * sizeof(Remain_info_t) + + ldt * ldt * (num_look_aheads+1) * dword + + Llu->bufmax[1] * dword ; + log_memory(alloc_mem, stat); + + InitTimer = SuperLU_timer_() - tt1; + + double pxgstrfTimer = SuperLU_timer_(); + + /* ################################################################## + ** Handle first block column separately to start the pipeline. ** + ################################################################## */ + look_id = 0; + msgcnt = msgcnts[0]; /* Lsub[0] to be transferred */ + send_req = send_reqs[0]; + recv_req = recv_reqs[0]; + + k0 = 0; + k = perm_c_supno[0]; + kcol = PCOL (k, grid); + krow = PROW (k, grid); + if (mycol == kcol) { + double ttt1 = SuperLU_timer_(); + + /* panel factorization */ + PSGSTRF2 (options, k0, k, thresh, Glu_persist, grid, Llu, + U_diag_blk_send_req, tag_ub, stat, info); + + pdgstrf2_timer += SuperLU_timer_()-ttt1; + + scp = &grid->rscp; /* The scope of process row. */ + + /* Multicasts numeric values of L(:,0) to process rows. */ + lk = LBj (k, grid); /* Local block number. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + if (lsub) { + /* number of entries in Lsub_buf[] to be transferred */ + msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + /* number of entries in Lval_buf[] to be transferred */ + msgcnt[1] = lsub[1] * SuperSize (k); + } else { + msgcnt[0] = msgcnt[1] = 0; + } + + for (pj = 0; pj < Pc; ++pj) { + if (ToSendR[lk][pj] != EMPTY) { +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + + MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj, + SLU_MPI_TAG (0, 0) /* 0 */, + scp->comm, &send_req[pj]); + MPI_Isend (lusup, msgcnt[1], MPI_FLOAT, pj, + SLU_MPI_TAG (1, 0) /* 1 */, + scp->comm, &send_req[pj + Pc]); +#if ( DEBUGlevel>=2 ) + printf ("[%d] first block cloumn Send L(:,%4d): lsub %4d, lusup %4d to Pc %2d\n", + iam, 0, msgcnt[0], msgcnt[1], pj); +#endif + +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; + ++prof_sendR[lk]; + msg_cnt += 2; + msg_vol += msgcnt[0] * iword + msgcnt[1] * dword; +#endif + } /* end if */ + } /* end for pj ... */ + } else { /* Post immediate receives. */ + if (ToRecv[k] >= 1) { /* Recv block column L(:,0). */ + scp = &grid->rscp; /* The scope of process row. */ +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Irecv (Lsub_buf_2[0], Llu->bufmax[0], mpi_int_t, kcol, + SLU_MPI_TAG (0, 0) /* 0 */ , + scp->comm, &recv_req[0]); + MPI_Irecv (Lval_buf_2[0], Llu->bufmax[1], MPI_FLOAT, kcol, + SLU_MPI_TAG (1, 0) /* 1 */ , + scp->comm, &recv_req[1]); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; +#endif + } + } /* end if mycol == 0 */ + + factored[k] = 0; /* flag column k as factored. */ + + /* post receive of first U-row */ + if (myrow != krow) { + if (ToRecv[k] == 2) { /* Recv block row U(k,:). */ + scp = &grid->cscp; /* The scope of process column. */ + Usub_buf = Llu->Usub_buf_2[0]; + Uval_buf = Llu->Uval_buf_2[0]; +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow, + SLU_MPI_TAG (2, 0) /* 2%tag_ub */ , + scp->comm, &recv_reqs_u[0][0]); + MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_FLOAT, krow, + SLU_MPI_TAG (3, 0) /* 3%tag_ub */ , + scp->comm, &recv_reqs_u[0][1]); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DOWN] += t2; +#endif + } + } + + /* ################################################################## + **** MAIN LOOP **** + ################################################################## */ + for (k0 = 0; k0 < nsupers; ++k0) { + k = perm_c_supno[k0]; + + /* ============================================ * + * ======= look-ahead the new L columns ======= * + * ============================================ */ + /* tt1 = SuperLU_timer_(); */ + if (k0 == 0) { /* look-ahead all the columns in the window */ + kk1 = k0 + 1; + kk2 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1); + } else { /* look-ahead one new column after the current window */ + kk1 = k0 + num_look_aheads; + kk2 = SUPERLU_MIN (kk1, nsupers - 1); + } + + for (kk0 = kk1; kk0 <= kk2; kk0++) { + /* loop through look-ahead window in L */ + + kk = perm_c_supno[kk0]; /* use the ordering from static schedule */ + look_id = kk0 % (1 + num_look_aheads); /* which column in window */ + + if (look_ahead[kk] < k0) { /* does not depend on current column k */ + kcol = PCOL (kk, grid); + if (mycol == kcol) { /* I own this panel */ + + /* Panel factorization -- Factor diagonal and subdiagonal + L blocks and test for exact singularity. */ + factored[kk] = 0; /* flag column kk as factored */ + double ttt1 = SuperLU_timer_(); + + PSGSTRF2 (options, kk0, kk, thresh, Glu_persist, + grid, Llu, U_diag_blk_send_req, tag_ub, stat, info); + + pdgstrf2_timer += SuperLU_timer_() - ttt1; + + /* Multicasts numeric values of L(:,kk) to process rows. */ + /* ttt1 = SuperLU_timer_(); */ + msgcnt = msgcnts[look_id]; /* point to the proper count array */ + send_req = send_reqs[look_id]; + + lk = LBj (kk, grid); /* Local block number in L. */ + lsub1 = Lrowind_bc_ptr[lk]; + if (lsub1) { + msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR; /* size of metadata */ + msgcnt[1] = lsub1[1] * SuperSize (kk); /* Lval_buf[] size */ + } else { + msgcnt[0] = 0; + msgcnt[1] = 0; + } + scp = &grid->rscp; /* The scope of process row. */ + for (pj = 0; pj < Pc; ++pj) { + if (ToSendR[lk][pj] != EMPTY) { + lusup1 = Lnzval_bc_ptr[lk]; +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj, + SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */ + scp->comm, &send_req[pj]); + MPI_Isend (lusup1, msgcnt[1], MPI_FLOAT, pj, + SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */ + scp->comm, &send_req[pj + Pc]); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; + ++prof_sendR[lk]; +#endif +#if ( DEBUGlevel>=2 ) + printf ("[%d] -1- Send L(:,%4d): #lsub1 %4d, #lusup1 %4d right to Pj %2d\n", + iam, kk, msgcnt[0], msgcnt[1], pj); +#endif + } + } + /* stat->time9 += SuperLU_timer_() - ttt1; */ + } else { /* Post Recv of block column L(:,kk). */ + /* double ttt1 = SuperLU_timer_(); */ + if (ToRecv[kk] >= 1) { + scp = &grid->rscp; /* The scope of process row. */ + recv_req = recv_reqs[look_id]; +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0], + mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */ + scp->comm, &recv_req[0]); + MPI_Irecv (Lval_buf_2[look_id], Llu->bufmax[1], + MPI_FLOAT, kcol, + SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */ + scp->comm, &recv_req[1]); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; +#endif + } + /* stat->time10 += SuperLU_timer_() - ttt1; */ + } /* end if mycol == Pc(kk) */ + } /* end if look-ahead in L panels */ + + /* Pre-post irecv for U-row look-ahead */ + krow = PROW (kk, grid); + if (myrow != krow) { + if (ToRecv[kk] == 2) { /* post iRecv block row U(kk,:). */ + scp = &grid->cscp; /* The scope of process column. */ + Usub_buf = Llu->Usub_buf_2[look_id]; + Uval_buf = Llu->Uval_buf_2[look_id]; +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow, + SLU_MPI_TAG (2, kk0) /* (4*kk0+2)%tag_ub */ , + scp->comm, &recv_reqs_u[look_id][0]); + MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_FLOAT, krow, + SLU_MPI_TAG (3, kk0) /* (4*kk0+3)%tag_ub */ , + scp->comm, &recv_reqs_u[look_id][1]); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DOWN] += t2; +#endif + } + } + + } /* end for each column in look-ahead window for L panels */ + + /* stat->time4 += SuperLU_timer_()-tt1; */ + + /* ================================= * + * ==== look-ahead the U rows === * + * ================================= */ + kk1 = k0; + kk2 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1); + for (kk0 = kk1; kk0 < kk2; kk0++) { + kk = perm_c_supno[kk0]; /* order determined from static schedule */ + if (factoredU[kk0] != 1 && look_ahead[kk] < k0) { + /* does not depend on current column k */ + kcol = PCOL (kk, grid); + krow = PROW (kk, grid); + lk = LBj (kk, grid); /* Local block number across row. NOT USED?? -- Sherry */ + + look_id = kk0 % (1 + num_look_aheads); + msgcnt = msgcntsU[look_id]; + recv_req = recv_reqs[look_id]; + + /* ================================================= * + * Check if diagonal block has been received * + * for panel factorization of U in look-ahead window * + * ================================================= */ + + if (mycol == kcol) { /* I own this column panel, no need + to receive L */ + flag0 = flag1 = 1; + msgcnt[0] = msgcnt[1] = -1; /* No need to transfer Lsub, nor Lval */ + } else { /* Check to receive L(:,kk) from the left */ + flag0 = flag1 = 0; + if ( ToRecv[kk] >= 1 ) { +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + if ( recv_req[0] != MPI_REQUEST_NULL ) { + MPI_Test (&recv_req[0], &flag0, &status); + if ( flag0 ) { + MPI_Get_count (&status, mpi_int_t, &msgcnt[0]); + recv_req[0] = MPI_REQUEST_NULL; + } + } else flag0 = 1; + + if ( recv_req[1] != MPI_REQUEST_NULL ) { + MPI_Test (&recv_req[1], &flag1, &status); + if ( flag1 ) { + MPI_Get_count (&status, mpi_int_t, &msgcnt[1]); + recv_req[1] = MPI_REQUEST_NULL; + } + } else flag1 = 1; +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; +#endif + } else { + msgcnt[0] = 0; + } + } + + if (flag0 && flag1) { /* L(:,kk) is ready */ + /* tt1 = SuperLU_timer_(); */ + scp = &grid->cscp; /* The scope of process column. */ + if (myrow == krow) { + factoredU[kk0] = 1; + /* Parallel triangular solve across process row *krow* -- + U(k,j) = L(k,k) \ A(k,j). */ + double ttt2 = SuperLU_timer_(); +#ifdef _OPENMP +/* #pragma omp parallel */ /* Sherry -- parallel done inside psgstrs2 */ +#endif + { + psgstrs2_omp (kk0, kk, Glu_persist, grid, Llu, + Ublock_info, stat); + } + + pdgstrs2_timer += SuperLU_timer_()-ttt2; + /* stat->time8 += SuperLU_timer_()-ttt2; */ + + /* Multicasts U(kk,:) to process columns. */ + lk = LBi (kk, grid); + usub = Ufstnz_br_ptr[lk]; + uval = Unzval_br_ptr[lk]; + if (usub) { + msgcnt[2] = usub[2]; /* metadata size */ + msgcnt[3] = usub[1]; /* Uval[] size */ + } else { + msgcnt[2] = msgcnt[3] = 0; + } + + if (ToSendD[lk] == YES) { + for (pi = 0; pi < Pr; ++pi) { + if (pi != myrow) { +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + + MPI_Isend (usub, msgcnt[2], mpi_int_t, pi, + SLU_MPI_TAG (2, kk0), /* (4*kk0+2)%tag_ub */ + scp->comm, &send_reqs_u[look_id][pi]); + MPI_Isend (uval, msgcnt[3], MPI_FLOAT, + pi, SLU_MPI_TAG (3, kk0), /* (4*kk0+3)%tag_ub */ + scp->comm, &send_reqs_u[look_id][pi + Pr]); + +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + msg_cnt += 2; + msg_vol += msgcnt[2] * iword + msgcnt[3] * dword; +#endif +#if ( DEBUGlevel>=2 ) + printf ("[%d] Send U(%4d,:) to Pr %2d\n", + iam, k, pi); +#endif + } /* if pi ... */ + } /* for pi ... */ + } /* if ToSendD ... */ + + /* stat->time2 += SuperLU_timer_()-tt1; */ + + } /* end if myrow == krow */ + } /* end if flag0 & flag1 ... */ + } /* end if factoredU[] ... */ + } /* end for kk0 ... */ + + /* ============================================== * + * == start processing the current row of U(k,:) * + * ============================================== */ + knsupc = SuperSize (k); + krow = PROW (k, grid); + kcol = PCOL (k, grid); + + /* tt1 = SuperLU_timer_(); */ + look_id = k0 % (1 + num_look_aheads); + recv_req = recv_reqs[look_id]; + send_req = send_reqs[look_id]; + msgcnt = msgcnts[look_id]; + Usub_buf = Llu->Usub_buf_2[look_id]; + Uval_buf = Llu->Uval_buf_2[look_id]; + + if (mycol == kcol) { + lk = LBj (k, grid); /* Local block number in L */ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + for (pj = 0; pj < Pc; ++pj) { + /* Wait for Isend to complete before using lsub/lusup buffer. */ + if (ToSendR[lk][pj] != EMPTY) { + MPI_Wait (&send_req[pj], &status); + MPI_Wait (&send_req[pj + Pc], &status); + } + } +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; +#endif + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + } else { + if (ToRecv[k] >= 1) { /* Recv block column L(:,k). */ + + scp = &grid->rscp; /* The scope of process row. */ + + /* ============================================= * + * Waiting for L(:,kk) for outer-product uptate * + * if iam in U(kk,:), then the diagonal block * + * did not reach in time for panel factorization * + * of U(k,:). * + * ============================================= */ +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + if (recv_req[0] != MPI_REQUEST_NULL) { + MPI_Wait (&recv_req[0], &status); + MPI_Get_count (&status, mpi_int_t, &msgcnt[0]); + recv_req[0] = MPI_REQUEST_NULL; + } else { + msgcnt[0] = msgcntsU[look_id][0]; +#if (DEBUGlevel>=2) + printf("\t[%d] k=%d, look_id=%d, recv_req[0] == MPI_REQUEST_NULL, msgcnt[0] = %d\n", + iam, k, look_id, msgcnt[0]); +#endif + } + + if (recv_req[1] != MPI_REQUEST_NULL) { + MPI_Wait (&recv_req[1], &status); + MPI_Get_count (&status, MPI_FLOAT, &msgcnt[1]); + recv_req[1] = MPI_REQUEST_NULL; + } else { + msgcnt[1] = msgcntsU[look_id][1]; +#if (DEBUGlevel>=2) + printf("\t[%d] k=%d, look_id=%d, recv_req[1] == MPI_REQUEST_NULL, msgcnt[1] = %d\n", + iam, k, look_id, msgcnt[1]); +#endif + } + +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; +#endif +#if ( DEBUGlevel>=2 ) + printf("[%d] Recv L(:,%4d): #lsub %4d, #lusup %4d from Pc %2d\n", + iam, k, msgcnt[0], msgcnt[1], kcol); + fflush (stdout); +#endif + +#if ( PRNTlevel==3 ) + ++total_msg; + if (!msgcnt[0]) ++zero_msg; +#endif + } else { + msgcnt[0] = 0; + } + + lsub = Lsub_buf_2[look_id]; + lusup = Lval_buf_2[look_id]; + } /* else if mycol = Pc(k) */ + /* stat->time1 += SuperLU_timer_()-tt1; */ + + scp = &grid->cscp; /* The scope of process column. */ + + /* tt1 = SuperLU_timer_(); */ + if (myrow == krow) { /* I own U(k,:) */ + lk = LBi (k, grid); + usub = Ufstnz_br_ptr[lk]; + uval = Unzval_br_ptr[lk]; + + if (factoredU[k0] == -1) { + /* Parallel triangular solve across process row *krow* -- + U(k,j) = L(k,k) \ A(k,j). */ + double ttt2 = SuperLU_timer_(); +#ifdef _OPENMP +/* #pragma omp parallel */ /* Sherry -- parallel done inside psgstrs2 */ +#endif + { + psgstrs2_omp (k0, k, Glu_persist, grid, Llu, + Ublock_info, stat); + } + pdgstrs2_timer += SuperLU_timer_() - ttt2; + + /* Sherry -- need to set factoredU[k0] = 1; ?? */ + + /* Multicasts U(k,:) along process columns. */ + if ( usub ) { + msgcnt[2] = usub[2]; /* metadata size */ + msgcnt[3] = usub[1]; /* Uval[] size */ + } else { + msgcnt[2] = msgcnt[3] = 0; + } + + if (ToSendD[lk] == YES) { + for (pi = 0; pi < Pr; ++pi) { + if (pi != myrow) { /* Matching recv was pre-posted before */ +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Send (usub, msgcnt[2], mpi_int_t, pi, + SLU_MPI_TAG (2, k0), /* (4*k0+2)%tag_ub */ + scp->comm); + MPI_Send (uval, msgcnt[3], MPI_FLOAT, pi, + SLU_MPI_TAG (3, k0), /* (4*k0+3)%tag_ub */ + scp->comm); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DOWN] += t2; + msg_cnt += 2; + msg_vol += msgcnt[2] * iword + msgcnt[3] * dword; +#endif +#if ( DEBUGlevel>=2 ) + printf ("[%d] Send U(%4d,:) down to Pr %2d\n", iam, k, pi); +#endif + } /* if pi ... */ + } /* for pi ... */ + } /* if ToSendD ... */ + + } else { /* Panel U(k,:) already factorized from previous look-ahead */ + + /* ================================================ * + * Wait for downward sending of U(k,:) to complete * + * for outer-product update. * + * ================================================ */ + + if (ToSendD[lk] == YES) { +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + for (pi = 0; pi < Pr; ++pi) { + if (pi != myrow) { + MPI_Wait (&send_reqs_u[look_id][pi], &status); + MPI_Wait (&send_reqs_u[look_id][pi + Pr], &status); + } + } +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DOWN] += t2; +#endif + } + msgcnt[2] = msgcntsU[look_id][2]; + msgcnt[3] = msgcntsU[look_id][3]; + } + /* stat->time2 += SuperLU_timer_()-tt1; */ + + } else { /* myrow != krow */ + + /* ========================================== * + * Wait for U(k,:) for outer-product updates. * + * ========================================== */ + + if (ToRecv[k] == 2) { /* Recv block row U(k,:). */ +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Wait (&recv_reqs_u[look_id][0], &status); + MPI_Get_count (&status, mpi_int_t, &msgcnt[2]); + MPI_Wait (&recv_reqs_u[look_id][1], &status); + MPI_Get_count (&status, MPI_FLOAT, &msgcnt[3]); + +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DOWN] += t2; +#endif + usub = Usub_buf; + uval = Uval_buf; +#if ( DEBUGlevel>=2 ) + printf ("[%d] Recv U(%4d,:) from Pr %2d\n", iam, k, krow); +#endif +#if ( PRNTlevel==3 ) + ++total_msg; + if (!msgcnt[2]) ++zero_msg; +#endif + } else { + msgcnt[2] = 0; + } + /* stat->time6 += SuperLU_timer_()-tt1; */ + } /* end if myrow == Pr(k) */ + + /* + * Parallel rank-k update; pair up blocks L(i,k) and U(k,j). + * for (j = k+1; k < N; ++k) { + * for (i = k+1; i < N; ++i) + * if ( myrow == PROW( i, grid ) && mycol == PCOL( j, grid ) + * && L(i,k) != 0 && U(k,j) != 0 ) + * A(i,j) = A(i,j) - L(i,k) * U(k,j); + */ + msg0 = msgcnt[0]; + msg2 = msgcnt[2]; + /* tt1 = SuperLU_timer_(); */ + if (msg0 && msg2) { /* L(:,k) and U(k,:) are not empty. */ + nsupr = lsub[1]; /* LDA of lusup. */ + if (myrow == krow) { /* Skip diagonal block L(k,k). */ + lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1]; + luptr0 = knsupc; + nlb = lsub[0] - 1; + } else { + lptr0 = BC_HEADER; + luptr0 = 0; + nlb = lsub[0]; + } + iukp = BR_HEADER; /* Skip header; Pointer to index[] of U(k,:) */ + rukp = 0; /* Pointer to nzval[] of U(k,:) */ + nub = usub[0]; /* Number of blocks in the block row U(k,:) */ + klst = FstBlockC (k + 1); + + /* ------------------------------------------------------------- + Update the look-ahead block columns A(:,k+1:k+num_look_ahead) + ------------------------------------------------------------- */ + iukp0 = iukp; + rukp0 = rukp; + /* reorder the remaining columns in bottome-up */ + /* TAU_STATIC_TIMER_START("LOOK_AHEAD_UPDATE"); */ + for (jj = 0; jj < nub; jj++) { +#ifdef ISORT + iperm_u[jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */ + perm_u[jj] = jj; +#else + perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */ + perm_u[2 * jj + 1] = jj; +#endif + jb = usub[iukp]; /* Global block number of block U(k,j). */ + nsupc = SuperSize (jb); + iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */ + iukp += nsupc; + } + iukp = iukp0; +#ifdef ISORT + /* iperm_u is sorted based on elimination order; + perm_u reorders the U blocks to match the elimination order. */ + isort (nub, iperm_u, perm_u); +#else + qsort (perm_u, (size_t) nub, 2 * sizeof (int_t), + &superlu_sort_perm); +#endif + +/************************************************************************/ + double ttx =SuperLU_timer_(); + +//#include "slook_ahead_update_v4.c" +#include "slook_ahead_update.c" + + lookaheadupdatetimer += SuperLU_timer_() - ttx; +/************************************************************************/ + + /*ifdef OMP_LOOK_AHEAD */ + /* TAU_STATIC_TIMER_STOP("LOOK_AHEAD_UPDATE"); */ + } /* if L(:,k) and U(k,:) not empty */ + + /* stat->time3 += SuperLU_timer_()-tt1; */ + + /* ================== */ + /* == post receive == */ + /* ================== */ + kk1 = SUPERLU_MIN (k0 + num_look_aheads, nsupers - 1); + for (kk0 = k0 + 1; kk0 <= kk1; kk0++) { + kk = perm_c_supno[kk0]; + kcol = PCOL (kk, grid); + + if (look_ahead[kk] == k0) { + if (mycol != kcol) { + if (ToRecv[kk] >= 1) { + scp = &grid->rscp; /* The scope of process row. */ + + look_id = kk0 % (1 + num_look_aheads); + recv_req = recv_reqs[look_id]; +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Irecv (Lsub_buf_2[look_id], Llu->bufmax[0], + mpi_int_t, kcol, SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */ + scp->comm, &recv_req[0]); + MPI_Irecv (Lval_buf_2[look_id], Llu->bufmax[1], + MPI_FLOAT, kcol, + SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */ + scp->comm, &recv_req[1]); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; +#endif + } + } else { + lk = LBj (kk, grid); /* Local block number. */ + lsub1 = Lrowind_bc_ptr[lk]; + lusup1 = Lnzval_bc_ptr[lk]; + if (factored[kk] == -1) { + /* Factor diagonal and subdiagonal blocks and + test for exact singularity. */ + factored[kk] = 0; /* flag column kk as factored */ + double ttt1 = SuperLU_timer_(); + PSGSTRF2 (options, kk0, kk, thresh, + Glu_persist, grid, Llu, U_diag_blk_send_req, + tag_ub, stat, info); + pdgstrf2_timer += SuperLU_timer_() - ttt1; + + /* Process column *kcol+1* multicasts numeric + values of L(:,k+1) to process rows. */ + look_id = kk0 % (1 + num_look_aheads); + send_req = send_reqs[look_id]; + msgcnt = msgcnts[look_id]; + + if (lsub1) { + msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR; + msgcnt[1] = lsub1[1] * SuperSize (kk); + } else { + msgcnt[0] = 0; + msgcnt[1] = 0; + } + + scp = &grid->rscp; /* The scope of process row. */ + for (pj = 0; pj < Pc; ++pj) { + if (ToSendR[lk][pj] != EMPTY) { +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj, + SLU_MPI_TAG (0, kk0), /* (4*kk0)%tag_ub */ + scp->comm, &send_req[pj]); + MPI_Isend (lusup1, msgcnt[1], MPI_FLOAT, pj, + SLU_MPI_TAG (1, kk0), /* (4*kk0+1)%tag_ub */ + scp->comm, &send_req[pj + Pc]); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_RIGHT] += t2; + ++prof_sendR[lk]; +#endif + } + } /* end for pj ... */ + } /* if factored[kk] ... */ + } + } + } + + double tsch = SuperLU_timer_(); + + /*******************************************************************/ + +#ifdef GPU_ACC /*-- GPU --*/ + +#include "sSchCompUdt-cuda.c" + +#else + +/*#include "SchCompUdt--Phi-2Ddynamic-alt.c"*/ +//#include "sSchCompUdt-2Ddynamic_v6.c" + +#include "sSchCompUdt-2Ddynamic.c" + +#endif + /*uncomment following to compare against SuperLU 3.3 baseline*/ + /* #include "SchCompUdt--baseline.c" */ + /************************************************************************/ + + NetSchurUpTimer += SuperLU_timer_() - tsch; + + } /* MAIN LOOP for k0 = 0, ... */ + + /* ################################################################## + ** END MAIN LOOP: for k0 = ... + ################################################################## */ + + pxgstrfTimer = SuperLU_timer_() - pxgstrfTimer; + +#if ( PRNTlevel>=2 ) + /* Print detailed statistics */ + /* Updating total flops */ + double allflops; + MPI_Reduce(&RemainGEMM_flops, &allflops, 1, MPI_DOUBLE, MPI_SUM, + 0, grid->comm); + if ( iam==0 ) { + printf("\nInitialization time\t%8.4lf seconds\n" + "\t Serial: compute static schedule, allocate storage\n", InitTimer); + printf("\n==== Time breakdown in factorization (rank 0) ====\n"); + printf("Panel factorization \t %8.4lf seconds\n", + pdgstrf2_timer + pdgstrs2_timer); + printf(".. L-panel pxgstrf2 \t %8.4lf seconds\n", pdgstrf2_timer); + printf(".. U-panel pxgstrs2 \t %8.4lf seconds\n", pdgstrs2_timer); + printf("Time in Look-ahead update \t %8.4lf seconds\n", lookaheadupdatetimer); + printf("Time in Schur update \t\t %8.4lf seconds\n", NetSchurUpTimer); + printf(".. Time to Gather L buffer\t %8.4lf (Separate L panel by Lookahead/Remain)\n", GatherLTimer); + printf(".. Time to Gather U buffer\t %8.4lf \n", GatherUTimer); + + printf(".. Time in GEMM %8.4lf \n", + LookAheadGEMMTimer + RemainGEMMTimer); + printf("\t* Look-ahead\t %8.4lf \n", LookAheadGEMMTimer); + printf("\t* Remain\t %8.4lf\tFlops %8.4le\tGflops %8.4lf\n", + RemainGEMMTimer, allflops, allflops/RemainGEMMTimer*1e-9); + printf(".. Time to Scatter %8.4lf \n", + LookAheadScatterTimer + RemainScatterTimer); + printf("\t* Look-ahead\t %8.4lf \n", LookAheadScatterTimer); + printf("\t* Remain\t %8.4lf \n", RemainScatterTimer); + + printf("Total factorization time \t: %8.4lf seconds, \n", pxgstrfTimer); + printf("--------\n"); + printf("GEMM maximum block: %d-%d-%d\n", gemm_max_m, gemm_max_k, gemm_max_n); + } +#endif + +#if ( DEBUGlevel>=3 ) + for (i = 0; i < Pr * Pc; ++i) { + if (iam == i) { + sPrintLblocks(iam, nsupers, grid, Glu_persist, Llu); + sPrintUblocks(iam, nsupers, grid, Glu_persist, Llu); + printf ("(%d)\n", iam); + PrintInt10 ("Recv", nsupers, Llu->ToRecv); + } + MPI_Barrier (grid->comm); + } +#endif + + /******************************************************** + * Free memory * + ********************************************************/ + + if (Pr * Pc > 1) { + SUPERLU_FREE (Lsub_buf_2[0]); /* also free Lsub_buf_2[1] */ + SUPERLU_FREE (Lval_buf_2[0]); /* also free Lval_buf_2[1] */ + if (Llu->bufmax[2] != 0) + SUPERLU_FREE (Usub_buf_2[0]); + if (Llu->bufmax[3] != 0) + SUPERLU_FREE (Uval_buf_2[0]); + if (U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL) { + /* wait for last Isend requests to complete, deallocate objects */ + for (krow = 0; krow < Pr; ++krow) { + if (krow != myrow) + MPI_Wait (U_diag_blk_send_req + krow, &status); + } + } + SUPERLU_FREE (U_diag_blk_send_req); + } + + log_memory( -((Llu->bufmax[0] + Llu->bufmax[2]) * (num_look_aheads + 1) * iword + + (Llu->bufmax[1] + Llu->bufmax[3]) * (num_look_aheads + 1) * dword), + stat ); + + SUPERLU_FREE (Lsub_buf_2); + SUPERLU_FREE (Lval_buf_2); + SUPERLU_FREE (Usub_buf_2); + SUPERLU_FREE (Uval_buf_2); + SUPERLU_FREE (perm_c_supno); + SUPERLU_FREE (perm_u); +#ifdef ISORT + SUPERLU_FREE (iperm_u); +#endif + SUPERLU_FREE (look_ahead); + SUPERLU_FREE (factoredU); + SUPERLU_FREE (factored); + log_memory(-(6 * nsupers * iword), stat); + + for (i = 0; i <= num_look_aheads; i++) { + SUPERLU_FREE (msgcnts[i]); + SUPERLU_FREE (msgcntsU[i]); + } + SUPERLU_FREE (msgcnts); + SUPERLU_FREE (msgcntsU); + + for (i = 0; i <= num_look_aheads; i++) { + SUPERLU_FREE (send_reqs_u[i]); + SUPERLU_FREE (recv_reqs_u[i]); + SUPERLU_FREE (send_reqs[i]); + SUPERLU_FREE (recv_reqs[i]); + } + + SUPERLU_FREE (recv_reqs_u); + SUPERLU_FREE (send_reqs_u); + SUPERLU_FREE (recv_reqs); + SUPERLU_FREE (send_reqs); + +#ifdef GPU_ACC + checkCuda (cudaFreeHost (bigV)); + checkCuda (cudaFreeHost (bigU)); + cudaFree( (void*)dA ); /* Sherry added */ + cudaFree( (void*)dB ); + cudaFree( (void*)dC ); + SUPERLU_FREE( handle ); + SUPERLU_FREE( streams ); + SUPERLU_FREE( stream_end_col ); +#else +// #ifdef __INTEL_COMPILER +// _mm_free (bigU); +// _mm_free (bigV); +// #else + SUPERLU_FREE (bigV); + SUPERLU_FREE (bigU); +// #endif + /* Decrement freed memory from memory stat. */ + log_memory(-(bigv_size + bigu_size) * dword, stat); +#endif + + SUPERLU_FREE (Llu->ujrow); + // SUPERLU_FREE (tempv2d);/* Sherry */ + SUPERLU_FREE (indirect); + SUPERLU_FREE (indirect2); /* Sherry added */ + + ldt = sp_ienv_dist(3); + log_memory( -(3 * ldt *ldt * dword + 2 * ldt * num_threads * iword), stat ); + + /* Sherry added */ + SUPERLU_FREE(omp_loop_time); + SUPERLU_FREE(full_u_cols); + SUPERLU_FREE(blk_ldu); +#if ( PRNTlevel>=1 ) + log_memory(-2 * ncb * dword, stat); +#endif + + SUPERLU_FREE(lookAheadFullRow); + SUPERLU_FREE(lookAheadStRow); + SUPERLU_FREE(lookAhead_lptr); + SUPERLU_FREE(lookAhead_ib); + + SUPERLU_FREE(RemainStRow); + SUPERLU_FREE(Remain_lptr); + SUPERLU_FREE(Remain_ib); + SUPERLU_FREE(Remain_info); + SUPERLU_FREE(lookAhead_L_buff); + SUPERLU_FREE(Remain_L_buff); + log_memory( -(3 * mrb * iword + mrb * sizeof(Remain_info_t) + + ldt * ldt * (num_look_aheads + 1) * dword + + Llu->bufmax[1] * dword), stat ); + + SUPERLU_FREE(Ublock_info); + /*SUPERLU_FREE(Ublock_info_iukp); + SUPERLU_FREE(Ublock_info_rukp); + SUPERLU_FREE(Ublock_info_jb); */ + + +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + + /* Prepare error message - find the smallesr index i that U(i,i)==0 */ + if ( *info == 0 ) *info = n + 1; + MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid->comm); + if ( iinfo == n + 1 ) *info = 0; + else *info = iinfo; + +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + { + float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; + + MPI_Reduce (&msg_cnt, &msg_cnt_sum, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&msg_cnt, &msg_cnt_max, + 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + MPI_Reduce (&msg_vol, &msg_vol_sum, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&msg_vol, &msg_vol_max, + 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + if ( iam==0 ) { + printf ("\tPSGSTRF comm stat:" + "\tAvg\tMax\t\tAvg\tMax\n" + "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", + msg_cnt_sum / Pr / Pc, msg_cnt_max, + msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6); + printf("\t\tcomm time on task 0: %8.2lf\n" + "\t\t\tcomm down DIAG block %8.2lf\n" + "\t\t\tcomm right L panel %8.2lf\n" + "\t\t\tcomm down U panel %8.2lf\n", + stat->utime[COMM], stat->utime[COMM_DIAG], + stat->utime[COMM_RIGHT], stat->utime[COMM_DOWN]); + //#include + //int Digs = DECIMAL_DIG; + printf("gemm_count %d\n", gemm_count); + for (i = 0; i < gemm_count; ++i) + fprintf(fgemm, "%8d%8d%8d\t %20.16e\t%8d\n", gemm_stats[i].m, gemm_stats[i].n, + gemm_stats[i].k, gemm_stats[i].microseconds, prof_sendR[i]); + + fclose(fgemm); + } + SUPERLU_FREE(gemm_stats); + SUPERLU_FREE(prof_sendR); + } +#endif + +#if ( PRNTlevel==3 ) + MPI_Allreduce (&zero_msg, &iinfo, 1, MPI_INT, MPI_SUM, grid->comm); + if (!iam) + printf (".. # msg of zero size\t%d\n", iinfo); + MPI_Allreduce (&total_msg, &iinfo, 1, MPI_INT, MPI_SUM, grid->comm); + if (!iam) + printf (".. # total msg\t%d\n", iinfo); +#endif + +#if ( DEBUGlevel>=3 ) + for (i = 0; i < Pr * Pc; ++i) { + if (iam == i) { + sPrintLblocks (iam, nsupers, grid, Glu_persist, Llu); + sPrintUblocks (iam, nsupers, grid, Glu_persist, Llu); + printf ("(%d)\n", iam); + PrintInt10 ("Recv", nsupers, Llu->ToRecv); + } + MPI_Barrier (grid->comm); + } +#endif + +#if ( DEBUGlevel>=3 ) + printf ("(%d) num_copy=%d, num_update=%d\n", iam, num_copy, num_update); +#endif +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit psgstrf()"); +#endif + + return 0; +} /* PSGSTRF */ + diff --git a/SRC/psgstrf2.c b/SRC/psgstrf2.c new file mode 100644 index 00000000..5d0a24c1 --- /dev/null +++ b/SRC/psgstrf2.c @@ -0,0 +1,921 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Performs panel LU factorization. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * August 15, 2014
+ *
+ * Modified:
+ *   September 30, 2017
+ *   May 10, 2019 version 7.0.0
+ *
+ * 
+ * Purpose
+ * =======
+ *   Panel factorization -- block column k
+ *
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that own block column *k* participate
+ *   in the work.
+ *
+ * Arguments
+ * =========
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * k0     (input) int (global)
+ *        Counter of the next supernode to be factorized.
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) sLocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * tag_ub (input) int
+ *        Upper bound of MPI tag values.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ */ + +#include +#include "superlu_sdefs.h" +//#include "cblas.h" + +/***************************************************************************** + * The following psgstrf2_trsm is in version 6 and earlier. + *****************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Panel factorization -- block column k
+ *
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that own block column *k* participate
+ *   in the work.
+ *
+ * Arguments
+ * =========
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * k0     (input) int (global)
+ *        Counter of the next supernode to be factorized.
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) sLocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * tag_ub (input) int
+ *        Upper bound of MPI tag values.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ */ +/* This pdgstrf2 is based on TRSM function */ +void +psgstrf2_trsm + (superlu_dist_options_t * options, int_t k0, int_t k, double thresh, + Glu_persist_t * Glu_persist, gridinfo_t * grid, sLocalLU_t * Llu, + MPI_Request * U_diag_blk_send_req, int tag_ub, + SuperLUStat_t * stat, int *info) +{ + /* printf("entering psgstrf2 %d \n", grid->iam); */ + int cols_left, iam, l, pkk, pr; + int incx = 1, incy = 1; + + int nsupr; /* number of rows in the block (LDA) */ + int nsupc; /* number of columns in the block */ + int luptr; + int_t i, myrow, krow, j, jfst, jlst, u_diag_cnt; + int_t *xsup = Glu_persist->xsup; + float *lusup, temp; + float *ujrow, *ublk_ptr; /* pointer to the U block */ + float alpha = -1, zero = 0.0; + int_t Pr; + MPI_Status status; + MPI_Comm comm = (grid->cscp).comm; + double t1, t2; + + /* Initialization. */ + iam = grid->iam; + Pr = grid->nprow; + myrow = MYROW (iam, grid); + krow = PROW (k, grid); + pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + j = LBj (k, grid); /* Local block number */ + jfst = FstBlockC (k); + jlst = FstBlockC (k + 1); + lusup = Llu->Lnzval_bc_ptr[j]; + nsupc = SuperSize (k); + if (Llu->Lrowind_bc_ptr[j]) + nsupr = Llu->Lrowind_bc_ptr[j][1]; + else + nsupr = 0; +#ifdef PI_DEBUG + printf ("rank %d Iter %d k=%d \t strsm nsuper %d \n", + iam, k0, k, nsupr); +#endif + ublk_ptr = ujrow = Llu->ujrow; + + luptr = 0; /* Point to the diagonal entries. */ + cols_left = nsupc; /* supernode size */ + int ld_ujrow = nsupc; /* leading dimension of ujrow */ + u_diag_cnt = 0; + incy = ld_ujrow; + + if ( U_diag_blk_send_req && + U_diag_blk_send_req[myrow] != MPI_REQUEST_NULL ) { + /* There are pending sends - wait for all Isend to complete */ +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + for (pr = 0; pr < Pr; ++pr) { + if (pr != myrow) { + MPI_Wait (U_diag_blk_send_req + pr, &status); + } + } +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DIAG] += t2; +#endif + /* flag no more outstanding send request. */ + U_diag_blk_send_req[myrow] = MPI_REQUEST_NULL; + } + + if (iam == pkk) { /* diagonal process */ + /* ++++ First step compute diagonal block ++++++++++ */ + for (j = 0; j < jlst - jfst; ++j) { /* for each column in panel */ + /* Diagonal pivot */ + i = luptr; + /* May replace zero pivot. */ + if (options->ReplaceTinyPivot == YES ) { + if (fabs (lusup[i]) < thresh) { /* Diagonal */ + +#if ( PRNTlevel>=2 ) + printf ("(%d) .. col %d, tiny pivot %e ", + iam, jfst + j, lusup[i]); +#endif + /* Keep the new diagonal entry with the same sign. */ + if (lusup[i] < 0) lusup[i] = -thresh; + else lusup[i] = thresh; +#if ( PRNTlevel>=2 ) + printf ("replaced by %e\n", lusup[i]); +#endif + ++(stat->TinyPivots); + } + } + +#if 0 + for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) + ublk_ptr[u_diag_cnt] = lusup[i]; /* copy one row of U */ +#endif + + /* storing U in full form */ + int st; + for (l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) { + st = j * ld_ujrow + j; + ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */ + } + + if ( ujrow[0] == zero ) { /* Test for singularity. */ + *info = j + jfst + 1; + } else { /* Scale the j-th column within diag. block. */ + temp = 1.0 / ujrow[0]; + for (i = luptr + 1; i < luptr - j + nsupc; ++i) + lusup[i] *= temp; + stat->ops[FACT] += nsupc - j - 1; + } + + /* Rank-1 update of the trailing submatrix within diag. block. */ + if (--cols_left) { + /* l = nsupr - j - 1; */ + l = nsupc - j - 1; /* Piyush */ + sger_ (&l, &cols_left, &alpha, &lusup[luptr + 1], &incx, + &ujrow[ld_ujrow], &incy, &lusup[luptr + nsupr + 1], + &nsupr); + stat->ops[FACT] += 2 * l * cols_left; + } + + /* ujrow = ublk_ptr + u_diag_cnt; */ + ujrow = ujrow + ld_ujrow + 1; /* move to next row of U */ + luptr += nsupr + 1; /* move to next column */ + + } /* for column j ... first loop */ + + /* ++++ Second step compute off-diagonal block with communication ++*/ + + ublk_ptr = ujrow = Llu->ujrow; + + if (U_diag_blk_send_req && iam == pkk) { /* Send the U block downward */ + /** ALWAYS SEND TO ALL OTHERS - TO FIX **/ +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + for (pr = 0; pr < Pr; ++pr) { + if (pr != krow) { + /* tag = ((k0<<2)+2) % tag_ub; */ + /* tag = (4*(nsupers+k0)+2) % tag_ub; */ + MPI_Isend (ublk_ptr, nsupc * nsupc, MPI_FLOAT, pr, + SLU_MPI_TAG (4, k0) /* tag */ , + comm, U_diag_blk_send_req + pr); + + } + } +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DIAG] += t2; +#endif + + /* flag outstanding Isend */ + U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* Sherry */ + } + + /* pragma below would be changed by an MKL call */ + + l = nsupr - nsupc; + // n = nsupc; + float alpha = 1.0; +#ifdef PI_DEBUG + printf ("calling strsm\n"); + printf ("strsm diagonal param 11: %d \n", nsupr); +#endif + +#if defined (USE_VENDOR_BLAS) + strsm_ ("R", "U", "N", "N", &l, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr, + 1, 1, 1, 1); +#else + strsm_ ("R", "U", "N", "N", &l, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, &lusup[nsupc], &nsupr); +#endif + stat->ops[FACT] += (flops_t) nsupc * (nsupc+1) * l; + } else { /* non-diagonal process */ + /* ================================================================== * + * Receive the diagonal block of U for panel factorization of L(:,k). * + * Note: we block for panel factorization of L(:,k), but panel * + * factorization of U(:,k) do not block * + * ================================================================== */ + + /* tag = ((k0<<2)+2) % tag_ub; */ + /* tag = (4*(nsupers+k0)+2) % tag_ub; */ + // printf("hello message receiving%d %d\n",(nsupc*(nsupc+1))>>1,SLU_MPI_TAG(4,k0)); +#if ( PROFlevel>=1 ) + TIC (t1); +#endif + MPI_Recv (ublk_ptr, (nsupc * nsupc), MPI_FLOAT, krow, + SLU_MPI_TAG (4, k0) /* tag */ , + comm, &status); +#if ( PROFlevel>=1 ) + TOC (t2, t1); + stat->utime[COMM] += t2; + stat->utime[COMM_DIAG] += t2; +#endif + if (nsupr > 0) { + float alpha = 1.0; + +#ifdef PI_DEBUG + printf ("strsm non diagonal param 11: %d \n", nsupr); + if (!lusup) + printf (" Rank :%d \t Empty block column occurred :\n", iam); +#endif +#if defined (USE_VENDOR_BLAS) + strsm_ ("R", "U", "N", "N", &nsupr, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr, 1, 1, 1, 1); +#else + strsm_ ("R", "U", "N", "N", &nsupr, &nsupc, + &alpha, ublk_ptr, &ld_ujrow, lusup, &nsupr); +#endif + stat->ops[FACT] += (flops_t) nsupc * (nsupc+1) * nsupr; + } + + } /* end if pkk ... */ + + /* printf("exiting psgstrf2 %d \n", grid->iam); */ + +} /* PSGSTRF2_trsm */ + + + +/***************************************************************************** + * The following functions are for the new pdgstrf2_strsm in the 3D code. + *****************************************************************************/ +static +int_t LpanelUpdate(int off0, int nsupc, float* ublk_ptr, int ld_ujrow, + float* lusup, int nsupr, SCT_t* SCT) +{ + int_t l = nsupr - off0; + float alpha = 1.0; + double t1 = SuperLU_timer_(); + +#define GT 32 +#pragma omp parallel for + for (int i = 0; i < CEILING(l, GT); ++i) + { + int_t off = i * GT; + int len = SUPERLU_MIN(GT, l - i * GT); + + superlu_strsm("R", "U", "N", "N", len, nsupc, alpha, + ublk_ptr, ld_ujrow, &lusup[off0 + off], nsupr); + + } /* for i = ... */ + + t1 = SuperLU_timer_() - t1; + + SCT->trf2_flops += (double) l * (double) nsupc * (double)nsupc; + SCT->trf2_time += t1; + SCT->L_PanelUpdate_tl += t1; + return 0; +} + +#pragma GCC push_options +#pragma GCC optimize ("O0") +/*factorizes the diagonal block; called from process that owns the (k,k) block*/ +void Local_Sgstrf2(superlu_dist_options_t *options, int_t k, double thresh, + float *BlockUFactor, /*factored U is overwritten here*/ + Glu_persist_t *Glu_persist, gridinfo_t *grid, sLocalLU_t *Llu, + SuperLUStat_t *stat, int *info, SCT_t* SCT) +{ + //double t1 = SuperLU_timer_(); + int_t *xsup = Glu_persist->xsup; + float alpha = -1, zero = 0.0; + + // printf("Entering dgetrf2 %d \n", k); + /* Initialization. */ + int_t lk = LBj (k, grid); /* Local block number */ + int_t jfst = FstBlockC (k); + int_t jlst = FstBlockC (k + 1); + float *lusup = Llu->Lnzval_bc_ptr[lk]; + int nsupc = SuperSize (k); + int nsupr; + if (Llu->Lrowind_bc_ptr[lk]) + nsupr = Llu->Lrowind_bc_ptr[lk][1]; + else + nsupr = 0; + float *ublk_ptr = BlockUFactor; + float *ujrow = BlockUFactor; + int_t luptr = 0; /* Point_t to the diagonal entries. */ + int cols_left = nsupc; /* supernode size */ + int_t u_diag_cnt = 0; + int_t ld_ujrow = nsupc; /* leading dimension of ujrow */ + int incx = 1; + int incy = ld_ujrow; + + for (int_t j = 0; j < jlst - jfst; ++j) /* for each column in panel */ + { + /* Diagonal pivot */ + int_t i = luptr; + /* Not to replace zero pivot. */ + if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0) + { + if (fabs (lusup[i]) < thresh) { /* Diagonal */ + +#if ( PRNTlevel>=2 ) + printf ("(%d) .. col %d, tiny pivot %e ", + iam, jfst + j, lusup[i]); +#endif + /* Keep the new diagonal entry with the same sign. */ + if (lusup[i] < 0) lusup[i] = -thresh; + else lusup[i] = thresh; +#if ( PRNTlevel>=2 ) + printf ("replaced by %e\n", lusup[i]); +#endif + ++(stat->TinyPivots); + } + } + + for (int_t l = 0; l < cols_left; ++l, i += nsupr, ++u_diag_cnt) + { + int_t st = j * ld_ujrow + j; + ublk_ptr[st + l * ld_ujrow] = lusup[i]; /* copy one row of U */ + } + + if (ujrow[0] == zero) /* Test for singularity. */ + { + *info = j + jfst + 1; + } + else /* Scale the j-th column. */ + { + float temp; + temp = 1.0 / ujrow[0]; + for (int_t i = luptr + 1; i < luptr - j + nsupc; ++i) + lusup[i] *= temp; + stat->ops[FACT] += nsupc - j - 1; + } + + /* Rank-1 update of the trailing submatrix. */ + if (--cols_left) + { + /*following must be int*/ + int l = nsupc - j - 1; + + /* Rank-1 update */ + superlu_sger(l, cols_left, alpha, &lusup[luptr + 1], incx, + &ujrow[ld_ujrow], incy, &lusup[luptr + nsupr + 1], nsupr); + stat->ops[FACT] += 2 * l * cols_left; + } + + ujrow = ujrow + ld_ujrow + 1; /* move to next row of U */ + luptr += nsupr + 1; /* move to next column */ + + } /* for column j ... first loop */ + + + //int_t thread_id = omp_get_thread_num(); + // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( SuperLU_timer_() - t1); +} + +#pragma GCC pop_options +/************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Panel factorization -- block column k
+ *
+ *   Factor diagonal and subdiagonal blocks and test for exact singularity.
+ *   Only the column processes that own block column *k* participate
+ *   in the work.
+ *
+ * Arguments
+ * =========
+ * options (input) superlu_dist_options_t* (global)
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *
+ * nsupers (input) int_t (global)
+ *         Number of supernodes.
+ *
+ * k0     (input) int (global)
+ *        Counter of the next supernode to be factorized.
+ *
+ * k      (input) int (global)
+ *        The column number of the block column to be factorized.
+ *
+ * thresh (input) double (global)
+ *        The threshold value = s_eps * anorm.
+ *
+ * Glu_persist (input) Glu_persist_t*
+ *        Global data structures (xsup, supno) replicated on all processes.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Llu    (input/output) sLocalLU_t*
+ *        Local data structures to store distributed L and U matrices.
+ *
+ * U_diag_blk_send_req (input/output) MPI_Request*
+ *        List of send requests to send down the diagonal block of U.
+ *
+ * tag_ub (input) int
+ *        Upper bound of MPI tag values.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the factorization.
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ * SCT    (output) SCT_t*
+ *        Additional statistics used in the 3D algorithm.
+ *
+ * 
+ */ +void psgstrf2_xtrsm +(superlu_dist_options_t *options, int_t nsupers, + int_t k0, int_t k, double thresh, Glu_persist_t *Glu_persist, + gridinfo_t *grid, sLocalLU_t *Llu, MPI_Request *U_diag_blk_send_req, + int tag_ub, SuperLUStat_t *stat, int *info, SCT_t *SCT) +{ + int cols_left, iam, pkk; + int incy = 1; + + int nsupr; /* number of rows in the block (LDA) */ + int luptr; + int_t myrow, krow, j, jfst, jlst, u_diag_cnt; + int_t nsupc; /* number of columns in the block */ + int_t *xsup = Glu_persist->xsup; + float *lusup; + float *ujrow, *ublk_ptr; /* pointer to the U block */ + int_t Pr; + + /* Quick return. */ + *info = 0; + + /* Initialization. */ + iam = grid->iam; + Pr = grid->nprow; + myrow = MYROW (iam, grid); + krow = PROW (k, grid); + pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + j = LBj (k, grid); /* Local block number */ + jfst = FstBlockC (k); + jlst = FstBlockC (k + 1); + lusup = Llu->Lnzval_bc_ptr[j]; + nsupc = SuperSize (k); + if (Llu->Lrowind_bc_ptr[j]) + nsupr = Llu->Lrowind_bc_ptr[j][1]; + else + nsupr = 0; + ublk_ptr = ujrow = Llu->ujrow; + + luptr = 0; /* Point to the diagonal entries. */ + cols_left = nsupc; /* supernode size */ + int ld_ujrow = nsupc; /* leading dimension of ujrow */ + u_diag_cnt = 0; + incy = ld_ujrow; + + if (U_diag_blk_send_req && U_diag_blk_send_req[myrow]) + { + /* There are pending sends - wait for all Isend to complete */ + Wait_UDiagBlockSend(U_diag_blk_send_req, grid, SCT); + } + + if (iam == pkk) /* diagonal process */ + { + /*factorize the diagonal block*/ + Local_Sgstrf2(options, k, thresh, Llu->ujrow, Glu_persist, + grid, Llu, stat, info, SCT); + ublk_ptr = ujrow = Llu->ujrow; + + if (U_diag_blk_send_req && iam == pkk) /* Send the U block */ + { + sISend_UDiagBlock(k0, ublk_ptr, nsupc * nsupc, U_diag_blk_send_req, + grid, tag_ub); + U_diag_blk_send_req[krow] = (MPI_Request) TRUE; /* flag outstanding Isend */ + } + + LpanelUpdate(nsupc, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); + } + else /* non-diagonal process */ + { + /* ================================================ * + * Receive the diagonal block of U * + * for panel factorization of L(:,k) * + * note: we block for panel factorization of L(:,k) * + * but panel factorization of U(:,k) don't * + * ================================================ */ + + sRecv_UDiagBlock( k0, ublk_ptr, (nsupc * nsupc), krow, grid, SCT, tag_ub); + + if (nsupr > 0) + { + LpanelUpdate(0, nsupc, ublk_ptr, ld_ujrow, lusup, nsupr, SCT); + } + } /* end if pkk ... */ + +} /* psgstrf2_xtrsm */ + +/***************************************************************************** + * The following functions are for the new psgstrs2_omp in the 3D code. + *****************************************************************************/ + +/* PSGSTRS2 helping kernels*/ + +int_t sTrs2_GatherU(int_t iukp, int_t rukp, int_t klst, + int_t nsupc, int_t ldu, + int_t *usub, + float* uval, float *tempv) +{ + double zero = 0.0; + int_t ncols = 0; + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + int_t segsize = klst - usub[jj]; + if ( segsize ) + { + int_t lead_zero = ldu - segsize; + for (int_t i = 0; i < lead_zero; ++i) tempv[i] = zero; + tempv += lead_zero; + for (int_t i = 0; i < segsize; ++i) + tempv[i] = uval[rukp + i]; + rukp += segsize; + tempv += segsize; + ncols++; + } + } + return ncols; +} + +int_t sTrs2_ScatterU(int_t iukp, int_t rukp, int_t klst, + int_t nsupc, int_t ldu, + int_t *usub, float* uval, float *tempv) +{ + for (int_t jj = 0; jj < nsupc; ++jj) + { + int_t segsize = klst - usub[iukp + jj]; + if (segsize) + { + int_t lead_zero = ldu - segsize; + tempv += lead_zero; + for (int i = 0; i < segsize; ++i) + { + uval[rukp + i] = tempv[i]; + } + tempv += segsize; + rukp += segsize; + } + } /*for jj=0:nsupc */ + return 0; +} + +int_t sTrs2_GatherTrsmScatter(int_t klst, int_t iukp, int_t rukp, + int_t *usub, float *uval, float *tempv, + int_t knsupc, int nsupr, float *lusup, + Glu_persist_t *Glu_persist) /*glupersist for xsup for supersize*/ +{ + float alpha = 1.0; + int_t *xsup = Glu_persist->xsup; + // int_t iukp = Ublock_info.iukp; + // int_t rukp = Ublock_info.rukp; + int_t gb = usub[iukp]; + int_t nsupc = SuperSize (gb); + iukp += UB_DESCRIPTOR; + + // printf("klst inside task%d\n", ); + /*find ldu */ + int ldu = 0; + for (int_t jj = iukp; jj < iukp + nsupc; ++jj) + { + ldu = SUPERLU_MAX( klst - usub[jj], ldu) ; + } + + /*pack U block into a dense Block*/ + int ncols = sTrs2_GatherU(iukp, rukp, klst, nsupc, ldu, usub, + uval, tempv); + + /*now call strsm on packed dense block*/ + int_t luptr = (knsupc - ldu) * (nsupr + 1); + // if(ldu>nsupr) printf("nsupr %d ldu %d\n",nsupr,ldu ); + + superlu_strsm("L", "L", "N", "U", ldu, ncols, alpha, + &lusup[luptr], nsupr, tempv, ldu); + + /*now scatter the output into sparse U block*/ + sTrs2_ScatterU(iukp, rukp, klst, nsupc, ldu, usub, uval, tempv); + + return 0; +} + +/* +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ */ + +#if 1 + +/***************************************************************************** + * The following pdgstrf2_omp is improved for KNL, since Version 5.2.0. + *****************************************************************************/ +void psgstrs2_omp +(int_t k0, int_t k, Glu_persist_t * Glu_persist, gridinfo_t * grid, + sLocalLU_t * Llu, Ublock_info_t *Ublock_info, SuperLUStat_t * stat) +{ +#ifdef PI_DEBUG + printf("====Entering psgstrs2==== \n"); +#endif + int iam, pkk; + int incx = 1; + int nsupr; /* number of rows in the block L(:,k) (LDA) */ + int segsize; + int nsupc; /* number of columns in the block */ + int_t luptr, iukp, rukp; + int_t b, gb, j, klst, knsupc, lk, nb; + int_t *xsup = Glu_persist->xsup; + int_t *usub; + float *lusup, *uval; + +#if 0 + //#ifdef USE_VTUNE + __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores + __itt_resume(); // start VTune, again use 2 underscores +#endif + + /* Quick return. */ + lk = LBi (k, grid); /* Local block number */ + if (!Llu->Unzval_br_ptr[lk]) return; + + /* Initialization. */ + iam = grid->iam; + pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid); + //int k_row_cycle = k / grid->nprow; /* for which cycle k exist (to assign rowwise thread blocking) */ + //int gb_col_cycle; /* cycle through block columns */ + klst = FstBlockC (k + 1); + knsupc = SuperSize (k); + usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ + uval = Llu->Unzval_br_ptr[lk]; + if (iam == pkk) { + lk = LBj (k, grid); + nsupr = Llu->Lrowind_bc_ptr[lk][1]; /* LDA of lusup[] */ + lusup = Llu->Lnzval_bc_ptr[lk]; + } else { + nsupr = Llu->Lsub_buf_2[k0 % (1 + stat->num_look_aheads)][1]; /* LDA of lusup[] */ + lusup = Llu->Lval_buf_2[k0 % (1 + stat->num_look_aheads)]; + } + + /////////////////////new-test////////////////////////// + /* !! Taken from Carl/SuperLU_DIST_5.1.0/EXAMPLE/pdgstrf2_v3.c !! */ + + /* Master thread: set up pointers to each block in the row */ + nb = usub[0]; + iukp = BR_HEADER; + rukp = 0; + + /* Sherry: can use the existing Ublock_info[] array, call + Trs2_InitUblock_info(); */ +#undef USE_Ublock_info +#ifdef USE_Ublock_info /** 4/19/2019 **/ + /* Loop through all the row blocks. to get the iukp and rukp*/ + Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); +#else + int* blocks_index_pointers = SUPERLU_MALLOC (3 * nb * sizeof(int)); + int* blocks_value_pointers = blocks_index_pointers + nb; + int* nsupc_temp = blocks_value_pointers + nb; + for (b = 0; b < nb; b++) { /* set up pointers to each block */ + blocks_index_pointers[b] = iukp + UB_DESCRIPTOR; + blocks_value_pointers[b] = rukp; + gb = usub[iukp]; + rukp += usub[iukp+1]; + nsupc = SuperSize( gb ); + nsupc_temp[b] = nsupc; + iukp += (UB_DESCRIPTOR + nsupc); /* move to the next block */ + } +#endif + + // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c + // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for +#pragma omp parallel for schedule(static) default(shared) \ + private(b,j,iukp,rukp,segsize) + /* Loop through all the blocks in the row. */ + for (b = 0; b < nb; ++b) { +#ifdef USE_Ublock_info + iukp = Ublock_info[b].iukp; + rukp = Ublock_info[b].rukp; +#else + iukp = blocks_index_pointers[b]; + rukp = blocks_value_pointers[b]; +#endif + + /* Loop through all the segments in the block. */ +#ifdef USE_Ublock_info + gb = usub[iukp]; + nsupc = SuperSize( gb ); + iukp += UB_DESCRIPTOR; + for (j = 0; j < nsupc; j++) { +#else + for (j = 0; j < nsupc_temp[b]; j++) { +#endif + segsize = klst - usub[iukp++]; + if (segsize) { +#ifdef _OPENMP +#pragma omp task default(shared) firstprivate(segsize,rukp) if (segsize > 30) +#endif + { /* Nonzero segment. */ + int_t luptr = (knsupc - segsize) * (nsupr + 1); + //printf("[2] segsize %d, nsupr %d\n", segsize, nsupr); + +#if defined (USE_VENDOR_BLAS) + strsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr, + &uval[rukp], &incx, 1, 1, 1); +#else + strsv_ ("L", "N", "U", &segsize, &lusup[luptr], &nsupr, + &uval[rukp], &incx); +#endif + } /* end task */ + rukp += segsize; +#ifndef USE_Ublock_info + stat->ops[FACT] += segsize * (segsize + 1); +#endif + } /* end if segsize > 0 */ + } /* end for j in parallel ... */ +/* #pragma omp taskwait */ + } /* end for b ... */ + +#ifndef USE_Ublock_info + /* Deallocate memory */ + SUPERLU_FREE(blocks_index_pointers); +#endif + +#if 0 + //#ifdef USE_VTUNE + __itt_pause(); // stop VTune + __SSC_MARK(0x222); // stop SDE tracing +#endif + +} /* psgstrs2_omp */ + +#else /*==== new version from Piyush ====*/ + +void psgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf, + float *Lval_buf, Glu_persist_t *Glu_persist, + gridinfo_t *grid, sLocalLU_t *Llu, SuperLUStat_t *stat, + Ublock_info_t *Ublock_info, float *bigV, int_t ldt, SCT_t *SCT) +{ + double t1 = SuperLU_timer_(); + int_t *xsup = Glu_persist->xsup; + /* Quick return. */ + int_t lk = LBi (k, grid); /* Local block number */ + + if (!Llu->Unzval_br_ptr[lk]) return; + + /* Initialization. */ + int_t klst = FstBlockC (k + 1); + int_t knsupc = SuperSize (k); + int_t *usub = Llu->Ufstnz_br_ptr[lk]; /* index[] of block row U(k,:) */ + float *uval = Llu->Unzval_br_ptr[lk]; + int_t nb = usub[0]; + + int_t nsupr = Lsub_buf[1]; /* LDA of lusup[] */ + float *lusup = Lval_buf; + + /* Loop through all the row blocks. to get the iukp and rukp*/ + Trs2_InitUbloc_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); + + /* Loop through all the row blocks. */ +#pragma omp parallel for schedule(dynamic,2) + for (int_t b = 0; b < nb; ++b) + { +#ifdef _OPENMP + int_t thread_id = omp_get_thread_num(); +#else + int_t thread_id = 0; +#endif + float *tempv = bigV + thread_id * ldt * ldt; + sTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp, + usub, uval, tempv, knsupc, nsupr, lusup, Glu_persist); + } /* for b ... */ + + SCT->PDGSTRS2_tl += (double) ( SuperLU_timer_() - t1); +} /* pdgstrs2_omp new version from Piyush */ + +#endif /* there are 2 versions of psgstrs2_omp */ diff --git a/SRC/psgstrf3d.c b/SRC/psgstrf3d.c new file mode 100644 index 00000000..f071d593 --- /dev/null +++ b/SRC/psgstrf3d.c @@ -0,0 +1,375 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Performs LU factorization in 3D process grid. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_sdefs.h"
+#if 0
+#include "pdgstrf3d.h"
+#include "trfCommWrapper.h"
+#include "trfAux.h"
+//#include "load-balance/supernodal_etree.h"
+//#include "load-balance/supernodalForest.h"
+#include "supernodal_etree.h"
+#include "supernodalForest.h"
+#include "p3dcomm.h"
+#include "treeFactorization.h"
+#include "ancFactorization.h"
+#include "xtrf3Dpartition.h"
+#endif
+
+#ifdef MAP_PROFILE
+#include  "mapsampler_api.h"
+#endif
+
+#ifdef GPU_ACC
+#include "slustruct_gpu.h"
+//#include "acc_aux.c"  //no need anymore
+#endif
+
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * PSGSTRF3D performs the LU factorization in parallel using 3D process grid,
+ * which is a communication-avoiding algorithm compared to the 2D algorithm.
+ *
+ * Arguments
+ * =========
+ *
+ * options (input) superlu_dist_options_t*
+ *         The structure defines the input parameters to control
+ *         how the LU decomposition will be performed.
+ *         The following field should be defined:
+ *         o ReplaceTinyPivot (yes_no_t)
+ *           Specifies whether to replace the tiny diagonals by
+ *           sqrt(epsilon)*norm(A) during LU factorization.
+ *
+ * m      (input) int
+ *        Number of rows in the matrix.
+ *
+ * n      (input) int
+ *        Number of columns in the matrix.
+ *
+ * anorm  (input) float
+ *        The norm of the original matrix A, or the scaled A if
+ *        equilibration was done.
+ *
+ * trf3Dpartition (input) trf3Dpartition*
+ *        Matrix partitioning information in 3D process grid.
+ *
+ * SCT    (input/output) SCT_t*
+ *        Various statistics of 3D factorization.
+ *
+ * LUstruct (input/output) sLUstruct_t*
+ *         The data structures to store the distributed L and U factors.
+ *         The following fields should be defined:
+ *
+ *         o Glu_persist (input) Glu_persist_t*
+ *           Global data structure (xsup, supno) replicated on all processes,
+ *           describing the supernode partition in the factored matrices
+ *           L and U:
+ *         xsup[s] is the leading column of the s-th supernode,
+ *             supno[i] is the supernode number to which column i belongs.
+ *
+ *         o Llu (input/output) sLocalLU_t*
+ *           The distributed data structures to store L and U factors.
+ *           See superlu_sdefs.h for the definition of 'sLocalLU_t'.
+ *
+ * grid3d (input) gridinfo3d_t*
+ *        The 3D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and replication factor in Z-dimension. It is an input argument to all
+ *        the 3D parallel routines.
+ *        Grid3d can be initialized by subroutine SUPERLU_GRIDINIT3D.
+ *        See superlu_defs.h for the definition of 'gridinfo3d_t'.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics on runtime and floating-point operation count.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ *        = 0: successful exit
+ *        < 0: if info = -i, the i-th argument had an illegal value
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ * 
+ */ +int_t psgstrf3d(superlu_dist_options_t *options, int m, int n, float anorm, + trf3Dpartition_t* trf3Dpartition, SCT_t *SCT, + sLUstruct_t *LUstruct, gridinfo3d_t * grid3d, + SuperLUStat_t *stat, int *info) +{ + gridinfo_t* grid = &(grid3d->grid2d); + sLocalLU_t *Llu = LUstruct->Llu; + + // problem specific contants + int_t ldt = sp_ienv_dist (3); /* Size of maximum supernode */ + // double s_eps = slamch_ ("Epsilon"); -Sherry + double s_eps = smach_dist("Epsilon"); + double thresh = s_eps * anorm; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (grid3d->iam, "Enter psgstrf3d()"); +#endif + + // Initilize stat + stat->ops[FACT] = 0; + stat->current_buffer = 0.0; + stat->peak_buffer = 0.0; + stat->gpu_buffer = 0.0; + //if (!grid3d->zscp.Iam && !grid3d->iam) printf("Using NSUP=%d\n", (int) ldt); + + //getting Nsupers + int_t nsupers = getNsupers(n, LUstruct->Glu_persist); + + // Grid related Variables + int_t iam = grid->iam; // in 2D grid + int num_threads = getNumThreads(grid3d->iam); + + factStat_t factStat; + initFactStat(nsupers, &factStat); + +#if 0 // sherry: not used + sdiagFactBufs_t dFBuf; + sinitDiagFactBufs(ldt, &dFBuf); + + commRequests_t comReqs; + initCommRequests(&comReqs, grid); + + msgs_t msgs; + initMsgs(&msgs); +#endif + + SCT->tStartup = SuperLU_timer_(); + packLUInfo_t packLUInfo; + initPackLUInfo(nsupers, &packLUInfo); + + sscuBufs_t scuBufs; + sinitScuBufs(ldt, num_threads, nsupers, &scuBufs, LUstruct, grid); + + factNodelists_t fNlists; + initFactNodelists( ldt, num_threads, nsupers, &fNlists); + + // tag_ub initialization + int tag_ub = set_tag_ub(); + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + +#if ( PRNTlevel>=1 ) + if (grid3d->iam == 0) { + printf ("MPI tag upper bound = %d\n", tag_ub); fflush(stdout); + } +#endif + + // trf3Dpartition_t* trf3Dpartition = initTrf3Dpartition(nsupers, options, LUstruct, grid3d); + gEtreeInfo_t gEtreeInfo = trf3Dpartition->gEtreeInfo; + int_t* iperm_c_supno = trf3Dpartition->iperm_c_supno; + int_t* myNodeCount = trf3Dpartition->myNodeCount; + int_t* myTreeIdxs = trf3Dpartition->myTreeIdxs; + int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs; + sForest_t** sForests = trf3Dpartition->sForests; + int_t** treePerm = trf3Dpartition->treePerm ; + sLUValSubBuf_t *LUvsb = trf3Dpartition->LUvsb; + + /* Initializing factorization specific buffers */ + + int_t numLA = getNumLookAhead(options); + sLUValSubBuf_t** LUvsbs = sLluBufInitArr( SUPERLU_MAX( numLA, grid3d->zscp.Np ), LUstruct); + msgs_t**msgss = initMsgsArr(numLA); + int_t mxLeafNode = 0; + for (int ilvl = 0; ilvl < maxLvl; ++ilvl) { + if (sForests[myTreeIdxs[ilvl]] && sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1] > mxLeafNode ) + mxLeafNode = sForests[myTreeIdxs[ilvl]]->topoInfo.eTreeTopLims[1]; + } + sdiagFactBufs_t** dFBufs = sinitDiagFactBufsArr(mxLeafNode, ldt, grid); + commRequests_t** comReqss = initCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), ldt, grid); + + /* Setting up GPU related data structures */ + + int_t first_l_block_acc = 0; + int_t first_u_block_acc = 0; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t mrb = (nsupers + Pr - 1) / Pr; + int_t mcb = (nsupers + Pc - 1) / Pc; + HyP_t *HyP = (HyP_t *) SUPERLU_MALLOC(sizeof(HyP_t)); + + sInit_HyP(HyP, Llu, mcb, mrb); + HyP->first_l_block_acc = first_l_block_acc; + HyP->first_u_block_acc = first_u_block_acc; + + int superlu_acc_offload = HyP->superlu_acc_offload; + + //int_t bigu_size = getBigUSize(nsupers, grid, LUstruct); + int_t bigu_size = getBigUSize(nsupers, grid, + LUstruct->Llu->Lrowind_bc_ptr); + HyP->bigu_size = bigu_size; + int_t buffer_size = sp_ienv_dist(8); // get_max_buffer_size (); + HyP->buffer_size = buffer_size; + HyP->nsupers = nsupers; + +#ifdef GPU_ACC + + /*Now initialize the GPU data structure*/ + sLUstruct_gpu_t *A_gpu, *dA_gpu; + + d2Hreduce_t d2HredObj; + d2Hreduce_t* d2Hred = &d2HredObj; + ssluGPU_t sluGPUobj; + ssluGPU_t *sluGPU = &sluGPUobj; + sluGPU->isNodeInMyGrid = getIsNodeInMyGrid(nsupers, maxLvl, myNodeCount, treePerm); + if (superlu_acc_offload) + { +#if 0 /* Sherry: For GPU code on titan, we do not need performance + lookup tables since due to difference in CPU-GPU performance, + it didn't make much sense to do any Schur-complement update + on CPU, except for the lookahead-update on CPU. Same should + hold for summit as well. (from Piyush) */ + + /*Initilize the lookup tables */ + LookUpTableInit(iam); + acc_async_cost = get_acc_async_cost(); +#ifdef GPU_DEBUG + if (!iam) printf("Using MIC async cost of %lf \n", acc_async_cost); +#endif +#endif + + //OLD: int_t* perm_c_supno = getPerm_c_supno(nsupers, options, LUstruct, grid); + int_t* perm_c_supno = getPerm_c_supno(nsupers, options, + LUstruct->etree, + LUstruct->Glu_persist, + LUstruct->Llu->Lrowind_bc_ptr, + LUstruct->Llu->Ufstnz_br_ptr, + grid); + + /* Initialize GPU data structures */ + sinitSluGPU3D_t(sluGPU, LUstruct, grid3d, perm_c_supno, + n, buffer_size, bigu_size, ldt); + + HyP->first_u_block_acc = sluGPU->A_gpu->first_u_block_gpu; + HyP->first_l_block_acc = sluGPU->A_gpu->first_l_block_gpu; + HyP->nCudaStreams = sluGPU->nCudaStreams; + } + +#endif // end GPU_ACC + + /*==== starting main factorization loop =====*/ + MPI_Barrier( grid3d->comm); + SCT->tStartup = SuperLU_timer_() - SCT->tStartup; + // int_t myGrid = grid3d->zscp.Iam; + +#ifdef ITAC_PROF + VT_traceon(); +#endif +#ifdef MAP_PROFILE + allinea_start_sampling(); +#endif + SCT->pdgstrfTimer = SuperLU_timer_(); + + for (int ilvl = 0; ilvl < maxLvl; ++ilvl) + { + /* if I participate in this level */ + if (!myZeroTrIdxs[ilvl]) + { + //int_t tree = myTreeIdxs[ilvl]; + + sForest_t* sforest = sForests[myTreeIdxs[ilvl]]; + + /* main loop over all the supernodes */ + if (sforest) /* 2D factorization at individual subtree */ + { + double tilvl = SuperLU_timer_(); +#ifdef GPU_ACC + ssparseTreeFactor_ASYNC_GPU( + sforest, + comReqss, &scuBufs, &packLUInfo, + msgss, LUvsbs, dFBufs, &factStat, &fNlists, + &gEtreeInfo, options, iperm_c_supno, ldt, + sluGPU, d2Hred, HyP, LUstruct, grid3d, stat, + thresh, SCT, tag_ub, info); +#else + ssparseTreeFactor_ASYNC(sforest, comReqss, &scuBufs, &packLUInfo, + msgss, LUvsbs, dFBufs, &factStat, &fNlists, + &gEtreeInfo, options, iperm_c_supno, ldt, + HyP, LUstruct, grid3d, stat, + thresh, SCT, tag_ub, info ); +#endif + + /*now reduce the updates*/ + SCT->tFactor3D[ilvl] = SuperLU_timer_() - tilvl; + sForests[myTreeIdxs[ilvl]]->cost = SCT->tFactor3D[ilvl]; + } + + if (ilvl < maxLvl - 1) /*then reduce before factorization*/ + { +#ifdef GPU_ACC + sreduceAllAncestors3d_GPU( + ilvl, myNodeCount, treePerm, LUvsb, + LUstruct, grid3d, sluGPU, d2Hred, &factStat, HyP, + SCT ); +#else + + sreduceAllAncestors3d(ilvl, myNodeCount, treePerm, + LUvsb, LUstruct, grid3d, SCT ); +#endif + + } + } /*if (!myZeroTrIdxs[ilvl]) ... If I participate in this level*/ + + SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer + : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1]; + } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/ + + MPI_Barrier( grid3d->comm); + SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer; + +#ifdef ITAC_PROF + VT_traceoff(); +#endif + +#ifdef MAP_PROFILE + allinea_stop_sampling(); +#endif + + reduceStat(FACT, stat, grid3d); + + // sherry added + /* Deallocate factorization specific buffers */ + freePackLUInfo(&packLUInfo); + sfreeScuBufs(&scuBufs); + freeFactStat(&factStat); + freeFactNodelists(&fNlists); + freeMsgsArr(numLA, msgss); + freeCommRequestsArr(SUPERLU_MAX(mxLeafNode, numLA), comReqss); + sLluBufFreeArr(numLA, LUvsbs); + sfreeDiagFactBufsArr(mxLeafNode, dFBufs); + Free_HyP(HyP); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (grid3d->iam, "Exit psgstrf3d()"); +#endif + return 0; + +} /* psgstrf3d */ diff --git a/SRC/psgstrs.c b/SRC/psgstrs.c new file mode 100644 index 00000000..3658bd6c --- /dev/null +++ b/SRC/psgstrs.c @@ -0,0 +1,2400 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Solves a system of distributed linear equations A*X = B with a + * general N-by-N matrix A using the LU factors computed previously. + * + *
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * September 18, 2018  version 6.0
+ * February 8, 2019  version 6.1.1
+ * 
+ */ +#include +#include "superlu_sdefs.h" +#ifndef CACHELINE +#define CACHELINE 64 /* bytes, Xeon Phi KNL, Cori haswell, Edision */ +#endif + +/* + * Sketch of the algorithm for L-solve: + * ======================= + * + * Self-scheduling loop: + * + * while ( not finished ) { .. use message counter to control + * + * reveive a message; + * + * if ( message is Xk ) { + * perform local block modifications into lsum[]; + * lsum[i] -= L_i,k * X[k] + * if all local updates done, Isend lsum[] to diagonal process; + * + * } else if ( message is LSUM ) { .. this must be a diagonal process + * accumulate LSUM; + * if ( all LSUM are received ) { + * perform triangular solve for Xi; + * Isend Xi down to the current process column; + * perform local block modifications into lsum[]; + * } + * } + * } + * + * + * Auxiliary data structures: lsum[] / ilsum (pointer to lsum array) + * ======================= + * + * lsum[] array (local) + * + lsum has "nrhs" columns, row-wise is partitioned by supernodes + * + stored by row blocks, column wise storage within a row block + * + prepend a header recording the global block number. + * + * lsum[] ilsum[nsupers + 1] + * + * ----- + * | | | <- header of size 2 --- + * --------- <--------------------| | + * | | | | | --- + * | | | | | |-----------| | + * | | | | | | --- + * --------- | |-------| | + * | | | <- header | | --- + * --------- <--------| | |----| | + * | | | | | | | --- + * | | | | | | | + * | | | | | | | + * --------- | | + * | | | <- header | | + * --------- <------------| | + * | | | | | | + * | | | | | | + * | | | | | | + * --------- <---------------| + */ + +/*#define ISEND_IRECV*/ + +/* + * Function prototypes + */ +#ifdef _CRAY +fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, float*, + float*, int*, float*, int*); +_fcd ftcs1; +_fcd ftcs2; +_fcd ftcs3; +#endif + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Re-distribute B on the diagonal processes of the 2D process mesh.
+ *
+ * Note
+ * ====
+ *   This routine can only be called after the routine psgstrs_init(),
+ *   in which the structures of the send and receive buffers are set up.
+ *
+ * Arguments
+ * =========
+ *
+ * B      (input) float*
+ *        The distributed right-hand side matrix of the possibly
+ *        equilibrated system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * ldb    (input) int (local)
+ *        Leading dimension of matrix B.
+ *
+ * fst_row (input) int (global)
+ *        The row number of B's first row in the global matrix.
+ *
+ * ilsum  (input) int* (global)
+ *        Starting position of each supernode in a full array.
+ *
+ * x      (output) float*
+ *        The solution vector. It is valid only on the diagonal processes.
+ *
+ * ScalePermstruct (input) sScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * SOLVEstruct (input) sSOLVEstruct_t*
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * Return value
+ * ============
+ * 
+ */ + +int_t +psReDistribute_B_to_X(float *B, int_t m_loc, int nrhs, int_t ldb, + int_t fst_row, int_t *ilsum, float *x, + sScalePermstruct_t *ScalePermstruct, + Glu_persist_t *Glu_persist, + gridinfo_t *grid, sSOLVEstruct_t *SOLVEstruct) +{ + int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; + int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; + int *ptr_to_ibuf, *ptr_to_dbuf; + int_t *perm_r, *perm_c; /* row and column permutation vectors */ + int_t *send_ibuf, *recv_ibuf; + float *send_dbuf, *recv_dbuf; + int_t *xsup, *supno; + int_t i, ii, irow, gbi, j, jj, k, knsupc, l, lk, nbrow; + int p, procs; + pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; + MPI_Request req_i, req_d, *req_send, *req_recv; + MPI_Status status, *status_send, *status_recv; + int Nreq_recv, Nreq_send, pp, pps, ppr; + double t; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Enter psReDistribute_B_to_X()"); +#endif + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + procs = grid->nprow * grid->npcol; + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + SendCnt = gstrs_comm->B_to_X_SendCnt; + SendCnt_nrhs = gstrs_comm->B_to_X_SendCnt + procs; + RecvCnt = gstrs_comm->B_to_X_SendCnt + 2*procs; + RecvCnt_nrhs = gstrs_comm->B_to_X_SendCnt + 3*procs; + sdispls = gstrs_comm->B_to_X_SendCnt + 4*procs; + sdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 5*procs; + rdispls = gstrs_comm->B_to_X_SendCnt + 6*procs; + rdispls_nrhs = gstrs_comm->B_to_X_SendCnt + 7*procs; + ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; + ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; + + /* ------------------------------------------------------------ + NOW COMMUNICATE THE ACTUAL DATA. + ------------------------------------------------------------*/ + + if(procs==1){ // faster memory copy when procs=1 + +#ifdef _OPENMP +#pragma omp parallel default (shared) +#endif + { +#ifdef _OPENMP +#pragma omp master +#endif + { + // t = SuperLU_timer_(); +#ifdef _OPENMP +#pragma omp taskloop private (i,l,irow,k,j,knsupc) untied +#endif + for (i = 0; i < m_loc; ++i) { + irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*B */ + + k = BlockNum( irow ); + knsupc = SuperSize( k ); + l = X_BLK( k ); + + x[l - XK_H] = k; /* Block number prepended in the header. */ + + irow = irow - FstBlockC(k); /* Relative row number in X-block */ + RHS_ITERATE(j) { + x[l + irow + j*knsupc] = B[i + j*ldb]; + } + } + } + } + }else{ + k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ + l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ + if ( !(send_ibuf = intMalloc_dist(k + l)) ) + ABORT("Malloc fails for send_ibuf[]."); + recv_ibuf = send_ibuf + k; + if ( !(send_dbuf = floatMalloc_dist((k + l)* (size_t)nrhs)) ) + ABORT("Malloc fails for send_dbuf[]."); + recv_dbuf = send_dbuf + k * nrhs; + if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_send[]."); + if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_recv[]."); + if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_send[]."); + if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_recv[]."); + + for (p = 0; p < procs; ++p) { + ptr_to_ibuf[p] = sdispls[p]; + ptr_to_dbuf[p] = sdispls[p] * nrhs; + } + + /* Copy the row indices and values to the send buffer. */ + // t = SuperLU_timer_(); + for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { + irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ + gbi = BlockNum( irow ); + p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ + k = ptr_to_ibuf[p]; + send_ibuf[k] = irow; + ++ptr_to_ibuf[p]; + + k = ptr_to_dbuf[p]; + RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ + send_dbuf[k++] = B[i + j*ldb]; + } + ptr_to_dbuf[p] += nrhs; + } + + // t = SuperLU_timer_() - t; + // printf(".. copy to send buffer time\t%8.4f\n", t); + +#if 0 + #if 1 + /* Communicate the (permuted) row indices. */ + MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); + /* Communicate the numerical values. */ + MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_FLOAT, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_FLOAT, + grid->comm); + #else + /* Communicate the (permuted) row indices. */ + MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm, &req_i); + /* Communicate the numerical values. */ + MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_FLOAT, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_FLOAT, + grid->comm, &req_d); + MPI_Wait(&req_i,&status); + MPI_Wait(&req_d,&status); + #endif +#endif + MPI_Barrier( grid->comm ); + + + Nreq_send=0; + Nreq_recv=0; + for (pp=0;ppiam+1+pp; + if(pps>=procs)pps-=procs; + if(pps<0)pps+=procs; + ppr = grid->iam-1+pp; + if(ppr>=procs)ppr-=procs; + if(ppr<0)ppr+=procs; + + if(SendCnt[pps]>0){ + MPI_Isend(&send_ibuf[sdispls[pps]], SendCnt[pps], mpi_int_t, pps, 0, grid->comm, + &req_send[Nreq_send] ); + Nreq_send++; + } + if(RecvCnt[ppr]>0){ + MPI_Irecv(&recv_ibuf[rdispls[ppr]], RecvCnt[ppr], mpi_int_t, ppr, 0, grid->comm, + &req_recv[Nreq_recv] ); + Nreq_recv++; + } + } + + + if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); + if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); + + + Nreq_send=0; + Nreq_recv=0; + for (pp=0;ppiam+1+pp; + if(pps>=procs)pps-=procs; + if(pps<0)pps+=procs; + ppr = grid->iam-1+pp; + if(ppr>=procs)ppr-=procs; + if(ppr<0)ppr+=procs; + if(SendCnt_nrhs[pps]>0){ + MPI_Isend(&send_dbuf[sdispls_nrhs[pps]], SendCnt_nrhs[pps], MPI_FLOAT, pps, 1, grid->comm, + &req_send[Nreq_send] ); + Nreq_send++; + } + if(RecvCnt_nrhs[ppr]>0){ + MPI_Irecv(&recv_dbuf[rdispls_nrhs[ppr]], RecvCnt_nrhs[ppr], MPI_FLOAT, ppr, 1, grid->comm, + &req_recv[Nreq_recv] ); + Nreq_recv++; + } + } + + if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); + if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); + + + /* ------------------------------------------------------------ + Copy buffer into X on the diagonal processes. + ------------------------------------------------------------*/ + + // t = SuperLU_timer_(); + ii = 0; + for (p = 0; p < procs; ++p) { + jj = rdispls_nrhs[p]; + for (i = 0; i < RecvCnt[p]; ++i) { + /* Only the diagonal processes do this; the off-diagonal processes + have 0 RecvCnt. */ + irow = recv_ibuf[ii]; /* The permuted row index. */ + k = BlockNum( irow ); + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number. */ + l = X_BLK( lk ); + x[l - XK_H] = k; /* Block number prepended in the header. */ + + irow = irow - FstBlockC(k); /* Relative row number in X-block */ + RHS_ITERATE(j) { + x[l + irow + j*knsupc] = recv_dbuf[jj++]; + } + ++ii; + } + } + + // t = SuperLU_timer_() - t; + // printf(".. copy to x time\t%8.4f\n", t); + + SUPERLU_FREE(send_ibuf); + SUPERLU_FREE(send_dbuf); + SUPERLU_FREE(req_send); + SUPERLU_FREE(req_recv); + SUPERLU_FREE(status_send); + SUPERLU_FREE(status_recv); + } + + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Exit psReDistribute_B_to_X()"); +#endif + return 0; +} /* psReDistribute_B_to_X */ + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Re-distribute X on the diagonal processes to B distributed on all
+ *   the processes.
+ *
+ * Note
+ * ====
+ *   This routine can only be called after the routine psgstrs_init(),
+ *   in which the structures of the send and receive buffers are set up.
+ * 
+ */ + +int_t +psReDistribute_X_to_B(int_t n, float *B, int_t m_loc, int_t ldb, int_t fst_row, + int_t nrhs, float *x, int_t *ilsum, + sScalePermstruct_t *ScalePermstruct, + Glu_persist_t *Glu_persist, gridinfo_t *grid, + sSOLVEstruct_t *SOLVEstruct) +{ + int_t i, ii, irow, j, jj, k, knsupc, nsupers, l, lk; + int_t *xsup, *supno; + int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; + int *sdispls, *rdispls, *sdispls_nrhs, *rdispls_nrhs; + int *ptr_to_ibuf, *ptr_to_dbuf; + int_t *send_ibuf, *recv_ibuf; + float *send_dbuf, *recv_dbuf; + int_t *row_to_proc = SOLVEstruct->row_to_proc; /* row-process mapping */ + pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; + int iam, p, q, pkk, procs; + int_t num_diag_procs, *diag_procs; + MPI_Request req_i, req_d, *req_send, *req_recv; + MPI_Status status, *status_send, *status_recv; + int Nreq_recv, Nreq_send, pp,pps,ppr; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Enter psReDistribute_X_to_B()"); +#endif + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = Glu_persist->supno[n-1] + 1; + iam = grid->iam; + procs = grid->nprow * grid->npcol; + + SendCnt = gstrs_comm->X_to_B_SendCnt; + SendCnt_nrhs = gstrs_comm->X_to_B_SendCnt + procs; + RecvCnt = gstrs_comm->X_to_B_SendCnt + 2*procs; + RecvCnt_nrhs = gstrs_comm->X_to_B_SendCnt + 3*procs; + sdispls = gstrs_comm->X_to_B_SendCnt + 4*procs; + sdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 5*procs; + rdispls = gstrs_comm->X_to_B_SendCnt + 6*procs; + rdispls_nrhs = gstrs_comm->X_to_B_SendCnt + 7*procs; + ptr_to_ibuf = gstrs_comm->ptr_to_ibuf; + ptr_to_dbuf = gstrs_comm->ptr_to_dbuf; + + + if(procs==1){ //faster memory copy when procs=1 + +#ifdef _OPENMP +#pragma omp parallel default (shared) +#endif + { +#ifdef _OPENMP +#pragma omp master +#endif + { + // t = SuperLU_timer_(); +#ifdef _OPENMP +#pragma omp taskloop private (k,knsupc,lk,irow,l,i,j) untied +#endif + for (k = 0; k < nsupers; k++) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number */ + irow = FstBlockC( k ); + l = X_BLK( lk ); + for (i = 0; i < knsupc; ++i) { + RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ + B[irow-fst_row +i + j*ldb] = x[l + i + j*knsupc]; + } + } + } + } + } + }else{ + k = sdispls[procs-1] + SendCnt[procs-1]; /* Total number of sends */ + l = rdispls[procs-1] + RecvCnt[procs-1]; /* Total number of receives */ + if ( !(send_ibuf = intMalloc_dist(k + l)) ) + ABORT("Malloc fails for send_ibuf[]."); + recv_ibuf = send_ibuf + k; + if ( !(send_dbuf = floatMalloc_dist((k + l)*nrhs)) ) + ABORT("Malloc fails for send_dbuf[]."); + if ( !(req_send = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_send[]."); + if ( !(req_recv = (MPI_Request*) SUPERLU_MALLOC(procs*sizeof(MPI_Request))) ) + ABORT("Malloc fails for req_recv[]."); + if ( !(status_send = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_send[]."); + if ( !(status_recv = (MPI_Status*) SUPERLU_MALLOC(procs*sizeof(MPI_Status))) ) + ABORT("Malloc fails for status_recv[]."); + recv_dbuf = send_dbuf + k * nrhs; + for (p = 0; p < procs; ++p) { + ptr_to_ibuf[p] = sdispls[p]; + ptr_to_dbuf[p] = sdispls_nrhs[p]; + } + num_diag_procs = SOLVEstruct->num_diag_procs; + diag_procs = SOLVEstruct->diag_procs; + for (p = 0; p < num_diag_procs; ++p) { /* For all diagonal processes. */ + pkk = diag_procs[p]; + if ( iam == pkk ) { + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number */ + irow = FstBlockC( k ); + l = X_BLK( lk ); + for (i = 0; i < knsupc; ++i) { + #if 0 + ii = inv_perm_c[irow]; /* Apply X <== Pc'*Y */ + #else + ii = irow; + #endif + q = row_to_proc[ii]; + jj = ptr_to_ibuf[q]; + send_ibuf[jj] = ii; + jj = ptr_to_dbuf[q]; + RHS_ITERATE(j) { /* RHS stored in row major in buffer. */ + send_dbuf[jj++] = x[l + i + j*knsupc]; + } + ++ptr_to_ibuf[q]; + ptr_to_dbuf[q] += nrhs; + ++irow; + } + } + } + } + + /* ------------------------------------------------------------ + COMMUNICATE THE (PERMUTED) ROW INDICES AND NUMERICAL VALUES. + ------------------------------------------------------------*/ +#if 0 + #if 1 + MPI_Alltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm); + MPI_Alltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs,MPI_FLOAT, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_FLOAT, + grid->comm); + #else + MPI_Ialltoallv(send_ibuf, SendCnt, sdispls, mpi_int_t, + recv_ibuf, RecvCnt, rdispls, mpi_int_t, grid->comm,&req_i); + MPI_Ialltoallv(send_dbuf, SendCnt_nrhs, sdispls_nrhs, MPI_FLOAT, + recv_dbuf, RecvCnt_nrhs, rdispls_nrhs, MPI_FLOAT, + grid->comm,&req_d); + MPI_Wait(&req_i,&status); + MPI_Wait(&req_d,&status); + #endif +#endif + + MPI_Barrier( grid->comm ); + Nreq_send=0; + Nreq_recv=0; + for (pp=0;ppiam+1+pp; + if(pps>=procs)pps-=procs; + if(pps<0)pps+=procs; + ppr = grid->iam-1+pp; + if(ppr>=procs)ppr-=procs; + if(ppr<0)ppr+=procs; + if(SendCnt[pps]>0){ + MPI_Isend(&send_ibuf[sdispls[pps]], SendCnt[pps], mpi_int_t, pps, 0, grid->comm, + &req_send[Nreq_send] ); + Nreq_send++; + } + if(RecvCnt[ppr]>0){ + MPI_Irecv(&recv_ibuf[rdispls[ppr]], RecvCnt[ppr], mpi_int_t, ppr, 0, grid->comm, + &req_recv[Nreq_recv] ); + Nreq_recv++; + } + } + + + if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); + if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); + // MPI_Barrier( grid->comm ); + + Nreq_send=0; + Nreq_recv=0; + for (pp=0;ppiam+1+pp; + if(pps>=procs)pps-=procs; + if(pps<0)pps+=procs; + ppr = grid->iam-1+pp; + if(ppr>=procs)ppr-=procs; + if(ppr<0)ppr+=procs; + if(SendCnt_nrhs[pps]>0){ + MPI_Isend(&send_dbuf[sdispls_nrhs[pps]], SendCnt_nrhs[pps], MPI_FLOAT, pps, 1, grid->comm, + &req_send[Nreq_send] ); + Nreq_send++; + } + if(RecvCnt_nrhs[ppr]>0){ + MPI_Irecv(&recv_dbuf[rdispls_nrhs[ppr]], RecvCnt_nrhs[ppr], MPI_FLOAT, ppr, 1, grid->comm, + &req_recv[Nreq_recv] ); + Nreq_recv++; + } + } + + + if(Nreq_send>0)MPI_Waitall(Nreq_send,req_send,status_send); + if(Nreq_recv>0)MPI_Waitall(Nreq_recv,req_recv,status_recv); + // MPI_Barrier( grid->comm ); + + + /* ------------------------------------------------------------ + COPY THE BUFFER INTO B. + ------------------------------------------------------------*/ + for (i = 0, k = 0; i < m_loc; ++i) { + irow = recv_ibuf[i]; + irow -= fst_row; /* Relative row number */ + RHS_ITERATE(j) { /* RHS is stored in row major in the buffer. */ + B[irow + j*ldb] = recv_dbuf[k++]; + } + } + + SUPERLU_FREE(send_ibuf); + SUPERLU_FREE(send_dbuf); + SUPERLU_FREE(req_send); + SUPERLU_FREE(req_recv); + SUPERLU_FREE(status_send); + SUPERLU_FREE(status_recv); +} +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Exit psReDistribute_X_to_B()"); +#endif + return 0; + +} /* psReDistribute_X_to_B */ + + + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Compute the inverse of the diagonal blocks of the L and U
+ *   triangular matrices.
+ * 
+ */ +void +psCompute_Diag_Inv(int_t n, sLUstruct_t *LUstruct,gridinfo_t *grid, + SuperLUStat_t *stat, int *info) +{ +#ifdef SLU_HAVE_LAPACK + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + + float *lusup; + float *recvbuf, *tempv; + float *Linv;/* Inverse of diagonal block */ + float *Uinv;/* Inverse of diagonal block */ + + int_t kcol, krow, mycol, myrow; + int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; + int_t nb, nlb,nlb_nodiag, nub, nsupers; + int_t *xsup, *supno, *lsub, *usub; + int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ + int Pc, Pr, iam; + int knsupc, nsupr; + int ldalsum; /* Number of lsum entries locally owned. */ + int maxrecvsz, p, pi; + int_t **Lrowind_bc_ptr; + float **Lnzval_bc_ptr; + float **Linv_bc_ptr; + float **Uinv_bc_ptr; + int INFO; + double t; + + float one = 1.0; + float zero = 0.0; + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + +#if ( PRNTlevel>=2 ) + if ( grid->iam==0 ) { + printf("computing inverse of diagonal blocks...\n"); + fflush(stdout); + } +#endif + + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = supno[n-1] + 1; + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Linv_bc_ptr = Llu->Linv_bc_ptr; + Uinv_bc_ptr = Llu->Uinv_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ + + Llu->inv = 1; + + /*--------------------------------------------------- + * Compute inverse of L(lk,lk). + *---------------------------------------------------*/ + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if ( mycol == kcol ) { /* diagonal process */ + + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + Linv = Linv_bc_ptr[lk]; + Uinv = Uinv_bc_ptr[lk]; + nsupr = lsub[1]; + knsupc = SuperSize( k ); + + for (j=0 ; j=1 ) + if( grid->iam==0 ) { + t = SuperLU_timer_() - t; + printf(".. L-diag_inv time\t%10.5f\n", t); + fflush(stdout); + } +#endif + + return; +#endif /* SLU_HAVE_LAPACK */ +} + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * PSGSTRS solves a system of distributed linear equations
+ * A*X = B with a general N-by-N matrix A using the LU factorization
+ * computed by PSGSTRF.
+ * If the equilibration, and row and column permutations were performed,
+ * the LU factorization was performed for A1 where
+ *     A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ * and the linear system solved is
+ *     A1 * Y = Pc*Pr*B1, where B was overwritten by B1 = diag(R)*B, and
+ * the permutation to B1 by Pc*Pr is applied internally in this routine.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from PSGSTRF for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_sdefs.h for the definition of 'sLUstruct_t'.
+ *        A may be scaled and permuted into A1, so that
+ *        A1 = Pc*Pr*diag(R)*A*diag(C)*Pc^T = L*U
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_defs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input/output) float*
+ *        On entry, the distributed right-hand side matrix of the possibly
+ *        equilibrated system. That is, B may be overwritten by diag(R)*B.
+ *        On exit, the distributed solution matrix Y of the possibly
+ *        equilibrated system if info = 0, where Y = Pc*diag(C)^(-1)*X,
+ *        and X is the solution of the original system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of matrix B.
+ *
+ * fst_row (input) int (global)
+ *        The row number of B's first row in the global matrix.
+ *
+ * ldb    (input) int (local)
+ *        The leading dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * SOLVEstruct (input) sSOLVEstruct_t* (global)
+ *        Contains the information for the communication during the
+ *        solution phase.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * 
+ */ + +void +psgstrs(int_t n, sLUstruct_t *LUstruct, + sScalePermstruct_t *ScalePermstruct, + gridinfo_t *grid, float *B, + int_t m_loc, int_t fst_row, int_t ldb, int nrhs, + sSOLVEstruct_t *SOLVEstruct, + SuperLUStat_t *stat, int *info) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + float alpha = 1.0; + float beta = 0.0; + float zero = 0.0; + float *lsum; /* Local running sum of the updates to B-components */ + float *x; /* X component at step k. */ + /* NOTE: x and lsum are of same size. */ + float *lusup, *dest; + float *recvbuf, *recvbuf_on, *tempv, + *recvbufall, *recvbuf_BC_fwd, *recvbuf0, *xin; + float *rtemp, *rtemp_loc; /* Result of full matrix-vector multiply. */ + float *Linv; /* Inverse of diagonal block */ + float *Uinv; /* Inverse of diagonal block */ + int *ipiv; + int_t *leaf_send; + int_t nleaf_send, nleaf_send_tmp; + int_t *root_send; + int_t nroot_send, nroot_send_tmp; + int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + /*-- Data structures used for broadcast and reduction trees. --*/ + BcTree *LBtree_ptr = Llu->LBtree_ptr; + RdTree *LRtree_ptr = Llu->LRtree_ptr; + BcTree *UBtree_ptr = Llu->UBtree_ptr; + RdTree *URtree_ptr = Llu->URtree_ptr; + int_t *Urbs1; /* Number of row blocks in each block column of U. */ + int_t *Urbs = Llu->Urbs; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr = Llu->Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr = Llu->Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + int_t kcol, krow, mycol, myrow; + int_t i, ii, il, j, jj, k, kk, lb, ljb, lk, lib, lptr, luptr, gb, nn; + int_t nb, nlb,nlb_nodiag, nub, nsupers, nsupers_j, nsupers_i,maxsuper; + int_t *xsup, *supno, *lsub, *usub; + int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ + int Pc, Pr, iam; + int knsupc, nsupr, nprobe; + int nbtree, nrtree, outcount; + int ldalsum; /* Number of lsum entries locally owned. */ + int maxrecvsz, p, pi; + int_t **Lrowind_bc_ptr; + float **Lnzval_bc_ptr; + float **Linv_bc_ptr; + float **Uinv_bc_ptr; + float sum; + MPI_Status status,status_on,statusx,statuslsum; + pxgstrs_comm_t *gstrs_comm = SOLVEstruct->gstrs_comm; + SuperLUStat_t **stat_loc; + + double tmax; + /*-- Counts used for L-solve --*/ + int_t *fmod; /* Modification count for L-solve -- + Count the number of local block products to + be summed into lsum[lk]. */ + int_t fmod_tmp; + int_t **fsendx_plist = Llu->fsendx_plist; + int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ + int_t nfrecvx_buf=0; + int_t *frecv; /* Count of lsum[lk] contributions to be received + from processes in this row. + It is only valid on the diagonal processes. */ + int_t frecv_tmp; + int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ + int_t nfrecv = 0; /* Count of total messages to be recv'd. */ + int_t nbrecv = 0; /* Count of total messages to be recv'd. */ + int_t nleaf = 0, nroot = 0; + int_t nleaftmp = 0, nroottmp = 0; + int_t msgsize; + /*-- Counts used for U-solve --*/ + int_t *bmod; /* Modification count for U-solve. */ + int_t bmod_tmp; + int_t **bsendx_plist = Llu->bsendx_plist; + int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ + int_t nbrecvx_buf=0; + int_t *brecv; /* Count of modifications to be recv'd from + processes in this row. */ + int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ + int_t flagx,flaglsum,flag; + int_t *LBTree_active, *LRTree_active, *LBTree_finish, *LRTree_finish, *leafsups, *rootsups; + int_t TAG; + double t1_sol, t2_sol, t; +#if ( DEBUGlevel>=2 ) + int_t Ublocks = 0; +#endif + + int_t gik,iklrow,fnz; + + int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ + int INFO, pad; + int_t tmpresult; + + // #if ( PROFlevel>=1 ) + double t1, t2; + float msg_vol = 0, msg_cnt = 0; + // #endif + + int_t msgcnt[4]; /* Count the size of the message xfer'd in each buffer: + * 0 : transferred in Lsub_buf[] + * 1 : transferred in Lval_buf[] + * 2 : transferred in Usub_buf[] + * 3 : transferred in Uval_buf[] + */ + int iword = sizeof (int_t); + int dword = sizeof (float); + int Nwork; + int_t procs = grid->nprow * grid->npcol; + yes_no_t done; + yes_no_t startforward; + int nbrow; + int_t ik, rel, idx_r, jb, nrbl, irow, pc,iknsupc; + int_t lptr1_tmp, idx_i, idx_v,m; + int_t ready; + int thread_id = 0; + yes_no_t empty; + int_t sizelsum,sizertemp,aln_d,aln_i; + aln_d = ceil(CACHELINE/(double)dword); + aln_i = ceil(CACHELINE/(double)iword); + int num_thread = 1; + + maxsuper = sp_ienv_dist(3); + +//#ifdef _OPENMP +//#pragma omp threadprivate(thread_id) +//#endif + +#ifdef _OPENMP +#pragma omp parallel default(shared) + { + if (omp_get_thread_num () == 0) { + num_thread = omp_get_num_threads (); + } + } +#else + num_thread=1; +#endif + +#if ( PRNTlevel>=1 ) + if( grid->iam==0 ) { + printf("num_thread: %5d\n", num_thread); + fflush(stdout); + } +#endif + + MPI_Barrier( grid->comm ); + t1_sol = SuperLU_timer_(); + t = SuperLU_timer_(); + + /* Test input parameters. */ + *info = 0; + if ( n < 0 ) *info = -1; + else if ( nrhs < 0 ) *info = -9; + if ( *info ) { + pxerr_dist("PSGSTRS", grid, -*info); + return; + } + + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = supno[n-1] + 1; + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + Linv_bc_ptr = Llu->Linv_bc_ptr; + Uinv_bc_ptr = Llu->Uinv_bc_ptr; + nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ + + stat->utime[SOL_COMM] = 0.0; + stat->utime[SOL_GEMM] = 0.0; + stat->utime[SOL_TRSM] = 0.0; + stat->utime[SOL_TOT] = 0.0; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psgstrs()"); +#endif + + stat->ops[SOLVE] = 0.0; + Llu->SolveMsgSent = 0; + + /* Save the count to be altered so it can be used by + subsequent call to PDGSTRS. */ + if ( !(fmod = intMalloc_dist(nlb*aln_i)) ) + ABORT("Malloc fails for fmod[]."); + for (i = 0; i < nlb; ++i) fmod[i*aln_i] = Llu->fmod[i]; + if ( !(frecv = intCalloc_dist(nlb)) ) + ABORT("Calloc fails for frecv[]."); + Llu->frecv = frecv; + + if ( !(leaf_send = intMalloc_dist((CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i)) ) + ABORT("Malloc fails for leaf_send[]."); + nleaf_send=0; + if ( !(root_send = intMalloc_dist((CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i)) ) + ABORT("Malloc fails for root_send[]."); + nroot_send=0; + +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("N", strlen("N")); + ftcs3 = _cptofcd("U", strlen("U")); +#endif + + + /* Obtain ilsum[] and ldalsum for process column 0. */ + ilsum = Llu->ilsum; + ldalsum = Llu->ldalsum; + + /* Allocate working storage. */ + knsupc = sp_ienv_dist(3); + maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); + sizelsum = (((size_t)ldalsum)*nrhs + nlb*LSUM_H); + sizelsum = ((sizelsum + (aln_d - 1)) / aln_d) * aln_d; + +#ifdef _OPENMP + if ( !(lsum = (float*)SUPERLU_MALLOC(sizelsum*num_thread * sizeof(float)))) + ABORT("Malloc fails for lsum[]."); +#pragma omp parallel default(shared) private(ii,thread_id) + { + thread_id = omp_get_thread_num(); //mjc + for (ii=0; ii=2 ) + /* Dump the L factor using matlab triple-let format. */ + sDumpLblocks(iam, nsupers, grid, Glu_persist, Llu); +#endif + + /*--------------------------------------------------- + * Forward solve Ly = b. + *---------------------------------------------------*/ + /* Redistribute B into X on the diagonal processes. */ + psReDistribute_B_to_X(B, m_loc, nrhs, ldb, fst_row, ilsum, x, + ScalePermstruct, Glu_persist, grid, SOLVEstruct); + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. B to X redistribute time\t%8.4f\n", t); + fflush(stdout); + t = SuperLU_timer_(); +#endif + + /* Set up the headers in lsum[]. */ + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + il = LSUM_BLK( lk ); + lsum[il - LSUM_H] = k; /* Block number prepended in the header. */ + } + } + + /* --------------------------------------------------------- + Initialize the async Bcast trees on all processes. + --------------------------------------------------------- */ + nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ + + nbtree = 0; + for (lk=0;lk0)nfrecvx_buf++; + } + BcTree_allocateRequest(LBtree_ptr[lk],'s'); + } + } + + nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + if ( !( leafsups = (int_t*)intCalloc_dist(nsupers_i)) ) + ABORT("Calloc fails for leafsups."); + + nrtree = 0; + nleaf=0; + nfrecvmod=0; + + + +if(procs==1){ + for (lk=0;lknprow; /* not sure */ + if(gbnprow; /* not sure */ + if(gb=2 ) + printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n, nbtree %4d\n, nrtree %4d\n", + iam, nfrecvx, nfrecvmod, nleaf, nbtree, nrtree); + fflush(stdout); +#endif + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Setup L-solve time\t%8.4f\n", t); + fflush(stdout); + MPI_Barrier( grid->comm ); + t = SuperLU_timer_(); +#endif + +#if ( VAMPIR>=1 ) + // VT_initialize(); + VT_traceon(); +#endif + +#ifdef USE_VTUNE + __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores + __itt_resume(); // start VTune, again use 2 underscores +#endif + + /* --------------------------------------------------------- + Solve the leaf nodes first by all the diagonal processes. + --------------------------------------------------------- */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nleaf %4d\n", iam, nleaf); + fflush(stdout); +#endif + + +#ifdef _OPENMP +#pragma omp parallel default (shared) + { + int thread_id = omp_get_thread_num(); +#else + { + thread_id=0; +#endif + { + + if (Llu->inv == 1) { /* Diagonal is inverted. */ + +#ifdef _OPENMP +#pragma omp for firstprivate(nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait +#endif + for (jj=0;jj=1 ) + TIC(t1); +#endif + rtemp_loc = &rtemp[sizertemp* thread_id]; + + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + + nsupr = lsub[1]; + Linv = Linv_bc_ptr[lk]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc, 1, 1 ); +#else + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#endif + +#ifdef _OPENMP +#pragma omp simd +#endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_TRSM] += t2; + +#endif + + stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; + // --nleaf; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + /* + * Send Xk to process column Pc[k]. + */ + + if(LBtree_ptr[lk]!=NULL){ + lib = LBi( k, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lib ); + +#ifdef _OPENMP +#pragma omp atomic capture +#endif + nleaf_send_tmp = ++nleaf_send; + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'s'); + } + } + } + } else { /* Diagonal is not inverted. */ +#ifdef _OPENMP +#pragma omp for firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Linv,i,lib,rtemp_loc,nleaf_send_tmp) nowait +#endif + for (jj=0;jj=1 ) + TIC(t1); +#endif + rtemp_loc = &rtemp[sizertemp* thread_id]; + + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + + nsupr = lsub[1]; + +#ifdef _CRAY + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_TRSM] += t2; + +#endif + + stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; + + // --nleaf; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + if (LBtree_ptr[lk]!=NULL) { + lib = LBi( k, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lib ); + +#ifdef _OPENMP +#pragma omp atomic capture +#endif + nleaf_send_tmp = ++nleaf_send; + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + } + } /* end a block */ + } /* end for jj ... */ + } /* end else ... diagonal is not inverted */ + } + } /* end parallel region */ + + jj=0; + +#ifdef _OPENMP +#pragma omp parallel default (shared) +#endif + { + +#ifdef _OPENMP +#pragma omp master +#endif + { + +#ifdef _OPENMP +#pragma omp taskloop private (k,ii,lk,thread_id) num_tasks(num_thread*8) nogroup +#endif + for (jj=0;jj=0){ // this is a bcast forwarding + gb = mycol+lk*grid->npcol; /* not sure */ + lib = LBi( gb, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lib ); + BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s'); + }else{ // this is a reduce forwarding + lk = -lk - 1; + il = LSUM_BLK( lk ); + RdTree_forwardMessageSimple(LRtree_ptr[lk],&lsum[il - LSUM_H ],RdTree_GetMsgSize(LRtree_ptr[lk],'s')*nrhs+LSUM_H,'s'); + } + } + + +#ifdef USE_VTUNE + __itt_pause(); // stop VTune + __SSC_MARK(0x222); // stop SDE tracing +#endif + + /* ----------------------------------------------------------- + Compute the internal nodes asynchronously by all processes. + ----------------------------------------------------------- */ + +#ifdef _OPENMP +#pragma omp parallel default (shared) + { + int thread_id = omp_get_thread_num(); +#else + { + thread_id = 0; +#endif +#ifdef _OPENMP +#pragma omp master +#endif + { + for ( nfrecv =0; nfrecv=1 ) + TIC(t1); + // msgcnt[1] = maxrecvsz; +#endif + + recvbuf0 = &recvbuf_BC_fwd[nfrecvx_buf*maxrecvsz]; + + /* Receive a message. */ + MPI_Recv( recvbuf0, maxrecvsz, MPI_FLOAT, + MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); + // MPI_Irecv(recvbuf0,maxrecvsz,MPI_FLOAT,MPI_ANY_SOURCE,MPI_ANY_TAG,grid->comm,&req); + // ready=0; + // while(ready==0){ + // MPI_Test(&req,&ready,&status); + // #pragma omp taskyield + // } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_COMM] += t2; + + msg_cnt += 1; + msg_vol += maxrecvsz * dword; +#endif + + { + + k = *recvbuf0; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); +#endif + + if(status.MPI_TAG==BC_L){ + // --nfrecvx; + nfrecvx_buf++; + { + lk = LBj( k, grid ); /* local block number */ + + if(BcTree_getDestCount(LBtree_ptr[lk],'s')>0){ + + BcTree_forwardMessageSimple(LBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s'); + // nfrecvx_buf++; + } + + /* + * Perform local block modifications: lsum[i] -= L_i,k * X[k] + */ + + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + if ( lsub ) { + krow = PROW( k, grid ); + if(myrow==krow){ + nb = lsub[0] - 1; + knsupc = SuperSize( k ); + ii = X_BLK( LBi( k, grid ) ); + xin = &x[ii]; + }else{ + nb = lsub[0]; + knsupc = SuperSize( k ); + xin = &recvbuf0[XK_H] ; + } + slsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k, + fmod, nb, xsup, grid, Llu, + stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread); + + } /* if lsub */ + } + }else if(status.MPI_TAG==RD_L){ + // --nfrecvmod; + lk = LBi( k, grid ); /* Local block number, row-wise. */ + + knsupc = SuperSize( k ); + tempv = &recvbuf0[LSUM_H]; + il = LSUM_BLK( lk ); + RHS_ITERATE(j) { + for (i = 0; i < knsupc; ++i) + lsum[i + il + j*knsupc + thread_id*sizelsum] += tempv[i + j*knsupc]; + } + + // #ifdef _OPENMP + // #pragma omp atomic capture + // #endif + fmod_tmp=--fmod[lk*aln_i]; + { + thread_id = 0; + rtemp_loc = &rtemp[sizertemp* thread_id]; + if ( fmod_tmp==0 ) { + if(RdTree_IsRoot(LRtree_ptr[lk],'s')==YES){ + // ii = X_BLK( lk ); + knsupc = SuperSize( k ); + for (ii=1;ii=1 ) + TIC(t1); +#endif + if(Llu->inv == 1){ + Linv = Linv_bc_ptr[lk]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc, 1, 1 ); +#else + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Linv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; iinnv == 0 */ +#ifdef _CRAY + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + } /* end if-else */ + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_TRSM] += t2; +#endif + + stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + if(LBtree_ptr[lk]!=NULL){ + BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s'); + } + /* + * Perform local block modifications. + */ + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + if ( lsub ) { + krow = PROW( k, grid ); + nb = lsub[0] - 1; + knsupc = SuperSize( k ); + ii = X_BLK( LBi( k, grid ) ); + xin = &x[ii]; + slsum_fmod_inv_master(lsum, x, xin, rtemp, nrhs, knsupc, k, + fmod, nb, xsup, grid, Llu, + stat_loc,sizelsum,sizertemp,0,maxsuper,thread_id,num_thread); + } /* if lsub */ + // } + + }else{ /* fmod_tmp != 0 */ + il = LSUM_BLK( lk ); + knsupc = SuperSize( k ); + for (ii=1;ii=2 ) + t = SuperLU_timer_() - t; + stat->utime[SOL_TOT] += t; + if ( !iam ) { + printf(".. L-solve time\t%8.4f\n", t); + fflush(stdout); + } + + MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE, MPI_MAX, 0, grid->comm); + if ( !iam ) { + printf(".. L-solve time (MAX) \t%8.4f\n", tmax); + fflush(stdout); + } + t = SuperLU_timer_(); +#endif + +#if ( DEBUGlevel==2 ) + { + printf("(%d) .. After L-solve: y =\n", iam); + for (i = 0, k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + ii = X_BLK( lk ); + for (j = 0; j < knsupc; ++j) + printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); + fflush(stdout); + } + MPI_Barrier( grid->comm ); + } + } +#endif + + SUPERLU_FREE(fmod); + SUPERLU_FREE(frecv); + SUPERLU_FREE(leaf_send); + SUPERLU_FREE(leafsups); + SUPERLU_FREE(recvbuf_BC_fwd); + log_memory(-nlb*aln_i*iword-nlb*iword-(CEILING( nsupers, Pr )+CEILING( nsupers, Pc ))*aln_i*iword- nsupers_i*iword -maxrecvsz*(nfrecvx+1)*dword, stat); //account for fmod, frecv, leaf_send, leafsups, recvbuf_BC_fwd + + for (lk=0;lkcomm ); + +#if ( VAMPIR>=1 ) + VT_traceoff(); + VT_finalize(); +#endif + + + /*--------------------------------------------------- + * Back solve Ux = y. + * + * The Y components from the forward solve is already + * on the diagonal processes. + *---------------------------------------------------*/ + + /* Save the count to be altered so it can be used by + subsequent call to PDGSTRS. */ + if ( !(bmod = intMalloc_dist(nlb*aln_i)) ) + ABORT("Malloc fails for bmod[]."); + for (i = 0; i < nlb; ++i) bmod[i*aln_i] = Llu->bmod[i]; + if ( !(brecv = intCalloc_dist(nlb)) ) + ABORT("Calloc fails for brecv[]."); + Llu->brecv = brecv; + + k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; + + /* Re-initialize lsum to zero. Each block header is already in place. */ + +#ifdef _OPENMP + +#pragma omp parallel default(shared) private(ii) + { + int thread_id = omp_get_thread_num(); + for(ii=0;ii=2 ) + for (p = 0; p < Pr*Pc; ++p) { + if (iam == p) { + printf("(%2d) .. Ublocks %d\n", iam, Ublocks); + for (lb = 0; lb < nub; ++lb) { + printf("(%2d) Local col %2d: # row blocks %2d\n", + iam, lb, Urbs[lb]); + if ( Urbs[lb] ) { + for (i = 0; i < Urbs[lb]; ++i) + printf("(%2d) .. row blk %2d:\ + lbnum %d, indpos %d, valpos %d\n", + iam, i, + Ucb_indptr[lb][i].lbnum, + Ucb_indptr[lb][i].indpos, + Ucb_valptr[lb][i]); + } + } + } + MPI_Barrier( grid->comm ); + } + for (p = 0; p < Pr*Pc; ++p) { + if ( iam == p ) { + printf("\n(%d) bsendx_plist[][]", iam); + for (lb = 0; lb < nub; ++lb) { + printf("\n(%d) .. local col %2d: ", iam, lb); + for (i = 0; i < Pr; ++i) + printf("%4d", bsendx_plist[lb][i]); + } + printf("\n"); + } + MPI_Barrier( grid->comm ); + } +#endif /* DEBUGlevel */ + + /* --------------------------------------------------------- + Initialize the async Bcast trees on all processes. + --------------------------------------------------------- */ + nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ + + nbtree = 0; + for (lk=0;lk0)nbrecvx_buf++; + } + BcTree_allocateRequest(UBtree_ptr[lk],'s'); + } + } + + nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + if ( !( rootsups = (int_t*)intCalloc_dist(nsupers_i)) ) + ABORT("Calloc fails for rootsups."); + + nrtree = 0; + nroot=0; + for (lk=0;lknprow; /* not sure */ + if(gb=2 ) + printf("(%2d) nbrecvx %4d, nbrecvmod %4d, nroot %4d\n, nbtree %4d\n, nrtree %4d\n", + iam, nbrecvx, nbrecvmod, nroot, nbtree, nrtree); + fflush(stdout); +#endif + + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Setup U-solve time\t%8.4f\n", t); + fflush(stdout); + MPI_Barrier( grid->comm ); + t = SuperLU_timer_(); +#endif + + /* + * Solve the roots first by all the diagonal processes. + */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nroot %4d\n", iam, nroot); + fflush(stdout); +#endif + +#ifdef _OPENMP +#pragma omp parallel default (shared) +#endif + { +#ifdef _OPENMP +#pragma omp master +#endif + { +#ifdef _OPENMP +#pragma omp taskloop firstprivate (nrhs,beta,alpha,x,rtemp,ldalsum) private (ii,jj,k,knsupc,lk,luptr,lsub,nsupr,lusup,t1,t2,Uinv,i,lib,rtemp_loc,nroot_send_tmp,thread_id) nogroup +#endif + for (jj=0;jj=1 ) + TIC(t1); +#endif +#ifdef _OPENMP + thread_id = omp_get_thread_num (); +#else + thread_id = 0; +#endif + rtemp_loc = &rtemp[sizertemp* thread_id]; + + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number, row-wise. */ + + // bmod[lk] = -1; /* Do not solve X[k] in the future. */ + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; + + if(Llu->inv == 1){ + + Uinv = Uinv_bc_ptr[lk]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc, 1, 1 ); +#else + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_TRSM] += t2; +#endif + stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + if(UBtree_ptr[lk]!=NULL){ +#ifdef _OPENMP +#pragma omp atomic capture +#endif + nroot_send_tmp = ++nroot_send; + root_send[(nroot_send_tmp-1)*aln_i] = lk; + + } + } /* for jj ... */ + } /* omp master region */ + } /* omp parallel region */ + + +#ifdef _OPENMP +#pragma omp parallel default (shared) +#endif + { +#ifdef _OPENMP +#pragma omp master +#endif + { +#ifdef _OPENMP +#pragma omp taskloop private (ii,jj,k,lk,thread_id) nogroup +#endif + for (jj=0;jj=0){ // this is a bcast forwarding + gb = mycol+lk*grid->npcol; /* not sure */ + lib = LBi( gb, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lib ); + BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk],'s')*nrhs+XK_H,'s'); + }else{ // this is a reduce forwarding + lk = -lk - 1; + il = LSUM_BLK( lk ); + RdTree_forwardMessageSimple(URtree_ptr[lk],&lsum[il - LSUM_H ],RdTree_GetMsgSize(URtree_ptr[lk],'s')*nrhs+LSUM_H,'s'); + } +} + + /* + * Compute the internal nodes asychronously by all processes. + */ + +#ifdef _OPENMP +#pragma omp parallel default (shared) + { + int thread_id=omp_get_thread_num(); +#else + { + thread_id = 0; +#endif +#ifdef _OPENMP +#pragma omp master +#endif + for ( nbrecv =0; nbrecv=1 ) + TIC(t1); +#endif + + recvbuf0 = &recvbuf_BC_fwd[nbrecvx_buf*maxrecvsz]; + + /* Receive a message. */ + MPI_Recv( recvbuf0, maxrecvsz, MPI_FLOAT, + MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_COMM] += t2; + msg_cnt += 1; + msg_vol += maxrecvsz * dword; +#endif + + k = *recvbuf0; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); + fflush(stdout); +#endif + if(status.MPI_TAG==BC_U){ + // --nfrecvx; + nbrecvx_buf++; + lk = LBj( k, grid ); /* local block number */ + if(BcTree_getDestCount(UBtree_ptr[lk],'s')>0){ + + BcTree_forwardMessageSimple(UBtree_ptr[lk],recvbuf0,BcTree_GetMsgSize(UBtree_ptr[lk],'s')*nrhs+XK_H,'s'); + // nfrecvx_buf++; + } + + /* + * Perform local block modifications: lsum[i] -= U_i,k * X[k] + */ + + lk = LBj( k, grid ); /* Local block number, column-wise. */ + slsum_bmod_inv_master(lsum, x, &recvbuf0[XK_H], rtemp, nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + stat_loc, sizelsum,sizertemp,thread_id,num_thread); + }else if(status.MPI_TAG==RD_U){ + + lk = LBi( k, grid ); /* Local block number, row-wise. */ + + knsupc = SuperSize( k ); + tempv = &recvbuf0[LSUM_H]; + il = LSUM_BLK( lk ); + RHS_ITERATE(j) { + #ifdef _OPENMP + #pragma omp simd + #endif + for (i = 0; i < knsupc; ++i) + lsum[i + il + j*knsupc + thread_id*sizelsum] += tempv[i + j*knsupc]; + } + // #ifdef _OPENMP + // #pragma omp atomic capture + // #endif + bmod_tmp=--bmod[lk*aln_i]; + thread_id = 0; + rtemp_loc = &rtemp[sizertemp* thread_id]; + if ( bmod_tmp==0 ) { + if(RdTree_IsRoot(URtree_ptr[lk],'s')==YES){ + + knsupc = SuperSize( k ); + for (ii=1;iiinv == 1){ + + Uinv = Uinv_bc_ptr[lk]; + +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc, 1, 1 ); +#else + sgemm_( "N", "N", &knsupc, &nrhs, &knsupc, + &alpha, Uinv, &knsupc, &x[ii], + &knsupc, &beta, rtemp_loc, &knsupc ); +#endif + + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; iinv == 0 */ +#ifdef _CRAY + STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat_loc[thread_id]->utime[SOL_TRSM] += t2; +#endif + stat_loc[thread_id]->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + /* + * Send Xk to process column Pc[k]. + */ + if(UBtree_ptr[lk]!=NULL){ + BcTree_forwardMessageSimple(UBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(UBtree_ptr[lk],'s')*nrhs+XK_H,'s'); + } + + /* + * Perform local block modifications: + * lsum[i] -= U_i,k * X[k] + */ + if ( Urbs[lk] ) + slsum_bmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + stat_loc, sizelsum,sizertemp,thread_id,num_thread); + + }else{ + il = LSUM_BLK( lk ); + knsupc = SuperSize( k ); + + for (ii=1;ii=2 ) + t = SuperLU_timer_() - t; + stat->utime[SOL_TOT] += t; + if ( !iam ) printf(".. U-solve time\t%8.4f\n", t); + MPI_Reduce (&t, &tmax, 1, MPI_DOUBLE, MPI_MAX, 0, grid->comm); + if ( !iam ) { + printf(".. U-solve time (MAX) \t%8.4f\n", tmax); + fflush(stdout); + } + t = SuperLU_timer_(); +#endif + +#if ( DEBUGlevel>=2 ) + { + float *x_col; + int diag; + printf("\n(%d) .. After U-solve: x (ON DIAG PROCS) = \n", iam); + ii = 0; + for (k = 0; k < nsupers; ++k) { + knsupc = SuperSize( k ); + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + diag = PNUM( krow, kcol, grid); + if ( iam == diag ) { /* Diagonal process. */ + lk = LBi( k, grid ); + jj = X_BLK( lk ); + x_col = &x[jj]; + RHS_ITERATE(j) { + for (i = 0; i < knsupc; ++i) { /* X stored in blocks */ + printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+i, x_col[i]); + } + x_col += knsupc; + } + } + ii += knsupc; + } /* for k ... */ + } +#endif + + psReDistribute_X_to_B(n, B, m_loc, ldb, fst_row, nrhs, x, ilsum, + ScalePermstruct, Glu_persist, grid, SOLVEstruct); + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. X to B redistribute time\t%8.4f\n", t); + t = SuperLU_timer_(); +#endif + + double tmp1=0; + double tmp2=0; + double tmp3=0; + double tmp4=0; + for(i=0;iutime[SOL_TRSM]); + tmp2 = SUPERLU_MAX(tmp2,stat_loc[i]->utime[SOL_GEMM]); + tmp3 = SUPERLU_MAX(tmp3,stat_loc[i]->utime[SOL_COMM]); + tmp4 += stat_loc[i]->ops[SOLVE]; +#if ( PRNTlevel>=2 ) + f(iam==0)printf("thread %5d gemm %9.5f\n",i,stat_loc[i]->utime[SOL_GEMM]); +#endif + } + + stat->utime[SOL_TRSM] += tmp1; + stat->utime[SOL_GEMM] += tmp2; + stat->utime[SOL_COMM] += tmp3; + stat->ops[SOLVE]+= tmp4; + + /* Deallocate storage. */ + for(i=0;icomm ); + + +#if ( PROFlevel>=2 ) + { + float msg_vol_max, msg_vol_sum, msg_cnt_max, msg_cnt_sum; + MPI_Reduce (&msg_cnt, &msg_cnt_sum, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&msg_cnt, &msg_cnt_max, + 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + MPI_Reduce (&msg_vol, &msg_vol_sum, + 1, MPI_FLOAT, MPI_SUM, 0, grid->comm); + MPI_Reduce (&msg_vol, &msg_vol_max, + 1, MPI_FLOAT, MPI_MAX, 0, grid->comm); + if (!iam) { + printf ("\tPDGSTRS comm stat:" + "\tAvg\tMax\t\tAvg\tMax\n" + "\t\t\tCount:\t%.0f\t%.0f\tVol(MB)\t%.2f\t%.2f\n", + msg_cnt_sum / Pr / Pc, msg_cnt_max, + msg_vol_sum / Pr / Pc * 1e-6, msg_vol_max * 1e-6); + } + } +#endif + + stat->utime[SOLVE] = SuperLU_timer_() - t1_sol; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgstrs()"); +#endif + +#if ( PRNTlevel>=2 ) + float for_lu, total, max, avg, temp; + superlu_dist_mem_usage_t num_mem_usage; + + sQuerySpace_dist(n, LUstruct, grid, stat, &num_mem_usage); + temp = num_mem_usage.total; + + MPI_Reduce( &temp, &max, 1, MPI_FLOAT, MPI_MAX, 0, grid->comm ); + MPI_Reduce( &temp, &avg, 1, MPI_FLOAT, MPI_SUM, 0, grid->comm ); + if (!iam) { + printf("\n** Memory Usage **********************************\n"); + printf("** Total highmark (MB):\n" + " Sum-of-all : %8.2f | Avg : %8.2f | Max : %8.2f\n", + avg * 1e-6, + avg / grid->nprow / grid->npcol * 1e-6, + max * 1e-6); + printf("**************************************************\n"); + fflush(stdout); + } +#endif + + return; +} /* PSGSTRS */ + diff --git a/SRC/psgstrs1.c b/SRC/psgstrs1.c new file mode 100644 index 00000000..83082157 --- /dev/null +++ b/SRC/psgstrs1.c @@ -0,0 +1,910 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Solves a system of distributed linear equations + * + *
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ *     October 15, 2008  use fewer MPI_Reduce
+ * 
+ */ + +#include "superlu_sdefs.h" + +#define ISEND_IRECV + +/* + * Function prototypes + */ +#ifdef _CRAY +fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, float*, + float*, int*, float*, int*); +fortran void SGEMM(_fcd, _fcd, int*, int*, int*, float*, float*, + int*, float*, int*, float*, float*, int*); +_fcd ftcs1; +_fcd ftcs2; +_fcd ftcs3; +#endif + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * PSGSTRS1 solves a system of distributed linear equations
+ *
+ *                   op( sub(A) ) * X = sub( B )
+ *
+ * with a general N-by-N distributed matrix sub( A ) using the LU
+ * factorization computed by PSGSTRF.
+ *
+ * This routine is used only in the iterative refinement routine
+ * psgsrfs_ABXglobal, assuming that the right-hand side is already
+ * distributed in the diagonal processes.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        The distributed data structures to store L and U factors,
+ *        and the permutation vectors.
+ *        See superlu_sdefs.h for the definition of 'sLUstruct_t' structure.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * x      (input/output) float*
+ *        On entry, the right hand side matrix.
+ *        On exit, the solution matrix if info = 0;
+ *
+ *        NOTE: the right-hand side matrix is already distributed on
+ *              the diagonal processes.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves;
+ *        See SuperLUStat_t structure defined in util.h.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * 
+ */ + +void psgstrs1(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid, + float *x, int nrhs, SuperLUStat_t *stat, int *info) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + float alpha = 1.0; + float *lsum; /* Local running sum of the updates to B-components */ + float *lusup, *dest; + float *recvbuf, *tempv; + float *rtemp; /* Result of full matrix-vector multiply. */ + int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + int iam, kcol, krow, mycol, myrow; + int_t i, ii, il, j, k, lb, ljb, lk, lptr, luptr; + int_t nb, nlb, nub, nsupers; + int_t *xsup, *lsub, *usub; + int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ + int_t Pc, Pr; + int knsupc, nsupr; + int ldalsum; /* Number of lsum entries locally owned. */ + int maxrecvsz, p, pi; + int_t **Lrowind_bc_ptr; + float **Lnzval_bc_ptr; + MPI_Status status; +#ifdef ISEND_IRECV + MPI_Request *send_req, recv_req; +#endif + + /*-- Counts used for L-solve --*/ + int_t *fmod; /* Modification count for L-solve. */ + int_t **fsendx_plist = Llu->fsendx_plist; + int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ + int_t *frecv; /* Count of modifications to be recv'd from + processes in this row. */ + int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ + int_t nleaf = 0, nroot = 0; + + /*-- Counts used for U-solve --*/ + int_t *bmod; /* Modification count for L-solve. */ + int_t **bsendx_plist = Llu->bsendx_plist; + int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ + int_t *brecv; /* Count of modifications to be recv'd from + processes in this row. */ + int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ + double t; +#if ( DEBUGlevel>=2 ) + int_t Ublocks = 0; +#endif + + int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ + + t = SuperLU_timer_(); + + /* Test input parameters. */ + *info = 0; + if ( n < 0 ) *info = -1; + else if ( nrhs < 0 ) *info = -8; + if ( *info ) { + pxerr_dist("PSGSTRS1", grid, -*info); + return; + } + + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ + Llu->SolveMsgSent = 0; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psgstrs1()"); +#endif + + /* Save the count to be altered so it can be used by + subsequent call to PSGSTRS1. */ + if ( !(fmod = intMalloc_dist(nlb)) ) + ABORT("Calloc fails for fmod[]."); + for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + Llu->frecv = frecv; + +#ifdef ISEND_IRECV + k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; + if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) + ABORT("Malloc fails for send_req[]."); +#endif + +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("N", strlen("N")); + ftcs3 = _cptofcd("U", strlen("U")); +#endif + + + /* Compute ilsum[] and ldalsum for process column 0. */ + ilsum = Llu->ilsum; + ldalsum = Llu->ldalsum; + + /* Allocate working storage. */ + knsupc = sp_ienv_dist(3); + if ( !(lsum = floatCalloc_dist(((size_t)ldalsum) * nrhs + + nlb * LSUM_H)) ) + ABORT("Calloc fails for lsum[]."); + maxrecvsz = knsupc * nrhs + SUPERLU_MAX(XK_H, LSUM_H); + if ( !(recvbuf = floatMalloc_dist(maxrecvsz)) ) + ABORT("Malloc fails for recvbuf[]."); + if ( !(rtemp = floatCalloc_dist(maxrecvsz)) ) + ABORT("Malloc fails for rtemp[]."); + + + /*--------------------------------------------------- + * Forward solve Ly = b. + *---------------------------------------------------*/ + + /* + * Prepended the block number in the header for lsum[]. + */ + for (k = 0; k < nsupers; ++k) { + knsupc = SuperSize( k ); + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + il = LSUM_BLK( lk ); + lsum[il - LSUM_H] = k; + } + } + + /* + * Compute frecv[] and nfrecvmod counts on the diagonal processes. + */ + { + superlu_scope_t *scp = &grid->rscp; + +#if 1 + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if ( mycol != kcol && fmod[lk] ) + mod_bit[lk] = 1; /* contribution from off-diagonal */ + } + } + /*PrintInt10("mod_bit", nlb, mod_bit);*/ + + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if ( mycol == kcol ) { /* diagonal process */ + nfrecvmod += frecv[lk]; + if ( !frecv[lk] && !fmod[lk] ) ++nleaf; + } + } + } + +#else /* old */ + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol != kcol && fmod[lk] ) + i = 1; /* Contribution from non-diagonal process. */ + else i = 0; + MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, + MPI_SUM, kcol, scp->comm ); + if ( mycol == kcol ) { /* Diagonal process. */ + nfrecvmod += frecv[lk]; + if ( !frecv[lk] && !fmod[lk] ) ++nleaf; +#if ( DEBUGlevel>=2 ) + printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); + assert( frecv[lk] < Pc ); +#endif + } + } + } +#endif + } + + /* --------------------------------------------------------- + Solve the leaf nodes first by all the diagonal processes. + --------------------------------------------------------- */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nleaf %4d\n", iam, nleaf); +#endif + for (k = 0; k < nsupers && nleaf; ++k) { + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + if ( !frecv[lk] && !fmod[lk] ) { + fmod[lk] = -1; /* Do not solve X[k] in the future. */ + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ + --nleaf; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < Pr; ++p) + if ( fsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++]); +#else + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, + pi, Xk, grid->comm ); +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + + /* + * Perform local block modifications: lsum[i] -= L_i,k * X[k] + */ + nb = lsub[0] - 1; + lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; + luptr = knsupc; /* Skip diagonal block L(k,k). */ + + slsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, + fmod, nb, lptr, luptr, xsup, grid, Llu, + send_req, stat); + } + } /* if diagonal process ... */ + } /* for k ... */ + + /* + * Compute the internal nodes asynchronously by all processes. + */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", + iam, nfrecvx, nfrecvmod, nleaf); +#endif + + while ( nfrecvx || nfrecvmod ) { /* While not finished. */ + + /* Receive a message. */ +#ifdef ISEND_IRECV + /* -MPI- FATAL: Remote protocol queue full */ + MPI_Irecv( recvbuf, maxrecvsz, MPI_FLOAT, MPI_ANY_SOURCE, + MPI_ANY_TAG, grid->comm, &recv_req ); + MPI_Wait( &recv_req, &status ); +#else + MPI_Recv( recvbuf, maxrecvsz, MPI_FLOAT, MPI_ANY_SOURCE, + MPI_ANY_TAG, grid->comm, &status ); +#endif + + k = *recvbuf; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); +#endif + + switch ( status.MPI_TAG ) { + case Xk: + --nfrecvx; + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + if ( lsub ) { + nb = lsub[0]; + lptr = BC_HEADER; + luptr = 0; + knsupc = SuperSize( k ); + + /* + * Perform local block modifications: lsum[i] -= L_i,k * X[k] + */ + slsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, + fmod, nb, lptr, luptr, xsup, grid, Llu, + send_req, stat); + } /* if lsub */ + + break; + + case LSUM: + --nfrecvmod; + lk = LBi( k, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lk ); + knsupc = SuperSize( k ); + tempv = &recvbuf[LSUM_H]; + RHS_ITERATE(j) + for (i = 0; i < knsupc; ++i) + x[i + ii + j*knsupc] += tempv[i + j*knsupc]; + + if ( (--frecv[lk])==0 && fmod[lk]==0 ) { + fmod[lk] = -1; /* Do not solve X[k] in the future. */ + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + /*stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs;*/ +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + kcol = PCOL( k, grid ); + for (p = 0; p < Pr; ++p) + if ( fsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + + /* + * Perform local block modifications. + */ + nb = lsub[0] - 1; + lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; + luptr = knsupc; /* Skip diagonal block L(k,k). */ + + slsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, + fmod, nb, lptr, luptr, xsup, grid, Llu, + send_req, stat); + } /* if */ + + break; + +#if ( DEBUGlevel>=2 ) + default: + printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG); + break; +#endif + } /* switch */ + + } /* while not finished ... */ + + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); + t = SuperLU_timer_(); +#endif + +#if ( DEBUGlevel>=2 ) + if ( !iam ) printf("\n.. After L-solve: y =\n"); + for (i = 0, k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + ii = X_BLK( lk ); + for (j = 0; j < knsupc; ++j) + printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); + } + MPI_Barrier( grid->comm ); + } +#endif + + SUPERLU_FREE(fmod); + SUPERLU_FREE(frecv); + SUPERLU_FREE(rtemp); + +#ifdef ISEND_IRECV + for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); + Llu->SolveMsgSent = 0; +#endif + + + /*--------------------------------------------------- + * Back solve Ux = y. + * + * The Y components from the forward solve is already + * on the diagonal processes. + *---------------------------------------------------*/ + + /* Save the count to be altered so it can be used by + subsequent call to PSGSTRS1. */ + if ( !(bmod = intMalloc_dist(nlb)) ) + ABORT("Calloc fails for bmod[]."); + for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + Llu->brecv = brecv; + + /* + * Compute brecv[] and nbrecvmod counts on the diagonal processes. + */ + { + superlu_scope_t *scp = &grid->rscp; + +#if 1 + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol != kcol && bmod[lk] ) + mod_bit[lk] = 1; /* Contribution from off-diagonal */ + } + } + + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol == kcol ) { /* Diagonal process. */ + nbrecvmod += brecv[lk]; + if ( !brecv[lk] && !bmod[lk] ) ++nroot; +#if ( DEBUGlevel>=2 ) + printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); + assert( brecv[lk] < Pc ); +#endif + } + } + } + +#else + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol != kcol && bmod[lk] ) + i = 1; /* Contribution from non-diagonal process. */ + else i = 0; + MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, + MPI_SUM, kcol, scp->comm ); + if ( mycol == kcol ) { /* Diagonal process. */ + nbrecvmod += brecv[lk]; + if ( !brecv[lk] && !bmod[lk] ) ++nroot; +#if ( DEBUGlevel>=2 ) + printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); + assert( brecv[lk] < Pc ); +#endif + } + } + } +#endif + } + + /* Re-initialize lsum to zero. Each block header is already in place. */ + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + il = LSUM_BLK( lk ); + dest = &lsum[il]; + RHS_ITERATE(j) + for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0; + } + } + + /* Set up additional pointers for the index and value arrays of U. + nlb is the number of local block rows. */ + nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub = Ufstnz_br_ptr[lk]; + if ( usub ) { /* Not an empty block row. */ + /* usub[0] -- number of column blocks in this block row. */ +#if ( DEBUGlevel>=2 ) + Ublocks += usub[0]; +#endif + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ + k = usub[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub = Ufstnz_br_ptr[lk]; + if ( usub ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ + k = usub[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + ++Urbs1[ljb]; + j += usub[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + +#if ( DEBUGlevel>=2 ) + for (p = 0; p < Pr*Pc; ++p) { + if (iam == p) { + printf("(%2d) .. Ublocks %d\n", iam, Ublocks); + for (lb = 0; lb < nub; ++lb) { + printf("(%2d) Local col %2d: # row blocks %2d\n", + iam, lb, Urbs[lb]); + if ( Urbs[lb] ) { + for (i = 0; i < Urbs[lb]; ++i) + printf("(%2d) .. row blk %2d:\ + lbnum %d, indpos %d, valpos %d\n", + iam, i, + Ucb_indptr[lb][i].lbnum, + Ucb_indptr[lb][i].indpos, + Ucb_valptr[lb][i]); + } + } + } + MPI_Barrier( grid->comm ); + } + for (p = 0; p < Pr*Pc; ++p) { + if ( iam == p ) { + printf("\n(%d) bsendx_plist[][]", iam); + for (lb = 0; lb < nub; ++lb) { + printf("\n(%d) .. local col %2d: ", iam, lb); + for (i = 0; i < Pr; ++i) + printf("%4d", bsendx_plist[lb][i]); + } + printf("\n"); + } + MPI_Barrier( grid->comm ); + } +#endif /* DEBUGlevel */ + + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t); + t = SuperLU_timer_(); +#endif + + /* + * Solve the roots first by all the diagonal processes. + */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nroot %4d\n", iam, nroot); +#endif + for (k = nsupers-1; k >= 0 && nroot; --k) { + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */ + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number, row-wise. */ + if ( !brecv[lk] && !bmod[lk] ) { + bmod[lk] = -1; /* Do not solve X[k] in the future. */ + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + /*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/ + --nroot; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < Pr; ++p) + if ( bsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + + /* + * Perform local block modifications: lsum[i] -= U_i,k * X[k] + */ + if ( Urbs[lk] ) + slsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + } /* if root ... */ + } /* if diagonal process ... */ + } /* for k ... */ + + + /* + * Compute the internal nodes asynchronously by all processes. + */ + while ( nbrecvx || nbrecvmod ) { /* While not finished. */ + + /* Receive a message. */ + MPI_Recv( recvbuf, maxrecvsz, MPI_FLOAT, + MPI_ANY_SOURCE, MPI_ANY_TAG, grid->comm, &status ); + k = *recvbuf; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); +#endif + + switch ( status.MPI_TAG ) { + case Xk: + --nbrecvx; + lk = LBj( k, grid ); /* Local block number, column-wise. */ + /* + * Perform local block modifications: + * lsum[i] -= U_i,k * X[k] + */ + slsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + + break; + + case LSUM: + --nbrecvmod; + lk = LBi( k, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lk ); + knsupc = SuperSize( k ); + tempv = &recvbuf[LSUM_H]; + RHS_ITERATE(j) + for (i = 0; i < knsupc; ++i) + x[i + ii + j*knsupc] += tempv[i + j*knsupc]; + + if ( !(--brecv[lk]) && !bmod[lk] ) { + bmod[lk] = -1; /* Do not solve X[k] in the future. */ + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + /*stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs;*/ +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + /* + * Send Xk to process column Pc[k]. + */ + kcol = PCOL( k, grid ); + for (p = 0; p < Pr; ++p) + if ( bsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii - XK_H], pi); +#endif + } + + /* + * Perform local block modifications: + * lsum[i] -= U_i,k * X[k] + */ + if ( Urbs[lk] ) + slsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + } /* if becomes solvable */ + + break; + +#if ( DEBUGlevel>=2 ) + default: + printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG); + break; +#endif + + } /* switch */ + + } /* while not finished ... */ + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam ) printf(".. U-solve time\t%8.2f\n", t); +#endif + + stat->utime[SOLVE] = SuperLU_timer_() - t; + + /* Deallocate storage. */ + + SUPERLU_FREE(lsum); + SUPERLU_FREE(recvbuf); + for (i = 0; i < nub; ++i) + if ( Urbs[i] ) { + SUPERLU_FREE(Ucb_indptr[i]); + SUPERLU_FREE(Ucb_valptr[i]); + } + SUPERLU_FREE(Ucb_indptr); + SUPERLU_FREE(Ucb_valptr); + SUPERLU_FREE(Urbs); + SUPERLU_FREE(bmod); + SUPERLU_FREE(brecv); +#ifdef ISEND_IRECV + for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); + SUPERLU_FREE(send_req); +#endif + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgstrs1()"); +#endif + +} /* PSGSTRS1 */ diff --git a/SRC/psgstrs_Bglobal.c b/SRC/psgstrs_Bglobal.c new file mode 100644 index 00000000..eb972a16 --- /dev/null +++ b/SRC/psgstrs_Bglobal.c @@ -0,0 +1,1040 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Solves a system of distributed linear equations A*X = B with a general N-by-N matrix A using the LU factorization + * + *
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ *     October 15, 2008  use fewer MPI_Reduce
+ * 
+ */ + +#include "superlu_sdefs.h" + +#define ISEND_IRECV + +/* + * Function prototypes + */ +#ifdef _CRAY +fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, float*, + float*, int*, float*, int*); +fortran void SGEMM(_fcd, _fcd, int*, int*, int*, float*, float*, + int*, float*, int*, float*, float*, int*); +_fcd ftcs1; +_fcd ftcs2; +_fcd ftcs3; +#endif +static void gather_diag_to_all(int_t, int_t, float [], Glu_persist_t *, + sLocalLU_t *, gridinfo_t *, int_t, int_t [], + int_t [], float [], int_t, float []); + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * psgstrs_Bglobal solves a system of distributed linear equations
+ * A*X = B with a general N-by-N matrix A using the LU factorization
+ * computed by psgstrf.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The order of the system of linear equations.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        The distributed data structures storing L and U factors.
+ *        The L and U factors are obtained from psgstrf for
+ *        the possibly scaled and permuted matrix A.
+ *        See superlu_ddefs.h for the definition of 'sLUstruct_t'.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh. It contains the MPI communicator, the number
+ *        of process rows (NPROW), the number of process columns (NPCOL),
+ *        and my process rank. It is an input argument to all the
+ *        parallel routines.
+ *        Grid can be initialized by subroutine SUPERLU_GRIDINIT.
+ *        See superlu_ddefs.h for the definition of 'gridinfo_t'.
+ *
+ * B      (input/output) float*
+ *        On entry, the right-hand side matrix of the possibly equilibrated
+ *        and row permuted system.
+ *        On exit, the solution matrix of the possibly equilibrated
+ *        and row permuted system if info = 0;
+ *
+ *        NOTE: Currently, the N-by-NRHS  matrix B must reside on all
+ *              processes when calling this routine.
+ *
+ * ldb    (input) int (global)
+ *        Leading dimension of matrix B.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * stat   (output) SuperLUStat_t*
+ *        Record the statistics about the triangular solves.
+ *        See util.h for the definition of 'SuperLUStat_t'.
+ *
+ * info   (output) int*
+ * 	   = 0: successful exit
+ *	   < 0: if info = -i, the i-th argument had an illegal value
+ * 
+ */ + +void +psgstrs_Bglobal(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid, + float *B, int_t ldb, int nrhs, + SuperLUStat_t *stat, int *info) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + float alpha = 1.0; + float *lsum; /* Local running sum of the updates to B-components */ + float *x; /* X component at step k. */ + float *lusup, *dest; + float *recvbuf, *tempv; + float *rtemp; /* Result of full matrix-vector multiply. */ + int_t **Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + int_t *Urbs, *Urbs1; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + int_t kcol, krow, mycol, myrow; + int_t i, ii, il, j, jj, k, lb, ljb, lk, lptr, luptr; + int_t nb, nlb, nub, nsupers; + int_t *xsup, *lsub, *usub; + int_t *ilsum; /* Starting position of each supernode in lsum (LOCAL)*/ + int Pc, Pr, iam; + int knsupc, nsupr; + int ldalsum; /* Number of lsum entries locally owned. */ + int maxrecvsz, p, pi; + int_t **Lrowind_bc_ptr; + float **Lnzval_bc_ptr; + MPI_Status status; +#if defined (ISEND_IRECV) || defined (BSEND) + MPI_Request *send_req, recv_req; +#endif + + /*-- Counts used for L-solve --*/ + int_t *fmod; /* Modification count for L-solve. */ + int_t **fsendx_plist = Llu->fsendx_plist; + int_t nfrecvx = Llu->nfrecvx; /* Number of X components to be recv'd. */ + int_t *frecv; /* Count of modifications to be recv'd from + processes in this row. */ + int_t nfrecvmod = 0; /* Count of total modifications to be recv'd. */ + int_t nleaf = 0, nroot = 0; + + /*-- Counts used for U-solve --*/ + int_t *bmod; /* Modification count for L-solve. */ + int_t **bsendx_plist = Llu->bsendx_plist; + int_t nbrecvx = Llu->nbrecvx; /* Number of X components to be recv'd. */ + int_t *brecv; /* Count of modifications to be recv'd from + processes in this row. */ + int_t nbrecvmod = 0; /* Count of total modifications to be recv'd. */ + double t; +#if ( DEBUGlevel>=2 ) + int_t Ublocks = 0; +#endif + + int_t *mod_bit = Llu->mod_bit; /* flag contribution from each row block */ + + t = SuperLU_timer_(); + + /* Test input parameters. */ + *info = 0; + if ( n < 0 ) *info = -1; + else if ( nrhs < 0 ) *info = -9; + if ( *info ) { + pxerr_dist("PSGSTRS_BGLOBAL", grid, -*info); + return; + } + + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + nlb = CEILING( nsupers, Pr ); /* Number of local block rows. */ + stat->ops[SOLVE] = 0.0; + Llu->SolveMsgSent = 0; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter psgstrs_Bglobal()"); +#endif + + /* Save the count to be altered so it can be used by + subsequent call to PDGSTRS_BGLOBAL. */ + if ( !(fmod = intMalloc_dist(nlb)) ) + ABORT("Calloc fails for fmod[]."); + for (i = 0; i < nlb; ++i) fmod[i] = Llu->fmod[i]; + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + Llu->frecv = frecv; + +#if defined (ISEND_IRECV) || defined (BSEND) + k = SUPERLU_MAX( Llu->nfsendx, Llu->nbsendx ) + nlb; + if ( !(send_req = (MPI_Request*) SUPERLU_MALLOC(k*sizeof(MPI_Request))) ) + ABORT("Malloc fails for send_req[]."); +#endif + +#ifdef _CRAY + ftcs1 = _cptofcd("L", strlen("L")); + ftcs2 = _cptofcd("N", strlen("N")); + ftcs3 = _cptofcd("U", strlen("U")); +#endif + + + /* Obtain ilsum[] and ldalsum for process column 0. */ + ilsum = Llu->ilsum; + ldalsum = Llu->ldalsum; + + /* Allocate working storage. */ + knsupc = sp_ienv_dist(3); + maxrecvsz = knsupc * nrhs + SUPERLU_MAX( XK_H, LSUM_H ); + if ( !(lsum = floatCalloc_dist(((size_t)ldalsum) * nrhs + + nlb * LSUM_H)) ) + ABORT("Calloc fails for lsum[]."); + if ( !(x = floatMalloc_dist(((size_t)ldalsum) * nrhs + + nlb * XK_H)) ) + ABORT("Malloc fails for x[]."); + if ( !(recvbuf = floatMalloc_dist(maxrecvsz)) ) + ABORT("Malloc fails for recvbuf[]."); + if ( !(rtemp = floatCalloc_dist(maxrecvsz)) ) + ABORT("Malloc fails for rtemp[]."); + + + /*--------------------------------------------------- + * Forward solve Ly = b. + *---------------------------------------------------*/ + + /* + * Copy B into X on the diagonal processes. + */ + ii = 0; + for (k = 0; k < nsupers; ++k) { + knsupc = SuperSize( k ); + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + il = LSUM_BLK( lk ); + lsum[il - LSUM_H] = k; /* Block number prepended in the header. */ + kcol = PCOL( k, grid ); + if ( mycol == kcol ) { /* Diagonal process. */ + jj = X_BLK( lk ); + x[jj - XK_H] = k; /* Block number prepended in the header. */ + RHS_ITERATE(j) + for (i = 0; i < knsupc; ++i) /* X is stored in blocks. */ + x[i + jj + j*knsupc] = B[i + ii + j*ldb]; + } + } + ii += knsupc; + } + + /* + * Compute frecv[] and nfrecvmod counts on the diagonal processes. + */ + { + superlu_scope_t *scp = &grid->rscp; + +#if 1 + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); + if ( mycol != kcol && fmod[lk] ) + mod_bit[lk] = 1; /* contribution from off-diagonal */ + } + } + + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); + if ( mycol == kcol ) { /* Diagonal process. */ + nfrecvmod += frecv[lk]; + if ( !frecv[lk] && !fmod[lk] ) ++nleaf; + } + } + } + +#else /* old */ + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol != kcol && fmod[lk] ) + i = 1; /* Contribution from non-diagonal process. */ + else i = 0; + MPI_Reduce( &i, &frecv[lk], 1, mpi_int_t, + MPI_SUM, kcol, scp->comm ); + if ( mycol == kcol ) { /* Diagonal process. */ + nfrecvmod += frecv[lk]; + if ( !frecv[lk] && !fmod[lk] ) ++nleaf; +#if ( DEBUGlevel>=2 ) + printf("(%2d) frecv[%4d] %2d\n", iam, k, frecv[lk]); + assert( frecv[lk] < Pc ); +#endif + } + } + } +#endif + } + + /* --------------------------------------------------------- + Solve the leaf nodes first by all the diagonal processes. + --------------------------------------------------------- */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nleaf %4d\n", iam, nleaf); +#endif + for (k = 0; k < nsupers && nleaf; ++k) { + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + if ( frecv[lk]==0 && fmod[lk]==0 ) { + fmod[lk] = -1; /* Do not solve X[k] in the future. */ + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; + --nleaf; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < Pr; ++p) { + if ( fsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++]); +#else +#ifdef BSEND + MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#else + + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, + pi, Xk, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + } + /* + * Perform local block modifications: lsum[i] -= L_i,k * X[k] + */ + nb = lsub[0] - 1; + lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; + luptr = knsupc; /* Skip diagonal block L(k,k). */ + + slsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, + fmod, nb, lptr, luptr, xsup, grid, Llu, + send_req,stat); + } + } /* if diagonal process ... */ + } /* for k ... */ + + /* ----------------------------------------------------------- + Compute the internal nodes asynchronously by all processes. + ----------------------------------------------------------- */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nfrecvx %4d, nfrecvmod %4d, nleaf %4d\n", + iam, nfrecvx, nfrecvmod, nleaf); +#endif + + while ( nfrecvx || nfrecvmod ) { /* While not finished. */ + + /* Receive a message. */ +#ifdef ISEND_IRECV + /* -MPI- FATAL: Remote protocol queue full */ + MPI_Irecv( recvbuf, maxrecvsz, MPI_FLOAT, MPI_ANY_SOURCE, + MPI_ANY_TAG, grid->comm, &recv_req ); + MPI_Wait( &recv_req, &status ); +#else + MPI_Recv( recvbuf, maxrecvsz, MPI_FLOAT, MPI_ANY_SOURCE, + MPI_ANY_TAG, grid->comm, &status ); +#endif + + k = *recvbuf; + + + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); +#endif + + switch ( status.MPI_TAG ) { + case Xk: + --nfrecvx; + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + if ( lsub ) { + nb = lsub[0]; + lptr = BC_HEADER; + luptr = 0; + knsupc = SuperSize( k ); + + /* + * Perform local block modifications: lsum[i] -= L_i,k * X[k] + */ + slsum_fmod(lsum, x, &recvbuf[XK_H], rtemp, nrhs, knsupc, k, + fmod, nb, lptr, luptr, xsup, grid, Llu, + send_req, stat); + } /* if lsub */ + + break; + + case LSUM: /* Receiver must be a diagonal process */ + --nfrecvmod; + lk = LBi( k, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lk ); + knsupc = SuperSize( k ); + tempv = &recvbuf[LSUM_H]; + RHS_ITERATE(j) + for (i = 0; i < knsupc; ++i) + x[i + ii + j*knsupc] += tempv[i + j*knsupc]; + + if ( (--frecv[lk])==0 && fmod[lk]==0 ) { + fmod[lk] = -1; /* Do not solve X[k] in the future. */ + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "L", "N", "U", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + stat->ops[SOLVE] += knsupc * (knsupc - 1) * nrhs; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + kcol = PCOL( k, grid ); + for (p = 0; p < Pr; ++p) { + if ( fsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++]); +#else +#ifdef BSEND + MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#else + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + } + /* + * Perform local block modifications. + */ + nb = lsub[0] - 1; + lptr = BC_HEADER + LB_DESCRIPTOR + knsupc; + luptr = knsupc; /* Skip diagonal block L(k,k). */ + + slsum_fmod(lsum, x, &x[ii], rtemp, nrhs, knsupc, k, + fmod, nb, lptr, luptr, xsup, grid, Llu, + send_req, stat); + } /* if */ + + break; + +#if ( DEBUGlevel>=2 ) + default: + printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG); + break; +#endif + } /* switch */ + + } /* while not finished ... */ + + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam ) printf(".. L-solve time\t%8.2f\n", t); + t = SuperLU_timer_(); +#endif + +#if ( DEBUGlevel>=2 ) + printf("\n(%d) .. After L-solve: y =\n", iam); + for (i = 0, k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + if ( myrow == krow && mycol == kcol ) { /* Diagonal process */ + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + ii = X_BLK( lk ); + for (j = 0; j < knsupc; ++j) + printf("\t(%d)\t%4d\t%.10f\n", iam, xsup[k]+j, x[ii+j]); + } + MPI_Barrier( grid->comm ); + } +#endif + + SUPERLU_FREE(fmod); + SUPERLU_FREE(frecv); + SUPERLU_FREE(rtemp); + +#ifdef ISEND_IRECV + for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); + Llu->SolveMsgSent = 0; +#endif + + + /*--------------------------------------------------- + * Back solve Ux = y. + * + * The Y components from the forward solve is already + * on the diagonal processes. + *---------------------------------------------------*/ + + /* Save the count to be altered so it can be used by + subsequent call to PDGSTRS_BGLOBAL. */ + if ( !(bmod = intMalloc_dist(nlb)) ) + ABORT("Calloc fails for bmod[]."); + for (i = 0; i < nlb; ++i) bmod[i] = Llu->bmod[i]; + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + Llu->brecv = brecv; + + /* + * Compute brecv[] and nbrecvmod counts on the diagonal processes. + */ + { + superlu_scope_t *scp = &grid->rscp; + +#if 1 + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol != kcol && bmod[lk] ) + mod_bit[lk] = 1; /* Contribution from off-diagonal */ + } + } + + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, scp->comm ); + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol == kcol ) { /* Diagonal process. */ + nbrecvmod += brecv[lk]; + if ( !brecv[lk] && !bmod[lk] ) ++nroot; +#if ( DEBUGlevel>=2 ) + printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); + assert( brecv[lk] < Pc ); +#endif + } + } + } + +#else /* old */ + + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + lk = LBi( k, grid ); /* Local block number. */ + kcol = PCOL( k, grid ); /* Root process in this row scope. */ + if ( mycol != kcol && bmod[lk] ) + i = 1; /* Contribution from non-diagonal process. */ + else i = 0; + MPI_Reduce( &i, &brecv[lk], 1, mpi_int_t, + MPI_SUM, kcol, scp->comm ); + if ( mycol == kcol ) { /* Diagonal process. */ + nbrecvmod += brecv[lk]; + if ( !brecv[lk] && !bmod[lk] ) ++nroot; +#if ( DEBUGlevel>=2 ) + printf("(%2d) brecv[%4d] %2d\n", iam, k, brecv[lk]); + assert( brecv[lk] < Pc ); +#endif + } + } + } +#endif + } + + /* Re-initialize lsum to zero. Each block header is already in place. */ + for (k = 0; k < nsupers; ++k) { + krow = PROW( k, grid ); + if ( myrow == krow ) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + il = LSUM_BLK( lk ); + dest = &lsum[il]; + RHS_ITERATE(j) + for (i = 0; i < knsupc; ++i) dest[i + j*knsupc] = 0.0; + } + } + + /* Set up additional pointers for the index and value arrays of U. + nub is the number of local block columns. */ + nub = CEILING( nsupers, Pc ); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*((size_t)nub))) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub = Ufstnz_br_ptr[lk]; + if ( usub ) { /* Not an empty block row. */ + /* usub[0] -- number of column blocks in this block row. */ +#if ( DEBUGlevel>=2 ) + Ublocks += usub[0]; +#endif + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ + k = usub[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) { + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + } + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub = Ufstnz_br_ptr[lk]; + if ( usub ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + for (lb = 0; lb < usub[0]; ++lb) { /* For all column blocks. */ + k = usub[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + ++Urbs1[ljb]; + j += usub[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + +#if ( DEBUGlevel>=2 ) + for (p = 0; p < Pr*Pc; ++p) { + if (iam == p) { + printf("(%2d) .. Ublocks %d\n", iam, Ublocks); + for (lb = 0; lb < nub; ++lb) { + printf("(%2d) Local col %2d: # row blocks %2d\n", + iam, lb, Urbs[lb]); + if ( Urbs[lb] ) { + for (i = 0; i < Urbs[lb]; ++i) + printf("(%2d) .. row blk %2d:\ + lbnum %d, indpos %d, valpos %d\n", + iam, i, + Ucb_indptr[lb][i].lbnum, + Ucb_indptr[lb][i].indpos, + Ucb_valptr[lb][i]); + } + } + } + MPI_Barrier( grid->comm ); + } + for (p = 0; p < Pr*Pc; ++p) { + if ( iam == p ) { + printf("\n(%d) bsendx_plist[][]", iam); + for (lb = 0; lb < nub; ++lb) { + printf("\n(%d) .. local col %2d: ", iam, lb); + for (i = 0; i < Pr; ++i) + printf("%4d", bsendx_plist[lb][i]); + } + printf("\n"); + } + MPI_Barrier( grid->comm ); + } +#endif /* DEBUGlevel */ + + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Setup U-solve time\t%8.2f\n", t); + t = SuperLU_timer_(); +#endif + + /* + * Solve the roots first by all the diagonal processes. + */ +#if ( DEBUGlevel>=2 ) + printf("(%2d) nroot %4d\n", iam, nroot); +#endif + for (k = nsupers-1; k >= 0 && nroot; --k) { + krow = PROW( k, grid ); + kcol = PCOL( k, grid ); + if ( myrow == krow && mycol == kcol ) { /* Diagonal process. */ + knsupc = SuperSize( k ); + lk = LBi( k, grid ); /* Local block number, row-wise. */ + if ( brecv[lk]==0 && bmod[lk]==0 ) { + bmod[lk] = -1; /* Do not solve X[k] in the future. */ + ii = X_BLK( lk ); + lk = LBj( k, grid ); /* Local block number, column-wise */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; + --nroot; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < Pr; ++p) { + if ( bsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++]); +#else +#ifdef BSEND + MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#else + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + } + /* + * Perform local block modifications: lsum[i] -= U_i,k * X[k] + */ + if ( Urbs[lk] ) + slsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + } /* if root ... */ + } /* if diagonal process ... */ + } /* for k ... */ + + + /* + * Compute the internal nodes asynchronously by all processes. + */ + while ( nbrecvx || nbrecvmod ) { /* While not finished. */ + + /* Receive a message. */ + MPI_Recv( recvbuf, maxrecvsz, MPI_FLOAT, MPI_ANY_SOURCE, + MPI_ANY_TAG, grid->comm, &status ); + + k = *recvbuf; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Recv'd block %d, tag %2d\n", iam, k, status.MPI_TAG); +#endif + + switch ( status.MPI_TAG ) { + case Xk: + --nbrecvx; + lk = LBj( k, grid ); /* Local block number, column-wise. */ + /* + * Perform local block modifications: + * lsum[i] -= U_i,k * X[k] + */ + slsum_bmod(lsum, x, &recvbuf[XK_H], nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + + break; + + case LSUM: /* Receiver must be a diagonal process */ + --nbrecvmod; + lk = LBi( k, grid ); /* Local block number, row-wise. */ + ii = X_BLK( lk ); + knsupc = SuperSize( k ); + tempv = &recvbuf[LSUM_H]; + RHS_ITERATE(j) + for (i = 0; i < knsupc; ++i) + x[i + ii + j*knsupc] += tempv[i + j*knsupc]; + + if ( (--brecv[lk])==0 && bmod[lk]==0 ) { + bmod[lk] = -1; /* Do not solve X[k] in the future. */ + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Lrowind_bc_ptr[lk]; + lusup = Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc, 1, 1, 1, 1); +#else + strsm_("L", "U", "N", "N", &knsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &knsupc); +#endif + stat->ops[SOLVE] += knsupc * (knsupc + 1) * nrhs; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, k); +#endif + /* + * Send Xk to process column Pc[k]. + */ + kcol = PCOL( k, grid ); + for (p = 0; p < Pr; ++p) { + if ( bsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, kcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else +#ifdef BSEND + MPI_Bsend( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#else + MPI_Send( &x[ii - XK_H], knsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii - XK_H], pi); +#endif + } + } + /* + * Perform local block modifications: + * lsum[i] -= U_i,k * X[k] + */ + if ( Urbs[lk] ) + slsum_bmod(lsum, x, &x[ii], nrhs, k, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + } /* if becomes solvable */ + + break; + +#if ( DEBUGlevel>=2 ) + default: + printf("(%2d) Recv'd wrong message tag %4d\n", iam, status.MPI_TAG); + break; +#endif + + } /* switch */ + + } /* while not finished ... */ + +#if ( PRNTlevel>=2 ) + t = SuperLU_timer_() - t; + if ( !iam ) printf(".. U-solve time\t%8.2f\n", t); +#endif + + + /* Copy the solution X into B (on all processes). */ + { + int_t num_diag_procs, *diag_procs, *diag_len; + float *work; + + get_diag_procs(n, Glu_persist, grid, &num_diag_procs, + &diag_procs, &diag_len); + jj = diag_len[0]; + for (j = 1; j < num_diag_procs; ++j) jj = SUPERLU_MAX(jj, diag_len[j]); + if ( !(work = floatMalloc_dist(((size_t)jj)*nrhs)) ) + ABORT("Malloc fails for work[]"); + gather_diag_to_all(n, nrhs, x, Glu_persist, Llu, + grid, num_diag_procs, diag_procs, diag_len, + B, ldb, work); + SUPERLU_FREE(diag_procs); + SUPERLU_FREE(diag_len); + SUPERLU_FREE(work); + } + + /* Deallocate storage. */ + + SUPERLU_FREE(lsum); + SUPERLU_FREE(x); + SUPERLU_FREE(recvbuf); + for (i = 0; i < nub; ++i) + if ( Urbs[i] ) { + SUPERLU_FREE(Ucb_indptr[i]); + SUPERLU_FREE(Ucb_valptr[i]); + } + SUPERLU_FREE(Ucb_indptr); + SUPERLU_FREE(Ucb_valptr); + SUPERLU_FREE(Urbs); + SUPERLU_FREE(bmod); + SUPERLU_FREE(brecv); +#ifdef ISEND_IRECV + for (i = 0; i < Llu->SolveMsgSent; ++i) MPI_Request_free(&send_req[i]); + SUPERLU_FREE(send_req); +#endif +#ifdef BSEND + SUPERLU_FREE(send_req); +#endif + + stat->utime[SOLVE] = SuperLU_timer_() - t; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit psgstrs_Bglobal()"); +#endif + +} /* PSGSTRS_BGLOBAL */ + + +/* + * Gather the components of x vector on the diagonal processes + * onto all processes, and combine them into the global vector y. + */ +static void +gather_diag_to_all(int_t n, int_t nrhs, float x[], + Glu_persist_t *Glu_persist, sLocalLU_t *Llu, + gridinfo_t *grid, int_t num_diag_procs, + int_t diag_procs[], int_t diag_len[], + float y[], int_t ldy, float work[]) +{ + int_t i, ii, j, k, lk, lwork, nsupers, p; + int_t *ilsum, *xsup; + int iam, knsupc, pkk; + float *x_col, *y_col; + + iam = grid->iam; + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + ilsum = Llu->ilsum; + + for (p = 0; p < num_diag_procs; ++p) { + pkk = diag_procs[p]; + if ( iam == pkk ) { + /* Copy x vector into a buffer. */ + lwork = 0; + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + lk = LBi( k, grid ); + ii = X_BLK( lk ); /*ilsum[lk] + (lk+1)*XK_H;*/ + x_col = &x[ii]; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < knsupc; ++i) work[i+lwork] = x_col[i]; + lwork += knsupc; + x_col += knsupc; + } + } + MPI_Bcast( work, lwork, MPI_FLOAT, pkk, grid->comm ); + } else { + MPI_Bcast( work, diag_len[p]*nrhs, MPI_FLOAT, pkk, grid->comm ); + } + /* Scatter work[] into global y vector. */ + lwork = 0; + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + ii = FstBlockC( k ); + y_col = &y[ii]; + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < knsupc; ++i) y_col[i] = work[i+lwork]; + lwork += knsupc; + y_col += ldy; + } + } + } +} /* GATHER_DIAG_TO_ALL */ + diff --git a/SRC/psgstrs_lsum.c b/SRC/psgstrs_lsum.c new file mode 100644 index 00000000..fe0044f3 --- /dev/null +++ b/SRC/psgstrs_lsum.c @@ -0,0 +1,2138 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Perform local block modifications: lsum[i] -= L_i,k * X[k] + * + *
+ * -- Distributed SuperLU routine (version 6.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ * Modified:
+ *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
+ *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
+ * February 8, 2019  version 6.1.1
+ * 
+ */ + +#include "superlu_sdefs.h" +#include "superlu_defs.h" + +#ifndef CACHELINE +#define CACHELINE 64 /* bytes, Xeon Phi KNL, Cori haswell, Edision */ +#endif + +#define ISEND_IRECV + +/* + * Function prototypes + */ +#ifdef _CRAY +fortran void STRSM(_fcd, _fcd, _fcd, _fcd, int*, int*, float*, + float*, int*, float*, int*); +fortran void SGEMM(_fcd, _fcd, int*, int*, int*, float*, float*, + int*, float*, int*, float*, float*, int*); +_fcd ftcs1; +_fcd ftcs2; +_fcd ftcs3; +#endif + +/************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
+ * 
+ */ +void slsum_fmod +/************************************************************************/ +( + float *lsum, /* Sum of local modifications. */ + float *x, /* X array (local) */ + float *xk, /* X[k]. */ + float *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int knsupc, /* Size of supernode k. */ + int_t k, /* The k-th component of X. */ + int_t *fmod, /* Modification count for L-solve. */ + int_t nlb, /* Number of L blocks. */ + int_t lptr, /* Starting position in lsub[*]. */ + int_t luptr, /* Starting position in lusup[*]. */ + int_t *xsup, + gridinfo_t *grid, + sLocalLU_t *Llu, + MPI_Request send_req[], /* input/output */ + SuperLUStat_t *stat +) +{ + float alpha = 1.0, beta = 0.0; + float *lusup, *lusup1; + float *dest; + int iam, iknsupc, myrow, nbrow, nsupr, nsupr1, p, pi; + int_t i, ii, ik, il, ikcol, irow, j, lb, lk, lib, rel; + int_t *lsub, *lsub1, nlb1, lptr1, luptr1; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *frecv = Llu->frecv; + int_t **fsendx_plist = Llu->fsendx_plist; + MPI_Status status; + int test_flag; + +#if ( PROFlevel>=1 ) + double t1, t2; + float msg_vol = 0, msg_cnt = 0; +#endif +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + + iam = grid->iam; + myrow = MYROW( iam, grid ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Llu->Lrowind_bc_ptr[lk]; + lusup = Llu->Lnzval_bc_ptr[lk]; + nsupr = lsub[1]; + + for (lb = 0; lb < nlb; ++lb) { + ik = lsub[lptr]; /* Global block number, row-wise. */ + nbrow = lsub[lptr+1]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr], &nsupr, xk, + &knsupc, &beta, rtemp, &nbrow ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr], &nsupr, xk, + &knsupc, &beta, rtemp, &nbrow, 1, 1 ); +#else + sgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr], &nsupr, xk, + &knsupc, &beta, rtemp, &nbrow ); +#endif + stat->ops[SOLVE] += 2 * nbrow * nrhs * knsupc + nbrow * nrhs; + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + dest = &lsum[il]; + lptr += LB_DESCRIPTOR; + rel = xsup[ik]; /* Global row index of block ik. */ + for (i = 0; i < nbrow; ++i) { + irow = lsub[lptr++] - rel; /* Relative row. */ + RHS_ITERATE(j) + dest[irow + j*iknsupc] -= rtemp[i + j*nbrow]; + } + luptr += nbrow; + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat->utime[SOL_GEMM] += t2; +#endif + + if ( (--fmod[lk])==0 ) { /* Local accumulation done. */ + ikcol = PCOL( ik, grid ); + p = PNUM( myrow, ikcol, grid ); + if ( iam != p ) { +#ifdef ISEND_IRECV + MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_FLOAT, p, LSUM, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else +#ifdef BSEND + MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_FLOAT, p, LSUM, grid->comm ); +#else + MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_FLOAT, p, LSUM, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); +#endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + ii = X_BLK( lk ); + RHS_ITERATE(j) + for (i = 0; i < iknsupc; ++i) + x[i + ii + j*iknsupc] += lsum[i + il + j*iknsupc]; + if ( frecv[lk]==0 ) { /* Becomes a leaf node. */ + fmod[lk] = -1; /* Do not solve X[k] in the future. */ + lk = LBj( ik, grid );/* Local block number, column-wise. */ + lsub1 = Llu->Lrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; +#if ( PROFlevel>=1 ) + TIC(t1); +#endif +#ifdef _CRAY + STRSM(ftcs1, ftcs1, ftcs2, ftcs3, &iknsupc, &nrhs, &alpha, + lusup1, &nsupr1, &x[ii], &iknsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, + lusup1, &nsupr1, &x[ii], &iknsupc, 1, 1, 1, 1); +#else + strsm_("L", "L", "N", "U", &iknsupc, &nrhs, &alpha, + lusup1, &nsupr1, &x[ii], &iknsupc); +#endif +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat->utime[SOL_TRSM] += t2; +#endif + + stat->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, ik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < grid->nprow; ++p) { + if ( fsendx_plist[lk][p] != EMPTY ) { + pi = PNUM( p, ikcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else +#ifdef BSEND + MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#else + MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + } + /* + * Perform local block modifications. + */ + nlb1 = lsub1[0] - 1; + lptr1 = BC_HEADER + LB_DESCRIPTOR + iknsupc; + luptr1 = iknsupc; /* Skip diagonal block L(I,I). */ + + slsum_fmod(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, + fmod, nlb1, lptr1, luptr1, xsup, + grid, Llu, send_req, stat); + } /* if frecv[lk] == 0 */ + } /* if iam == p */ + } /* if fmod[lk] == 0 */ + + } /* for lb ... */ + +} /* sLSUM_FMOD */ + + +/************************************************************************/ +void slsum_bmod +/************************************************************************/ +( + float *lsum, /* Sum of local modifications. */ + float *x, /* X array (local). */ + float *xk, /* X[k]. */ + int nrhs, /* Number of right-hand sides. */ + int_t k, /* The k-th component of X. */ + int_t *bmod, /* Modification count for L-solve. */ + int_t *Urbs, /* Number of row blocks in each block column of U.*/ + Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/ + int_t **Ucb_valptr, /* Vertical linked list pointing to Unzval[]. */ + int_t *xsup, + gridinfo_t *grid, + sLocalLU_t *Llu, + MPI_Request send_req[], /* input/output */ + SuperLUStat_t *stat + ) +{ +/* + * Purpose + * ======= + * Perform local block modifications: lsum[i] -= U_i,k * X[k]. + */ + float alpha = 1.0, beta = 0.0; + int iam, iknsupc, knsupc, myrow, nsupr, p, pi; + int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, + j, jj, lk, lk1, nub, ub, uptr; + int_t *usub; + float *uval, *dest, *y; + int_t *lsub; + float *lusup; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *brecv = Llu->brecv; + int_t **bsendx_plist = Llu->bsendx_plist; + MPI_Status status; + int test_flag; + + iam = grid->iam; + myrow = MYROW( iam, grid ); + knsupc = SuperSize( k ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + nub = Urbs[lk]; /* Number of U blocks in block column lk */ + + for (ub = 0; ub < nub; ++ub) { + ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ + usub = Llu->Ufstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + for (irow = fnz; irow < iklrow; ++irow) + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat->ops[SOLVE] += 2 * (iklrow - fnz); + } + } /* for jj ... */ + } + + if ( (--bmod[ik]) == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { +#ifdef ISEND_IRECV + MPI_Isend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_FLOAT, p, LSUM, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else +#ifdef BSEND + MPI_Bsend( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_FLOAT, p, LSUM, grid->comm ); +#else + MPI_Send( &lsum[il - LSUM_H], iknsupc * nrhs + LSUM_H, + MPI_FLOAT, p, LSUM, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); +#endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + ii = X_BLK( ik ); + dest = &x[ii]; + RHS_ITERATE(j) + for (i = 0; i < iknsupc; ++i) + dest[i + j*iknsupc] += lsum[i + il + j*iknsupc]; + if ( !brecv[ik] ) { /* Becomes a leaf node. */ + bmod[ik] = -1; /* Do not solve X[k] in the future. */ + lk1 = LBj( gik, grid ); /* Local block number. */ + lsub = Llu->Lrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; +#ifdef _CRAY + STRSM(ftcs1, ftcs3, ftcs2, ftcs2, &iknsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &iknsupc); +#elif defined (USE_VENDOR_BLAS) + strsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &iknsupc, 1, 1, 1, 1); +#else + strsm_("L", "U", "N", "N", &iknsupc, &nrhs, &alpha, + lusup, &nsupr, &x[ii], &iknsupc); +#endif + stat->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, gik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + for (p = 0; p < grid->nprow; ++p) { + if ( bsendx_plist[lk1][p] != EMPTY ) { + pi = PNUM( p, gikcol, grid ); +#ifdef ISEND_IRECV + MPI_Isend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm, + &send_req[Llu->SolveMsgSent++] ); +#else +#ifdef BSEND + MPI_Bsend( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#else + MPI_Send( &x[ii - XK_H], iknsupc * nrhs + XK_H, + MPI_FLOAT, pi, Xk, grid->comm ); +#endif +#endif +#if ( DEBUGlevel>=2 ) + printf("(%2d) Sent X[%2.0f] to P %2d\n", + iam, x[ii-XK_H], pi); +#endif + } + } + /* + * Perform local block modifications. + */ + if ( Urbs[lk1] ) + slsum_bmod(lsum, x, &x[ii], nrhs, gik, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + send_req, stat); + } /* if brecv[ik] == 0 */ + } + } /* if bmod[ik] == 0 */ + + } /* for ub ... */ + +} /* slSUM_BMOD */ + + + +/************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
+ * 
+ */ +void slsum_fmod_inv +/************************************************************************/ +( + float *lsum, /* Sum of local modifications. */ + float *x, /* X array (local) */ + float *xk, /* X[k]. */ + float *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int_t k, /* The k-th component of X. */ + int_t *fmod, /* Modification count for L-solve. */ + int_t *xsup, + gridinfo_t *grid, + sLocalLU_t *Llu, + SuperLUStat_t **stat, + int_t *leaf_send, + int_t *nleaf_send, + int_t sizelsum, + int_t sizertemp, + int_t recurlevel, + int_t maxsuper, + int thread_id, + int num_thread +) +{ + float alpha = 1.0, beta = 0.0,malpha=-1.0; + float *lusup, *lusup1; + float *dest; + float *Linv;/* Inverse of diagonal block */ + int iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r,m; + int_t i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready; + int_t *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *frecv = Llu->frecv; + int_t **fsendx_plist = Llu->fsendx_plist; + int_t luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n, idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder; + int thread_id1; + flops_t ops_loc=0.0; + MPI_Status status; + int test_flag; + yes_no_t done; + BcTree *LBtree_ptr = Llu->LBtree_ptr; + RdTree *LRtree_ptr = Llu->LRtree_ptr; + int_t* idx_lsum,idx_lsum1; + float *rtemp_loc; + int_t ldalsum; + int_t nleaf_send_tmp; + int_t lptr; /* Starting position in lsub[*]. */ + int_t luptr; /* Starting position in lusup[*]. */ + int_t iword = sizeof(int_t); + int_t dword = sizeof (float); + int_t aln_d,aln_i; + aln_d = ceil(CACHELINE/(double)dword); + aln_i = ceil(CACHELINE/(double)iword); + int knsupc; /* Size of supernode k. */ + int_t nlb; /* Number of L blocks. */ + + + knsupc = SuperSize( k ); + + lk = LBj( k, grid ); /* Local block number, column-wise. */ + lsub = Llu->Lrowind_bc_ptr[lk]; + nlb = lsub[0] - 1; + + + ldalsum=Llu->ldalsum; + + rtemp_loc = &rtemp[sizertemp* thread_id]; + + // #if ( PROFlevel>=1 ) + double t1, t2, t3, t4; + float msg_vol = 0, msg_cnt = 0; + // #endif + + if(nlb>0){ + + iam = grid->iam; + myrow = MYROW( iam, grid ); + + lusup = Llu->Lnzval_bc_ptr[lk]; + lloc = Llu->Lindval_loc_bc_ptr[lk]; + + nsupr = lsub[1]; + + // printf("nlb: %5d lk: %5d\n",nlb,lk); + // fflush(stdout); + + krow = PROW( k, grid ); + if(myrow==krow){ + idx_n = 1; + idx_i = nlb+2; + idx_v = 2*nlb+3; + luptr_tmp = lloc[idx_v]; + m = nsupr-knsupc; + }else{ + idx_n = 0; + idx_i = nlb; + idx_v = 2*nlb; + luptr_tmp = lloc[idx_v]; + m = nsupr; + } + + assert(m>0); + + if(m>8*maxsuper){ + // if(0){ + + // Nchunk=floor(num_thread/2.0)+1; + Nchunk=SUPERLU_MIN(num_thread,nlb); + // Nchunk=1; + nlb_loc = floor(((double)nlb)/Nchunk); + remainder = nlb % Nchunk; + +#ifdef _OPENMP +#pragma omp taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j,nleaf_send_tmp) untied nogroup +#endif + for (nn=0;nn=1 ) + TIC(t1); +#endif + luptr_tmp1 = lloc[lbstart+idx_v]; + nbrow=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + + #ifdef _CRAY + SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); + #elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 ); + #else + sgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); + #endif + + nbrow_ref=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + + RHS_ITERATE(j) + #ifdef _OPENMP + #pragma omp simd + #endif + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ + lsum[il+irow + j*iknsupc+sizelsum*thread_id1] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + } + nbrow_ref+=nbrow1; + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + + for (lb=lbstart;lb=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; + + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; + + +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); +#else + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; + +#endif + + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, ik); + +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + if(LBtree_ptr[lk]!=NULL){ +#ifdef _OPENMP +#pragma omp atomic capture +#endif + nleaf_send_tmp = ++nleaf_send[0]; + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + } + + /* + * Perform local block modifications. + */ + + // #ifdef _OPENMP + // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) + // #endif + { + + slsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, + fmod, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread); + } + + // } /* if frecv[lk] == 0 */ + } /* if iam == p */ + } /* if fmod[lk] == 0 */ + } + + } + } + + }else{ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m, 1, 1 ); +#else + sgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#endif + + nbrow=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + nbrow_ref=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + + RHS_ITERATE(j) + #ifdef _OPENMP + #pragma omp simd + #endif + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ + + lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + } + nbrow_ref+=nbrow1; + } + + // TOC(t3, t1); + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_GEMM] += t2; +#endif + + for (lb=0;lb=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; + + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); +#else + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_TRSM] += t2; +#endif + + stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, ik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + if(LBtree_ptr[lk]!=NULL){ + +#ifdef _OPENMP +#pragma omp atomic capture +#endif + nleaf_send_tmp = ++nleaf_send[0]; + // printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp); + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'s'); + } + + /* + * Perform local block modifications. + */ + + // #ifdef _OPENMP + // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1) + // #endif + + { + slsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, + fmod, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); + } + + // } /* if frecv[lk] == 0 */ + } /* if iam == p */ + } /* if fmod[lk] == 0 */ + } + // } +} + + stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; + + + +} /* if nlb>0*/ +} /* sLSUM_FMOD_INV */ + +/************************************************************************/ +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Perform local block modifications: lsum[i] -= L_i,k * X[k].
+ * 
+ */ +void slsum_fmod_inv_master +/************************************************************************/ +( + float *lsum, /* Sum of local modifications. */ + float *x, /* X array (local) */ + float *xk, /* X[k]. */ + float *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int knsupc, /* Size of supernode k. */ + int_t k, /* The k-th component of X. */ + int_t *fmod, /* Modification count for L-solve. */ + int_t nlb, /* Number of L blocks. */ + int_t *xsup, + gridinfo_t *grid, + sLocalLU_t *Llu, + SuperLUStat_t **stat, + int_t sizelsum, + int_t sizertemp, + int_t recurlevel, + int_t maxsuper, + int thread_id, + int num_thread +) +{ + float alpha = 1.0, beta = 0.0, malpha=-1.0; + float *lusup, *lusup1; + float *dest; + float *Linv;/* Inverse of diagonal block */ + int iam, iknsupc, myrow, krow, nbrow, nbrow1, nbrow_ref, nsupr, nsupr1, p, pi, idx_r; + int_t i, ii,jj, ik, il, ikcol, irow, j, lb, lk, rel, lib,lready; + int_t *lsub, *lsub1, nlb1, lptr1, luptr1,*lloc; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *frecv = Llu->frecv; + int_t **fsendx_plist = Llu->fsendx_plist; + int_t luptr_tmp,luptr_tmp1,lptr1_tmp,maxrecvsz, idx_i, idx_v,idx_n, idx_l, fmod_tmp, lbstart,lbend,nn,Nchunk,nlb_loc,remainder; + int thread_id1; + int m; + flops_t ops_loc=0.0; + MPI_Status status; + int test_flag; + yes_no_t done; + BcTree *LBtree_ptr = Llu->LBtree_ptr; + RdTree *LRtree_ptr = Llu->LRtree_ptr; + int_t* idx_lsum,idx_lsum1; + float *rtemp_loc; + int_t ldalsum; + int_t nleaf_send_tmp; + int_t lptr; /* Starting position in lsub[*]. */ + int_t luptr; /* Starting position in lusup[*]. */ + int_t iword = sizeof(int_t); + int_t dword = sizeof (float); + int_t aln_d,aln_i; + aln_d = ceil(CACHELINE/(double)dword); + aln_i = ceil(CACHELINE/(double)iword); + + ldalsum=Llu->ldalsum; + + rtemp_loc = &rtemp[sizertemp* thread_id]; + + // #if ( PROFlevel>=1 ) + double t1, t2, t3, t4; + float msg_vol = 0, msg_cnt = 0; + // #endif + + if(nlb>0){ + + iam = grid->iam; + myrow = MYROW( iam, grid ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + + // printf("ya1 %5d k %5d lk %5d\n",thread_id,k,lk); + // fflush(stdout); + + lsub = Llu->Lrowind_bc_ptr[lk]; + + // printf("ya2 %5d k %5d lk %5d\n",thread_id,k,lk); + // fflush(stdout); + + lusup = Llu->Lnzval_bc_ptr[lk]; + lloc = Llu->Lindval_loc_bc_ptr[lk]; + // idx_lsum = Llu->Lrowind_bc_2_lsum[lk]; + + nsupr = lsub[1]; + + // printf("nlb: %5d lk: %5d\n",nlb,lk); + // fflush(stdout); + + krow = PROW( k, grid ); + if(myrow==krow){ + idx_n = 1; + idx_i = nlb+2; + idx_v = 2*nlb+3; + luptr_tmp = lloc[idx_v]; + m = nsupr-knsupc; + }else{ + idx_n = 0; + idx_i = nlb; + idx_v = 2*nlb; + luptr_tmp = lloc[idx_v]; + m = nsupr; + } + + assert(m>0); + + if(m>4*maxsuper || nrhs>10){ + // if(m<1){ + // TIC(t1); + Nchunk=num_thread; + nlb_loc = floor(((double)nlb)/Nchunk); + remainder = nlb % Nchunk; + +#ifdef _OPENMP +#pragma omp taskloop private (lptr1,luptr1,nlb1,thread_id1,lsub1,lusup1,nsupr1,Linv,nn,lbstart,lbend,luptr_tmp1,nbrow,lb,lptr1_tmp,rtemp_loc,nbrow_ref,lptr,nbrow1,ik,rel,lk,iknsupc,il,i,irow,fmod_tmp,ikcol,p,ii,jj,t1,t2,j) untied +#endif + for (nn=0;nn=1 ) + TIC(t1); +#endif + luptr_tmp1 = lloc[lbstart+idx_v]; + nbrow=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + + #ifdef _CRAY + SGEMM( ftcs2, ftcs2, &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); + #elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow, 1, 1 ); + #else + sgemm_( "N", "N", &nbrow, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp1], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &nbrow ); + #endif + + nbrow_ref=0; + for (lb = lbstart; lb < lbend; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + + RHS_ITERATE(j) + #ifdef _OPENMP + #pragma omp simd lastprivate(irow) + #endif + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ + lsum[il+irow + j*iknsupc] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + } + nbrow_ref+=nbrow1; + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + } + } + + }else{ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m, 1, 1 ); +#else + sgemm_( "N", "N", &m, &nrhs, &knsupc, + &alpha, &lusup[luptr_tmp], &nsupr, xk, + &knsupc, &beta, rtemp_loc, &m ); +#endif + + nbrow=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + nbrow += lsub[lptr1_tmp+1]; + } + nbrow_ref=0; + for (lb = 0; lb < nlb; ++lb){ + lptr1_tmp = lloc[lb+idx_i]; + lptr= lptr1_tmp+2; + nbrow1 = lsub[lptr1_tmp+1]; + ik = lsub[lptr1_tmp]; /* Global block number, row-wise. */ + rel = xsup[ik]; /* Global row index of block ik. */ + + lk = LBi( ik, grid ); /* Local block number, row-wise. */ + + iknsupc = SuperSize( ik ); + il = LSUM_BLK( lk ); + + RHS_ITERATE(j) + #ifdef _OPENMP + #pragma omp simd lastprivate(irow) + #endif + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ + + lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + } + nbrow_ref+=nbrow1; + } +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_GEMM] += t2; +#endif + } + // TOC(t3, t1); + rtemp_loc = &rtemp[sizertemp* thread_id]; + + for (lb=0;lb=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; + + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); +#else + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_TRSM] += t2; + +#endif + + stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, ik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + if(LBtree_ptr[lk]!=NULL) + BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],BcTree_GetMsgSize(LBtree_ptr[lk],'s')*nrhs+XK_H,'s'); + + /* + * Perform local block modifications. + */ + + // #ifdef _OPENMP + // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) + // #endif + { + nlb1 = lsub1[0] - 1; + + + slsum_fmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, + fmod, nlb1, xsup, + grid, Llu, stat,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); + } + + // } /* if frecv[lk] == 0 */ + } /* if iam == p */ + } /* if fmod[lk] == 0 */ + } + // } + stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; + } /* if nlb>0*/ +} /* sLSUM_FMOD_INV */ + + + +/************************************************************************/ +void slsum_bmod_inv +/************************************************************************/ +( + float *lsum, /* Sum of local modifications. */ + float *x, /* X array (local). */ + float *xk, /* X[k]. */ + float *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int_t k, /* The k-th component of X. */ + int_t *bmod, /* Modification count for L-solve. */ + int_t *Urbs, /* Number of row blocks in each block column of U.*/ + Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/ + int_t **Ucb_valptr, /* Vertical linked list pointing to Unzval[]. */ + int_t *xsup, + gridinfo_t *grid, + sLocalLU_t *Llu, + SuperLUStat_t **stat, + int_t* root_send, + int_t* nroot_send, + int_t sizelsum, + int_t sizertemp, + int thread_id, + int num_thread + ) +{ + /* + * Purpose + * ======= + * Perform local block modifications: lsum[i] -= U_i,k * X[k]. + */ + float alpha = 1.0, beta = 0.0; + int iam, iknsupc, knsupc, myrow, nsupr, p, pi; + int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, + j, jj, lk, lk1, nub, ub, uptr; + int_t *usub; + float *uval, *dest, *y; + int_t *lsub; + float *lusup; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *brecv = Llu->brecv; + int_t **bsendx_plist = Llu->bsendx_plist; + BcTree *UBtree_ptr = Llu->UBtree_ptr; + RdTree *URtree_ptr = Llu->URtree_ptr; + MPI_Status status; + int test_flag; + int_t bmod_tmp; + int thread_id1; + float *rtemp_loc; + int_t nroot_send_tmp; + float *Uinv;/* Inverse of diagonal block */ + float temp; + double t1, t2; + float msg_vol = 0, msg_cnt = 0; + int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend; + int_t iword = sizeof(int_t); + int_t dword = sizeof(float); + int_t aln_d,aln_i; + aln_d = ceil(CACHELINE/(double)dword); + aln_i = ceil(CACHELINE/(double)iword); + + + iam = grid->iam; + myrow = MYROW( iam, grid ); + knsupc = SuperSize( k ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + nub = Urbs[lk]; /* Number of U blocks in block column lk */ + + if(Llu->Unnz[lk]>knsupc*64 || nub>16){ + // if(nub>num_thread){ + // if(nub>16){ + // // // // if(Urbs2[lk]>num_thread){ + // if(Urbs2[lk]>0){ + Nchunk=SUPERLU_MIN(num_thread,nub); + nub_loc = floor(((double)nub)/Nchunk); + remainder = nub % Nchunk; + // printf("Unnz: %5d nub: %5d knsupc: %5d\n",Llu->Unnz[lk],nub,knsupc); +#ifdef _OPENMP +#pragma omp taskloop firstprivate (stat) private (thread_id1,Uinv,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,lk1,gik,gikcol,usub,uval,lsub,lusup,iknsupc,il,i,irow,bmod_tmp,p,ii,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz,nsupr) untied nogroup +#endif + for (nn=0;nnUfstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id1]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + #ifdef _OPENMP + #pragma omp simd + #endif + for (irow = fnz; irow < iklrow; ++irow) + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); + + } + } /* for jj ... */ + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + + + #ifdef _OPENMP + #pragma omp atomic capture + #endif + bmod_tmp=--bmod[ik*aln_i]; + + if ( bmod_tmp == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { + for (ii=1;ii=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); + #endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + + for (ii=1;iiLrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; + + if(Llu->inv == 1){ + Uinv = Llu->Uinv_bc_ptr[lk1]; + #ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + #else + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; + #endif + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; + + #if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, gik); + #endif + + /* + * Send Xk to process column Pc[k]. + */ + + // for (i=0 ; iUfstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + #ifdef _OPENMP + #pragma omp simd + #endif + for (irow = fnz; irow < iklrow; ++irow) + + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz); + } + } /* for jj ... */ + } + +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_GEMM] += t2; +#endif + + #ifdef _OPENMP + #pragma omp atomic capture + #endif + bmod_tmp=--bmod[ik*aln_i]; + + if ( bmod_tmp == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { + for (ii=1;ii=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); + #endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + + for (ii=1;iiLrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; + + if(Llu->inv == 1){ + Uinv = Llu->Uinv_bc_ptr[lk1]; + #ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + #else + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); + #endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_TRSM] += t2; + #endif + stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; + #if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, gik); + #endif + + /* + * Send Xk to process column Pc[k]. + */ + + // for (i=0 ; i16){ + // #ifdef _OPENMP + // #pragma omp task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied + // #endif + // slsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, + // Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + // stat, root_send, nroot_send, sizelsum,sizertemp); + //}else{ + slsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread); + //} + + // } /* if brecv[ik] == 0 */ + } + } /* if bmod[ik] == 0 */ + + } /* for ub ... */ + } + +} /* slSUM_BMOD_inv */ + + + +/************************************************************************/ +void slsum_bmod_inv_master +/************************************************************************/ +( + float *lsum, /* Sum of local modifications. */ + float *x, /* X array (local). */ + float *xk, /* X[k]. */ + float *rtemp, /* Result of full matrix-vector multiply. */ + int nrhs, /* Number of right-hand sides. */ + int_t k, /* The k-th component of X. */ + int_t *bmod, /* Modification count for L-solve. */ + int_t *Urbs, /* Number of row blocks in each block column of U.*/ + Ucb_indptr_t **Ucb_indptr,/* Vertical linked list pointing to Uindex[].*/ + int_t **Ucb_valptr, /* Vertical linked list pointing to Unzval[]. */ + int_t *xsup, + gridinfo_t *grid, + sLocalLU_t *Llu, + SuperLUStat_t **stat, + int_t sizelsum, + int_t sizertemp, + int thread_id, + int num_thread + ) +{ + /* + * Purpose + * ======= + * Perform local block modifications: lsum[i] -= U_i,k * X[k]. + */ + float alpha = 1.0, beta = 0.0; + int iam, iknsupc, knsupc, myrow, nsupr, p, pi; + int_t fnz, gik, gikcol, i, ii, ik, ikfrow, iklrow, il, irow, + j, jj, lk, lk1, nub, ub, uptr; + int_t *usub; + float *uval, *dest, *y; + int_t *lsub; + float *lusup; + int_t *ilsum = Llu->ilsum; /* Starting position of each supernode in lsum. */ + int_t *brecv = Llu->brecv; + int_t **bsendx_plist = Llu->bsendx_plist; + BcTree *UBtree_ptr = Llu->UBtree_ptr; + RdTree *URtree_ptr = Llu->URtree_ptr; + MPI_Status status; + int test_flag; + int_t bmod_tmp; + int thread_id1; + float *rtemp_loc; + float temp; + float *Uinv;/* Inverse of diagonal block */ + + double t1, t2; + float msg_vol = 0, msg_cnt = 0; + int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend; + int_t iword = sizeof(int_t); + int_t dword = sizeof (float); + int_t aln_d,aln_i; + aln_d = ceil(CACHELINE/(double)dword); + aln_i = ceil(CACHELINE/(double)iword); + + + rtemp_loc = &rtemp[sizertemp* thread_id]; + + + iam = grid->iam; + myrow = MYROW( iam, grid ); + knsupc = SuperSize( k ); + lk = LBj( k, grid ); /* Local block number, column-wise. */ + nub = Urbs[lk]; /* Number of U blocks in block column lk */ + + // printf("Urbs2[lk] %5d lk %5d nub %5d\n",Urbs2[lk],lk,nub); + // fflush(stdout); + + if(nub>num_thread){ + // if(nub>0){ + Nchunk=num_thread; + nub_loc = floor(((double)nub)/Nchunk); + remainder = nub % Nchunk; + +//#ifdef _OPENMP +//#pragma omp taskloop firstprivate (stat) private (thread_id1,nn,lbstart,lbend,ub,temp,rtemp_loc,ik,gik,usub,uval,iknsupc,il,i,irow,jj,t1,t2,j,ikfrow,iklrow,dest,y,uptr,fnz) untied +//#endif + for (nn=0;nn=1 ) + TIC(t1); +#endif + + if(nnUfstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id1]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + #ifdef _OPENMP + #pragma omp simd + #endif + for (irow = fnz; irow < iklrow; ++irow) + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); + + } + } /* for jj ... */ + } + } +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id1]->utime[SOL_GEMM] += t2; +#endif + } + + }else{ + rtemp_loc = &rtemp[sizertemp* thread_id]; +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + for (ub = 0; ub < nub; ++ub) { + ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ + usub = Llu->Ufstnz_br_ptr[ik]; + uval = Llu->Unzval_br_ptr[ik]; + i = Ucb_indptr[lk][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + ikfrow = FstBlockC( gik ); + iklrow = FstBlockC( gik+1 ); + + RHS_ITERATE(j) { + dest = &lsum[il + j*iknsupc+sizelsum*thread_id]; + y = &xk[j*knsupc]; + uptr = Ucb_valptr[lk][ub]; /* Start of the block in uval[]. */ + for (jj = 0; jj < knsupc; ++jj) { + fnz = usub[i + jj]; + if ( fnz < iklrow ) { /* Nonzero segment. */ + /* AXPY */ + #ifdef _OPENMP + #pragma omp simd + #endif + for (irow = fnz; irow < iklrow; ++irow) + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz); + + } + } /* for jj ... */ + } + } +#if ( PROFlevel>=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_GEMM] += t2; +#endif + } + + + rtemp_loc = &rtemp[sizertemp* thread_id]; + for (ub = 0; ub < nub; ++ub){ + ik = Ucb_indptr[lk][ub].lbnum; /* Local block number, row-wise. */ + il = LSUM_BLK( ik ); + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iknsupc = SuperSize( gik ); + + // #ifdef _OPENMP + // #pragma omp atomic capture + // #endif + bmod_tmp=--bmod[ik*aln_i]; + + if ( bmod_tmp == 0 ) { /* Local accumulation done. */ + gikcol = PCOL( gik, grid ); + p = PNUM( myrow, gikcol, grid ); + if ( iam != p ) { + for (ii=1;ii=2 ) + printf("(%2d) Sent LSUM[%2.0f], size %2d, to P %2d\n", + iam, lsum[il-LSUM_H], iknsupc*nrhs+LSUM_H, p); +#endif + } else { /* Diagonal process: X[i] += lsum[i]. */ + +#if ( PROFlevel>=1 ) + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk1]; + lusup = Llu->Lnzval_bc_ptr[lk1]; + nsupr = lsub[1]; + + if(Llu->inv == 1){ + Uinv = Llu->Uinv_bc_ptr[lk1]; +#ifdef _CRAY + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#elif defined (USE_VENDOR_BLAS) + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); +#else + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Uinv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); +#endif + #ifdef _OPENMP + #pragma omp simd + #endif + for (i=0 ; i=1 ) + TOC(t2, t1); + stat[thread_id]->utime[SOL_TRSM] += t2; +#endif + stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc + 1) * nrhs; +#if ( DEBUGlevel>=2 ) + printf("(%2d) Solve X[%2d]\n", iam, gik); +#endif + + /* + * Send Xk to process column Pc[k]. + */ + + // for (i=0 ; i + * File name: pslangs.c + * History: Modified from lapack routine SLANGE + *
+ */ +#include +#include "superlu_sdefs.h" + +/*! \brief + +
+    Purpose
+    =======
+
+    PSLANGS returns the value of the one norm, or the Frobenius norm, or
+    the infinity norm, or the element of largest absolute value of a
+    real matrix A.
+
+    Description
+    ===========
+
+    PSLANGE returns the value
+
+       PSLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'
+                 (
+                 ( norm1(A),         NORM = '1', 'O' or 'o'
+                 (
+                 ( normI(A),         NORM = 'I' or 'i'
+                 (
+                 ( normF(A),         NORM = 'F', 'f', 'E' or 'e'
+
+    where  norm1  denotes the  one norm of a matrix (maximum column sum),
+    normI  denotes the  infinity norm  of a matrix  (maximum row sum) and
+    normF  denotes the  Frobenius norm of a matrix (square root of sum of
+    squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.
+
+    Arguments
+    =========
+
+    NORM    (input) CHARACTER*1
+            Specifies the value to be returned in DLANGE as described above.
+    A       (input) SuperMatrix*
+            The M by N sparse matrix A.
+    GRID    (input) gridinof_t*
+            The 2D process mesh.
+   =====================================================================
+
+*/ + +float pslangs(char *norm, SuperMatrix *A, gridinfo_t *grid) +{ + /* Local variables */ + NRformat_loc *Astore; + int_t m_loc; + float *Aval; + int_t i, j, jcol; + float value=0., sum; + float *rwork; + float tempvalue; + float *temprwork; + + Astore = (NRformat_loc *) A->Store; + m_loc = Astore->m_loc; + Aval = (float *) Astore->nzval; + + if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { + value = 0.; + } else if ( strncmp(norm, "M", 1)==0 ) { + /* Find max(abs(A(i,j))). */ + value = 0.; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) + value = SUPERLU_MAX( value, fabs(Aval[j]) ); + } + + MPI_Allreduce(&value, &tempvalue, 1, MPI_FLOAT, MPI_MAX, grid->comm); + value = tempvalue; + + } else if ( strncmp(norm, "O", 1)==0 || *(unsigned char *)norm == '1') { + /* Find norm1(A). */ + value = 0.; +#if 0 + for (j = 0; j < A->ncol; ++j) { + sum = 0.; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) + sum += fabs(Aval[i]); + value = SUPERLU_MAX(value,sum); + } +#else /* Sherry ==> */ + if ( !(rwork = floatCalloc_dist(A->ncol)) ) + ABORT("floatCalloc_dist fails for rwork."); + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + jcol = Astore->colind[j]; + rwork[jcol] += fabs(Aval[j]); + } + } + + if ( !(temprwork = floatCalloc_dist(A->ncol)) ) + ABORT("floatCalloc_dist fails for temprwork."); + MPI_Allreduce(rwork, temprwork, A->ncol, MPI_FLOAT, MPI_SUM, grid->comm); + value = 0.; + for (j = 0; j < A->ncol; ++j) { + value = SUPERLU_MAX(value, temprwork[j]); + } + SUPERLU_FREE (temprwork); + SUPERLU_FREE (rwork); +#endif + } else if ( strncmp(norm, "I", 1)==0 ) { + /* Find normI(A). */ + value = 0.; + sum = 0.; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) + sum += fabs(Aval[j]); + value = SUPERLU_MAX(value, sum); + } + MPI_Allreduce(&value, &tempvalue, 1, MPI_FLOAT, MPI_MAX, grid->comm); + value = tempvalue; + + } else if ( strncmp(norm, "F", 1)==0 || strncmp(norm, "E", 1)==0 ) { + /* Find normF(A). */ + ABORT("Not implemented."); + } else { + ABORT("Illegal norm specified."); + } + + return (value); + +} /* pslangs */ diff --git a/SRC/pslaqgs.c b/SRC/pslaqgs.c new file mode 100644 index 00000000..3fbf9451 --- /dev/null +++ b/SRC/pslaqgs.c @@ -0,0 +1,151 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Equilibrates a general sparse M by N matrix + * + *
+ * File name:	pslaqgs.c
+ * History:     Modified from LAPACK routine SLAQGE
+ * 
+ */ +#include +#include "superlu_sdefs.h" + +/*! \brief + +
+    Purpose
+    =======
+
+    PSLAQGS equilibrates a general sparse M by N matrix A using the row
+    and column scaling factors in the vectors R and C.
+
+    See supermatrix.h for the definition of 'SuperMatrix' structure.
+
+    Arguments
+    =========
+
+    A       (input/output) SuperMatrix*
+            On exit, the equilibrated matrix.  See EQUED for the form of
+            the equilibrated matrix. The type of A can be:
+	    Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+
+    R       (input) float*, dimension (A->nrow)
+            The row scale factors for A.
+
+    C       (input) float*, dimension (A->ncol)
+            The column scale factors for A.
+
+    ROWCND  (input) float
+            Ratio of the smallest R(i) to the largest R(i).
+
+    COLCND  (input) float
+            Ratio of the smallest C(i) to the largest C(i).
+
+    AMAX    (input) float
+            Absolute value of largest matrix entry.
+
+    EQUED   (output) char*
+            Specifies the form of equilibration that was done.
+            = 'N':  No equilibration
+            = 'R':  Row equilibration, i.e., A has been premultiplied by
+                    diag(R).
+            = 'C':  Column equilibration, i.e., A has been postmultiplied
+                    by diag(C).
+            = 'B':  Both row and column equilibration, i.e., A has been
+                    replaced by diag(R) * A * diag(C).
+
+    Internal Parameters
+    ===================
+
+    THRESH is a threshold value used to decide if row or column scaling
+    should be done based on the ratio of the row or column scaling
+    factors.  If ROWCND < THRESH, row scaling is done, and if
+    COLCND < THRESH, column scaling is done.
+
+    LARGE and SMALL are threshold values used to decide if row scaling
+    should be done based on the absolute size of the largest matrix
+    element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.
+
+    =====================================================================
+
+*/ + +void +pslaqgs(SuperMatrix *A, float *r, float *c, + float rowcnd, float colcnd, float amax, char *equed) +{ + +#define THRESH (0.1) + + /* Local variables */ + NRformat_loc *Astore; + float *Aval; + int_t i, j, irow, jcol, m_loc; + float large, small; + + /* Quick return if possible */ + if (A->nrow <= 0 || A->ncol <= 0) { + *(unsigned char *)equed = 'N'; + return; + } + + Astore = A->Store; + Aval = Astore->nzval; + m_loc = Astore->m_loc; + + /* Initialize LARGE and SMALL. */ + small = smach_dist("Safe minimum") / smach_dist("Precision"); + large = 1. / small; + + if (rowcnd >= THRESH && amax >= small && amax <= large) { + if (colcnd >= THRESH) + *(unsigned char *)equed = 'N'; + else { + /* Column scaling */ + irow = Astore->fst_row; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + jcol = Astore->colind[j]; + Aval[j] *= c[jcol]; + } + ++irow; + } + *(unsigned char *)equed = 'C'; + } + } else if (colcnd >= THRESH) { + /* Row scaling, no column scaling */ + irow = Astore->fst_row; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) + Aval[j] *= r[irow]; + ++irow; + } + *(unsigned char *)equed = 'R'; + } else { + /* Both row and column scaling */ + irow = Astore->fst_row; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + jcol = Astore->colind[j]; + Aval[j] = Aval[j] * r[irow] * c[jcol]; + } + ++irow; + } + *(unsigned char *)equed = 'B'; + } + + return; + +} /* pslaqgs */ + diff --git a/SRC/pssymbfact_distdata.c b/SRC/pssymbfact_distdata.c new file mode 100644 index 00000000..1488ee18 --- /dev/null +++ b/SRC/pssymbfact_distdata.c @@ -0,0 +1,2831 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Redistribute the symbolic structure of L and U from the distribution + * + *
+ * -- Parallel symbolic factorization auxialiary routine (version 2.3) --
+ * -- Distributes the data from parallel symbolic factorization
+ * -- to numeric factorization
+ * INRIA France -  July 1, 2004
+ * Laura Grigori
+ *
+ * November 1, 2007
+ * Feburary 20, 2008
+ * October 15, 2008
+ * 
+ */ + +/* limits.h: the largest positive integer (INT_MAX) */ +#include + +#include "superlu_sdefs.h" +#include "psymbfact.h" + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * Redistribute the symbolic structure of L and U from the distribution
+ * used in the parallel symbolic factorization step to the distdibution
+ * used in the parallel numeric factorization step.  On exit, the L and U
+ * structure for the 2D distribution used in the numeric factorization step is
+ * stored in p_xlsub, p_lsub, p_xusub, p_usub.  The global supernodal
+ * information is also computed and it is stored in Glu_persist->supno
+ * and Glu_persist->xsup.
+ *
+ * This routine allocates memory for storing the structure of L and U
+ * and the supernodes information.  This represents the arrays:
+ * p_xlsub, p_lsub, p_xusub, p_usub,
+ * Glu_persist->supno,  Glu_persist->xsup.
+ *
+ * This routine also deallocates memory allocated during symbolic
+ * factorization routine.  That is, the folloing arrays are freed:
+ * Pslu_freeable->xlsub,  Pslu_freeable->lsub,
+ * Pslu_freeable->xusub, Pslu_freeable->usub,
+ * Pslu_freeable->globToLoc, Pslu_freeable->supno_loc,
+ * Pslu_freeable->xsup_beg_loc, Pslu_freeable->xsup_end_loc.
+ *
+ * Arguments
+ * =========
+ *
+ * n      (Input) int_t
+ *        Order of the input matrix
+ * Pslu_freeable  (Input) Pslu_freeable_t *
+ *        Local L and U structure,
+ *        global to local indexing information.
+ *
+ * Glu_persist (Output) Glu_persist_t *
+ *        Stores on output the information on supernodes mapping.
+ *
+ * p_xlsub (Output) int_t **
+ *         Pointer to structure of L distributed on a 2D grid
+ *         of processors, stored by columns.
+ *
+ * p_lsub  (Output) int_t **
+ *         Structure of L distributed on a 2D grid of processors,
+ *         stored by columns.
+ *
+ * p_xusub (Output) int_t **
+ *         Pointer to structure of U distributed on a 2D grid
+ *         of processors, stored by rows.
+ *
+ * p_usub  (Output) int_t **
+ *         Structure of U distributed on a 2D grid of processors,
+ *         stored by rows.
+ *
+ * grid   (Input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the dist_symbLU.
+ *   > 0, number of bytes allocated in this routine when out of memory.
+ *        (an approximation).
+ * 
+ */ + +static float +dist_symbLU (int_t n, Pslu_freeable_t *Pslu_freeable, + Glu_persist_t *Glu_persist, + int_t **p_xlsub, int_t **p_lsub, int_t **p_xusub, int_t **p_usub, + gridinfo_t *grid + ) +{ + int iam, nprocs, pc, pr, p, np, p_diag; + int_t *nnzToSend, *nnzToRecv, *nnzToSend_l, *nnzToSend_u, + *tmp_ptrToSend, *mem; + int_t *nnzToRecv_l, *nnzToRecv_u; + int_t *send_1, *send_2, nsend_1, nsend_2; + int_t *ptrToSend, *ptrToRecv, sendL, sendU, *snd_luind, *rcv_luind; + int_t nsupers, nsupers_i, nsupers_j; + int *nvtcs, *intBuf1, *intBuf2, *intBuf3, *intBuf4, intNvtcs_loc; + int_t maxszsn, maxNvtcsPProc; + int_t *xsup_n, *supno_n, *temp, *xsup_beg_s, *xsup_end_s, *supno_s; + int_t *xlsub_s, *lsub_s, *xusub_s, *usub_s; + int_t *xlsub_n, *lsub_n, *xusub_n, *usub_n; + int_t *xsub_s, *sub_s, *xsub_n, *sub_n; + int_t *globToLoc, nvtcs_loc; + int_t SendCnt_l, SendCnt_u, nnz_loc_l, nnz_loc_u, nnz_loc, + RecvCnt_l, RecvCnt_u, ind_loc; + int_t i, k, j, gb, szsn, gb_n, gb_s, gb_l, fst_s, fst_s_l, lst_s, i_loc; + int_t nelts, isize; + float memAux; /* Memory used during this routine and freed on return */ + float memRet; /* Memory allocated and not freed on return */ + int_t iword, dword; + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + iam = grid->iam; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter dist_symbLU()"); +#endif + nprocs = (int) grid->nprow * grid->npcol; + xlsub_s = Pslu_freeable->xlsub; lsub_s = Pslu_freeable->lsub; + xusub_s = Pslu_freeable->xusub; usub_s = Pslu_freeable->usub; + maxNvtcsPProc = Pslu_freeable->maxNvtcsPProc; + globToLoc = Pslu_freeable->globToLoc; + nvtcs_loc = Pslu_freeable->nvtcs_loc; + xsup_beg_s = Pslu_freeable->xsup_beg_loc; + xsup_end_s = Pslu_freeable->xsup_end_loc; + supno_s = Pslu_freeable->supno_loc; + rcv_luind = NULL; + iword = sizeof(int_t); + dword = sizeof(float); + memAux = 0.; memRet = 0.; + + mem = intCalloc_dist(12 * nprocs); + if (!mem) + return (ERROR_RET); + memAux = (float) (12 * nprocs * sizeof(int_t)); + nnzToRecv = mem; + nnzToSend = nnzToRecv + 2*nprocs; + nnzToSend_l = nnzToSend + 2 * nprocs; + nnzToSend_u = nnzToSend_l + nprocs; + send_1 = nnzToSend_u + nprocs; + send_2 = send_1 + nprocs; + tmp_ptrToSend = send_2 + nprocs; + nnzToRecv_l = tmp_ptrToSend + nprocs; + nnzToRecv_u = nnzToRecv_l + nprocs; + + ptrToSend = nnzToSend; + ptrToRecv = nnzToSend + nprocs; + + nvtcs = (int *) SUPERLU_MALLOC(5 * nprocs * sizeof(int)); + intBuf1 = nvtcs + nprocs; + intBuf2 = nvtcs + 2 * nprocs; + intBuf3 = nvtcs + 3 * nprocs; + intBuf4 = nvtcs + 4 * nprocs; + memAux += 5 * nprocs * sizeof(int); + + maxszsn = sp_ienv_dist(3); + + /* Allocate space for storing Glu_persist_n. */ + if ( !(supno_n = intMalloc_dist(n+1)) ) { + fprintf (stderr, "Malloc fails for supno_n[]."); + return (memAux); + } + memRet += (float) ((n+1) * sizeof(int_t)); + + /* ------------------------------------------------------------ + DETERMINE SUPERNODES FOR NUMERICAL FACTORIZATION + ------------------------------------------------------------*/ + + if (nvtcs_loc > INT_MAX) + ABORT("ERROR in dist_symbLU nvtcs_loc > INT_MAX\n"); + intNvtcs_loc = (int) nvtcs_loc; + MPI_Gather (&intNvtcs_loc, 1, MPI_INT, nvtcs, 1, MPI_INT, + 0, grid->comm); + + if (!iam) { + /* set ptrToRecv to point to the beginning of the data for + each processor */ + for (k = 0, p = 0; p < nprocs; p++) { + ptrToRecv[p] = k; + k += nvtcs[p]; + } + } + + if (nprocs > 1) { + temp = NULL; + if (!iam ) { + if ( !(temp = intMalloc_dist (n+1)) ) { + fprintf (stderr, "Malloc fails for temp[]."); + return (memAux + memRet); + } + memAux += (float) (n+1) * iword; + } +#if defined (_LONGINT) + for (p=0; p INT_MAX) + ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); + intBuf1[p] = (int) ptrToRecv[p]; + } +#else /* Default */ + intBuf1 = ptrToRecv; +#endif + MPI_Gatherv (supno_s, (int) nvtcs_loc, mpi_int_t, + temp, nvtcs, intBuf1, mpi_int_t, 0, grid->comm); + } + else + temp = supno_s; + + if (!iam) { + nsupers = 0; + p = (int) OWNER( globToLoc[0] ); + gb = temp[ptrToRecv[p]]; + supno_n[0] = nsupers; + ptrToRecv[p] ++; + szsn = 1; + for (j = 1; j < n; j ++) { + if (p != (int) OWNER( globToLoc[j] ) || szsn >= maxszsn || gb != temp[ptrToRecv[p]]) { + nsupers ++; + p = (int) OWNER( globToLoc[j] ); + gb = temp[ptrToRecv[p]]; + szsn = 1; + } + else { + szsn ++; + } + ptrToRecv[p] ++; + supno_n[j] = nsupers; + } + nsupers++; + if (nprocs > 1) { + SUPERLU_FREE (temp); + memAux -= (float) (n+1) * iword; + } + supno_n[n] = nsupers; + } + + /* reset to 0 nnzToSend */ + for (p = 0; p < 2 *nprocs; p++) + nnzToSend[p] = 0; + + MPI_Bcast (supno_n, n+1, mpi_int_t, 0, grid->comm); + nsupers = supno_n[n]; + /* Allocate space for storing Glu_persist_n. */ + if ( !(xsup_n = intMalloc_dist(nsupers+1)) ) { + fprintf (stderr, "Malloc fails for xsup_n[]."); + return (memAux + memRet); + } + memRet += (float) (nsupers+1) * iword; + + /* ------------------------------------------------------------ + COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, + THEN ALLOCATE SPACE. + THIS ACCOUNTS FOR THE FIRST PASS OF L and U. + ------------------------------------------------------------*/ + gb = EMPTY; + for (i = 0; i < n; i++) { + if (gb != supno_n[i]) { + /* a new supernode starts */ + gb = supno_n[i]; + xsup_n[gb] = i; + } + } + xsup_n[nsupers] = n; + + for (p = 0; p < nprocs; p++) { + send_1[p] = FALSE; + send_2[p] = FALSE; + } + for (gb_n = 0; gb_n < nsupers; gb_n ++) { + i = xsup_n[gb_n]; + if (iam == (int) OWNER( globToLoc[i] )) { + pc = PCOL( gb_n, grid ); + pr = PROW( gb_n, grid ); + p_diag = PNUM( pr, pc, grid); + + i_loc = LOCAL_IND( globToLoc[i] ); + gb_s = supno_s[i_loc]; + fst_s = xsup_beg_s[gb_s]; + lst_s = xsup_end_s[gb_s]; + fst_s_l = LOCAL_IND( globToLoc[fst_s] ); + for (j = xlsub_s[fst_s_l]; j < xlsub_s[fst_s_l+1]; j++) { + k = lsub_s[j]; + if (k >= i) { + gb = supno_n[k]; + p = (int) PNUM( PROW(gb, grid), pc, grid ); + nnzToSend[2*p] ++; + send_1[p] = TRUE; + } + } + for (j = xusub_s[fst_s_l]; j < xusub_s[fst_s_l+1]; j++) { + k = usub_s[j]; + if (k >= i + xsup_n[gb_n+1] - xsup_n[gb_n]) { + gb = supno_n[k]; + p = PNUM( pr, PCOL(gb, grid), grid); + nnzToSend[2*p+1] ++; + send_2[p] = TRUE; + } + } + + nsend_2 = 0; + for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { + nnzToSend[2*p+1] += 2; + if (send_2[p]) nsend_2 ++; + } + for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) + if (send_2[p] || p == p_diag) { + if (p == p_diag && !send_2[p]) + nnzToSend[2*p+1] += nsend_2; + else + nnzToSend[2*p+1] += nsend_2-1; + send_2[p] = FALSE; + } + nsend_1 = 0; + for (p = pc; p < nprocs; p += grid->npcol) { + nnzToSend[2*p] += 2; + if (send_1[p]) nsend_1 ++; + } + for (p = pc; p < nprocs; p += grid->npcol) + if (send_1[p]) { + nnzToSend[2*p] += nsend_1-1; + send_1[p] = FALSE; + } + else + nnzToSend[2*p] += nsend_1; + } + } + + /* All-to-all communication */ + MPI_Alltoall( nnzToSend, 2, mpi_int_t, nnzToRecv, 2, mpi_int_t, + grid->comm); + + nnz_loc_l = nnz_loc_u = 0; + SendCnt_l = SendCnt_u = RecvCnt_l = RecvCnt_u = 0; + for (p = 0; p < nprocs; p++) { + if ( p != iam ) { + SendCnt_l += nnzToSend[2*p]; nnzToSend_l[p] = nnzToSend[2*p]; + SendCnt_u += nnzToSend[2*p+1]; nnzToSend_u[p] = nnzToSend[2*p+1]; + RecvCnt_l += nnzToRecv[2*p]; nnzToRecv_l[p] = nnzToRecv[2*p]; + RecvCnt_u += nnzToRecv[2*p+1]; nnzToRecv_u[p] = nnzToRecv[2*p+1]; + } else { + nnz_loc_l += nnzToRecv[2*p]; + nnz_loc_u += nnzToRecv[2*p+1]; + nnzToSend_l[p] = 0; nnzToSend_u[p] = 0; + nnzToRecv_l[p] = nnzToRecv[2*p]; + nnzToRecv_u[p] = nnzToRecv[2*p+1]; + } + } + + /* Allocate space for storing the symbolic structure after redistribution. */ + nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ + if ( !(xlsub_n = intCalloc_dist(nsupers_j+1)) ) { + fprintf (stderr, "Malloc fails for xlsub_n[]."); + return (memAux + memRet); + } + memRet += (float) (nsupers_j+1) * iword; + + if ( !(xusub_n = intCalloc_dist(nsupers_i+1)) ) { + fprintf (stderr, "Malloc fails for xusub_n[]."); + return (memAux + memRet); + } + memRet += (float) (nsupers_i+1) * iword; + + /* Allocate temp storage for sending/receiving the L/U symbolic structure. */ + if ( (RecvCnt_l + nnz_loc_l) || (RecvCnt_u + nnz_loc_u) ) { + if (!(rcv_luind = + intMalloc_dist(SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u))) ) { + fprintf (stderr, "Malloc fails for rcv_luind[]."); + return (memAux + memRet); + } + memAux += (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) + * iword; + } + if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { + if (!(snd_luind = intMalloc_dist(SUPERLU_MAX(SendCnt_l, SendCnt_u))) ) { + fprintf (stderr, "Malloc fails for index[]."); + return (memAux + memRet); + } + memAux += (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; + } + + /* ------------------------------------------------------------------ + LOAD THE SYMBOLIC STRUCTURE OF L AND U INTO THE STRUCTURES TO SEND. + THIS ACCOUNTS FOR THE SECOND PASS OF L and U. + ------------------------------------------------------------------*/ + sendL = TRUE; + sendU = FALSE; + while (sendL || sendU) { + if (sendL) { + xsub_s = xlsub_s; sub_s = lsub_s; xsub_n = xlsub_n; + nnzToSend = nnzToSend_l; nnzToRecv = nnzToRecv_l; + } + if (sendU) { + xsub_s = xusub_s; sub_s = usub_s; xsub_n = xusub_n; + nnzToSend = nnzToSend_u; nnzToRecv = nnzToRecv_u; + } + for (i = 0, j = 0, p = 0; p < nprocs; p++) { + if ( p != iam ) { + ptrToSend[p] = i; i += nnzToSend[p]; + } + ptrToRecv[p] = j; j += nnzToRecv[p]; + } + nnzToRecv[iam] = 0; + + ind_loc = ptrToRecv[iam]; + for (gb_n = 0; gb_n < nsupers; gb_n++) { + nsend_2 = 0; + i = xsup_n[gb_n]; + if (iam == OWNER( globToLoc[i] )) { + pc = PCOL( gb_n, grid ); + pr = PROW( gb_n, grid ); + p_diag = PNUM( pr, pc, grid ); + + i_loc = LOCAL_IND( globToLoc[i] ); + gb_s = supno_s[i_loc]; + fst_s = xsup_beg_s[gb_s]; + lst_s = xsup_end_s[gb_s]; + fst_s_l = LOCAL_IND( globToLoc[fst_s] ); + + if (sendL) { + p = pc; np = grid->nprow; + } else { + p = pr * grid->npcol; np = grid->npcol; + } + for (j = 0; j < np; j++) { + if (p == iam) { + rcv_luind[ind_loc] = gb_n; + rcv_luind[ind_loc+1] = 0; + tmp_ptrToSend[p] = ind_loc + 1; + ind_loc += 2; + } + else { + snd_luind[ptrToSend[p]] = gb_n; + snd_luind[ptrToSend[p]+1] = 0; + tmp_ptrToSend[p] = ptrToSend[p] + 1; + ptrToSend[p] += 2; + } + if (sendL) p += grid->npcol; + if (sendU) p++; + } + for (j = xsub_s[fst_s_l]; j < xsub_s[fst_s_l+1]; j++) { + k = sub_s[j]; + if ((sendL && k >= i) || (sendU && k >= i + xsup_n[gb_n+1] - xsup_n[gb_n])) { + gb = supno_n[k]; + if (sendL) + p = PNUM( PROW(gb, grid), pc, grid ); + else + p = PNUM( pr, PCOL(gb, grid), grid); + if (send_1[p] == FALSE) { + send_1[p] = TRUE; + send_2[nsend_2] = k; nsend_2 ++; + } + if (p == iam) { + rcv_luind[ind_loc] = k; ind_loc++; + if (sendL) + xsub_n[LBj( gb_n, grid )] ++; + else + xsub_n[LBi( gb_n, grid )] ++; + } + else { + snd_luind[ptrToSend[p]] = k; + ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; + } + } + } + if (sendL) + for (p = pc; p < nprocs; p += grid->npcol) { + for (k = 0; k < nsend_2; k++) { + gb = supno_n[send_2[k]]; + if (PNUM(PROW(gb, grid), pc, grid) != p) { + if (p == iam) { + rcv_luind[ind_loc] = send_2[k]; ind_loc++; + xsub_n[LBj( gb_n, grid )] ++; + } + else { + snd_luind[ptrToSend[p]] = send_2[k]; + ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; + } + } + } + send_1[p] = FALSE; + } + if (sendU) + for (p = pr * grid->npcol; p < (pr + 1) * grid->npcol; p++) { + if (send_1[p] || p == p_diag) { + for (k = 0; k < nsend_2; k++) { + gb = supno_n[send_2[k]]; + if(PNUM( pr, PCOL(gb, grid), grid) != p) { + if (p == iam) { + rcv_luind[ind_loc] = send_2[k]; ind_loc++; + xsub_n[LBi( gb_n, grid )] ++; + } + else { + snd_luind[ptrToSend[p]] = send_2[k]; + ptrToSend[p] ++; snd_luind[tmp_ptrToSend[p]] ++; + } + } + } + send_1[p] = FALSE; + } + } + } + } + + /* reset ptrToSnd to point to the beginning of the data for + each processor (structure needed in MPI_Alltoallv) */ + for (i = 0, p = 0; p < nprocs; p++) { + ptrToSend[p] = i; i += nnzToSend[p]; + } + + /* ------------------------------------------------------------ + PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. + Note: it uses MPI_Alltoallv. + ------------------------------------------------------------*/ + if (nprocs > 1) { +#if defined (_LONGINT) + nnzToSend[iam] = 0; + for (p=0; p INT_MAX || ptrToSend[p] > INT_MAX || + nnzToRecv[p] > INT_MAX || ptrToRecv[p] > INT_MAX) + ABORT("ERROR in dist_symbLU size to send > INT_MAX\n"); + intBuf1[p] = (int) nnzToSend[p]; + intBuf2[p] = (int) ptrToSend[p]; + intBuf3[p] = (int) nnzToRecv[p]; + intBuf4[p] = (int) ptrToRecv[p]; + } +#else /* Default */ + intBuf1 = nnzToSend; intBuf2 = ptrToSend; + intBuf3 = nnzToRecv; intBuf4 = ptrToRecv; +#endif + + MPI_Alltoallv (snd_luind, intBuf1, intBuf2, mpi_int_t, + rcv_luind, intBuf3, intBuf4, mpi_int_t, + grid->comm); + } + if (sendL) + nnzToRecv[iam] = nnz_loc_l; + else + nnzToRecv[iam] = nnz_loc_u; + + /* ------------------------------------------------------------ + DEALLOCATE TEMPORARY STORAGE. + -------------------------------------------------------------*/ + if (sendU) + if ( nprocs > 1 && (SendCnt_l || SendCnt_u) ) { + SUPERLU_FREE (snd_luind); + memAux -= (float) SUPERLU_MAX(SendCnt_l, SendCnt_u) * iword; + } + + /* ------------------------------------------------------------ + CONVERT THE FORMAT. + ------------------------------------------------------------*/ + /* Initialize the array of column of L/ row of U pointers */ + k = 0; + for (p = 0; p < nprocs; p ++) { + if (p != iam) { + i = k; + while (i < k + nnzToRecv[p]) { + gb = rcv_luind[i]; + nelts = rcv_luind[i+1]; + if (sendL) + xsub_n[LBj( gb, grid )] = nelts; + else + xsub_n[LBi( gb, grid )] = nelts; + i += nelts + 2; + } + } + k += nnzToRecv[p]; + } + + if (sendL) j = nsupers_j; + else j = nsupers_i; + k = 0; + isize = xsub_n[0]; + xsub_n[0] = 0; + for (gb_l = 1; gb_l < j; gb_l++) { + k += isize; + isize = xsub_n[gb_l]; + xsub_n[gb_l] = k; + } + xsub_n[gb_l] = k + isize; + nnz_loc = xsub_n[gb_l]; + if (sendL) { + lsub_n = NULL; + if (nnz_loc) { + if ( !(lsub_n = intMalloc_dist(nnz_loc)) ) { + fprintf (stderr, "Malloc fails for lsub_n[]."); + return (memAux + memRet); + } + memRet += (float) (nnz_loc * iword); + } + sub_n = lsub_n; + } + if (sendU) { + usub_n = NULL; + if (nnz_loc) { + if ( !(usub_n = intMalloc_dist(nnz_loc)) ) { + fprintf (stderr, "Malloc fails for usub_n[]."); + return (memAux + memRet); + } + memRet += (float) (nnz_loc * iword); + } + sub_n = usub_n; + } + + /* Copy the data into the L column / U row oriented storage */ + k = 0; + for (p = 0; p < nprocs; p++) { + i = k; + while (i < k + nnzToRecv[p]) { + gb = rcv_luind[i]; + if (gb >= nsupers) + printf ("Pe[%d] p %d gb " IFMT " nsupers " IFMT " i " IFMT " i-k " IFMT "\n", + iam, p, gb, nsupers, i, i-k); + i += 2; + if (sendL) gb_l = LBj( gb, grid ); + if (sendU) gb_l = LBi( gb, grid ); + for (j = xsub_n[gb_l]; j < xsub_n[gb_l+1]; i++, j++) { + sub_n[j] = rcv_luind[i]; + } + } + k += nnzToRecv[p]; + } + if (sendL) { + sendL = FALSE; sendU = TRUE; + } + else + sendU = FALSE; + } + + /* deallocate memory allocated during symbolic factorization routine */ + if (rcv_luind != NULL) { + SUPERLU_FREE (rcv_luind); + memAux -= (float) SUPERLU_MAX(RecvCnt_l+nnz_loc_l, RecvCnt_u+nnz_loc_u) * iword; + } + SUPERLU_FREE (mem); + memAux -= (float) (12 * nprocs * iword); + SUPERLU_FREE(nvtcs); + memAux -= (float) (5 * nprocs * sizeof(int)); + + if (xlsub_s != NULL) { + SUPERLU_FREE (xlsub_s); SUPERLU_FREE (lsub_s); + } + if (xusub_s != NULL) { + SUPERLU_FREE (xusub_s); SUPERLU_FREE (usub_s); + } + SUPERLU_FREE (globToLoc); + if (supno_s != NULL) { + SUPERLU_FREE (xsup_beg_s); SUPERLU_FREE (xsup_end_s); + SUPERLU_FREE (supno_s); + } + + Glu_persist->supno = supno_n; Glu_persist->xsup = xsup_n; + *p_xlsub = xlsub_n; *p_lsub = lsub_n; + *p_xusub = xusub_n; *p_usub = usub_n; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit dist_symbLU()"); +#endif + + return (-memRet); +} + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Re-distribute A on the 2D process mesh.  The lower part is
+ *   stored using a column format and the upper part
+ *   is stored using a row format.
+ *
+ * Arguments
+ * =========
+ *
+ * A      (Input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        The type of A can be: Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ * ScalePermstruct (Input) sScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_persist  (Input) Glu_persist_t *
+ *        Information on supernodes mapping.
+ *
+ * grid   (Input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * p_ainf_colptr (Output) int_t**
+ *         Pointer to the lower part of A distributed on a 2D grid
+ *         of processors, stored by columns.
+ *
+ * p_ainf_rowind (Output) int_t**
+ *         Structure of of the lower part of A distributed on a
+ *         2D grid of processors, stored by columns.
+ *
+ * p_ainf_val    (Output) float**
+ *         Numerical values of the lower part of A, distributed on a
+ *         2D grid of processors, stored by columns.
+ *
+ * p_asup_rowptr (Output) int_t**
+ *         Pointer to the upper part of A distributed on a 2D grid
+ *         of processors, stored by rows.
+ *
+ * p_asup_colind (Output) int_t**
+ *         Structure of of the upper part of A distributed on a
+ *         2D grid of processors, stored by rows.
+ *
+ * p_asup_val    (Output) float**
+ *         Numerical values of the upper part of A, distributed on a
+ *         2D grid of processors, stored by rows.
+ *
+ * ilsum_i  (Input) int_t *
+ *       Starting position of each supernode in
+ *       the full array (local, block row wise).
+ *
+ * ilsum_j  (Input) int_t *
+ *       Starting position of each supernode in
+ *       the full array (local, block column wise).
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the dist_symbLU
+ *   > 0, number of bytes allocated when out of memory.
+ *        (an approximation).
+ * 
+ */ + +static float +sdist_A(SuperMatrix *A, sScalePermstruct_t *ScalePermstruct, + Glu_persist_t *Glu_persist, gridinfo_t *grid, + int_t **p_ainf_colptr, int_t **p_ainf_rowind, float **p_ainf_val, + int_t **p_asup_rowptr, int_t **p_asup_colind, float **p_asup_val, + int_t *ilsum_i, int_t *ilsum_j + ) +{ + int iam, p, procs; + NRformat_loc *Astore; + int_t *perm_r; /* row permutation vector */ + int_t *perm_c; /* column permutation vector */ + int_t i, it, irow, fst_row, j, jcol, k, gbi, gbj, n, m_loc, jsize, isize; + int_t nsupers, nsupers_i, nsupers_j; + int_t nnz_loc, nnz_loc_ainf, nnz_loc_asup; /* number of local nonzeros */ + int_t SendCnt; /* number of remote nonzeros to be sent */ + int_t RecvCnt; /* number of remote nonzeros to be sent */ + int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; + float *asup_val, *ainf_val; + int_t *nnzToSend, *nnzToRecv, maxnnzToRecv; + int_t *ia, *ja, **ia_send, *index, *itemp; + int_t *ptr_to_send; + float *aij, **aij_send, *nzval, *dtemp; + float *nzval_a; + MPI_Request *send_req; + MPI_Status status; + int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ + int_t *supno = Glu_persist->supno; + float memAux; /* Memory used during this routine and freed on return */ + float memRet; /* Memory allocated and not freed on return */ + int_t iword, dword, szbuf; + + /* ------------------------------------------------------------ + INITIALIZATION. + ------------------------------------------------------------*/ + iam = grid->iam; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter sdist_A()"); +#endif + iword = sizeof(int_t); + dword = sizeof(double); + + perm_r = ScalePermstruct->perm_r; + perm_c = ScalePermstruct->perm_c; + procs = grid->nprow * grid->npcol; + Astore = (NRformat_loc *) A->Store; + n = A->ncol; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + if (!(nnzToRecv = intCalloc_dist(2*procs))) { + fprintf (stderr, "Malloc fails for nnzToRecv[]."); + return (ERROR_RET); + } + memAux = (float) (2 * procs * iword); + memRet = 0.; + nnzToSend = nnzToRecv + procs; + nsupers = supno[n-1] + 1; + + /* ------------------------------------------------------------ + COUNT THE NUMBER OF NONZEROS TO BE SENT TO EACH PROCESS, + THEN ALLOCATE SPACE. + THIS ACCOUNTS FOR THE FIRST PASS OF A. + ------------------------------------------------------------*/ + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ + jcol = Astore->colind[j]; + gbi = BlockNum( irow ); + gbj = BlockNum( jcol ); + p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); + ++nnzToSend[p]; + } + } + + /* All-to-all communication */ + MPI_Alltoall( nnzToSend, 1, mpi_int_t, nnzToRecv, 1, mpi_int_t, + grid->comm); + + maxnnzToRecv = 0; + nnz_loc = SendCnt = RecvCnt = 0; + + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + SendCnt += nnzToSend[p]; + RecvCnt += nnzToRecv[p]; + maxnnzToRecv = SUPERLU_MAX( nnzToRecv[p], maxnnzToRecv ); + } else { + nnz_loc += nnzToRecv[p]; + /*assert(nnzToSend[p] == nnzToRecv[p]);*/ + } + } + k = nnz_loc + RecvCnt; /* Total nonzeros ended up in my process. */ + szbuf = k; + + /* Allocate space for storing the triplets after redistribution. */ + if ( !(ia = intMalloc_dist(2*k)) ) { + fprintf (stderr, "Malloc fails for ia[]."); + return (memAux); + } + memAux += (float) (2*k*iword); + ja = ia + k; + if ( !(aij = floatMalloc_dist(k)) ) { + fprintf (stderr, "Malloc fails for aij[]."); + return (memAux); + } + memAux += (float) (k*dword); + + /* Allocate temporary storage for sending/receiving the A triplets. */ + if ( procs > 1 ) { + if ( !(send_req = (MPI_Request *) + SUPERLU_MALLOC(2*procs *sizeof(MPI_Request))) ) { + fprintf (stderr, "Malloc fails for send_req[]."); + return (memAux); + } + memAux += (float) (2*procs *sizeof(MPI_Request)); + if ( !(ia_send = (int_t **) SUPERLU_MALLOC(procs*sizeof(int_t*))) ) { + fprintf(stderr, "Malloc fails for ia_send[]."); + return (memAux); + } + memAux += (float) (procs*sizeof(int_t*)); + if ( !(aij_send = (float **)SUPERLU_MALLOC(procs*sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for aij_send[]."); + return (memAux); + } + memAux += (float) (procs*sizeof(float*)); + if ( !(index = intMalloc_dist(2*SendCnt)) ) { + fprintf(stderr, "Malloc fails for index[]."); + return (memAux); + } + memAux += (float) (2*SendCnt*iword); + if ( !(nzval = floatMalloc_dist(SendCnt)) ) { + fprintf(stderr, "Malloc fails for nzval[]."); + return (memAux); + } + memAux += (float) (SendCnt * dword); + if ( !(ptr_to_send = intCalloc_dist(procs)) ) { + fprintf(stderr, "Malloc fails for ptr_to_send[]."); + return (memAux); + } + memAux += (float) (procs * iword); + if ( !(itemp = intMalloc_dist(2*maxnnzToRecv)) ) { + fprintf(stderr, "Malloc fails for itemp[]."); + return (memAux); + } + memAux += (float) (2*maxnnzToRecv*iword); + if ( !(dtemp = floatMalloc_dist(maxnnzToRecv)) ) { + fprintf(stderr, "Malloc fails for dtemp[]."); + return (memAux); + } + memAux += (float) (maxnnzToRecv * dword); + + for (i = 0, j = 0, p = 0; p < procs; ++p) { + if ( p != iam ) { + ia_send[p] = &index[i]; + i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ + aij_send[p] = &nzval[j]; + j += nnzToSend[p]; + } + } + } /* if procs > 1 */ + + nsupers_i = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + nsupers_j = CEILING( nsupers, grid->npcol ); /* Number of local block columns */ + if ( !(ainf_colptr = intCalloc_dist(ilsum_j[nsupers_j] + 1)) ) { + fprintf (stderr, "Malloc fails for *ainf_colptr[]."); + return (memAux); + } + memRet += (float) (ilsum_j[nsupers_j] + 1) * iword; + if ( !(asup_rowptr = intCalloc_dist(ilsum_i[nsupers_i] + 1)) ) { + fprintf (stderr, "Malloc fails for *asup_rowptr[]."); + return (memAux+memRet); + } + memRet += (float) (ilsum_i[nsupers_i] + 1) * iword; + + /* ------------------------------------------------------------ + LOAD THE ENTRIES OF A INTO THE (IA,JA,AIJ) STRUCTURES TO SEND. + THIS ACCOUNTS FOR THE SECOND PASS OF A. + ------------------------------------------------------------*/ + nnz_loc = 0; /* Reset the local nonzero count. */ + nnz_loc_ainf = nnz_loc_asup = 0; + nzval_a = Astore->nzval; + for (i = 0; i < m_loc; ++i) { + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + irow = perm_c[perm_r[i+fst_row]]; /* Row number in Pc*Pr*A */ + jcol = Astore->colind[j]; + gbi = BlockNum( irow ); + gbj = BlockNum( jcol ); + p = PNUM( PROW(gbi,grid), PCOL(gbj,grid), grid ); + + if ( p != iam ) { /* remote */ + k = ptr_to_send[p]; + ia_send[p][k] = irow; + ia_send[p][k + nnzToSend[p]] = jcol; + aij_send[p][k] = nzval_a[j]; + ++ptr_to_send[p]; + } else { /* local */ + ia[nnz_loc] = irow; + ja[nnz_loc] = jcol; + aij[nnz_loc] = nzval_a[j]; + ++nnz_loc; + /* Count nonzeros in each column of L / row of U */ + if (gbi >= gbj) { + ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; + nnz_loc_ainf ++; + } + else { + asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; + nnz_loc_asup ++; + } + } + } + } + + /* ------------------------------------------------------------ + PERFORM REDISTRIBUTION. THIS INVOLVES ALL-TO-ALL COMMUNICATION. + NOTE: Can possibly use MPI_Alltoallv. + ------------------------------------------------------------*/ + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + it = 2*nnzToSend[p]; + MPI_Isend( ia_send[p], it, mpi_int_t, + p, iam, grid->comm, &send_req[p] ); + it = nnzToSend[p]; + MPI_Isend( aij_send[p], it, MPI_FLOAT, + p, iam+procs, grid->comm, &send_req[procs+p] ); + } + } + + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + it = 2*nnzToRecv[p]; + MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); + it = nnzToRecv[p]; + MPI_Recv( dtemp, it, MPI_FLOAT, p, p+procs, + grid->comm, &status ); + for (i = 0; i < nnzToRecv[p]; ++i) { + ia[nnz_loc] = itemp[i]; + irow = itemp[i]; + jcol = itemp[i + nnzToRecv[p]]; + /* assert(jcol= gbj) { + ainf_colptr[ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj )] ++; + nnz_loc_ainf ++; + } + else { + asup_rowptr[ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi )] ++; + nnz_loc_asup ++; + } + } + } + } + + for (p = 0; p < procs; ++p) { + if ( p != iam ) { + MPI_Wait( &send_req[p], &status); + MPI_Wait( &send_req[procs+p], &status); + } + } + + /* ------------------------------------------------------------ + DEALLOCATE TEMPORARY STORAGE + ------------------------------------------------------------*/ + + SUPERLU_FREE(nnzToRecv); + memAux -= 2 * procs * iword; + if ( procs > 1 ) { + SUPERLU_FREE(send_req); + SUPERLU_FREE(ia_send); + SUPERLU_FREE(aij_send); + SUPERLU_FREE(index); + SUPERLU_FREE(nzval); + SUPERLU_FREE(ptr_to_send); + SUPERLU_FREE(itemp); + SUPERLU_FREE(dtemp); + memAux -= 2*procs *sizeof(MPI_Request) + procs*sizeof(int_t*) + + procs*sizeof(float*) + 2*SendCnt * iword + + SendCnt* dword + procs*iword + + 2*maxnnzToRecv*iword + maxnnzToRecv*dword; + } + + /* ------------------------------------------------------------ + CONVERT THE TRIPLET FORMAT. + ------------------------------------------------------------*/ + if (nnz_loc_ainf != 0) { + if ( !(ainf_rowind = intMalloc_dist(nnz_loc_ainf)) ) { + fprintf (stderr, "Malloc fails for *ainf_rowind[]."); + return (memAux+memRet); + } + memRet += (float) (nnz_loc_ainf * iword); + if ( !(ainf_val = floatMalloc_dist(nnz_loc_ainf)) ) { + fprintf (stderr, "Malloc fails for *ainf_val[]."); + return (memAux+memRet); + } + memRet += (float) (nnz_loc_ainf * dword); + } + else { + ainf_rowind = NULL; + ainf_val = NULL; + } + if (nnz_loc_asup != 0) { + if ( !(asup_colind = intMalloc_dist(nnz_loc_asup)) ) { + fprintf (stderr, "Malloc fails for *asup_colind[]."); + return (memAux + memRet); + } + memRet += (float) (nnz_loc_asup * iword); + if ( !(asup_val = floatMalloc_dist(nnz_loc_asup)) ) { + fprintf (stderr, "Malloc fails for *asup_val[]."); + return (memAux + memRet); + } + memRet += (float) (nnz_loc_asup * dword); + } + else { + asup_colind = NULL; + asup_val = NULL; + } + + /* Initialize the array of column pointers */ + k = 0; + jsize = ainf_colptr[0]; ainf_colptr[0] = 0; + for (j = 1; j < ilsum_j[nsupers_j]; j++) { + k += jsize; + jsize = ainf_colptr[j]; + ainf_colptr[j] = k; + } + ainf_colptr[ilsum_j[nsupers_j]] = k + jsize; + i = 0; + isize = asup_rowptr[0]; asup_rowptr[0] = 0; + for (j = 1; j < ilsum_i[nsupers_i]; j++) { + i += isize; + isize = asup_rowptr[j]; + asup_rowptr[j] = i; + } + asup_rowptr[ilsum_i[nsupers_i]] = i + isize; + + /* Copy the triplets into the column oriented storage */ + for (i = 0; i < nnz_loc; ++i) { + jcol = ja[i]; + irow = ia[i]; + gbi = BlockNum( irow ); + gbj = BlockNum( jcol ); + /* Count nonzeros in each column of L / row of U */ + if (gbi >= gbj) { + j = ilsum_j[LBj( gbj, grid )] + jcol - FstBlockC( gbj ); + k = ainf_colptr[j]; + ainf_rowind[k] = irow; + ainf_val[k] = aij[i]; + ainf_colptr[j] ++; + } + else { + j = ilsum_i[LBi( gbi, grid )] + irow - FstBlockC( gbi ); + k = asup_rowptr[j]; + asup_colind[k] = jcol; + asup_val[k] = aij[i]; + asup_rowptr[j] ++; + } + } + + /* Reset the column pointers to the beginning of each column */ + for (j = ilsum_j[nsupers_j]; j > 0; j--) + ainf_colptr[j] = ainf_colptr[j-1]; + for (j = ilsum_i[nsupers_i]; j > 0; j--) + asup_rowptr[j] = asup_rowptr[j-1]; + ainf_colptr[0] = 0; + asup_rowptr[0] = 0; + + SUPERLU_FREE(ia); + SUPERLU_FREE(aij); + memAux -= 2*szbuf*iword + szbuf*dword; + + *p_ainf_colptr = ainf_colptr; + *p_ainf_rowind = ainf_rowind; + *p_ainf_val = ainf_val; + *p_asup_rowptr = asup_rowptr; + *p_asup_colind = asup_colind; + *p_asup_val = asup_val; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit sdist_A()"); + fprintf (stdout, "Size of allocated memory (MB) %.3f\n", memRet*1e-6); +#endif + + return (-memRet); +} /* dist_A */ + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Distribute the input matrix onto the 2D process mesh.
+ *
+ * Arguments
+ * =========
+ *
+ * fact (input) fact_t
+ *        Specifies whether or not the L and U structures will be re-used.
+ *        = SamePattern_SameRowPerm: L and U structures are input, and
+ *                                   unchanged on exit.
+ *          This routine should not be called for this case, an error
+ *          is generated.  Instead, pddistribute routine should be called.
+ *        = DOFACT or SamePattern: L and U structures are computed and output.
+ *
+ * n      (Input) int
+ *        Dimension of the matrix.
+ *
+ * A      (Input) SuperMatrix*
+ *	  The distributed input matrix A of dimension (A->nrow, A->ncol).
+ *        A may be overwritten by diag(R)*A*diag(C)*Pc^T.
+ *        The type of A can be: Stype = NR; Dtype = SLU_D; Mtype = GE.
+ *
+ * ScalePermstruct (Input) sScalePermstruct_t*
+ *        The data structure to store the scaling and permutation vectors
+ *        describing the transformations performed to the original matrix A.
+ *
+ * Glu_freeable (Input) *Glu_freeable_t
+ *        The global structure describing the graph of L and U.
+ *
+ * LUstruct (Input) sLUstruct_t*
+ *        Data structures for L and U factors.
+ *
+ * grid   (Input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   < 0, number of bytes allocated on return from the dist_symbLU
+ *   > 0, number of bytes allocated for performing the distribution
+ *       of the data, when out of memory.
+ *        (an approximation).
+ * 
+ */ + +float +sdist_psymbtonum(fact_t fact, int_t n, SuperMatrix *A, + sScalePermstruct_t *ScalePermstruct, + Pslu_freeable_t *Pslu_freeable, + sLUstruct_t *LUstruct, gridinfo_t *grid) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + Glu_freeable_t Glu_freeable_n; + sLocalLU_t *Llu = LUstruct->Llu; + int_t bnnz, fsupc, i, irow, istart, j, jb, ib, jj, k, k1, + len, len1, nsupc, nsupc_gb, ii, nprocs; + int_t lib; /* local block row number */ + int_t nlb; /* local block rows*/ + int_t ljb; /* local block column number */ + int_t nrbl; /* number of L blocks in current block column */ + int_t nrbu; /* number of U blocks in current block column */ + int_t gb; /* global block number; 0 < gb <= nsuper */ + int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ + int_t ub,gik,iklrow,fnz; + int iam, jbrow, jbcol, jcol, kcol, krow, mycol, myrow, pc, pr, ljb_i, ljb_j, p; + int_t mybufmax[NBUFFERS]; + NRformat_loc *Astore; + float *a; + int_t *asub, *xa; + int_t *ainf_colptr, *ainf_rowind, *asup_rowptr, *asup_colind; + float *asup_val, *ainf_val; + int_t *xsup, *supno; /* supernode and column mapping */ + int_t *lsub, *xlsub, *usub, *usub1, *xusub; + int_t nsupers, nsupers_i, nsupers_j, nsupers_ij; + int_t next_ind; /* next available position in index[*] */ + int_t next_val; /* next available position in nzval[*] */ + int_t *index; /* indices consist of headers and row subscripts */ + int *index1; /* temporary pointer to array of int */ + float *lusup, *uval; /* nonzero values in L and U */ + int_t *recvBuf; + int *ptrToRecv, *nnzToRecv, *ptrToSend, *nnzToSend; + float **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ + float **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + float **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t *index_srt; /* indices consist of headers and row subscripts */ + float *lusup_srt; /* nonzero values in L and U */ + float **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ + int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ + int_t *Unnz; /* size ceil(NSUPERS/Pc) */ + + BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ + int msgsize; + + int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + + + /*-- Counts to be used in factorization. --*/ + int *ToRecv, *ToSendD, **ToSendR; + + /*-- Counts to be used in lower triangular solve. --*/ + int_t *fmod; /* Modification count for L-solve. */ + int_t **fsendx_plist; /* Column process list to send down Xk. */ + int_t nfrecvx = 0; /* Number of Xk I will receive. */ + int_t nfsendx = 0; /* Number of Xk I will send */ + int_t kseen; + + /*-- Counts to be used in upper triangular solve. --*/ + int_t *bmod; /* Modification count for U-solve. */ + int_t **bsendx_plist; /* Column process list to send down Xk. */ + int_t nbrecvx = 0; /* Number of Xk I will receive. */ + int_t nbsendx = 0; /* Number of Xk I will send */ + int_t *ilsum; /* starting position of each supernode in + the full array (local) */ + int_t *ilsum_j, ldaspa_j; /* starting position of each supernode in + the full array (local, block column wise) */ + /*-- Auxiliary arrays; freed on return --*/ + int_t *Urb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ + int_t *LUb_length; /* L,U block length; size nsupers_ij */ + int_t *LUb_indptr; /* pointers to L,U index[]; size nsupers_ij */ + int_t *LUb_number; /* global block number; size nsupers_ij */ + int_t *LUb_valptr; /* pointers to U nzval[]; size ceil(NSUPERS/Pc) */ + int_t *Lrb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ + int_t *ActiveFlag; + int_t *ActiveFlagAll; + int_t Iactive; + int *ranks; + int_t *idxs; + int_t **nzrows; + double rseed; + int rank_cnt,rank_cnt_ref,Root; +float *dense, *dense_col; /* SPA */ + float zero = 0.0; + int_t ldaspa; /* LDA of SPA */ + int_t iword, dword; + float mem_use = 0.0; + int_t *mod_bit; + int_t *frecv, *brecv, *lloc; + double *SeedSTD_BC,*SeedSTD_RD; + int_t idx_indx,idx_lusup; + int_t nbrow; + int_t ik, il, lk, rel, knsupc, idx_r; + int_t lptr1_tmp, idx_i, idx_v,m, uu; + int_t nub; + + float memStrLU, memA, + memDist = 0.; /* memory used for redistributing the data, which does + not include the memory for the numerical values + of L and U (positive number)*/ + float memNLU = 0.; /* memory allocated for storing the numerical values of + L and U, that will be used in the numeric + factorization (positive number) */ + float memTRS = 0.; /* memory allocated for storing the meta-data for triangular solve (positive number)*/ + +#if ( PRNTlevel>=1 ) + int_t nLblocks = 0, nUblocks = 0; +#endif +#if ( PROFlevel>=1 ) + double t, t_u, t_l; + int_t u_blks; +#endif + + /* Initialization. */ + iam = grid->iam; +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter dist_psymbtonum()"); +#endif + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + nprocs = grid->npcol * grid->nprow; + for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; + Astore = (NRformat_loc *) A->Store; + + iword = sizeof(int_t); + dword = sizeof(float); + + if (fact == SamePattern_SameRowPerm) { + ABORT ("ERROR: call of dist_psymbtonum with fact equals SamePattern_SameRowPerm."); + } + + if ((memStrLU = + dist_symbLU (n, Pslu_freeable, + Glu_persist, &xlsub, &lsub, &xusub, &usub, grid)) > 0) + return (memStrLU); + memDist += (-memStrLU); + xsup = Glu_persist->xsup; /* supernode and column mapping */ + supno = Glu_persist->supno; + nsupers = supno[n-1] + 1; + nsupers_i = CEILING( nsupers, grid->nprow );/* No of local row blocks */ + nsupers_j = CEILING( nsupers, grid->npcol );/* No of local column blocks */ + nsupers_ij = SUPERLU_MAX(nsupers_i, nsupers_j); + if ( !(ilsum = intMalloc_dist(nsupers_i+1)) ) { + fprintf (stderr, "Malloc fails for ilsum[]."); + return (memDist + memNLU + memTRS); + } + memNLU += (nsupers_i+1) * iword; + if ( !(ilsum_j = intMalloc_dist(nsupers_j+1)) ) { + fprintf (stderr, "Malloc fails for ilsum_j[]."); + return (memDist + memNLU + memTRS); + } + memDist += (nsupers_j+1) * iword; + + /* Compute ldaspa and ilsum[], ldaspa_j and ilsum_j[]. */ + ilsum[0] = 0; + ldaspa = 0; + for (gb = 0; gb < nsupers; gb++) + if ( myrow == PROW( gb, grid ) ) { + i = SuperSize( gb ); + ldaspa += i; + lb = LBi( gb, grid ); + ilsum[lb + 1] = ilsum[lb] + i; + } + ilsum[nsupers_i] = ldaspa; + + ldaspa_j = 0; ilsum_j[0] = 0; + for (gb = 0; gb < nsupers; gb++) + if (mycol == PCOL( gb, grid )) { + i = SuperSize( gb ); + ldaspa_j += i; + lb = LBj( gb, grid ); + ilsum_j[lb + 1] = ilsum_j[lb] + i; + } + ilsum_j[nsupers_j] = ldaspa_j; + + if ((memA = sdist_A(A, ScalePermstruct, Glu_persist, + grid, &ainf_colptr, &ainf_rowind, &ainf_val, + &asup_rowptr, &asup_colind, &asup_val, + ilsum, ilsum_j)) > 0) + return (memDist + memA + memNLU + memTRS); + memDist += (-memA); + + /* ------------------------------------------------------------ + FIRST TIME CREATING THE L AND U DATA STRUCTURES. + ------------------------------------------------------------*/ + + /* We first need to set up the L and U data structures and then + * propagate the values of A into them. + */ + if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) ) { + fprintf(stderr, "Calloc fails for ToRecv[]."); + return (memDist + memNLU + memTRS); + } + for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; + memNLU += nsupers * iword; + + k = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */ + if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) { + fprintf(stderr, "Malloc fails for ToSendR[]."); + return (memDist + memNLU + memTRS); + } + memNLU += k*sizeof(int_t*); + j = k * grid->npcol; + if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) { + fprintf(stderr, "Malloc fails for index[]."); + return (memDist + memNLU + memTRS); + } + memNLU += j*iword; + + for (i = 0; i < j; ++i) index1[i] = EMPTY; + for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j]; + + /* Auxiliary arrays used to set up L and U block data structures. + They are freed on return. */ + if ( !(LUb_length = intCalloc_dist(nsupers_ij)) ) { + fprintf(stderr, "Calloc fails for LUb_length[]."); + return (memDist + memNLU + memTRS); + } + if ( !(LUb_indptr = intMalloc_dist(nsupers_ij)) ) { + fprintf(stderr, "Malloc fails for LUb_indptr[]."); + return (memDist + memNLU + memTRS); + } + if ( !(LUb_number = intCalloc_dist(nsupers_ij)) ) { + fprintf(stderr, "Calloc fails for LUb_number[]."); + return (memDist + memNLU + memTRS); + } + if ( !(LUb_valptr = intCalloc_dist(nsupers_ij)) ) { + fprintf(stderr, "Calloc fails for LUb_valptr[]."); + return (memDist + memNLU + memTRS); + } + memDist += 4 * nsupers_ij * iword; + + k = CEILING( nsupers, grid->nprow ); + /* Pointers to the beginning of each block row of U. */ + if ( !(Unzval_br_ptr = + (float**)SUPERLU_MALLOC(nsupers_i * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Unzval_br_ptr[]."); + return (memDist + memNLU + memTRS); + } + if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(nsupers_i * sizeof(int_t*))) ) { + fprintf(stderr, "Malloc fails for Ufstnz_br_ptr[]."); + return (memDist + memNLU + memTRS); + } + memNLU += nsupers_i*sizeof(float*) + nsupers_i*sizeof(int_t*); + Unzval_br_ptr[nsupers_i-1] = NULL; + Ufstnz_br_ptr[nsupers_i-1] = NULL; + + if ( !(ToSendD = SUPERLU_MALLOC(nsupers_i * sizeof(int))) ) { + fprintf(stderr, "Malloc fails for ToSendD[]."); + return (memDist + memNLU + memTRS); + } + for (i = 0; i < nsupers_i; ++i) ToSendD[i] = NO; + + memNLU += nsupers_i*iword; + if ( !(Urb_marker = intCalloc_dist(nsupers_j))) { + fprintf(stderr, "Calloc fails for rb_marker[]."); + return (memDist + memNLU + memTRS); + } + if ( !(Lrb_marker = intCalloc_dist( nsupers_i ))) { + fprintf(stderr, "Calloc fails for rb_marker[]."); + return (memDist + memNLU + memTRS); + } + memDist += (nsupers_i + nsupers_j)*iword; + + /* Auxiliary arrays used to set up L, U block data structures. + They are freed on return. + k is the number of local row blocks. */ + if ( !(dense = floatCalloc_dist(SUPERLU_MAX(ldaspa, ldaspa_j) + * sp_ienv_dist(3))) ) { + fprintf(stderr, "Calloc fails for SPA dense[]."); + return (memDist + memNLU + memTRS); + } + /* These counts will be used for triangular solves. */ + if ( !(fmod = intCalloc_dist(nsupers_i)) ) { + fprintf(stderr, "Calloc fails for fmod[]."); + return (memDist + memNLU + memTRS); + } + if ( !(bmod = intCalloc_dist(nsupers_i)) ) { + fprintf(stderr, "Calloc fails for bmod[]."); + return (memDist + memNLU + memTRS); + } + /* ------------------------------------------------ */ + memNLU += 2*nsupers_i*iword + + SUPERLU_MAX(ldaspa, ldaspa_j)*sp_ienv_dist(3)*dword; + + /* Pointers to the beginning of each block column of L. */ + if ( !(Lnzval_bc_ptr = + (float**)SUPERLU_MALLOC(nsupers_j * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[]."); + return (memDist + memNLU + memTRS); + } + if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ) { + fprintf(stderr, "Malloc fails for Lrowind_bc_ptr[]."); + return (memDist + memNLU + memTRS); + } + + if ( !(Linv_bc_ptr = + (float**)SUPERLU_MALLOC(nsupers_j * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); + return (memDist + memNLU + memTRS); + } + if ( !(Uinv_bc_ptr = + (float**)SUPERLU_MALLOC(nsupers_j * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); + return (memDist + memNLU + memTRS); + } + if ( !(Lindval_loc_bc_ptr = (int_t**)SUPERLU_MALLOC(nsupers_j * sizeof(int_t*))) ){ + fprintf(stderr, "Malloc fails for Lindval_loc_bc_ptr[]."); + return (memDist + memNLU + memTRS); + } + + if ( !(Unnz = (int_t*)SUPERLU_MALLOC(nsupers_j * sizeof(int_t))) ){ + fprintf(stderr, "Malloc fails for Unnz[]."); + return (memDist + memNLU + memTRS); + } + memTRS += nsupers_j*sizeof(int_t*) + 2.0*nsupers_j*sizeof(double*) + nsupers_j*iword; //acount for Lindval_loc_bc_ptr, Unnz, Linv_bc_ptr,Uinv_bc_ptr + + memNLU += nsupers_j * sizeof(double*) + nsupers_j * sizeof(int_t*)+ nsupers_j * sizeof(int_t*); + Lnzval_bc_ptr[nsupers_j-1] = NULL; + Lrowind_bc_ptr[nsupers_j-1] = NULL; + Linv_bc_ptr[nsupers_j-1] = NULL; + Uinv_bc_ptr[nsupers_j-1] = NULL; + Lindval_loc_bc_ptr[nsupers_j-1] = NULL; + + /* These lists of processes will be used for triangular solves. */ + if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { + fprintf(stderr, "Malloc fails for fsendx_plist[]."); + return (memDist + memNLU + memTRS); + } + len = nsupers_j * grid->nprow; + if ( !(index = intMalloc_dist(len)) ) { + fprintf(stderr, "Malloc fails for fsendx_plist[0]"); + return (memDist + memNLU + memTRS); + } + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow) + fsendx_plist[i] = &index[j]; + if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(nsupers_j*sizeof(int_t*))) ) { + fprintf(stderr, "Malloc fails for bsendx_plist[]."); + return (memDist + memNLU + memTRS); + } + if ( !(index = intMalloc_dist(len)) ) { + fprintf(stderr, "Malloc fails for bsendx_plist[0]"); + return (memDist + memNLU + memTRS); + } + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < nsupers_j; ++i, j += grid->nprow) + bsendx_plist[i] = &index[j]; + /* -------------------------------------------------------------- */ + memNLU += 2*nsupers_j*sizeof(int_t*) + 2*len*iword; + + /*------------------------------------------------------------ + PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. + ------------------------------------------------------------*/ + for (jb = 0; jb < nsupers; jb++) { + jbcol = PCOL( jb, grid ); + jbrow = PROW( jb, grid ); + ljb_j = LBj( jb, grid ); /* Local block number column wise */ + ljb_i = LBi( jb, grid); /* Local block number row wise */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + + if ( myrow == jbrow ) { /* Block row jb in my process row */ + /* Scatter A into SPA. */ + for (j = ilsum[ljb_i], dense_col = dense; j < ilsum[ljb_i]+nsupc; j++) { + for (i = asup_rowptr[j]; i < asup_rowptr[j+1]; i++) { + if (i >= asup_rowptr[ilsum[nsupers_i]]) + printf ("ERR7\n"); + jcol = asup_colind[i]; + if (jcol >= n) + printf ("Pe[%d] ERR distsn jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n", + iam, jb, gb, j, jcol); + gb = BlockNum( jcol ); + lb = LBj( gb, grid ); + if (gb >= nsupers || lb >= nsupers_j) printf ("ERR8\n"); + jcol = ilsum_j[lb] + jcol - FstBlockC( gb ); + if (jcol >= ldaspa_j) + printf ("Pe[%d] ERR1 jb " IFMT " gb " IFMT " j " IFMT " jcol %d\n", + iam, jb, gb, j, jcol); + dense_col[jcol] = asup_val[i]; + } + dense_col += ldaspa_j; + } + + /*------------------------------------------------ + * SET UP U BLOCKS. + *------------------------------------------------*/ + /* Count number of blocks and length of each block. */ + nrbu = 0; + len = 0; /* Number of column subscripts I own. */ + len1 = 0; /* number of fstnz subscripts */ + for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) { + if (i >= xusub[nsupers_i]) printf ("ERR10\n"); + jcol = usub[i]; + gb = BlockNum( jcol ); /* Global block number */ + + /*if (fsupc <= 146445 && 146445 < fsupc + nsupc && jcol == 397986) + printf ("Pe[%d] [%d %d] elt [%d] jbcol %d pc %d\n", + iam, jb, gb, jcol, jbcol, pc); */ + + lb = LBj( gb, grid ); /* Local block number */ + pc = PCOL( gb, grid ); /* Process col owning this block */ + if (mycol == jbcol) ToSendR[ljb_j][pc] = YES; + /* if (mycol == jbcol && mycol != pc) ToSendR[ljb_j][pc] = YES; */ + pr = PROW( gb, grid ); + if ( pr != jbrow && mycol == pc) + bsendx_plist[lb][jbrow] = YES; + if (mycol == pc) { + len += nsupc; + LUb_length[lb] += nsupc; + ToSendD[ljb_i] = YES; + if (Urb_marker[lb] <= jb) { /* First see this block */ + if (Urb_marker[lb] == FALSE && gb != jb && myrow != pr) nbrecvx ++; + Urb_marker[lb] = jb + 1; + LUb_number[nrbu] = gb; + /* if (gb == 391825 && jb == 145361) + printf ("Pe[%d] T1 [%d %d] nrbu %d \n", + iam, jb, gb, nrbu); */ + nrbu ++; + len1 += SuperSize( gb ); + if ( gb != jb )/* Exclude diagonal block. */ + ++bmod[ljb_i];/* Mod. count for back solve */ +#if ( PRNTlevel>=1 ) + ++nUblocks; +#endif + } + } + } /* for i ... */ + + if ( nrbu ) { + /* Sort the blocks of U in increasing block column index. + SuperLU_DIST assumes this is true */ + /* simple insert sort algorithm */ + /* to be transformed in quick sort */ + for (j = 1; j < nrbu; j++) { + k = LUb_number[j]; + for (i=j-1; i>=0 && LUb_number[i] > k; i--) { + LUb_number[i+1] = LUb_number[i]; + } + LUb_number[i+1] = k; + } + + /* Set up the initial pointers for each block in + index[] and nzval[]. */ + /* Add room for descriptors */ + len1 += BR_HEADER + nrbu * UB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1+1)) ) { + fprintf (stderr, "Malloc fails for Uindex[]"); + return (memDist + memNLU + memTRS); + } + Ufstnz_br_ptr[ljb_i] = index; + if (!(Unzval_br_ptr[ljb_i] = + floatMalloc_dist(len))) { + fprintf (stderr, "Malloc fails for Unzval_br_ptr[*][]"); + return (memDist + memNLU + memTRS); + } + memNLU += (len1+1)*iword + len*dword; + uval = Unzval_br_ptr[ljb_i]; + mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); + mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); + index[0] = nrbu; /* Number of column blocks */ + index[1] = len; /* Total length of nzval[] */ + index[2] = len1; /* Total length of index */ + index[len1] = -1; /* End marker */ + next_ind = BR_HEADER; + next_val = 0; + for (k = 0; k < nrbu; k++) { + gb = LUb_number[k]; + lb = LBj( gb, grid ); + len = LUb_length[lb]; + LUb_length[lb] = 0; /* Reset vector of block length */ + index[next_ind++] = gb; /* Descriptor */ + index[next_ind++] = len; + LUb_indptr[lb] = next_ind; + for (; next_ind < LUb_indptr[lb] + SuperSize( gb ); next_ind++) + index[next_ind] = FstBlockC( jb + 1 ); + LUb_valptr[lb] = next_val; + next_val += len; + } + /* Propagate the fstnz subscripts to Ufstnz_br_ptr[], + and the initial values of A from SPA into Unzval_br_ptr[]. */ + for (i = xusub[ljb_i]; i < xusub[ljb_i+1]; i++) { + jcol = usub[i]; + gb = BlockNum( jcol ); + + if ( mycol == PCOL( gb, grid ) ) { + lb = LBj( gb, grid ); + k = LUb_indptr[lb]; /* Start fstnz in index */ + index[k + jcol - FstBlockC( gb )] = FstBlockC( jb ); + } + } /* for i ... */ + + for (i = 0; i < nrbu; i++) { + gb = LUb_number[i]; + lb = LBj( gb, grid ); + next_ind = LUb_indptr[lb]; + k = FstBlockC( jb + 1); + jcol = ilsum_j[lb]; + for (jj = 0; jj < SuperSize( gb ); jj++, jcol++) { + dense_col = dense; + j = index[next_ind+jj]; + for (ii = j; ii < k; ii++) { + uval[LUb_valptr[lb]++] = dense_col[jcol]; + dense_col[jcol] = zero; + dense_col += ldaspa_j; + } + } + } + } else { + Ufstnz_br_ptr[ljb_i] = NULL; + Unzval_br_ptr[ljb_i] = NULL; + } /* if nrbu ... */ + } /* if myrow == jbrow */ + + /*------------------------------------------------ + * SET UP L BLOCKS. + *------------------------------------------------*/ + if (mycol == jbcol) { /* Block column jb in my process column */ + /* Scatter A_inf into SPA. */ + for (j = ilsum_j[ljb_j], dense_col = dense; j < ilsum_j[ljb_j] + nsupc; j++) { + for (i = ainf_colptr[j]; i < ainf_colptr[j+1]; i++) { + irow = ainf_rowind[i]; + if (irow >= n) printf ("Pe[%d] ERR1\n", iam); + gb = BlockNum( irow ); + if (gb >= nsupers) printf ("Pe[%d] ERR5\n", iam); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + irow = ilsum[lb] + irow - FstBlockC( gb ); + if (irow >= ldaspa) printf ("Pe[%d] ERR0\n", iam); + dense_col[irow] = ainf_val[i]; + } + } + dense_col += ldaspa; + } + + /* sort the indices of the diagonal block at the beginning of xlsub */ + if (myrow == jbrow) { + k = xlsub[ljb_j]; + for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { + irow = lsub[i]; + if (irow < nsupc + fsupc && i != k+irow-fsupc) { + lsub[i] = lsub[k + irow - fsupc]; + lsub[k + irow - fsupc] = irow; + i --; + } + } + } + + /* Count number of blocks and length of each block. */ + nrbl = 0; + len = 0; /* Number of row subscripts I own. */ + kseen = 0; + for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { + irow = lsub[i]; + gb = BlockNum( irow ); /* Global block number */ + pr = PROW( gb, grid ); /* Process row owning this block */ + if ( pr != jbrow && fsendx_plist[ljb_j][pr] == EMPTY && + myrow == jbrow) { + fsendx_plist[ljb_j][pr] = YES; + ++nfsendx; + } + if ( myrow == pr ) { + lb = LBi( gb, grid ); /* Local block number */ + if (Lrb_marker[lb] <= jb) { /* First see this block */ + Lrb_marker[lb] = jb + 1; + LUb_length[lb] = 1; + LUb_number[nrbl++] = gb; + if ( gb != jb ) /* Exclude diagonal block. */ + ++fmod[lb]; /* Mod. count for forward solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nfrecvx; + kseen = 1; + } +#if ( PRNTlevel>=1 ) + ++nLblocks; +#endif + } else + ++LUb_length[lb]; + ++len; + } + } /* for i ... */ + + if ( nrbl ) { /* Do not ensure the blocks are sorted! */ + /* Set up the initial pointers for each block in + index[] and nzval[]. */ + /* If I am the owner of the diagonal block, order it first in LUb_number. + Necessary for SuperLU_DIST routines */ + kseen = EMPTY; + for (j = 0; j < nrbl; j++) { + if (LUb_number[j] == jb) + kseen = j; + } + if (kseen != EMPTY && kseen != 0) { + LUb_number[kseen] = LUb_number[0]; + LUb_number[0] = jb; + } + + /* Add room for descriptors */ + len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1)) ) { + fprintf (stderr, "Malloc fails for index[]"); + return (memDist + memNLU + memTRS); + } + Lrowind_bc_ptr[ljb_j] = index; + if (!(Lnzval_bc_ptr[ljb_j] = + floatMalloc_dist(len*nsupc))) { + fprintf(stderr, "Malloc fails for Lnzval_bc_ptr[*][] col block " IFMT, jb); + return (memDist + memNLU + memTRS); + } + + if (!(Linv_bc_ptr[ljb_j] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float)))) + ABORT("Malloc fails for Linv_bc_ptr[ljb_j][]"); + if (!(Uinv_bc_ptr[ljb_j] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float)))) + ABORT("Malloc fails for Uinv_bc_ptr[ljb_j][]"); + + memNLU += len1*iword + len*nsupc*dword; + + if ( !(Lindval_loc_bc_ptr[ljb_j] = intCalloc_dist(nrbl*3))) + ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb_j][]"); + memTRS += nrbl*3.0*iword + 2.0*nsupc*nsupc*dword; //acount for Lindval_loc_bc_ptr[ljb],Linv_bc_ptr[ljb],Uinv_bc_ptr[ljb] + + lusup = Lnzval_bc_ptr[ljb_j]; + mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); + mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); + mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); + index[0] = nrbl; /* Number of row blocks */ + index[1] = len; /* LDA of the nzval[] */ + next_ind = BC_HEADER; + next_val = 0; + for (k = 0; k < nrbl; ++k) { + gb = LUb_number[k]; + lb = LBi( gb, grid ); + len = LUb_length[lb]; + + Lindval_loc_bc_ptr[ljb_j][k] = lb; + Lindval_loc_bc_ptr[ljb_j][k+nrbl] = next_ind; + Lindval_loc_bc_ptr[ljb_j][k+nrbl*2] = next_val; + + LUb_length[lb] = 0; + index[next_ind++] = gb; /* Descriptor */ + index[next_ind++] = len; + LUb_indptr[lb] = next_ind; + LUb_valptr[lb] = next_val; + next_ind += len; + next_val += len; + } + /* Propagate the compressed row subscripts to Lindex[], + and the initial values of A from SPA into Lnzval[]. */ + len = index[1]; /* LDA of lusup[] */ + for (i = xlsub[ljb_j]; i < xlsub[ljb_j+1]; i++) { + irow = lsub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + k = LUb_indptr[lb]++; /* Random access a block */ + index[k] = irow; + k = LUb_valptr[lb]++; + irow = ilsum[lb] + irow - FstBlockC( gb ); + for (j = 0, dense_col = dense; j < nsupc; ++j) { + lusup[k] = dense_col[irow]; + dense_col[irow] = zero; + k += len; + dense_col += ldaspa; + } + } + } /* for i ... */ + + + + /* sort Lindval_loc_bc_ptr[ljb_j], Lrowind_bc_ptr[ljb_j] and Lnzval_bc_ptr[ljb_j] here*/ + if(nrbl>1){ + krow = PROW( jb, grid ); + if(myrow==krow){ /* skip the diagonal block */ + uu=nrbl-2; + lloc = &Lindval_loc_bc_ptr[ljb_j][1]; + }else{ + uu=nrbl-1; + lloc = Lindval_loc_bc_ptr[ljb_j]; + } + quickSortM(lloc,0,uu,nrbl,0,3); + } + + + if ( !(index_srt = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index_srt[]"); + if (!(lusup_srt = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float)))) + ABORT("Malloc fails for lusup_srt[]"); + + idx_indx = BC_HEADER; + idx_lusup = 0; + for (jj=0;jjnprow, grid->npcol); + if ( !(recvBuf = (int_t *) SUPERLU_MALLOC(nsupers*k*iword)) ) { + fprintf (stderr, "Malloc fails for recvBuf[]."); + return (memDist + memNLU + memTRS); + } + if ( !(nnzToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { + fprintf (stderr, "Malloc fails for nnzToRecv[]."); + return (memDist + memNLU + memTRS); + } + if ( !(ptrToRecv = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { + fprintf (stderr, "Malloc fails for ptrToRecv[]."); + return (memDist + memNLU + memTRS); + } + if ( !(nnzToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { + fprintf (stderr, "Malloc fails for nnzToRecv[]."); + return (memDist + memNLU + memTRS); + } + if ( !(ptrToSend = (int *) SUPERLU_MALLOC(nprocs*sizeof(int))) ) { + fprintf (stderr, "Malloc fails for ptrToRecv[]."); + return (memDist + memNLU + memTRS); + } + + if (memDist < (nsupers*k*iword +4*nprocs * sizeof(int))) + memDist = nsupers*k*iword +4*nprocs * sizeof(int); + + for (p = 0; p < nprocs; p++) + nnzToRecv[p] = 0; + + for (jb = 0; jb < nsupers; jb++) { + jbcol = PCOL( jb, grid ); + jbrow = PROW( jb, grid ); + p = PNUM(jbrow, jbcol, grid); + nnzToRecv[p] += grid->npcol; + } + i = 0; + for (p = 0; p < nprocs; p++) { + ptrToRecv[p] = i; + i += nnzToRecv[p]; + ptrToSend[p] = 0; + if (p != iam) + nnzToSend[p] = nnzToRecv[iam]; + else + nnzToSend[p] = 0; + } + nnzToRecv[iam] = 0; + i = ptrToRecv[iam]; + for (jb = 0; jb < nsupers; jb++) { + jbcol = PCOL( jb, grid ); + jbrow = PROW( jb, grid ); + p = PNUM(jbrow, jbcol, grid); + if (p == iam) { + ljb_j = LBj( jb, grid ); /* Local block number column wise */ + for (j = 0; j < grid->npcol; j++, i++) + recvBuf[i] = ToSendR[ljb_j][j]; + } + } + + MPI_Alltoallv (&(recvBuf[ptrToRecv[iam]]), nnzToSend, ptrToSend, mpi_int_t, + recvBuf, nnzToRecv, ptrToRecv, mpi_int_t, grid->comm); + + for (jb = 0; jb < nsupers; jb++) { + jbcol = PCOL( jb, grid ); + jbrow = PROW( jb, grid ); + p = PNUM(jbrow, jbcol, grid); + ljb_j = LBj( jb, grid ); /* Local block number column wise */ + ljb_i = LBi( jb, grid ); /* Local block number row wise */ + /* (myrow == jbrow) { + if (ToSendD[ljb_i] == YES) + ToRecv[jb] = 1; + } + else { + if (recvBuf[ptrToRecv[p] + mycol] == YES) + ToRecv[jb] = 2; + } */ + if (recvBuf[ptrToRecv[p] + mycol] == YES) { + if (myrow == jbrow) + ToRecv[jb] = 1; + else + ToRecv[jb] = 2; + } + if (mycol == jbcol) { + for (i = 0, j = ptrToRecv[p]; i < grid->npcol; i++, j++) + ToSendR[ljb_j][i] = recvBuf[j]; + ToSendR[ljb_j][mycol] = EMPTY; + } + ptrToRecv[p] += grid->npcol; + } + + /* exchange information about bsendx_plist in between column of processors */ + MPI_Allreduce ((*bsendx_plist), recvBuf, nsupers_j * grid->nprow, mpi_int_t, + MPI_MAX, grid->cscp.comm); + + for (jb = 0; jb < nsupers; jb ++) { + jbcol = PCOL( jb, grid); + jbrow = PROW( jb, grid); + if (mycol == jbcol) { + ljb_j = LBj( jb, grid ); /* Local block number column wise */ + if (myrow == jbrow ) { + for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) { + (*bsendx_plist)[k] = recvBuf[k]; + if ((*bsendx_plist)[k] != EMPTY) + nbsendx ++; + } + } + else { + for (k = ljb_j * grid->nprow; k < (ljb_j+1) * grid->nprow; k++) + (*bsendx_plist)[k] = EMPTY; + } + } + } + + ///////////////////////////////////////////////////////////////// + + /* Set up additional pointers for the index and value arrays of U. + nub is the number of local block columns. */ + nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */ + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + /* usub1[0] -- number of column blocks in this block row. */ + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) { + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + } + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + + ++Urbs1[ljb]; + j += usub1[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + + +/* Count the nnzs per block column */ + for (lb = 0; lb < nub; ++lb) { + Unnz[lb] = 0; + k = lb * grid->npcol + mycol;/* Global block number, column-wise. */ + knsupc = SuperSize( k ); + for (ub = 0; ub < Urbs[lb]; ++ub) { + ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */ + i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iklrow = FstBlockC( gik+1 ); + for (jj = 0; jj < knsupc; ++jj) { + fnz = Ufstnz_br_ptr[ik][i + jj]; + if ( fnz < iklrow ) { + Unnz[lb] +=iklrow-fnz; + } + } /* for jj ... */ + } + } + + ///////////////////////////////////////////////////////////////// + + // if(LSUM=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Bcast tree for L ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for LBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + for (i=0;icscp.comm); + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlag[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers; + memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword; //acount for LBtree_ptr, SeedSTD_BC, ActiveFlagAll + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=SUPERLU_MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb); + } /* for j ... */ + } + } + + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MIN,grid->cscp.comm); + + + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2); + + if(Iactive==1){ + // printf("jb %5d damn\n",jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s'); + BcTree_SetTag(LBtree_ptr[ljb],BC_L,'s'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(iam==15 || iam==3){ + // printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb],'s')); + // fflush(stdout); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + if ( fsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + assert(rank_cnt==rank_cnt_ref); + + // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); + + // // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;jnprow*k*iword; //acount for SeedSTD_BC, ActiveFlagAll + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); +#endif + + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for L ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || fmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for LRtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + + for (lib = 0; lib npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers; + memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword; //acount for LRtree_ptr, SeedSTD_RD, ActiveFlagAll + + + for (ljb = 0; ljb < CEILING( nsupers, grid->npcol); ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnpcol]=SUPERLU_MAX(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + } + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MAX,grid->rscp.comm); + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s'); + RdTree_SetTag(LRtree_ptr[lib], RD_L,'s'); + // } + + // printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + + #if ( PRNTlevel>=1 ) + if(Root==mycol){ + assert(rank_cnt==frecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;jnprow*k*iword; //acount for SeedSTD_RD, ActiveFlagAll + //////////////////////////////////////////////////////// + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + /* construct the Bcast tree for U ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for UBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + for (i=0;icscp.comm); + + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers; + memTRS += k*sizeof(BcTree) + k*dword + grid->nprow*k*iword; //acount for UBtree_ptr, SeedSTD_BC, ActiveFlagAll + + + for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */ + ib = myrow+lib*grid->nprow; /* not sure */ + + // if(ib==0)printf("iam %5d ib %5d\n",iam,ib); + // fflush(stdout); + + if(ibnprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib); + } + } /* for i ... */ + pr = PROW( ib, grid ); // take care of diagonal node stored as L + pc = PCOL( ib, grid ); + if ( mycol == pc ) { /* Block column ib in my process column */ + ljb = LBj( ib, grid ); /* local block number */ + ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],ib); + // if(pr+ljb*grid->nprow==0)printf("iam %5d ib %5d ActiveFlagAll %5d pr %5d ljb %5d\n",iam,ib,ActiveFlagAll[pr+ljb*grid->nprow],pr,ljb); + // fflush(stdout); + } + } + } + + // printf("iam %5d ActiveFlagAll %5d\n",iam,ActiveFlagAll[0]); + // fflush(stdout); + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->nprow*k,mpi_int_t,MPI_MAX,grid->cscp.comm); + + for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2); + // printf("jb: %5d Iactive %5d\n",jb,Iactive); + // fflush(stdout); + if(Iactive==1){ + // if(jb==0)printf("root:%5d jb: %5d ActiveFlag %5d \n",Root,jb,ActiveFlag[0]); + fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + // printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt); + // fflush(stdout); + if(rank_cnt>1){ + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s'); + BcTree_SetTag(UBtree_ptr[ljb],BC_U,'s'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + // printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow); + // fflush(stdout); + if ( bsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + // printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref); + // fflush(stdout); + assert(rank_cnt==rank_cnt_ref); + } + } + } + } + } + SUPERLU_FREE(ActiveFlag); + SUPERLU_FREE(ActiveFlagAll); + SUPERLU_FREE(ranks); + SUPERLU_FREE(SeedSTD_BC); + memTRS -= k*dword + grid->nprow*k*iword; //acount for SeedSTD_BC, ActiveFlagAll + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for U ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || bmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for URtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + for (lib = 0; lib npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers; + memTRS += k*sizeof(RdTree) + k*dword + grid->npcol*k*iword; //acount for URtree_ptr, SeedSTD_RD, ActiveFlagAll + + for (lib = 0; lib < CEILING( nsupers, grid->nprow); ++lib) { /* for each local block row ... */ + ib = myrow+lib*grid->nprow; /* not sure */ + if(ibnpcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } /* for i ... */ + pc = PCOL( ib, grid ); + if ( mycol == pc ) { /* Block column ib in my process column */ + ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],ib); + } + } + } + + MPI_Allreduce(MPI_IN_PLACE,ActiveFlagAll,grid->npcol*k,mpi_int_t,MPI_MIN,grid->rscp.comm); + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s'); + RdTree_SetTag(URtree_ptr[lib], RD_U,'s'); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==mycol){ + // printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]); + // fflush(stdout); + assert(rank_cnt==brecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;jnprow*k*iword; //acount for SeedSTD_RD, ActiveFlagAll + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); +#endif + + //////////////////////////////////////////////////////// + + /* Free the memory used for storing L and U */ + SUPERLU_FREE(xlsub); SUPERLU_FREE(xusub); + if (lsub != NULL) + SUPERLU_FREE(lsub); + if (usub != NULL) + SUPERLU_FREE(usub); + + + SUPERLU_FREE(nnzToRecv); + SUPERLU_FREE(ptrToRecv); + SUPERLU_FREE(nnzToSend); + SUPERLU_FREE(ptrToSend); + SUPERLU_FREE(recvBuf); + + Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; + Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; + Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; + Llu->Linv_bc_ptr = Linv_bc_ptr; + Llu->Uinv_bc_ptr = Uinv_bc_ptr; + Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; + Llu->Unzval_br_ptr = Unzval_br_ptr; + Llu->Unnz = Unnz; + Llu->ToRecv = ToRecv; + Llu->ToSendD = ToSendD; + Llu->ToSendR = ToSendR; + Llu->fmod = fmod; + Llu->fsendx_plist = fsendx_plist; + Llu->nfrecvx = nfrecvx; + Llu->nfsendx = nfsendx; + Llu->bmod = bmod; + Llu->bsendx_plist = bsendx_plist; + Llu->nbrecvx = nbrecvx; + Llu->nbsendx = nbsendx; + Llu->ilsum = ilsum; + Llu->ldalsum = ldaspa; + LUstruct->Glu_persist = Glu_persist; + Llu->LRtree_ptr = LRtree_ptr; + Llu->LBtree_ptr = LBtree_ptr; + Llu->URtree_ptr = URtree_ptr; + Llu->UBtree_ptr = UBtree_ptr; + Llu->Urbs = Urbs; + Llu->Ucb_indptr = Ucb_indptr; + Llu->Ucb_valptr = Ucb_valptr; + +#if ( PRNTlevel>=1 ) + if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", + nLblocks, nUblocks); +#endif + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(Llu->mod_bit = intMalloc_dist(k)) ) + ABORT("Malloc fails for mod_bit[]."); + + /* Find the maximum buffer size. */ + MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, + MPI_MAX, grid->comm); + +#if ( DEBUGlevel>=1 ) + /* Memory allocated but not freed: + ilsum, fmod, fsendx_plist, bmod, bsendx_plist, + ToRecv, ToSendR, ToSendD, mod_bit + */ + CHECK_MALLOC(iam, "Exit dist_psymbtonum()"); +#endif + + return (- (memDist+memNLU)); +} /* sdist_psymbtonum */ + diff --git a/SRC/psutil.c b/SRC/psutil.c new file mode 100644 index 00000000..e08066be --- /dev/null +++ b/SRC/psutil.c @@ -0,0 +1,868 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Several matrix utilities + * + *
+ * -- Distributed SuperLU routine (version 2.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ * 
+ */ + +#include +#include "superlu_sdefs.h" + +/*! \brief Gather A from the distributed compressed row format to global A in compressed column format. + */ +int psCompRow_loc_to_CompCol_global +( + int_t need_value, /* Input. Whether need to gather numerical values */ + SuperMatrix *A, /* Input. Distributed matrix in NRformat_loc format. */ + gridinfo_t *grid, /* Input */ + SuperMatrix *GA /* Output */ +) +{ + NRformat_loc *Astore; + NCformat *GAstore; + float *a, *a_loc; + int_t *colind, *rowptr; + int_t *colptr_loc, *rowind_loc; + int_t m_loc, n, i, j, k, l; + int_t colnnz, fst_row, nnz_loc, nnz; + float *a_recv; /* Buffer to receive the blocks of values. */ + float *a_buf; /* Buffer to merge blocks into block columns. */ + int_t *itemp; + int_t *colptr_send; /* Buffer to redistribute the column pointers of the + local block rows. + Use n_loc+1 pointers for each block. */ + int_t *colptr_blk; /* The column pointers for each block, after + redistribution to the local block columns. + Use n_loc+1 pointers for each block. */ + int_t *rowind_recv; /* Buffer to receive the blocks of row indices. */ + int_t *rowind_buf; /* Buffer to merge blocks into block columns. */ + int_t *fst_rows, *n_locs; + int *sendcnts, *sdispls, *recvcnts, *rdispls, *itemp_32; + int it, n_loc, procs; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Enter psCompRow_loc_to_CompCol_global"); +#endif + + /* Initialization. */ + n = A->ncol; + Astore = (NRformat_loc *) A->Store; + nnz_loc = Astore->nnz_loc; + m_loc = Astore->m_loc; + fst_row = Astore->fst_row; + a = Astore->nzval; + rowptr = Astore->rowptr; + colind = Astore->colind; + n_loc = m_loc; /* NOTE: CURRENTLY ONLY WORK FOR SQUARE MATRIX */ + + /* ------------------------------------------------------------ + FIRST PHASE: TRANSFORM A INTO DISTRIBUTED COMPRESSED COLUMN. + ------------------------------------------------------------*/ + sCompRow_to_CompCol_dist(m_loc, n, nnz_loc, a, colind, rowptr, &a_loc, + &rowind_loc, &colptr_loc); + /* Change local row index numbers to global numbers. */ + for (i = 0; i < nnz_loc; ++i) rowind_loc[i] += fst_row; + +#if ( DEBUGlevel>=2 ) + printf("Proc %d\n", grid->iam); + PrintInt10("rowind_loc", nnz_loc, rowind_loc); + PrintInt10("colptr_loc", n+1, colptr_loc); +#endif + + procs = grid->nprow * grid->npcol; + if ( !(fst_rows = (int_t *) intMalloc_dist(2*procs)) ) + ABORT("Malloc fails for fst_rows[]"); + n_locs = fst_rows + procs; + MPI_Allgather(&fst_row, 1, mpi_int_t, fst_rows, 1, mpi_int_t, + grid->comm); + for (i = 0; i < procs-1; ++i) n_locs[i] = fst_rows[i+1] - fst_rows[i]; + n_locs[procs-1] = n - fst_rows[procs-1]; + if ( !(recvcnts = SUPERLU_MALLOC(5*procs * sizeof(int))) ) + ABORT("Malloc fails for recvcnts[]"); + sendcnts = recvcnts + procs; + rdispls = sendcnts + procs; + sdispls = rdispls + procs; + itemp_32 = sdispls + procs; + + /* All-to-all transfer column pointers of each block. + Now the matrix view is P-by-P block-partition. */ + /* n column starts for each column, and procs column ends for each block */ + if ( !(colptr_send = intMalloc_dist(n + procs)) ) + ABORT("Malloc fails for colptr_send[]"); + if ( !(colptr_blk = intMalloc_dist( (((size_t) n_loc)+1)*procs)) ) + ABORT("Malloc fails for colptr_blk[]"); + for (i = 0, j = 0; i < procs; ++i) { + for (k = j; k < j + n_locs[i]; ++k) colptr_send[i+k] = colptr_loc[k]; + colptr_send[i+k] = colptr_loc[k]; /* Add an END marker */ + sendcnts[i] = n_locs[i] + 1; +#if ( DEBUGlevel>=1 ) + assert(j == fst_rows[i]); +#endif + sdispls[i] = j + i; + recvcnts[i] = n_loc + 1; + rdispls[i] = i * (n_loc + 1); + j += n_locs[i]; /* First column of next block in colptr_loc[] */ + } + MPI_Alltoallv(colptr_send, sendcnts, sdispls, mpi_int_t, + colptr_blk, recvcnts, rdispls, mpi_int_t, grid->comm); + + /* Adjust colptr_blk[] so that they contain the local indices of the + column pointers in the receive buffer. */ + nnz = 0; /* The running sum of the nonzeros counted by far */ + k = 0; + for (i = 0; i < procs; ++i) { + for (j = rdispls[i]; j < rdispls[i] + n_loc; ++j) { + colnnz = colptr_blk[j+1] - colptr_blk[j]; + /*assert(k<=j);*/ + colptr_blk[k] = nnz; + nnz += colnnz; /* Start of the next column */ + ++k; + } + colptr_blk[k++] = nnz; /* Add an END marker for each block */ + } + /*assert(k == (n_loc+1)*procs);*/ + + /* Now prepare to transfer row indices and values. */ + sdispls[0] = 0; + for (i = 0; i < procs-1; ++i) { + sendcnts[i] = colptr_loc[fst_rows[i+1]] - colptr_loc[fst_rows[i]]; + sdispls[i+1] = sdispls[i] + sendcnts[i]; + } + sendcnts[procs-1] = colptr_loc[n] - colptr_loc[fst_rows[procs-1]]; + for (i = 0; i < procs; ++i) { + j = rdispls[i]; /* Point to this block in colptr_blk[]. */ + recvcnts[i] = colptr_blk[j+n_loc] - colptr_blk[j]; + } + rdispls[0] = 0; /* Recompute rdispls[] for row indices. */ + for (i = 0; i < procs-1; ++i) rdispls[i+1] = rdispls[i] + recvcnts[i]; + + k = rdispls[procs-1] + recvcnts[procs-1]; /* Total received */ + if ( !(rowind_recv = (int_t *) intMalloc_dist(2*k)) ) + ABORT("Malloc fails for rowind_recv[]"); + rowind_buf = rowind_recv + k; + MPI_Alltoallv(rowind_loc, sendcnts, sdispls, mpi_int_t, + rowind_recv, recvcnts, rdispls, mpi_int_t, grid->comm); + if ( need_value ) { + if ( !(a_recv = (float *) floatMalloc_dist(2*k)) ) + ABORT("Malloc fails for rowind_recv[]"); + a_buf = a_recv + k; + MPI_Alltoallv(a_loc, sendcnts, sdispls, MPI_FLOAT, + a_recv, recvcnts, rdispls, MPI_FLOAT, + grid->comm); + } + + /* Reset colptr_loc[] to point to the n_loc global columns. */ + colptr_loc[0] = 0; + itemp = colptr_send; + for (j = 0; j < n_loc; ++j) { + colnnz = 0; + for (i = 0; i < procs; ++i) { + k = i * (n_loc + 1) + j; /* j-th column in i-th block */ + colnnz += colptr_blk[k+1] - colptr_blk[k]; + } + colptr_loc[j+1] = colptr_loc[j] + colnnz; + itemp[j] = colptr_loc[j]; /* Save a copy of the column starts */ + } + itemp[n_loc] = colptr_loc[n_loc]; + + /* Merge blocks of row indices into columns of row indices. */ + for (i = 0; i < procs; ++i) { + k = i * (n_loc + 1); + for (j = 0; j < n_loc; ++j) { /* i-th block */ + for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { + rowind_buf[itemp[j]] = rowind_recv[l]; + ++itemp[j]; + } + } + } + + if ( need_value ) { + for (j = 0; j < n_loc+1; ++j) itemp[j] = colptr_loc[j]; + for (i = 0; i < procs; ++i) { + k = i * (n_loc + 1); + for (j = 0; j < n_loc; ++j) { /* i-th block */ + for (l = colptr_blk[k+j]; l < colptr_blk[k+j+1]; ++l) { + a_buf[itemp[j]] = a_recv[l]; + ++itemp[j]; + } + } + } + } + + /* ------------------------------------------------------------ + SECOND PHASE: GATHER TO GLOBAL A IN COMPRESSED COLUMN FORMAT. + ------------------------------------------------------------*/ + GA->nrow = A->nrow; + GA->ncol = A->ncol; + GA->Stype = SLU_NC; + GA->Dtype = A->Dtype; + GA->Mtype = A->Mtype; + GAstore = GA->Store = (NCformat *) SUPERLU_MALLOC ( sizeof(NCformat) ); + if ( !GAstore ) ABORT ("SUPERLU_MALLOC fails for GAstore"); + + /* First gather the size of each piece. */ + nnz_loc = colptr_loc[n_loc]; + MPI_Allgather(&nnz_loc, 1, mpi_int_t, itemp, 1, mpi_int_t, grid->comm); + for (i = 0, nnz = 0; i < procs; ++i) nnz += itemp[i]; + GAstore->nnz = nnz; + + if ( !(GAstore->rowind = (int_t *) intMalloc_dist (nnz)) ) + ABORT ("SUPERLU_MALLOC fails for GAstore->rowind[]"); + if ( !(GAstore->colptr = (int_t *) intMalloc_dist (n+1)) ) + ABORT ("SUPERLU_MALLOC fails for GAstore->colptr[]"); + + /* Allgatherv for row indices. */ + rdispls[0] = 0; + for (i = 0; i < procs-1; ++i) { + rdispls[i+1] = rdispls[i] + itemp[i]; + itemp_32[i] = itemp[i]; + } + itemp_32[procs-1] = itemp[procs-1]; + it = nnz_loc; + MPI_Allgatherv(rowind_buf, it, mpi_int_t, GAstore->rowind, + itemp_32, rdispls, mpi_int_t, grid->comm); + if ( need_value ) { + if ( !(GAstore->nzval = (float *) floatMalloc_dist (nnz)) ) + ABORT ("SUPERLU_MALLOC fails for GAstore->rnzval[]"); + MPI_Allgatherv(a_buf, it, MPI_FLOAT, GAstore->nzval, + itemp_32, rdispls, MPI_FLOAT, grid->comm); + } else GAstore->nzval = NULL; + + /* Now gather the column pointers. */ + rdispls[0] = 0; + for (i = 0; i < procs-1; ++i) { + rdispls[i+1] = rdispls[i] + n_locs[i]; + itemp_32[i] = n_locs[i]; + } + itemp_32[procs-1] = n_locs[procs-1]; + MPI_Allgatherv(colptr_loc, n_loc, mpi_int_t, GAstore->colptr, + itemp_32, rdispls, mpi_int_t, grid->comm); + + /* Recompute column pointers. */ + for (i = 1; i < procs; ++i) { + k = rdispls[i]; + for (j = 0; j < n_locs[i]; ++j) GAstore->colptr[k++] += itemp[i-1]; + itemp[i] += itemp[i-1]; /* prefix sum */ + } + GAstore->colptr[n] = nnz; + +#if ( DEBUGlevel>=2 ) + if ( !grid->iam ) { + printf("After pdCompRow_loc_to_CompCol_global()\n"); + sPrint_CompCol_Matrix_dist(GA); + } +#endif + + SUPERLU_FREE(a_loc); + SUPERLU_FREE(rowind_loc); + SUPERLU_FREE(colptr_loc); + SUPERLU_FREE(fst_rows); + SUPERLU_FREE(recvcnts); + SUPERLU_FREE(colptr_send); + SUPERLU_FREE(colptr_blk); + SUPERLU_FREE(rowind_recv); + if ( need_value) SUPERLU_FREE(a_recv); +#if ( DEBUGlevel>=1 ) + if ( !grid->iam ) printf("sizeof(NCformat) %lu\n", sizeof(NCformat)); + CHECK_MALLOC(grid->iam, "Exit psCompRow_loc_to_CompCol_global"); +#endif + return 0; +} /* psCompRow_loc_to_CompCol_global */ + + +/*! \brief Permute the distributed dense matrix: B <= perm(X). perm[i] = j means the i-th row of X is in the j-th row of B. + */ +int psPermute_Dense_Matrix +( + int_t fst_row, + int_t m_loc, + int_t row_to_proc[], + int_t perm[], + float X[], int ldx, + float B[], int ldb, + int nrhs, + gridinfo_t *grid +) +{ + int_t i, j, k, l; + int p, procs; + int *sendcnts, *sendcnts_nrhs, *recvcnts, *recvcnts_nrhs; + int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; + int *ptr_to_ibuf, *ptr_to_dbuf; + int_t *send_ibuf, *recv_ibuf; + float *send_dbuf, *recv_dbuf; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Enter psPermute_Dense_Matrix()"); +#endif + + procs = grid->nprow * grid->npcol; + if ( !(sendcnts = SUPERLU_MALLOC(10*procs * sizeof(int))) ) + ABORT("Malloc fails for sendcnts[]."); + sendcnts_nrhs = sendcnts + procs; + recvcnts = sendcnts_nrhs + procs; + recvcnts_nrhs = recvcnts + procs; + sdispls = recvcnts_nrhs + procs; + sdispls_nrhs = sdispls + procs; + rdispls = sdispls_nrhs + procs; + rdispls_nrhs = rdispls + procs; + ptr_to_ibuf = rdispls_nrhs + procs; + ptr_to_dbuf = ptr_to_ibuf + procs; + + for (i = 0; i < procs; ++i) sendcnts[i] = 0; + + /* Count the number of X entries to be sent to each process.*/ + for (i = fst_row; i < fst_row + m_loc; ++i) { + p = row_to_proc[perm[i]]; + ++sendcnts[p]; + } + MPI_Alltoall(sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm); + sdispls[0] = rdispls[0] = 0; + sdispls_nrhs[0] = rdispls_nrhs[0] = 0; + sendcnts_nrhs[0] = sendcnts[0] * nrhs; + recvcnts_nrhs[0] = recvcnts[0] * nrhs; + for (i = 1; i < procs; ++i) { + sdispls[i] = sdispls[i-1] + sendcnts[i-1]; + sdispls_nrhs[i] = sdispls[i] * nrhs; + rdispls[i] = rdispls[i-1] + recvcnts[i-1]; + rdispls_nrhs[i] = rdispls[i] * nrhs; + sendcnts_nrhs[i] = sendcnts[i] * nrhs; + recvcnts_nrhs[i] = recvcnts[i] * nrhs; + } + k = sdispls[procs-1] + sendcnts[procs-1];/* Total number of sends */ + l = rdispls[procs-1] + recvcnts[procs-1];/* Total number of recvs */ + /*assert(k == m_loc);*/ + /*assert(l == m_loc);*/ + if ( !(send_ibuf = intMalloc_dist(k + l)) ) + ABORT("Malloc fails for send_ibuf[]."); + recv_ibuf = send_ibuf + k; + if ( !(send_dbuf = floatMalloc_dist((k + l)*nrhs)) ) + ABORT("Malloc fails for send_dbuf[]."); + recv_dbuf = send_dbuf + k * nrhs; + + for (i = 0; i < procs; ++i) { + ptr_to_ibuf[i] = sdispls[i]; + ptr_to_dbuf[i] = sdispls_nrhs[i]; + } + + /* Fill in the send buffers: send_ibuf[] and send_dbuf[]. */ + for (i = fst_row; i < fst_row + m_loc; ++i) { + j = perm[i]; + p = row_to_proc[j]; + send_ibuf[ptr_to_ibuf[p]] = j; + j = ptr_to_dbuf[p]; + RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ + send_dbuf[j++] = X[i-fst_row + k*ldx]; + } + ++ptr_to_ibuf[p]; + ptr_to_dbuf[p] += nrhs; + } + + /* Transfer the (permuted) row indices and numerical values. */ + MPI_Alltoallv(send_ibuf, sendcnts, sdispls, mpi_int_t, + recv_ibuf, recvcnts, rdispls, mpi_int_t, grid->comm); + MPI_Alltoallv(send_dbuf, sendcnts_nrhs, sdispls_nrhs, MPI_FLOAT, + recv_dbuf, recvcnts_nrhs, rdispls_nrhs, MPI_FLOAT, + grid->comm); + + /* Copy the buffer into b. */ + for (i = 0, l = 0; i < m_loc; ++i) { + j = recv_ibuf[i] - fst_row; /* Relative row number */ + RHS_ITERATE(k) { /* RHS stored in row major in the buffer */ + B[j + k*ldb] = recv_dbuf[l++]; + } + } + + SUPERLU_FREE(sendcnts); + SUPERLU_FREE(send_ibuf); + SUPERLU_FREE(send_dbuf); +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(grid->iam, "Exit psPermute_Dense_Matrix()"); +#endif + return 0; +} /* psPermute_Dense_Matrix */ + + +/*! \brief Allocate storage in LUstruct */ +void sLUstructInit(const int_t n, sLUstruct_t *LUstruct) +{ + if ( !(LUstruct->etree = intMalloc_dist(n)) ) + ABORT("Malloc fails for etree[]."); + if ( !(LUstruct->Glu_persist = (Glu_persist_t *) + SUPERLU_MALLOC(sizeof(Glu_persist_t))) ) + ABORT("Malloc fails for Glu_persist_t."); + if ( !(LUstruct->Llu = (sLocalLU_t *) + SUPERLU_MALLOC(sizeof(sLocalLU_t))) ) + ABORT("Malloc fails for LocalLU_t."); + LUstruct->Llu->inv = 0; +} + +/*! \brief Deallocate LUstruct */ +void sLUstructFree(sLUstruct_t *LUstruct) +{ +#if ( DEBUGlevel>=1 ) + int iam; + MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + CHECK_MALLOC(iam, "Enter sLUstructFree()"); +#endif + + SUPERLU_FREE(LUstruct->etree); + SUPERLU_FREE(LUstruct->Glu_persist); + SUPERLU_FREE(LUstruct->Llu); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit sLUstructFree()"); +#endif +} + +/*! \brief Destroy distributed L & U matrices. */ +void +sDestroy_LU(int_t n, gridinfo_t *grid, sLUstruct_t *LUstruct) +{ + int_t i, nb, nsupers; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + +#if ( DEBUGlevel>=1 ) + int iam; + MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + CHECK_MALLOC(iam, "Enter sDestroy_LU()"); +#endif + + sDestroy_Tree(n, grid, LUstruct); + + nsupers = Glu_persist->supno[n-1] + 1; + + nb = CEILING(nsupers, grid->npcol); + for (i = 0; i < nb; ++i) + if ( Llu->Lrowind_bc_ptr[i] ) { + SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); +#if 0 // Sherry: the following is not allocated with cudaHostAlloc + //#ifdef GPU_ACC + checkCuda(cudaFreeHost(Llu->Lnzval_bc_ptr[i])); +#endif + SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); + } + SUPERLU_FREE (Llu->Lrowind_bc_ptr); + SUPERLU_FREE (Llu->Lnzval_bc_ptr); + + nb = CEILING(nsupers, grid->nprow); + for (i = 0; i < nb; ++i) + if ( Llu->Ufstnz_br_ptr[i] ) { + SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]); + SUPERLU_FREE (Llu->Unzval_br_ptr[i]); + } + SUPERLU_FREE (Llu->Ufstnz_br_ptr); + SUPERLU_FREE (Llu->Unzval_br_ptr); + + /* The following can be freed after factorization. */ + SUPERLU_FREE(Llu->ToRecv); + SUPERLU_FREE(Llu->ToSendD); + SUPERLU_FREE(Llu->ToSendR[0]); + SUPERLU_FREE(Llu->ToSendR); + + /* The following can be freed only after iterative refinement. */ + SUPERLU_FREE(Llu->ilsum); + SUPERLU_FREE(Llu->fmod); + SUPERLU_FREE(Llu->fsendx_plist[0]); + SUPERLU_FREE(Llu->fsendx_plist); + SUPERLU_FREE(Llu->bmod); + SUPERLU_FREE(Llu->bsendx_plist[0]); + SUPERLU_FREE(Llu->bsendx_plist); + SUPERLU_FREE(Llu->mod_bit); + + nb = CEILING(nsupers, grid->npcol); + for (i = 0; i < nb; ++i) + if ( Llu->Lindval_loc_bc_ptr[i]!=NULL) { + SUPERLU_FREE (Llu->Lindval_loc_bc_ptr[i]); + } + SUPERLU_FREE(Llu->Lindval_loc_bc_ptr); + + nb = CEILING(nsupers, grid->npcol); + for (i=0; iLinv_bc_ptr[i]!=NULL) { + SUPERLU_FREE(Llu->Linv_bc_ptr[i]); + } + if(Llu->Uinv_bc_ptr[i]!=NULL){ + SUPERLU_FREE(Llu->Uinv_bc_ptr[i]); + } + } + SUPERLU_FREE(Llu->Linv_bc_ptr); + SUPERLU_FREE(Llu->Uinv_bc_ptr); + SUPERLU_FREE(Llu->Unnz); + + nb = CEILING(nsupers, grid->npcol); + for (i = 0; i < nb; ++i) + if ( Llu->Urbs[i] ) { + SUPERLU_FREE(Llu->Ucb_indptr[i]); + SUPERLU_FREE(Llu->Ucb_valptr[i]); + } + SUPERLU_FREE(Llu->Ucb_indptr); + SUPERLU_FREE(Llu->Ucb_valptr); + SUPERLU_FREE(Llu->Urbs); + + SUPERLU_FREE(Glu_persist->xsup); + SUPERLU_FREE(Glu_persist->supno); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit sDestroy_LU()"); +#endif +} + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Set up the communication pattern for redistribution between B and X
+ *   in the triangular solution.
+ * 
+ * Arguments
+ * =========
+ *
+ * n      (input) int (global)
+ *        The dimension of the linear system.
+ *
+ * m_loc  (input) int (local)
+ *        The local row dimension of the distributed input matrix.
+ *
+ * nrhs   (input) int (global)
+ *        Number of right-hand sides.
+ *
+ * fst_row (input) int (global)
+ *        The row number of matrix B's first row in the global matrix.
+ *
+ * perm_r (input) int* (global)
+ *        The row permutation vector.
+ *
+ * perm_c (input) int* (global)
+ *        The column permutation vector.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ * 
+ */ +int_t +psgstrs_init(int_t n, int_t m_loc, int_t nrhs, int_t fst_row, + int_t perm_r[], int_t perm_c[], gridinfo_t *grid, + Glu_persist_t *Glu_persist, sSOLVEstruct_t *SOLVEstruct) +{ + + int *SendCnt, *SendCnt_nrhs, *RecvCnt, *RecvCnt_nrhs; + int *sdispls, *sdispls_nrhs, *rdispls, *rdispls_nrhs; + int *itemp, *ptr_to_ibuf, *ptr_to_dbuf; + int_t *row_to_proc; + int_t i, gbi, k, l, num_diag_procs, *diag_procs; + int_t irow, q, knsupc, nsupers, *xsup, *supno; + int iam, p, pkk, procs; + pxgstrs_comm_t *gstrs_comm; + + procs = grid->nprow * grid->npcol; + iam = grid->iam; + gstrs_comm = SOLVEstruct->gstrs_comm; + xsup = Glu_persist->xsup; + supno = Glu_persist->supno; + nsupers = Glu_persist->supno[n-1] + 1; + row_to_proc = SOLVEstruct->row_to_proc; + + /* ------------------------------------------------------------ + SET UP COMMUNICATION PATTERN FOR ReDistribute_B_to_X. + ------------------------------------------------------------*/ + if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) + ABORT("Malloc fails for B_to_X_itemp[]."); + SendCnt = itemp; + SendCnt_nrhs = itemp + procs; + RecvCnt = itemp + 2*procs; + RecvCnt_nrhs = itemp + 3*procs; + sdispls = itemp + 4*procs; + sdispls_nrhs = itemp + 5*procs; + rdispls = itemp + 6*procs; + rdispls_nrhs = itemp + 7*procs; + + /* Count the number of elements to be sent to each diagonal process.*/ + for (p = 0; p < procs; ++p) SendCnt[p] = 0; + for (i = 0, l = fst_row; i < m_loc; ++i, ++l) { + irow = perm_c[perm_r[l]]; /* Row number in Pc*Pr*B */ + gbi = BlockNum( irow ); + p = PNUM( PROW(gbi,grid), PCOL(gbi,grid), grid ); /* Diagonal process */ + ++SendCnt[p]; + } + + /* Set up the displacements for alltoall. */ + MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); + sdispls[0] = rdispls[0] = 0; + for (p = 1; p < procs; ++p) { + sdispls[p] = sdispls[p-1] + SendCnt[p-1]; + rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; + } + for (p = 0; p < procs; ++p) { + SendCnt_nrhs[p] = SendCnt[p] * nrhs; + sdispls_nrhs[p] = sdispls[p] * nrhs; + RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; + rdispls_nrhs[p] = rdispls[p] * nrhs; + } + + /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ + gstrs_comm->B_to_X_SendCnt = SendCnt; + + /* ------------------------------------------------------------ + SET UP COMMUNICATION PATTERN FOR ReDistribute_X_to_B. + ------------------------------------------------------------*/ + /* This is freed in pxgstrs_finalize(). */ + if ( !(itemp = SUPERLU_MALLOC(8*procs * sizeof(int))) ) + ABORT("Malloc fails for X_to_B_itemp[]."); + SendCnt = itemp; + SendCnt_nrhs = itemp + procs; + RecvCnt = itemp + 2*procs; + RecvCnt_nrhs = itemp + 3*procs; + sdispls = itemp + 4*procs; + sdispls_nrhs = itemp + 5*procs; + rdispls = itemp + 6*procs; + rdispls_nrhs = itemp + 7*procs; + + /* Count the number of X entries to be sent to each process.*/ + for (p = 0; p < procs; ++p) SendCnt[p] = 0; + num_diag_procs = SOLVEstruct->num_diag_procs; + diag_procs = SOLVEstruct->diag_procs; + + for (p = 0; p < num_diag_procs; ++p) { /* for all diagonal processes */ + pkk = diag_procs[p]; + if ( iam == pkk ) { + for (k = p; k < nsupers; k += num_diag_procs) { + knsupc = SuperSize( k ); + irow = FstBlockC( k ); + for (i = 0; i < knsupc; ++i) { +#if 0 + q = row_to_proc[inv_perm_c[irow]]; +#else + q = row_to_proc[irow]; +#endif + ++SendCnt[q]; + ++irow; + } + } + } + } + + MPI_Alltoall(SendCnt, 1, MPI_INT, RecvCnt, 1, MPI_INT, grid->comm); + sdispls[0] = rdispls[0] = 0; + sdispls_nrhs[0] = rdispls_nrhs[0] = 0; + SendCnt_nrhs[0] = SendCnt[0] * nrhs; + RecvCnt_nrhs[0] = RecvCnt[0] * nrhs; + for (p = 1; p < procs; ++p) { + sdispls[p] = sdispls[p-1] + SendCnt[p-1]; + rdispls[p] = rdispls[p-1] + RecvCnt[p-1]; + sdispls_nrhs[p] = sdispls[p] * nrhs; + rdispls_nrhs[p] = rdispls[p] * nrhs; + SendCnt_nrhs[p] = SendCnt[p] * nrhs; + RecvCnt_nrhs[p] = RecvCnt[p] * nrhs; + } + + /* This is saved for repeated solves, and is freed in pxgstrs_finalize().*/ + gstrs_comm->X_to_B_SendCnt = SendCnt; + + if ( !(ptr_to_ibuf = SUPERLU_MALLOC(2*procs * sizeof(int))) ) + ABORT("Malloc fails for ptr_to_ibuf[]."); + gstrs_comm->ptr_to_ibuf = ptr_to_ibuf; + gstrs_comm->ptr_to_dbuf = ptr_to_ibuf + procs; + + return 0; +} /* PSGSTRS_INIT */ + + +/*! \brief Initialize the data structure for the solution phase. + */ +int sSolveInit(superlu_dist_options_t *options, SuperMatrix *A, + int_t perm_r[], int_t perm_c[], int_t nrhs, + sLUstruct_t *LUstruct, gridinfo_t *grid, + sSOLVEstruct_t *SOLVEstruct) +{ + int_t *row_to_proc, *inv_perm_c, *itemp; + NRformat_loc *Astore; + int_t i, fst_row, m_loc, p; + int procs; + + Astore = (NRformat_loc *) A->Store; + fst_row = Astore->fst_row; + m_loc = Astore->m_loc; + procs = grid->nprow * grid->npcol; + + if ( !(row_to_proc = intMalloc_dist(A->nrow)) ) + ABORT("Malloc fails for row_to_proc[]"); + SOLVEstruct->row_to_proc = row_to_proc; + if ( !(inv_perm_c = intMalloc_dist(A->ncol)) ) + ABORT("Malloc fails for inv_perm_c[]."); + for (i = 0; i < A->ncol; ++i) inv_perm_c[perm_c[i]] = i; + SOLVEstruct->inv_perm_c = inv_perm_c; + + /* ------------------------------------------------------------ + EVERY PROCESS NEEDS TO KNOW GLOBAL PARTITION. + SET UP THE MAPPING BETWEEN ROWS AND PROCESSES. + + NOTE: For those processes that do not own any row, it must + must be set so that fst_row == A->nrow. + ------------------------------------------------------------*/ + if ( !(itemp = intMalloc_dist(procs+1)) ) + ABORT("Malloc fails for itemp[]"); + MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, + grid->comm); + itemp[procs] = A->nrow; + for (p = 0; p < procs; ++p) { + for (i = itemp[p] ; i < itemp[p+1]; ++i) row_to_proc[i] = p; + } +#if ( DEBUGlevel>=2 ) + if ( !grid->iam ) { + printf("fst_row = %d\n", fst_row); + PrintInt10("row_to_proc", A->nrow, row_to_proc); + PrintInt10("inv_perm_c", A->ncol, inv_perm_c); + } +#endif + SUPERLU_FREE(itemp); + +#if 0 + /* Compute the mapping between rows and processes. */ + /* XSL NOTE: What happens if # of mapped processes is smaller + than total Procs? For the processes without any row, let + fst_row be EMPTY (-1). Make sure this case works! */ + MPI_Allgather(&fst_row, 1, mpi_int_t, itemp, 1, mpi_int_t, + grid->comm); + itemp[procs] = n; + for (p = 0; p < procs; ++p) { + j = itemp[p]; + if ( j != EMPTY ) { + k = itemp[p+1]; + if ( k == EMPTY ) k = n; + for (i = j ; i < k; ++i) row_to_proc[i] = p; + } + } +#endif + + get_diag_procs(A->ncol, LUstruct->Glu_persist, grid, + &SOLVEstruct->num_diag_procs, + &SOLVEstruct->diag_procs, + &SOLVEstruct->diag_len); + + /* Setup communication pattern for redistribution of B and X. */ + if ( !(SOLVEstruct->gstrs_comm = (pxgstrs_comm_t *) + SUPERLU_MALLOC(sizeof(pxgstrs_comm_t))) ) + ABORT("Malloc fails for gstrs_comm[]"); + psgstrs_init(A->ncol, m_loc, nrhs, fst_row, perm_r, perm_c, grid, + LUstruct->Glu_persist, SOLVEstruct); + + if ( !(SOLVEstruct->gsmv_comm = (psgsmv_comm_t *) + SUPERLU_MALLOC(sizeof(psgsmv_comm_t))) ) + ABORT("Malloc fails for gsmv_comm[]"); + SOLVEstruct->A_colind_gsmv = NULL; + + options->SolveInitialized = YES; + return 0; +} /* sSolveInit */ + +/*! \brief Release the resources used for the solution phase. + */ +void sSolveFinalize(superlu_dist_options_t *options, sSOLVEstruct_t *SOLVEstruct) +{ + pxgstrs_finalize(SOLVEstruct->gstrs_comm); + + if ( options->RefineInitialized ) { + psgsmv_finalize(SOLVEstruct->gsmv_comm); + options->RefineInitialized = NO; + } + SUPERLU_FREE(SOLVEstruct->gsmv_comm); + SUPERLU_FREE(SOLVEstruct->row_to_proc); + SUPERLU_FREE(SOLVEstruct->inv_perm_c); + SUPERLU_FREE(SOLVEstruct->diag_procs); + SUPERLU_FREE(SOLVEstruct->diag_len); + if ( SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(SOLVEstruct->A_colind_gsmv); + options->SolveInitialized = NO; +} /* sSolveFinalize */ + +/*! \brief Check the inf-norm of the error vector + */ +void psinf_norm_error(int iam, int_t n, int_t nrhs, float x[], int_t ldx, + float xtrue[], int_t ldxtrue, MPI_Comm slucomm) +{ + float err, xnorm, temperr, tempxnorm; + float *x_work, *xtrue_work; + int i, j; + + for (j = 0; j < nrhs; j++) { + x_work = &x[j*ldx]; + xtrue_work = &xtrue[j*ldxtrue]; + err = xnorm = 0.0; + for (i = 0; i < n; i++) { + err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i])); + xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i])); + } + + /* get the golbal max err & xnrom */ + temperr = err; + tempxnorm = xnorm; + MPI_Allreduce( &temperr, &err, 1, MPI_FLOAT, MPI_MAX, slucomm); + MPI_Allreduce( &tempxnorm, &xnorm, 1, MPI_FLOAT, MPI_MAX, slucomm); + + err = err / xnorm; + if ( !iam ) printf("\tSol %2d: ||X-Xtrue||/||X|| = %e\n", j, err); + } +} + +/*! \brief Destroy broadcast and reduction trees used in triangular solve */ +void +sDestroy_Tree(int_t n, gridinfo_t *grid, sLUstruct_t *LUstruct) +{ + int_t i, nb, nsupers; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; +#if ( DEBUGlevel>=1 ) + int iam; + MPI_Comm_rank( MPI_COMM_WORLD, &iam ); + CHECK_MALLOC(iam, "Enter Destroy_Tree()"); +#endif + + nsupers = Glu_persist->supno[n-1] + 1; + + nb = CEILING(nsupers, grid->npcol); + for (i=0;iLBtree_ptr[i]!=NULL){ + BcTree_Destroy(Llu->LBtree_ptr[i],LUstruct->dt); + } + if(Llu->UBtree_ptr[i]!=NULL){ + BcTree_Destroy(Llu->UBtree_ptr[i],LUstruct->dt); + } + } + SUPERLU_FREE(Llu->LBtree_ptr); + SUPERLU_FREE(Llu->UBtree_ptr); + + nb = CEILING(nsupers, grid->nprow); + for (i=0;iLRtree_ptr[i]!=NULL){ + RdTree_Destroy(Llu->LRtree_ptr[i],LUstruct->dt); + } + if(Llu->URtree_ptr[i]!=NULL){ + RdTree_Destroy(Llu->URtree_ptr[i],LUstruct->dt); + } + } + SUPERLU_FREE(Llu->LRtree_ptr); + SUPERLU_FREE(Llu->URtree_ptr); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Exit sDestroy_Tree()"); +#endif +} + + diff --git a/SRC/sSchCompUdt-2Ddynamic.c b/SRC/sSchCompUdt-2Ddynamic.c new file mode 100644 index 00000000..f4dc643e --- /dev/null +++ b/SRC/sSchCompUdt-2Ddynamic.c @@ -0,0 +1,714 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief This file contains the main loop of pdgstrf which involves rank k + * update of the Schur complement. + * Uses 2D partitioning for the scatter phase. + * + *
+ * -- Distributed SuperLU routine (version 5.4) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ * Modified:
+ *   September 14, 2017
+ *   - First gather U-panel, then depending on "ldu" (excluding leading zeros),
+ *     gather only trailing columns of the L-panel corresponding to the nonzero
+ *     of U-rows.
+ *   - Padding zeros for nice dimensions of GEMM.
+ *
+ *  June 1, 2018  add parallel AWPM pivoting; add back arrive_at_ublock()
+ */
+
+#define SCHEDULE_STRATEGY guided
+
+/*
+ * Buffers:
+ *     [ lookAhead_L_buff | Remain_L_buff ] : stores the gathered L-panel
+ *                                            (A matrix in C := A*B )
+ *     bigU : stores the U-panel (B matrix in C := A*B)
+ *     bigV : stores the block GEMM result (C matrix in C := A*B)
+ */
+
+if ( msg0 && msg2 ) { /* L(:,k) and U(k,:) are not empty. */
+    int cum_nrow = 0; /* cumulative number of nonzero rows in L(:,k) */
+    int temp_nbrow;   /* nonzero rows in current block L(i,k) */
+    lptr  = lptr0;
+    luptr = luptr0;
+    int Lnbrow, Rnbrow; /* number of nonzero rows in look-ahead window,
+			   and remaining part.  */
+
+    /*******************************************************************
+     * Separating L blocks into the top part within look-ahead window
+     * and the remaining ones.
+     *******************************************************************/
+
+     int lookAheadBlk=0, RemainBlk=0;
+
+     tt_start = SuperLU_timer_();
+
+     /* Sherry -- can this loop be threaded?? */
+     /* Loop through all blocks in L(:,k) to set up pointers to the start
+      * of each block in the data arrays.
+      *   - lookAheadFullRow[i] := number of nonzero rows from block 0 to i
+      *   - lookAheadStRow[i] := number of nonzero rows before block i
+      *   - lookAhead_lptr[i] := point to the start of block i in L's index[]
+      *   - (ditto Remain_Info[i])
+      */
+     for (int i = 0; i < nlb; ++i) {
+	 ib = lsub[lptr];            /* Block number of L(i,k). */
+	 temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+
+	 int look_up_flag = 1; /* assume ib is outside look-up window */
+	 for (int j = k0+1; j < SUPERLU_MIN (k0 + num_look_aheads+2, nsupers );
+	      ++j) {
+		 if ( ib == perm_c_supno[j] ) {
+		     look_up_flag = 0; /* flag ib within look-up window */
+                     break;            /* Sherry -- can exit the loop?? */
+                 }
+	 }
+
+	 if ( look_up_flag == 0 ) { /* ib is within look-up window */
+	     if (lookAheadBlk==0) {
+		 lookAheadFullRow[lookAheadBlk] = temp_nbrow;
+	     } else {
+		 lookAheadFullRow[lookAheadBlk] =
+		     temp_nbrow + lookAheadFullRow[lookAheadBlk-1];
+	     }
+	     lookAheadStRow[lookAheadBlk] = cum_nrow;
+	     lookAhead_lptr[lookAheadBlk] = lptr;
+	     lookAhead_ib[lookAheadBlk] = ib;
+	     lookAheadBlk++;
+	 } else { /* ib is not in look-up window */
+	     if ( RemainBlk==0 ) {
+		 Remain_info[RemainBlk].FullRow = temp_nbrow;
+	     } else {
+		 Remain_info[RemainBlk].FullRow =
+		     temp_nbrow + Remain_info[RemainBlk-1].FullRow;
+	     }
+             RemainStRow[RemainBlk] = cum_nrow;
+             // Remain_lptr[RemainBlk] = lptr;
+	     Remain_info[RemainBlk].lptr = lptr;
+	     // Remain_ib[RemainBlk] = ib;
+	     Remain_info[RemainBlk].ib = ib;
+	     RemainBlk++;
+	 }
+
+         cum_nrow += temp_nbrow;
+
+	 lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+	 lptr += temp_nbrow;     /* Move to next block */
+	 luptr += temp_nbrow;
+     }  /* for i ... set up pointers for all blocks in L(:,k) */
+
+     lptr = lptr0;
+     luptr = luptr0;
+
+     /* leading dimension of L look-ahead buffer, same as Lnbrow */
+     //int LDlookAhead_LBuff = lookAheadBlk==0 ? 0 :lookAheadFullRow[lookAheadBlk-1];
+     Lnbrow = lookAheadBlk==0 ? 0 : lookAheadFullRow[lookAheadBlk-1];
+     /* leading dimension of L remaining buffer, same as Rnbrow */
+     //int LDRemain_LBuff = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+     Rnbrow = RemainBlk==0 ? 0 : Remain_info[RemainBlk-1].FullRow;
+     /* assert( cum_nrow == (LDlookAhead_LBuff + LDRemain_LBuff) );*/
+     /* Piyush fix */
+     //int LDlookAhead_LBuff = lookAheadBlk==0? 0 : lookAheadFullRow[lookAheadBlk-1];
+
+     nbrow = Lnbrow + Rnbrow; /* total number of rows in L */
+     LookAheadRowSepMOP += 2*knsupc*(nbrow);
+
+     /***********************************************
+      * Gather U blocks (AFTER LOOK-AHEAD WINDOW)   *
+      ***********************************************/
+     tt_start = SuperLU_timer_();
+
+     if ( nbrow > 0 ) { /* L(:,k) is not empty */
+	 /*
+	  * Counting U blocks
+	  */
+     	 ldu = 0; /* Calculate ldu for U(k,:) after look-ahead window. */
+	 ncols = 0; /* Total number of nonzero columns in U(k,:) */
+	 int temp_ncols = 0;
+
+	 /* jj0 contains the look-ahead window that was updated in
+	    dlook_ahead_update.c. Now the search can continue from that point,
+	    not to start from block 0. */
+#if 0 // Sherry comment out 5/21/2018
+	 /* Save pointers at location right after look-ahead window
+	    for later restart. */
+	 iukp0 = iukp;
+	 rukp0 = rukp;
+#endif
+
+	 /* if ( iam==0 ) printf("--- k0 %d, k %d, jj0 %d, nub %d\n", k0, k, jj0, nub);*/
+
+         /*
+	  * Loop through all blocks in U(k,:) to set up pointers to the start
+          * of each block in the data arrays, store them in Ublock_info[j]
+          * for block U(k,j).
+  	  */
+	 for (j = jj0; j < nub; ++j) { /* jj0 starts after look-ahead window. */
+	     temp_ncols = 0;
+#if 1
+	     /* Cannot remove following call, since perm_u != Identity  */
+	     arrive_at_ublock(
+			      j, &iukp, &rukp, &jb, &ljb, &nsupc,
+			      iukp0, rukp0, usub, perm_u, xsup, grid
+			      );
+#else
+	     jb = usub[iukp];
+	     /* ljb = LBj (jb, grid);   Local block number of U(k,j). */
+	     nsupc = SuperSize(jb);
+	     iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+#endif
+	     Ublock_info[j].iukp = iukp;
+	     Ublock_info[j].rukp = rukp;
+	     Ublock_info[j].jb = jb;
+
+	     /* if ( iam==0 )
+		 printf("j %d: Ublock_info[j].iukp %d, Ublock_info[j].rukp %d,"
+			"Ublock_info[j].jb %d, nsupc %d\n",
+			j, Ublock_info[j].iukp, Ublock_info[j].rukp,
+			Ublock_info[j].jb, nsupc); */
+
+	     /* Prepare to call GEMM. */
+	     jj = iukp;
+	     for (; jj < iukp+nsupc; ++jj) {
+		 segsize = klst - usub[jj];
+		 if ( segsize ) {
+                    ++temp_ncols;
+                    if ( segsize > ldu ) ldu = segsize;
+		 }
+	     }
+
+	     Ublock_info[j].full_u_cols = temp_ncols;
+	     ncols += temp_ncols;
+#if 0 // Sherry comment out 5/31/2018 */
+	     /* Jump number of nonzeros in block U(k,jj);
+		Move to block U(k,j+1) in nzval[] array.  */
+	     rukp += usub[iukp - 1];
+	     iukp += nsupc;
+#endif
+         } /* end for j ... compute ldu & ncols */
+
+	 /* Now doing prefix sum on full_u_cols.
+	  * After this, full_u_cols is the number of nonzero columns
+          * from block 0 to block j.
+          */
+	 for ( j = jj0+1; j < nub; ++j) {
+	     Ublock_info[j].full_u_cols += Ublock_info[j-1].full_u_cols;
+	 }
+
+	 /* Padding zeros to make {m,n,k} multiple of vector length. */
+	 jj = 8; //n;
+	 if (gemm_padding > 0 && Rnbrow > jj && ncols > jj && ldu > jj) {
+	     gemm_m_pad = Rnbrow + (Rnbrow % GEMM_PADLEN);
+	     gemm_n_pad = ncols + (ncols % GEMM_PADLEN);
+	     //gemm_n_pad = ncols;
+	     //gemm_k_pad = ldu + (ldu % GEMM_PADLEN);
+	     gemm_k_pad = ldu;
+
+	     for (i = Rnbrow; i < gemm_m_pad; ++i)  // padding A matrix
+		 for (j = 0; j < gemm_k_pad; ++j)
+		     Remain_L_buff[i + j*gemm_m_pad] = zero;
+	     for (i = 0; i < Rnbrow; ++i)
+		 for (j = ldu; j < gemm_k_pad; ++j)
+		     Remain_L_buff[i + j*gemm_m_pad] = zero;
+	     for (i = ldu; i < gemm_k_pad; ++i)     // padding B matrix
+		 for (j = 0; j < gemm_n_pad; ++j)
+		     bigU[i + j*gemm_k_pad] = zero;
+	     for (i = 0; i < ldu; ++i)
+		 for (j = ncols; j < gemm_n_pad; ++j)
+		     bigU[i + j*gemm_k_pad] = zero;
+	 } else {
+	     gemm_m_pad = Rnbrow;
+	     gemm_n_pad = ncols;
+	     gemm_k_pad = ldu;
+	 }
+
+	 tempu = bigU; /* buffer the entire row block U(k,:) */
+
+         /* Gather U(k,:) into buffer bigU[] to prepare for GEMM */
+#ifdef _OPENMP
+#pragma omp parallel for firstprivate(iukp, rukp) \
+    private(j,tempu, jb, nsupc,ljb,segsize, lead_zero, jj, i) \
+    default (shared) schedule(SCHEDULE_STRATEGY)
+#endif
+        for (j = jj0; j < nub; ++j) { /* jj0 starts after look-ahead window. */
+
+            if (j==jj0) tempu = bigU;
+            //else tempu = bigU + ldu * Ublock_info[j-1].full_u_cols;
+            else tempu = bigU + gemm_k_pad * Ublock_info[j-1].full_u_cols;
+
+            /* == processing each of the remaining columns in parallel == */
+#if 0
+	    /* Can remove following call, since search was already done.  */
+            arrive_at_ublock(j, &iukp, &rukp, &jb, &ljb, &nsupc,
+			     iukp0, rukp0, usub,perm_u, xsup, grid);
+#else
+	    iukp = Ublock_info[j].iukp;
+	    rukp = Ublock_info[j].rukp;
+	    jb = Ublock_info[j].jb;
+	    nsupc = SuperSize (jb );
+#endif
+            /* Copy from U(k,j) to tempu[], padding zeros.  */
+            for (jj = iukp; jj < iukp+nsupc; ++jj) {
+                segsize = klst - usub[jj];
+                if ( segsize ) {
+                    lead_zero = ldu - segsize;
+                    for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+		    //tempu += lead_zero;
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+		    for (i = 0; i < segsize; ++i)
+                    	tempu[i+lead_zero] = uval[rukp+i];
+                    rukp += segsize;
+                    tempu += gemm_k_pad;
+                }
+	    }
+        }   /* parallel for j = jj0 .. nub */
+
+#if 0
+	if (ldu==0) printf("[%d] .. k0 %d, before updating: ldu %d, Lnbrow %d, Rnbrow %d, ncols %d\n",iam,k0,ldu,Lnbrow,Rnbrow, ncols);
+	fflush(stdout);
+#endif
+
+        GatherMOP += 2*ldu*ncols;
+
+    }  /* end if (nbrow>0), end gather U blocks */
+
+    GatherUTimer += SuperLU_timer_() - tt_start;
+    int jj_cpu = nub;       /* limit between CPU and GPU */
+    int thread_id;
+    /*tempv = bigV;*/
+
+    /**********************
+     * Gather L blocks    *
+     **********************/
+     tt_start = SuperLU_timer_();
+
+     /* Loop through the look-ahead blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for private(j,jj,tempu,tempv) default (shared)
+#endif
+     for (i = 0; i < lookAheadBlk; ++i) {
+	 int StRowDest, temp_nbrow;
+	 if ( i==0 ) {
+	     StRowDest = 0;
+	     temp_nbrow = lookAheadFullRow[0];
+	 } else {
+	     StRowDest   = lookAheadFullRow[i-1];
+	     temp_nbrow  = lookAheadFullRow[i]-lookAheadFullRow[i-1];
+	 }
+
+	 int StRowSource = lookAheadStRow[i];
+
+	 /* Now copying one block into L lookahead buffer */
+	 /* #pragma omp parallel for (gives slow down) */
+	 // for (int j = 0; j < knsupc; ++j) {
+	 for (j = knsupc-ldu; j < knsupc; ++j) { /* skip leading columns
+						    corresponding to zero U rows */
+#if 1
+	     /* Better let compiler generate memcpy or vectorized code. */
+	     //tempu = &lookAhead_L_buff[StRowDest + j*LDlookAhead_LBuff];
+	     //tempu = &lookAhead_L_buff[StRowDest + j * Lnbrow];
+	     tempu = &lookAhead_L_buff[StRowDest + (j - (knsupc-ldu)) * Lnbrow];
+	     tempv = &lusup[luptr+j*nsupr + StRowSource];
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+	     for (jj = 0; jj < temp_nbrow; ++jj) tempu[jj] = tempv[jj];
+#else
+	     //memcpy(&lookAhead_L_buff[StRowDest + j*LDlookAhead_LBuff],
+	     memcpy(&lookAhead_L_buff[StRowDest + (j - (knsupc-ldu)) * Lnbrow],
+		    &lusup[luptr+j*nsupr + StRowSource],
+		    temp_nbrow * sizeof(float) );
+#endif
+	 } /* end for j ... */
+     } /* parallel for i ... gather Lval blocks from lookahead window */
+
+     /* Loop through the remaining blocks to copy Lval into the buffer */
+#ifdef _OPENMP
+#pragma omp parallel for private(i,j,jj,tempu,tempv) default (shared) \
+    schedule(SCHEDULE_STRATEGY)
+#endif
+     for (int i = 0; i < RemainBlk; ++i) {
+         int StRowDest, temp_nbrow;
+         if ( i==0 )  {
+	     StRowDest  = 0;
+	     temp_nbrow = Remain_info[0].FullRow;
+	 } else  {
+	     StRowDest   = Remain_info[i-1].FullRow;
+	     temp_nbrow  = Remain_info[i].FullRow - Remain_info[i-1].FullRow;
+	 }
+
+	 int StRowSource = RemainStRow[i];
+
+	 /* Now copying a block into L remaining buffer */
+	 // #pragma omp parallel for (gives slow down)
+	 // for (int j = 0; j < knsupc; ++j) {
+	 for (int j = knsupc-ldu; j < knsupc; ++j) {
+	     // printf("StRowDest %d Rnbrow %d StRowSource %d \n", StRowDest,Rnbrow ,StRowSource);
+#if 1
+	     /* Better let compiler generate memcpy or vectorized code. */
+	     //tempu = &Remain_L_buff[StRowDest + j*LDRemain_LBuff];
+	     //tempu = &Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * Rnbrow];
+	     tempu = &Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * gemm_m_pad];
+	     tempv = &lusup[luptr + j*nsupr + StRowSource];
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+	     for (jj = 0; jj < temp_nbrow; ++jj) tempu[jj] = tempv[jj];
+#else
+	     //memcpy(&Remain_L_buff[StRowDest + j*LDRemain_LBuff],
+	     memcpy(&Remain_L_buff[StRowDest + (j - (knsupc-ldu)) * gemm_m_pad],
+		    &lusup[luptr+j*nsupr + StRowSource],
+                    temp_nbrow * sizeof(float) );
+#endif
+	 } /* end for j ... */
+     } /* parallel for i ... copy Lval into the remaining buffer */
+
+     tt_end = SuperLU_timer_();
+     GatherLTimer += tt_end - tt_start;
+
+
+     /*************************************************************************
+      * Perform GEMM (look-ahead L part, and remain L part) followed by Scatter
+      *************************************************************************/
+     tempu = bigU;  /* setting to the start of padded U(k,:) */
+
+     if ( Lnbrow>0 && ldu>0 && ncols>0 ) { /* Both L(:,k) and U(k,:) nonempty */
+	 /***************************************************************
+	  * Updating blocks in look-ahead window of the LU(look-ahead-rows,:)
+	  ***************************************************************/
+
+	 /* Count flops for total GEMM calls */
+	 ncols = Ublock_info[nub-1].full_u_cols;
+ 	 flops_t flps = 2.0 * (flops_t)Lnbrow * ldu * ncols;
+	 LookAheadScatterMOP += 3 * Lnbrow * ncols; /* scatter-add */
+	 schur_flop_counter += flps;
+	 stat->ops[FACT]    += flps;
+	 LookAheadGEMMFlOp  += flps;
+
+#ifdef _OPENMP
+#pragma omp parallel default (shared) private(thread_id)
+	 {
+#ifdef _OPENMP	 
+	   thread_id = omp_get_thread_num();
+#else	   
+	   thread_id = 0;
+#endif
+
+	   /* Ideally, should organize the loop as:
+	      for (j = 0; j < nub; ++j) {
+	          for (lb = 0; lb < lookAheadBlk; ++lb) {
+	               L(lb,k) X U(k,j) -> tempv[]
+		  }
+	      }
+	      But now, we use collapsed loop to achieve more parallelism.
+	      Total number of block updates is:
+	      (# of lookAheadBlk in L(:,k)) X (# of blocks in U(k,:))
+	   */
+
+	   int i = sizeof(int);
+	   int* indirect_thread    = indirect + (ldt + CACHELINE/i) * thread_id;
+	   int* indirect2_thread   = indirect2 + (ldt + CACHELINE/i) * thread_id;
+
+#pragma omp for \
+    private (nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
+    schedule(dynamic)
+#else /* not use _OPENMP */
+	   thread_id = 0;
+	   int* indirect_thread    = indirect;
+	   int* indirect2_thread   = indirect2;
+#endif
+	   /* Each thread is assigned one loop index ij, responsible for
+	      block update L(lb,k) * U(k,j) -> tempv[]. */
+	   for (int ij = 0; ij < lookAheadBlk*(nub-jj0); ++ij) {
+	       /* jj0 starts after look-ahead window. */
+            int j   = ij/lookAheadBlk + jj0;
+            int lb  = ij%lookAheadBlk;
+
+            /* Getting U block U(k,j) information */
+            /* unsigned long long ut_start, ut_end; */
+            int_t rukp =  Ublock_info[j].rukp;
+            int_t iukp =  Ublock_info[j].iukp;
+            int jb   =  Ublock_info[j].jb;
+            int nsupc = SuperSize(jb);
+            int ljb = LBj (jb, grid);  /* destination column block */
+            int st_col;
+            int ncols;  /* Local variable counts only columns in the block */
+            if ( j > jj0 ) { /* jj0 starts after look-ahead window. */
+                ncols  = Ublock_info[j].full_u_cols-Ublock_info[j-1].full_u_cols;
+                st_col = Ublock_info[j-1].full_u_cols;
+            } else {
+                ncols  = Ublock_info[j].full_u_cols;
+                st_col = 0;
+            }
+
+            /* Getting L block L(i,k) information */
+            int_t lptr = lookAhead_lptr[lb];
+            int ib   = lookAhead_ib[lb];
+            int temp_nbrow = lsub[lptr+1];
+            lptr += LB_DESCRIPTOR;
+            int cum_nrow = (lb==0 ? 0 : lookAheadFullRow[lb-1]);
+
+	    /* Block-by-block GEMM in look-ahead window */
+#if 0
+	    i = sizeof(float);
+	    float* tempv1 = bigV + thread_id * (ldt*ldt + CACHELINE/i);
+#else
+	    float* tempv1 = bigV + thread_id * (ldt*ldt);
+#endif
+
+#if ( PRNTlevel>= 1)
+	    if (thread_id == 0) tt_start = SuperLU_timer_();
+	    gemm_max_m = SUPERLU_MAX(gemm_max_m, temp_nbrow);
+	    gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
+	    gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
+#endif
+
+#if defined (USE_VENDOR_BLAS)
+            sgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+		   //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+		   &lookAhead_L_buff[cum_nrow], &Lnbrow,
+		   &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow, 1, 1);
+#else
+            sgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+		   //&lookAhead_L_buff[(knsupc-ldu)*Lnbrow+cum_nrow], &Lnbrow,
+		   &lookAhead_L_buff[cum_nrow], &Lnbrow,
+		   &tempu[st_col*ldu], &ldu, &beta, tempv1, &temp_nbrow);
+#endif
+
+#if (PRNTlevel>=1 )
+	    if (thread_id == 0) {
+		tt_end = SuperLU_timer_();
+		LookAheadGEMMTimer += tt_end - tt_start;
+		tt_start = tt_end;
+	    }
+#endif
+            if ( ib < jb ) {
+                sscatter_u (
+				 ib, jb,
+				 nsupc, iukp, xsup,
+				 klst, temp_nbrow,
+				 lptr, temp_nbrow, lsub,
+				 usub, tempv1,
+				 Ufstnz_br_ptr, Unzval_br_ptr,
+				 grid
+			        );
+            } else {
+#if 0
+		//#ifdef USE_VTUNE
+	    __SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+	    __itt_resume(); // start VTune, again use 2 underscores
+#endif
+                sscatter_l (
+				 ib, ljb,
+				 nsupc, iukp, xsup,
+ 				 klst, temp_nbrow,
+				 lptr, temp_nbrow,
+				 usub, lsub, tempv1,
+				 indirect_thread, indirect2_thread,
+				 Lrowind_bc_ptr, Lnzval_bc_ptr,
+				 grid
+				);
+#if 0
+		//#ifdef USE_VTUNE
+		__itt_pause(); // stop VTune
+		__SSC_MARK(0x222); // stop SDE tracing
+#endif
+            }
+
+#if ( PRNTlevel>=1 )
+	    if (thread_id == 0)
+		LookAheadScatterTimer += SuperLU_timer_() - tt_start;
+#endif
+	   } /* end omp for ij = ... */
+
+#ifdef _OPENMP
+	 } /* end omp parallel */
+#endif
+     } /* end if Lnbrow>0 ... look-ahead GEMM and scatter */
+
+    /***************************************************************
+     * Updating remaining rows and columns on CPU.
+     ***************************************************************/
+    ncols = jj_cpu==0 ? 0 : Ublock_info[jj_cpu-1].full_u_cols;
+
+    if ( Rnbrow>0 && ldu>0 ) { /* There are still blocks remaining ... */
+	double flps = 2.0 * (double)Rnbrow * ldu * ncols;
+	schur_flop_counter  += flps;
+	stat->ops[FACT]     += flps;
+
+#if ( PRNTlevel>=1 )
+	RemainGEMM_flops += flps;
+	gemm_max_m = SUPERLU_MAX(gemm_max_m, Rnbrow);
+	gemm_max_n = SUPERLU_MAX(gemm_max_n, ncols);
+	gemm_max_k = SUPERLU_MAX(gemm_max_k, ldu);
+	tt_start = SuperLU_timer_();
+	/* printf("[%d] .. k0 %d, before large GEMM: %d-%d-%d, RemainBlk %d\n",
+	   iam, k0,Rnbrow,ldu,ncols,RemainBlk);  fflush(stdout);
+	assert( Rnbrow*ncols < bigv_size ); */
+#endif
+	/* calling aggregated large GEMM, result stored in bigV[]. */
+#if defined (USE_VENDOR_BLAS)
+	//sgemm_("N", "N", &Rnbrow, &ncols, &ldu, &alpha,
+	sgemm_("N", "N", &gemm_m_pad, &gemm_n_pad, &gemm_k_pad, &alpha,
+	       //&Remain_L_buff[(knsupc-ldu)*Rnbrow], &Rnbrow,
+	       &Remain_L_buff[0], &gemm_m_pad,
+	       &bigU[0], &gemm_k_pad, &beta, bigV, &gemm_m_pad, 1, 1);
+#else
+	//sgemm_("N", "N", &Rnbrow, &ncols, &ldu, &alpha,
+	sgemm_("N", "N", &gemm_m_pad, &gemm_n_pad, &gemm_k_pad, &alpha,
+	       //&Remain_L_buff[(knsupc-ldu)*Rnbrow], &Rnbrow,
+	       &Remain_L_buff[0], &gemm_m_pad,
+	       &bigU[0], &gemm_k_pad, &beta, bigV, &gemm_m_pad);
+#endif
+
+#if ( PRNTlevel>=1 )
+	tt_end = SuperLU_timer_();
+	RemainGEMMTimer += tt_end - tt_start;
+#if ( PROFlevel>=1 )
+	//fprintf(fgemm, "%8d%8d%8d %16.8e\n", Rnbrow, ncols, ldu,
+	// (tt_end - tt_start)*1e6); // time in microsecond
+	//fflush(fgemm);
+	gemm_stats[gemm_count].m = Rnbrow;
+	gemm_stats[gemm_count].n = ncols;
+	gemm_stats[gemm_count].k = ldu;
+	gemm_stats[gemm_count++].microseconds = (tt_end - tt_start) * 1e6;
+#endif
+	tt_start = SuperLU_timer_();
+#endif
+
+#ifdef USE_VTUNE
+	__SSC_MARK(0x111);// start SDE tracing, note uses 2 underscores
+	__itt_resume(); // start VTune, again use 2 underscores
+#endif
+
+	/* Scatter into destination block-by-block. */
+#ifdef _OPENMP
+#pragma omp parallel default(shared) private(thread_id)
+	{
+#ifdef _OPENMP	
+	    thread_id = omp_get_thread_num();
+#else	    
+	    thread_id = 0;
+#endif
+
+	    /* Ideally, should organize the loop as:
+               for (j = 0; j < jj_cpu; ++j) {
+	           for (lb = 0; lb < RemainBlk; ++lb) {
+	               L(lb,k) X U(k,j) -> tempv[]
+                   }
+               }
+	       But now, we use collapsed loop to achieve more parallelism.
+	       Total number of block updates is:
+	       (# of RemainBlk in L(:,k)) X (# of blocks in U(k,:))
+	    */
+
+	    int i = sizeof(int);
+	    int* indirect_thread = indirect + (ldt + CACHELINE/i) * thread_id;
+	    int* indirect2_thread = indirect2 + (ldt + CACHELINE/i) * thread_id;
+
+#pragma omp for \
+    private (j,lb,rukp,iukp,jb,nsupc,ljb,lptr,ib,temp_nbrow,cum_nrow)	\
+    schedule(dynamic)
+#else /* not use _OPENMP */
+	    thread_id = 0;
+	    int* indirect_thread = indirect;
+	    int* indirect2_thread = indirect2;
+#endif
+	    /* Each thread is assigned one loop index ij, responsible for
+	       block update L(lb,k) * U(k,j) -> tempv[]. */
+	    for (int ij = 0; ij < RemainBlk*(jj_cpu-jj0); ++ij) {
+		/* jj_cpu := nub, jj0 starts after look-ahead window. */
+		int j   = ij / RemainBlk + jj0; /* j-th block in U panel */
+		int lb  = ij % RemainBlk;       /* lb-th block in L panel */
+
+		/* Getting U block U(k,j) information */
+		/* unsigned long long ut_start, ut_end; */
+		int_t rukp =  Ublock_info[j].rukp;
+		int_t iukp =  Ublock_info[j].iukp;
+		int jb   =  Ublock_info[j].jb;
+		int nsupc = SuperSize(jb);
+		int ljb = LBj (jb, grid);
+		int st_col;
+		int ncols;
+		if ( j>jj0 ) {
+		    ncols = Ublock_info[j].full_u_cols - Ublock_info[j-1].full_u_cols;
+		    st_col = Ublock_info[j-1].full_u_cols;
+		} else {
+		    ncols = Ublock_info[j].full_u_cols;
+		    st_col = 0;
+		}
+
+		/* Getting L block L(i,k) information */
+		int_t lptr = Remain_info[lb].lptr;
+		int ib   = Remain_info[lb].ib;
+		int temp_nbrow = lsub[lptr+1];
+		lptr += LB_DESCRIPTOR;
+		int cum_nrow = (lb==0 ? 0 : Remain_info[lb-1].FullRow);
+
+		/* tempv1 points to block(i,j) in bigV : LDA == Rnbrow */
+		//double* tempv1 = bigV + (st_col * Rnbrow + cum_nrow); Sherry
+		float* tempv1 = bigV + (st_col * gemm_m_pad + cum_nrow); /* Sherry */
+
+		// printf("[%d] .. before scatter: ib %d, jb %d, temp_nbrow %d, Rnbrow %d\n", iam, ib, jb, temp_nbrow, Rnbrow); fflush(stdout);
+
+		/* Now scattering the block */
+
+		if ( ib < jb ) {
+		    sscatter_u (
+				ib, jb,
+				nsupc, iukp, xsup,
+				//klst, Rnbrow, /*** klst, temp_nbrow, Sherry */
+				klst, gemm_m_pad, /*** klst, temp_nbrow, Sherry */
+				lptr, temp_nbrow, /* row dimension of the block */
+				lsub, usub, tempv1,
+				Ufstnz_br_ptr, Unzval_br_ptr,
+				grid
+				);
+		} else {
+		    sscatter_l(
+			       ib, ljb,
+			       nsupc, iukp, xsup,
+			       //klst, temp_nbrow, Sherry
+			       klst, gemm_m_pad, /*** temp_nbrow, Sherry */
+			       lptr, temp_nbrow, /* row dimension of the block */
+			       usub, lsub, tempv1,
+			       indirect_thread, indirect2_thread,
+			       Lrowind_bc_ptr,Lnzval_bc_ptr,
+			       grid
+			       );
+		}
+
+	    } /* end omp for (int ij =...) */
+
+#ifdef _OPENMP
+	} /* end omp parallel region */
+#endif
+
+#if ( PRNTlevel>=1 )
+	RemainScatterTimer += SuperLU_timer_() - tt_start;
+#endif
+
+#ifdef USE_VTUNE
+	__itt_pause(); // stop VTune
+	__SSC_MARK(0x222); // stop SDE tracing
+#endif
+
+    } /* end if Rnbrow>0 ... update remaining block */
+
+}  /* end if L(:,k) and U(k,:) are not empty */
diff --git a/SRC/sSchCompUdt-cuda.c b/SRC/sSchCompUdt-cuda.c
new file mode 100644
index 00000000..0fb95e9a
--- /dev/null
+++ b/SRC/sSchCompUdt-cuda.c
@@ -0,0 +1,589 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief This file contains the main loop of psgstrf which involves
+ *        rank k update of the Schur complement.
+ *        Uses CUDA GPU.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ */
+
+#define SCHEDULE_STRATEGY dynamic
+
+#define cublasCheckErrors(fn) \
+    do { \
+        cublasStatus_t __err = fn; \
+        if (__err != CUBLAS_STATUS_SUCCESS) { \
+            fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
+                (int)(__err), \
+                __FILE__, __LINE__); \
+            fprintf(stderr, "*** FAILED - ABORTING\n"); \
+            exit(1); \
+        } \
+    } while(0);
+
+int full;
+double gemm_timer = 0.0;
+double scatter_timer = 0.0;
+
+if ( msg0 && msg2 ) {  /* L(:,k) and U(k,:) are not empty. */
+    ldu   =0;
+    full  =1;
+    int cum_nrow;
+    int temp_nbrow;
+
+    lptr = lptr0;
+    luptr = luptr0;
+
+    nbrow= lsub[1];
+    if (myrow==krow) nbrow = lsub[1]-lsub[3];
+
+    if (nbrow>0) {
+
+        // Maximum number of columns that can fit in dC[buffer_size] on GPU 
+#if 0   // max_ldu can be < ldt, so bigu_size/ldt may be smaller, giving false alarm
+        int ncol_max = SUPERLU_MIN(buffer_size/nbrow,bigu_size/ldt);
+#else // Sherry fix
+        int ncol_max = SUPERLU_MIN(buffer_size/nbrow, max_ncols);
+#endif
+	
+        int num_streams_used, /* number of streams that will be used*/
+        ncpu_blks;            /* the leading number of CPU dgemm blks
+			         in each partition */
+        int jjj, jjj_st,jjj_global;
+        for (j = jj0; j < nub; ++j) {
+            arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+	    		      iukp0,rukp0,usub,perm_u,xsup,grid );
+
+            ncols =0 ;  //initialize at 0
+            jj = iukp;
+            int temp_ldu=0;
+            for (; jj < iukp+nsupc; ++jj) {
+                segsize = klst - usub[jj];
+                if ( segsize ) {
+		    ++ncols;
+		}
+                temp_ldu = SUPERLU_MAX(temp_ldu, segsize);
+            }
+
+            full_u_cols[j] = ncols;
+            blk_ldu[j] = temp_ldu;
+        } /* end for j = jj0..nub */
+
+        jjj = jj0; /* jj0 is the first block column after look-ahead window */
+
+        // #pragma omp barrier
+        while ( jjj < nub ) {
+            jjj_st=jjj;
+#ifdef _OPENMP
+#pragma omp single
+#endif
+            {
+                ldu = blk_ldu[jjj_st];
+                for (j = jjj_st; j < nub ; ++j) {
+
+                    /* prefix sum */
+                    if (j != jjj_st) full_u_cols[j] += full_u_cols[j-1];
+
+                    ldu = SUPERLU_MAX(ldu, blk_ldu[j]);
+
+                    /* break condition */
+                    /* the number of columns that can be processed on GPU is
+		       limited by buffer size */
+                    if (full_u_cols[j]+((j+1==nub)?0:full_u_cols[j+1]) > ncol_max) {
+                        break; // block column j+1 does not fit in GPU memory */
+                    }
+                } /* end for j=jjj_st to nub */
+
+                jjj_global = SUPERLU_MIN(nub, j+1); /* Maximum value of jjj < nub */
+
+                // TAU_STATIC_TIMER_START("work_divison");
+                /* Divide CPU-GPU gemm here.
+		 * If there is only one block, we leave it on CPU.
+		 */
+                gemm_division_cpu_gpu(
+		       &num_streams_used,/*number of streams that will be used*/
+		       stream_end_col,   /*array holding last column blk for each partition*/
+		       &ncpu_blks,       /*number of CPU gemm blks*/
+		       		// Following are inputs
+		       nbrow,            /*number of rows in A matrix*/
+		       ldu,              /*value of k in dgemm*/
+		       nstreams,
+		       full_u_cols + jjj_st, /*array containing prefix sum of GPU workload*/
+		       jjj_global - jjj_st /*number of block columns on GPU.
+		       		             If only one block, leave it on CPU*/
+                );
+                // TAU_STATIC_TIMER_STOP("work_divison");
+
+            } /* pragma omp single */
+
+            jjj = jjj_global; /* Move to the next [ CPU : GPU ] partition */
+	    
+#if 0 // !!Sherry: this test is not necessary
+	    // if jjj_global - jjj_st == 1, everything is on CPU.
+	    // bigv_size is calculated sufficiently large.
+            if (jjj == jjj_st+1 && full_u_cols[jjj_st] > ncol_max) {
+                printf("allocate more memory for buffer !!!!\n"
+		       ".. jjj_st %d, nbrow %d, full_u_cols[jjj_st] %d, ncol_max %d\n",
+		       jjj_st, nbrow, full_u_cols[jjj_st], ncol_max);
+                if(nbrow * full_u_cols[jjj_st] > buffer_size)
+                    printf("[%d] needed %d > buffer_size %d\n",iam,nbrow*full_u_cols[jjj_st],buffer_size );
+		fflush(stdout);
+            }
+#endif
+
+            // #pragma omp barrier
+            /* gathering circuit */
+            assert(jjj_st 0 ) {
+#ifdef PI_DEBUG
+		printf("nbrow %d *ldu %d  =%d < ldt %d * max_row_size %d =%d \n",nbrow,ldu,nbrow*ldu,ldt,max_row_size,ldt*max_row_size ); fflush(stdout);
+		assert(nbrow*ldu<=ldt*max_row_size);
+#endif
+		cudaMemcpy2DAsync(dA, nbrow*sizeof(float),
+				  &lusup[luptr+(knsupc-ldu)*nsupr],
+				  nsupr*sizeof(float), nbrow*sizeof(float),
+				  ldu, cudaMemcpyHostToDevice, streams[0]);
+	    }
+
+	    for (int i = 0; i < num_streams_used; ++i) { // streams on GPU
+		int st = (i==0) ? ncpu_blks+jjj_st : jjj_st+stream_end_col[i-1];
+		// st starts after the leading ncpu_blks
+		int st_col = full_u_cols[st-1];
+		int num_col_stream = full_u_cols[jjj_st+stream_end_col[i]-1]-full_u_cols[st-1];
+		tempu = bigU;
+
+		float *tempv1 = bigV + full_u_cols[st-1]*nbrow;
+
+		/* Following is for testing purpose */
+		if ( num_col_stream > 0 ) {		
+#ifdef GPU_ACC
+		    int stream_id = i;
+		    int b_offset  = ldu * st_col;
+		    int c_offset  = st_col * nbrow;
+		    size_t B_stream_size = ldu * num_col_stream * sizeof(float);
+		    size_t C_stream_size = nbrow * num_col_stream * sizeof(float);
+
+		    assert(nbrow*(st_col+num_col_stream) < buffer_size);
+
+		    cudaMemcpyAsync(dB+b_offset, tempu+b_offset, B_stream_size,
+		    		    cudaMemcpyHostToDevice, streams[stream_id]);
+
+		    cublasCheckErrors(
+				  cublasSetStream(handle[stream_id],
+						  streams[stream_id])
+				     );
+
+		    cublasCheckErrors(
+				  cublasSgemm(handle[stream_id],
+					      CUBLAS_OP_N, CUBLAS_OP_N,
+					      nbrow, num_col_stream, ldu,
+                                              &alpha, dA, nbrow,
+					      &dB[b_offset], ldu,
+					      &beta, &dC[c_offset],
+                                              nbrow)
+				  );
+
+		    checkCuda( cudaMemcpyAsync(tempv1, dC+c_offset,
+					   C_stream_size,
+					   cudaMemcpyDeviceToHost,
+					   streams[stream_id]) );
+#else /*-- on CPU --*/
+
+	            my_sgemm_("N", "N", &nbrow, &num_col_stream, &ldu,
+			      &alpha, &lusup[luptr+(knsupc-ldu)*nsupr],
+			      &nsupr, tempu+ldu*st_col, &ldu, &beta,
+			      tempv1, &nbrow, 1, 1);
+#endif
+   	        } // end if num_col_stream > 0
+
+	    } /* end for i = 1 to num_streams used */
+
+	    /* Special case for CPU -- leading block columns are computed 
+	       on CPU in order to mask the GPU data transfer latency */
+	    int num_col = full_u_cols[jjj_st+ncpu_blks-1];
+	    int st_col = 0; /* leading part on CPU */
+	    tempv = bigV + nbrow * st_col;
+	    tempu = bigU;
+
+	    double tstart = SuperLU_timer_();
+#if defined (USE_VENDOR_BLAS)
+	    sgemm_("N", "N", &nbrow, &num_col, &ldu, &alpha,
+		  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
+		  tempu+ldu*st_col, &ldu, &beta, tempv, &nbrow, 1, 1);
+#else
+	    sgemm_("N", "N", &nbrow, &num_col, &ldu, &alpha,
+		  &lusup[luptr+(knsupc-ldu)*nsupr], &nsupr,
+		  tempu+ldu*st_col, &ldu, &beta, tempv, &nbrow);
+#endif
+	    gemm_timer += SuperLU_timer_() -tstart;
+	    stat->ops[FACT] += 2 * nbrow * ldu * full_u_cols[jjj-1];
+
+            /* Now scattering blocks computed by CPU */
+            int temp_ncol;
+
+            /* scatter leading blocks which CPU has computated */
+            tstart = SuperLU_timer_();
+
+#ifdef _OPENMP
+#pragma omp parallel  \
+    private(j,iukp,rukp, tempu, tempv, cum_nrow, jb, nsupc,ljb,	\
+	    segsize,lead_zero,					\
+	    ib, temp_nbrow,ilst,lib,index,			\
+	    ijb,fnz,ucol,rel,ldv,lptrj,luptrj,			\
+	    nzval,     lb ,                     jj, i)		\
+    firstprivate(luptr,lptr) default (shared)
+#endif
+            {
+#ifdef _OPENMP	    
+                int thread_id = omp_get_thread_num();
+		int num_threads = omp_get_num_threads();
+#else
+                int thread_id = 0;
+		int num_threads = 1;
+#endif		
+
+                int* indirect_thread = indirect + ldt*thread_id;
+                int* indirect2_thread = indirect2 + ldt*thread_id;
+                float* tempv1;
+
+                if ( ncpu_blks < num_threads ) {
+                    // TAU_STATIC_TIMER_START("SPECIAL_CPU_SCATTER");
+
+                    for (j = jjj_st; j < jjj_st+ncpu_blks; ++j) {
+                        /* code */
+#ifdef PI_DEBUG
+			printf("scattering block column %d, jjj_st, jjj_st+ncpu_blks\n",j,jjj_st,jjj_st+ncpu_blks);
+#endif
+
+                        /* == processing each of the remaining columns == */
+
+                        if(j==jjj_st) tempv1 = bigV;
+                        else tempv1 = bigV + full_u_cols[j-1]*nbrow;
+
+                        arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+					  iukp0,rukp0,usub,perm_u,xsup,grid );
+
+                        cum_nrow =0 ;
+
+                        /* do update with the kth column of L and (k,j)th block of U */
+                        lptr = lptr0;
+                        luptr = luptr0;
+
+#ifdef _OPENMP
+#pragma omp for schedule( SCHEDULE_STRATEGY ) nowait
+#endif
+                        for (lb = 0; lb < nlb; lb++ ) {
+                            int cum_nrow = 0;
+                            int temp_nbrow;
+                            lptr = lptr0;
+                            luptr = luptr0;
+                            for (int i = 0; i < lb; ++i) {
+                                ib = lsub[lptr];        /* Row block L(i,k). */
+                                temp_nbrow = lsub[lptr+1];   /* Number of full rows. */
+                                lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+                                lptr += temp_nbrow;
+                                luptr += temp_nbrow;
+                                cum_nrow +=temp_nbrow;
+                            }
+
+                            ib = lsub[lptr];       /* Row block L(i,k). */
+                            temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+                            assert(temp_nbrow<=nbrow);
+
+                            lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+
+                            /* Now gather the result into the destination block. */
+                            if ( ib < jb ) {  /* A(i,j) is in U. */
+                                #ifdef PI_DEBUG
+                                    printf("cpu scatter \n");
+                                    printf("A(%d,%d) goes to U block %d \n", ib,jb,ljb);
+                                #endif
+
+                                tempv = tempv1+cum_nrow;
+                                sscatter_u (
+						 ib,jb,
+						 nsupc,iukp,xsup,
+						 klst,nbrow,
+						 lptr,temp_nbrow,lsub,
+						 usub,tempv,
+						 Ufstnz_br_ptr,
+						 Unzval_br_ptr,
+						 grid
+						 );
+                            } else {    /* A(i,j) is in L. */
+#ifdef PI_DEBUG
+                                printf("cpu scatter \n");
+                                printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
+#endif
+
+                                tempv = tempv1+cum_nrow;
+
+                                sscatter_l (
+						 ib, ljb,nsupc,iukp,xsup,klst,nbrow,lptr,
+						 temp_nbrow,usub,lsub,tempv,
+						 indirect_thread,indirect2_thread,
+						 Lrowind_bc_ptr,Lnzval_bc_ptr,grid
+						 );
+                            } /* if ib < jb ... */
+
+                            lptr += temp_nbrow;
+                            luptr += temp_nbrow;
+                            cum_nrow += temp_nbrow;
+
+                        } /* for lb ... */
+
+                        luptr=luptr0;
+                    } /* for j = jjj_st ... */
+
+                    // TAU_STATIC_TIMER_STOP("SPECIAL_CPU_SCATTER");
+                } else { // ncpu_blks >= omp_get_num_threads()
+#ifdef _OPENMP
+#pragma omp for schedule(SCHEDULE_STRATEGY) nowait
+#endif
+                    for (j = jjj_st; j < jjj_st+ncpu_blks; ++j) {
+                        /* code */
+#ifdef PI_DEBUG
+			printf("scattering block column %d\n",j);
+#endif
+
+                        /* == processing each of the remaining columns == */
+                        if(j==jjj_st) tempv1 = bigV;
+                        else tempv1 = bigV + full_u_cols[j-1]*nbrow;
+
+                        arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+					  iukp0,rukp0,usub,perm_u,xsup,grid );
+                        cum_nrow =0 ;
+
+                        /* do update with the kth column of L and (k,j)th block of U */
+                        lptr = lptr0;
+                        luptr = luptr0;
+
+                        for (lb = 0; lb < nlb; lb++ ) {
+                            ib = lsub[lptr];       /* Row block L(i,k). */
+                            temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+                            assert(temp_nbrow<=nbrow);
+
+                            lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+#ifdef DGEMM_STAT
+			    if(j==jjj_st) {
+				temp_ncol = full_u_cols[j];
+			    } else {
+				temp_ncol = full_u_cols[j]- full_u_cols[j-1];
+			    }
+			    printf("%d %d %d \n",temp_nbrow, temp_ncol,ldu);
+#endif
+
+			    /* Now gather the result into the destination block. */
+			    if ( ib < jb ) {  /* A(i,j) is in U. */
+#ifdef PI_DEBUG
+				printf("cpu scatter \n");
+				printf("A(%d,%d) goes to U block %d \n", ib,jb,ljb);
+#endif
+
+				tempv = tempv1+cum_nrow;
+                                sscatter_u (
+						 ib,jb,
+						 nsupc,iukp,xsup,
+						 klst,nbrow,
+						 lptr,temp_nbrow,lsub,
+						 usub,tempv,
+						 Ufstnz_br_ptr,
+						 Unzval_br_ptr,
+						 grid
+						 );
+			    } else {    /* A(i,j) is in L. */
+#ifdef PI_DEBUG
+                                printf("cpu scatter \n");
+                                printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
+#endif
+                                tempv = tempv1+cum_nrow;
+
+                                sscatter_l (
+						 ib, ljb,nsupc,iukp,xsup,klst,nbrow,lptr,
+						 temp_nbrow,usub,lsub,tempv,
+						 indirect_thread,indirect2_thread,
+						 Lrowind_bc_ptr,Lnzval_bc_ptr,grid
+						 );
+			    } /* if ib < jb ... */
+
+			    lptr += temp_nbrow;
+			    luptr += temp_nbrow;
+			    cum_nrow += temp_nbrow;
+
+			} /* for lb ... */
+
+			luptr=luptr0;
+		    } /* for j = jjj_st ... */
+		}     /* else (ncpu_blks >= omp_get_num_threads()) */
+	    }         /* parallel region */
+
+	    scatter_timer += SuperLU_timer_() - tstart;
+	    
+	    // Scatter tempv(:, (jjj_st1 : jjj_global)) computed on GPU.
+#ifdef _OPENMP
+#pragma omp parallel							\
+    private(j,iukp,rukp, tempu, tempv, cum_nrow, jb, nsupc,ljb,		\
+	    segsize,lead_zero,						\
+	    ib, temp_nbrow,ilst,lib,index,				\
+	    ijb,fnz,ucol,rel,ldv,lptrj,luptrj,				\
+	    nzval,     lb ,                     jj, i)			\
+    firstprivate(luptr,lptr) default (shared)
+#endif
+            {
+#ifdef _OPENMP	    
+                int thread_id = omp_get_thread_num();
+#else		
+                int thread_id = 0;
+#endif
+                int* indirect_thread = indirect + ldt*thread_id;
+                int* indirect2_thread = indirect2 + ldt*thread_id;
+                float* tempv1;
+                for(i = 0; i < num_streams_used; i++) { /* i is private variable */
+                    checkCuda(cudaStreamSynchronize (streams[i]));
+		    // jjj_st1 := first block column on GPU stream[i]
+		    int jjj_st1 = (i==0) ? jjj_st + ncpu_blks : jjj_st + stream_end_col[i-1];
+                    int jjj_end = jjj_st + stream_end_col[i];
+                    assert(jjj_end-1jjj_st) ;
+
+                    /* now scatter it */
+#pragma omp for schedule( SCHEDULE_STRATEGY ) nowait
+                    for (j = jjj_st1; j < jjj_end; ++j) {
+                        /* code */
+#ifdef PI_DEBUG
+			printf("scattering block column %d, jjj_end %d, nub %d\n",j,jjj_end,nub); fflush(stdout);
+#endif
+                        /* == processing each of the remaining columns == */
+
+                        if(j==jjj_st) tempv1 = bigV;
+                        else tempv1 = bigV + full_u_cols[j-1]*nbrow;
+
+                        arrive_at_ublock( j,&iukp,&rukp,&jb,&ljb,&nsupc,
+					  iukp0,rukp0,usub,perm_u,xsup,grid );
+                        cum_nrow =0 ;
+
+                        /* do update with the kth column of L and (k,j)th
+			   block of U */
+                        lptr = lptr0;
+                        luptr = luptr0;
+                        for (lb = 0; lb < nlb; lb++) {
+                            ib = lsub[lptr];       /* Row block L(i,k). */
+                            temp_nbrow = lsub[lptr+1];  /* Number of full rows. */
+                            assert(temp_nbrow<=nbrow);
+
+                            lptr += LB_DESCRIPTOR; /* Skip descriptor. */
+#ifdef DGEMM_STAT
+			    if(j==jjj_st) {
+				temp_ncol = full_u_cols[j];
+			    } else {
+				temp_ncol = full_u_cols[j]- full_u_cols[j-1];
+			    }
+			    printf("%d %d %d \n",temp_nbrow, temp_ncol,ldu);
+#endif
+
+                            /* Now scatter result into destination block. */
+                            if ( ib < jb ) { /* A(i,j) is in U. */
+#ifdef PI_DEBUG
+				printf("gpu scatter \n");
+				printf("A(%d,%d) goes to U block %d \n", ib,jb,ljb);
+				fflush(stdout);
+#endif
+                                tempv = tempv1+cum_nrow;
+                                sscatter_u (
+						 ib,jb,
+						 nsupc,iukp,xsup,
+						 klst,nbrow,
+						 lptr,temp_nbrow,lsub,
+						 usub,tempv,
+						 Ufstnz_br_ptr,
+						 Unzval_br_ptr,
+						 grid
+						 );
+                            } else {    /* A(i,j) is in L. */
+#ifdef PI_DEBUG
+                                printf("gpu scatter \n");
+                                printf("A(%d,%d) goes to L block %d \n", ib,jb,ljb);
+				fflush(stdout);
+#endif
+                                tempv = tempv1+cum_nrow;
+
+                                sscatter_l (
+						 ib, ljb,nsupc,iukp,xsup,klst,nbrow,lptr,
+						 temp_nbrow,usub,lsub,tempv,
+						 indirect_thread,indirect2_thread,
+						 Lrowind_bc_ptr,Lnzval_bc_ptr,grid
+						 );
+                            } /* if ib < jb ... */
+
+                            lptr += temp_nbrow;
+                            luptr += temp_nbrow;
+                            cum_nrow += temp_nbrow;
+
+                        } /* for lb ... */
+
+                        luptr=luptr0;
+                    } /* for j = jjj_st ... */
+
+                } /* end for i = 0 to nstreams */
+		
+                // TAU_STATIC_TIMER_STOP("GPU_SCATTER");
+                // TAU_STATIC_TIMER_STOP("INSIDE_OMP");
+		
+            } /* end pragma omp parallel */
+            // TAU_STATIC_TIMER_STOP("OUTSIDE_OMP");
+	    
+        }  /* end while(jjj0 */
+
+ }   /* if msg1 and msg 2 */
+
+
+
diff --git a/SRC/sbinary_io.c b/SRC/sbinary_io.c
new file mode 100644
index 00000000..b1a3c81a
--- /dev/null
+++ b/SRC/sbinary_io.c
@@ -0,0 +1,42 @@
+#include "superlu_sdefs.h"
+
+int
+sread_binary(FILE *fp, int_t *m, int_t *n, int_t *nnz, 
+	     float **nzval, int_t **rowind, int_t **colptr)
+{
+    size_t isize = sizeof(int_t), dsize = sizeof(float);
+    int nnz_read;
+    fread(n, isize, 1, fp);
+    fread(nnz, isize, 1, fp);
+    printf("fread n " IFMT "\tnnz " IFMT "\n", *n, *nnz);
+    *m = *n;
+    *colptr = intMalloc_dist(*n+1);
+    *rowind = intMalloc_dist(*nnz);
+    *nzval  = floatMalloc_dist(*nnz);
+    fread(*colptr, isize, (size_t) (*n + 1), fp);
+    fread(*rowind, isize, (size_t) *nnz, fp);
+    nnz_read = fread(*nzval, dsize, (size_t) (*nnz), fp);
+    printf("# of floats fread: %d\n", nnz_read);
+    fclose(fp);
+    return 0;
+}
+
+int
+swrite_binary(int_t n, int_t nnz,
+	      float *values, int_t *rowind, int_t *colptr)
+{       
+      FILE  *fp1;
+      int nnz_written;
+      size_t isize = sizeof(int_t), dsize = sizeof(float);
+      fp1 = fopen("matrix.bin", "wb");
+      fwrite(&n, isize, 1, fp1);
+      fwrite(&nnz, isize, 1, fp1);
+      fwrite(colptr, isize, n+1, fp1);
+      fwrite(rowind, isize, nnz, fp1);
+      nnz_written = fwrite(values, dsize, nnz, fp1);
+      printf("n " IFMT ", # of float: " IFMT "\n", n, nnz);
+      printf("dump binary file ... # of float fwrite: %d\n", nnz_written);
+      assert(nnz_written==nnz);
+      fclose(fp1);
+      return 0;
+}
diff --git a/SRC/scommunication_aux.c b/SRC/scommunication_aux.c
new file mode 100644
index 00000000..55e83851
--- /dev/null
+++ b/SRC/scommunication_aux.c
@@ -0,0 +1,504 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Communication routines.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include "superlu_sdefs.h"
+#if 0
+#include "sec_structs.h"
+#include "communication_aux.h"
+#include "compiler.h"
+#endif
+
+int_t sIBcast_LPanel
+/*broadcasts index array lsub and non-zero value
+ array lusup of a newly factored L column to my process row*/
+(int_t k, int_t k0, int_t* lsub, float* lusup, gridinfo_t *grid,
+ int* msgcnt, MPI_Request *send_req, int **ToSendR, int_t *xsup,
+ int tag_ub)
+{
+    int_t Pc = grid->npcol;
+    int_t lk = LBj (k, grid);
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    if (lsub)
+    {
+        msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+        msgcnt[1] = lsub[1] * SuperSize (k);
+    }
+    else
+    {
+        msgcnt[0] = msgcnt[1] = 0;
+    }
+
+    for (int_t pj = 0; pj < Pc; ++pj)
+    {
+        if (ToSendR[lk][pj] != EMPTY)
+        {
+
+
+            MPI_Isend (lsub, msgcnt[0], mpi_int_t, pj,
+                       SLU_MPI_TAG (0, k0) /* 0 */ ,
+                       scp->comm, &send_req[pj]);
+            MPI_Isend (lusup, msgcnt[1], MPI_FLOAT, pj,
+                       SLU_MPI_TAG (1, k0) /* 1 */ ,
+                       scp->comm, &send_req[pj + Pc]);
+
+        }
+    }
+
+    return 0;
+}
+
+
+int_t sBcast_LPanel
+/*broadcasts index array lsub and non-zero value
+ array lusup of a newly factored L column to my process row*/
+(int_t k, int_t k0, int_t* lsub, float* lusup, gridinfo_t *grid,
+ int* msgcnt,  int **ToSendR, int_t *xsup , SCT_t* SCT,
+ int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    int_t Pc = grid->npcol;
+    int_t lk = LBj (k, grid);
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    if (lsub)
+    {
+        msgcnt[0] = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR;
+        msgcnt[1] = lsub[1] * SuperSize (k);
+    }
+    else
+    {
+        msgcnt[0] = msgcnt[1] = 0;
+    }
+
+    for (int_t pj = 0; pj < Pc; ++pj)
+    {
+        if (ToSendR[lk][pj] != EMPTY)
+        {
+
+
+            MPI_Send (lsub, msgcnt[0], mpi_int_t, pj,
+                       SLU_MPI_TAG (0, k0) /* 0 */ ,
+                       scp->comm);
+            MPI_Send (lusup, msgcnt[1], MPI_FLOAT, pj,
+                       SLU_MPI_TAG (1, k0) /* 1 */ ,
+                       scp->comm);
+
+        }
+    }
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+
+int_t sIBcast_UPanel
+/*asynchronously braodcasts U panel to my process row */
+(int_t k, int_t k0, int_t* usub, float* uval, gridinfo_t *grid,
+ int* msgcnt, MPI_Request *send_req_u, int *ToSendD, int tag_ub )
+{
+
+    int_t iam = grid->iam;
+    int_t lk = LBi (k, grid);
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */
+    if (usub)
+    {
+        msgcnt[2] = usub[2];
+        msgcnt[3] = usub[1];
+    }
+    else
+    {
+        msgcnt[2] = msgcnt[3] = 0;
+    }
+
+    if (ToSendD[lk] == YES)
+    {
+        for (int_t pi = 0; pi < Pr; ++pi)
+        {
+            if (pi != myrow)
+            {
+
+                MPI_Isend (usub, msgcnt[2], mpi_int_t, pi,
+                           SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ ,
+                           scp->comm,
+                           &send_req_u[pi]);
+                MPI_Isend (uval, msgcnt[3], MPI_FLOAT,
+                           pi, SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ ,
+                           scp->comm,
+                           &send_req_u[pi + Pr]);
+
+            }   /* if pi ... */
+        }   /* for pi ... */
+    }       /* if ToSendD ... */
+    return 0;
+}
+
+/*Synchronously braodcasts U panel to my process row */
+int_t sBcast_UPanel(int_t k, int_t k0, int_t* usub,
+                     float* uval, gridinfo_t *grid,
+		   int* msgcnt, int *ToSendD, SCT_t* SCT, int tag_ub)
+
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    int_t iam = grid->iam;
+    int_t lk = LBi (k, grid);
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    superlu_scope_t *scp = &grid->cscp; /* The scope of process col. */
+    if (usub)
+    {
+        msgcnt[2] = usub[2];
+        msgcnt[3] = usub[1];
+    }
+    else
+    {
+        msgcnt[2] = msgcnt[3] = 0;
+    }
+
+    if (ToSendD[lk] == YES)
+    {
+        for (int_t pi = 0; pi < Pr; ++pi)
+        {
+            if (pi != myrow)
+            {
+                MPI_Send (usub, msgcnt[2], mpi_int_t, pi,
+                          SLU_MPI_TAG (2, k0) /* (4*k0+2)%tag_ub */ ,
+                          scp->comm);
+                MPI_Send (uval, msgcnt[3], MPI_FLOAT, pi,
+                          SLU_MPI_TAG (3, k0) /* (4*k0+3)%tag_ub */ ,
+                          scp->comm);
+
+            }       /* if pi ... */
+        }           /* for pi ... */
+    }
+    //SCT->Bcast_UPanel_tl += (double) ( _rdtsc() - t1);
+    SCT->Bcast_UPanel_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t sIrecv_LPanel
+/*it places Irecv call for L panel*/
+(int_t k, int_t k0,  int_t* Lsub_buf, float* Lval_buf,
+ gridinfo_t *grid, MPI_Request *recv_req, sLocalLU_t *Llu, int tag_ub )
+{
+    int_t kcol = PCOL (k, grid);
+
+    superlu_scope_t *scp = &grid->rscp;  /* The scope of process row. */
+    MPI_Irecv (Lsub_buf, Llu->bufmax[0], mpi_int_t, kcol,
+               SLU_MPI_TAG (0, k0) /* 0 */ ,
+               scp->comm, &recv_req[0]);
+    MPI_Irecv (Lval_buf, Llu->bufmax[1], MPI_FLOAT, kcol,
+               SLU_MPI_TAG (1, k0) /* 1 */ ,
+               scp->comm, &recv_req[1]);
+    return 0;
+}
+
+
+int_t sIrecv_UPanel
+/*it places Irecv calls to receive U panels*/
+(int_t k, int_t k0, int_t* Usub_buf, float* Uval_buf, sLocalLU_t *Llu,
+ gridinfo_t* grid, MPI_Request *recv_req_u, int tag_ub )
+{
+    int_t krow = PROW (k, grid);
+    superlu_scope_t *scp = &grid->cscp;  /* The scope of process column. */
+    MPI_Irecv (Usub_buf, Llu->bufmax[2], mpi_int_t, krow,
+               SLU_MPI_TAG (2, k0) /* (4*kk0+2)%tag_ub */ ,
+               scp->comm, &recv_req_u[0]);
+    MPI_Irecv (Uval_buf, Llu->bufmax[3], MPI_FLOAT, krow,
+               SLU_MPI_TAG (3, k0) /* (4*kk0+3)%tag_ub */ ,
+               scp->comm, &recv_req_u[1]);
+
+    return 0;
+}
+
+int_t sWait_URecv
+( MPI_Request *recv_req, int* msgcnt, SCT_t* SCT)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Wait (&recv_req[0], &status);
+    MPI_Get_count (&status, mpi_int_t, &msgcnt[2]);
+    MPI_Wait (&recv_req[1], &status);
+    MPI_Get_count (&status, MPI_FLOAT, &msgcnt[3]);
+    //SCT->Wait_URecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_URecv_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t sWait_LRecv
+/*waits till L blocks have been received*/
+(  MPI_Request* recv_req, int* msgcnt, int* msgcntsU, gridinfo_t * grid, SCT_t* SCT)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    
+    if (recv_req[0] != MPI_REQUEST_NULL)
+    {
+        MPI_Wait (&recv_req[0], &status);
+        MPI_Get_count (&status, mpi_int_t, &msgcnt[0]);
+        recv_req[0] = MPI_REQUEST_NULL;
+    }
+    else
+    {
+        msgcnt[0] = msgcntsU[0];
+    }
+
+    if (recv_req[1] != MPI_REQUEST_NULL)
+    {
+        MPI_Wait (&recv_req[1], &status);
+        MPI_Get_count (&status, MPI_FLOAT, &msgcnt[1]);
+        recv_req[1] = MPI_REQUEST_NULL;
+    }
+    else
+    {
+        msgcnt[1] = msgcntsU[1];
+    }
+    //SCT->Wait_LRecv_tl += (double) ( _rdtsc() - t1);
+    SCT->Wait_LRecv_tl +=  SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+int_t sISend_UDiagBlock(int_t k0, float *ublk_ptr, /*pointer for the diagonal block*/
+                       int_t size, /*number of elements to be broadcasted*/
+                       MPI_Request *U_diag_blk_send_req,
+                       gridinfo_t * grid, int tag_ub)
+{
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    MPI_Comm comm = (grid->cscp).comm;
+    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+    for (int_t pr = 0; pr < Pr; ++pr)
+    {
+        if (pr != myrow)
+        {
+            /* tag = ((k0<<2)+2) % tag_ub;        */
+            /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+            MPI_Isend (ublk_ptr, size, MPI_FLOAT, pr,
+                       SLU_MPI_TAG (4, k0) /* tag */ ,
+                       comm, U_diag_blk_send_req + pr);
+        }
+    }
+
+    return 0;
+}
+
+
+int_t sRecv_UDiagBlock(int_t k0, float *ublk_ptr, /*pointer for the diagonal block*/
+                      int_t size, /*number of elements to be broadcasted*/
+                      int_t src,
+                      gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Status status;
+    MPI_Comm comm = (grid->cscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    MPI_Recv (ublk_ptr, size, MPI_FLOAT, src,
+              SLU_MPI_TAG (4, k0), comm, &status);
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+
+int_t sPackLBlock(int_t k, float* Dest, Glu_persist_t *Glu_persist,
+                  gridinfo_t *grid, sLocalLU_t *Llu)
+/*Copies src matrix into dest matrix*/
+{
+    /* Initialization. */
+    int_t *xsup = Glu_persist->xsup;
+    int_t lk = LBj (k, grid);          /* Local block number */
+    float *lusup = Llu->Lnzval_bc_ptr[lk];
+    int_t nsupc = SuperSize (k);
+    int_t nsupr;
+    if (Llu->Lrowind_bc_ptr[lk])
+        nsupr = Llu->Lrowind_bc_ptr[lk][1];
+    else
+        nsupr = 0;
+#if 0
+    LAPACKE_dlacpy (LAPACK_COL_MAJOR, 'A', nsupc, nsupc, lusup, nsupr, Dest, nsupc);
+#else /* Sherry */
+    for (int j = 0; j < nsupc; ++j) {
+	memcpy( &Dest[j * nsupc], &lusup[j * nsupr], nsupc * sizeof(float) );
+    }
+#endif
+    
+    return 0;
+}
+
+int_t sISend_LDiagBlock(int_t k0, float *lblk_ptr, /*pointer for the diagonal block*/
+                       int_t size,                                        /*number of elements to be broadcasted*/
+                       MPI_Request *L_diag_blk_send_req,
+                       gridinfo_t * grid, int tag_ub)
+{
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t mycol = MYCOL (iam, grid);
+    MPI_Comm comm = (grid->rscp).comm; /*Row communicator*/
+    /** ALWAYS SEND TO ALL OTHERS - TO FIX **/
+    for (int_t pc = 0; pc < Pc; ++pc)
+    {
+        if (pc != mycol)
+        {
+            /* tag = ((k0<<2)+2) % tag_ub;        */
+            /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+            MPI_Isend (lblk_ptr, size, MPI_FLOAT, pc,
+                       SLU_MPI_TAG (5, k0) /* tag */ ,
+                       comm, L_diag_blk_send_req + pc);
+
+        }
+    }
+
+    return 0;
+}
+
+
+int_t sIRecv_UDiagBlock(int_t k0, float *ublk_ptr, /*pointer for the diagonal block*/
+                       int_t size,                                        /*number of elements to be broadcasted*/
+                       int_t src,
+                       MPI_Request *U_diag_blk_recv_req,
+                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Comm comm = (grid->cscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    int_t err = MPI_Irecv (ublk_ptr, size, MPI_FLOAT, src,
+               		   SLU_MPI_TAG (4, k0), comm, U_diag_blk_recv_req);
+    if (err==MPI_ERR_COUNT)
+    {
+        printf("Error in IRecv_UDiagBlock count\n");
+    }
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+int_t sIRecv_LDiagBlock(int_t k0, float *L_blk_ptr, /*pointer for the diagonal block*/
+                       int_t size,  /*number of elements to be broadcasted*/
+                       int_t src,
+                       MPI_Request *L_diag_blk_recv_req,
+                       gridinfo_t * grid, SCT_t* SCT, int tag_ub)
+{
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    MPI_Comm comm = (grid->rscp).comm;
+    /* tag = ((k0<<2)+2) % tag_ub;        */
+    /* tag = (4*(nsupers+k0)+2) % tag_ub; */
+
+    int_t err = MPI_Irecv (L_blk_ptr, size, MPI_FLOAT, src,
+                   SLU_MPI_TAG (5, k0),
+                   comm, L_diag_blk_recv_req);
+    if (err==MPI_ERR_COUNT)
+    {
+        printf("Error in IRecv_lDiagBlock count\n");
+    }
+    //SCT->Recv_UDiagBlock_tl += (double) ( _rdtsc() - t1);
+    SCT->Recv_UDiagBlock_tl += SuperLU_timer_() - t1;
+    return 0;
+}
+
+#if (MPI_VERSION>2)
+
+/****Ibcast based on mpi ibcast****/
+int_t sIBcast_UDiagBlock(int_t k, float *ublk_ptr, /*pointer for the diagonal block*/
+                        int_t size,  /*number of elements to be broadcasted*/
+                        MPI_Request *L_diag_blk_ibcast_req,
+                        gridinfo_t * grid)
+{
+    int_t  krow = PROW (k, grid);
+    MPI_Comm comm = (grid->cscp).comm;
+
+    MPI_Ibcast(ublk_ptr, size, MPI_FLOAT, krow,comm, L_diag_blk_ibcast_req);
+    
+    // MPI_Status status;
+    // MPI_Wait(L_diag_blk_ibcast_req, &status);
+    return 0;
+}
+
+int_t sIBcast_LDiagBlock(int_t k, float *lblk_ptr, /*pointer for the diagonal block*/
+                        int_t size,  /*number of elements to be broadcasted*/
+                        MPI_Request *U_diag_blk_ibcast_req,
+                        gridinfo_t * grid)
+{
+    int_t  kcol = PCOL (k, grid);
+    MPI_Comm comm = (grid->rscp).comm;
+
+    MPI_Ibcast(lblk_ptr, size, MPI_FLOAT, kcol,comm, U_diag_blk_ibcast_req);
+    // MPI_Status status;
+    // MPI_Wait(U_diag_blk_ibcast_req, &status);
+    return 0;
+}
+
+#endif 
+
+int_t sUDiagBlockRecvWait( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
+                           MPI_Request * U_diag_blk_recv_req,
+                           gridinfo_t *grid,
+                           sLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    int_t iam = grid->iam;
+
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+
+    int_t kcol = PCOL (k, grid);
+
+    if (IrecvPlcd_D[k] == 1)
+    {
+        /* code */
+        /*factor the L panel*/
+        if (mycol == kcol  && factored_L[k] == 0 && iam != pkk)
+        {
+            factored_L[k] = 1;
+            int_t lk = LBj (k, grid);
+
+            int_t nsupr;
+            if (Llu->Lrowind_bc_ptr[lk])
+                nsupr = Llu->Lrowind_bc_ptr[lk][1];
+            else
+                nsupr = 0;
+            /*wait for communication to finish*/
+
+            // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+            int_t flag = 0;
+            while (flag == 0)
+            {
+                flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+            }
+        }
+    }
+    return 0;
+}
+
diff --git a/SRC/sdistribute.c b/SRC/sdistribute.c
new file mode 100644
index 00000000..964f7ce4
--- /dev/null
+++ b/SRC/sdistribute.c
@@ -0,0 +1,1652 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Distribute the matrix onto the 2D process mesh.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 2.3) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 15, 2008
+ * 
+ */ +#include "superlu_sdefs.h" + + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *   Distribute the matrix onto the 2D process mesh.
+ *
+ * Arguments
+ * =========
+ *
+ * fact (input) fact_t
+ *        Specifies whether or not the L and U structures will be re-used.
+ *        = SamePattern_SameRowPerm: L and U structures are input, and
+ *                                   unchanged on exit.
+ *        = DOFACT or SamePattern: L and U structures are computed and output.
+ *
+ * n      (input) int
+ *        Dimension of the matrix.
+ *
+ * A      (input) SuperMatrix*
+ *	  The original matrix A, permuted by columns, of dimension
+ *        (A->nrow, A->ncol). The type of A can be:
+ *        Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE.
+ *
+ * LUstruct (input) sLUstruct_t*
+ *        Data structures for L and U factors.
+ *
+ * grid   (input) gridinfo_t*
+ *        The 2D process mesh.
+ *
+ * Return value
+ * ============
+ *   > 0, working storage (in bytes) required to perform redistribution.
+ *        (excluding LU factor size)
+ * 
+ */ + +float +sdistribute(fact_t fact, int_t n, SuperMatrix *A, + Glu_freeable_t *Glu_freeable, + sLUstruct_t *LUstruct, gridinfo_t *grid) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + int_t bnnz, fsupc, fsupc1, i, ii, irow, istart, j, ib, jb, jj, k, k1, + len, len1, nsupc; + int_t lib; /* local block row number */ + int_t nlb; /* local block rows*/ + int_t ljb; /* local block column number */ + int_t nrbl; /* number of L blocks in current block column */ + int_t nrbu; /* number of U blocks in current block column */ + int_t gb; /* global block number; 0 < gb <= nsuper */ + int_t lb; /* local block number; 0 < lb <= ceil(NSUPERS/Pr) */ + int_t ub,gik,iklrow,fnz; + int iam, jbrow, kcol, krow, mycol, myrow, pc, pr; + int_t mybufmax[NBUFFERS]; + NCPformat *Astore; + float *a; + int_t *asub; + int_t *xa_begin, *xa_end; + int_t *xsup = Glu_persist->xsup; /* supernode and column mapping */ + int_t *supno = Glu_persist->supno; + int_t *lsub, *xlsub, *usub, *usub1, *xusub; + int_t nsupers; + int_t next_lind; /* next available position in index[*] */ + int_t next_lval; /* next available position in nzval[*] */ + int_t *index; /* indices consist of headers and row subscripts */ + int_t *index_srt; /* indices consist of headers and row subscripts */ + int *index1; /* temporary pointer to array of int */ + float *lusup, *lusup_srt, *uval; /* nonzero values in L and U */ + float **Lnzval_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lrowind_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t **Lindval_loc_bc_ptr; /* size ceil(NSUPERS/Pc) */ + int_t *Unnz; /* size ceil(NSUPERS/Pc) */ + float **Unzval_br_ptr; /* size ceil(NSUPERS/Pr) */ + int_t **Ufstnz_br_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *LBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *LRtree_ptr; /* size ceil(NSUPERS/Pr) */ + BcTree *UBtree_ptr; /* size ceil(NSUPERS/Pc) */ + RdTree *URtree_ptr; /* size ceil(NSUPERS/Pr) */ + int msgsize; + + int_t *Urbs,*Urbs1; /* Number of row blocks in each block column of U. */ + Ucb_indptr_t **Ucb_indptr;/* Vertical linked list pointing to Uindex[] */ + int_t **Ucb_valptr; /* Vertical linked list pointing to Unzval[] */ + + /*-- Counts to be used in factorization. --*/ + int *ToRecv, *ToSendD, **ToSendR; + + /*-- Counts to be used in lower triangular solve. --*/ + int_t *fmod; /* Modification count for L-solve. */ + int_t **fsendx_plist; /* Column process list to send down Xk. */ + int_t nfrecvx = 0; /* Number of Xk I will receive. */ + int_t nfsendx = 0; /* Number of Xk I will send */ + int_t kseen; + + /*-- Counts to be used in upper triangular solve. --*/ + int_t *bmod; /* Modification count for U-solve. */ + int_t **bsendx_plist; /* Column process list to send down Xk. */ + int_t nbrecvx = 0; /* Number of Xk I will receive. */ + int_t nbsendx = 0; /* Number of Xk I will send */ + int_t *ilsum; /* starting position of each supernode in + the full array (local) */ + + /*-- Auxiliary arrays; freed on return --*/ + int_t *rb_marker; /* block hit marker; size ceil(NSUPERS/Pr) */ + int_t *Urb_length; /* U block length; size ceil(NSUPERS/Pr) */ + int_t *Urb_indptr; /* pointers to U index[]; size ceil(NSUPERS/Pr) */ + int_t *Urb_fstnz; /* # of fstnz in a block row; size ceil(NSUPERS/Pr) */ + int_t *Ucbs; /* number of column blocks in a block row */ + int_t *Lrb_length; /* L block length; size ceil(NSUPERS/Pr) */ + int_t *Lrb_number; /* global block number; size ceil(NSUPERS/Pr) */ + int_t *Lrb_indptr; /* pointers to L index[]; size ceil(NSUPERS/Pr) */ + int_t *Lrb_valptr; /* pointers to L nzval[]; size ceil(NSUPERS/Pr) */ + int_t *ActiveFlag; + int_t *ActiveFlagAll; + int_t Iactive; + int *ranks; + int_t *idxs; + int_t **nzrows; + double rseed; + int rank_cnt,rank_cnt_ref,Root; + float *dense, *dense_col; /* SPA */ + float zero = 0.0; + int_t ldaspa; /* LDA of SPA */ + int_t iword, sword; + float mem_use = 0.0; + + int_t *mod_bit; + int_t *frecv, *brecv, *lloc; + float **Linv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + float **Uinv_bc_ptr; /* size ceil(NSUPERS/Pc) */ + double *SeedSTD_BC,*SeedSTD_RD; + int_t idx_indx,idx_lusup; + int_t nbrow; + int_t ik, il, lk, rel, knsupc, idx_r; + int_t lptr1_tmp, idx_i, idx_v,m, uu; + int_t nub; + int tag; + +#if ( PRNTlevel>=1 ) + int_t nLblocks = 0, nUblocks = 0; +#endif +#if ( PROFlevel>=1 ) + double t, t_u, t_l; + int_t u_blks; +#endif + + /* Initialization. */ + iam = grid->iam; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + for (i = 0; i < NBUFFERS; ++i) mybufmax[i] = 0; + nsupers = supno[n-1] + 1; + Astore = A->Store; + a = Astore->nzval; + asub = Astore->rowind; + xa_begin = Astore->colbeg; + xa_end = Astore->colend; +//#if ( PRNTlevel>=1 ) + iword = sizeof(int_t); + sword = sizeof(float); +//#endif + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(iam, "Enter sdistribute()"); +#endif + + if ( fact == SamePattern_SameRowPerm ) { + /* --------------------------------------------------------------- + * REUSE THE L AND U DATA STRUCTURES FROM A PREVIOUS FACTORIZATION. + * --------------------------------------------------------------- */ + +#if ( PROFlevel>=1 ) + t_l = t_u = 0; u_blks = 0; +#endif + /* We can propagate the new values of A into the existing + L and U data structures. */ + ilsum = Llu->ilsum; + ldaspa = Llu->ldalsum; + if ( !(dense = floatCalloc_dist(((size_t)ldaspa) * sp_ienv_dist(3))) ) + ABORT("Calloc fails for SPA dense[]."); + nrbu = CEILING( nsupers, grid->nprow ); /* No. of local block rows */ + if ( !(Urb_length = intCalloc_dist(nrbu)) ) + ABORT("Calloc fails for Urb_length[]."); + if ( !(Urb_indptr = intMalloc_dist(nrbu)) ) + ABORT("Malloc fails for Urb_indptr[]."); + Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + Lindval_loc_bc_ptr = Llu->Lindval_loc_bc_ptr; + Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + Unzval_br_ptr = Llu->Unzval_br_ptr; + Unnz = Llu->Unnz; + + mem_use += 2.0*nrbu*iword + ldaspa*sp_ienv_dist(3)*sword; + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + /* Initialize Uval to zero. */ + for (lb = 0; lb < nrbu; ++lb) { + Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ + index = Ufstnz_br_ptr[lb]; + if ( index ) { + uval = Unzval_br_ptr[lb]; + len = index[1]; + for (i = 0; i < len; ++i) uval[i] = zero; + } /* if index != NULL */ + } /* for lb ... */ + + for (jb = 0; jb < nsupers; ++jb) { /* Loop through each block column */ + pc = PCOL( jb, grid ); + if ( mycol == pc ) { /* Block column jb in my process column */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + + /* Scatter A into SPA (for L), or into U directly. */ + for (j = fsupc, dense_col = dense; j < FstBlockC(jb+1); ++j) { + for (i = xa_begin[j]; i < xa_end[j]; ++i) { + irow = asub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + if ( gb < jb ) { /* in U */ + index = Ufstnz_br_ptr[lb]; + uval = Unzval_br_ptr[lb]; + while ( (k = index[Urb_indptr[lb]]) < jb ) { + /* Skip nonzero values in this block */ + Urb_length[lb] += index[Urb_indptr[lb]+1]; + /* Move pointer to the next block */ + Urb_indptr[lb] += UB_DESCRIPTOR + + SuperSize( k ); + } + /*assert(k == jb);*/ + /* start fstnz */ + istart = Urb_indptr[lb] + UB_DESCRIPTOR; + len = Urb_length[lb]; + fsupc1 = FstBlockC( gb+1 ); + k = j - fsupc; + /* Sum the lengths of the leading columns */ + for (jj = 0; jj < k; ++jj) + len += fsupc1 - index[istart++]; + /*assert(irow>=index[istart]);*/ + uval[len + irow - index[istart]] = a[i]; + } else { /* in L; put in SPA first */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + dense_col[irow] = a[i]; + } + } + } /* for i ... */ + dense_col += ldaspa; + } /* for j ... */ + +#if ( PROFlevel>=1 ) + t_u += SuperLU_timer_() - t; + t = SuperLU_timer_(); +#endif + + /* Gather the values of A from SPA into Lnzval[]. */ + ljb = LBj( jb, grid ); /* Local block number */ + index = Lrowind_bc_ptr[ljb]; + if ( index ) { + nrbl = index[0]; /* Number of row blocks. */ + len = index[1]; /* LDA of lusup[]. */ + lusup = Lnzval_bc_ptr[ljb]; + next_lind = BC_HEADER; + next_lval = 0; + for (jj = 0; jj < nrbl; ++jj) { + gb = index[next_lind++]; + len1 = index[next_lind++]; /* Rows in the block. */ + lb = LBi( gb, grid ); + for (bnnz = 0; bnnz < len1; ++bnnz) { + irow = index[next_lind++]; /* Global index. */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + k = next_lval++; + for (j = 0, dense_col = dense; j < nsupc; ++j) { + lusup[k] = dense_col[irow]; + dense_col[irow] = zero; + k += len; + dense_col += ldaspa; + } + } /* for bnnz ... */ + } /* for jj ... */ + } /* if index ... */ +#if ( PROFlevel>=1 ) + t_l += SuperLU_timer_() - t; +#endif + } /* if mycol == pc */ + } /* for jb ... */ + + SUPERLU_FREE(dense); + SUPERLU_FREE(Urb_length); + SUPERLU_FREE(Urb_indptr); +#if ( PROFlevel>=1 ) + if ( !iam ) printf(".. 2nd distribute time: L %.2f\tU %.2f\tu_blks %d\tnrbu %d\n", + t_l, t_u, u_blks, nrbu); +#endif + + } else { + /* -------------------------------------------------- + * FIRST TIME CREATING THE L AND U DATA STRUCTURE. + * -------------------------------------------------- */ + +#if ( PROFlevel>=1 ) + t_l = t_u = 0; u_blks = 0; +#endif + /* No L and U data structures are available yet. + We need to set up the L and U data structures and propagate + the values of A into them. */ + lsub = Glu_freeable->lsub; /* compressed L subscripts */ + xlsub = Glu_freeable->xlsub; + usub = Glu_freeable->usub; /* compressed U subscripts */ + xusub = Glu_freeable->xusub; + + if ( !(ToRecv = SUPERLU_MALLOC(nsupers * sizeof(int))) ) + ABORT("Malloc fails for ToRecv[]."); + for (i = 0; i < nsupers; ++i) ToRecv[i] = 0; + + k = CEILING( nsupers, grid->npcol );/* Number of local column blocks */ + if ( !(ToSendR = (int **) SUPERLU_MALLOC(k*sizeof(int*))) ) + ABORT("Malloc fails for ToSendR[]."); + j = k * grid->npcol; + if ( !(index1 = SUPERLU_MALLOC(j * sizeof(int))) ) + ABORT("Malloc fails for index[]."); + + mem_use += (float) k*sizeof(int_t*) + (j + nsupers)*iword; + + for (i = 0; i < j; ++i) index1[i] = EMPTY; + for (i = 0,j = 0; i < k; ++i, j += grid->npcol) ToSendR[i] = &index1[j]; + k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + + /* Pointers to the beginning of each block row of U. */ + if ( !(Unzval_br_ptr = + (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) + ABORT("Malloc fails for Unzval_br_ptr[]."); + if ( !(Ufstnz_br_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Ufstnz_br_ptr[]."); + + if ( !(ToSendD = SUPERLU_MALLOC(k * sizeof(int))) ) + ABORT("Malloc fails for ToSendD[]."); + for (i = 0; i < k; ++i) ToSendD[i] = NO; + if ( !(ilsum = intMalloc_dist(k+1)) ) + ABORT("Malloc fails for ilsum[]."); + + /* Auxiliary arrays used to set up U block data structures. + They are freed on return. */ + if ( !(rb_marker = intCalloc_dist(k)) ) + ABORT("Calloc fails for rb_marker[]."); + if ( !(Urb_length = intCalloc_dist(k)) ) + ABORT("Calloc fails for Urb_length[]."); + if ( !(Urb_indptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Urb_indptr[]."); + if ( !(Urb_fstnz = intCalloc_dist(k)) ) + ABORT("Calloc fails for Urb_fstnz[]."); + if ( !(Ucbs = intCalloc_dist(k)) ) + ABORT("Calloc fails for Ucbs[]."); + + mem_use += 2.0*k*sizeof(int_t*) + (7.0*k+1)*iword; + + /* Compute ldaspa and ilsum[]. */ + ldaspa = 0; + ilsum[0] = 0; + for (gb = 0; gb < nsupers; ++gb) { + if ( myrow == PROW( gb, grid ) ) { + i = SuperSize( gb ); + ldaspa += i; + lb = LBi( gb, grid ); + ilsum[lb + 1] = ilsum[lb] + i; + } + } + + + /* ------------------------------------------------------------ + COUNT NUMBER OF ROW BLOCKS AND THE LENGTH OF EACH BLOCK IN U. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF G(U). + ------------------------------------------------------------*/ + + /* Loop through each supernode column. */ + for (jb = 0; jb < nsupers; ++jb) { + pc = PCOL( jb, grid ); + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + /* Loop through each column in the block. */ + for (j = fsupc; j < fsupc + nsupc; ++j) { + /* usub[*] contains only "first nonzero" in each segment. */ + for (i = xusub[j]; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero of the segment. */ + gb = BlockNum( irow ); + kcol = PCOL( gb, grid ); + ljb = LBj( gb, grid ); + if ( mycol == kcol && mycol != pc ) ToSendR[ljb][pc] = YES; + pr = PROW( gb, grid ); + lb = LBi( gb, grid ); + if ( mycol == pc ) { + if ( myrow == pr ) { + ToSendD[lb] = YES; + /* Count nonzeros in entire block row. */ + Urb_length[lb] += FstBlockC( gb+1 ) - irow; + if (rb_marker[lb] <= jb) {/* First see the block */ + rb_marker[lb] = jb + 1; + Urb_fstnz[lb] += nsupc; + ++Ucbs[lb]; /* Number of column blocks + in block row lb. */ +#if ( PRNTlevel>=1 ) + ++nUblocks; +#endif + } + ToRecv[gb] = 1; + } else ToRecv[gb] = 2; /* Do I need 0, 1, 2 ? */ + } + } /* for i ... */ + } /* for j ... */ + } /* for jb ... */ + + /* Set up the initial pointers for each block row in U. */ + nrbu = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + for (lb = 0; lb < nrbu; ++lb) { + len = Urb_length[lb]; + rb_marker[lb] = 0; /* Reset block marker. */ + if ( len ) { + /* Add room for descriptors */ + len1 = Urb_fstnz[lb] + BR_HEADER + Ucbs[lb] * UB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1+1)) ) + ABORT("Malloc fails for Uindex[]."); + Ufstnz_br_ptr[lb] = index; + if ( !(Unzval_br_ptr[lb] = floatMalloc_dist(len)) ) + ABORT("Malloc fails for Unzval_br_ptr[*][]."); + mybufmax[2] = SUPERLU_MAX( mybufmax[2], len1 ); + mybufmax[3] = SUPERLU_MAX( mybufmax[3], len ); + index[0] = Ucbs[lb]; /* Number of column blocks */ + index[1] = len; /* Total length of nzval[] */ + index[2] = len1; /* Total length of index[] */ + index[len1] = -1; /* End marker */ + } else { + Ufstnz_br_ptr[lb] = NULL; + Unzval_br_ptr[lb] = NULL; + } + Urb_length[lb] = 0; /* Reset block length. */ + Urb_indptr[lb] = BR_HEADER; /* Skip header in U index[]. */ + Urb_fstnz[lb] = BR_HEADER; + } /* for lb ... */ + + SUPERLU_FREE(Ucbs); + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_() - t; + if ( !iam) printf(".. Phase 2 - setup U strut time: %.2f\t\n", t); +#endif + + mem_use -= 2.0*k * iword; + + /* Auxiliary arrays used to set up L block data structures. + They are freed on return. + k is the number of local row blocks. */ + if ( !(Lrb_length = intCalloc_dist(k)) ) + ABORT("Calloc fails for Lrb_length[]."); + if ( !(Lrb_number = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_number[]."); + if ( !(Lrb_indptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_indptr[]."); + if ( !(Lrb_valptr = intMalloc_dist(k)) ) + ABORT("Malloc fails for Lrb_valptr[]."); + if (!(dense=floatCalloc_dist(SUPERLU_MAX(1,((size_t)ldaspa) + *sp_ienv_dist(3))))) + ABORT("Calloc fails for SPA dense[]."); + + /* These counts will be used for triangular solves. */ + if ( !(fmod = intCalloc_dist(k)) ) + ABORT("Calloc fails for fmod[]."); + if ( !(bmod = intCalloc_dist(k)) ) + ABORT("Calloc fails for bmod[]."); +#if ( PRNTlevel>=1 ) + mem_use += 6.0*k*iword + ldaspa*sp_ienv_dist(3)*sword; +#endif + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + + /* Pointers to the beginning of each block column of L. */ + if ( !(Lnzval_bc_ptr = (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) + ABORT("Malloc fails for Lnzval_bc_ptr[]."); + if ( !(Lrowind_bc_ptr = (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Lrowind_bc_ptr[]."); + Lrowind_bc_ptr[k-1] = NULL; + + if ( !(Lindval_loc_bc_ptr = + (int_t**)SUPERLU_MALLOC(k * sizeof(int_t*))) ) + ABORT("Malloc fails for Lindval_loc_bc_ptr[]."); + Lindval_loc_bc_ptr[k-1] = NULL; + + if ( !(Linv_bc_ptr = + (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Linv_bc_ptr[]."); + } + if ( !(Uinv_bc_ptr = + (float**)SUPERLU_MALLOC(k * sizeof(float*))) ) { + fprintf(stderr, "Malloc fails for Uinv_bc_ptr[]."); + } + Linv_bc_ptr[k-1] = NULL; + Uinv_bc_ptr[k-1] = NULL; + + if ( !(Unnz = + (int_t*)SUPERLU_MALLOC(k * sizeof(int_t))) ) + ABORT("Malloc fails for Unnz[]."); + + /* These lists of processes will be used for triangular solves. */ + if ( !(fsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + ABORT("Malloc fails for fsendx_plist[]."); + len = k * grid->nprow; + if ( !(index = intMalloc_dist(len)) ) + ABORT("Malloc fails for fsendx_plist[0]"); + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < k; ++i, j += grid->nprow) + fsendx_plist[i] = &index[j]; + if ( !(bsendx_plist = (int_t **) SUPERLU_MALLOC(k*sizeof(int_t*))) ) + ABORT("Malloc fails for bsendx_plist[]."); + if ( !(index = intMalloc_dist(len)) ) + ABORT("Malloc fails for bsendx_plist[0]"); + for (i = 0; i < len; ++i) index[i] = EMPTY; + for (i = 0, j = 0; i < k; ++i, j += grid->nprow) + bsendx_plist[i] = &index[j]; + + mem_use += 4.0*k*sizeof(int_t*) + 2.0*len*iword; + + /*------------------------------------------------------------ + PROPAGATE ROW SUBSCRIPTS AND VALUES OF A INTO L AND U BLOCKS. + THIS ACCOUNTS FOR ONE-PASS PROCESSING OF A, L AND U. + ------------------------------------------------------------*/ + + for (jb = 0; jb < nsupers; ++jb) { + pc = PCOL( jb, grid ); + if ( mycol == pc ) { /* Block column jb in my process column */ + fsupc = FstBlockC( jb ); + nsupc = SuperSize( jb ); + ljb = LBj( jb, grid ); /* Local block number */ + + /* Scatter A into SPA. */ + for (j = fsupc, dense_col = dense; j < FstBlockC( jb+1 ); ++j){ + for (i = xa_begin[j]; i < xa_end[j]; ++i) { + irow = asub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + irow = ilsum[lb] + irow - FstBlockC( gb ); + dense_col[irow] = a[i]; + } + } + dense_col += ldaspa; + } + + jbrow = PROW( jb, grid ); + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /*------------------------------------------------ + * SET UP U BLOCKS. + *------------------------------------------------*/ + kseen = 0; + dense_col = dense; + /* Loop through each column in the block column. */ + for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + istart = xusub[j]; + /* NOTE: Only the first nonzero index of the segment + is stored in usub[]. */ + for (i = istart; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero in the segment. */ + gb = BlockNum( irow ); + pr = PROW( gb, grid ); + if ( pr != jbrow && + myrow == jbrow && /* diag. proc. owning jb */ + bsendx_plist[ljb][pr] == EMPTY ) { + bsendx_plist[ljb][pr] = YES; + ++nbsendx; + } + if ( myrow == pr ) { + lb = LBi( gb, grid ); /* Local block number */ + index = Ufstnz_br_ptr[lb]; + uval = Unzval_br_ptr[lb]; + fsupc1 = FstBlockC( gb+1 ); + if (rb_marker[lb] <= jb) { /* First time see + the block */ + rb_marker[lb] = jb + 1; + Urb_indptr[lb] = Urb_fstnz[lb];; + index[Urb_indptr[lb]] = jb; /* Descriptor */ + Urb_indptr[lb] += UB_DESCRIPTOR; + /* Record the first location in index[] of the + next block */ + Urb_fstnz[lb] = Urb_indptr[lb] + nsupc; + len = Urb_indptr[lb];/* Start fstnz in index */ + index[len-1] = 0; + for (k = 0; k < nsupc; ++k) + index[len+k] = fsupc1; + if ( gb != jb )/* Exclude diagonal block. */ + ++bmod[lb];/* Mod. count for back solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nbrecvx; + kseen = 1; + } + } else { /* Already saw the block */ + len = Urb_indptr[lb];/* Start fstnz in index */ + } + jj = j - fsupc; + index[len+jj] = irow; + /* Load the numerical values */ + k = fsupc1 - irow; /* No. of nonzeros in segment */ + index[len-1] += k; /* Increment block length in + Descriptor */ + irow = ilsum[lb] + irow - FstBlockC( gb ); + for (ii = 0; ii < k; ++ii) { + uval[Urb_length[lb]++] = dense_col[irow + ii]; + dense_col[irow + ii] = zero; + } + } /* if myrow == pr ... */ + } /* for i ... */ + dense_col += ldaspa; + } /* for j ... */ + +#if ( PROFlevel>=1 ) + t_u += SuperLU_timer_() - t; + t = SuperLU_timer_(); +#endif + + /*------------------------------------------------ + * SET UP L BLOCKS. + *------------------------------------------------*/ + + /* Count number of blocks and length of each block. */ + nrbl = 0; + len = 0; /* Number of row subscripts I own. */ + kseen = 0; + istart = xlsub[fsupc]; + for (i = istart; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + gb = BlockNum( irow ); /* Global block number */ + pr = PROW( gb, grid ); /* Process row owning this block */ + if ( pr != jbrow && + myrow == jbrow && /* diag. proc. owning jb */ + fsendx_plist[ljb][pr] == EMPTY /* first time */ ) { + fsendx_plist[ljb][pr] = YES; + ++nfsendx; + } + if ( myrow == pr ) { + lb = LBi( gb, grid ); /* Local block number */ + if (rb_marker[lb] <= jb) { /* First see this block */ + rb_marker[lb] = jb + 1; + Lrb_length[lb] = 1; + Lrb_number[nrbl++] = gb; + if ( gb != jb ) /* Exclude diagonal block. */ + ++fmod[lb]; /* Mod. count for forward solve */ + if ( kseen == 0 && myrow != jbrow ) { + ++nfrecvx; + kseen = 1; + } +#if ( PRNTlevel>=1 ) + ++nLblocks; +#endif + } else { + ++Lrb_length[lb]; + } + ++len; + } + } /* for i ... */ + + if ( nrbl ) { /* Do not ensure the blocks are sorted! */ + /* Set up the initial pointers for each block in + index[] and nzval[]. */ + /* Add room for descriptors */ + len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + if ( !(index = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index[]"); + if (!(lusup = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float)))) + ABORT("Malloc fails for lusup[]"); + if ( !(Lindval_loc_bc_ptr[ljb] = intCalloc_dist(nrbl*3) )) + ABORT("Malloc fails for Lindval_loc_bc_ptr[ljb][]"); + if (!(Linv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float)))) + ABORT("Malloc fails for Linv_bc_ptr[ljb][]"); + if (!(Uinv_bc_ptr[ljb] = (float*)SUPERLU_MALLOC(nsupc*nsupc * sizeof(float)))) + ABORT("Malloc fails for Uinv_bc_ptr[ljb][]"); + mybufmax[0] = SUPERLU_MAX( mybufmax[0], len1 ); + mybufmax[1] = SUPERLU_MAX( mybufmax[1], len*nsupc ); + mybufmax[4] = SUPERLU_MAX( mybufmax[4], len ); + index[0] = nrbl; /* Number of row blocks */ + index[1] = len; /* LDA of the nzval[] */ + next_lind = BC_HEADER; + next_lval = 0; + for (k = 0; k < nrbl; ++k) { + gb = Lrb_number[k]; + lb = LBi( gb, grid ); + len = Lrb_length[lb]; + Lindval_loc_bc_ptr[ljb][k] = lb; + Lindval_loc_bc_ptr[ljb][k+nrbl] = next_lind; + Lindval_loc_bc_ptr[ljb][k+nrbl*2] = next_lval; + Lrb_length[lb] = 0; /* Reset vector of block length */ + index[next_lind++] = gb; /* Descriptor */ + index[next_lind++] = len; + Lrb_indptr[lb] = next_lind; + Lrb_valptr[lb] = next_lval; + next_lind += len; + next_lval += len; + } + /* Propagate the compressed row subscripts to Lindex[], and + the initial values of A from SPA into Lnzval[]. */ + + len = index[1]; /* LDA of lusup[] */ + for (i = istart; i < xlsub[fsupc+1]; ++i) { + irow = lsub[i]; + gb = BlockNum( irow ); + if ( myrow == PROW( gb, grid ) ) { + lb = LBi( gb, grid ); + k = Lrb_indptr[lb]++; /* Random access a block */ + index[k] = irow; + k = Lrb_valptr[lb]++; + irow = ilsum[lb] + irow - FstBlockC( gb ); + for (j = 0, dense_col = dense; j < nsupc; ++j) { + lusup[k] = dense_col[irow]; + dense_col[irow] = zero; + k += len; + dense_col += ldaspa; + } + } + } /* for i ... */ + Lrowind_bc_ptr[ljb] = index; + Lnzval_bc_ptr[ljb] = lusup; + + + /* sort Lindval_loc_bc_ptr[ljb], Lrowind_bc_ptr[ljb] and Lnzval_bc_ptr[ljb] here*/ + if(nrbl>1){ + krow = PROW( jb, grid ); + if(myrow==krow){ /* skip the diagonal block */ + uu=nrbl-2; + lloc = &Lindval_loc_bc_ptr[ljb][1]; + }else{ + uu=nrbl-1; + lloc = Lindval_loc_bc_ptr[ljb]; + } + quickSortM(lloc,0,uu,nrbl,0,3); + } + + + if ( !(index_srt = intMalloc_dist(len1)) ) + ABORT("Malloc fails for index_srt[]"); + if (!(lusup_srt = (float*)SUPERLU_MALLOC(len*nsupc * sizeof(float)))) + ABORT("Malloc fails for lusup_srt[]"); + + idx_indx = BC_HEADER; + idx_lusup = 0; + for (jj=0;jj=1 ) + t_l += SuperLU_timer_() - t; +#endif + } /* if mycol == pc */ + + } /* for jb ... */ + + ///////////////////////////////////////////////////////////////// + + /* Set up additional pointers for the index and value arrays of U. + nub is the number of local block columns. */ + nub = CEILING( nsupers, grid->npcol); /* Number of local block columns. */ + if ( !(Urbs = (int_t *) intCalloc_dist(2*nub)) ) + ABORT("Malloc fails for Urbs[]"); /* Record number of nonzero + blocks in a block column. */ + Urbs1 = Urbs + nub; + if ( !(Ucb_indptr = SUPERLU_MALLOC(nub * sizeof(Ucb_indptr_t *))) ) + ABORT("Malloc fails for Ucb_indptr[]"); + if ( !(Ucb_valptr = SUPERLU_MALLOC(nub * sizeof(int_t *))) ) + ABORT("Malloc fails for Ucb_valptr[]"); + nlb = CEILING( nsupers, grid->nprow ); /* Number of local block rows. */ + + /* Count number of row blocks in a block column. + One pass of the skeleton graph of U. */ + for (lk = 0; lk < nlb; ++lk) { + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + /* usub1[0] -- number of column blocks in this block row. */ + i = BR_HEADER; /* Pointer in index array. */ + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number */ + ++Urbs[LBj(k,grid)]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + /* Set up the vertical linked lists for the row blocks. + One pass of the skeleton graph of U. */ + for (lb = 0; lb < nub; ++lb) { + if ( Urbs[lb] ) { /* Not an empty block column. */ + if ( !(Ucb_indptr[lb] + = SUPERLU_MALLOC(Urbs[lb] * sizeof(Ucb_indptr_t))) ) + ABORT("Malloc fails for Ucb_indptr[lb][]"); + if ( !(Ucb_valptr[lb] = (int_t *) intMalloc_dist(Urbs[lb])) ) + ABORT("Malloc fails for Ucb_valptr[lb][]"); + } + } + for (lk = 0; lk < nlb; ++lk) { /* For each block row. */ + usub1 = Ufstnz_br_ptr[lk]; + if ( usub1 ) { /* Not an empty block row. */ + i = BR_HEADER; /* Pointer in index array. */ + j = 0; /* Pointer in nzval array. */ + + for (lb = 0; lb < usub1[0]; ++lb) { /* For all column blocks. */ + k = usub1[i]; /* Global block number, column-wise. */ + ljb = LBj( k, grid ); /* Local block number, column-wise. */ + Ucb_indptr[ljb][Urbs1[ljb]].lbnum = lk; + + Ucb_indptr[ljb][Urbs1[ljb]].indpos = i; + Ucb_valptr[ljb][Urbs1[ljb]] = j; + + ++Urbs1[ljb]; + j += usub1[i+1]; + i += UB_DESCRIPTOR + SuperSize( k ); + } + } + } + + +/* Count the nnzs per block column */ + for (lb = 0; lb < nub; ++lb) { + Unnz[lb] = 0; + k = lb * grid->npcol + mycol;/* Global block number, column-wise. */ + knsupc = SuperSize( k ); + for (ub = 0; ub < Urbs[lb]; ++ub) { + ik = Ucb_indptr[lb][ub].lbnum; /* Local block number, row-wise. */ + i = Ucb_indptr[lb][ub].indpos; /* Start of the block in usub[]. */ + i += UB_DESCRIPTOR; + gik = ik * grid->nprow + myrow;/* Global block number, row-wise. */ + iklrow = FstBlockC( gik+1 ); + for (jj = 0; jj < knsupc; ++jj) { + fnz = Ufstnz_br_ptr[ik][i + jj]; + if ( fnz < iklrow ) { + Unnz[lb] +=iklrow-fnz; + } + } /* for jj ... */ + } + } + + ///////////////////////////////////////////////////////////////// + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Bcast tree for L ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(LBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for LBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + + for (i=0;icscp.comm); + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlag[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=3*nsupers; + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=SUPERLU_MIN(ActiveFlagAll[pr+ljb*grid->nprow],gb); + } /* for j ... */ + } + } + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,0,2); + + if(Iactive==1){ + // printf("jb %5d damn\n",jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s'); + BcTree_SetTag(LBtree_ptr[ljb],BC_L,'s'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(iam==15 || iam==3){ + // printf("iam %5d btree lk %5d tag %5d root %5d\n",iam, ljb,jb,BcTree_IsRoot(LBtree_ptr[ljb],'s')); + // fflush(stdout); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + if ( fsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + assert(rank_cnt==rank_cnt_ref); + + // printf("Partial Bcast Procs: col%7d np%4d\n",jb,rank_cnt); + + // // printf("Partial Bcast Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Bcast tree for L: %.2f\t\n", t); +#endif + + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for L ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(frecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for frecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || fmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, frecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(LRtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for LRtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + + // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + // fsupc = FstBlockC( jb ); + // len=xlsub[fsupc+1]-xlsub[fsupc]; + // idxs[jb] = len-1; + // if(len>0){ + // if ( !(nzrows[jb] = intMalloc_dist(len)) ) + // ABORT("Malloc fails for nzrows[jb]"); + // for(i=xlsub[fsupc];inpcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=-3*nsupers; + + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + fsupc = FstBlockC( jb ); + pc = PCOL( jb, grid ); + for(i=xlsub[fsupc];inpcol]=SUPERLU_MAX(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,1,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s'); + RdTree_SetTag(LRtree_ptr[lib], RD_L,'s'); + // } + + // printf("iam %5d rtree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + // if(ib==15 || ib ==16){ + + // if(iam==15 || iam==3){ + // printf("iam %5d rtree lk %5d tag %5d root %5d\n",iam,lib,ib,RdTree_IsRoot(LRtree_ptr[lib],'s')); + // fflush(stdout); + // } + + + // #if ( PRNTlevel>=1 ) + // if(Root==mycol){ + // assert(rank_cnt==frecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // // for(j=0;j=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Reduce tree for L: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + + /* construct the Bcast tree for U ... */ + + k = CEILING( nsupers, grid->npcol );/* Number of local block columns */ + if ( !(UBtree_ptr = (BcTree*)SUPERLU_MALLOC(k * sizeof(BcTree))) ) + ABORT("Malloc fails for UBtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->nprow*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->nprow * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + if ( !(SeedSTD_BC = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_BC[]."); + + for (i=0;icscp.comm); + + + for (ljb = 0; ljb nprow*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnprow*k;++j)ActiveFlagAll[j]=-3*nsupers; + + for (ljb = 0; ljb < k; ++ljb) { /* for each local block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],gb); + // printf("gb:%5d jb: %5d nsupers: %5d\n",gb,jb,nsupers); + // fflush(stdout); + //if(gb==jb)Root=pr; + } + + + } + pr = PROW( jb, grid ); // take care of diagonal node stored as L + // printf("jb %5d current: %5d",jb,ActiveFlagAll[pr+ljb*grid->nprow]); + // fflush(stdout); + ActiveFlagAll[pr+ljb*grid->nprow]=SUPERLU_MAX(ActiveFlagAll[pr+ljb*grid->nprow],jb); + } + } + + + + for (ljb = 0; ljb < k; ++ljb) { /* for each block column ... */ + jb = mycol+ljb*grid->npcol; /* not sure */ + if(jbnprow;++j)ActiveFlag[j]=ActiveFlagAll[j+ljb*grid->nprow]; + for (j=0;jnprow;++j)ActiveFlag[j+grid->nprow]=j; + for (j=0;jnprow;++j)ranks[j]=-1; + + Root=-1; + Iactive = 0; + for (j=0;jnprow;++j){ + if(ActiveFlag[j]!=-3*nsupers){ + gb = ActiveFlag[j]; + pr = PROW( gb, grid ); + if(gb==jb)Root=pr; + if(myrow==pr)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->nprow-1,grid->nprow,1,2); + // printf("jb: %5d Iactive %5d\n",jb,Iactive); + // fflush(stdout); + if(Iactive==1){ + // printf("root:%5d jb: %5d\n",Root,jb); + // fflush(stdout); + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->nprow; ++j){ + if(ActiveFlag[j]!=-3*nsupers && ActiveFlag[j+grid->nprow]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->nprow]; + ++rank_cnt; + } + } + // printf("jb: %5d rank_cnt %5d\n",jb,rank_cnt); + // fflush(stdout); + if(rank_cnt>1){ + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_BC[ljb],'s'); + BcTree_SetTag(UBtree_ptr[ljb],BC_U,'s'); + + // printf("iam %5d btree rank_cnt %5d \n",iam,rank_cnt); + // fflush(stdout); + + if(Root==myrow){ + rank_cnt_ref=1; + for (j = 0; j < grid->nprow; ++j) { + // printf("ljb %5d j %5d nprow %5d\n",ljb,j,grid->nprow); + // fflush(stdout); + if ( bsendx_plist[ljb][j] != EMPTY ) { + ++rank_cnt_ref; + } + } + // printf("ljb %5d rank_cnt %5d rank_cnt_ref %5d\n",ljb,rank_cnt,rank_cnt_ref); + // fflush(stdout); + assert(rank_cnt==rank_cnt_ref); + } + } + } + } + } + SUPERLU_FREE(ActiveFlag); + SUPERLU_FREE(ActiveFlagAll); + SUPERLU_FREE(ranks); + SUPERLU_FREE(SeedSTD_BC); + +#if ( PROFlevel>=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Bcast tree for U: %.2f\t\n", t); +#endif + +#if ( PROFlevel>=1 ) + t = SuperLU_timer_(); +#endif + /* construct the Reduce tree for U ... */ + /* the following is used as reference */ + nlb = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(mod_bit = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for mod_bit[]."); + if ( !(brecv = intMalloc_dist(nlb)) ) + ABORT("Malloc fails for brecv[]."); + + for (k = 0; k < nlb; ++k) mod_bit[k] = 0; + for (k = 0; k < nsupers; ++k) { + pr = PROW( k, grid ); + if ( myrow == pr ) { + lib = LBi( k, grid ); /* local block number */ + kcol = PCOL( k, grid ); + if (mycol == kcol || bmod[lib] ) + mod_bit[lib] = 1; /* contribution from off-diagonal and diagonal*/ + } + } + /* Every process receives the count, but it is only useful on the + diagonal processes. */ + MPI_Allreduce( mod_bit, brecv, nlb, mpi_int_t, MPI_SUM, grid->rscp.comm); + + + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(URtree_ptr = (RdTree*)SUPERLU_MALLOC(k * sizeof(RdTree))) ) + ABORT("Malloc fails for URtree_ptr[]."); + if ( !(ActiveFlag = intCalloc_dist(grid->npcol*2)) ) + ABORT("Calloc fails for ActiveFlag[]."); + if ( !(ranks = (int*)SUPERLU_MALLOC(grid->npcol * sizeof(int))) ) + ABORT("Malloc fails for ranks[]."); + + // if ( !(idxs = intCalloc_dist(nsupers)) ) + // ABORT("Calloc fails for idxs[]."); + + // if ( !(nzrows = (int_t**)SUPERLU_MALLOC(nsupers * sizeof(int_t*))) ) + // ABORT("Malloc fails for nzrows[]."); + + if ( !(SeedSTD_RD = (double*)SUPERLU_MALLOC(k * sizeof(double))) ) + ABORT("Malloc fails for SeedSTD_RD[]."); + + for (i=0;irscp.comm); + + + // for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + // fsupc = FstBlockC( jb ); + // len=0; + // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + // istart = xusub[j]; + // /* NOTE: Only the first nonzero index of the segment + // is stored in usub[]. */ + // len += xusub[j+1] - xusub[j]; + // } + + // idxs[jb] = len-1; + + // if(len>0){ + // if ( !(nzrows[jb] = intMalloc_dist(len)) ) + // ABORT("Malloc fails for nzrows[jb]"); + + // fsupc = FstBlockC( jb ); + + // len=0; + + // for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + // istart = xusub[j]; + // /* NOTE: Only the first nonzero index of the segment + // is stored in usub[]. */ + // for (i = istart; i < xusub[j+1]; ++i) { + // irow = usub[i]; /* First nonzero in the segment. */ + // nzrows[jb][len]=irow; + // len++; + // } + // } + // quickSort(nzrows[jb],0,len-1,0); + // } + // else{ + // nzrows[jb] = NULL; + // } + // } + + + for (lib = 0; lib npcol*k)) ) + ABORT("Calloc fails for ActiveFlagAll[]."); + for (j=0;jnpcol*k;++j)ActiveFlagAll[j]=3*nsupers; + + for (jb = 0; jb < nsupers; ++jb) { /* for each block column ... */ + fsupc = FstBlockC( jb ); + pc = PCOL( jb, grid ); + + fsupc = FstBlockC( jb ); + for (j = fsupc; j < FstBlockC( jb+1 ); ++j) { + istart = xusub[j]; + /* NOTE: Only the first nonzero index of the segment + is stored in usub[]. */ + for (i = istart; i < xusub[j+1]; ++i) { + irow = usub[i]; /* First nonzero in the segment. */ + ib = BlockNum( irow ); + pr = PROW( ib, grid ); + if ( myrow == pr ) { /* Block row ib in my process row */ + lib = LBi( ib, grid ); /* Local block number */ + ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + } + + pr = PROW( jb, grid ); + if ( myrow == pr ) { /* Block row ib in my process row */ + lib = LBi( jb, grid ); /* Local block number */ + ActiveFlagAll[pc+lib*grid->npcol]=SUPERLU_MIN(ActiveFlagAll[pc+lib*grid->npcol],jb); + } + } + + + for (lib=0;libnprow; /* not sure */ + if(ibnpcol;++j)ActiveFlag[j]=ActiveFlagAll[j+lib*grid->npcol];; + for (j=0;jnpcol;++j)ActiveFlag[j+grid->npcol]=j; + for (j=0;jnpcol;++j)ranks[j]=-1; + Root=-1; + Iactive = 0; + + for (j=0;jnpcol;++j){ + if(ActiveFlag[j]!=3*nsupers){ + jb = ActiveFlag[j]; + pc = PCOL( jb, grid ); + if(jb==ib)Root=pc; + if(mycol==pc)Iactive=1; + } + } + + quickSortM(ActiveFlag,0,grid->npcol-1,grid->npcol,0,2); + + if(Iactive==1){ + assert( Root>-1 ); + rank_cnt = 1; + ranks[0]=Root; + for (j = 0; j < grid->npcol; ++j){ + if(ActiveFlag[j]!=3*nsupers && ActiveFlag[j+grid->npcol]!=Root){ + ranks[rank_cnt]=ActiveFlag[j+grid->npcol]; + ++rank_cnt; + } + } + if(rank_cnt>1){ + + for (ii=0;iicomm, ranks, rank_cnt, msgsize,SeedSTD_RD[lib],'s'); + RdTree_SetTag(URtree_ptr[lib], RD_U,'s'); + // } + + // #if ( PRNTlevel>=1 ) + if(Root==mycol){ + // printf("Partial Reduce Procs: %4d %4d %5d \n",iam, rank_cnt,brecv[lib]); + // fflush(stdout); + assert(rank_cnt==brecv[lib]); + // printf("Partial Reduce Procs: row%7d np%4d\n",ib,rank_cnt); + // printf("Partial Reduce Procs: %4d %4d: ",iam, rank_cnt); + // // for(j=0;j=1 ) +t = SuperLU_timer_() - t; +if ( !iam) printf(".. Construct Reduce tree for U: %.2f\t\n", t); +#endif + + //////////////////////////////////////////////////////// + + + Llu->Lrowind_bc_ptr = Lrowind_bc_ptr; + Llu->Lindval_loc_bc_ptr = Lindval_loc_bc_ptr; + Llu->Lnzval_bc_ptr = Lnzval_bc_ptr; + Llu->Ufstnz_br_ptr = Ufstnz_br_ptr; + Llu->Unzval_br_ptr = Unzval_br_ptr; + Llu->Unnz = Unnz; + Llu->ToRecv = ToRecv; + Llu->ToSendD = ToSendD; + Llu->ToSendR = ToSendR; + Llu->fmod = fmod; + Llu->fsendx_plist = fsendx_plist; + Llu->nfrecvx = nfrecvx; + Llu->nfsendx = nfsendx; + Llu->bmod = bmod; + Llu->bsendx_plist = bsendx_plist; + Llu->nbrecvx = nbrecvx; + Llu->nbsendx = nbsendx; + Llu->ilsum = ilsum; + Llu->ldalsum = ldaspa; + Llu->LRtree_ptr = LRtree_ptr; + Llu->LBtree_ptr = LBtree_ptr; + Llu->URtree_ptr = URtree_ptr; + Llu->UBtree_ptr = UBtree_ptr; + Llu->Linv_bc_ptr = Linv_bc_ptr; + Llu->Uinv_bc_ptr = Uinv_bc_ptr; + Llu->Urbs = Urbs; + Llu->Ucb_indptr = Ucb_indptr; + Llu->Ucb_valptr = Ucb_valptr; + +#if ( PRNTlevel>=1 ) + if ( !iam ) printf(".. # L blocks " IFMT "\t# U blocks " IFMT "\n", + nLblocks, nUblocks); +#endif + + SUPERLU_FREE(rb_marker); + SUPERLU_FREE(Urb_fstnz); + SUPERLU_FREE(Urb_length); + SUPERLU_FREE(Urb_indptr); + SUPERLU_FREE(Lrb_length); + SUPERLU_FREE(Lrb_number); + SUPERLU_FREE(Lrb_indptr); + SUPERLU_FREE(Lrb_valptr); + SUPERLU_FREE(dense); + + k = CEILING( nsupers, grid->nprow );/* Number of local block rows */ + if ( !(Llu->mod_bit = intMalloc_dist(k)) ) + ABORT("Malloc fails for mod_bit[]."); + + /* Find the maximum buffer size. */ + MPI_Allreduce(mybufmax, Llu->bufmax, NBUFFERS, mpi_int_t, MPI_MAX, grid->comm); + +#if ( PROFlevel>=1 ) + if ( !iam ) printf(".. 1st distribute time:\n " + "\tL\t%.2f\n\tU\t%.2f\n" + "\tu_blks %d\tnrbu %d\n--------\n", + t_l, t_u, u_blks, nrbu); +#endif + + } /* else fact != SamePattern_SameRowPerm */ + +#if ( DEBUGlevel>=1 ) + /* Memory allocated but not freed: + ilsum, fmod, fsendx_plist, bmod, bsendx_plist */ + CHECK_MALLOC(iam, "Exit sdistribute()"); +#endif + + return (mem_use); +} /* SDISTRIBUTE */ + diff --git a/SRC/sgather.c b/SRC/sgather.c new file mode 100644 index 00000000..3a3f29b3 --- /dev/null +++ b/SRC/sgather.c @@ -0,0 +1,394 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Various gather routines. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include 
+#include "superlu_sdefs.h"
+#if 0
+#include "scatter.h"
+#include "sec_structs.h"
+#include "superlu_defs.h"
+#include "gather.h"
+#endif
+
+int_t sprintMatrix(char*s, int n, int m, float* A, int LDA)
+{
+    printf("%s\n", s );
+    for(int i=0; ixsup;
+    int_t knsupc = SuperSize (k);
+    int_t krow = PROW (k, grid);
+    int_t nlb, lptr0, luptr0;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+
+    HyP->lookAheadBlk = 0, HyP->RemainBlk = 0;
+
+    int_t nsupr = lsub[1];  /* LDA of lusup. */
+    if (myrow == krow)  /* Skip diagonal block L(k,k). */
+    {
+        lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+        luptr0 = knsupc;
+        nlb = lsub[0] - 1;
+    }
+    else
+    {
+        lptr0 = BC_HEADER;
+        luptr0 = 0;
+        nlb = lsub[0];
+    }
+    // printf("nLb =%d ", nlb );
+
+    int_t lptr = lptr0;
+    int_t luptr = luptr0;
+    for (int_t i = 0; i < nlb; ++i)
+    {
+        ib = lsub[lptr];        /* Row block L(i,k). */
+        temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */
+
+        int_t look_up_flag = 1;
+
+        // if elimination order is greater than first block stored on GPU
+        if (iperm_c_supno[ib] < HyP->first_u_block_acc) look_up_flag = 0;
+
+        // if it myIperm[ib] is within look ahead window
+        if (myIperm[ib]< myIperm[k] + HyP->nCudaStreams && myIperm[ib]>0) look_up_flag = 0;        
+
+        if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 )
+        {
+            int_t k_parent = gEtreeInfo->setree[k];
+            if (ib == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 )
+            {
+                look_up_flag = 0;
+            }
+        }
+        // look_up_flag = 0;
+        if (!look_up_flag)
+        {
+            /* ib is within look up window */
+            HyP->lookAhead_info[HyP->lookAheadBlk].nrows = temp_nbrow;
+            if (HyP->lookAheadBlk == 0)
+            {
+                HyP->lookAhead_info[HyP->lookAheadBlk].FullRow = temp_nbrow;
+            }
+            else
+            {
+                HyP->lookAhead_info[HyP->lookAheadBlk].FullRow
+                    = temp_nbrow + HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow;
+            }
+            HyP->lookAhead_info[HyP->lookAheadBlk].StRow = cum_nrow;
+            HyP->lookAhead_info[HyP->lookAheadBlk].lptr = lptr;
+            HyP->lookAhead_info[HyP->lookAheadBlk].ib = ib;
+            HyP->lookAheadBlk++;
+        }
+        else
+        {
+            /* ib is not in look up window */
+            HyP->Remain_info[HyP->RemainBlk].nrows = temp_nbrow;
+            if (HyP->RemainBlk == 0)
+            {
+                HyP->Remain_info[HyP->RemainBlk].FullRow = temp_nbrow;
+            }
+            else
+            {
+                HyP->Remain_info[HyP->RemainBlk].FullRow
+                    = temp_nbrow + HyP->Remain_info[HyP->RemainBlk - 1].FullRow;
+            }
+            HyP->Remain_info[HyP->RemainBlk].StRow = cum_nrow;
+            HyP->Remain_info[HyP->RemainBlk].lptr = lptr;
+            HyP->Remain_info[HyP->RemainBlk].ib = ib;
+            HyP->RemainBlk++;
+        }
+
+        cum_nrow += temp_nbrow;
+
+        lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+        lptr += temp_nbrow;
+        luptr += temp_nbrow;
+    }
+    lptr = lptr0;
+    luptr = luptr0;
+
+    sgather_l( HyP->lookAheadBlk, knsupc, HyP->lookAhead_info,
+               &lusup[luptr], nsupr, HyP->lookAhead_L_buff);
+
+    sgather_l( HyP->RemainBlk, knsupc, HyP->Remain_info,
+               &lusup[luptr], nsupr, HyP->Remain_L_buff);
+
+    assert(HyP->lookAheadBlk + HyP->RemainBlk ==nlb );
+    HyP->Lnbrow = HyP->lookAheadBlk == 0 ? 0 : HyP->lookAhead_info[HyP->lookAheadBlk - 1].FullRow;
+    HyP->Rnbrow = HyP->RemainBlk == 0 ? 0 : HyP->Remain_info[HyP->RemainBlk - 1].FullRow;
+
+    // sprintMatrix("LookAhead Block", HyP->Lnbrow, knsupc, HyP->lookAhead_L_buff, HyP->Lnbrow);
+    // sprintMatrix("Remaining Block", HyP->Rnbrow, knsupc, HyP->Remain_L_buff, HyP->Rnbrow);
+}
+
+// void Rgather_U(int_t k,
+//                 HyP_t *HyP,
+//                int_t st, int_t end,
+//                int_t *usub, double *uval, double *bigU,
+//                Glu_persist_t *Glu_persist, gridinfo_t *grid,
+//                int_t *perm_u)
+
+void sRgather_U( int_t k, int_t jj0, int_t *usub,	float *uval,
+                 float *bigU, gEtreeInfo_t* gEtreeInfo,	
+                 Glu_persist_t *Glu_persist, gridinfo_t *grid, HyP_t *HyP,
+                 int_t* myIperm, int_t *iperm_c_supno, int_t *perm_u)
+{
+    HyP->ldu   = 0;
+    HyP->num_u_blks = 0;
+    HyP->ldu_Phi = 0;
+    HyP->num_u_blks_Phi = 0;
+
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t     nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+    int_t *xsup = Glu_persist->xsup;
+    // int_t k = perm_c_supno[k0];
+    int_t klst = FstBlockC (k + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb, ljb;
+    int_t nsupc;
+    int_t full = 1;
+    int_t full_Phi = 1;
+    int_t temp_ncols = 0;
+    int_t segsize;
+    HyP->num_u_blks = 0;
+    HyP->ldu = 0;
+
+    for (int_t j = jj0; j < nub; ++j)
+    {
+        temp_ncols = 0;
+        arrive_at_ublock(
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid
+        );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+        }
+        /*here goes the condition wether jb block exists on Phi or not*/
+        int_t u_blk_acc_cond = 0;
+        // if (j == jj0) u_blk_acc_cond = 1;   /* must schedule first colum on cpu */
+        if (iperm_c_supno[jb] < HyP->first_l_block_acc) 
+        {
+            // printf("k=%d jb=%d got at condition-1:%d, %d \n",k,jb, iperm_c_supno[jb] , HyP->first_l_block_acc);
+            u_blk_acc_cond = 1;
+        }
+        // if jb is within lookahead window
+        if (myIperm[jb]< myIperm[k] + HyP->nCudaStreams && myIperm[jb]>0)
+        {
+            // printf("k=%d jb=%d got at condition-2:%d, %d\n ",k,jb, myIperm[jb] , myIperm[k]);
+            u_blk_acc_cond = 1;
+        }
+ 
+        if (k <= HyP->nsupers - 2 && gEtreeInfo->setree[k] > 0 )
+        {
+            int_t k_parent = gEtreeInfo->setree[k];
+            if (jb == k_parent && gEtreeInfo->numChildLeft[k_parent]==1 )
+            {
+                u_blk_acc_cond = 1;
+                // printf("k=%d jb=%d got at condition-3\n",k,jb);
+                u_blk_acc_cond = 1;
+            }
+        }
+
+
+        if (u_blk_acc_cond)
+        {
+            HyP->Ublock_info[HyP->num_u_blks].iukp = iukp;
+            HyP->Ublock_info[HyP->num_u_blks].rukp = rukp;
+            HyP->Ublock_info[HyP->num_u_blks].jb = jb;
+
+            for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+
+                    if ( segsize != HyP->ldu ) full = 0;
+                    if ( segsize > HyP->ldu ) HyP->ldu = segsize;
+                }
+            }
+
+            HyP->Ublock_info[HyP->num_u_blks].ncols = temp_ncols;
+            // ncols += temp_ncols;
+            HyP->num_u_blks++;
+        }
+        else
+        {
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].iukp = iukp;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].rukp = rukp;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].jb = jb;
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].eo =  HyP->nsupers - iperm_c_supno[jb]; /*since we want it to be in descending order*/
+
+            /* Prepare to call DGEMM. */
+
+
+            for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+
+                    if ( segsize != HyP->ldu_Phi ) full_Phi = 0;
+                    if ( segsize > HyP->ldu_Phi ) HyP->ldu_Phi = segsize;
+                }
+            }
+
+            HyP->Ublock_info_Phi[HyP->num_u_blks_Phi].ncols = temp_ncols;
+            // ncols_Phi += temp_ncols;
+            HyP->num_u_blks_Phi++;
+        }
+    }
+
+    /* Now doing prefix sum on  on ncols*/
+    HyP->Ublock_info[0].full_u_cols = HyP->Ublock_info[0 ].ncols;
+    for (int_t j = 1; j < HyP->num_u_blks; ++j)
+    {
+        HyP->Ublock_info[j].full_u_cols = HyP->Ublock_info[j ].ncols + HyP->Ublock_info[j - 1].full_u_cols;
+    }
+
+    /*sorting u blocks based on elimination order */
+    // sort_U_info_elm(HyP->Ublock_info_Phi,HyP->num_u_blks_Phi );
+    HyP->Ublock_info_Phi[0].full_u_cols = HyP->Ublock_info_Phi[0 ].ncols;
+    for ( int_t j = 1; j < HyP->num_u_blks_Phi; ++j)
+    {
+        HyP->Ublock_info_Phi[j].full_u_cols = HyP->Ublock_info_Phi[j ].ncols + HyP->Ublock_info_Phi[j - 1].full_u_cols;
+    }
+
+    HyP->bigU_Phi = bigU;
+    if ( HyP->num_u_blks_Phi == 0 )  // Sherry fix
+	HyP->bigU_host = bigU;
+    else
+	HyP->bigU_host = bigU + HyP->ldu_Phi * HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols;
+
+    sgather_u(HyP->num_u_blks, HyP->Ublock_info, usub, uval, HyP->bigU_host,
+               HyP->ldu, xsup, klst );
+
+    sgather_u(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, usub, uval,
+               HyP->bigU_Phi,  HyP->ldu_Phi, xsup, klst );
+
+} /* sRgather_U */
diff --git a/SRC/sgsequ_dist.c b/SRC/sgsequ_dist.c
new file mode 100644
index 00000000..a133fe1c
--- /dev/null
+++ b/SRC/sgsequ_dist.c
@@ -0,0 +1,204 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file sgsequ_dist.c
+ * \brief Computes row and column scalings
+ *
+ * 
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * Modified from LAPACK routine SGEEQU
+ * 
+ */ +/* + * History: Modified from LAPACK routine SGEEQU + */ +#include +#include "superlu_sdefs.h" + + + +/*! \brief + * + *
+ * Purpose   
+ *   =======   
+ *
+ *   SGSEQU_DIST computes row and column scalings intended to equilibrate an   
+ *   M-by-N sparse matrix A and reduce its condition number. R returns the row
+ *   scale factors and C the column scale factors, chosen to try to make   
+ *   the largest element in each row and column of the matrix B with   
+ *   elements B(i,j)=R(i)*A(i,j)*C(j) have absolute value 1.   
+ *
+ *   R(i) and C(j) are restricted to be between SMLNUM = smallest safe   
+ *   number and BIGNUM = largest safe number.  Use of these scaling   
+ *   factors is not guaranteed to reduce the condition number of A but   
+ *   works well in practice.   
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   A       (input) SuperMatrix*
+ *           The matrix of dimension (A->nrow, A->ncol) whose equilibration
+ *           factors are to be computed. The type of A can be:
+ *           Stype = SLU_NC; Dtype = SLU_S; Mtype = SLU_GE.
+ *	    
+ *   R       (output) float*, size A->nrow
+ *           If INFO = 0 or INFO > M, R contains the row scale factors   
+ *           for A.
+ *	    
+ *   C       (output) float*, size A->ncol
+ *           If INFO = 0,  C contains the column scale factors for A.
+ *	    
+ *   ROWCND  (output) float*
+ *           If INFO = 0 or INFO > M, ROWCND contains the ratio of the   
+ *           smallest R(i) to the largest R(i).  If ROWCND >= 0.1 and   
+ *           AMAX is neither too large nor too small, it is not worth   
+ *           scaling by R.
+ *	    
+ *   COLCND  (output) float*
+ *           If INFO = 0, COLCND contains the ratio of the smallest   
+ *           C(i) to the largest C(i).  If COLCND >= 0.1, it is not   
+ *           worth scaling by C.
+ *	    
+ *   AMAX    (output) float*
+ *           Absolute value of largest matrix element.  If AMAX is very   
+ *           close to overflow or very close to underflow, the matrix   
+ *           should be scaled.
+ *	    
+ *   INFO    (output) int*
+ *           = 0:  successful exit   
+ *           < 0:  if INFO = -i, the i-th argument had an illegal value   
+ *           > 0:  if INFO = i,  and i is   
+ *                 <= A->nrow:  the i-th row of A is exactly zero   
+ *                 >  A->ncol:  the (i-M)-th column of A is exactly zero   
+ *
+ *   ===================================================================== 
+ * 
+ */ +void +sgsequ_dist(SuperMatrix *A, float *r, float *c, float *rowcnd, + float *colcnd, float *amax, int_t *info) +{ + + + /* Local variables */ + NCformat *Astore; + float *Aval; + int i, j, irow; + float rcmin, rcmax; + float bignum, smlnum; + + /* Test the input parameters. */ + *info = 0; + if ( A->nrow < 0 || A->ncol < 0 || + A->Stype != SLU_NC || A->Dtype != SLU_S || A->Mtype != SLU_GE ) + *info = -1; + if (*info != 0) { + i = -(*info); + xerr_dist("sgsequ_dist", &i); + return; + } + + /* Quick return if possible */ + if ( A->nrow == 0 || A->ncol == 0 ) { + *rowcnd = 1.; + *colcnd = 1.; + *amax = 0.; + return; + } + + Astore = (NCformat *) A->Store; + Aval = (float *) Astore->nzval; + + /* Get machine constants. */ + smlnum = smach_dist("S"); /* slamch_("S"); */ + bignum = 1. / smlnum; + + /* Compute row scale factors. */ + for (i = 0; i < A->nrow; ++i) r[i] = 0.; + + /* Find the maximum element in each row. */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + r[irow] = SUPERLU_MAX( r[irow], fabs(Aval[i]) ); + } + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (i = 0; i < A->nrow; ++i) { + rcmax = SUPERLU_MAX(rcmax, r[i]); + rcmin = SUPERLU_MIN(rcmin, r[i]); + } + *amax = rcmax; + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (i = 0; i < A->nrow; ++i) + if (r[i] == 0.) { + *info = i + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (i = 0; i < A->nrow; ++i) + r[i] = 1. / SUPERLU_MIN( SUPERLU_MAX( r[i], smlnum ), bignum ); + /* Compute ROWCND = min(R(I)) / max(R(I)) */ + *rowcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + /* Compute column scale factors */ + for (j = 0; j < A->ncol; ++j) c[j] = 0.; + + /* Find the maximum element in each column, assuming the row + scalings computed above. */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + c[j] = SUPERLU_MAX( c[j], fabs(Aval[i]) * r[irow] ); + } + + /* Find the maximum and minimum scale factors. */ + rcmin = bignum; + rcmax = 0.; + for (j = 0; j < A->ncol; ++j) { + rcmax = SUPERLU_MAX(rcmax, c[j]); + rcmin = SUPERLU_MIN(rcmin, c[j]); + } + + if (rcmin == 0.) { + /* Find the first zero scale factor and return an error code. */ + for (j = 0; j < A->ncol; ++j) + if ( c[j] == 0. ) { + *info = A->nrow + j + 1; + return; + } + } else { + /* Invert the scale factors. */ + for (j = 0; j < A->ncol; ++j) + c[j] = 1. / SUPERLU_MIN( SUPERLU_MAX( c[j], smlnum ), bignum); + /* Compute COLCND = min(C(J)) / max(C(J)) */ + *colcnd = SUPERLU_MAX( rcmin, smlnum ) / SUPERLU_MIN( rcmax, bignum ); + } + + return; + +} /* sgsequ_dist */ + + diff --git a/SRC/slangs_dist.c b/SRC/slangs_dist.c new file mode 100644 index 00000000..0d81c5f5 --- /dev/null +++ b/SRC/slangs_dist.c @@ -0,0 +1,130 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file slangs_dist.c + * \brief Returns the value of the one norm, the infinity norm, or the element of largest value + * Modified from SuperLU routine SLANGS + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ *
+ * 
+ */ +/* + * File name: slangs_dist.c + * History: Modified from lapack routine SLANGE + */ +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ * Purpose   
+ *   =======   
+ *
+ *   SLANGS_DIST returns the value of the one norm, or the Frobenius norm, or 
+ *   the infinity norm, or the element of largest absolute value of a 
+ *   real matrix A.   
+ *
+ *   Description   
+ *   ===========   
+ *
+ *   SLANGE returns the value   
+ *
+ *      SLANGE = ( max(abs(A(i,j))), NORM = 'M' or 'm'   
+ *               (   
+ *               ( norm1(A),         NORM = '1', 'O' or 'o'   
+ *               (   
+ *               ( normI(A),         NORM = 'I' or 'i'   
+ *               (   
+ *               ( normF(A),         NORM = 'F', 'f', 'E' or 'e'   
+ *
+ *   where  norm1  denotes the  one norm of a matrix (maximum column sum), 
+ *   normI  denotes the  infinity norm  of a matrix  (maximum row sum) and 
+ *   normF  denotes the  Frobenius norm of a matrix (square root of sum of 
+ *   squares).  Note that  max(abs(A(i,j)))  is not a  matrix norm.   
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   NORM    (input) CHARACTER*1   
+ *           Specifies the value to be returned in SLANGE as described above.   
+ *   A       (input) SuperMatrix*
+ *           The M by N sparse matrix A. 
+ *
+ *  =====================================================================
+ * 
+ */ + +float slangs_dist(char *norm, SuperMatrix *A) +{ + + /* Local variables */ + NCformat *Astore; + float *Aval; + int i, j, irow; + float value = 0.0, sum; + float *rwork; + + Astore = (NCformat *) A->Store; + Aval = (float *) Astore->nzval; + + if ( SUPERLU_MIN(A->nrow, A->ncol) == 0) { + value = 0.; + + } else if ( (strncmp(norm, "M", 1)==0 ) ) { + /* Find max(abs(A(i,j))). */ + value = 0.; + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) + value = SUPERLU_MAX( value, fabs( Aval[i]) ); + + } else if (strncmp(norm, "O", 1)==0 || *(unsigned char *)norm == '1') { + /* Find norm1(A). */ + value = 0.; + for (j = 0; j < A->ncol; ++j) { + sum = 0.; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) + sum += fabs(Aval[i]); + value = SUPERLU_MAX(value,sum); + } + + } else if (strncmp(norm, "I", 1)==0) { + /* Find normI(A). */ + if ( !(rwork = (float *) SUPERLU_MALLOC(A->nrow * sizeof(float))) ) + ABORT("SUPERLU_MALLOC fails for rwork."); + for (i = 0; i < A->nrow; ++i) rwork[i] = 0.; + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; i++) { + irow = Astore->rowind[i]; + rwork[irow] += fabs(Aval[i]); + } + value = 0.; + for (i = 0; i < A->nrow; ++i) + value = SUPERLU_MAX(value, rwork[i]); + + SUPERLU_FREE (rwork); + + } else if (strncmp(norm, "F", 1)==0 || strncmp(norm, "E", 1)==0) { + /* Find normF(A). */ + ABORT("Not implemented."); + } else + ABORT("Illegal norm specified."); + + return (value); + +} /* slangs_dist */ + diff --git a/SRC/slaqgs_dist.c b/SRC/slaqgs_dist.c new file mode 100644 index 00000000..bef736b5 --- /dev/null +++ b/SRC/slaqgs_dist.c @@ -0,0 +1,154 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file slaqgs_dist.c + * \brief Equlibrates a general sprase matrix + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ * Modified from LAPACK routine SLAQGE
+ * 
+ */ +/* + * File name: slaqgs_dist.c + * History: Modified from LAPACK routine SLAQGE + */ +#include +#include "superlu_sdefs.h" + +/*! \brief + * + *
+ *   Purpose   
+ *   =======   
+ *
+ *   SLAQGS_DIST equilibrates a general sparse M by N matrix A using the row and   
+ *   scaling factors in the vectors R and C.   
+ *
+ *   See supermatrix.h for the definition of 'SuperMatrix' structure.
+ *
+ *   Arguments   
+ *   =========   
+ *
+ *   A       (input/output) SuperMatrix*
+ *           On exit, the equilibrated matrix.  See EQUED for the form of 
+ *           the equilibrated matrix. The type of A can be:
+ *	    Stype = NC; Dtype = SLU_S; Mtype = GE.
+ *	    
+ *   R       (input) float*, dimension (A->nrow)
+ *           The row scale factors for A.
+ *	    
+ *   C       (input) float*, dimension (A->ncol)
+ *           The column scale factors for A.
+ *	    
+ *   ROWCND  (input) float
+ *           Ratio of the smallest R(i) to the largest R(i).
+ *	    
+ *   COLCND  (input) float
+ *           Ratio of the smallest C(i) to the largest C(i).
+ *	    
+ *   AMAX    (input) float
+ *           Absolute value of largest matrix entry.
+ *	    
+ *   EQUED   (output) char*
+ *           Specifies the form of equilibration that was done.   
+ *           = 'N':  No equilibration   
+ *           = 'R':  Row equilibration, i.e., A has been premultiplied by  
+ *                   diag(R).   
+ *           = 'C':  Column equilibration, i.e., A has been postmultiplied  
+ *                   by diag(C).   
+ *           = 'B':  Both row and column equilibration, i.e., A has been
+ *                   replaced by diag(R) * A * diag(C).   
+ *
+ *   Internal Parameters   
+ *   ===================   
+ *
+ *   THRESH is a threshold value used to decide if row or column scaling   
+ *   should be done based on the ratio of the row or column scaling   
+ *   factors.  If ROWCND < THRESH, row scaling is done, and if   
+ *   COLCND < THRESH, column scaling is done.   
+ *
+ *   LARGE and SMALL are threshold values used to decide if row scaling   
+ *   should be done based on the absolute size of the largest matrix   
+ *   element.  If AMAX > LARGE or AMAX < SMALL, row scaling is done.   
+ *
+ *   ===================================================================== 
+ * 
+ */ + +void +slaqgs_dist(SuperMatrix *A, float *r, float *c, + float rowcnd, float colcnd, float amax, char *equed) +{ + +#define THRESH (0.1) + + /* Local variables */ + NCformat *Astore; + float *Aval; + int i, j, irow; + float large, small, cj; + + /* Quick return if possible */ + if (A->nrow <= 0 || A->ncol <= 0) { + *(unsigned char *)equed = 'N'; + return; + } + + Astore = (NCformat *) A->Store; + Aval = (float *) Astore->nzval; + + /* Initialize LARGE and SMALL. */ + small = smach_dist("Safe minimum") / smach_dist("Precision"); + large = 1. / small; + + if (rowcnd >= THRESH && amax >= small && amax <= large) { + if (colcnd >= THRESH) + *(unsigned char *)equed = 'N'; + else { + /* Column scaling */ + for (j = 0; j < A->ncol; ++j) { + cj = c[j]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + Aval[i] *= cj; + } + } + *(unsigned char *)equed = 'C'; + } + } else if (colcnd >= THRESH) { + /* Row scaling, no column scaling */ + for (j = 0; j < A->ncol; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + Aval[i] *= r[irow]; + } + *(unsigned char *)equed = 'R'; + } else { + /* Row and column scaling */ + for (j = 0; j < A->ncol; ++j) { + cj = c[j]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + Aval[i] *= cj * r[irow]; + } + } + *(unsigned char *)equed = 'B'; + } + + return; + +} /* slaqgs_dist */ + diff --git a/SRC/sldperm_dist.c b/SRC/sldperm_dist.c new file mode 100644 index 00000000..6178c1b0 --- /dev/null +++ b/SRC/sldperm_dist.c @@ -0,0 +1,175 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Finds a row permutation so that the matrix has large entries on the diagonal + * + *
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * 
+ */ + +#include "superlu_sdefs.h" + +extern void mc64ad_dist(int_t*, int_t*, int_t*, int_t [], int_t [], double [], + int_t*, int_t [], int_t*, int_t[], int_t*, double [], + int_t [], int_t []); + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ *   SLDPERM finds a row permutation so that the matrix has large
+ *   entries on the diagonal.
+ *
+ * Arguments
+ * =========
+ *
+ * job    (input) int
+ *        Control the action. Possible values for JOB are:
+ *        = 1 : Compute a row permutation of the matrix so that the
+ *              permuted matrix has as many entries on its diagonal as
+ *              possible. The values on the diagonal are of arbitrary size.
+ *              HSL subroutine MC21A/AD is used for this.
+ *        = 2 : Compute a row permutation of the matrix so that the smallest
+ *              value on the diagonal of the permuted matrix is maximized.
+ *        = 3 : Compute a row permutation of the matrix so that the smallest
+ *              value on the diagonal of the permuted matrix is maximized.
+ *              The algorithm differs from the one used for JOB = 2 and may
+ *              have quite a different performance.
+ *        = 4 : Compute a row permutation of the matrix so that the sum
+ *              of the diagonal entries of the permuted matrix is maximized.
+ *        = 5 : Compute a row permutation of the matrix so that the product
+ *              of the diagonal entries of the permuted matrix is maximized
+ *              and vectors to scale the matrix so that the nonzero diagonal
+ *              entries of the permuted matrix are one in absolute value and
+ *              all the off-diagonal entries are less than or equal to one in
+ *              absolute value.
+ *        Restriction: 1 <= JOB <= 5.
+ *
+ * n      (input) int
+ *        The order of the matrix.
+ *
+ * nnz    (input) int
+ *        The number of nonzeros in the matrix.
+ *
+ * adjncy (input) int*, of size nnz
+ *        The adjacency structure of the matrix, which contains the row
+ *        indices of the nonzeros.
+ *
+ * colptr (input) int*, of size n+1
+ *        The pointers to the beginning of each column in ADJNCY.
+ *
+ * nzval  (input) float*, of size nnz
+ *        The nonzero values of the matrix. nzval[k] is the value of
+ *        the entry corresponding to adjncy[k].
+ *        It is not used if job = 1.
+ *
+ * perm   (output) int*, of size n
+ *        The permutation vector. perm[i] = j means row i in the
+ *        original matrix is in row j of the permuted matrix.
+ *
+ * u      (output) double*, of size n
+ *        If job = 5, the natural logarithms of the row scaling factors.
+ *
+ * v      (output) double*, of size n
+ *        If job = 5, the natural logarithms of the column scaling factors.
+ *        The scaled matrix B has entries b_ij = a_ij * exp(u_i + v_j).
+ * 
+ */ + +int +sldperm_dist(int_t job, int_t n, int_t nnz, int_t colptr[], int_t adjncy[], + float nzval[], int_t *perm, float u[], float v[]) +{ + int_t i, liw, ldw, num; + int_t *iw, icntl[10], info[10]; + double *dw; + extern double *doubleMalloc_dist(int_t); + double *nzval_d = doubleMalloc_dist(nnz); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(0, "Enter sldperm_dist()"); +#endif + liw = 5*n; + if ( job == 3 ) liw = 10*n + nnz; + if ( !(iw = intMalloc_dist(liw)) ) ABORT("Malloc fails for iw[]"); + ldw = 3*n + nnz; + if ( !(dw = doubleMalloc_dist(ldw)) ) ABORT("Malloc fails for dw[]"); + + /* Increment one to get 1-based indexing. */ + for (i = 0; i <= n; ++i) ++colptr[i]; + for (i = 0; i < nnz; ++i) ++adjncy[i]; +#if ( DEBUGlevel>=2 ) + printf("LDPERM(): n %d, nnz %d\n", n, nnz); + PrintInt10("colptr", n+1, colptr); + PrintInt10("adjncy", nnz, adjncy); +#endif + + /* + * NOTE: + * ===== + * + * MC64AD assumes that column permutation vector is defined as: + * perm(i) = j means column i of permuted A is in column j of original A. + * + * Since a symmetric permutation preserves the diagonal entries. Then + * by the following relation: + * P'(A*P')P = P'A + * we can apply inverse(perm) to rows of A to get large diagonal entries. + * But, since 'perm' defined in MC64AD happens to be the reverse of + * SuperLU's definition of permutation vector, therefore, it is already + * an inverse for our purpose. We will thus use it directly. + * + */ + mc64id_dist(icntl); + /* Suppress error and warning messages. */ + icntl[0] = -1; + icntl[1] = -1; + + for (i = 0; i < nnz; ++i) nzval_d[i] = nzval[i]; + mc64ad_dist(&job, &n, &nnz, colptr, adjncy, nzval_d, &num, perm, + &liw, iw, &ldw, dw, icntl, info); + +#if ( DEBUGlevel>=2 ) + PrintInt10("perm", n, perm); + printf(".. After MC64AD info %d\tsize of matching %d\n", info[0], num); +#endif + if ( info[0] == 1 ) { /* Structurally singular */ + printf(".. The last " IFMT " permutations:\n", n-num); + PrintInt10("perm", n-num, &perm[num]); + } + + /* Restore to 0-based indexing. */ + for (i = 0; i <= n; ++i) --colptr[i]; + for (i = 0; i < nnz; ++i) --adjncy[i]; + for (i = 0; i < n; ++i) --perm[i]; + + if ( job == 5 ) + for (i = 0; i < n; ++i) { + u[i] = dw[i]; + v[i] = dw[n+i]; + } + + SUPERLU_FREE(iw); + SUPERLU_FREE(dw); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(0, "Exit sldperm_dist()"); +#endif + return (info[0]); +} + diff --git a/SRC/slook_ahead_update.c b/SRC/slook_ahead_update.c new file mode 100644 index 00000000..5a6999cd --- /dev/null +++ b/SRC/slook_ahead_update.c @@ -0,0 +1,278 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/************************************************************************/ +/*! @file + * \brief Look-ahead update of the Schur complement. + * + *
+ * -- Distributed SuperLU routine (version 5.4) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ * Modified:
+ *  September 18, 2017
+ *  June 1, 2018  add parallel AWPM pivoting; add back arrive_at_ublock()
+ *
+ */
+
+#include   /* assertion doesn't work if NDEBUG is defined */
+
+iukp = iukp0; /* point to the first block in index[] */
+rukp = rukp0; /* point to the start of nzval[] */
+j = jj0 = 0;  /* After the j-loop, jj0 points to the first block in U
+                 outside look-ahead window. */
+
+#if 0
+for (jj = 0; jj < nub; ++jj) assert(perm_u[jj] == jj); /* Sherry */
+#endif
+
+#ifdef ISORT
+while (j < nub && iperm_u[j] <= k0 + num_look_aheads)
+#else
+while (j < nub && perm_u[2 * j] <= k0 + num_look_aheads)
+#endif
+{
+    float zero = 0.0;
+
+#if 1
+    /* Search is needed because a permutation perm_u is involved for j  */
+    /* Search along the row for the pointers {iukp, rukp} pointing to
+     * block U(k,j).
+     * j    -- current block in look-ahead window, initialized to 0 on entry
+     * iukp -- point to the start of index[] metadata
+     * rukp -- point to the start of nzval[] array
+     * jb   -- block number of block U(k,j), update destination column
+     */
+    arrive_at_ublock(
+		     j, &iukp, &rukp, &jb, &ljb, &nsupc,
+         	     iukp0, rukp0, usub, perm_u, xsup, grid
+		    );
+#else
+    jb = usub[iukp];
+    ljb = LBj (jb, grid);     /* Local block number of U(k,j). */
+    nsupc = SuperSize(jb);
+    iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+#endif
+
+    j++;
+    jj0++;
+    jj = iukp;
+
+    while (usub[jj] == klst) ++jj; /* Skip zero segments */
+
+    ldu = klst - usub[jj++];
+    ncols = 1;
+
+    /* This loop computes ldu. */
+    for (; jj < iukp + nsupc; ++jj) { /* for each column jj in block U(k,j) */
+        segsize = klst - usub[jj];
+        if (segsize) {
+            ++ncols;
+            if (segsize > ldu)  ldu = segsize;
+        }
+    }
+#if ( DEBUGlevel>=3 )
+    ++num_update;
+#endif
+
+#if ( DEBUGlevel>=3 )
+    printf ("(%d) k=%d,jb=%d,ldu=%d,ncols=%d,nsupc=%d\n",
+	    iam, k, jb, ldu, ncols, nsupc);
+    ++num_copy;
+#endif
+
+    /* Now copy one block U(k,j) to bigU for GEMM, padding zeros up to ldu. */
+    tempu = bigU; /* Copy one block U(k,j) to bigU for GEMM */
+    for (jj = iukp; jj < iukp + nsupc; ++jj) {
+        segsize = klst - usub[jj];
+        if (segsize) {
+            lead_zero = ldu - segsize;
+            for (i = 0; i < lead_zero; ++i) tempu[i] = zero;
+            tempu += lead_zero;
+            for (i = 0; i < segsize; ++i) {
+                tempu[i] = uval[rukp + i];
+            }
+            rukp += segsize;
+            tempu += segsize;
+        }
+    }
+    tempu = bigU; /* set back to the beginning of the buffer */
+
+    nbrow = lsub[1]; /* number of row subscripts in L(:,k) */
+    if (myrow == krow) nbrow = lsub[1] - lsub[3]; /* skip diagonal block for those rows. */
+    // double ttx =SuperLU_timer_();
+
+    int current_b = 0; /* Each thread starts searching from first block.
+                          This records the moving search target.           */
+    lptr = lptr0; /* point to the start of index[] in supernode L(:,k) */
+    luptr = luptr0;
+
+#ifdef _OPENMP
+    /* Sherry -- examine all the shared variables ??
+       'firstprivate' ensures that the private variables are initialized
+       to the values before entering the loop.  */
+#pragma omp parallel for \
+    firstprivate(lptr,luptr,ib,current_b) private(lb) \
+    default(shared) schedule(dynamic)
+#endif
+    for (lb = 0; lb < nlb; lb++) { /* Loop through each block in L(:,k) */
+        int temp_nbrow; /* automatic variable is private */
+
+        /* Search for the L block that my thread will work on.
+           No need to search from 0, can continue at the point where
+           it is left from last iteration.
+           Note: Blocks may not be sorted in L. Different thread picks up
+	   different lb.   */
+        for (; current_b < lb; ++current_b) {
+            temp_nbrow = lsub[lptr + 1];    /* Number of full rows. */
+            lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+            lptr += temp_nbrow;   /* move to next block */
+            luptr += temp_nbrow;  /* move to next block */
+        }
+
+#ifdef _OPENMP
+        int_t thread_id = omp_get_thread_num ();
+#else
+        int_t thread_id = 0;
+#endif
+        float * tempv = bigV + ldt*ldt*thread_id;
+
+        int *indirect_thread  = indirect + ldt * thread_id;
+        int *indirect2_thread = indirect2 + ldt * thread_id;
+        ib = lsub[lptr];        /* block number of L(i,k) */
+        temp_nbrow = lsub[lptr + 1];    /* Number of full rows. */
+	/* assert (temp_nbrow <= nbrow); */
+
+        lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+
+	/*if (thread_id == 0) tt_start = SuperLU_timer_();*/
+
+        /* calling gemm */
+	stat->ops[FACT] += 2.0 * (flops_t)temp_nbrow * ldu * ncols;
+#if defined (USE_VENDOR_BLAS)
+        sgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                   &lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
+                   tempu, &ldu, &beta, tempv, &temp_nbrow, 1, 1);
+#else
+        sgemm_("N", "N", &temp_nbrow, &ncols, &ldu, &alpha,
+                   &lusup[luptr + (knsupc - ldu) * nsupr], &nsupr,
+                   tempu, &ldu, &beta, tempv, &temp_nbrow );
+#endif
+
+#if 0
+	if (thread_id == 0) {
+	    tt_end = SuperLU_timer_();
+	    LookAheadGEMMTimer += tt_end - tt_start;
+	    tt_start = tt_end;
+	}
+#endif
+        /* Now scattering the output. */
+        if (ib < jb) {    /* A(i,j) is in U. */
+            sscatter_u (ib, jb,
+                       nsupc, iukp, xsup,
+                       klst, temp_nbrow,
+                       lptr, temp_nbrow, lsub,
+                       usub, tempv, Ufstnz_br_ptr, Unzval_br_ptr, grid);
+        } else {          /* A(i,j) is in L. */
+            sscatter_l (ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+                       temp_nbrow, usub, lsub, tempv,
+                       indirect_thread, indirect2_thread,
+                       Lrowind_bc_ptr, Lnzval_bc_ptr, grid);
+        }
+
+        ++current_b;         /* Move to next block. */
+        lptr += temp_nbrow;
+        luptr += temp_nbrow;
+
+#if 0
+	if (thread_id == 0) {
+	    tt_end = SuperLU_timer_();
+	    LookAheadScatterTimer += tt_end - tt_start;
+	}
+#endif
+    } /* end parallel for lb = 0, nlb ... all blocks in L(:,k) */
+
+    iukp += nsupc; /* Mov to block U(k,j+1) */
+
+    /* =========================================== *
+     * == factorize L(:,j) and send if possible == *
+     * =========================================== */
+    kk = jb; /* destination column that is just updated */
+    kcol = PCOL (kk, grid);
+#ifdef ISORT
+    kk0 = iperm_u[j - 1];
+#else
+    kk0 = perm_u[2 * (j - 1)];
+#endif
+    look_id = kk0 % (1 + num_look_aheads);
+
+    if (look_ahead[kk] == k0 && kcol == mycol) {
+        /* current column is the last dependency */
+        look_id = kk0 % (1 + num_look_aheads);
+
+        /* Factor diagonal and subdiagonal blocks and test for exact
+           singularity.  */
+        factored[kk] = 0;
+
+        double tt1 = SuperLU_timer_();
+
+        PSGSTRF2(options, kk0, kk, thresh, Glu_persist, grid, Llu,
+                  U_diag_blk_send_req, tag_ub, stat, info);
+
+        pdgstrf2_timer += SuperLU_timer_() - tt1;
+
+        /* stat->time7 += SuperLU_timer_() - ttt1; */
+
+        /* Multicasts numeric values of L(:,kk) to process rows. */
+        send_req = send_reqs[look_id];
+        msgcnt = msgcnts[look_id];
+
+        lk = LBj (kk, grid);    /* Local block number. */
+        lsub1 = Lrowind_bc_ptr[lk];
+        lusup1 = Lnzval_bc_ptr[lk];
+        if (lsub1) {
+            msgcnt[0] = lsub1[1] + BC_HEADER + lsub1[0] * LB_DESCRIPTOR;
+            msgcnt[1] = lsub1[1] * SuperSize (kk);
+        } else {
+            msgcnt[0] = 0;
+            msgcnt[1] = 0;
+        }
+
+        scp = &grid->rscp;      /* The scope of process row. */
+        for (pj = 0; pj < Pc; ++pj) {
+            if (ToSendR[lk][pj] != EMPTY) {
+#if ( PROFlevel>=1 )
+                TIC (t1);
+#endif
+                MPI_Isend (lsub1, msgcnt[0], mpi_int_t, pj,
+                           SLU_MPI_TAG (0, kk0) /* (4*kk0)%tag_ub */ ,
+                           scp->comm, &send_req[pj]);
+                MPI_Isend (lusup1, msgcnt[1], MPI_FLOAT, pj,
+                           SLU_MPI_TAG (1, kk0) /* (4*kk0+1)%tag_ub */ ,
+                           scp->comm, &send_req[pj + Pc]);
+#if ( PROFlevel>=1 )
+                TOC (t2, t1);
+                stat->utime[COMM] += t2;
+                msg_cnt += 2;
+                msg_vol += msgcnt[0] * iword + msgcnt[1] * dword;
+#endif
+#if ( DEBUGlevel>=2 )
+                printf ("[%d] -2- Send L(:,%4d): #lsub %4d, #lusup %4d to Pj %2d, tags %d:%d \n",
+                        iam, kk, msgcnt[0], msgcnt[1], pj,
+			SLU_MPI_TAG(0,kk0), SLU_MPI_TAG(1,kk0));
+#endif
+            }  /* end if ( ToSendR[lk][pj] != EMPTY ) */
+        } /* end for pj ... */
+    } /* end if( look_ahead[kk] == k0 && kcol == mycol ) */
+} /* end while j < nub and perm_u[j] 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ * 
+ */ + +#pragma once // so that this header file is included onle once + +#include "superlu_sdefs.h" + +#ifdef GPU_ACC // enable GPU + +// #include "mkl.h" + +#include +#include +// #include "sec_structs.h" +// #include "supernodal_etree.h" + +/* Constants */ +//#define SLU_TARGET_GPU 0 +//#define MAX_BLOCK_SIZE 10000 +#define MAX_NCUDA_STREAMS 32 + +static +void check(cudaError_t result, char const *const func, const char *const file, int const line) +{ + if (result) + { + fprintf(stderr, "CUDA error at file %s: line %d code=(%s) \"%s\" \n", + file, line, cudaGetErrorString(result), func); + + // Make sure we call CUDA Device Reset before exiting + exit(EXIT_FAILURE); + } +} + +#define checkCudaErrors(val) check ( (val), #val, __FILE__, __LINE__ ) + +typedef struct //SCUbuf_gpu_ +{ + /*Informations for various buffers*/ + float *bigV; + float *bigU; + float *bigU_host; /*pinned location*/ + int_t *indirect; /*for indirect address calculations*/ + int_t *indirect2; /*for indirect address calculations*/ + + float *Remain_L_buff; /* on GPU */ + float *Remain_L_buff_host; /* Sherry: this memory is page-locked, why need another copy on GPU ? */ + + int_t *lsub; + int_t *usub; + + int_t *lsub_buf, *usub_buf; + + Ublock_info_t *Ublock_info; /* on GPU */ + Remain_info_t *Remain_info; + Ublock_info_t *Ublock_info_host; + Remain_info_t *Remain_info_host; + + int_t* usub_IndirectJ3; /* on GPU */ + int_t* usub_IndirectJ3_host; + +} sSCUbuf_gpu_t; + + +typedef struct //LUstruct_gpu_ +{ + int_t *LrowindVec; /* A single vector */ + int_t *LrowindPtr; /* A single vector */ + + float *LnzvalVec; /* A single vector */ + int_t *LnzvalPtr; /* A single vector */ + int_t *LnzvalPtr_host; /* A single vector */ + + int_t *UrowindVec; /* A single vector */ + int_t *UrowindPtr; /* A single vector */ + int_t *UrowindPtr_host; /* A single vector */ + int_t *UnzvalPtr_host; + + float *UnzvalVec; /* A single vector */ + int_t *UnzvalPtr; /* A single vector */ + /*gpu pointers for easy block accesses */ + local_l_blk_info_t *local_l_blk_infoVec; + int_t *local_l_blk_infoPtr; + int_t *jib_lookupVec; + int_t *jib_lookupPtr; + local_u_blk_info_t *local_u_blk_infoVec; + + int_t *local_u_blk_infoPtr; + int_t *ijb_lookupVec; + int_t *ijb_lookupPtr; + + // GPU buffers for performing Schur Complement Update on GPU + sSCUbuf_gpu_t scubufs[MAX_NCUDA_STREAMS]; + float *acc_L_buff, *acc_U_buff; + + /*Informations for various buffers*/ + int_t buffer_size; /**/ + int_t nsupers; /*should have number of supernodes*/ + int_t *xsup; + gridinfo_t *grid; + + + double ScatterMOPCounter; + double ScatterMOPTimer; + double GemmFLOPCounter; + double GemmFLOPTimer; + + double cPCIeH2D; + double cPCIeD2H; + double tHost_PCIeH2D; + double tHost_PCIeD2H; + + /*cuda events to measure DGEMM and SCATTER timing */ + int *isOffloaded; /*stores if any iteration is offloaded or not*/ + cudaEvent_t *GemmStart, *GemmEnd, *ScatterEnd; /*cuda events to store gemm and scatter's begin and end*/ + cudaEvent_t *ePCIeH2D; + cudaEvent_t *ePCIeD2H_Start; + cudaEvent_t *ePCIeD2H_End; + + int_t *xsup_host; + int_t* perm_c_supno; + int_t first_l_block_gpu, first_u_block_gpu; +} sLUstruct_gpu_t; + +typedef struct //sluGPU_t_ +{ + int_t gpuId; // if there are multiple GPUs + sLUstruct_gpu_t *A_gpu, *dA_gpu; + cudaStream_t funCallStreams[MAX_NCUDA_STREAMS], CopyStream; + cublasHandle_t cublasHandles[MAX_NCUDA_STREAMS]; + int_t lastOffloadStream[MAX_NCUDA_STREAMS]; + int_t nCudaStreams; + int_t* isNodeInMyGrid; + double acc_async_cost; +} ssluGPU_t; + + +#ifdef __cplusplus +extern "C" { +#endif + +extern int ssparseTreeFactor_ASYNC_GPU( + sForest_t *sforest, + commRequests_t **comReqss, // lists of communication requests, + // size = maxEtree level + sscuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t *packLUInfo, + msgs_t **msgss, // size = num Look ahead + sLUValSubBuf_t **LUvsbs, // size = num Look ahead + sdiagFactBufs_t **dFBufs, // size = maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t *gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t *gIperm_c_supno, + int ldt, + ssluGPU_t *sluGPU, + d2Hreduce_t *d2Hred, + HyP_t *HyP, + sLUstruct_t *LUstruct, gridinfo3d_t *grid3d, + SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info); + +int sinitD2Hreduce( + int next_k, + d2Hreduce_t* d2Hred, + int last_flag, + // int_t *perm_c_supno, + HyP_t* HyP, + ssluGPU_t *sluGPU, + gridinfo_t *grid, + sLUstruct_t *LUstruct, SCT_t* SCT +); + +extern int sreduceGPUlu(int last_flag, d2Hreduce_t* d2Hred, + ssluGPU_t *sluGPU, SCT_t *SCT, gridinfo_t *grid, + sLUstruct_t *LUstruct); + +extern int swaitGPUscu(int streamId, ssluGPU_t *sluGPU, SCT_t *SCT); +extern int ssendLUpanelGPU2HOST( int_t k0, d2Hreduce_t* d2Hred, ssluGPU_t *sluGPU); +extern int ssendSCUdataHost2GPU( + int_t streamId, int_t* lsub, int_t* usub, float* bigU, int_t bigu_send_size, + int_t Remain_lbuf_send_size, ssluGPU_t *sluGPU, HyP_t* HyP +); + +extern int sinitSluGPU3D_t( + ssluGPU_t *sluGPU, + sLUstruct_t *LUstruct, + gridinfo3d_t * grid3d, + int_t* perm_c_supno, int_t n, int_t buffer_size, int_t bigu_size, int_t ldt +); +int sSchurCompUpdate_GPU( + int_t streamId, + int_t jj_cpu, int_t nub, int_t klst, int_t knsupc, + int_t Rnbrow, int_t RemainBlk, + int_t Remain_lbuf_send_size, + int_t bigu_send_size, int_t ldu, + int_t mcb, + int_t buffer_size, int_t lsub_len, int_t usub_len, + int_t ldt, int_t k0, + ssluGPU_t *sluGPU, gridinfo_t *grid +); + + +extern void sCopyLUToGPU3D (int_t* isNodeInMyGrid, sLocalLU_t *A_host, + ssluGPU_t *sluGPU, Glu_persist_t *Glu_persist, int_t n, + gridinfo3d_t *grid3d, int_t buffer_size, int_t bigu_size, int_t ldt); + +extern int sreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount, + int_t** treePerm, sLUValSubBuf_t*LUvsb, + sLUstruct_t* LUstruct, gridinfo3d_t* grid3d, + ssluGPU_t *sluGPU, d2Hreduce_t* d2Hred, + factStat_t *factStat, HyP_t* HyP, SCT_t* SCT ); + +extern void ssyncAllfunCallStreams(ssluGPU_t* sluGPU, SCT_t* SCT); +extern int sfree_LUstruct_gpu (sLUstruct_gpu_t *A_gpu); + +//int freeSluGPU(ssluGPU_t *sluGPU); + +extern void sPrint_matrix( char *desc, int_t m, int_t n, float *dA, int_t lda ); + +/*to print out various statistics*/ +void sprintGPUStats(sLUstruct_gpu_t *A_gpu); + +#ifdef __cplusplus +} +#endif + +#endif // matching: enable GPU diff --git a/SRC/smemory_dist.c b/SRC/smemory_dist.c new file mode 100644 index 00000000..8c9ef510 --- /dev/null +++ b/SRC/smemory_dist.c @@ -0,0 +1,286 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Memory utilities + * + *
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ * 
+ */ + +#include "superlu_sdefs.h" + + +/* Variables external to this file */ +extern SuperLU_LU_stack_t stack; + + +void *suser_malloc_dist(int_t bytes, int_t which_end) +{ + void *buf; + + if ( SuperLU_StackFull(bytes) ) return (NULL); + + if ( which_end == HEAD ) { + buf = (char*) stack.array + stack.top1; + stack.top1 += bytes; + } else { + stack.top2 -= bytes; + buf = (char*) stack.array + stack.top2; + } + + stack.used += bytes; + return buf; +} + + +void suser_free_dist(int_t bytes, int_t which_end) +{ + if ( which_end == HEAD ) { + stack.top1 -= bytes; + } else { + stack.top2 += bytes; + } + stack.used -= bytes; +} + + + +/*! \brief + * + *
+ * mem_usage consists of the following fields:
+ *    - for_lu (float)
+ *      The amount of space used in bytes for the L\U data structures.
+ *    - total (float)
+ *      The amount of space needed in bytes to perform factorization.
+ *    - expansions (int)
+ *      Number of memory expansions during the LU factorization.
+ * 
+ */ +int_t sQuerySpace_dist(int_t n, sLUstruct_t *LUstruct, gridinfo_t *grid, + SuperLUStat_t *stat, superlu_dist_mem_usage_t *mem_usage) +{ + register int_t dword, gb, iword, k, nb, nsupers; + int_t *index, *xsup; + int iam, mycol, myrow; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + + iam = grid->iam; + myrow = MYROW( iam, grid ); + mycol = MYCOL( iam, grid ); + iword = sizeof(int_t); + dword = sizeof(float); + nsupers = Glu_persist->supno[n-1] + 1; + xsup = Glu_persist->xsup; + mem_usage->for_lu = 0.; + + /* For L factor */ + nb = CEILING( nsupers, grid->npcol ); /* Number of local column blocks */ + for (k = 0; k < nb; ++k) { + gb = k * grid->npcol + mycol; /* Global block number. */ + if ( gb < nsupers ) { + index = Llu->Lrowind_bc_ptr[k]; + if ( index ) { + mem_usage->for_lu += (float) + ((BC_HEADER + index[0]*LB_DESCRIPTOR + index[1]) * iword); + mem_usage->for_lu += (float)(index[1]*SuperSize( gb )*dword); + } + } + } + + /* For U factor */ + nb = CEILING( nsupers, grid->nprow ); /* Number of local row blocks */ + for (k = 0; k < nb; ++k) { + gb = k * grid->nprow + myrow; /* Global block number. */ + if ( gb < nsupers ) { + index = Llu->Ufstnz_br_ptr[k]; + if ( index ) { + mem_usage->for_lu += (float)(index[2] * iword); + mem_usage->for_lu += (float)(index[1] * dword); + } + } + } + + /* Working storage to support factorization */ + mem_usage->total = mem_usage->for_lu; +#if 0 + mem_usage->total += + (float)(( Llu->bufmax[0] + Llu->bufmax[2] ) * iword + + ( Llu->bufmax[1] + Llu->bufmax[3] + maxsup ) * dword ); + /**** another buffer to use mpi_irecv in pdgstrf_irecv.c ****/ + mem_usage->total += + (float)( Llu->bufmax[0] * iword + Llu->bufmax[1] * dword ); + mem_usage->total += (float)( maxsup * maxsup + maxsup) * iword; + k = CEILING( nsupers, grid->nprow ); + mem_usage->total += (float)(2 * k * iword); +#else + /*mem_usage->total += stat->current_buffer;*/ + mem_usage->total += stat->peak_buffer; + +#if ( PRNTlevel>=1 ) + if (iam==0) printf(".. sQuerySpace: peak_buffer %.2f (MB)\n", + stat->peak_buffer * 1.0e-6); +#endif +#endif + return 0; +} /* sQuerySpace_dist */ + + +/* + * Allocate storage for original matrix A + */ +void +sallocateA_dist(int_t n, int_t nnz, float **a, int_t **asub, int_t **xa) +{ + *a = (float *) floatMalloc_dist(nnz); + *asub = (int_t *) intMalloc_dist(nnz); + *xa = (int_t *) intMalloc_dist(n+1); +} + + +float *floatMalloc_dist(int_t n) +{ + float *buf; + buf = (float *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(float) ); + return (buf); +} + +float *floatCalloc_dist(int_t n) +{ + float *buf; + register int_t i; + float zero = 0.0; + buf = (float *) SUPERLU_MALLOC( SUPERLU_MAX(1, n) * sizeof(float)); + if ( !buf ) return (buf); + for (i = 0; i < n; ++i) buf[i] = zero; + return (buf); +} + +/*************************************** + * The following are from 3D code. + ***************************************/ + +double sgetLUMem(int_t nodeId, sLUstruct_t *LUstruct, gridinfo3d_t *grid3d) +{ + double memlu = 0.0; + gridinfo_t* grid = &(grid3d->grid2d); + sLocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = LUstruct->Glu_persist->xsup; + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + // double** Unzval_br_ptr = Llu->Unzval_br_ptr; + int_t iam = grid->iam; + + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + + int_t pc = PCOL( nodeId, grid ); + if (mycol == pc) + { + int_t ljb = LBj( nodeId, grid ); /* Local block number */ + int_t *lsub; + float* lnzval; + lsub = Lrowind_bc_ptr[ljb]; + lnzval = Lnzval_bc_ptr[ljb]; + + if (lsub != NULL) + { + int_t nrbl = lsub[0]; /*number of L blocks */ + int_t len = lsub[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + int_t len2 = SuperSize(nodeId) * len; + memlu += 1.0 * (len1 * sizeof(int_t) + len2 * sizeof(float)); + } + } + + int_t pr = PROW( nodeId, grid ); + if (myrow == pr) + { + int_t lib = LBi( nodeId, grid ); /* Local block number */ + int_t *usub; + // double* unzval; + usub = Ufstnz_br_ptr[lib]; + + if (usub != NULL) + { + int_t lenv = usub[1]; + int_t lens = usub[2]; + memlu += 1.0 * (lenv * sizeof(int_t) + lens * sizeof(float)); + } + } + return memlu; +} + +double smemForest(sForest_t*sforest, sLUstruct_t *LUstruct, gridinfo3d_t *grid3d) +{ + double memlu = 0; + + int_t *perm_c_supno = sforest->nodeList; + int_t nnodes = sforest->nNodes; + for (int i = 0; i < nnodes; ++i) + { + memlu += sgetLUMem(perm_c_supno[i], LUstruct, grid3d); + } + + return memlu; +} + +void s3D_printMemUse( trf3Dpartition_t* trf3Dpartition, sLUstruct_t *LUstruct, + gridinfo3d_t * grid3d ) +{ + int_t* myTreeIdxs = trf3Dpartition->myTreeIdxs; + int_t* myZeroTrIdxs = trf3Dpartition->myZeroTrIdxs; + sForest_t** sForests = trf3Dpartition->sForests; + + double memNzLU = 0.0; + double memzLU = 0.0; + int_t maxLvl = log2i(grid3d->zscp.Np) + 1; + + for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl) + { + sForest_t* sforest = sForests[myTreeIdxs[ilvl]]; + + if (sforest) + { + if (!myZeroTrIdxs[ilvl]) + { + memNzLU += smemForest(sforest, LUstruct, grid3d); + } + else + { + memzLU += smemForest(sforest, LUstruct, grid3d); + } + } + } + double sumMem = memNzLU + memzLU; + double maxMem, minMem, avgNzLU, avgzLU; + /*Now reduce it among all the procs*/ + MPI_Reduce(&sumMem, &maxMem, 1, MPI_DOUBLE, MPI_MAX, 0, grid3d->comm); + MPI_Reduce(&sumMem, &minMem, 1, MPI_DOUBLE, MPI_MIN, 0, grid3d->comm); + MPI_Reduce(&memNzLU, &avgNzLU, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm); + MPI_Reduce(&memzLU, &avgzLU, 1, MPI_DOUBLE, MPI_SUM, 0, grid3d->comm); + + int_t nProcs = grid3d->nprow * grid3d->npcol * grid3d->npdep; + if (!(grid3d->iam)) + { + /* code */ + printf("| Total Memory \t| %.2g \t| %.2g \t|%.2g \t|\n", (avgNzLU + avgzLU) / nProcs, maxMem, minMem ); + printf("| LU-LU(repli) \t| %.2g \t| %.2g \t|\n", (avgNzLU) / nProcs, avgzLU / nProcs ); + } +} + diff --git a/SRC/smyblas2_dist.c b/SRC/smyblas2_dist.c new file mode 100644 index 00000000..34281a54 --- /dev/null +++ b/SRC/smyblas2_dist.c @@ -0,0 +1,248 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Level 2 BLAS operations: solves and matvec, written in C + * + *
+ * -- SuperLU routine (version 2.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * November 15, 1997
+ * 
+ */ +/* + * File name: smyblas2.c + * Purpose: + * Level 2 BLAS operations: solves and matvec, written in C. + * Note: + * This is only used when the system lacks an efficient BLAS library. + */ + +/*! \brief + * + *
+ * Solves a dense UNIT lower triangular system. The unit lower
+ * triangular matrix is stored in a 2D array M(1:nrow,1:ncol).
+ * The solution will be returned in the rhs vector.
+ * 
+ */ +void slsolve ( int ldm, int ncol, float *M, float *rhs ) +{ + int k; + float x0, x1, x2, x3, x4, x5, x6, x7; + float *M0; + register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; + register int firstcol = 0; + + M0 = &M[0]; + + while ( firstcol < ncol - 7 ) { /* Do 8 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + Mki2 = Mki1 + ldm + 1; + Mki3 = Mki2 + ldm + 1; + Mki4 = Mki3 + ldm + 1; + Mki5 = Mki4 + ldm + 1; + Mki6 = Mki5 + ldm + 1; + Mki7 = Mki6 + ldm + 1; + + x0 = rhs[firstcol]; + x1 = rhs[firstcol+1] - x0 * *Mki0++; + x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; + x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; + x4 = rhs[firstcol+4] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++; + x5 = rhs[firstcol+5] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++ - x4 * *Mki4++; + x6 = rhs[firstcol+6] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++; + x7 = rhs[firstcol+7] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++ + - x3 * *Mki3++ - x4 * *Mki4++ - x5 * *Mki5++ + - x6 * *Mki6++; + + rhs[++firstcol] = x1; + rhs[++firstcol] = x2; + rhs[++firstcol] = x3; + rhs[++firstcol] = x4; + rhs[++firstcol] = x5; + rhs[++firstcol] = x6; + rhs[++firstcol] = x7; + ++firstcol; + + for (k = firstcol; k < ncol; k++) + rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ + - x2 * *Mki2++ - x3 * *Mki3++ + - x4 * *Mki4++ - x5 * *Mki5++ + - x6 * *Mki6++ - x7 * *Mki7++; + + M0 += 8 * ldm + 8; + } + + while ( firstcol < ncol - 3 ) { /* Do 4 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + Mki2 = Mki1 + ldm + 1; + Mki3 = Mki2 + ldm + 1; + + x0 = rhs[firstcol]; + x1 = rhs[firstcol+1] - x0 * *Mki0++; + x2 = rhs[firstcol+2] - x0 * *Mki0++ - x1 * *Mki1++; + x3 = rhs[firstcol+3] - x0 * *Mki0++ - x1 * *Mki1++ - x2 * *Mki2++; + + rhs[++firstcol] = x1; + rhs[++firstcol] = x2; + rhs[++firstcol] = x3; + ++firstcol; + + for (k = firstcol; k < ncol; k++) + rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++ + - x2 * *Mki2++ - x3 * *Mki3++; + + M0 += 4 * ldm + 4; + } + + if ( firstcol < ncol - 1 ) { /* Do 2 columns */ + Mki0 = M0 + 1; + Mki1 = Mki0 + ldm + 1; + + x0 = rhs[firstcol]; + x1 = rhs[firstcol+1] - x0 * *Mki0++; + + rhs[++firstcol] = x1; + ++firstcol; + + for (k = firstcol; k < ncol; k++) + rhs[k] = rhs[k] - x0 * *Mki0++ - x1 * *Mki1++; + + } + return; +} + +/*! \brief + * + *
+ * Solves a dense upper triangular system. The upper triangular matrix is
+ * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
+ * in the rhs vector.
+ * 
+ */ +void +susolve ( + int ldm, /* in */ + int ncol, /* in */ + float *M, /* in */ + float *rhs /* modified */ +) +{ + float xj; + int jcol, j, irow; + + jcol = ncol - 1; + + for (j = 0; j < ncol; j++) { + + xj = rhs[jcol] / M[jcol + jcol*ldm]; /* M(jcol, jcol) */ + rhs[jcol] = xj; + + for (irow = 0; irow < jcol; irow++) + rhs[irow] -= xj * M[irow + jcol*ldm]; /* M(irow, jcol) */ + + jcol--; + + } + return; +} + + +/*! \brief + * + *
+ * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
+ * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
+ * 
+ */ +void smatvec ( + int ldm, /* in -- leading dimension of M */ + int nrow, /* in */ + int ncol, /* in */ + float *M, /* in */ + float *vec, /* in */ + float *Mxvec /* in/out */ +) +{ + float vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7; + float *M0; + register float *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7; + register int firstcol = 0; + int k; + + M0 = &M[0]; + while ( firstcol < ncol - 7 ) { /* Do 8 columns */ + + Mki0 = M0; + Mki1 = Mki0 + ldm; + Mki2 = Mki1 + ldm; + Mki3 = Mki2 + ldm; + Mki4 = Mki3 + ldm; + Mki5 = Mki4 + ldm; + Mki6 = Mki5 + ldm; + Mki7 = Mki6 + ldm; + + vi0 = vec[firstcol++]; + vi1 = vec[firstcol++]; + vi2 = vec[firstcol++]; + vi3 = vec[firstcol++]; + vi4 = vec[firstcol++]; + vi5 = vec[firstcol++]; + vi6 = vec[firstcol++]; + vi7 = vec[firstcol++]; + + for (k = 0; k < nrow; k++) + Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ + + vi2 * *Mki2++ + vi3 * *Mki3++ + + vi4 * *Mki4++ + vi5 * *Mki5++ + + vi6 * *Mki6++ + vi7 * *Mki7++; + + M0 += 8 * ldm; + } + + while ( firstcol < ncol - 3 ) { /* Do 4 columns */ + + Mki0 = M0; + Mki1 = Mki0 + ldm; + Mki2 = Mki1 + ldm; + Mki3 = Mki2 + ldm; + + vi0 = vec[firstcol++]; + vi1 = vec[firstcol++]; + vi2 = vec[firstcol++]; + vi3 = vec[firstcol++]; + for (k = 0; k < nrow; k++) + Mxvec[k] += vi0 * *Mki0++ + vi1 * *Mki1++ + + vi2 * *Mki2++ + vi3 * *Mki3++ ; + + M0 += 4 * ldm; + } + + while ( firstcol < ncol ) { /* Do 1 column */ + + Mki0 = M0; + vi0 = vec[firstcol++]; + for (k = 0; k < nrow; k++) + Mxvec[k] += vi0 * *Mki0++; + + M0 += ldm; + } + return; +} + diff --git a/SRC/snrformat_loc3d.c b/SRC/snrformat_loc3d.c new file mode 100644 index 00000000..57874544 --- /dev/null +++ b/SRC/snrformat_loc3d.c @@ -0,0 +1,315 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + + +/*! @file + * \brief Preprocessing routines for the 3D factorization/solve codes: + * - Gather {A,B} from 3D grid to 2D process layer 0 + * - Scatter B (solution) from 2D process layer 0 to 3D grid + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab.
+ * May 12, 2021
+ */
+
+#include "superlu_sdefs.h"
+
+/* Dst <- BlockByBlock (Src), reshape the block storage. */
+static void matCopy(int n, int m, float *Dst, int lddst, float *Src, int ldsrc)
+{
+    for (int j = 0; j < m; j++)
+        for (int i = 0; i < n; ++i)
+        {
+            Dst[i + lddst * j] = Src[i + ldsrc * j];
+        }
+
+    return;
+}
+
+/*
+ * Gather {A,B} from 3D grid to 2D process layer 0
+ *     Input:  {A, B, ldb} are distributed on 3D process grid
+ *     Output: {A2d, B2d} are distributed on layer 0 2D process grid
+ *             output is in the returned A3d->{} structure.
+ *             see supermatrix.h for nrformat_loc3d{} structure.
+ */
+NRformat_loc3d *sGatherNRformat_loc3d(NRformat_loc *A, // input, on 3D grid
+                                      float *B,       // input
+				      int ldb, int nrhs, // input
+                                      gridinfo3d_t *grid3d)
+{
+    NRformat_loc3d *A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+    NRformat_loc *A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
+    A3d->m_loc = A->m_loc;
+    A3d->B3d = (float *) B; // on 3D process grid
+    A3d->ldb = ldb;
+    A3d->nrhs = nrhs;
+
+    // find number of nnzs
+    int_t *nnz_counts; // number of local nonzeros relative to all processes
+    int_t *row_counts; // number of local rows relative to all processes
+    int *nnz_counts_int, *row_counts_int; // 32-bit
+    int *nnz_disp, *row_disp; // displacement
+    int *b_counts_int; // number of local B entries relative to all processes 
+    int *b_disp;       // including 'nrhs'
+
+    nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+    row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+    nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+    row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+    b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+    MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
+               1, mpi_int_t, 0, grid3d->zscp.comm);
+    MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
+               1, mpi_int_t, 0, grid3d->zscp.comm);
+    nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+    row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+    b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+
+    nnz_disp[0] = 0;
+    row_disp[0] = 0;
+    b_disp[0] = 0;
+    for (int i = 0; i < grid3d->npdep; i++)
+    {
+        nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
+        row_disp[i + 1] = row_disp[i] + row_counts[i];
+        b_disp[i + 1] = nrhs * row_disp[i + 1];
+        nnz_counts_int[i] = nnz_counts[i];
+        row_counts_int[i] = row_counts[i];
+        b_counts_int[i] = nrhs * row_counts[i];
+    }
+
+    if (grid3d->zscp.Iam == 0)
+    {
+        A2d->colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t));
+        A2d->nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(float));
+        A2d->rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t));
+        A2d->rowptr[0] = 0;
+    }
+
+    MPI_Gatherv(A->nzval, A->nnz_loc, MPI_FLOAT, A2d->nzval,
+                nnz_counts_int, nnz_disp,
+                MPI_FLOAT, 0, grid3d->zscp.comm);
+    MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+                nnz_counts_int, nnz_disp,
+                mpi_int_t, 0, grid3d->zscp.comm);
+    MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+                row_counts_int, row_disp,
+                mpi_int_t, 0, grid3d->zscp.comm);
+
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (int i = 0; i < grid3d->npdep; i++)
+        {
+            for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+            {
+                // A2d->rowptr[j] += row_disp[i];
+                A2d->rowptr[j] += nnz_disp[i];
+            }
+        }
+        A2d->nnz_loc = nnz_disp[grid3d->npdep];
+        A2d->m_loc = row_disp[grid3d->npdep];
+
+        if (grid3d->rankorder == 1) { // XY-major
+     	    A2d->fst_row = A->fst_row;
+	} else { // Z-major
+	    gridinfo_t *grid2d = &(grid3d->grid2d);
+            int procs2d = grid2d->nprow * grid2d->npcol;
+            int m_loc_2d = A2d->m_loc;
+            int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+            MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+	                  MPI_INT, grid2d->comm);
+
+            int fst_row = 0;
+            for (int p = 0; p < procs2d; ++p)
+            {
+		if (grid2d->iam == p)
+                   A2d->fst_row = fst_row;
+            	fst_row += m_loc_2d_counts[p];
+            }
+
+            SUPERLU_FREE(m_loc_2d_counts);
+        }
+    }
+
+    // Btmp <- compact(B)
+    // compacting B
+    float *Btmp;
+    Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(float));
+    matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
+
+    float *B1;
+    if (grid3d->zscp.Iam == 0)
+    {
+        B1 = SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(float));
+        A3d->B2d = (float *) SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(float));
+    }
+
+    // B1 <- gatherv(Btmp)
+    MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_FLOAT, B1,
+                b_counts_int, b_disp,
+                MPI_FLOAT, 0, grid3d->zscp.comm);
+
+    // B2d <- colMajor(B1)
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (int i = 0; i < grid3d->npdep; ++i)
+        {
+            /* code */
+            matCopy(row_counts_int[i], nrhs, ((float*)A3d->B2d) + row_disp[i],
+		    A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
+        }
+
+        SUPERLU_FREE(B1);
+    }
+
+    A3d->A_nfmt = A2d;
+    A3d->b_counts_int = b_counts_int;
+    A3d->b_disp = b_disp;
+    A3d->row_counts_int = row_counts_int;
+    A3d->row_disp = row_disp;
+
+    /* free storage */
+    SUPERLU_FREE(nnz_counts);
+    SUPERLU_FREE(nnz_counts_int);
+    SUPERLU_FREE(row_counts);
+    SUPERLU_FREE(nnz_disp);
+    SUPERLU_FREE(Btmp);
+
+    return A3d;
+
+} /* sGatherNRformat_loc3d */
+
+/*
+ * Scatter B (solution) from 2D process layer 0 to 3D grid
+ *   Output: X3d <- A^{-1} B2d
+ */
+int sScatter_B3d(NRformat_loc3d *A3d,  // modified
+		 gridinfo3d_t *grid3d)
+{
+    float *B = (float *) A3d->B3d; // on 3D grid
+    int ldb = A3d->ldb;
+    int nrhs = A3d->nrhs;
+    float *B2d = (float *) A3d->B2d; // on 2D layer 0 
+    NRformat_loc A2d = *(A3d->A_nfmt);
+
+    /* The following are the number of local rows relative to all processes */
+    int m_loc = A3d->m_loc;
+    int *b_counts_int = A3d->b_counts_int;
+    int *b_disp = A3d->b_disp;
+    int *row_counts_int = A3d->row_counts_int;
+    int *row_disp = A3d->row_disp;
+    int i, p;
+    int iam = grid3d->iam;
+    int rankorder = grid3d->rankorder;
+    gridinfo_t *grid2d = &(grid3d->grid2d);
+
+    float *B1;  // on 2D layer 0
+    if (grid3d->zscp.Iam == 0)
+    {
+        B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(float));
+    }
+
+    // B1 <- BlockByBlock(B2d)
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (i = 0; i < grid3d->npdep; ++i)
+        {
+            /* code */
+            matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i],
+                    B2d + row_disp[i], A2d.m_loc);
+        }
+    }
+
+    float *Btmp; // on 3D grid
+    Btmp = SUPERLU_MALLOC(A3d->m_loc * nrhs * sizeof(float));
+
+    // Btmp <- scatterv(B1), block-by-block
+    if ( rankorder == 1 ) { /* XY-major in 3D grid */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	 *                     0      1      2      3
+	 *                     4      5      6      7
+	 *                     8      9      10     11
+	 */
+        MPI_Scatterv(B1, b_counts_int, b_disp, MPI_FLOAT,
+		     Btmp, nrhs * A3d->m_loc, MPI_FLOAT,
+		     0, grid3d->zscp.comm);
+
+    } else { /* Z-major in 3D grid */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	                       0      3      6      9
+ 	                       1      4      7      10      
+	                       2      5      8      11
+	  GATHER:  {A, B} in A * X = B
+	  layer-0:
+    	       B (row space)  X (column space)  SCATTER
+	       ----           ----        ---->>
+           P0  0              0
+(equations     3              1      Proc 0 -> Procs {0, 1, 2, 3}
+ reordered     6              2
+ after gather) 9              3
+	       ----           ----
+	   P1  1              4      Proc 1 -> Procs {4, 5, 6, 7}
+	       4              5
+               7              6
+               10             7
+	       ----           ----
+	   P2  2              8      Proc 2 -> Procs {8, 9, 10, 11}
+	       5              9
+	       8             10
+	       11            11
+	       ----         ----
+	*/
+        MPI_Request recv_req;
+	MPI_Status recv_status;
+	int pxy = grid2d->nprow * grid2d->npcol;
+	int npdep = grid3d->npdep, dest, src, tag;
+	int nprocs = pxy * npdep;
+
+	/* Everyone receives one block (post non-blocking irecv) */
+	src = grid3d->iam / npdep;  // Z-major
+	tag = iam;
+	MPI_Irecv(Btmp, nrhs * A3d->m_loc, MPI_FLOAT,
+		 src, tag, grid3d->comm, &recv_req);
+
+	/* Layer 0 sends to npdep procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (p = 0; p < npdep; ++p) { // send to npdep procs
+	        dest = p + grid2d->iam * npdep; // Z-major order
+		tag = dest;
+
+		MPI_Send(B1 + b_disp[p], b_counts_int[p], 
+			 MPI_FLOAT, dest, tag, grid3d->comm);
+	    }
+	}  /* end layer 0 send */
+    
+	/* Wait for Irecv to complete */
+	MPI_Wait(&recv_req, &recv_status);
+
+    } /* else Z-major */
+
+    // B <- colMajor(Btmp)
+    matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
+
+    /* free storage */
+    SUPERLU_FREE(A3d->b_counts_int);
+    SUPERLU_FREE(A3d->b_disp);
+    SUPERLU_FREE(A3d->row_counts_int);
+    SUPERLU_FREE(A3d->row_disp);
+    SUPERLU_FREE(Btmp);
+    if (grid3d->zscp.Iam == 0) SUPERLU_FREE(B1);
+
+    return 0;
+} /* sScatter_B3d */
diff --git a/SRC/sreadMM.c b/SRC/sreadMM.c
new file mode 100644
index 00000000..4b309a98
--- /dev/null
+++ b/SRC/sreadMM.c
@@ -0,0 +1,244 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+
+/*! @file
+ * \brief
+ * Contributed by Francois-Henry Rouet.
+ *
+ */
+#include 
+#include 
+#include "superlu_sdefs.h"
+
+#undef EXPAND_SYM
+
+/*! brief
+ *
+ * 
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * 
+ */ + +void +sreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, + float **nzval, int_t **rowind, int_t **colptr) +{ + int_t j, k, jsize, nnz, nz, new_nonz; + float *a, *val; + int_t *asub, *xa, *row, *col; + int_t zero_base = 0; + char *p, line[512], banner[64], mtx[64], crd[64], arith[64], sym[64]; + int expand; + char *cs; + + /* File format: + * %%MatrixMarket matrix coordinate real general/symmetric/... + * % ... + * % (optional comments) + * % ... + * #rows #non-zero + * Triplet in the rest of lines: row col value + */ + + /* 1/ read header */ + cs = fgets(line,512,fp); + for (p=line; *p!='\0'; *p=tolower(*p),p++); + + if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, arith, sym) != 5) { + printf("Invalid header (first line does not contain 5 tokens)\n"); + exit(-1); + } + + if(strcmp(banner,"%%matrixmarket")) { + printf("Invalid header (first token is not \"%%%%MatrixMarket\")\n"); + exit(-1); + } + + if(strcmp(mtx,"matrix")) { + printf("Not a matrix; this driver cannot handle that.\n"); + exit(-1); + } + + if(strcmp(crd,"coordinate")) { + printf("Not in coordinate format; this driver cannot handle that.\n"); + exit(-1); + } + + if(strcmp(arith,"real")) { + if(!strcmp(arith,"complex")) { + printf("Complex matrix; use zreadMM instead!\n"); + exit(-1); + } + else if(!strcmp(arith, "pattern")) { + printf("Pattern matrix; values are needed!\n"); + exit(-1); + } + else { + printf("Unknown arithmetic\n"); + exit(-1); + } + } + + if(strcmp(sym,"general")) { + printf("Symmetric matrix: will be expanded\n"); + expand=1; + } else + expand=0; + + /* 2/ Skip comments */ + while(banner[0]=='%') { + cs = fgets(line,512,fp); + sscanf(line,"%s",banner); + } + + /* 3/ Read n and nnz */ +#ifdef _LONGINT + sscanf(line, "%ld%ld%ld",m, n, nonz); +#else + sscanf(line, "%d%d%d",m, n, nonz); +#endif + + if(*m!=*n) { + printf("Rectangular matrix!. Abort\n"); + exit(-1); + } + + if(expand) + new_nonz = 2 * *nonz - *n; + else + new_nonz = *nonz; + + *m = *n; + printf("m %lld, n %lld, nonz %lld\n", (long long) *m, (long long) *n, (long long) *nonz); + fflush(stdout); + sallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */ + a = *nzval; + asub = *rowind; + xa = *colptr; + + if ( !(val = floatMalloc_dist(new_nonz)) ) + ABORT("Malloc fails for val[]"); + if ( !(row = (int_t *) intMalloc_dist(new_nonz)) ) + ABORT("Malloc fails for row[]"); + if ( !(col = (int_t *) intMalloc_dist(new_nonz)) ) + ABORT("Malloc fails for col[]"); + + for (j = 0; j < *n; ++j) xa[j] = 0; + + /* 4/ Read triplets of values */ + for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) { + + j = fscanf(fp, IFMT IFMT "%f\n", &row[nz], &col[nz], &val[nz]); + + if ( nnz == 0 ) /* first nonzero */ { + if ( row[0] == 0 || col[0] == 0 ) { + zero_base = 1; + printf("triplet file: row/col indices are zero-based.\n"); + } else + printf("triplet file: row/col indices are one-based.\n"); + fflush(stdout); + } + + if ( !zero_base ) { + /* Change to 0-based indexing. */ + --row[nz]; + --col[nz]; + } + + if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n + /*|| val[nz] == 0.*/) { + fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = %e out of bound, removed\n", + nz, row[nz], col[nz], val[nz]); + exit(-1); + } else { + ++xa[col[nz]]; + if(expand) { + if ( row[nz] != col[nz] ) { /* Excluding diagonal */ + ++nz; + row[nz] = col[nz-1]; + col[nz] = row[nz-1]; + val[nz] = val[nz-1]; + ++xa[col[nz]]; + } + } + ++nz; + } + } + + *nonz = nz; + if(expand) { + printf("new_nonz after symmetric expansion:\t" IFMT "\n", *nonz); + fflush(stdout); + } + + + /* Initialize the array of column pointers */ + k = 0; + jsize = xa[0]; + xa[0] = 0; + for (j = 1; j < *n; ++j) { + k += jsize; + jsize = xa[j]; + xa[j] = k; + } + + /* Copy the triplets into the column oriented storage */ + for (nz = 0; nz < *nonz; ++nz) { + j = col[nz]; + k = xa[j]; + asub[k] = row[nz]; + a[k] = val[nz]; + ++xa[j]; + } + + /* Reset the column pointers to the beginning of each column */ + for (j = *n; j > 0; --j) + xa[j] = xa[j-1]; + xa[0] = 0; + + SUPERLU_FREE(val); + SUPERLU_FREE(row); + SUPERLU_FREE(col); + +#ifdef CHK_INPUT + int i; + for (i = 0; i < *n; i++) { + printf("Col %d, xa %d\n", i, xa[i]); + for (k = xa[i]; k < xa[i+1]; k++) + printf("%d\t%16.10f\n", asub[k], a[k]); + } +#endif + +} + + +static void sreadrhs(int m, float *b) +{ + FILE *fp, *fopen(); + int i; + + if ( !(fp = fopen("b.dat", "r")) ) { + fprintf(stderr, "sreadrhs: file does not exist\n"); + exit(-1); + } + for (i = 0; i < m; ++i) + i = fscanf(fp, "%lf\n", &b[i]); + /*fscanf(fp, "%d%lf\n", &j, &b[i]);*/ + /* readpair_(j, &b[i]);*/ + fclose(fp); +} diff --git a/SRC/sreadhb.c b/SRC/sreadhb.c new file mode 100644 index 00000000..4476af93 --- /dev/null +++ b/SRC/sreadhb.c @@ -0,0 +1,389 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Read a FLOAT PRECISION matrix stored in Harwell-Boeing format + * + *
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * 
+ */ +#include +#include +#include +#include "superlu_sdefs.h" + +/* + * Prototypes + */ +static void ReadVector(FILE *, int_t, int_t *, int_t, int_t); +static void sReadValues(FILE *, int_t, float *, int_t, int_t); +static void FormFullA(int_t, int_t *, float **, int_t **, int_t **); +static int DumpLine(FILE *); +static int ParseIntFormat(char *, int_t *, int_t *); +static int ParseFloatFormat(char *, int_t *, int_t *); + +/*! \brief + * + *
+ * Purpose
+ * =======
+ *
+ * Read a FLOAT PRECISION matrix stored in Harwell-Boeing format
+ * as described below.
+ *
+ * Line 1 (A72,A8)
+ *  	Col. 1 - 72   Title (TITLE)
+ *	Col. 73 - 80  Key (KEY)
+ *
+ * Line 2 (5I14)
+ * 	Col. 1 - 14   Total number of lines excluding header (TOTCRD)
+ * 	Col. 15 - 28  Number of lines for pointers (PTRCRD)
+ * 	Col. 29 - 42  Number of lines for row (or variable) indices (INDCRD)
+ * 	Col. 43 - 56  Number of lines for numerical values (VALCRD)
+ *	Col. 57 - 70  Number of lines for right-hand sides (RHSCRD)
+ *                    (including starting guesses and solution vectors
+ *		       if present)
+ *           	      (zero indicates no right-hand side data is present)
+ *
+ * Line 3 (A3, 11X, 4I14)
+ *   	Col. 1 - 3    Matrix type (see below) (MXTYPE)
+ * 	Col. 15 - 28  Number of rows (or variables) (NROW)
+ * 	Col. 29 - 42  Number of columns (or elements) (NCOL)
+ *	Col. 43 - 56  Number of row (or variable) indices (NNZERO)
+ *	              (equal to number of entries for assembled matrices)
+ * 	Col. 57 - 70  Number of elemental matrix entries (NELTVL)
+ *	              (zero in the case of assembled matrices)
+ * Line 4 (2A16, 2A20)
+ * 	Col. 1 - 16   Format for pointers (PTRFMT)
+ *	Col. 17 - 32  Format for row (or variable) indices (INDFMT)
+ *	Col. 33 - 52  Format for numerical values of coefficient matrix (VALFMT)
+ * 	Col. 53 - 72 Format for numerical values of right-hand sides (RHSFMT)
+ *
+ * Line 5 (A3, 11X, 2I14) Only present if there are right-hand sides present
+ *    	Col. 1 	      Right-hand side type:
+ *	         	  F for full storage or M for same format as matrix
+ *    	Col. 2        G if a starting vector(s) (Guess) is supplied. (RHSTYP)
+ *    	Col. 3        X if an exact solution vector(s) is supplied.
+ *	Col. 15 - 28  Number of right-hand sides (NRHS)
+ *	Col. 29 - 42  Number of row indices (NRHSIX)
+ *          	      (ignored in case of unassembled matrices)
+ *
+ * The three character type field on line 3 describes the matrix type.
+ * The following table lists the permitted values for each of the three
+ * characters. As an example of the type field, RSA denotes that the matrix
+ * is real, symmetric, and assembled.
+ *
+ * First Character:
+ *	R Real matrix
+ *	C Complex matrix
+ *	P Pattern only (no numerical values supplied)
+ *
+ * Second Character:
+ *	S Symmetric
+ *	U Unsymmetric
+ *	H Hermitian
+ *	Z Skew symmetric
+ *	R Rectangular
+ *
+ * Third Character:
+ *	A Assembled
+ *	E Elemental matrices (unassembled)
+ * 
+ */ + +void +sreadhb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz, + float **nzval, int_t **rowind, int_t **colptr) +{ + + register int_t i, numer_lines, rhscrd = 0; + int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize; + char buf[100], type[4]; + int_t sym; + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(0, "Enter sreadhb_dist()"); +#endif + + /* Line 1 */ + fgets(buf, 100, fp); + + /* Line 2 */ + for (i=0; i<5; i++) { + fscanf(fp, "%14c", buf); buf[14] = 0; + tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/ + if (i == 3) numer_lines = tmp; + if (i == 4 && tmp) rhscrd = tmp; + } + DumpLine(fp); + + /* Line 3 */ + fscanf(fp, "%3c", type); + fscanf(fp, "%11c", buf); /* pad */ + type[3] = 0; +#if ( DEBUGlevel>=1 ) + if ( !iam ) printf("Matrix type %s\n", type); +#endif + + fscanf(fp, "%14c", buf); *nrow = atoi(buf); + fscanf(fp, "%14c", buf); *ncol = atoi(buf); + fscanf(fp, "%14c", buf); *nonz = atoi(buf); + fscanf(fp, "%14c", buf); tmp = atoi(buf); + + if (tmp != 0) + if ( !iam ) printf("This is not an assembled matrix!\n"); + if (*nrow != *ncol) + if ( !iam ) printf("Matrix is not square.\n"); + DumpLine(fp); + + /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */ + sallocateA_dist(*ncol, *nonz, nzval, rowind, colptr); + + /* Line 4: format statement */ + fscanf(fp, "%16c", buf); + ParseIntFormat(buf, &colnum, &colsize); + fscanf(fp, "%16c", buf); + ParseIntFormat(buf, &rownum, &rowsize); + fscanf(fp, "%20c", buf); + ParseFloatFormat(buf, &valnum, &valsize); + fscanf(fp, "%20c", buf); + DumpLine(fp); + + /* Line 5: right-hand side */ + if ( rhscrd ) DumpLine(fp); /* skip RHSFMT */ + +#if ( DEBUGlevel>=1 ) + if ( !iam ) { + printf(IFMT " rows, " IFMT " nonzeros\n", *nrow, *nonz); + printf("colnum " IFMT ", colsize " IFMT "\n", colnum, colsize); + printf("rownum " IFMT ", rowsize " IFMT "\n", rownum, rowsize); + printf("valnum " IFMT ", valsize " IFMT "\n", valnum, valsize); + } +#endif + + ReadVector(fp, *ncol+1, *colptr, colnum, colsize); +#if ( DEBUGlevel>=1 ) + if ( !iam ) printf("read colptr[" IFMT "] = " IFMT "\n", *ncol, (*colptr)[*ncol]); +#endif + ReadVector(fp, *nonz, *rowind, rownum, rowsize); +#if ( DEBUGlevel>=1 ) + if ( !iam ) printf("read rowind[" IFMT "] = " IFMT "\n", *nonz-1, (*rowind)[*nonz-1]); +#endif + if ( numer_lines ) { + sReadValues(fp, *nonz, *nzval, valnum, valsize); +#if ( DEBUGlevel>=1 ) + if ( !iam ) printf("read nzval[" IFMT "] = %e\n", *nonz-1, (*nzval)[*nonz-1]); +#endif + } + + sym = (type[1] == 'S' || type[1] == 's'); + if ( sym ) { + FormFullA(*ncol, nonz, nzval, rowind, colptr); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC(0, "Exit sreadhb_dist()"); +#endif +} + +/* Eat up the rest of the current line */ +static int DumpLine(FILE *fp) +{ + register int c; + while ((c = fgetc(fp)) != '\n') ; + return 0; +} + +static int ParseIntFormat(char *buf, int_t *num, int_t *size) +{ + char *tmp; + + tmp = buf; + while (*tmp++ != '(') ; + *num = atoi(tmp); + while (*tmp != 'I' && *tmp != 'i') ++tmp; + ++tmp; + *size = atoi(tmp); + return 0; +} + +static int ParseFloatFormat(char *buf, int_t *num, int_t *size) +{ + char *tmp, *period; + + tmp = buf; + while (*tmp++ != '(') ; + *num = atoi(tmp); + while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd' + && *tmp != 'F' && *tmp != 'f') { + /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the + num picked up refers to P, which should be skipped. */ + if (*tmp=='p' || *tmp=='P') { + ++tmp; + *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ + } else { + ++tmp; + } + } + ++tmp; + period = tmp; + while (*period != '.' && *period != ')') ++period ; + *period = '\0'; + *size = atoi(tmp); + + return 0; +} + +static void +ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize) +{ + register int_t i, j, item; + char tmp, buf[100]; + + i = 0; + while (i < n) { + fgets(buf, 100, fp); /* read a line at a time */ + for (j=0; j + * On input, nonz/nzval/rowind/colptr represents lower part of a symmetric + * matrix. On exit, it represents the full matrix with lower and upper parts. + *
+ */ +static void +FormFullA(int_t n, int_t *nonz, float **nzval, int_t **rowind, int_t **colptr) +{ + register int_t i, j, k, col, new_nnz; + int_t *t_rowind, *t_colptr, *al_rowind, *al_colptr, *a_rowind, *a_colptr; + int_t *marker; + float *t_val, *al_val, *a_val; + + al_rowind = *rowind; + al_colptr = *colptr; + al_val = *nzval; + + if ( !(marker =(int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC fails for marker[]"); + if ( !(t_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC t_colptr[]"); + if ( !(t_rowind = (int_t *) SUPERLU_MALLOC( *nonz * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC fails for t_rowind[]"); + if ( !(t_val = (float*) SUPERLU_MALLOC( *nonz * sizeof(float)) ) ) + ABORT("SUPERLU_MALLOC fails for t_val[]"); + + /* Get counts of each column of T, and set up column pointers */ + for (i = 0; i < n; ++i) marker[i] = 0; + for (j = 0; j < n; ++j) { + for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) + ++marker[al_rowind[i]]; + } + t_colptr[0] = 0; + for (i = 0; i < n; ++i) { + t_colptr[i+1] = t_colptr[i] + marker[i]; + marker[i] = t_colptr[i]; + } + + /* Transpose matrix A to T */ + for (j = 0; j < n; ++j) + for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) { + col = al_rowind[i]; + t_rowind[marker[col]] = j; + t_val[marker[col]] = al_val[i]; + ++marker[col]; + } + + new_nnz = *nonz * 2 - n; + if ( !(a_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC a_colptr[]"); + if ( !(a_rowind = (int_t *) SUPERLU_MALLOC( new_nnz * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC fails for a_rowind[]"); + if ( !(a_val = (float*) SUPERLU_MALLOC( new_nnz * sizeof(float)) ) ) + ABORT("SUPERLU_MALLOC fails for a_val[]"); + + a_colptr[0] = 0; + k = 0; + for (j = 0; j < n; ++j) { + for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { + if ( t_rowind[i] != j ) { /* not diagonal */ + a_rowind[k] = t_rowind[i]; + a_val[k] = t_val[i]; +#if (DEBUGlevel >= 2) + if ( fabs(a_val[k]) < 4.047e-300 ) + printf("%5d: %e\n", k, a_val[k]); +#endif + ++k; + } + } + + for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) { + a_rowind[k] = al_rowind[i]; + a_val[k] = al_val[i]; +#if (DEBUGlevel >= 2) + if ( fabs(a_val[k]) < 4.047e-300 ) + printf("%5d: %e\n", k, a_val[k]); +#endif + ++k; + } + + a_colptr[j+1] = k; + } + + printf("FormFullA: new_nnz = " IFMT ", k = " IFMT "\n", new_nnz, k); + + SUPERLU_FREE(al_val); + SUPERLU_FREE(al_rowind); + SUPERLU_FREE(al_colptr); + SUPERLU_FREE(marker); + SUPERLU_FREE(t_val); + SUPERLU_FREE(t_rowind); + SUPERLU_FREE(t_colptr); + + *nzval = a_val; + *rowind = a_rowind; + *colptr = a_colptr; + *nonz = new_nnz; +} diff --git a/SRC/sreadrb.c b/SRC/sreadrb.c new file mode 100644 index 00000000..7165536e --- /dev/null +++ b/SRC/sreadrb.c @@ -0,0 +1,346 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file sreadrb.c + * \brief Read a matrix stored in Rutherford-Boeing format + * + *
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * August 15, 2014
+ *
+ * 
+ * + * Purpose + * ======= + * + * Read a FLOAT PRECISION matrix stored in Rutherford-Boeing format + * as described below. + * + * Line 1 (A72, A8) + * Col. 1 - 72 Title (TITLE) + * Col. 73 - 80 Matrix name / identifier (MTRXID) + * + * Line 2 (I14, 3(1X, I13)) + * Col. 1 - 14 Total number of lines excluding header (TOTCRD) + * Col. 16 - 28 Number of lines for pointers (PTRCRD) + * Col. 30 - 42 Number of lines for row (or variable) indices (INDCRD) + * Col. 44 - 56 Number of lines for numerical values (VALCRD) + * + * Line 3 (A3, 11X, 4(1X, I13)) + * Col. 1 - 3 Matrix type (see below) (MXTYPE) + * Col. 15 - 28 Compressed Column: Number of rows (NROW) + * Elemental: Largest integer used to index variable (MVAR) + * Col. 30 - 42 Compressed Column: Number of columns (NCOL) + * Elemental: Number of element matrices (NELT) + * Col. 44 - 56 Compressed Column: Number of entries (NNZERO) + * Elemental: Number of variable indeces (NVARIX) + * Col. 58 - 70 Compressed Column: Unused, explicitly zero + * Elemental: Number of elemental matrix entries (NELTVL) + * + * Line 4 (2A16, A20) + * Col. 1 - 16 Fortran format for pointers (PTRFMT) + * Col. 17 - 32 Fortran format for row (or variable) indices (INDFMT) + * Col. 33 - 52 Fortran format for numerical values of coefficient matrix + * (VALFMT) + * (blank in the case of matrix patterns) + * + * The three character type field on line 3 describes the matrix type. + * The following table lists the permitted values for each of the three + * characters. As an example of the type field, RSA denotes that the matrix + * is real, symmetric, and assembled. + * + * First Character: + * R Real matrix + * C Complex matrix + * I integer matrix + * P Pattern only (no numerical values supplied) + * Q Pattern only (numerical values supplied in associated auxiliary value + * file) + * + * Second Character: + * S Symmetric + * U Unsymmetric + * H Hermitian + * Z Skew symmetric + * R Rectangular + * + * Third Character: + * A Compressed column form + * E Elemental form + * + *
+ */ + +#include +#include +#include "superlu_sdefs.h" + +/*! \brief Eat up the rest of the current line */ +static int DumpLine(FILE *fp) +{ + register int c; + while ((c = fgetc(fp)) != '\n') ; + return 0; +} + +static int ParseIntFormat(char *buf, int_t *num, int_t *size) +{ + char *tmp; + + tmp = buf; + while (*tmp++ != '(') ; + *num = atoi(tmp); + while (*tmp != 'I' && *tmp != 'i') ++tmp; + ++tmp; + *size = atoi(tmp); + return 0; +} + +static int ParseFloatFormat(char *buf, int_t *num, int_t *size) +{ + char *tmp, *period; + + tmp = buf; + while (*tmp++ != '(') ; + *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ + while (*tmp != 'E' && *tmp != 'e' && *tmp != 'D' && *tmp != 'd' + && *tmp != 'F' && *tmp != 'f') { + /* May find kP before nE/nD/nF, like (1P6F13.6). In this case the + num picked up refers to P, which should be skipped. */ + if (*tmp=='p' || *tmp=='P') { + ++tmp; + *num = atoi(tmp); /*sscanf(tmp, "%d", num);*/ + } else { + ++tmp; + } + } + ++tmp; + period = tmp; + while (*period != '.' && *period != ')') ++period ; + *period = '\0'; + *size = atoi(tmp); /*sscanf(tmp, "%2d", size);*/ + + return 0; +} + +static int ReadVector(FILE *fp, int_t n, int_t *where, int_t perline, int_t persize) +{ + register int_t i, j, item; + char tmp, buf[100]; + + i = 0; + while (i < n) { + fgets(buf, 100, fp); /* read a line at a time */ + for (j=0; j + * On input, nonz/nzval/rowind/colptr represents lower part of a symmetric + * matrix. On exit, it represents the full matrix with lower and upper parts. + *
+ */ +static void +FormFullA(int_t n, int_t *nonz, float **nzval, int_t **rowind, int_t **colptr) +{ + register int_t i, j, k, col, new_nnz; + int_t *t_rowind, *t_colptr, *al_rowind, *al_colptr, *a_rowind, *a_colptr; + int_t *marker; + float *t_val, *al_val, *a_val; + + al_rowind = *rowind; + al_colptr = *colptr; + al_val = *nzval; + + if ( !(marker = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC fails for marker[]"); + if ( !(t_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC t_colptr[]"); + if ( !(t_rowind = (int_t *) SUPERLU_MALLOC( *nonz * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC fails for t_rowind[]"); + if ( !(t_val = (float*) SUPERLU_MALLOC( *nonz * sizeof(float)) ) ) + ABORT("SUPERLU_MALLOC fails for t_val[]"); + + /* Get counts of each column of T, and set up column pointers */ + for (i = 0; i < n; ++i) marker[i] = 0; + for (j = 0; j < n; ++j) { + for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) + ++marker[al_rowind[i]]; + } + t_colptr[0] = 0; + for (i = 0; i < n; ++i) { + t_colptr[i+1] = t_colptr[i] + marker[i]; + marker[i] = t_colptr[i]; + } + + /* Transpose matrix A to T */ + for (j = 0; j < n; ++j) + for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) { + col = al_rowind[i]; + t_rowind[marker[col]] = j; + t_val[marker[col]] = al_val[i]; + ++marker[col]; + } + + new_nnz = *nonz * 2 - n; + if ( !(a_colptr = (int_t *) SUPERLU_MALLOC( (n+1) * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC a_colptr[]"); + if ( !(a_rowind = (int_t *) SUPERLU_MALLOC( new_nnz * sizeof(int_t)) ) ) + ABORT("SUPERLU_MALLOC fails for a_rowind[]"); + if ( !(a_val = (float*) SUPERLU_MALLOC( new_nnz * sizeof(float)) ) ) + ABORT("SUPERLU_MALLOC fails for a_val[]"); + + a_colptr[0] = 0; + k = 0; + for (j = 0; j < n; ++j) { + for (i = t_colptr[j]; i < t_colptr[j+1]; ++i) { + if ( t_rowind[i] != j ) { /* not diagonal */ + a_rowind[k] = t_rowind[i]; + a_val[k] = t_val[i]; + ++k; + } + } + + for (i = al_colptr[j]; i < al_colptr[j+1]; ++i) { + a_rowind[k] = al_rowind[i]; + a_val[k] = al_val[i]; + ++k; + } + + a_colptr[j+1] = k; + } + + printf("FormFullA: new_nnz = " IFMT ", k = " IFMT "\n", new_nnz, k); + + SUPERLU_FREE(al_val); + SUPERLU_FREE(al_rowind); + SUPERLU_FREE(al_colptr); + SUPERLU_FREE(marker); + SUPERLU_FREE(t_val); + SUPERLU_FREE(t_rowind); + SUPERLU_FREE(t_colptr); + + *nzval = a_val; + *rowind = a_rowind; + *colptr = a_colptr; + *nonz = new_nnz; +} + +void +sreadrb_dist(int iam, FILE *fp, int_t *nrow, int_t *ncol, int_t *nonz, + float **nzval, int_t **rowind, int_t **colptr) +{ + register int_t i, numer_lines = 0; + int_t tmp, colnum, colsize, rownum, rowsize, valnum, valsize; + char buf[100], type[4]; + int sym; + + /* Line 1 */ + fgets(buf, 100, fp); + fputs(buf, stdout); + + /* Line 2 */ + for (i=0; i<4; i++) { + fscanf(fp, "%14c", buf); buf[14] = 0; + tmp = atoi(buf); /*sscanf(buf, "%d", &tmp);*/ + if (i == 3) numer_lines = tmp; + } + DumpLine(fp); + + /* Line 3 */ + fscanf(fp, "%3c", type); + fscanf(fp, "%11c", buf); /* pad */ + type[3] = 0; +#if (DEBUGlevel >= 1) + if ( !iam ) printf("Matrix type %s\n", type); +#endif + + fscanf(fp, "%14c", buf); *nrow = atoi(buf); + fscanf(fp, "%14c", buf); *ncol = atoi(buf); + fscanf(fp, "%14c", buf); *nonz = atoi(buf); + fscanf(fp, "%14c", buf); tmp = atoi(buf); + + if (tmp != 0) + if ( !iam ) printf("This is not an assembled matrix!\n"); + if (*nrow != *ncol) + if ( !iam ) printf("Matrix is not square.\n"); + DumpLine(fp); + + /* Allocate storage for the three arrays ( nzval, rowind, colptr ) */ + sallocateA_dist(*ncol, *nonz, nzval, rowind, colptr); + + /* Line 4: format statement */ + fscanf(fp, "%16c", buf); + ParseIntFormat(buf, &colnum, &colsize); + fscanf(fp, "%16c", buf); + ParseIntFormat(buf, &rownum, &rowsize); + fscanf(fp, "%20c", buf); + ParseFloatFormat(buf, &valnum, &valsize); + DumpLine(fp); + +#if (DEBUGlevel >= 1) + if ( !iam ) { + printf(IFMT " rows, " IFMT " nonzeros\n", *nrow, *nonz); + printf("colnum " IFMT ", colsize " IFMT "\n", colnum, colsize); + printf("rownum " IFMT ", rowsize " IFMT "\n", rownum, rowsize); + printf("valnum " IFMT ", valsize " IFMT "\n", valnum, valsize); + } +#endif + + ReadVector(fp, *ncol+1, *colptr, colnum, colsize); + ReadVector(fp, *nonz, *rowind, rownum, rowsize); + if ( numer_lines ) { + sReadValues(fp, *nonz, *nzval, valnum, valsize); + } + + sym = (type[1] == 'S' || type[1] == 's'); + if ( sym ) { + FormFullA(*ncol, nonz, nzval, rowind, colptr); + } + +} diff --git a/SRC/sreadtriple.c b/SRC/sreadtriple.c new file mode 100644 index 00000000..18ec7ca6 --- /dev/null +++ b/SRC/sreadtriple.c @@ -0,0 +1,180 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief + * + */ +#include +#include "superlu_sdefs.h" + +#undef EXPAND_SYM + +/*! brief + * + *
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * 
+ */ + +void +sreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, + float **nzval, int_t **rowind, int_t **colptr) +{ + int_t j, k, jsize, nnz, nz, new_nonz; + float *a, *val; + int_t *asub, *xa, *row, *col; + int_t zero_base = 0; + + /* File format: + * First line: #rows #non-zero + * Triplet in the rest of lines: + * row col value + */ + +#ifdef _LONGINT + fscanf(fp, "%ld%ld%ld", m, n, nonz); +#else + fscanf(fp, "%d%d%d", m, n, nonz); +#endif + +#ifdef EXPAND_SYM + new_nonz = 2 * *nonz - *n; +#else + new_nonz = *nonz; +#endif + *m = *n; + printf("m %lld, n %lld, nonz %lld\n", (long long) *m, (long long) *n, (long long) *nonz); + sallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */ + a = *nzval; + asub = *rowind; + xa = *colptr; + + if ( !(val = (float *) SUPERLU_MALLOC(new_nonz * sizeof(float))) ) + ABORT("Malloc fails for val[]"); + if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) ) + ABORT("Malloc fails for row[]"); + if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) ) + ABORT("Malloc fails for col[]"); + + for (j = 0; j < *n; ++j) xa[j] = 0; + + /* Read into the triplet array from a file */ + for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) { + +#ifdef _LONGINT + fscanf(fp, "%ld%ld%f\n", &row[nz], &col[nz], &val[nz]); +#else // int + fscanf(fp, "%d%d%f\n", &row[nz], &col[nz], &val[nz]); +#endif + + if ( nnz == 0 ) /* first nonzero */ + if ( row[0] == 0 || col[0] == 0 ) { + zero_base = 1; + printf("triplet file: row/col indices are zero-based.\n"); + } else + printf("triplet file: row/col indices are one-based.\n"); + + if ( !zero_base ) { + /* Change to 0-based indexing. */ + --row[nz]; + --col[nz]; + } + + if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n + /*|| val[nz] == 0.*/) { + fprintf(stderr, "nz " IFMT ", (" IFMT ", " IFMT ") = %e out of bound, removed\n", + nz, row[nz], col[nz], val[nz]); + exit(-1); + } else { + ++xa[col[nz]]; +#ifdef EXPAND_SYM + if ( row[nz] != col[nz] ) { /* Excluding diagonal */ + ++nz; + row[nz] = col[nz-1]; + col[nz] = row[nz-1]; + val[nz] = val[nz-1]; + ++xa[col[nz]]; + } +#endif + ++nz; + } + } + + *nonz = nz; +#ifdef EXPAND_SYM + printf("new_nonz after symmetric expansion:\t%d\n", *nonz); +#endif + + + /* Initialize the array of column pointers */ + k = 0; + jsize = xa[0]; + xa[0] = 0; + for (j = 1; j < *n; ++j) { + k += jsize; + jsize = xa[j]; + xa[j] = k; + } + + /* Copy the triplets into the column oriented storage */ + for (nz = 0; nz < *nonz; ++nz) { + j = col[nz]; + k = xa[j]; + asub[k] = row[nz]; + a[k] = val[nz]; + ++xa[j]; + } + + /* Reset the column pointers to the beginning of each column */ + for (j = *n; j > 0; --j) + xa[j] = xa[j-1]; + xa[0] = 0; + + SUPERLU_FREE(val); + SUPERLU_FREE(row); + SUPERLU_FREE(col); + +#ifdef CHK_INPUT + int i; + for (i = 0; i < *n; i++) { + printf("Col %d, xa %d\n", i, xa[i]); + for (k = xa[i]; k < xa[i+1]; k++) + printf("%d\t%16.10f\n", asub[k], a[k]); + } +#endif + +} + + +void sreadrhs(int m, float *b) +{ + FILE *fp, *fopen(); + int i; + + if ( !(fp = fopen("b.dat", "r")) ) { + fprintf(stderr, "sreadrhs: file does not exist\n"); + exit(-1); + } + for (i = 0; i < m; ++i) + fscanf(fp, "%f\n", &b[i]); + /* readpair_(j, &b[i]);*/ + + fclose(fp); +} + + diff --git a/SRC/sreadtriple_noheader.c b/SRC/sreadtriple_noheader.c new file mode 100644 index 00000000..91bd58c3 --- /dev/null +++ b/SRC/sreadtriple_noheader.c @@ -0,0 +1,199 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief + * + */ +#include +#include "superlu_sdefs.h" + +#undef EXPAND_SYM + +/*! brief + * + *
+ * Output parameters
+ * =================
+ *   (nzval, rowind, colptr): (*rowind)[*] contains the row subscripts of
+ *      nonzeros in columns of matrix A; (*nzval)[*] the numerical values;
+ *	column i of A is given by (*nzval)[k], k = (*rowind)[i],...,
+ *      (*rowind)[i+1]-1.
+ * 
+ */ + +void +sreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, + float **nzval, int_t **rowind, int_t **colptr) +{ + int_t i, j, k, jsize, lasta, nnz, nz, new_nonz, minn = 100; + float *a, *val, vali; + int_t *asub, *xa, *row, *col; + int zero_base = 0, ret_val = 0; + + /* File format: Triplet in a line for each nonzero entry: + * row col value + * or row col real_part imaginary_part + */ + + /* First pass: determine N and NNZ */ + nz = *n = 0; + +#ifdef _LONGINT + ret_val = fscanf(fp, "%ld%ld%f\n", &i, &j, &vali); +#else // int + ret_val = fscanf(fp, "%d%d%f\n", &i, &j, &vali); +#endif + + while (ret_val != EOF) { + *n = SUPERLU_MAX(*n, i); + *n = SUPERLU_MAX(*n, j); + minn = SUPERLU_MIN(minn, i); + minn = SUPERLU_MIN(minn, j); + ++nz; + +#ifdef _LONGINT + ret_val = fscanf(fp, "%ld%ld%f\n", &i, &j, &vali); +#else // int + ret_val = fscanf(fp, "%d%d%f\n", &i, &j, &vali); +#endif + } + + if ( minn == 0 ) { /* zero-based indexing */ + zero_base = 1; + ++(*n); + printf("triplet file: row/col indices are zero-based.\n"); + } else { + printf("triplet file: row/col indices are one-based.\n"); + } + + *m = *n; + *nonz = nz; + rewind(fp); + +#ifdef EXPAND_SYM + new_nonz = 2 * *nonz - *n; +#else + new_nonz = *nonz; +#endif + + /* Second pass: read the actual matrix values */ + printf("m %ld, n %ld, nonz %ld\n", (long int) *m, (long int) *n, (long int) *nonz); + sallocateA_dist(*n, new_nonz, nzval, rowind, colptr); /* Allocate storage */ + a = *nzval; + asub = *rowind; + xa = *colptr; + + if ( !(val = (float *) SUPERLU_MALLOC(new_nonz * sizeof(float))) ) + ABORT("Malloc fails for val[]"); + if ( !(row = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) ) + ABORT("Malloc fails for row[]"); + if ( !(col = (int_t *) SUPERLU_MALLOC(new_nonz * sizeof(int_t))) ) + ABORT("Malloc fails for col[]"); + + for (j = 0; j < *n; ++j) xa[j] = 0; + + /* Read into the triplet array from a file */ + for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) { +#ifdef _LONGINT + fscanf(fp, "%ld%ld%f\n", &row[nz], &col[nz], &val[nz]); +#else // int32 + fscanf(fp, "%d%d%f\n", &row[nz], &col[nz], &val[nz]); +#endif + + if ( !zero_base ) { + /* Change to 0-based indexing. */ + --row[nz]; + --col[nz]; + } + + if (row[nz] < 0 || row[nz] >= *m || col[nz] < 0 || col[nz] >= *n + /*|| val[nz] == 0.*/) { + fprintf(stderr, "nz" IFMT ", (" IFMT ", " IFMT ") = %e out of bound, removed\n", + nz, row[nz], col[nz], val[nz]); + exit(-1); + } else { + ++xa[col[nz]]; +#ifdef EXPAND_SYM + if ( row[nz] != col[nz] ) { /* Excluding diagonal */ + ++nz; + row[nz] = col[nz-1]; + col[nz] = row[nz-1]; + val[nz] = val[nz-1]; + ++xa[col[nz]]; + } +#endif + ++nz; + } + } + + *nonz = nz; +#ifdef EXPAND_SYM + printf("new_nonz after symmetric expansion:\t%d\n", *nonz); +#endif + + + /* Initialize the array of column pointers */ + k = 0; + jsize = xa[0]; + xa[0] = 0; + for (j = 1; j < *n; ++j) { + k += jsize; + jsize = xa[j]; + xa[j] = k; + } + + /* Copy the triplets into the column oriented storage */ + for (nz = 0; nz < *nonz; ++nz) { + j = col[nz]; + k = xa[j]; + asub[k] = row[nz]; + a[k] = val[nz]; + ++xa[j]; + } + + /* Reset the column pointers to the beginning of each column */ + for (j = *n; j > 0; --j) + xa[j] = xa[j-1]; + xa[0] = 0; + + SUPERLU_FREE(val); + SUPERLU_FREE(row); + SUPERLU_FREE(col); + +#ifdef CHK_INPUT + for (i = 0; i < *n; i++) { + printf("Col %d, xa %d\n", i, xa[i]); + for (k = xa[i]; k < xa[i+1]; k++) + printf("%d\t%16.10f\n", asub[k], a[k]); + } +#endif + +} + +#if 0 +void sreadrhs(int m, float *b) +{ + FILE *fp, *fopen(); + int i, j; + + if ( !(fp = fopen("b.dat", "r")) ) { + fprintf(stderr, "zreadrhs: file does not exist\n"); + exit(-1); + } + for (i = 0; i < m; ++i) + fscanf(fp, "%f\n", &b[i]); + + fclose(fp); +} +#endif + diff --git a/SRC/sscatter.c b/SRC/sscatter.c new file mode 100644 index 00000000..689d4f29 --- /dev/null +++ b/SRC/sscatter.c @@ -0,0 +1,524 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Scatter the computed blocks into LU destination. + * + *
+ * -- Distributed SuperLU routine (version 6.1.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * October 1, 2014
+ *
+ * Modified:
+ *   September 18, 2017, enable SIMD vectorized scatter operation.
+ *
+ */
+#include 
+#include "superlu_sdefs.h"
+
+void
+sscatter_l_1 (int ib,
+           int ljb,
+           int nsupc,
+           int_t iukp,
+           int_t* xsup,
+           int klst,
+           int nbrow,
+           int_t lptr,
+           int temp_nbrow,
+           int * usub,
+           int * lsub,
+           float *tempv,
+           int * indirect_thread,
+           int_t ** Lrowind_bc_ptr, float **Lnzval_bc_ptr,
+	   gridinfo_t * grid)
+{
+    // TAU_STATIC_TIMER_START("SCATTER_LB");
+    // printf("hello\n");
+    int_t rel, i, segsize, jj;
+    float *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+    while (ijb != ib)
+    {
+        /* Search for dest block --
+           blocks are not ordered! */
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+
+        ijb = index[lptrj];
+    }
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    lptrj += LB_DESCRIPTOR;
+    for (i = 0; i < index[lptrj - 1]; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    // tempv =bigV + (cum_nrow + cum_ncol*nbrow);
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        // printf("segsize %d \n",segsize);
+        if (segsize) {
+            /*#pragma _CRI cache_bypass nzval,tempv */
+            for (i = 0; i < temp_nbrow; ++i) {
+                rel = lsub[lptr + i] - fnz;
+                nzval[indirect_thread[rel]] -= tempv[i];
+                // printf("i (src) %d, perm (dest) %d  \n",i,indirect_thread[rel]);
+#ifdef PI_DEBUG
+                double zz = 0.0;
+                // if(!(*(long*)&zz == *(long*)&tempv[i]) )
+                printf ("(%d %d, %0.3e, %0.3e, %3e ) ", ljb,
+                        nzval - Lnzval_bc_ptr[ljb] + indirect_thread[rel],
+                        nzval[indirect_thread[rel]] + tempv[i],
+                        nzval[indirect_thread[rel]],tempv[i]);
+                //printing triplets (location??, old value, new value ) if none of them is zero
+#endif
+            }
+            // printf("\n");
+            tempv += nbrow;
+#ifdef PI_DEBUG
+            // printf("\n");
+#endif
+        }
+        nzval += ldv;
+        // printf("%d\n",nzval );
+    }
+    // TAU_STATIC_TIMER_STOP("SCATTER_LB");
+} /* sscatter_l_1 */
+
+void
+sscatter_l (
+           int ib,    /* row block number of source block L(i,k) */
+           int ljb,   /* local column block number of dest. block L(i,j) */
+           int nsupc, /* number of columns in destination supernode */
+           int_t iukp, /* point to destination supernode's index[] */
+           int_t* xsup,
+           int klst,
+           int nbrow,  /* LDA of the block in tempv[] */
+           int_t lptr, /* Input, point to index[] location of block L(i,k) */
+	   int temp_nbrow, /* number of rows of source block L(i,k) */
+           int_t* usub,
+           int_t* lsub,
+           float *tempv,
+           int* indirect_thread,int* indirect2,
+           int_t ** Lrowind_bc_ptr, float **Lnzval_bc_ptr,
+           gridinfo_t * grid)
+{
+
+    int_t rel, i, segsize, jj;
+    float *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the destination lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+
+    while (ijb != ib)  /* Search for destination block L(i,j) */
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+
+    /*
+     * Build indirect table. This is needed because the indices are not sorted
+     * in the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow;
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow=index[lptrj - 1];
+
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+    for (i = 0; i < dest_nbrow; ++i) {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+    /* can be precalculated? */
+    for (i = 0; i < temp_nbrow; ++i) { /* Source index is a subset of dest. */
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] =indirect_thread[rel];
+    }
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj; /* Destination block L(i,j) */
+#ifdef __INTEL_COMPILER
+#pragma ivdep
+#endif
+    for (jj = 0; jj < nsupc; ++jj) {
+        segsize = klst - usub[iukp + jj];
+        if (segsize) {
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+            for (i = 0; i < temp_nbrow; ++i) {
+                nzval[indirect2[i]] -= tempv[i];
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+
+} /* sscatter_l */
+
+
+void
+sscatter_u (int ib,
+           int jb,
+           int nsupc,
+           int_t iukp,
+           int_t * xsup,
+           int klst,
+ 	   int nbrow,      /* LDA of the block in tempv[] */
+           int_t lptr,     /* point to index location of block L(i,k) */
+	   int temp_nbrow, /* number of rows of source block L(i,k) */
+           int_t* lsub,
+           int_t* usub,
+           float* tempv,
+           int_t ** Ufstnz_br_ptr, float **Unzval_br_ptr,
+           gridinfo_t * grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    // TAU_STATIC_TIMER_START("SCATTER_U");
+    // TAU_STATIC_TIMER_START("SCATTER_UB");
+
+    int_t jj, i, fnz, rel;
+    int segsize;
+    float *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* Reinitilize the pointers to the beginning of the k-th column/row of
+     * L/U factors.
+     * usub[] - index array for panel U(k,:)
+     */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb) {   /* Search for destination block. */
+        ruip_lib += index[iuip_lib + 1];
+        // printf("supersize[%ld] \t:%ld \n",ijb,SuperSize( ijb ) );
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor. Now point to fstnz index of block U(i,j). */
+    iuip_lib += UB_DESCRIPTOR;
+
+    // tempv = bigV + (cum_nrow + cum_ncol*nbrow);
+    for (jj = 0; jj < nsupc; ++jj) {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        if (segsize) {          /* Nonzero segment in U(k,j). */
+            ucol = &Unzval_br_ptr[lib][ruip_lib];
+
+            // printf("========Entering loop=========\n");
+#if (_OPENMP>=201307)
+#pragma omp simd
+#endif
+            for (i = 0; i < temp_nbrow; ++i) {
+                rel = lsub[lptr + i] - fnz;
+                // printf("%d %d %d %d %d \n",lptr,i,fnz,temp_nbrow,nbrow );
+                // printf("hello   ucol[%d] %d %d : \n",rel,lsub[lptr + i],fnz);
+                ucol[rel] -= tempv[i];
+
+#ifdef PI_DEBUG
+                double zz = 0.0;
+                if (!(*(long *) &zz == *(long *) &tempv[i]))
+                    printf ("(%d, %0.3e, %0.3e ) ", rel, ucol[rel] + tempv[i],
+                            ucol[rel]);
+                //printing triplets (location??, old value, new value ) if none of them is zero
+#endif
+            } /* for i = 0:temp_nbropw */
+            tempv += nbrow; /* Jump LDA to next column */
+#ifdef PI_DEBUG
+            // printf("\n");
+#endif
+        }  /* if segsize */
+
+        ruip_lib += ilst - fnz;
+
+    }  /* for jj = 0:nsupc */
+#ifdef PI_DEBUG
+    // printf("\n");
+#endif
+    // TAU_STATIC_TIMER_STOP("SCATTER_UB");
+} /* sscatter_u */
+
+
+/*Divide CPU-GPU dgemm work here*/
+#ifdef PI_DEBUG
+int Ngem = 2;
+// int_t Ngem = 0;
+int min_gpu_col = 6;
+#else
+
+    // int_t Ngem = 0;
+
+#endif
+
+
+#if 0 // Sherry: moved and corrected in util.c 
+#ifdef GPU_ACC
+
+void
+gemm_division_cpu_gpu(
+    int* num_streams_used,  /*number of streams that will be used */
+    int* stream_end_col,    /*array holding last column blk for each partition */
+    int * ncpu_blks,        /*Number of CPU dgemm blks */
+    /*input */
+    int nbrow,              /*number of row in A matrix */
+    int ldu,                /*number of k in dgemm */
+    int nstreams,
+    int* full_u_cols,       /*array containing prefix sum of work load */
+    int num_blks            /*Number of work load */
+)
+{
+    int Ngem = sp_ienv_dist(7);  /*get_mnk_dgemm ();*/
+    int min_gpu_col = get_cublas_nb ();
+
+    // Ngem = 1000000000;
+    /*
+       cpu is to gpu dgemm should be ideally 0:1 ratios to hide the total cost
+       However since there is gpu latency of around 20,000 ns implying about
+       200000 floating point calculation be done in that time so ~200,000/(2*nbrow*ldu)
+       should be done in cpu to hide the latency; we Ngem =200,000/2
+     */
+    int i, j;
+
+    // {
+    //     *num_streams_used=0;
+    //     *ncpu_blks = num_blks;
+    //     return;
+    // }
+
+    for (int i = 0; i < nstreams; ++i)
+    {
+        stream_end_col[i] = num_blks;
+    }
+
+    *ncpu_blks = 0;
+    /*easy returns -1 when number of column are less than threshold */
+    if (full_u_cols[num_blks - 1] < (Ngem / (nbrow * ldu)) || num_blks == 1 )
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+#ifdef PI_DEBUG
+        printf ("full_u_cols[num_blks-1] %d  %d \n",
+                full_u_cols[num_blks - 1], (Ngem / (nbrow * ldu)));
+        printf ("Early return \n");
+#endif
+        return;
+
+    }
+
+    /* Easy return -2 when number of streams =0 */
+    if (nstreams == 0)
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+        return;
+        /* code */
+    }
+    /*find first block where count > Ngem */
+
+
+    for (i = 0; i < num_blks - 1; ++i)  /*I can use binary search here */
+    {
+        if (full_u_cols[i + 1] > Ngem / (nbrow * ldu))
+            break;
+    }
+    *ncpu_blks = i + 1;
+
+    int_t cols_remain =
+        full_u_cols[num_blks - 1] - full_u_cols[*ncpu_blks - 1];
+
+#ifdef PI_DEBUG
+    printf ("Remaining cols %d num_blks %d cpu_blks %d \n", cols_remain,
+            num_blks, *ncpu_blks);
+#endif
+    if (cols_remain > 0)
+    {
+        *num_streams_used = 1;  /* now atleast one stream would be used */
+
+#ifdef PI_DEBUG
+        printf ("%d %d  %d %d \n", full_u_cols[num_blks - 1],
+                full_u_cols[*ncpu_blks], *ncpu_blks, nstreams);
+#endif
+        int_t FP_MIN = 200000 / (nbrow * ldu);
+        int_t cols_per_stream = SUPERLU_MAX (min_gpu_col, cols_remain / nstreams);
+        cols_per_stream = SUPERLU_MAX (cols_per_stream, FP_MIN);
+#ifdef PI_DEBUG
+        printf ("cols_per_stream :\t%d\n", cols_per_stream);
+#endif
+
+        int_t cutoff = cols_per_stream + full_u_cols[*ncpu_blks - 1];
+        for (int_t i = 0; i < nstreams; ++i)
+        {
+            stream_end_col[i] = num_blks;
+        }
+        j = *ncpu_blks;
+        for (i = 0; i < nstreams - 1; ++i)
+        {
+            int_t st = (i == 0) ? (*ncpu_blks) : stream_end_col[i - 1];
+
+            for (j = st; j < num_blks - 1; ++j)
+            {
+#ifdef PI_DEBUG
+                printf ("i %d, j %d, %d  %d ", i, j, full_u_cols[j + 1],
+                        cutoff);
+#endif
+                if (full_u_cols[j + 1] > cutoff)
+                {
+#ifdef PI_DEBUG
+                    printf ("cutoff met \n");
+#endif
+                    cutoff = cols_per_stream + full_u_cols[j];
+                    stream_end_col[i] = j + 1;
+                    *num_streams_used += 1;
+                    j++;
+                    break;
+                }
+#ifdef PI_DEBUG
+                printf ("\n");
+#endif
+            }
+
+        }
+
+    }
+}
+
+void
+gemm_division_new (int * num_streams_used,   /*number of streams that will be used */
+                   int * stream_end_col, /*array holding last column blk for each partition */
+                   int * ncpu_blks,  /*Number of CPU dgemm blks */
+                        /*input */
+                   int nbrow,    /*number of row in A matrix */
+                   int ldu,  /*number of k in dgemm */
+                   int nstreams,
+                   Ublock_info_t *Ublock_info,    /*array containing prefix sum of work load */
+                   int num_blks  /*Number of work load */
+    )
+{
+    int Ngem = sp_ienv_dist(7); /*get_mnk_dgemm ();*/
+    int min_gpu_col = get_cublas_nb ();
+
+    // Ngem = 1000000000;
+    /*
+       cpu is to gpu dgemm should be ideally 0:1 ratios to hide the total cost
+       However since there is gpu latency of around 20,000 ns implying about
+       200000 floating point calculation be done in that time so ~200,000/(2*nbrow*ldu)
+       should be done in cpu to hide the latency; we Ngem =200,000/2
+     */
+    int_t i, j;
+
+
+    for (int i = 0; i < nstreams; ++i)
+    {
+        stream_end_col[i] = num_blks;
+    }
+
+    *ncpu_blks = 0;
+    /*easy returns -1 when number of column are less than threshold */
+    if (Ublock_info[num_blks - 1].full_u_cols < (Ngem / (nbrow * ldu)) || num_blks == 1)
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+
+        return;
+
+    }
+
+    /* Easy return -2 when number of streams =0 */
+    if (nstreams == 0)
+    {
+        *num_streams_used = 0;
+        *ncpu_blks = num_blks;
+        return;
+        /* code */
+    }
+    /*find first block where count > Ngem */
+
+
+    for (i = 0; i < num_blks - 1; ++i)  /*I can use binary search here */
+    {
+        if (Ublock_info[i + 1].full_u_cols > Ngem / (nbrow * ldu))
+            break;
+    }
+    *ncpu_blks = i + 1;
+
+    int_t cols_remain =
+       Ublock_info [num_blks - 1].full_u_cols - Ublock_info[*ncpu_blks - 1].full_u_cols;
+
+    if (cols_remain > 0)
+    {
+        *num_streams_used = 1;  /* now atleast one stream would be used */
+
+        int_t FP_MIN = 200000 / (nbrow * ldu);
+        int_t cols_per_stream = SUPERLU_MAX (min_gpu_col, cols_remain / nstreams);
+        cols_per_stream = SUPERLU_MAX (cols_per_stream, FP_MIN);
+
+        int_t cutoff = cols_per_stream + Ublock_info[*ncpu_blks - 1].full_u_cols;
+        for (int_t i = 0; i < nstreams; ++i)
+        {
+            stream_end_col[i] = num_blks;
+        }
+        j = *ncpu_blks;
+        for (i = 0; i < nstreams - 1; ++i)
+        {
+            int_t st = (i == 0) ? (*ncpu_blks) : stream_end_col[i - 1];
+
+            for (j = st; j < num_blks - 1; ++j)
+            {
+                if (Ublock_info[j + 1].full_u_cols > cutoff)
+                {
+
+                    cutoff = cols_per_stream + Ublock_info[j].full_u_cols;
+                    stream_end_col[i] = j + 1;
+                    *num_streams_used += 1;
+                    j++;
+                    break;
+                }
+
+            }
+
+        }
+
+    }
+}
+
+#endif  /* defined GPU_ACC */
+
+#endif // comment out the above code 
diff --git a/SRC/sscatter3d.c b/SRC/sscatter3d.c
new file mode 100644
index 00000000..c43a7460
--- /dev/null
+++ b/SRC/sscatter3d.c
@@ -0,0 +1,625 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Scatter the computed blocks into LU destination.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_sdefs.h"
+//#include "scatter.h"
+//#include "compiler.h"
+
+//#include "cblas.h"
+
+
+#define ISORT
+#define SCATTER_U_CPU  scatter_u
+
+static void scatter_u (int_t ib, int_t jb, int_t nsupc, int_t iukp, int_t *xsup,
+                 int_t klst, int_t nbrow, int_t lptr, int_t temp_nbrow,
+ 		 int_t *lsub, int_t *usub, float *tempv,
+		 int *indirect,
+           	 int_t **Ufstnz_br_ptr, float **Unzval_br_ptr, gridinfo_t *grid);
+
+
+#if 0 /**** Sherry: this routine is moved to util.c ****/
+void
+arrive_at_ublock (int_t j,      //block number
+                  int_t *iukp,  // output
+                  int_t *rukp, int_t *jb,   /* Global block number of block U(k,j). */
+                  int_t *ljb,   /* Local block number of U(k,j). */
+                  int_t *nsupc,     /*supernode size of destination block */
+                  int_t iukp0,  //input
+                  int_t rukp0, int_t *usub,     /*usub scripts */
+                  int_t *perm_u,    /*permutation matrix */
+                  int_t *xsup,  /*for SuperSize and LBj */
+                  gridinfo_t *grid)
+{
+    int_t jj;
+    *iukp = iukp0;
+    *rukp = rukp0;
+
+#ifdef ISORT
+    for (jj = 0; jj < perm_u[j]; jj++)
+#else
+    for (jj = 0; jj < perm_u[2 * j + 1]; jj++)
+#endif
+    {
+
+        *jb = usub[*iukp];      /* Global block number of block U(k,j). */
+        *nsupc = SuperSize (*jb);
+        *iukp += UB_DESCRIPTOR; /* Start fstnz of block U(k,j). */
+        *rukp += usub[*iukp - 1];   /* Move to block U(k,j+1) */
+        *iukp += *nsupc;
+    }
+
+    /* reinitilize the pointers to the begining of the */
+    /* kth column/row of L/U factors                   */
+    *jb = usub[*iukp];          /* Global block number of block U(k,j). */
+    *ljb = LBj (*jb, grid);     /* Local block number of U(k,j). */
+    *nsupc = SuperSize (*jb);
+    *iukp += UB_DESCRIPTOR;     /* Start fstnz of block U(k,j). */
+}
+#endif
+/*--------------------------------------------------------------*/
+
+void
+sblock_gemm_scatter( int_t lb, int_t j,
+                    Ublock_info_t *Ublock_info,
+                    Remain_info_t *Remain_info,
+                    float *L_mat, int ldl,
+                    float *U_mat, int ldu,
+                    float *bigV,
+                    // int_t jj0,
+                    int_t knsupc,  int_t klst,
+                    int_t *lsub, int_t *usub, int_t ldt,
+                    int_t thread_id,
+                    int *indirect,
+                    int *indirect2,
+                    int_t **Lrowind_bc_ptr, float **Lnzval_bc_ptr,
+                    int_t **Ufstnz_br_ptr, float **Unzval_br_ptr,
+                    int_t *xsup, gridinfo_t *grid,
+                    SuperLUStat_t *stat
+#ifdef SCATTER_PROFILE
+                    , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                  )
+{
+    // return ;
+#ifdef _OPENMP    
+    thread_id = omp_get_thread_num();
+#else    
+    thread_id = 0;
+#endif    
+    int *indirect_thread = indirect + ldt * thread_id;
+    int *indirect2_thread = indirect2 + ldt * thread_id;
+    float *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col;
+    int ncols;
+    // if (j > jj0)
+    if (j > 0)
+    {
+        ncols  = Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols;
+        st_col = Ublock_info[j - 1].full_u_cols;
+    }
+    else
+    {
+        ncols  = Ublock_info[j].full_u_cols;
+        st_col = 0;
+    }
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow);
+    float alpha = 1.0, beta = 0.0;
+
+    /* calling SGEMM */
+    // printf(" m %d n %d k %d ldu %d ldl %d st_col %d \n",temp_nbrow,ncols,ldu,ldl,st_col );
+    superlu_sgemm("N", "N", temp_nbrow, ncols, ldu, alpha,
+                &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+                &U_mat[st_col * ldu], ldu,
+                beta, tempv1, temp_nbrow);
+    
+    // printf("SCU update: (%d, %d)\n",ib,jb );
+#ifdef SCATTER_PROFILE
+    double ttx = SuperLU_timer_();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (    Sherry
+        sscatter_l (
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    // #pragma omp atomic
+    // stat->ops[FACT] += 2*temp_nbrow*ncols*ldu + temp_nbrow*ncols;
+
+#ifdef SCATTER_PROFILE
+    double t_s = SuperLU_timer_() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* sblock_gemm_scatter */
+
+#ifdef _OPENMP
+/*this version uses a lock to prevent multiple thread updating the same block*/
+void
+sblock_gemm_scatter_lock( int_t lb, int_t j,
+                         omp_lock_t* lock,
+                         Ublock_info_t *Ublock_info,
+                         Remain_info_t *Remain_info,
+                         float *L_mat, int_t ldl,
+                         float *U_mat, int_t ldu,
+                         float *bigV,
+                         // int_t jj0,
+                         int_t knsupc,  int_t klst,
+                         int_t *lsub, int_t *usub, int_t ldt,
+                         int_t thread_id,
+                         int *indirect,
+                         int *indirect2,
+                         int_t **Lrowind_bc_ptr, float **Lnzval_bc_ptr,
+                         int_t **Ufstnz_br_ptr, float **Unzval_br_ptr,
+                         int_t *xsup, gridinfo_t *grid
+#ifdef SCATTER_PROFILE
+                         , double *Host_TheadScatterMOP, double *Host_TheadScatterTimer
+#endif
+                       )
+{
+    int *indirect_thread = indirect + ldt * thread_id;
+    int *indirect2_thread = indirect2 + ldt * thread_id;
+    float *tempv1 = bigV + thread_id * ldt * ldt;
+
+    /* Getting U block information */
+
+    int_t iukp =  Ublock_info[j].iukp;
+    int_t jb   =  Ublock_info[j].jb;
+    int_t nsupc = SuperSize(jb);
+    int_t ljb = LBj (jb, grid);
+    int_t st_col = Ublock_info[j].StCol;
+    int_t ncols = Ublock_info[j].ncols;
+
+
+    /* Getting L block information */
+    int_t lptr = Remain_info[lb].lptr;
+    int_t ib   = Remain_info[lb].ib;
+    int temp_nbrow = lsub[lptr + 1];
+    lptr += LB_DESCRIPTOR;
+    int cum_nrow =  Remain_info[lb].StRow;
+
+    double alpha = 1.0;  double beta = 0.0;
+
+    /* calling SGEMM */
+    superlu_sgemm("N", "N", temp_nbrow, ncols, ldu, alpha,
+           &L_mat[(knsupc - ldu)*ldl + cum_nrow], ldl,
+           &U_mat[st_col * ldu], ldu, beta, tempv1, temp_nbrow);
+    
+    /*try to get the lock for the block*/
+    if (lock)       /*lock is not null*/
+        while (!omp_test_lock(lock))
+        {
+        }
+
+#ifdef SCATTER_PROFILE
+    double ttx = SuperLU_timer_();
+#endif
+    /*Now scattering the block*/
+    if (ib < jb)
+    {
+        SCATTER_U_CPU (
+            ib, jb,
+            nsupc, iukp, xsup,
+            klst, temp_nbrow,
+            lptr, temp_nbrow, lsub,
+            usub, tempv1,
+            indirect_thread,
+            Ufstnz_br_ptr,
+            Unzval_br_ptr,
+            grid
+        );
+    }
+    else
+    {
+        //scatter_l (  Sherry
+        sscatter_l ( 
+            ib, ljb, nsupc, iukp, xsup, klst, temp_nbrow, lptr,
+            temp_nbrow, usub, lsub, tempv1,
+            indirect_thread, indirect2_thread,
+            Lrowind_bc_ptr, Lnzval_bc_ptr, grid
+        );
+
+    }
+
+    if (lock)
+        omp_unset_lock(lock);
+
+#ifdef SCATTER_PROFILE
+    //double t_s = (double) __rdtsc() - ttx;
+    double t_s = SuperLU_timer_() - ttx;
+    Host_TheadScatterMOP[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += 3.0 * (double ) temp_nbrow * (double ) ncols;
+    Host_TheadScatterTimer[thread_id * ((192 / 8) * (192 / 8)) + ((CEILING(temp_nbrow, 8) - 1)   +  (192 / 8) * (CEILING(ncols, 8) - 1))]
+    += t_s;
+#endif
+} /* sblock_gemm_scatter_lock */
+#endif  // Only if _OPENMP is defined
+
+
+// there are following three variations of block_gemm_scatter call
+/*
++---------------------------------------+
+|          ||                           |
+|  CPU     ||          CPU+TopRight     |
+|  Top     ||                           |
+|  Left    ||                           |
+|          ||                           |
++---------------------------------------+
++---------------------------------------+
+|          ||        |                  |
+|          ||        |                  |
+|          ||        |                  |
+|  CPU     ||  CPU   |Accelerator       |
+|  Bottom  ||  Bottom|                  |
+|  Left    ||  Right |                  |
+|          ||        |                  |
+|          ||        |                  |
++--------------------+------------------+
+                  jj_cpu
+*/
+
+int_t sblock_gemm_scatterTopLeft( int_t lb, /* block number in L */
+				 int_t j,  /* block number in U */
+                                 float* bigV, int_t knsupc,  int_t klst,
+				 int_t* lsub, int_t * usub, int_t ldt,
+				 int* indirect, int* indirect2, HyP_t* HyP,
+                                 sLUstruct_t *LUstruct,
+                                 gridinfo_t* grid,
+                                 SCT_t*SCT, SuperLUStat_t *stat
+                               )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile int_t thread_id = omp_get_thread_num();
+#else    
+    volatile int_t thread_id = 0;
+#endif    
+    
+//    printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    sblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->lookAhead_info,
+			HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id,
+			indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr,
+			xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* sgemm_scatterTopLeft */
+
+int_t sblock_gemm_scatterTopRight( int_t lb,  int_t j,
+                                  float* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                  int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                  HyP_t* HyP,
+                                  sLUstruct_t *LUstruct,
+                                  gridinfo_t* grid,
+                                  SCT_t*SCT, SuperLUStat_t *stat
+                                )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile  int_t thread_id = omp_get_thread_num();
+#else    
+    volatile  int_t thread_id = 0;
+#endif    
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    sblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->lookAhead_info, HyP->lookAhead_L_buff, HyP->Lnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+} /* sblock_gemm_scatterTopRight */
+
+int_t sblock_gemm_scatterBottomLeft( int_t lb,  int_t j,
+                                    float* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                    int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                    HyP_t* HyP,
+                                    sLUstruct_t *LUstruct,
+                                    gridinfo_t* grid,
+                                    SCT_t*SCT, SuperLUStat_t *stat
+                                  )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile int_t thread_id = omp_get_thread_num();
+#else    
+    volatile int_t thread_id = 0;
+#endif    
+    //printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    sblock_gemm_scatter( lb, j, HyP->Ublock_info, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_host, HyP->ldu,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* sblock_gemm_scatterBottomLeft */
+
+int_t sblock_gemm_scatterBottomRight( int_t lb,  int_t j,
+                                     float* bigV, int_t knsupc,  int_t klst, int_t* lsub,
+                                     int_t* usub, int_t ldt, int* indirect, int* indirect2,
+                                     HyP_t* HyP,
+                                     sLUstruct_t *LUstruct,
+                                     gridinfo_t* grid,
+                                     SCT_t*SCT, SuperLUStat_t *stat
+                                   )
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+#ifdef _OPENMP    
+    volatile  int_t thread_id = omp_get_thread_num();
+#else    
+    volatile  int_t thread_id = 0;
+#endif    
+   // printf("Thread's ID %lld \n", thread_id);
+    //unsigned long long t1 = _rdtsc();
+    double t1 = SuperLU_timer_();
+    sblock_gemm_scatter( lb, j, HyP->Ublock_info_Phi, HyP->Remain_info, HyP->Remain_L_buff, HyP->Rnbrow,
+                        HyP->bigU_Phi, HyP->ldu_Phi,
+                        bigV, knsupc,  klst, lsub,  usub, ldt, thread_id, indirect, indirect2,
+                        Lrowind_bc_ptr, Lnzval_bc_ptr, Ufstnz_br_ptr, Unzval_br_ptr, xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                        , SCT->Host_TheadScatterMOP, SCT->Host_TheadScatterTimer
+#endif
+                      );
+
+    //unsigned long long t2 = _rdtsc();
+    double t2 = SuperLU_timer_();
+    SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double) (t2 - t1);
+    return 0;
+
+} /* sblock_gemm_scatterBottomRight */
+
+/******************************************************************
+ * SHERRY: scatter_l is the same as dscatter_l in dscatter.c
+ *         scatter_u is ALMOST the same as dscatter_u in dscatter.c
+ ******************************************************************/
+#if 0
+void
+scatter_l (int_t ib,
+           int_t ljb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *usub,
+           int_t *lsub,
+           double *tempv,
+           int *indirect_thread, int *indirect2,
+           int_t **Lrowind_bc_ptr, double **Lnzval_bc_ptr, gridinfo_t *grid)
+{
+    int_t rel, i, segsize, jj;
+    double *nzval;
+    int_t *index = Lrowind_bc_ptr[ljb];
+    int_t ldv = index[1];       /* LDA of the dest lusup. */
+    int_t lptrj = BC_HEADER;
+    int_t luptrj = 0;
+    int_t ijb = index[lptrj];
+
+    while (ijb != ib)
+    {
+        luptrj += index[lptrj + 1];
+        lptrj += LB_DESCRIPTOR + index[lptrj + 1];
+        ijb = index[lptrj];
+    }
+
+
+    /*
+     * Build indirect table. This is needed because the
+     * indices are not sorted for the L blocks.
+     */
+    int_t fnz = FstBlockC (ib);
+    int_t dest_nbrow;
+    lptrj += LB_DESCRIPTOR;
+    dest_nbrow = index[lptrj - 1];
+
+    for (i = 0; i < dest_nbrow; ++i)
+    {
+        rel = index[lptrj + i] - fnz;
+        indirect_thread[rel] = i;
+
+    }
+
+    /* can be precalculated */
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        rel = lsub[lptr + i] - fnz;
+        indirect2[i] = indirect_thread[rel];
+    }
+
+
+    nzval = Lnzval_bc_ptr[ljb] + luptrj;
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+
+        segsize = klst - usub[iukp + jj];
+        if (segsize)
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                nzval[indirect2[i]] -= tempv[i];
+            }
+            tempv += nbrow;
+        }
+        nzval += ldv;
+    }
+
+} /* scatter_l */
+#endif // comment out
+
+static void   // SHERRY: ALMOST the same as dscatter_u in dscatter.c
+scatter_u (int_t ib,
+           int_t jb,
+           int_t nsupc,
+           int_t iukp,
+           int_t *xsup,
+           int_t klst,
+           int_t nbrow,
+           int_t lptr,
+           int_t temp_nbrow,
+           int_t *lsub,
+           int_t *usub,
+           float *tempv,
+           int *indirect,
+           int_t **Ufstnz_br_ptr, float **Unzval_br_ptr, gridinfo_t *grid)
+{
+#ifdef PI_DEBUG
+    printf ("A(%d,%d) goes to U block \n", ib, jb);
+#endif
+    int_t jj, i, fnz;
+    int_t segsize;
+    float *ucol;
+    int_t ilst = FstBlockC (ib + 1);
+    int_t lib = LBi (ib, grid);
+    int_t *index = Ufstnz_br_ptr[lib];
+
+    /* reinitialize the pointer to each row of U */
+    int_t iuip_lib, ruip_lib;
+    iuip_lib = BR_HEADER;
+    ruip_lib = 0;
+
+    int_t ijb = index[iuip_lib];
+    while (ijb < jb)            /* Search for dest block. */
+    {
+        ruip_lib += index[iuip_lib + 1];
+
+        iuip_lib += UB_DESCRIPTOR + SuperSize (ijb);
+        ijb = index[iuip_lib];
+    }
+    /* Skip descriptor.  Now point_t to fstnz index of
+       block U(i,j). */
+
+    for (i = 0; i < temp_nbrow; ++i)
+    {
+        indirect[i] = lsub[lptr + i] ;
+    }
+
+    iuip_lib += UB_DESCRIPTOR;
+
+    ucol = &Unzval_br_ptr[lib][ruip_lib];
+    for (jj = 0; jj < nsupc; ++jj)
+    {
+        segsize = klst - usub[iukp + jj];
+        fnz = index[iuip_lib++];
+        ucol -= fnz;
+        if (segsize)            /* Nonzero segment in U(k.j). */
+        {
+            for (i = 0; i < temp_nbrow; ++i)
+            {
+                ucol[indirect[i]] -= tempv[i];
+            }                   /* for i=0..temp_nbropw */
+            tempv += nbrow;
+
+        } /*if segsize */
+        ucol += ilst ;
+
+    } /*for jj=0:nsupc */
+
+}
+
+
diff --git a/SRC/ssp_blas2_dist.c b/SRC/ssp_blas2_dist.c
new file mode 100644
index 00000000..279b97e8
--- /dev/null
+++ b/SRC/ssp_blas2_dist.c
@@ -0,0 +1,501 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Sparse BLAS 2, using some dense BLAS 2 operations
+ *
+ * 
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * 
+ */ + +/* + * File name: ssp_blas2_dist.c + * Purpose: Sparse BLAS 2, using some dense BLAS 2 operations. + */ + +#include "superlu_sdefs.h" + + +/* + * Function prototypes + */ +#ifndef USE_VENDOR_BLAS +extern void susolve(int, int, float*, float*); +extern void slsolve(int, int, float*, float*); +extern void smatvec(int, int, int, float*, float*, float*); +#endif + +/*! \brief + * + *
+ *   Purpose
+ *   =======
+ *
+ *   sp_strsv_dist() solves one of the systems of equations   
+ *       A*x = b,   or   A'*x = b,
+ *   where b and x are n element vectors and A is a sparse unit , or   
+ *   non-unit, upper or lower triangular matrix.   
+ *   No test for singularity or near-singularity is included in this   
+ *   routine. Such tests must be performed before calling this routine.   
+ *
+ *   Parameters   
+ *   ==========   
+ *
+ *   uplo   - (input) char*
+ *            On entry, uplo specifies whether the matrix is an upper or   
+ *             lower triangular matrix as follows:   
+ *                uplo = 'U' or 'u'   A is an upper triangular matrix.   
+ *                uplo = 'L' or 'l'   A is a lower triangular matrix.   
+ *
+ *   trans  - (input) char*
+ *             On entry, trans specifies the equations to be solved as   
+ *             follows:   
+ *                trans = 'N' or 'n'   A*x = b.   
+ *                trans = 'T' or 't'   A'*x = b.   
+ *                trans = 'C' or 'c'   A'*x = b.   
+ *
+ *   diag   - (input) char*
+ *             On entry, diag specifies whether or not A is unit   
+ *             triangular as follows:   
+ *                diag = 'U' or 'u'   A is assumed to be unit triangular.   
+ *                diag = 'N' or 'n'   A is not assumed to be unit   
+ *                                    triangular.   
+ *	     
+ *   L       - (input) SuperMatrix*
+ *	       The factor L from the factorization Pr*A*Pc=L*U. Use
+ *             compressed row subscripts storage for supernodes, i.e.,
+ *             L has types: Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU.
+ *
+ *   U       - (input) SuperMatrix*
+ *	        The factor U from the factorization Pr*A*Pc=L*U.
+ *	        U has types: Stype = SLU_NC, Dtype = SLU_S, Mtype = SLU_TRU.
+ *    
+ *   x       - (input/output) float*
+ *             Before entry, the incremented array X must contain the n   
+ *             element right-hand side vector b. On exit, X is overwritten 
+ *             with the solution vector x.
+ *
+ *   info    - (output) int*
+ *             If *info = -i, the i-th argument had an illegal value.
+ * 
+ */
+int
+sp_strsv_dist(char *uplo, char *trans, char *diag, SuperMatrix *L, 
+	      SuperMatrix *U, float *x, int *info)
+{
+
+#ifdef _CRAY
+    _fcd ftcs1, ftcs2, ftcs3;
+#endif
+    SCformat *Lstore;
+    NCformat *Ustore;
+    float   *Lval, *Uval;
+    int incx = 1, incy = 1;
+    float alpha = 1.0, beta = 1.0;
+    int nrow;
+    int fsupc, nsupr, nsupc, luptr, istart, irow;
+    int i, k, iptr, jcol;
+    float *work;
+    flops_t solve_ops;
+    /*extern SuperLUStat_t SuperLUStat;*/
+
+    /* Test the input parameters */
+    *info = 0;
+    if ( strncmp(uplo,"L",1) != 0 && strncmp(uplo, "U",1) !=0 ) *info = -1;
+    else if ( strncmp(trans, "N",1) !=0 && strncmp(trans, "T", 1) !=0 )
+	*info = -2;
+    else if ( strncmp(diag, "U", 1) !=0 && strncmp(diag, "N", 1) != 0 )
+	*info = -3;
+    else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
+    else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
+    if ( *info ) {
+	i = -(*info);
+	xerr_dist("sp_strsv_dist", &i);
+	return 0;
+    }
+
+    Lstore = (SCformat *) L->Store;
+    Lval = (float *) Lstore->nzval;
+    Ustore = (NCformat *) U->Store;
+    Uval = (float *) Ustore->nzval;
+    solve_ops = 0;
+
+    if ( !(work = floatCalloc_dist(L->nrow)) )
+	ABORT("Malloc fails for work in sp_dtrsv_dist().");
+    
+    if ( strncmp(trans, "N", 1)==0 ) {	/* Form x := inv(A)*x. */
+	
+	if ( strncmp(uplo, "L", 1)==0 ) {
+	    /* Form x := inv(L)*x */
+    	    if ( L->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = 0; k <= Lstore->nsuper; k++) {
+		fsupc = SuperLU_L_FST_SUPC(k);
+		istart = SuperLU_L_SUB_START(fsupc);
+		nsupr = SuperLU_L_SUB_START(fsupc+1) - istart;
+		nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc;
+		luptr = SuperLU_L_NZ_START(fsupc);
+		nrow = nsupr - nsupc;
+	        solve_ops += nsupc * (nsupc - 1);
+	        solve_ops += 2 * nrow * nsupc;
+		if ( nsupc == 1 ) {
+		    for (iptr=istart+1; iptr < SuperLU_L_SUB_START(fsupc+1); ++iptr) {
+			irow = SuperLU_L_SUB(iptr);
+			++luptr;
+			x[irow] -= x[fsupc] * Lval[luptr];
+		    }
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		    ftcs1 = _cptofcd("L", strlen("L"));
+		    ftcs2 = _cptofcd("N", strlen("N"));
+		    ftcs3 = _cptofcd("U", strlen("U"));
+		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+		       	&x[fsupc], &incx);
+		
+		    SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
+		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
+#else
+		    strsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
+		       	&x[fsupc], &incx, 1, 1, 1);
+		
+		    sgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc], 
+		       	&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy, 1);
+#endif /* _CRAY */		
+#else
+		    slsolve (nsupr, nsupc, &Lval[luptr], &x[fsupc]);
+		
+		    smatvec (nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
+			&x[fsupc], &work[0] );
+#endif		
+		
+		    iptr = istart + nsupc;
+		    for (i = 0; i < nrow; ++i, ++iptr) {
+			irow = SuperLU_L_SUB(iptr);
+			x[irow] -= work[i];	/* Scatter */
+			work[i] = 0.0;
+		    }
+	 	}
+	    } /* for k ... */
+	    
+	} else {
+	    /* Form x := inv(U)*x */
+	    
+	    if ( U->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = Lstore->nsuper; k >= 0; k--) {
+	    	fsupc = SuperLU_L_FST_SUPC(k);
+	    	nsupr = SuperLU_L_SUB_START(fsupc+1) - SuperLU_L_SUB_START(fsupc);
+	    	nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc;
+	    	luptr = SuperLU_L_NZ_START(fsupc);
+    	        solve_ops += nsupc * (nsupc + 1);
+
+		if ( nsupc == 1 ) {
+		    x[fsupc] /= Lval[luptr];
+		    for (i = SuperLU_U_NZ_START(fsupc); i < SuperLU_U_NZ_START(fsupc+1); ++i) {
+			irow = SuperLU_U_SUB(i);
+			x[irow] -= x[fsupc] * Uval[i];
+		    }
+
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+		    ftcs1 = _cptofcd("U", strlen("U"));
+		    ftcs2 = _cptofcd("N", strlen("N"));
+		    STRSV(ftcs1, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
+		       &x[fsupc], &incx);
+#else
+		    strsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
+		       &x[fsupc], &incx, 1, 1, 1);
+#endif
+#else		
+		    susolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
+#endif		
+
+		    for (jcol = fsupc; jcol < SuperLU_L_FST_SUPC(k+1); jcol++) {
+		        solve_ops += 2*(SuperLU_U_NZ_START(jcol+1) - SuperLU_U_NZ_START(jcol));
+		    	for (i = SuperLU_U_NZ_START(jcol); i < SuperLU_U_NZ_START(jcol+1); 
+				i++) {
+			    irow = SuperLU_U_SUB(i);
+			    x[irow] -= x[jcol] * Uval[i];
+		    	}
+                    }
+		}
+	    } /* for k ... */
+	    
+	}
+    } else { /* Form x := inv(A')*x */
+	
+	if ( strncmp(uplo, "L", 1)==0 ) {
+	    /* Form x := inv(L')*x */
+    	    if ( L->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = Lstore->nsuper; k >= 0; --k) {
+	    	fsupc = SuperLU_L_FST_SUPC(k);
+	    	istart = SuperLU_L_SUB_START(fsupc);
+	    	nsupr = SuperLU_L_SUB_START(fsupc+1) - istart;
+	    	nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc;
+	    	luptr = SuperLU_L_NZ_START(fsupc);
+
+		solve_ops += 2 * (nsupr - nsupc) * nsupc;
+		for (jcol = fsupc; jcol < SuperLU_L_FST_SUPC(k+1); jcol++) {
+		    iptr = istart + nsupc;
+		    for (i = SuperLU_L_NZ_START(jcol) + nsupc; 
+				i < SuperLU_L_NZ_START(jcol+1); i++) {
+			irow = SuperLU_L_SUB(iptr);
+			x[jcol] -= x[irow] * Lval[i];
+			iptr++;
+		    }
+		}
+		
+		if ( nsupc > 1 ) {
+		    solve_ops += nsupc * (nsupc - 1);
+
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+                    ftcs1 = _cptofcd("L", strlen("L"));
+                    ftcs2 = _cptofcd("T", strlen("T"));
+                    ftcs3 = _cptofcd("U", strlen("U"));
+		    STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx);
+#else
+		    strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx, 1, 1, 1);
+#endif
+#else
+		    strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
+			&x[fsupc], &incx);
+#endif
+		}
+	    }
+	} else {
+	    /* Form x := inv(U')*x */
+	    if ( U->nrow == 0 ) return 0; /* Quick return */
+	    
+	    for (k = 0; k <= Lstore->nsuper; k++) {
+	    	fsupc = SuperLU_L_FST_SUPC(k);
+	    	nsupr = SuperLU_L_SUB_START(fsupc+1) - SuperLU_L_SUB_START(fsupc);
+	    	nsupc = SuperLU_L_FST_SUPC(k+1) - fsupc;
+	    	luptr = SuperLU_L_NZ_START(fsupc);
+
+		for (jcol = fsupc; jcol < SuperLU_L_FST_SUPC(k+1); jcol++) {
+		    solve_ops += 2*(SuperLU_U_NZ_START(jcol+1) - SuperLU_U_NZ_START(jcol));
+		    for (i = SuperLU_U_NZ_START(jcol); i < SuperLU_U_NZ_START(jcol+1); i++) {
+			irow = SuperLU_U_SUB(i);
+			x[jcol] -= x[irow] * Uval[i];
+		    }
+		}
+
+		solve_ops += nsupc * (nsupc + 1);
+		if ( nsupc == 1 ) {
+		    x[fsupc] /= Lval[luptr];
+		} else {
+#ifdef USE_VENDOR_BLAS
+#ifdef _CRAY
+                    ftcs1 = _cptofcd("U", strlen("U"));
+                    ftcs2 = _cptofcd("T", strlen("T"));
+                    ftcs3 = _cptofcd("N", strlen("N"));
+		    STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx);
+#else
+		    strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx, 1, 1, 1);
+#endif
+#else
+		    strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
+			    &x[fsupc], &incx);
+#endif
+		}
+	    } /* for k ... */
+	}
+    }
+
+    /*SuperLUStat.ops[SOLVE] += solve_ops;*/
+    SUPERLU_FREE(work);
+    return 0;
+} /* sp_strsv_dist */
+
+
+/*! \brief SpGEMV
+
+  Purpose   
+    =======   
+
+    sp_sgemv_dist()  performs one of the matrix-vector operations   
+       y := alpha*A*x + beta*y,   or   y := alpha*A'*x + beta*y,   
+    where alpha and beta are scalars, x and y are vectors and A is a
+    sparse A->nrow by A->ncol matrix.   
+
+    Parameters   
+    ==========   
+
+    TRANS  - (input) char*
+             On entry, TRANS specifies the operation to be performed as   
+             follows:   
+                TRANS = 'N' or 'n'   y := alpha*A*x + beta*y.   
+                TRANS = 'T' or 't'   y := alpha*A'*x + beta*y.   
+                TRANS = 'C' or 'c'   y := alpha*A'*x + beta*y.   
+
+    ALPHA  - (input) double
+             On entry, ALPHA specifies the scalar alpha.   
+
+    A      - (input) SuperMatrix*
+             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
+             Currently, the type of A can be:
+                 Stype = SLU_NC or SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE. 
+             In the future, more general A can be handled.
+
+    X      - (input) float*, array of DIMENSION at least   
+             ( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( m - 1 )*abs( INCX ) ) otherwise.   
+             Before entry, the incremented array X must contain the   
+             vector x.   
+
+    INCX   - (input) int
+             On entry, INCX specifies the increment for the elements of   
+             X. INCX must not be zero.   
+
+    BETA   - (input) float
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then Y need not be set on input.   
+
+    Y      - (output) float*,  array of DIMENSION at least   
+             ( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'   
+             and at least   
+             ( 1 + ( n - 1 )*abs( INCY ) ) otherwise.   
+             Before entry with BETA non-zero, the incremented array Y   
+             must contain the vector y. On exit, Y is overwritten by the 
+             updated vector y.
+	     
+    INCY   - (input) int
+             On entry, INCY specifies the increment for the elements of   
+             Y. INCY must not be zero.   
+
+    ==== Sparse Level 2 Blas routine.   
+
+*/ +int +sp_sgemv_dist(char *trans, float alpha, SuperMatrix *A, float *x, + int incx, float beta, float *y, int incy) +{ + + /* Local variables */ + NCformat *Astore; + float *Aval; + int info; + float temp, temp1; + int lenx, leny, i, j, irow; + int iy, jx, jy, kx, ky; + int notran; + float zero = 0.0; + float one = 1.0; + + notran = (strncmp(trans, "N", 1)==0); + Astore = (NCformat *) A->Store; + Aval = (float *) Astore->nzval; + + /* Test the input parameters */ + info = 0; + if ( !notran && strncmp(trans, "T", 1) !=0 && strncmp(trans, "C", 1) != 0) + info = 1; + else if ( A->nrow < 0 || A->ncol < 0 ) info = 3; + else if (incx == 0) info = 5; + else if (incy == 0) info = 8; + if (info != 0) { + xerr_dist("sp_sgemv_dist ", &info); + return 0; + } + + /* Quick return if possible. */ + if (A->nrow == 0 || A->ncol == 0 || alpha == 0. && beta == 1.) + return 0; + + /* Set LENX and LENY, the lengths of the vectors x and y, and set + up the start points in X and Y. */ + if ( strncmp(trans, "N", 1)==0 ) { + lenx = A->ncol; + leny = A->nrow; + } else { + lenx = A->nrow; + leny = A->ncol; + } + if (incx > 0) kx = 0; + else kx = - (lenx - 1) * incx; + if (incy > 0) ky = 0; + else ky = - (leny - 1) * incy; + + /* Start the operations. In this version the elements of A are + accessed sequentially with one pass through A. */ + /* First form y := beta*y. */ + if (beta != 1.) { + if (incy == 1) { + if (beta == 0.) + for (i = 0; i < leny; ++i) y[i] = zero; + else + for (i = 0; i < leny; ++i) y[i] = beta * y[i]; + } else { + iy = ky; + if (beta == 0.) + for (i = 0; i < leny; ++i) { + y[iy] = zero; + iy += incy; + } + else + for (i = 0; i < leny; ++i) { + y[iy] = beta * y[iy]; + iy += incy; + } + } + } + + if (alpha == 0.) return 0; + + if ( notran ) { + /* Form y := alpha*A*x + y. */ + jx = kx; + if (incy == 1) { + for (j = 0; j < A->ncol; ++j) { + if (x[jx] != 0.) { + temp = alpha * x[jx]; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + y[irow] += temp * Aval[i]; + } + } + jx += incx; + } + } else { + ABORT("Not implemented."); + } + } else { + /* Form y := alpha*A'*x + y. */ + jy = ky; + if (incx == 1) { + for (j = 0; j < A->ncol; ++j) { + temp = zero; + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + temp += Aval[i] * x[irow]; + } + y[jy] += alpha * temp; + jy += incy; + } + } else { + ABORT("Not implemented."); + } + } + return 0; +} /* sp_sgemv_dist */ diff --git a/SRC/ssp_blas3_dist.c b/SRC/ssp_blas3_dist.c new file mode 100644 index 00000000..4bf00e01 --- /dev/null +++ b/SRC/ssp_blas3_dist.c @@ -0,0 +1,138 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Sparse BLAS3, using some dense BLAS3 operations + * + *
+ * -- Distributed SuperLU routine (version 1.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * September 1, 1999
+ * 
+ */ + +/* + * File name: ssp_blas3_dist.c + * Purpose: Sparse BLAS3, using some dense BLAS3 operations. + */ + +#include "superlu_sdefs.h" + +/*! \brief + +
+  Purpose   
+    =======   
+
+    sp_sgemm_dist performs one of the matrix-matrix operations   
+
+       C := alpha*op( A )*op( B ) + beta*C,   
+
+    where  op( X ) is one of 
+
+       op( X ) = X   or   op( X ) = X'   or   op( X ) = conjg( X' ),
+
+    alpha and beta are scalars, and A, B and C are matrices, with op( A ) 
+    an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix. 
+  
+
+    Parameters   
+    ==========   
+
+    TRANSA - (input) char*
+             On entry, TRANSA specifies the form of op( A ) to be used in 
+             the matrix multiplication as follows:   
+                TRANSA = 'N' or 'n',  op( A ) = A.   
+                TRANSA = 'T' or 't',  op( A ) = A'.   
+                TRANSA = 'C' or 'c',  op( A ) = conjg( A' ).   
+             Unchanged on exit.   
+
+    TRANSB - (input) char*
+             On entry, TRANSB specifies the form of op( B ) to be used in 
+             the matrix multiplication as follows:   
+                TRANSB = 'N' or 'n',  op( B ) = B.   
+                TRANSB = 'T' or 't',  op( B ) = B'.   
+                TRANSB = 'C' or 'c',  op( B ) = conjg( B' ).   
+             Unchanged on exit.   
+
+    M      - (input) int   
+             On entry,  M  specifies  the number of rows of the matrix 
+	     op( A ) and of the matrix C.  M must be at least zero. 
+	     Unchanged on exit.   
+
+    N      - (input) int
+             On entry,  N specifies the number of columns of the matrix 
+	     op( B ) and the number of columns of the matrix C. N must be 
+	     at least zero.
+	     Unchanged on exit.   
+
+    K      - (input) int
+             On entry, K specifies the number of columns of the matrix 
+	     op( A ) and the number of rows of the matrix op( B ). K must 
+	     be at least  zero.   
+             Unchanged on exit.
+	     
+    ALPHA  - (input) float
+             On entry, ALPHA specifies the scalar alpha.   
+
+    A      - (input) SuperMatrix*
+             Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
+             Currently, the type of A can be:
+                 Stype = NC or NCP; Dtype = SLU_S; Mtype = GE. 
+             In the future, more general A can be handled.
+
+    B      - float array of DIMENSION ( LDB, kb ), where kb is 
+             n when TRANSB = 'N' or 'n',  and is  k otherwise.   
+             Before entry with  TRANSB = 'N' or 'n',  the leading k by n 
+             part of the array B must contain the matrix B, otherwise 
+             the leading n by k part of the array B must contain the 
+             matrix B.   
+             Unchanged on exit.   
+
+    LDB    - (input) int
+             On entry, LDB specifies the first dimension of B as declared 
+             in the calling (sub) program. LDB must be at least max( 1, n ).  
+             Unchanged on exit.   
+
+    BETA   - (input) float
+             On entry, BETA specifies the scalar beta. When BETA is   
+             supplied as zero then C need not be set on input.   
+
+    C      - float array of DIMENSION ( LDC, n ).   
+             Before entry, the leading m by n part of the array C must 
+             contain the matrix C,  except when beta is zero, in which 
+             case C need not be set on entry.   
+             On exit, the array C is overwritten by the m by n matrix 
+	     ( alpha*op( A )*B + beta*C ).   
+
+    LDC    - (input) int
+             On entry, LDC specifies the first dimension of C as declared 
+             in the calling (sub)program. LDC must be at least max(1,m).   
+             Unchanged on exit.   
+
+    ==== Sparse Level 3 Blas routine.  
+
+*/ +int +sp_sgemm_dist(char *transa, int n, float alpha, SuperMatrix *A, + float *b, int ldb, float beta, + float *c, int ldc) +{ + + int incx = 1, incy = 1; + int j; + + for (j = 0; j < n; ++j) { + sp_sgemv_dist(transa, alpha, A, &b[ldb*j], incx, beta, &c[ldc*j], incy); + } + return 0; +} diff --git a/SRC/sstatic_schedule.c b/SRC/sstatic_schedule.c new file mode 100644 index 00000000..ebb416f5 --- /dev/null +++ b/SRC/sstatic_schedule.c @@ -0,0 +1,984 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Performs static scheduling for the look-ahead factorization algorithm. + * + *
+ * -- Distributed SuperLU routine (version 4.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * August 15, 2014
+ *
+ * Modified: February 20, 2020, changed to be precision-dependent.
+ *
+ * Reference:
+ * 
+ * 
+ */ + +#include "superlu_sdefs.h" + +#ifdef ISORT +extern void isort (int_t N, int_t * ARRAY1, int_t * ARRAY2); +extern void isort1 (int_t N, int_t * ARRAY); + +#else + +static int +superlu_sort_perm (const void *arg1, const void *arg2) +{ + const int_t *val1 = (const int_t *) arg1; + const int_t *val2 = (const int_t *) arg2; + return (*val2 < *val1); +} +#endif + +int +sstatic_schedule(superlu_dist_options_t * options, int m, int n, + sLUstruct_t * LUstruct, gridinfo_t * grid, SuperLUStat_t * stat, + int_t *perm_c_supno, int_t *iperm_c_supno, int *info) +{ +/* + * Arguments + * ========= + * + * perm_c_supno (output) + * perm_c_supno[k] = j means at the k-th step of elimination, the j-th + * supernode is chosen. + * iperm_c_supno (output), inverse of perm_c_supno[] + * iperm_c_supno[j] = k means the j-th supernode will be scheduled + * at the k-th step of elimination. + * + */ + int_t *xsup; + int_t i, ib, jb, lb, nlb, il, iu; + int_t Pc, Pr; + int iam, krow, yourcol, mycol, myrow; + int j, k, nsupers; /* k - current panel to work on */ + int_t *index; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + int ncb, nrb, p, pr, pc, nblocks; + int_t *etree_supno_l, *etree_supno, *blocks, *blockr, *Ublock, *Urows, + *Lblock, *Lrows, *sf_block, *sf_block_l, *nnodes_l, + *nnodes_u, *edag_supno_l, *recvbuf, **edag_supno; + float edag_supno_l_bytes; + int nnodes, *sendcnts, *sdispls, *recvcnts, *rdispls, *srows, *rrows; + etree_node *head, *tail, *ptr; + int *num_child; + + int iword = sizeof (int_t); + + /* Test the input parameters. */ + *info = 0; + if (m < 0) *info = -2; + else if (n < 0) *info = -3; + if (*info) { + pxerr_dist ("static_schedule", grid, -*info); + return (-1); + } + + /* Quick return if possible. */ + if (m == 0 || n == 0) return 0; + + /* + * Initialization. + */ + iam = grid->iam; + Pc = grid->npcol; + Pr = grid->nprow; + myrow = MYROW (iam, grid); + mycol = MYCOL (iam, grid); + nsupers = Glu_persist->supno[n - 1] + 1; + xsup = Glu_persist->xsup; + nblocks = 0; + ncb = nsupers / Pc; + nrb = nsupers / Pr; + +#if ( DEBUGlevel >= 1 ) + print_memorylog(stat, "before static schedule"); +#endif + + /* ================================================== * + * static scheduling of j-th step of LU-factorization * + * ================================================== */ + if (options->lookahead_etree == YES && /* use e-tree of symmetrized matrix and */ + (options->ParSymbFact == NO || /* 1) symmetric fact with serial symbolic, or */ + (options->SymPattern == YES && /* 2) symmetric pattern, and */ + options->RowPerm == NOROWPERM))) { /* no rowperm to destroy symmetry */ + + /* if symmetric pattern or using e-tree of |A^T|+|A|, + then we can use a simple tree structure for static schduling */ + + if (options->ParSymbFact == NO) { + /* Use the etree computed from serial symb. fact., and turn it + into supernodal tree. */ + int_t *etree = LUstruct->etree; +#if ( PRNTlevel>=1 ) + if (grid->iam == 0) + printf (" === using column e-tree ===\n"); +#endif + + /* look for the first off-diagonal blocks */ + etree_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + log_memory(nsupers * iword, stat); + + for (i = 0; i < nsupers; i++) etree_supno[i] = nsupers; + + for (j = 0, lb = 0; lb < nsupers; lb++) { + for (k = 0; k < SuperSize (lb); k++) { + jb = Glu_persist->supno[etree[j + k]]; + if (jb != lb) + etree_supno[lb] = SUPERLU_MIN (etree_supno[lb], jb); + } + j += SuperSize (lb); + } + } else { /* ParSymbFACT==YES and SymPattern==YES and RowPerm == NOROWPERM */ + /* Compute an "etree" based on struct(L), + assuming struct(U) = struct(L'). */ +#if ( PRNTlevel>=1 ) + if (grid->iam == 0) + printf (" === using supernodal e-tree ===\n"); +#endif + + /* find the first block in each supernodal-column of local L-factor */ + etree_supno_l = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + log_memory(nsupers * iword, stat); + + for (i = 0; i < nsupers; i++) etree_supno_l[i] = nsupers; + for (lb = 0; lb < ncb; lb++) { + jb = lb * grid->npcol + mycol; + index = Llu->Lrowind_bc_ptr[lb]; + if (index) { /* Not an empty column */ + i = index[0]; + k = BC_HEADER; + krow = PROW (jb, grid); + if (krow == myrow) { /* skip the diagonal block */ + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + if (i > 0) + { + etree_supno_l[jb] = index[k]; + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + + for (j = 0; j < i; j++) + { + etree_supno_l[jb] = + SUPERLU_MIN (etree_supno_l[jb], index[k]); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + if (mycol < nsupers % grid->npcol) { + jb = ncb * grid->npcol + mycol; + index = Llu->Lrowind_bc_ptr[ncb]; + if (index) { /* Not an empty column */ + i = index[0]; + k = BC_HEADER; + krow = PROW (jb, grid); + if (krow == myrow) { /* skip the diagonal block */ + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + if (i > 0) { + etree_supno_l[jb] = index[k]; + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + for (j = 0; j < i; j++) { + etree_supno_l[jb] = + SUPERLU_MIN (etree_supno_l[jb], index[k]); + k += LB_DESCRIPTOR + index[k + 1]; + } + } + } + + /* form global e-tree */ + etree_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + + MPI_Allreduce (etree_supno_l, etree_supno, nsupers, mpi_int_t, + MPI_MIN, grid->comm); + + SUPERLU_FREE (etree_supno_l); + } + + /* initialize number of children for each node */ + num_child = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + for (i = 0; i < nsupers; i++) num_child[i] = 0; + for (i = 0; i < nsupers; i++) + if (etree_supno[i] != nsupers) num_child[etree_supno[i]]++; + + /* push initial leaves to the fifo queue */ + nnodes = 0; + for (i = 0; i < nsupers; i++) { + if (num_child[i] == 0) { + ptr = SUPERLU_MALLOC (sizeof (etree_node)); + ptr->id = i; + ptr->next = NULL; + /*printf( " == push leaf %d (%d) ==\n",i,nnodes ); */ + nnodes++; + + if (nnodes == 1) { + head = ptr; + tail = ptr; + } else { + tail->next = ptr; + tail = ptr; + } + } + } + + /* process fifo queue, and compute the ordering */ + i = 0; + + while (nnodes > 0) { + ptr = head; + j = ptr->id; + head = ptr->next; + perm_c_supno[i] = j; + SUPERLU_FREE (ptr); + i++; + nnodes--; + + if (etree_supno[j] != nsupers) { + num_child[etree_supno[j]]--; + if (num_child[etree_supno[j]] == 0) { + nnodes++; + + ptr = SUPERLU_MALLOC (sizeof (etree_node)); + ptr->id = etree_supno[j]; + ptr->next = NULL; + + /*printf( "=== push %d ===\n",ptr->id ); */ + if (nnodes == 1) { + head = ptr; + tail = ptr; + } else { + tail->next = ptr; + tail = ptr; + } + } + } + /*printf( "\n" ); */ + } + SUPERLU_FREE (num_child); + SUPERLU_FREE (etree_supno); + log_memory(-2 * nsupers * iword, stat); + + } else { /* Unsymmetric pattern */ + + /* Need to process both L- and U-factors, use the symmetrically + pruned graph of L & U instead of tree (very naive implementation) */ + int nrbp1 = nrb + 1; + float Ublock_bytes, Urows_bytes, Lblock_bytes, Lrows_bytes; + + /* allocate some workspace */ + if (! (sendcnts = SUPERLU_MALLOC ((4 + 2 * nrbp1) * Pr * Pc * sizeof (int)))) + ABORT ("Malloc fails for sendcnts[]."); + log_memory((4 + 2 * nrbp1) * Pr * Pc * sizeof (int), stat); + + sdispls = &sendcnts[Pr * Pc]; + recvcnts = &sdispls[Pr * Pc]; + rdispls = &recvcnts[Pr * Pc]; + srows = &rdispls[Pr * Pc]; + rrows = &srows[Pr * Pc * nrbp1]; + + myrow = MYROW (iam, grid); +#if ( PRNTlevel>=1 ) + if (grid->iam == 0) + printf (" === using DAG ===\n"); +#endif + + /* send supno block of local U-factor to a processor * + * who owns the corresponding block of L-factor */ + + /* srows : # of block to send to a processor from each supno row */ + /* sendcnts: total # of blocks to send to a processor */ + for (p = 0; p < Pr * Pc * nrbp1; p++) srows[p] = 0; + for (p = 0; p < Pr * Pc; p++) sendcnts[p] = 0; + + /* sending blocks of U-factors corresponding to L-factors */ + /* count the number of blocks to send */ + for (lb = 0; lb < nrb; ++lb) { + jb = lb * Pr + myrow; + pc = jb % Pc; + index = Llu->Ufstnz_br_ptr[lb]; + + if (index) { /* Not an empty row */ + k = BR_HEADER; + nblocks += index[0]; + for (j = 0; j < index[0]; ++j) { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + sendcnts[p]++; + srows[p * nrbp1 + lb]++; + + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + + if (myrow < nsupers % grid->nprow) { + jb = nrb * Pr + myrow; + pc = jb % Pc; + index = Llu->Ufstnz_br_ptr[nrb]; + + if (index) { /* Not an empty row */ + k = BR_HEADER; + nblocks += index[0]; + for (j = 0; j < index[0]; ++j) { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + sendcnts[p]++; + srows[p * nrbp1 + nrb]++; + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + + /* insert blocks to send */ + sdispls[0] = 0; + for (p = 1; p < Pr * Pc; p++) sdispls[p] = sdispls[p - 1] + sendcnts[p - 1]; + if (!(blocks = intMalloc_dist (nblocks))) + ABORT ("Malloc fails for blocks[]."); + log_memory( nblocks * iword, stat ); + + for (lb = 0; lb < nrb; ++lb) { + jb = lb * Pr + myrow; + pc = jb % Pc; + index = Llu->Ufstnz_br_ptr[lb]; + + if (index) { /* Not an empty row */ + k = BR_HEADER; + for (j = 0; j < index[0]; ++j) { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + blocks[sdispls[p]] = ib; + sdispls[p]++; + + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + + if (myrow < nsupers % grid->nprow) { + jb = nrb * Pr + myrow; + pc = jb % Pc; + index = Llu->Ufstnz_br_ptr[nrb]; + + if (index) { /* Not an empty row */ + k = BR_HEADER; + for (j = 0; j < index[0]; ++j) { + ib = index[k]; + pr = ib % Pr; + p = pr * Pc + pc; + blocks[sdispls[p]] = ib; + sdispls[p]++; + + k += UB_DESCRIPTOR + SuperSize (index[k]); + } + } + } + + /* communication */ + MPI_Alltoall (sendcnts, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm); + MPI_Alltoall (srows, nrbp1, MPI_INT, rrows, nrbp1, MPI_INT, grid->comm); + + log_memory( -(nblocks * iword), stat ); /* blocks[] to be freed soon */ + + nblocks = recvcnts[0]; + rdispls[0] = sdispls[0] = 0; + for (p = 1; p < Pr * Pc; p++) { + rdispls[p] = rdispls[p - 1] + recvcnts[p - 1]; + sdispls[p] = sdispls[p - 1] + sendcnts[p - 1]; + nblocks += recvcnts[p]; + } + + if (!(blockr = intMalloc_dist (nblocks))) ABORT ("Malloc fails for blockr[]."); + log_memory( nblocks * iword, stat ); + + MPI_Alltoallv (blocks, sendcnts, sdispls, mpi_int_t, blockr, recvcnts, + rdispls, mpi_int_t, grid->comm); + + SUPERLU_FREE (blocks); /* memory logged before */ + + + /* store the received U-blocks by rows */ + nlb = nsupers / Pc; + if (!(Ublock = intMalloc_dist (nblocks))) ABORT ("Malloc fails for Ublock[]."); + if (!(Urows = intMalloc_dist (1 + nlb))) ABORT ("Malloc fails for Urows[]."); + + Ublock_bytes = nblocks * iword; + Urows_bytes = (1 + nlb) * iword; + log_memory( Ublock_bytes + Urows_bytes, stat ); + + k = 0; + for (jb = 0; jb < nlb; jb++) { + j = jb * Pc + mycol; + pr = j % Pr; + lb = j / Pr; + Urows[jb] = 0; + + for (pc = 0; pc < Pc; pc++) { + p = pr * Pc + pc; /* the processor owning this block of U-factor */ + + for (i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb]; + i++) { + Ublock[k] = blockr[i]; + k++; + Urows[jb]++; + } + rdispls[p] += rrows[p * nrbp1 + lb]; + } + /* sort by the column indices to make things easier for later on */ + +#ifdef ISORT + isort1 (Urows[jb], &(Ublock[k - Urows[jb]])); +#else + qsort (&(Ublock[k - Urows[jb]]), (size_t) (Urows[jb]), + sizeof (int_t), &superlu_sort_perm); +#endif + } + if (mycol < nsupers % grid->npcol) { + j = nlb * Pc + mycol; + pr = j % Pr; + lb = j / Pr; + Urows[nlb] = 0; + + for (pc = 0; pc < Pc; pc++) { + p = pr * Pc + pc; + for (i = rdispls[p]; i < rdispls[p] + rrows[p * nrbp1 + lb]; + i++) { + Ublock[k] = blockr[i]; + k++; + Urows[nlb]++; + } + rdispls[p] += rrows[p * nrb + lb]; + } +#ifdef ISORT + isort1 (Urows[nlb], &(Ublock[k - Urows[nlb]])); +#else + qsort (&(Ublock[k - Urows[nlb]]), (size_t) (Urows[nlb]), + sizeof (int_t), &superlu_sort_perm); +#endif + } + SUPERLU_FREE (blockr); + log_memory( -nblocks * iword, stat ); + + /* sort the block in L-factor */ + nblocks = 0; + for (lb = 0; lb < ncb; lb++) { + jb = lb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[lb]; + if (index) { /* Not an empty column */ + nblocks += index[0]; + } + } + if (mycol < nsupers % grid->npcol) { + jb = ncb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[ncb]; + if (index) { /* Not an empty column */ + nblocks += index[0]; + } + } + + if (!(Lblock = intMalloc_dist (nblocks))) ABORT ("Malloc fails for Lblock[]."); + if (!(Lrows = intMalloc_dist (1 + ncb))) ABORT ("Malloc fails for Lrows[]."); + + Lblock_bytes = nblocks * iword; + Lrows_bytes = (1 + ncb) * iword; + log_memory(Lblock_bytes + Lrows_bytes, stat); + + for (lb = 0; lb <= ncb; lb++) Lrows[lb] = 0; + nblocks = 0; + for (lb = 0; lb < ncb; lb++) { + Lrows[lb] = 0; + + jb = lb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[lb]; + if (index) { /* Not an empty column */ + i = index[0]; + k = BC_HEADER; + krow = PROW (jb, grid); + if (krow == myrow) /* skip the diagonal block */ + { + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + + for (j = 0; j < i; j++) { + Lblock[nblocks] = index[k]; + Lrows[lb]++; + nblocks++; + + k += LB_DESCRIPTOR + index[k + 1]; + } + } +#ifdef ISORT + isort1 (Lrows[lb], &(Lblock[nblocks - Lrows[lb]])); +#else + qsort (&(Lblock[nblocks - Lrows[lb]]), (size_t) (Lrows[lb]), + sizeof (int_t), &superlu_sort_perm); +#endif + } + if (mycol < nsupers % grid->npcol) { + Lrows[ncb] = 0; + jb = ncb * Pc + mycol; + index = Llu->Lrowind_bc_ptr[ncb]; + if (index) { /* Not an empty column */ + i = index[0]; + k = BC_HEADER; + krow = PROW (jb, grid); + if (krow == myrow) { /* skip the diagonal block */ + k += LB_DESCRIPTOR + index[k + 1]; + i--; + } + for (j = 0; j < i; j++) { + Lblock[nblocks] = index[k]; + Lrows[ncb]++; + nblocks++; + k += LB_DESCRIPTOR + index[k + 1]; + } +#ifdef ISORT + isort1 (Lrows[ncb], &(Lblock[nblocks - Lrows[ncb]])); +#else + qsort (&(Lblock[nblocks - Lrows[ncb]]), (size_t) (Lrows[ncb]), + sizeof (int_t), &superlu_sort_perm); +#endif + } + } + + /* look for the first local symmetric nonzero block match */ + if (!(sf_block = intMalloc_dist (nsupers))) ABORT ("Malloc fails for sf_block[]."); + if (!(sf_block_l = intMalloc_dist (nsupers))) ABORT ("Malloc fails for sf_block_l[]."); + + log_memory( 2 * nsupers * iword, stat ); + + for (lb = 0; lb < nsupers; lb++) + sf_block_l[lb] = nsupers; + i = 0; + j = 0; + for (jb = 0; jb < nlb; jb++) { + if (Urows[jb] > 0) { + ib = i + Urows[jb]; + lb = jb * Pc + mycol; + for (k = 0; k < Lrows[jb]; k++) { + while (Ublock[i] < Lblock[j] && i + 1 < ib) + i++; + + if (Ublock[i] == Lblock[j]) { + sf_block_l[lb] = Lblock[j]; + j += (Lrows[jb] - k); + k = Lrows[jb]; + } else { + j++; + } + } + i = ib; + } else { + j += Lrows[jb]; + } + } + if (mycol < nsupers % grid->npcol) { + if (Urows[nlb] > 0) { + ib = i + Urows[nlb]; + lb = nlb * Pc + mycol; + for (k = 0; k < Lrows[nlb]; k++) { + while (Ublock[i] < Lblock[j] && i + 1 < ib) + i++; + + if (Ublock[i] == Lblock[j]) + { + sf_block_l[lb] = Lblock[j]; + j += (Lrows[nlb] - k); + k = Lrows[nlb]; + } + else + { + j++; + } + } + i = ib; + } else { + j += Lrows[nlb]; + } + } + + /* compute the first global symmetric matchs */ + MPI_Allreduce (sf_block_l, sf_block, nsupers, mpi_int_t, MPI_MIN, + grid->comm); + SUPERLU_FREE (sf_block_l); + log_memory( -nsupers * iword, stat ); + + /* count number of nodes in DAG (i.e., the number of blocks on and above the first match) */ + if (!(nnodes_l = intMalloc_dist (nsupers))) ABORT ("Malloc fails for nnodes_l[]."); + if (!(nnodes_u = intMalloc_dist (nsupers))) ABORT ("Malloc fails for nnodes_u[]."); + log_memory( 2 * nsupers * iword, stat ); + + for (lb = 0; lb < nsupers; lb++) nnodes_l[lb] = 0; + for (lb = 0; lb < nsupers; lb++) nnodes_u[lb] = 0; + + nblocks = 0; + /* from U-factor */ + for (i = 0, jb = 0; jb < nlb; jb++) { + lb = jb * Pc + mycol; + ib = i + Urows[jb]; + while (i < ib) { + if (Ublock[i] <= sf_block[lb]) { + nnodes_u[lb]++; + i++; + nblocks++; + } else { /* get out */ + i = ib; + } + } + i = ib; + } + if (mycol < nsupers % grid->npcol) { + lb = nlb * Pc + mycol; + ib = i + Urows[nlb]; + while (i < ib) { + if (Ublock[i] <= sf_block[lb]) { + nnodes_u[lb]++; + i++; + nblocks++; + } else { /* get out */ + i = ib; + } + } + i = ib; + } + + /* from L-factor */ + for (i = 0, jb = 0; jb < nlb; jb++) { + lb = jb * Pc + mycol; + ib = i + Lrows[jb]; + while (i < ib) { + if (Lblock[i] < sf_block[lb]) { + nnodes_l[lb]++; + i++; + nblocks++; + } else { + i = ib; + } + } + i = ib; + } + if (mycol < nsupers % grid->npcol) { + lb = nlb * Pc + mycol; + ib = i + Lrows[nlb]; + while (i < ib) { + if (Lblock[i] < sf_block[lb]) { + nnodes_l[lb]++; + i++; + nblocks++; + } else { + i = ib; + } + } + i = ib; + } + +#ifdef USE_ALLGATHER + /* insert local nodes in DAG */ + if (!(edag_supno_l = intMalloc_dist (nsupers + nblocks))) + ABORT ("Malloc fails for edag_supno_l[]."); + edag_supno_l_bytes = (nsupers + nblocks) * iword; + log_memory(edag_supno_l_bytes, stat); + + iu = il = nblocks = 0; + for (lb = 0; lb < nsupers; lb++) { + j = lb / Pc; + pc = lb % Pc; + + edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb]; + nblocks++; + if (mycol == pc) { + /* from U-factor */ + ib = iu + Urows[j]; + for (jb = 0; jb < nnodes_u[lb]; jb++) { + edag_supno_l[nblocks] = Ublock[iu]; + iu++; + nblocks++; + } + iu = ib; + + /* from L-factor */ + ib = il + Lrows[j]; + for (jb = 0; jb < nnodes_l[lb]; jb++) { + edag_supno_l[nblocks] = Lblock[il]; + il++; + nblocks++; + } + il = ib; + } + } + SUPERLU_FREE (nnodes_u); + log_memory(-nsupers * iword, stat); + + /* form global DAG on each processor */ + MPI_Allgather (&nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT, + grid->comm); + nblocks = recvcnts[0]; + rdispls[0] = 0; + for (lb = 1; lb < Pc * Pr; lb++) { + rdispls[lb] = nblocks; + nblocks += recvcnts[lb]; + } + if (!(recvbuf = intMalloc_dist (nblocks))) ABORT ("Malloc fails for recvbuf[]."); + log_memory(nblocks * iword, stat); + + MPI_Allgatherv (edag_supno_l, recvcnts[iam], mpi_int_t, + recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm); + SUPERLU_FREE (edag_supno_l); + log_memory(-edag_supno_l_bytes, stat); + + if (!(edag_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t *)))) + ABORT ("Malloc fails for edag_supno[]."); + log_memory(nsupers * iword, stat); + + k = 0; + for (lb = 0; lb < nsupers; lb++) nnodes_l[lb] = 0; + for (p = 0; p < Pc * Pr; p++) { + for (lb = 0; lb < nsupers; lb++) { + nnodes_l[lb] += recvbuf[k]; + k += (1 + recvbuf[k]); + } + } + for (lb = 0; lb < nsupers; lb++) { + if (nnodes_l[lb] > 0) + if (!(edag_supno[lb] = intMalloc_dist (nnodes_l[lb]))) + ABORT ("Malloc fails for edag_supno[lb]."); + nnodes_l[lb] = 0; + } + k = 0; + for (p = 0; p < Pc * Pr; p++) { + for (lb = 0; lb < nsupers; lb++) { + jb = k + recvbuf[k] + 1; + k++; + for (; k < jb; k++) { + edag_supno[lb][nnodes_l[lb]] = recvbuf[k]; + nnodes_l[lb]++; + } + } + } + SUPERLU_FREE (recvbuf); + log_memory(-nblocks * iword, stat); + +#else /* not USE_ALLGATHER */ + int nlsupers = nsupers / Pc; + if (mycol < nsupers % Pc) nlsupers++; + + /* insert local nodes in DAG */ + if (!(edag_supno_l = intMalloc_dist (nlsupers + nblocks))) + ABORT ("Malloc fails for edag_supno_l[]."); + edag_supno_l_bytes = (nlsupers + nblocks) * iword; + log_memory(edag_supno_l_bytes, stat); + + iu = il = nblocks = 0; + for (lb = 0; lb < nsupers; lb++) { + j = lb / Pc; + pc = lb % Pc; + if (mycol == pc) { + edag_supno_l[nblocks] = nnodes_l[lb] + nnodes_u[lb]; + nblocks++; + /* from U-factor */ + ib = iu + Urows[j]; + for (jb = 0; jb < nnodes_u[lb]; jb++) { + edag_supno_l[nblocks] = Ublock[iu]; + iu++; + nblocks++; + } + iu = ib; + + /* from L-factor */ + ib = il + Lrows[j]; + for (jb = 0; jb < nnodes_l[lb]; jb++) { + edag_supno_l[nblocks] = Lblock[il]; + il++; + nblocks++; + } + il = ib; + } else if (nnodes_l[lb] + nnodes_u[lb] != 0) + printf (" # %d: nnodes[" IFMT "]=" IFMT "+" IFMT "\n", + grid->iam, lb, nnodes_l[lb], nnodes_u[lb]); + } + SUPERLU_FREE (nnodes_u); + log_memory(-nsupers * iword, stat); + + /* form global DAG on each processor */ + MPI_Allgather (&nblocks, 1, MPI_INT, recvcnts, 1, MPI_INT, grid->comm); + nblocks = recvcnts[0]; + rdispls[0] = 0; + for (lb = 1; lb < Pc * Pr; lb++) { + rdispls[lb] = nblocks; + nblocks += recvcnts[lb]; + } + if (!(recvbuf = intMalloc_dist (nblocks))) ABORT ("Malloc fails for recvbuf[]."); + log_memory(nblocks * iword, stat); + + MPI_Allgatherv (edag_supno_l, recvcnts[iam], mpi_int_t, + recvbuf, recvcnts, rdispls, mpi_int_t, grid->comm); + + SUPERLU_FREE (edag_supno_l); + log_memory(-edag_supno_l_bytes, stat); + + if (!(edag_supno = SUPERLU_MALLOC (nsupers * sizeof (int_t *)))) + ABORT ("Malloc fails for edag_supno[]."); + log_memory(nsupers * sizeof(int_t *), stat); + + k = 0; + for (lb = 0; lb < nsupers; lb++) nnodes_l[lb] = 0; + for (p = 0; p < Pc * Pr; p++) { + yourcol = MYCOL (p, grid); + + for (lb = 0; lb < nsupers; lb++) { + j = lb / Pc; + pc = lb % Pc; + if (yourcol == pc) { + nnodes_l[lb] += recvbuf[k]; + k += (1 + recvbuf[k]); + } + } + } + for (lb = 0; lb < nsupers; lb++) { + if (nnodes_l[lb] > 0) + if (!(edag_supno[lb] = intMalloc_dist (nnodes_l[lb]))) + ABORT ("Malloc fails for edag_supno[lb]."); + nnodes_l[lb] = 0; + } + k = 0; + for (p = 0; p < Pc * Pr; p++) { + yourcol = MYCOL (p, grid); + + for (lb = 0; lb < nsupers; lb++) { + j = lb / Pc; + pc = lb % Pc; + if (yourcol == pc) + { + jb = k + recvbuf[k] + 1; + k++; + for (; k < jb; k++) + { + edag_supno[lb][nnodes_l[lb]] = recvbuf[k]; + nnodes_l[lb]++; + } + } + } + } + SUPERLU_FREE (recvbuf); + log_memory( -nblocks * iword , stat); + +#endif /* end USE_ALL_GATHER */ + + /* initialize the num of child for each node */ + num_child = SUPERLU_MALLOC (nsupers * sizeof (int_t)); + for (i = 0; i < nsupers; i++) num_child[i] = 0; + for (i = 0; i < nsupers; i++) { + for (jb = 0; jb < nnodes_l[i]; jb++) { + num_child[edag_supno[i][jb]]++; + } + } + + /* push initial leaves to the fifo queue */ + nnodes = 0; + for (i = 0; i < nsupers; i++) { + if (num_child[i] == 0) { + ptr = SUPERLU_MALLOC (sizeof (etree_node)); + ptr->id = i; + ptr->next = NULL; + /*printf( " == push leaf %d (%d) ==\n",i,nnodes ); */ + nnodes++; + + if (nnodes == 1) { + head = ptr; + tail = ptr; + } else { + tail->next = ptr; + tail = ptr; + } + } + } + + /* process fifo queue, and compute the ordering */ + i = 0; + + while (nnodes > 0) { + /*printf( "=== pop %d (%d) ===\n",head->id,i ); */ + ptr = head; + j = ptr->id; + head = ptr->next; + + perm_c_supno[i] = j; + SUPERLU_FREE (ptr); + i++; + nnodes--; + + for (jb = 0; jb < nnodes_l[j]; jb++) { + num_child[edag_supno[j][jb]]--; + if (num_child[edag_supno[j][jb]] == 0) { + nnodes++; + + ptr = SUPERLU_MALLOC (sizeof (etree_node)); + ptr->id = edag_supno[j][jb]; + ptr->next = NULL; + + /*printf( "=== push %d ===\n",ptr->id ); */ + if (nnodes == 1) { + head = ptr; + tail = ptr; + } else { + tail->next = ptr; + tail = ptr; + } + } + } + /*printf( "\n" ); */ + } + for (lb = 0; lb < nsupers; lb++) + if (nnodes_l[lb] > 0) SUPERLU_FREE (edag_supno[lb]); + + SUPERLU_FREE (num_child); + SUPERLU_FREE (edag_supno); + SUPERLU_FREE (nnodes_l); + SUPERLU_FREE (sf_block); + SUPERLU_FREE (sendcnts); + + log_memory(-(4 * nsupers + (4 + 2 * nrbp1)*Pr*Pc) * iword, stat); + + SUPERLU_FREE (Ublock); + SUPERLU_FREE (Urows); + SUPERLU_FREE (Lblock); + SUPERLU_FREE (Lrows); + log_memory(-(Ublock_bytes + Urows_bytes + Lblock_bytes + Lrows_bytes), stat); + } + /* ======================== * + * end of static scheduling * + * ======================== */ + + for (lb = 0; lb < nsupers; lb++) iperm_c_supno[perm_c_supno[lb]] = lb; + +#if ( DEBUGlevel >= 1 ) + print_memorylog(stat, "after static schedule"); + check_perm_dist("perm_c_supno", nsupers, perm_c_supno); + check_perm_dist("iperm_c_supno", nsupers, iperm_c_supno); +#endif + + return 0; +} /* STATIC_SCHEDULE */ + diff --git a/SRC/ssuperlu_blas.c b/SRC/ssuperlu_blas.c new file mode 100644 index 00000000..5d820665 --- /dev/null +++ b/SRC/ssuperlu_blas.c @@ -0,0 +1,123 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Wrapper functions to call BLAS. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab
+ * December 6, 2020
+ */
+
+#include "superlu_sdefs.h"
+
+#ifdef _CRAY
+_fcd ftcs = _cptofcd("N", strlen("N"));
+_fcd ftcs1 = _cptofcd("L", strlen("L"));
+_fcd ftcs2 = _cptofcd("N", strlen("N"));
+_fcd ftcs3 = _cptofcd("U", strlen("U"));
+#endif
+
+int superlu_sgemm(const char *transa, const char *transb,
+                  int m, int n, int k, float alpha, float *a,
+                  int lda, float *b, int ldb, float beta, float *c, int ldc)
+{
+#ifdef _CRAY
+    _fcd ftcs = _cptofcd(transa, strlen(transa));
+    _fcd ftcs1 = _cptofcd(transb, strlen(transb));
+    return SGEMM(ftcs, ftcs1, &m, &n, &k,
+                 &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#elif defined(USE_VENDOR_BLAS)
+    sgemm_(transa, transb, &m, &n, &k,
+           &alpha, a, &lda, b, &ldb, &beta, c, &ldc, 1, 1);
+    return 0;
+#else
+    return sgemm_(transa, transb, &m, &n, &k,
+                  &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
+#endif
+}
+
+int superlu_strsm(const char *sideRL, const char *uplo,
+                  const char *transa, const char *diag,
+                  const int m, const int n,
+                  const float alpha, const float *a,
+                  const int lda, float *b, const int ldb)
+
+{
+#if defined(USE_VENDOR_BLAS)
+    strsm_(sideRL, uplo, transa, diag,
+           &m, &n, &alpha, a, &lda, b, &ldb,
+           1, 1, 1, 1);
+    return 0;
+#else
+    return strsm_(sideRL, uplo, transa, diag,
+                  &m, &n, &alpha, a, &lda, b, &ldb);
+#endif
+}
+
+int superlu_sger(const int m, const int n, const float alpha,
+                 const float *x, const int incx, const float *y,
+                 const int incy, float *a, const int lda)
+{
+#ifdef _CRAY
+    SGER(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#else
+    sger_(&m, &n, &alpha, x, &incx, y, &incy, a, &lda);
+#endif
+
+    return 0;
+}
+
+int superlu_sscal(const int n, const float alpha, float *x, const int incx)
+{
+    sscal_(&n, &alpha, x, &incx);
+    return 0;
+}
+
+int superlu_saxpy(const int n, const float alpha,
+    const float *x, const int incx, float *y, const int incy)
+{
+    saxpy_(&n, &alpha, x, &incx, y, &incy);
+    return 0;
+}
+
+int superlu_sgemv(const char *trans, const int m,
+                  const int n, const float alpha, const float *a,
+                  const int lda, const float *x, const int incx,
+                  const float beta, float *y, const int incy)
+{
+#ifdef USE_VENDOR_BLAS
+    sgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy, 1);
+#else
+    sgemv_(trans, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy);
+#endif
+    
+    return 0;
+}
+
+int superlu_strsv(char *uplo, char *trans, char *diag,
+                  int n, float *a, int lda, float *x, int incx)
+{
+#ifdef _CRAY
+    // _fcd ftcs = _cptofcd("N", strlen("N"));
+    STRSV(_cptofcd(uplo, strlen(uplo)), _cptofcd(trans, strlen(trans)), _cptofcd(diag, strlen(diag)), 
+         &n, a, &lda, x, &incx);
+#elif defined (USE_VENDOR_BLAS)
+    strsv_(uplo, trans, diag, &n, a, &lda, x, &incx, 1, 1, 1);
+#else
+    strsv_(uplo, trans, diag, &n, a, &lda, x, &incx);
+#endif
+    
+    return 0;
+}
+
diff --git a/SRC/ssuperlu_gpu.cu b/SRC/ssuperlu_gpu.cu
new file mode 100644
index 00000000..7b86574f
--- /dev/null
+++ b/SRC/ssuperlu_gpu.cu
@@ -0,0 +1,1788 @@
+
+
+/*! @file
+ * \brief Descriptions and declarations for structures used in GPU
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * March 14, 2021 version 7.0.0
+ * 
+ */ + +//#define GPU_DEBUG + +#include "mpi.h" +// #include "sec_structs.h" +#include +#include +#include + +#undef Reduce +#include "cub/cub.cuh" +//#include + +#include "slustruct_gpu.h" + + +//extern "C" { +// void cblas_daxpy(const int N, const double alpha, const double *X, +// const int incX, double *Y, const int incY); +//} + +// cublasStatus_t checkCublas(cublasStatus_t result) +// { +// #if defined(DEBUG) || defined(_DEBUG) +// if (result != CUBLAS_STATUS_SUCCESS) +// { +// fprintf(stderr, "CUDA Blas Runtime Error: %s\n", cublasGetErrorString(result)); +// assert(result == CUBLAS_STATUS_SUCCESS); +// } +// #endif +// return result; +// } + + +// #define UNIT_STRIDE + +#if 0 ////////// this routine is not used anymore +__device__ inline +void device_scatter_l (int_t thread_id, + int_t nsupc, int_t temp_nbrow, + int_t *usub, int_t iukp, int_t klst, + float *nzval, int_t ldv, + float *tempv, int_t nbrow, + // int_t *indirect2_thread + int *indirect2_thread + ) +{ + + + int_t segsize, jj; + + for (jj = 0; jj < nsupc; ++jj) + { + segsize = klst - usub[iukp + jj]; + if (segsize) + { + if (thread_id < temp_nbrow) + { + +#ifndef UNIT_STRIDE + nzval[indirect2_thread[thread_id]] -= tempv[thread_id]; +#else + nzval[thread_id] -= tempv[thread_id]; /*making access unit strided*/ +#endif + } + tempv += nbrow; + } + nzval += ldv; + } +} +#endif ///////////// not used + +#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ + +__device__ inline +void sdevice_scatter_l_2D (int thread_id, + int nsupc, int temp_nbrow, + int_t *usub, int iukp, int_t klst, + float *nzval, int ldv, + const float *tempv, int nbrow, + int *indirect2_thread, + int nnz_cols, int ColPerBlock, + int *IndirectJ3 + ) +{ + int i; + if ( thread_id < temp_nbrow * ColPerBlock ) { + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; + +#define UNROLL_ITER 8 + +#pragma unroll 4 + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + i = ldv * IndirectJ3[col] + indirect2_thread[thread_id_x]; + nzval[i] -= tempv[nbrow * col + thread_id_x]; + } + } +} + +/* Sherry: this routine is not used */ +#if 0 ////////////////////////////////////////////// +__global__ +void cub_scan_test(void) +{ + int thread_id = threadIdx.x; + typedef cub::BlockScan BlockScan; /*1D int data type*/ + + __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ + + __shared__ int IndirectJ1[MAX_SUPER_SIZE]; + __shared__ int IndirectJ2[MAX_SUPER_SIZE]; + + if (thread_id < MAX_SUPER_SIZE) + { + IndirectJ1[thread_id] = (thread_id + 1) % 2; + } + + __syncthreads(); + if (thread_id < MAX_SUPER_SIZE) + BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]); + + + if (thread_id < MAX_SUPER_SIZE) + printf("%d %d\n", thread_id, IndirectJ2[thread_id]); + +} +#endif /////////////////////////////////// not used + + +__device__ inline +void device_scatter_u_2D (int thread_id, + int temp_nbrow, int nsupc, + float * ucol, + int_t * usub, int iukp, + int_t ilst, int_t klst, + int_t * index, int iuip_lib, + float * tempv, int nbrow, + int *indirect, + int nnz_cols, int ColPerBlock, + int *IndirectJ1, + int *IndirectJ3 + ) +{ + int i; + + if ( thread_id < temp_nbrow * ColPerBlock ) + { + /* 1D threads are logically arranged in 2D shape. */ + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; + +#pragma unroll 4 + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + i = IndirectJ1[IndirectJ3[col]] + indirect[thread_id_x]; + ucol[i] -= tempv[nbrow * col + thread_id_x]; + } + } +} + + +__device__ inline +void device_scatter_u (int_t thread_id, + int_t temp_nbrow, int_t nsupc, + float * ucol, + int_t * usub, int_t iukp, + int_t ilst, int_t klst, + int_t * index, int_t iuip_lib, + float * tempv, int_t nbrow, + // int_t *indirect + int *indirect + ) +{ + int_t segsize, fnz, jj; + for (jj = 0; jj < nsupc; ++jj) + { + segsize = klst - usub[iukp + jj]; + fnz = index[iuip_lib++]; + ucol -= fnz; + if (segsize) { /* Nonzero segment in U(k.j). */ + if (thread_id < temp_nbrow) + { +#ifndef UNIT_STRIDE + ucol[indirect[thread_id]] -= tempv[thread_id]; +#else + /* making access unit strided; + it doesn't work; it is for measurements */ + ucol[thread_id] -= tempv[thread_id]; +#endif + } + tempv += nbrow; + } + ucol += ilst ; + } +} + + +__global__ +void Scatter_GPU_kernel( + int_t streamId, + int_t ii_st, int_t ii_end, + int_t jj_st, int_t jj_end, /* defines rectangular Schur block to be scatter */ + int_t klst, + int_t jj0, /* 0 on entry */ + int_t nrows, int_t ldt, int_t npcol, int_t nprow, + sLUstruct_gpu_t * A_gpu) +{ + + /* initializing pointers */ + int_t *xsup = A_gpu->xsup; + int_t *UrowindPtr = A_gpu->UrowindPtr; + int_t *UrowindVec = A_gpu->UrowindVec; + int_t *UnzvalPtr = A_gpu->UnzvalPtr; + float *UnzvalVec = A_gpu->UnzvalVec; + int_t *LrowindPtr = A_gpu->LrowindPtr; + int_t *LrowindVec = A_gpu->LrowindVec; + int_t *LnzvalPtr = A_gpu->LnzvalPtr; + float *LnzvalVec = A_gpu->LnzvalVec; + float *bigV = A_gpu->scubufs[streamId].bigV; + local_l_blk_info_t *local_l_blk_infoVec = A_gpu->local_l_blk_infoVec; + local_u_blk_info_t *local_u_blk_infoVec = A_gpu->local_u_blk_infoVec; + int_t *local_l_blk_infoPtr = A_gpu->local_l_blk_infoPtr; + int_t *local_u_blk_infoPtr = A_gpu->local_u_blk_infoPtr; + Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info; + Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info; + int_t *lsub = A_gpu->scubufs[streamId].lsub; + int_t *usub = A_gpu->scubufs[streamId].usub; + + /* thread block assignment: this thread block is + assigned to block (lb, j) in 2D grid */ + int lb = blockIdx.x + ii_st; + int j = blockIdx.y + jj_st; + __shared__ int indirect_thread[MAX_SUPER_SIZE]; /* row-wise */ + __shared__ int indirect2_thread[MAX_SUPER_SIZE]; /* row-wise */ + __shared__ int IndirectJ1[THREAD_BLOCK_SIZE]; /* column-wise */ + __shared__ int IndirectJ3[THREAD_BLOCK_SIZE]; /* column-wise */ + + /* see CUB page https://nvlabs.github.io/cub/. Implement threads collectives */ + typedef cub::BlockScan BlockScan; /*1D int data type*/ + __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ + + int thread_id = threadIdx.x; + + int iukp = Ublock_info[j].iukp; + int jb = Ublock_info[j].jb; + int nsupc = SuperSize (jb); + int ljb = jb / npcol; + + float *tempv1; + if (jj_st == jj0) + { + tempv1 = (j == jj_st) ? bigV + : bigV + Ublock_info[j - 1].full_u_cols * nrows; + } + else + { + tempv1 = (j == jj_st) ? bigV + : bigV + (Ublock_info[j - 1].full_u_cols - + Ublock_info[jj_st - 1].full_u_cols) * nrows; + } + + /* # of nonzero columns in block j */ + int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols + : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols); + int cum_ncol = (j == 0) ? 0 : Ublock_info[j - 1].full_u_cols; + + int lptr = Remain_info[lb].lptr; + int ib = Remain_info[lb].ib; + int temp_nbrow = lsub[lptr + 1]; /* number of rows in the current L block */ + lptr += LB_DESCRIPTOR; + + int_t cum_nrow; + if (ii_st == 0) + { + cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow); + } + else + { + cum_nrow = (lb == 0 ? 0 : Remain_info[lb - 1].FullRow - Remain_info[ii_st - 1].FullRow); + } + + tempv1 += cum_nrow; + + if (ib < jb) /*scatter U code */ + { + int ilst = FstBlockC (ib + 1); + int lib = ib / nprow; /* local index of row block ib */ + int_t *index = &UrowindVec[UrowindPtr[lib]]; + + int num_u_blocks = index[0]; + + int ljb = (jb) / npcol; /* local index of column block jb */ + + /* Each thread is responsible for one block column */ + __shared__ int ljb_ind; + /*do a search ljb_ind at local row lib*/ + int blks_per_threads = CEILING(num_u_blocks, THREAD_BLOCK_SIZE); + for (int i = 0; i < blks_per_threads; ++i) + /* each thread is assigned a chunk of consecutive U blocks to search */ + { + /* only one thread finds the block index matching ljb */ + if (thread_id * blks_per_threads + i < num_u_blocks && + local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + thread_id * blks_per_threads + i ].ljb == ljb) + { + ljb_ind = thread_id * blks_per_threads + i; + } + } + __syncthreads(); + + int iuip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].iuip; + int ruip_lib = local_u_blk_infoVec[ local_u_blk_infoPtr[lib] + ljb_ind].ruip; + iuip_lib += UB_DESCRIPTOR; + float *Unzval_lib = &UnzvalVec[UnzvalPtr[lib]]; + float *ucol = &Unzval_lib[ruip_lib]; + + if (thread_id < temp_nbrow) /* row-wise */ + { + /* cyclically map each thread to a row */ + indirect_thread[thread_id] = (int) lsub[lptr + thread_id]; + } + + /* column-wise: each thread is assigned one column */ + if (thread_id < nnz_cols) + IndirectJ3[thread_id] = A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; + /* indirectJ3[j] == kk means the j-th nonzero segment + points to column kk in this supernode */ + + __syncthreads(); + + /* threads are divided into multiple columns */ + int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; + + if (thread_id < THREAD_BLOCK_SIZE) + IndirectJ1[thread_id] = 0; + + if (thread_id < THREAD_BLOCK_SIZE) + { + if (thread_id < nsupc) + { + /* fstnz subscript of each column in the block */ + IndirectJ1[thread_id] = index[iuip_lib + thread_id]; + } + } + + /* perform an inclusive block-wide prefix sum among all threads */ + if (thread_id < THREAD_BLOCK_SIZE) + BlockScan(temp_storage).InclusiveSum(IndirectJ1[thread_id], IndirectJ1[thread_id]); + + if (thread_id < THREAD_BLOCK_SIZE) + IndirectJ1[thread_id] = -IndirectJ1[thread_id] + ilst * thread_id; + + __syncthreads(); + + device_scatter_u_2D ( + thread_id, + temp_nbrow, nsupc, + ucol, + usub, iukp, + ilst, klst, + index, iuip_lib, + tempv1, nrows, + indirect_thread, + nnz_cols, ColPerBlock, + IndirectJ1, + IndirectJ3 ); + + } + else /* ib >= jb, scatter L code */ + { + + int rel; + float *nzval; + int_t *index = &LrowindVec[LrowindPtr[ljb]]; + int num_l_blocks = index[0]; + int ldv = index[1]; + + int fnz = FstBlockC (ib); + int lib = ib / nprow; + + __shared__ int lib_ind; + /*do a search lib_ind for lib*/ + int blks_per_threads = CEILING(num_l_blocks, THREAD_BLOCK_SIZE); + for (int i = 0; i < blks_per_threads; ++i) + { + if (thread_id * blks_per_threads + i < num_l_blocks && + local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + thread_id * blks_per_threads + i ].lib == lib) + { + lib_ind = thread_id * blks_per_threads + i; + } + } + __syncthreads(); + + int lptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].lptrj; + int luptrj = local_l_blk_infoVec[ local_l_blk_infoPtr[ljb] + lib_ind].luptrj; + lptrj += LB_DESCRIPTOR; + int dest_nbrow = index[lptrj - 1]; + + if (thread_id < dest_nbrow) + { + rel = index[lptrj + thread_id] - fnz; + indirect_thread[rel] = thread_id; + } + __syncthreads(); + + /* can be precalculated */ + if (thread_id < temp_nbrow) + { + rel = lsub[lptr + thread_id] - fnz; + indirect2_thread[thread_id] = indirect_thread[rel]; + } + if (thread_id < nnz_cols) + IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; + __syncthreads(); + + int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; + + nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj; + sdevice_scatter_l_2D( + thread_id, + nsupc, temp_nbrow, + usub, iukp, klst, + nzval, ldv, + tempv1, nrows, indirect2_thread, + nnz_cols, ColPerBlock, + IndirectJ3); + } /* end else ib >= jb */ + +} /* end Scatter_GPU_kernel */ + + +#define GPU_2D_SCHUDT /* Not used */ + +int sSchurCompUpdate_GPU( + int_t streamId, + int_t jj_cpu, /* 0 on entry, pointing to the start of Phi part */ + int_t nub, /* jj_cpu on entry, pointing to the end of the Phi part */ + int_t klst, int_t knsupc, + int_t Rnbrow, int_t RemainBlk, + int_t Remain_lbuf_send_size, + int_t bigu_send_size, int_t ldu, + int_t mcb, /* num_u_blks_hi */ + int_t buffer_size, int_t lsub_len, int_t usub_len, + int_t ldt, int_t k0, + ssluGPU_t *sluGPU, gridinfo_t *grid +) +{ + + sLUstruct_gpu_t * A_gpu = sluGPU->A_gpu; + sLUstruct_gpu_t * dA_gpu = sluGPU->dA_gpu; + int_t nprow = grid->nprow; + int_t npcol = grid->npcol; + + cudaStream_t FunCallStream = sluGPU->funCallStreams[streamId]; + cublasHandle_t cublas_handle0 = sluGPU->cublasHandles[streamId]; + int_t * lsub = A_gpu->scubufs[streamId].lsub_buf; + int_t * usub = A_gpu->scubufs[streamId].usub_buf; + Remain_info_t *Remain_info = A_gpu->scubufs[streamId].Remain_info_host; + float * Remain_L_buff = A_gpu->scubufs[streamId].Remain_L_buff_host; + Ublock_info_t *Ublock_info = A_gpu->scubufs[streamId].Ublock_info_host; + float * bigU = A_gpu->scubufs[streamId].bigU_host; + + A_gpu->isOffloaded[k0] = 1; + /* start by sending data to */ + int_t *xsup = A_gpu->xsup_host; + int_t col_back = (jj_cpu == 0) ? 0 : Ublock_info[jj_cpu - 1].full_u_cols; + // if(nub<1) return; + int_t ncols = Ublock_info[nub - 1].full_u_cols - col_back; + + /* Sherry: can get max_super_size from sp_ienv(3) */ + int_t indirectJ1[MAX_SUPER_SIZE]; // 0 indicates an empry segment + int_t indirectJ2[MAX_SUPER_SIZE]; // # of nonzero segments so far + int_t indirectJ3[MAX_SUPER_SIZE]; /* indirectJ3[j] == k means the + j-th nonzero segment points + to column k in this supernode */ + /* calculate usub_indirect */ + for (int jj = jj_cpu; jj < nub; ++jj) + { + int_t iukp = Ublock_info[jj].iukp; + int_t jb = Ublock_info[jj].jb; + int_t nsupc = SuperSize (jb); + int_t addr = (jj == 0) ? 0 + : Ublock_info[jj - 1].full_u_cols - col_back; + + for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + indirectJ1[kk] = 0; + } + + for (int_t kk = 0; kk < nsupc; ++kk) + { + indirectJ1[kk] = ((klst - usub[iukp + kk]) == 0) ? 0 : 1; + } + + /*prefix sum - indicates # of nonzero segments up to column kk */ + indirectJ2[0] = indirectJ1[0]; + for (int_t kk = 1; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + indirectJ2[kk] = indirectJ2[kk - 1] + indirectJ1[kk]; + } + + /* total number of nonzero segments in this supernode */ + int nnz_col = indirectJ2[nsupc - 1]; // old: MAX_SUPER_SIZE + + /* compactation */ + for (int_t kk = 0; kk < nsupc; ++kk) // old: MAX_SUPER_SIZE + { + if (indirectJ1[kk]) /* kk is a nonzero segment */ + { + /* indirectJ3[j] == kk means the j-th nonzero segment + points to column kk in this supernode */ + indirectJ3[indirectJ2[kk] - 1] = kk; + } + } + + for (int i = 0; i < nnz_col; ++i) + { + /* addr == total # of full columns before current block jj */ + A_gpu->scubufs[streamId].usub_IndirectJ3_host[addr + i] = indirectJ3[i]; + } + } /* end for jj ... calculate usub_indirect */ + + //printf("sSchurCompUpdate_GPU[3]: jj_cpu %d, nub %d\n", jj_cpu, nub); fflush(stdout); + + /*sizeof RemainLbuf = Rnbuf*knsupc */ + double tTmp = SuperLU_timer_(); + cudaEventRecord(A_gpu->ePCIeH2D[k0], FunCallStream); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].usub_IndirectJ3, + A_gpu->scubufs[streamId].usub_IndirectJ3_host, + ncols * sizeof(int_t), cudaMemcpyHostToDevice, + FunCallStream)) ; + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Remain_L_buff, Remain_L_buff, + Remain_lbuf_send_size * sizeof(float), + cudaMemcpyHostToDevice, FunCallStream)) ; + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].bigU, bigU, + bigu_send_size * sizeof(float), + cudaMemcpyHostToDevice, FunCallStream) ); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Remain_info, Remain_info, + RemainBlk * sizeof(Remain_info_t), + cudaMemcpyHostToDevice, FunCallStream) ); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].Ublock_info, Ublock_info, + mcb * sizeof(Ublock_info_t), cudaMemcpyHostToDevice, + FunCallStream) ); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].lsub, lsub, + lsub_len * sizeof(int_t), cudaMemcpyHostToDevice, + FunCallStream) ); + + checkCuda(cudaMemcpyAsync(A_gpu->scubufs[streamId].usub, usub, + usub_len * sizeof(int_t), cudaMemcpyHostToDevice, + FunCallStream) ); + + A_gpu->tHost_PCIeH2D += SuperLU_timer_() - tTmp; + A_gpu->cPCIeH2D += Remain_lbuf_send_size * sizeof(float) + + bigu_send_size * sizeof(float) + + RemainBlk * sizeof(Remain_info_t) + + mcb * sizeof(Ublock_info_t) + + lsub_len * sizeof(int_t) + + usub_len * sizeof(int_t); + + float alpha = 1.0, beta = 0.0; + + int_t ii_st = 0; + int_t ii_end = 0; + int_t maxGemmBlockDim = (int) sqrt(buffer_size); + // int_t maxGemmBlockDim = 8000; + + /* Organize GEMM by blocks of [ii_st : ii_end, jj_st : jj_end] that + fits in the buffer_size */ + while (ii_end < RemainBlk) { + ii_st = ii_end; + ii_end = RemainBlk; + int_t nrow_max = maxGemmBlockDim; +// nrow_max = Rnbrow; + int_t remaining_rows = (ii_st == 0) ? Rnbrow : Rnbrow - Remain_info[ii_st - 1].FullRow; + nrow_max = (remaining_rows / nrow_max) > 0 ? remaining_rows / CEILING(remaining_rows, nrow_max) : nrow_max; + + int_t ResRow = (ii_st == 0) ? 0 : Remain_info[ii_st - 1].FullRow; + for (int_t i = ii_st; i < RemainBlk - 1; ++i) + { + if ( Remain_info[i + 1].FullRow > ResRow + nrow_max) + { + ii_end = i; + break; /* row dimension reaches nrow_max */ + } + } + + int_t nrows; /* actual row dimension for GEMM */ + int_t st_row; + if (ii_st > 0) + { + nrows = Remain_info[ii_end - 1].FullRow - Remain_info[ii_st - 1].FullRow; + st_row = Remain_info[ii_st - 1].FullRow; + } + else + { + nrows = Remain_info[ii_end - 1].FullRow; + st_row = 0; + } + + int jj_st = jj_cpu; + int jj_end = jj_cpu; + + while (jj_end < nub && nrows > 0 ) + { + int_t remaining_cols = (jj_st == jj_cpu) ? ncols : ncols - Ublock_info[jj_st - 1].full_u_cols; + if ( remaining_cols * nrows < buffer_size) + { + jj_st = jj_end; + jj_end = nub; + } + else /* C matrix cannot fit in buffer, need to break into pieces */ + { + int_t ncol_max = buffer_size / nrows; + /** Must revisit **/ + ncol_max = SUPERLU_MIN(ncol_max, maxGemmBlockDim); + ncol_max = (remaining_cols / ncol_max) > 0 ? + remaining_cols / CEILING(remaining_cols, ncol_max) + : ncol_max; + + jj_st = jj_end; + jj_end = nub; + + int_t ResCol = (jj_st == 0) ? 0 : Ublock_info[jj_st - 1].full_u_cols; + for (int_t j = jj_st; j < nub - 1; ++j) + { + if (Ublock_info[j + 1].full_u_cols > ResCol + ncol_max) + { + jj_end = j; + break; + } + } + } /* end-if-else */ + + int ncols; + int st_col; + if (jj_st > 0) + { + ncols = Ublock_info[jj_end - 1].full_u_cols - Ublock_info[jj_st - 1].full_u_cols; + st_col = Ublock_info[jj_st - 1].full_u_cols; + if (ncols == 0) exit(0); + } + else + { + ncols = Ublock_info[jj_end - 1].full_u_cols; + st_col = 0; + } + + /* none of the matrix dimension is zero. */ + if (nrows > 0 && ldu > 0 && ncols > 0) + { + if (nrows * ncols > buffer_size) { + printf("!! Matrix size %lld x %lld exceeds buffer_size \n", + nrows, ncols, buffer_size); + fflush(stdout); + } + assert(nrows * ncols <= buffer_size); + cublasSetStream(cublas_handle0, FunCallStream); + cudaEventRecord(A_gpu->GemmStart[k0], FunCallStream); + cublasSgemm(cublas_handle0, CUBLAS_OP_N, CUBLAS_OP_N, + nrows, ncols, ldu, &alpha, + &A_gpu->scubufs[streamId].Remain_L_buff[(knsupc - ldu) * Rnbrow + st_row], Rnbrow, + &A_gpu->scubufs[streamId].bigU[st_col * ldu], ldu, + &beta, A_gpu->scubufs[streamId].bigV, nrows); + +// #define SCATTER_OPT +#ifdef SCATTER_OPT + cudaStreamSynchronize(FunCallStream); +#warning this function is synchronous +#endif + cudaEventRecord(A_gpu->GemmEnd[k0], FunCallStream); + + A_gpu->GemmFLOPCounter += 2.0 * (double) nrows * ncols * ldu; + + /* + * Scattering the output + */ + dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread + + dim3 dimGrid(ii_end - ii_st, jj_end - jj_st); + + Scatter_GPU_kernel <<< dimGrid, dimBlock, 0, FunCallStream>>> + (streamId, ii_st, ii_end, jj_st, jj_end, klst, + 0, nrows, ldt, npcol, nprow, dA_gpu); +#ifdef SCATTER_OPT + cudaStreamSynchronize(FunCallStream); +#warning this function is synchrnous +#endif + + cudaEventRecord(A_gpu->ScatterEnd[k0], FunCallStream); + + A_gpu->ScatterMOPCounter += 3.0 * (double) nrows * ncols; + } /* endif ... none of the matrix dimension is zero. */ + + } /* end while jj_end < nub */ + + } /* end while (ii_end < RemainBlk) */ + + return 0; +} /* end sSchurCompUpdate_GPU */ + + +static void print_occupancy() +{ + int blockSize; // The launch configurator returned block size + int minGridSize; /* The minimum grid size needed to achieve the + best potential occupancy */ + + cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, + Scatter_GPU_kernel, 0, 0); + printf("Occupancy: MinGridSize %d blocksize %d \n", minGridSize, blockSize); +} + +static void printDevProp(cudaDeviceProp devProp) +{ + size_t mfree, mtotal; + cudaMemGetInfo (&mfree, &mtotal); + + printf("pciBusID: %d\n", devProp.pciBusID); + printf("pciDeviceID: %d\n", devProp.pciDeviceID); + printf("GPU Name: %s\n", devProp.name); + printf("Total global memory: %zu\n", devProp.totalGlobalMem); + printf("Total free memory: %zu\n", mfree); + printf("Clock rate: %d\n", devProp.clockRate); + + return; +} + + +static size_t get_acc_memory () +{ + + size_t mfree, mtotal; + cudaMemGetInfo (&mfree, &mtotal); +#if 0 + printf("Total memory %zu & free memory %zu\n", mtotal, mfree); +#endif + return (size_t) (0.9 * (double) mfree) / get_mpi_process_per_gpu (); + + +} + +int sfree_LUstruct_gpu (sLUstruct_gpu_t * A_gpu) +{ + checkCuda(cudaFree(A_gpu->LrowindVec)); + checkCuda(cudaFree(A_gpu->LrowindPtr)); + + checkCuda(cudaFree(A_gpu->LnzvalVec)); + checkCuda(cudaFree(A_gpu->LnzvalPtr)); + free(A_gpu->LnzvalPtr_host); + /*freeing the pinned memory*/ + int_t streamId = 0; + checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_info_host)); + checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Ublock_info_host)); + checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_L_buff_host)); + checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].bigU_host)); + + checkCuda(cudaFreeHost(A_gpu->acc_L_buff)); + checkCuda(cudaFreeHost(A_gpu->acc_U_buff)); + checkCuda(cudaFreeHost(A_gpu->scubufs[streamId].lsub_buf)); + checkCuda(cudaFreeHost(A_gpu->scubufs[streamId].usub_buf)); + + + free(A_gpu->isOffloaded); + free(A_gpu->GemmStart); + free(A_gpu->GemmEnd); + free(A_gpu->ScatterEnd); + free(A_gpu->ePCIeH2D); + + free(A_gpu->ePCIeD2H_Start); + free(A_gpu->ePCIeD2H_End); + + checkCuda(cudaFree(A_gpu->UrowindVec)); + checkCuda(cudaFree(A_gpu->UrowindPtr)); + + free(A_gpu->UrowindPtr_host); + + checkCuda(cudaFree(A_gpu->UnzvalVec)); + checkCuda(cudaFree(A_gpu->UnzvalPtr)); + + checkCuda(cudaFree(A_gpu->grid)); + + + + checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV)); + checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU)); + + checkCuda(cudaFree(A_gpu->scubufs[streamId].Remain_L_buff)); + checkCuda(cudaFree(A_gpu->scubufs[streamId].Ublock_info)); + checkCuda(cudaFree(A_gpu->scubufs[streamId].Remain_info)); + + // checkCuda(cudaFree(A_gpu->indirect)); + // checkCuda(cudaFree(A_gpu->indirect2)); + checkCuda(cudaFree(A_gpu->xsup)); + + checkCuda(cudaFree(A_gpu->scubufs[streamId].lsub)); + checkCuda(cudaFree(A_gpu->scubufs[streamId].usub)); + + + checkCuda(cudaFree(A_gpu->local_l_blk_infoVec)); + checkCuda(cudaFree(A_gpu->local_l_blk_infoPtr)); + checkCuda(cudaFree(A_gpu->jib_lookupVec)); + checkCuda(cudaFree(A_gpu->jib_lookupPtr)); + checkCuda(cudaFree(A_gpu->local_u_blk_infoVec)); + checkCuda(cudaFree(A_gpu->local_u_blk_infoPtr)); + checkCuda(cudaFree(A_gpu->ijb_lookupVec)); + checkCuda(cudaFree(A_gpu->ijb_lookupPtr)); + + return 0; +} + + + +void sPrint_matrix( char *desc, int_t m, int_t n, float * dA, int_t lda ) +{ + float *cPtr = (float *) malloc(sizeof(float) * lda * n); + checkCuda(cudaMemcpy( cPtr, dA, + lda * n * sizeof(float), cudaMemcpyDeviceToHost)) ; + + int_t i, j; + printf( "\n %s\n", desc ); + for ( i = 0; i < m; i++ ) + { + for ( j = 0; j < n; j++ ) printf( " %.3e", cPtr[i + j * lda] ); + printf( "\n" ); + } + free(cPtr); +} + +void sprintGPUStats(sLUstruct_gpu_t * A_gpu) +{ + double tGemm = 0; + double tScatter = 0; + double tPCIeH2D = 0; + double tPCIeD2H = 0; + + for (int_t i = 0; i < A_gpu->nsupers; ++i) + { + float milliseconds = 0; + + if (A_gpu->isOffloaded[i]) + { + cudaEventElapsedTime(&milliseconds, A_gpu->ePCIeH2D[i], A_gpu->GemmStart[i]); + tPCIeH2D += 1e-3 * (double) milliseconds; + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, A_gpu->GemmStart[i], A_gpu->GemmEnd[i]); + tGemm += 1e-3 * (double) milliseconds; + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, A_gpu->GemmEnd[i], A_gpu->ScatterEnd[i]); + tScatter += 1e-3 * (double) milliseconds; + } + + milliseconds = 0; + cudaEventElapsedTime(&milliseconds, A_gpu->ePCIeD2H_Start[i], A_gpu->ePCIeD2H_End[i]); + tPCIeD2H += 1e-3 * (double) milliseconds; + } + + printf("GPU: Flops offloaded %.3e Time spent %lf Flop rate %lf GF/sec \n", + A_gpu->GemmFLOPCounter, tGemm, 1e-9 * A_gpu->GemmFLOPCounter / tGemm ); + printf("GPU: Mop offloaded %.3e Time spent %lf Bandwidth %lf GByte/sec \n", + A_gpu->ScatterMOPCounter, tScatter, 8e-9 * A_gpu->ScatterMOPCounter / tScatter ); + printf("PCIe Data Transfer H2D:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", + 1e-9 * A_gpu->cPCIeH2D, A_gpu->tHost_PCIeH2D, tPCIeH2D, 1e-9 * A_gpu->cPCIeH2D / tPCIeH2D ); + printf("PCIe Data Transfer D2H:\n\tData Sent %.3e(GB)\n\tTime observed from CPU %lf\n\tActual time spent %lf\n\tBandwidth %lf GByte/sec \n", + 1e-9 * A_gpu->cPCIeD2H, A_gpu->tHost_PCIeD2H, tPCIeD2H, 1e-9 * A_gpu->cPCIeD2H / tPCIeD2H ); + fflush(stdout); + +} /* end printGPUStats */ + + +int sinitSluGPU3D_t( + ssluGPU_t *sluGPU, + sLUstruct_t *LUstruct, + gridinfo3d_t * grid3d, + int_t* perm_c_supno, + int_t n, + int_t buffer_size, /* read from env variable MAX_BUFFER_SIZE */ + int_t bigu_size, + int_t ldt /* NSUP read from sp_ienv(3) */ +) +{ + checkCudaErrors(cudaDeviceReset ()) ; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + int_t* isNodeInMyGrid = sluGPU->isNodeInMyGrid; + + sluGPU->nCudaStreams = getnCudaStreams(); + if (grid3d->iam == 0) + { + printf("sinitSluGPU3D_t: Using hardware acceleration, with %d cuda streams, max_buffer_size %d\n", sluGPU->nCudaStreams, (int) buffer_size); + fflush(stdout); + if ( MAX_SUPER_SIZE < ldt ) + { + ABORT("MAX_SUPER_SIZE smaller than requested NSUP"); + } + } + + cudaStreamCreate(&(sluGPU->CopyStream)); + + for (int streamId = 0; streamId < sluGPU->nCudaStreams; streamId++) + { + cudaStreamCreate(&(sluGPU->funCallStreams[streamId])); + cublasCreate(&(sluGPU->cublasHandles[streamId])); + sluGPU->lastOffloadStream[streamId] = -1; + } + + sluGPU->A_gpu = (sLUstruct_gpu_t *) malloc (sizeof(sLUstruct_gpu_t)); + sluGPU->A_gpu->perm_c_supno = perm_c_supno; + sCopyLUToGPU3D ( isNodeInMyGrid, + Llu, /* referred to as A_host */ + sluGPU, Glu_persist, n, grid3d, buffer_size, bigu_size, ldt + ); + + return 0; +} /* end sinitSluGPU3D_t */ + +int sinitD2Hreduce( + int next_k, d2Hreduce_t* d2Hred, int last_flag, HyP_t* HyP, + ssluGPU_t *sluGPU, gridinfo_t *grid, sLUstruct_t *LUstruct, SCT_t* SCT +) +{ + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + sLocalLU_t *Llu = LUstruct->Llu; + int_t* xsup = Glu_persist->xsup; + int_t iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + + + // int_t next_col = SUPERLU_MIN (k0 + num_look_aheads + 1, nsupers - 1); + // int_t next_k = perm_c_supno[next_col]; /* global block number for next colum*/ + int_t mkcol, mkrow; + + int_t kljb = LBj( next_k, grid ); /*local block number for next block*/ + int_t kijb = LBi( next_k, grid ); /*local block number for next block*/ + + int_t *kindexL ; /*for storing index vectors*/ + int_t *kindexU ; + mkrow = PROW (next_k, grid); + mkcol = PCOL (next_k, grid); + int_t ksup_size = SuperSize(next_k); + + int_t copyL_kljb = 0; + int_t copyU_kljb = 0; + int_t l_copy_len = 0; + int_t u_copy_len = 0; + + if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL && last_flag) + { + if (HyP->Lblock_dirty_bit[kljb] > -1) + { + copyL_kljb = 1; + int_t lastk0 = HyP->Lblock_dirty_bit[kljb]; + int_t streamIdk0Offload = lastk0 % sluGPU->nCudaStreams; + if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1) + { + // printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload); + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamIdk0Offload] = -1; + } + } + + kindexL = Lrowind_bc_ptr[kljb]; + l_copy_len = kindexL[1] * ksup_size; + } + + if ( mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL && last_flag ) + { + if (HyP->Ublock_dirty_bit[kijb] > -1) + { + copyU_kljb = 1; + int_t lastk0 = HyP->Ublock_dirty_bit[kijb]; + int_t streamIdk0Offload = lastk0 % sluGPU->nCudaStreams; + if (sluGPU->lastOffloadStream[streamIdk0Offload] == lastk0 && lastk0 != -1) + { + // printf("Waiting for Offload =%d to finish StreamId=%d\n", lastk0, streamIdk0Offload); + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(sluGPU->funCallStreams[streamIdk0Offload]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamIdk0Offload] = -1; + } + } + // copyU_kljb = HyP->Ublock_dirty_bit[kijb]>-1? 1: 0; + kindexU = Ufstnz_br_ptr[kijb]; + u_copy_len = kindexU[1]; + } + + // wait for streams if they have not been finished + + // d2Hred->next_col = next_col; + d2Hred->next_k = next_k; + d2Hred->kljb = kljb; + d2Hred->kijb = kijb; + d2Hred->copyL_kljb = copyL_kljb; + d2Hred->copyU_kljb = copyU_kljb; + d2Hred->l_copy_len = l_copy_len; + d2Hred->u_copy_len = u_copy_len; + d2Hred->kindexU = kindexU; + d2Hred->kindexL = kindexL; + d2Hred->mkrow = mkrow; + d2Hred->mkcol = mkcol; + d2Hred->ksup_size = ksup_size; + return 0; +} /* sinitD2Hreduce */ + +int sreduceGPUlu( + int last_flag, + d2Hreduce_t* d2Hred, + ssluGPU_t *sluGPU, + SCT_t *SCT, + gridinfo_t *grid, + sLUstruct_t *LUstruct +) +{ + sLocalLU_t *Llu = LUstruct->Llu; + int iam = grid->iam; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr; + float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr; + int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr; + float** Unzval_br_ptr = Llu->Unzval_br_ptr; + + cudaStream_t CopyStream; + sLUstruct_gpu_t *A_gpu; + A_gpu = sluGPU->A_gpu; + CopyStream = sluGPU->CopyStream; + + int_t kljb = d2Hred->kljb; + int_t kijb = d2Hred->kijb; + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + int_t mkrow = d2Hred->mkrow; + int_t mkcol = d2Hred->mkcol; + int_t ksup_size = d2Hred->ksup_size; + int_t *kindex; + if ((copyL_kljb || copyU_kljb) && last_flag ) + { + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(CopyStream); + SCT->PhiWaitTimer_2 += SuperLU_timer_() - ttx; + } + + double tt_start = SuperLU_timer_(); + + if (last_flag) { + if (mkcol == mycol && Lrowind_bc_ptr[kljb] != NULL ) + { + kindex = Lrowind_bc_ptr[kljb]; + int_t len = kindex[1]; + + if (copyL_kljb) + { + float *nzval_host; + nzval_host = Lnzval_bc_ptr[kljb]; + int_t llen = ksup_size * len; + float alpha = 1; + superlu_saxpy (llen, alpha, A_gpu->acc_L_buff, 1, nzval_host, 1); + } + + } + } + if (last_flag) { + if (mkrow == myrow && Ufstnz_br_ptr[kijb] != NULL ) + { + kindex = Ufstnz_br_ptr[kijb]; + int_t len = kindex[1]; + + if (copyU_kljb) + { + float *nzval_host; + nzval_host = Unzval_br_ptr[kijb]; + + float alpha = 1; + superlu_saxpy (len, alpha, A_gpu->acc_U_buff, 1, nzval_host, 1); + } + } + } + + double tt_end = SuperLU_timer_(); + SCT->AssemblyTimer += tt_end - tt_start; + return 0; +} /* sreduceGPUlu */ + + +int swaitGPUscu(int streamId, ssluGPU_t *sluGPU, SCT_t *SCT) +{ + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + return 0; +} + +int ssendLUpanelGPU2HOST( + int_t k0, + d2Hreduce_t* d2Hred, + ssluGPU_t *sluGPU +) +{ + int_t kljb = d2Hred->kljb; + int_t kijb = d2Hred->kijb; + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + int_t l_copy_len = d2Hred->l_copy_len; + int_t u_copy_len = d2Hred->u_copy_len; + cudaStream_t CopyStream = sluGPU->CopyStream;; + sLUstruct_gpu_t *A_gpu = sluGPU->A_gpu; + double tty = SuperLU_timer_(); + cudaEventRecord(A_gpu->ePCIeD2H_Start[k0], CopyStream); + if (copyL_kljb) + checkCuda(cudaMemcpyAsync(A_gpu->acc_L_buff, &A_gpu->LnzvalVec[A_gpu->LnzvalPtr_host[kljb]], + l_copy_len * sizeof(float), cudaMemcpyDeviceToHost, CopyStream ) ); + + if (copyU_kljb) + checkCuda(cudaMemcpyAsync(A_gpu->acc_U_buff, &A_gpu->UnzvalVec[A_gpu->UnzvalPtr_host[kijb]], + u_copy_len * sizeof(float), cudaMemcpyDeviceToHost, CopyStream ) ); + cudaEventRecord(A_gpu->ePCIeD2H_End[k0], CopyStream); + A_gpu->tHost_PCIeD2H += SuperLU_timer_() - tty; + A_gpu->cPCIeD2H += u_copy_len * sizeof(float) + l_copy_len * sizeof(float); + + return 0; +} + +/* Copy L and U panel data structures from host to the host part of the + data structures in A_gpu. + GPU is not involved in this routine. */ +int ssendSCUdataHost2GPU( + int_t streamId, + int_t* lsub, + int_t* usub, + float* bigU, + int_t bigu_send_size, + int_t Remain_lbuf_send_size, + ssluGPU_t *sluGPU, + HyP_t* HyP +) +{ + //{printf("....[enter] ssendSCUdataHost2GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} + + int_t usub_len = usub[2]; + int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + //{printf("....[2] in ssendSCUdataHost2GPU, lsub_len %d\n", lsub_len); fflush(stdout);} + sLUstruct_gpu_t *A_gpu = sluGPU->A_gpu; + memcpy(A_gpu->scubufs[streamId].lsub_buf, lsub, sizeof(int_t)*lsub_len); + memcpy(A_gpu->scubufs[streamId].usub_buf, usub, sizeof(int_t)*usub_len); + memcpy(A_gpu->scubufs[streamId].Remain_info_host, HyP->Remain_info, + sizeof(Remain_info_t)*HyP->RemainBlk); + memcpy(A_gpu->scubufs[streamId].Ublock_info_host, HyP->Ublock_info_Phi, + sizeof(Ublock_info_t)*HyP->num_u_blks_Phi); + memcpy(A_gpu->scubufs[streamId].Remain_L_buff_host, HyP->Remain_L_buff, + sizeof(float)*Remain_lbuf_send_size); + memcpy(A_gpu->scubufs[streamId].bigU_host, bigU, + sizeof(float)*bigu_send_size); + + return 0; +} + +/* Sherry: not used ?*/ +#if 0 +int freeSluGPU(ssluGPU_t *sluGPU) +{ + return 0; +} +#endif + +void sCopyLUToGPU3D ( + int_t* isNodeInMyGrid, + sLocalLU_t *A_host, /* distributed LU structure on host */ + ssluGPU_t *sluGPU, + Glu_persist_t *Glu_persist, int_t n, + gridinfo3d_t *grid3d, + int_t buffer_size, /* bigV size on GPU for Schur complement update */ + int_t bigu_size, + int_t ldt +) +{ + gridinfo_t* grid = &(grid3d->grid2d); + sLUstruct_gpu_t * A_gpu = sluGPU->A_gpu; + sLUstruct_gpu_t **dA_gpu = &(sluGPU->dA_gpu); + +#if ( PRNTlevel>=1 ) + if ( grid3d->iam == 0 ) print_occupancy(); +#endif + +#ifdef GPU_DEBUG + // if ( grid3d->iam == 0 ) + { + cudaDeviceProp devProp; + cudaGetDeviceProperties(&devProp, 0); + printDevProp(devProp); + } +#endif + int_t *xsup ; + xsup = Glu_persist->xsup; + int iam = grid->iam; + int nsupers = Glu_persist->supno[n - 1] + 1; + int_t Pc = grid->npcol; + int_t Pr = grid->nprow; + int_t myrow = MYROW (iam, grid); + int_t mycol = MYCOL (iam, grid); + int_t mrb = (nsupers + Pr - 1) / Pr; + int_t mcb = (nsupers + Pc - 1) / Pc; + int_t remain_l_max = A_host->bufmax[1]; + + /*copies of scalars for easy access*/ + A_gpu->nsupers = nsupers; + A_gpu->ScatterMOPCounter = 0; + A_gpu->GemmFLOPCounter = 0; + A_gpu->cPCIeH2D = 0; + A_gpu->cPCIeD2H = 0; + A_gpu->tHost_PCIeH2D = 0; + A_gpu->tHost_PCIeD2H = 0; + + /*initializing memory*/ + size_t max_gpu_memory = get_acc_memory (); + size_t gpu_mem_used = 0; + + void *tmp_ptr; + + A_gpu->xsup_host = xsup; + + int_t nCudaStreams = sluGPU->nCudaStreams; + /*pinned memory allocations. + Paged-locked memory by cudaMallocHost is accessible to the device.*/ + for (int streamId = 0; streamId < nCudaStreams; streamId++ ) { + void *tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, (n) * sizeof(int_t) )) ; + A_gpu->scubufs[streamId].usub_IndirectJ3_host = (int_t*) tmp_ptr; + + checkCudaErrors(cudaMalloc( &tmp_ptr, ( n) * sizeof(int_t) )); + A_gpu->scubufs[streamId].usub_IndirectJ3 = (int_t*) tmp_ptr; + gpu_mem_used += ( n) * sizeof(int_t); + checkCudaErrors(cudaMallocHost( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; + A_gpu->scubufs[streamId].Remain_info_host = (Remain_info_t*)tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; + A_gpu->scubufs[streamId].Ublock_info_host = (Ublock_info_t*)tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, remain_l_max * sizeof(float) )) ; + A_gpu->scubufs[streamId].Remain_L_buff_host = (float *) tmp_ptr; + checkCudaErrors(cudaMallocHost( &tmp_ptr, bigu_size * sizeof(float) )) ; + A_gpu->scubufs[streamId].bigU_host = (float *) tmp_ptr; + + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(float) * (A_host->bufmax[1]))); + A_gpu->acc_L_buff = (float *) tmp_ptr; + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(float) * (A_host->bufmax[3]))); + A_gpu->acc_U_buff = (float *) tmp_ptr; + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[0]))); + A_gpu->scubufs[streamId].lsub_buf = (int_t *) tmp_ptr; + checkCudaErrors(cudaMallocHost ( &tmp_ptr, sizeof(int_t) * (A_host->bufmax[2]))); + A_gpu->scubufs[streamId].usub_buf = (int_t *) tmp_ptr; + + checkCudaErrors(cudaMalloc( &tmp_ptr, remain_l_max * sizeof(float) )) ; + A_gpu->scubufs[streamId].Remain_L_buff = (float *) tmp_ptr; + gpu_mem_used += remain_l_max * sizeof(float); + checkCudaErrors(cudaMalloc( &tmp_ptr, bigu_size * sizeof(float) )) ; + A_gpu->scubufs[streamId].bigU = (float *) tmp_ptr; + gpu_mem_used += bigu_size * sizeof(float); + checkCudaErrors(cudaMalloc( &tmp_ptr, mcb * sizeof(Ublock_info_t) )) ; + A_gpu->scubufs[streamId].Ublock_info = (Ublock_info_t *) tmp_ptr; + gpu_mem_used += mcb * sizeof(Ublock_info_t); + checkCudaErrors(cudaMalloc( &tmp_ptr, mrb * sizeof(Remain_info_t) )) ; + A_gpu->scubufs[streamId].Remain_info = (Remain_info_t *) tmp_ptr; + gpu_mem_used += mrb * sizeof(Remain_info_t); + checkCudaErrors(cudaMalloc( &tmp_ptr, buffer_size * sizeof(float))) ; + A_gpu->scubufs[streamId].bigV = (float *) tmp_ptr; + gpu_mem_used += buffer_size * sizeof(float); + checkCudaErrors(cudaMalloc( &tmp_ptr, A_host->bufmax[0]*sizeof(int_t))) ; + A_gpu->scubufs[streamId].lsub = (int_t *) tmp_ptr; + gpu_mem_used += A_host->bufmax[0] * sizeof(int_t); + checkCudaErrors(cudaMalloc( &tmp_ptr, A_host->bufmax[2]*sizeof(int_t))) ; + A_gpu->scubufs[streamId].usub = (int_t *) tmp_ptr; + gpu_mem_used += A_host->bufmax[2] * sizeof(int_t); + + } /* endfor streamID ... allocate paged-locked memory */ + + A_gpu->isOffloaded = (int *) SUPERLU_MALLOC (sizeof(int) * nsupers); + A_gpu->GemmStart = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->GemmEnd = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->ScatterEnd = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeH2D = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeD2H_Start = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeD2H_End = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + + for (int i = 0; i < nsupers; ++i) + { + A_gpu->isOffloaded[i] = 0; + checkCudaErrors(cudaEventCreate(&(A_gpu->GemmStart[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->GemmEnd[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ScatterEnd[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeH2D[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeH2D[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeD2H_Start[i]))); + checkCudaErrors(cudaEventCreate(&(A_gpu->ePCIeD2H_End[i]))); + } + + /*---- Copy L data structure to GPU ----*/ + + /*pointers and address of local blocks for easy accessibility */ + local_l_blk_info_t *local_l_blk_infoVec; + int_t * local_l_blk_infoPtr; + local_l_blk_infoPtr = (int_t *) malloc( CEILING(nsupers, Pc) * sizeof(int_t ) ); + + /* First pass: count total L blocks */ + int_t cum_num_l_blocks = 0; /* total number of L blocks I own */ + for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) + { + /* going through each block column I own */ + + if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) + { + int_t *index = A_host->Lrowind_bc_ptr[i]; + int_t num_l_blocks = index[0]; + cum_num_l_blocks += num_l_blocks; + } + } + + /*allocating memory*/ + local_l_blk_infoVec = (local_l_blk_info_t *) malloc(cum_num_l_blocks * sizeof(local_l_blk_info_t)); + + /* Second pass: set up the meta-data for the L structure */ + cum_num_l_blocks = 0; + + /*initialzing vectors */ + for (int_t i = 0; i < CEILING(nsupers, Pc); ++i) + { + if (A_host->Lrowind_bc_ptr[i] != NULL && isNodeInMyGrid[i * Pc + mycol] == 1) + { + int_t *index = A_host->Lrowind_bc_ptr[i]; + int_t num_l_blocks = index[0]; /* # L blocks in this column */ + + if (num_l_blocks > 0) + { + + local_l_blk_info_t *local_l_blk_info_i = local_l_blk_infoVec + cum_num_l_blocks; + local_l_blk_infoPtr[i] = cum_num_l_blocks; + + int_t lptrj = BC_HEADER; + int_t luptrj = 0; + + for (int_t j = 0; j < num_l_blocks ; ++j) + { + + int_t ijb = index[lptrj]; + + local_l_blk_info_i[j].lib = ijb / Pr; + local_l_blk_info_i[j].lptrj = lptrj; + local_l_blk_info_i[j].luptrj = luptrj; + luptrj += index[lptrj + 1]; + lptrj += LB_DESCRIPTOR + index[lptrj + 1]; + + } + } + cum_num_l_blocks += num_l_blocks; + } + + } /* endfor all block columns */ + + /* Allocate L memory on GPU, and copy the values from CPU to GPU */ + checkCudaErrors(cudaMalloc( &tmp_ptr, cum_num_l_blocks * sizeof(local_l_blk_info_t))) ; + A_gpu->local_l_blk_infoVec = (local_l_blk_info_t *) tmp_ptr; + gpu_mem_used += cum_num_l_blocks * sizeof(local_l_blk_info_t); + checkCudaErrors(cudaMemcpy( (A_gpu->local_l_blk_infoVec), local_l_blk_infoVec, cum_num_l_blocks * sizeof(local_l_blk_info_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, CEILING(nsupers, Pc)*sizeof(int_t))) ; + A_gpu->local_l_blk_infoPtr = (int_t *) tmp_ptr; + gpu_mem_used += CEILING(nsupers, Pc) * sizeof(int_t); + checkCudaErrors(cudaMemcpy( (A_gpu->local_l_blk_infoPtr), local_l_blk_infoPtr, CEILING(nsupers, Pc)*sizeof(int_t), cudaMemcpyHostToDevice)) ; + + /*---- Copy U data structure to GPU ----*/ + + local_u_blk_info_t *local_u_blk_infoVec; + int_t * local_u_blk_infoPtr; + local_u_blk_infoPtr = (int_t *) malloc( CEILING(nsupers, Pr) * sizeof(int_t ) ); + + /* First pass: count total U blocks */ + int_t cum_num_u_blocks = 0; + + for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) + { + + if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) + { + int_t *index = A_host->Ufstnz_br_ptr[i]; + int_t num_u_blocks = index[0]; + cum_num_u_blocks += num_u_blocks; + + } + } + + local_u_blk_infoVec = (local_u_blk_info_t *) malloc(cum_num_u_blocks * sizeof(local_u_blk_info_t)); + + /* Second pass: set up the meta-data for the U structure */ + cum_num_u_blocks = 0; + + for (int_t i = 0; i < CEILING(nsupers, Pr); ++i) + { + if (A_host->Ufstnz_br_ptr[i] != NULL && isNodeInMyGrid[i * Pr + myrow] == 1) + { + int_t *index = A_host->Ufstnz_br_ptr[i]; + int_t num_u_blocks = index[0]; + + if (num_u_blocks > 0) + { + local_u_blk_info_t *local_u_blk_info_i = local_u_blk_infoVec + cum_num_u_blocks; + local_u_blk_infoPtr[i] = cum_num_u_blocks; + + int_t iuip_lib, ruip_lib; + iuip_lib = BR_HEADER; + ruip_lib = 0; + + for (int_t j = 0; j < num_u_blocks ; ++j) + { + + int_t ijb = index[iuip_lib]; + local_u_blk_info_i[j].ljb = ijb / Pc; + local_u_blk_info_i[j].iuip = iuip_lib; + local_u_blk_info_i[j].ruip = ruip_lib; + + ruip_lib += index[iuip_lib + 1]; + iuip_lib += UB_DESCRIPTOR + SuperSize (ijb); + + } + } + cum_num_u_blocks += num_u_blocks; + } + } + + checkCudaErrors(cudaMalloc( &tmp_ptr, cum_num_u_blocks * sizeof(local_u_blk_info_t))) ; + A_gpu->local_u_blk_infoVec = (local_u_blk_info_t *) tmp_ptr; + gpu_mem_used += cum_num_u_blocks * sizeof(local_u_blk_info_t); + checkCudaErrors(cudaMemcpy( (A_gpu->local_u_blk_infoVec), local_u_blk_infoVec, cum_num_u_blocks * sizeof(local_u_blk_info_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, CEILING(nsupers, Pr)*sizeof(int_t))) ; + A_gpu->local_u_blk_infoPtr = (int_t *) tmp_ptr; + gpu_mem_used += CEILING(nsupers, Pr) * sizeof(int_t); + checkCudaErrors(cudaMemcpy( (A_gpu->local_u_blk_infoPtr), local_u_blk_infoPtr, CEILING(nsupers, Pr)*sizeof(int_t), cudaMemcpyHostToDevice)) ; + + /* Copy the actual L indices and values */ + int_t l_k = CEILING( nsupers, grid->npcol ); /* # of local block columns */ + int_t *temp_LrowindPtr = (int_t *) malloc(sizeof(int_t) * l_k); + int_t *temp_LnzvalPtr = (int_t *) malloc(sizeof(int_t) * l_k); + int_t *Lnzval_size = (int_t *) malloc(sizeof(int_t) * l_k); + int_t l_ind_len = 0; + int_t l_val_len = 0; + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc && isNodeInMyGrid[jb] == 1) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *index_host; + index_host = A_host->Lrowind_bc_ptr[ljb]; + + temp_LrowindPtr[ljb] = l_ind_len; + temp_LnzvalPtr[ljb] = l_val_len; // ### + Lnzval_size[ljb] = 0; //### + if (index_host != NULL) + { + int_t nrbl = index_host[0]; /* number of L blocks */ + int_t len = index_host[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + + /* Global block number is mycol + ljb*Pc */ + int_t nsupc = SuperSize(jb); + + l_ind_len += len1; + l_val_len += len * nsupc; + Lnzval_size[ljb] = len * nsupc ; // ### + } + else + { + Lnzval_size[ljb] = 0 ; // ### + } + } + } /* endfor jb = 0 ... */ + + /* Copy the actual U indices and values */ + int_t u_k = CEILING( nsupers, grid->nprow ); /* Number of local block rows */ + int_t *temp_UrowindPtr = (int_t *) malloc(sizeof(int_t) * u_k); + int_t *temp_UnzvalPtr = (int_t *) malloc(sizeof(int_t) * u_k); + int_t *Unzval_size = (int_t *) malloc(sizeof(int_t) * u_k); + int_t u_ind_len = 0; + int_t u_val_len = 0; + for ( int_t lb = 0; lb < u_k; ++lb) + { + int_t *index_host; + index_host = A_host->Ufstnz_br_ptr[lb]; + temp_UrowindPtr[lb] = u_ind_len; + temp_UnzvalPtr[lb] = u_val_len; + Unzval_size[lb] = 0; + if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) + { + int_t len = index_host[1]; + int_t len1 = index_host[2]; + + u_ind_len += len1; + u_val_len += len; + Unzval_size[lb] = len; + } + else + { + Unzval_size[lb] = 0; + } + } + + gpu_mem_used += l_ind_len * sizeof(int_t); + gpu_mem_used += 2 * l_k * sizeof(int_t); + gpu_mem_used += u_ind_len * sizeof(int_t); + gpu_mem_used += 2 * u_k * sizeof(int_t); + + /*left memory shall be divided among the two */ + + for (int_t i = 0; i < l_k; ++i) + { + temp_LnzvalPtr[i] = -1; + } + + for (int_t i = 0; i < u_k; ++i) + { + temp_UnzvalPtr[i] = -1; + } + + /*setting these pointers back */ + l_val_len = 0; + u_val_len = 0; + + int_t num_gpu_l_blocks = 0; + int_t num_gpu_u_blocks = 0; + size_t mem_l_block, mem_u_block; + + /* Find the trailing matrix size that can fit into GPU memory */ + for (int_t i = nsupers - 1; i > -1; --i) + { + /* ulte se chalte hai eleimination tree */ + /* bottom up ordering */ + int_t i_sup = A_gpu->perm_c_supno[i]; + + int_t pc = PCOL( i_sup, grid ); + if (isNodeInMyGrid[i_sup] == 1) + { + if (mycol == pc ) + { + int_t ljb = LBj(i_sup, grid); + mem_l_block = sizeof(float) * Lnzval_size[ljb]; + if (gpu_mem_used + mem_l_block > max_gpu_memory) + { + break; + } + else + { + gpu_mem_used += mem_l_block; + temp_LnzvalPtr[ljb] = l_val_len; + l_val_len += Lnzval_size[ljb]; + num_gpu_l_blocks++; + A_gpu->first_l_block_gpu = i; + } + } + + int_t pr = PROW( i_sup, grid ); + if (myrow == pr) + { + int_t lib = LBi(i_sup, grid); + mem_u_block = sizeof(float) * Unzval_size[lib]; + if (gpu_mem_used + mem_u_block > max_gpu_memory) + { + break; + } + else + { + gpu_mem_used += mem_u_block; + temp_UnzvalPtr[lib] = u_val_len; + u_val_len += Unzval_size[lib]; + num_gpu_u_blocks++; + A_gpu->first_u_block_gpu = i; + } + } + } /* endif */ + + } /* endfor i .... nsupers */ + +#if (PRNTlevel>=2) + printf("(%d) Number of L blocks in GPU %d, U blocks %d\n", + grid3d->iam, num_gpu_l_blocks, num_gpu_u_blocks ); + printf("(%d) elimination order of first block in GPU: L block %d, U block %d\n", + grid3d->iam, A_gpu->first_l_block_gpu, A_gpu->first_u_block_gpu); + printf("(%d) Memory of L %.1f GB, memory for U %.1f GB, Total device memory used %.1f GB, Memory allowed %.1f GB \n", grid3d->iam, + l_val_len * sizeof(float) * 1e-9, + u_val_len * sizeof(float) * 1e-9, + gpu_mem_used * 1e-9, max_gpu_memory * 1e-9); + fflush(stdout); +#endif + + /* Assemble index vector on temp */ + int_t *indtemp = (int_t *) malloc(sizeof(int_t) * l_ind_len); + for (int_t jb = 0; jb < nsupers; ++jb) /* for each block column ... */ + { + int_t pc = PCOL( jb, grid ); + if (mycol == pc && isNodeInMyGrid[jb] == 1) + { + int_t ljb = LBj( jb, grid ); /* Local block number */ + int_t *index_host; + index_host = A_host->Lrowind_bc_ptr[ljb]; + + if (index_host != NULL) + { + int_t nrbl = index_host[0]; /* number of L blocks */ + int_t len = index_host[1]; /* LDA of the nzval[] */ + int_t len1 = len + BC_HEADER + nrbl * LB_DESCRIPTOR; + + memcpy(&indtemp[temp_LrowindPtr[ljb]] , index_host, len1 * sizeof(int_t)) ; + } + } + } + + checkCudaErrors(cudaMalloc( &tmp_ptr, l_ind_len * sizeof(int_t))) ; + A_gpu->LrowindVec = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->LrowindVec), indtemp, l_ind_len * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, l_val_len * sizeof(float))); + A_gpu->LnzvalVec = (float *) tmp_ptr; + checkCudaErrors(cudaMemset( (A_gpu->LnzvalVec), 0, l_val_len * sizeof(float))); + + checkCudaErrors(cudaMalloc( &tmp_ptr, l_k * sizeof(int_t))) ; + A_gpu->LrowindPtr = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->LrowindPtr), temp_LrowindPtr, l_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, l_k * sizeof(int_t))) ; + A_gpu->LnzvalPtr = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->LnzvalPtr), temp_LnzvalPtr, l_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + A_gpu->LnzvalPtr_host = temp_LnzvalPtr; + + int_t *indtemp1 = (int_t *) malloc(sizeof(int_t) * u_ind_len); + for ( int_t lb = 0; lb < u_k; ++lb) + { + int_t *index_host; + index_host = A_host->Ufstnz_br_ptr[lb]; + + if (index_host != NULL && isNodeInMyGrid[lb * Pr + myrow] == 1) + { + int_t len1 = index_host[2]; + memcpy(&indtemp1[temp_UrowindPtr[lb]] , index_host, sizeof(int_t)*len1); + } + } + + checkCudaErrors(cudaMalloc( &tmp_ptr, u_ind_len * sizeof(int_t))) ; + A_gpu->UrowindVec = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->UrowindVec), indtemp1, u_ind_len * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, u_val_len * sizeof(float))); + A_gpu->UnzvalVec = (float *) tmp_ptr; + checkCudaErrors(cudaMemset( (A_gpu->UnzvalVec), 0, u_val_len * sizeof(float))); + + checkCudaErrors(cudaMalloc( &tmp_ptr, u_k * sizeof(int_t))) ; + A_gpu->UrowindPtr = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->UrowindPtr), temp_UrowindPtr, u_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + A_gpu->UnzvalPtr_host = temp_UnzvalPtr; + + checkCudaErrors(cudaMalloc( &tmp_ptr, u_k * sizeof(int_t))) ; + A_gpu->UnzvalPtr = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->UnzvalPtr), temp_UnzvalPtr, u_k * sizeof(int_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, (nsupers + 1)*sizeof(int_t))) ; + A_gpu->xsup = (int_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( (A_gpu->xsup), xsup, (nsupers + 1)*sizeof(int_t), cudaMemcpyHostToDevice)) ; + + checkCudaErrors(cudaMalloc( &tmp_ptr, sizeof(sLUstruct_gpu_t))) ; + *dA_gpu = (sLUstruct_gpu_t *) tmp_ptr; + checkCudaErrors(cudaMemcpy( *dA_gpu, A_gpu, sizeof(sLUstruct_gpu_t), cudaMemcpyHostToDevice)) ; + + free (temp_LrowindPtr); + free (temp_UrowindPtr); + free (indtemp1); + free (indtemp); + +} /* end sCopyLUToGPU3D */ + + + +int sreduceAllAncestors3d_GPU(int_t ilvl, int_t* myNodeCount, + int_t** treePerm, + sLUValSubBuf_t*LUvsb, + sLUstruct_t* LUstruct, + gridinfo3d_t* grid3d, + ssluGPU_t *sluGPU, + d2Hreduce_t* d2Hred, + factStat_t *factStat, + HyP_t* HyP, SCT_t* SCT ) +{ + // first synchronize all cuda streams + int superlu_acc_offload = HyP->superlu_acc_offload; + + int_t maxLvl = log2i( (int_t) grid3d->zscp.Np) + 1; + int_t myGrid = grid3d->zscp.Iam; + gridinfo_t* grid = &(grid3d->grid2d); + int_t* gpuLUreduced = factStat->gpuLUreduced; + + int_t sender; + if ((myGrid % (1 << (ilvl + 1))) == 0) + { + sender = myGrid + (1 << ilvl); + + } + else + { + sender = myGrid; + } + + /*Reduce all the ancestors from the GPU*/ + if (myGrid == sender && superlu_acc_offload) + { + for (int_t streamId = 0; streamId < sluGPU->nCudaStreams; streamId++) + { + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamId] = -1; + } + + for (int_t alvl = ilvl + 1; alvl < maxLvl; ++alvl) + { + /* code */ + // int_t atree = myTreeIdxs[alvl]; + int_t nsAncestor = myNodeCount[alvl]; + int_t* cAncestorList = treePerm[alvl]; + + for (int_t node = 0; node < nsAncestor; node++ ) + { + int_t k = cAncestorList[node]; + if (!gpuLUreduced[k]) + { + sinitD2Hreduce(k, d2Hred, 1, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + double tt_start1 = SuperLU_timer_(); + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + if (copyL_kljb || copyU_kljb) SCT->PhiMemCpyCounter++; + ssendLUpanelGPU2HOST(k, d2Hred, sluGPU); + /* + Reduce the LU panels from GPU + */ + sreduceGPUlu(1, d2Hred, sluGPU, SCT, grid, LUstruct); + gpuLUreduced[k] = 1; + } + } + } + } /*if (myGrid == sender)*/ + + sreduceAllAncestors3d(ilvl, myNodeCount, treePerm, + LUvsb, LUstruct, grid3d, SCT ); + return 0; +} /* sreduceAllAncestors3d_GPU */ + + +void ssyncAllfunCallStreams(ssluGPU_t* sluGPU, SCT_t* SCT) +{ + for (int streamId = 0; streamId < sluGPU->nCudaStreams; streamId++) + { + double ttx = SuperLU_timer_(); + cudaStreamSynchronize(sluGPU->funCallStreams[streamId]); + SCT->PhiWaitTimer += SuperLU_timer_() - ttx; + sluGPU->lastOffloadStream[streamId] = -1; + } +} diff --git a/SRC/streeFactorization.c b/SRC/streeFactorization.c new file mode 100644 index 00000000..e56bca43 --- /dev/null +++ b/SRC/streeFactorization.c @@ -0,0 +1,746 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Factorization routines for the subtree using 2D process grid. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+#include "superlu_sdefs.h"
+#if 0
+#include "treeFactorization.h"
+#include "trfCommWrapper.h"
+#endif
+
+int_t sLluBufInit(sLUValSubBuf_t* LUvsb, sLUstruct_t *LUstruct)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    LUvsb->Lsub_buf = intMalloc_dist(Llu->bufmax[0]); //INT_T_ALLOC(Llu->bufmax[0]);
+    LUvsb->Lval_buf = floatMalloc_dist(Llu->bufmax[1]); //DOUBLE_ALLOC(Llu->bufmax[1]);
+    LUvsb->Usub_buf = intMalloc_dist(Llu->bufmax[2]); //INT_T_ALLOC(Llu->bufmax[2]);
+    LUvsb->Uval_buf = floatMalloc_dist(Llu->bufmax[3]); //DOUBLE_ALLOC(Llu->bufmax[3]);
+    return 0;
+}
+
+sdiagFactBufs_t** sinitDiagFactBufsArr(int_t mxLeafNode, int_t ldt, gridinfo_t* grid)
+{
+    sdiagFactBufs_t** dFBufs;
+
+    /* Sherry fix:
+     * mxLeafNode can be 0 for the replicated layers of the processes ?? */
+    if ( mxLeafNode ) dFBufs = (sdiagFactBufs_t** )
+                          SUPERLU_MALLOC(mxLeafNode * sizeof(sdiagFactBufs_t*));
+
+    for (int i = 0; i < mxLeafNode; ++i)
+    {
+        /* code */
+        dFBufs[i] = (sdiagFactBufs_t* ) SUPERLU_MALLOC(sizeof(sdiagFactBufs_t));
+        assert(dFBufs[i]);
+        sinitDiagFactBufs(ldt, dFBufs[i]);
+
+    }/*Minor for loop -2 for (int i = 0; i < mxLeafNode; ++i)*/
+
+    return dFBufs;
+}
+
+// sherry added
+int sfreeDiagFactBufsArr(int_t mxLeafNode, sdiagFactBufs_t** dFBufs)
+{
+    for (int i = 0; i < mxLeafNode; ++i) {
+	SUPERLU_FREE(dFBufs[i]->BlockUFactor);
+	SUPERLU_FREE(dFBufs[i]->BlockLFactor);
+	SUPERLU_FREE(dFBufs[i]);
+    }
+
+    /* Sherry fix:
+     * mxLeafNode can be 0 for the replicated layers of the processes ?? */
+    if ( mxLeafNode ) SUPERLU_FREE(dFBufs);
+
+    return 0;
+}
+
+sLUValSubBuf_t** sLluBufInitArr(int_t numLA, sLUstruct_t *LUstruct)
+{
+    sLUValSubBuf_t** LUvsbs = (sLUValSubBuf_t**) SUPERLU_MALLOC(numLA * sizeof(sLUValSubBuf_t*));
+    for (int_t i = 0; i < numLA; ++i)
+    {
+        /* code */
+        LUvsbs[i] = (sLUValSubBuf_t*) SUPERLU_MALLOC(sizeof(sLUValSubBuf_t));
+        sLluBufInit(LUvsbs[i], LUstruct);
+    } /*minor for loop-3 for (int_t i = 0; i < numLA; ++i)*/
+
+    return LUvsbs;
+}
+
+// sherry added
+int sLluBufFreeArr(int_t numLA, sLUValSubBuf_t **LUvsbs)
+{
+    for (int_t i = 0; i < numLA; ++i) {
+	SUPERLU_FREE(LUvsbs[i]->Lsub_buf);
+	SUPERLU_FREE(LUvsbs[i]->Lval_buf);
+	SUPERLU_FREE(LUvsbs[i]->Usub_buf);
+	SUPERLU_FREE(LUvsbs[i]->Uval_buf);
+	SUPERLU_FREE(LUvsbs[i]);
+    }
+    SUPERLU_FREE(LUvsbs);
+    return 0;
+}
+
+
+int_t sinitScuBufs(int_t ldt, int_t num_threads, int_t nsupers,
+                  sscuBufs_t* scuBufs,
+                  sLUstruct_t* LUstruct,
+                  gridinfo_t * grid)
+{
+    scuBufs->bigV = sgetBigV(ldt, num_threads);
+    scuBufs->bigU = sgetBigU(nsupers, grid, LUstruct);
+    return 0;
+}
+
+// sherry added
+int sfreeScuBufs(sscuBufs_t* scuBufs)
+{
+    SUPERLU_FREE(scuBufs->bigV);
+    SUPERLU_FREE(scuBufs->bigU);
+    return 0;
+}
+
+int_t sinitDiagFactBufs(int_t ldt, sdiagFactBufs_t* dFBuf)
+{
+    dFBuf->BlockUFactor = floatMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt);
+    dFBuf->BlockLFactor = floatMalloc_dist(ldt * ldt); //DOUBLE_ALLOC( ldt * ldt);
+    return 0;
+}
+
+int_t sdenseTreeFactor(
+    int_t nnodes,          // number of nodes in the tree
+    int_t *perm_c_supno,    // list of nodes in the order of factorization
+    commRequests_t *comReqs,    // lists of communication requests
+    sscuBufs_t *scuBufs,   // contains buffers for schur complement update
+    packLUInfo_t*packLUInfo,
+    msgs_t*msgs,
+    sLUValSubBuf_t* LUvsb,
+    sdiagFactBufs_t *dFBuf,
+    factStat_t *factStat,
+    factNodelists_t  *fNlists,
+    superlu_dist_options_t *options,
+    int_t * gIperm_c_supno,
+    int_t ldt,
+    sLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    double thresh,  SCT_t *SCT, int tag_ub,
+    int *info
+)
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    /*main loop over all the super nodes*/
+    for (int_t k0 = 0; k0 < nnodes   ; ++k0)
+    {
+        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+
+        /* diagonal factorization */
+#if 0
+        sDiagFactIBCast(k,  dFBuf, factStat, comReqs, grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+	sDiagFactIBCast(k, k, dFBuf->BlockUFactor, dFBuf->BlockLFactor,
+			factStat->IrecvPlcd_D,
+			comReqs->U_diag_blk_recv_req, 
+			comReqs->L_diag_blk_recv_req,
+			comReqs->U_diag_blk_send_req, 
+			comReqs->L_diag_blk_send_req,
+			grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+
+#if 0
+        /*L update */
+        sLPanelUpdate(k,  dFBuf, factStat, comReqs, grid, LUstruct, SCT);
+        /*L Ibcast*/
+        sIBcastRecvLPanel( k, comReqs,  LUvsb,  msgs, factStat, grid, LUstruct, SCT, tag_ub );
+        /*U update*/
+        sUPanelUpdate(k, ldt, dFBuf, factStat, comReqs, scuBufs,
+                      packLUInfo, grid, LUstruct, stat, SCT);
+        /*U bcast*/
+        sIBcastRecvUPanel( k, comReqs,  LUvsb,  msgs, factStat, grid, LUstruct, SCT, tag_ub );
+        /*Wait for L panel*/
+        sWaitL(k, comReqs, msgs, grid, LUstruct, SCT);
+        /*Wait for U panel*/
+        sWaitU(k, comReqs, msgs, grid, LUstruct, SCT);
+#else
+        /*L update */
+	sLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+		      comReqs->U_diag_blk_recv_req, dFBuf->BlockUFactor, grid, LUstruct, SCT);
+        /*L Ibcast*/
+	sIBcastRecvLPanel(k, k, msgs->msgcnt, comReqs->send_req, comReqs->recv_req,
+			  LUvsb->Lsub_buf, LUvsb->Lval_buf, factStat->factored, 
+			  grid, LUstruct, SCT, tag_ub);
+        /*U update*/
+	sUPanelUpdate(k, factStat->factored_U, comReqs->L_diag_blk_recv_req,
+		      dFBuf->BlockLFactor, scuBufs->bigV, ldt,
+		      packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+        /*U bcast*/
+	sIBcastRecvUPanel(k, k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ,
+			  LUvsb->Usub_buf, LUvsb->Uval_buf, 
+			  grid, LUstruct, SCT, tag_ub);
+	sWaitL(k, msgs->msgcnt, msgs->msgcntU, comReqs->send_req, comReqs->recv_req,
+	       grid, LUstruct, SCT);
+	sWaitU(k, msgs->msgcnt, comReqs->send_requ, comReqs->recv_requ, grid, LUstruct, SCT);
+#endif
+        double tsch = SuperLU_timer_();
+#if 0
+        int_t LU_nonempty = sSchurComplementSetup(k,
+                            msgs, packLUInfo, gIperm_c_supno, perm_c_supno,
+                            fNlists, scuBufs,  LUvsb, grid, LUstruct);
+#else
+	int_t LU_nonempty= sSchurComplementSetup(k, msgs->msgcnt,
+				 packLUInfo->Ublock_info, packLUInfo->Remain_info,
+				 packLUInfo->uPanelInfo, packLUInfo->lPanelInfo,
+				 gIperm_c_supno, fNlists->iperm_u, fNlists->perm_u,
+				 scuBufs->bigU, LUvsb->Lsub_buf, LUvsb->Lval_buf,
+				 LUvsb->Usub_buf, LUvsb->Uval_buf,
+				 grid, LUstruct);
+#endif
+        if (LU_nonempty)
+        {
+            Ublock_info_t* Ublock_info = packLUInfo->Ublock_info;
+            Remain_info_t*  Remain_info = packLUInfo->Remain_info;
+            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            int* indirect  = fNlists->indirect;
+            int* indirect2  = fNlists->indirect2;
+            /*Schurcomplement Update*/
+            int_t nub = uPanelInfo->nub;
+            int_t nlb = lPanelInfo->nlb;
+            float* bigV = scuBufs->bigV;
+            float* bigU = scuBufs->bigU;
+
+#pragma omp parallel for schedule(dynamic)
+            for (int_t ij = 0; ij < nub * nlb; ++ij)
+            {
+                /* code */
+                int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+                float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+                int_t** Ufstnz_br_ptr = LUstruct->Llu->Ufstnz_br_ptr;
+                float** Unzval_br_ptr = LUstruct->Llu->Unzval_br_ptr;
+                int_t* xsup = LUstruct->Glu_persist->xsup;
+                int_t ub = ij / nlb;
+                int_t lb
+                    = ij % nlb;
+                float *L_mat = lPanelInfo->lusup;
+                int_t ldl = lPanelInfo->nsupr;
+                int_t luptr0 = lPanelInfo->luptr0;
+                float *U_mat = bigU;
+                int_t ldu = uPanelInfo->ldu;
+                int_t knsupc = SuperSize(k);
+                int_t klst = FstBlockC (k + 1);
+                int_t *lsub = lPanelInfo->lsub;
+                int_t *usub = uPanelInfo->usub;
+#ifdef _OPENMP		
+                int_t thread_id = omp_get_thread_num();
+#else		
+                int_t thread_id = 0;
+#endif		
+                sblock_gemm_scatter( lb, ub,
+                                    Ublock_info,
+                                    Remain_info,
+                                    &L_mat[luptr0], ldl,
+                                    U_mat, ldu,
+                                    bigV,
+                                    knsupc, klst,
+                                    lsub, usub, ldt,
+                                    thread_id, indirect, indirect2,
+                                    Lrowind_bc_ptr, Lnzval_bc_ptr,
+                                    Ufstnz_br_ptr, Unzval_br_ptr,
+                                    xsup, grid, stat
+#ifdef SCATTER_PROFILE
+                                    , Host_TheadScatterMOP, Host_TheadScatterTimer
+#endif
+                                  );
+            } /*for (int_t ij = 0; ij < nub * nlb;*/
+        } /*if (LU_nonempty)*/
+        SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+#if 0
+        sWait_LUDiagSend(k,  comReqs, grid, SCT);
+#else
+	Wait_LUDiagSend(k, comReqs->U_diag_blk_send_req, comReqs->L_diag_blk_send_req, 
+			grid, SCT);
+#endif
+    }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/
+
+    return 0;
+} /* sdenseTreeFactor */
+
+/*
+ * 2D factorization at individual subtree. -- CPU only
+ */
+int_t ssparseTreeFactor_ASYNC(
+    sForest_t* sforest,
+    commRequests_t **comReqss,    // lists of communication requests // size maxEtree level
+    sscuBufs_t *scuBufs,       // contains buffers for schur complement update
+    packLUInfo_t*packLUInfo,
+    msgs_t**msgss,                  // size=num Look ahead
+    sLUValSubBuf_t** LUvsbs,          // size=num Look ahead
+    sdiagFactBufs_t **dFBufs,         // size maxEtree level
+    factStat_t *factStat,
+    factNodelists_t  *fNlists,
+    gEtreeInfo_t*   gEtreeInfo,        // global etree info
+    superlu_dist_options_t *options,
+    int_t * gIperm_c_supno,
+    int_t ldt,
+    HyP_t* HyP,
+    sLUstruct_t *LUstruct, gridinfo3d_t * grid3d, SuperLUStat_t *stat,
+    double thresh,  SCT_t *SCT, int tag_ub,
+    int *info
+)
+{
+    int_t nnodes =   sforest->nNodes ;      // number of nodes in the tree
+    if (nnodes < 1)
+    {
+        return 1;
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter ssparseTreeFactor_ASYNC()");
+#endif
+
+    int_t *perm_c_supno = sforest->nodeList ;  // list of nodes in the order of factorization
+    treeTopoInfo_t* treeTopoInfo = &sforest->topoInfo;
+    int_t* myIperm = treeTopoInfo->myIperm;
+
+    gridinfo_t* grid = &(grid3d->grid2d);
+    /*main loop over all the levels*/
+
+    int_t maxTopoLevel = treeTopoInfo->numLvl;
+    int_t* eTreeTopLims = treeTopoInfo->eTreeTopLims;
+    int_t * IrecvPlcd_D = factStat->IrecvPlcd_D;
+    int_t* factored_D = factStat->factored_D;
+    int_t * factored_L = factStat->factored_L;
+    int_t * factored_U = factStat->factored_U;
+    int_t* IbcastPanel_L = factStat->IbcastPanel_L;
+    int_t* IbcastPanel_U = factStat->IbcastPanel_U;
+    int_t* xsup = LUstruct->Glu_persist->xsup;
+
+    int_t numLAMax = getNumLookAhead(options);
+    int_t numLA = numLAMax;
+
+    for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0)
+    {
+        int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+        int_t offset = k0;
+        /* k-th diagonal factorization */
+        /*Now factor and broadcast diagonal block*/
+#if 0
+        sDiagFactIBCast(k,  dFBufs[offset], factStat, comReqss[offset], grid,
+                        options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+	sDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor,
+			factStat->IrecvPlcd_D,
+			comReqss[offset]->U_diag_blk_recv_req, 
+			comReqss[offset]->L_diag_blk_recv_req,
+			comReqss[offset]->U_diag_blk_send_req, 
+			comReqss[offset]->L_diag_blk_send_req,
+			grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+        factored_D[k] = 1;
+    }
+
+    for (int_t topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl)
+    {
+        /* code */
+        int_t k_st = eTreeTopLims[topoLvl];
+        int_t k_end = eTreeTopLims[topoLvl + 1];
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+            /* diagonal factorization */
+            if (!factored_D[k] )
+            {
+                /*If LU panels from GPU are not reduced then reduce
+                them before diagonal factorization*/
+#if 0
+                sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid,
+                                options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#else
+		sDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor,
+				dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+				comReqss[offset]->U_diag_blk_recv_req, 
+				comReqss[offset]->L_diag_blk_recv_req,
+				comReqss[offset]->U_diag_blk_send_req, 
+				comReqss[offset]->L_diag_blk_send_req,
+				grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+            }
+        }
+        double t_apt = SuperLU_timer_();
+
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 - k_st;
+
+            /*L update */
+            if (factored_L[k] == 0)
+            {  
+#if 0
+		sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset],
+			      grid, LUstruct, SCT);
+#else
+		sLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L,
+			      comReqss[offset]->U_diag_blk_recv_req, 
+			      dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT);
+#endif
+                factored_L[k] = 1;
+            }
+            /*U update*/
+            if (factored_U[k] == 0)
+            {
+#if 0
+		sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset],
+			      scuBufs, packLUInfo, grid, LUstruct, stat, SCT);
+#else
+		sUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req,
+			      dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt,
+			      packLUInfo->Ublock_info, grid, LUstruct, stat, SCT);
+#endif
+                factored_U[k] = 1;
+            }
+        }
+
+        for (int_t k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+            /* diagonal factorization */
+
+            /*L Ibcast*/
+            if (IbcastPanel_L[k] == 0)
+	    {
+#if 0
+                sIBcastRecvLPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+		sIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req,
+				  comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf,
+				  LUvsbs[offset]->Lval_buf, factStat->factored, 
+				  grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_L[k] = 1; /*for consistancy; unused later*/
+            }
+
+            /*U Ibcast*/
+            if (IbcastPanel_U[k] == 0)
+            {
+#if 0
+                sIBcastRecvUPanel( k, comReqss[offset],  LUvsbs[offset],
+                                   msgss[offset], factStat, grid, LUstruct, SCT, tag_ub );
+#else
+		sIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ,
+				  comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf,
+				  LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub);
+#endif
+                IbcastPanel_U[k] = 1;
+            }
+        }
+
+        // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+        SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt;
+
+        for (int_t k0 = k_st; k0 < k_end; ++k0)
+        {
+            int_t k = perm_c_supno[k0];   // direct computation no perm_c_supno
+            int_t offset = k0 % numLA;
+
+#if 0
+            sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+            /*Wait for U panel*/
+            sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT);
+#else
+	    sWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, 
+		   comReqss[offset]->send_req, comReqss[offset]->recv_req,
+		   grid, LUstruct, SCT);
+	    sWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, 
+		   comReqss[offset]->recv_requ, grid, LUstruct, SCT);
+#endif
+            double tsch = SuperLU_timer_();
+            int_t LU_nonempty = sSchurComplementSetupGPU(k,
+							 msgss[offset], packLUInfo,
+							 myIperm, gIperm_c_supno, 
+							 perm_c_supno, gEtreeInfo,
+							 fNlists, scuBufs,
+							 LUvsbs[offset],
+							 grid, LUstruct, HyP);
+            // initializing D2H data transfer
+            int_t jj_cpu = 0;
+
+            scuStatUpdate( SuperSize(k), HyP,  SCT, stat);
+            uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+            lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+            int_t *lsub = lPanelInfo->lsub;
+            int_t *usub = uPanelInfo->usub;
+            int* indirect  = fNlists->indirect;
+            int* indirect2  = fNlists->indirect2;
+
+            /*Schurcomplement Update*/
+
+            int_t knsupc = SuperSize(k);
+            int_t klst = FstBlockC (k + 1);
+
+            float* bigV = scuBufs->bigV;
+
+#pragma omp parallel
+            {
+#pragma omp for schedule(dynamic,2) nowait
+		/* Each thread is assigned one loop index ij, responsible for
+		   block update L(lb,k) * U(k,j) -> tempv[]. */
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
+                {
+		    /* Get the entire area of L (look-ahead) X U (all-blocks). */
+		    /* for each j-block in U, go through all L-blocks in the
+		       look-ahead window. */
+                    int_t j   = ij / HyP->lookAheadBlk; 
+							   
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    sblock_gemm_scatterTopLeft( lb,  j, bigV, knsupc, klst, lsub,
+					       usub, ldt,  indirect, indirect2, HyP,
+					       LUstruct, grid, SCT, stat );
+                }
+
+#pragma omp for schedule(dynamic,2) nowait
+                for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
+                {
+                    int_t j   = ij / HyP->lookAheadBlk ;
+                    int_t lb  = ij % HyP->lookAheadBlk;
+                    sblock_gemm_scatterTopRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                usub, ldt,  indirect, indirect2, HyP,
+						LUstruct, grid, SCT, stat);
+                }
+
+#pragma omp for schedule(dynamic,2) nowait
+                for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) //
+                {
+                    int_t j   = ij / HyP->RemainBlk;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    sblock_gemm_scatterBottomLeft( lb,  j, bigV, knsupc, klst, lsub,
+                                                  usub, ldt,  indirect, indirect2,
+						  HyP, LUstruct, grid, SCT, stat);
+                } /*for (int_t ij =*/
+            }
+
+            if (topoLvl < maxTopoLevel - 1)
+            {
+                int_t k_parent = gEtreeInfo->setree[k];
+                gEtreeInfo->numChildLeft[k_parent]--;
+                if (gEtreeInfo->numChildLeft[k_parent] == 0)
+                {
+                    int_t k0_parent =  myIperm[k_parent];
+                    if (k0_parent > 0)
+                    {
+                        /* code */
+                        assert(k0_parent < nnodes);
+                        int_t offset = k0_parent - k_end;
+#if 0
+                        sDiagFactIBCast(k_parent,  dFBufs[offset], factStat,
+					comReqss[offset], grid, options, thresh,
+					LUstruct, stat, info, SCT, tag_ub);
+#else
+			sDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor,
+					dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D,
+					comReqss[offset]->U_diag_blk_recv_req, 
+					comReqss[offset]->L_diag_blk_recv_req,
+					comReqss[offset]->U_diag_blk_send_req, 
+					comReqss[offset]->L_diag_blk_send_req,
+					grid, options, thresh, LUstruct, stat, info, SCT, tag_ub);
+#endif
+                        factored_D[k_parent] = 1;
+                    }
+
+                }
+            }
+
+#pragma omp parallel
+            {
+#pragma omp for schedule(dynamic,2) nowait
+                for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu) ; ++ij)
+                {
+                    int_t j   = ij / HyP->RemainBlk + jj_cpu;
+                    int_t lb  = ij % HyP->RemainBlk;
+                    sblock_gemm_scatterBottomRight( lb,  j, bigV, knsupc, klst, lsub,
+                                                   usub, ldt,  indirect, indirect2,
+						   HyP, LUstruct, grid, SCT, stat);
+                } /*for (int_t ij =*/
+
+            }
+
+            SCT->NetSchurUpTimer += SuperLU_timer_() - tsch;
+            // finish waiting for diag block send
+            int_t abs_offset = k0 - k_st;
+#if 0
+            sWait_LUDiagSend(k,  comReqss[abs_offset], grid, SCT);
+#else
+	    Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, 
+			    comReqss[abs_offset]->L_diag_blk_send_req, 
+			    grid, SCT);
+#endif
+            /*Schedule next I bcasts*/
+            for (int_t next_k0 = k0 + 1; next_k0 < SUPERLU_MIN( k0 + 1 + numLA, nnodes); ++next_k0)
+            {
+                /* code */
+                int_t next_k = perm_c_supno[next_k0];
+                int_t offset = next_k0 % numLA;
+
+                /*L Ibcast*/
+                if (IbcastPanel_L[next_k] == 0 && factored_L[next_k])
+                {
+#if 0
+                    sIBcastRecvLPanel( next_k, comReqss[offset], 
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+		    sIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, 
+				      comReqss[offset]->send_req, comReqss[offset]->recv_req,
+				      LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf,
+				      factStat->factored, grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_L[next_k] = 1; /*will be used later*/
+                }
+                /*U Ibcast*/
+                if (IbcastPanel_U[next_k] == 0 && factored_U[next_k])
+                {
+#if 0
+                    sIBcastRecvUPanel( next_k, comReqss[offset],
+				       LUvsbs[offset], msgss[offset], factStat,
+				       grid, LUstruct, SCT, tag_ub );
+#else
+		    sIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, 
+				      comReqss[offset]->send_requ, comReqss[offset]->recv_requ,
+				      LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, 
+				      grid, LUstruct, SCT, tag_ub);
+#endif
+                    IbcastPanel_U[next_k] = 1;
+                }
+            }
+
+            if (topoLvl < maxTopoLevel - 1)
+            {
+
+                /*look ahead LU factorization*/
+                int_t kx_st = eTreeTopLims[topoLvl + 1];
+                int_t kx_end = eTreeTopLims[topoLvl + 2];
+                for (int_t k0x = kx_st; k0x < kx_end; k0x++)
+                {
+                    /* code */
+                    int_t kx = perm_c_supno[k0x];
+                    int_t offset = k0x - kx_st;
+                    if (IrecvPlcd_D[kx] && !factored_L[kx])
+                    {
+                        /*check if received*/
+                        int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset],
+                                                         grid, SCT);
+                        if (recvUDiag)
+                        {
+#if 0
+                            sLPanelTrSolve( kx,  dFBufs[offset],
+                                            factStat, comReqss[offset],
+                                            grid, LUstruct, SCT);
+#else
+			    sLPanelTrSolve( kx, factStat->factored_L, 
+					    dFBufs[offset]->BlockUFactor, grid, LUstruct);
+#endif
+
+                            factored_L[kx] = 1;
+
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_L[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within lookahead window
+                                    factored_L[kx])
+                            {
+                                int_t offset1 = k0x % numLA;
+#if 0
+                                sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1],
+                                                   msgss[offset1], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+				sIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, 
+						  comReqss[offset1]->send_req,
+						  comReqss[offset1]->recv_req,
+						  LUvsbs[offset1]->Lsub_buf,
+						  LUvsbs[offset1]->Lval_buf, 
+						  factStat->factored, 
+						  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_L[kx] = 1; /*will be used later*/
+                            }
+
+                        }
+                    }
+
+                    if (IrecvPlcd_D[kx] && !factored_U[kx])
+                    {
+                        /*check if received*/
+                        int_t recvLDiag = checkRecvLDiag( kx, comReqss[offset],
+                                                          grid, SCT);
+                        if (recvLDiag)
+                        {
+#if 0
+                            sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo,
+                                            grid, LUstruct, stat, SCT);
+#else
+			    sUPanelTrSolve( kx, dFBufs[offset]->BlockLFactor,
+                                            scuBufs->bigV,
+					    ldt, packLUInfo->Ublock_info, 
+					    grid, LUstruct, stat, SCT);
+#endif
+                            factored_U[kx] = 1;
+                            /*check if an L_Ibcast is possible*/
+
+                            if (IbcastPanel_U[kx] == 0 &&
+                                    k0x - k0 < numLA + 1  && // is within lookahead window
+                                    factored_U[kx])
+                            {
+                                int_t offset = k0x % numLA;
+#if 0
+                                sIBcastRecvUPanel( kx, comReqss[offset],
+						   LUvsbs[offset],
+						   msgss[offset], factStat,
+						   grid, LUstruct, SCT, tag_ub);
+#else
+				sIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, 
+						  comReqss[offset]->send_requ,
+						  comReqss[offset]->recv_requ,
+						  LUvsbs[offset]->Usub_buf,
+						  LUvsbs[offset]->Uval_buf, 
+						  grid, LUstruct, SCT, tag_ub);
+#endif
+                                IbcastPanel_U[kx] = 1; /*will be used later*/
+                            }
+                        }
+                    }
+                }
+
+            }
+        }/*for main loop (int_t k0 = 0; k0 < gNodeCount[tree]; ++k0)*/
+
+    }
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit ssparseTreeFactor_ASYNC()");
+#endif
+
+    return 0;
+} /* ssparseTreeFactor_ASYNC */
diff --git a/SRC/streeFactorizationGPU.c b/SRC/streeFactorizationGPU.c
new file mode 100644
index 00000000..2875ed5c
--- /dev/null
+++ b/SRC/streeFactorizationGPU.c
@@ -0,0 +1,735 @@
+
+
+/*! @file
+ * \brief Factorization routines for the subtree using 2D process grid, with GPUs.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
+ * Georgia Institute of Technology, Oak Ridge National Laboratory
+ * May 12, 2021
+ * 
+ */ +// #include "treeFactorization.h" +// #include "trfCommWrapper.h" +#include "slustruct_gpu.h" + +//#include "cblas.h" + +#ifdef GPU_ACC ///////////////// enable GPU + +/* +/-- num_u_blks--\ /-- num_u_blks_Phi --\ +---------------------------------------- +| host_cols || GPU | host | +---------------------------------------- + ^ ^ + 0 jj_cpu +*/ +static int_t getAccUPartition(HyP_t *HyP) +{ + /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */ + int_t total_cols_1 = HyP->Ublock_info_Phi[HyP->num_u_blks_Phi - 1].full_u_cols; + + int_t host_cols = HyP->Ublock_info[HyP->num_u_blks - 1].full_u_cols; + double cpu_time_0 = estimate_cpu_time(HyP->Lnbrow, total_cols_1, HyP->ldu_Phi) + + estimate_cpu_time(HyP->Rnbrow, host_cols, HyP->ldu) + estimate_cpu_time(HyP->Lnbrow, host_cols, HyP->ldu); + + int jj_cpu; + +#if 0 /* Ignoe those estimates */ + jj_cpu = tuned_partition(HyP->num_u_blks_Phi, HyP->Ublock_info_Phi, + HyP->Remain_info, HyP->RemainBlk, cpu_time_0, HyP->Rnbrow, HyP->ldu_Phi ); +#else /* Sherry: new */ + jj_cpu = HyP->num_u_blks_Phi; +#endif + + if (jj_cpu != 0 && HyP->Rnbrow > 0) // ### + { + HyP->offloadCondition = 1; + } + else + { + HyP->offloadCondition = 0; + jj_cpu = 0; // ### + } + + return jj_cpu; +} + +int ssparseTreeFactor_ASYNC_GPU( + sForest_t *sforest, + commRequests_t **comReqss, // lists of communication requests, + // size = maxEtree level + sscuBufs_t *scuBufs, // contains buffers for schur complement update + packLUInfo_t *packLUInfo, + msgs_t **msgss, // size = num Look ahead + sLUValSubBuf_t **LUvsbs, // size = num Look ahead + sdiagFactBufs_t **dFBufs, // size = maxEtree level + factStat_t *factStat, + factNodelists_t *fNlists, + gEtreeInfo_t *gEtreeInfo, // global etree info + superlu_dist_options_t *options, + int_t *gIperm_c_supno, + int ldt, + ssluGPU_t *sluGPU, + d2Hreduce_t *d2Hred, + HyP_t *HyP, + sLUstruct_t *LUstruct, gridinfo3d_t *grid3d, SuperLUStat_t *stat, + double thresh, SCT_t *SCT, int tag_ub, + int *info) +{ + // sforest.nNodes, sforest.nodeList, + // &sforest.topoInfo, + int_t nnodes = sforest->nNodes; // number of nodes in supernodal etree + if (nnodes < 1) + { + return 1; + } + + int_t *perm_c_supno = sforest->nodeList; // list of nodes in the order of factorization + treeTopoInfo_t *treeTopoInfo = &sforest->topoInfo; + int_t *myIperm = treeTopoInfo->myIperm; + + gridinfo_t *grid = &(grid3d->grid2d); + /*main loop over all the levels*/ + + int_t maxTopoLevel = treeTopoInfo->numLvl; + int_t *eTreeTopLims = treeTopoInfo->eTreeTopLims; + int_t *IrecvPlcd_D = factStat->IrecvPlcd_D; + int_t *factored_D = factStat->factored_D; + int_t *factored_L = factStat->factored_L; + int_t *factored_U = factStat->factored_U; + int_t *IbcastPanel_L = factStat->IbcastPanel_L; + int_t *IbcastPanel_U = factStat->IbcastPanel_U; + int_t *gpuLUreduced = factStat->gpuLUreduced; + int_t *xsup = LUstruct->Glu_persist->xsup; + + // int_t numLAMax = getNumLookAhead(); + int_t numLAMax = getNumLookAhead(options); + int_t numLA = numLAMax; // number of look-ahead panels + int_t superlu_acc_offload = HyP->superlu_acc_offload; + int_t last_flag = 1; /* for updating nsuper-1 only once */ + int_t nCudaStreams = sluGPU->nCudaStreams; // number of cuda streams + + if (superlu_acc_offload) + ssyncAllfunCallStreams(sluGPU, SCT); + + /* Go through each leaf node */ + for (int_t k0 = 0; k0 < eTreeTopLims[1]; ++k0) + { + int_t k = perm_c_supno[k0]; // direct computation no perm_c_supno + int_t offset = k0; + /* k-th diagonal factorization */ + + /* If LU panels from GPU are not reduced, then reduce + them before diagonal factorization */ + if (!gpuLUreduced[k] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + + sinitD2Hreduce(k, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + ssendLUpanelGPU2HOST(k, d2Hred, sluGPU); + + sreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid, LUstruct); + + gpuLUreduced[k] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + double t1 = SuperLU_timer_(); + + /*Now factor and broadcast diagonal block*/ + // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + // options, thresh, LUstruct, stat, info, SCT); + +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + sDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k] = 1; + + SCT->pdgstrf2_timer += (SuperLU_timer_() - t1); + } /* for all leaves ... */ + + //printf(".. SparseFactor_GPU: after leaves\n"); fflush(stdout); + + /* Process supernodal etree level by level */ + for (int topoLvl = 0; topoLvl < maxTopoLevel; ++topoLvl) + // for (int_t topoLvl = 0; topoLvl < 1; ++topoLvl) + { + // printf("(%d) factor level %d, maxTopoLevel %d\n",grid3d->iam,topoLvl,maxTopoLevel); fflush(stdout); + /* code */ + int k_st = eTreeTopLims[topoLvl]; + int k_end = eTreeTopLims[topoLvl + 1]; + + /* Process all the nodes in 'topoLvl': diagonal factorization */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 - k_st; + + if (!factored_D[k]) + { + /*If LU panels from GPU are not reduced then reduce + them before diagonal factorization*/ + if (!gpuLUreduced[k] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + sinitD2Hreduce(k, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + ssendLUpanelGPU2HOST(k, d2Hred, sluGPU); + /* + Reduce the LU panels from GPU + */ + sreduceGPUlu(last_flag, d2Hred, sluGPU, SCT, grid, + LUstruct); + + gpuLUreduced[k] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + double t1 = SuperLU_timer_(); + /* Factor diagonal block on CPU */ + // sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + // options, thresh, LUstruct, stat, info, SCT); +#if 0 + sDiagFactIBCast(k, dFBufs[offset], factStat, comReqss[offset], grid, + options, thresh, LUstruct, stat, info, SCT, tag_ub); +#else + sDiagFactIBCast(k, k, dFBufs[offset]->BlockUFactor, dFBufs[offset]->BlockLFactor, + factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + SCT->pdgstrf2_timer += (SuperLU_timer_() - t1); + } + } /* for all nodes in this level */ + + //printf(".. SparseFactor_GPU: after diag factorization\n"); fflush(stdout); + + double t_apt = SuperLU_timer_(); /* Async Pipe Timer */ + + /* Process all the nodes in 'topoLvl': panel updates on CPU */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 - k_st; + + /*L update */ + if (factored_L[k] == 0) + { +#if 0 + sLPanelUpdate(k, dFBufs[offset], factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + sLPanelUpdate(k, factStat->IrecvPlcd_D, factStat->factored_L, + comReqss[offset]->U_diag_blk_recv_req, + dFBufs[offset]->BlockUFactor, grid, LUstruct, SCT); +#endif + + factored_L[k] = 1; + } + /*U update*/ + if (factored_U[k] == 0) + { +#if 0 + sUPanelUpdate(k, ldt, dFBufs[offset], factStat, comReqss[offset], + scuBufs, packLUInfo, grid, LUstruct, stat, SCT); +#else + sUPanelUpdate(k, factStat->factored_U, comReqss[offset]->L_diag_blk_recv_req, + dFBufs[offset]->BlockLFactor, scuBufs->bigV, ldt, + packLUInfo->Ublock_info, grid, LUstruct, stat, SCT); +#endif + factored_U[k] = 1; + } + } /* end panel update */ + + //printf(".. after CPU panel updates. numLA %d\n", numLA); fflush(stdout); + + /* Process all the panels in look-ahead window: + broadcast L and U panels. */ + for (int k0 = k_st; k0 < SUPERLU_MIN(k_end, k_st + numLA); ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 % numLA; + /* diagonal factorization */ + + /*L Ibcast*/ + if (IbcastPanel_L[k] == 0) + { +#if 0 + sIBcastRecvLPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + sIBcastRecvLPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_req, + comReqss[offset]->recv_req, LUvsbs[offset]->Lsub_buf, + LUvsbs[offset]->Lval_buf, factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[k] = 1; /*for consistancy; unused later*/ + } + + /*U Ibcast*/ + if (IbcastPanel_U[k] == 0) + { +#if 0 + sIBcastRecvUPanel( k, comReqss[offset], LUvsbs[offset], + msgss[offset], factStat, grid, LUstruct, SCT, tag_ub ); +#else + sIBcastRecvUPanel(k, k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[k] = 1; + } + } /* end for panels in look-ahead window */ + + //printf(".. after CPU look-ahead updates\n"); fflush(stdout); + + // if (topoLvl) SCT->tAsyncPipeTail += SuperLU_timer_() - t_apt; + SCT->tAsyncPipeTail += (SuperLU_timer_() - t_apt); + + /* Process all the nodes in level 'topoLvl': Schur complement update + (no MPI communication) */ + for (int k0 = k_st; k0 < k_end; ++k0) + { + int k = perm_c_supno[k0]; // direct computation no perm_c_supno + int offset = k0 % numLA; + + double tsch = SuperLU_timer_(); + +#if 0 + sWaitL(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); + /*Wait for U panel*/ + sWaitU(k, comReqss[offset], msgss[offset], grid, LUstruct, SCT); +#else + sWaitL(k, msgss[offset]->msgcnt, msgss[offset]->msgcntU, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + grid, LUstruct, SCT); + sWaitU(k, msgss[offset]->msgcnt, comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, grid, LUstruct, SCT); +#endif + + int_t LU_nonempty = sSchurComplementSetupGPU(k, + msgss[offset], packLUInfo, + myIperm, gIperm_c_supno, perm_c_supno, + gEtreeInfo, fNlists, scuBufs, + LUvsbs[offset], grid, LUstruct, HyP); + // initializing D2H data transfer. D2H = Device To Host. + int_t jj_cpu; /* limit between CPU and GPU */ + +#if 1 + if (superlu_acc_offload) + { + jj_cpu = HyP->num_u_blks_Phi; // -1 ?? + HyP->offloadCondition = 1; + } + else + { + /* code */ + HyP->offloadCondition = 0; + jj_cpu = 0; + } + +#else + if (superlu_acc_offload) + { + jj_cpu = getAccUPartition(HyP); + + if (jj_cpu > 0) + jj_cpu = HyP->num_u_blks_Phi; + + /* Sherry force this --> */ + jj_cpu = HyP->num_u_blks_Phi; // -1 ?? + HyP->offloadCondition = 1; + } + else + { + jj_cpu = 0; + } +#endif + + // int_t jj_cpu = HyP->num_u_blks_Phi-1; + // if (HyP->Rnbrow > 0 && jj_cpu>=0) + // HyP->offloadCondition = 1; + // else + // HyP->offloadCondition = 0; + // jj_cpu=0; +#if 0 + if ( HyP->offloadCondition ) { + printf("(%d) k=%d, nub=%d, nub_host=%d, nub_phi=%d, jj_cpu %d, offloadCondition %d\n", + grid3d->iam, k, HyP->num_u_blks+HyP->num_u_blks_Phi , + HyP->num_u_blks, HyP->num_u_blks_Phi, + jj_cpu, HyP->offloadCondition); + fflush(stdout); + } +#endif + scuStatUpdate(SuperSize(k), HyP, SCT, stat); + + int_t offload_condition = HyP->offloadCondition; + uPanelInfo_t *uPanelInfo = packLUInfo->uPanelInfo; + lPanelInfo_t *lPanelInfo = packLUInfo->lPanelInfo; + int_t *lsub = lPanelInfo->lsub; + int_t *usub = uPanelInfo->usub; + int *indirect = fNlists->indirect; + int *indirect2 = fNlists->indirect2; + + /* Schur Complement Update */ + + int_t knsupc = SuperSize(k); + int_t klst = FstBlockC(k + 1); + + float *bigV = scuBufs->bigV; + float *bigU = scuBufs->bigU; + + double t1 = SuperLU_timer_(); + +#pragma omp parallel /* Look-ahead update on CPU */ + { + int_t thread_id = omp_get_thread_num(); + +#pragma omp for + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij) + { + int_t j = ij / HyP->lookAheadBlk; + int_t lb = ij % HyP->lookAheadBlk; + sblock_gemm_scatterTopLeft(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } + +#pragma omp for + for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij) + { + int_t j = ij / HyP->lookAheadBlk; + int_t lb = ij % HyP->lookAheadBlk; + sblock_gemm_scatterTopRight(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } + +#pragma omp for + for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) + { + int_t j = ij / HyP->RemainBlk; + int_t lb = ij % HyP->RemainBlk; + sblock_gemm_scatterBottomLeft(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } /* for int_t ij = ... */ + } /* end parallel region ... end look-ahead update */ + + SCT->lookaheadupdatetimer += (SuperLU_timer_() - t1); + + //printf("... after look-ahead update, topoLvl %d\t maxTopoLevel %d\n", topoLvl, maxTopoLevel); fflush(stdout); + + /* Reduce the L & U panels from GPU to CPU. */ + if (topoLvl < maxTopoLevel - 1) + { /* Not the root */ + int_t k_parent = gEtreeInfo->setree[k]; + gEtreeInfo->numChildLeft[k_parent]--; + if (gEtreeInfo->numChildLeft[k_parent] == 0 && k_parent < nnodes) + { /* if k is the last child in this level */ + int_t k0_parent = myIperm[k_parent]; + if (k0_parent > 0) + { + /* code */ + // printf("Before assert: iam %d, k %d, k_parent %d, k0_parent %d, nnodes %d\n", grid3d->iam, k, k_parent, k0_parent, nnodes); fflush(stdout); + // exit(-1); + assert(k0_parent < nnodes); + int offset = k0_parent - k_end; + if (!gpuLUreduced[k_parent] && superlu_acc_offload) + { + double tt_start1 = SuperLU_timer_(); + + sinitD2Hreduce(k_parent, d2Hred, last_flag, + HyP, sluGPU, grid, LUstruct, SCT); + int_t copyL_kljb = d2Hred->copyL_kljb; + int_t copyU_kljb = d2Hred->copyU_kljb; + + if (copyL_kljb || copyU_kljb) + SCT->PhiMemCpyCounter++; + ssendLUpanelGPU2HOST(k_parent, d2Hred, sluGPU); + + /* Reduce the LU panels from GPU */ + sreduceGPUlu(last_flag, d2Hred, + sluGPU, SCT, grid, LUstruct); + + gpuLUreduced[k_parent] = 1; + SCT->PhiMemCpyTimer += SuperLU_timer_() - tt_start1; + } + + /* Factorize diagonal block on CPU */ +#if 0 + sDiagFactIBCast(k_parent, dFBufs[offset], factStat, + comReqss[offset], grid, options, thresh, + LUstruct, stat, info, SCT, tag_ub); +#else + sDiagFactIBCast(k_parent, k_parent, dFBufs[offset]->BlockUFactor, + dFBufs[offset]->BlockLFactor, factStat->IrecvPlcd_D, + comReqss[offset]->U_diag_blk_recv_req, + comReqss[offset]->L_diag_blk_recv_req, + comReqss[offset]->U_diag_blk_send_req, + comReqss[offset]->L_diag_blk_send_req, + grid, options, thresh, LUstruct, stat, info, SCT, tag_ub); +#endif + factored_D[k_parent] = 1; + } /* end if k0_parent > 0 */ + + } /* end if all children are done */ + } /* end if non-root */ + +#pragma omp parallel + { + /* Master thread performs Schur complement update on GPU. */ +#pragma omp master + { + if (superlu_acc_offload) + { + int thread_id = omp_get_thread_num(); + double t1 = SuperLU_timer_(); + + if (offload_condition) + { + SCT->datatransfer_count++; + int streamId = k0 % nCudaStreams; + + /*wait for previous offload to get finished*/ + if (sluGPU->lastOffloadStream[streamId] != -1) + { + swaitGPUscu(streamId, sluGPU, SCT); + sluGPU->lastOffloadStream[streamId] = -1; + } + + int_t Remain_lbuf_send_size = knsupc * HyP->Rnbrow; + int_t bigu_send_size = jj_cpu < 1 ? 0 : HyP->ldu_Phi * HyP->Ublock_info_Phi[jj_cpu - 1].full_u_cols; + assert(bigu_send_size < HyP->bigu_size); + + /* !! Sherry add the test to avoid seg_fault inside + sendSCUdataHost2GPU */ + if (bigu_send_size > 0) + { + ssendSCUdataHost2GPU(streamId, lsub, usub, + bigU, bigu_send_size, + Remain_lbuf_send_size, sluGPU, HyP); + + sluGPU->lastOffloadStream[streamId] = k0; + int_t usub_len = usub[2]; + int_t lsub_len = lsub[1] + BC_HEADER + lsub[0] * LB_DESCRIPTOR; + //{printf("... before SchurCompUpdate_GPU, bigu_send_size %d\n", bigu_send_size); fflush(stdout);} + + sSchurCompUpdate_GPU( + streamId, 0, jj_cpu, klst, knsupc, HyP->Rnbrow, HyP->RemainBlk, + Remain_lbuf_send_size, bigu_send_size, HyP->ldu_Phi, HyP->num_u_blks_Phi, + HyP->buffer_size, lsub_len, usub_len, ldt, k0, sluGPU, grid); + } /* endif bigu_send_size > 0 */ + + // sendLUpanelGPU2HOST( k0, d2Hred, sluGPU); + + SCT->schurPhiCallCount++; + HyP->jj_cpu = jj_cpu; + updateDirtyBit(k0, HyP, grid); + } /* endif (offload_condition) */ + + double t2 = SuperLU_timer_(); + SCT->SchurCompUdtThreadTime[thread_id * CACHE_LINE_SIZE] += (double)(t2 - t1); /* not used */ + SCT->CPUOffloadTimer += (double)(t2 - t1); // Sherry added + + } /* endif (superlu_acc_offload) */ + + } /* end omp master thread */ + +#pragma omp for + /* The following update is on CPU. Should not be necessary now, + because we set jj_cpu equal to num_u_blks_Phi. */ + for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu); ++ij) + { + //printf(".. WARNING: should NOT get here\n"); + int_t j = ij / HyP->RemainBlk + jj_cpu; + int_t lb = ij % HyP->RemainBlk; + sblock_gemm_scatterBottomRight(lb, j, bigV, knsupc, klst, lsub, + usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat); + } /* for int_t ij = ... */ + + } /* end omp parallel region */ + + //SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + + // finish waiting for diag block send + int_t abs_offset = k0 - k_st; +#if 0 + sWait_LUDiagSend(k, comReqss[abs_offset], grid, SCT); +#else + Wait_LUDiagSend(k, comReqss[abs_offset]->U_diag_blk_send_req, + comReqss[abs_offset]->L_diag_blk_send_req, + grid, SCT); +#endif + + /*Schedule next I bcasts within look-ahead window */ + for (int next_k0 = k0 + 1; next_k0 < SUPERLU_MIN(k0 + 1 + numLA, nnodes); ++next_k0) + { + /* code */ + int_t next_k = perm_c_supno[next_k0]; + int_t offset = next_k0 % numLA; + + /*L Ibcast*/ + if (IbcastPanel_L[next_k] == 0 && factored_L[next_k]) + { +#if 0 + sIBcastRecvLPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + sIBcastRecvLPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_req, comReqss[offset]->recv_req, + LUvsbs[offset]->Lsub_buf, LUvsbs[offset]->Lval_buf, + factStat->factored, grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[next_k] = 1; /*will be used later*/ + } + /*U Ibcast*/ + if (IbcastPanel_U[next_k] == 0 && factored_U[next_k]) + { +#if 0 + sIBcastRecvUPanel( next_k, comReqss[offset], + LUvsbs[offset], msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub ); +#else + sIBcastRecvUPanel(next_k, next_k, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[next_k] = 1; + } + } /* end for look-ahead window */ + + if (topoLvl < maxTopoLevel - 1) /* not root */ + { + /*look-ahead LU factorization*/ + int kx_st = eTreeTopLims[topoLvl + 1]; + int kx_end = eTreeTopLims[topoLvl + 2]; + for (int k0x = kx_st; k0x < kx_end; k0x++) + { + /* code */ + int kx = perm_c_supno[k0x]; + int offset = k0x - kx_st; + if (IrecvPlcd_D[kx] && !factored_L[kx]) + { + /*check if received*/ + int_t recvUDiag = checkRecvUDiag(kx, comReqss[offset], + grid, SCT); + if (recvUDiag) + { +#if 0 + sLPanelTrSolve( kx, dFBufs[offset], + factStat, comReqss[offset], + grid, LUstruct, SCT); +#else + sLPanelTrSolve(kx, factStat->factored_L, + dFBufs[offset]->BlockUFactor, grid, LUstruct); +#endif + + factored_L[kx] = 1; + + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_L[kx] == 0 && + k0x - k0 < numLA + 1 && // is within look-ahead window + factored_L[kx]) + { + int_t offset1 = k0x % numLA; +#if 0 + sIBcastRecvLPanel( kx, comReqss[offset1], LUvsbs[offset1], + msgss[offset1], factStat, + grid, LUstruct, SCT, tag_ub); +#else + sIBcastRecvLPanel(kx, kx, msgss[offset1]->msgcnt, + comReqss[offset1]->send_req, + comReqss[offset1]->recv_req, + LUvsbs[offset1]->Lsub_buf, + LUvsbs[offset1]->Lval_buf, + factStat->factored, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_L[kx] = 1; /*will be used later*/ + } + } + } + + if (IrecvPlcd_D[kx] && !factored_U[kx]) + { + /*check if received*/ + int_t recvLDiag = checkRecvLDiag(kx, comReqss[offset], + grid, SCT); + if (recvLDiag) + { +#if 0 + sUPanelTrSolve( kx, ldt, dFBufs[offset], scuBufs, packLUInfo, + grid, LUstruct, stat, SCT); +#else + sUPanelTrSolve(kx, dFBufs[offset]->BlockLFactor, + scuBufs->bigV, + ldt, packLUInfo->Ublock_info, + grid, LUstruct, stat, SCT); +#endif + factored_U[kx] = 1; + /*check if an L_Ibcast is possible*/ + + if (IbcastPanel_U[kx] == 0 && + k0x - k0 < numLA + 1 && // is within lookahead window + factored_U[kx]) + { + int_t offset = k0x % numLA; +#if 0 + sIBcastRecvUPanel( kx, comReqss[offset], + LUvsbs[offset], + msgss[offset], factStat, + grid, LUstruct, SCT, tag_ub); +#else + sIBcastRecvUPanel(kx, kx, msgss[offset]->msgcnt, + comReqss[offset]->send_requ, + comReqss[offset]->recv_requ, + LUvsbs[offset]->Usub_buf, + LUvsbs[offset]->Uval_buf, + grid, LUstruct, SCT, tag_ub); +#endif + IbcastPanel_U[kx] = 1; /*will be used later*/ + } + } + } + } /* end look-ahead */ + + } /* end if non-root level */ + + /* end Schur complement update */ + SCT->NetSchurUpTimer += SuperLU_timer_() - tsch; + + } /* end Schur update for all the nodes in level 'topoLvl' */ + + } /* end for all levels of the tree */ + + return 0; +} /* end ssparseTreeFactor_ASYNC_GPU */ + +#endif // matching: enable GPU diff --git a/SRC/strfAux.c b/SRC/strfAux.c new file mode 100644 index 00000000..81fe3266 --- /dev/null +++ b/SRC/strfAux.c @@ -0,0 +1,758 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Auxiliary routine for 3D factorization. + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_sdefs.h"
+
+#if 0
+#include "pdgstrf3d.h"
+#include "trfAux.h"
+#endif
+
+/* Inititalize the data structure to assist HALO offload of Schur-complement. */
+void sInit_HyP(HyP_t* HyP, sLocalLU_t *Llu, int_t mcb, int_t mrb )
+{
+    HyP->last_offload = -1;
+#if 0
+    HyP->lookAhead_info = (Remain_info_t *) _mm_malloc((mrb) * sizeof(Remain_info_t), 64);
+
+    HyP->lookAhead_L_buff = (float *) _mm_malloc( sizeof(float) * (Llu->bufmax[1]), 64);
+
+    HyP->Remain_L_buff = (float *) _mm_malloc( sizeof(float) * (Llu->bufmax[1]), 64);
+    HyP->Remain_info = (Remain_info_t *) _mm_malloc(mrb * sizeof(Remain_info_t), 64);
+    HyP->Ublock_info_Phi = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64);
+    HyP->Ublock_info = (Ublock_info_t *) _mm_malloc(mcb * sizeof(Ublock_info_t), 64);
+    HyP->Lblock_dirty_bit = (int_t *) _mm_malloc(mcb * sizeof(int_t), 64);
+    HyP->Ublock_dirty_bit = (int_t *) _mm_malloc(mrb * sizeof(int_t), 64);
+#else
+    HyP->lookAhead_info = (Remain_info_t *) SUPERLU_MALLOC((mrb) * sizeof(Remain_info_t));
+    HyP->lookAhead_L_buff = (float *) floatMalloc_dist((Llu->bufmax[1]));
+    HyP->Remain_L_buff = (float *) floatMalloc_dist((Llu->bufmax[1]));
+    HyP->Remain_info = (Remain_info_t *) SUPERLU_MALLOC(mrb * sizeof(Remain_info_t));
+    HyP->Ublock_info_Phi = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t));
+    HyP->Ublock_info = (Ublock_info_t *) SUPERLU_MALLOC(mcb * sizeof(Ublock_info_t));
+    HyP->Lblock_dirty_bit = (int_t *) intMalloc_dist(mcb);
+    HyP->Ublock_dirty_bit = (int_t *) intMalloc_dist(mrb);
+#endif
+
+    for (int_t i = 0; i < mcb; ++i)
+    {
+        HyP->Lblock_dirty_bit[i] = -1;
+    }
+
+    for (int_t i = 0; i < mrb; ++i)
+    {
+        HyP->Ublock_dirty_bit[i] = -1;
+    }
+
+    HyP->last_offload = -1;
+    HyP->superlu_acc_offload = get_acc_offload ();
+
+    HyP->nCudaStreams =0;
+} /* sInit_HyP */
+
+/*init3DLUstruct with forest interface */
+void sinit3DLUstructForest( int_t* myTreeIdxs, int_t* myZeroTrIdxs,
+                           sForest_t**  sForests, sLUstruct_t* LUstruct,
+                           gridinfo3d_t* grid3d)
+{
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    int_t* gNodeCount = INT_T_ALLOC (numForests);
+    int_t** gNodeLists =  (int_t**) SUPERLU_MALLOC(numForests * sizeof(int_t*));
+
+    for (int i = 0; i < numForests; ++i)
+	{
+	    gNodeCount[i] = 0;
+	    gNodeLists[i] = NULL;
+	    /* code */
+	    if (sForests[i])
+		{	
+                    gNodeCount[i] = sForests[i]->nNodes;
+		    gNodeLists[i] = sForests[i]->nodeList;
+		}
+	}
+    
+    /*call the old forest*/
+    sinit3DLUstruct( myTreeIdxs, myZeroTrIdxs,
+		     gNodeCount, gNodeLists, LUstruct, grid3d);
+
+    SUPERLU_FREE(gNodeCount);  // sherry added
+    SUPERLU_FREE(gNodeLists);
+}
+
+int_t sSchurComplementSetup(
+    int_t k,
+    int *msgcnt,
+    Ublock_info_t*  Ublock_info,
+    Remain_info_t*  Remain_info,
+    uPanelInfo_t *uPanelInfo,
+    lPanelInfo_t *lPanelInfo,
+    int_t* iperm_c_supno,
+    int_t * iperm_u,
+    int_t * perm_u,
+    float *bigU,
+    int_t* Lsub_buf,
+    float *Lval_buf,
+    int_t* Usub_buf,
+    float *Uval_buf,
+    gridinfo_t *grid,
+    sLUstruct_t *LUstruct
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+    int_t *usub;
+    float* uval;
+    int_t* lsub;
+    float* lusup;
+
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        int_t  lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lPanelInfo->lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        lPanelInfo->lusup = Lnzval_bc_ptr[lk];
+    }
+    else
+    {
+        lsub = Lsub_buf;
+        lPanelInfo->lsub = Lsub_buf;
+        lusup = Lval_buf;
+        lPanelInfo->lusup = Lval_buf;
+    }
+
+    if (myrow == krow)
+    {
+        int_t  lk = LBi (k, grid);
+        usub = Ufstnz_br_ptr[lk];
+        uval = Unzval_br_ptr[lk];
+        uPanelInfo->usub = usub;
+    }
+    else
+    {
+        if (ToRecv[k] == 2)
+        {
+            usub = Usub_buf;
+            uval = Uval_buf;
+            uPanelInfo->usub = usub;
+        }
+    }
+
+    /*now each procs does the schurcomplement update*/
+    int_t msg0 = msgcnt[0];
+    int_t msg2 = msgcnt[2];
+    int_t knsupc = SuperSize (k);
+
+    int_t lptr0, luptr0;
+    int_t LU_nonempty = msg0 && msg2;
+    if (LU_nonempty == 0) return 0;
+    if (msg0 && msg2)       /* L(:,k) and U(k,:) are not empty. */
+    {
+        lPanelInfo->nsupr = lsub[1];
+        int_t nlb;
+        if (myrow == krow)  /* Skip diagonal block L(k,k). */
+        {
+            lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+            luptr0 = knsupc;
+            nlb = lsub[0] - 1;
+            lPanelInfo->nlb = nlb;
+        }
+        else
+        {
+            lptr0 = BC_HEADER;
+            luptr0 = 0;
+            nlb = lsub[0];
+            lPanelInfo->nlb = nlb;
+        }
+        int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+        int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+        int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+        int_t klst = FstBlockC (k + 1);
+        uPanelInfo->klst = klst;
+
+        /* --------------------------------------------------------------
+           Update the look-ahead block columns A(:,k+1:k+num_look_ahead).
+           -------------------------------------------------------------- */
+        int_t iukp0 = iukp;
+        int_t rukp0 = rukp;
+
+        /* reorder the remaining columns in bottom-up */
+        for (int_t jj = 0; jj < nub; jj++)
+        {
+#ifdef ISORT
+            iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+            perm_u[jj] = jj;
+#else
+            perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+            perm_u[2 * jj + 1] = jj;
+#endif
+            int_t jb = usub[iukp];    /* Global block number of block U(k,j). */
+            int_t nsupc = SuperSize (jb);
+            iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+            iukp += nsupc;
+        }
+        iukp = iukp0;
+#ifdef ISORT
+        isort (nub, iperm_u, perm_u);
+#else
+        qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+               &superlu_sort_perm);
+#endif
+        // j = jj0 = 0;
+
+        int_t ldu   = 0;
+        int_t full  = 1;
+        int_t num_u_blks = 0;
+
+        for (int_t j = 0; j < nub ; ++j)
+        {
+            int_t iukp, temp_ncols;
+
+            temp_ncols = 0;
+            int_t  rukp, jb, ljb, nsupc, segsize;
+            arrive_at_ublock(
+                j, &iukp, &rukp, &jb, &ljb, &nsupc,
+                iukp0, rukp0, usub, perm_u, xsup, grid
+            );
+
+            int_t jj = iukp;
+            for (; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize ) ++temp_ncols;
+            }
+            Ublock_info[num_u_blks].iukp = iukp;
+            Ublock_info[num_u_blks].rukp = rukp;
+            Ublock_info[num_u_blks].jb = jb;
+            Ublock_info[num_u_blks].eo = iperm_c_supno[jb];
+            /* Prepare to call DGEMM. */
+            jj = iukp;
+
+            for (; jj < iukp + nsupc; ++jj)
+            {
+                segsize = klst - usub[jj];
+                if ( segsize )
+                {
+                    if ( segsize != ldu ) full = 0;
+                    if ( segsize > ldu ) ldu = segsize;
+                }
+            }
+
+            Ublock_info[num_u_blks].ncols = temp_ncols;
+            // ncols += temp_ncols;
+            num_u_blks++;
+
+        }
+
+        uPanelInfo->ldu = ldu;
+        uPanelInfo->nub = num_u_blks;
+
+        Ublock_info[0].full_u_cols = Ublock_info[0 ].ncols;
+        Ublock_info[0].StCol = 0;
+        for ( int_t j = 1; j < num_u_blks; ++j)
+        {
+            Ublock_info[j].full_u_cols = Ublock_info[j ].ncols + Ublock_info[j - 1].full_u_cols;
+            Ublock_info[j].StCol = Ublock_info[j - 1].StCol + Ublock_info[j - 1].ncols;
+        }
+
+        sgather_u(num_u_blks, Ublock_info, usub,  uval,  bigU,  ldu, xsup, klst );
+
+        sort_U_info_elm(Ublock_info, num_u_blks );
+
+        int_t cum_nrow = 0;
+        int_t RemainBlk = 0;
+
+        int_t lptr = lptr0;
+        int_t luptr = luptr0;
+        for (int_t i = 0; i < nlb; ++i)
+        {
+            int_t ib = lsub[lptr];        /* Row block L(i,k). */
+            int_t temp_nbrow = lsub[lptr + 1]; /* Number of full rows. */
+
+            Remain_info[RemainBlk].nrows = temp_nbrow;
+            Remain_info[RemainBlk].StRow = cum_nrow;
+            Remain_info[RemainBlk].FullRow = cum_nrow;
+            Remain_info[RemainBlk].lptr = lptr;
+            Remain_info[RemainBlk].ib = ib;
+            Remain_info[RemainBlk].eo = iperm_c_supno[ib];
+            RemainBlk++;
+
+            cum_nrow += temp_nbrow;
+            lptr += LB_DESCRIPTOR;  /* Skip descriptor. */
+            lptr += temp_nbrow;
+            luptr += temp_nbrow;
+        }
+
+        lptr = lptr0;
+        luptr = luptr0;
+        sort_R_info_elm( Remain_info, lPanelInfo->nlb );
+        lPanelInfo->luptr0 = luptr0;
+    }
+    return LU_nonempty;
+} /* sSchurComplementSetup */
+
+/* 
+ * Gather L and U panels into respective buffers, to prepare for GEMM call.
+ * Divide Schur complement update into two parts: CPU vs. GPU.
+ */
+int_t sSchurComplementSetupGPU(
+    int_t k, msgs_t* msgs,
+    packLUInfo_t* packLUInfo,
+    int_t* myIperm, 
+    int_t* iperm_c_supno, int_t*perm_c_supno,
+    gEtreeInfo_t*   gEtreeInfo, factNodelists_t* fNlists,
+    sscuBufs_t* scuBufs, sLUValSubBuf_t* LUvsb,
+    gridinfo_t *grid, sLUstruct_t *LUstruct,
+    HyP_t* HyP)
+{
+    int_t * Lsub_buf  = LUvsb->Lsub_buf;
+    float * Lval_buf  = LUvsb->Lval_buf;
+    int_t * Usub_buf  = LUvsb->Usub_buf;
+    float * Uval_buf  = LUvsb->Uval_buf;
+    uPanelInfo_t* uPanelInfo = packLUInfo->uPanelInfo;
+    lPanelInfo_t* lPanelInfo = packLUInfo->lPanelInfo;
+    int* msgcnt  = msgs->msgcnt;
+    int_t* iperm_u  = fNlists->iperm_u;
+    int_t* perm_u  = fNlists->perm_u;
+    float* bigU = scuBufs->bigU;
+
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+
+    int_t *usub;
+    float* uval;
+    int_t* lsub;
+    float* lusup;
+
+    HyP->lookAheadBlk = 0, HyP->RemainBlk = 0;
+    HyP->Lnbrow =0, HyP->Rnbrow=0;
+    HyP->num_u_blks_Phi=0;
+    HyP->num_u_blks=0;
+
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        int_t  lk = LBj (k, grid);     /* Local block number. */
+        lsub = Lrowind_bc_ptr[lk];
+        lPanelInfo->lsub = Lrowind_bc_ptr[lk];
+        lusup = Lnzval_bc_ptr[lk];
+        lPanelInfo->lusup = Lnzval_bc_ptr[lk];
+    }
+    else
+    {
+        lsub = Lsub_buf;
+        lPanelInfo->lsub = Lsub_buf;
+        lusup = Lval_buf;
+        lPanelInfo->lusup = Lval_buf;
+    }
+    if (myrow == krow)
+    {
+        int_t  lk = LBi (k, grid);
+        usub = Ufstnz_br_ptr[lk];
+        uval = Unzval_br_ptr[lk];
+        uPanelInfo->usub = usub;
+    }
+    else
+    {
+        if (ToRecv[k] == 2)
+        {
+            usub = Usub_buf;
+            uval = Uval_buf;
+            uPanelInfo->usub = usub;
+        }
+    }
+
+    /*now each procs does the schurcomplement update*/
+    int_t msg0 = msgcnt[0];
+    int_t msg2 = msgcnt[2];
+    int_t knsupc = SuperSize (k);
+
+    int_t lptr0, luptr0;
+    int_t LU_nonempty = msg0 && msg2;
+    if (LU_nonempty == 0) return 0;
+    if (msg0 && msg2)       /* L(:,k) and U(k,:) are not empty. */
+    {
+        lPanelInfo->nsupr = lsub[1];
+        int_t nlb;
+        if (myrow == krow)  /* Skip diagonal block L(k,k). */
+        {
+            lptr0 = BC_HEADER + LB_DESCRIPTOR + lsub[BC_HEADER + 1];
+            luptr0 = knsupc;
+            nlb = lsub[0] - 1;
+            lPanelInfo->nlb = nlb;
+        }
+        else
+        {
+            lptr0 = BC_HEADER;
+            luptr0 = 0;
+            nlb = lsub[0];
+            lPanelInfo->nlb = nlb;
+        }
+        int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+
+        int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+        int_t klst = FstBlockC (k + 1);
+        uPanelInfo->klst = klst;
+
+        /* --------------------------------------------------------------
+           Update the look-ahead block columns A(:,k+1:k+num_look_ahead).
+           -------------------------------------------------------------- */
+        int_t iukp0 = iukp;
+
+        /* reorder the remaining columns in bottom-up */
+        for (int_t jj = 0; jj < nub; jj++)
+        {
+#ifdef ISORT
+            iperm_u[jj] = iperm_c_supno[usub[iukp]];    /* Global block number of block U(k,j). */
+            perm_u[jj] = jj;
+#else
+            perm_u[2 * jj] = iperm_c_supno[usub[iukp]]; /* Global block number of block U(k,j). */
+            perm_u[2 * jj + 1] = jj;
+#endif
+            int_t jb = usub[iukp];    /* Global block number of block U(k,j). */
+            int_t nsupc = SuperSize (jb);
+            iukp += UB_DESCRIPTOR;  /* Start fstnz of block U(k,j). */
+            iukp += nsupc;
+        }
+        iukp = iukp0;
+#ifdef ISORT
+        isort (nub, iperm_u, perm_u);
+#else
+        qsort (perm_u, (size_t) nub, 2 * sizeof (int_t),
+               &superlu_sort_perm);
+#endif
+        HyP->Lnbrow = 0;
+        HyP->Rnbrow = 0;
+        HyP->num_u_blks_Phi=0;
+	HyP->num_u_blks=0;
+
+        sRgather_L(k, lsub, lusup,  gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno);
+        if (HyP->Lnbrow + HyP->Rnbrow > 0)
+        {
+            sRgather_U( k, 0, usub, uval, bigU,  gEtreeInfo, Glu_persist, grid, HyP, myIperm, iperm_c_supno, perm_u);
+        }/*if(nbrow>0) */
+
+    }
+
+    return LU_nonempty;
+} /* sSchurComplementSetupGPU */
+
+
+float* sgetBigV(int_t ldt, int_t num_threads)
+{
+    float *bigV;
+    if (!(bigV = floatMalloc_dist (8 * ldt * ldt * num_threads)))
+        ABORT ("Malloc failed for dgemm buffV");
+    return bigV;
+}
+
+float* sgetBigU(int_t nsupers, gridinfo_t *grid, sLUstruct_t *LUstruct)
+{
+    int_t Pr = grid->nprow;
+    int_t Pc = grid->npcol;
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+
+    /* Following circuit is for finding maximum block size */
+    int local_max_row_size = 0;
+    int max_row_size;
+
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        int_t tpc = PCOL (i, grid);
+        if (mycol == tpc)
+        {
+            int_t lk = LBj (i, grid);
+            int_t* lsub = LUstruct->Llu->Lrowind_bc_ptr[lk];
+            if (lsub != NULL)
+            {
+                local_max_row_size = SUPERLU_MAX (local_max_row_size, lsub[1]);
+            }
+        }
+
+    }
+
+    /* Max row size is global reduction of within A row */
+    MPI_Allreduce (&local_max_row_size, &max_row_size, 1, MPI_INT, MPI_MAX,
+                   (grid->rscp.comm));
+
+    // int_t Threads_per_process = get_thread_per_process ();
+
+    /*Buffer size is max of of look ahead window*/
+
+    int_t bigu_size =
+	8 * sp_ienv_dist (3) * (max_row_size) * SUPERLU_MAX(Pr / Pc, 1);
+	//Sherry: 8 * sp_ienv_dist (3) * (max_row_size) * MY_MAX(Pr / Pc, 1);
+
+    // printf("Size of big U is %d\n",bigu_size );
+    float* bigU = floatMalloc_dist(bigu_size);
+
+    return bigU;
+} /* sgetBigU */
+
+
+trf3Dpartition_t* sinitTrf3Dpartition(int_t nsupers,
+				      superlu_dist_options_t *options,
+				      sLUstruct_t *LUstruct, gridinfo3d_t * grid3d
+				      )
+{
+    gridinfo_t* grid = &(grid3d->grid2d);
+
+#if ( DEBUGlevel>=1 )
+    int iam = grid3d->iam;
+    CHECK_MALLOC (iam, "Enter sinitTrf3Dpartition()");
+#endif
+    int_t* perm_c_supno = getPerm_c_supno(nsupers, options,
+                                         LUstruct->etree,
+    	   		                 LUstruct->Glu_persist,
+		                         LUstruct->Llu->Lrowind_bc_ptr,
+					 LUstruct->Llu->Ufstnz_br_ptr, grid);
+    int_t* iperm_c_supno = getFactIperm(perm_c_supno, nsupers);
+
+    // calculating tree factorization
+    int_t *setree = supernodal_etree(nsupers, LUstruct->etree, LUstruct->Glu_persist->supno, LUstruct->Glu_persist->xsup);
+    treeList_t* treeList = setree2list(nsupers, setree );
+
+    /*update treelist with weight and depth*/
+    getSCUweight(nsupers, treeList, LUstruct->Glu_persist->xsup,
+		  LUstruct->Llu->Lrowind_bc_ptr, LUstruct->Llu->Ufstnz_br_ptr,
+		  grid3d);
+
+    calcTreeWeight(nsupers, setree, treeList, LUstruct->Glu_persist->xsup);
+
+    gEtreeInfo_t gEtreeInfo;
+    gEtreeInfo.setree = setree;
+    gEtreeInfo.numChildLeft = (int_t* ) SUPERLU_MALLOC(sizeof(int_t) * nsupers);
+    for (int_t i = 0; i < nsupers; ++i)
+    {
+        /* code */
+        gEtreeInfo.numChildLeft[i] = treeList[i].numChild;
+    }
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    sForest_t**  sForests = getForests( maxLvl, nsupers, setree, treeList);
+    /*indexes of trees for my process grid in gNodeList size(maxLvl)*/
+    int_t* myTreeIdxs = getGridTrees(grid3d);
+    int_t* myZeroTrIdxs = getReplicatedTrees(grid3d);
+    int_t*  gNodeCount = getNodeCountsFr(maxLvl, sForests);
+    int_t** gNodeLists = getNodeListFr(maxLvl, sForests); // reuse NodeLists stored in sForests[]
+
+    sinit3DLUstructForest(myTreeIdxs, myZeroTrIdxs,
+                         sForests, LUstruct, grid3d);
+    int_t* myNodeCount = getMyNodeCountsFr(maxLvl, myTreeIdxs, sForests);
+    int_t** treePerm = getTreePermFr( myTreeIdxs, sForests, grid3d);
+
+    sLUValSubBuf_t *LUvsb = SUPERLU_MALLOC(sizeof(sLUValSubBuf_t));
+    sLluBufInit(LUvsb, LUstruct);
+
+    int_t* supernode2treeMap = SUPERLU_MALLOC(nsupers*sizeof(int_t));
+    int_t numForests = (1 << maxLvl) - 1;
+    for (int_t Fr = 0; Fr < numForests; ++Fr)
+    {
+        /* code */
+        for (int_t nd = 0; nd < gNodeCount[Fr]; ++nd)
+        {
+            /* code */
+            supernode2treeMap[gNodeLists[Fr][nd]]=Fr;
+        }
+    }
+
+    trf3Dpartition_t*  trf3Dpartition = SUPERLU_MALLOC(sizeof(trf3Dpartition_t));
+
+    trf3Dpartition->gEtreeInfo = gEtreeInfo;
+    trf3Dpartition->iperm_c_supno = iperm_c_supno;
+    trf3Dpartition->myNodeCount = myNodeCount;
+    trf3Dpartition->myTreeIdxs = myTreeIdxs;
+    trf3Dpartition->myZeroTrIdxs = myZeroTrIdxs;
+    trf3Dpartition->sForests = sForests;
+    trf3Dpartition->treePerm = treePerm;
+    trf3Dpartition->LUvsb = LUvsb;
+    trf3Dpartition->supernode2treeMap = supernode2treeMap;
+
+    // Sherry added
+    // Deallocate storage
+    SUPERLU_FREE(gNodeCount); 
+    SUPERLU_FREE(gNodeLists); 
+    SUPERLU_FREE(perm_c_supno);
+    free_treelist(nsupers, treeList);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (iam, "Exit sinitTrf3Dpartition()");
+#endif
+    return trf3Dpartition;
+} /* sinitTrf3Dpartition */
+
+/* Free memory allocated for trf3Dpartition structure. Sherry added this routine */
+void sDestroy_trf3Dpartition(trf3Dpartition_t *trf3Dpartition, gridinfo3d_t *grid3d)
+{
+    int i;
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Enter sDestroy_trf3Dpartition()");
+#endif
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.setree);
+    SUPERLU_FREE(trf3Dpartition->gEtreeInfo.numChildLeft);
+    SUPERLU_FREE(trf3Dpartition->iperm_c_supno);
+    SUPERLU_FREE(trf3Dpartition->myNodeCount);
+    SUPERLU_FREE(trf3Dpartition->myTreeIdxs);
+    SUPERLU_FREE(trf3Dpartition->myZeroTrIdxs);
+    SUPERLU_FREE(trf3Dpartition->treePerm); // double pointer pointing to sForests->nodeList
+
+    int_t maxLvl = log2i(grid3d->zscp.Np) + 1;
+    int_t numForests = (1 << maxLvl) - 1;
+    sForest_t** sForests = trf3Dpartition->sForests;
+    for (i = 0; i < numForests; ++i) {
+	if ( sForests[i] ) {
+	    SUPERLU_FREE(sForests[i]->nodeList);
+	    SUPERLU_FREE((sForests[i]->topoInfo).eTreeTopLims);
+	    SUPERLU_FREE((sForests[i]->topoInfo).myIperm);
+	    SUPERLU_FREE(sForests[i]); // Sherry added
+	}
+    }
+    SUPERLU_FREE(trf3Dpartition->sForests); // double pointer 
+    SUPERLU_FREE(trf3Dpartition->supernode2treeMap);
+
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lsub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Lval_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Usub_buf);
+    SUPERLU_FREE((trf3Dpartition->LUvsb)->Uval_buf);
+    SUPERLU_FREE(trf3Dpartition->LUvsb); // Sherry: check this ...
+
+    SUPERLU_FREE(trf3Dpartition);
+
+#if ( DEBUGlevel>=1 )
+    CHECK_MALLOC (grid3d->iam, "Exit sDestroy_trf3Dpartition()");
+#endif
+}
+
+
+#if 0  //**** Sherry: following two routines are old, the new ones are in util.c
+int_t num_full_cols_U(int_t kk,  int_t **Ufstnz_br_ptr, int_t *xsup,
+                      gridinfo_t *grid, int_t *perm_u)
+{
+    int_t lk = LBi (kk, grid);
+    int_t *usub = Ufstnz_br_ptr[lk];
+
+    if (usub == NULL)
+    {
+        /* code */
+        return 0;
+    }
+    int_t iukp = BR_HEADER;   /* Skip header; Pointer to index[] of U(k,:) */
+    int_t rukp = 0;           /* Pointer to nzval[] of U(k,:) */
+    int_t nub = usub[0];      /* Number of blocks in the block row U(k,:) */
+
+    int_t klst = FstBlockC (kk + 1);
+    int_t iukp0 = iukp;
+    int_t rukp0 = rukp;
+    int_t jb, ljb;
+    int_t nsupc;
+    int_t temp_ncols = 0;
+    int_t segsize;
+
+    temp_ncols = 0;
+
+    for (int_t j = 0; j < nub; ++j)
+    {
+        arrive_at_ublock(
+            j, &iukp, &rukp, &jb, &ljb, &nsupc,
+            iukp0, rukp0, usub, perm_u, xsup, grid
+        );
+
+        for (int_t jj = iukp; jj < iukp + nsupc; ++jj)
+        {
+            segsize = klst - usub[jj];
+            if ( segsize ) ++temp_ncols;
+        }
+    }
+    return temp_ncols;
+}
+
+// Sherry: this is old; new version is in util.c 
+int_t estimate_bigu_size( int_t nsupers, int_t ldt, int_t**Ufstnz_br_ptr,
+                          Glu_persist_t *Glu_persist,  gridinfo_t* grid, int_t* perm_u)
+{
+
+    int_t iam = grid->iam;
+
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+
+    int_t* xsup = Glu_persist->xsup;
+
+    int ncols = 0;
+    int_t ldu = 0;
+
+    /*initilize perm_u*/
+    for (int i = 0; i < nsupers; ++i)
+    {
+        perm_u[i] = i;
+    }
+
+    for (int lk = myrow; lk < nsupers; lk += Pr )
+    {
+        ncols = SUPERLU_MAX(ncols, num_full_cols_U(lk, Ufstnz_br_ptr,
+						   xsup, grid, perm_u, &ldu));
+    }
+
+    int_t max_ncols = 0;
+
+    MPI_Allreduce(&ncols, &max_ncols, 1, mpi_int_t, MPI_MAX, grid->cscp.comm);
+
+    printf("max_ncols =%d, bigu_size=%ld\n", (int) max_ncols, (long long) ldt * max_ncols);
+    return ldt * max_ncols;
+} /* old estimate_bigu_size. New one is in util.c */
+#endif /**** end old ones ****/
+
+
diff --git a/SRC/strfCommWrapper.c b/SRC/strfCommWrapper.c
new file mode 100644
index 00000000..b5126c60
--- /dev/null
+++ b/SRC/strfCommWrapper.c
@@ -0,0 +1,534 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Communication wrapper routines for 2D factorization.
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
+ */
+
+#include "superlu_sdefs.h"
+
+#if 0
+#include "pdgstrf3d.h"
+#include "trfCommWrapper.h"
+#endif
+
+//#include "cblas.h"
+
+int_t sDiagFactIBCast(int_t k,  int_t k0,      // supernode to be factored
+                     float *BlockUFactor,
+                     float *BlockLFactor,
+                     int_t* IrecvPlcd_D,
+                     MPI_Request *U_diag_blk_recv_req,
+                     MPI_Request *L_diag_blk_recv_req,
+                     MPI_Request *U_diag_blk_send_req,
+                     MPI_Request *L_diag_blk_send_req,
+                     gridinfo_t *grid,
+                     superlu_dist_options_t *options,
+                     double thresh,
+                     sLUstruct_t *LUstruct,
+                     SuperLUStat_t *stat, int *info,
+                     SCT_t *SCT,
+		     int tag_ub
+                    )
+{
+    // unpacking variables
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    int_t kcol = PCOL (k, grid);
+
+    //xsup for supersize
+
+    /*Place Irecvs first*/
+    // if (IrecvPlcd_D[k] == 0 )
+    // {
+    int_t nsupc = SuperSize (k);
+    if (mycol == kcol && iam != pkk)
+    {
+        sIRecv_UDiagBlock(k0, BlockUFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, krow,
+                         U_diag_blk_recv_req, grid, SCT, tag_ub);
+    }
+
+    if (myrow == krow && iam != pkk)
+    {
+        sIRecv_LDiagBlock(k0, BlockLFactor,  /*pointer for the diagonal block*/
+                         nsupc * nsupc, kcol,
+                         L_diag_blk_recv_req, grid, SCT, tag_ub);
+    }
+    IrecvPlcd_D[k] = 1;
+    // }
+
+    /*DiagFact and send */
+    // if ( factored_D[k] == 0 )
+    // {
+
+    // int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    // int_t krow = PROW (k, grid);
+    // int_t kcol = PCOL (k, grid);
+    /*factorize the leaf node and broadcast them
+     process row and process column*/
+    if (iam == pkk)
+    {
+        // printf("Entering factorization %d\n", k);
+        // int_t offset = (k0 - k_st); // offset is input
+        /*factorize A[kk]*/
+        Local_Sgstrf2(options, k, thresh,
+                      BlockUFactor, /*factored U is over writen here*/
+                      Glu_persist, grid, Llu, stat, info, SCT);
+
+        /*Pack L[kk] into blockLfactor*/
+        sPackLBlock(k, BlockLFactor, Glu_persist, grid, Llu);
+
+        /*Isend U blocks to the process row*/
+        int_t nsupc = SuperSize(k);
+        sISend_UDiagBlock(k0, BlockLFactor,
+                         nsupc * nsupc, U_diag_blk_send_req , grid, tag_ub);
+
+        /*Isend L blocks to the process col*/
+        sISend_LDiagBlock(k0, BlockLFactor,
+                         nsupc * nsupc, L_diag_blk_send_req, grid, tag_ub);
+        SCT->commVolFactor += 1.0 * nsupc * nsupc * (Pr + Pc);
+    }
+    // }
+    return 0;
+}
+
+int_t sLPanelTrSolve( int_t k,   int_t* factored_L,
+		      float* BlockUFactor,
+		      gridinfo_t *grid,
+		      sLUstruct_t *LUstruct)
+{
+    double alpha = 1.0;
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+
+    int_t iam = grid->iam;
+
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t kcol = PCOL (k, grid);
+    int_t mycol = MYCOL (iam, grid);
+    int nsupc = SuperSize(k);
+
+    /*factor the L panel*/
+    if (mycol == kcol  && iam != pkk)
+    {
+        // factored_L[k] = 1;
+        int_t lk = LBj (k, grid);
+        float *lusup = Llu->Lnzval_bc_ptr[lk];
+        int nsupr;
+        if (Llu->Lrowind_bc_ptr[lk])
+            nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else
+            nsupr = 0;
+        /*wait for communication to finish*/
+
+        // Wait_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+        // int_t flag = 0;
+        // while (flag == 0)
+        // {
+        //     flag = Test_UDiagBlock_Recv( U_diag_blk_recv_req, SCT);
+        // }
+
+        int_t l = nsupr;
+        float* ublk_ptr = BlockUFactor;
+        int ld_ujrow = nsupc;
+
+        // unsigned long long t1 = _rdtsc();
+
+        // #pragma omp for schedule(dynamic) nowait
+#define BL  32
+        for (int i = 0; i < CEILING(l, BL); ++i)
+        {
+            #pragma omp task
+            {
+                int_t off = i * BL;
+                // Sherry: int_t len = MY_MIN(BL, l - i * BL);
+                int len = SUPERLU_MIN(BL, l - i * BL);
+
+                superlu_strsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[off], nsupr);
+            }
+        }
+    }
+
+    if (iam == pkk)
+    {
+        /* if (factored_L[k] == 0)
+         { */
+        /* code */
+        factored_L[k] = 1;
+        int_t lk = LBj (k, grid);
+        float *lusup = Llu->Lnzval_bc_ptr[lk];
+        int nsupr;
+        if (Llu->Lrowind_bc_ptr[lk]) nsupr = Llu->Lrowind_bc_ptr[lk][1];
+        else nsupr = 0;
+
+        /*factorize A[kk]*/
+
+        int_t l = nsupr - nsupc;
+
+        float* ublk_ptr = BlockUFactor;
+        int ld_ujrow = nsupc;
+        // printf("%d: L update \n",k );
+
+#define BL  32
+        // #pragma omp parallel for
+        for (int i = 0; i < CEILING(l, BL); ++i)
+        {
+            int_t off = i * BL;
+            // Sherry: int_t len = MY_MIN(BL, l - i * BL);
+            int len = SUPERLU_MIN(BL, (l - i * BL));
+#pragma omp task
+            {
+                superlu_strsm("R", "U", "N", "N", len, nsupc, alpha,
+			      ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
+            }
+        }
+    }
+
+    return 0;
+}  /* sLPanelTrSolve */
+
+int_t sLPanelUpdate( int_t k,  int_t* IrecvPlcd_D, int_t* factored_L,
+                    MPI_Request * U_diag_blk_recv_req,
+                    float* BlockUFactor,
+                    gridinfo_t *grid,
+                    sLUstruct_t *LUstruct, SCT_t *SCT)
+{
+
+    sUDiagBlockRecvWait( k,  IrecvPlcd_D, factored_L,
+                         U_diag_blk_recv_req, grid, LUstruct, SCT);
+
+    sLPanelTrSolve( k, factored_L, BlockUFactor, grid, LUstruct );
+
+    return 0;
+}  /* sLPanelUpdate */
+
+#define BL  32
+
+int_t sUPanelTrSolve( int_t k,  
+                     float* BlockLFactor,
+                     float* bigV,
+                     int_t ldt,
+                     Ublock_info_t* Ublock_info,
+                     gridinfo_t *grid,
+                     sLUstruct_t *LUstruct,
+                     SuperLUStat_t *stat, SCT_t *SCT)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t pkk = PNUM (PROW (k, grid), PCOL (k, grid), grid);
+    int_t krow = PROW (k, grid);
+    int_t nsupc = SuperSize(k);
+
+    /*factor the U panel*/
+    if (myrow == krow  && iam != pkk)
+    {
+        int_t lk = LBi (k, grid);         /* Local block number */
+        if (!Llu->Unzval_br_ptr[lk])
+            return 0;
+        /* Initialization. */
+        int_t klst = FstBlockC (k + 1);
+
+        int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+        float *uval = Llu->Unzval_br_ptr[lk];
+        int_t nb = usub[0];
+
+        // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+        float *lusup = BlockLFactor;
+
+        /* Loop through all the row blocks. to get the iukp and rukp*/
+        Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+
+        /* Loop through all the row blocks. */
+        // #pragma omp for schedule(dynamic,2) nowait
+        for (int_t b = 0; b < nb; ++b)
+        {
+            #pragma omp task
+            {
+#ifdef _OPENMP	    
+                int_t thread_id = omp_get_thread_num();
+#else		
+                int_t thread_id = 0;
+#endif		
+                float *tempv = bigV +  thread_id * ldt * ldt;
+                sTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
+				       usub, uval, tempv, nsupc, nsupc, lusup, Glu_persist);
+            }
+        }
+    }
+
+    /*factor the U panel*/
+    if (iam == pkk)
+    {
+        /* code */
+        // factored_U[k] = 1;
+        int_t *Lsub_buf;
+        float *Lval_buf;
+        int_t lk = LBj (k, grid);
+        Lsub_buf = Llu->Lrowind_bc_ptr[lk];
+        Lval_buf = Llu->Lnzval_bc_ptr[lk];
+
+
+        /* calculate U panel */
+        // PDGSTRS2 (n, k0, k, Lsub_buf, Lval_buf, Glu_persist, grid, Llu,
+        //           stat, HyP->Ublock_info, bigV, ldt, SCT);
+
+        lk = LBi (k, grid);         /* Local block number */
+        if (Llu->Unzval_br_ptr[lk])
+        {
+            /* Initialization. */
+            int_t klst = FstBlockC (k + 1);
+
+            int_t *usub = Llu->Ufstnz_br_ptr[lk];  /* index[] of block row U(k,:) */
+            float *uval = Llu->Unzval_br_ptr[lk];
+            int_t nb = usub[0];
+
+            // int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+            int_t nsupr = Lsub_buf[1];   /* LDA of lusup[] */
+            float *lusup = Lval_buf;
+
+            /* Loop through all the row blocks. to get the iukp and rukp*/
+            Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
+
+            /* Loop through all the row blocks. */
+            // printf("%d :U update \n", k);
+            for (int_t b = 0; b < nb; ++b)
+            {
+                #pragma omp task
+                {
+#ifdef _OPENMP		
+                    int_t thread_id = omp_get_thread_num();
+#else		    
+                    int_t thread_id = 0;
+#endif		    
+                    float *tempv = bigV +  thread_id * ldt * ldt;
+                    sTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
+					   usub, uval, tempv, nsupc, nsupr, lusup, Glu_persist);
+                }
+
+            }
+        }
+    }
+
+    return 0;
+} /* sUPanelTrSolve */
+
+int_t sUPanelUpdate( int_t k,  int_t* factored_U,
+                    MPI_Request * L_diag_blk_recv_req,
+                    float* BlockLFactor,
+                    float* bigV,
+                    int_t ldt,
+                    Ublock_info_t* Ublock_info,
+                    gridinfo_t *grid,
+                    sLUstruct_t *LUstruct,
+                    SuperLUStat_t *stat, SCT_t *SCT)
+{
+
+    LDiagBlockRecvWait( k, factored_U, L_diag_blk_recv_req, grid);
+
+    sUPanelTrSolve( k, BlockLFactor, bigV, ldt, Ublock_info, grid,
+                       LUstruct, stat, SCT);
+    return 0;
+}
+
+int_t sIBcastRecvLPanel(
+    int_t k,
+    int_t k0,
+    int* msgcnt,
+    MPI_Request *send_req,
+    MPI_Request *recv_req ,
+    int_t* Lsub_buf,
+    float* Lval_buf,
+    int_t * factored,
+    gridinfo_t *grid,
+    sLUstruct_t *LUstruct,
+    SCT_t *SCT,
+    int tag_ub
+)
+{
+    Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int_t* xsup = Glu_persist->xsup;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t Pc = grid->npcol;
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
+    int_t** Lrowind_bc_ptr = Llu->Lrowind_bc_ptr;
+    float** Lnzval_bc_ptr = Llu->Lnzval_bc_ptr;
+    /* code */
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+
+        int_t lk = LBj (k, grid);     /* Local block number. */
+        int_t* lsub = Lrowind_bc_ptr[lk];
+        float* lusup = Lnzval_bc_ptr[lk];
+
+        sIBcast_LPanel (k, k0, lsub, lusup, grid, msgcnt, send_req,
+		       ToSendR, xsup, tag_ub);
+
+        if (lsub)
+        {
+            int_t nrbl  =   lsub[0]; /*number of L blocks */
+            int_t   len   = lsub[1];       /* LDA of the nzval[] */
+            int_t len1  = len + BC_HEADER + nrbl * LB_DESCRIPTOR;
+            int_t len2  = SuperSize(lk) * len;
+            SCT->commVolFactor += 1.0 * (Pc - 1) * (len1 * sizeof(int_t) + len2 * sizeof(float));
+        }
+    }
+    else
+    {
+        /*receive factored L panels*/
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        {
+            /*place Irecv*/
+            sIrecv_LPanel (k, k0, Lsub_buf, Lval_buf, grid, recv_req, Llu, tag_ub);
+        }
+        else
+        {
+            msgcnt[0] = 0;
+        }
+
+    }
+    factored[k] = 0;
+
+    return 0;
+}
+
+int_t sIBcastRecvUPanel(int_t k, int_t k0, int* msgcnt,
+    			     MPI_Request *send_requ,
+    			     MPI_Request *recv_requ,
+    			     int_t* Usub_buf, float* Uval_buf,
+    			     gridinfo_t *grid, sLUstruct_t *LUstruct,
+    			     SCT_t *SCT, int tag_ub)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    int* ToSendD = Llu->ToSendD;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t Pr = grid->nprow;
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
+
+    int_t** Ufstnz_br_ptr = Llu->Ufstnz_br_ptr;
+    float** Unzval_br_ptr = Llu->Unzval_br_ptr;
+    if (myrow == krow)
+    {
+        /*send U panel to myrow*/
+        int_t   lk = LBi (k, grid);
+        int_t*  usub = Ufstnz_br_ptr[lk];
+        float* uval = Unzval_br_ptr[lk];
+        sIBcast_UPanel(k, k0, usub, uval, grid, msgcnt,
+                        send_requ, ToSendD, tag_ub);
+        if (usub)
+        {
+            /* code */
+            int_t lenv = usub[1];
+            int_t lens = usub[2];
+            SCT->commVolFactor += 1.0 * (Pr - 1) * (lens * sizeof(int_t) + lenv * sizeof(float));
+        }
+    }
+    else
+    {
+        /*receive U panels */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        {
+            sIrecv_UPanel (k, k0, Usub_buf, Uval_buf, Llu, grid, recv_requ, tag_ub);
+        }
+        else
+        {
+            msgcnt[2] = 0;
+        }
+    }
+
+    return 0;
+}
+
+int_t sWaitL( int_t k, int* msgcnt, int* msgcntU,
+              MPI_Request *send_req, MPI_Request *recv_req,
+    	      gridinfo_t *grid, sLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+    int** ToSendR = Llu->ToSendR;
+    int* ToRecv = Llu->ToRecv;
+    int_t iam = grid->iam;
+    int_t mycol = MYCOL (iam, grid);
+    int_t kcol = PCOL (k, grid);
+    if (mycol == kcol)
+    {
+        /*send the L panel to myrow*/
+        Wait_LSend (k, grid, ToSendR, send_req, SCT);
+    }
+    else
+    {
+        /*receive factored L panels*/
+        if (ToRecv[k] >= 1)     /* Recv block column L(:,0). */
+        {
+            /*force wait for I recv to complete*/
+            sWait_LRecv( recv_req,  msgcnt, msgcntU, grid, SCT);
+        }
+    }
+
+    return 0;
+}
+
+int_t sWaitU( int_t k, int* msgcnt,
+              MPI_Request *send_requ, MPI_Request *recv_requ,
+    	      gridinfo_t *grid, sLUstruct_t *LUstruct, SCT_t *SCT)
+{
+    sLocalLU_t *Llu = LUstruct->Llu;
+
+    int* ToRecv = Llu->ToRecv;
+    int* ToSendD = Llu->ToSendD;
+    int_t iam = grid->iam;
+    int_t myrow = MYROW (iam, grid);
+    int_t krow = PROW (k, grid);
+    if (myrow == krow)
+    {
+        int_t lk = LBi (k, grid);
+        if (ToSendD[lk] == YES)
+            Wait_USend(send_requ, grid, SCT);
+    }
+    else
+    {
+        /*receive U panels */
+        if (ToRecv[k] == 2)     /* Recv block row U(k,:). */
+        {
+            /*force wait*/
+            sWait_URecv( recv_requ, msgcnt, SCT);
+        }
+    }
+    return 0;
+}
diff --git a/SRC/superlu_FortranCInterface.h b/SRC/superlu_FortranCInterface.h
index 467bfb65..c9fee77d 100644
--- a/SRC/superlu_FortranCInterface.h
+++ b/SRC/superlu_FortranCInterface.h
@@ -2,15 +2,15 @@
 #define FC_HEADER_INCLUDED
 
 /* Mangling for Fortran global symbols without underscores. */
-#define FC_GLOBAL(name,NAME) name##_
+#define FC_GLOBAL(name,NAME) name
 
 /* Mangling for Fortran global symbols with underscores. */
-#define FC_GLOBAL_(name,NAME) name##_
+#define FC_GLOBAL_(name,NAME) name
 
 /* Mangling for Fortran module symbols without underscores. */
-#define FC_MODULE(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name
+#define FC_MODULE(mod_name,name, mod_NAME,NAME) __##mod_name##_NMOD_##name
 
 /* Mangling for Fortran module symbols with underscores. */
-#define FC_MODULE_(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name
+#define FC_MODULE_(mod_name,name, mod_NAME,NAME) __##mod_name##_NMOD_##name
 
 #endif
diff --git a/SRC/superlu_dist_config.h b/SRC/superlu_dist_config.h
deleted file mode 100644
index ffd061a2..00000000
--- a/SRC/superlu_dist_config.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* superlu_dist_config.h.in */
-
-/* Enable CUDA */
-/* #undef HAVE_CUDA */
-
-/* Enable parmetis */
-#define HAVE_PARMETIS TRUE
-
-/* Enable LAPACK */
-/* #undef SLU_HAVE_LAPACK */
-
-/* Enable CombBLAS */
-/* #undef HAVE_COMBBLAS */
-
-/* enable 64bit index mode */
-/* #undef XSDK_INDEX_SIZE */
-
-#if (XSDK_INDEX_SIZE == 64)
-#define _LONGINT 1
-#endif
diff --git a/SRC/superlu_gpu_utils.cu b/SRC/superlu_gpu_utils.cu
new file mode 100644
index 00000000..877ad865
--- /dev/null
+++ b/SRC/superlu_gpu_utils.cu
@@ -0,0 +1,14 @@
+#include 
+
+/*error reporting functions */
+cudaError_t checkCuda(cudaError_t result)
+{
+#if defined(DEBUG) || defined(_DEBUG)
+    if (result != cudaSuccess) {
+        fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
+        assert(result == cudaSuccess);
+    }
+#endif
+    return result;
+}
+
diff --git a/SRC/sutil_dist.c b/SRC/sutil_dist.c
new file mode 100644
index 00000000..c957b30b
--- /dev/null
+++ b/SRC/sutil_dist.c
@@ -0,0 +1,945 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Several matrix utilities
+ *
+ * 
+ * -- Distributed SuperLU routine (version 6.1.1) --
+ * Lawrence Berkeley National Lab, Univ. of California Berkeley.
+ * March 15, 2003
+ *
+ */
+
+#include 
+#include "superlu_sdefs.h"
+
+void
+sCreate_CompCol_Matrix_dist(SuperMatrix *A, int_t m, int_t n, int_t nnz,
+			    float *nzval, int_t *rowind, int_t *colptr,
+			    Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    NCformat *Astore;
+
+    A->Stype = stype;
+    A->Dtype = dtype;
+    A->Mtype = mtype;
+    A->nrow = m;
+    A->ncol = n;
+    A->Store = (void *) SUPERLU_MALLOC( sizeof(NCformat) );
+    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
+    Astore = (NCformat *) A->Store;
+    Astore->nnz = nnz;
+    Astore->nzval = nzval;
+    Astore->rowind = rowind;
+    Astore->colptr = colptr;
+}
+
+void
+sCreate_CompRowLoc_Matrix_dist(SuperMatrix *A, int_t m, int_t n,
+			       int_t nnz_loc, int_t m_loc, int_t fst_row,
+			       float *nzval, int_t *colind, int_t *rowptr,
+			       Stype_t stype, Dtype_t dtype, Mtype_t mtype)
+{
+    NRformat_loc *Astore;
+
+    A->Stype = stype;
+    A->Dtype = dtype;
+    A->Mtype = mtype;
+    A->nrow = m;
+    A->ncol = n;
+    A->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat_loc) );
+    if ( !(A->Store) ) ABORT("SUPERLU_MALLOC fails for A->Store");
+    Astore = (NRformat_loc *) A->Store;
+    Astore->nnz_loc = nnz_loc;
+    Astore->fst_row = fst_row;
+    Astore->m_loc = m_loc;
+    Astore->nzval = nzval;
+    Astore->colind = colind;
+    Astore->rowptr = rowptr;
+}
+
+/*! \brief Convert a row compressed storage into a column compressed storage.
+ */
+void
+sCompRow_to_CompCol_dist(int_t m, int_t n, int_t nnz,
+                         float *a, int_t *colind, int_t *rowptr,
+                         float **at, int_t **rowind, int_t **colptr)
+{
+    register int i, j, col, relpos;
+    int_t *marker;
+
+    /* Allocate storage for another copy of the matrix. */
+    *at = (float *) floatMalloc_dist(nnz);
+    *rowind = intMalloc_dist(nnz);
+    *colptr = intMalloc_dist(n+1);
+    marker = intCalloc_dist(n);
+
+    /* Get counts of each column of A, and set up column pointers */
+    for (i = 0; i < m; ++i)
+	for (j = rowptr[i]; j < rowptr[i+1]; ++j) ++marker[colind[j]];
+    (*colptr)[0] = 0;
+    for (j = 0; j < n; ++j) {
+	(*colptr)[j+1] = (*colptr)[j] + marker[j];
+	marker[j] = (*colptr)[j];
+    }
+
+    /* Transfer the matrix into the compressed column storage. */
+    for (i = 0; i < m; ++i) {
+	for (j = rowptr[i]; j < rowptr[i+1]; ++j) {
+	    col = colind[j];
+	    relpos = marker[col];
+	    (*rowind)[relpos] = i;
+	    (*at)[relpos] = a[j];
+	    ++marker[col];
+	}
+    }
+
+    SUPERLU_FREE(marker);
+}
+
+/*! \brief Copy matrix A into matrix B. */
+void
+sCopy_CompCol_Matrix_dist(SuperMatrix *A, SuperMatrix *B)
+{
+    NCformat *Astore, *Bstore;
+    int      ncol, nnz, i;
+
+    B->Stype = A->Stype;
+    B->Dtype = A->Dtype;
+    B->Mtype = A->Mtype;
+    B->nrow  = A->nrow;;
+    B->ncol  = ncol = A->ncol;
+    Astore   = (NCformat *) A->Store;
+    Bstore   = (NCformat *) B->Store;
+    Bstore->nnz = nnz = Astore->nnz;
+    for (i = 0; i < nnz; ++i)
+	((float *)Bstore->nzval)[i] = ((float *)Astore->nzval)[i];
+    for (i = 0; i < nnz; ++i) Bstore->rowind[i] = Astore->rowind[i];
+    for (i = 0; i <= ncol; ++i) Bstore->colptr[i] = Astore->colptr[i];
+}
+
+
+void sPrint_CompCol_Matrix_dist(SuperMatrix *A)
+{
+    NCformat     *Astore;
+    register int i;
+    float       *dp;
+
+    printf("\nCompCol matrix: ");
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (NCformat *) A->Store;
+    printf("nrow %lld, ncol %lld, nnz %lld\n", (long long) A->nrow,
+	    (long long) A->ncol, (long long) Astore->nnz);
+    if ( (dp = (float *) Astore->nzval) != NULL ) {
+        printf("nzval:\n");
+        for (i = 0; i < Astore->nnz; ++i) printf("%f  ", dp[i]);
+    }
+    printf("\nrowind:\n");
+    for (i = 0; i < Astore->nnz; ++i)
+        printf("%lld  ", (long long) Astore->rowind[i]);
+    printf("\ncolptr:\n");
+    for (i = 0; i <= A->ncol; ++i)
+        printf("%lld  ", (long long) Astore->colptr[i]);
+    printf("\nend CompCol matrix.\n");
+}
+
+void sPrint_Dense_Matrix_dist(SuperMatrix *A)
+{
+    DNformat     *Astore;
+    register int i;
+    float       *dp;
+
+    printf("\nDense matrix: ");
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (DNformat *) A->Store;
+    dp = (float *) Astore->nzval;
+    printf("nrow %lld, ncol %lld, lda %lld\n",
+        (long long) A->nrow, (long long) A->ncol, (long long) Astore->lda);
+    printf("\nnzval: ");
+    for (i = 0; i < A->nrow; ++i) printf("%f  ", dp[i]);
+    printf("\nend Dense matrix.\n");
+}
+
+int sPrint_CompRowLoc_Matrix_dist(SuperMatrix *A)
+{
+    NRformat_loc  *Astore;
+    int_t  nnz_loc, m_loc;
+    float  *dp;
+
+    printf("\n==== CompRowLoc matrix: ");
+    printf("Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (NRformat_loc *) A->Store;
+    printf("nrow %ld, ncol %ld\n",
+            (long int) A->nrow, (long int) A->ncol);
+    nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
+    printf("nnz_loc %ld, m_loc %ld, fst_row %ld\n", (long int) nnz_loc,
+            (long int) m_loc, (long int) Astore->fst_row);
+    PrintInt10("rowptr", m_loc+1, Astore->rowptr);
+    PrintInt10("colind", nnz_loc, Astore->colind);
+    if ( (dp = (float *) Astore->nzval) != NULL )
+        Printfloat5("nzval", nnz_loc, dp);
+    printf("==== end CompRowLoc matrix\n");
+    return 0;
+}
+
+int file_sPrint_CompRowLoc_Matrix_dist(FILE *fp, SuperMatrix *A)
+{
+    NRformat_loc     *Astore;
+    int_t  nnz_loc, m_loc;
+    float       *dp;
+
+    fprintf(fp, "\n==== CompRowLoc matrix: ");
+    fprintf(fp, "Stype %d, Dtype %d, Mtype %d\n", A->Stype,A->Dtype,A->Mtype);
+    Astore = (NRformat_loc *) A->Store;
+    fprintf(fp, "nrow %ld, ncol %ld\n", (long int) A->nrow, (long int) A->ncol);
+    nnz_loc = Astore->nnz_loc; m_loc = Astore->m_loc;
+    fprintf(fp, "nnz_loc %ld, m_loc %ld, fst_row %ld\n", (long int) nnz_loc,
+            (long int) m_loc, (long int) Astore->fst_row);
+    file_PrintInt10(fp, "rowptr", m_loc+1, Astore->rowptr);
+    file_PrintInt10(fp, "colind", nnz_loc, Astore->colind);
+    if ( (dp = (float *) Astore->nzval) != NULL )
+        file_Printfloat5(fp, "nzval", nnz_loc, dp);
+    fprintf(fp, "==== end CompRowLoc matrix\n");
+    return 0;
+}
+
+void
+sCreate_Dense_Matrix_dist(SuperMatrix *X, int_t m, int_t n, float *x,
+			  int_t ldx, Stype_t stype, Dtype_t dtype,
+			  Mtype_t mtype)
+{
+    DNformat    *Xstore;
+
+    X->Stype = stype;
+    X->Dtype = dtype;
+    X->Mtype = mtype;
+    X->nrow = m;
+    X->ncol = n;
+    X->Store = (void *) SUPERLU_MALLOC( sizeof(DNformat) );
+    if ( !(X->Store) ) ABORT("SUPERLU_MALLOC fails for X->Store");
+    Xstore = (DNformat *) X->Store;
+    Xstore->lda = ldx;
+    Xstore->nzval = (float *) x;
+}
+
+void
+sCopy_Dense_Matrix_dist(int_t M, int_t N, float *X, int_t ldx,
+			float *Y, int_t ldy)
+{
+/*! \brief
+ *
+ * 
+ *  Purpose
+ *  =======
+ *
+ *  Copies a two-dimensional matrix X to another matrix Y.
+ * 
+ */ + int i, j; + + for (j = 0; j < N; ++j) + for (i = 0; i < M; ++i) + Y[i + j*ldy] = X[i + j*ldx]; +} + +void +sCreate_SuperNode_Matrix_dist(SuperMatrix *L, int_t m, int_t n, int_t nnz, + float *nzval, int_t *nzval_colptr, + int_t *rowind, int_t *rowind_colptr, + int_t *col_to_sup, int_t *sup_to_col, + Stype_t stype, Dtype_t dtype, Mtype_t mtype) +{ + SCformat *Lstore; + + L->Stype = stype; + L->Dtype = dtype; + L->Mtype = mtype; + L->nrow = m; + L->ncol = n; + L->Store = (void *) SUPERLU_MALLOC( sizeof(SCformat) ); + if ( !(L->Store) ) ABORT("SUPERLU_MALLOC fails for L->Store"); + Lstore = L->Store; + Lstore->nnz = nnz; + Lstore->nsuper = col_to_sup[n]; + Lstore->nzval = nzval; + Lstore->nzval_colptr = nzval_colptr; + Lstore->rowind = rowind; + Lstore->rowind_colptr = rowind_colptr; + Lstore->col_to_sup = col_to_sup; + Lstore->sup_to_col = sup_to_col; + +} + +/**** The following utilities are added per request of SUNDIALS ****/ + +/*! \brief Clone: Allocate memory for a new matrix B, which is of the same type + * and shape as A. + * The clone operation would copy all the non-pointer structure members like + * nrow, ncol, Stype, Dtype, Mtype from A and allocate a new nested Store + * structure. It would also copy nnz_loc, m_loc, fst_row from A->Store + * into B->Store. It does not copy the matrix entries, row pointers, + * or column indices. + */ +void sClone_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B) +{ + NRformat_loc *Astore, *Bstore; + + B->Stype = A->Stype; + B->Dtype = A->Dtype; + B->Mtype = A->Mtype; + B->nrow = A->nrow;; + B->ncol = A->ncol; + Astore = (NRformat_loc *) A->Store; + B->Store = (void *) SUPERLU_MALLOC( sizeof(NRformat_loc) ); + if ( !(B->Store) ) ABORT("SUPERLU_MALLOC fails for B->Store"); + Bstore = (NRformat_loc *) B->Store; + + Bstore->nnz_loc = Astore->nnz_loc; + Bstore->m_loc = Astore->m_loc; + Bstore->fst_row = Astore->fst_row; + if ( !(Bstore->nzval = (float *) floatMalloc_dist(Bstore->nnz_loc)) ) + ABORT("floatMalloc_dist fails for Bstore->nzval"); + if ( !(Bstore->colind = (int_t *) intMalloc_dist(Bstore->nnz_loc)) ) + ABORT("intMalloc_dist fails for Bstore->colind"); + if ( !(Bstore->rowptr = (int_t *) intMalloc_dist(Bstore->m_loc + 1)) ) + ABORT("intMalloc_dist fails for Bstore->rowptr"); + + return; +} + +/* \brief Copy: copies all entries, row pointers, and column indices of + * a matrix into another matrix of the same type, + * B_{i,j}=A_{i,j}, for i,j=1,...,n + */ +void sCopy_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B) +{ + NRformat_loc *Astore, *Bstore; + + Astore = (NRformat_loc *) A->Store; + Bstore = (NRformat_loc *) B->Store; + + memcpy(Bstore->nzval, Astore->nzval, Astore->nnz_loc * sizeof(float)); + memcpy(Bstore->colind, Astore->colind, Astore->nnz_loc * sizeof(int_t)); + memcpy(Bstore->rowptr, Astore->rowptr, (Astore->m_loc+1) * sizeof(int_t)); + + return; +} + +/*! \brief Sets all entries of a matrix to zero, A_{i,j}=0, for i,j=1,..,n */ +void sZero_CompRowLoc_Matrix_dist(SuperMatrix *A) +{ + float zero = 0.0; + NRformat_loc *Astore = A->Store; + float *aval; + int_t i; + + aval = (float *) Astore->nzval; + for (i = 0; i < Astore->nnz_loc; ++i) aval[i] = zero; + + return; +} + +/*! \brief Scale and add I: scales a matrix and adds an identity. + * A_{i,j} = c * A_{i,j} + \delta_{i,j} for i,j=1,...,n and + * \delta_{i,j} is the Kronecker delta. + */ +void sScaleAddId_CompRowLoc_Matrix_dist(SuperMatrix *A, float c) +{ + float one = 1.0; + NRformat_loc *Astore = A->Store; + float *aval = (float *) Astore->nzval; + int i, j; + float temp; + + for (i = 0; i < Astore->m_loc; ++i) { /* Loop through each row */ + for (j = Astore->rowptr[i]; j < Astore->rowptr[i+1]; ++j) { + if ( (Astore->fst_row + i) == Astore->colind[j] ) { /* diagonal */ + temp = aval[j] * c; + aval[j] = temp + one; + } else { + aval[j] *= c; + } + } + } + + return; +} + +/*! \brief Scale and add: adds a scalar multiple of one matrix to another. + * A_{i,j} = c * A_{i,j} + B_{i,j}$ for i,j=1,...,n + */ +void sScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B, float c) +{ + NRformat_loc *Astore = A->Store; + NRformat_loc *Bstore = B->Store; + float *aval = (float *) Astore->nzval, *bval = (float *) Bstore->nzval; + int_t i; + float temp; + + for (i = 0; i < Astore->nnz_loc; ++i) { /* Loop through each nonzero */ + aval[i] = c * aval[i] + bval[i]; + } + + return; +} + +/*! \brief Allocate storage in ScalePermstruct */ +void sScalePermstructInit(const int_t m, const int_t n, + sScalePermstruct_t *ScalePermstruct) +{ + ScalePermstruct->DiagScale = NOEQUIL; + if ( !(ScalePermstruct->perm_r = intMalloc_dist(m)) ) + ABORT("Malloc fails for perm_r[]."); + if ( !(ScalePermstruct->perm_c = intMalloc_dist(n)) ) + ABORT("Malloc fails for perm_c[]."); +} + +/*! \brief Deallocate ScalePermstruct */ +void sScalePermstructFree(sScalePermstruct_t *ScalePermstruct) +{ + SUPERLU_FREE(ScalePermstruct->perm_r); + SUPERLU_FREE(ScalePermstruct->perm_c); + switch ( ScalePermstruct->DiagScale ) { + case ROW: + SUPERLU_FREE(ScalePermstruct->R); + break; + case COL: + SUPERLU_FREE(ScalePermstruct->C); + break; + case BOTH: + SUPERLU_FREE(ScalePermstruct->R); + SUPERLU_FREE(ScalePermstruct->C); + break; + default: break; + } +} + +/* + * The following are from 3D code p3dcomm.c + */ + +int sAllocGlu_3d(int_t n, int_t nsupers, sLUstruct_t * LUstruct) +{ + /*broadcasting Glu_persist*/ + LUstruct->Glu_persist->xsup = intMalloc_dist(nsupers+1); //INT_T_ALLOC(nsupers+1); + LUstruct->Glu_persist->supno = intMalloc_dist(n); //INT_T_ALLOC(n); + return 0; +} + +// Sherry added +int sDeAllocGlu_3d(sLUstruct_t * LUstruct) +{ + SUPERLU_FREE(LUstruct->Glu_persist->xsup); + SUPERLU_FREE(LUstruct->Glu_persist->supno); + return 0; +} + +int sDeAllocLlu_3d(int_t n, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d) +{ + int i, nbc, nbr, nsupers; + sLocalLU_t *Llu = LUstruct->Llu; + + nsupers = (LUstruct->Glu_persist)->supno[n-1] + 1; + + nbc = CEILING(nsupers, grid3d->npcol); + for (i = 0; i < nbc; ++i) + if ( Llu->Lrowind_bc_ptr[i] ) { + SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); + SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); + } + SUPERLU_FREE (Llu->Lrowind_bc_ptr); + SUPERLU_FREE (Llu->Lnzval_bc_ptr); + + nbr = CEILING(nsupers, grid3d->nprow); + for (i = 0; i < nbr; ++i) + if ( Llu->Ufstnz_br_ptr[i] ) { + SUPERLU_FREE (Llu->Ufstnz_br_ptr[i]); + SUPERLU_FREE (Llu->Unzval_br_ptr[i]); + } + SUPERLU_FREE (Llu->Ufstnz_br_ptr); + SUPERLU_FREE (Llu->Unzval_br_ptr); + + /* The following can be freed after factorization. */ + SUPERLU_FREE(Llu->ToRecv); + SUPERLU_FREE(Llu->ToSendD); + for (i = 0; i < nbc; ++i) SUPERLU_FREE(Llu->ToSendR[i]); + SUPERLU_FREE(Llu->ToSendR); + return 0; +} /* sDeAllocLlu_3d */ + + +/**** Other utilities ****/ +void +sGenXtrue_dist(int_t n, int_t nrhs, float *x, int_t ldx) +{ + int i, j; + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) { + if ( i % 2 ) x[i + j*ldx] = 1.0 + (double)(i+1.)/n; + else x[i + j*ldx] = 1.0 - (double)(i+1.)/n; + } +} + +/*! \brief Let rhs[i] = sum of i-th row of A, so the solution vector is all 1's + */ +void +sFillRHS_dist(char *trans, int_t nrhs, float *x, int_t ldx, + SuperMatrix *A, float *rhs, int_t ldb) +{ + float one = 1.0; + float zero = 0.0; + + sp_sgemm_dist(trans, nrhs, one, A, x, ldx, zero, rhs, ldb); + +} + +/*! \brief Fills a float precision array with a given value. + */ +void +sfill_dist(float *a, int_t alen, float dval) +{ + register int_t i; + for (i = 0; i < alen; i++) a[i] = dval; +} + + + +/*! \brief Check the inf-norm of the error vector + */ +void sinf_norm_error_dist(int_t n, int_t nrhs, float *x, int_t ldx, + float *xtrue, int_t ldxtrue, + gridinfo_t *grid) +{ + double err, xnorm; + float *x_work, *xtrue_work; + int i, j; + + for (j = 0; j < nrhs; j++) { + x_work = &x[j*ldx]; + xtrue_work = &xtrue[j*ldxtrue]; + err = xnorm = 0.0; + for (i = 0; i < n; i++) { + err = SUPERLU_MAX(err, fabs(x_work[i] - xtrue_work[i])); + xnorm = SUPERLU_MAX(xnorm, fabs(x_work[i])); + } + err = err / xnorm; + printf("\tRHS %2d: ||X-Xtrue||/||X|| = %e\n", j, err); + } +} + +void Printfloat5(char *name, int_t len, float *x) +{ + register int_t i; + + printf("%10s:", name); + for (i = 0; i < len; ++i) { + if ( i % 5 == 0 ) printf("\n[%ld-%ld] ", (long int) i, (long int) i+4); + printf("%14e", x[i]); + } + printf("\n"); +} + +int file_Printfloat5(FILE *fp, char *name, int_t len, float *x) +{ + register int_t i; + + fprintf(fp, "%10s:", name); + for (i = 0; i < len; ++i) { + if ( i % 5 == 0 ) fprintf(fp, "\n[%ld-%ld] ", (long int) i, (long int) i+4); + fprintf(fp, "%14e", x[i]); + } + fprintf(fp, "\n"); + return 0; +} + +/*! \brief Print the blocks in the factored matrix L. + */ +void sPrintLblocks(int iam, int_t nsupers, gridinfo_t *grid, + Glu_persist_t *Glu_persist, sLocalLU_t *Llu) +{ + register int c, extra, gb, j, lb, nsupc, nsupr, len, nb, ncb; + register int_t k, mycol, r; + int_t *xsup = Glu_persist->xsup; + int_t *index; + float *nzval; + + printf("\n[%d] L BLOCKS IN COLUMN-MAJOR ORDER -->\n", iam); + ncb = nsupers / grid->npcol; + extra = nsupers % grid->npcol; + mycol = MYCOL( iam, grid ); + if ( mycol < extra ) ++ncb; + for (lb = 0; lb < ncb; ++lb) { + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) { /* Not an empty column */ + nzval = Llu->Lnzval_bc_ptr[lb]; + nb = index[0]; + nsupr = index[1]; + gb = lb * grid->npcol + mycol; + nsupc = SuperSize( gb ); + printf("[%d] block column %d (local # %d), nsupc %d, # row blocks %d\n", + iam, gb, lb, nsupc, nb); + for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { + len = index[k+1]; + printf("[%d] row-block %d: block # " IFMT "\tlength %d\n", + iam, c, index[k], len); + PrintInt10("lsub", len, &index[k+LB_DESCRIPTOR]); + for (j = 0; j < nsupc; ++j) { + Printfloat5("nzval", len, &nzval[r + j*nsupr]); + } + k += LB_DESCRIPTOR + len; + r += len; + } + } + printf("(%d)", iam); + PrintInt32("ToSendR[]", grid->npcol, Llu->ToSendR[lb]); + PrintInt10("fsendx_plist[]", grid->nprow, Llu->fsendx_plist[lb]); + } + printf("nfrecvx " IFMT "\n", Llu->nfrecvx); + k = CEILING( nsupers, grid->nprow ); + PrintInt10("fmod", k, Llu->fmod); + +} /* SPRINTLBLOCKS */ + + +/*! \brief Sets all entries of matrix L to zero. + */ +void sZeroLblocks(int iam, int_t n, gridinfo_t *grid, sLUstruct_t *LUstruct) +{ + float zero = 0.0; + register int extra, gb, j, lb, nsupc, nsupr, ncb; + register int_t k, mycol, r; + sLocalLU_t *Llu = LUstruct->Llu; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + int_t *xsup = Glu_persist->xsup; + int_t *index; + float *nzval; + int_t nsupers = Glu_persist->supno[n-1] + 1; + + ncb = nsupers / grid->npcol; + extra = nsupers % grid->npcol; + mycol = MYCOL( iam, grid ); + if ( mycol < extra ) ++ncb; + for (lb = 0; lb < ncb; ++lb) { + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) { /* Not an empty column */ + nzval = Llu->Lnzval_bc_ptr[lb]; + nsupr = index[1]; + gb = lb * grid->npcol + mycol; + nsupc = SuperSize( gb ); + for (j = 0; j < nsupc; ++j) { + for (r = 0; r < nsupr; ++r) { + nzval[r + j*nsupr] = zero; + } + } + } + } +} /* sZeroLblocks */ + + +/*! \brief Dump the factored matrix L using matlab triple-let format + */ +void sDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid, + Glu_persist_t *Glu_persist, sLocalLU_t *Llu) +{ + register int c, extra, gb, j, i, lb, nsupc, nsupr, len, nb, ncb; + int k, mycol, r, n, nmax; + int_t nnzL; + int_t *xsup = Glu_persist->xsup; + int_t *index; + float *nzval; + char filename[256]; + FILE *fp, *fopen(); + + // assert(grid->npcol*grid->nprow==1); + + // count nonzeros in the first pass + nnzL = 0; + n = 0; + ncb = nsupers / grid->npcol; + extra = nsupers % grid->npcol; + mycol = MYCOL( iam, grid ); + if ( mycol < extra ) ++ncb; + for (lb = 0; lb < ncb; ++lb) { + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) { /* Not an empty column */ + nzval = Llu->Lnzval_bc_ptr[lb]; + nb = index[0]; + nsupr = index[1]; + gb = lb * grid->npcol + mycol; + nsupc = SuperSize( gb ); + for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { + len = index[k+1]; + + for (j = 0; j < nsupc; ++j) { + for (i=0; i=xsup[gb]+j+1){ + nnzL ++; + nmax = SUPERLU_MAX(n,index[k+LB_DESCRIPTOR+i]+1); + n = nmax; + } + + } + } + k += LB_DESCRIPTOR + len; + r += len; + } + } + } + MPI_Allreduce(MPI_IN_PLACE,&nnzL,1,mpi_int_t,MPI_SUM,grid->comm); + MPI_Allreduce(MPI_IN_PLACE,&n,1,mpi_int_t,MPI_MAX,grid->comm); + + snprintf(filename, sizeof(filename), "%s-%d", "L", iam); + printf("Dumping L factor to --> %s\n", filename); + if ( !(fp = fopen(filename, "w")) ) { + ABORT("File open failed"); + } + + if(grid->iam==0){ + fprintf(fp, "%d %d " IFMT "\n", n,n,nnzL); + } + + ncb = nsupers / grid->npcol; + extra = nsupers % grid->npcol; + mycol = MYCOL( iam, grid ); + if ( mycol < extra ) ++ncb; + for (lb = 0; lb < ncb; ++lb) { + index = Llu->Lrowind_bc_ptr[lb]; + if ( index ) { /* Not an empty column */ + nzval = Llu->Lnzval_bc_ptr[lb]; + nb = index[0]; + nsupr = index[1]; + gb = lb * grid->npcol + mycol; + nsupc = SuperSize( gb ); + for (c = 0, k = BC_HEADER, r = 0; c < nb; ++c) { + len = index[k+1]; + + for (j = 0; j < nsupc; ++j) { + for (i=0; ixsup; + int_t *index; + float *nzval; + + printf("\n[%d] U BLOCKS IN ROW-MAJOR ORDER -->\n", iam); + nrb = nsupers / grid->nprow; + extra = nsupers % grid->nprow; + myrow = MYROW( iam, grid ); + if ( myrow < extra ) ++nrb; + for (lb = 0; lb < nrb; ++lb) { + index = Llu->Ufstnz_br_ptr[lb]; + if ( index ) { /* Not an empty row */ + nzval = Llu->Unzval_br_ptr[lb]; + nb = index[0]; + printf("[%d] block row " IFMT " (local # %d), # column blocks %d\n", + iam, lb*grid->nprow+myrow, lb, nb); + r = 0; + for (c = 0, k = BR_HEADER; c < nb; ++c) { + jb = index[k]; + len = index[k+1]; + printf("[%d] col-block %d: block # %d\tlength " IFMT "\n", + iam, c, jb, index[k+1]); + nsupc = SuperSize( jb ); + PrintInt10("fstnz", nsupc, &index[k+UB_DESCRIPTOR]); + Printfloat5("nzval", len, &nzval[r]); + k += UB_DESCRIPTOR + nsupc; + r += len; + } + + printf("[%d] ToSendD[] %d\n", iam, Llu->ToSendD[lb]); + } + } +} /* SPRINTUBLOCKS */ + +int +sprint_gsmv_comm(FILE *fp, int_t m_loc, psgsmv_comm_t *gsmv_comm, + gridinfo_t *grid) +{ + int_t procs = grid->nprow*grid->npcol; + fprintf(fp, "TotalIndSend " IFMT "\tTotalValSend " IFMT "\n", gsmv_comm->TotalIndSend, + gsmv_comm->TotalValSend); + file_PrintInt10(fp, "extern_start", m_loc, gsmv_comm->extern_start); + file_PrintInt10(fp, "ind_tosend", gsmv_comm->TotalIndSend, gsmv_comm->ind_tosend); + file_PrintInt10(fp, "ind_torecv", gsmv_comm->TotalValSend, gsmv_comm->ind_torecv); + file_PrintInt10(fp, "ptr_ind_tosend", procs+1, gsmv_comm->ptr_ind_tosend); + file_PrintInt10(fp, "ptr_ind_torecv", procs+1, gsmv_comm->ptr_ind_torecv); + file_PrintInt32(fp, "SendCounts", procs, gsmv_comm->SendCounts); + file_PrintInt32(fp, "RecvCounts", procs, gsmv_comm->RecvCounts); + return 0; +} + + +void +sGenXtrueRHS(int nrhs, SuperMatrix *A, Glu_persist_t *Glu_persist, + gridinfo_t *grid, float **xact, int *ldx, float **b, int *ldb) +{ + int_t gb, gbrow, i, iam, irow, j, lb, lsup, myrow, n, nlrows, + nsupr, nsupers, rel; + int_t *supno, *xsup, *lxsup; + float *x, *bb; + NCformat *Astore; + float *aval; + + n = A->ncol; + *ldb = 0; + supno = Glu_persist->supno; + xsup = Glu_persist->xsup; + nsupers = supno[n-1] + 1; + iam = grid->iam; + myrow = MYROW( iam, grid ); + Astore = (NCformat *) A->Store; + aval = Astore->nzval; + lb = CEILING( nsupers, grid->nprow ) + 1; + if ( !(lxsup = intMalloc_dist(lb)) ) + ABORT("Malloc fails for lxsup[]."); + + lsup = 0; + nlrows = 0; + for (j = 0; j < nsupers; ++j) { + i = PROW( j, grid ); + if ( myrow == i ) { + nsupr = SuperSize( j ); + *ldb += nsupr; + lxsup[lsup++] = nlrows; + nlrows += nsupr; + } + } + *ldx = n; + if ( !(x = floatMalloc_dist(((size_t)*ldx) * nrhs)) ) + ABORT("Malloc fails for x[]."); + if ( !(bb = floatCalloc_dist(*ldb * nrhs)) ) + ABORT("Calloc fails for bb[]."); + for (j = 0; j < nrhs; ++j) + for (i = 0; i < n; ++i) x[i + j*(*ldx)] = 1.0; + + /* Form b = A*x. */ + for (j = 0; j < n; ++j) + for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) { + irow = Astore->rowind[i]; + gb = supno[irow]; + gbrow = PROW( gb, grid ); + if ( myrow == gbrow ) { + rel = irow - xsup[gb]; + lb = LBi( gb, grid ); + bb[lxsup[lb] + rel] += aval[i] * x[j]; + } + } + + /* Memory allocated but not freed: xact, b */ + *xact = x; + *b = bb; + + SUPERLU_FREE(lxsup); + +#if ( PRNTlevel>=2 ) + for (i = 0; i < grid->nprow*grid->npcol; ++i) { + if ( iam == i ) { + printf("\n(%d)\n", iam); + Printfloat5("rhs", *ldb, *b); + } + MPI_Barrier( grid->comm ); + } +#endif + +} /* GENXTRUERHS */ + +/* g5.rua + b = A*x y = L\b + 0 1 1.0000 + 1 0 0.2500 + 2 1 1.0000 + 3 2 2.0000 + 4 1 1.7500 + 5 1 1.8917 + 6 0 1.1879 + 7 2 2.0000 + 8 2 2.0000 + 9 1 1.0000 + 10 1 1.7500 + 11 0 0 + 12 1 1.8750 + 13 2 2.0000 + 14 1 1.0000 + 15 0 0.2500 + 16 1 1.7667 + 17 0 0.6419 + 18 1 2.2504 + 19 0 1.1563 + 20 0 0.9069 + 21 0 1.4269 + 22 1 2.7510 + 23 1 2.2289 + 24 0 2.4332 + + g6.rua + b=A*x y=L\b + 0 0 0 + 1 1 1.0000 + 2 1 1.0000 + 3 2 2.5000 + 4 0 0 + 5 2 2.0000 + 6 1 1.0000 + 7 1 1.7500 + 8 1 1.0000 + 9 0 0.2500 + 10 0 0.5667 + 11 1 2.0787 + 12 0 0.8011 + 13 1 1.9838 + 14 1 1.0000 + 15 1 1.0000 + 16 2 2.5000 + 17 0 0.8571 + 18 0 0 + 19 1 1.0000 + 20 0 0.2500 + 21 1 1.0000 + 22 2 2.0000 + 23 1 1.7500 + 24 1 1.8917 + 25 0 1.1879 + 26 0 0.8011 + 27 1 1.9861 + 28 1 2.0199 + 29 0 1.3620 + 30 0 0.6136 + 31 1 2.3677 + 32 0 1.1011 + 33 0 1.5258 + 34 0 1.7628 + 35 0 2.1658 +*/ diff --git a/SRC/treeFactorizationGPU.c b/SRC/treeFactorizationGPU.c index 45d4f8c0..547e53c5 100644 --- a/SRC/treeFactorizationGPU.c +++ b/SRC/treeFactorizationGPU.c @@ -1,11 +1,6 @@ // #include "treeFactorization.h" // #include "trfCommWrapper.h" #include "dlustruct_gpu.h" -#ifdef __INTEL_COMPILER -#include "mkl.h" -#else -//#include "cblas.h" -#endif /* /-- num_u_blks--\ /-- num_u_blks_Phi --\ From 060aa595eaecad0e4b00ade66d377b53325d8657 Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Tue, 6 Jul 2021 22:42:48 -0700 Subject: [PATCH 104/147] added LUstruct->dt = 'x' in pxgssvx3d --- SRC/pdgssvx3d.c | 2 +- SRC/psgssvx3d.c | 2 +- SRC/pzgssvx3d.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index 74ec2be4..c4dd8d91 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -548,7 +548,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, #if ( PRNTlevel>= 2 ) double dmin, dsum, dprod; #endif - + LUstruct->dt = 'd'; // get the 2d grid gridinfo_t *grid = &(grid3d->grid2d); iam = grid->iam; diff --git a/SRC/psgssvx3d.c b/SRC/psgssvx3d.c index 9b057aed..ce73ad8a 100644 --- a/SRC/psgssvx3d.c +++ b/SRC/psgssvx3d.c @@ -548,7 +548,7 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, #if ( PRNTlevel>= 2 ) double dmin, dsum, dprod; #endif - + LUstruct->dt = 's'; // get the 2d grid gridinfo_t *grid = &(grid3d->grid2d); iam = grid->iam; diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c index 479296ae..eba2489c 100644 --- a/SRC/pzgssvx3d.c +++ b/SRC/pzgssvx3d.c @@ -547,7 +547,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A, #if ( PRNTlevel>= 2 ) double dmin, dsum, dprod; #endif - + LUstruct->dt = 'z'; // get the 2d grid gridinfo_t *grid = &(grid3d->grid2d); iam = grid->iam; From 22cb04fc137ecfd4ecdd2bf57b09b882c062e8b3 Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Tue, 13 Jul 2021 14:28:15 -0700 Subject: [PATCH 105/147] update the default blocking sizes to be larger. --- FORTRAN/superlu_dist_config.fh | 2 -- SRC/sp_ienv.c | 8 ++++---- SRC/superlu_FortranCInterface.h | 8 ++++---- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/FORTRAN/superlu_dist_config.fh b/FORTRAN/superlu_dist_config.fh index 878933ff..cbe990cc 100644 --- a/FORTRAN/superlu_dist_config.fh +++ b/FORTRAN/superlu_dist_config.fh @@ -1,11 +1,9 @@ -#define HAVE_CUDA TRUE #define HAVE_PARMETIS TRUE -#define XSDK_INDEX_SIZE 64 #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 diff --git a/SRC/sp_ienv.c b/SRC/sp_ienv.c index f4014dd2..84cf47fe 100644 --- a/SRC/sp_ienv.c +++ b/SRC/sp_ienv.c @@ -93,7 +93,7 @@ sp_ienv_dist(int ispec) return(atoi(ttemp)); } else - return 20; + return 60; // 20 case 3: ttemp = getenv("NSUP"); // take min of MAX_SUPER_SIZE in superlu_defs.h @@ -102,7 +102,7 @@ sp_ienv_dist(int ispec) int k = SUPERLU_MIN( atoi(ttemp), MAX_SUPER_SIZE ); return (k); } - else return 128; + else return 256; // 128; #endif case 6: @@ -112,11 +112,11 @@ sp_ienv_dist(int ispec) case 7: ttemp = getenv ("N_GEMM"); if (ttemp) return atoi (ttemp); - else return 10000; + else return 100; // 10000; case 8: ttemp = getenv ("MAX_BUFFER_SIZE"); if (ttemp) return atoi (ttemp); - else return 256000000; // 16000^2 + else return 1000000000; // 256000000 = 16000^2 } /* Invalid value for ISPEC */ diff --git a/SRC/superlu_FortranCInterface.h b/SRC/superlu_FortranCInterface.h index c9fee77d..467bfb65 100644 --- a/SRC/superlu_FortranCInterface.h +++ b/SRC/superlu_FortranCInterface.h @@ -2,15 +2,15 @@ #define FC_HEADER_INCLUDED /* Mangling for Fortran global symbols without underscores. */ -#define FC_GLOBAL(name,NAME) name +#define FC_GLOBAL(name,NAME) name##_ /* Mangling for Fortran global symbols with underscores. */ -#define FC_GLOBAL_(name,NAME) name +#define FC_GLOBAL_(name,NAME) name##_ /* Mangling for Fortran module symbols without underscores. */ -#define FC_MODULE(mod_name,name, mod_NAME,NAME) __##mod_name##_NMOD_##name +#define FC_MODULE(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name /* Mangling for Fortran module symbols with underscores. */ -#define FC_MODULE_(mod_name,name, mod_NAME,NAME) __##mod_name##_NMOD_##name +#define FC_MODULE_(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name #endif From f4048dab828bf54a18a497cd76bf21004286fa3b Mon Sep 17 00:00:00 2001 From: Piyush Sao Date: Thu, 22 Jul 2021 16:52:00 -0400 Subject: [PATCH 106/147] Adding a scan code --- SRC/dsuperlu_gpu.cu | 43 ++++++++++++++++ SRC/scan.cu | 123 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 SRC/scan.cu diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu index fe91bae5..91e07442 100644 --- a/SRC/dsuperlu_gpu.cu +++ b/SRC/dsuperlu_gpu.cu @@ -223,6 +223,49 @@ void device_scatter_u (int_t thread_id, } } +typedef int pfx_dtype ; +__global__ void prescan(pfx_dtype *outArr, pfx_dtype *inArr, int n) +{ + extern __shared__ pfx_dtype temp[]; + int thread_id = threadIdx.x; + int offset = 1; + temp[2*thread_id] = inArr[2*thread_id]; + temp[2*thread_id+1] = inArr[2*thread_id+1]; + for (int d = n>>1; d > 0; d >>= 1) + { + __syncthreads(); + if (thread_id < d) + { + int ai = offset*(2*thread_id+1)-1; + int bi = offset*(2*thread_id+2)-1; + temp[bi] += temp[ai]; + } + offset *= 2; + } + + if (thread_id == 0) { temp[n - 1] = 0; } + for (int d = 1; d < n; d *= 2) + { + offset >>= 1; + __syncthreads(); + if (thread_id < d) + { + int ai = offset*(2*thread_id+1)-1; + int bi = offset*(2*thread_id+2)-1; + pfx_dtype t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + __syncthreads(); + outArr[2*thread_id] = temp[2*thread_id]+ inArr[2*thread_id]; // write results to device memory + outArr[2*thread_id+1] = temp[2*thread_id+1]+ inArr[2*thread_id+1]; + __syncthreads(); + printf("xA[%d] = %d \n",2*thread_id , outArr[2*thread_id]); + printf("xA[%d] = %d \n",2*thread_id+1 , outArr[2*thread_id+1]); + __syncthreads(); +} + __global__ void Scatter_GPU_kernel( diff --git a/SRC/scan.cu b/SRC/scan.cu new file mode 100644 index 00000000..1080c25c --- /dev/null +++ b/SRC/scan.cu @@ -0,0 +1,123 @@ +#include +#include + +// typedef float pfx_dtype ; +typedef int pfx_dtype ; +__global__ void prescan(pfx_dtype *outArr, pfx_dtype *inArr, int n) +{ + extern __shared__ pfx_dtype temp[]; + int thread_id = threadIdx.x; + int offset = 1; + temp[2*thread_id] = inArr[2*thread_id]; + temp[2*thread_id+1] = inArr[2*thread_id+1]; + for (int d = n>>1; d > 0; d >>= 1) + { + __syncthreads(); + if (thread_id < d) + { + int ai = offset*(2*thread_id+1)-1; + int bi = offset*(2*thread_id+2)-1; + temp[bi] += temp[ai]; + } + offset *= 2; + } + + if (thread_id == 0) { temp[n - 1] = 0; } + for (int d = 1; d < n; d *= 2) + { + offset >>= 1; + __syncthreads(); + if (thread_id < d) + { + int ai = offset*(2*thread_id+1)-1; + int bi = offset*(2*thread_id+2)-1; + pfx_dtype t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + __syncthreads(); + outArr[2*thread_id] = temp[2*thread_id]+ inArr[2*thread_id]; // write results to device memory + outArr[2*thread_id+1] = temp[2*thread_id+1]+ inArr[2*thread_id+1]; + __syncthreads(); + printf("xA[%d] = %d \n",2*thread_id , outArr[2*thread_id]); + printf("xA[%d] = %d \n",2*thread_id+1 , outArr[2*thread_id+1]); + __syncthreads(); +} + +#define SELF_TEST +#ifdef SELF_TEST + +#include +#include "cub/cub.cuh" +#define N 22 +#define THREAD_BLOCK_SIZE 32 + + +__global__ +void cub_scan_test(void) +{ + int thread_id = threadIdx.x; + typedef cub::BlockScan BlockScan; /*1D int data type*/ + + __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ + + __shared__ int IndirectJ1[N]; + __shared__ int IndirectJ2[N]; + + if (thread_id < N) + { + IndirectJ1[thread_id] = 2*thread_id +1; + } + + __syncthreads(); + if (thread_id < THREAD_BLOCK_SIZE) + BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]); + + + if (thread_id < THREAD_BLOCK_SIZE) + printf("%d %d\n", thread_id, IndirectJ2[thread_id]); + +} + + + +// extern __shared__ +// #define THREAD_BLOCK_SIZE 7 + +__global__ void initData(pfx_dtype* A, int n) +{ + int threadId = threadIdx.x; + if(threadId>> (A,N); + if(cudaDeviceSynchronize() != cudaSuccess) + std::cout<<"Error- 0\n"; + // prescan<<< 1,THREAD_BLOCK_SIZE/2,2*THREAD_BLOCK_SIZE*sizeof(pfx_dtype) >>> (xA, A, N); + prescan<<< 1,(N+1)/2,2*N*sizeof(pfx_dtype) >>> (xA, A, N); + if(cudaDeviceSynchronize() != cudaSuccess) + std::cout<<".....EXITING\n"; + else + std::cout<<"No errors reported\n"; + + + // typedef cub::BlockScan BlockScan; /*1D int data type*/ + // __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ + + cub_scan_test <<< 1,THREAD_BLOCK_SIZE >>> (); + + return 0; +} + +#endif \ No newline at end of file From 7af490eddedefe2e5eb67adc40548f4d1921ae2d Mon Sep 17 00:00:00 2001 From: Piyush Sao Date: Fri, 23 Jul 2021 16:27:06 -0400 Subject: [PATCH 107/147] making scan work for non powers of two --- SRC/scan.cu | 119 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 95 insertions(+), 24 deletions(-) diff --git a/SRC/scan.cu b/SRC/scan.cu index 1080c25c..5a23509c 100644 --- a/SRC/scan.cu +++ b/SRC/scan.cu @@ -2,14 +2,56 @@ #include // typedef float pfx_dtype ; + +int nextpow2(int v) + +{ + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + + return v; +} + +__device__ int dnextpow2(int v) + +{ + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + + return v; +} + + + typedef int pfx_dtype ; __global__ void prescan(pfx_dtype *outArr, pfx_dtype *inArr, int n) { extern __shared__ pfx_dtype temp[]; + int n_original = n; + n = (n & (n - 1)) == 0? n: dnextpow2(n); int thread_id = threadIdx.x; int offset = 1; - temp[2*thread_id] = inArr[2*thread_id]; - temp[2*thread_id+1] = inArr[2*thread_id+1]; + if(2*thread_id < n_original) + temp[2*thread_id] = inArr[2*thread_id]; + else + temp[2*thread_id] =0; + + + if(2*thread_id+1 >1; d > 0; d >>= 1) { __syncthreads(); @@ -37,10 +79,14 @@ __global__ void prescan(pfx_dtype *outArr, pfx_dtype *inArr, int n) } } __syncthreads(); + if(2*thread_id < n_original) outArr[2*thread_id] = temp[2*thread_id]+ inArr[2*thread_id]; // write results to device memory + if(2*thread_id+1 < n_original) outArr[2*thread_id+1] = temp[2*thread_id+1]+ inArr[2*thread_id+1]; __syncthreads(); + if(2*thread_id < n_original) printf("xA[%d] = %d \n",2*thread_id , outArr[2*thread_id]); + if(2*thread_id+1 < n_original) printf("xA[%d] = %d \n",2*thread_id+1 , outArr[2*thread_id+1]); __syncthreads(); } @@ -50,35 +96,35 @@ __global__ void prescan(pfx_dtype *outArr, pfx_dtype *inArr, int n) #include #include "cub/cub.cuh" -#define N 22 -#define THREAD_BLOCK_SIZE 32 +#define THREAD_BLOCK_SIZE 8 -__global__ -void cub_scan_test(void) -{ - int thread_id = threadIdx.x; - typedef cub::BlockScan BlockScan; /*1D int data type*/ - __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ +// __global__ +// void cub_scan_test(int N) +// { +// int thread_id = threadIdx.x; +// typedef cub::BlockScan BlockScan; /*1D int data type*/ - __shared__ int IndirectJ1[N]; - __shared__ int IndirectJ2[N]; +// __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ - if (thread_id < N) - { - IndirectJ1[thread_id] = 2*thread_id +1; - } +// extern __shared__ int* IndirectJ1; +// extern __shared__ int* IndirectJ2= IndirectJ1+ N*sizeof(int); - __syncthreads(); - if (thread_id < THREAD_BLOCK_SIZE) - BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]); +// if (thread_id < N) +// { +// IndirectJ1[thread_id] = 2*thread_id +1; +// } +// __syncthreads(); +// if (thread_id < THREAD_BLOCK_SIZE) +// BlockScan(temp_storage).InclusiveSum (IndirectJ1[thread_id], IndirectJ2[thread_id]); - if (thread_id < THREAD_BLOCK_SIZE) - printf("%d %d\n", thread_id, IndirectJ2[thread_id]); -} +// if (thread_id < THREAD_BLOCK_SIZE) +// printf("%d %d\n", thread_id, IndirectJ2[thread_id]); + +// } @@ -93,8 +139,31 @@ __global__ void initData(pfx_dtype* A, int n) printf("A[%d] = %d \n",threadId,A[threadId]); } -int main() + + +int main(int argc, char* argv[]) { + if(argc<2) + { + std::cout<<"Error with number of arguments\n"; + return -1; + } + int N = atoi(argv[1]); + int N2=N; + if((N & (N - 1)) == 0) + { + std::cout<<"Power of Two\n"; + } + else + { + std::cout<<"Not a power of Two\n"; + N2 = nextpow2(N); + std::cout<<"Using "<>> (xA, A, N); + prescan<<< 1,(N+1)/2,2*N*sizeof(pfx_dtype) >>> (xA, A, N); + prescan<<< 1,N2,2*N*sizeof(pfx_dtype) >>> (xA, A, N); if(cudaDeviceSynchronize() != cudaSuccess) std::cout<<".....EXITING\n"; else @@ -115,7 +186,7 @@ int main() // typedef cub::BlockScan BlockScan; /*1D int data type*/ // __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ - cub_scan_test <<< 1,THREAD_BLOCK_SIZE >>> (); + // cub_scan_test <<< 1,THREAD_BLOCK_SIZE >>> (N); return 0; } From 8d9c23fcc7a4a82a270802cb1fae57651acdf7ed Mon Sep 17 00:00:00 2001 From: Piyush Sao Date: Fri, 23 Jul 2021 19:11:45 -0400 Subject: [PATCH 108/147] including "omp.h" --- SRC/dtreeFactorizationGPU.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SRC/dtreeFactorizationGPU.c b/SRC/dtreeFactorizationGPU.c index 9f5bb8ee..8c785924 100644 --- a/SRC/dtreeFactorizationGPU.c +++ b/SRC/dtreeFactorizationGPU.c @@ -13,7 +13,7 @@ // #include "treeFactorization.h" // #include "trfCommWrapper.h" #include "dlustruct_gpu.h" - +#include "omp.h" //#include "cblas.h" #ifdef GPU_ACC ///////////////// enable GPU From 6642d3c26f9ab55fee0a42c5233710ac3f16779b Mon Sep 17 00:00:00 2001 From: Piyush Sao Date: Fri, 23 Jul 2021 19:12:19 -0400 Subject: [PATCH 109/147] device scan code --- SRC/dsuperlu_gpu.cu | 71 +++++++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 18 deletions(-) diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu index 91e07442..9f72d6a8 100644 --- a/SRC/dsuperlu_gpu.cu +++ b/SRC/dsuperlu_gpu.cu @@ -223,14 +223,42 @@ void device_scatter_u (int_t thread_id, } } +__device__ int dnextpow2(int v) + +{ + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + + return v; +} + + typedef int pfx_dtype ; -__global__ void prescan(pfx_dtype *outArr, pfx_dtype *inArr, int n) +extern __shared__ pfx_dtype temp_storage[]; + +__device__ void prescan(pfx_dtype *outArr, pfx_dtype *inArr, int n) { - extern __shared__ pfx_dtype temp[]; + + int n_original = n; + n = (n & (n - 1)) == 0? n: dnextpow2(n); int thread_id = threadIdx.x; int offset = 1; - temp[2*thread_id] = inArr[2*thread_id]; - temp[2*thread_id+1] = inArr[2*thread_id+1]; + if(2*thread_id < n_original) + temp_storage[2*thread_id] = inArr[2*thread_id]; + else + temp_storage[2*thread_id] =0; + + + if(2*thread_id+1 >1; d > 0; d >>= 1) { __syncthreads(); @@ -238,12 +266,12 @@ __global__ void prescan(pfx_dtype *outArr, pfx_dtype *inArr, int n) { int ai = offset*(2*thread_id+1)-1; int bi = offset*(2*thread_id+2)-1; - temp[bi] += temp[ai]; + temp_storage[bi] += temp_storage[ai]; } offset *= 2; } - if (thread_id == 0) { temp[n - 1] = 0; } + if (thread_id == 0) { temp_storage[n - 1] = 0; } for (int d = 1; d < n; d *= 2) { offset >>= 1; @@ -252,18 +280,22 @@ __global__ void prescan(pfx_dtype *outArr, pfx_dtype *inArr, int n) { int ai = offset*(2*thread_id+1)-1; int bi = offset*(2*thread_id+2)-1; - pfx_dtype t = temp[ai]; - temp[ai] = temp[bi]; - temp[bi] += t; + pfx_dtype t = temp_storage[ai]; + temp_storage[ai] = temp_storage[bi]; + temp_storage[bi] += t; } } __syncthreads(); - outArr[2*thread_id] = temp[2*thread_id]+ inArr[2*thread_id]; // write results to device memory - outArr[2*thread_id+1] = temp[2*thread_id+1]+ inArr[2*thread_id+1]; - __syncthreads(); - printf("xA[%d] = %d \n",2*thread_id , outArr[2*thread_id]); - printf("xA[%d] = %d \n",2*thread_id+1 , outArr[2*thread_id+1]); - __syncthreads(); + if(2*thread_id < n_original) + outArr[2*thread_id] = temp_storage[2*thread_id]+ inArr[2*thread_id]; // write results to device memory + if(2*thread_id+1 < n_original) + outArr[2*thread_id+1] = temp_storage[2*thread_id+1]+ inArr[2*thread_id+1]; + // __syncthreads(); + // if(2*thread_id < n_original) + // printf("xA[%d] = %d \n",2*thread_id , outArr[2*thread_id]); + // if(2*thread_id+1 < n_original) + // printf("xA[%d] = %d \n",2*thread_id+1 , outArr[2*thread_id+1]); + // __syncthreads(); } @@ -309,7 +341,8 @@ void Scatter_GPU_kernel( /* see CUB page https://nvlabs.github.io/cub/. Implement threads collectives */ typedef cub::BlockScan BlockScan; /*1D int data type*/ - __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ + // __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ + __shared__ pfx_dtype temp_storage[THREAD_BLOCK_SIZE]; int thread_id = threadIdx.x; @@ -415,8 +448,10 @@ void Scatter_GPU_kernel( } /* perform an inclusive block-wide prefix sum among all threads */ - if (thread_id < THREAD_BLOCK_SIZE) - BlockScan(temp_storage).InclusiveSum(IndirectJ1[thread_id], IndirectJ1[thread_id]); + // if (thread_id < THREAD_BLOCK_SIZE) + // BlockScan(temp_storage).InclusiveSum(IndirectJ1[thread_id], IndirectJ1[thread_id]); + prescan(IndirectJ1, IndirectJ1, THREAD_BLOCK_SIZE); + if (thread_id < THREAD_BLOCK_SIZE) IndirectJ1[thread_id] = -IndirectJ1[thread_id] + ilst * thread_id; From 9c744540f610b0fe07e84366f9fa2b342e8dafdc Mon Sep 17 00:00:00 2001 From: Piyush Sao Date: Mon, 26 Jul 2021 15:01:59 -0400 Subject: [PATCH 110/147] adding new_scan from gpu3d-opt branch --- SRC/dsuperlu_gpu.cu | 122 ++++++++++++++++++-------------------------- 1 file changed, 49 insertions(+), 73 deletions(-) diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu index 9f72d6a8..f4c34afa 100644 --- a/SRC/dsuperlu_gpu.cu +++ b/SRC/dsuperlu_gpu.cu @@ -175,53 +175,19 @@ void device_scatter_u_2D (int thread_id, if ( thread_id < temp_nbrow * ColPerBlock ) { /* 1D threads are logically arranged in 2D shape. */ - int thread_id_x = thread_id % temp_nbrow; - int thread_id_y = thread_id / temp_nbrow; + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; -#pragma unroll 4 - for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) - { - i = IndirectJ1[IndirectJ3[col]] + indirect[thread_id_x]; - ucol[i] -= tempv[nbrow * col + thread_id_x]; - } + #pragma unroll 4 + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + i = IndirectJ1[IndirectJ3[col]]-ilst + indirect[thread_id_x]; + ucol[i] -= tempv[nbrow * col + thread_id_x]; + } } } -__device__ inline -void device_scatter_u (int_t thread_id, - int_t temp_nbrow, int_t nsupc, - double * ucol, - int_t * usub, int_t iukp, - int_t ilst, int_t klst, - int_t * index, int_t iuip_lib, - double * tempv, int_t nbrow, - // int_t *indirect - int *indirect - ) -{ - int_t segsize, fnz, jj; - for (jj = 0; jj < nsupc; ++jj) - { - segsize = klst - usub[iukp + jj]; - fnz = index[iuip_lib++]; - ucol -= fnz; - if (segsize) { /* Nonzero segment in U(k.j). */ - if (thread_id < temp_nbrow) - { -#ifndef UNIT_STRIDE - ucol[indirect[thread_id]] -= tempv[thread_id]; -#else - /* making access unit strided; - it doesn't work; it is for measurements */ - ucol[thread_id] -= tempv[thread_id]; -#endif - } - tempv += nbrow; - } - ucol += ilst ; - } -} __device__ int dnextpow2(int v) @@ -238,26 +204,25 @@ __device__ int dnextpow2(int v) } -typedef int pfx_dtype ; -extern __shared__ pfx_dtype temp_storage[]; -__device__ void prescan(pfx_dtype *outArr, pfx_dtype *inArr, int n) +typedef int pfx_dtype ; +__device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n) { - + // extern __shared__ pfx_dtype temp[]; int n_original = n; n = (n & (n - 1)) == 0? n: dnextpow2(n); int thread_id = threadIdx.x; int offset = 1; if(2*thread_id < n_original) - temp_storage[2*thread_id] = inArr[2*thread_id]; + temp[2*thread_id] = inOutArr[2*thread_id]; else - temp_storage[2*thread_id] =0; + temp[2*thread_id] =0; if(2*thread_id+1 >1; d > 0; d >>= 1) { @@ -266,12 +231,12 @@ __device__ void prescan(pfx_dtype *outArr, pfx_dtype *inArr, int n) { int ai = offset*(2*thread_id+1)-1; int bi = offset*(2*thread_id+2)-1; - temp_storage[bi] += temp_storage[ai]; + temp[bi] += temp[ai]; } offset *= 2; } - if (thread_id == 0) { temp_storage[n - 1] = 0; } + if (thread_id == 0) { temp[n - 1] = 0; } for (int d = 1; d < n; d *= 2) { offset >>= 1; @@ -280,25 +245,26 @@ __device__ void prescan(pfx_dtype *outArr, pfx_dtype *inArr, int n) { int ai = offset*(2*thread_id+1)-1; int bi = offset*(2*thread_id+2)-1; - pfx_dtype t = temp_storage[ai]; - temp_storage[ai] = temp_storage[bi]; - temp_storage[bi] += t; + pfx_dtype t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; } } __syncthreads(); if(2*thread_id < n_original) - outArr[2*thread_id] = temp_storage[2*thread_id]+ inArr[2*thread_id]; // write results to device memory + inOutArr[2*thread_id] = temp[2*thread_id]+ inOutArr[2*thread_id]; // write results to device memory if(2*thread_id+1 < n_original) - outArr[2*thread_id+1] = temp_storage[2*thread_id+1]+ inArr[2*thread_id+1]; - // __syncthreads(); - // if(2*thread_id < n_original) - // printf("xA[%d] = %d \n",2*thread_id , outArr[2*thread_id]); - // if(2*thread_id+1 < n_original) - // printf("xA[%d] = %d \n",2*thread_id+1 , outArr[2*thread_id+1]); - // __syncthreads(); + inOutArr[2*thread_id+1] = temp[2*thread_id+1]+ inOutArr[2*thread_id+1]; + __syncthreads(); + } - +__global__ void gExScan(pfx_dtype *inArr, int n) +{ + extern __shared__ pfx_dtype temp[]; + incScan(inArr, temp, n); + +} __global__ void Scatter_GPU_kernel( int_t streamId, @@ -338,12 +304,18 @@ void Scatter_GPU_kernel( __shared__ int indirect2_thread[MAX_SUPER_SIZE]; /* row-wise */ __shared__ int IndirectJ1[THREAD_BLOCK_SIZE]; /* column-wise */ __shared__ int IndirectJ3[THREAD_BLOCK_SIZE]; /* column-wise */ + + #define MY_SCAN + #ifdef MY_SCAN + __shared__ int pfxStorage[2*THREAD_BLOCK_SIZE]; /* column-wise */ + // __shared__ int pfxTest[2*THREAD_BLOCK_SIZE]; /* column-wise */ + + #else /* see CUB page https://nvlabs.github.io/cub/. Implement threads collectives */ typedef cub::BlockScan BlockScan; /*1D int data type*/ - // __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ - __shared__ pfx_dtype temp_storage[THREAD_BLOCK_SIZE]; - + __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ + #endif int thread_id = threadIdx.x; int iukp = Ublock_info[j].iukp; @@ -443,19 +415,23 @@ void Scatter_GPU_kernel( if (thread_id < nsupc) { /* fstnz subscript of each column in the block */ - IndirectJ1[thread_id] = index[iuip_lib + thread_id]; + IndirectJ1[thread_id] = -index[iuip_lib + thread_id] + ilst; } } /* perform an inclusive block-wide prefix sum among all threads */ - // if (thread_id < THREAD_BLOCK_SIZE) - // BlockScan(temp_storage).InclusiveSum(IndirectJ1[thread_id], IndirectJ1[thread_id]); - prescan(IndirectJ1, IndirectJ1, THREAD_BLOCK_SIZE); + #ifdef MY_SCAN - + __syncthreads(); + + incScan(IndirectJ1, pfxStorage, nsupc); + + #else if (thread_id < THREAD_BLOCK_SIZE) - IndirectJ1[thread_id] = -IndirectJ1[thread_id] + ilst * thread_id; + BlockScan(temp_storage).InclusiveSum(IndirectJ1[thread_id], IndirectJ1[thread_id]); + #endif + __syncthreads(); device_scatter_u_2D ( From 039839f9ee7f6d1d833f73fd2f90310738c7b43c Mon Sep 17 00:00:00 2001 From: Piyush Sao Date: Sun, 8 Aug 2021 15:32:47 -0400 Subject: [PATCH 111/147] porting changes from gpu3d-opt branch --- SRC/dsuperlu_gpu.cu | 80 ++++++++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 26 deletions(-) diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu index f4c34afa..a04162cd 100644 --- a/SRC/dsuperlu_gpu.cu +++ b/SRC/dsuperlu_gpu.cu @@ -97,7 +97,8 @@ void device_scatter_l (int_t thread_id, } #endif ///////////// not used -#define THREAD_BLOCK_SIZE 512 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ +// #define THREAD_BLOCK_SIZE 512 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ +int SCATTER_THREAD_BLOCK_SIZE=512; __device__ inline void ddevice_scatter_l_2D (int thread_id, @@ -300,16 +301,25 @@ void Scatter_GPU_kernel( assigned to block (lb, j) in 2D grid */ int lb = blockIdx.x + ii_st; int j = blockIdx.y + jj_st; - __shared__ int indirect_thread[MAX_SUPER_SIZE]; /* row-wise */ + #if 1 + extern __shared__ int s[]; + int* indirect_lptr = s; /* row-wise */ + int* indirect2_thread= (int*) &indirect_lptr[ldt]; /* row-wise */ + int* IndirectJ1= (int*) &indirect2_thread[ldt]; /* column-wise */ + int* IndirectJ3= (int*) &IndirectJ1[ldt]; /* column-wise */ + int CHREAD_BLOCK_SIZE =ldt; + #else + __shared__ int indirect_lptr[MAX_SUPER_SIZE]; /* row-wise */ __shared__ int indirect2_thread[MAX_SUPER_SIZE]; /* row-wise */ __shared__ int IndirectJ1[THREAD_BLOCK_SIZE]; /* column-wise */ __shared__ int IndirectJ3[THREAD_BLOCK_SIZE]; /* column-wise */ - + #endif + #define MY_SCAN #ifdef MY_SCAN - __shared__ int pfxStorage[2*THREAD_BLOCK_SIZE]; /* column-wise */ - // __shared__ int pfxTest[2*THREAD_BLOCK_SIZE]; /* column-wise */ + // __shared__ int pfxStorage[2*THREAD_BLOCK_SIZE]; /* column-wise */ + int* pfxStorage = (int*) &IndirectJ3[ldt]; #else /* see CUB page https://nvlabs.github.io/cub/. Implement threads collectives */ @@ -338,8 +348,9 @@ void Scatter_GPU_kernel( /* # of nonzero columns in block j */ int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols - : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols); - int cum_ncol = (j == 0) ? 0 : Ublock_info[j - 1].full_u_cols; + : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols); + int cum_ncol = (j == 0) ? 0 + : Ublock_info[j - 1].full_u_cols; int lptr = Remain_info[lb].lptr; int ib = Remain_info[lb].ib; @@ -371,7 +382,9 @@ void Scatter_GPU_kernel( /* Each thread is responsible for one block column */ __shared__ int ljb_ind; /*do a search ljb_ind at local row lib*/ - int blks_per_threads = CEILING(num_u_blocks, THREAD_BLOCK_SIZE); + int blks_per_threads = CEILING(num_u_blocks, blockDim.x); + // printf("blockDim.x =%d \n", blockDim.x); + for (int i = 0; i < blks_per_threads; ++i) /* each thread is assigned a chunk of consecutive U blocks to search */ { @@ -393,7 +406,7 @@ void Scatter_GPU_kernel( if (thread_id < temp_nbrow) /* row-wise */ { /* cyclically map each thread to a row */ - indirect_thread[thread_id] = (int) lsub[lptr + thread_id]; + indirect_lptr[thread_id] = (int) lsub[lptr + thread_id]; } /* column-wise: each thread is assigned one column */ @@ -405,12 +418,14 @@ void Scatter_GPU_kernel( __syncthreads(); /* threads are divided into multiple columns */ - int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; + int ColPerBlock = blockDim.x / temp_nbrow; - if (thread_id < THREAD_BLOCK_SIZE) + // if (thread_id < blockDim.x) + // IndirectJ1[thread_id] = 0; + if (thread_id < ldt) IndirectJ1[thread_id] = 0; - if (thread_id < THREAD_BLOCK_SIZE) + if (thread_id < blockDim.x) { if (thread_id < nsupc) { @@ -427,7 +442,7 @@ void Scatter_GPU_kernel( incScan(IndirectJ1, pfxStorage, nsupc); #else - if (thread_id < THREAD_BLOCK_SIZE) + if (thread_id < blockDim.x) BlockScan(temp_storage).InclusiveSum(IndirectJ1[thread_id], IndirectJ1[thread_id]); #endif @@ -442,7 +457,7 @@ void Scatter_GPU_kernel( ilst, klst, index, iuip_lib, tempv1, nrows, - indirect_thread, + indirect_lptr, nnz_cols, ColPerBlock, IndirectJ1, IndirectJ3 ); @@ -462,7 +477,7 @@ void Scatter_GPU_kernel( __shared__ int lib_ind; /*do a search lib_ind for lib*/ - int blks_per_threads = CEILING(num_l_blocks, THREAD_BLOCK_SIZE); + int blks_per_threads = CEILING(num_l_blocks, blockDim.x); for (int i = 0; i < blks_per_threads; ++i) { if (thread_id * blks_per_threads + i < num_l_blocks && @@ -481,7 +496,7 @@ void Scatter_GPU_kernel( if (thread_id < dest_nbrow) { rel = index[lptrj + thread_id] - fnz; - indirect_thread[rel] = thread_id; + indirect_lptr[rel] = thread_id; } __syncthreads(); @@ -489,13 +504,13 @@ void Scatter_GPU_kernel( if (thread_id < temp_nbrow) { rel = lsub[lptr + thread_id] - fnz; - indirect2_thread[thread_id] = indirect_thread[rel]; + indirect2_thread[thread_id] = indirect_lptr[rel]; } if (thread_id < nnz_cols) IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; __syncthreads(); - int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; + int ColPerBlock = blockDim.x / temp_nbrow; nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj; ddevice_scatter_l_2D( @@ -762,11 +777,12 @@ int dSchurCompUpdate_GPU( /* * Scattering the output */ - dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread + // dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread + dim3 dimBlock(ldt); // 1d thread dim3 dimGrid(ii_end - ii_st, jj_end - jj_st); - Scatter_GPU_kernel <<< dimGrid, dimBlock, 0, FunCallStream>>> + Scatter_GPU_kernel <<< dimGrid, dimBlock, (4*ldt + 2*SCATTER_THREAD_BLOCK_SIZE)*sizeof(int), FunCallStream>>> (streamId, ii_st, ii_end, jj_st, jj_end, klst, 0, nrows, ldt, npcol, nprow, dA_gpu); #ifdef SCATTER_OPT @@ -972,14 +988,26 @@ int dinitSluGPU3D_t( int_t* isNodeInMyGrid = sluGPU->isNodeInMyGrid; sluGPU->nCudaStreams = getnCudaStreams(); - if (grid3d->iam == 0) - { - printf("dinitSluGPU3D_t: Using hardware acceleration, with %d cuda streams \n", sluGPU->nCudaStreams); - fflush(stdout); - if ( MAX_SUPER_SIZE < ldt ) + SCATTER_THREAD_BLOCK_SIZE = ldt; + if(getenv("SCATTER_THREAD_BLOCK_SIZE")) { - ABORT("MAX_SUPER_SIZE smaller than requested NSUP"); + int stbs = atoi(getenv("SCATTER_THREAD_BLOCK_SIZE")); + if(stbs>=ldt) + { + SCATTER_THREAD_BLOCK_SIZE = stbs; + } + } + if (grid3d->iam == 0) + { + printf("dinitSluGPU3D_t: Using hardware acceleration, with %d cuda streams \n", sluGPU->nCudaStreams); + fflush(stdout); + printf("dinitSluGPU3D_t: Using %d threads per block for scatter \n", SCATTER_THREAD_BLOCK_SIZE); + + if ( MAX_SUPER_SIZE < ldt ) + { + ABORT("MAX_SUPER_SIZE smaller than requested NSUP"); + } } cudaStreamCreate(&(sluGPU->CopyStream)); From a37d78629c26ac7bb5f82bb4cd042420158eb453 Mon Sep 17 00:00:00 2001 From: Piyush Sao Date: Sun, 8 Aug 2021 15:36:15 -0400 Subject: [PATCH 112/147] removing dependency on CUB --- SRC/CMakeLists.txt | 1 - SRC/dsuperlu_gpu.cu | 28 +++------------------------- 2 files changed, 3 insertions(+), 26 deletions(-) diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 51f10a42..386d8ce4 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -15,7 +15,6 @@ set(headers TreeReduce_slu_impl.hpp ${CMAKE_CURRENT_BINARY_DIR}/superlu_dist_config.h ${PROJECT_SOURCE_DIR}/SRC/superlu_FortranCInterface.h - cub/cub.cuh ) if (MSVC) list(APPEND headers wingetopt.h) diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu index a04162cd..8391ee09 100644 --- a/SRC/dsuperlu_gpu.cu +++ b/SRC/dsuperlu_gpu.cu @@ -20,8 +20,7 @@ #include #undef Reduce -#include "cub/cub.cuh" -//#include + #include "dlustruct_gpu.h" @@ -301,31 +300,16 @@ void Scatter_GPU_kernel( assigned to block (lb, j) in 2D grid */ int lb = blockIdx.x + ii_st; int j = blockIdx.y + jj_st; - #if 1 + extern __shared__ int s[]; int* indirect_lptr = s; /* row-wise */ int* indirect2_thread= (int*) &indirect_lptr[ldt]; /* row-wise */ int* IndirectJ1= (int*) &indirect2_thread[ldt]; /* column-wise */ int* IndirectJ3= (int*) &IndirectJ1[ldt]; /* column-wise */ int CHREAD_BLOCK_SIZE =ldt; - #else - __shared__ int indirect_lptr[MAX_SUPER_SIZE]; /* row-wise */ - __shared__ int indirect2_thread[MAX_SUPER_SIZE]; /* row-wise */ - __shared__ int IndirectJ1[THREAD_BLOCK_SIZE]; /* column-wise */ - __shared__ int IndirectJ3[THREAD_BLOCK_SIZE]; /* column-wise */ - #endif - - #define MY_SCAN - #ifdef MY_SCAN - // __shared__ int pfxStorage[2*THREAD_BLOCK_SIZE]; /* column-wise */ int* pfxStorage = (int*) &IndirectJ3[ldt]; - #else - - /* see CUB page https://nvlabs.github.io/cub/. Implement threads collectives */ - typedef cub::BlockScan BlockScan; /*1D int data type*/ - __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ - #endif + int thread_id = threadIdx.x; int iukp = Ublock_info[j].iukp; @@ -435,16 +419,10 @@ void Scatter_GPU_kernel( } /* perform an inclusive block-wide prefix sum among all threads */ - #ifdef MY_SCAN - __syncthreads(); incScan(IndirectJ1, pfxStorage, nsupc); - #else - if (thread_id < blockDim.x) - BlockScan(temp_storage).InclusiveSum(IndirectJ1[thread_id], IndirectJ1[thread_id]); - #endif __syncthreads(); From ad11ddd7d370d95f01898fecf81a13fc9d976d2b Mon Sep 17 00:00:00 2001 From: Piyush Sao Date: Sun, 8 Aug 2021 15:38:44 -0400 Subject: [PATCH 113/147] removing cub from tracking --- SRC/cub/agent/agent_histogram.cuh | 787 ------ SRC/cub/agent/agent_radix_sort_downsweep.cuh | 789 ------ SRC/cub/agent/agent_radix_sort_upsweep.cuh | 526 ---- SRC/cub/agent/agent_reduce.cuh | 385 --- SRC/cub/agent/agent_reduce_by_key.cuh | 547 ----- SRC/cub/agent/agent_rle.cuh | 837 ------- SRC/cub/agent/agent_scan.cuh | 471 ---- SRC/cub/agent/agent_segment_fixup.cuh | 375 --- SRC/cub/agent/agent_select_if.cuh | 703 ------ SRC/cub/agent/agent_spmv_orig.cuh | 670 ------ SRC/cub/agent/single_pass_scan_operators.cuh | 815 ------- SRC/cub/block/block_adjacent_difference.cuh | 596 ----- SRC/cub/block/block_discontinuity.cuh | 1148 --------- SRC/cub/block/block_exchange.cuh | 1248 ---------- SRC/cub/block/block_histogram.cuh | 415 ---- SRC/cub/block/block_load.cuh | 1241 ---------- SRC/cub/block/block_radix_rank.cuh | 696 ------ SRC/cub/block/block_radix_sort.cuh | 863 ------- SRC/cub/block/block_raking_layout.cuh | 152 -- SRC/cub/block/block_reduce.cuh | 607 ----- SRC/cub/block/block_scan.cuh | 2126 ----------------- SRC/cub/block/block_shuffle.cuh | 305 --- SRC/cub/block/block_store.cuh | 1000 -------- .../block_histogram_atomic.cuh | 82 - .../specializations/block_histogram_sort.cuh | 226 -- .../specializations/block_reduce_raking.cuh | 226 -- .../block_reduce_raking_commutative_only.cuh | 199 -- .../block_reduce_warp_reductions.cuh | 218 -- .../specializations/block_scan_raking.cuh | 666 ------ .../specializations/block_scan_warp_scans.cuh | 392 --- .../block_scan_warp_scans2.cuh | 436 ---- .../block_scan_warp_scans3.cuh | 418 ---- SRC/cub/cub.cuh | 95 - SRC/cub/device/device_histogram.cuh | 866 ------- SRC/cub/device/device_partition.cuh | 273 --- SRC/cub/device/device_radix_sort.cuh | 797 ------ SRC/cub/device/device_reduce.cuh | 734 ------ SRC/cub/device/device_run_length_encode.cuh | 278 --- SRC/cub/device/device_scan.cuh | 443 ---- .../device/device_segmented_radix_sort.cuh | 876 ------- SRC/cub/device/device_segmented_reduce.cuh | 619 ----- SRC/cub/device/device_select.cuh | 369 --- SRC/cub/device/device_spmv.cuh | 174 -- .../device/dispatch/dispatch_histogram.cuh | 1096 --------- .../device/dispatch/dispatch_radix_sort.cuh | 1619 ------------- SRC/cub/device/dispatch/dispatch_reduce.cuh | 882 ------- .../dispatch/dispatch_reduce_by_key.cuh | 554 ----- SRC/cub/device/dispatch/dispatch_rle.cuh | 538 ----- SRC/cub/device/dispatch/dispatch_scan.cuh | 563 ----- .../device/dispatch/dispatch_select_if.cuh | 542 ----- .../device/dispatch/dispatch_spmv_orig.cuh | 834 ------- SRC/cub/grid/grid_barrier.cuh | 211 -- SRC/cub/grid/grid_even_share.cuh | 222 -- SRC/cub/grid/grid_mapping.cuh | 113 - SRC/cub/grid/grid_queue.cuh | 220 -- SRC/cub/host/mutex.cuh | 171 -- SRC/cub/iterator/arg_index_input_iterator.cuh | 259 -- .../cache_modified_input_iterator.cuh | 240 -- .../cache_modified_output_iterator.cuh | 254 -- SRC/cub/iterator/constant_input_iterator.cuh | 235 -- SRC/cub/iterator/counting_input_iterator.cuh | 228 -- SRC/cub/iterator/discard_output_iterator.cuh | 220 -- SRC/cub/iterator/tex_obj_input_iterator.cuh | 310 --- SRC/cub/iterator/tex_ref_input_iterator.cuh | 374 --- SRC/cub/iterator/transform_input_iterator.cuh | 252 -- SRC/cub/thread/thread_load.cuh | 438 ---- SRC/cub/thread/thread_operators.cuh | 317 --- SRC/cub/thread/thread_reduce.cuh | 152 -- SRC/cub/thread/thread_scan.cuh | 268 --- SRC/cub/thread/thread_search.cuh | 154 -- SRC/cub/thread/thread_store.cuh | 422 ---- SRC/cub/util_allocator.cuh | 708 ------ SRC/cub/util_arch.cuh | 151 -- SRC/cub/util_debug.cuh | 145 -- SRC/cub/util_device.cuh | 347 --- SRC/cub/util_macro.cuh | 103 - SRC/cub/util_namespace.cuh | 46 - SRC/cub/util_ptx.cuh | 758 ------ SRC/cub/util_type.cuh | 1167 --------- .../warp/specializations/warp_reduce_shfl.cuh | 541 ----- .../warp/specializations/warp_reduce_smem.cuh | 372 --- .../warp/specializations/warp_scan_shfl.cuh | 632 ----- .../warp/specializations/warp_scan_smem.cuh | 397 --- SRC/cub/warp/warp_reduce.cuh | 612 ----- SRC/cub/warp/warp_scan.cuh | 936 -------- 85 files changed, 44292 deletions(-) delete mode 100644 SRC/cub/agent/agent_histogram.cuh delete mode 100644 SRC/cub/agent/agent_radix_sort_downsweep.cuh delete mode 100644 SRC/cub/agent/agent_radix_sort_upsweep.cuh delete mode 100644 SRC/cub/agent/agent_reduce.cuh delete mode 100644 SRC/cub/agent/agent_reduce_by_key.cuh delete mode 100644 SRC/cub/agent/agent_rle.cuh delete mode 100644 SRC/cub/agent/agent_scan.cuh delete mode 100644 SRC/cub/agent/agent_segment_fixup.cuh delete mode 100644 SRC/cub/agent/agent_select_if.cuh delete mode 100644 SRC/cub/agent/agent_spmv_orig.cuh delete mode 100644 SRC/cub/agent/single_pass_scan_operators.cuh delete mode 100644 SRC/cub/block/block_adjacent_difference.cuh delete mode 100644 SRC/cub/block/block_discontinuity.cuh delete mode 100644 SRC/cub/block/block_exchange.cuh delete mode 100644 SRC/cub/block/block_histogram.cuh delete mode 100644 SRC/cub/block/block_load.cuh delete mode 100644 SRC/cub/block/block_radix_rank.cuh delete mode 100644 SRC/cub/block/block_radix_sort.cuh delete mode 100644 SRC/cub/block/block_raking_layout.cuh delete mode 100644 SRC/cub/block/block_reduce.cuh delete mode 100644 SRC/cub/block/block_scan.cuh delete mode 100644 SRC/cub/block/block_shuffle.cuh delete mode 100644 SRC/cub/block/block_store.cuh delete mode 100644 SRC/cub/block/specializations/block_histogram_atomic.cuh delete mode 100644 SRC/cub/block/specializations/block_histogram_sort.cuh delete mode 100644 SRC/cub/block/specializations/block_reduce_raking.cuh delete mode 100644 SRC/cub/block/specializations/block_reduce_raking_commutative_only.cuh delete mode 100644 SRC/cub/block/specializations/block_reduce_warp_reductions.cuh delete mode 100644 SRC/cub/block/specializations/block_scan_raking.cuh delete mode 100644 SRC/cub/block/specializations/block_scan_warp_scans.cuh delete mode 100644 SRC/cub/block/specializations/block_scan_warp_scans2.cuh delete mode 100644 SRC/cub/block/specializations/block_scan_warp_scans3.cuh delete mode 100644 SRC/cub/cub.cuh delete mode 100644 SRC/cub/device/device_histogram.cuh delete mode 100644 SRC/cub/device/device_partition.cuh delete mode 100644 SRC/cub/device/device_radix_sort.cuh delete mode 100644 SRC/cub/device/device_reduce.cuh delete mode 100644 SRC/cub/device/device_run_length_encode.cuh delete mode 100644 SRC/cub/device/device_scan.cuh delete mode 100644 SRC/cub/device/device_segmented_radix_sort.cuh delete mode 100644 SRC/cub/device/device_segmented_reduce.cuh delete mode 100644 SRC/cub/device/device_select.cuh delete mode 100644 SRC/cub/device/device_spmv.cuh delete mode 100644 SRC/cub/device/dispatch/dispatch_histogram.cuh delete mode 100644 SRC/cub/device/dispatch/dispatch_radix_sort.cuh delete mode 100644 SRC/cub/device/dispatch/dispatch_reduce.cuh delete mode 100644 SRC/cub/device/dispatch/dispatch_reduce_by_key.cuh delete mode 100644 SRC/cub/device/dispatch/dispatch_rle.cuh delete mode 100644 SRC/cub/device/dispatch/dispatch_scan.cuh delete mode 100644 SRC/cub/device/dispatch/dispatch_select_if.cuh delete mode 100644 SRC/cub/device/dispatch/dispatch_spmv_orig.cuh delete mode 100644 SRC/cub/grid/grid_barrier.cuh delete mode 100644 SRC/cub/grid/grid_even_share.cuh delete mode 100644 SRC/cub/grid/grid_mapping.cuh delete mode 100644 SRC/cub/grid/grid_queue.cuh delete mode 100644 SRC/cub/host/mutex.cuh delete mode 100644 SRC/cub/iterator/arg_index_input_iterator.cuh delete mode 100644 SRC/cub/iterator/cache_modified_input_iterator.cuh delete mode 100644 SRC/cub/iterator/cache_modified_output_iterator.cuh delete mode 100644 SRC/cub/iterator/constant_input_iterator.cuh delete mode 100644 SRC/cub/iterator/counting_input_iterator.cuh delete mode 100644 SRC/cub/iterator/discard_output_iterator.cuh delete mode 100644 SRC/cub/iterator/tex_obj_input_iterator.cuh delete mode 100644 SRC/cub/iterator/tex_ref_input_iterator.cuh delete mode 100644 SRC/cub/iterator/transform_input_iterator.cuh delete mode 100644 SRC/cub/thread/thread_load.cuh delete mode 100644 SRC/cub/thread/thread_operators.cuh delete mode 100644 SRC/cub/thread/thread_reduce.cuh delete mode 100644 SRC/cub/thread/thread_scan.cuh delete mode 100644 SRC/cub/thread/thread_search.cuh delete mode 100644 SRC/cub/thread/thread_store.cuh delete mode 100644 SRC/cub/util_allocator.cuh delete mode 100644 SRC/cub/util_arch.cuh delete mode 100644 SRC/cub/util_debug.cuh delete mode 100644 SRC/cub/util_device.cuh delete mode 100644 SRC/cub/util_macro.cuh delete mode 100644 SRC/cub/util_namespace.cuh delete mode 100644 SRC/cub/util_ptx.cuh delete mode 100644 SRC/cub/util_type.cuh delete mode 100644 SRC/cub/warp/specializations/warp_reduce_shfl.cuh delete mode 100644 SRC/cub/warp/specializations/warp_reduce_smem.cuh delete mode 100644 SRC/cub/warp/specializations/warp_scan_shfl.cuh delete mode 100644 SRC/cub/warp/specializations/warp_scan_smem.cuh delete mode 100644 SRC/cub/warp/warp_reduce.cuh delete mode 100644 SRC/cub/warp/warp_scan.cuh diff --git a/SRC/cub/agent/agent_histogram.cuh b/SRC/cub/agent/agent_histogram.cuh deleted file mode 100644 index 37b1ec97..00000000 --- a/SRC/cub/agent/agent_histogram.cuh +++ /dev/null @@ -1,787 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram . - */ - -#pragma once - -#include - -#include "../util_type.cuh" -#include "../block/block_load.cuh" -#include "../grid/grid_queue.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy - ******************************************************************************/ - -/** - * - */ -enum BlockHistogramMemoryPreference -{ - GMEM, - SMEM, - BLEND -}; - - -/** - * Parameterizable tuning policy type for AgentHistogram - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - bool _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming - BlockHistogramMemoryPreference _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) - bool _WORK_STEALING> ///< Whether to dequeue tiles from a global work queue -struct AgentHistogramPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - PIXELS_PER_THREAD = _PIXELS_PER_THREAD, ///< Pixels per thread (per tile of input) - IS_RLE_COMPRESS = _RLE_COMPRESS, ///< Whether to perform localized RLE to compress samples before histogramming - MEM_PREFERENCE = _MEM_PREFERENCE, ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins) - IS_WORK_STEALING = _WORK_STEALING, ///< Whether to dequeue tiles from a global work queue - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements -}; - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram . - */ -template < - typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type - int PRIVATIZED_SMEM_BINS, ///< Number of privatized shared-memory histogram bins of any channel. Zero indicates privatized counters to be maintained in device-accessible memory. - int NUM_CHANNELS, ///< Number of channels interleaved in the input data. Supports up to four channels. - int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename SampleIteratorT, ///< Random-access input iterator type for reading samples - typename CounterT, ///< Integer type for counting sample occurrences per histogram bin - typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel - typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel - typename OffsetT, ///< Signed integer type for global offsets - int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability -struct AgentHistogram -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// The sample type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - - /// The pixel type of SampleT - typedef typename CubVector::Type PixelT; - - /// The quad type of SampleT - typedef typename CubVector::Type QuadT; - - /// Constants - enum - { - BLOCK_THREADS = AgentHistogramPolicyT::BLOCK_THREADS, - - PIXELS_PER_THREAD = AgentHistogramPolicyT::PIXELS_PER_THREAD, - SAMPLES_PER_THREAD = PIXELS_PER_THREAD * NUM_CHANNELS, - QUADS_PER_THREAD = SAMPLES_PER_THREAD / 4, - - TILE_PIXELS = PIXELS_PER_THREAD * BLOCK_THREADS, - TILE_SAMPLES = SAMPLES_PER_THREAD * BLOCK_THREADS, - - IS_RLE_COMPRESS = AgentHistogramPolicyT::IS_RLE_COMPRESS, - - MEM_PREFERENCE = (PRIVATIZED_SMEM_BINS > 0) ? - AgentHistogramPolicyT::MEM_PREFERENCE : - GMEM, - - IS_WORK_STEALING = AgentHistogramPolicyT::IS_WORK_STEALING, - }; - - /// Cache load modifier for reading input elements - static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER; - - - /// Input iterator wrapper type (for applying cache modifier) - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator - SampleIteratorT>::Type // Directly use the supplied input iterator type - WrappedSampleIteratorT; - - /// Pixel input iterator type (for applying cache modifier) - typedef CacheModifiedInputIterator - WrappedPixelIteratorT; - - /// Qaud input iterator type (for applying cache modifier) - typedef CacheModifiedInputIterator - WrappedQuadIteratorT; - - /// Parameterized BlockLoad type for samples - typedef BlockLoad< - SampleT, - BLOCK_THREADS, - SAMPLES_PER_THREAD, - AgentHistogramPolicyT::LOAD_ALGORITHM> - BlockLoadSampleT; - - /// Parameterized BlockLoad type for pixels - typedef BlockLoad< - PixelT, - BLOCK_THREADS, - PIXELS_PER_THREAD, - AgentHistogramPolicyT::LOAD_ALGORITHM> - BlockLoadPixelT; - - /// Parameterized BlockLoad type for quads - typedef BlockLoad< - QuadT, - BLOCK_THREADS, - QUADS_PER_THREAD, - AgentHistogramPolicyT::LOAD_ALGORITHM> - BlockLoadQuadT; - - /// Shared memory type required by this thread block - struct _TempStorage - { - CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1]; // Smem needed for block-privatized smem histogram (with 1 word of padding) - - int tile_idx; - - // Aliasable storage layout - union Aliasable - { - typename BlockLoadSampleT::TempStorage sample_load; // Smem needed for loading a tile of samples - typename BlockLoadPixelT::TempStorage pixel_load; // Smem needed for loading a tile of pixels - typename BlockLoadQuadT::TempStorage quad_load; // Smem needed for loading a tile of quads - - } aliasable; - }; - - - /// Temporary storage type (unionable) - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - /// Reference to temp_storage - _TempStorage &temp_storage; - - /// Sample input iterator (with cache modifier applied, if possible) - WrappedSampleIteratorT d_wrapped_samples; - - /// Native pointer for input samples (possibly NULL if unavailable) - SampleT* d_native_samples; - - /// The number of output bins for each channel - int (&num_output_bins)[NUM_ACTIVE_CHANNELS]; - - /// The number of privatized bins for each channel - int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS]; - - /// Reference to gmem privatized histograms for each channel - CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS]; - - /// Reference to final output histograms (gmem) - CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS]; - - /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel - OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS]; - - /// The transform operator for determining privatized counter indices from samples, one for each channel - PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]; - - /// Whether to prefer privatized smem counters vs privatized global counters - bool prefer_smem; - - - //--------------------------------------------------------------------- - // Initialize privatized bin counters - //--------------------------------------------------------------------- - - // Initialize privatized bin counters - __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) - { - // Initialize histogram bin counts to zeros - #pragma unroll - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - { - for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS) - { - privatized_histograms[CHANNEL][privatized_bin] = 0; - } - } - - // Barrier to make sure all threads are done updating counters - CTA_SYNC(); - } - - - // Initialize privatized bin counters. Specialized for privatized shared-memory counters - __device__ __forceinline__ void InitSmemBinCounters() - { - CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; - - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; - - InitBinCounters(privatized_histograms); - } - - - // Initialize privatized bin counters. Specialized for privatized global-memory counters - __device__ __forceinline__ void InitGmemBinCounters() - { - InitBinCounters(d_privatized_histograms); - } - - - //--------------------------------------------------------------------- - // Update final output histograms - //--------------------------------------------------------------------- - - // Update final output histograms from privatized histograms - __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]) - { - // Barrier to make sure all threads are done updating counters - CTA_SYNC(); - - // Apply privatized bin counts to output bin counts - #pragma unroll - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - { - int channel_bins = num_privatized_bins[CHANNEL]; - for (int privatized_bin = threadIdx.x; - privatized_bin < channel_bins; - privatized_bin += BLOCK_THREADS) - { - int output_bin = -1; - CounterT count = privatized_histograms[CHANNEL][privatized_bin]; - bool is_valid = count > 0; - - output_decode_op[CHANNEL].template BinSelect((SampleT) privatized_bin, output_bin, is_valid); - - if (output_bin >= 0) - { - atomicAdd(&d_output_histograms[CHANNEL][output_bin], count); - } - - } - } - } - - - // Update final output histograms from privatized histograms. Specialized for privatized shared-memory counters - __device__ __forceinline__ void StoreSmemOutput() - { - CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; - - StoreOutput(privatized_histograms); - } - - - // Update final output histograms from privatized histograms. Specialized for privatized global-memory counters - __device__ __forceinline__ void StoreGmemOutput() - { - StoreOutput(d_privatized_histograms); - } - - - //--------------------------------------------------------------------- - // Tile accumulation - //--------------------------------------------------------------------- - - // Accumulate pixels. Specialized for RLE compression. - __device__ __forceinline__ void AccumulatePixels( - SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], - bool is_valid[PIXELS_PER_THREAD], - CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], - Int2Type is_rle_compress) - { - #pragma unroll - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - { - // Bin pixels - int bins[PIXELS_PER_THREAD]; - - #pragma unroll - for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) - { - bins[PIXEL] = -1; - privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]); - } - - CounterT accumulator = 1; - - #pragma unroll - for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL) - { - if (bins[PIXEL] != bins[PIXEL + 1]) - { - if (bins[PIXEL] >= 0) - atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator); - - accumulator = 0; - } - accumulator++; - } - - // Last pixel - if (bins[PIXELS_PER_THREAD - 1] >= 0) - atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator); - } - } - - - // Accumulate pixels. Specialized for individual accumulation of each pixel. - __device__ __forceinline__ void AccumulatePixels( - SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], - bool is_valid[PIXELS_PER_THREAD], - CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS], - Int2Type is_rle_compress) - { - #pragma unroll - for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) - { - #pragma unroll - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - { - int bin = -1; - privatized_decode_op[CHANNEL].template BinSelect(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]); - if (bin >= 0) - atomicAdd(privatized_histograms[CHANNEL] + bin, 1); - } - } - } - - - /** - * Accumulate pixel, specialized for smem privatized histogram - */ - __device__ __forceinline__ void AccumulateSmemPixels( - SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], - bool is_valid[PIXELS_PER_THREAD]) - { - CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS]; - - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL]; - - AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type()); - } - - - /** - * Accumulate pixel, specialized for gmem privatized histogram - */ - __device__ __forceinline__ void AccumulateGmemPixels( - SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS], - bool is_valid[PIXELS_PER_THREAD]) - { - AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type()); - } - - - - //--------------------------------------------------------------------- - // Tile loading - //--------------------------------------------------------------------- - - // Load full, aligned tile using pixel iterator (multi-channel) - template - __device__ __forceinline__ void LoadFullAlignedTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type<_NUM_ACTIVE_CHANNELS> num_active_channels) - { - typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; - - WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); - - // Load using a wrapped pixel iterator - BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load( - d_wrapped_pixels, - reinterpret_cast(samples)); - } - - // Load full, aligned tile using quad iterator (single-channel) - __device__ __forceinline__ void LoadFullAlignedTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type<1> num_active_channels) - { - typedef QuadT AliasedQuads[QUADS_PER_THREAD]; - - WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset)); - - // Load using a wrapped quad iterator - BlockLoadQuadT(temp_storage.aliasable.quad_load).Load( - d_wrapped_quads, - reinterpret_cast(samples)); - } - - // Load full, aligned tile - __device__ __forceinline__ void LoadTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type is_full_tile, - Int2Type is_aligned) - { - LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type()); - } - - // Load full, mis-aligned tile using sample iterator - __device__ __forceinline__ void LoadTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type is_full_tile, - Int2Type is_aligned) - { - typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; - - // Load using sample iterator - BlockLoadSampleT(temp_storage.aliasable.sample_load).Load( - d_wrapped_samples + block_offset, - reinterpret_cast(samples)); - } - - // Load partially-full, aligned tile using the pixel iterator - __device__ __forceinline__ void LoadTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type is_full_tile, - Int2Type is_aligned) - { - typedef PixelT AliasedPixels[PIXELS_PER_THREAD]; - - WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset)); - - int valid_pixels = valid_samples / NUM_CHANNELS; - - // Load using a wrapped pixel iterator - BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load( - d_wrapped_pixels, - reinterpret_cast(samples), - valid_pixels); - } - - // Load partially-full, mis-aligned tile using sample iterator - __device__ __forceinline__ void LoadTile( - OffsetT block_offset, - int valid_samples, - SampleT (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS], - Int2Type is_full_tile, - Int2Type is_aligned) - { - typedef SampleT AliasedSamples[SAMPLES_PER_THREAD]; - - BlockLoadSampleT(temp_storage.aliasable.sample_load).Load( - d_wrapped_samples + block_offset, - reinterpret_cast(samples), - valid_samples); - } - - - //--------------------------------------------------------------------- - // Tile processing - //--------------------------------------------------------------------- - - // Consume a tile of data samples - template < - bool IS_ALIGNED, // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel) - bool IS_FULL_TILE> // Whether the tile is full - __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples) - { - SampleT samples[PIXELS_PER_THREAD][NUM_CHANNELS]; - bool is_valid[PIXELS_PER_THREAD]; - - // Load tile - LoadTile( - block_offset, - valid_samples, - samples, - Int2Type(), - Int2Type()); - - // Set valid flags - #pragma unroll - for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL) - is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples); - - // Accumulate samples -#if CUB_PTX_ARCH >= 120 - if (prefer_smem) - AccumulateSmemPixels(samples, is_valid); - else - AccumulateGmemPixels(samples, is_valid); -#else - AccumulateGmemPixels(samples, is_valid); -#endif - - } - - - // Consume row tiles. Specialized for work-stealing from queue - template - __device__ __forceinline__ void ConsumeTiles( - OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< The number of rows in the region of interest - OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest - int tiles_per_row, ///< Number of image tiles per row - GridQueue tile_queue, - Int2Type is_work_stealing) - { - - int num_tiles = num_rows * tiles_per_row; - int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x; - OffsetT num_even_share_tiles = gridDim.x * gridDim.y; - - while (tile_idx < num_tiles) - { - int row = tile_idx / tiles_per_row; - int col = tile_idx - (row * tiles_per_row); - OffsetT row_offset = row * row_stride_samples; - OffsetT col_offset = (col * TILE_SAMPLES); - OffsetT tile_offset = row_offset + col_offset; - - if (col == tiles_per_row - 1) - { - // Consume a partially-full tile at the end of the row - OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset; - ConsumeTile(tile_offset, num_remaining); - } - else - { - // Consume full tile - ConsumeTile(tile_offset, TILE_SAMPLES); - } - - CTA_SYNC(); - - // Get next tile - if (threadIdx.x == 0) - temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles; - - CTA_SYNC(); - - tile_idx = temp_storage.tile_idx; - } - } - - - // Consume row tiles. Specialized for even-share (striped across thread blocks) - template - __device__ __forceinline__ void ConsumeTiles( - OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< The number of rows in the region of interest - OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest - int tiles_per_row, ///< Number of image tiles per row - GridQueue tile_queue, - Int2Type is_work_stealing) - { - for (int row = blockIdx.y; row < num_rows; row += gridDim.y) - { - OffsetT row_begin = row * row_stride_samples; - OffsetT row_end = row_begin + (num_row_pixels * NUM_CHANNELS); - OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES); - - while (tile_offset < row_end) - { - OffsetT num_remaining = row_end - tile_offset; - - if (num_remaining < TILE_SAMPLES) - { - // Consume partial tile - ConsumeTile(tile_offset, num_remaining); - break; - } - - // Consume full tile - ConsumeTile(tile_offset, TILE_SAMPLES); - tile_offset += gridDim.x * TILE_SAMPLES; - } - } - } - - - //--------------------------------------------------------------------- - // Parameter extraction - //--------------------------------------------------------------------- - - // Return a native pixel pointer (specialized for CacheModifiedInputIterator types) - template < - CacheLoadModifier _MODIFIER, - typename _ValueT, - typename _OffsetT> - __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr) - { - return itr.ptr; - } - - // Return a native pixel pointer (specialized for other types) - template - __device__ __forceinline__ SampleT* NativePointer(IteratorT itr) - { - return NULL; - } - - - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - - /** - * Constructor - */ - __device__ __forceinline__ AgentHistogram( - TempStorage &temp_storage, ///< Reference to temp_storage - SampleIteratorT d_samples, ///< Input data to reduce - int (&num_output_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per final output histogram - int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS], ///< The number bins per privatized histogram - CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to final output histograms - CounterT* (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS], ///< Reference to privatized histograms - OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS], ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel - PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS]) ///< The transform operator for determining privatized counter indices from samples, one for each channel - : - temp_storage(temp_storage.Alias()), - d_wrapped_samples(d_samples), - num_output_bins(num_output_bins), - num_privatized_bins(num_privatized_bins), - d_output_histograms(d_output_histograms), - privatized_decode_op(privatized_decode_op), - output_decode_op(output_decode_op), - d_native_samples(NativePointer(d_wrapped_samples)), - prefer_smem((MEM_PREFERENCE == SMEM) ? - true : // prefer smem privatized histograms - (MEM_PREFERENCE == GMEM) ? - false : // prefer gmem privatized histograms - blockIdx.x & 1) // prefer blended privatized histograms - { - int blockId = (blockIdx.y * gridDim.x) + blockIdx.x; - - // Initialize the locations of this block's privatized histograms - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]); - } - - - /** - * Consume image - */ - __device__ __forceinline__ void ConsumeTiles( - OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< The number of rows in the region of interest - OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest - int tiles_per_row, ///< Number of image tiles per row - GridQueue tile_queue) ///< Queue descriptor for assigning tiles of work to thread blocks - { - // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel) - int quad_mask = AlignBytes::ALIGN_BYTES - 1; - int pixel_mask = AlignBytes::ALIGN_BYTES - 1; - size_t row_bytes = sizeof(SampleT) * row_stride_samples; - - bool quad_aligned_rows = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) && // Single channel - ((size_t(d_native_samples) & quad_mask) == 0) && // ptr is quad-aligned - ((num_rows == 1) || ((row_bytes & quad_mask) == 0)); // number of row-samples is a multiple of the alignment of the quad - - bool pixel_aligned_rows = (NUM_CHANNELS > 1) && // Multi channel - ((size_t(d_native_samples) & pixel_mask) == 0) && // ptr is pixel-aligned - ((row_bytes & pixel_mask) == 0); // number of row-samples is a multiple of the alignment of the pixel - - // Whether rows are aligned and can be vectorized - if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows)) - ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); - else - ConsumeTiles(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type()); - } - - - /** - * Initialize privatized bin counters. Specialized for privatized shared-memory counters - */ - __device__ __forceinline__ void InitBinCounters() - { - if (prefer_smem) - InitSmemBinCounters(); - else - InitGmemBinCounters(); - } - - - /** - * Store privatized histogram to device-accessible memory. Specialized for privatized shared-memory counters - */ - __device__ __forceinline__ void StoreOutput() - { - if (prefer_smem) - StoreSmemOutput(); - else - StoreGmemOutput(); - } - - -}; - - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/agent/agent_radix_sort_downsweep.cuh b/SRC/cub/agent/agent_radix_sort_downsweep.cuh deleted file mode 100644 index faea8813..00000000 --- a/SRC/cub/agent/agent_radix_sort_downsweep.cuh +++ /dev/null @@ -1,789 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep . - */ - - -#pragma once - -#include - -#include "../thread/thread_load.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_radix_rank.cuh" -#include "../block/block_exchange.cuh" -#include "../util_type.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Radix ranking algorithm - */ -enum RadixRankAlgorithm -{ - RADIX_RANK_BASIC, - RADIX_RANK_MEMOIZE, - RADIX_RANK_MATCH -}; - -/** - * Parameterizable tuning policy type for AgentRadixSortDownsweep - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys (and values) - RadixRankAlgorithm _RANK_ALGORITHM, ///< The radix ranking algorithm to use - BlockScanAlgorithm _SCAN_ALGORITHM, ///< The block scan algorithm to use - int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins) -struct AgentRadixSortDownsweepPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys (and values) - static const RadixRankAlgorithm RANK_ALGORITHM = _RANK_ALGORITHM; ///< The radix ranking algorithm to use - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - - - - - -/** - * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep . - */ -template < - typename AgentRadixSortDownsweepPolicy, ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< KeyT type - typename ValueT, ///< ValueT type - typename OffsetT> ///< Signed integer type for global offsets -struct AgentRadixSortDownsweep -{ - //--------------------------------------------------------------------- - // Type definitions and constants - //--------------------------------------------------------------------- - - // Appropriate unsigned-bits representation of KeyT - typedef typename Traits::UnsignedBits UnsignedBits; - - static const UnsignedBits LOWEST_KEY = Traits::LOWEST_KEY; - static const UnsignedBits MAX_KEY = Traits::MAX_KEY; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM; - static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER; - static const RadixRankAlgorithm RANK_ALGORITHM = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM; - static const BlockScanAlgorithm SCAN_ALGORITHM = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM; - - enum - { - BLOCK_THREADS = AgentRadixSortDownsweepPolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD, - RADIX_BITS = AgentRadixSortDownsweepPolicy::RADIX_BITS, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - - RADIX_DIGITS = 1 << RADIX_BITS, - KEYS_ONLY = Equals::VALUE, - }; - - // Input iterator wrapper type (for applying cache modifier)s - typedef CacheModifiedInputIterator KeysItr; - typedef CacheModifiedInputIterator ValuesItr; - - // Radix ranking type to use - typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC), - BlockRadixRank, - typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE), - BlockRadixRank, - BlockRadixRankMatch - >::Type - >::Type BlockRadixRankT; - - enum - { - /// Number of bin-starting offsets tracked per thread - BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD - }; - - // BlockLoad type (keys) - typedef BlockLoad< - UnsignedBits, - BLOCK_THREADS, - ITEMS_PER_THREAD, - LOAD_ALGORITHM> BlockLoadKeysT; - - // BlockLoad type (values) - typedef BlockLoad< - ValueT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - LOAD_ALGORITHM> BlockLoadValuesT; - - // Value exchange array type - typedef ValueT ValueExchangeT[TILE_ITEMS]; - - /** - * Shared memory storage layout - */ - union __align__(16) _TempStorage - { - typename BlockLoadKeysT::TempStorage load_keys; - typename BlockLoadValuesT::TempStorage load_values; - typename BlockRadixRankT::TempStorage radix_rank; - - struct - { - UnsignedBits exchange_keys[TILE_ITEMS]; - OffsetT relative_bin_offsets[RADIX_DIGITS]; - }; - - Uninitialized exchange_values; - - OffsetT exclusive_digit_prefix[RADIX_DIGITS]; - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Thread fields - //--------------------------------------------------------------------- - - // Shared storage for this CTA - _TempStorage &temp_storage; - - // Input and output device pointers - KeysItr d_keys_in; - ValuesItr d_values_in; - UnsignedBits *d_keys_out; - ValueT *d_values_out; - - // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads) - OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; - - // The least-significant bit position of the current digit to extract - int current_bit; - - // Number of bits in current digit - int num_bits; - - // Whether to short-cirucit - int short_circuit; - - //--------------------------------------------------------------------- - // Utility methods - //--------------------------------------------------------------------- - - - /** - * Scatter ranked keys through shared memory, then to device-accessible memory - */ - template - __device__ __forceinline__ void ScatterKeys( - UnsignedBits (&twiddled_keys)[ITEMS_PER_THREAD], - OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - OffsetT valid_items) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM]; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - UnsignedBits key = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)]; - UnsignedBits digit = BFE(key, current_bit, num_bits); - relative_bin_offsets[ITEM] = temp_storage.relative_bin_offsets[digit]; - - // Un-twiddle - key = Traits::TwiddleOut(key); - - if (FULL_TILE || - (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items)) - { - d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key; - } - } - } - - - /** - * Scatter ranked values through shared memory, then to device-accessible memory - */ - template - __device__ __forceinline__ void ScatterValues( - ValueT (&values)[ITEMS_PER_THREAD], - OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - OffsetT valid_items) - { - CTA_SYNC(); - - ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - exchange_values[ranks[ITEM]] = values[ITEM]; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)]; - - if (FULL_TILE || - (static_cast(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items)) - { - d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value; - } - } - } - - /** - * Load a tile of keys (specialized for full tile, any ranking algorithm) - */ - template - __device__ __forceinline__ void LoadKeys( - UnsignedBits (&keys)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - UnsignedBits oob_item, - Int2Type is_full_tile, - Int2Type<_RANK_ALGORITHM> rank_algorithm) - { - BlockLoadKeysT(temp_storage.load_keys).Load( - d_keys_in + block_offset, keys); - - CTA_SYNC(); - } - - - /** - * Load a tile of keys (specialized for partial tile, any ranking algorithm) - */ - template - __device__ __forceinline__ void LoadKeys( - UnsignedBits (&keys)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - UnsignedBits oob_item, - Int2Type is_full_tile, - Int2Type<_RANK_ALGORITHM> rank_algorithm) - { - // Register pressure work-around: moving valid_items through shfl prevents compiler - // from reusing guards/addressing from prior guarded loads - valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); - - BlockLoadKeysT(temp_storage.load_keys).Load( - d_keys_in + block_offset, keys, valid_items, oob_item); - - CTA_SYNC(); - } - - - /** - * Load a tile of keys (specialized for full tile, match ranking algorithm) - */ - __device__ __forceinline__ void LoadKeys( - UnsignedBits (&keys)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - UnsignedBits oob_item, - Int2Type is_full_tile, - Int2Type rank_algorithm) - { - LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys); - } - - - /** - * Load a tile of keys (specialized for partial tile, match ranking algorithm) - */ - __device__ __forceinline__ void LoadKeys( - UnsignedBits (&keys)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - UnsignedBits oob_item, - Int2Type is_full_tile, - Int2Type rank_algorithm) - { - // Register pressure work-around: moving valid_items through shfl prevents compiler - // from reusing guards/addressing from prior guarded loads - valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); - - LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item); - } - - - /** - * Load a tile of values (specialized for full tile, any ranking algorithm) - */ - template - __device__ __forceinline__ void LoadValues( - ValueT (&values)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - Int2Type is_full_tile, - Int2Type<_RANK_ALGORITHM> rank_algorithm) - { - BlockLoadValuesT(temp_storage.load_values).Load( - d_values_in + block_offset, values); - - CTA_SYNC(); - } - - - /** - * Load a tile of values (specialized for partial tile, any ranking algorithm) - */ - template - __device__ __forceinline__ void LoadValues( - ValueT (&values)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - Int2Type is_full_tile, - Int2Type<_RANK_ALGORITHM> rank_algorithm) - { - // Register pressure work-around: moving valid_items through shfl prevents compiler - // from reusing guards/addressing from prior guarded loads - valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); - - BlockLoadValuesT(temp_storage.load_values).Load( - d_values_in + block_offset, values, valid_items); - - CTA_SYNC(); - } - - - /** - * Load a tile of items (specialized for full tile, match ranking algorithm) - */ - __device__ __forceinline__ void LoadValues( - ValueT (&values)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - Int2Type is_full_tile, - Int2Type rank_algorithm) - { - LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values); - } - - - /** - * Load a tile of items (specialized for partial tile, match ranking algorithm) - */ - __device__ __forceinline__ void LoadValues( - ValueT (&values)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - Int2Type is_full_tile, - Int2Type rank_algorithm) - { - // Register pressure work-around: moving valid_items through shfl prevents compiler - // from reusing guards/addressing from prior guarded loads - valid_items = ShuffleIndex(valid_items, 0, 0xffffffff); - - LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items); - } - - - /** - * Truck along associated values - */ - template - __device__ __forceinline__ void GatherScatterValues( - OffsetT (&relative_bin_offsets)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - OffsetT block_offset, - OffsetT valid_items, - Int2Type /*is_keys_only*/) - { - ValueT values[ITEMS_PER_THREAD]; - - CTA_SYNC(); - - LoadValues( - values, - block_offset, - valid_items, - Int2Type(), - Int2Type()); - - ScatterValues( - values, - relative_bin_offsets, - ranks, - valid_items); - } - - - /** - * Truck along associated values (specialized for key-only sorting) - */ - template - __device__ __forceinline__ void GatherScatterValues( - OffsetT (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD], - int (&/*ranks*/)[ITEMS_PER_THREAD], - OffsetT /*block_offset*/, - OffsetT /*valid_items*/, - Int2Type /*is_keys_only*/) - {} - - - /** - * Process tile - */ - template - __device__ __forceinline__ void ProcessTile( - OffsetT block_offset, - const OffsetT &valid_items = TILE_ITEMS) - { - UnsignedBits keys[ITEMS_PER_THREAD]; - int ranks[ITEMS_PER_THREAD]; - OffsetT relative_bin_offsets[ITEMS_PER_THREAD]; - - // Assign default (min/max) value to all keys - UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY; - - // Load tile of keys - LoadKeys( - keys, - block_offset, - valid_items, - default_key, - Int2Type(), - Int2Type()); - - // Twiddle key bits if necessary - #pragma unroll - for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) - { - keys[KEY] = Traits::TwiddleIn(keys[KEY]); - } - - // Rank the twiddled keys - int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; - BlockRadixRankT(temp_storage.radix_rank).RankKeys( - keys, - ranks, - current_bit, - num_bits, - exclusive_digit_prefix); - - CTA_SYNC(); - - // Share exclusive digit prefix - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - // Store exclusive prefix - temp_storage.exclusive_digit_prefix[bin_idx] = - exclusive_digit_prefix[track]; - } - } - - CTA_SYNC(); - - // Get inclusive digit prefix - int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD]; - - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - if (IS_DESCENDING) - { - // Get inclusive digit prefix from exclusive prefix (higher bins come first) - inclusive_digit_prefix[track] = (bin_idx == 0) ? - (BLOCK_THREADS * ITEMS_PER_THREAD) : - temp_storage.exclusive_digit_prefix[bin_idx - 1]; - } - else - { - // Get inclusive digit prefix from exclusive prefix (lower bins come first) - inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ? - (BLOCK_THREADS * ITEMS_PER_THREAD) : - temp_storage.exclusive_digit_prefix[bin_idx + 1]; - } - } - } - - CTA_SYNC(); - - // Update global scatter base offsets for each digit - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - bin_offset[track] -= exclusive_digit_prefix[track]; - temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track]; - bin_offset[track] += inclusive_digit_prefix[track]; - } - } - - CTA_SYNC(); - - // Scatter keys - ScatterKeys(keys, relative_bin_offsets, ranks, valid_items); - - // Gather/scatter values - GatherScatterValues(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type()); - } - - //--------------------------------------------------------------------- - // Copy shortcut - //--------------------------------------------------------------------- - - /** - * Copy tiles within the range of input - */ - template < - typename InputIteratorT, - typename T> - __device__ __forceinline__ void Copy( - InputIteratorT d_in, - T *d_out, - OffsetT block_offset, - OffsetT block_end) - { - // Simply copy the input - while (block_offset + TILE_ITEMS <= block_end) - { - T items[ITEMS_PER_THREAD]; - - LoadDirectStriped(threadIdx.x, d_in + block_offset, items); - CTA_SYNC(); - StoreDirectStriped(threadIdx.x, d_out + block_offset, items); - - block_offset += TILE_ITEMS; - } - - // Clean up last partial tile with guarded-I/O - if (block_offset < block_end) - { - OffsetT valid_items = block_end - block_offset; - - T items[ITEMS_PER_THREAD]; - - LoadDirectStriped(threadIdx.x, d_in + block_offset, items, valid_items); - CTA_SYNC(); - StoreDirectStriped(threadIdx.x, d_out + block_offset, items, valid_items); - } - } - - - /** - * Copy tiles within the range of input (specialized for NullType) - */ - template - __device__ __forceinline__ void Copy( - InputIteratorT /*d_in*/, - NullType * /*d_out*/, - OffsetT /*block_offset*/, - OffsetT /*block_end*/) - {} - - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - /** - * Constructor - */ - __device__ __forceinline__ AgentRadixSortDownsweep( - TempStorage &temp_storage, - OffsetT (&bin_offset)[BINS_TRACKED_PER_THREAD], - OffsetT num_items, - const KeyT *d_keys_in, - KeyT *d_keys_out, - const ValueT *d_values_in, - ValueT *d_values_out, - int current_bit, - int num_bits) - : - temp_storage(temp_storage.Alias()), - d_keys_in(reinterpret_cast(d_keys_in)), - d_values_in(d_values_in), - d_keys_out(reinterpret_cast(d_keys_out)), - d_values_out(d_values_out), - current_bit(current_bit), - num_bits(num_bits), - short_circuit(1) - { - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - this->bin_offset[track] = bin_offset[track]; - - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - // Short circuit if the histogram has only bin counts of only zeros or problem-size - short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items)); - } - } - - short_circuit = CTA_SYNC_AND(short_circuit); - } - - - /** - * Constructor - */ - __device__ __forceinline__ AgentRadixSortDownsweep( - TempStorage &temp_storage, - OffsetT num_items, - OffsetT *d_spine, - const KeyT *d_keys_in, - KeyT *d_keys_out, - const ValueT *d_values_in, - ValueT *d_values_out, - int current_bit, - int num_bits) - : - temp_storage(temp_storage.Alias()), - d_keys_in(reinterpret_cast(d_keys_in)), - d_values_in(d_values_in), - d_keys_out(reinterpret_cast(d_keys_out)), - d_values_out(d_values_out), - current_bit(current_bit), - num_bits(num_bits), - short_circuit(1) - { - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - - // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit) - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - if (IS_DESCENDING) - bin_idx = RADIX_DIGITS - bin_idx - 1; - - // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size - OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx]; - short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items)); - - // Load my block's bin offset for my bin - bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x]; - } - } - - short_circuit = CTA_SYNC_AND(short_circuit); - } - - - /** - * Distribute keys from a segment of input tiles. - */ - __device__ __forceinline__ void ProcessRegion( - OffsetT block_offset, - OffsetT block_end) - { - if (short_circuit) - { - // Copy keys - Copy(d_keys_in, d_keys_out, block_offset, block_end); - - // Copy values - Copy(d_values_in, d_values_out, block_offset, block_end); - } - else - { - // Process full tiles of tile_items - #pragma unroll 1 - while (block_offset + TILE_ITEMS <= block_end) - { - ProcessTile(block_offset); - block_offset += TILE_ITEMS; - - CTA_SYNC(); - } - - // Clean up last partial tile with guarded-I/O - if (block_offset < block_end) - { - ProcessTile(block_offset, block_end - block_offset); - } - - } - } - -}; - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/agent/agent_radix_sort_upsweep.cuh b/SRC/cub/agent/agent_radix_sort_upsweep.cuh deleted file mode 100644 index 2081cefb..00000000 --- a/SRC/cub/agent/agent_radix_sort_upsweep.cuh +++ /dev/null @@ -1,526 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep . - */ - -#pragma once - -#include "../thread/thread_reduce.cuh" -#include "../thread/thread_load.cuh" -#include "../warp/warp_reduce.cuh" -#include "../block/block_load.cuh" -#include "../util_type.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentRadixSortUpsweep - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading keys - int _RADIX_BITS> ///< The number of radix bits, i.e., log2(bins) -struct AgentRadixSortUpsweepPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - RADIX_BITS = _RADIX_BITS, ///< The number of radix bits, i.e., log2(bins) - }; - - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading keys -}; - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep . - */ -template < - typename AgentRadixSortUpsweepPolicy, ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type - typename KeyT, ///< KeyT type - typename OffsetT> ///< Signed integer type for global offsets -struct AgentRadixSortUpsweep -{ - - //--------------------------------------------------------------------- - // Type definitions and constants - //--------------------------------------------------------------------- - - typedef typename Traits::UnsignedBits UnsignedBits; - - // Integer type for digit counters (to be packed into words of PackedCounters) - typedef unsigned char DigitCounter; - - // Integer type for packing DigitCounters into columns of shared memory banks - typedef unsigned int PackedCounter; - - static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER; - - enum - { - RADIX_BITS = AgentRadixSortUpsweepPolicy::RADIX_BITS, - BLOCK_THREADS = AgentRadixSortUpsweepPolicy::BLOCK_THREADS, - KEYS_PER_THREAD = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD, - - RADIX_DIGITS = 1 << RADIX_BITS, - - LOG_WARP_THREADS = CUB_PTX_LOG_WARP_THREADS, - WARP_THREADS = 1 << LOG_WARP_THREADS, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - TILE_ITEMS = BLOCK_THREADS * KEYS_PER_THREAD, - - BYTES_PER_COUNTER = sizeof(DigitCounter), - LOG_BYTES_PER_COUNTER = Log2::VALUE, - - PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), - LOG_PACKING_RATIO = Log2::VALUE, - - LOG_COUNTER_LANES = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO), - COUNTER_LANES = 1 << LOG_COUNTER_LANES, - - // To prevent counter overflow, we must periodically unpack and aggregate the - // digit counters back into registers. Each counter lane is assigned to a - // warp for aggregation. - - LANES_PER_WARP = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS), - - // Unroll tiles in batches without risk of counter overflow - UNROLL_COUNT = CUB_MIN(64, 255 / KEYS_PER_THREAD), - UNROLLED_ELEMENTS = UNROLL_COUNT * TILE_ITEMS, - }; - - - // Input iterator wrapper type (for applying cache modifier)s - typedef CacheModifiedInputIterator KeysItr; - - /** - * Shared memory storage layout - */ - union __align__(16) _TempStorage - { - DigitCounter thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; - PackedCounter packed_thread_counters[COUNTER_LANES][BLOCK_THREADS]; - OffsetT block_counters[WARP_THREADS][RADIX_DIGITS]; - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Thread fields (aggregate state bundle) - //--------------------------------------------------------------------- - - // Shared storage for this CTA - _TempStorage &temp_storage; - - // Thread-local counters for periodically aggregating composite-counter lanes - OffsetT local_counts[LANES_PER_WARP][PACKING_RATIO]; - - // Input and output device pointers - KeysItr d_keys_in; - - // The least-significant bit position of the current digit to extract - int current_bit; - - // Number of bits in current digit - int num_bits; - - - - //--------------------------------------------------------------------- - // Helper structure for templated iteration - //--------------------------------------------------------------------- - - // Iterate - template - struct Iterate - { - // BucketKeys - static __device__ __forceinline__ void BucketKeys( - AgentRadixSortUpsweep &cta, - UnsignedBits keys[KEYS_PER_THREAD]) - { - cta.Bucket(keys[COUNT]); - - // Next - Iterate::BucketKeys(cta, keys); - } - }; - - // Terminate - template - struct Iterate - { - // BucketKeys - static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {} - }; - - - //--------------------------------------------------------------------- - // Utility methods - //--------------------------------------------------------------------- - - /** - * Decode a key and increment corresponding smem digit counter - */ - __device__ __forceinline__ void Bucket(UnsignedBits key) - { - // Perform transform op - UnsignedBits converted_key = Traits::TwiddleIn(key); - - // Extract current digit bits - UnsignedBits digit = BFE(converted_key, current_bit, num_bits); - - // Get sub-counter offset - UnsignedBits sub_counter = digit & (PACKING_RATIO - 1); - - // Get row offset - UnsignedBits row_offset = digit >> LOG_PACKING_RATIO; - - // Increment counter - temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++; - } - - - /** - * Reset composite counters - */ - __device__ __forceinline__ void ResetDigitCounters() - { - #pragma unroll - for (int LANE = 0; LANE < COUNTER_LANES; LANE++) - { - temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0; - } - } - - - /** - * Reset the unpacked counters in each thread - */ - __device__ __forceinline__ void ResetUnpackedCounters() - { - #pragma unroll - for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) - { - #pragma unroll - for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) - { - local_counts[LANE][UNPACKED_COUNTER] = 0; - } - } - } - - - /** - * Extracts and aggregates the digit counters for each counter lane - * owned by this warp - */ - __device__ __forceinline__ void UnpackDigitCounts() - { - unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; - unsigned int warp_tid = LaneId(); - - #pragma unroll - for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) - { - const int counter_lane = (LANE * WARPS) + warp_id; - if (counter_lane < COUNTER_LANES) - { - #pragma unroll - for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS) - { - #pragma unroll - for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) - { - OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER]; - local_counts[LANE][UNPACKED_COUNTER] += counter; - } - } - } - } - } - - - /** - * Processes a single, full tile - */ - __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset) - { - // Tile of keys - UnsignedBits keys[KEYS_PER_THREAD]; - - LoadDirectStriped(threadIdx.x, d_keys_in + block_offset, keys); - - // Prevent hoisting - CTA_SYNC(); - - // Bucket tile of keys - Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys); - } - - - /** - * Processes a single load (may have some threads masked off) - */ - __device__ __forceinline__ void ProcessPartialTile( - OffsetT block_offset, - const OffsetT &block_end) - { - // Process partial tile if necessary using single loads - block_offset += threadIdx.x; - while (block_offset < block_end) - { - // Load and bucket key - UnsignedBits key = d_keys_in[block_offset]; - Bucket(key); - block_offset += BLOCK_THREADS; - } - } - - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - /** - * Constructor - */ - __device__ __forceinline__ AgentRadixSortUpsweep( - TempStorage &temp_storage, - const KeyT *d_keys_in, - int current_bit, - int num_bits) - : - temp_storage(temp_storage.Alias()), - d_keys_in(reinterpret_cast(d_keys_in)), - current_bit(current_bit), - num_bits(num_bits) - {} - - - /** - * Compute radix digit histograms from a segment of input tiles. - */ - __device__ __forceinline__ void ProcessRegion( - OffsetT block_offset, - const OffsetT &block_end) - { - // Reset digit counters in smem and unpacked counters in registers - ResetDigitCounters(); - ResetUnpackedCounters(); - - // Unroll batches of full tiles - while (block_offset + UNROLLED_ELEMENTS <= block_end) - { - for (int i = 0; i < UNROLL_COUNT; ++i) - { - ProcessFullTile(block_offset); - block_offset += TILE_ITEMS; - } - - CTA_SYNC(); - - // Aggregate back into local_count registers to prevent overflow - UnpackDigitCounts(); - - CTA_SYNC(); - - // Reset composite counters in lanes - ResetDigitCounters(); - } - - // Unroll single full tiles - while (block_offset + TILE_ITEMS <= block_end) - { - ProcessFullTile(block_offset); - block_offset += TILE_ITEMS; - } - - // Process partial tile if necessary - ProcessPartialTile( - block_offset, - block_end); - - CTA_SYNC(); - - // Aggregate back into local_count registers - UnpackDigitCounts(); - } - - - /** - * Extract counts (saving them to the external array) - */ - template - __device__ __forceinline__ void ExtractCounts( - OffsetT *counters, - int bin_stride = 1, - int bin_offset = 0) - { - unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; - unsigned int warp_tid = LaneId(); - - // Place unpacked digit counters in shared memory - #pragma unroll - for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) - { - int counter_lane = (LANE * WARPS) + warp_id; - if (counter_lane < COUNTER_LANES) - { - int digit_row = counter_lane << LOG_PACKING_RATIO; - - #pragma unroll - for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) - { - int bin_idx = digit_row + UNPACKED_COUNTER; - - temp_storage.block_counters[warp_tid][bin_idx] = - local_counts[LANE][UNPACKED_COUNTER]; - } - } - } - - CTA_SYNC(); - - // Rake-reduce bin_count reductions - - // Whole blocks - #pragma unroll - for (int BIN_BASE = RADIX_DIGITS % BLOCK_THREADS; - (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS; - BIN_BASE += BLOCK_THREADS) - { - int bin_idx = BIN_BASE + threadIdx.x; - - OffsetT bin_count = 0; - #pragma unroll - for (int i = 0; i < WARP_THREADS; ++i) - bin_count += temp_storage.block_counters[i][bin_idx]; - - if (IS_DESCENDING) - bin_idx = RADIX_DIGITS - bin_idx - 1; - - counters[(bin_stride * bin_idx) + bin_offset] = bin_count; - } - - // Remainder - if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS)) - { - int bin_idx = threadIdx.x; - - OffsetT bin_count = 0; - #pragma unroll - for (int i = 0; i < WARP_THREADS; ++i) - bin_count += temp_storage.block_counters[i][bin_idx]; - - if (IS_DESCENDING) - bin_idx = RADIX_DIGITS - bin_idx - 1; - - counters[(bin_stride * bin_idx) + bin_offset] = bin_count; - } - } - - - /** - * Extract counts - */ - template - __device__ __forceinline__ void ExtractCounts( - OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] - { - unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS; - unsigned int warp_tid = LaneId(); - - // Place unpacked digit counters in shared memory - #pragma unroll - for (int LANE = 0; LANE < LANES_PER_WARP; LANE++) - { - int counter_lane = (LANE * WARPS) + warp_id; - if (counter_lane < COUNTER_LANES) - { - int digit_row = counter_lane << LOG_PACKING_RATIO; - - #pragma unroll - for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++) - { - int bin_idx = digit_row + UNPACKED_COUNTER; - - temp_storage.block_counters[warp_tid][bin_idx] = - local_counts[LANE][UNPACKED_COUNTER]; - } - } - } - - CTA_SYNC(); - - // Rake-reduce bin_count reductions - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - bin_count[track] = 0; - - #pragma unroll - for (int i = 0; i < WARP_THREADS; ++i) - bin_count[track] += temp_storage.block_counters[i][bin_idx]; - } - } - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/agent/agent_reduce.cuh b/SRC/cub/agent/agent_reduce.cuh deleted file mode 100644 index 000a905c..00000000 --- a/SRC/cub/agent/agent_reduce.cuh +++ /dev/null @@ -1,385 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction . - */ - -#pragma once - -#include - -#include "../block/block_load.cuh" -#include "../block/block_reduce.cuh" -#include "../grid/grid_mapping.cuh" -#include "../grid/grid_even_share.cuh" -#include "../util_type.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentReduce - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load - BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use - CacheLoadModifier _LOAD_MODIFIER> ///< Cache load modifier for reading input elements -struct AgentReducePolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load - }; - - static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements -}; - - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction . - * - * Each thread reduces only the values it loads. If \p FIRST_TILE, this - * partial reduction is stored into \p thread_aggregate. Otherwise it is - * accumulated into \p thread_aggregate. - */ -template < - typename AgentReducePolicy, ///< Parameterized AgentReducePolicy tuning policy type - typename InputIteratorT, ///< Random-access iterator type for input - typename OutputIteratorT, ///< Random-access iterator type for output - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOp> ///< Binary reduction operator type having member T operator()(const T &a, const T &b) -struct AgentReduce -{ - - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// The input value type - typedef typename std::iterator_traits::value_type InputT; - - /// The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - /// Vector type of InputT for data movement - typedef typename CubVector::Type VectorT; - - /// Input iterator wrapper type (for applying cache modifier) - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator - InputIteratorT>::Type // Directly use the supplied input iterator type - WrappedInputIteratorT; - - /// Constants - enum - { - BLOCK_THREADS = AgentReducePolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentReducePolicy::ITEMS_PER_THREAD, - VECTOR_LOAD_LENGTH = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH), - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - - // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type - ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) && - (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) && - (IsPointer::VALUE) && Traits::PRIMITIVE, - - }; - - static const CacheLoadModifier LOAD_MODIFIER = AgentReducePolicy::LOAD_MODIFIER; - static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM; - - /// Parameterized BlockReduce primitive - typedef BlockReduce BlockReduceT; - - /// Shared memory type required by this thread block - struct _TempStorage - { - typename BlockReduceT::TempStorage reduce; - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage& temp_storage; ///< Reference to temp_storage - InputIteratorT d_in; ///< Input data to reduce - WrappedInputIteratorT d_wrapped_in; ///< Wrapped input data to reduce - ReductionOp reduction_op; ///< Binary reduction operator - - - //--------------------------------------------------------------------- - // Utility - //--------------------------------------------------------------------- - - - // Whether or not the input is aligned with the vector type (specialized for types we can vectorize) - template - static __device__ __forceinline__ bool IsAligned( - Iterator d_in, - Int2Type /*can_vectorize*/) - { - return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0; - } - - // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize) - template - static __device__ __forceinline__ bool IsAligned( - Iterator /*d_in*/, - Int2Type /*can_vectorize*/) - { - return false; - } - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - /** - * Constructor - */ - __device__ __forceinline__ AgentReduce( - TempStorage& temp_storage, ///< Reference to temp_storage - InputIteratorT d_in, ///< Input data to reduce - ReductionOp reduction_op) ///< Binary reduction operator - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_wrapped_in(d_in), - reduction_op(reduction_op) - {} - - - //--------------------------------------------------------------------- - // Tile consumption - //--------------------------------------------------------------------- - - /** - * Consume a full tile of input (non-vectorized) - */ - template - __device__ __forceinline__ void ConsumeTile( - OutputT &thread_aggregate, - OffsetT block_offset, ///< The offset the tile to consume - int /*valid_items*/, ///< The number of valid items in the tile - Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile - Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads - { - OutputT items[ITEMS_PER_THREAD]; - - // Load items in striped fashion - LoadDirectStriped(threadIdx.x, d_wrapped_in + block_offset, items); - - // Reduce items within each thread stripe - thread_aggregate = (IS_FIRST_TILE) ? - internal::ThreadReduce(items, reduction_op) : - internal::ThreadReduce(items, reduction_op, thread_aggregate); - } - - - /** - * Consume a full tile of input (vectorized) - */ - template - __device__ __forceinline__ void ConsumeTile( - OutputT &thread_aggregate, - OffsetT block_offset, ///< The offset the tile to consume - int /*valid_items*/, ///< The number of valid items in the tile - Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile - Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads - { - // Alias items as an array of VectorT and load it in striped fashion - enum { WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH }; - - // Fabricate a vectorized input iterator - InputT *d_in_unqualified = const_cast(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH); - CacheModifiedInputIterator d_vec_in( - reinterpret_cast(d_in_unqualified)); - - // Load items as vector items - InputT input_items[ITEMS_PER_THREAD]; - VectorT *vec_items = reinterpret_cast(input_items); - #pragma unroll - for (int i = 0; i < WORDS; ++i) - vec_items[i] = d_vec_in[BLOCK_THREADS * i]; - - // Convert from input type to output type - OutputT items[ITEMS_PER_THREAD]; - #pragma unroll - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - items[i] = input_items[i]; - - // Reduce items within each thread stripe - thread_aggregate = (IS_FIRST_TILE) ? - internal::ThreadReduce(items, reduction_op) : - internal::ThreadReduce(items, reduction_op, thread_aggregate); - } - - - /** - * Consume a partial tile of input - */ - template - __device__ __forceinline__ void ConsumeTile( - OutputT &thread_aggregate, - OffsetT block_offset, ///< The offset the tile to consume - int valid_items, ///< The number of valid items in the tile - Int2Type /*is_full_tile*/, ///< Whether or not this is a full tile - Int2Type /*can_vectorize*/) ///< Whether or not we can vectorize loads - { - // Partial tile - int thread_offset = threadIdx.x; - - // Read first item - if ((IS_FIRST_TILE) && (thread_offset < valid_items)) - { - thread_aggregate = d_wrapped_in[block_offset + thread_offset]; - thread_offset += BLOCK_THREADS; - } - - // Continue reading items (block-striped) - while (thread_offset < valid_items) - { - OutputT item = d_wrapped_in[block_offset + thread_offset]; - thread_aggregate = reduction_op(thread_aggregate, item); - thread_offset += BLOCK_THREADS; - } - } - - - //--------------------------------------------------------------- - // Consume a contiguous segment of tiles - //--------------------------------------------------------------------- - - /** - * \brief Reduce a contiguous segment of input tiles - */ - template - __device__ __forceinline__ OutputT ConsumeRange( - GridEvenShare &even_share, ///< GridEvenShare descriptor - Int2Type can_vectorize) ///< Whether or not we can vectorize loads - { - OutputT thread_aggregate; - - if (even_share.block_offset + TILE_ITEMS > even_share.block_end) - { - // First tile isn't full (not all threads have valid items) - int valid_items = even_share.block_end - even_share.block_offset; - ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); - return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items); - } - - // At least one full block - ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); - even_share.block_offset += even_share.block_stride; - - // Consume subsequent full tiles of input - while (even_share.block_offset + TILE_ITEMS <= even_share.block_end) - { - ConsumeTile(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type(), can_vectorize); - even_share.block_offset += even_share.block_stride; - } - - // Consume a partially-full tile - if (even_share.block_offset < even_share.block_end) - { - int valid_items = even_share.block_end - even_share.block_offset; - ConsumeTile(thread_aggregate, even_share.block_offset, valid_items, Int2Type(), can_vectorize); - } - - // Compute block-wide reduction (all threads have valid items) - return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op); - } - - - /** - * \brief Reduce a contiguous segment of input tiles - */ - __device__ __forceinline__ OutputT ConsumeRange( - OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) - OffsetT block_end) ///< [in] Threadblock end offset (exclusive) - { - GridEvenShare even_share; - even_share.template BlockInit(block_offset, block_end); - - return (IsAligned(d_in + block_offset, Int2Type())) ? - ConsumeRange(even_share, Int2Type()) : - ConsumeRange(even_share, Int2Type()); - } - - - /** - * Reduce a contiguous segment of input tiles - */ - __device__ __forceinline__ OutputT ConsumeTiles( - GridEvenShare &even_share) ///< [in] GridEvenShare descriptor - { - // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block - even_share.template BlockInit(); - - return (IsAligned(d_in, Int2Type())) ? - ConsumeRange(even_share, Int2Type()) : - ConsumeRange(even_share, Int2Type()); - - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/agent/agent_reduce_by_key.cuh b/SRC/cub/agent/agent_reduce_by_key.cuh deleted file mode 100644 index 51964d3e..00000000 --- a/SRC/cub/agent/agent_reduce_by_key.cuh +++ /dev/null @@ -1,547 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. - */ - -#pragma once - -#include - -#include "single_pass_scan_operators.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_scan.cuh" -#include "../block/block_discontinuity.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../iterator/constant_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentReduceByKey - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use -struct AgentReduceByKeyPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key - */ -template < - typename AgentReduceByKeyPolicyT, ///< Parameterized AgentReduceByKeyPolicy tuning policy type - typename KeysInputIteratorT, ///< Random-access input iterator type for keys - typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys - typename ValuesInputIteratorT, ///< Random-access input iterator type for values - typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values - typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of items selected - typename EqualityOpT, ///< KeyT equality operator type - typename ReductionOpT, ///< ValueT reduction operator type - typename OffsetT> ///< Signed integer type for global offsets -struct AgentReduceByKey -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // The input keys type - typedef typename std::iterator_traits::value_type KeyInputT; - - // The output keys type - typedef typename If<(Equals::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type KeyOutputT; // ... else the output iterator's value type - - // The input values type - typedef typename std::iterator_traits::value_type ValueInputT; - - // The output values type - typedef typename If<(Equals::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type ValueOutputT; // ... else the output iterator's value type - - // Tuple type for scanning (pairs accumulated segment-value with segment-index) - typedef KeyValuePair OffsetValuePairT; - - // Tuple type for pairing keys and values - typedef KeyValuePair KeyValuePairT; - - // Tile status descriptor interface type - typedef ReduceByKeyScanTileState ScanTileStateT; - - // Guarded inequality functor - template - struct GuardedInequalityWrapper - { - _EqualityOpT op; ///< Wrapped equality operator - int num_remaining; ///< Items remaining - - /// Constructor - __host__ __device__ __forceinline__ - GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {} - - /// Boolean inequality operator, returns (a != b) - template - __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const - { - if (idx < num_remaining) - return !op(a, b); // In bounds - - // Return true if first out-of-bounds item, false otherwise - return (idx == num_remaining); - } - }; - - - // Constants - enum - { - BLOCK_THREADS = AgentReduceByKeyPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1), - - // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type) - HAS_IDENTITY_ZERO = (Equals::VALUE) && (Traits::PRIMITIVE), - }; - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - KeysInputIteratorT>::Type // Directly use the supplied input iterator type - WrappedKeysInputIteratorT; - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for values - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - ValuesInputIteratorT>::Type // Directly use the supplied input iterator type - WrappedValuesInputIteratorT; - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - AggregatesOutputIteratorT>::Type // Directly use the supplied input iterator type - WrappedFixupInputIteratorT; - - // Reduce-value-by-segment scan operator - typedef ReduceBySegmentOp ReduceBySegmentOpT; - - // Parameterized BlockLoad type for keys - typedef BlockLoad< - KeyOutputT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - AgentReduceByKeyPolicyT::LOAD_ALGORITHM> - BlockLoadKeysT; - - // Parameterized BlockLoad type for values - typedef BlockLoad< - ValueOutputT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - AgentReduceByKeyPolicyT::LOAD_ALGORITHM> - BlockLoadValuesT; - - // Parameterized BlockDiscontinuity type for keys - typedef BlockDiscontinuity< - KeyOutputT, - BLOCK_THREADS> - BlockDiscontinuityKeys; - - // Parameterized BlockScan type - typedef BlockScan< - OffsetValuePairT, - BLOCK_THREADS, - AgentReduceByKeyPolicyT::SCAN_ALGORITHM> - BlockScanT; - - // Callback type for obtaining tile prefix during block scan - typedef TilePrefixCallbackOp< - OffsetValuePairT, - ReduceBySegmentOpT, - ScanTileStateT> - TilePrefixCallbackOpT; - - // Key and value exchange types - typedef KeyOutputT KeyExchangeT[TILE_ITEMS + 1]; - typedef ValueOutputT ValueExchangeT[TILE_ITEMS + 1]; - - // Shared memory type for this thread block - union _TempStorage - { - struct - { - typename BlockScanT::TempStorage scan; // Smem needed for tile scanning - typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback - typename BlockDiscontinuityKeys::TempStorage discontinuity; // Smem needed for discontinuity detection - }; - - // Smem needed for loading keys - typename BlockLoadKeysT::TempStorage load_keys; - - // Smem needed for loading values - typename BlockLoadValuesT::TempStorage load_values; - - // Smem needed for compacting key value pairs(allows non POD items in this union) - Uninitialized raw_exchange; - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage& temp_storage; ///< Reference to temp_storage - WrappedKeysInputIteratorT d_keys_in; ///< Input keys - UniqueOutputIteratorT d_unique_out; ///< Unique output keys - WrappedValuesInputIteratorT d_values_in; ///< Input values - AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates - NumRunsOutputIteratorT d_num_runs_out; ///< Output pointer for total number of segments identified - EqualityOpT equality_op; ///< KeyT equality operator - ReductionOpT reduction_op; ///< Reduction operator - ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - // Constructor - __device__ __forceinline__ - AgentReduceByKey( - TempStorage& temp_storage, ///< Reference to temp_storage - KeysInputIteratorT d_keys_in, ///< Input keys - UniqueOutputIteratorT d_unique_out, ///< Unique output keys - ValuesInputIteratorT d_values_in, ///< Input values - AggregatesOutputIteratorT d_aggregates_out, ///< Output value aggregates - NumRunsOutputIteratorT d_num_runs_out, ///< Output pointer for total number of segments identified - EqualityOpT equality_op, ///< KeyT equality operator - ReductionOpT reduction_op) ///< ValueT reduction operator - : - temp_storage(temp_storage.Alias()), - d_keys_in(d_keys_in), - d_unique_out(d_unique_out), - d_values_in(d_values_in), - d_aggregates_out(d_aggregates_out), - d_num_runs_out(d_num_runs_out), - equality_op(equality_op), - reduction_op(reduction_op), - scan_op(reduction_op) - {} - - - //--------------------------------------------------------------------- - // Scatter utility methods - //--------------------------------------------------------------------- - - /** - * Directly scatter flagged items to output offsets - */ - __device__ __forceinline__ void ScatterDirect( - KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], - OffsetT (&segment_flags)[ITEMS_PER_THREAD], - OffsetT (&segment_indices)[ITEMS_PER_THREAD]) - { - // Scatter flagged keys and values - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (segment_flags[ITEM]) - { - d_unique_out[segment_indices[ITEM]] = scatter_items[ITEM].key; - d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value; - } - } - } - - - /** - * 2-phase scatter flagged items to output offsets - * - * The exclusive scan causes each head flag to be paired with the previous - * value aggregate: the scatter offsets must be decremented for value aggregates - */ - __device__ __forceinline__ void ScatterTwoPhase( - KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], - OffsetT (&segment_flags)[ITEMS_PER_THREAD], - OffsetT (&segment_indices)[ITEMS_PER_THREAD], - OffsetT num_tile_segments, - OffsetT num_tile_segments_prefix) - { - CTA_SYNC(); - - // Compact and scatter pairs - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (segment_flags[ITEM]) - { - temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM]; - } - } - - CTA_SYNC(); - - for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS) - { - KeyValuePairT pair = temp_storage.raw_exchange.Alias()[item]; - d_unique_out[num_tile_segments_prefix + item] = pair.key; - d_aggregates_out[num_tile_segments_prefix + item] = pair.value; - } - } - - - /** - * Scatter flagged items - */ - __device__ __forceinline__ void Scatter( - KeyValuePairT (&scatter_items)[ITEMS_PER_THREAD], - OffsetT (&segment_flags)[ITEMS_PER_THREAD], - OffsetT (&segment_indices)[ITEMS_PER_THREAD], - OffsetT num_tile_segments, - OffsetT num_tile_segments_prefix) - { - // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one - if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS)) - { - ScatterTwoPhase( - scatter_items, - segment_flags, - segment_indices, - num_tile_segments, - num_tile_segments_prefix); - } - else - { - ScatterDirect( - scatter_items, - segment_flags, - segment_indices); - } - } - - - //--------------------------------------------------------------------- - // Cooperatively scan a device-wide sequence of tiles with other CTAs - //--------------------------------------------------------------------- - - /** - * Process a tile of input (dynamic chained scan) - */ - template ///< Whether the current tile is the last tile - __device__ __forceinline__ void ConsumeTile( - OffsetT num_remaining, ///< Number of global input items remaining (including this tile) - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor - { - KeyOutputT keys[ITEMS_PER_THREAD]; // Tile keys - KeyOutputT prev_keys[ITEMS_PER_THREAD]; // Tile keys shuffled up - ValueOutputT values[ITEMS_PER_THREAD]; // Tile values - OffsetT head_flags[ITEMS_PER_THREAD]; // Segment head flags - OffsetT segment_indices[ITEMS_PER_THREAD]; // Segment indices - OffsetValuePairT scan_items[ITEMS_PER_THREAD]; // Zipped values and segment flags|indices - KeyValuePairT scatter_items[ITEMS_PER_THREAD]; // Zipped key value pairs for scattering - - // Load keys - if (IS_LAST_TILE) - BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining); - else - BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); - - // Load tile predecessor key in first thread - KeyOutputT tile_predecessor; - if (threadIdx.x == 0) - { - tile_predecessor = (tile_idx == 0) ? - keys[0] : // First tile gets repeat of first item (thus first item will not be flagged as a head) - d_keys_in[tile_offset - 1]; // Subsequent tiles get last key from previous tile - } - - CTA_SYNC(); - - // Load values - if (IS_LAST_TILE) - BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining); - else - BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values); - - CTA_SYNC(); - - // Initialize head-flags and shuffle up the previous keys - if (IS_LAST_TILE) - { - // Use custom flag operator to additionally flag the first out-of-bounds item - GuardedInequalityWrapper flag_op(equality_op, num_remaining); - BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads( - head_flags, keys, prev_keys, flag_op, tile_predecessor); - } - else - { - InequalityWrapper flag_op(equality_op); - BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads( - head_flags, keys, prev_keys, flag_op, tile_predecessor); - } - - // Zip values and head flags - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - scan_items[ITEM].value = values[ITEM]; - scan_items[ITEM].key = head_flags[ITEM]; - } - - // Perform exclusive tile scan - OffsetValuePairT block_aggregate; // Inclusive block-wide scan aggregate - OffsetT num_segments_prefix; // Number of segments prior to this tile - OffsetValuePairT total_aggregate; // The tile prefix folded with block_aggregate - if (tile_idx == 0) - { - // Scan first tile - BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate); - num_segments_prefix = 0; - total_aggregate = block_aggregate; - - // Update tile status if there are successor tiles - if ((!IS_LAST_TILE) && (threadIdx.x == 0)) - tile_state.SetInclusive(0, block_aggregate); - } - else - { - // Scan non-first tile - TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); - BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op); - - block_aggregate = prefix_op.GetBlockAggregate(); - num_segments_prefix = prefix_op.GetExclusivePrefix().key; - total_aggregate = prefix_op.GetInclusivePrefix(); - } - - // Rezip scatter items and segment indices - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - scatter_items[ITEM].key = prev_keys[ITEM]; - scatter_items[ITEM].value = scan_items[ITEM].value; - segment_indices[ITEM] = scan_items[ITEM].key; - } - - // At this point, each flagged segment head has: - // - The key for the previous segment - // - The reduced value from the previous segment - // - The segment index for the reduced value - - // Scatter flagged keys and values - OffsetT num_tile_segments = block_aggregate.key; - Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix); - - // Last thread in last tile will output final count (and last pair, if necessary) - if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1)) - { - OffsetT num_segments = num_segments_prefix + num_tile_segments; - - // If the last tile is a whole tile, output the final_value - if (num_remaining == TILE_ITEMS) - { - d_unique_out[num_segments] = keys[ITEMS_PER_THREAD - 1]; - d_aggregates_out[num_segments] = total_aggregate.value; - num_segments++; - } - - // Output the total number of items selected - *d_num_runs_out = num_segments; - } - } - - - /** - * Scan tiles of items as part of a dynamic chained scan - */ - __device__ __forceinline__ void ConsumeRange( - int num_items, ///< Total number of input items - ScanTileStateT& tile_state, ///< Global tile state descriptor - int start_tile) ///< The starting tile for the current grid - { - // Blocks are launched in increasing order, so just assign one tile per block - int tile_idx = start_tile + blockIdx.x; // Current tile index - OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Global offset for the current tile - OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) - - if (num_remaining > TILE_ITEMS) - { - // Not last tile - ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); - } - else if (num_remaining > 0) - { - // Last tile - ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); - } - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/agent/agent_rle.cuh b/SRC/cub/agent/agent_rle.cuh deleted file mode 100644 index cb7a4a65..00000000 --- a/SRC/cub/agent/agent_rle.cuh +++ /dev/null @@ -1,837 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode. - */ - -#pragma once - -#include - -#include "single_pass_scan_operators.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_scan.cuh" -#include "../block/block_exchange.cuh" -#include "../block/block_discontinuity.cuh" -#include "../grid/grid_queue.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../iterator/constant_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentRle - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - bool _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use -struct AgentRlePolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING, ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; - - - - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode - */ -template < - typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type - typename InputIteratorT, ///< Random-access input iterator type for data - typename OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values - typename LengthsOutputIteratorT, ///< Random-access output iterator type for length values - typename EqualityOpT, ///< T equality operator type - typename OffsetT> ///< Signed integer type for global offsets -struct AgentRle -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// The input value type - typedef typename std::iterator_traits::value_type T; - - /// The lengths output value type - typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? - OffsetT, // ... then the OffsetT type, - typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type - - /// Tuple type for scanning (pairs run-length and run-index) - typedef KeyValuePair LengthOffsetPair; - - /// Tile status descriptor interface type - typedef ReduceByKeyScanTileState ScanTileStateT; - - // Constants - enum - { - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), - BLOCK_THREADS = AgentRlePolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentRlePolicyT::ITEMS_PER_THREAD, - WARP_ITEMS = WARP_THREADS * ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - /// Whether or not to sync after loading data - SYNC_AFTER_LOAD = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT), - - /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage) - STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING, - ACTIVE_EXCHANGE_WARPS = (STORE_WARP_TIME_SLICING) ? 1 : WARPS, - }; - - - /** - * Special operator that signals all out-of-bounds items are not equal to everything else, - * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked - * trivial. - */ - template - struct OobInequalityOp - { - OffsetT num_remaining; - EqualityOpT equality_op; - - __device__ __forceinline__ OobInequalityOp( - OffsetT num_remaining, - EqualityOpT equality_op) - : - num_remaining(num_remaining), - equality_op(equality_op) - {} - - template - __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx) - { - if (!LAST_TILE || (idx < num_remaining)) - return !equality_op(first, second); - else - return true; - } - }; - - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for data - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedVLengthnputIterator - InputIteratorT>::Type // Directly use the supplied input iterator type - WrappedInputIteratorT; - - // Parameterized BlockLoad type for data - typedef BlockLoad< - T, - AgentRlePolicyT::BLOCK_THREADS, - AgentRlePolicyT::ITEMS_PER_THREAD, - AgentRlePolicyT::LOAD_ALGORITHM> - BlockLoadT; - - // Parameterized BlockDiscontinuity type for data - typedef BlockDiscontinuity BlockDiscontinuityT; - - // Parameterized WarpScan type - typedef WarpScan WarpScanPairs; - - // Reduce-length-by-run scan operator - typedef ReduceBySegmentOp ReduceBySegmentOpT; - - // Callback type for obtaining tile prefix during block scan - typedef TilePrefixCallbackOp< - LengthOffsetPair, - ReduceBySegmentOpT, - ScanTileStateT> - TilePrefixCallbackOpT; - - // Warp exchange types - typedef WarpExchange WarpExchangePairs; - - typedef typename If::Type WarpExchangePairsStorage; - - typedef WarpExchange WarpExchangeOffsets; - typedef WarpExchange WarpExchangeLengths; - - typedef LengthOffsetPair WarpAggregates[WARPS]; - - // Shared memory type for this thread block - struct _TempStorage - { - // Aliasable storage layout - union Aliasable - { - struct - { - typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection - typename WarpScanPairs::TempStorage warp_scan[WARPS]; // Smem needed for warp-synchronous scans - Uninitialized warp_aggregates; // Smem needed for sharing warp-wide aggregates - typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback - }; - - // Smem needed for input loading - typename BlockLoadT::TempStorage load; - - // Aliasable layout needed for two-phase scatter - union ScatterAliasable - { - unsigned long long align; - WarpExchangePairsStorage exchange_pairs[ACTIVE_EXCHANGE_WARPS]; - typename WarpExchangeOffsets::TempStorage exchange_offsets[ACTIVE_EXCHANGE_WARPS]; - typename WarpExchangeLengths::TempStorage exchange_lengths[ACTIVE_EXCHANGE_WARPS]; - - } scatter_aliasable; - - } aliasable; - - OffsetT tile_idx; // Shared tile index - LengthOffsetPair tile_inclusive; // Inclusive tile prefix - LengthOffsetPair tile_exclusive; // Exclusive tile prefix - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage& temp_storage; ///< Reference to temp_storage - - WrappedInputIteratorT d_in; ///< Pointer to input sequence of data items - OffsetsOutputIteratorT d_offsets_out; ///< Input run offsets - LengthsOutputIteratorT d_lengths_out; ///< Output run lengths - - EqualityOpT equality_op; ///< T equality operator - ReduceBySegmentOpT scan_op; ///< Reduce-length-by-flag scan operator - OffsetT num_items; ///< Total number of input items - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - // Constructor - __device__ __forceinline__ - AgentRle( - TempStorage &temp_storage, ///< [in] Reference to temp_storage - InputIteratorT d_in, ///< [in] Pointer to input sequence of data items - OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run offsets - LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run lengths - EqualityOpT equality_op, ///< [in] T equality operator - OffsetT num_items) ///< [in] Total number of input items - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_offsets_out(d_offsets_out), - d_lengths_out(d_lengths_out), - equality_op(equality_op), - scan_op(cub::Sum()), - num_items(num_items) - {} - - - //--------------------------------------------------------------------- - // Utility methods for initializing the selections - //--------------------------------------------------------------------- - - template - __device__ __forceinline__ void InitializeSelections( - OffsetT tile_offset, - OffsetT num_remaining, - T (&items)[ITEMS_PER_THREAD], - LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD]) - { - bool head_flags[ITEMS_PER_THREAD]; - bool tail_flags[ITEMS_PER_THREAD]; - - OobInequalityOp inequality_op(num_remaining, equality_op); - - if (FIRST_TILE && LAST_TILE) - { - // First-and-last-tile always head-flags the first item and tail-flags the last item - - BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( - head_flags, tail_flags, items, inequality_op); - } - else if (FIRST_TILE) - { - // First-tile always head-flags the first item - - // Get the first item from the next tile - T tile_successor_item; - if (threadIdx.x == BLOCK_THREADS - 1) - tile_successor_item = d_in[tile_offset + TILE_ITEMS]; - - BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( - head_flags, tail_flags, tile_successor_item, items, inequality_op); - } - else if (LAST_TILE) - { - // Last-tile always flags the last item - - // Get the last item from the previous tile - T tile_predecessor_item; - if (threadIdx.x == 0) - tile_predecessor_item = d_in[tile_offset - 1]; - - BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( - head_flags, tile_predecessor_item, tail_flags, items, inequality_op); - } - else - { - // Get the first item from the next tile - T tile_successor_item; - if (threadIdx.x == BLOCK_THREADS - 1) - tile_successor_item = d_in[tile_offset + TILE_ITEMS]; - - // Get the last item from the previous tile - T tile_predecessor_item; - if (threadIdx.x == 0) - tile_predecessor_item = d_in[tile_offset - 1]; - - BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails( - head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op); - } - - // Zip counts and runs - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - lengths_and_num_runs[ITEM].key = head_flags[ITEM] && (!tail_flags[ITEM]); - lengths_and_num_runs[ITEM].value = ((!head_flags[ITEM]) || (!tail_flags[ITEM])); - } - } - - //--------------------------------------------------------------------- - // Scan utility methods - //--------------------------------------------------------------------- - - /** - * Scan of allocations - */ - __device__ __forceinline__ void WarpScanAllocations( - LengthOffsetPair &tile_aggregate, - LengthOffsetPair &warp_aggregate, - LengthOffsetPair &warp_exclusive_in_tile, - LengthOffsetPair &thread_exclusive_in_warp, - LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD]) - { - // Perform warpscans - unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - int lane_id = LaneId(); - - LengthOffsetPair identity; - identity.key = 0; - identity.value = 0; - - LengthOffsetPair thread_inclusive; - LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op); - WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan( - thread_aggregate, - thread_inclusive, - thread_exclusive_in_warp, - identity, - scan_op); - - // Last lane in each warp shares its warp-aggregate - if (lane_id == WARP_THREADS - 1) - temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive; - - CTA_SYNC(); - - // Accumulate total selected and the warp-wide prefix - warp_exclusive_in_tile = identity; - warp_aggregate = temp_storage.aliasable.warp_aggregates.Alias()[warp_id]; - tile_aggregate = temp_storage.aliasable.warp_aggregates.Alias()[0]; - - #pragma unroll - for (int WARP = 1; WARP < WARPS; ++WARP) - { - if (warp_id == WARP) - warp_exclusive_in_tile = tile_aggregate; - - tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]); - } - } - - - //--------------------------------------------------------------------- - // Utility methods for scattering selections - //--------------------------------------------------------------------- - - /** - * Two-phase scatter, specialized for warp time-slicing - */ - template - __device__ __forceinline__ void ScatterTwoPhase( - OffsetT tile_num_runs_exclusive_in_global, - OffsetT warp_num_runs_aggregate, - OffsetT warp_num_runs_exclusive_in_tile, - OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], - LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD], - Int2Type is_warp_time_slice) - { - unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - int lane_id = LaneId(); - - // Locally compact items within the warp (first warp) - if (warp_id == 0) - { - WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped( - lengths_and_offsets, thread_num_runs_exclusive_in_warp); - } - - // Locally compact items within the warp (remaining warps) - #pragma unroll - for (int SLICE = 1; SLICE < WARPS; ++SLICE) - { - CTA_SYNC(); - - if (warp_id == SLICE) - { - WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped( - lengths_and_offsets, thread_num_runs_exclusive_in_warp); - } - } - - // Global scatter - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id) - { - OffsetT item_offset = - tile_num_runs_exclusive_in_global + - warp_num_runs_exclusive_in_tile + - (ITEM * WARP_THREADS) + lane_id; - - // Scatter offset - d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key; - - // Scatter length if not the first (global) length - if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0)) - { - d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value; - } - } - } - } - - - /** - * Two-phase scatter - */ - template - __device__ __forceinline__ void ScatterTwoPhase( - OffsetT tile_num_runs_exclusive_in_global, - OffsetT warp_num_runs_aggregate, - OffsetT warp_num_runs_exclusive_in_tile, - OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], - LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD], - Int2Type is_warp_time_slice) - { - unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - int lane_id = LaneId(); - - // Unzip - OffsetT run_offsets[ITEMS_PER_THREAD]; - LengthT run_lengths[ITEMS_PER_THREAD]; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - run_offsets[ITEM] = lengths_and_offsets[ITEM].key; - run_lengths[ITEM] = lengths_and_offsets[ITEM].value; - } - - WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped( - run_offsets, thread_num_runs_exclusive_in_warp); - - WARP_SYNC(0xffffffff); - - WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped( - run_lengths, thread_num_runs_exclusive_in_warp); - - // Global scatter - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate) - { - OffsetT item_offset = - tile_num_runs_exclusive_in_global + - warp_num_runs_exclusive_in_tile + - (ITEM * WARP_THREADS) + lane_id; - - // Scatter offset - d_offsets_out[item_offset] = run_offsets[ITEM]; - - // Scatter length if not the first (global) length - if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0)) - { - d_lengths_out[item_offset - 1] = run_lengths[ITEM]; - } - } - } - } - - - /** - * Direct scatter - */ - template - __device__ __forceinline__ void ScatterDirect( - OffsetT tile_num_runs_exclusive_in_global, - OffsetT warp_num_runs_aggregate, - OffsetT warp_num_runs_exclusive_in_tile, - OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], - LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD]) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate) - { - OffsetT item_offset = - tile_num_runs_exclusive_in_global + - warp_num_runs_exclusive_in_tile + - thread_num_runs_exclusive_in_warp[ITEM]; - - // Scatter offset - d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key; - - // Scatter length if not the first (global) length - if (item_offset >= 1) - { - d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value; - } - } - } - } - - - /** - * Scatter - */ - template - __device__ __forceinline__ void Scatter( - OffsetT tile_num_runs_aggregate, - OffsetT tile_num_runs_exclusive_in_global, - OffsetT warp_num_runs_aggregate, - OffsetT warp_num_runs_exclusive_in_tile, - OffsetT (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD], - LengthOffsetPair (&lengths_and_offsets)[ITEMS_PER_THREAD]) - { - if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS)) - { - // Direct scatter if the warp has any items - if (warp_num_runs_aggregate) - { - ScatterDirect( - tile_num_runs_exclusive_in_global, - warp_num_runs_aggregate, - warp_num_runs_exclusive_in_tile, - thread_num_runs_exclusive_in_warp, - lengths_and_offsets); - } - } - else - { - // Scatter two phase - ScatterTwoPhase( - tile_num_runs_exclusive_in_global, - warp_num_runs_aggregate, - warp_num_runs_exclusive_in_tile, - thread_num_runs_exclusive_in_warp, - lengths_and_offsets, - Int2Type()); - } - } - - - - //--------------------------------------------------------------------- - // Cooperatively scan a device-wide sequence of tiles with other CTAs - //--------------------------------------------------------------------- - - /** - * Process a tile of input (dynamic chained scan) - */ - template < - bool LAST_TILE> - __device__ __forceinline__ LengthOffsetPair ConsumeTile( - OffsetT num_items, ///< Total number of global input items - OffsetT num_remaining, ///< Number of global input items remaining (including this tile) - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT &tile_status) ///< Global list of tile status - { - if (tile_idx == 0) - { - // First tile - - // Load items - T items[ITEMS_PER_THREAD]; - if (LAST_TILE) - BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T()); - else - BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items); - - if (SYNC_AFTER_LOAD) - CTA_SYNC(); - - // Set flags - LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD]; - - InitializeSelections( - tile_offset, - num_remaining, - items, - lengths_and_num_runs); - - // Exclusive scan of lengths and runs - LengthOffsetPair tile_aggregate; - LengthOffsetPair warp_aggregate; - LengthOffsetPair warp_exclusive_in_tile; - LengthOffsetPair thread_exclusive_in_warp; - - WarpScanAllocations( - tile_aggregate, - warp_aggregate, - warp_exclusive_in_tile, - thread_exclusive_in_warp, - lengths_and_num_runs); - - // Update tile status if this is not the last tile - if (!LAST_TILE && (threadIdx.x == 0)) - tile_status.SetInclusive(0, tile_aggregate); - - // Update thread_exclusive_in_warp to fold in warp run-length - if (thread_exclusive_in_warp.key == 0) - thread_exclusive_in_warp.value += warp_exclusive_in_tile.value; - - LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD]; - OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD]; - LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD]; - - // Downsweep scan through lengths_and_num_runs - internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp); - - // Zip - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value; - lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; - thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ? - lengths_and_num_runs2[ITEM].key : // keep - WARP_THREADS * ITEMS_PER_THREAD; // discard - } - - OffsetT tile_num_runs_aggregate = tile_aggregate.key; - OffsetT tile_num_runs_exclusive_in_global = 0; - OffsetT warp_num_runs_aggregate = warp_aggregate.key; - OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key; - - // Scatter - Scatter( - tile_num_runs_aggregate, - tile_num_runs_exclusive_in_global, - warp_num_runs_aggregate, - warp_num_runs_exclusive_in_tile, - thread_num_runs_exclusive_in_warp, - lengths_and_offsets); - - // Return running total (inclusive of this tile) - return tile_aggregate; - } - else - { - // Not first tile - - // Load items - T items[ITEMS_PER_THREAD]; - if (LAST_TILE) - BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T()); - else - BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items); - - if (SYNC_AFTER_LOAD) - CTA_SYNC(); - - // Set flags - LengthOffsetPair lengths_and_num_runs[ITEMS_PER_THREAD]; - - InitializeSelections( - tile_offset, - num_remaining, - items, - lengths_and_num_runs); - - // Exclusive scan of lengths and runs - LengthOffsetPair tile_aggregate; - LengthOffsetPair warp_aggregate; - LengthOffsetPair warp_exclusive_in_tile; - LengthOffsetPair thread_exclusive_in_warp; - - WarpScanAllocations( - tile_aggregate, - warp_aggregate, - warp_exclusive_in_tile, - thread_exclusive_in_warp, - lengths_and_num_runs); - - // First warp computes tile prefix in lane 0 - TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx); - unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - if (warp_id == 0) - { - prefix_op(tile_aggregate); - if (threadIdx.x == 0) - temp_storage.tile_exclusive = prefix_op.exclusive_prefix; - } - - CTA_SYNC(); - - LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive; - - // Update thread_exclusive_in_warp to fold in warp and tile run-lengths - LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile); - if (thread_exclusive_in_warp.key == 0) - thread_exclusive_in_warp.value += thread_exclusive.value; - - // Downsweep scan through lengths_and_num_runs - LengthOffsetPair lengths_and_num_runs2[ITEMS_PER_THREAD]; - LengthOffsetPair lengths_and_offsets[ITEMS_PER_THREAD]; - OffsetT thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD]; - - internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp); - - // Zip - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - lengths_and_offsets[ITEM].value = lengths_and_num_runs2[ITEM].value; - lengths_and_offsets[ITEM].key = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM; - thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ? - lengths_and_num_runs2[ITEM].key : // keep - WARP_THREADS * ITEMS_PER_THREAD; // discard - } - - OffsetT tile_num_runs_aggregate = tile_aggregate.key; - OffsetT tile_num_runs_exclusive_in_global = tile_exclusive_in_global.key; - OffsetT warp_num_runs_aggregate = warp_aggregate.key; - OffsetT warp_num_runs_exclusive_in_tile = warp_exclusive_in_tile.key; - - // Scatter - Scatter( - tile_num_runs_aggregate, - tile_num_runs_exclusive_in_global, - warp_num_runs_aggregate, - warp_num_runs_exclusive_in_tile, - thread_num_runs_exclusive_in_warp, - lengths_and_offsets); - - // Return running total (inclusive of this tile) - return prefix_op.inclusive_prefix; - } - } - - - /** - * Scan tiles of items as part of a dynamic chained scan - */ - template ///< Output iterator type for recording number of items selected - __device__ __forceinline__ void ConsumeRange( - int num_tiles, ///< Total number of input tiles - ScanTileStateT& tile_status, ///< Global list of tile status - NumRunsIteratorT d_num_runs_out) ///< Output pointer for total number of runs identified - { - // Blocks are launched in increasing order, so just assign one tile per block - int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index - OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile - OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) - - if (tile_idx < num_tiles - 1) - { - // Not the last tile (full) - ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status); - } - else if (num_remaining > 0) - { - // The last tile (possibly partially-full) - LengthOffsetPair running_total = ConsumeTile(num_items, num_remaining, tile_idx, tile_offset, tile_status); - - if (threadIdx.x == 0) - { - // Output the total number of items selected - *d_num_runs_out = running_total.key; - - // The inclusive prefix contains accumulated length reduction for the last run - if (running_total.key > 0) - d_lengths_out[running_total.key - 1] = running_total.value; - } - } - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/agent/agent_scan.cuh b/SRC/cub/agent/agent_scan.cuh deleted file mode 100644 index 9368615e..00000000 --- a/SRC/cub/agent/agent_scan.cuh +++ /dev/null @@ -1,471 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan . - */ - -#pragma once - -#include - -#include "single_pass_scan_operators.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_scan.cuh" -#include "../grid/grid_queue.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentScan - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - BlockStoreAlgorithm _STORE_ALGORITHM, ///< The BlockStore algorithm to use - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use -struct AgentScanPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static const BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; ///< The BlockStore algorithm to use - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; - - - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan . - */ -template < - typename AgentScanPolicyT, ///< Parameterized AgentScanPolicyT tuning policy type - typename InputIteratorT, ///< Random-access input iterator type - typename OutputIteratorT, ///< Random-access output iterator type - typename ScanOpT, ///< Scan functor type - typename InitValueT, ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan) - typename OffsetT> ///< Signed integer type for global offsets -struct AgentScan -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - // Tile status descriptor interface type - typedef ScanTileState ScanTileStateT; - - // Input iterator wrapper type (for applying cache modifier) - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator - InputIteratorT>::Type // Directly use the supplied input iterator type - WrappedInputIteratorT; - - // Constants - enum - { - IS_INCLUSIVE = Equals::VALUE, // Inclusive scan if no init_value type is provided - BLOCK_THREADS = AgentScanPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentScanPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - // Parameterized BlockLoad type - typedef BlockLoad< - OutputT, - AgentScanPolicyT::BLOCK_THREADS, - AgentScanPolicyT::ITEMS_PER_THREAD, - AgentScanPolicyT::LOAD_ALGORITHM> - BlockLoadT; - - // Parameterized BlockStore type - typedef BlockStore< - OutputT, - AgentScanPolicyT::BLOCK_THREADS, - AgentScanPolicyT::ITEMS_PER_THREAD, - AgentScanPolicyT::STORE_ALGORITHM> - BlockStoreT; - - // Parameterized BlockScan type - typedef BlockScan< - OutputT, - AgentScanPolicyT::BLOCK_THREADS, - AgentScanPolicyT::SCAN_ALGORITHM> - BlockScanT; - - // Callback type for obtaining tile prefix during block scan - typedef TilePrefixCallbackOp< - OutputT, - ScanOpT, - ScanTileStateT> - TilePrefixCallbackOpT; - - // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles - typedef BlockScanRunningPrefixOp< - OutputT, - ScanOpT> - RunningPrefixCallbackOp; - - // Shared memory type for this thread block - union _TempStorage - { - typename BlockLoadT::TempStorage load; // Smem needed for tile loading - typename BlockStoreT::TempStorage store; // Smem needed for tile storing - - struct - { - typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback - typename BlockScanT::TempStorage scan; // Smem needed for tile scanning - }; - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage& temp_storage; ///< Reference to temp_storage - WrappedInputIteratorT d_in; ///< Input data - OutputIteratorT d_out; ///< Output data - ScanOpT scan_op; ///< Binary scan operator - InitValueT init_value; ///< The init_value element for ScanOpT - - - //--------------------------------------------------------------------- - // Block scan utility methods - //--------------------------------------------------------------------- - - /** - * Exclusive scan specialization (first tile) - */ - __device__ __forceinline__ - void ScanTile( - OutputT (&items)[ITEMS_PER_THREAD], - OutputT init_value, - ScanOpT scan_op, - OutputT &block_aggregate, - Int2Type /*is_inclusive*/) - { - BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate); - block_aggregate = scan_op(init_value, block_aggregate); - } - - - /** - * Inclusive scan specialization (first tile) - */ - __device__ __forceinline__ - void ScanTile( - OutputT (&items)[ITEMS_PER_THREAD], - InitValueT /*init_value*/, - ScanOpT scan_op, - OutputT &block_aggregate, - Int2Type /*is_inclusive*/) - { - BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate); - } - - - /** - * Exclusive scan specialization (subsequent tiles) - */ - template - __device__ __forceinline__ - void ScanTile( - OutputT (&items)[ITEMS_PER_THREAD], - ScanOpT scan_op, - PrefixCallback &prefix_op, - Int2Type /*is_inclusive*/) - { - BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op); - } - - - /** - * Inclusive scan specialization (subsequent tiles) - */ - template - __device__ __forceinline__ - void ScanTile( - OutputT (&items)[ITEMS_PER_THREAD], - ScanOpT scan_op, - PrefixCallback &prefix_op, - Int2Type /*is_inclusive*/) - { - BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op); - } - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - // Constructor - __device__ __forceinline__ - AgentScan( - TempStorage& temp_storage, ///< Reference to temp_storage - InputIteratorT d_in, ///< Input data - OutputIteratorT d_out, ///< Output data - ScanOpT scan_op, ///< Binary scan operator - InitValueT init_value) ///< Initial value to seed the exclusive scan - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_out(d_out), - scan_op(scan_op), - init_value(init_value) - {} - - - //--------------------------------------------------------------------- - // Cooperatively scan a device-wide sequence of tiles with other CTAs - //--------------------------------------------------------------------- - - /** - * Process a tile of input (dynamic chained scan) - */ - template ///< Whether the current tile is the last tile - __device__ __forceinline__ void ConsumeTile( - OffsetT num_remaining, ///< Number of global input items remaining (including this tile) - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor - { - // Load items - OutputT items[ITEMS_PER_THREAD]; - - if (IS_LAST_TILE) - BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining); - else - BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); - - CTA_SYNC(); - - // Perform tile scan - if (tile_idx == 0) - { - // Scan first tile - OutputT block_aggregate; - ScanTile(items, init_value, scan_op, block_aggregate, Int2Type()); - if ((!IS_LAST_TILE) && (threadIdx.x == 0)) - tile_state.SetInclusive(0, block_aggregate); - } - else - { - // Scan non-first tile - TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); - ScanTile(items, scan_op, prefix_op, Int2Type()); - } - - CTA_SYNC(); - - // Store items - if (IS_LAST_TILE) - BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining); - else - BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items); - } - - - /** - * Scan tiles of items as part of a dynamic chained scan - */ - __device__ __forceinline__ void ConsumeRange( - int num_items, ///< Total number of input items - ScanTileStateT& tile_state, ///< Global tile state descriptor - int start_tile) ///< The starting tile for the current grid - { - // Blocks are launched in increasing order, so just assign one tile per block - int tile_idx = start_tile + blockIdx.x; // Current tile index - OffsetT tile_offset = OffsetT(TILE_ITEMS) * tile_idx; // Global offset for the current tile - OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) - - if (num_remaining > TILE_ITEMS) - { - // Not last tile - ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); - } - else if (num_remaining > 0) - { - // Last tile - ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); - } - } - - - //--------------------------------------------------------------------- - // Scan an sequence of consecutive tiles (independent of other thread blocks) - //--------------------------------------------------------------------- - - /** - * Process a tile of input - */ - template < - bool IS_FIRST_TILE, - bool IS_LAST_TILE> - __device__ __forceinline__ void ConsumeTile( - OffsetT tile_offset, ///< Tile offset - RunningPrefixCallbackOp& prefix_op, ///< Running prefix operator - int valid_items = TILE_ITEMS) ///< Number of valid items in the tile - { - // Load items - OutputT items[ITEMS_PER_THREAD]; - - if (IS_LAST_TILE) - BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items); - else - BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items); - - CTA_SYNC(); - - // Block scan - if (IS_FIRST_TILE) - { - OutputT block_aggregate; - ScanTile(items, init_value, scan_op, block_aggregate, Int2Type()); - prefix_op.running_total = block_aggregate; - } - else - { - ScanTile(items, scan_op, prefix_op, Int2Type()); - } - - CTA_SYNC(); - - // Store items - if (IS_LAST_TILE) - BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items); - else - BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items); - } - - - /** - * Scan a consecutive share of input tiles - */ - __device__ __forceinline__ void ConsumeRange( - OffsetT range_offset, ///< [in] Threadblock begin offset (inclusive) - OffsetT range_end) ///< [in] Threadblock end offset (exclusive) - { - BlockScanRunningPrefixOp prefix_op(scan_op); - - if (range_offset + TILE_ITEMS <= range_end) - { - // Consume first tile of input (full) - ConsumeTile(range_offset, prefix_op); - range_offset += TILE_ITEMS; - - // Consume subsequent full tiles of input - while (range_offset + TILE_ITEMS <= range_end) - { - ConsumeTile(range_offset, prefix_op); - range_offset += TILE_ITEMS; - } - - // Consume a partially-full tile - if (range_offset < range_end) - { - int valid_items = range_end - range_offset; - ConsumeTile(range_offset, prefix_op, valid_items); - } - } - else - { - // Consume the first tile of input (partially-full) - int valid_items = range_end - range_offset; - ConsumeTile(range_offset, prefix_op, valid_items); - } - } - - - /** - * Scan a consecutive share of input tiles, seeded with the specified prefix value - */ - __device__ __forceinline__ void ConsumeRange( - OffsetT range_offset, ///< [in] Threadblock begin offset (inclusive) - OffsetT range_end, ///< [in] Threadblock end offset (exclusive) - OutputT prefix) ///< [in] The prefix to apply to the scan segment - { - BlockScanRunningPrefixOp prefix_op(prefix, scan_op); - - // Consume full tiles of input - while (range_offset + TILE_ITEMS <= range_end) - { - ConsumeTile(range_offset, prefix_op); - range_offset += TILE_ITEMS; - } - - // Consume a partially-full tile - if (range_offset < range_end) - { - int valid_items = range_end - range_offset; - ConsumeTile(range_offset, prefix_op, valid_items); - } - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/agent/agent_segment_fixup.cuh b/SRC/cub/agent/agent_segment_fixup.cuh deleted file mode 100644 index e2de58ed..00000000 --- a/SRC/cub/agent/agent_segment_fixup.cuh +++ /dev/null @@ -1,375 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key. - */ - -#pragma once - -#include - -#include "single_pass_scan_operators.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_scan.cuh" -#include "../block/block_discontinuity.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../iterator/constant_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentSegmentFixup - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use -struct AgentSegmentFixupPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -/** - * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key - */ -template < - typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type - typename PairsInputIteratorT, ///< Random-access input iterator type for keys - typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values - typename EqualityOpT, ///< KeyT equality operator type - typename ReductionOpT, ///< ValueT reduction operator type - typename OffsetT> ///< Signed integer type for global offsets -struct AgentSegmentFixup -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // Data type of key-value input iterator - typedef typename std::iterator_traits::value_type KeyValuePairT; - - // Value type - typedef typename KeyValuePairT::Value ValueT; - - // Tile status descriptor interface type - typedef ReduceByKeyScanTileState ScanTileStateT; - - // Constants - enum - { - BLOCK_THREADS = AgentSegmentFixupPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - - // Whether or not do fixup using RLE + global atomics - USE_ATOMIC_FIXUP = (CUB_PTX_ARCH >= 350) && - (Equals::VALUE || - Equals::VALUE || - Equals::VALUE || - Equals::VALUE), - - // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type) - HAS_IDENTITY_ZERO = (Equals::VALUE) && (Traits::PRIMITIVE), - }; - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - PairsInputIteratorT>::Type // Directly use the supplied input iterator type - WrappedPairsInputIteratorT; - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - AggregatesOutputIteratorT>::Type // Directly use the supplied input iterator type - WrappedFixupInputIteratorT; - - // Reduce-value-by-segment scan operator - typedef ReduceByKeyOp ReduceBySegmentOpT; - - // Parameterized BlockLoad type for pairs - typedef BlockLoad< - KeyValuePairT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - AgentSegmentFixupPolicyT::LOAD_ALGORITHM> - BlockLoadPairs; - - // Parameterized BlockScan type - typedef BlockScan< - KeyValuePairT, - BLOCK_THREADS, - AgentSegmentFixupPolicyT::SCAN_ALGORITHM> - BlockScanT; - - // Callback type for obtaining tile prefix during block scan - typedef TilePrefixCallbackOp< - KeyValuePairT, - ReduceBySegmentOpT, - ScanTileStateT> - TilePrefixCallbackOpT; - - // Shared memory type for this thread block - union _TempStorage - { - struct - { - typename BlockScanT::TempStorage scan; // Smem needed for tile scanning - typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback - }; - - // Smem needed for loading keys - typename BlockLoadPairs::TempStorage load_pairs; - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage& temp_storage; ///< Reference to temp_storage - WrappedPairsInputIteratorT d_pairs_in; ///< Input keys - AggregatesOutputIteratorT d_aggregates_out; ///< Output value aggregates - WrappedFixupInputIteratorT d_fixup_in; ///< Fixup input values - InequalityWrapper inequality_op; ///< KeyT inequality operator - ReductionOpT reduction_op; ///< Reduction operator - ReduceBySegmentOpT scan_op; ///< Reduce-by-segment scan operator - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - // Constructor - __device__ __forceinline__ - AgentSegmentFixup( - TempStorage& temp_storage, ///< Reference to temp_storage - PairsInputIteratorT d_pairs_in, ///< Input keys - AggregatesOutputIteratorT d_aggregates_out, ///< Output value aggregates - EqualityOpT equality_op, ///< KeyT equality operator - ReductionOpT reduction_op) ///< ValueT reduction operator - : - temp_storage(temp_storage.Alias()), - d_pairs_in(d_pairs_in), - d_aggregates_out(d_aggregates_out), - d_fixup_in(d_aggregates_out), - inequality_op(equality_op), - reduction_op(reduction_op), - scan_op(reduction_op) - {} - - - //--------------------------------------------------------------------- - // Cooperatively scan a device-wide sequence of tiles with other CTAs - //--------------------------------------------------------------------- - - - /** - * Process input tile. Specialized for atomic-fixup - */ - template - __device__ __forceinline__ void ConsumeTile( - OffsetT num_remaining, ///< Number of global input items remaining (including this tile) - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state, ///< Global tile state descriptor - Int2Type use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key) - { - KeyValuePairT pairs[ITEMS_PER_THREAD]; - - // Load pairs - KeyValuePairT oob_pair; - oob_pair.key = -1; - - if (IS_LAST_TILE) - BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair); - else - BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); - - // RLE - #pragma unroll - for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key; - if (pairs[ITEM].key != pairs[ITEM - 1].key) - atomicAdd(d_scatter, pairs[ITEM - 1].value); - else - pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value); - } - - // Flush last item if valid - ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key; - if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0)) - atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value); - } - - - /** - * Process input tile. Specialized for reduce-by-key fixup - */ - template - __device__ __forceinline__ void ConsumeTile( - OffsetT num_remaining, ///< Number of global input items remaining (including this tile) - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state, ///< Global tile state descriptor - Int2Type use_atomic_fixup) ///< Marker whether to use atomicAdd (instead of reduce-by-key) - { - KeyValuePairT pairs[ITEMS_PER_THREAD]; - KeyValuePairT scatter_pairs[ITEMS_PER_THREAD]; - - // Load pairs - KeyValuePairT oob_pair; - oob_pair.key = -1; - - if (IS_LAST_TILE) - BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair); - else - BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs); - - CTA_SYNC(); - - KeyValuePairT tile_aggregate; - if (tile_idx == 0) - { - // Exclusive scan of values and segment_flags - BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate); - - // Update tile status if this is not the last tile - if (threadIdx.x == 0) - { - // Set first segment id to not trigger a flush (invalid from exclusive scan) - scatter_pairs[0].key = pairs[0].key; - - if (!IS_LAST_TILE) - tile_state.SetInclusive(0, tile_aggregate); - - } - } - else - { - // Exclusive scan of values and segment_flags - TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx); - BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op); - tile_aggregate = prefix_op.GetBlockAggregate(); - } - - // Scatter updated values - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (scatter_pairs[ITEM].key != pairs[ITEM].key) - { - // Update the value at the key location - ValueT value = d_fixup_in[scatter_pairs[ITEM].key]; - value = reduction_op(value, scatter_pairs[ITEM].value); - - d_aggregates_out[scatter_pairs[ITEM].key] = value; - } - } - - // Finalize the last item - if (IS_LAST_TILE) - { - // Last thread will output final count and last item, if necessary - if (threadIdx.x == BLOCK_THREADS - 1) - { - // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment - if (num_remaining == TILE_ITEMS) - { - // Update the value at the key location - OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key; - d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]); - } - } - } - } - - - /** - * Scan tiles of items as part of a dynamic chained scan - */ - __device__ __forceinline__ void ConsumeRange( - int num_items, ///< Total number of input items - int num_tiles, ///< Total number of input tiles - ScanTileStateT& tile_state) ///< Global tile state descriptor - { - // Blocks are launched in increasing order, so just assign one tile per block - int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index - OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile - OffsetT num_remaining = num_items - tile_offset; // Remaining items (including this tile) - - if (num_remaining > TILE_ITEMS) - { - // Not the last tile (full) - ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type()); - } - else if (num_remaining > 0) - { - // The last tile (possibly partially-full) - ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state, Int2Type()); - } - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/agent/agent_select_if.cuh b/SRC/cub/agent/agent_select_if.cuh deleted file mode 100644 index 52ca9fc2..00000000 --- a/SRC/cub/agent/agent_select_if.cuh +++ /dev/null @@ -1,703 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select. - */ - -#pragma once - -#include - -#include "single_pass_scan_operators.cuh" -#include "../block/block_load.cuh" -#include "../block/block_store.cuh" -#include "../block/block_scan.cuh" -#include "../block/block_exchange.cuh" -#include "../block/block_discontinuity.cuh" -#include "../grid/grid_queue.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy types - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentSelectIf - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - BlockLoadAlgorithm _LOAD_ALGORITHM, ///< The BlockLoad algorithm to use - CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use -struct AgentSelectIfPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - }; - - static const BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; ///< The BlockLoad algorithm to use - static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use -}; - - - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - - -/** - * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection - * - * Performs functor-based selection if SelectOpT functor type != NullType - * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType - * Otherwise performs discontinuity selection (keep unique) - */ -template < - typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicy tuning policy type - typename InputIteratorT, ///< Random-access input iterator type for selection items - typename FlagsInputIteratorT, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection) - typename SelectedOutputIteratorT, ///< Random-access input iterator type for selection_flags items - typename SelectOpT, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection) - typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selections is to be used for selection) - typename OffsetT, ///< Signed integer type for global offsets - bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output -struct AgentSelectIf -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - // The flag value type - typedef typename std::iterator_traits::value_type FlagT; - - // Tile status descriptor interface type - typedef ScanTileState ScanTileStateT; - - // Constants - enum - { - USE_SELECT_OP, - USE_SELECT_FLAGS, - USE_DISCONTINUITY, - - BLOCK_THREADS = AgentSelectIfPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentSelectIfPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1), - - SELECT_METHOD = (!Equals::VALUE) ? - USE_SELECT_OP : - (!Equals::VALUE) ? - USE_SELECT_FLAGS : - USE_DISCONTINUITY - }; - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for items - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - InputIteratorT>::Type // Directly use the supplied input iterator type - WrappedInputIteratorT; - - // Cache-modified Input iterator wrapper type (for applying cache modifier) for values - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - FlagsInputIteratorT>::Type // Directly use the supplied input iterator type - WrappedFlagsInputIteratorT; - - // Parameterized BlockLoad type for input data - typedef BlockLoad< - OutputT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - AgentSelectIfPolicyT::LOAD_ALGORITHM> - BlockLoadT; - - // Parameterized BlockLoad type for flags - typedef BlockLoad< - FlagT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - AgentSelectIfPolicyT::LOAD_ALGORITHM> - BlockLoadFlags; - - // Parameterized BlockDiscontinuity type for items - typedef BlockDiscontinuity< - OutputT, - BLOCK_THREADS> - BlockDiscontinuityT; - - // Parameterized BlockScan type - typedef BlockScan< - OffsetT, - BLOCK_THREADS, - AgentSelectIfPolicyT::SCAN_ALGORITHM> - BlockScanT; - - // Callback type for obtaining tile prefix during block scan - typedef TilePrefixCallbackOp< - OffsetT, - cub::Sum, - ScanTileStateT> - TilePrefixCallbackOpT; - - // Item exchange type - typedef OutputT ItemExchangeT[TILE_ITEMS]; - - // Shared memory type for this thread block - union _TempStorage - { - struct - { - typename BlockScanT::TempStorage scan; // Smem needed for tile scanning - typename TilePrefixCallbackOpT::TempStorage prefix; // Smem needed for cooperative prefix callback - typename BlockDiscontinuityT::TempStorage discontinuity; // Smem needed for discontinuity detection - }; - - // Smem needed for loading items - typename BlockLoadT::TempStorage load_items; - - // Smem needed for loading values - typename BlockLoadFlags::TempStorage load_flags; - - // Smem needed for compacting items (allows non POD items in this union) - Uninitialized raw_exchange; - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - _TempStorage& temp_storage; ///< Reference to temp_storage - WrappedInputIteratorT d_in; ///< Input items - SelectedOutputIteratorT d_selected_out; ///< Unique output items - WrappedFlagsInputIteratorT d_flags_in; ///< Input selection flags (if applicable) - InequalityWrapper inequality_op; ///< T inequality operator - SelectOpT select_op; ///< Selection operator - OffsetT num_items; ///< Total number of input items - - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - // Constructor - __device__ __forceinline__ - AgentSelectIf( - TempStorage &temp_storage, ///< Reference to temp_storage - InputIteratorT d_in, ///< Input data - FlagsInputIteratorT d_flags_in, ///< Input selection flags (if applicable) - SelectedOutputIteratorT d_selected_out, ///< Output data - SelectOpT select_op, ///< Selection operator - EqualityOpT equality_op, ///< Equality operator - OffsetT num_items) ///< Total number of input items - : - temp_storage(temp_storage.Alias()), - d_in(d_in), - d_flags_in(d_flags_in), - d_selected_out(d_selected_out), - select_op(select_op), - inequality_op(equality_op), - num_items(num_items) - {} - - - //--------------------------------------------------------------------- - // Utility methods for initializing the selections - //--------------------------------------------------------------------- - - /** - * Initialize selections (specialized for selection operator) - */ - template - __device__ __forceinline__ void InitializeSelections( - OffsetT /*tile_offset*/, - OffsetT num_tile_items, - OutputT (&items)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - Int2Type /*select_method*/) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - // Out-of-bounds items are selection_flags - selection_flags[ITEM] = 1; - - if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items)) - selection_flags[ITEM] = select_op(items[ITEM]); - } - } - - - /** - * Initialize selections (specialized for valid flags) - */ - template - __device__ __forceinline__ void InitializeSelections( - OffsetT tile_offset, - OffsetT num_tile_items, - OutputT (&/*items*/)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - Int2Type /*select_method*/) - { - CTA_SYNC(); - - FlagT flags[ITEMS_PER_THREAD]; - - if (IS_LAST_TILE) - { - // Out-of-bounds items are selection_flags - BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1); - } - else - { - BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags); - } - - // Convert flag type to selection_flags type - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - selection_flags[ITEM] = flags[ITEM]; - } - } - - - /** - * Initialize selections (specialized for discontinuity detection) - */ - template - __device__ __forceinline__ void InitializeSelections( - OffsetT tile_offset, - OffsetT num_tile_items, - OutputT (&items)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - Int2Type /*select_method*/) - { - if (IS_FIRST_TILE) - { - CTA_SYNC(); - - // Set head selection_flags. First tile sets the first flag for the first item - BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op); - } - else - { - OutputT tile_predecessor; - if (threadIdx.x == 0) - tile_predecessor = d_in[tile_offset - 1]; - - CTA_SYNC(); - - BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor); - } - - // Set selection flags for out-of-bounds items - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - // Set selection_flags for out-of-bounds items - if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) - selection_flags[ITEM] = 1; - } - } - - - //--------------------------------------------------------------------- - // Scatter utility methods - //--------------------------------------------------------------------- - - /** - * Scatter flagged items to output offsets (specialized for direct scattering) - */ - template - __device__ __forceinline__ void ScatterDirect( - OutputT (&items)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - OffsetT (&selection_indices)[ITEMS_PER_THREAD], - OffsetT num_selections) - { - // Scatter flagged items - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (selection_flags[ITEM]) - { - if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections) - { - d_selected_out[selection_indices[ITEM]] = items[ITEM]; - } - } - } - } - - - /** - * Scatter flagged items to output offsets (specialized for two-phase scattering) - */ - template - __device__ __forceinline__ void ScatterTwoPhase( - OutputT (&items)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - OffsetT (&selection_indices)[ITEMS_PER_THREAD], - int /*num_tile_items*/, ///< Number of valid items in this tile - int num_tile_selections, ///< Number of selections in this tile - OffsetT num_selections_prefix, ///< Total number of selections prior to this tile - OffsetT /*num_rejected_prefix*/, ///< Total number of rejections prior to this tile - Int2Type /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition - { - CTA_SYNC(); - - // Compact and scatter items - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix; - if (selection_flags[ITEM]) - { - temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; - } - } - - CTA_SYNC(); - - for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS) - { - d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item]; - } - } - - - /** - * Scatter flagged items to output offsets (specialized for two-phase scattering) - */ - template - __device__ __forceinline__ void ScatterTwoPhase( - OutputT (&items)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - OffsetT (&selection_indices)[ITEMS_PER_THREAD], - int num_tile_items, ///< Number of valid items in this tile - int num_tile_selections, ///< Number of selections in this tile - OffsetT num_selections_prefix, ///< Total number of selections prior to this tile - OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile - Int2Type /*is_keep_rejects*/) ///< Marker type indicating whether to keep rejected items in the second partition - { - CTA_SYNC(); - - int tile_num_rejections = num_tile_items - num_tile_selections; - - // Scatter items to shared memory (rejections first) - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int item_idx = (threadIdx.x * ITEMS_PER_THREAD) + ITEM; - int local_selection_idx = selection_indices[ITEM] - num_selections_prefix; - int local_rejection_idx = item_idx - local_selection_idx; - int local_scatter_offset = (selection_flags[ITEM]) ? - tile_num_rejections + local_selection_idx : - local_rejection_idx; - - temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM]; - } - - CTA_SYNC(); - - // Gather items from shared memory and scatter to global - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int item_idx = (ITEM * BLOCK_THREADS) + threadIdx.x; - int rejection_idx = item_idx; - int selection_idx = item_idx - tile_num_rejections; - OffsetT scatter_offset = (item_idx < tile_num_rejections) ? - num_items - num_rejected_prefix - rejection_idx - 1 : - num_selections_prefix + selection_idx; - - OutputT item = temp_storage.raw_exchange.Alias()[item_idx]; - - if (!IS_LAST_TILE || (item_idx < num_tile_items)) - { - d_selected_out[scatter_offset] = item; - } - } - } - - - /** - * Scatter flagged items - */ - template - __device__ __forceinline__ void Scatter( - OutputT (&items)[ITEMS_PER_THREAD], - OffsetT (&selection_flags)[ITEMS_PER_THREAD], - OffsetT (&selection_indices)[ITEMS_PER_THREAD], - int num_tile_items, ///< Number of valid items in this tile - int num_tile_selections, ///< Number of selections in this tile - OffsetT num_selections_prefix, ///< Total number of selections prior to this tile - OffsetT num_rejected_prefix, ///< Total number of rejections prior to this tile - OffsetT num_selections) ///< Total number of selections including this tile - { - // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one - if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS))) - { - ScatterTwoPhase( - items, - selection_flags, - selection_indices, - num_tile_items, - num_tile_selections, - num_selections_prefix, - num_rejected_prefix, - Int2Type()); - } - else - { - ScatterDirect( - items, - selection_flags, - selection_indices, - num_selections); - } - } - - //--------------------------------------------------------------------- - // Cooperatively scan a device-wide sequence of tiles with other CTAs - //--------------------------------------------------------------------- - - - /** - * Process first tile of input (dynamic chained scan). Returns the running count of selections (including this tile) - */ - template - __device__ __forceinline__ OffsetT ConsumeFirstTile( - int num_tile_items, ///< Number of input items comprising this tile - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor - { - OutputT items[ITEMS_PER_THREAD]; - OffsetT selection_flags[ITEMS_PER_THREAD]; - OffsetT selection_indices[ITEMS_PER_THREAD]; - - // Load items - if (IS_LAST_TILE) - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); - else - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); - - // Initialize selection_flags - InitializeSelections( - tile_offset, - num_tile_items, - items, - selection_flags, - Int2Type()); - - CTA_SYNC(); - - // Exclusive scan of selection_flags - OffsetT num_tile_selections; - BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections); - - if (threadIdx.x == 0) - { - // Update tile status if this is not the last tile - if (!IS_LAST_TILE) - tile_state.SetInclusive(0, num_tile_selections); - } - - // Discount any out-of-bounds selections - if (IS_LAST_TILE) - num_tile_selections -= (TILE_ITEMS - num_tile_items); - - // Scatter flagged items - Scatter( - items, - selection_flags, - selection_indices, - num_tile_items, - num_tile_selections, - 0, - 0, - num_tile_selections); - - return num_tile_selections; - } - - - /** - * Process subsequent tile of input (dynamic chained scan). Returns the running count of selections (including this tile) - */ - template - __device__ __forceinline__ OffsetT ConsumeSubsequentTile( - int num_tile_items, ///< Number of input items comprising this tile - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor - { - OutputT items[ITEMS_PER_THREAD]; - OffsetT selection_flags[ITEMS_PER_THREAD]; - OffsetT selection_indices[ITEMS_PER_THREAD]; - - // Load items - if (IS_LAST_TILE) - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items); - else - BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items); - - // Initialize selection_flags - InitializeSelections( - tile_offset, - num_tile_items, - items, - selection_flags, - Int2Type()); - - CTA_SYNC(); - - // Exclusive scan of values and selection_flags - TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx); - BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op); - - OffsetT num_tile_selections = prefix_op.GetBlockAggregate(); - OffsetT num_selections = prefix_op.GetInclusivePrefix(); - OffsetT num_selections_prefix = prefix_op.GetExclusivePrefix(); - OffsetT num_rejected_prefix = (tile_idx * TILE_ITEMS) - num_selections_prefix; - - // Discount any out-of-bounds selections - if (IS_LAST_TILE) - { - int num_discount = TILE_ITEMS - num_tile_items; - num_selections -= num_discount; - num_tile_selections -= num_discount; - } - - // Scatter flagged items - Scatter( - items, - selection_flags, - selection_indices, - num_tile_items, - num_tile_selections, - num_selections_prefix, - num_rejected_prefix, - num_selections); - - return num_selections; - } - - - /** - * Process a tile of input - */ - template - __device__ __forceinline__ OffsetT ConsumeTile( - int num_tile_items, ///< Number of input items comprising this tile - int tile_idx, ///< Tile index - OffsetT tile_offset, ///< Tile offset - ScanTileStateT& tile_state) ///< Global tile state descriptor - { - OffsetT num_selections; - if (tile_idx == 0) - { - num_selections = ConsumeFirstTile(num_tile_items, tile_offset, tile_state); - } - else - { - num_selections = ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, tile_state); - } - - return num_selections; - } - - - /** - * Scan tiles of items as part of a dynamic chained scan - */ - template ///< Output iterator type for recording number of items selection_flags - __device__ __forceinline__ void ConsumeRange( - int num_tiles, ///< Total number of input tiles - ScanTileStateT& tile_state, ///< Global tile state descriptor - NumSelectedIteratorT d_num_selected_out) ///< Output total number selection_flags - { - // Blocks are launched in increasing order, so just assign one tile per block - int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index - OffsetT tile_offset = tile_idx * TILE_ITEMS; // Global offset for the current tile - - if (tile_idx < num_tiles - 1) - { - // Not the last tile (full) - ConsumeTile(TILE_ITEMS, tile_idx, tile_offset, tile_state); - } - else - { - // The last tile (possibly partially-full) - OffsetT num_remaining = num_items - tile_offset; - OffsetT num_selections = ConsumeTile(num_remaining, tile_idx, tile_offset, tile_state); - - if (threadIdx.x == 0) - { - // Output the total number of items selection_flags - *d_num_selected_out = num_selections; - } - } - } - -}; - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/agent/agent_spmv_orig.cuh b/SRC/cub/agent/agent_spmv_orig.cuh deleted file mode 100644 index 54e2a139..00000000 --- a/SRC/cub/agent/agent_spmv_orig.cuh +++ /dev/null @@ -1,670 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. - */ - -#pragma once - -#include - -#include "../util_type.cuh" -#include "../block/block_reduce.cuh" -#include "../block/block_scan.cuh" -#include "../block/block_exchange.cuh" -#include "../thread/thread_search.cuh" -#include "../thread/thread_operators.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../iterator/counting_input_iterator.cuh" -#include "../iterator/tex_ref_input_iterator.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Tuning policy - ******************************************************************************/ - -/** - * Parameterizable tuning policy type for AgentSpmv - */ -template < - int _BLOCK_THREADS, ///< Threads per thread block - int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - CacheLoadModifier _ROW_OFFSETS_SEARCH_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets during search - CacheLoadModifier _ROW_OFFSETS_LOAD_MODIFIER, ///< Cache load modifier for reading CSR row-offsets - CacheLoadModifier _COLUMN_INDICES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR column-indices - CacheLoadModifier _VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading CSR values - CacheLoadModifier _VECTOR_VALUES_LOAD_MODIFIER, ///< Cache load modifier for reading vector values - bool _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory) - BlockScanAlgorithm _SCAN_ALGORITHM> ///< The BlockScan algorithm to use -struct AgentSpmvPolicy -{ - enum - { - BLOCK_THREADS = _BLOCK_THREADS, ///< Threads per thread block - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, ///< Items per thread (per tile of input) - DIRECT_LOAD_NONZEROS = _DIRECT_LOAD_NONZEROS, ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory) - }; - - static const CacheLoadModifier ROW_OFFSETS_SEARCH_LOAD_MODIFIER = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets - static const CacheLoadModifier ROW_OFFSETS_LOAD_MODIFIER = _ROW_OFFSETS_LOAD_MODIFIER; ///< Cache load modifier for reading CSR row-offsets - static const CacheLoadModifier COLUMN_INDICES_LOAD_MODIFIER = _COLUMN_INDICES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR column-indices - static const CacheLoadModifier VALUES_LOAD_MODIFIER = _VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading CSR values - static const CacheLoadModifier VECTOR_VALUES_LOAD_MODIFIER = _VECTOR_VALUES_LOAD_MODIFIER; ///< Cache load modifier for reading vector values - static const BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; ///< The BlockScan algorithm to use - -}; - - -/****************************************************************************** - * Thread block abstractions - ******************************************************************************/ - -template < - typename ValueT, ///< Matrix and vector value type - typename OffsetT> ///< Signed integer type for sequence offsets -struct SpmvParams -{ - ValueT* d_values; ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. - OffsetT* d_row_end_offsets; ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values - OffsetT* d_column_indices; ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) - ValueT* d_vector_x; ///< Pointer to the array of \p num_cols values corresponding to the dense input vector x - ValueT* d_vector_y; ///< Pointer to the array of \p num_rows values corresponding to the dense output vector y - int num_rows; ///< Number of rows of matrix A. - int num_cols; ///< Number of columns of matrix A. - int num_nonzeros; ///< Number of nonzero elements of matrix A. - ValueT alpha; ///< Alpha multiplicand - ValueT beta; ///< Beta addend-multiplicand - - TexRefInputIterator t_vector_x; -}; - - -/** - * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV. - */ -template < - typename AgentSpmvPolicyT, ///< Parameterized AgentSpmvPolicy tuning policy type - typename ValueT, ///< Matrix and vector value type - typename OffsetT, ///< Signed integer type for sequence offsets - bool HAS_ALPHA, ///< Whether the input parameter \p alpha is 1 - bool HAS_BETA, ///< Whether the input parameter \p beta is 0 - int PTX_ARCH = CUB_PTX_ARCH> ///< PTX compute capability -struct AgentSpmv -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// Constants - enum - { - BLOCK_THREADS = AgentSpmvPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = AgentSpmvPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - /// 2D merge path coordinate type - typedef typename CubVector::Type CoordinateT; - - /// Input iterator wrapper types (for applying cache modifiers) - - typedef CacheModifiedInputIterator< - AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, - OffsetT, - OffsetT> - RowOffsetsSearchIteratorT; - - typedef CacheModifiedInputIterator< - AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER, - OffsetT, - OffsetT> - RowOffsetsIteratorT; - - typedef CacheModifiedInputIterator< - AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER, - OffsetT, - OffsetT> - ColumnIndicesIteratorT; - - typedef CacheModifiedInputIterator< - AgentSpmvPolicyT::VALUES_LOAD_MODIFIER, - ValueT, - OffsetT> - ValueIteratorT; - - typedef CacheModifiedInputIterator< - AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, - ValueT, - OffsetT> - VectorValueIteratorT; - - // Tuple type for scanning (pairs accumulated segment-value with segment-index) - typedef KeyValuePair KeyValuePairT; - - // Reduce-value-by-segment scan operator - typedef ReduceByKeyOp ReduceBySegmentOpT; - - // BlockReduce specialization - typedef BlockReduce< - ValueT, - BLOCK_THREADS, - BLOCK_REDUCE_WARP_REDUCTIONS> - BlockReduceT; - - // BlockScan specialization - typedef BlockScan< - KeyValuePairT, - BLOCK_THREADS, - AgentSpmvPolicyT::SCAN_ALGORITHM> - BlockScanT; - - // BlockScan specialization - typedef BlockScan< - ValueT, - BLOCK_THREADS, - AgentSpmvPolicyT::SCAN_ALGORITHM> - BlockPrefixSumT; - - // BlockExchange specialization - typedef BlockExchange< - ValueT, - BLOCK_THREADS, - ITEMS_PER_THREAD> - BlockExchangeT; - - /// Merge item type (either a non-zero value or a row-end offset) - union MergeItem - { - // Value type to pair with index type OffsetT (NullType if loading values directly during merge) - typedef typename If::Type MergeValueT; - - OffsetT row_end_offset; - MergeValueT nonzero; - }; - - /// Shared memory type required by this thread block - struct _TempStorage - { - CoordinateT tile_coords[2]; - - union Aliasable - { - // Smem needed for tile of merge items - MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1]; - - // Smem needed for block exchange - typename BlockExchangeT::TempStorage exchange; - - // Smem needed for block-wide reduction - typename BlockReduceT::TempStorage reduce; - - // Smem needed for tile scanning - typename BlockScanT::TempStorage scan; - - // Smem needed for tile prefix sum - typename BlockPrefixSumT::TempStorage prefix_sum; - - } aliasable; - }; - - /// Temporary storage type (unionable) - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - - _TempStorage& temp_storage; /// Reference to temp_storage - - SpmvParams& spmv_params; - - ValueIteratorT wd_values; ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. - RowOffsetsIteratorT wd_row_end_offsets; ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values - ColumnIndicesIteratorT wd_column_indices; ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) - VectorValueIteratorT wd_vector_x; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x - VectorValueIteratorT wd_vector_y; ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector x - - - //--------------------------------------------------------------------- - // Interface - //--------------------------------------------------------------------- - - /** - * Constructor - */ - __device__ __forceinline__ AgentSpmv( - TempStorage& temp_storage, ///< Reference to temp_storage - SpmvParams& spmv_params) ///< SpMV input parameter bundle - : - temp_storage(temp_storage.Alias()), - spmv_params(spmv_params), - wd_values(spmv_params.d_values), - wd_row_end_offsets(spmv_params.d_row_end_offsets), - wd_column_indices(spmv_params.d_column_indices), - wd_vector_x(spmv_params.d_vector_x), - wd_vector_y(spmv_params.d_vector_y) - {} - - - - - /** - * Consume a merge tile, specialized for direct-load of nonzeros - */ - __device__ __forceinline__ KeyValuePairT ConsumeTile( - int tile_idx, - CoordinateT tile_start_coord, - CoordinateT tile_end_coord, - Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch - { - int tile_num_rows = tile_end_coord.x - tile_start_coord.x; - int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; - OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; - - // Gather the row end-offsets for the merge tile into shared memory - for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS) - { - s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item]; - } - - CTA_SYNC(); - - // Search for the thread's starting coordinate within the merge tile - CountingInputIterator tile_nonzero_indices(tile_start_coord.y); - CoordinateT thread_start_coord; - - MergePathSearch( - OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal - s_tile_row_end_offsets, // List A - tile_nonzero_indices, // List B - tile_num_rows, - tile_num_nonzeros, - thread_start_coord); - - CTA_SYNC(); // Perf-sync - - // Compute the thread's merge path segment - CoordinateT thread_current_coord = thread_start_coord; - KeyValuePairT scan_segment[ITEMS_PER_THREAD]; - - ValueT running_total = 0.0; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - OffsetT nonzero_idx = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1); - OffsetT column_idx = wd_column_indices[nonzero_idx]; - ValueT value = wd_values[nonzero_idx]; - - ValueT vector_value = spmv_params.t_vector_x[column_idx]; -#if (CUB_PTX_ARCH >= 350) - vector_value = wd_vector_x[column_idx]; -#endif - ValueT nonzero = value * vector_value; - - OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; - - if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) - { - // Move down (accumulate) - running_total += nonzero; - scan_segment[ITEM].value = running_total; - scan_segment[ITEM].key = tile_num_rows; - ++thread_current_coord.y; - } - else - { - // Move right (reset) - scan_segment[ITEM].value = running_total; - scan_segment[ITEM].key = thread_current_coord.x; - running_total = 0.0; - ++thread_current_coord.x; - } - } - - CTA_SYNC(); - - // Block-wide reduce-value-by-segment - KeyValuePairT tile_carry; - ReduceBySegmentOpT scan_op; - KeyValuePairT scan_item; - - scan_item.value = running_total; - scan_item.key = thread_current_coord.x; - - BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); - - if (tile_num_rows > 0) - { - if (threadIdx.x == 0) - scan_item.key = -1; - - // Direct scatter - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (scan_segment[ITEM].key < tile_num_rows) - { - if (scan_item.key == scan_segment[ITEM].key) - scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value; - - if (HAS_ALPHA) - { - scan_segment[ITEM].value *= spmv_params.alpha; - } - - if (HAS_BETA) - { - // Update the output vector element - ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key]; - scan_segment[ITEM].value += addend; - } - - // Set the output vector element - spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value; - } - } - } - - // Return the tile's running carry-out - return tile_carry; - } - - - - /** - * Consume a merge tile, specialized for indirect load of nonzeros - */ - __device__ __forceinline__ KeyValuePairT ConsumeTile( - int tile_idx, - CoordinateT tile_start_coord, - CoordinateT tile_end_coord, - Int2Type is_direct_load) ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch - { - int tile_num_rows = tile_end_coord.x - tile_start_coord.x; - int tile_num_nonzeros = tile_end_coord.y - tile_start_coord.y; - -#if (CUB_PTX_ARCH >= 520) - - OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; - ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; - - // Gather the nonzeros for the merge tile into shared memory - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); - - ValueIteratorT a = wd_values + tile_start_coord.y + nonzero_idx; - ColumnIndicesIteratorT ci = wd_column_indices + tile_start_coord.y + nonzero_idx; - ValueT* s = s_tile_nonzeros + nonzero_idx; - - if (nonzero_idx < tile_num_nonzeros) - { - - OffsetT column_idx = *ci; - ValueT value = *a; - - ValueT vector_value = spmv_params.t_vector_x[column_idx]; - vector_value = wd_vector_x[column_idx]; - - ValueT nonzero = value * vector_value; - - *s = nonzero; - } - } - - -#else - - OffsetT* s_tile_row_end_offsets = &temp_storage.aliasable.merge_items[0].row_end_offset; - ValueT* s_tile_nonzeros = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero; - - // Gather the nonzeros for the merge tile into shared memory - if (tile_num_nonzeros > 0) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS); - nonzero_idx = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1); - - OffsetT column_idx = wd_column_indices[tile_start_coord.y + nonzero_idx]; - ValueT value = wd_values[tile_start_coord.y + nonzero_idx]; - - ValueT vector_value = spmv_params.t_vector_x[column_idx]; -#if (CUB_PTX_ARCH >= 350) - vector_value = wd_vector_x[column_idx]; -#endif - ValueT nonzero = value * vector_value; - - s_tile_nonzeros[nonzero_idx] = nonzero; - } - } - -#endif - - // Gather the row end-offsets for the merge tile into shared memory - #pragma unroll 1 - for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS) - { - s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item]; - } - - CTA_SYNC(); - - // Search for the thread's starting coordinate within the merge tile - CountingInputIterator tile_nonzero_indices(tile_start_coord.y); - CoordinateT thread_start_coord; - - MergePathSearch( - OffsetT(threadIdx.x * ITEMS_PER_THREAD), // Diagonal - s_tile_row_end_offsets, // List A - tile_nonzero_indices, // List B - tile_num_rows, - tile_num_nonzeros, - thread_start_coord); - - CTA_SYNC(); // Perf-sync - - // Compute the thread's merge path segment - CoordinateT thread_current_coord = thread_start_coord; - KeyValuePairT scan_segment[ITEMS_PER_THREAD]; - ValueT running_total = 0.0; - - OffsetT row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; - ValueT nonzero = s_tile_nonzeros[thread_current_coord.y]; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset) - { - // Move down (accumulate) - scan_segment[ITEM].value = nonzero; - running_total += nonzero; - ++thread_current_coord.y; - nonzero = s_tile_nonzeros[thread_current_coord.y]; - } - else - { - // Move right (reset) - scan_segment[ITEM].value = 0.0; - running_total = 0.0; - ++thread_current_coord.x; - row_end_offset = s_tile_row_end_offsets[thread_current_coord.x]; - } - - scan_segment[ITEM].key = thread_current_coord.x; - } - - CTA_SYNC(); - - // Block-wide reduce-value-by-segment - KeyValuePairT tile_carry; - ReduceBySegmentOpT scan_op; - KeyValuePairT scan_item; - - scan_item.value = running_total; - scan_item.key = thread_current_coord.x; - - BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry); - - if (threadIdx.x == 0) - { - scan_item.key = thread_start_coord.x; - scan_item.value = 0.0; - } - - if (tile_num_rows > 0) - { - - CTA_SYNC(); - - // Scan downsweep and scatter - ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero; - - if (scan_item.key != scan_segment[0].key) - { - s_partials[scan_item.key] = scan_item.value; - } - else - { - scan_segment[0].value += scan_item.value; - } - - #pragma unroll - for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key) - { - s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value; - } - else - { - scan_segment[ITEM].value += scan_segment[ITEM - 1].value; - } - } - - CTA_SYNC(); - - #pragma unroll 1 - for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS) - { - spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item]; - } - } - - // Return the tile's running carry-out - return tile_carry; - } - - - /** - * Consume input tile - */ - __device__ __forceinline__ void ConsumeTile( - CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates - KeyValuePairT* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block - int num_merge_tiles) ///< [in] Number of merge tiles - { - int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index - - if (tile_idx >= num_merge_tiles) - return; - - // Read our starting coordinates - if (threadIdx.x < 2) - { - if (d_tile_coordinates == NULL) - { - // Search our starting coordinates - OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS; - CoordinateT tile_coord; - CountingInputIterator nonzero_indices(0); - - // Search the merge path - MergePathSearch( - diagonal, - RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), - nonzero_indices, - spmv_params.num_rows, - spmv_params.num_nonzeros, - tile_coord); - - temp_storage.tile_coords[threadIdx.x] = tile_coord; - } - else - { - temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x]; - } - } - - CTA_SYNC(); - - CoordinateT tile_start_coord = temp_storage.tile_coords[0]; - CoordinateT tile_end_coord = temp_storage.tile_coords[1]; - - // Consume multi-segment tile - KeyValuePairT tile_carry = ConsumeTile( - tile_idx, - tile_start_coord, - tile_end_coord, - Int2Type()); - - // Output the tile's carry-out - if (threadIdx.x == 0) - { - if (HAS_ALPHA) - tile_carry.value *= spmv_params.alpha; - - tile_carry.key += tile_start_coord.x; - d_tile_carry_pairs[tile_idx] = tile_carry; - } - } - - -}; - - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/agent/single_pass_scan_operators.cuh b/SRC/cub/agent/single_pass_scan_operators.cuh deleted file mode 100644 index 53409bde..00000000 --- a/SRC/cub/agent/single_pass_scan_operators.cuh +++ /dev/null @@ -1,815 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Callback operator types for supplying BlockScan prefixes - */ - -#pragma once - -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../warp/warp_reduce.cuh" -#include "../util_arch.cuh" -#include "../util_device.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Prefix functor type for maintaining a running prefix while scanning a - * region independent of other thread blocks - ******************************************************************************/ - -/** - * Stateful callback operator type for supplying BlockScan prefixes. - * Maintains a running prefix that can be applied to consecutive - * BlockScan operations. - */ -template < - typename T, ///< BlockScan value type - typename ScanOpT> ///< Wrapped scan operator type -struct BlockScanRunningPrefixOp -{ - ScanOpT op; ///< Wrapped scan operator - T running_total; ///< Running block-wide prefix - - /// Constructor - __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op) - : - op(op) - {} - - /// Constructor - __device__ __forceinline__ BlockScanRunningPrefixOp( - T starting_prefix, - ScanOpT op) - : - op(op), - running_total(starting_prefix) - {} - - /** - * Prefix callback operator. Returns the block-wide running_total in thread-0. - */ - __device__ __forceinline__ T operator()( - const T &block_aggregate) ///< The aggregate sum of the BlockScan inputs - { - T retval = running_total; - running_total = op(running_total, block_aggregate); - return retval; - } -}; - - -/****************************************************************************** - * Generic tile status interface types for block-cooperative scans - ******************************************************************************/ - -/** - * Enumerations of tile status - */ -enum ScanTileStatus -{ - SCAN_TILE_OOB, // Out-of-bounds (e.g., padding) - SCAN_TILE_INVALID = 99, // Not yet processed - SCAN_TILE_PARTIAL, // Tile aggregate is available - SCAN_TILE_INCLUSIVE, // Inclusive tile prefix is available -}; - - -/** - * Tile status interface. - */ -template < - typename T, - bool SINGLE_WORD = Traits::PRIMITIVE> -struct ScanTileState; - - -/** - * Tile status interface specialized for scan status and value types - * that can be combined into one machine word that can be - * read/written coherently in a single access. - */ -template -struct ScanTileState -{ - // Status word type - typedef typename If<(sizeof(T) == 8), - long long, - typename If<(sizeof(T) == 4), - int, - typename If<(sizeof(T) == 2), - short, - char>::Type>::Type>::Type StatusWord; - - - // Unit word type - typedef typename If<(sizeof(T) == 8), - longlong2, - typename If<(sizeof(T) == 4), - int2, - typename If<(sizeof(T) == 2), - int, - uchar2>::Type>::Type>::Type TxnWord; - - - // Device word type - struct TileDescriptor - { - StatusWord status; - T value; - }; - - - // Constants - enum - { - TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, - }; - - - // Device storage - TxnWord *d_tile_descriptors; - - /// Constructor - __host__ __device__ __forceinline__ - ScanTileState() - : - d_tile_descriptors(NULL) - {} - - - /// Initializer - __host__ __device__ __forceinline__ - cudaError_t Init( - int /*num_tiles*/, ///< [in] Number of tiles - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation - { - d_tile_descriptors = reinterpret_cast(d_temp_storage); - return cudaSuccess; - } - - - /** - * Compute device memory needed for tile status - */ - __host__ __device__ __forceinline__ - static cudaError_t AllocationSize( - int num_tiles, ///< [in] Number of tiles - size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation - { - temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors - return cudaSuccess; - } - - - /** - * Initialize (from device) - */ - __device__ __forceinline__ void InitializeStatus(int num_tiles) - { - int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - - TxnWord val = TxnWord(); - TileDescriptor *descriptor = reinterpret_cast(&val); - - if (tile_idx < num_tiles) - { - // Not-yet-set - descriptor->status = StatusWord(SCAN_TILE_INVALID); - d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; - } - - if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) - { - // Padding - descriptor->status = StatusWord(SCAN_TILE_OOB); - d_tile_descriptors[threadIdx.x] = val; - } - } - - - /** - * Update the specified tile's inclusive value and corresponding status - */ - __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) - { - TileDescriptor tile_descriptor; - tile_descriptor.status = SCAN_TILE_INCLUSIVE; - tile_descriptor.value = tile_inclusive; - - TxnWord alias; - *reinterpret_cast(&alias) = tile_descriptor; - ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); - } - - - /** - * Update the specified tile's partial value and corresponding status - */ - __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) - { - TileDescriptor tile_descriptor; - tile_descriptor.status = SCAN_TILE_PARTIAL; - tile_descriptor.value = tile_partial; - - TxnWord alias; - *reinterpret_cast(&alias) = tile_descriptor; - ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); - } - - /** - * Wait for the corresponding tile to become non-invalid - */ - __device__ __forceinline__ void WaitForValid( - int tile_idx, - StatusWord &status, - T &value) - { - TileDescriptor tile_descriptor; - do - { - __threadfence_block(); // prevent hoisting loads from loop - TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); - tile_descriptor = reinterpret_cast(alias); - - } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); - - status = tile_descriptor.status; - value = tile_descriptor.value; - } - -}; - - - -/** - * Tile status interface specialized for scan status and value types that - * cannot be combined into one machine word. - */ -template -struct ScanTileState -{ - // Status word type - typedef char StatusWord; - - // Constants - enum - { - TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, - }; - - // Device storage - StatusWord *d_tile_status; - T *d_tile_partial; - T *d_tile_inclusive; - - /// Constructor - __host__ __device__ __forceinline__ - ScanTileState() - : - d_tile_status(NULL), - d_tile_partial(NULL), - d_tile_inclusive(NULL) - {} - - - /// Initializer - __host__ __device__ __forceinline__ - cudaError_t Init( - int num_tiles, ///< [in] Number of tiles - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t temp_storage_bytes) ///< [in] Size in bytes of \t d_temp_storage allocation - { - cudaError_t error = cudaSuccess; - do - { - void* allocations[3]; - size_t allocation_sizes[3]; - - allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors - allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials - allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives - - // Compute allocation pointers into the single storage blob - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - - // Alias the offsets - d_tile_status = reinterpret_cast(allocations[0]); - d_tile_partial = reinterpret_cast(allocations[1]); - d_tile_inclusive = reinterpret_cast(allocations[2]); - } - while (0); - - return error; - } - - - /** - * Compute device memory needed for tile status - */ - __host__ __device__ __forceinline__ - static cudaError_t AllocationSize( - int num_tiles, ///< [in] Number of tiles - size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation - { - // Specify storage allocation requirements - size_t allocation_sizes[3]; - allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord); // bytes needed for tile status descriptors - allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for partials - allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized); // bytes needed for inclusives - - // Set the necessary size of the blob - void* allocations[3]; - return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes)); - } - - - /** - * Initialize (from device) - */ - __device__ __forceinline__ void InitializeStatus(int num_tiles) - { - int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tile_idx < num_tiles) - { - // Not-yet-set - d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID); - } - - if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) - { - // Padding - d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB); - } - } - - - /** - * Update the specified tile's inclusive value and corresponding status - */ - __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive) - { - // Update tile inclusive value - ThreadStore(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive); - - // Fence - __threadfence(); - - // Update tile status - ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE)); - } - - - /** - * Update the specified tile's partial value and corresponding status - */ - __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial) - { - // Update tile partial value - ThreadStore(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial); - - // Fence - __threadfence(); - - // Update tile status - ThreadStore(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL)); - } - - /** - * Wait for the corresponding tile to become non-invalid - */ - __device__ __forceinline__ void WaitForValid( - int tile_idx, - StatusWord &status, - T &value) - { - do { - status = ThreadLoad(d_tile_status + TILE_STATUS_PADDING + tile_idx); - - __threadfence(); // prevent hoisting loads from loop or loads below above this one - - } while (status == SCAN_TILE_INVALID); - - if (status == StatusWord(SCAN_TILE_PARTIAL)) - value = ThreadLoad(d_tile_partial + TILE_STATUS_PADDING + tile_idx); - else - value = ThreadLoad(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx); - } -}; - - -/****************************************************************************** - * ReduceByKey tile status interface types for block-cooperative scans - ******************************************************************************/ - -/** - * Tile status interface for reduction by key. - * - */ -template < - typename ValueT, - typename KeyT, - bool SINGLE_WORD = (Traits::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)> -struct ReduceByKeyScanTileState; - - -/** - * Tile status interface for reduction by key, specialized for scan status and value types that - * cannot be combined into one machine word. - */ -template < - typename ValueT, - typename KeyT> -struct ReduceByKeyScanTileState : - ScanTileState > -{ - typedef ScanTileState > SuperClass; - - /// Constructor - __host__ __device__ __forceinline__ - ReduceByKeyScanTileState() : SuperClass() {} -}; - - -/** - * Tile status interface for reduction by key, specialized for scan status and value types that - * can be combined into one machine word that can be read/written coherently in a single access. - */ -template < - typename ValueT, - typename KeyT> -struct ReduceByKeyScanTileState -{ - typedef KeyValuePairKeyValuePairT; - - // Constants - enum - { - PAIR_SIZE = sizeof(ValueT) + sizeof(KeyT), - TXN_WORD_SIZE = 1 << Log2::VALUE, - STATUS_WORD_SIZE = TXN_WORD_SIZE - PAIR_SIZE, - - TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS, - }; - - // Status word type - typedef typename If<(STATUS_WORD_SIZE == 8), - long long, - typename If<(STATUS_WORD_SIZE == 4), - int, - typename If<(STATUS_WORD_SIZE == 2), - short, - char>::Type>::Type>::Type StatusWord; - - // Status word type - typedef typename If<(TXN_WORD_SIZE == 16), - longlong2, - typename If<(TXN_WORD_SIZE == 8), - long long, - int>::Type>::Type TxnWord; - - // Device word type (for when sizeof(ValueT) == sizeof(KeyT)) - struct TileDescriptorBigStatus - { - KeyT key; - ValueT value; - StatusWord status; - }; - - // Device word type (for when sizeof(ValueT) != sizeof(KeyT)) - struct TileDescriptorLittleStatus - { - ValueT value; - StatusWord status; - KeyT key; - }; - - // Device word type - typedef typename If< - (sizeof(ValueT) == sizeof(KeyT)), - TileDescriptorBigStatus, - TileDescriptorLittleStatus>::Type - TileDescriptor; - - - // Device storage - TxnWord *d_tile_descriptors; - - - /// Constructor - __host__ __device__ __forceinline__ - ReduceByKeyScanTileState() - : - d_tile_descriptors(NULL) - {} - - - /// Initializer - __host__ __device__ __forceinline__ - cudaError_t Init( - int /*num_tiles*/, ///< [in] Number of tiles - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t /*temp_storage_bytes*/) ///< [in] Size in bytes of \t d_temp_storage allocation - { - d_tile_descriptors = reinterpret_cast(d_temp_storage); - return cudaSuccess; - } - - - /** - * Compute device memory needed for tile status - */ - __host__ __device__ __forceinline__ - static cudaError_t AllocationSize( - int num_tiles, ///< [in] Number of tiles - size_t &temp_storage_bytes) ///< [out] Size in bytes of \t d_temp_storage allocation - { - temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor); // bytes needed for tile status descriptors - return cudaSuccess; - } - - - /** - * Initialize (from device) - */ - __device__ __forceinline__ void InitializeStatus(int num_tiles) - { - int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - TxnWord val = TxnWord(); - TileDescriptor *descriptor = reinterpret_cast(&val); - - if (tile_idx < num_tiles) - { - // Not-yet-set - descriptor->status = StatusWord(SCAN_TILE_INVALID); - d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val; - } - - if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING)) - { - // Padding - descriptor->status = StatusWord(SCAN_TILE_OOB); - d_tile_descriptors[threadIdx.x] = val; - } - } - - - /** - * Update the specified tile's inclusive value and corresponding status - */ - __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive) - { - TileDescriptor tile_descriptor; - tile_descriptor.status = SCAN_TILE_INCLUSIVE; - tile_descriptor.value = tile_inclusive.value; - tile_descriptor.key = tile_inclusive.key; - - TxnWord alias; - *reinterpret_cast(&alias) = tile_descriptor; - ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); - } - - - /** - * Update the specified tile's partial value and corresponding status - */ - __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial) - { - TileDescriptor tile_descriptor; - tile_descriptor.status = SCAN_TILE_PARTIAL; - tile_descriptor.value = tile_partial.value; - tile_descriptor.key = tile_partial.key; - - TxnWord alias; - *reinterpret_cast(&alias) = tile_descriptor; - ThreadStore(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias); - } - - /** - * Wait for the corresponding tile to become non-invalid - */ - __device__ __forceinline__ void WaitForValid( - int tile_idx, - StatusWord &status, - KeyValuePairT &value) - { -// TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); -// TileDescriptor tile_descriptor = reinterpret_cast(alias); -// -// while (tile_descriptor.status == SCAN_TILE_INVALID) -// { -// __threadfence_block(); // prevent hoisting loads from loop -// -// alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); -// tile_descriptor = reinterpret_cast(alias); -// } -// -// status = tile_descriptor.status; -// value.value = tile_descriptor.value; -// value.key = tile_descriptor.key; - - TileDescriptor tile_descriptor; - do - { - __threadfence_block(); // prevent hoisting loads from loop - TxnWord alias = ThreadLoad(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx); - tile_descriptor = reinterpret_cast(alias); - - } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff)); - - status = tile_descriptor.status; - value.value = tile_descriptor.value; - value.key = tile_descriptor.key; - } - -}; - - -/****************************************************************************** - * Prefix call-back operator for coupling local block scan within a - * block-cooperative scan - ******************************************************************************/ - -/** - * Stateful block-scan prefix functor. Provides the the running prefix for - * the current tile by using the call-back warp to wait on on - * aggregates/prefixes from predecessor tiles to become available. - */ -template < - typename T, - typename ScanOpT, - typename ScanTileStateT, - int PTX_ARCH = CUB_PTX_ARCH> -struct TilePrefixCallbackOp -{ - // Parameterized warp reduce - typedef WarpReduce WarpReduceT; - - // Temporary storage type - struct _TempStorage - { - typename WarpReduceT::TempStorage warp_reduce; - T exclusive_prefix; - T inclusive_prefix; - T block_aggregate; - }; - - // Alias wrapper allowing temporary storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - // Type of status word - typedef typename ScanTileStateT::StatusWord StatusWord; - - // Fields - _TempStorage& temp_storage; ///< Reference to a warp-reduction instance - ScanTileStateT& tile_status; ///< Interface to tile status - ScanOpT scan_op; ///< Binary scan operator - int tile_idx; ///< The current tile index - T exclusive_prefix; ///< Exclusive prefix for the tile - T inclusive_prefix; ///< Inclusive prefix for the tile - - // Constructor - __device__ __forceinline__ - TilePrefixCallbackOp( - ScanTileStateT &tile_status, - TempStorage &temp_storage, - ScanOpT scan_op, - int tile_idx) - : - temp_storage(temp_storage.Alias()), - tile_status(tile_status), - scan_op(scan_op), - tile_idx(tile_idx) {} - - - // Block until all predecessors within the warp-wide window have non-invalid status - __device__ __forceinline__ - void ProcessWindow( - int predecessor_idx, ///< Preceding tile index to inspect - StatusWord &predecessor_status, ///< [out] Preceding tile status - T &window_aggregate) ///< [out] Relevant partial reduction from this window of preceding tiles - { - T value; - tile_status.WaitForValid(predecessor_idx, predecessor_status, value); - - // Perform a segmented reduction to get the prefix for the current window. - // Use the swizzled scan operator because we are now scanning *down* towards thread0. - - int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE)); - window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce( - value, - tail_flag, - SwizzleScanOp(scan_op)); - } - - - // BlockScan prefix callback functor (called by the first warp) - __device__ __forceinline__ - T operator()(T block_aggregate) - { - - // Update our status with our tile-aggregate - if (threadIdx.x == 0) - { - temp_storage.block_aggregate = block_aggregate; - tile_status.SetPartial(tile_idx, block_aggregate); - } - - int predecessor_idx = tile_idx - threadIdx.x - 1; - StatusWord predecessor_status; - T window_aggregate; - - // Wait for the warp-wide window of predecessor tiles to become valid - ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); - - // The exclusive tile prefix starts out as the current window aggregate - exclusive_prefix = window_aggregate; - - // Keep sliding the window back until we come across a tile whose inclusive prefix is known - while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff)) - { - predecessor_idx -= CUB_PTX_WARP_THREADS; - - // Update exclusive tile prefix with the window prefix - ProcessWindow(predecessor_idx, predecessor_status, window_aggregate); - exclusive_prefix = scan_op(window_aggregate, exclusive_prefix); - } - - // Compute the inclusive tile prefix and update the status for this tile - if (threadIdx.x == 0) - { - inclusive_prefix = scan_op(exclusive_prefix, block_aggregate); - tile_status.SetInclusive(tile_idx, inclusive_prefix); - - temp_storage.exclusive_prefix = exclusive_prefix; - temp_storage.inclusive_prefix = inclusive_prefix; - } - - // Return exclusive_prefix - return exclusive_prefix; - } - - // Get the exclusive prefix stored in temporary storage - __device__ __forceinline__ - T GetExclusivePrefix() - { - return temp_storage.exclusive_prefix; - } - - // Get the inclusive prefix stored in temporary storage - __device__ __forceinline__ - T GetInclusivePrefix() - { - return temp_storage.inclusive_prefix; - } - - // Get the block aggregate stored in temporary storage - __device__ __forceinline__ - T GetBlockAggregate() - { - return temp_storage.block_aggregate; - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/block_adjacent_difference.cuh b/SRC/cub/block/block_adjacent_difference.cuh deleted file mode 100644 index acef9f05..00000000 --- a/SRC/cub/block/block_adjacent_difference.cuh +++ /dev/null @@ -1,596 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. - */ - -#pragma once - -#include "../util_type.cuh" -#include "../util_ptx.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -template < - typename T, - int BLOCK_DIM_X, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockAdjacentDifference -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - - /// Shared memory storage layout type (last element from each thread's input) - struct _TempStorage - { - T first_items[BLOCK_THREADS]; - T last_items[BLOCK_THREADS]; - }; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /// Specialization for when FlagOp has third index param - template ::HAS_PARAM> - struct ApplyOp - { - // Apply flag operator - static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx) - { - return flag_op(b, a, idx); - } - }; - - /// Specialization for when FlagOp does not have a third index param - template - struct ApplyOp - { - // Apply flag operator - static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/) - { - return flag_op(b, a); - } - }; - - /// Templated unrolling of item comparison (inductive case) - template - struct Iterate - { - // Head flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagHeads( - int linear_tid, - FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - preds[ITERATION] = input[ITERATION - 1]; - - flags[ITERATION] = ApplyOp::FlagT( - flag_op, - preds[ITERATION], - input[ITERATION], - (linear_tid * ITEMS_PER_THREAD) + ITERATION); - - Iterate::FlagHeads(linear_tid, flags, input, preds, flag_op); - } - - // Tail flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagTails( - int linear_tid, - FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - flags[ITERATION] = ApplyOp::FlagT( - flag_op, - input[ITERATION], - input[ITERATION + 1], - (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1); - - Iterate::FlagTails(linear_tid, flags, input, flag_op); - } - - }; - - /// Templated unrolling of item comparison (termination case) - template - struct Iterate - { - // Head flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagHeads( - int /*linear_tid*/, - FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&/*preds*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate - {} - - // Tail flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagTails( - int /*linear_tid*/, - FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate - {} - }; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - -public: - - /// \smemstorage{BlockDiscontinuity} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockAdjacentDifference() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockAdjacentDifference( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Head flag operations - *********************************************************************/ - //@{ - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share last item - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - if (linear_tid == 0) - { - // Set flag for first thread-item (preds[0] is undefined) - head_flags[0] = 1; - } - else - { - preds[0] = temp_storage.last_items[linear_tid - 1]; - head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); - } - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - } - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - { - // Share last item - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - // Set flag for first thread-item - preds[0] = (linear_tid == 0) ? - tile_predecessor_item : // First thread - temp_storage.last_items[linear_tid - 1]; - - head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - } - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - T preds[ITEMS_PER_THREAD]; - FlagHeads(head_flags, input, preds, flag_op); - } - - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - { - T preds[ITEMS_PER_THREAD]; - FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item); - } - - - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagTails( - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first item - temp_storage.first_items[linear_tid] = input[0]; - - CTA_SYNC(); - - // Set flag for last thread-item - tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? - 1 : // Last thread - ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - temp_storage.first_items[linear_tid + 1], - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagTails( - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - { - // Share first item - temp_storage.first_items[linear_tid] = input[0]; - - CTA_SYNC(); - - // Set flag for last thread-item - T successor_item = (linear_tid == BLOCK_THREADS - 1) ? - tile_successor_item : // Last thread - temp_storage.first_items[linear_tid + 1]; - - tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - successor_item, - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - preds[0] = temp_storage.last_items[linear_tid - 1]; - if (linear_tid == 0) - { - head_flags[0] = 1; - } - else - { - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - } - - - // Set flag for last thread-item - tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? - 1 : // Last thread - ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - temp_storage.first_items[linear_tid + 1], - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - if (linear_tid == 0) - { - head_flags[0] = 1; - } - else - { - preds[0] = temp_storage.last_items[linear_tid - 1]; - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - } - - // Set flag for last thread-item - T successor_item = (linear_tid == BLOCK_THREADS - 1) ? - tile_successor_item : // Last thread - temp_storage.first_items[linear_tid + 1]; - - tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - successor_item, - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - preds[0] = (linear_tid == 0) ? - tile_predecessor_item : // First thread - temp_storage.last_items[linear_tid - 1]; - - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - - // Set flag for last thread-item - tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? - 1 : // Last thread - ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - temp_storage.first_items[linear_tid + 1], - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - preds[0] = (linear_tid == 0) ? - tile_predecessor_item : // First thread - temp_storage.last_items[linear_tid - 1]; - - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - - // Set flag for last thread-item - T successor_item = (linear_tid == BLOCK_THREADS - 1) ? - tile_successor_item : // Last thread - temp_storage.first_items[linear_tid + 1]; - - tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - successor_item, - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/block/block_discontinuity.cuh b/SRC/cub/block/block_discontinuity.cuh deleted file mode 100644 index 503e3e0b..00000000 --- a/SRC/cub/block/block_discontinuity.cuh +++ /dev/null @@ -1,1148 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. - */ - -#pragma once - -#include "../util_type.cuh" -#include "../util_ptx.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief The BlockDiscontinuity class provides [collective](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png) - * \ingroup BlockModule - * - * \tparam T The data type to be flagged. - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items - * that differ from their predecessors (or successors). For example, head flags are convenient - * for demarcating disjoint data segments as part of a segmented scan or reduction. - * - \blocked - * - * \par Performance Considerations - * - \granularity - * - * \par A Simple Example - * \blockcollective{BlockDiscontinuity} - * \par - * The code snippet below illustrates the head flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute head flags for discontinuities in the segment - * int head_flags[4]; - * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. - * The corresponding output \p head_flags in those threads will be - * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * - * \par Performance Considerations - * - Incurs zero bank conflicts for most types - * - */ -template < - typename T, - int BLOCK_DIM_X, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockDiscontinuity -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - - /// Shared memory storage layout type (last element from each thread's input) - struct _TempStorage - { - T first_items[BLOCK_THREADS]; - T last_items[BLOCK_THREADS]; - }; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /// Specialization for when FlagOp has third index param - template ::HAS_PARAM> - struct ApplyOp - { - // Apply flag operator - static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx) - { - return flag_op(a, b, idx); - } - }; - - /// Specialization for when FlagOp does not have a third index param - template - struct ApplyOp - { - // Apply flag operator - static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/) - { - return flag_op(a, b); - } - }; - - /// Templated unrolling of item comparison (inductive case) - template - struct Iterate - { - // Head flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagHeads( - int linear_tid, - FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - preds[ITERATION] = input[ITERATION - 1]; - - flags[ITERATION] = ApplyOp::FlagT( - flag_op, - preds[ITERATION], - input[ITERATION], - (linear_tid * ITEMS_PER_THREAD) + ITERATION); - - Iterate::FlagHeads(linear_tid, flags, input, preds, flag_op); - } - - // Tail flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagTails( - int linear_tid, - FlagT (&flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - flags[ITERATION] = ApplyOp::FlagT( - flag_op, - input[ITERATION], - input[ITERATION + 1], - (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1); - - Iterate::FlagTails(linear_tid, flags, input, flag_op); - } - - }; - - /// Templated unrolling of item comparison (termination case) - template - struct Iterate - { - // Head flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagHeads( - int /*linear_tid*/, - FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&/*preds*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate - {} - - // Tail flags - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - static __device__ __forceinline__ void FlagTails( - int /*linear_tid*/, - FlagT (&/*flags*/)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&/*input*/)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp /*flag_op*/) ///< [in] Binary boolean flag predicate - {} - }; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - -public: - - /// \smemstorage{BlockDiscontinuity} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockDiscontinuity() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockDiscontinuity( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Head flag operations - *********************************************************************/ - //@{ - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share last item - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - if (linear_tid == 0) - { - // Set flag for first thread-item (preds[0] is undefined) - head_flags[0] = 1; - } - else - { - preds[0] = temp_storage.last_items[linear_tid - 1]; - head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); - } - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - } - - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&preds)[ITEMS_PER_THREAD], ///< [out] Calling thread's predecessor items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - { - // Share last item - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - // Set flag for first thread-item - preds[0] = (linear_tid == 0) ? - tile_predecessor_item : // First thread - temp_storage.last_items[linear_tid - 1]; - - head_flags[0] = ApplyOp::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - } - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - /** - * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged. - * - * \par - * - The flag head_flagsi is set for item - * inputi when - * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item - * in the same thread or the last item in the previous thread). - * - For thread0, item input0 is always flagged. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the head-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute head flags for discontinuities in the segment - * int head_flags[4]; - * BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }. - * The corresponding output \p head_flags in those threads will be - * { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - T preds[ITEMS_PER_THREAD]; - FlagHeads(head_flags, input, preds, flag_op); - } - - - /** - * \brief Sets head flags indicating discontinuities between items partitioned across the thread block. - * - * \par - * - The flag head_flagsi is set for item - * inputi when - * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item - * in the same thread or the last item in the previous thread). - * - For thread0, item input0 is compared - * against \p tile_predecessor_item. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the head-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Have thread0 obtain the predecessor item for the entire tile - * int tile_predecessor_item; - * if (threadIdx.x == 0) tile_predecessor_item == ... - * - * // Collectively compute head flags for discontinuities in the segment - * int head_flags[4]; - * BlockDiscontinuity(temp_storage).FlagHeads( - * head_flags, thread_data, cub::Inequality(), tile_predecessor_item); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }, - * and that \p tile_predecessor_item is \p 0. The corresponding output \p head_flags in those threads will be - * { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeads( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_predecessor_item) ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - { - T preds[ITEMS_PER_THREAD]; - FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item); - } - - - - //@} end member group - /******************************************************************//** - * \name Tail flag operations - *********************************************************************/ - //@{ - - - /** - * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged. - * - * \par - * - The flag tail_flagsi is set for item - * inputi when - * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item - * in the same thread or the first item in the next thread). - * - For threadBLOCK_THREADS-1, item - * inputITEMS_PER_THREAD-1 is always flagged. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the tail-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute tail flags for discontinuities in the segment - * int tail_flags[4]; - * BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }. - * The corresponding output \p tail_flags in those threads will be - * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagTails( - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first item - temp_storage.first_items[linear_tid] = input[0]; - - CTA_SYNC(); - - // Set flag for last thread-item - tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? - 1 : // Last thread - ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - temp_storage.first_items[linear_tid + 1], - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - /** - * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block. - * - * \par - * - The flag tail_flagsi is set for item - * inputi when - * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item - * in the same thread or the first item in the next thread). - * - For threadBLOCK_THREADS-1, item - * inputITEMS_PER_THREAD-1 is compared - * against \p tile_successor_item. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the tail-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Have thread127 obtain the successor item for the entire tile - * int tile_successor_item; - * if (threadIdx.x == 127) tile_successor_item == ... - * - * // Collectively compute tail flags for discontinuities in the segment - * int tail_flags[4]; - * BlockDiscontinuity(temp_storage).FlagTails( - * tail_flags, thread_data, cub::Inequality(), tile_successor_item); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } - * and that \p tile_successor_item is \p 125. The corresponding output \p tail_flags in those threads will be - * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagTails( - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op, ///< [in] Binary boolean flag predicate - T tile_successor_item) ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - { - // Share first item - temp_storage.first_items[linear_tid] = input[0]; - - CTA_SYNC(); - - // Set flag for last thread-item - T successor_item = (linear_tid == BLOCK_THREADS - 1) ? - tile_successor_item : // Last thread - temp_storage.first_items[linear_tid + 1]; - - tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - successor_item, - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - //@} end member group - /******************************************************************//** - * \name Head & tail flag operations - *********************************************************************/ - //@{ - - - /** - * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. - * - * \par - * - The flag head_flagsi is set for item - * inputi when - * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item - * in the same thread or the last item in the previous thread). - * - For thread0, item input0 is always flagged. - * - The flag tail_flagsi is set for item - * inputi when - * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item - * in the same thread or the first item in the next thread). - * - For threadBLOCK_THREADS-1, item - * inputITEMS_PER_THREAD-1 is always flagged. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the head- and tail-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute head and flags for discontinuities in the segment - * int head_flags[4]; - * int tail_flags[4]; - * BlockDiscontinuity(temp_storage).FlagTails( - * head_flags, tail_flags, thread_data, cub::Inequality()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } - * and that the tile_successor_item is \p 125. The corresponding output \p head_flags - * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * and the corresponding output \p tail_flags in those threads will be - * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - preds[0] = temp_storage.last_items[linear_tid - 1]; - if (linear_tid == 0) - { - head_flags[0] = 1; - } - else - { - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - } - - - // Set flag for last thread-item - tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? - 1 : // Last thread - ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - temp_storage.first_items[linear_tid + 1], - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - /** - * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. - * - * \par - * - The flag head_flagsi is set for item - * inputi when - * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item - * in the same thread or the last item in the previous thread). - * - For thread0, item input0 is always flagged. - * - The flag tail_flagsi is set for item - * inputi when - * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item - * in the same thread or the first item in the next thread). - * - For threadBLOCK_THREADS-1, item - * inputITEMS_PER_THREAD-1 is compared - * against \p tile_predecessor_item. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the head- and tail-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Have thread127 obtain the successor item for the entire tile - * int tile_successor_item; - * if (threadIdx.x == 127) tile_successor_item == ... - * - * // Collectively compute head and flags for discontinuities in the segment - * int head_flags[4]; - * int tail_flags[4]; - * BlockDiscontinuity(temp_storage).FlagTails( - * head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] } - * and that the tile_successor_item is \p 125. The corresponding output \p head_flags - * in those threads will be { [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * and the corresponding output \p tail_flags in those threads will be - * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - if (linear_tid == 0) - { - head_flags[0] = 1; - } - else - { - preds[0] = temp_storage.last_items[linear_tid - 1]; - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - } - - // Set flag for last thread-item - T successor_item = (linear_tid == BLOCK_THREADS - 1) ? - tile_successor_item : // Last thread - temp_storage.first_items[linear_tid + 1]; - - tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - successor_item, - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - /** - * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. - * - * \par - * - The flag head_flagsi is set for item - * inputi when - * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item - * in the same thread or the last item in the previous thread). - * - For thread0, item input0 is compared - * against \p tile_predecessor_item. - * - The flag tail_flagsi is set for item - * inputi when - * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item - * in the same thread or the first item in the next thread). - * - For threadBLOCK_THREADS-1, item - * inputITEMS_PER_THREAD-1 is always flagged. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the head- and tail-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Have thread0 obtain the predecessor item for the entire tile - * int tile_predecessor_item; - * if (threadIdx.x == 0) tile_predecessor_item == ... - * - * // Have thread127 obtain the successor item for the entire tile - * int tile_successor_item; - * if (threadIdx.x == 127) tile_successor_item == ... - * - * // Collectively compute head and flags for discontinuities in the segment - * int head_flags[4]; - * int tail_flags[4]; - * BlockDiscontinuity(temp_storage).FlagTails( - * head_flags, tile_predecessor_item, tail_flags, tile_successor_item, - * thread_data, cub::Inequality()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }, - * that the \p tile_predecessor_item is \p 0, and that the - * \p tile_successor_item is \p 125. The corresponding output \p head_flags - * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * and the corresponding output \p tail_flags in those threads will be - * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - preds[0] = (linear_tid == 0) ? - tile_predecessor_item : // First thread - temp_storage.last_items[linear_tid - 1]; - - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - - // Set flag for last thread-item - tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ? - 1 : // Last thread - ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - temp_storage.first_items[linear_tid + 1], - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - /** - * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block. - * - * \par - * - The flag head_flagsi is set for item - * inputi when - * flag_op(previous-item, inputi) - * returns \p true (where previous-item is either the preceding item - * in the same thread or the last item in the previous thread). - * - For thread0, item input0 is compared - * against \p tile_predecessor_item. - * - The flag tail_flagsi is set for item - * inputi when - * flag_op(inputi, next-item) - * returns \p true (where next-item is either the next item - * in the same thread or the first item in the next thread). - * - For threadBLOCK_THREADS-1, item - * inputITEMS_PER_THREAD-1 is compared - * against \p tile_successor_item. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the head- and tail-flagging of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int - * typedef cub::BlockDiscontinuity BlockDiscontinuity; - * - * // Allocate shared memory for BlockDiscontinuity - * __shared__ typename BlockDiscontinuity::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Have thread0 obtain the predecessor item for the entire tile - * int tile_predecessor_item; - * if (threadIdx.x == 0) tile_predecessor_item == ... - * - * // Have thread127 obtain the successor item for the entire tile - * int tile_successor_item; - * if (threadIdx.x == 127) tile_successor_item == ... - * - * // Collectively compute head and flags for discontinuities in the segment - * int head_flags[4]; - * int tail_flags[4]; - * BlockDiscontinuity(temp_storage).FlagTails( - * head_flags, tile_predecessor_item, tail_flags, tile_successor_item, - * thread_data, cub::Inequality()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }, - * that the \p tile_predecessor_item is \p 0, and that the - * \p tile_successor_item is \p 125. The corresponding output \p head_flags - * in those threads will be { [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }. - * and the corresponding output \p tail_flags in those threads will be - * { [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam FlagT [inferred] The flag type (must be an integer type) - * \tparam FlagOp [inferred] Binary predicate functor type having member T operator()(const T &a, const T &b) or member T operator()(const T &a, const T &b, unsigned int b_index), and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false. \p b_index is the rank of b in the aggregate tile of data. - */ - template < - int ITEMS_PER_THREAD, - typename FlagT, - typename FlagOp> - __device__ __forceinline__ void FlagHeadsAndTails( - FlagT (&head_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity head_flags - T tile_predecessor_item, ///< [in] [thread0 only] Item with which to compare the first tile item (input0 from thread0). - FlagT (&tail_flags)[ITEMS_PER_THREAD], ///< [out] Calling thread's discontinuity tail_flags - T tile_successor_item, ///< [in] [threadBLOCK_THREADS-1 only] Item with which to compare the last tile item (inputITEMS_PER_THREAD-1 from threadBLOCK_THREADS-1). - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - FlagOp flag_op) ///< [in] Binary boolean flag predicate - { - // Share first and last items - temp_storage.first_items[linear_tid] = input[0]; - temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - T preds[ITEMS_PER_THREAD]; - - // Set flag for first thread-item - preds[0] = (linear_tid == 0) ? - tile_predecessor_item : // First thread - temp_storage.last_items[linear_tid - 1]; - - head_flags[0] = ApplyOp::FlagT( - flag_op, - preds[0], - input[0], - linear_tid * ITEMS_PER_THREAD); - - // Set flag for last thread-item - T successor_item = (linear_tid == BLOCK_THREADS - 1) ? - tile_successor_item : // Last thread - temp_storage.first_items[linear_tid + 1]; - - tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp::FlagT( - flag_op, - input[ITEMS_PER_THREAD - 1], - successor_item, - (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD); - - // Set head_flags for remaining items - Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op); - - // Set tail_flags for remaining items - Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op); - } - - - - - //@} end member group - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/block/block_exchange.cuh b/SRC/cub/block/block_exchange.cuh deleted file mode 100644 index 3ae99343..00000000 --- a/SRC/cub/block/block_exchange.cuh +++ /dev/null @@ -1,1248 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. - */ - -#pragma once - -#include "../util_ptx.cuh" -#include "../util_arch.cuh" -#include "../util_macro.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief The BlockExchange class provides [collective](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png) - * \ingroup BlockModule - * - * \tparam T The data type to be exchanged. - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ITEMS_PER_THREAD The number of items partitioned onto each thread. - * \tparam WARP_TIME_SLICING [optional] When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds. Yields a smaller memory footprint at the expense of decreased parallelism. (Default: false) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - It is commonplace for blocks of threads to rearrange data items between - * threads. For example, the device-accessible memory subsystem prefers access patterns - * where data items are "striped" across threads (where consecutive threads access consecutive items), - * yet most block-wide operations prefer a "blocked" partitioning of items across threads - * (where consecutive items belong to a single thread). - * - BlockExchange supports the following types of data exchanges: - * - Transposing between [blocked](index.html#sec5sec3) and [striped](index.html#sec5sec3) arrangements - * - Transposing between [blocked](index.html#sec5sec3) and [warp-striped](index.html#sec5sec3) arrangements - * - Scattering ranked items to a [blocked arrangement](index.html#sec5sec3) - * - Scattering ranked items to a [striped arrangement](index.html#sec5sec3) - * - \rowmajor - * - * \par A Simple Example - * \blockcollective{BlockExchange} - * \par - * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement - * of 512 integer items partitioned across 128 threads where each thread owns 4 items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockExchange BlockExchange; - * - * // Allocate shared memory for BlockExchange - * __shared__ typename BlockExchange::TempStorage temp_storage; - * - * // Load a tile of data striped across threads - * int thread_data[4]; - * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); - * - * // Collectively exchange data into a blocked arrangement across threads - * BlockExchange(temp_storage).StripedToBlocked(thread_data); - * - * \endcode - * \par - * Suppose the set of striped input \p thread_data across the block of threads is - * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }. - * The corresponding output \p thread_data in those threads will be - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * - * \par Performance Considerations - * - Proper device-specific padding ensures zero bank conflicts for most types. - * - */ -template < - typename InputT, - int BLOCK_DIM_X, - int ITEMS_PER_THREAD, - bool WARP_TIME_SLICING = false, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockExchange -{ -private: - - /****************************************************************************** - * Constants - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), - WARP_THREADS = 1 << LOG_WARP_THREADS, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH), - SMEM_BANKS = 1 << LOG_SMEM_BANKS, - - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - - TIME_SLICES = (WARP_TIME_SLICING) ? WARPS : 1, - - TIME_SLICED_THREADS = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS, - TIME_SLICED_ITEMS = TIME_SLICED_THREADS * ITEMS_PER_THREAD, - - WARP_TIME_SLICED_THREADS = CUB_MIN(BLOCK_THREADS, WARP_THREADS), - WARP_TIME_SLICED_ITEMS = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD, - - // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads) - INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE), - PADDING_ITEMS = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0, - }; - - /****************************************************************************** - * Type definitions - ******************************************************************************/ - - /// Shared memory storage layout type - struct __align__(16) _TempStorage - { - InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS]; - }; - -public: - - /// \smemstorage{BlockExchange} - struct TempStorage : Uninitialized<_TempStorage> {}; - -private: - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - unsigned int lane_id; - unsigned int warp_id; - unsigned int warp_offset; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /** - * Transposes data items from blocked arrangement to striped arrangement. Specialized for no timeslicing. - */ - template - __device__ __forceinline__ void BlockedToStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - - /** - * Transposes data items from blocked arrangement to striped arrangement. Specialized for warp-timeslicing. - */ - template - __device__ __forceinline__ void BlockedToStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - InputT temp_items[ITEMS_PER_THREAD]; - - #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) - { - const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; - const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; - - CTA_SYNC(); - - if (warp_id == SLICE) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - // Read a strip of items - const int STRIP_OFFSET = ITEM * BLOCK_THREADS; - const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; - - if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) - { - int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) - { - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_items[ITEM] = temp_storage.buff[item_offset]; - } - } - } - } - - // Copy - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - output_items[ITEM] = temp_items[ITEM]; - } - } - - - /** - * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for no timeslicing - */ - template - __device__ __forceinline__ void BlockedToWarpStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - WARP_SYNC(0xffffffff); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - /** - * Transposes data items from blocked arrangement to warp-striped arrangement. Specialized for warp-timeslicing - */ - template - __device__ __forceinline__ void BlockedToWarpStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - if (warp_id == 0) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - WARP_SYNC(0xffffffff); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - #pragma unroll - for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE) - { - CTA_SYNC(); - - if (warp_id == SLICE) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - WARP_SYNC(0xffffffff); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - } - } - - - /** - * Transposes data items from striped arrangement to blocked arrangement. Specialized for no timeslicing. - */ - template - __device__ __forceinline__ void StripedToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - CTA_SYNC(); - - // No timeslicing - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - - /** - * Transposes data items from striped arrangement to blocked arrangement. Specialized for warp-timeslicing. - */ - template - __device__ __forceinline__ void StripedToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - // Warp time-slicing - InputT temp_items[ITEMS_PER_THREAD]; - - #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) - { - const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; - const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - // Write a strip of items - const int STRIP_OFFSET = ITEM * BLOCK_THREADS; - const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; - - if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) - { - int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) - { - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - } - } - - CTA_SYNC(); - - if (warp_id == SLICE) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_items[ITEM] = temp_storage.buff[item_offset]; - } - } - } - - // Copy - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - output_items[ITEM] = temp_items[ITEM]; - } - } - - - /** - * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for no timeslicing - */ - template - __device__ __forceinline__ void WarpStripedToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - WARP_SYNC(0xffffffff); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - - /** - * Transposes data items from warp-striped arrangement to blocked arrangement. Specialized for warp-timeslicing - */ - template - __device__ __forceinline__ void WarpStripedToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - Int2Type /*time_slicing*/) - { - #pragma unroll - for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE) - { - CTA_SYNC(); - - if (warp_id == SLICE) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id; - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - WARP_SYNC(0xffffffff); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD); - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - } - } - - - /** - * Exchanges data items annotated by rank into blocked arrangement. Specialized for no timeslicing. - */ - template - __device__ __forceinline__ void ScatterToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - Int2Type /*time_slicing*/) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - /** - * Exchanges data items annotated by rank into blocked arrangement. Specialized for warp-timeslicing. - */ - template - __device__ __forceinline__ void ScatterToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - Int2Type /*time_slicing*/) - { - InputT temp_items[ITEMS_PER_THREAD]; - - #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) - { - CTA_SYNC(); - - const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ranks[ITEM] - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) - { - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - temp_storage.buff[item_offset] = input_items[ITEM]; - } - } - - CTA_SYNC(); - - if (warp_id == SLICE) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - temp_items[ITEM] = temp_storage.buff[item_offset]; - } - } - } - - // Copy - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - output_items[ITEM] = temp_items[ITEM]; - } - } - - - /** - * Exchanges data items annotated by rank into striped arrangement. Specialized for no timeslicing. - */ - template - __device__ __forceinline__ void ScatterToStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - Int2Type /*time_slicing*/) - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - - /** - * Exchanges data items annotated by rank into striped arrangement. Specialized for warp-timeslicing. - */ - template - __device__ __forceinline__ void ScatterToStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between blocked and striped arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items to exchange, converting between blocked and striped arrangements. - OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - Int2Type /*time_slicing*/) - { - InputT temp_items[ITEMS_PER_THREAD]; - - #pragma unroll - for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++) - { - const int SLICE_OFFSET = SLICE * TIME_SLICED_ITEMS; - const int SLICE_OOB = SLICE_OFFSET + TIME_SLICED_ITEMS; - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ranks[ITEM] - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS)) - { - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - temp_storage.buff[item_offset] = input_items[ITEM]; - } - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - // Read a strip of items - const int STRIP_OFFSET = ITEM * BLOCK_THREADS; - const int STRIP_OOB = STRIP_OFFSET + BLOCK_THREADS; - - if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET)) - { - int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET; - if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS)) - { - if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS; - temp_items[ITEM] = temp_storage.buff[item_offset]; - } - } - } - } - - // Copy - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - output_items[ITEM] = temp_items[ITEM]; - } - } - - -public: - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockExchange() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), - warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), - lane_id(LaneId()), - warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockExchange( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), - lane_id(LaneId()), - warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), - warp_offset(warp_id * WARP_TIME_SLICED_ITEMS) - {} - - - //@} end member group - /******************************************************************//** - * \name Structured exchanges - *********************************************************************/ - //@{ - - /** - * \brief Transposes data items from striped arrangement to blocked arrangement. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement - * of 512 integer items partitioned across 128 threads where each thread owns 4 items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockExchange BlockExchange; - * - * // Allocate shared memory for BlockExchange - * __shared__ typename BlockExchange::TempStorage temp_storage; - * - * // Load a tile of ordered data into a striped arrangement across block threads - * int thread_data[4]; - * cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data); - * - * // Collectively exchange data into a blocked arrangement across threads - * BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of striped input \p thread_data across the block of threads is - * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } after loading from device-accessible memory. - * The corresponding output \p thread_data in those threads will be - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * - */ - template - __device__ __forceinline__ void StripedToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. - { - StripedToBlocked(input_items, output_items, Int2Type()); - } - - - /** - * \brief Transposes data items from blocked arrangement to striped arrangement. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement - * of 512 integer items partitioned across 128 threads where each thread owns 4 items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockExchange BlockExchange; - * - * // Allocate shared memory for BlockExchange - * __shared__ typename BlockExchange::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively exchange data into a striped arrangement across threads - * BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data); - * - * // Store data striped across block threads into an ordered tile - * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); - * - * \endcode - * \par - * Suppose the set of blocked input \p thread_data across the block of threads is - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * The corresponding output \p thread_data in those threads will be - * { [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] } in - * preparation for storing to device-accessible memory. - * - */ - template - __device__ __forceinline__ void BlockedToStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. - { - BlockedToStriped(input_items, output_items, Int2Type()); - } - - - - /** - * \brief Transposes data items from warp-striped arrangement to blocked arrangement. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement - * of 512 integer items partitioned across 128 threads where each thread owns 4 items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockExchange BlockExchange; - * - * // Allocate shared memory for BlockExchange - * __shared__ typename BlockExchange::TempStorage temp_storage; - * - * // Load a tile of ordered data into a warp-striped arrangement across warp threads - * int thread_data[4]; - * cub::LoadSWarptriped(threadIdx.x, d_data, thread_data); - * - * // Collectively exchange data into a blocked arrangement across threads - * BlockExchange(temp_storage).WarpStripedToBlocked(thread_data); - * - * \endcode - * \par - * Suppose the set of warp-striped input \p thread_data across the block of threads is - * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } - * after loading from device-accessible memory. (The first 128 items are striped across - * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) - * The corresponding output \p thread_data in those threads will be - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * - */ - template - __device__ __forceinline__ void WarpStripedToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. - { - WarpStripedToBlocked(input_items, output_items, Int2Type()); - } - - - - /** - * \brief Transposes data items from blocked arrangement to warp-striped arrangement. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement - * of 512 integer items partitioned across 128 threads where each thread owns 4 items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockExchange BlockExchange; - * - * // Allocate shared memory for BlockExchange - * __shared__ typename BlockExchange::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively exchange data into a warp-striped arrangement across threads - * BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data); - * - * // Store data striped across warp threads into an ordered tile - * cub::StoreDirectStriped(threadIdx.x, d_data, thread_data); - * - * \endcode - * \par - * Suppose the set of blocked input \p thread_data across the block of threads is - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * The corresponding output \p thread_data in those threads will be - * { [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] } - * in preparation for storing to device-accessible memory. (The first 128 items are striped across - * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.) - * - */ - template - __device__ __forceinline__ void BlockedToWarpStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD]) ///< [out] Items from exchange, converting between striped and blocked arrangements. - { - BlockedToWarpStriped(input_items, output_items, Int2Type()); - } - - - - //@} end member group - /******************************************************************//** - * \name Scatter exchanges - *********************************************************************/ - //@{ - - - /** - * \brief Exchanges data items annotated by rank into blocked arrangement. - * - * \par - * - \smemreuse - * - * \tparam OffsetT [inferred] Signed integer type for local offsets - */ - template - __device__ __forceinline__ void ScatterToBlocked( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks - { - ScatterToBlocked(input_items, output_items, ranks, Int2Type()); - } - - - - /** - * \brief Exchanges data items annotated by rank into striped arrangement. - * - * \par - * - \smemreuse - * - * \tparam OffsetT [inferred] Signed integer type for local offsets - */ - template - __device__ __forceinline__ void ScatterToStriped( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks - { - ScatterToStriped(input_items, output_items, ranks, Int2Type()); - } - - - - /** - * \brief Exchanges data items annotated by rank into striped arrangement. Items with rank -1 are not exchanged. - * - * \par - * - \smemreuse - * - * \tparam OffsetT [inferred] Signed integer type for local offsets - */ - template - __device__ __forceinline__ void ScatterToStripedGuarded( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - if (ranks[ITEM] >= 0) - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - - - - /** - * \brief Exchanges valid data items annotated by rank into striped arrangement. - * - * \par - * - \smemreuse - * - * \tparam OffsetT [inferred] Signed integer type for local offsets - * \tparam ValidFlag [inferred] FlagT type denoting which items are valid - */ - template - __device__ __forceinline__ void ScatterToStripedFlagged( - InputT input_items[ITEMS_PER_THREAD], ///< [in] Items to exchange, converting between striped and blocked arrangements. - OutputT output_items[ITEMS_PER_THREAD], ///< [out] Items from exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = ranks[ITEM]; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - if (is_valid[ITEM]) - temp_storage.buff[item_offset] = input_items[ITEM]; - } - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - output_items[ITEM] = temp_storage.buff[item_offset]; - } - } - - - //@} end member group - - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - - __device__ __forceinline__ void StripedToBlocked( - InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - { - StripedToBlocked(items, items); - } - - __device__ __forceinline__ void BlockedToStriped( - InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - { - BlockedToStriped(items, items); - } - - __device__ __forceinline__ void WarpStripedToBlocked( - InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - { - WarpStripedToBlocked(items, items); - } - - __device__ __forceinline__ void BlockedToWarpStriped( - InputT items[ITEMS_PER_THREAD]) ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - { - BlockedToWarpStriped(items, items); - } - - template - __device__ __forceinline__ void ScatterToBlocked( - InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks - { - ScatterToBlocked(items, items, ranks); - } - - template - __device__ __forceinline__ void ScatterToStriped( - InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks - { - ScatterToStriped(items, items, ranks); - } - - template - __device__ __forceinline__ void ScatterToStripedGuarded( - InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks - { - ScatterToStripedGuarded(items, items, ranks); - } - - template - __device__ __forceinline__ void ScatterToStripedFlagged( - InputT items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange, converting between striped and blocked arrangements. - OffsetT ranks[ITEMS_PER_THREAD], ///< [in] Corresponding scatter ranks - ValidFlag is_valid[ITEMS_PER_THREAD]) ///< [in] Corresponding flag denoting item validity - { - ScatterToStriped(items, items, ranks, is_valid); - } - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -}; - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - -template < - typename T, - int ITEMS_PER_THREAD, - int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, - int PTX_ARCH = CUB_PTX_ARCH> -class WarpExchange -{ -private: - - /****************************************************************************** - * Constants - ******************************************************************************/ - - /// Constants - enum - { - // Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - WARP_ITEMS = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1, - - LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH), - SMEM_BANKS = 1 << LOG_SMEM_BANKS, - - // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads) - INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE), - PADDING_ITEMS = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0, - }; - - /****************************************************************************** - * Type definitions - ******************************************************************************/ - - /// Shared memory storage layout type - struct _TempStorage - { - T buff[WARP_ITEMS + PADDING_ITEMS]; - }; - -public: - - /// \smemstorage{WarpExchange} - struct TempStorage : Uninitialized<_TempStorage> {}; - -private: - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - _TempStorage &temp_storage; - int lane_id; - -public: - - /****************************************************************************** - * Construction - ******************************************************************************/ - - /// Constructor - __device__ __forceinline__ WarpExchange( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - lane_id(IS_ARCH_WARP ? - LaneId() : - LaneId() % LOGICAL_WARP_THREADS) - {} - - - /****************************************************************************** - * Interface - ******************************************************************************/ - - /** - * \brief Exchanges valid data items annotated by rank into striped arrangement. - * - * \par - * - \smemreuse - * - * \tparam OffsetT [inferred] Signed integer type for local offsets - */ - template - __device__ __forceinline__ void ScatterToStriped( - T items[ITEMS_PER_THREAD], ///< [in-out] Items to exchange - OffsetT ranks[ITEMS_PER_THREAD]) ///< [in] Corresponding scatter ranks - { - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]); - temp_storage.buff[ranks[ITEM]] = items[ITEM]; - } - - WARP_SYNC(0xffffffff); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id; - if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); - items[ITEM] = temp_storage.buff[item_offset]; - } - } - -}; - - - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/block_histogram.cuh b/SRC/cub/block/block_histogram.cuh deleted file mode 100644 index b7cb9700..00000000 --- a/SRC/cub/block/block_histogram.cuh +++ /dev/null @@ -1,415 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. - */ - -#pragma once - -#include "specializations/block_histogram_sort.cuh" -#include "specializations/block_histogram_atomic.cuh" -#include "../util_ptx.cuh" -#include "../util_arch.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Algorithmic variants - ******************************************************************************/ - -/** - * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms. - */ -enum BlockHistogramAlgorithm -{ - - /** - * \par Overview - * Sorting followed by differentiation. Execution is comprised of two phases: - * -# Sort the data using efficient radix sort - * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts. - * - * \par Performance Considerations - * Delivers consistent throughput regardless of sample bin distribution. - */ - BLOCK_HISTO_SORT, - - - /** - * \par Overview - * Use atomic addition to update byte counts directly - * - * \par Performance Considerations - * Performance is strongly tied to the hardware implementation of atomic - * addition, and may be significantly degraded for non uniformly-random - * input distributions where many concurrent updates are likely to be - * made to the same bin counter. - */ - BLOCK_HISTO_ATOMIC, -}; - - - -/****************************************************************************** - * Block histogram - ******************************************************************************/ - - -/** - * \brief The BlockHistogram class provides [collective](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png) - * \ingroup BlockModule - * - * \tparam T The sample type being histogrammed (must be castable to an integer bin identifier) - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ITEMS_PER_THREAD The number of items per thread - * \tparam BINS The number bins within the histogram - * \tparam ALGORITHM [optional] cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - A histogram - * counts the number of observations that fall into each of the disjoint categories (known as bins). - * - BlockHistogram can be optionally specialized to use different algorithms: - * -# cub::BLOCK_HISTO_SORT. Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm) - * -# cub::BLOCK_HISTO_ATOMIC. Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm) - * - * \par Performance Considerations - * - \granularity - * - * \par A Simple Example - * \blockcollective{BlockHistogram} - * \par - * The code snippet below illustrates a 256-bin histogram of 512 integer samples that - * are partitioned across 128 threads where each thread owns 4 samples. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each - * typedef cub::BlockHistogram BlockHistogram; - * - * // Allocate shared memory for BlockHistogram - * __shared__ typename BlockHistogram::TempStorage temp_storage; - * - * // Allocate shared memory for block-wide histogram bin counts - * __shared__ unsigned int smem_histogram[256]; - * - * // Obtain input samples per thread - * unsigned char data[4]; - * ... - * - * // Compute the block-wide histogram - * BlockHistogram(temp_storage).Histogram(data, smem_histogram); - * - * \endcode - * - * \par Performance and Usage Considerations - * - The histogram output can be constructed in shared or device-accessible memory - * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives - * - */ -template < - typename T, - int BLOCK_DIM_X, - int ITEMS_PER_THREAD, - int BINS, - BlockHistogramAlgorithm ALGORITHM = BLOCK_HISTO_SORT, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockHistogram -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - /** - * Ensure the template parameterization meets the requirements of the - * targeted device architecture. BLOCK_HISTO_ATOMIC can only be used - * on version SM120 or later. Otherwise BLOCK_HISTO_SORT is used - * regardless. - */ - static const BlockHistogramAlgorithm SAFE_ALGORITHM = - ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ? - BLOCK_HISTO_SORT : - ALGORITHM; - - /// Internal specialization. - typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT), - BlockHistogramSort, - BlockHistogramAtomic >::Type InternalBlockHistogram; - - /// Shared memory storage layout type for BlockHistogram - typedef typename InternalBlockHistogram::TempStorage _TempStorage; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - -public: - - /// \smemstorage{BlockHistogram} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockHistogram() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockHistogram( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Histogram operations - *********************************************************************/ - //@{ - - - /** - * \brief Initialize the shared histogram counters to zero. - * - * \par Snippet - * The code snippet below illustrates a the initialization and update of a - * histogram of 512 integer samples that are partitioned across 128 threads - * where each thread owns 4 samples. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each - * typedef cub::BlockHistogram BlockHistogram; - * - * // Allocate shared memory for BlockHistogram - * __shared__ typename BlockHistogram::TempStorage temp_storage; - * - * // Allocate shared memory for block-wide histogram bin counts - * __shared__ unsigned int smem_histogram[256]; - * - * // Obtain input samples per thread - * unsigned char thread_samples[4]; - * ... - * - * // Initialize the block-wide histogram - * BlockHistogram(temp_storage).InitHistogram(smem_histogram); - * - * // Update the block-wide histogram - * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); - * - * \endcode - * - * \tparam CounterT [inferred] Histogram counter type - */ - template - __device__ __forceinline__ void InitHistogram(CounterT histogram[BINS]) - { - // Initialize histogram bin counts to zeros - int histo_offset = 0; - - #pragma unroll - for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) - { - histogram[histo_offset + linear_tid] = 0; - } - // Finish up with guarded initialization if necessary - if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) - { - histogram[histo_offset + linear_tid] = 0; - } - } - - - /** - * \brief Constructs a block-wide histogram in shared/device-accessible memory. Each thread contributes an array of input elements. - * - * \par - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a 256-bin histogram of 512 integer samples that - * are partitioned across 128 threads where each thread owns 4 samples. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each - * typedef cub::BlockHistogram BlockHistogram; - * - * // Allocate shared memory for BlockHistogram - * __shared__ typename BlockHistogram::TempStorage temp_storage; - * - * // Allocate shared memory for block-wide histogram bin counts - * __shared__ unsigned int smem_histogram[256]; - * - * // Obtain input samples per thread - * unsigned char thread_samples[4]; - * ... - * - * // Compute the block-wide histogram - * BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram); - * - * \endcode - * - * \tparam CounterT [inferred] Histogram counter type - */ - template < - typename CounterT > - __device__ __forceinline__ void Histogram( - T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram - CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram - { - // Initialize histogram bin counts to zeros - InitHistogram(histogram); - - CTA_SYNC(); - - // Composite the histogram - InternalBlockHistogram(temp_storage).Composite(items, histogram); - } - - - - /** - * \brief Updates an existing block-wide histogram in shared/device-accessible memory. Each thread composites an array of input elements. - * - * \par - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a the initialization and update of a - * histogram of 512 integer samples that are partitioned across 128 threads - * where each thread owns 4 samples. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each - * typedef cub::BlockHistogram BlockHistogram; - * - * // Allocate shared memory for BlockHistogram - * __shared__ typename BlockHistogram::TempStorage temp_storage; - * - * // Allocate shared memory for block-wide histogram bin counts - * __shared__ unsigned int smem_histogram[256]; - * - * // Obtain input samples per thread - * unsigned char thread_samples[4]; - * ... - * - * // Initialize the block-wide histogram - * BlockHistogram(temp_storage).InitHistogram(smem_histogram); - * - * // Update the block-wide histogram - * BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram); - * - * \endcode - * - * \tparam CounterT [inferred] Histogram counter type - */ - template < - typename CounterT > - __device__ __forceinline__ void Composite( - T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram - CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram - { - InternalBlockHistogram(temp_storage).Composite(items, histogram); - } - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/block_load.cuh b/SRC/cub/block/block_load.cuh deleted file mode 100644 index 217f5212..00000000 --- a/SRC/cub/block/block_load.cuh +++ /dev/null @@ -1,1241 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Operations for reading linear tiles of data into the CUDA thread block. - */ - -#pragma once - -#include - -#include "block_exchange.cuh" -#include "../iterator/cache_modified_input_iterator.cuh" -#include "../util_ptx.cuh" -#include "../util_macro.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIo - * @{ - */ - - -/******************************************************************//** - * \name Blocked arrangement I/O (direct) - *********************************************************************/ -//@{ - - -/** - * \brief Load a linear segment of items into a blocked arrangement across the thread block. - * - * \blocked - * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectBlocked( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load -{ - InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); - - // Load directly in thread-blocked order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - items[ITEM] = thread_itr[ITEM]; - } -} - - -/** - * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range. - * - * \blocked - * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectBlocked( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load -{ - InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items) - { - items[ITEM] = thread_itr[ITEM]; - } - } -} - - -/** - * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.. - * - * \blocked - * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - typename InputT, - typename DefaultT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectBlocked( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items -{ - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - items[ITEM] = oob_default; - - LoadDirectBlocked(linear_tid, block_itr, items, valid_items); -} - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -/** - * Internal implementation for load vectorization - */ -template < - CacheLoadModifier MODIFIER, - typename T, - int ITEMS_PER_THREAD> -__device__ __forceinline__ void InternalLoadDirectBlockedVectorized( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - T *block_ptr, ///< [in] Input pointer for loading from - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load -{ - // Biggest memory access word that T is a whole multiple of - typedef typename UnitWord::DeviceWord DeviceWord; - - enum - { - TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord), - - VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ? - 4 : - (TOTAL_WORDS % 2 == 0) ? - 2 : - 1, - - VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE, - }; - - // Vector type - typedef typename CubVector::Type Vector; - - // Vector items - Vector vec_items[VECTORS_PER_THREAD]; - - // Aliased input ptr - Vector* vec_ptr = reinterpret_cast(block_ptr) + (linear_tid * VECTORS_PER_THREAD); - - // Load directly in thread-blocked order - #pragma unroll - for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++) - { - vec_items[ITEM] = ThreadLoad(vec_ptr + ITEM); - } - - // Copy - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - items[ITEM] = *(reinterpret_cast(vec_items) + ITEM); - } -} - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/** - * \brief Load a linear segment of items into a blocked arrangement across the thread block. - * - * \blocked - * - * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned - * - * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: - * - \p ITEMS_PER_THREAD is odd - * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) - * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - */ -template < - typename T, - int ITEMS_PER_THREAD> -__device__ __forceinline__ void LoadDirectBlockedVectorized( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - T *block_ptr, ///< [in] Input pointer for loading from - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load -{ - InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); -} - - -//@} end member group -/******************************************************************//** - * \name Striped arrangement I/O (direct) - *********************************************************************/ -//@{ - - -/** - * \brief Load a linear segment of items into a striped arrangement across the thread block. - * - * \striped - * - * \tparam BLOCK_THREADS The thread block size in threads - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - int BLOCK_THREADS, - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load -{ - InputIteratorT thread_itr = block_itr + linear_tid; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - items[ITEM] = thread_itr[ITEM * BLOCK_THREADS]; - } -} - - -/** - * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range - * - * \striped - * - * \tparam BLOCK_THREADS The thread block size in threads - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - int BLOCK_THREADS, - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load -{ - InputIteratorT thread_itr = block_itr + linear_tid; - - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items) - { - items[ITEM] = thread_itr[ITEM * BLOCK_THREADS]; - } - } -} - - -/** - * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. - * - * \striped - * - * \tparam BLOCK_THREADS The thread block size in threads - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - int BLOCK_THREADS, - typename InputT, - typename DefaultT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items -{ - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - items[ITEM] = oob_default; - - LoadDirectStriped(linear_tid, block_itr, items, valid_items); -} - - - -//@} end member group -/******************************************************************//** - * \name Warp-striped arrangement I/O (direct) - *********************************************************************/ -//@{ - - -/** - * \brief Load a linear segment of items into a warp-striped arrangement across the thread block. - * - * \warpstriped - * - * \par Usage Considerations - * The number of threads in the thread block must be a multiple of the architecture's warp size. - * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectWarpStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load -{ - int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); - int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; - int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; - - InputIteratorT thread_itr = block_itr + warp_offset + tid ; - - // Load directly in warp-striped order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)]; - } -} - - -/** - * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range - * - * \warpstriped - * - * \par Usage Considerations - * The number of threads in the thread block must be a multiple of the architecture's warp size. - * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - typename InputT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectWarpStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load -{ - int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); - int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; - int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; - - InputIteratorT thread_itr = block_itr + warp_offset + tid ; - - // Load directly in warp-striped order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) - { - items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)]; - } - } -} - - -/** - * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements. - * - * \warpstriped - * - * \par Usage Considerations - * The number of threads in the thread block must be a multiple of the architecture's warp size. - * - * \tparam T [inferred] The data type to load. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam InputIteratorT [inferred] The random-access iterator type for input \iterator. - */ -template < - typename InputT, - typename DefaultT, - int ITEMS_PER_THREAD, - typename InputIteratorT> -__device__ __forceinline__ void LoadDirectWarpStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items -{ - // Load directly in warp-striped order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - items[ITEM] = oob_default; - - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); -} - - - -//@} end member group - -/** @} */ // end group UtilIo - - - -//----------------------------------------------------------------------------- -// Generic BlockLoad abstraction -//----------------------------------------------------------------------------- - -/** - * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. - */ - -/** - * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block. - */ -enum BlockLoadAlgorithm -{ - /** - * \par Overview - * - * A [blocked arrangement](index.html#sec5sec3) of data is read - * directly from memory. - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) decreases as the - * access stride between threads increases (i.e., the number items per thread). - */ - BLOCK_LOAD_DIRECT, - - /** - * \par Overview - * - * A [blocked arrangement](index.html#sec5sec3) of data is read - * from memory using CUDA's built-in vectorized loads as a coalescing optimization. - * For example, ld.global.v4.s32 instructions will be generated - * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high until the the - * access stride between threads (i.e., the number items per thread) exceeds the - * maximum vector load width (typically 4 items or 64B, whichever is lower). - * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT: - * - \p ITEMS_PER_THREAD is odd - * - The \p InputIteratorTis not a simple pointer type - * - The block input offset is not quadword-aligned - * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) - */ - BLOCK_LOAD_VECTORIZE, - - /** - * \par Overview - * - * A [striped arrangement](index.html#sec5sec3) of data is read - * efficiently from memory and then locally transposed into a - * [blocked arrangement](index.html#sec5sec3). - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high regardless - * of items loaded per thread. - * - The local reordering incurs slightly longer latencies and throughput than the - * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. - */ - BLOCK_LOAD_TRANSPOSE, - - - /** - * \par Overview - * - * A [warp-striped arrangement](index.html#sec5sec3) of data is - * read efficiently from memory and then locally transposed into a - * [blocked arrangement](index.html#sec5sec3). - * - * \par Usage Considerations - * - BLOCK_THREADS must be a multiple of WARP_THREADS - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high regardless - * of items loaded per thread. - * - The local reordering incurs slightly larger latencies than the - * direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives. - * - Provisions more shared storage, but incurs smaller latencies than the - * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative. - */ - BLOCK_LOAD_WARP_TRANSPOSE, - - - /** - * \par Overview - * - * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [warp-striped arrangement](index.html#sec5sec3) - * of data is read directly from memory and then is locally transposed into a - * [blocked arrangement](index.html#sec5sec3). To reduce the shared memory - * requirement, only one warp's worth of shared memory is provisioned and is - * subsequently time-sliced among warps. - * - * \par Usage Considerations - * - BLOCK_THREADS must be a multiple of WARP_THREADS - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high regardless - * of items loaded per thread. - * - Provisions less shared memory temporary storage, but incurs larger - * latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative. - */ - BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, -}; - - -/** - * \brief The BlockLoad class provides [collective](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [blocked arrangement](index.html#sec5sec3) across a CUDA thread block. ![](block_load_logo.png) - * \ingroup BlockModule - * \ingroup UtilIo - * - * \tparam InputT The data type to read into (which must be convertible from the input iterator's value type). - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. - * \tparam ALGORITHM [optional] cub::BlockLoadAlgorithm tuning policy. default: cub::BLOCK_LOAD_DIRECT. - * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - The BlockLoad class provides a single data movement abstraction that can be specialized - * to implement different cub::BlockLoadAlgorithm strategies. This facilitates different - * performance policies for different architectures, data types, granularity sizes, etc. - * - BlockLoad can be optionally specialized by different data movement strategies: - * -# cub::BLOCK_LOAD_DIRECT. A [blocked arrangement](index.html#sec5sec3) - * of data is read directly from memory. [More...](\ref cub::BlockLoadAlgorithm) - * -# cub::BLOCK_LOAD_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) - * of data is read directly from memory using CUDA's built-in vectorized loads as a - * coalescing optimization. [More...](\ref cub::BlockLoadAlgorithm) - * -# cub::BLOCK_LOAD_TRANSPOSE. A [striped arrangement](index.html#sec5sec3) - * of data is read directly from memory and is then locally transposed into a - * [blocked arrangement](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm) - * -# cub::BLOCK_LOAD_WARP_TRANSPOSE. A [warp-striped arrangement](index.html#sec5sec3) - * of data is read directly from memory and is then locally transposed into a - * [blocked arrangement](index.html#sec5sec3). [More...](\ref cub::BlockLoadAlgorithm) - * -# cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,. A [warp-striped arrangement](index.html#sec5sec3) - * of data is read directly from memory and is then locally transposed into a - * [blocked arrangement](index.html#sec5sec3) one warp at a time. [More...](\ref cub::BlockLoadAlgorithm) - * - \rowmajor - * - * \par A Simple Example - * \blockcollective{BlockLoad} - * \par - * The code snippet below illustrates the loading of a linear - * segment of 512 integers into a "blocked" arrangement across 128 threads where each - * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, - * meaning memory references are efficiently coalesced using a warp-striped access - * pattern (after which items are locally reordered among threads). - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockLoad BlockLoad; - * - * // Allocate shared memory for BlockLoad - * __shared__ typename BlockLoad::TempStorage temp_storage; - * - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage).Load(d_data, thread_data); - * - * \endcode - * \par - * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... - * The set of \p thread_data across the block of threads in those threads will be - * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * - */ -template < - typename InputT, - int BLOCK_DIM_X, - int ITEMS_PER_THREAD, - BlockLoadAlgorithm ALGORITHM = BLOCK_LOAD_DIRECT, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockLoad -{ -private: - - /****************************************************************************** - * Constants and typed definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - - /****************************************************************************** - * Algorithmic variants - ******************************************************************************/ - - /// Load helper - template - struct LoadInternal; - - - /** - * BLOCK_LOAD_DIRECT specialization of load helper - */ - template - struct LoadInternal - { - /// Shared memory storage layout type - typedef NullType TempStorage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ LoadInternal( - TempStorage &/*temp_storage*/, - int linear_tid) - : - linear_tid(linear_tid) - {} - - /// Load a linear segment of items from memory - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load - { - LoadDirectBlocked(linear_tid, block_itr, items); - } - - /// Load a linear segment of items from memory, guarded by range - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load - { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items); - } - - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items - { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); - } - - }; - - - /** - * BLOCK_LOAD_VECTORIZE specialization of load helper - */ - template - struct LoadInternal - { - /// Shared memory storage layout type - typedef NullType TempStorage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ LoadInternal( - TempStorage &/*temp_storage*/, - int linear_tid) - : - linear_tid(linear_tid) - {} - - /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) - template - __device__ __forceinline__ void Load( - InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load - { - InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); - } - - /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) - template - __device__ __forceinline__ void Load( - const InputT *block_ptr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load - { - InternalLoadDirectBlockedVectorized(linear_tid, block_ptr, items); - } - - /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization) - template < - CacheLoadModifier MODIFIER, - typename ValueType, - typename OffsetT> - __device__ __forceinline__ void Load( - CacheModifiedInputIterator block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load - { - InternalLoadDirectBlockedVectorized(linear_tid, block_itr.ptr, items); - } - - /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization) - template - __device__ __forceinline__ void Load( - _InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load - { - LoadDirectBlocked(linear_tid, block_itr, items); - } - - /// Load a linear segment of items from memory, guarded by range (skips vectorization) - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load - { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items); - } - - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization) - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items - { - LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default); - } - - }; - - - /** - * BLOCK_LOAD_TRANSPOSE specialization of load helper - */ - template - struct LoadInternal - { - // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - {}; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ LoadInternal( - TempStorage &temp_storage, - int linear_tid) - : - temp_storage(temp_storage.Alias()), - linear_tid(linear_tid) - {} - - /// Load a linear segment of items from memory - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ - { - LoadDirectStriped(linear_tid, block_itr, items); - BlockExchange(temp_storage).StripedToBlocked(items, items); - } - - /// Load a linear segment of items from memory, guarded by range - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load - { - LoadDirectStriped(linear_tid, block_itr, items, valid_items); - BlockExchange(temp_storage).StripedToBlocked(items, items); - } - - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items - { - LoadDirectStriped(linear_tid, block_itr, items, valid_items, oob_default); - BlockExchange(temp_storage).StripedToBlocked(items, items); - } - - }; - - - /** - * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper - */ - template - struct LoadInternal - { - enum - { - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) - }; - - // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); - - // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - {}; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ LoadInternal( - TempStorage &temp_storage, - int linear_tid) - : - temp_storage(temp_storage.Alias()), - linear_tid(linear_tid) - {} - - /// Load a linear segment of items from memory - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ - { - LoadDirectWarpStriped(linear_tid, block_itr, items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); - } - - /// Load a linear segment of items from memory, guarded by range - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load - { - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); - } - - - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items - { - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); - } - }; - - - /** - * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper - */ - template - struct LoadInternal - { - enum - { - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) - }; - - // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); - - // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - {}; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ LoadInternal( - TempStorage &temp_storage, - int linear_tid) - : - temp_storage(temp_storage.Alias()), - linear_tid(linear_tid) - {} - - /// Load a linear segment of items from memory - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load{ - { - LoadDirectWarpStriped(linear_tid, block_itr, items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); - } - - /// Load a linear segment of items from memory, guarded by range - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load - { - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); - } - - - /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items - { - LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default); - BlockExchange(temp_storage).WarpStripedToBlocked(items, items); - } - }; - - - /****************************************************************************** - * Type definitions - ******************************************************************************/ - - /// Internal load implementation to use - typedef LoadInternal InternalLoad; - - - /// Shared memory storage layout type - typedef typename InternalLoad::TempStorage _TempStorage; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - -public: - - /// \smemstorage{BlockLoad} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockLoad() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockLoad( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - - - //@} end member group - /******************************************************************//** - * \name Data movement - *********************************************************************/ - //@{ - - - /** - * \brief Load a linear segment of items from memory. - * - * \par - * - \blocked - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the loading of a linear - * segment of 512 integers into a "blocked" arrangement across 128 threads where each - * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, - * meaning memory references are efficiently coalesced using a warp-striped access - * pattern (after which items are locally reordered among threads). - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockLoad BlockLoad; - * - * // Allocate shared memory for BlockLoad - * __shared__ typename BlockLoad::TempStorage temp_storage; - * - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage).Load(d_data, thread_data); - * - * \endcode - * \par - * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, .... - * The set of \p thread_data across the block of threads in those threads will be - * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * - */ - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load - { - InternalLoad(temp_storage, linear_tid).Load(block_itr, items); - } - - - /** - * \brief Load a linear segment of items from memory, guarded by range. - * - * \par - * - \blocked - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the guarded loading of a linear - * segment of 512 integers into a "blocked" arrangement across 128 threads where each - * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, - * meaning memory references are efficiently coalesced using a warp-striped access - * pattern (after which items are locally reordered among threads). - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, int valid_items, ...) - * { - * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockLoad BlockLoad; - * - * // Allocate shared memory for BlockLoad - * __shared__ typename BlockLoad::TempStorage temp_storage; - * - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items); - * - * \endcode - * \par - * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6... and \p valid_items is \p 5. - * The set of \p thread_data across the block of threads in those threads will be - * { [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }, with only the first two threads - * being unmasked to load portions of valid data (and other items remaining unassigned). - * - */ - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items) ///< [in] Number of valid items to load - { - InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items); - } - - - /** - * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements - * - * \par - * - \blocked - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the guarded loading of a linear - * segment of 512 integers into a "blocked" arrangement across 128 threads where each - * thread owns 4 consecutive items. The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE, - * meaning memory references are efficiently coalesced using a warp-striped access - * pattern (after which items are locally reordered among threads). - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, int valid_items, ...) - * { - * // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockLoad BlockLoad; - * - * // Allocate shared memory for BlockLoad - * __shared__ typename BlockLoad::TempStorage temp_storage; - * - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1); - * - * \endcode - * \par - * Suppose the input \p d_data is 0, 1, 2, 3, 4, 5, 6..., - * \p valid_items is \p 5, and the out-of-bounds default is \p -1. - * The set of \p thread_data across the block of threads in those threads will be - * { [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }, with only the first two threads - * being unmasked to load portions of valid data (and other items are assigned \p -1) - * - */ - template - __device__ __forceinline__ void Load( - InputIteratorT block_itr, ///< [in] The thread block's base input iterator for loading from - InputT (&items)[ITEMS_PER_THREAD], ///< [out] Data to load - int valid_items, ///< [in] Number of valid items to load - DefaultT oob_default) ///< [in] Default value to assign out-of-bound items - { - InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default); - } - - - //@} end member group - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/block_radix_rank.cuh b/SRC/cub/block/block_radix_rank.cuh deleted file mode 100644 index c26451c6..00000000 --- a/SRC/cub/block/block_radix_rank.cuh +++ /dev/null @@ -1,696 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block - */ - -#pragma once - -#include - -#include "../thread/thread_reduce.cuh" -#include "../thread/thread_scan.cuh" -#include "../block/block_scan.cuh" -#include "../util_ptx.cuh" -#include "../util_arch.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block. - * \ingroup BlockModule - * - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam RADIX_BITS The number of radix bits per digit place - * \tparam IS_DESCENDING Whether or not the sorted-order is high-to-low - * \tparam MEMOIZE_OUTER_SCAN [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details. - * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) - * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * Blah... - * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits). - * - \blocked - * - * \par Performance Considerations - * - \granularity - * - * \par Examples - * \par - * - Example 1: Simple radix rank of 32-bit integer keys - * \code - * #include - * - * template - * __global__ void ExampleKernel(...) - * { - * - * \endcode - */ -template < - int BLOCK_DIM_X, - int RADIX_BITS, - bool IS_DESCENDING, - bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, - BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, - cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockRadixRank -{ -private: - - /****************************************************************************** - * Type definitions and constants - ******************************************************************************/ - - // Integer type for digit counters (to be packed into words of type PackedCounters) - typedef unsigned short DigitCounter; - - // Integer type for packing DigitCounters into columns of shared memory banks - typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte), - unsigned long long, - unsigned int>::Type PackedCounter; - - enum - { - // The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - RADIX_DIGITS = 1 << RADIX_BITS, - - LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), - WARP_THREADS = 1 << LOG_WARP_THREADS, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - BYTES_PER_COUNTER = sizeof(DigitCounter), - LOG_BYTES_PER_COUNTER = Log2::VALUE, - - PACKING_RATIO = sizeof(PackedCounter) / sizeof(DigitCounter), - LOG_PACKING_RATIO = Log2::VALUE, - - LOG_COUNTER_LANES = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0), // Always at least one lane - COUNTER_LANES = 1 << LOG_COUNTER_LANES, - - // The number of packed counters per thread (plus one for padding) - PADDED_COUNTER_LANES = COUNTER_LANES + 1, - RAKING_SEGMENT = PADDED_COUNTER_LANES, - }; - -public: - - enum - { - /// Number of bin-starting offsets tracked per thread - BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), - }; - -private: - - - /// BlockScan type - typedef BlockScan< - PackedCounter, - BLOCK_DIM_X, - INNER_SCAN_ALGORITHM, - BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> - BlockScan; - - - /// Shared memory storage layout type for BlockRadixRank - struct __align__(16) _TempStorage - { - union Aliasable - { - DigitCounter digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO]; - PackedCounter raking_grid[BLOCK_THREADS][RAKING_SEGMENT]; - - } aliasable; - - // Storage for scanning local ranks - typename BlockScan::TempStorage block_scan; - }; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - /// Copy of raking segment, promoted to registers - PackedCounter cached_segment[RAKING_SEGMENT]; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /** - * Internal storage allocator - */ - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /** - * Performs upsweep raking reduction, returning the aggregate - */ - __device__ __forceinline__ PackedCounter Upsweep() - { - PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid]; - PackedCounter *raking_ptr; - - if (MEMOIZE_OUTER_SCAN) - { - // Copy data into registers - #pragma unroll - for (int i = 0; i < RAKING_SEGMENT; i++) - { - cached_segment[i] = smem_raking_ptr[i]; - } - raking_ptr = cached_segment; - } - else - { - raking_ptr = smem_raking_ptr; - } - - return internal::ThreadReduce(raking_ptr, Sum()); - } - - - /// Performs exclusive downsweep raking scan - __device__ __forceinline__ void ExclusiveDownsweep( - PackedCounter raking_partial) - { - PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid]; - - PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ? - cached_segment : - smem_raking_ptr; - - // Exclusive raking downsweep scan - internal::ThreadScanExclusive(raking_ptr, raking_ptr, Sum(), raking_partial); - - if (MEMOIZE_OUTER_SCAN) - { - // Copy data back to smem - #pragma unroll - for (int i = 0; i < RAKING_SEGMENT; i++) - { - smem_raking_ptr[i] = cached_segment[i]; - } - } - } - - - /** - * Reset shared memory digit counters - */ - __device__ __forceinline__ void ResetCounters() - { - // Reset shared memory digit counters - #pragma unroll - for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++) - { - *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0; - } - } - - - /** - * Block-scan prefix callback - */ - struct PrefixCallBack - { - __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate) - { - PackedCounter block_prefix = 0; - - // Propagate totals in packed fields - #pragma unroll - for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++) - { - block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED); - } - - return block_prefix; - } - }; - - - /** - * Scan shared memory digit counters. - */ - __device__ __forceinline__ void ScanCounters() - { - // Upsweep scan - PackedCounter raking_partial = Upsweep(); - - // Compute exclusive sum - PackedCounter exclusive_partial; - PrefixCallBack prefix_call_back; - BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back); - - // Downsweep scan with exclusive partial - ExclusiveDownsweep(exclusive_partial); - } - -public: - - /// \smemstorage{BlockScan} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockRadixRank() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockRadixRank( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Raking - *********************************************************************/ - //@{ - - /** - * \brief Rank keys. - */ - template < - typename UnsignedBits, - int KEYS_PER_THREAD> - __device__ __forceinline__ void RankKeys( - UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile - int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile - int current_bit, ///< [in] The least-significant bit position of the current digit to extract - int num_bits) ///< [in] The number of bits in the current digit - { - DigitCounter thread_prefixes[KEYS_PER_THREAD]; // For each key, the count of previous keys in this tile having the same digit - DigitCounter* digit_counters[KEYS_PER_THREAD]; // For each key, the byte-offset of its corresponding digit counter in smem - - // Reset shared memory digit counters - ResetCounters(); - - #pragma unroll - for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) - { - // Get digit - unsigned int digit = BFE(keys[ITEM], current_bit, num_bits); - - // Get sub-counter - unsigned int sub_counter = digit >> LOG_COUNTER_LANES; - - // Get counter lane - unsigned int counter_lane = digit & (COUNTER_LANES - 1); - - if (IS_DESCENDING) - { - sub_counter = PACKING_RATIO - 1 - sub_counter; - counter_lane = COUNTER_LANES - 1 - counter_lane; - } - - // Pointer to smem digit counter - digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter]; - - // Load thread-exclusive prefix - thread_prefixes[ITEM] = *digit_counters[ITEM]; - - // Store inclusive prefix - *digit_counters[ITEM] = thread_prefixes[ITEM] + 1; - } - - CTA_SYNC(); - - // Scan shared memory counters - ScanCounters(); - - CTA_SYNC(); - - // Extract the local ranks of each key - for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) - { - // Add in thread block exclusive prefix - ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM]; - } - } - - - /** - * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. - */ - template < - typename UnsignedBits, - int KEYS_PER_THREAD> - __device__ __forceinline__ void RankKeys( - UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile - int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) - int current_bit, ///< [in] The least-significant bit position of the current digit to extract - int num_bits, ///< [in] The number of bits in the current digit - int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] - { - // Rank keys - RankKeys(keys, ranks, current_bit, num_bits); - - // Get the inclusive and exclusive digit totals corresponding to the calling thread. - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; - - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - if (IS_DESCENDING) - bin_idx = RADIX_DIGITS - bin_idx - 1; - - // Obtain ex/inclusive digit counts. (Unfortunately these all reside in the - // first counter column, resulting in unavoidable bank conflicts.) - unsigned int counter_lane = (bin_idx & (COUNTER_LANES - 1)); - unsigned int sub_counter = bin_idx >> (LOG_COUNTER_LANES); - - exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter]; - } - } - } -}; - - - - - -/** - * Radix-rank using match.any - */ -template < - int BLOCK_DIM_X, - int RADIX_BITS, - bool IS_DESCENDING, - BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockRadixRankMatch -{ -private: - - /****************************************************************************** - * Type definitions and constants - ******************************************************************************/ - - typedef int32_t RankT; - typedef int32_t DigitCounterT; - - enum - { - // The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - RADIX_DIGITS = 1 << RADIX_BITS, - - LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), - WARP_THREADS = 1 << LOG_WARP_THREADS, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - PADDED_WARPS = ((WARPS & 0x1) == 0) ? - WARPS + 1 : - WARPS, - - COUNTERS = PADDED_WARPS * RADIX_DIGITS, - RAKING_SEGMENT = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS, - PADDED_RAKING_SEGMENT = ((RAKING_SEGMENT & 0x1) == 0) ? - RAKING_SEGMENT + 1 : - RAKING_SEGMENT, - }; - -public: - - enum - { - /// Number of bin-starting offsets tracked per thread - BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS), - }; - -private: - - /// BlockScan type - typedef BlockScan< - DigitCounterT, - BLOCK_THREADS, - INNER_SCAN_ALGORITHM, - BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> - BlockScanT; - - - /// Shared memory storage layout type for BlockRadixRank - struct __align__(16) _TempStorage - { - typename BlockScanT::TempStorage block_scan; - - union __align__(16) Aliasable - { - volatile DigitCounterT warp_digit_counters[RADIX_DIGITS][PADDED_WARPS]; - DigitCounterT raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT]; - - } aliasable; - }; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - - -public: - - /// \smemstorage{BlockScan} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockRadixRankMatch( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Raking - *********************************************************************/ - //@{ - - /** - * \brief Rank keys. - */ - template < - typename UnsignedBits, - int KEYS_PER_THREAD> - __device__ __forceinline__ void RankKeys( - UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile - int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile - int current_bit, ///< [in] The least-significant bit position of the current digit to extract - int num_bits) ///< [in] The number of bits in the current digit - { - // Initialize shared digit counters - - #pragma unroll - for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) - temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0; - - CTA_SYNC(); - - // Each warp will strip-mine its section of input, one strip at a time - - volatile DigitCounterT *digit_counters[KEYS_PER_THREAD]; - uint32_t warp_id = linear_tid >> LOG_WARP_THREADS; - uint32_t lane_mask_lt = LaneMaskLt(); - - #pragma unroll - for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) - { - // My digit - uint32_t digit = BFE(keys[ITEM], current_bit, num_bits); - - if (IS_DESCENDING) - digit = RADIX_DIGITS - digit - 1; - - // Mask of peers who have same digit as me - uint32_t peer_mask = MatchAny(digit); - - // Pointer to smem digit counter for this key - digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id]; - - // Number of occurrences in previous strips - DigitCounterT warp_digit_prefix = *digit_counters[ITEM]; - - // Warp-sync - WARP_SYNC(0xFFFFFFFF); - - // Number of peers having same digit as me - int32_t digit_count = __popc(peer_mask); - - // Number of lower-ranked peers having same digit seen so far - int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt); - - if (peer_digit_prefix == 0) - { - // First thread for each digit updates the shared warp counter - *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count); - } - - // Warp-sync - WARP_SYNC(0xFFFFFFFF); - - // Number of prior keys having same digit - ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix); - } - - CTA_SYNC(); - - // Scan warp counters - - DigitCounterT scan_counters[PADDED_RAKING_SEGMENT]; - - #pragma unroll - for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) - scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM]; - - BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters); - - #pragma unroll - for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM) - temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM]; - - CTA_SYNC(); - - // Seed ranks with counter values from previous warps - #pragma unroll - for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM) - ranks[ITEM] += *digit_counters[ITEM]; - } - - - /** - * \brief Rank keys. For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread. - */ - template < - typename UnsignedBits, - int KEYS_PER_THREAD> - __device__ __forceinline__ void RankKeys( - UnsignedBits (&keys)[KEYS_PER_THREAD], ///< [in] Keys for this tile - int (&ranks)[KEYS_PER_THREAD], ///< [out] For each key, the local rank within the tile (out parameter) - int current_bit, ///< [in] The least-significant bit position of the current digit to extract - int num_bits, ///< [in] The number of bits in the current digit - int (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD]) ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1] - { - RankKeys(keys, ranks, current_bit, num_bits); - - // Get exclusive count for each digit - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track; - - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - { - if (IS_DESCENDING) - bin_idx = RADIX_DIGITS - bin_idx - 1; - - exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0]; - } - } - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/block/block_radix_sort.cuh b/SRC/cub/block/block_radix_sort.cuh deleted file mode 100644 index ac0c9f85..00000000 --- a/SRC/cub/block/block_radix_sort.cuh +++ /dev/null @@ -1,863 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockRadixSort class provides [collective](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block. - */ - - -#pragma once - -#include "block_exchange.cuh" -#include "block_radix_rank.cuh" -#include "../util_ptx.cuh" -#include "../util_arch.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief The BlockRadixSort class provides [collective](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method. ![](sorting_logo.png) - * \ingroup BlockModule - * - * \tparam KeyT KeyT type - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ITEMS_PER_THREAD The number of items per thread - * \tparam ValueT [optional] ValueT type (default: cub::NullType, which indicates a keys-only sort) - * \tparam RADIX_BITS [optional] The number of radix bits per digit place (default: 4 bits) - * \tparam MEMOIZE_OUTER_SCAN [optional] Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise). - * \tparam INNER_SCAN_ALGORITHM [optional] The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS) - * \tparam SMEM_CONFIG [optional] Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges - * items into ascending order. It relies upon a positional representation for - * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, - * characters, etc.) specified from least-significant to most-significant. For a - * given input sequence of keys and a set of rules specifying a total ordering - * of the symbolic alphabet, the radix sorting method produces a lexicographic - * ordering of those keys. - * - BlockRadixSort can sort all of the built-in C++ numeric primitive types - * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half - * half-precision floating-point type. Within each key, the implementation treats fixed-length - * bit-sequences of \p RADIX_BITS as radix digit places. Although the direct radix sorting - * method can only be applied to unsigned integral types, BlockRadixSort - * is able to sort signed and floating-point types via simple bit-wise transformations - * that ensure lexicographic key ordering. - * - \rowmajor - * - * \par Performance Considerations - * - \granularity - * - * \par A Simple Example - * \blockcollective{BlockRadixSort} - * \par - * The code snippet below illustrates a sort of 512 integer keys that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * ... - * - * // Collectively sort the keys - * BlockRadixSort(temp_storage).Sort(thread_keys); - * - * ... - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * - */ -template < - typename KeyT, - int BLOCK_DIM_X, - int ITEMS_PER_THREAD, - typename ValueT = NullType, - int RADIX_BITS = 4, - bool MEMOIZE_OUTER_SCAN = (CUB_PTX_ARCH >= 350) ? true : false, - BlockScanAlgorithm INNER_SCAN_ALGORITHM = BLOCK_SCAN_WARP_SCANS, - cudaSharedMemConfig SMEM_CONFIG = cudaSharedMemBankSizeFourByte, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockRadixSort -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - // The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - // Whether or not there are values to be trucked along with keys - KEYS_ONLY = Equals::VALUE, - }; - - // KeyT traits and unsigned bits type - typedef Traits KeyTraits; - typedef typename KeyTraits::UnsignedBits UnsignedBits; - - /// Ascending BlockRadixRank utility type - typedef BlockRadixRank< - BLOCK_DIM_X, - RADIX_BITS, - false, - MEMOIZE_OUTER_SCAN, - INNER_SCAN_ALGORITHM, - SMEM_CONFIG, - BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> - AscendingBlockRadixRank; - - /// Descending BlockRadixRank utility type - typedef BlockRadixRank< - BLOCK_DIM_X, - RADIX_BITS, - true, - MEMOIZE_OUTER_SCAN, - INNER_SCAN_ALGORITHM, - SMEM_CONFIG, - BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> - DescendingBlockRadixRank; - - /// BlockExchange utility type for keys - typedef BlockExchange BlockExchangeKeys; - - /// BlockExchange utility type for values - typedef BlockExchange BlockExchangeValues; - - /// Shared memory storage layout type - union _TempStorage - { - typename AscendingBlockRadixRank::TempStorage asending_ranking_storage; - typename DescendingBlockRadixRank::TempStorage descending_ranking_storage; - typename BlockExchangeKeys::TempStorage exchange_keys; - typename BlockExchangeValues::TempStorage exchange_values; - }; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - /// Rank keys (specialized for ascending sort) - __device__ __forceinline__ void RankKeys( - UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - int begin_bit, - int pass_bits, - Int2Type /*is_descending*/) - { - AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys( - unsigned_keys, - ranks, - begin_bit, - pass_bits); - } - - /// Rank keys (specialized for descending sort) - __device__ __forceinline__ void RankKeys( - UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - int begin_bit, - int pass_bits, - Int2Type /*is_descending*/) - { - DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys( - unsigned_keys, - ranks, - begin_bit, - pass_bits); - } - - /// ExchangeValues (specialized for key-value sort, to-blocked arrangement) - __device__ __forceinline__ void ExchangeValues( - ValueT (&values)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - Int2Type /*is_keys_only*/, - Int2Type /*is_blocked*/) - { - CTA_SYNC(); - - // Exchange values through shared memory in blocked arrangement - BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks); - } - - /// ExchangeValues (specialized for key-value sort, to-striped arrangement) - __device__ __forceinline__ void ExchangeValues( - ValueT (&values)[ITEMS_PER_THREAD], - int (&ranks)[ITEMS_PER_THREAD], - Int2Type /*is_keys_only*/, - Int2Type /*is_blocked*/) - { - CTA_SYNC(); - - // Exchange values through shared memory in blocked arrangement - BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks); - } - - /// ExchangeValues (specialized for keys-only sort) - template - __device__ __forceinline__ void ExchangeValues( - ValueT (&/*values*/)[ITEMS_PER_THREAD], - int (&/*ranks*/)[ITEMS_PER_THREAD], - Int2Type /*is_keys_only*/, - Int2Type /*is_blocked*/) - {} - - /// Sort blocked arrangement - template - __device__ __forceinline__ void SortBlocked( - KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort - int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison - int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison - Int2Type is_descending, ///< Tag whether is a descending-order sort - Int2Type is_keys_only) ///< Tag whether is keys-only sort - { - UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = - reinterpret_cast(keys); - - // Twiddle bits if necessary - #pragma unroll - for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) - { - unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); - } - - // Radix sorting passes - while (true) - { - int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); - - // Rank the blocked keys - int ranks[ITEMS_PER_THREAD]; - RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); - begin_bit += RADIX_BITS; - - CTA_SYNC(); - - // Exchange keys through shared memory in blocked arrangement - BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); - - // Exchange values through shared memory in blocked arrangement - ExchangeValues(values, ranks, is_keys_only, Int2Type()); - - // Quit if done - if (begin_bit >= end_bit) break; - - CTA_SYNC(); - } - - // Untwiddle bits if necessary - #pragma unroll - for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) - { - unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); - } - } - -public: - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - /// Sort blocked -> striped arrangement - template - __device__ __forceinline__ void SortBlockedToStriped( - KeyT (&keys)[ITEMS_PER_THREAD], ///< Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< Values to sort - int begin_bit, ///< The beginning (least-significant) bit index needed for key comparison - int end_bit, ///< The past-the-end (most-significant) bit index needed for key comparison - Int2Type is_descending, ///< Tag whether is a descending-order sort - Int2Type is_keys_only) ///< Tag whether is keys-only sort - { - UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] = - reinterpret_cast(keys); - - // Twiddle bits if necessary - #pragma unroll - for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) - { - unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]); - } - - // Radix sorting passes - while (true) - { - int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit); - - // Rank the blocked keys - int ranks[ITEMS_PER_THREAD]; - RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending); - begin_bit += RADIX_BITS; - - CTA_SYNC(); - - // Check if this is the last pass - if (begin_bit >= end_bit) - { - // Last pass exchanges keys through shared memory in striped arrangement - BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks); - - // Last pass exchanges through shared memory in striped arrangement - ExchangeValues(values, ranks, is_keys_only, Int2Type()); - - // Quit - break; - } - - // Exchange keys through shared memory in blocked arrangement - BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks); - - // Exchange values through shared memory in blocked arrangement - ExchangeValues(values, ranks, is_keys_only, Int2Type()); - - CTA_SYNC(); - } - - // Untwiddle bits if necessary - #pragma unroll - for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) - { - unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]); - } - } - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - /// \smemstorage{BlockRadixSort} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockRadixSort() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockRadixSort( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Sorting (blocked arrangements) - *********************************************************************/ - //@{ - - /** - * \brief Performs an ascending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. - * - * \par - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive keys. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * ... - * - * // Collectively sort the keys - * BlockRadixSort(temp_storage).Sort(thread_keys); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. - * The corresponding output \p thread_keys in those threads will be - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - */ - __device__ __forceinline__ void Sort( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - NullType values[ITEMS_PER_THREAD]; - - SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - - /** - * \brief Performs an ascending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. - * - * \par - * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" - * more than one tile of values, simply perform a key-value sort of the keys paired - * with a temporary value array that enumerates the key indices. The reordered indices - * can then be used as a gather-vector for exchanging other associated tile data through - * shared memory. - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys and values that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive pairs. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * int thread_values[4]; - * ... - * - * // Collectively sort the keys and values among block threads - * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be - * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. - * - */ - __device__ __forceinline__ void Sort( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - /** - * \brief Performs a descending block-wide radix sort over a [blocked arrangement](index.html#sec5sec3) of keys. - * - * \par - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive keys. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * ... - * - * // Collectively sort the keys - * BlockRadixSort(temp_storage).Sort(thread_keys); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. - * The corresponding output \p thread_keys in those threads will be - * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. - */ - __device__ __forceinline__ void SortDescending( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - NullType values[ITEMS_PER_THREAD]; - - SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - - /** - * \brief Performs a descending block-wide radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values. - * - * \par - * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" - * more than one tile of values, simply perform a key-value sort of the keys paired - * with a temporary value array that enumerates the key indices. The reordered indices - * can then be used as a gather-vector for exchanging other associated tile data through - * shared memory. - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys and values that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive pairs. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * int thread_values[4]; - * ... - * - * // Collectively sort the keys and values among block threads - * BlockRadixSort(temp_storage).Sort(thread_keys, thread_values); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be - * { [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }. - * - */ - __device__ __forceinline__ void SortDescending( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - SortBlocked(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - - //@} end member group - /******************************************************************//** - * \name Sorting (blocked arrangement -> striped arrangement) - *********************************************************************/ - //@{ - - - /** - * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). - * - * \par - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys that - * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive keys. The final partitioning is striped. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * ... - * - * // Collectively sort the keys - * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be - * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. - * - */ - __device__ __forceinline__ void SortBlockedToStriped( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - NullType values[ITEMS_PER_THREAD]; - - SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - - /** - * \brief Performs an ascending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). - * - * \par - * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" - * more than one tile of values, simply perform a key-value sort of the keys paired - * with a temporary value array that enumerates the key indices. The reordered indices - * can then be used as a gather-vector for exchanging other associated tile data through - * shared memory. - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys and values that - * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive pairs. The final partitioning is striped. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * int thread_values[4]; - * ... - * - * // Collectively sort the keys and values among block threads - * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be - * { [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }. - * - */ - __device__ __forceinline__ void SortBlockedToStriped( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - - /** - * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys, leaving them in a [striped arrangement](index.html#sec5sec3). - * - * \par - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys that - * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive keys. The final partitioning is striped. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * ... - * - * // Collectively sort the keys - * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be - * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. - * - */ - __device__ __forceinline__ void SortDescendingBlockedToStriped( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - NullType values[ITEMS_PER_THREAD]; - - SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - - /** - * \brief Performs a descending radix sort across a [blocked arrangement](index.html#sec5sec3) of keys and values, leaving them in a [striped arrangement](index.html#sec5sec3). - * - * \par - * - BlockRadixSort can only accommodate one associated tile of values. To "truck along" - * more than one tile of values, simply perform a key-value sort of the keys paired - * with a temporary value array that enumerates the key indices. The reordered indices - * can then be used as a gather-vector for exchanging other associated tile data through - * shared memory. - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sort of 512 integer keys and values that - * are initially partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive pairs. The final partitioning is striped. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each - * typedef cub::BlockRadixSort BlockRadixSort; - * - * // Allocate shared memory for BlockRadixSort - * __shared__ typename BlockRadixSort::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_keys[4]; - * int thread_values[4]; - * ... - * - * // Collectively sort the keys and values among block threads - * BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values); - * - * \endcode - * \par - * Suppose the set of input \p thread_keys across the block of threads is - * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. The - * corresponding output \p thread_keys in those threads will be - * { [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }. - * - */ - __device__ __forceinline__ void SortDescendingBlockedToStriped( - KeyT (&keys)[ITEMS_PER_THREAD], ///< [in-out] Keys to sort - ValueT (&values)[ITEMS_PER_THREAD], ///< [in-out] Values to sort - int begin_bit = 0, ///< [in] [optional] The beginning (least-significant) bit index needed for key comparison - int end_bit = sizeof(KeyT) * 8) ///< [in] [optional] The past-the-end (most-significant) bit index needed for key comparison - { - SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type(), Int2Type()); - } - - - //@} end member group - -}; - -/** - * \example example_block_radix_sort.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/block_raking_layout.cuh b/SRC/cub/block/block_raking_layout.cuh deleted file mode 100644 index 35006168..00000000 --- a/SRC/cub/block/block_raking_layout.cuh +++ /dev/null @@ -1,152 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data. - */ - - -#pragma once - -#include "../util_macro.cuh" -#include "../util_arch.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. ![](raking.png) - * \ingroup BlockModule - * - * \par Overview - * This type facilitates a shared memory usage pattern where a block of CUDA - * threads places elements into shared memory and then reduces the active - * parallelism to one "raking" warp of threads for serially aggregating consecutive - * sequences of shared items. Padding is inserted to eliminate bank conflicts - * (for most data types). - * - * \tparam T The data type to be exchanged. - * \tparam BLOCK_THREADS The thread block size in threads. - * \tparam PTX_ARCH [optional] \ptxversion - */ -template < - typename T, - int BLOCK_THREADS, - int PTX_ARCH = CUB_PTX_ARCH> -struct BlockRakingLayout -{ - //--------------------------------------------------------------------- - // Constants and type definitions - //--------------------------------------------------------------------- - - enum - { - /// The total number of elements that need to be cooperatively reduced - SHARED_ELEMENTS = BLOCK_THREADS, - - /// Maximum number of warp-synchronous raking threads - MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)), - - /// Number of raking elements per warp-synchronous raking thread (rounded up) - SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS, - - /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads) - RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH, - - /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1) - HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0), - - /// Degree of bank conflicts (e.g., 4-way) - CONFLICT_DEGREE = (HAS_CONFLICTS) ? - (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) : - 1, - - /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load - USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2), - - /// Total number of elements in the raking grid - GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING), - - /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads) - UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0), - }; - - - /** - * \brief Shared memory storage type - */ - struct __align__(16) _TempStorage - { - T buff[BlockRakingLayout::GRID_ELEMENTS]; - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /** - * \brief Returns the location for the calling thread to place data into the grid - */ - static __device__ __forceinline__ T* PlacementPtr( - TempStorage &temp_storage, - unsigned int linear_tid) - { - // Offset for partial - unsigned int offset = linear_tid; - - // Add in one padding element for every segment - if (USE_SEGMENT_PADDING > 0) - { - offset += offset / SEGMENT_LENGTH; - } - - // Incorporating a block of padding partials every shared memory segment - return temp_storage.Alias().buff + offset; - } - - - /** - * \brief Returns the location for the calling thread to begin sequential raking - */ - static __device__ __forceinline__ T* RakingPtr( - TempStorage &temp_storage, - unsigned int linear_tid) - { - return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING)); - } -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/block_reduce.cuh b/SRC/cub/block/block_reduce.cuh deleted file mode 100644 index 261f2ea6..00000000 --- a/SRC/cub/block/block_reduce.cuh +++ /dev/null @@ -1,607 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. - */ - -#pragma once - -#include "specializations/block_reduce_raking.cuh" -#include "specializations/block_reduce_raking_commutative_only.cuh" -#include "specializations/block_reduce_warp_reductions.cuh" -#include "../util_ptx.cuh" -#include "../util_type.cuh" -#include "../thread/thread_operators.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - - -/****************************************************************************** - * Algorithmic variants - ******************************************************************************/ - -/** - * BlockReduceAlgorithm enumerates alternative algorithms for parallel - * reduction across a CUDA thread block. - */ -enum BlockReduceAlgorithm -{ - - /** - * \par Overview - * An efficient "raking" reduction algorithm that only supports commutative - * reduction operators (true for most operations, e.g., addition). - * - * \par - * Execution is comprised of three phases: - * -# Upsweep sequential reduction in registers (if threads contribute more - * than one input each). Threads in warps other than the first warp place - * their partial reductions into shared memory. - * -# Upsweep sequential reduction in shared memory. Threads within the first - * warp continue to accumulate by raking across segments of shared partial reductions - * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. - * - * \par - * \image html block_reduce.png - *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
- * - * \par Performance Considerations - * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE - * and is preferable when the reduction operator is commutative. This variant - * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall - * throughput across the GPU when suitably occupied. However, turn-around latency may be - * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable - * when the GPU is under-occupied. - */ - BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, - - - /** - * \par Overview - * An efficient "raking" reduction algorithm that supports commutative - * (e.g., addition) and non-commutative (e.g., string concatenation) reduction - * operators. \blocked. - * - * \par - * Execution is comprised of three phases: - * -# Upsweep sequential reduction in registers (if threads contribute more - * than one input each). Each thread then places the partial reduction - * of its item(s) into shared memory. - * -# Upsweep sequential reduction in shared memory. Threads within a - * single warp rake across segments of shared partial reductions. - * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. - * - * \par - * \image html block_reduce.png - *
\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
- * - * \par Performance Considerations - * - This variant performs more communication than BLOCK_REDUCE_RAKING - * and is only preferable when the reduction operator is non-commutative. This variant - * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall - * throughput across the GPU when suitably occupied. However, turn-around latency may be - * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable - * when the GPU is under-occupied. - */ - BLOCK_REDUCE_RAKING, - - - /** - * \par Overview - * A quick "tiled warp-reductions" reduction algorithm that supports commutative - * (e.g., addition) and non-commutative (e.g., string concatenation) reduction - * operators. - * - * \par - * Execution is comprised of four phases: - * -# Upsweep sequential reduction in registers (if threads contribute more - * than one input each). Each thread then places the partial reduction - * of its item(s) into shared memory. - * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style - * reduction within each warp. - * -# A propagation phase where the warp reduction outputs in each warp are - * updated with the aggregate from each preceding warp. - * - * \par - * \image html block_scan_warpscans.png - *
\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
- * - * \par Performance Considerations - * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING - * or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall - * throughput across the GPU. However turn-around latency may be lower and - * thus useful when the GPU is under-occupied. - */ - BLOCK_REDUCE_WARP_REDUCTIONS, -}; - - -/****************************************************************************** - * Block reduce - ******************************************************************************/ - -/** - * \brief The BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png) - * \ingroup BlockModule - * - * \tparam T Data type being reduced - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ALGORITHM [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - A reduction (or fold) - * uses a binary combining operator to compute a single aggregate from a list of input elements. - * - \rowmajor - * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles: - * -# cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY. An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) - * -# cub::BLOCK_REDUCE_RAKING. An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) - * -# cub::BLOCK_REDUCE_WARP_REDUCTIONS. A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) - * - * \par Performance Considerations - * - \granularity - * - Very efficient (only one synchronization barrier). - * - Incurs zero bank conflicts for most types - * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: - * - Summation (vs. generic reduction) - * - \p BLOCK_THREADS is a multiple of the architecture's warp size - * - Every thread has a valid input (i.e., full vs. partial-tiles) - * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives - * - * \par A Simple Example - * \blockcollective{BlockReduce} - * \par - * The code snippet below illustrates a sum reduction of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Compute the block-wide sum for thread0 - * int aggregate = BlockReduce(temp_storage).Sum(thread_data); - * - * \endcode - * - */ -template < - typename T, - int BLOCK_DIM_X, - BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockReduce -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - typedef BlockReduceWarpReductions WarpReductions; - typedef BlockReduceRakingCommutativeOnly RakingCommutativeOnly; - typedef BlockReduceRaking Raking; - - /// Internal specialization type - typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS), - WarpReductions, - typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY), - RakingCommutativeOnly, - Raking>::Type>::Type InternalBlockReduce; // BlockReduceRaking - - /// Shared memory storage layout type for BlockReduce - typedef typename InternalBlockReduce::TempStorage _TempStorage; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - -public: - - /// \smemstorage{BlockReduce} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockReduce() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockReduce( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Generic reductions - *********************************************************************/ - //@{ - - - /** - * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes one input element. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a max reduction of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Each thread obtains an input item - * int thread_data; - * ... - * - * // Compute the block-wide max for thread0 - * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); - * - * \endcode - * - * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - ReductionOp reduction_op) ///< [in] Binary reduction functor - { - return InternalBlockReduce(temp_storage).template Reduce(input, BLOCK_THREADS, reduction_op); - } - - - /** - * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. Each thread contributes an array of consecutive input elements. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a max reduction of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Compute the block-wide max for thread0 - * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); - * - * \endcode - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment - ReductionOp reduction_op) ///< [in] Binary reduction functor - { - // Reduce partials - T partial = internal::ThreadReduce(inputs, reduction_op); - return Reduce(partial, reduction_op); - } - - - /** - * \brief Computes a block-wide reduction for thread0 using the specified binary reduction functor. The first \p num_valid threads each contribute one input element. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a max reduction of a partially-full tile of integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int num_valid, ...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Each thread obtains an input item - * int thread_data; - * if (threadIdx.x < num_valid) thread_data = ... - * - * // Compute the block-wide max for thread0 - * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid); - * - * \endcode - * - * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - ReductionOp reduction_op, ///< [in] Binary reduction functor - int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) - { - // Determine if we scan skip bounds checking - if (num_valid >= BLOCK_THREADS) - { - return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); - } - else - { - return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); - } - } - - - //@} end member group - /******************************************************************//** - * \name Summation reductions - *********************************************************************/ - //@{ - - - /** - * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes one input element. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sum reduction of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Each thread obtains an input item - * int thread_data; - * ... - * - * // Compute the block-wide sum for thread0 - * int aggregate = BlockReduce(temp_storage).Sum(thread_data); - * - * \endcode - * - */ - __device__ __forceinline__ T Sum( - T input) ///< [in] Calling thread's input - { - return InternalBlockReduce(temp_storage).template Sum(input, BLOCK_THREADS); - } - - /** - * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sum reduction of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Compute the block-wide sum for thread0 - * int aggregate = BlockReduce(temp_storage).Sum(thread_data); - * - * \endcode - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - */ - template - __device__ __forceinline__ T Sum( - T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment - { - // Reduce partials - T partial = internal::ThreadReduce(inputs, cub::Sum()); - return Sum(partial); - } - - - /** - * \brief Computes a block-wide reduction for thread0 using addition (+) as the reduction operator. The first \p num_valid threads each contribute one input element. - * - * \par - * - The return value is undefined in threads other than thread0. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int num_valid, ...) - * { - * // Specialize BlockReduce for a 1D block of 128 threads on type int - * typedef cub::BlockReduce BlockReduce; - * - * // Allocate shared memory for BlockReduce - * __shared__ typename BlockReduce::TempStorage temp_storage; - * - * // Each thread obtains an input item (up to num_items) - * int thread_data; - * if (threadIdx.x < num_valid) - * thread_data = ... - * - * // Compute the block-wide sum for thread0 - * int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid); - * - * \endcode - * - */ - __device__ __forceinline__ T Sum( - T input, ///< [in] Calling thread's input - int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) - { - // Determine if we scan skip bounds checking - if (num_valid >= BLOCK_THREADS) - { - return InternalBlockReduce(temp_storage).template Sum(input, num_valid); - } - else - { - return InternalBlockReduce(temp_storage).template Sum(input, num_valid); - } - } - - - //@} end member group -}; - -/** - * \example example_block_reduce.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/block_scan.cuh b/SRC/cub/block/block_scan.cuh deleted file mode 100644 index 27ea7ed4..00000000 --- a/SRC/cub/block/block_scan.cuh +++ /dev/null @@ -1,2126 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. - */ - -#pragma once - -#include "specializations/block_scan_raking.cuh" -#include "specializations/block_scan_warp_scans.cuh" -#include "../util_arch.cuh" -#include "../util_type.cuh" -#include "../util_ptx.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Algorithmic variants - ******************************************************************************/ - -/** - * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block. - */ -enum BlockScanAlgorithm -{ - - /** - * \par Overview - * An efficient "raking reduce-then-scan" prefix scan algorithm. Execution is comprised of five phases: - * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. - * -# Upsweep sequential reduction in shared memory. Threads within a single warp rake across segments of shared partial reductions. - * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp. - * -# Downsweep sequential exclusive scan in shared memory. Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output. - * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. - * - * \par - * \image html block_scan_raking.png - *
\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
- * - * \par Performance Considerations - * - Although this variant may suffer longer turnaround latencies when the - * GPU is under-occupied, it can often provide higher overall throughput - * across the GPU when suitably occupied. - */ - BLOCK_SCAN_RAKING, - - - /** - * \par Overview - * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at - * the expense of higher register pressure. Raking threads preserve their - * "upsweep" segment of values in registers while performing warp-synchronous - * scan, allowing the "downsweep" not to re-read them from shared memory. - */ - BLOCK_SCAN_RAKING_MEMOIZE, - - - /** - * \par Overview - * A quick "tiled warpscans" prefix scan algorithm. Execution is comprised of four phases: - * -# Upsweep sequential reduction in registers (if threads contribute more than one input each). Each thread then places the partial reduction of its item(s) into shared memory. - * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp. - * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp. - * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output. - * - * \par - * \image html block_scan_warpscans.png - *
\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.
- * - * \par Performance Considerations - * - Although this variant may suffer lower overall throughput across the - * GPU because due to a heavy reliance on inefficient warpscans, it can - * often provide lower turnaround latencies when the GPU is under-occupied. - */ - BLOCK_SCAN_WARP_SCANS, -}; - - -/****************************************************************************** - * Block scan - ******************************************************************************/ - -/** - * \brief The BlockScan class provides [collective](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png) - * \ingroup BlockModule - * - * \tparam T Data type being scanned - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ALGORITHM [optional] cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) - * produces an output list where each element is computed to be the reduction - * of the elements occurring earlier in the input list. Prefix sum - * connotes a prefix scan with the addition operator. The term \em inclusive indicates - * that the ith output reduction incorporates the ith input. - * The term \em exclusive indicates the ith input is not incorporated into - * the ith output reduction. - * - \rowmajor - * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles: - * -# cub::BLOCK_SCAN_RAKING. An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) - * -# cub::BLOCK_SCAN_RAKING_MEMOIZE. Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm) - * -# cub::BLOCK_SCAN_WARP_SCANS. A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm) - * - * \par Performance Considerations - * - \granularity - * - Uses special instructions when applicable (e.g., warp \p SHFL) - * - Uses synchronization-free communication between warp lanes when applicable - * - Invokes a minimal number of minimal block-wide synchronization barriers (only - * one or two depending on algorithm selection) - * - Incurs zero bank conflicts for most types - * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: - * - Prefix sum variants (vs. generic scan) - * - \blocksize - * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives - * - * \par A Simple Example - * \blockcollective{BlockScan} - * \par - * The code snippet below illustrates an exclusive prefix sum of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide exclusive prefix sum - * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * {[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}. - * The corresponding output \p thread_data in those threads will be - * {[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}. - * - */ -template < - typename T, - int BLOCK_DIM_X, - BlockScanAlgorithm ALGORITHM = BLOCK_SCAN_RAKING, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockScan -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - /** - * Ensure the template parameterization meets the requirements of the - * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy - * cannot be used with thread block sizes not a multiple of the - * architectural warp size. - */ - static const BlockScanAlgorithm SAFE_ALGORITHM = - ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ? - BLOCK_SCAN_RAKING : - ALGORITHM; - - typedef BlockScanWarpScans WarpScans; - typedef BlockScanRaking Raking; - - /// Define the delegate type for the desired algorithm - typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS), - WarpScans, - Raking>::Type InternalBlockScan; - - /// Shared memory storage layout type for BlockScan - typedef typename InternalBlockScan::TempStorage _TempStorage; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /****************************************************************************** - * Public types - ******************************************************************************/ -public: - - /// \smemstorage{BlockScan} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockScan() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockScan( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - - //@} end member group - /******************************************************************//** - * \name Exclusive prefix sum operations - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. - * - * \par - * - \identityzero - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix sum of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide exclusive prefix sum - * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. - * - */ - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output) ///< [out] Calling thread's output item (may be aliased to \p input) - { - T initial_value = 0; - ExclusiveScan(input, output, initial_value, cub::Sum()); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. The value of 0 is applied as the initial value, and is assigned to \p output in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \identityzero - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix sum of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide exclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 0, 1, ..., 127. - * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. - * - */ - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - T initial_value = 0; - ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \identityzero - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an exclusive prefix sum over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total += block_aggregate; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockScan for a 1D block of 128 threads - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the block-wide exclusive prefix sum - * BlockScan(temp_storage).ExclusiveSum( - * thread_data, thread_data, prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... - * The corresponding output for the first segment will be 0, 1, ..., 127. - * The output for the second segment will be 128, 129, ..., 255. - * - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); - } - - - //@} end member group - /******************************************************************//** - * \name Exclusive prefix sum operations (multiple data per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. - * - * \par - * - \identityzero - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix sum of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide exclusive prefix sum - * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - */ - template - __device__ __forceinline__ void ExclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) - { - T initial_value = 0; - ExclusiveScan(input, output, initial_value, cub::Sum()); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. The value of 0 is applied as the initial value, and is assigned to \p output[0] in thread0. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \identityzero - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix sum of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide exclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - */ - template - __device__ __forceinline__ void ExclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - // Reduce consecutive thread items in registers - T initial_value = 0; - ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \identityzero - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an exclusive prefix sum over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) - * across 128 threads where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total += block_aggregate; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread - * typedef cub::BlockLoad BlockLoad; - * typedef cub::BlockStore BlockStore; - * typedef cub::BlockScan BlockScan; - * - * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan - * __shared__ union { - * typename BlockLoad::TempStorage load; - * typename BlockScan::TempStorage scan; - * typename BlockStore::TempStorage store; - * } temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - * CTA_SYNC(); - * - * // Collectively compute the block-wide exclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage.scan).ExclusiveSum( - * thread_data, thread_data, prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - * CTA_SYNC(); - * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... - * The corresponding output for the first segment will be 0, 1, 2, 3, ..., 510, 511. - * The output for the second segment will be 512, 513, 514, 515, ..., 1022, 1023. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - int ITEMS_PER_THREAD, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); - } - - - - //@} end member group // Exclusive prefix sums - /******************************************************************//** - * \name Exclusive prefix scan operations - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide exclusive prefix max scan - * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) - ScanOp scan_op) ///< [in] Binary scan functor - { - InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide exclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be INT_MIN, 0, 0, 2, ..., 124, 126. - * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &output, ///< [out] Calling thread's output items (may be aliased to \p input) - T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an exclusive prefix max scan over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockScan for a 1D block of 128 threads - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(INT_MIN); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the block-wide exclusive prefix max scan - * BlockScan(temp_storage).ExclusiveScan( - * thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, ..., 124, 126. - * The output for the second segment will be 126, 128, 128, 130, ..., 252, 254. - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op); - } - - - //@} end member group // Inclusive prefix sums - /******************************************************************//** - * \name Exclusive prefix scan operations (multiple data per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide exclusive prefix max scan - * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. - * The corresponding output \p thread_data in those threads will be - * { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) - ScanOp scan_op) ///< [in] Binary scan functor - { - // Reduce consecutive thread items in registers - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op); - - // Exclusive scan in registers with prefix as seed - internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide exclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The - * corresponding output \p thread_data in those threads will be { [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }. - * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T initial_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in thread0) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - // Reduce consecutive thread items in registers - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate); - - // Exclusive scan in registers with prefix as seed - internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an exclusive prefix max scan over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread - * typedef cub::BlockLoad BlockLoad; - * typedef cub::BlockStore BlockStore; - * typedef cub::BlockScan BlockScan; - * - * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan - * __shared__ union { - * typename BlockLoad::TempStorage load; - * typename BlockScan::TempStorage scan; - * typename BlockStore::TempStorage store; - * } temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - * CTA_SYNC(); - * - * // Collectively compute the block-wide exclusive prefix max scan - * BlockScan(temp_storage.scan).ExclusiveScan( - * thread_data, thread_data, INT_MIN, cub::Max(), prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - * CTA_SYNC(); - * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510. - * The output for the second segment will be 510, 512, 512, 514, 514, 516, ..., 1020, 1022. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - // Reduce consecutive thread items in registers - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); - - // Exclusive scan in registers with prefix as seed - internal::ThreadScanExclusive(input, output, scan_op, thread_prefix); - } - - - //@} end member group -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans - - /******************************************************************//** - * \name Exclusive prefix scan operations (no initial value, single datum per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor - { - InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate); - } - - //@} end member group - /******************************************************************//** - * \name Exclusive prefix scan operations (no initial value, multiple data per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. With no initial value, the output computed for thread0 is undefined. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor - { - // Reduce consecutive thread items in registers - T thread_partial = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_partial, thread_partial, scan_op); - - // Exclusive scan in registers with prefix - internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); - } - - - /** - * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void ExclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - // Reduce consecutive thread items in registers - T thread_partial = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate); - - // Exclusive scan in registers with prefix - internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); - } - - - //@} end member group -#endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans - - /******************************************************************//** - * \name Inclusive prefix sum operations - *********************************************************************/ - //@{ - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. - * - * \par - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix sum of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide inclusive prefix sum - * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. - * - */ - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output) ///< [out] Calling thread's output item (may be aliased to \p input) - { - InclusiveScan(input, output, cub::Sum()); - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix sum of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide inclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 1, 1, ..., 1. The - * corresponding output \p thread_data in those threads will be 1, 2, ..., 128. - * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads. - * - */ - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - InclusiveScan(input, output, cub::Sum(), block_aggregate); - } - - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes one input element. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an inclusive prefix sum over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total += block_aggregate; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockScan for a 1D block of 128 threads - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the block-wide inclusive prefix sum - * BlockScan(temp_storage).InclusiveSum( - * thread_data, thread_data, prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... - * The corresponding output for the first segment will be 1, 2, ..., 128. - * The output for the second segment will be 129, 130, ..., 256. - * - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op); - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive prefix sum operations (multiple data per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix sum of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide inclusive prefix sum - * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - */ - template - __device__ __forceinline__ void InclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD]) ///< [out] Calling thread's output items (may be aliased to \p input) - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveSum(input[0], output[0]); - } - else - { - // Reduce consecutive thread items in registers - Sum scan_op; - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveSum(thread_prefix, thread_prefix); - - // Inclusive scan in registers with prefix as seed - internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); - } - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix sum of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide inclusive prefix sum - * int block_aggregate; - * BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }. The - * corresponding output \p thread_data in those threads will be - * { [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }. - * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void InclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveSum(input[0], output[0], block_aggregate); - } - else - { - // Reduce consecutive thread items in registers - Sum scan_op; - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveSum(thread_prefix, thread_prefix, block_aggregate); - - // Inclusive scan in registers with prefix as seed - internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); - } - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator. Each thread contributes an array of consecutive input elements. Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an inclusive prefix sum over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 512 integer items that are partitioned in a [blocked arrangement](index.html#sec5sec3) - * across 128 threads where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total += block_aggregate; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread - * typedef cub::BlockLoad BlockLoad; - * typedef cub::BlockStore BlockStore; - * typedef cub::BlockScan BlockScan; - * - * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan - * __shared__ union { - * typename BlockLoad::TempStorage load; - * typename BlockScan::TempStorage scan; - * typename BlockStore::TempStorage store; - * } temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - * CTA_SYNC(); - * - * // Collectively compute the block-wide inclusive prefix sum - * BlockScan(temp_storage.scan).IncluisveSum( - * thread_data, thread_data, prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - * CTA_SYNC(); - * } - * \endcode - * \par - * Suppose the input \p d_data is 1, 1, 1, 1, 1, 1, 1, 1, .... - * The corresponding output for the first segment will be 1, 2, 3, 4, ..., 511, 512. - * The output for the second segment will be 513, 514, 515, 516, ..., 1023, 1024. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - int ITEMS_PER_THREAD, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveSum( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveSum(input[0], output[0], block_prefix_callback_op); - } - else - { - // Reduce consecutive thread items in registers - Sum scan_op; - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op); - - // Inclusive scan in registers with prefix as seed - internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); - } - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive prefix scan operations - *********************************************************************/ - //@{ - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide inclusive prefix max scan - * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor - { - InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op); - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that - * are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain input item for each thread - * int thread_data; - * ... - * - * // Collectively compute the block-wide inclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is 0, -1, 2, -3, ..., 126, -127. The - * corresponding output \p thread_data in those threads will be 0, 0, 2, 2, ..., 126, 126. - * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads. - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate); - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - Supports non-commutative scan operators. - * - \rowmajor - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an inclusive prefix max scan over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockScan for a 1D block of 128 threads - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(INT_MIN); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data = d_data[block_offset]; - * - * // Collectively compute the block-wide inclusive prefix max scan - * BlockScan(temp_storage).InclusiveScan( - * thread_data, thread_data, cub::Max(), prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * d_data[block_offset] = thread_data; - * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be 0, 0, 2, 2, ..., 126, 126. - * The output for the second segment will be 128, 128, 130, 130, ..., 254, 254. - * - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op); - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive prefix scan operations (multiple data per thread) - *********************************************************************/ - //@{ - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide inclusive prefix max scan - * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. The - * corresponding output \p thread_data in those threads will be { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void InclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan functor - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveScan(input[0], output[0], scan_op); - } - else - { - // Reduce consecutive thread items in registers - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_prefix, thread_prefix, scan_op); - - // Inclusive scan in registers with prefix as seed (first thread does not seed) - internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); - } - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that - * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads - * where each thread owns 4 consecutive items. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Specialize BlockScan for a 1D block of 128 threads on type int - * typedef cub::BlockScan BlockScan; - * - * // Allocate shared memory for BlockScan - * __shared__ typename BlockScan::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Collectively compute the block-wide inclusive prefix max scan - * int block_aggregate; - * BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is - * { [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }. - * The corresponding output \p thread_data in those threads will be - * { [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }. - * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp> - __device__ __forceinline__ void InclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - T &block_aggregate) ///< [out] block-wide aggregate reduction of input items - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveScan(input[0], output[0], scan_op, block_aggregate); - } - else - { - // Reduce consecutive thread items in registers - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan (with no initial value) - ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate); - - // Inclusive scan in registers with prefix as seed (first thread does not seed) - internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0)); - } - } - - - /** - * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - * - * \par - * - The \p block_prefix_callback_op functor must implement a member function T operator()(T block_aggregate). - * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation. - * The functor will be invoked by the first warp of threads in the block, however only the return value from - * lane0 is applied as the block-wide prefix. Can be stateful. - * - Supports non-commutative scan operators. - * - \blocked - * - \granularity - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates a single thread block that progressively - * computes an inclusive prefix max scan over multiple "tiles" of input using a - * prefix functor to maintain a running total between block-wide scans. Each tile consists - * of 128 integer items that are partitioned across 128 threads. - * \par - * \code - * #include // or equivalently - * - * // A stateful callback functor that maintains a running prefix to be applied - * // during consecutive scan operations. - * struct BlockPrefixCallbackOp - * { - * // Running prefix - * int running_total; - * - * // Constructor - * __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {} - * - * // Callback operator to be entered by the first warp of threads in the block. - * // Thread-0 is responsible for returning a value for seeding the block-wide scan. - * __device__ int operator()(int block_aggregate) - * { - * int old_prefix = running_total; - * running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix; - * return old_prefix; - * } - * }; - * - * __global__ void ExampleKernel(int *d_data, int num_items, ...) - * { - * // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread - * typedef cub::BlockLoad BlockLoad; - * typedef cub::BlockStore BlockStore; - * typedef cub::BlockScan BlockScan; - * - * // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan - * __shared__ union { - * typename BlockLoad::TempStorage load; - * typename BlockScan::TempStorage scan; - * typename BlockStore::TempStorage store; - * } temp_storage; - * - * // Initialize running total - * BlockPrefixCallbackOp prefix_op(0); - * - * // Have the block iterate over segments of items - * for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4) - * { - * // Load a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data); - * CTA_SYNC(); - * - * // Collectively compute the block-wide inclusive prefix max scan - * BlockScan(temp_storage.scan).InclusiveScan( - * thread_data, thread_data, cub::Max(), prefix_op); - * CTA_SYNC(); - * - * // Store scanned items to output segment - * BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data); - * CTA_SYNC(); - * } - * \endcode - * \par - * Suppose the input \p d_data is 0, -1, 2, -3, 4, -5, .... - * The corresponding output for the first segment will be 0, 0, 2, 2, 4, 4, ..., 510, 510. - * The output for the second segment will be 512, 512, 514, 514, 516, 516, ..., 1022, 1022. - * - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam BlockPrefixCallbackOp [inferred] Call-back functor type having member T operator()(T block_aggregate) - */ - template < - int ITEMS_PER_THREAD, - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items - T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan functor - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence. - { - if (ITEMS_PER_THREAD == 1) - { - InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op); - } - else - { - // Reduce consecutive thread items in registers - T thread_prefix = internal::ThreadReduce(input, scan_op); - - // Exclusive thread block-scan - ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op); - - // Inclusive scan in registers with prefix as seed - internal::ThreadScanInclusive(input, output, scan_op, thread_prefix); - } - } - - //@} end member group - - -}; - -/** - * \example example_block_scan.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/block_shuffle.cuh b/SRC/cub/block/block_shuffle.cuh deleted file mode 100644 index a0cc71d2..00000000 --- a/SRC/cub/block/block_shuffle.cuh +++ /dev/null @@ -1,305 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. - */ - -#pragma once - -#include "../util_arch.cuh" -#include "../util_ptx.cuh" -#include "../util_macro.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief The BlockShuffle class provides [collective](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block. - * \ingroup BlockModule - * - * \tparam T The data type to be exchanged. - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * It is commonplace for blocks of threads to rearrange data items between - * threads. The BlockShuffle abstraction allows threads to efficiently shift items - * either (a) up to their successor or (b) down to their predecessor. - * - */ -template < - typename T, - int BLOCK_DIM_X, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockShuffle -{ -private: - - /****************************************************************************** - * Constants - ******************************************************************************/ - - enum - { - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - LOG_WARP_THREADS = CUB_LOG_WARP_THREADS(PTX_ARCH), - WARP_THREADS = 1 << LOG_WARP_THREADS, - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - }; - - /****************************************************************************** - * Type definitions - ******************************************************************************/ - - /// Shared memory storage layout type (last element from each thread's input) - struct _TempStorage - { - T prev[BLOCK_THREADS]; - T next[BLOCK_THREADS]; - }; - - -public: - - /// \smemstorage{BlockShuffle} - struct TempStorage : Uninitialized<_TempStorage> {}; - -private: - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - /// Linear thread-id - unsigned int linear_tid; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - -public: - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockShuffle() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockShuffle( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Shuffle movement - *********************************************************************/ - //@{ - - - /** - * \brief Each threadi obtains the \p input provided by threadi+distance. The offset \p distance may be negative. - * - * \par - * - \smemreuse - */ - __device__ __forceinline__ void Offset( - T input, ///< [in] The input item from the calling thread (threadi) - T& output, ///< [out] The \p input item from the successor (or predecessor) thread threadi+distance (may be aliased to \p input). This value is only updated for for threadi when 0 <= (i + \p distance) < BLOCK_THREADS-1 - int distance = 1) ///< [in] Offset distance (may be negative) - { - temp_storage[linear_tid].prev = input; - - CTA_SYNC(); - - if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS)) - output = temp_storage[linear_tid + distance].prev; - } - - - /** - * \brief Each threadi obtains the \p input provided by threadi+distance. - * - * \par - * - \smemreuse - */ - __device__ __forceinline__ void Rotate( - T input, ///< [in] The calling thread's input item - T& output, ///< [out] The \p input item from thread thread(i+distance>)% (may be aliased to \p input). This value is not updated for threadBLOCK_THREADS-1 - unsigned int distance = 1) ///< [in] Offset distance (0 < \p distance < BLOCK_THREADS) - { - temp_storage[linear_tid].prev = input; - - CTA_SYNC(); - - unsigned int offset = threadIdx.x + distance; - if (offset >= BLOCK_THREADS) - offset -= BLOCK_THREADS; - - output = temp_storage[offset].prev; - } - - - /** - * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it up by one item - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - */ - template - __device__ __forceinline__ void Up( - T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items - T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. - { - temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) - prev[ITEM] = input[ITEM - 1]; - - - if (linear_tid > 0) - prev[0] = temp_storage[linear_tid - 1].prev; - } - - - /** - * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it up by one item. All threads receive the \p input provided by threadBLOCK_THREADS-1. - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - */ - template - __device__ __forceinline__ void Up( - T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items - T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The item \p prev[0] is not updated for thread0. - T &block_suffix) ///< [out] The item \p input[ITEMS_PER_THREAD-1] from threadBLOCK_THREADS-1, provided to all threads - { - Up(input, prev); - block_suffix = temp_storage[BLOCK_THREADS - 1].prev; - } - - - /** - * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of \p input items, shifting it down by one item - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - */ - template - __device__ __forceinline__ void Down( - T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items - T (&prev)[ITEMS_PER_THREAD]) ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. - { - temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1]; - - CTA_SYNC(); - - #pragma unroll - for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM) - prev[ITEM] = input[ITEM - 1]; - - if (linear_tid > 0) - prev[0] = temp_storage[linear_tid - 1].prev; - } - - - /** - * \brief The thread block rotates its [blocked arrangement](index.html#sec5sec3) of input items, shifting it down by one item. All threads receive \p input[0] provided by thread0. - * - * \par - * - \blocked - * - \granularity - * - \smemreuse - */ - template - __device__ __forceinline__ void Down( - T (&input)[ITEMS_PER_THREAD], ///< [in] The calling thread's input items - T (&prev)[ITEMS_PER_THREAD], ///< [out] The corresponding predecessor items (may be aliased to \p input). The value \p prev[0] is not updated for threadBLOCK_THREADS-1. - T &block_prefix) ///< [out] The item \p input[0] from thread0, provided to all threads - { - Up(input, prev); - block_prefix = temp_storage[BLOCK_THREADS - 1].prev; - } - - //@} end member group - - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/block_store.cuh b/SRC/cub/block/block_store.cuh deleted file mode 100644 index 648bf9ff..00000000 --- a/SRC/cub/block/block_store.cuh +++ /dev/null @@ -1,1000 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Operations for writing linear segments of data from the CUDA thread block - */ - -#pragma once - -#include - -#include "block_exchange.cuh" -#include "../util_ptx.cuh" -#include "../util_macro.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIo - * @{ - */ - - -/******************************************************************//** - * \name Blocked arrangement I/O (direct) - *********************************************************************/ -//@{ - -/** - * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. - * - * \blocked - * - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. - */ -template < - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectBlocked( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store -{ - OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); - - // Store directly in thread-blocked order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - thread_itr[ITEM] = items[ITEM]; - } -} - - -/** - * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range - * - * \blocked - * - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. - */ -template < - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectBlocked( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write -{ - OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD); - - // Store directly in thread-blocked order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items) - { - thread_itr[ITEM] = items[ITEM]; - } - } -} - - -/** - * \brief Store a blocked arrangement of items across a thread block into a linear segment of items. - * - * \blocked - * - * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned, - * which is the default starting offset returned by \p cudaMalloc() - * - * \par - * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT: - * - \p ITEMS_PER_THREAD is odd - * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) - * - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * - */ -template < - typename T, - int ITEMS_PER_THREAD> -__device__ __forceinline__ void StoreDirectBlockedVectorized( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - T *block_ptr, ///< [in] Input pointer for storing from - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store -{ - enum - { - // Maximum CUDA vector size is 4 elements - MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD), - - // Vector size must be a power of two and an even divisor of the items per thread - VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ? - MAX_VEC_SIZE : - 1, - - VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE, - }; - - // Vector type - typedef typename CubVector::Type Vector; - - // Alias global pointer - Vector *block_ptr_vectors = reinterpret_cast(const_cast(block_ptr)); - - // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling) - Vector raw_vector[VECTORS_PER_THREAD]; - T *raw_items = reinterpret_cast(raw_vector); - - // Copy - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - raw_items[ITEM] = items[ITEM]; - } - - // Direct-store using vector types - StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector); -} - - - -//@} end member group -/******************************************************************//** - * \name Striped arrangement I/O (direct) - *********************************************************************/ -//@{ - - -/** - * \brief Store a striped arrangement of data across the thread block into a linear segment of items. - * - * \striped - * - * \tparam BLOCK_THREADS The thread block size in threads - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. - */ -template < - int BLOCK_THREADS, - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store -{ - OutputIteratorT thread_itr = block_itr + linear_tid; - - // Store directly in striped order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; - } -} - - -/** - * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range - * - * \striped - * - * \tparam BLOCK_THREADS The thread block size in threads - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. - */ -template < - int BLOCK_THREADS, - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write -{ - OutputIteratorT thread_itr = block_itr + linear_tid; - - // Store directly in striped order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items) - { - thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM]; - } - } -} - - - -//@} end member group -/******************************************************************//** - * \name Warp-striped arrangement I/O (direct) - *********************************************************************/ -//@{ - - -/** - * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items. - * - * \warpstriped - * - * \par Usage Considerations - * The number of threads in the thread block must be a multiple of the architecture's warp size. - * - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. - */ -template < - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectWarpStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load -{ - int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); - int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; - int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; - - OutputIteratorT thread_itr = block_itr + warp_offset + tid; - - // Store directly in warp-striped order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; - } -} - - -/** - * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range - * - * \warpstriped - * - * \par Usage Considerations - * The number of threads in the thread block must be a multiple of the architecture's warp size. - * - * \tparam T [inferred] The data type to store. - * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. - * \tparam OutputIteratorT [inferred] The random-access iterator type for output \iterator. - */ -template < - typename T, - int ITEMS_PER_THREAD, - typename OutputIteratorT> -__device__ __forceinline__ void StoreDirectWarpStriped( - int linear_tid, ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., (threadIdx.y * blockDim.x) + linear_tid for 2D thread blocks) - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write -{ - int tid = linear_tid & (CUB_PTX_WARP_THREADS - 1); - int wid = linear_tid >> CUB_PTX_LOG_WARP_THREADS; - int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD; - - OutputIteratorT thread_itr = block_itr + warp_offset + tid; - - // Store directly in warp-striped order - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) - { - if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items) - { - thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM]; - } - } -} - - -//@} end member group - - -/** @} */ // end group UtilIo - - -//----------------------------------------------------------------------------- -// Generic BlockStore abstraction -//----------------------------------------------------------------------------- - -/** - * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory. - */ -enum BlockStoreAlgorithm -{ - /** - * \par Overview - * - * A [blocked arrangement](index.html#sec5sec3) of data is written - * directly to memory. - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) decreases as the - * access stride between threads increases (i.e., the number items per thread). - */ - BLOCK_STORE_DIRECT, - - /** - * \par Overview - * - * A [blocked arrangement](index.html#sec5sec3) of data is written directly - * to memory using CUDA's built-in vectorized stores as a coalescing optimization. - * For example, st.global.v4.s32 instructions will be generated - * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0. - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high until the the - * access stride between threads (i.e., the number items per thread) exceeds the - * maximum vector store width (typically 4 items or 64B, whichever is lower). - * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT: - * - \p ITEMS_PER_THREAD is odd - * - The \p OutputIteratorT is not a simple pointer type - * - The block output offset is not quadword-aligned - * - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.) - */ - BLOCK_STORE_VECTORIZE, - - /** - * \par Overview - * A [blocked arrangement](index.html#sec5sec3) is locally - * transposed and then efficiently written to memory as a [striped arrangement](index.html#sec5sec3). - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high regardless - * of items written per thread. - * - The local reordering incurs slightly longer latencies and throughput than the - * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. - */ - BLOCK_STORE_TRANSPOSE, - - /** - * \par Overview - * A [blocked arrangement](index.html#sec5sec3) is locally - * transposed and then efficiently written to memory as a - * [warp-striped arrangement](index.html#sec5sec3) - * - * \par Usage Considerations - * - BLOCK_THREADS must be a multiple of WARP_THREADS - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high regardless - * of items written per thread. - * - The local reordering incurs slightly longer latencies and throughput than the - * direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives. - */ - BLOCK_STORE_WARP_TRANSPOSE, - - /** - * \par Overview - * A [blocked arrangement](index.html#sec5sec3) is locally - * transposed and then efficiently written to memory as a - * [warp-striped arrangement](index.html#sec5sec3) - * To reduce the shared memory requirement, only one warp's worth of shared - * memory is provisioned and is subsequently time-sliced among warps. - * - * \par Usage Considerations - * - BLOCK_THREADS must be a multiple of WARP_THREADS - * - * \par Performance Considerations - * - The utilization of memory transactions (coalescing) remains high regardless - * of items written per thread. - * - Provisions less shared memory temporary storage, but incurs larger - * latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative. - */ - BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, - -}; - - -/** - * \brief The BlockStore class provides [collective](index.html#sec0) data movement methods for writing a [blocked arrangement](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory. ![](block_store_logo.png) - * \ingroup BlockModule - * \ingroup UtilIo - * - * \tparam T The type of data to be written. - * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension - * \tparam ITEMS_PER_THREAD The number of consecutive items partitioned onto each thread. - * \tparam ALGORITHM [optional] cub::BlockStoreAlgorithm tuning policy enumeration. default: cub::BLOCK_STORE_DIRECT. - * \tparam WARP_TIME_SLICING [optional] Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false) - * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) - * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - The BlockStore class provides a single data movement abstraction that can be specialized - * to implement different cub::BlockStoreAlgorithm strategies. This facilitates different - * performance policies for different architectures, data types, granularity sizes, etc. - * - BlockStore can be optionally specialized by different data movement strategies: - * -# cub::BLOCK_STORE_DIRECT. A [blocked arrangement](index.html#sec5sec3) of data is written - * directly to memory. [More...](\ref cub::BlockStoreAlgorithm) - * -# cub::BLOCK_STORE_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) - * of data is written directly to memory using CUDA's built-in vectorized stores as a - * coalescing optimization. [More...](\ref cub::BlockStoreAlgorithm) - * -# cub::BLOCK_STORE_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) - * is locally transposed into a [striped arrangement](index.html#sec5sec3) which is - * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) - * -# cub::BLOCK_STORE_WARP_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) - * is locally transposed into a [warp-striped arrangement](index.html#sec5sec3) which is - * then written to memory. [More...](\ref cub::BlockStoreAlgorithm) - * - \rowmajor - * - * \par A Simple Example - * \blockcollective{BlockStore} - * \par - * The code snippet below illustrates the storing of a "blocked" arrangement - * of 512 integers across 128 threads (where each thread owns 4 consecutive items) - * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, - * meaning items are locally reordered among threads so that memory references will be - * efficiently coalesced using a warp-striped access pattern. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockStore BlockStore; - * - * // Allocate shared memory for BlockStore - * __shared__ typename BlockStore::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Store items to linear memory - * int thread_data[4]; - * BlockStore(temp_storage).Store(d_data, thread_data); - * - * \endcode - * \par - * Suppose the set of \p thread_data across the block of threads is - * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... - * - */ -template < - typename T, - int BLOCK_DIM_X, - int ITEMS_PER_THREAD, - BlockStoreAlgorithm ALGORITHM = BLOCK_STORE_DIRECT, - int BLOCK_DIM_Y = 1, - int BLOCK_DIM_Z = 1, - int PTX_ARCH = CUB_PTX_ARCH> -class BlockStore -{ -private: - /****************************************************************************** - * Constants and typed definitions - ******************************************************************************/ - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - - /****************************************************************************** - * Algorithmic variants - ******************************************************************************/ - - /// Store helper - template - struct StoreInternal; - - - /** - * BLOCK_STORE_DIRECT specialization of store helper - */ - template - struct StoreInternal - { - /// Shared memory storage layout type - typedef NullType TempStorage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ StoreInternal( - TempStorage &/*temp_storage*/, - int linear_tid) - : - linear_tid(linear_tid) - {} - - /// Store items into a linear segment of memory - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store - { - StoreDirectBlocked(linear_tid, block_itr, items); - } - - /// Store items into a linear segment of memory, guarded by range - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write - { - StoreDirectBlocked(linear_tid, block_itr, items, valid_items); - } - }; - - - /** - * BLOCK_STORE_VECTORIZE specialization of store helper - */ - template - struct StoreInternal - { - /// Shared memory storage layout type - typedef NullType TempStorage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ StoreInternal( - TempStorage &/*temp_storage*/, - int linear_tid) - : - linear_tid(linear_tid) - {} - - /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization) - __device__ __forceinline__ void Store( - T *block_ptr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store - { - StoreDirectBlockedVectorized(linear_tid, block_ptr, items); - } - - /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization) - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store - { - StoreDirectBlocked(linear_tid, block_itr, items); - } - - /// Store items into a linear segment of memory, guarded by range - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write - { - StoreDirectBlocked(linear_tid, block_itr, items, valid_items); - } - }; - - - /** - * BLOCK_STORE_TRANSPOSE specialization of store helper - */ - template - struct StoreInternal - { - // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - { - /// Temporary storage for partially-full block guard - volatile int valid_items; - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ StoreInternal( - TempStorage &temp_storage, - int linear_tid) - : - temp_storage(temp_storage.Alias()), - linear_tid(linear_tid) - {} - - /// Store items into a linear segment of memory - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store - { - BlockExchange(temp_storage).BlockedToStriped(items); - StoreDirectStriped(linear_tid, block_itr, items); - } - - /// Store items into a linear segment of memory, guarded by range - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write - { - BlockExchange(temp_storage).BlockedToStriped(items); - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - StoreDirectStriped(linear_tid, block_itr, items, temp_storage.valid_items); - } - }; - - - /** - * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper - */ - template - struct StoreInternal - { - enum - { - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) - }; - - // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); - - // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - { - /// Temporary storage for partially-full block guard - volatile int valid_items; - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ StoreInternal( - TempStorage &temp_storage, - int linear_tid) - : - temp_storage(temp_storage.Alias()), - linear_tid(linear_tid) - {} - - /// Store items into a linear segment of memory - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store - { - BlockExchange(temp_storage).BlockedToWarpStriped(items); - StoreDirectWarpStriped(linear_tid, block_itr, items); - } - - /// Store items into a linear segment of memory, guarded by range - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write - { - BlockExchange(temp_storage).BlockedToWarpStriped(items); - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); - } - }; - - - /** - * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper - */ - template - struct StoreInternal - { - enum - { - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH) - }; - - // Assert BLOCK_THREADS must be a multiple of WARP_THREADS - CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS"); - - // BlockExchange utility type for keys - typedef BlockExchange BlockExchange; - - /// Shared memory storage layout type - struct _TempStorage : BlockExchange::TempStorage - { - /// Temporary storage for partially-full block guard - volatile int valid_items; - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - - /// Constructor - __device__ __forceinline__ StoreInternal( - TempStorage &temp_storage, - int linear_tid) - : - temp_storage(temp_storage.Alias()), - linear_tid(linear_tid) - {} - - /// Store items into a linear segment of memory - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store - { - BlockExchange(temp_storage).BlockedToWarpStriped(items); - StoreDirectWarpStriped(linear_tid, block_itr, items); - } - - /// Store items into a linear segment of memory, guarded by range - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write - { - BlockExchange(temp_storage).BlockedToWarpStriped(items); - if (linear_tid == 0) - temp_storage.valid_items = valid_items; // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads - CTA_SYNC(); - StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items); - } - }; - - /****************************************************************************** - * Type definitions - ******************************************************************************/ - - /// Internal load implementation to use - typedef StoreInternal InternalStore; - - - /// Shared memory storage layout type - typedef typename InternalStore::TempStorage _TempStorage; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Internal storage allocator - __device__ __forceinline__ _TempStorage& PrivateStorage() - { - __shared__ _TempStorage private_storage; - return private_storage; - } - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Thread reference to shared storage - _TempStorage &temp_storage; - - /// Linear thread-id - int linear_tid; - -public: - - - /// \smemstorage{BlockStore} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using a private static allocation of shared memory as temporary storage. - */ - __device__ __forceinline__ BlockStore() - : - temp_storage(PrivateStorage()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. - */ - __device__ __forceinline__ BlockStore( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //@} end member group - /******************************************************************//** - * \name Data movement - *********************************************************************/ - //@{ - - - /** - * \brief Store items into a linear segment of memory. - * - * \par - * - \blocked - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the storing of a "blocked" arrangement - * of 512 integers across 128 threads (where each thread owns 4 consecutive items) - * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, - * meaning items are locally reordered among threads so that memory references will be - * efficiently coalesced using a warp-striped access pattern. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, ...) - * { - * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockStore BlockStore; - * - * // Allocate shared memory for BlockStore - * __shared__ typename BlockStore::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Store items to linear memory - * int thread_data[4]; - * BlockStore(temp_storage).Store(d_data, thread_data); - * - * \endcode - * \par - * Suppose the set of \p thread_data across the block of threads is - * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }. - * The output \p d_data will be 0, 1, 2, 3, 4, 5, .... - * - */ - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store - { - InternalStore(temp_storage, linear_tid).Store(block_itr, items); - } - - /** - * \brief Store items into a linear segment of memory, guarded by range. - * - * \par - * - \blocked - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the guarded storing of a "blocked" arrangement - * of 512 integers across 128 threads (where each thread owns 4 consecutive items) - * into a linear segment of memory. The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE, - * meaning items are locally reordered among threads so that memory references will be - * efficiently coalesced using a warp-striped access pattern. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(int *d_data, int valid_items, ...) - * { - * // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each - * typedef cub::BlockStore BlockStore; - * - * // Allocate shared memory for BlockStore - * __shared__ typename BlockStore::TempStorage temp_storage; - * - * // Obtain a segment of consecutive items that are blocked across threads - * int thread_data[4]; - * ... - * - * // Store items to linear memory - * int thread_data[4]; - * BlockStore(temp_storage).Store(d_data, thread_data, valid_items); - * - * \endcode - * \par - * Suppose the set of \p thread_data across the block of threads is - * { [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] } and \p valid_items is \p 5. - * The output \p d_data will be 0, 1, 2, 3, 4, ?, ?, ?, ..., with - * only the first two threads being unmasked to store portions of valid data. - * - */ - template - __device__ __forceinline__ void Store( - OutputIteratorT block_itr, ///< [in] The thread block's base output iterator for storing to - T (&items)[ITEMS_PER_THREAD], ///< [in] Data to store - int valid_items) ///< [in] Number of valid items to write - { - InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items); - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/specializations/block_histogram_atomic.cuh b/SRC/cub/block/specializations/block_histogram_atomic.cuh deleted file mode 100644 index 29db0df7..00000000 --- a/SRC/cub/block/specializations/block_histogram_atomic.cuh +++ /dev/null @@ -1,82 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. - */ - -#pragma once - -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. - */ -template -struct BlockHistogramAtomic -{ - /// Shared memory storage layout type - struct TempStorage {}; - - - /// Constructor - __device__ __forceinline__ BlockHistogramAtomic( - TempStorage &temp_storage) - {} - - - /// Composite data onto an existing histogram - template < - typename T, - typename CounterT, - int ITEMS_PER_THREAD> - __device__ __forceinline__ void Composite( - T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram - CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram - { - // Update histogram - #pragma unroll - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - { - atomicAdd(histogram + items[i], 1); - } - } - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/specializations/block_histogram_sort.cuh b/SRC/cub/block/specializations/block_histogram_sort.cuh deleted file mode 100644 index 9ef417ad..00000000 --- a/SRC/cub/block/specializations/block_histogram_sort.cuh +++ /dev/null @@ -1,226 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. - */ - -#pragma once - -#include "../../block/block_radix_sort.cuh" -#include "../../block/block_discontinuity.cuh" -#include "../../util_ptx.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - - -/** - * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. - */ -template < - typename T, ///< Sample type - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int ITEMS_PER_THREAD, ///< The number of samples per thread - int BINS, ///< The number of bins into which histogram samples may fall - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockHistogramSort -{ - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - // Parameterize BlockRadixSort type for our thread block - typedef BlockRadixSort< - T, - BLOCK_DIM_X, - ITEMS_PER_THREAD, - NullType, - 4, - (PTX_ARCH >= 350) ? true : false, - BLOCK_SCAN_WARP_SCANS, - cudaSharedMemBankSizeFourByte, - BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> - BlockRadixSortT; - - // Parameterize BlockDiscontinuity type for our thread block - typedef BlockDiscontinuity< - T, - BLOCK_DIM_X, - BLOCK_DIM_Y, - BLOCK_DIM_Z, - PTX_ARCH> - BlockDiscontinuityT; - - /// Shared memory - union _TempStorage - { - // Storage for sorting bin values - typename BlockRadixSortT::TempStorage sort; - - struct - { - // Storage for detecting discontinuities in the tile of sorted bin values - typename BlockDiscontinuityT::TempStorage flag; - - // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values - unsigned int run_begin[BINS]; - unsigned int run_end[BINS]; - }; - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - // Thread fields - _TempStorage &temp_storage; - unsigned int linear_tid; - - - /// Constructor - __device__ __forceinline__ BlockHistogramSort( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - // Discontinuity functor - struct DiscontinuityOp - { - // Reference to temp_storage - _TempStorage &temp_storage; - - // Constructor - __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) : - temp_storage(temp_storage) - {} - - // Discontinuity predicate - __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index) - { - if (a != b) - { - // Note the begin/end offsets in shared storage - temp_storage.run_begin[b] = b_index; - temp_storage.run_end[a] = b_index; - - return true; - } - else - { - return false; - } - } - }; - - - // Composite data onto an existing histogram - template < - typename CounterT > - __device__ __forceinline__ void Composite( - T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram - CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram - { - enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; - - // Sort bytes in blocked arrangement - BlockRadixSortT(temp_storage.sort).Sort(items); - - CTA_SYNC(); - - // Initialize the shared memory's run_begin and run_end for each bin - int histo_offset = 0; - - #pragma unroll - for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) - { - temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; - temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; - } - // Finish up with guarded initialization if necessary - if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) - { - temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; - temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; - } - - CTA_SYNC(); - - int flags[ITEMS_PER_THREAD]; // unused - - // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile - DiscontinuityOp flag_op(temp_storage); - BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op); - - // Update begin for first item - if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0; - - CTA_SYNC(); - - // Composite into histogram - histo_offset = 0; - - #pragma unroll - for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) - { - int thread_offset = histo_offset + linear_tid; - CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; - histogram[thread_offset] += count; - } - - // Finish up with guarded composition if necessary - if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) - { - int thread_offset = histo_offset + linear_tid; - CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; - histogram[thread_offset] += count; - } - } - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/specializations/block_reduce_raking.cuh b/SRC/cub/block/specializations/block_reduce_raking.cuh deleted file mode 100644 index aff97fc9..00000000 --- a/SRC/cub/block/specializations/block_reduce_raking.cuh +++ /dev/null @@ -1,226 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. - */ - -#pragma once - -#include "../../block/block_raking_layout.cuh" -#include "../../warp/warp_reduce.cuh" -#include "../../thread/thread_reduce.cuh" -#include "../../util_ptx.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. - * - * Supports non-commutative binary reduction operators. Unlike commutative - * reduction operators (e.g., addition), the application of a non-commutative - * reduction operator (e.g, string concatenation) across a sequence of inputs must - * honor the relative ordering of items and partial reductions when applying the - * reduction operator. - * - * Compared to the implementation of BlockReduceRaking (which does not support - * non-commutative operators), this implementation requires a few extra - * rounds of inter-thread communication. - */ -template < - typename T, ///< Data type being reduced - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockReduceRaking -{ - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - /// Layout type for padded thread block raking grid - typedef BlockRakingLayout BlockRakingLayout; - - /// WarpReduce utility type - typedef typename WarpReduce::InternalWarpReduce WarpReduce; - - /// Constants - enum - { - /// Number of raking threads - RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, - - /// Number of raking elements per warp synchronous raking thread - SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, - - /// Cooperative work can be entirely warp synchronous - WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS), - - /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two - WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo::VALUE, - - /// Whether or not accesses into smem are unguarded - RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED, - - }; - - - /// Shared memory storage layout type - union _TempStorage - { - typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction - typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - // Thread fields - _TempStorage &temp_storage; - unsigned int linear_tid; - - - /// Constructor - __device__ __forceinline__ BlockReduceRaking( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - template - __device__ __forceinline__ T RakingReduction( - ReductionOp reduction_op, ///< [in] Binary scan operator - T *raking_segment, - T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type /*iteration*/) - { - // Update partial if addend is in range - if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid)) - { - T addend = raking_segment[ITERATION]; - partial = reduction_op(partial, addend); - } - return RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type()); - } - - template - __device__ __forceinline__ T RakingReduction( - ReductionOp /*reduction_op*/, ///< [in] Binary scan operator - T * /*raking_segment*/, - T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type /*iteration*/) - { - return partial; - } - - - - /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template < - bool IS_FULL_TILE, - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T partial, ///< [in] Calling thread's input partial reductions - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - ReductionOp reduction_op) ///< [in] Binary reduction operator - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two) - partial = WarpReduce(temp_storage.warp_storage).template Reduce( - partial, - num_valid, - reduction_op); - } - else - { - // Place partial into shared memory grid. - *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; - - CTA_SYNC(); - - // Reduce parallelism to one warp - if (linear_tid < RAKING_THREADS) - { - // Raking reduction in grid - T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); - partial = raking_segment[0]; - - partial = RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type<1>()); - - int valid_raking_threads = (IS_FULL_TILE) ? - RAKING_THREADS : - (num_valid + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH; - - partial = WarpReduce(temp_storage.warp_storage).template Reduce( - partial, - valid_raking_threads, - reduction_op); - - } - } - - return partial; - } - - - /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template - __device__ __forceinline__ T Sum( - T partial, ///< [in] Calling thread's input partial reductions - int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - { - cub::Sum reduction_op; - - return Reduce(partial, num_valid, reduction_op); - } - - - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/SRC/cub/block/specializations/block_reduce_raking_commutative_only.cuh deleted file mode 100644 index 454fdafa..00000000 --- a/SRC/cub/block/specializations/block_reduce_raking_commutative_only.cuh +++ /dev/null @@ -1,199 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. - */ - -#pragma once - -#include "block_reduce_raking.cuh" -#include "../../warp/warp_reduce.cuh" -#include "../../thread/thread_reduce.cuh" -#include "../../util_ptx.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. Does not support block sizes that are not a multiple of the warp size. - */ -template < - typename T, ///< Data type being reduced - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockReduceRakingCommutativeOnly -{ - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values - typedef BlockReduceRaking FallBack; - - /// Constants - enum - { - /// Number of warp threads - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), - - /// Whether or not to use fall-back - USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)), - - /// Number of raking threads - RAKING_THREADS = WARP_THREADS, - - /// Number of threads actually sharing items with the raking threads - SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS), - - /// Number of raking elements per warp synchronous raking thread - SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS, - }; - - /// WarpReduce utility type - typedef WarpReduce WarpReduce; - - /// Layout type for padded thread block raking grid - typedef BlockRakingLayout BlockRakingLayout; - - /// Shared memory storage layout type - union _TempStorage - { - struct - { - typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction - typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid - }; - typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - // Thread fields - _TempStorage &temp_storage; - unsigned int linear_tid; - - - /// Constructor - __device__ __forceinline__ BlockReduceRakingCommutativeOnly( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template - __device__ __forceinline__ T Sum( - T partial, ///< [in] Calling thread's input partial reductions - int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - { - if (USE_FALLBACK || !FULL_TILE) - { - return FallBack(temp_storage.fallback_storage).template Sum(partial, num_valid); - } - else - { - // Place partial into shared memory grid - if (linear_tid >= RAKING_THREADS) - *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; - - CTA_SYNC(); - - // Reduce parallelism to one warp - if (linear_tid < RAKING_THREADS) - { - // Raking reduction in grid - T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); - partial = internal::ThreadReduce(raking_segment, cub::Sum(), partial); - - // Warpscan - partial = WarpReduce(temp_storage.warp_storage).Sum(partial); - } - } - - return partial; - } - - - /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template < - bool FULL_TILE, - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T partial, ///< [in] Calling thread's input partial reductions - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - ReductionOp reduction_op) ///< [in] Binary reduction operator - { - if (USE_FALLBACK || !FULL_TILE) - { - return FallBack(temp_storage.fallback_storage).template Reduce(partial, num_valid, reduction_op); - } - else - { - // Place partial into shared memory grid - if (linear_tid >= RAKING_THREADS) - *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; - - CTA_SYNC(); - - // Reduce parallelism to one warp - if (linear_tid < RAKING_THREADS) - { - // Raking reduction in grid - T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); - partial = internal::ThreadReduce(raking_segment, reduction_op, partial); - - // Warpscan - partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op); - } - } - - return partial; - } - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/specializations/block_reduce_warp_reductions.cuh b/SRC/cub/block/specializations/block_reduce_warp_reductions.cuh deleted file mode 100644 index 10ba303b..00000000 --- a/SRC/cub/block/specializations/block_reduce_warp_reductions.cuh +++ /dev/null @@ -1,218 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. - */ - -#pragma once - -#include "../../warp/warp_reduce.cuh" -#include "../../util_ptx.cuh" -#include "../../util_arch.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. - */ -template < - typename T, ///< Data type being reduced - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockReduceWarpReductions -{ - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - /// Number of warp threads - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), - - /// Number of active warps - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - - /// The logical warp size for warp reductions - LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS), - - /// Whether or not the logical warp size evenly divides the thread block size - EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0) - }; - - - /// WarpReduce utility type - typedef typename WarpReduce::InternalWarpReduce WarpReduce; - - - /// Shared memory storage layout type - struct _TempStorage - { - typename WarpReduce::TempStorage warp_reduce[WARPS]; ///< Buffer for warp-synchronous scan - T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan - T block_prefix; ///< Shared prefix for the entire thread block - }; - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - // Thread fields - _TempStorage &temp_storage; - int linear_tid; - int warp_id; - int lane_id; - - - /// Constructor - __device__ __forceinline__ BlockReduceWarpReductions( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), - warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), - lane_id(LaneId()) - {} - - - template - __device__ __forceinline__ T ApplyWarpAggregates( - ReductionOp reduction_op, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type /*successor_warp*/) - { - if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid)) - { - T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP]; - warp_aggregate = reduction_op(warp_aggregate, addend); - } - return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type()); - } - - template - __device__ __forceinline__ T ApplyWarpAggregates( - ReductionOp /*reduction_op*/, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int /*num_valid*/, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - Int2Type /*successor_warp*/) - { - return warp_aggregate; - } - - - /// Returns block-wide aggregate in thread0. - template < - bool FULL_TILE, - typename ReductionOp> - __device__ __forceinline__ T ApplyWarpAggregates( - ReductionOp reduction_op, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items - int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - { - // Share lane aggregates - if (lane_id == 0) - { - temp_storage.warp_aggregates[warp_id] = warp_aggregate; - } - - CTA_SYNC(); - - // Update total aggregate in warp 0, lane 0 - if (linear_tid == 0) - { - warp_aggregate = ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type<1>()); - } - - return warp_aggregate; - } - - - /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template - __device__ __forceinline__ T Sum( - T input, ///< [in] Calling thread's input partial reductions - int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - { - cub::Sum reduction_op; - int warp_offset = (warp_id * LOGICAL_WARP_SIZE); - int warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ? - LOGICAL_WARP_SIZE : - num_valid - warp_offset; - - // Warp reduction in every warp - T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>( - input, - warp_num_valid, - cub::Sum()); - - // Update outputs and block_aggregate with warp-wide aggregates from lane-0s - return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); - } - - - /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. - template < - bool FULL_TILE, - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input partial reductions - int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) - ReductionOp reduction_op) ///< [in] Binary reduction operator - { - int warp_offset = warp_id * LOGICAL_WARP_SIZE; - int warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ? - LOGICAL_WARP_SIZE : - num_valid - warp_offset; - - // Warp reduction in every warp - T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>( - input, - warp_num_valid, - reduction_op); - - // Update outputs and block_aggregate with warp-wide aggregates from lane-0s - return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/specializations/block_scan_raking.cuh b/SRC/cub/block/specializations/block_scan_raking.cuh deleted file mode 100644 index a855cda0..00000000 --- a/SRC/cub/block/specializations/block_scan_raking.cuh +++ /dev/null @@ -1,666 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - - -/** - * \file - * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. - */ - -#pragma once - -#include "../../util_ptx.cuh" -#include "../../util_arch.cuh" -#include "../../block/block_raking_layout.cuh" -#include "../../thread/thread_reduce.cuh" -#include "../../thread/thread_scan.cuh" -#include "../../warp/warp_scan.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block. - */ -template < - typename T, ///< Data type being scanned - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - bool MEMOIZE, ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockScanRaking -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - }; - - /// Layout type for padded thread block raking grid - typedef BlockRakingLayout BlockRakingLayout; - - /// Constants - enum - { - /// Number of raking threads - RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, - - /// Number of raking elements per warp synchronous raking thread - SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, - - /// Cooperative work can be entirely warp synchronous - WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS), - }; - - /// WarpScan utility type - typedef WarpScan WarpScan; - - /// Shared memory storage layout type - struct _TempStorage - { - typename WarpScan::TempStorage warp_scan; ///< Buffer for warp-synchronous scan - typename BlockRakingLayout::TempStorage raking_grid; ///< Padded thread block raking grid - T block_aggregate; ///< Block aggregate - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - // Thread fields - _TempStorage &temp_storage; - unsigned int linear_tid; - T cached_segment[SEGMENT_LENGTH]; - - - //--------------------------------------------------------------------- - // Utility methods - //--------------------------------------------------------------------- - - /// Templated reduction - template - __device__ __forceinline__ T GuardedReduce( - T* raking_ptr, ///< [in] Input array - ScanOp scan_op, ///< [in] Binary reduction operator - T raking_partial, ///< [in] Prefix to seed reduction with - Int2Type /*iteration*/) - { - if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS)) - { - T addend = raking_ptr[ITERATION]; - raking_partial = scan_op(raking_partial, addend); - } - - return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type()); - } - - - /// Templated reduction (base case) - template - __device__ __forceinline__ T GuardedReduce( - T* /*raking_ptr*/, ///< [in] Input array - ScanOp /*scan_op*/, ///< [in] Binary reduction operator - T raking_partial, ///< [in] Prefix to seed reduction with - Int2Type /*iteration*/) - { - return raking_partial; - } - - - /// Templated copy - template - __device__ __forceinline__ void CopySegment( - T* out, ///< [out] Out array - T* in, ///< [in] Input array - Int2Type /*iteration*/) - { - out[ITERATION] = in[ITERATION]; - CopySegment(out, in, Int2Type()); - } - - - /// Templated copy (base case) - __device__ __forceinline__ void CopySegment( - T* /*out*/, ///< [out] Out array - T* /*in*/, ///< [in] Input array - Int2Type /*iteration*/) - {} - - - /// Performs upsweep raking reduction, returning the aggregate - template - __device__ __forceinline__ T Upsweep( - ScanOp scan_op) - { - T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); - - // Read data into registers - CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); - - T raking_partial = cached_segment[0]; - - return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>()); - } - - - /// Performs exclusive downsweep raking scan - template - __device__ __forceinline__ void ExclusiveDownsweep( - ScanOp scan_op, - T raking_partial, - bool apply_prefix = true) - { - T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); - - // Read data back into registers - if (!MEMOIZE) - { - CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); - } - - internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); - - // Write data back to smem - CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); - } - - - /// Performs inclusive downsweep raking scan - template - __device__ __forceinline__ void InclusiveDownsweep( - ScanOp scan_op, - T raking_partial, - bool apply_prefix = true) - { - T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); - - // Read data back into registers - if (!MEMOIZE) - { - CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>()); - } - - internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix); - - // Write data back to smem - CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>()); - } - - - //--------------------------------------------------------------------- - // Constructors - //--------------------------------------------------------------------- - - /// Constructor - __device__ __forceinline__ BlockScanRaking( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) - {} - - - //--------------------------------------------------------------------- - // Exclusive scans - //--------------------------------------------------------------------- - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op); - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - // Raking upsweep reduction across shared partials - T upsweep_partial = Upsweep(scan_op); - - // Warp-synchronous scan - T exclusive_partial; - WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); - - // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); - } - - CTA_SYNC(); - - // Grab thread prefix from shared memory - exclusive_output = *placement_ptr; - } - } - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op) ///< [in] Binary scan operator - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op); - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - // Raking upsweep reduction across shared partials - T upsweep_partial = Upsweep(scan_op); - - // Exclusive Warp-synchronous scan - T exclusive_partial; - WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op); - - // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, exclusive_partial); - } - - CTA_SYNC(); - - // Grab exclusive partial from shared memory - output = *placement_ptr; - } - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate); - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - // Raking upsweep reduction across shared partials - T upsweep_partial= Upsweep(scan_op); - - // Warp-synchronous scan - T inclusive_partial; - T exclusive_partial; - WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); - - // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); - - // Broadcast aggregate to all threads - if (linear_tid == RAKING_THREADS - 1) - temp_storage.block_aggregate = inclusive_partial; - } - - CTA_SYNC(); - - // Grab thread prefix from shared memory - output = *placement_ptr; - - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; - } - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate); - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - // Raking upsweep reduction across shared partials - T upsweep_partial = Upsweep(scan_op); - - // Warp-synchronous scan - T exclusive_partial; - WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate); - - // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, exclusive_partial); - - // Broadcast aggregate to other threads - if (linear_tid == 0) - temp_storage.block_aggregate = block_aggregate; - } - - CTA_SYNC(); - - // Grab exclusive partial from shared memory - output = *placement_ptr; - - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; - } - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - T block_aggregate; - WarpScan warp_scan(temp_storage.warp_scan); - warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate); - - // Obtain warp-wide prefix in lane0, then broadcast to other lanes - T block_prefix = block_prefix_callback_op(block_aggregate); - block_prefix = warp_scan.Broadcast(block_prefix, 0); - - output = scan_op(block_prefix, output); - if (linear_tid == 0) - output = block_prefix; - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - WarpScan warp_scan(temp_storage.warp_scan); - - // Raking upsweep reduction across shared partials - T upsweep_partial = Upsweep(scan_op); - - // Warp-synchronous scan - T exclusive_partial, block_aggregate; - warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); - - // Obtain block-wide prefix in lane0, then broadcast to other lanes - T block_prefix = block_prefix_callback_op(block_aggregate); - block_prefix = warp_scan.Broadcast(block_prefix, 0); - - // Update prefix with warpscan exclusive partial - T downsweep_prefix = scan_op(block_prefix, exclusive_partial); - if (linear_tid == 0) - downsweep_prefix = block_prefix; - - // Exclusive raking downsweep scan - ExclusiveDownsweep(scan_op, downsweep_prefix); - } - - CTA_SYNC(); - - // Grab thread prefix from shared memory - output = *placement_ptr; - } - } - - - //--------------------------------------------------------------------- - // Inclusive scans - //--------------------------------------------------------------------- - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op); - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - // Raking upsweep reduction across shared partials - T upsweep_partial = Upsweep(scan_op); - - // Exclusive Warp-synchronous scan - T exclusive_partial; - WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op); - - // Inclusive raking downsweep scan - InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); - } - - CTA_SYNC(); - - // Grab thread prefix from shared memory - output = *placement_ptr; - } - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate); - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - // Raking upsweep reduction across shared partials - T upsweep_partial = Upsweep(scan_op); - - // Warp-synchronous scan - T inclusive_partial; - T exclusive_partial; - WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op); - - // Inclusive raking downsweep scan - InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0)); - - // Broadcast aggregate to all threads - if (linear_tid == RAKING_THREADS - 1) - temp_storage.block_aggregate = inclusive_partial; - } - - CTA_SYNC(); - - // Grab thread prefix from shared memory - output = *placement_ptr; - - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; - } - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - if (WARP_SYNCHRONOUS) - { - // Short-circuit directly to warp-synchronous scan - T block_aggregate; - WarpScan warp_scan(temp_storage.warp_scan); - warp_scan.InclusiveScan(input, output, scan_op, block_aggregate); - - // Obtain warp-wide prefix in lane0, then broadcast to other lanes - T block_prefix = block_prefix_callback_op(block_aggregate); - block_prefix = warp_scan.Broadcast(block_prefix, 0); - - // Update prefix with exclusive warpscan partial - output = scan_op(block_prefix, output); - } - else - { - // Place thread partial into shared memory raking grid - T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid); - *placement_ptr = input; - - CTA_SYNC(); - - // Reduce parallelism down to just raking threads - if (linear_tid < RAKING_THREADS) - { - WarpScan warp_scan(temp_storage.warp_scan); - - // Raking upsweep reduction across shared partials - T upsweep_partial = Upsweep(scan_op); - - // Warp-synchronous scan - T exclusive_partial, block_aggregate; - warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate); - - // Obtain block-wide prefix in lane0, then broadcast to other lanes - T block_prefix = block_prefix_callback_op(block_aggregate); - block_prefix = warp_scan.Broadcast(block_prefix, 0); - - // Update prefix with warpscan exclusive partial - T downsweep_prefix = scan_op(block_prefix, exclusive_partial); - if (linear_tid == 0) - downsweep_prefix = block_prefix; - - // Inclusive raking downsweep scan - InclusiveDownsweep(scan_op, downsweep_prefix); - } - - CTA_SYNC(); - - // Grab thread prefix from shared memory - output = *placement_ptr; - } - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/specializations/block_scan_warp_scans.cuh b/SRC/cub/block/specializations/block_scan_warp_scans.cuh deleted file mode 100644 index 85e4d613..00000000 --- a/SRC/cub/block/specializations/block_scan_warp_scans.cuh +++ /dev/null @@ -1,392 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. - */ - -#pragma once - -#include "../../util_arch.cuh" -#include "../../util_ptx.cuh" -#include "../../warp/warp_scan.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. - */ -template < - typename T, - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockScanWarpScans -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// Constants - enum - { - /// Number of warp threads - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), - - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - /// Number of active warps - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - }; - - /// WarpScan utility type - typedef WarpScan WarpScanT; - - /// WarpScan utility type - typedef WarpScan WarpAggregateScan; - - /// Shared memory storage layout type - - struct __align__(32) _TempStorage - { - T warp_aggregates[WARPS]; - typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans - T block_prefix; ///< Shared prefix for the entire thread block - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - // Thread fields - _TempStorage &temp_storage; - unsigned int linear_tid; - unsigned int warp_id; - unsigned int lane_id; - - - //--------------------------------------------------------------------- - // Constructors - //--------------------------------------------------------------------- - - /// Constructor - __device__ __forceinline__ BlockScanWarpScans( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), - warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), - lane_id(LaneId()) - {} - - - //--------------------------------------------------------------------- - // Utility methods - //--------------------------------------------------------------------- - - template - __device__ __forceinline__ void ApplyWarpAggregates( - T &warp_prefix, ///< [out] The calling thread's partial reduction - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items - Int2Type /*addend_warp*/) - { - if (warp_id == WARP) - warp_prefix = block_aggregate; - - T addend = temp_storage.warp_aggregates[WARP]; - block_aggregate = scan_op(block_aggregate, addend); - - ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type()); - } - - template - __device__ __forceinline__ void ApplyWarpAggregates( - T &/*warp_prefix*/, ///< [out] The calling thread's partial reduction - ScanOp /*scan_op*/, ///< [in] Binary scan operator - T &/*block_aggregate*/, ///< [out] Threadblock-wide aggregate reduction of input items - Int2Type /*addend_warp*/) - {} - - - /// Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. - template - __device__ __forceinline__ T ComputeWarpPrefix( - ScanOp scan_op, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - // Last lane in each warp shares its warp-aggregate - if (lane_id == WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = warp_aggregate; - - CTA_SYNC(); - - // Accumulate block aggregates and save the one that is our warp's prefix - T warp_prefix; - block_aggregate = temp_storage.warp_aggregates[0]; - - // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x) - ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>()); -/* - #pragma unroll - for (int WARP = 1; WARP < WARPS; ++WARP) - { - if (warp_id == WARP) - warp_prefix = block_aggregate; - - T addend = temp_storage.warp_aggregates[WARP]; - block_aggregate = scan_op(block_aggregate, addend); - } -*/ - - return warp_prefix; - } - - - /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. - template - __device__ __forceinline__ T ComputeWarpPrefix( - ScanOp scan_op, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items - const T &initial_value) ///< [in] Initial value to seed the exclusive scan - { - T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate); - - warp_prefix = scan_op(initial_value, warp_prefix); - - if (warp_id == 0) - warp_prefix = initial_value; - - return warp_prefix; - } - - //--------------------------------------------------------------------- - // Exclusive scans - //--------------------------------------------------------------------- - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. - T block_aggregate; - ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op) ///< [in] Binary scan operator - { - T block_aggregate; - ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - T inclusive_output; - WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); - - // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. - T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); - - // Apply warp prefix to our lane's partial - if (warp_id != 0) - { - exclusive_output = scan_op(warp_prefix, exclusive_output); - if (lane_id == 0) - exclusive_output = warp_prefix; - } - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - T inclusive_output; - WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op); - - // Compute the warp-wide prefix and block-wide aggregate for each warp - T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value); - - // Apply warp prefix to our lane's partial - exclusive_output = scan_op(warp_prefix, exclusive_output); - if (lane_id == 0) - exclusive_output = warp_prefix; - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. - T block_aggregate; - ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); - - // Use the first warp to determine the thread block prefix, returning the result in lane0 - if (warp_id == 0) - { - T block_prefix = block_prefix_callback_op(block_aggregate); - if (lane_id == 0) - { - // Share the prefix with all threads - temp_storage.block_prefix = block_prefix; - exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 - } - } - - CTA_SYNC(); - - // Incorporate thread block prefix into outputs - T block_prefix = temp_storage.block_prefix; - if (linear_tid > 0) - { - exclusive_output = scan_op(block_prefix, exclusive_output); - } - } - - - //--------------------------------------------------------------------- - // Inclusive scans - //--------------------------------------------------------------------- - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - T block_aggregate; - InclusiveScan(input, inclusive_output, scan_op, block_aggregate); - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op); - - // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. - T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); - - // Apply warp prefix to our lane's partial - if (warp_id != 0) - { - inclusive_output = scan_op(warp_prefix, inclusive_output); - } - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - T block_aggregate; - InclusiveScan(input, exclusive_output, scan_op, block_aggregate); - - // Use the first warp to determine the thread block prefix, returning the result in lane0 - if (warp_id == 0) - { - T block_prefix = block_prefix_callback_op(block_aggregate); - if (lane_id == 0) - { - // Share the prefix with all threads - temp_storage.block_prefix = block_prefix; - } - } - - CTA_SYNC(); - - // Incorporate thread block prefix into outputs - T block_prefix = temp_storage.block_prefix; - exclusive_output = scan_op(block_prefix, exclusive_output); - } - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/specializations/block_scan_warp_scans2.cuh b/SRC/cub/block/specializations/block_scan_warp_scans2.cuh deleted file mode 100644 index 4de7c69b..00000000 --- a/SRC/cub/block/specializations/block_scan_warp_scans2.cuh +++ /dev/null @@ -1,436 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. - */ - -#pragma once - -#include "../../util_arch.cuh" -#include "../../util_ptx.cuh" -#include "../../warp/warp_scan.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. - */ -template < - typename T, - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockScanWarpScans -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// Constants - enum - { - /// Number of warp threads - WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), - - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - /// Number of active warps - WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, - }; - - /// WarpScan utility type - typedef WarpScan WarpScanT; - - /// WarpScan utility type - typedef WarpScan WarpAggregateScanT; - - /// Shared memory storage layout type - struct _TempStorage - { - typename WarpAggregateScanT::TempStorage inner_scan[WARPS]; ///< Buffer for warp-synchronous scans - typename WarpScanT::TempStorage warp_scan[WARPS]; ///< Buffer for warp-synchronous scans - T warp_aggregates[WARPS]; - T block_prefix; ///< Shared prefix for the entire thread block - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - // Thread fields - _TempStorage &temp_storage; - unsigned int linear_tid; - unsigned int warp_id; - unsigned int lane_id; - - - //--------------------------------------------------------------------- - // Constructors - //--------------------------------------------------------------------- - - /// Constructor - __device__ __forceinline__ BlockScanWarpScans( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), - warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), - lane_id(LaneId()) - {} - - - //--------------------------------------------------------------------- - // Utility methods - //--------------------------------------------------------------------- - - template - __device__ __forceinline__ void ApplyWarpAggregates( - T &warp_prefix, ///< [out] The calling thread's partial reduction - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items - Int2Type addend_warp) - { - if (warp_id == WARP) - warp_prefix = block_aggregate; - - T addend = temp_storage.warp_aggregates[WARP]; - block_aggregate = scan_op(block_aggregate, addend); - - ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type()); - } - - template - __device__ __forceinline__ void ApplyWarpAggregates( - T &warp_prefix, ///< [out] The calling thread's partial reduction - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items - Int2Type addend_warp) - {} - - - /// Use the warp-wide aggregates to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. - template - __device__ __forceinline__ T ComputeWarpPrefix( - ScanOp scan_op, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - // Last lane in each warp shares its warp-aggregate - if (lane_id == WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = warp_aggregate; - - CTA_SYNC(); - - // Accumulate block aggregates and save the one that is our warp's prefix - T warp_prefix; - block_aggregate = temp_storage.warp_aggregates[0]; - - // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x) - ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>()); -/* - #pragma unroll - for (int WARP = 1; WARP < WARPS; ++WARP) - { - if (warp_id == WARP) - warp_prefix = block_aggregate; - - T addend = temp_storage.warp_aggregates[WARP]; - block_aggregate = scan_op(block_aggregate, addend); - } -*/ - - return warp_prefix; - } - - - /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix. Also returns block-wide aggregate in all threads. - template - __device__ __forceinline__ T ComputeWarpPrefix( - ScanOp scan_op, ///< [in] Binary scan operator - T warp_aggregate, ///< [in] [laneWARP_THREADS - 1 only] Warp-wide aggregate reduction of input items - T &block_aggregate, ///< [out] Threadblock-wide aggregate reduction of input items - const T &initial_value) ///< [in] Initial value to seed the exclusive scan - { - T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate); - - warp_prefix = scan_op(initial_value, warp_prefix); - - if (warp_id == 0) - warp_prefix = initial_value; - - return warp_prefix; - } - - //--------------------------------------------------------------------- - // Exclusive scans - //--------------------------------------------------------------------- - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. - T block_aggregate; - ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op) ///< [in] Binary scan operator - { - T block_aggregate; - ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]); - - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - T inclusive_output; - my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op); - - // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. -// T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); - -//-------------------------------------------------- - // Last lane in each warp shares its warp-aggregate - if (lane_id == WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = inclusive_output; - - CTA_SYNC(); - - // Get the warp scan partial - T warp_inclusive, warp_prefix; - if (lane_id < WARPS) - { - // Scan the warpscan partials - T warp_val = temp_storage.warp_aggregates[lane_id]; - WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op); - } - - warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id); - block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1); -//-------------------------------------------------- - - // Apply warp prefix to our lane's partial - if (warp_id != 0) - { - exclusive_output = scan_op(warp_prefix, exclusive_output); - if (lane_id == 0) - exclusive_output = warp_prefix; - } - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]); - - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - T inclusive_output; - my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op); - - // Compute the warp-wide prefix and block-wide aggregate for each warp -// T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value); - -//-------------------------------------------------- - // Last lane in each warp shares its warp-aggregate - if (lane_id == WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = inclusive_output; - - CTA_SYNC(); - - // Get the warp scan partial - T warp_inclusive, warp_prefix; - if (lane_id < WARPS) - { - // Scan the warpscan partials - T warp_val = temp_storage.warp_aggregates[lane_id]; - WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op); - } - - warp_prefix = my_warp_scan.Broadcast(warp_prefix, warp_id); - block_aggregate = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1); -//-------------------------------------------------- - - // Apply warp prefix to our lane's partial - exclusive_output = scan_op(warp_prefix, exclusive_output); - if (lane_id == 0) - exclusive_output = warp_prefix; - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. - T block_aggregate; - ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); - - // Use the first warp to determine the thread block prefix, returning the result in lane0 - if (warp_id == 0) - { - T block_prefix = block_prefix_callback_op(block_aggregate); - if (lane_id == 0) - { - // Share the prefix with all threads - temp_storage.block_prefix = block_prefix; - exclusive_output = block_prefix; // The block prefix is the exclusive output for tid0 - } - } - - CTA_SYNC(); - - // Incorporate thread block prefix into outputs - T block_prefix = temp_storage.block_prefix; - if (linear_tid > 0) - { - exclusive_output = scan_op(block_prefix, exclusive_output); - } - } - - - //--------------------------------------------------------------------- - // Inclusive scans - //--------------------------------------------------------------------- - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - T block_aggregate; - InclusiveScan(input, inclusive_output, scan_op, block_aggregate); - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op); - - // Compute the warp-wide prefix and block-wide aggregate for each warp. Warp prefix for warp0 is invalid. - T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate); - - // Apply warp prefix to our lane's partial - if (warp_id != 0) - { - inclusive_output = scan_op(warp_prefix, inclusive_output); - } - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - T block_aggregate; - InclusiveScan(input, exclusive_output, scan_op, block_aggregate); - - // Use the first warp to determine the thread block prefix, returning the result in lane0 - if (warp_id == 0) - { - T block_prefix = block_prefix_callback_op(block_aggregate); - if (lane_id == 0) - { - // Share the prefix with all threads - temp_storage.block_prefix = block_prefix; - } - } - - CTA_SYNC(); - - // Incorporate thread block prefix into outputs - T block_prefix = temp_storage.block_prefix; - exclusive_output = scan_op(block_prefix, exclusive_output); - } - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/block/specializations/block_scan_warp_scans3.cuh b/SRC/cub/block/specializations/block_scan_warp_scans3.cuh deleted file mode 100644 index 147ca4c5..00000000 --- a/SRC/cub/block/specializations/block_scan_warp_scans3.cuh +++ /dev/null @@ -1,418 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. - */ - -#pragma once - -#include "../../util_arch.cuh" -#include "../../util_ptx.cuh" -#include "../../warp/warp_scan.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block. - */ -template < - typename T, - int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension - int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension - int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct BlockScanWarpScans -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// Constants - enum - { - /// The thread block size in threads - BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, - - /// Number of warp threads - INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), - OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS, - - /// Number of outer scan warps - OUTER_WARPS = INNER_WARP_THREADS - }; - - /// Outer WarpScan utility type - typedef WarpScan OuterWarpScanT; - - /// Inner WarpScan utility type - typedef WarpScan InnerWarpScanT; - - typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS]; - - - /// Shared memory storage layout type - struct _TempStorage - { - union Aliasable - { - Uninitialized outer_warp_scan; ///< Buffer for warp-synchronous outer scans - typename InnerWarpScanT::TempStorage inner_warp_scan; ///< Buffer for warp-synchronous inner scan - - } aliasable; - - T warp_aggregates[OUTER_WARPS]; - - T block_aggregate; ///< Shared prefix for the entire thread block - }; - - - /// Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - // Thread fields - _TempStorage &temp_storage; - unsigned int linear_tid; - unsigned int warp_id; - unsigned int lane_id; - - - //--------------------------------------------------------------------- - // Constructors - //--------------------------------------------------------------------- - - /// Constructor - __device__ __forceinline__ BlockScanWarpScans( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), - warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS), - lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS) - {} - - - //--------------------------------------------------------------------- - // Exclusive scans - //--------------------------------------------------------------------- - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - // Compute block-wide exclusive scan. The exclusive output from tid0 is invalid. - T block_aggregate; - ExclusiveScan(input, exclusive_output, scan_op, block_aggregate); - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op) ///< [in] Binary scan operator - { - T block_aggregate; - ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate); - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - T inclusive_output; - OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( - input, inclusive_output, exclusive_output, scan_op); - - // Share outer warp total - if (lane_id == OUTER_WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = inclusive_output; - - CTA_SYNC(); - - if (linear_tid < INNER_WARP_THREADS) - { - T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; - T outer_warp_exclusive; - - InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( - outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate); - - temp_storage.block_aggregate = block_aggregate; - temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; - } - - CTA_SYNC(); - - if (warp_id != 0) - { - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; - - // Apply warp prefix to our lane's partial - T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; - exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); - if (lane_id == 0) - exclusive_output = outer_warp_exclusive; - } - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input items - T &exclusive_output, ///< [out] Calling thread's output items (may be aliased to \p input) - const T &initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - T inclusive_output; - OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( - input, inclusive_output, exclusive_output, scan_op); - - // Share outer warp total - if (lane_id == OUTER_WARP_THREADS - 1) - { - temp_storage.warp_aggregates[warp_id] = inclusive_output; - } - - CTA_SYNC(); - - if (linear_tid < INNER_WARP_THREADS) - { - T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; - T outer_warp_exclusive; - - InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( - outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate); - - temp_storage.block_aggregate = block_aggregate; - temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; - } - - CTA_SYNC(); - - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; - - // Apply warp prefix to our lane's partial - T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; - exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); - if (lane_id == 0) - exclusive_output = outer_warp_exclusive; - } - - - /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item - T &exclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - T inclusive_output; - OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan( - input, inclusive_output, exclusive_output, scan_op); - - // Share outer warp total - if (lane_id == OUTER_WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = inclusive_output; - - CTA_SYNC(); - - if (linear_tid < INNER_WARP_THREADS) - { - InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan); - - T upsweep = temp_storage.warp_aggregates[linear_tid]; - T downsweep_prefix, block_aggregate; - - inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate); - - // Use callback functor to get block prefix in lane0 and then broadcast to other lanes - T block_prefix = block_prefix_callback_op(block_aggregate); - block_prefix = inner_scan.Broadcast(block_prefix, 0); - - downsweep_prefix = scan_op(block_prefix, downsweep_prefix); - if (linear_tid == 0) - downsweep_prefix = block_prefix; - - temp_storage.warp_aggregates[linear_tid] = downsweep_prefix; - } - - CTA_SYNC(); - - // Apply warp prefix to our lane's partial (or assign it if partial is invalid) - T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; - exclusive_output = scan_op(outer_warp_exclusive, exclusive_output); - if (lane_id == 0) - exclusive_output = outer_warp_exclusive; - } - - - //--------------------------------------------------------------------- - // Inclusive scans - //--------------------------------------------------------------------- - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator - { - T block_aggregate; - InclusiveScan(input, inclusive_output, scan_op, block_aggregate); - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. Also provides every thread with the block-wide \p block_aggregate of all inputs. - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T &block_aggregate) ///< [out] Threadblock-wide aggregate reduction of input items - { - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan( - input, inclusive_output, scan_op); - - // Share outer warp total - if (lane_id == OUTER_WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = inclusive_output; - - CTA_SYNC(); - - if (linear_tid < INNER_WARP_THREADS) - { - T outer_warp_input = temp_storage.warp_aggregates[linear_tid]; - T outer_warp_exclusive; - - InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan( - outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate); - - temp_storage.block_aggregate = block_aggregate; - temp_storage.warp_aggregates[linear_tid] = outer_warp_exclusive; - } - - CTA_SYNC(); - - if (warp_id != 0) - { - // Retrieve block aggregate - block_aggregate = temp_storage.block_aggregate; - - // Apply warp prefix to our lane's partial - T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; - inclusive_output = scan_op(outer_warp_exclusive, inclusive_output); - } - } - - - /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes one input element. the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by lane0 in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs. - template < - typename ScanOp, - typename BlockPrefixCallbackOp> - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item - T &inclusive_output, ///< [out] Calling thread's output item (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - BlockPrefixCallbackOp &block_prefix_callback_op) ///< [in-out] [warp0 only] Call-back functor for specifying a thread block-wide prefix to be applied to all inputs. - { - // Compute warp scan in each warp. The exclusive output from each lane0 is invalid. - OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan( - input, inclusive_output, scan_op); - - // Share outer warp total - if (lane_id == OUTER_WARP_THREADS - 1) - temp_storage.warp_aggregates[warp_id] = inclusive_output; - - CTA_SYNC(); - - if (linear_tid < INNER_WARP_THREADS) - { - InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan); - - T upsweep = temp_storage.warp_aggregates[linear_tid]; - T downsweep_prefix, block_aggregate; - inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate); - - // Use callback functor to get block prefix in lane0 and then broadcast to other lanes - T block_prefix = block_prefix_callback_op(block_aggregate); - block_prefix = inner_scan.Broadcast(block_prefix, 0); - - downsweep_prefix = scan_op(block_prefix, downsweep_prefix); - if (linear_tid == 0) - downsweep_prefix = block_prefix; - - temp_storage.warp_aggregates[linear_tid] = downsweep_prefix; - } - - CTA_SYNC(); - - // Apply warp prefix to our lane's partial - T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id]; - inclusive_output = scan_op(outer_warp_exclusive, inclusive_output); - } - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/cub.cuh b/SRC/cub/cub.cuh deleted file mode 100644 index 3ece0f65..00000000 --- a/SRC/cub/cub.cuh +++ /dev/null @@ -1,95 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * CUB umbrella include file - */ - -#pragma once - - -// Block -#include "block/block_histogram.cuh" -#include "block/block_discontinuity.cuh" -#include "block/block_exchange.cuh" -#include "block/block_load.cuh" -#include "block/block_radix_rank.cuh" -#include "block/block_radix_sort.cuh" -#include "block/block_reduce.cuh" -#include "block/block_scan.cuh" -#include "block/block_store.cuh" -//#include "block/block_shift.cuh" - -// Device -#include "device/device_histogram.cuh" -#include "device/device_partition.cuh" -#include "device/device_radix_sort.cuh" -#include "device/device_reduce.cuh" -#include "device/device_run_length_encode.cuh" -#include "device/device_scan.cuh" -#include "device/device_segmented_radix_sort.cuh" -#include "device/device_segmented_reduce.cuh" -#include "device/device_select.cuh" -#include "device/device_spmv.cuh" - -// Grid -//#include "grid/grid_barrier.cuh" -#include "grid/grid_even_share.cuh" -#include "grid/grid_mapping.cuh" -#include "grid/grid_queue.cuh" - -// Thread -#include "thread/thread_load.cuh" -#include "thread/thread_operators.cuh" -#include "thread/thread_reduce.cuh" -#include "thread/thread_scan.cuh" -#include "thread/thread_store.cuh" - -// Warp -#include "warp/warp_reduce.cuh" -#include "warp/warp_scan.cuh" - -// Iterator -#include "iterator/arg_index_input_iterator.cuh" -#include "iterator/cache_modified_input_iterator.cuh" -#include "iterator/cache_modified_output_iterator.cuh" -#include "iterator/constant_input_iterator.cuh" -#include "iterator/counting_input_iterator.cuh" -#include "iterator/tex_obj_input_iterator.cuh" -#include "iterator/tex_ref_input_iterator.cuh" -#include "iterator/transform_input_iterator.cuh" - -// Util -#include "util_arch.cuh" -#include "util_debug.cuh" -#include "util_device.cuh" -#include "util_macro.cuh" -#include "util_ptx.cuh" -#include "util_type.cuh" - diff --git a/SRC/cub/device/device_histogram.cuh b/SRC/cub/device/device_histogram.cuh deleted file mode 100644 index a2556a6b..00000000 --- a/SRC/cub/device/device_histogram.cuh +++ /dev/null @@ -1,866 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. - */ - -#pragma once - -#include -#include -#include - -#include "dispatch/dispatch_histogram.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png) - * \ingroup SingleModule - * - * \par Overview - * A histogram - * counts the number of observations that fall into each of the disjoint categories (known as bins). - * - * \par Usage Considerations - * \cdp_class{DeviceHistogram} - * - */ -struct DeviceHistogram -{ - /******************************************************************//** - * \name Evenly-segmented bin ranges - *********************************************************************/ - //@{ - - /** - * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. - * - * \par - * - The number of histogram bins is (\p num_levels - 1) - * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of a six-bin histogram - * from a sequence of float samples - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples and - * // output histogram - * int num_samples; // e.g., 10 - * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] - * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] - * int num_levels; // e.g., 7 (seven level boundaries for six bins) - * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) - * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples); - * - * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; - * - * \endcode - * - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t HistogramEven( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. - CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. - int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. - LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. - LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. - OffsetT num_samples, ///< [in] The number of input samples (i.e., the length of \p d_samples) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - /// The sample value type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - - CounterT* d_histogram1[1] = {d_histogram}; - int num_levels1[1] = {num_levels}; - LevelT lower_level1[1] = {lower_level}; - LevelT upper_level1[1] = {upper_level}; - - return MultiHistogramEven<1, 1>( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram1, - num_levels1, - lower_level1, - upper_level1, - num_samples, - 1, - sizeof(SampleT) * num_samples, - stream, - debug_synchronous); - } - - - /** - * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins. - * - * \par - * - A two-dimensional region of interest within \p d_samples can be specified - * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. - * - The row stride must be a whole multiple of the sample data type - * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. - * - The number of histogram bins is (\p num_levels - 1) - * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of a six-bin histogram - * from a 2x5 region of interest within a flattened 2x7 array of float samples. - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples and - * // output histogram - * int num_row_samples; // e.g., 5 - * int num_rows; // e.g., 2; - * size_t row_stride_bytes; // e.g., 7 * sizeof(float) - * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, - * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] - * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] - * int num_levels; // e.g., 7 (seven level boundaries for six bins) - * float lower_level; // e.g., 0.0 (lower sample value boundary of lowest bin) - * float upper_level; // e.g., 12.0 (upper sample value boundary of upper bin) - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, - * num_row_samples, num_rows, row_stride_bytes); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram, - * d_samples, d_histogram, num_levels, lower_level, upper_level, - * num_row_samples, num_rows, row_stride_bytes); - * - * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; - * - * \endcode - * - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t HistogramEven( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. - CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. - int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. - LevelT lower_level, ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin. - LevelT upper_level, ///< [in] The upper sample value bound (exclusive) for the highest histogram bin. - OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - CounterT* d_histogram1[1] = {d_histogram}; - int num_levels1[1] = {num_levels}; - LevelT lower_level1[1] = {lower_level}; - LevelT upper_level1[1] = {upper_level}; - - return MultiHistogramEven<1, 1>( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram1, - num_levels1, - lower_level1, - upper_level1, - num_row_samples, - num_rows, - row_stride_bytes, - stream, - debug_synchronous); - } - - /** - * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. - * - * \par - * - The input is a sequence of pixel structures, where each pixel comprises - * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). - * - Of the \p NUM_CHANNELS specified, the function will only compute histograms - * for the first \p NUM_ACTIVE_CHANNELS (e.g., only RGB histograms from RGBA - * pixel samples). - * - The number of histogram bins for channeli is num_levels[i] - 1. - * - For channeli, the range of values for all histogram bins - * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of three 256-bin RGB histograms - * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples - * // and output histograms - * int num_pixels; // e.g., 5 - * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), - * // (0, 6, 7, 5), (3, 0, 2, 6)] - * int* d_histogram[3]; // e.g., three device pointers to three device buffers, - * // each allocated with 256 integer counters - * int num_levels[3]; // e.g., {257, 257, 257}; - * unsigned int lower_level[3]; // e.g., {0, 0, 0}; - * unsigned int upper_level[3]; // e.g., {256, 256, 256}; - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels); - * - * // d_histogram <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], - * // [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], - * // [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] - * - * \endcode - * - * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - int NUM_CHANNELS, - int NUM_ACTIVE_CHANNELS, - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t MultiHistogramEven( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. - int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. - LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. - LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. - OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - /// The sample value type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - - return MultiHistogramEven( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - lower_level, - upper_level, - num_pixels, - 1, - sizeof(SampleT) * NUM_CHANNELS * num_pixels, - stream, - debug_synchronous); - } - - - /** - * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins. - * - * \par - * - The input is a sequence of pixel structures, where each pixel comprises - * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). - * - Of the \p NUM_CHANNELS specified, the function will only compute histograms - * for the first \p NUM_ACTIVE_CHANNELS (e.g., only RGB histograms from RGBA - * pixel samples). - * - A two-dimensional region of interest within \p d_samples can be specified - * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. - * - The row stride must be a whole multiple of the sample data type - * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. - * - The number of histogram bins for channeli is num_levels[i] - 1. - * - For channeli, the range of values for all histogram bins - * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of three 256-bin RGB histograms from a 2x3 region of - * interest of within a flattened 2x4 array of quad-channel RGBA pixels (8 bits per channel per pixel). - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples - * // and output histograms - * int num_row_pixels; // e.g., 3 - * int num_rows; // e.g., 2 - * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS - * unsigned char* d_samples; // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -), - * // (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)] - * int* d_histogram[3]; // e.g., three device pointers to three device buffers, - * // each allocated with 256 integer counters - * int num_levels[3]; // e.g., {257, 257, 257}; - * unsigned int lower_level[3]; // e.g., {0, 0, 0}; - * unsigned int upper_level[3]; // e.g., {256, 256, 256}; - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, - * num_row_pixels, num_rows, row_stride_bytes); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, lower_level, upper_level, - * num_row_pixels, num_rows, row_stride_bytes); - * - * // d_histogram <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0], - * // [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0], - * // [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ] - * - * \endcode - * - * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - int NUM_CHANNELS, - int NUM_ACTIVE_CHANNELS, - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t MultiHistogramEven( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. - int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. - LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. - LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. - OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - /// The sample value type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - Int2Type is_byte_sample; - - if ((sizeof(OffsetT) > sizeof(int)) && - ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits::max())) - { - // Down-convert OffsetT data type - - - return DipatchHistogram::DispatchEven( - d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, - (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), - stream, debug_synchronous, is_byte_sample); - } - - return DipatchHistogram::DispatchEven( - d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level, - num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), - stream, debug_synchronous, is_byte_sample); - } - - - //@} end member group - /******************************************************************//** - * \name Custom bin ranges - *********************************************************************/ - //@{ - - /** - * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. - * - * \par - * - The number of histogram bins is (\p num_levels - 1) - * - The value range for bini is [level[i], level[i+1]) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of an six-bin histogram - * from a sequence of float samples - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples and - * // output histogram - * int num_samples; // e.g., 10 - * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5] - * int* d_histogram; // e.g., [ -, -, -, -, -, -, -, -] - * int num_levels // e.g., 7 (seven level boundaries for six bins) - * float* d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_samples); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_samples); - * - * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; - * - * \endcode - * - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t HistogramRange( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. - CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. - int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. - LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. - OffsetT num_samples, ///< [in] The number of data samples per row in the region of interest - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - /// The sample value type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - - CounterT* d_histogram1[1] = {d_histogram}; - int num_levels1[1] = {num_levels}; - LevelT* d_levels1[1] = {d_levels}; - - return MultiHistogramRange<1, 1>( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram1, - num_levels1, - d_levels1, - num_samples, - 1, - sizeof(SampleT) * num_samples, - stream, - debug_synchronous); - } - - - /** - * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels. - * - * \par - * - A two-dimensional region of interest within \p d_samples can be specified - * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. - * - The row stride must be a whole multiple of the sample data type - * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. - * - The number of histogram bins is (\p num_levels - 1) - * - The value range for bini is [level[i], level[i+1]) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of a six-bin histogram - * from a 2x5 region of interest within a flattened 2x7 array of float samples. - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples and - * // output histogram - * int num_row_samples; // e.g., 5 - * int num_rows; // e.g., 2; - * int row_stride_bytes; // e.g., 7 * sizeof(float) - * float* d_samples; // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, -, -, - * // 0.3, 2.9, 2.0, 6.1, 999.5, -, -] - * int* d_histogram; // e.g., [ , , , , , , , ] - * int num_levels // e.g., 7 (seven level boundaries for six bins) - * float *d_levels; // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0] - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, - * num_row_samples, num_rows, row_stride_bytes); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, - * num_row_samples, num_rows, row_stride_bytes); - * - * // d_histogram <-- [1, 0, 5, 0, 3, 0, 0, 0]; - * - * \endcode - * - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t HistogramRange( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of data samples. - CounterT* d_histogram, ///< [out] The pointer to the histogram counter output array of length num_levels - 1. - int num_levels, ///< [in] The number of boundaries (levels) for delineating histogram samples. Implies that the number of bins is num_levels - 1. - LevelT* d_levels, ///< [in] The pointer to the array of boundaries (levels). Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. - OffsetT num_row_samples, ///< [in] The number of data samples per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - CounterT* d_histogram1[1] = {d_histogram}; - int num_levels1[1] = {num_levels}; - LevelT* d_levels1[1] = {d_levels}; - - return MultiHistogramRange<1, 1>( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram1, - num_levels1, - d_levels1, - num_row_samples, - num_rows, - row_stride_bytes, - stream, - debug_synchronous); - } - - /** - * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. - * - * \par - * - The input is a sequence of pixel structures, where each pixel comprises - * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). - * - Of the \p NUM_CHANNELS specified, the function will only compute histograms - * for the first \p NUM_ACTIVE_CHANNELS (e.g., RGB histograms from RGBA - * pixel samples). - * - The number of histogram bins for channeli is num_levels[i] - 1. - * - For channeli, the range of values for all histogram bins - * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of three 4-bin RGB histograms - * from a quad-channel sequence of RGBA pixels (8 bits per channel per pixel) - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples - * // and output histograms - * int num_pixels; // e.g., 5 - * unsigned char *d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2), - * // (0, 6, 7, 5),(3, 0, 2, 6)] - * unsigned int *d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; - * int num_levels[3]; // e.g., {5, 5, 5}; - * unsigned int *d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], - * // [0, 2, 4, 6, 8], - * // [0, 2, 4, 6, 8] ]; - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_pixels); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_pixels); - * - * // d_histogram <-- [ [1, 3, 0, 1], - * // [3, 0, 0, 2], - * // [0, 2, 0, 3] ] - * - * \endcode - * - * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - int NUM_CHANNELS, - int NUM_ACTIVE_CHANNELS, - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t MultiHistogramRange( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. - int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. - LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. - OffsetT num_pixels, ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - /// The sample value type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - - return MultiHistogramRange( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - d_levels, - num_pixels, - 1, - sizeof(SampleT) * NUM_CHANNELS * num_pixels, - stream, - debug_synchronous); - } - - - /** - * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels. - * - * \par - * - The input is a sequence of pixel structures, where each pixel comprises - * a record of \p NUM_CHANNELS consecutive data samples (e.g., an RGBA pixel). - * - Of the \p NUM_CHANNELS specified, the function will only compute histograms - * for the first \p NUM_ACTIVE_CHANNELS (e.g., RGB histograms from RGBA - * pixel samples). - * - A two-dimensional region of interest within \p d_samples can be specified - * using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters. - * - The row stride must be a whole multiple of the sample data type - * size, i.e., (row_stride_bytes % sizeof(SampleT)) == 0. - * - The number of histogram bins for channeli is num_levels[i] - 1. - * - For channeli, the range of values for all histogram bins - * have the same width: (upper_level[i] - lower_level[i]) / ( num_levels[i] - 1) - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the computation of three 4-bin RGB histograms from a 2x3 region of - * interest of within a flattened 2x4 array of quad-channel RGBA pixels (8 bits per channel per pixel). - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input samples - * // and output histograms - * int num_row_pixels; // e.g., 3 - * int num_rows; // e.g., 2 - * size_t row_stride_bytes; // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS - * unsigned char* d_samples; // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -), - * // (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)] - * int* d_histogram[3]; // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]]; - * int num_levels[3]; // e.g., {5, 5, 5}; - * unsigned int* d_levels[3]; // e.g., [ [0, 2, 4, 6, 8], - * // [0, 2, 4, 6, 8], - * // [0, 2, 4, 6, 8] ]; - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Compute histograms - * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes, - * d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes); - * - * // d_histogram <-- [ [2, 3, 0, 1], - * // [3, 0, 0, 2], - * // [1, 2, 0, 3] ] - * - * \endcode - * - * \tparam NUM_CHANNELS Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - * \tparam NUM_ACTIVE_CHANNELS [inferred] Number of channels actively being histogrammed - * \tparam SampleIteratorT [inferred] Random-access input iterator type for reading input samples. \iterator - * \tparam CounterT [inferred] Integer type for histogram bin counters - * \tparam LevelT [inferred] Type for specifying boundaries (levels) - * \tparam OffsetT [inferred] Signed integer type for sequence offsets, list lengths, pointer differences, etc. \offset_size1 - */ - template < - int NUM_CHANNELS, - int NUM_ACTIVE_CHANNELS, - typename SampleIteratorT, - typename CounterT, - typename LevelT, - typename OffsetT> - CUB_RUNTIME_FUNCTION - static cudaError_t MultiHistogramRange( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_histogram[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histogram[i] should be num_levels[i] - 1. - int num_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_levels[i] - 1. - LevelT* d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. - OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - size_t row_stride_bytes, ///< [in] The number of bytes between starts of consecutive rows in the region of interest - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - /// The sample value type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - Int2Type is_byte_sample; - - if ((sizeof(OffsetT) > sizeof(int)) && - ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits::max())) - { - // Down-convert OffsetT data type - return DipatchHistogram::DispatchRange( - d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, - (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)), - stream, debug_synchronous, is_byte_sample); - } - - return DipatchHistogram::DispatchRange( - d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels, - num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)), - stream, debug_synchronous, is_byte_sample); - } - - - - //@} end member group -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/device_partition.cuh b/SRC/cub/device/device_partition.cuh deleted file mode 100644 index 50535400..00000000 --- a/SRC/cub/device/device_partition.cuh +++ /dev/null @@ -1,273 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/dispatch_select_if.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png) - * \ingroup SingleModule - * - * \par Overview - * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from - * a specified input sequence. - * - * \par Usage Considerations - * \cdp_class{DevicePartition} - * - * \par Performance - * \linear_performance{partition} - * - * \par - * The following chart illustrates DevicePartition::If - * performance across different CUDA architectures for \p int32 items, - * where 50% of the items are randomly selected for the first partition. - * \plots_below - * - * \image html partition_if_int32_50_percent.png - * - */ -struct DevicePartition -{ - /** - * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png) - * - * \par - * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). - * - Copies of the selected items are compacted into \p d_out and maintain their original - * relative ordering, however copies of the unselected items are compacted into the - * rear of \p d_out in reverse order. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the compaction of items selected from an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] - * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); - * - * // d_out <-- [1, 4, 6, 7, 8, 5, 3, 2] - * // d_num_selected_out <-- [4] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing output items \iterator - * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator - */ - template < - typename InputIteratorT, - typename FlagIterator, - typename OutputIteratorT, - typename NumSelectedIteratorT> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Flagged( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items - NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) - int num_items, ///< [in] Total number of items to select from - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int OffsetT; // Signed integer type for global offsets - typedef NullType SelectOp; // Selection op (not used) - typedef NullType EqualityOp; // Equality operator (not used) - - return DispatchSelectIf::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_flags, - d_out, - d_num_selected_out, - SelectOp(), - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out. The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png) - * - * \par - * - Copies of the selected items are compacted into \p d_out and maintain their original - * relative ordering, however copies of the unselected items are compacted into the - * rear of \p d_out in reverse order. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated partition-if performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. Items are - * selected for the first partition with 50% probability. - * - * \image html partition_if_int32_50_percent.png - * \image html partition_if_int64_50_percent.png - * - * \par - * The following charts are similar, but 5% selection probability for the first partition: - * - * \image html partition_if_int32_5_percent.png - * \image html partition_if_int64_5_percent.png - * - * \par Snippet - * The code snippet below illustrates the compaction of items selected from an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Functor type for selecting values less than some criteria - * struct LessThan - * { - * int compare; - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * LessThan(int compare) : compare(compare) {} - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * bool operator()(const int &a) const { - * return (a < compare); - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * LessThan select_op(7); - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); - * - * // d_out <-- [0, 2, 3, 5, 2, 8, 81, 9] - * // d_num_selected_out <-- [5] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing output items \iterator - * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator - * \tparam SelectOp [inferred] Selection functor type having member bool operator()(const T &a) - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename NumSelectedIteratorT, - typename SelectOp> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t If( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of partitioned data items - NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition) - int num_items, ///< [in] Total number of items to select from - SelectOp select_op, ///< [in] Unary selection operator - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int OffsetT; // Signed integer type for global offsets - typedef NullType* FlagIterator; // FlagT iterator type (not used) - typedef NullType EqualityOp; // Equality operator (not used) - - return DispatchSelectIf::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - NULL, - d_out, - d_num_selected_out, - select_op, - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - -}; - -/** - * \example example_device_partition_flagged.cu - * \example example_device_partition_if.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/device_radix_sort.cuh b/SRC/cub/device/device_radix_sort.cuh deleted file mode 100644 index 1c0bdbea..00000000 --- a/SRC/cub/device/device_radix_sort.cuh +++ /dev/null @@ -1,797 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/dispatch_radix_sort.cuh" -#include "../util_arch.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png) - * \ingroup SingleModule - * - * \par Overview - * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges - * items into ascending (or descending) order. The algorithm relies upon a positional representation for - * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, - * characters, etc.) specified from least-significant to most-significant. For a - * given input sequence of keys and a set of rules specifying a total ordering - * of the symbolic alphabet, the radix sorting method produces a lexicographic - * ordering of those keys. - * - * \par - * DeviceRadixSort can sort all of the built-in C++ numeric primitive types - * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half - * half-precision floating-point type. Although the direct radix sorting - * method can only be applied to unsigned integral types, DeviceRadixSort - * is able to sort signed and floating-point types via simple bit-wise transformations - * that ensure lexicographic key ordering. - * - * \par Usage Considerations - * \cdp_class{DeviceRadixSort} - * - * \par Performance - * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys - * performance across different CUDA architectures for uniform-random \p uint32 keys. - * \plots_below - * - * \image html lsb_radix_sort_int32_keys.png - * - */ -struct DeviceRadixSort -{ - - /******************************************************************//** - * \name KeyT-value pairs - *********************************************************************/ - //@{ - - /** - * \brief Sorts key-value pairs into ascending order. (~2N auxiliary storage required) - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated sorting performance across different - * CUDA architectures for uniform-random uint32,uint32 and - * uint64,uint64 pairs, respectively. - * - * \image html lsb_radix_sort_int32_pairs.png - * \image html lsb_radix_sort_int64_pairs.png - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [ ... ] - * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_values_out; // e.g., [ ... ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); - * - * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] - * // d_values_out <-- [5, 4, 3, 1, 2, 0, 6] - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - * \tparam ValueT [inferred] ValueT type - */ - template < - typename KeyT, - typename ValueT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairs( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data - const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items - ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts key-value pairs into ascending order. (~N auxiliary storage required) - * - * \par - * - The sorting operation is given a pair of key buffers and a corresponding - * pair of associated value buffers. Each pair is managed by a DoubleBuffer - * structure that indicates which of the two buffers is "current" (and thus - * contains the input data to be sorted). - * - The contents of both buffers within each pair may be altered by the sorting - * operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within each DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated sorting performance across different - * CUDA architectures for uniform-random uint32,uint32 and - * uint64,uint64 pairs, respectively. - * - * \image html lsb_radix_sort_int32_pairs.png - * \image html lsb_radix_sort_int64_pairs.png - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [ ... ] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [ ... ] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); - * - * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] - * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - * \tparam ValueT [inferred] ValueT type - */ - template < - typename KeyT, - typename ValueT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairs( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts key-value pairs into descending order. (~2N auxiliary storage required). - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Performance - * Performance is similar to DeviceRadixSort::SortPairs. - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [ ... ] - * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_values_out; // e.g., [ ... ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, num_items); - * - * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0] - * // d_values_out <-- [6, 0, 2, 1, 3, 4, 5] - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - * \tparam ValueT [inferred] ValueT type - */ - template < - typename KeyT, - typename ValueT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairsDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data - const ValueT *d_values_in, ///< [in] Pointer to the corresponding input sequence of associated value items - ValueT *d_values_out, ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts key-value pairs into descending order. (~N auxiliary storage required). - * - * \par - * - The sorting operation is given a pair of key buffers and a corresponding - * pair of associated value buffers. Each pair is managed by a DoubleBuffer - * structure that indicates which of the two buffers is "current" (and thus - * contains the input data to be sorted). - * - The contents of both buffers within each pair may be altered by the sorting - * operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within each DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Performance - * Performance is similar to DeviceRadixSort::SortPairs. - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [ ... ] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [ ... ] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items); - * - * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] - * // d_values.Current() <-- [6, 0, 2, 1, 3, 4, 5] - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - * \tparam ValueT [inferred] ValueT type - */ - template < - typename KeyT, - typename ValueT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairsDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - - //@} end member group - /******************************************************************//** - * \name Keys-only - *********************************************************************/ - //@{ - - - /** - * \brief Sorts keys into ascending order. (~2N auxiliary storage required) - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated sorting performance across different - * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. - * - * \image html lsb_radix_sort_int32_keys.png - * \image html lsb_radix_sort_int64_keys.png - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [ ... ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); - * - * // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - */ - template - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeys( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // Null value type - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values; - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts keys into ascending order. (~N auxiliary storage required). - * - * \par - * - The sorting operation is given a pair of key buffers managed by a - * DoubleBuffer structure that indicates which of the two buffers is - * "current" (and thus contains the input data to be sorted). - * - The contents of both buffers may be altered by the sorting operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within the DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated sorting performance across different - * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively. - * - * \image html lsb_radix_sort_int32_keys.png - * \image html lsb_radix_sort_int64_keys.png - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [ ... ] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items); - * - * // d_keys.Current() <-- [0, 3, 5, 6, 7, 8, 9] - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - */ - template - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeys( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // Null value type - DoubleBuffer d_values; - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - /** - * \brief Sorts keys into descending order. (~2N auxiliary storage required). - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Performance - * Performance is similar to DeviceRadixSort::SortKeys. - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [ ... ] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items); - * - * // d_keys_out <-- [9, 8, 7, 6, 5, 3, 0]s - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - */ - template - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeysDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] Pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] Pointer to the sorted output sequence of key data - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values; - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts keys into descending order. (~N auxiliary storage required). - * - * \par - * - The sorting operation is given a pair of key buffers managed by a - * DoubleBuffer structure that indicates which of the two buffers is - * "current" (and thus contains the input data to be sorted). - * - The contents of both buffers may be altered by the sorting operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within the DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Performance - * Performance is similar to DeviceRadixSort::SortKeys. - * - * \par Snippet - * The code snippet below illustrates the sorting of a device vector of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [ ... ] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items); - * - * // d_keys.Current() <-- [9, 8, 7, 6, 5, 3, 0] - * - * \endcode - * - * \tparam KeyT [inferred] KeyT type - */ - template - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeysDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - int num_items, ///< [in] Number of items to sort - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // Null value type - DoubleBuffer d_values; - - return DispatchRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - - //@} end member group - - -}; - -/** - * \example example_device_radix_sort.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/device_reduce.cuh b/SRC/cub/device/device_reduce.cuh deleted file mode 100644 index 13c7a72d..00000000 --- a/SRC/cub/device/device_reduce.cuh +++ /dev/null @@ -1,734 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include -#include - -#include "../iterator/arg_index_input_iterator.cuh" -#include "dispatch/dispatch_reduce.cuh" -#include "dispatch/dispatch_reduce_by_key.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png) - * \ingroup SingleModule - * - * \par Overview - * A reduction (or fold) - * uses a binary combining operator to compute a single aggregate from a sequence of input elements. - * - * \par Usage Considerations - * \cdp_class{DeviceReduce} - * - * \par Performance - * \linear_performance{reduction, reduce-by-key, and run-length encode} - * - * \par - * The following chart illustrates DeviceReduce::Sum - * performance across different CUDA architectures for \p int32 keys. - * - * \image html reduce_int32.png - * - * \par - * The following chart illustrates DeviceReduce::ReduceByKey (summation) - * performance across different CUDA architectures for \p fp32 - * values. Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000]. - * - * \image html reduce_by_key_fp32_len_500.png - * - * \par - * \plots_below - * - */ -struct DeviceReduce -{ - /** - * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init. - * - * \par - * - Does not support binary reduction operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * __device__ __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * CustomMin min_op; - * int init; // e.g., INT_MAX - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run reduction - * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init); - * - * // d_out <-- [0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam ReductionOpT [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) - * \tparam T [inferred] Data element type that is convertible to the \p value type of \p InputIteratorT - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename ReductionOpT, - typename T> - CUB_RUNTIME_FUNCTION - static cudaError_t Reduce( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - ReductionOpT reduction_op, ///< [in] Binary reduction functor - T init, ///< [in] Initial value of the reduction - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - reduction_op, - init, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide sum using the addition (\p +) operator. - * - * \par - * - Uses \p 0 as the initial value of the reduction. - * - Does not support \p + operators that are non-commutative.. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated sum-reduction performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. - * - * \image html reduce_int32.png - * \image html reduce_int64.png - * - * \par Snippet - * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sum-reduction - * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out <-- [38] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Sum( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - cub::Sum(), - OutputT(), // zero-initialize - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide minimum using the less-than ('<') operator. - * - * \par - * - Uses std::numeric_limits::max() as the initial value of the reduction. - * - Does not support \p < operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the min-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run min-reduction - * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out <-- [0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Min( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - cub::Min(), - Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent - stream, - debug_synchronous); - } - - - /** - * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item. - * - * \par - * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) - * - The minimum is written to d_out.value and its offset in the input array is written to d_out.key. - * - The {1, std::numeric_limits::max()} tuple is produced for zero-length inputs - * - Does not support \p < operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmin-reduction - * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items); - * - * // d_out <-- [{5, 0}] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type cub::KeyValuePair) \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t ArgMin( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input type - typedef typename std::iterator_traits::value_type InputValueT; - - // The output tuple type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - KeyValuePair, // ... then the key value pair OffsetT + InputValueT - typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type - - // The output value type - typedef typename OutputTupleT::Value OutputValueT; - - // Wrapped input iterator to produce index-value tuples - typedef ArgIndexInputIterator ArgIndexInputIteratorT; - ArgIndexInputIteratorT d_indexed_in(d_in); - - // Initial value - OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_indexed_in, - d_out, - num_items, - cub::ArgMin(), - initial_value, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide maximum using the greater-than ('>') operator. - * - * \par - * - Uses std::numeric_limits::lowest() as the initial value of the reduction. - * - Does not support \p > operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the max-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run max-reduction - * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items); - * - * // d_out <-- [9] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Max( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_items, - cub::Max(), - Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent - stream, - debug_synchronous); - } - - - /** - * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item - * - * \par - * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) - * - The maximum is written to d_out.value and its offset in the input array is written to d_out.key. - * - The {1, std::numeric_limits::lowest()} tuple is produced for zero-length inputs - * - Does not support \p > operators that are non-commutative. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmax-reduction - * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items); - * - * // d_out <-- [{6, 9}] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type cub::KeyValuePair) \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t ArgMax( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input type - typedef typename std::iterator_traits::value_type InputValueT; - - // The output tuple type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - KeyValuePair, // ... then the key value pair OffsetT + InputValueT - typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type - - // The output value type - typedef typename OutputTupleT::Value OutputValueT; - - // Wrapped input iterator to produce index-value tuples - typedef ArgIndexInputIterator ArgIndexInputIteratorT; - ArgIndexInputIteratorT d_indexed_in(d_in); - - // Initial value - OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent - - return DispatchReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_indexed_in, - d_out, - num_items, - cub::ArgMax(), - initial_value, - stream, - debug_synchronous); - } - - - /** - * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys. - * - * \par - * This operation computes segmented reductions within \p d_values_in using - * the specified binary \p reduction_op functor. The segments are identified by - * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of - * consecutive, identical keys. For the ith run encountered, - * the first key of the run and the corresponding value aggregate of that run are - * written to d_unique_out[i] and d_aggregates_out[i], - * respectively. The total number of runs encountered is written to \p d_num_runs_out. - * - * \par - * - The == equality operator is used to determine whether keys are equivalent - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Performance - * The following chart illustrates reduction-by-key (sum) performance across - * different CUDA architectures for \p fp32 and \p fp64 values, respectively. Segments - * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000]. - * - * \image html reduce_by_key_fp32_len_500.png - * \image html reduce_by_key_fp64_len_500.png - * - * \par - * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: - * - * \image html reduce_by_key_fp32_len_5.png - * \image html reduce_by_key_fp64_len_5.png - * - * \par Snippet - * The code snippet below illustrates the segmented reduction of \p int values grouped - * by runs of associated \p int keys. - * \par - * \code - * #include // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 8 - * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_values_in; // e.g., [0, 7, 1, 6, 2, 5, 3, 4] - * int *d_unique_out; // e.g., [-, -, -, -, -, -, -, -] - * int *d_aggregates_out; // e.g., [-, -, -, -, -, -, -, -] - * int *d_num_runs_out; // e.g., [-] - * CustomMin reduction_op; - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run reduce-by-key - * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items); - * - * // d_unique_out <-- [0, 2, 9, 5, 8] - * // d_aggregates_out <-- [0, 1, 6, 2, 4] - * // d_num_runs_out <-- [5] - * - * \endcode - * - * \tparam KeysInputIteratorT [inferred] Random-access input iterator type for reading input keys \iterator - * \tparam UniqueOutputIteratorT [inferred] Random-access output iterator type for writing unique output keys \iterator - * \tparam ValuesInputIteratorT [inferred] Random-access input iterator type for reading input values \iterator - * \tparam AggregatesOutputIterator [inferred] Random-access output iterator type for writing output value aggregates \iterator - * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator - * \tparam ReductionOpT [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) - */ - template < - typename KeysInputIteratorT, - typename UniqueOutputIteratorT, - typename ValuesInputIteratorT, - typename AggregatesOutputIteratorT, - typename NumRunsOutputIteratorT, - typename ReductionOpT> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t ReduceByKey( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys - UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) - ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values - AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) - ReductionOpT reduction_op, ///< [in] Binary reduction functor - int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // FlagT iterator type (not used) - - // Selection op (not used) - - // Default == operator - typedef Equality EqualityOp; - - return DispatchReduceByKey::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_unique_out, - d_values_in, - d_aggregates_out, - d_num_runs_out, - EqualityOp(), - reduction_op, - num_items, - stream, - debug_synchronous); - } - -}; - -/** - * \example example_device_reduce.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/device_run_length_encode.cuh b/SRC/cub/device/device_run_length_encode.cuh deleted file mode 100644 index 7a2e82d9..00000000 --- a/SRC/cub/device/device_run_length_encode.cuh +++ /dev/null @@ -1,278 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/dispatch_rle.cuh" -#include "dispatch/dispatch_reduce_by_key.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png) - * \ingroup SingleModule - * - * \par Overview - * A run-length encoding - * computes a simple compressed representation of a sequence of input elements such that each - * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a - * count of the elements in that run. - * - * \par Usage Considerations - * \cdp_class{DeviceRunLengthEncode} - * - * \par Performance - * \linear_performance{run-length encode} - * - * \par - * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across - * different CUDA architectures for \p int32 items. - * Segments have lengths uniformly sampled from [1,1000]. - * - * \image html rle_int32_len_500.png - * - * \par - * \plots_below - * - */ -struct DeviceRunLengthEncode -{ - - /** - * \brief Computes a run-length encoding of the sequence \p d_in. - * - * \par - * - For the ith run encountered, the first key of the run and its length are written to - * d_unique_out[i] and d_counts_out[i], - * respectively. - * - The total number of runs encountered is written to \p d_num_runs_out. - * - The == equality operator is used to determine whether values are equivalent - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated encode performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have - * lengths uniformly sampled from [1,1000]. - * - * \image html rle_int32_len_500.png - * \image html rle_int64_len_500.png - * - * \par - * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: - * - * \image html rle_int32_len_5.png - * \image html rle_int64_len_5.png - * - * \par Snippet - * The code snippet below illustrates the run-length encoding of a sequence of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_unique_out; // e.g., [ , , , , , , , ] - * int *d_counts_out; // e.g., [ , , , , , , , ] - * int *d_num_runs_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run encoding - * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items); - * - * // d_unique_out <-- [0, 2, 9, 5, 8] - * // d_counts_out <-- [1, 2, 1, 3, 1] - * // d_num_runs_out <-- [5] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam UniqueOutputIteratorT [inferred] Random-access output iterator type for writing unique output items \iterator - * \tparam LengthsOutputIteratorT [inferred] Random-access output iterator type for writing output counts \iterator - * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator - */ - template < - typename InputIteratorT, - typename UniqueOutputIteratorT, - typename LengthsOutputIteratorT, - typename NumRunsOutputIteratorT> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Encode( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of keys - UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) - LengthsOutputIteratorT d_counts_out, ///< [out] Pointer to the output sequence of run-lengths (one count per run) - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs - int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int OffsetT; // Signed integer type for global offsets - typedef NullType* FlagIterator; // FlagT iterator type (not used) - typedef NullType SelectOp; // Selection op (not used) - typedef Equality EqualityOp; // Default == operator - typedef cub::Sum ReductionOp; // Value reduction operator - - // The lengths output value type - typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? - OffsetT, // ... then the OffsetT type, - typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type - - // Generator type for providing 1s values for run-length reduction - typedef ConstantInputIterator LengthsInputIteratorT; - - return DispatchReduceByKey::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_unique_out, - LengthsInputIteratorT((LengthT) 1), - d_counts_out, - d_num_runs_out, - EqualityOp(), - ReductionOp(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in. - * - * \par - * - For the ith non-trivial run, the run's starting offset - * and its length are written to d_offsets_out[i] and - * d_lengths_out[i], respectively. - * - The total number of runs encountered is written to \p d_num_runs_out. - * - The == equality operator is used to determine whether values are equivalent - * - \devicestorage - * - * \par Performance - * - * \par Snippet - * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_offsets_out; // e.g., [ , , , , , , , ] - * int *d_lengths_out; // e.g., [ , , , , , , , ] - * int *d_num_runs_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run encoding - * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items); - * - * // d_offsets_out <-- [1, 4] - * // d_lengths_out <-- [2, 3] - * // d_num_runs_out <-- [2] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OffsetsOutputIteratorT [inferred] Random-access output iterator type for writing run-offset values \iterator - * \tparam LengthsOutputIteratorT [inferred] Random-access output iterator type for writing run-length values \iterator - * \tparam NumRunsOutputIteratorT [inferred] Output iterator type for recording the number of runs encountered \iterator - */ - template < - typename InputIteratorT, - typename OffsetsOutputIteratorT, - typename LengthsOutputIteratorT, - typename NumRunsOutputIteratorT> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t NonTrivialRuns( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to input sequence of data items - OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run) - LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run) - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) - int num_items, ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int OffsetT; // Signed integer type for global offsets - typedef Equality EqualityOp; // Default == operator - - return DeviceRleDispatch::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_offsets_out, - d_lengths_out, - d_num_runs_out, - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/device_scan.cuh b/SRC/cub/device/device_scan.cuh deleted file mode 100644 index e86fefe3..00000000 --- a/SRC/cub/device/device_scan.cuh +++ /dev/null @@ -1,443 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/dispatch_scan.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png) - * \ingroup SingleModule - * - * \par Overview - * Given a sequence of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) - * produces an output sequence where each element is computed to be the reduction - * of the elements occurring earlier in the input sequence. Prefix sum - * connotes a prefix scan with the addition operator. The term \em inclusive indicates - * that the ith output reduction incorporates the ith input. - * The term \em exclusive indicates the ith input is not incorporated into - * the ith output reduction. - * - * \par - * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our "decoupled look-back" algorithm - * for performing global prefix scan with only a single pass through the - * input data, as described in our 2016 technical report [1]. The central - * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies - * of global prefix propagation with local computation. As such, our algorithm requires only - * ~2n data movement (n inputs are read, n outputs are written), and typically - * proceeds at "memcpy" speeds. - * - * \par - * [1] [Duane Merrill and Michael Garland. "Single-pass Parallel Prefix Scan with Decoupled Look-back", NVIDIA Technical Report NVR-2016-002, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back) - * - * \par Usage Considerations - * \cdp_class{DeviceScan} - * - * \par Performance - * \linear_performance{prefix scan} - * - * \par - * The following chart illustrates DeviceScan::ExclusiveSum - * performance across different CUDA architectures for \p int32 keys. - * \plots_below - * - * \image html scan_int32.png - * - */ -struct DeviceScan -{ - /******************************************************************//** - * \name Exclusive scans - *********************************************************************/ - //@{ - - /** - * \brief Computes a device-wide exclusive prefix sum. The value of 0 is applied as the initial value, and is assigned to *d_out. - * - * \par - * - Supports non-commutative sum operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated exclusive sum performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. - * - * \image html scan_int32.png - * \image html scan_int64.png - * - * \par Snippet - * The code snippet below illustrates the exclusive prefix sum of an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix sum - * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out s<-- [0, 8, 14, 21, 26, 29, 29] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t ExclusiveSum( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - // Initial value - OutputT init_value = 0; - - return DispatchScan::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - Sum(), - init_value, - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor. The \p init_value value is applied as the initial value, and is assigned to *d_out. - * - * \par - * - Supports non-commutative scan operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector - * \par - * \code - * #include // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * CustomMin min_op - * ... - * - * // Determine temporary device storage requirements for exclusive prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); - * - * // Allocate temporary storage for exclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run exclusive prefix min-scan - * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items); - * - * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - * \tparam Identity [inferred] Type of the \p identity value used Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename ScanOpT, - typename InitValueT> - CUB_RUNTIME_FUNCTION - static cudaError_t ExclusiveScan( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items - ScanOpT scan_op, ///< [in] Binary scan functor - InitValueT init_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out) - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchScan::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - init_value, - num_items, - stream, - debug_synchronous); - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive scans - *********************************************************************/ - //@{ - - - /** - * \brief Computes a device-wide inclusive prefix sum. - * - * \par - * - Supports non-commutative sum operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the inclusive prefix sum of an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * ... - * - * // Determine temporary device storage requirements for inclusive prefix sum - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // Allocate temporary storage for inclusive prefix sum - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix sum - * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items); - * - * // d_out <-- [8, 14, 21, 26, 29, 29, 38] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t InclusiveSum( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchScan::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - Sum(), - NullType(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor. - * - * \par - * - Supports non-commutative scan operators. - * - Provides "run-to-run" determinism for pseudo-associative reduction - * (e.g., addition of floating point types) on the same GPU device. - * However, results for pseudo-associative reduction may be inconsistent - * from one device to a another device of a different compute-capability - * because CUB can employ different tile-sizing for different architectures. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 7 - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [ , , , , , , ] - * CustomMin min_op; - * ... - * - * // Determine temporary device storage requirements for inclusive prefix scan - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); - * - * // Allocate temporary storage for inclusive prefix scan - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run inclusive prefix min-scan - * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items); - * - * // d_out <-- [8, 6, 6, 5, 3, 0, 0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading scan inputs \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing scan outputs \iterator - * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename ScanOpT> - CUB_RUNTIME_FUNCTION - static cudaError_t InclusiveScan( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items - ScanOpT scan_op, ///< [in] Binary scan functor - int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchScan::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - NullType(), - num_items, - stream, - debug_synchronous); - } - - //@} end member group - -}; - -/** - * \example example_device_scan.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/device_segmented_radix_sort.cuh b/SRC/cub/device/device_segmented_radix_sort.cuh deleted file mode 100644 index 0d360762..00000000 --- a/SRC/cub/device/device_segmented_radix_sort.cuh +++ /dev/null @@ -1,876 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/dispatch_radix_sort.cuh" -#include "../util_arch.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png) - * \ingroup SegmentedModule - * - * \par Overview - * The [radix sorting method](http://en.wikipedia.org/wiki/Radix_sort) arranges - * items into ascending (or descending) order. The algorithm relies upon a positional representation for - * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits, - * characters, etc.) specified from least-significant to most-significant. For a - * given input sequence of keys and a set of rules specifying a total ordering - * of the symbolic alphabet, the radix sorting method produces a lexicographic - * ordering of those keys. - * - * \par - * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types - * (unsigned char, \p int, \p double, etc.) as well as CUDA's \p __half - * half-precision floating-point type. Although the direct radix sorting - * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort - * is able to sort signed and floating-point types via simple bit-wise transformations - * that ensure lexicographic key ordering. - * - * \par Usage Considerations - * \cdp_class{DeviceSegmentedRadixSort} - * - */ -struct DeviceSegmentedRadixSort -{ - - /******************************************************************//** - * \name Key-value pairs - *********************************************************************/ - //@{ - - /** - * \brief Sorts segments of key-value pairs into ascending order. (~2N auxiliary storage required) - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_values_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] - * // d_values_out <-- [1, 2, 0, 5, 4, 3, 6] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam ValueT [inferred] Value type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename ValueT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairs( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data - const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items - ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts segments of key-value pairs into ascending order. (~N auxiliary storage required) - * - * \par - * - The sorting operation is given a pair of key buffers and a corresponding - * pair of associated value buffers. Each pair is managed by a DoubleBuffer - * structure that indicates which of the two buffers is "current" (and thus - * contains the input data to be sorted). - * - The contents of both buffers within each pair may be altered by the sorting - * operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within each DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] - * // d_values.Current() <-- [5, 4, 3, 1, 2, 0, 6] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam ValueT [inferred] Value type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename ValueT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairs( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts segments of key-value pairs into descending order. (~2N auxiliary storage required). - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * int *d_values_in; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_values_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, - * d_keys_in, d_keys_out, d_values_in, d_values_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] - * // d_values_out <-- [0, 2, 1, 6, 3, 4, 5] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam ValueT [inferred] Value type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename ValueT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairsDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data - const ValueT *d_values_in, ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items - ValueT *d_values_out, ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values(const_cast(d_values_in), d_values_out); - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts segments of key-value pairs into descending order. (~N auxiliary storage required). - * - * \par - * - The sorting operation is given a pair of key buffers and a corresponding - * pair of associated value buffers. Each pair is managed by a DoubleBuffer - * structure that indicates which of the two buffers is "current" (and thus - * contains the input data to be sorted). - * - The contents of both buffers within each pair may be altered by the sorting - * operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within each DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys - * with associated vector of \p int values. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * int *d_value_buf; // e.g., [0, 1, 2, 3, 4, 5, 6] - * int *d_value_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a set of DoubleBuffers to wrap pairs of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * cub::DoubleBuffer d_values(d_value_buf, d_value_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] - * // d_values.Current() <-- [0, 2, 1, 6, 3, 4, 5] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam ValueT [inferred] Value type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename ValueT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortPairsDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - - //@} end member group - /******************************************************************//** - * \name Keys-only - *********************************************************************/ - //@{ - - - /** - * \brief Sorts segments of keys into ascending order. (~2N auxiliary storage required) - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [6, 7, 8, 0, 3, 5, 9] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeys( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // Null value type - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values; - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts segments of keys into ascending order. (~N auxiliary storage required). - * - * \par - * - The sorting operation is given a pair of key buffers managed by a - * DoubleBuffer structure that indicates which of the two buffers is - * "current" (and thus contains the input data to be sorted). - * - The contents of both buffers may be altered by the sorting operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within the DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [6, 7, 8, 0, 3, 5, 9] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeys( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // Null value type - DoubleBuffer d_values; - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - /** - * \brief Sorts segments of keys into descending order. (~2N auxiliary storage required). - * - * \par - * - The contents of the input data are not altered by the sorting operation - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageNP For sorting using only O(P) temporary storage, see the sorting interface using DoubleBuffer wrappers below. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_keys_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_keys_out; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys_out <-- [8, 7, 6, 9, 5, 3, 0] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeysDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - const KeyT *d_keys_in, ///< [in] %Device-accessible pointer to the input data of key data to sort - KeyT *d_keys_out, ///< [out] %Device-accessible pointer to the sorted output sequence of key data - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - DoubleBuffer d_values; - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - false, - stream, - debug_synchronous); - } - - - /** - * \brief Sorts segments of keys into descending order. (~N auxiliary storage required). - * - * \par - * - The sorting operation is given a pair of key buffers managed by a - * DoubleBuffer structure that indicates which of the two buffers is - * "current" (and thus contains the input data to be sorted). - * - The contents of both buffers may be altered by the sorting operation. - * - Upon completion, the sorting operation will update the "current" indicator - * within the DoubleBuffer wrapper to reference which of the two buffers - * now contains the sorted output sequence (a function of the number of key bits - * specified and the targeted device architecture). - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - An optional bit subrange [begin_bit, end_bit) of differentiating key bits can be specified. This can reduce overall sorting overhead and yield a corresponding performance improvement. - * - \devicestorageP - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for sorting data - * int num_items; // e.g., 7 - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_key_buf; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_key_alt_buf; // e.g., [-, -, -, -, -, -, -] - * ... - * - * // Create a DoubleBuffer to wrap the pair of device pointers - * cub::DoubleBuffer d_keys(d_key_buf, d_key_alt_buf); - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sorting operation - * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, - * num_items, num_segments, d_offsets, d_offsets + 1); - * - * // d_keys.Current() <-- [8, 7, 6, 9, 5, 3, 0] - * - * \endcode - * - * \tparam KeyT [inferred] Key type - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename KeyT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t SortKeysDescending( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - int num_items, ///< [in] The total number of items to sort (across all segments) - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit = 0, ///< [in] [optional] The least-significant bit index (inclusive) needed for key comparison - int end_bit = sizeof(KeyT) * 8, ///< [in] [optional] The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // Null value type - DoubleBuffer d_values; - - return DispatchSegmentedRadixSort::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys, - d_values, - num_items, - num_segments, - d_begin_offsets, - d_end_offsets, - begin_bit, - end_bit, - true, - stream, - debug_synchronous); - } - - - //@} end member group - - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/device_segmented_reduce.cuh b/SRC/cub/device/device_segmented_reduce.cuh deleted file mode 100644 index 6c3b54a0..00000000 --- a/SRC/cub/device/device_segmented_reduce.cuh +++ /dev/null @@ -1,619 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "../iterator/arg_index_input_iterator.cuh" -#include "dispatch/dispatch_reduce.cuh" -#include "dispatch/dispatch_reduce_by_key.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png) - * \ingroup SegmentedModule - * - * \par Overview - * A reduction (or fold) - * uses a binary combining operator to compute a single aggregate from a sequence of input elements. - * - * \par Usage Considerations - * \cdp_class{DeviceSegmentedReduce} - * - */ -struct DeviceSegmentedReduce -{ - /** - * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor. - * - * \par - * - Does not support binary reduction operators that are non-commutative. - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // CustomMin functor - * struct CustomMin - * { - * template - * CUB_RUNTIME_FUNCTION __forceinline__ - * T operator()(const T &a, const T &b) const { - * return (b < a) ? b : a; - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * CustomMin min_op; - * int initial_value; // e.g., INT_MAX - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run reduction - * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1, min_op, initial_value); - * - * // d_out <-- [6, INT_MAX, 0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) - * \tparam T [inferred] Data element type that is convertible to the \p value type of \p InputIteratorT - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename OffsetIteratorT, - typename ReductionOp, - typename T> - CUB_RUNTIME_FUNCTION - static cudaError_t Reduce( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - ReductionOp reduction_op, ///< [in] Binary reduction functor - T initial_value, ///< [in] Initial value of the reduction for each segment - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - reduction_op, - initial_value, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide segmented sum using the addition ('+') operator. - * - * \par - * - Uses \p 0 as the initial value of the reduction for each segment. - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - Does not support \p + operators that are non-commutative.. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the sum reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run sum-reduction - * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [21, 0, 17] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Sum( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::Sum(), - OutputT(), // zero-initialize - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide segmented minimum using the less-than ('<') operator. - * - * \par - * - Uses std::numeric_limits::max() as the initial value of the reduction for each segment. - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - Does not support \p < operators that are non-commutative. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the min-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run min-reduction - * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [6, INT_MAX, 0] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Min( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::Min(), - Traits::Max(), // replace with std::numeric_limits::max() when C++11 support is more prevalent - stream, - debug_synchronous); - } - - - /** - * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item. - * - * \par - * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) - * - The minimum of the ith segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key. - * - The {1, std::numeric_limits::max()} tuple is produced for zero-length inputs - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - Does not support \p < operators that are non-commutative. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmin-reduction - * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type KeyValuePair) \iterator - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t ArgMin( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input type - typedef typename std::iterator_traits::value_type InputValueT; - - // The output tuple type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - KeyValuePair, // ... then the key value pair OffsetT + InputValueT - typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type - - // The output value type - typedef typename OutputTupleT::Value OutputValueT; - - // Wrapped input iterator to produce index-value tuples - typedef ArgIndexInputIterator ArgIndexInputIteratorT; - ArgIndexInputIteratorT d_indexed_in(d_in); - - // Initial value - OutputTupleT initial_value(1, Traits::Max()); // replace with std::numeric_limits::max() when C++11 support is more prevalent - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_indexed_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::ArgMin(), - initial_value, - stream, - debug_synchronous); - } - - - /** - * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator. - * - * \par - * - Uses std::numeric_limits::lowest() as the initial value of the reduction. - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - Does not support \p > operators that are non-commutative. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the max-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * int *d_out; // e.g., [-, -, -] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run max-reduction - * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [8, INT_MIN, 9] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate \iterator - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t Max( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input value type - typedef typename std::iterator_traits::value_type InputT; - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::Max(), - Traits::Lowest(), // replace with std::numeric_limits::lowest() when C++11 support is more prevalent - stream, - debug_synchronous); - } - - - /** - * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item - * - * \par - * - The output value type of \p d_out is cub::KeyValuePair (assuming the value type of \p d_in is \p T) - * - The maximum of the ith segment is written to d_out[i].value and its offset in that segment is written to d_out[i].key. - * - The {1, std::numeric_limits::lowest()} tuple is produced for zero-length inputs - * - When input a contiguous sequence of segments, a single sequence - * \p segment_offsets (of length num_segments+1) can be aliased - * for both the \p d_begin_offsets and \p d_end_offsets parameters (where - * the latter is specified as segment_offsets+1). - * - Does not support \p > operators that are non-commutative. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_segments; // e.g., 3 - * int *d_offsets; // e.g., [0, 3, 3, 7] - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * KeyValuePair *d_out; // e.g., [{-,-}, {-,-}, {-,-}] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run argmax-reduction - * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out, - * num_segments, d_offsets, d_offsets + 1); - * - * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items (of some type \p T) \iterator - * \tparam OutputIteratorT [inferred] Output iterator type for recording the reduced aggregate (having value type KeyValuePair) \iterator - * \tparam OffsetIteratorT [inferred] Random-access input iterator type for reading segment offsets \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename OffsetIteratorT> - CUB_RUNTIME_FUNCTION - static cudaError_t ArgMax( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - // Signed integer type for global offsets - typedef int OffsetT; - - // The input type - typedef typename std::iterator_traits::value_type InputValueT; - - // The output tuple type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - KeyValuePair, // ... then the key value pair OffsetT + InputValueT - typename std::iterator_traits::value_type>::Type OutputTupleT; // ... else the output iterator's value type - - // The output value type - typedef typename OutputTupleT::Value OutputValueT; - - // Wrapped input iterator to produce index-value tuples - typedef ArgIndexInputIterator ArgIndexInputIteratorT; - ArgIndexInputIteratorT d_indexed_in(d_in); - - // Initial value - OutputTupleT initial_value(1, Traits::Lowest()); // replace with std::numeric_limits::lowest() when C++11 support is more prevalent - - return DispatchSegmentedReduce::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_indexed_in, - d_out, - num_segments, - d_begin_offsets, - d_end_offsets, - cub::ArgMax(), - initial_value, - stream, - debug_synchronous); - } - -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/device_select.cuh b/SRC/cub/device/device_select.cuh deleted file mode 100644 index 52a3e126..00000000 --- a/SRC/cub/device/device_select.cuh +++ /dev/null @@ -1,369 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch/dispatch_select_if.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png) - * \ingroup SingleModule - * - * \par Overview - * These operations apply a selection criterion to selectively copy - * items from a specified input sequence to a compact output sequence. - * - * \par Usage Considerations - * \cdp_class{DeviceSelect} - * - * \par Performance - * \linear_performance{select-flagged, select-if, and select-unique} - * - * \par - * The following chart illustrates DeviceSelect::If - * performance across different CUDA architectures for \p int32 items, - * where 50% of the items are randomly selected. - * - * \image html select_if_int32_50_percent.png - * - * \par - * The following chart illustrates DeviceSelect::Unique - * performance across different CUDA architectures for \p int32 items - * where segments have lengths uniformly sampled from [1,1000]. - * - * \image html select_unique_int32_len_500.png - * - * \par - * \plots_below - * - */ -struct DeviceSelect -{ - /** - * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png) - * - * \par - * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.). - * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. - * - \devicestorage - * - * \par Snippet - * The code snippet below illustrates the compaction of items selected from an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] - * char *d_flags; // e.g., [1, 0, 0, 1, 0, 1, 1, 0] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items); - * - * // d_out <-- [1, 4, 6, 7] - * // d_num_selected_out <-- [4] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam FlagIterator [inferred] Random-access input iterator type for reading selection flags \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator - * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator - */ - template < - typename InputIteratorT, - typename FlagIterator, - typename OutputIteratorT, - typename NumSelectedIteratorT> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Flagged( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - FlagIterator d_flags, ///< [in] Pointer to the input sequence of selection flags - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int OffsetT; // Signed integer type for global offsets - typedef NullType SelectOp; // Selection op (not used) - typedef NullType EqualityOp; // Equality operator (not used) - - return DispatchSelectIf::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_flags, - d_out, - d_num_selected_out, - SelectOp(), - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png) - * - * \par - * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated select-if performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. Items are - * selected with 50% probability. - * - * \image html select_if_int32_50_percent.png - * \image html select_if_int64_50_percent.png - * - * \par - * The following charts are similar, but 5% selection probability: - * - * \image html select_if_int32_5_percent.png - * \image html select_if_int64_5_percent.png - * - * \par Snippet - * The code snippet below illustrates the compaction of items selected from an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Functor type for selecting values less than some criteria - * struct LessThan - * { - * int compare; - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * LessThan(int compare) : compare(compare) {} - * - * CUB_RUNTIME_FUNCTION __forceinline__ - * bool operator()(const int &a) const { - * return (a < compare); - * } - * }; - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 3, 9, 5, 2, 81, 8] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * LessThan select_op(7); - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op); - * - * // d_out <-- [0, 2, 3, 5, 2] - * // d_num_selected_out <-- [5] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator - * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator - * \tparam SelectOp [inferred] Selection operator type having member bool operator()(const T &a) - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename NumSelectedIteratorT, - typename SelectOp> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t If( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - SelectOp select_op, ///< [in] Unary selection operator - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int OffsetT; // Signed integer type for global offsets - typedef NullType* FlagIterator; // FlagT iterator type (not used) - typedef NullType EqualityOp; // Equality operator (not used) - - return DispatchSelectIf::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - NULL, - d_out, - d_num_selected_out, - select_op, - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - - - /** - * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out. The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png) - * - * \par - * - The == equality operator is used to determine whether keys are equivalent - * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. - * - \devicestorage - * - * \par Performance - * The following charts illustrate saturated select-unique performance across different - * CUDA architectures for \p int32 and \p int64 items, respectively. Segments have - * lengths uniformly sampled from [1,1000]. - * - * \image html select_unique_int32_len_500.png - * \image html select_unique_int64_len_500.png - * - * \par - * The following charts are similar, but with segment lengths uniformly sampled from [1,10]: - * - * \image html select_unique_int32_len_5.png - * \image html select_unique_int64_len_5.png - * - * \par Snippet - * The code snippet below illustrates the compaction of items selected from an \p int device vector. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input and output - * int num_items; // e.g., 8 - * int *d_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] - * int *d_out; // e.g., [ , , , , , , , ] - * int *d_num_selected_out; // e.g., [ ] - * ... - * - * // Determine temporary device storage requirements - * void *d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run selection - * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items); - * - * // d_out <-- [0, 2, 9, 5, 8] - * // d_num_selected_out <-- [5] - * - * \endcode - * - * \tparam InputIteratorT [inferred] Random-access input iterator type for reading input items \iterator - * \tparam OutputIteratorT [inferred] Random-access output iterator type for writing selected items \iterator - * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator - */ - template < - typename InputIteratorT, - typename OutputIteratorT, - typename NumSelectedIteratorT> - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Unique( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out) - int num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - typedef int OffsetT; // Signed integer type for global offsets - typedef NullType* FlagIterator; // FlagT iterator type (not used) - typedef NullType SelectOp; // Selection op (not used) - typedef Equality EqualityOp; // Default == operator - - return DispatchSelectIf::Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - NULL, - d_out, - d_num_selected_out, - SelectOp(), - EqualityOp(), - num_items, - stream, - debug_synchronous); - } - -}; - -/** - * \example example_device_select_flagged.cu - * \example example_device_select_if.cu - * \example example_device_select_unique.cu - */ - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/device_spmv.cuh b/SRC/cub/device/device_spmv.cuh deleted file mode 100644 index 63b6a7e8..00000000 --- a/SRC/cub/device/device_spmv.cuh +++ /dev/null @@ -1,174 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). - */ - -#pragma once - -#include -#include -#include - -#include "dispatch/dispatch_spmv_orig.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV). - * \ingroup SingleModule - * - * \par Overview - * The [SpMV computation](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication) - * performs the matrix-vector operation - * y = alpha*A*x + beta*y, - * where: - * - A is an mxn sparse matrix whose non-zero structure is specified in - * [compressed-storage-row (CSR) format](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29) - * (i.e., three arrays: values, row_offsets, and column_indices) - * - x and y are dense vectors - * - alpha and beta are scalar multiplicands - * - * \par Usage Considerations - * \cdp_class{DeviceSpmv} - * - */ -struct DeviceSpmv -{ - /******************************************************************//** - * \name CSR matrix operations - *********************************************************************/ - //@{ - - /** - * \brief This function performs the matrix-vector operation y = A*x. - * - * \par Snippet - * The code snippet below illustrates SpMV upon a 9x9 CSR matrix A - * representing a 3x3 lattice (24 non-zeros). - * - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x, - * // and output vector y - * int num_rows = 9; - * int num_cols = 9; - * int num_nonzeros = 24; - * - * float* d_values; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, - * // 1, 1, 1, 1, 1, 1, 1, 1, - * // 1, 1, 1, 1, 1, 1, 1, 1] - * - * int* d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0, - * // 4, 6, 1, 3, 5, 7, 2, 4, - * // 8, 3, 7, 4, 6, 8, 5, 7] - * - * int* d_row_offsets; // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24] - * - * float* d_vector_x; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1] - * float* d_vector_y; // e.g., [ , , , , , , , , ] - * ... - * - * // Determine temporary device storage requirements - * void* d_temp_storage = NULL; - * size_t temp_storage_bytes = 0; - * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, - * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, - * num_rows, num_cols, num_nonzeros, alpha, beta); - * - * // Allocate temporary storage - * cudaMalloc(&d_temp_storage, temp_storage_bytes); - * - * // Run SpMV - * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, - * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, - * num_rows, num_cols, num_nonzeros, alpha, beta); - * - * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2] - * - * \endcode - * - * \tparam ValueT [inferred] Matrix and vector value type (e.g., /p float, /p double, etc.) - */ - template < - typename ValueT> - CUB_RUNTIME_FUNCTION - static cudaError_t CsrMV( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - ValueT* d_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. - int* d_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros) - int* d_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) - ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector x - ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector y - int num_rows, ///< [in] number of rows of matrix A. - int num_cols, ///< [in] number of columns of matrix A. - int num_nonzeros, ///< [in] number of nonzero elements of matrix A. - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - SpmvParams spmv_params; - spmv_params.d_values = d_values; - spmv_params.d_row_end_offsets = d_row_offsets + 1; - spmv_params.d_column_indices = d_column_indices; - spmv_params.d_vector_x = d_vector_x; - spmv_params.d_vector_y = d_vector_y; - spmv_params.num_rows = num_rows; - spmv_params.num_cols = num_cols; - spmv_params.num_nonzeros = num_nonzeros; - spmv_params.alpha = 1.0; - spmv_params.beta = 0.0; - - return DispatchSpmv::Dispatch( - d_temp_storage, - temp_storage_bytes, - spmv_params, - stream, - debug_synchronous); - } - - //@} end member group -}; - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/dispatch/dispatch_histogram.cuh b/SRC/cub/device/dispatch/dispatch_histogram.cuh deleted file mode 100644 index ab08e8ed..00000000 --- a/SRC/cub/device/dispatch/dispatch_histogram.cuh +++ /dev/null @@ -1,1096 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. - */ - -#pragma once - -#include -#include -#include - -#include "../../agent/agent_histogram.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../thread/thread_search.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - - -/****************************************************************************** - * Histogram kernel entry points - *****************************************************************************/ - -/** - * Histogram initialization kernel entry point - */ -template < - int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename CounterT, ///< Integer type for counting sample occurrences per histogram bin - typename OffsetT> ///< Signed integer type for global offsets -__global__ void DeviceHistogramInitKernel( - ArrayWrapper num_output_bins_wrapper, ///< Number of output histogram bins per channel - ArrayWrapper d_output_histograms_wrapper, ///< Histogram counter data having logical dimensions CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]] - GridQueue tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks -{ - if ((threadIdx.x == 0) && (blockIdx.x == 0)) - tile_queue.ResetDrain(); - - int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x; - - #pragma unroll - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - { - if (output_bin < num_output_bins_wrapper.array[CHANNEL]) - d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0; - } -} - - -/** - * Histogram privatized sweep kernel entry point (multi-block). Computes privatized histograms, one per thread block. - */ -template < - typename AgentHistogramPolicyT, ///< Parameterized AgentHistogramPolicy tuning policy type - int PRIVATIZED_SMEM_BINS, ///< Maximum number of histogram bins per channel (e.g., up to 256) - int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename SampleIteratorT, ///< The input iterator type. \iterator. - typename CounterT, ///< Integer type for counting sample occurrences per histogram bin - typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel - typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS)) -__global__ void DeviceHistogramSweepKernel( - SampleIteratorT d_samples, ///< Input data to reduce - ArrayWrapper num_output_bins_wrapper, ///< The number bins per final output histogram - ArrayWrapper num_privatized_bins_wrapper, ///< The number bins per privatized histogram - ArrayWrapper d_output_histograms_wrapper, ///< Reference to final output histograms - ArrayWrapper d_privatized_histograms_wrapper, ///< Reference to privatized histograms - ArrayWrapper output_decode_op_wrapper, ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel - ArrayWrapper privatized_decode_op_wrapper, ///< The transform operator for determining privatized counter indices from samples, one for each channel - OffsetT num_row_pixels, ///< The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< The number of rows in the region of interest - OffsetT row_stride_samples, ///< The number of samples between starts of consecutive rows in the region of interest - int tiles_per_row, ///< Number of image tiles per row - GridQueue tile_queue) ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks -{ - // Thread block type for compositing input tiles - typedef AgentHistogram< - AgentHistogramPolicyT, - PRIVATIZED_SMEM_BINS, - NUM_CHANNELS, - NUM_ACTIVE_CHANNELS, - SampleIteratorT, - CounterT, - PrivatizedDecodeOpT, - OutputDecodeOpT, - OffsetT> - AgentHistogramT; - - // Shared memory for AgentHistogram - __shared__ typename AgentHistogramT::TempStorage temp_storage; - - AgentHistogramT agent( - temp_storage, - d_samples, - num_output_bins_wrapper.array, - num_privatized_bins_wrapper.array, - d_output_histograms_wrapper.array, - d_privatized_histograms_wrapper.array, - output_decode_op_wrapper.array, - privatized_decode_op_wrapper.array); - - // Initialize counters - agent.InitBinCounters(); - - // Consume input tiles - agent.ConsumeTiles( - num_row_pixels, - num_rows, - row_stride_samples, - tiles_per_row, - tile_queue); - - // Store output to global (if necessary) - agent.StoreOutput(); - -} - - - - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram - */ -template < - int NUM_CHANNELS, ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed) - int NUM_ACTIVE_CHANNELS, ///< Number of channels actively being histogrammed - typename SampleIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename CounterT, ///< Integer type for counting sample occurrences per histogram bin - typename LevelT, ///< Type for specifying bin level boundaries - typename OffsetT> ///< Signed integer type for global offsets -struct DipatchHistogram -{ - //--------------------------------------------------------------------- - // Types and constants - //--------------------------------------------------------------------- - - /// The sample value type of the input iterator - typedef typename std::iterator_traits::value_type SampleT; - - enum - { - // Maximum number of bins per channel for which we will use a privatized smem strategy - MAX_PRIVATIZED_SMEM_BINS = 256 - }; - - - //--------------------------------------------------------------------- - // Transform functors for converting samples to bin-ids - //--------------------------------------------------------------------- - - // Searches for bin given a list of bin-boundary levels - template - struct SearchTransform - { - LevelIteratorT d_levels; // Pointer to levels array - int num_output_levels; // Number of levels in array - - // Initializer - __host__ __device__ __forceinline__ void Init( - LevelIteratorT d_levels, // Pointer to levels array - int num_output_levels) // Number of levels in array - { - this->d_levels = d_levels; - this->num_output_levels = num_output_levels; - } - - // Method for converting samples to bin-ids - template - __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) - { - /// Level iterator wrapper type - typedef typename If::VALUE, - CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedInputIterator - LevelIteratorT>::Type // Directly use the supplied input iterator type - WrappedLevelIteratorT; - - WrappedLevelIteratorT wrapped_levels(d_levels); - - int num_bins = num_output_levels - 1; - if (valid) - { - bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1; - if (bin >= num_bins) - bin = -1; - } - } - }; - - - // Scales samples to evenly-spaced bins - struct ScaleTransform - { - int num_bins; // Number of levels in array - LevelT max; // Max sample level (exclusive) - LevelT min; // Min sample level (inclusive) - LevelT scale; // Bin scaling factor - - // Initializer - template - __host__ __device__ __forceinline__ void Init( - int num_output_levels, // Number of levels in array - _LevelT max, // Max sample level (exclusive) - _LevelT min, // Min sample level (inclusive) - _LevelT scale) // Bin scaling factor - { - this->num_bins = num_output_levels - 1; - this->max = max; - this->min = min; - this->scale = scale; - } - - // Initializer (float specialization) - __host__ __device__ __forceinline__ void Init( - int num_output_levels, // Number of levels in array - float max, // Max sample level (exclusive) - float min, // Min sample level (inclusive) - float scale) // Bin scaling factor - { - this->num_bins = num_output_levels - 1; - this->max = max; - this->min = min; - this->scale = float(1.0) / scale; - } - - // Initializer (double specialization) - __host__ __device__ __forceinline__ void Init( - int num_output_levels, // Number of levels in array - double max, // Max sample level (exclusive) - double min, // Min sample level (inclusive) - double scale) // Bin scaling factor - { - this->num_bins = num_output_levels - 1; - this->max = max; - this->min = min; - this->scale = double(1.0) / scale; - } - - // Method for converting samples to bin-ids - template - __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) - { - LevelT level_sample = (LevelT) sample; - - if (valid && (level_sample >= min) && (level_sample < max)) - bin = (int) ((level_sample - min) / scale); - } - - // Method for converting samples to bin-ids (float specialization) - template - __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid) - { - LevelT level_sample = (LevelT) sample; - - if (valid && (level_sample >= min) && (level_sample < max)) - bin = (int) ((level_sample - min) * scale); - } - - // Method for converting samples to bin-ids (double specialization) - template - __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid) - { - LevelT level_sample = (LevelT) sample; - - if (valid && (level_sample >= min) && (level_sample < max)) - bin = (int) ((level_sample - min) * scale); - } - }; - - - // Pass-through bin transform operator - struct PassThruTransform - { - // Method for converting samples to bin-ids - template - __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid) - { - if (valid) - bin = (int) sample; - } - }; - - - - //--------------------------------------------------------------------- - // Tuning policies - //--------------------------------------------------------------------- - - template - struct TScale - { - enum - { - V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int), - VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1) - }; - }; - - - /// SM11 - struct Policy110 - { - // HistogramSweepPolicy - typedef AgentHistogramPolicy< - 512, - (NUM_CHANNELS == 1) ? 8 : 2, - BLOCK_LOAD_DIRECT, - LOAD_DEFAULT, - true, - GMEM, - false> - HistogramSweepPolicy; - }; - - /// SM20 - struct Policy200 - { - // HistogramSweepPolicy - typedef AgentHistogramPolicy< - (NUM_CHANNELS == 1) ? 256 : 128, - (NUM_CHANNELS == 1) ? 8 : 3, - (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - SMEM, - false> - HistogramSweepPolicy; - }; - - /// SM30 - struct Policy300 - { - // HistogramSweepPolicy - typedef AgentHistogramPolicy< - 512, - (NUM_CHANNELS == 1) ? 8 : 2, - BLOCK_LOAD_DIRECT, - LOAD_DEFAULT, - true, - GMEM, - false> - HistogramSweepPolicy; - }; - - /// SM35 - struct Policy350 - { - // HistogramSweepPolicy - typedef AgentHistogramPolicy< - 128, - TScale<8>::VALUE, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - true, - BLEND, - true> - HistogramSweepPolicy; - }; - - /// SM50 - struct Policy500 - { - // HistogramSweepPolicy - typedef AgentHistogramPolicy< - 384, - TScale<16>::VALUE, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - true, - SMEM, - false> - HistogramSweepPolicy; - }; - - - - //--------------------------------------------------------------------- - // Tuning policies of current PTX compiler pass - //--------------------------------------------------------------------- - -#if (CUB_PTX_ARCH >= 500) - typedef Policy500 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#else - typedef Policy110 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {}; - - - //--------------------------------------------------------------------- - // Utilities - //--------------------------------------------------------------------- - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t InitConfigs( - int ptx_version, - KernelConfig &histogram_sweep_config) - { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - return histogram_sweep_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 500) - { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 350) - { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 300) - { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 200) - { - return histogram_sweep_config.template Init(); - } - else if (ptx_version >= 110) - { - return histogram_sweep_config.template Init(); - } - else - { - // No global atomic support - return cudaErrorNotSupported; - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration - */ - struct KernelConfig - { - int block_threads; - int pixels_per_thread; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t Init() - { - block_threads = BlockPolicy::BLOCK_THREADS; - pixels_per_thread = BlockPolicy::PIXELS_PER_THREAD; - - return cudaSuccess; - } - }; - - - //--------------------------------------------------------------------- - // Dispatch entrypoints - //--------------------------------------------------------------------- - - /** - * Privatization-based dispatch routine - */ - template < - typename PrivatizedDecodeOpT, ///< The transform operator type for determining privatized counter indices from samples, one for each channel - typename OutputDecodeOpT, ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel - typename DeviceHistogramInitKernelT, ///< Function type of cub::DeviceHistogramInitKernel - typename DeviceHistogramSweepKernelT> ///< Function type of cub::DeviceHistogramSweepKernel - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t PrivatizedDispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. - int num_privatized_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. - PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel - int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. - OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS], ///< [in] Transform operators for determining bin-ids from samples, one for each channel - int max_num_output_bins, ///< [in] Maximum number of output bins in any channel - OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest - DeviceHistogramInitKernelT histogram_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel - DeviceHistogramSweepKernelT histogram_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel - KernelConfig histogram_sweep_config, ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - #ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - - #else - - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Get SM occupancy for histogram_sweep_kernel - int histogram_sweep_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - histogram_sweep_sm_occupancy, - histogram_sweep_kernel, - histogram_sweep_config.block_threads))) break; - - // Get device occupancy for histogram_sweep_kernel - int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count; - - if (num_row_pixels * NUM_CHANNELS == row_stride_samples) - { - // Treat as a single linear array of samples - num_row_pixels *= num_rows; - num_rows = 1; - row_stride_samples = num_row_pixels * NUM_CHANNELS; - } - - // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy - int pixels_per_tile = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread; - int tiles_per_row = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile; - int blocks_per_row = CUB_MIN(histogram_sweep_occupancy, tiles_per_row); - int blocks_per_col = (blocks_per_row > 0) ? - int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) : - 0; - int num_thread_blocks = blocks_per_row * blocks_per_col; - - dim3 sweep_grid_dims; - sweep_grid_dims.x = (unsigned int) blocks_per_row; - sweep_grid_dims.y = (unsigned int) blocks_per_col; - sweep_grid_dims.z = 1; - - // Temporary storage allocation requirements - const int NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1; - void* allocations[NUM_ALLOCATIONS]; - size_t allocation_sizes[NUM_ALLOCATIONS]; - - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT); - - allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue::AllocationSize(); - - // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - break; - } - - // Construct the grid queue descriptor - GridQueue tile_queue(allocations[NUM_ALLOCATIONS - 1]); - - // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters) - ArrayWrapper d_output_histograms_wrapper; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL]; - - // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters) - ArrayWrapper d_privatized_histograms_wrapper; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL]; - - // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters) - ArrayWrapper privatized_decode_op_wrapper; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL]; - - // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters) - ArrayWrapper output_decode_op_wrapper; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL]; - - // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters) - ArrayWrapper num_privatized_bins_wrapper; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1; - - // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters) - ArrayWrapper num_output_bins_wrapper; - for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL) - num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1; - - int histogram_init_block_threads = 256; - int histogram_init_grid_dims = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads; - - // Log DeviceHistogramInitKernel configuration - if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n", - histogram_init_grid_dims, histogram_init_block_threads, (long long) stream); - - // Invoke histogram_init_kernel - histogram_init_kernel<<>>( - num_output_bins_wrapper, - d_output_histograms_wrapper, - tile_queue); - - // Return if empty problem - if ((blocks_per_row == 0) || (blocks_per_col == 0)) - break; - - // Log histogram_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n", - sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z, - histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy); - - // Invoke histogram_sweep_kernel - histogram_sweep_kernel<<>>( - d_samples, - num_output_bins_wrapper, - num_privatized_bins_wrapper, - d_output_histograms_wrapper, - d_privatized_histograms_wrapper, - output_decode_op_wrapper, - privatized_decode_op_wrapper, - num_row_pixels, - num_rows, - row_stride_samples, - tiles_per_row, - tile_queue); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - } - while (0); - - return error; - - #endif // CUB_RUNTIME_ENABLED - } - - - - /** - * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit - */ - CUB_RUNTIME_FUNCTION - static cudaError_t DispatchRange( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. - int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. - LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. - OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel dispatch configurations - KernelConfig histogram_sweep_config; - if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) - break; - - // Use the search transform op for converting samples to privatized bins - typedef SearchTransform PrivatizedDecodeOpT; - - // Use the pass-thru transform op for converting privatized bins to output bins - typedef PassThruTransform OutputDecodeOpT; - - PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; - OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; - int max_levels = num_output_levels[0]; - - for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) - { - privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); - if (num_output_levels[channel] > max_levels) - max_levels = num_output_levels[channel]; - } - int max_num_output_bins = max_levels - 1; - - // Dispatch - if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) - { - // Too many bins to keep in shared memory. - const int PRIVATIZED_SMEM_BINS = 0; - - if (CubDebug(error = PrivatizedDispatch( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_output_levels, - privatized_decode_op, - num_output_levels, - output_decode_op, - max_num_output_bins, - num_row_pixels, - num_rows, - row_stride_samples, - DeviceHistogramInitKernel, - DeviceHistogramSweepKernel, - histogram_sweep_config, - stream, - debug_synchronous))) break; - } - else - { - // Dispatch shared-privatized approach - const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; - - if (CubDebug(error = PrivatizedDispatch( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_output_levels, - privatized_decode_op, - num_output_levels, - output_decode_op, - max_num_output_bins, - num_row_pixels, - num_rows, - row_stride_samples, - DeviceHistogramInitKernel, - DeviceHistogramSweepKernel, - histogram_sweep_config, - stream, - debug_synchronous))) break; - } - - } while (0); - - return error; - } - - - /** - * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) - */ - CUB_RUNTIME_FUNCTION - static cudaError_t DispatchRange( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. - int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. - LevelT *d_levels[NUM_ACTIVE_CHANNELS], ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel. Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive. - OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel dispatch configurations - KernelConfig histogram_sweep_config; - if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) - break; - - // Use the pass-thru transform op for converting samples to privatized bins - typedef PassThruTransform PrivatizedDecodeOpT; - - // Use the search transform op for converting privatized bins to output bins - typedef SearchTransform OutputDecodeOpT; - - int num_privatized_levels[NUM_ACTIVE_CHANNELS]; - PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; - OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; - int max_levels = num_output_levels[0]; // Maximum number of levels in any channel - - for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) - { - num_privatized_levels[channel] = 257; - output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]); - - if (num_output_levels[channel] > max_levels) - max_levels = num_output_levels[channel]; - } - int max_num_output_bins = max_levels - 1; - - const int PRIVATIZED_SMEM_BINS = 256; - - if (CubDebug(error = PrivatizedDispatch( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_privatized_levels, - privatized_decode_op, - num_output_levels, - output_decode_op, - max_num_output_bins, - num_row_pixels, - num_rows, - row_stride_samples, - DeviceHistogramInitKernel, - DeviceHistogramSweepKernel, - histogram_sweep_config, - stream, - debug_synchronous))) break; - - } while (0); - - return error; - } - - - /** - * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t DispatchEven( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. - int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. - LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. - LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. - OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel dispatch configurations - KernelConfig histogram_sweep_config; - if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) - break; - - // Use the scale transform op for converting samples to privatized bins - typedef ScaleTransform PrivatizedDecodeOpT; - - // Use the pass-thru transform op for converting privatized bins to output bins - typedef PassThruTransform OutputDecodeOpT; - - PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; - OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; - int max_levels = num_output_levels[0]; - - for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) - { - int bins = num_output_levels[channel] - 1; - LevelT scale = (upper_level[channel] - lower_level[channel]) / bins; - - privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); - - if (num_output_levels[channel] > max_levels) - max_levels = num_output_levels[channel]; - } - int max_num_output_bins = max_levels - 1; - - if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS) - { - // Dispatch shared-privatized approach - const int PRIVATIZED_SMEM_BINS = 0; - - if (CubDebug(error = PrivatizedDispatch( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_output_levels, - privatized_decode_op, - num_output_levels, - output_decode_op, - max_num_output_bins, - num_row_pixels, - num_rows, - row_stride_samples, - DeviceHistogramInitKernel, - DeviceHistogramSweepKernel, - histogram_sweep_config, - stream, - debug_synchronous))) break; - } - else - { - // Dispatch shared-privatized approach - const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS; - - if (CubDebug(error = PrivatizedDispatch( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_output_levels, - privatized_decode_op, - num_output_levels, - output_decode_op, - max_num_output_bins, - num_row_pixels, - num_rows, - row_stride_samples, - DeviceHistogramInitKernel, - DeviceHistogramSweepKernel, - histogram_sweep_config, - stream, - debug_synchronous))) break; - } - } - while (0); - - return error; - } - - - /** - * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels) - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t DispatchEven( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SampleIteratorT d_samples, ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples). - CounterT* d_output_histograms[NUM_ACTIVE_CHANNELS], ///< [out] The pointers to the histogram counter output arrays, one for each active channel. For channeli, the allocation length of d_histograms[i] should be num_output_levels[i] - 1. - int num_output_levels[NUM_ACTIVE_CHANNELS], ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel. Implies that the number of bins for channeli is num_output_levels[i] - 1. - LevelT lower_level[NUM_ACTIVE_CHANNELS], ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel. - LevelT upper_level[NUM_ACTIVE_CHANNELS], ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel. - OffsetT num_row_pixels, ///< [in] The number of multi-channel pixels per row in the region of interest - OffsetT num_rows, ///< [in] The number of rows in the region of interest - OffsetT row_stride_samples, ///< [in] The number of samples between starts of consecutive rows in the region of interest - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - Int2Type is_byte_sample) ///< [in] Marker type indicating whether or not SampleT is a 8b type - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel dispatch configurations - KernelConfig histogram_sweep_config; - if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config))) - break; - - // Use the pass-thru transform op for converting samples to privatized bins - typedef PassThruTransform PrivatizedDecodeOpT; - - // Use the scale transform op for converting privatized bins to output bins - typedef ScaleTransform OutputDecodeOpT; - - int num_privatized_levels[NUM_ACTIVE_CHANNELS]; - PrivatizedDecodeOpT privatized_decode_op[NUM_ACTIVE_CHANNELS]; - OutputDecodeOpT output_decode_op[NUM_ACTIVE_CHANNELS]; - int max_levels = num_output_levels[0]; - - for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel) - { - num_privatized_levels[channel] = 257; - - int bins = num_output_levels[channel] - 1; - LevelT scale = (upper_level[channel] - lower_level[channel]) / bins; - output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale); - - if (num_output_levels[channel] > max_levels) - max_levels = num_output_levels[channel]; - } - int max_num_output_bins = max_levels - 1; - - const int PRIVATIZED_SMEM_BINS = 256; - - if (CubDebug(error = PrivatizedDispatch( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_output_histograms, - num_privatized_levels, - privatized_decode_op, - num_output_levels, - output_decode_op, - max_num_output_bins, - num_row_pixels, - num_rows, - row_stride_samples, - DeviceHistogramInitKernel, - DeviceHistogramSweepKernel, - histogram_sweep_config, - stream, - debug_synchronous))) break; - - } - while (0); - - return error; - } - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/dispatch/dispatch_radix_sort.cuh b/SRC/cub/device/dispatch/dispatch_radix_sort.cuh deleted file mode 100644 index d1a992d4..00000000 --- a/SRC/cub/device/dispatch/dispatch_radix_sort.cuh +++ /dev/null @@ -1,1619 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "../../agent/agent_radix_sort_upsweep.cuh" -#include "../../agent/agent_radix_sort_downsweep.cuh" -#include "../../agent/agent_scan.cuh" -#include "../../block/block_radix_sort.cuh" -#include "../../grid/grid_even_share.cuh" -#include "../../util_type.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Upsweep digit-counting kernel entry point (multi-block). Computes privatized digit histograms, one per block. - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int((ALT_DIGIT_BITS) ? - ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS : - ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS)) -__global__ void DeviceRadixSortUpsweepKernel( - const KeyT *d_keys, ///< [in] Input keys buffer - OffsetT *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) - OffsetT /*num_items*/, ///< [in] Total number of input data items - int current_bit, ///< [in] Bit position of current radix digit - int num_bits, ///< [in] Number of bits of current radix digit - GridEvenShare even_share) ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block -{ - enum { - TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS * - ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD - }; - - // Parameterize AgentRadixSortUpsweep type for the current configuration - typedef AgentRadixSortUpsweep< - typename If<(ALT_DIGIT_BITS), - typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy, - typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>::Type, - KeyT, - OffsetT> - AgentRadixSortUpsweepT; - - // Shared memory storage - __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage; - - // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block - even_share.template BlockInit(); - - AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits); - - upsweep.ProcessRegion(even_share.block_offset, even_share.block_end); - - CTA_SYNC(); - - // Write out digit counts (striped) - upsweep.template ExtractCounts(d_spine, gridDim.x, blockIdx.x); -} - - -/** - * Spine scan kernel entry point (single-block). Computes an exclusive prefix sum over the privatized digit histograms - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1) -__global__ void RadixSortScanBinsKernel( - OffsetT *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) - int num_counts) ///< [in] Total number of bin-counts -{ - // Parameterize the AgentScan type for the current configuration - typedef AgentScan< - typename ChainedPolicyT::ActivePolicy::ScanPolicy, - OffsetT*, - OffsetT*, - cub::Sum, - OffsetT, - OffsetT> - AgentScanT; - - // Shared memory storage - __shared__ typename AgentScanT::TempStorage temp_storage; - - // Block scan instance - AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ; - - // Process full input tiles - int block_offset = 0; - BlockScanRunningPrefixOp prefix_op(0, Sum()); - while (block_offset + AgentScanT::TILE_ITEMS <= num_counts) - { - block_scan.template ConsumeTile(block_offset, prefix_op); - block_offset += AgentScanT::TILE_ITEMS; - } -} - - -/** - * Downsweep pass kernel entry point (multi-block). Scatters keys (and values) into corresponding bins for the current digit place. - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int((ALT_DIGIT_BITS) ? - ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS : - ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS)) -__global__ void DeviceRadixSortDownsweepKernel( - const KeyT *d_keys_in, ///< [in] Input keys buffer - KeyT *d_keys_out, ///< [in] Output keys buffer - const ValueT *d_values_in, ///< [in] Input values buffer - ValueT *d_values_out, ///< [in] Output values buffer - OffsetT *d_spine, ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) - OffsetT num_items, ///< [in] Total number of input data items - int current_bit, ///< [in] Bit position of current radix digit - int num_bits, ///< [in] Number of bits of current radix digit - GridEvenShare even_share) ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block -{ - enum { - TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS * - ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD - }; - - // Parameterize AgentRadixSortDownsweep type for the current configuration - typedef AgentRadixSortDownsweep< - typename If<(ALT_DIGIT_BITS), - typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy, - typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>::Type, - IS_DESCENDING, - KeyT, - ValueT, - OffsetT> - AgentRadixSortDownsweepT; - - // Shared memory storage - __shared__ typename AgentRadixSortDownsweepT::TempStorage temp_storage; - - // Initialize even-share descriptor for this thread block - even_share.template BlockInit(); - - // Process input tiles - AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion( - even_share.block_offset, - even_share.block_end); -} - - -/** - * Single pass kernel entry point (single-block). Fully sorts a tile of input. - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) -__global__ void DeviceRadixSortSingleTileKernel( - const KeyT *d_keys_in, ///< [in] Input keys buffer - KeyT *d_keys_out, ///< [in] Output keys buffer - const ValueT *d_values_in, ///< [in] Input values buffer - ValueT *d_values_out, ///< [in] Output values buffer - OffsetT num_items, ///< [in] Total number of input data items - int current_bit, ///< [in] Bit position of current radix digit - int end_bit) ///< [in] The past-the-end (most-significant) bit index needed for key comparison -{ - // Constants - enum - { - BLOCK_THREADS = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS, - ITEMS_PER_THREAD = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD, - KEYS_ONLY = Equals::VALUE, - }; - - // BlockRadixSort type - typedef BlockRadixSort< - KeyT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - ValueT, - ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS, - (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE), - ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM> - BlockRadixSortT; - - // BlockLoad type (keys) - typedef BlockLoad< - KeyT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys; - - // BlockLoad type (values) - typedef BlockLoad< - ValueT, - BLOCK_THREADS, - ITEMS_PER_THREAD, - ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues; - - // Unsigned word for key bits - typedef typename Traits::UnsignedBits UnsignedBitsT; - - // Shared memory storage - __shared__ union TempStorage - { - typename BlockRadixSortT::TempStorage sort; - typename BlockLoadKeys::TempStorage load_keys; - typename BlockLoadValues::TempStorage load_values; - - } temp_storage; - - // Keys and values for the block - KeyT keys[ITEMS_PER_THREAD]; - ValueT values[ITEMS_PER_THREAD]; - - // Get default (min/max) value for out-of-bounds keys - UnsignedBitsT default_key_bits = (IS_DESCENDING) ? Traits::LOWEST_KEY : Traits::MAX_KEY; - KeyT default_key = reinterpret_cast(default_key_bits); - - // Load keys - BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key); - - CTA_SYNC(); - - // Load values - if (!KEYS_ONLY) - { - // Register pressure work-around: moving num_items through shfl prevents compiler - // from reusing guards/addressing from prior guarded loads - num_items = ShuffleIndex(num_items, 0, 0xffffffff); - - BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items); - - CTA_SYNC(); - } - - // Sort tile - BlockRadixSortT(temp_storage.sort).SortBlockedToStriped( - keys, - values, - current_bit, - end_bit, - Int2Type(), - Int2Type()); - - // Store keys and values - #pragma unroll - for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) - { - int item_offset = ITEM * BLOCK_THREADS + threadIdx.x; - if (item_offset < num_items) - { - d_keys_out[item_offset] = keys[ITEM]; - if (!KEYS_ONLY) - d_values_out[item_offset] = values[ITEM]; - } - } -} - - -/** - * Segmented radix sorting pass (one block per segment) - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - bool ALT_DIGIT_BITS, ///< Whether or not to use the alternate (lower-bits) policy - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int((ALT_DIGIT_BITS) ? - ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS : - ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS)) -__global__ void DeviceSegmentedRadixSortKernel( - const KeyT *d_keys_in, ///< [in] Input keys buffer - KeyT *d_keys_out, ///< [in] Output keys buffer - const ValueT *d_values_in, ///< [in] Input values buffer - ValueT *d_values_out, ///< [in] Output values buffer - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data - int current_bit, ///< [in] Bit position of current radix digit - int pass_bits) ///< [in] Number of bits of current radix digit -{ - // - // Constants - // - - typedef typename If<(ALT_DIGIT_BITS), - typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy, - typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT; - - enum - { - BLOCK_THREADS = SegmentedPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = SegmentedPolicyT::ITEMS_PER_THREAD, - RADIX_BITS = SegmentedPolicyT::RADIX_BITS, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - RADIX_DIGITS = 1 << RADIX_BITS, - KEYS_ONLY = Equals::VALUE, - }; - - // Upsweep type - typedef AgentRadixSortUpsweep< - AgentRadixSortUpsweepPolicy, - KeyT, - OffsetT> - BlockUpsweepT; - - // Digit-scan type - typedef BlockScan DigitScanT; - - // Downsweep type - typedef AgentRadixSortDownsweep BlockDownsweepT; - - enum - { - /// Number of bin-starting offsets tracked per thread - BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD - }; - - // - // Process input tiles - // - - // Shared memory storage - __shared__ union - { - typename BlockUpsweepT::TempStorage upsweep; - typename BlockDownsweepT::TempStorage downsweep; - struct - { - volatile OffsetT reverse_counts_in[RADIX_DIGITS]; - volatile OffsetT reverse_counts_out[RADIX_DIGITS]; - typename DigitScanT::TempStorage scan; - }; - - } temp_storage; - - OffsetT segment_begin = d_begin_offsets[blockIdx.x]; - OffsetT segment_end = d_end_offsets[blockIdx.x]; - OffsetT num_items = segment_end - segment_begin; - - // Check if empty segment - if (num_items <= 0) - return; - - // Upsweep - BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits); - upsweep.ProcessRegion(segment_begin, segment_end); - - CTA_SYNC(); - - // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads) - OffsetT bin_count[BINS_TRACKED_PER_THREAD]; - upsweep.ExtractCounts(bin_count); - - CTA_SYNC(); - - if (IS_DESCENDING) - { - // Reverse bin counts - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - temp_storage.reverse_counts_in[bin_idx] = bin_count[track]; - } - - CTA_SYNC(); - - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1]; - } - } - - // Scan - OffsetT bin_offset[BINS_TRACKED_PER_THREAD]; // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads) - DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset); - - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - bin_offset[track] += segment_begin; - } - - if (IS_DESCENDING) - { - // Reverse bin offsets - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track]; - } - - CTA_SYNC(); - - #pragma unroll - for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track) - { - int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track; - - if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS)) - bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1]; - } - } - - CTA_SYNC(); - - // Downsweep - BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits); - downsweep.ProcessRegion(segment_begin, segment_end); -} - - - -/****************************************************************************** - * Policy - ******************************************************************************/ - -/** - * Tuning policy for kernel specialization - */ -template < - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename OffsetT> ///< Signed integer type for global offsets -struct DeviceRadixSortPolicy -{ - //------------------------------------------------------------------------------ - // Constants - //------------------------------------------------------------------------------ - - enum - { - // Whether this is a keys-only (or key-value) sort - KEYS_ONLY = (Equals::VALUE), - }; - - // Dominant-sized key/value type - typedef typename If<(sizeof(ValueT) > 4) && (sizeof(KeyT) < sizeof(ValueT)), ValueT, KeyT>::Type DominantT; - - //------------------------------------------------------------------------------ - // Architecture-specific tuning policies - //------------------------------------------------------------------------------ - - /// SM20 - struct Policy200 : ChainedPolicy<200, Policy200, Policy200> - { - enum { - PRIMARY_RADIX_BITS = 5, - ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, - - // Relative size of KeyT type to a 4-byte word - SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4, - }; - - // Keys-only upsweep policies - typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; - typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; - - // Key-value pairs upsweep policies - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; - typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; - - // Upsweep policies - typedef typename If::Type UpsweepPolicy; - typedef typename If::Type AltUpsweepPolicy; - - // Scan policy - typedef AgentScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // Keys-only downsweep policies - typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; - typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; - - // Key-value pairs downsweep policies - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; - - // Downsweep policies - typedef typename If::Type DownsweepPolicy; - typedef typename If::Type AltDownsweepPolicy; - - // Single-tile policy - typedef DownsweepPolicy SingleTilePolicy; - - // Segmented policies - typedef DownsweepPolicy SegmentedPolicy; - typedef AltDownsweepPolicy AltSegmentedPolicy; - }; - - /// SM30 - struct Policy300 : ChainedPolicy<300, Policy300, Policy200> - { - enum { - PRIMARY_RADIX_BITS = 5, - ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, - - // Relative size of KeyT type to a 4-byte word - SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4, - }; - - // Keys-only upsweep policies - typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyKeys; - typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyKeys; - - // Key-value pairs upsweep policies - typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS> UpsweepPolicyPairs; - typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS> AltUpsweepPolicyPairs; - - // Upsweep policies - typedef typename If::Type UpsweepPolicy; - typedef typename If::Type AltUpsweepPolicy; - - // Scan policy - typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; - - // Keys-only downsweep policies - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyKeys; - - // Key-value pairs downsweep policies - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyPairs; - typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS> AltDownsweepPolicyPairs; - - // Downsweep policies - typedef typename If::Type DownsweepPolicy; - typedef typename If::Type AltDownsweepPolicy; - - // Single-tile policy - typedef DownsweepPolicy SingleTilePolicy; - - // Segmented policies - typedef DownsweepPolicy SegmentedPolicy; - typedef AltDownsweepPolicy AltSegmentedPolicy; - }; - - - /// SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy300> - { - enum { - PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m) - }; - - // Scan policy - typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy; - - // Keys-only downsweep policies - typedef AgentRadixSortDownsweepPolicy DownsweepPolicyKeys; - typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicyKeys; - - // Key-value pairs downsweep policies - typedef DownsweepPolicyKeys DownsweepPolicyPairs; - typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicyPairs; - - // Downsweep policies - typedef typename If::Type DownsweepPolicy; - typedef typename If::Type AltDownsweepPolicy; - - // Upsweep policies - typedef DownsweepPolicy UpsweepPolicy; - typedef AltDownsweepPolicy AltUpsweepPolicy; - - // Single-tile policy - typedef DownsweepPolicy SingleTilePolicy; - - // Segmented policies - typedef DownsweepPolicy SegmentedPolicy; - typedef AltDownsweepPolicy AltSegmentedPolicy; - - - }; - - - /// SM50 - struct Policy500 : ChainedPolicy<500, Policy500, Policy350> - { - enum { - PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX) - SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, - SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.1B 32b segmented keys/s (TitanX) - }; - - // ScanPolicy - typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // Downsweep policies - typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; - typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; - - // Upsweep policies - typedef DownsweepPolicy UpsweepPolicy; - typedef AltDownsweepPolicy AltUpsweepPolicy; - - // Single-tile policy - typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; - - // Segmented policies - typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; - typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; - }; - - - /// SM60 (GP100) - struct Policy600 : ChainedPolicy<600, Policy600, Policy500> - { - enum { - PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) - SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, - SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 5.9B 32b segmented keys/s (Quadro P100) - }; - - // ScanPolicy - typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // Downsweep policies - typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; - typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; - - // Upsweep policies - typedef DownsweepPolicy UpsweepPolicy; - typedef AltDownsweepPolicy AltUpsweepPolicy; - - // Single-tile policy - typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; - - // Segmented policies - typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; - typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; - - }; - - - /// SM61 (GP104) - struct Policy610 : ChainedPolicy<610, Policy610, Policy600> - { - enum { - PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080) - SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, - SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 3.3B 32b segmented keys/s (1080) - }; - - // ScanPolicy - typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // Downsweep policies - typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; - typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; - - // Upsweep policies - typedef AgentRadixSortUpsweepPolicy UpsweepPolicy; - typedef AgentRadixSortUpsweepPolicy AltUpsweepPolicy; - - // Single-tile policy - typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; - - // Segmented policies - typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; - typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; - }; - - - /// SM62 (Tegra, less RF) - struct Policy620 : ChainedPolicy<620, Policy620, Policy610> - { - enum { - PRIMARY_RADIX_BITS = 5, - ALT_RADIX_BITS = PRIMARY_RADIX_BITS - 1, - }; - - // ScanPolicy - typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // Downsweep policies - typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; - typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; - - // Upsweep policies - typedef DownsweepPolicy UpsweepPolicy; - typedef AltDownsweepPolicy AltUpsweepPolicy; - - // Single-tile policy - typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; - - // Segmented policies - typedef DownsweepPolicy SegmentedPolicy; - typedef AltDownsweepPolicy AltSegmentedPolicy; - }; - - - /// SM70 (GV100) - struct Policy700 : ChainedPolicy<700, Policy700, Policy620> - { - enum { - PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 7.62B 32b keys/s (GV100) - SINGLE_TILE_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, - SEGMENTED_RADIX_BITS = (sizeof(KeyT) > 1) ? 6 : 5, // 8.7B 32b segmented keys/s (GV100) - }; - - // ScanPolicy - typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy; - - // Downsweep policies - typedef AgentRadixSortDownsweepPolicy DownsweepPolicy; - typedef AgentRadixSortDownsweepPolicy AltDownsweepPolicy; - - // Upsweep policies - typedef DownsweepPolicy UpsweepPolicy; - typedef AltDownsweepPolicy AltUpsweepPolicy; - - // Single-tile policy - typedef AgentRadixSortDownsweepPolicy SingleTilePolicy; - - // Segmented policies - typedef AgentRadixSortDownsweepPolicy SegmentedPolicy; - typedef AgentRadixSortDownsweepPolicy AltSegmentedPolicy; - }; - - - /// MaxPolicy - typedef Policy700 MaxPolicy; - - -}; - - - -/****************************************************************************** - * Single-problem dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort - */ -template < - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename OffsetT> ///< Signed integer type for global offsets -struct DispatchRadixSort : - DeviceRadixSortPolicy -{ - //------------------------------------------------------------------------------ - // Constants - //------------------------------------------------------------------------------ - - enum - { - // Whether this is a keys-only (or key-value) sort - KEYS_ONLY = (Equals::VALUE), - }; - - - //------------------------------------------------------------------------------ - // Problem state - //------------------------------------------------------------------------------ - - void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - OffsetT num_items; ///< [in] Number of items to sort - int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison - int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison - cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version; ///< [in] PTX version - bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers - - - //------------------------------------------------------------------------------ - // Constructor - //------------------------------------------------------------------------------ - - /// Constructor - CUB_RUNTIME_FUNCTION __forceinline__ - DispatchRadixSort( - void* d_temp_storage, - size_t &temp_storage_bytes, - DoubleBuffer &d_keys, - DoubleBuffer &d_values, - OffsetT num_items, - int begin_bit, - int end_bit, - bool is_overwrite_okay, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : - d_temp_storage(d_temp_storage), - temp_storage_bytes(temp_storage_bytes), - d_keys(d_keys), - d_values(d_values), - num_items(num_items), - begin_bit(begin_bit), - end_bit(end_bit), - stream(stream), - debug_synchronous(debug_synchronous), - ptx_version(ptx_version), - is_overwrite_okay(is_overwrite_okay) - {} - - - //------------------------------------------------------------------------------ - // Small-problem (single tile) invocation - //------------------------------------------------------------------------------ - - /// Invoke a single block to sort in-core - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename SingleTileKernelT> ///< Function type of cub::DeviceRadixSortSingleTileKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokeSingleTile( - SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel - { -#ifndef CUB_RUNTIME_ENABLED - (void)single_tile_kernel; - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else - cudaError error = cudaSuccess; - do - { - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - { - temp_storage_bytes = 1; - break; - } - - // Return if empty problem - if (num_items == 0) - break; - - // Log single_tile_kernel configuration - if (debug_synchronous) - _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", - 1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream, - ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS); - - // Invoke upsweep_kernel with same grid size as downsweep_kernel - single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( - d_keys.Current(), - d_keys.Alternate(), - d_values.Current(), - d_values.Alternate(), - num_items, - begin_bit, - end_bit); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Update selector - d_keys.selector ^= 1; - d_values.selector ^= 1; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - //------------------------------------------------------------------------------ - // Normal problem size invocation - //------------------------------------------------------------------------------ - - /** - * Invoke a three-kernel sorting pass at the current bit. - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePass( - const KeyT *d_keys_in, - KeyT *d_keys_out, - const ValueT *d_values_in, - ValueT *d_values_out, - OffsetT *d_spine, - int spine_length, - int ¤t_bit, - PassConfigT &pass_config) - { - cudaError error = cudaSuccess; - do - { - int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); - - // Log upsweep_kernel configuration - if (debug_synchronous) - _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", - pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream, - pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits); - - // Invoke upsweep_kernel with same grid size as downsweep_kernel - pass_config.upsweep_kernel<<>>( - d_keys_in, - d_spine, - num_items, - current_bit, - pass_bits, - pass_config.even_share); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Log scan_kernel configuration - if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n", - 1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread); - - // Invoke scan_kernel - pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>( - d_spine, - spine_length); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Log downsweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream, - pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy); - - // Invoke downsweep_kernel - pass_config.downsweep_kernel<<>>( - d_keys_in, - d_keys_out, - d_values_in, - d_values_out, - d_spine, - num_items, - current_bit, - pass_bits, - pass_config.even_share); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Update current bit - current_bit += pass_bits; - } - while (0); - - return error; - } - - - - /// Pass configuration structure - template < - typename UpsweepKernelT, - typename ScanKernelT, - typename DownsweepKernelT> - struct PassConfig - { - UpsweepKernelT upsweep_kernel; - KernelConfig upsweep_config; - ScanKernelT scan_kernel; - KernelConfig scan_config; - DownsweepKernelT downsweep_kernel; - KernelConfig downsweep_config; - int radix_bits; - int radix_digits; - int max_downsweep_grid_size; - GridEvenShare even_share; - - /// Initialize pass configuration - template < - typename UpsweepPolicyT, - typename ScanPolicyT, - typename DownsweepPolicyT> - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InitPassConfig( - UpsweepKernelT upsweep_kernel, - ScanKernelT scan_kernel, - DownsweepKernelT downsweep_kernel, - int ptx_version, - int sm_count, - int num_items) - { - cudaError error = cudaSuccess; - do - { - this->upsweep_kernel = upsweep_kernel; - this->scan_kernel = scan_kernel; - this->downsweep_kernel = downsweep_kernel; - radix_bits = DownsweepPolicyT::RADIX_BITS; - radix_digits = 1 << radix_bits; - - if (CubDebug(error = upsweep_config.Init(upsweep_kernel))) break; - if (CubDebug(error = scan_config.Init(scan_kernel))) break; - if (CubDebug(error = downsweep_config.Init(downsweep_kernel))) break; - - max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version); - - even_share.DispatchInit( - num_items, - max_downsweep_grid_size, - CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size)); - - } - while (0); - return error; - } - - }; - - - /// Invocation (run multiple digit passes) - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename UpsweepKernelT, ///< Function type of cub::DeviceRadixSortUpsweepKernel - typename ScanKernelT, ///< Function type of cub::SpineScanKernel - typename DownsweepKernelT> ///< Function type of cub::DeviceRadixSortDownsweepKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePasses( - UpsweepKernelT upsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel - UpsweepKernelT alt_upsweep_kernel, ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel - ScanKernelT scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel - DownsweepKernelT downsweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel - DownsweepKernelT alt_downsweep_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel - { -#ifndef CUB_RUNTIME_ENABLED - (void)upsweep_kernel; - (void)alt_upsweep_kernel; - (void)scan_kernel; - (void)downsweep_kernel; - (void)alt_downsweep_kernel; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else - - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Init regular and alternate-digit kernel configurations - PassConfig pass_config, alt_pass_config; - if ((error = pass_config.template InitPassConfig< - typename ActivePolicyT::UpsweepPolicy, - typename ActivePolicyT::ScanPolicy, - typename ActivePolicyT::DownsweepPolicy>( - upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break; - - if ((error = alt_pass_config.template InitPassConfig< - typename ActivePolicyT::AltUpsweepPolicy, - typename ActivePolicyT::ScanPolicy, - typename ActivePolicyT::AltDownsweepPolicy>( - alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break; - - // Get maximum spine length - int max_grid_size = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size); - int spine_length = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size; - - // Temporary storage allocation requirements - void* allocations[3]; - size_t allocation_sizes[3] = - { - spine_length * sizeof(OffsetT), // bytes needed for privatized block digit histograms - (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer - (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer - }; - - // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - return cudaSuccess; - - // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size - int num_bits = end_bit - begin_bit; - int num_passes = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits; - bool is_num_passes_odd = num_passes & 1; - int max_alt_passes = (num_passes * pass_config.radix_bits) - num_bits; - int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits)); - - // Alias the temporary storage allocations - OffsetT *d_spine = static_cast(allocations[0]); - - DoubleBuffer d_keys_remaining_passes( - (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[1]), - (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_keys.Alternate()); - - DoubleBuffer d_values_remaining_passes( - (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[2]), - (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[2]) : d_values.Alternate()); - - // Run first pass, consuming from the input's current buffers - int current_bit = begin_bit; - if (CubDebug(error = InvokePass( - d_keys.Current(), d_keys_remaining_passes.Current(), - d_values.Current(), d_values_remaining_passes.Current(), - d_spine, spine_length, current_bit, - (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; - - // Run remaining passes - while (current_bit < end_bit) - { - if (CubDebug(error = InvokePass( - d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], - d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], - d_spine, spine_length, current_bit, - (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;; - - // Invert selectors - d_keys_remaining_passes.selector ^= 1; - d_values_remaining_passes.selector ^= 1; - } - - // Update selector - if (!is_overwrite_okay) { - num_passes = 1; // Sorted data always ends up in the other vector - } - - d_keys.selector = (d_keys.selector + num_passes) & 1; - d_values.selector = (d_values.selector + num_passes) & 1; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - //------------------------------------------------------------------------------ - // Chained policy invocation - //------------------------------------------------------------------------------ - - /// Invocation - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t Invoke() - { - typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; - typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; - - // Force kernel code-generation in all compiler passes - if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) - { - // Small, single tile size - return InvokeSingleTile( - DeviceRadixSortSingleTileKernel); - } - else - { - // Regular size - return InvokePasses( - DeviceRadixSortUpsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, OffsetT>, - DeviceRadixSortUpsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, OffsetT>, - RadixSortScanBinsKernel< MaxPolicyT, OffsetT>, - DeviceRadixSortDownsweepKernel< MaxPolicyT, false, IS_DESCENDING, KeyT, ValueT, OffsetT>, - DeviceRadixSortDownsweepKernel< MaxPolicyT, true, IS_DESCENDING, KeyT, ValueT, OffsetT>); - } - } - - - //------------------------------------------------------------------------------ - // Dispatch entrypoints - //------------------------------------------------------------------------------ - - /** - * Internal dispatch routine - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - OffsetT num_items, ///< [in] Number of items to sort - int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison - int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison - bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT; - - cudaError_t error; - do { - // Get PTX version - int ptx_version; - if (CubDebug(error = PtxVersion(ptx_version))) break; - - // Create dispatch functor - DispatchRadixSort dispatch( - d_temp_storage, temp_storage_bytes, - d_keys, d_values, - num_items, begin_bit, end_bit, is_overwrite_okay, - stream, debug_synchronous, ptx_version); - - // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; - - } while (0); - - return error; - } -}; - - - - -/****************************************************************************** - * Segmented dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort - */ -template < - bool IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low - typename KeyT, ///< Key type - typename ValueT, ///< Value type - typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator - typename OffsetT> ///< Signed integer type for global offsets -struct DispatchSegmentedRadixSort : - DeviceRadixSortPolicy -{ - //------------------------------------------------------------------------------ - // Constants - //------------------------------------------------------------------------------ - - enum - { - // Whether this is a keys-only (or key-value) sort - KEYS_ONLY = (Equals::VALUE), - }; - - - //------------------------------------------------------------------------------ - // Parameter members - //------------------------------------------------------------------------------ - - void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys; ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values; ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - OffsetT num_items; ///< [in] Number of items to sort - OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets; ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets; ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit; ///< [in] The beginning (least-significant) bit index needed for key comparison - int end_bit; ///< [in] The past-the-end (most-significant) bit index needed for key comparison - cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version; ///< [in] PTX version - bool is_overwrite_okay; ///< [in] Whether is okay to overwrite source buffers - - - //------------------------------------------------------------------------------ - // Constructors - //------------------------------------------------------------------------------ - - /// Constructor - CUB_RUNTIME_FUNCTION __forceinline__ - DispatchSegmentedRadixSort( - void* d_temp_storage, - size_t &temp_storage_bytes, - DoubleBuffer &d_keys, - DoubleBuffer &d_values, - OffsetT num_items, - OffsetT num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - int begin_bit, - int end_bit, - bool is_overwrite_okay, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : - d_temp_storage(d_temp_storage), - temp_storage_bytes(temp_storage_bytes), - d_keys(d_keys), - d_values(d_values), - num_items(num_items), - num_segments(num_segments), - d_begin_offsets(d_begin_offsets), - d_end_offsets(d_end_offsets), - begin_bit(begin_bit), - end_bit(end_bit), - is_overwrite_okay(is_overwrite_okay), - stream(stream), - debug_synchronous(debug_synchronous), - ptx_version(ptx_version) - {} - - - //------------------------------------------------------------------------------ - // Multi-segment invocation - //------------------------------------------------------------------------------ - - /// Invoke a three-kernel sorting pass at the current bit. - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePass( - const KeyT *d_keys_in, - KeyT *d_keys_out, - const ValueT *d_values_in, - ValueT *d_values_out, - int ¤t_bit, - PassConfigT &pass_config) - { - cudaError error = cudaSuccess; - do - { - int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit)); - - // Log kernel configuration - if (debug_synchronous) - _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n", - num_segments, pass_config.segmented_config.block_threads, (long long) stream, - pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits); - - pass_config.segmented_kernel<<>>( - d_keys_in, d_keys_out, - d_values_in, d_values_out, - d_begin_offsets, d_end_offsets, num_segments, - current_bit, pass_bits); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Update current bit - current_bit += pass_bits; - } - while (0); - - return error; - } - - - /// PassConfig data structure - template - struct PassConfig - { - SegmentedKernelT segmented_kernel; - KernelConfig segmented_config; - int radix_bits; - int radix_digits; - - /// Initialize pass configuration - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel) - { - this->segmented_kernel = segmented_kernel; - this->radix_bits = SegmentedPolicyT::RADIX_BITS; - this->radix_digits = 1 << radix_bits; - - return CubDebug(segmented_config.Init(segmented_kernel)); - } - }; - - - /// Invocation (run multiple digit passes) - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename SegmentedKernelT> ///< Function type of cub::DeviceSegmentedRadixSortKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePasses( - SegmentedKernelT segmented_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel - SegmentedKernelT alt_segmented_kernel) ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel - { -#ifndef CUB_RUNTIME_ENABLED - (void)segmented_kernel; - (void)alt_segmented_kernel; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else - - cudaError error = cudaSuccess; - do - { - // Init regular and alternate kernel configurations - PassConfig pass_config, alt_pass_config; - if ((error = pass_config.template InitPassConfig(segmented_kernel))) break; - if ((error = alt_pass_config.template InitPassConfig(alt_segmented_kernel))) break; - - // Temporary storage allocation requirements - void* allocations[2]; - size_t allocation_sizes[2] = - { - (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT), // bytes needed for 3rd keys buffer - (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT), // bytes needed for 3rd values buffer - }; - - // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - { - if (temp_storage_bytes == 0) - temp_storage_bytes = 1; - return cudaSuccess; - } - - // Pass planning. Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size - int radix_bits = ActivePolicyT::SegmentedPolicy::RADIX_BITS; - int alt_radix_bits = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS; - int num_bits = end_bit - begin_bit; - int num_passes = (num_bits + radix_bits - 1) / radix_bits; - bool is_num_passes_odd = num_passes & 1; - int max_alt_passes = (num_passes * radix_bits) - num_bits; - int alt_end_bit = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits)); - - DoubleBuffer d_keys_remaining_passes( - (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast(allocations[0]), - (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast(allocations[0]) : d_keys.Alternate()); - - DoubleBuffer d_values_remaining_passes( - (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast(allocations[1]), - (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast(allocations[1]) : d_values.Alternate()); - - // Run first pass, consuming from the input's current buffers - int current_bit = begin_bit; - - if (CubDebug(error = InvokePass( - d_keys.Current(), d_keys_remaining_passes.Current(), - d_values.Current(), d_values_remaining_passes.Current(), - current_bit, - (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; - - // Run remaining passes - while (current_bit < end_bit) - { - if (CubDebug(error = InvokePass( - d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], - d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector], d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1], - current_bit, - (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break; - - // Invert selectors and update current bit - d_keys_remaining_passes.selector ^= 1; - d_values_remaining_passes.selector ^= 1; - } - - // Update selector - if (!is_overwrite_okay) { - num_passes = 1; // Sorted data always ends up in the other vector - } - - d_keys.selector = (d_keys.selector + num_passes) & 1; - d_values.selector = (d_values.selector + num_passes) & 1; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - //------------------------------------------------------------------------------ - // Chained policy invocation - //------------------------------------------------------------------------------ - - /// Invocation - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t Invoke() - { - typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; - - // Force kernel code-generation in all compiler passes - return InvokePasses( - DeviceSegmentedRadixSortKernel, - DeviceSegmentedRadixSortKernel); - } - - - //------------------------------------------------------------------------------ - // Dispatch entrypoints - //------------------------------------------------------------------------------ - - - /// Internal dispatch routine - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - DoubleBuffer &d_keys, ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys - DoubleBuffer &d_values, ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values - int num_items, ///< [in] Number of items to sort - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int begin_bit, ///< [in] The beginning (least-significant) bit index needed for key comparison - int end_bit, ///< [in] The past-the-end (most-significant) bit index needed for key comparison - bool is_overwrite_okay, ///< [in] Whether is okay to overwrite source buffers - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT; - - cudaError_t error; - do { - // Get PTX version - int ptx_version; - if (CubDebug(error = PtxVersion(ptx_version))) break; - - // Create dispatch functor - DispatchSegmentedRadixSort dispatch( - d_temp_storage, temp_storage_bytes, - d_keys, d_values, - num_items, num_segments, d_begin_offsets, d_end_offsets, - begin_bit, end_bit, is_overwrite_okay, - stream, debug_synchronous, ptx_version); - - // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; - - } while (0); - - return error; - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/dispatch/dispatch_reduce.cuh b/SRC/cub/device/dispatch/dispatch_reduce.cuh deleted file mode 100644 index e9d1b7ac..00000000 --- a/SRC/cub/device/dispatch/dispatch_reduce.cuh +++ /dev/null @@ -1,882 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "../../agent/agent_reduce.cuh" -#include "../../iterator/arg_index_input_iterator.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_even_share.cuh" -#include "../../iterator/arg_index_input_iterator.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Reduce region kernel entry point (multi-block). Computes privatized reductions, one per thread block. - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) -__global__ void DeviceReduceKernel( - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - OffsetT num_items, ///< [in] Total number of input data items - GridEvenShare even_share, ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block - ReductionOpT reduction_op) ///< [in] Binary reduction functor -{ - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - // Thread block type for reducing input tiles - typedef AgentReduce< - typename ChainedPolicyT::ActivePolicy::ReducePolicy, - InputIteratorT, - OutputIteratorT, - OffsetT, - ReductionOpT> - AgentReduceT; - - // Shared memory storage - __shared__ typename AgentReduceT::TempStorage temp_storage; - - // Consume input tiles - OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share); - - // Output result - if (threadIdx.x == 0) - d_out[blockIdx.x] = block_aggregate; -} - - -/** - * Reduce a single tile kernel entry point (single-block). Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass. - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) - typename OuputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) -__global__ void DeviceReduceSingleTileKernel( - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - OffsetT num_items, ///< [in] Total number of input data items - ReductionOpT reduction_op, ///< [in] Binary reduction functor - OuputT init) ///< [in] The initial value of the reduction -{ - // Thread block type for reducing input tiles - typedef AgentReduce< - typename ChainedPolicyT::ActivePolicy::SingleTilePolicy, - InputIteratorT, - OutputIteratorT, - OffsetT, - ReductionOpT> - AgentReduceT; - - // Shared memory storage - __shared__ typename AgentReduceT::TempStorage temp_storage; - - // Check if empty problem - if (num_items == 0) - { - if (threadIdx.x == 0) - *d_out = init; - return; - } - - // Consume input tiles - OuputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( - OffsetT(0), - num_items); - - // Output result - if (threadIdx.x == 0) - *d_out = reduction_op(init, block_aggregate); -} - - -/// Normalize input iterator to segment offset -template -__device__ __forceinline__ -void NormalizeReductionOutput( - T &/*val*/, - OffsetT /*base_offset*/, - IteratorT /*itr*/) -{} - - -/// Normalize input iterator to segment offset (specialized for arg-index) -template -__device__ __forceinline__ -void NormalizeReductionOutput( - KeyValuePairT &val, - OffsetT base_offset, - ArgIndexInputIterator /*itr*/) -{ - val.key -= base_offset; -} - - -/** - * Segmented reduction (one block per segment) - */ -template < - typename ChainedPolicyT, ///< Chained tuning policy - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator - typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT, ///< Binary reduction functor type having member T operator()(const T &a, const T &b) - typename OutputT> ///< Data element type that is convertible to the \p value type of \p OutputIteratorT -__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) -__global__ void DeviceSegmentedReduceKernel( - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - int /*num_segments*/, ///< [in] The number of segments that comprise the sorting data - ReductionOpT reduction_op, ///< [in] Binary reduction functor - OutputT init) ///< [in] The initial value of the reduction -{ - // Thread block type for reducing input tiles - typedef AgentReduce< - typename ChainedPolicyT::ActivePolicy::ReducePolicy, - InputIteratorT, - OutputIteratorT, - OffsetT, - ReductionOpT> - AgentReduceT; - - // Shared memory storage - __shared__ typename AgentReduceT::TempStorage temp_storage; - - OffsetT segment_begin = d_begin_offsets[blockIdx.x]; - OffsetT segment_end = d_end_offsets[blockIdx.x]; - - // Check if empty problem - if (segment_begin == segment_end) - { - if (threadIdx.x == 0) - d_out[blockIdx.x] = init; - return; - } - - // Consume input tiles - OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange( - segment_begin, - segment_end); - - // Normalize as needed - NormalizeReductionOutput(block_aggregate, segment_begin, d_in); - - if (threadIdx.x == 0) - d_out[blockIdx.x] = reduction_op(init, block_aggregate);; -} - - - - -/****************************************************************************** - * Policy - ******************************************************************************/ - -template < - typename OuputT, ///< Data type - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) -struct DeviceReducePolicy -{ - //------------------------------------------------------------------------------ - // Architecture-specific tuning policies - //------------------------------------------------------------------------------ - - /// SM13 - struct Policy130 : ChainedPolicy<130, Policy130, Policy130> - { - // ReducePolicy - typedef AgentReducePolicy< - CUB_SCALED_GRANULARITIES(128, 8, OuputT), ///< Threads per block, items per thread - 2, ///< Number of items per vectorized load - BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT> ///< Cache load modifier - ReducePolicy; - - // SingleTilePolicy - typedef ReducePolicy SingleTilePolicy; - - // SegmentedReducePolicy - typedef ReducePolicy SegmentedReducePolicy; - }; - - - /// SM20 - struct Policy200 : ChainedPolicy<200, Policy200, Policy130> - { - // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items) - typedef AgentReducePolicy< - CUB_SCALED_GRANULARITIES(128, 8, OuputT), ///< Threads per block, items per thread - 4, ///< Number of items per vectorized load - BLOCK_REDUCE_RAKING, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT> ///< Cache load modifier - ReducePolicy; - - // SingleTilePolicy - typedef ReducePolicy SingleTilePolicy; - - // SegmentedReducePolicy - typedef ReducePolicy SegmentedReducePolicy; - }; - - - /// SM30 - struct Policy300 : ChainedPolicy<300, Policy300, Policy200> - { - // ReducePolicy (GTX670: 154.0 @ 48M 4B items) - typedef AgentReducePolicy< - CUB_SCALED_GRANULARITIES(256, 20, OuputT), ///< Threads per block, items per thread - 2, ///< Number of items per vectorized load - BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use - LOAD_DEFAULT> ///< Cache load modifier - ReducePolicy; - - // SingleTilePolicy - typedef ReducePolicy SingleTilePolicy; - - // SegmentedReducePolicy - typedef ReducePolicy SegmentedReducePolicy; - }; - - - /// SM35 - struct Policy350 : ChainedPolicy<350, Policy350, Policy300> - { - // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items) - typedef AgentReducePolicy< - CUB_SCALED_GRANULARITIES(256, 20, OuputT), ///< Threads per block, items per thread - 4, ///< Number of items per vectorized load - BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use - LOAD_LDG> ///< Cache load modifier - ReducePolicy; - - // SingleTilePolicy - typedef ReducePolicy SingleTilePolicy; - - // SegmentedReducePolicy - typedef ReducePolicy SegmentedReducePolicy; - }; - - /// SM60 - struct Policy600 : ChainedPolicy<600, Policy600, Policy350> - { - // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items) - typedef AgentReducePolicy< - CUB_SCALED_GRANULARITIES(256, 16, OuputT), ///< Threads per block, items per thread - 4, ///< Number of items per vectorized load - BLOCK_REDUCE_WARP_REDUCTIONS, ///< Cooperative block-wide reduction algorithm to use - LOAD_LDG> ///< Cache load modifier - ReducePolicy; - - // SingleTilePolicy - typedef ReducePolicy SingleTilePolicy; - - // SegmentedReducePolicy - typedef ReducePolicy SegmentedReducePolicy; - }; - - - /// MaxPolicy - typedef Policy600 MaxPolicy; - -}; - - - -/****************************************************************************** - * Single-problem dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction - */ -template < - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) -struct DispatchReduce : - DeviceReducePolicy< - typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type, // ... else the output iterator's value type - OffsetT, - ReductionOpT> -{ - //------------------------------------------------------------------------------ - // Constants - //------------------------------------------------------------------------------ - - // Data type of output iterator - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - - //------------------------------------------------------------------------------ - // Problem state - //------------------------------------------------------------------------------ - - void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out; ///< [out] Pointer to the output aggregate - OffsetT num_items; ///< [in] Total number of input items (i.e., length of \p d_in) - ReductionOpT reduction_op; ///< [in] Binary reduction functor - OutputT init; ///< [in] The initial value of the reduction - cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version; ///< [in] PTX version - - //------------------------------------------------------------------------------ - // Constructor - //------------------------------------------------------------------------------ - - /// Constructor - CUB_RUNTIME_FUNCTION __forceinline__ - DispatchReduce( - void* d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_items, - ReductionOpT reduction_op, - OutputT init, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : - d_temp_storage(d_temp_storage), - temp_storage_bytes(temp_storage_bytes), - d_in(d_in), - d_out(d_out), - num_items(num_items), - reduction_op(reduction_op), - init(init), - stream(stream), - debug_synchronous(debug_synchronous), - ptx_version(ptx_version) - {} - - - //------------------------------------------------------------------------------ - // Small-problem (single tile) invocation - //------------------------------------------------------------------------------ - - /// Invoke a single block block to reduce in-core - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokeSingleTile( - SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel - { -#ifndef CUB_RUNTIME_ENABLED - (void)single_tile_kernel; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else - cudaError error = cudaSuccess; - do - { - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - { - temp_storage_bytes = 1; - break; - } - - // Log single_reduce_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", - ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, - (long long) stream, - ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); - - // Invoke single_reduce_sweep_kernel - single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( - d_in, - d_out, - num_items, - reduction_op, - init); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - //------------------------------------------------------------------------------ - // Normal problem size invocation (two-pass) - //------------------------------------------------------------------------------ - - /// Invoke two-passes to reduce - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename ReduceKernelT, ///< Function type of cub::DeviceReduceKernel - typename SingleTileKernelT> ///< Function type of cub::DeviceReduceSingleTileKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePasses( - ReduceKernelT reduce_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel - SingleTileKernelT single_tile_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel - { -#ifndef CUB_RUNTIME_ENABLED - (void) reduce_kernel; - (void) single_tile_kernel; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else - - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Init regular kernel configuration - KernelConfig reduce_config; - if (CubDebug(error = reduce_config.Init(reduce_kernel))) break; - int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count; - - // Even-share work distribution - int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version); - GridEvenShare even_share; - even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size); - - // Temporary storage allocation requirements - void* allocations[1]; - size_t allocation_sizes[1] = - { - max_blocks * sizeof(OutputT) // bytes needed for privatized block reductions - }; - - // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - return cudaSuccess; - } - - // Alias the allocation for the privatized per-block reductions - OutputT *d_block_reductions = (OutputT*) allocations[0]; - - // Get grid size for device_reduce_sweep_kernel - int reduce_grid_size = even_share.grid_size; - - // Log device_reduce_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - reduce_grid_size, - ActivePolicyT::ReducePolicy::BLOCK_THREADS, - (long long) stream, - ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD, - reduce_config.sm_occupancy); - - // Invoke DeviceReduceKernel - reduce_kernel<<>>( - d_in, - d_block_reductions, - num_items, - even_share, - reduction_op); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Log single_reduce_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n", - ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, - (long long) stream, - ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD); - - // Invoke DeviceReduceSingleTileKernel - single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>( - d_block_reductions, - d_out, - reduce_grid_size, - reduction_op, - init); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - - } - - - //------------------------------------------------------------------------------ - // Chained policy invocation - //------------------------------------------------------------------------------ - - /// Invocation - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t Invoke() - { - typedef typename ActivePolicyT::SingleTilePolicy SingleTilePolicyT; - typedef typename DispatchReduce::MaxPolicy MaxPolicyT; - - // Force kernel code-generation in all compiler passes - if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD)) - { - // Small, single tile size - return InvokeSingleTile( - DeviceReduceSingleTileKernel); - } - else - { - // Regular size - return InvokePasses( - DeviceReduceKernel, - DeviceReduceSingleTileKernel); - } - } - - - //------------------------------------------------------------------------------ - // Dispatch entrypoints - //------------------------------------------------------------------------------ - - /** - * Internal dispatch routine for computing a device-wide reduction - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - ReductionOpT reduction_op, ///< [in] Binary reduction functor - OutputT init, ///< [in] The initial value of the reduction - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - typedef typename DispatchReduce::MaxPolicy MaxPolicyT; - - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - if (CubDebug(error = PtxVersion(ptx_version))) break; - - // Create dispatch functor - DispatchReduce dispatch( - d_temp_storage, temp_storage_bytes, - d_in, d_out, num_items, reduction_op, init, - stream, debug_synchronous, ptx_version); - - // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; - } - while (0); - - return error; - } -}; - - - -/****************************************************************************** - * Segmented dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction - */ -template < - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OutputIteratorT, ///< Output iterator type for recording the reduced aggregate \iterator - typename OffsetIteratorT, ///< Random-access input iterator type for reading segment offsets \iterator - typename OffsetT, ///< Signed integer type for global offsets - typename ReductionOpT> ///< Binary reduction functor type having member T operator()(const T &a, const T &b) -struct DispatchSegmentedReduce : - DeviceReducePolicy< - typename std::iterator_traits::value_type, - OffsetT, - ReductionOpT> -{ - //------------------------------------------------------------------------------ - // Constants - //------------------------------------------------------------------------------ - - /// The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - - //------------------------------------------------------------------------------ - // Problem state - //------------------------------------------------------------------------------ - - void *d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in; ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out; ///< [out] Pointer to the output aggregate - OffsetT num_segments; ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets; ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets; ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - ReductionOpT reduction_op; ///< [in] Binary reduction functor - OutputT init; ///< [in] The initial value of the reduction - cudaStream_t stream; ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous; ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version; ///< [in] PTX version - - //------------------------------------------------------------------------------ - // Constructor - //------------------------------------------------------------------------------ - - /// Constructor - CUB_RUNTIME_FUNCTION __forceinline__ - DispatchSegmentedReduce( - void* d_temp_storage, - size_t &temp_storage_bytes, - InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - ReductionOpT reduction_op, - OutputT init, - cudaStream_t stream, - bool debug_synchronous, - int ptx_version) - : - d_temp_storage(d_temp_storage), - temp_storage_bytes(temp_storage_bytes), - d_in(d_in), - d_out(d_out), - num_segments(num_segments), - d_begin_offsets(d_begin_offsets), - d_end_offsets(d_end_offsets), - reduction_op(reduction_op), - init(init), - stream(stream), - debug_synchronous(debug_synchronous), - ptx_version(ptx_version) - {} - - - - //------------------------------------------------------------------------------ - // Chained policy invocation - //------------------------------------------------------------------------------ - - /// Invocation - template < - typename ActivePolicyT, ///< Umbrella policy active for the target device - typename DeviceSegmentedReduceKernelT> ///< Function type of cub::DeviceSegmentedReduceKernel - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t InvokePasses( - DeviceSegmentedReduceKernelT segmented_reduce_kernel) ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel - { -#ifndef CUB_RUNTIME_ENABLED - (void)segmented_reduce_kernel; - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); -#else - cudaError error = cudaSuccess; - do - { - // Return if the caller is simply requesting the size of the storage allocation - if (d_temp_storage == NULL) - { - temp_storage_bytes = 1; - return cudaSuccess; - } - - // Init kernel configuration - KernelConfig segmented_reduce_config; - if (CubDebug(error = segmented_reduce_config.Init(segmented_reduce_kernel))) break; - - // Log device_reduce_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - num_segments, - ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, - (long long) stream, - ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD, - segmented_reduce_config.sm_occupancy); - - // Invoke DeviceReduceKernel - segmented_reduce_kernel<<>>( - d_in, - d_out, - d_begin_offsets, - d_end_offsets, - num_segments, - reduction_op, - init); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - - } - - - /// Invocation - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t Invoke() - { - typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; - - // Force kernel code-generation in all compiler passes - return InvokePasses( - DeviceSegmentedReduceKernel); - } - - - //------------------------------------------------------------------------------ - // Dispatch entrypoints - //------------------------------------------------------------------------------ - - /** - * Internal dispatch routine for computing a device-wide reduction - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output aggregate - int num_segments, ///< [in] The number of segments that comprise the sorting data - OffsetIteratorT d_begin_offsets, ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that d_begin_offsets[i] is the first element of the ith data segment in d_keys_* and d_values_* - OffsetIteratorT d_end_offsets, ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that d_end_offsets[i]-1 is the last element of the ith data segment in d_keys_* and d_values_*. If d_end_offsets[i]-1 <= d_begin_offsets[i], the ith is considered empty. - ReductionOpT reduction_op, ///< [in] Binary reduction functor - OutputT init, ///< [in] The initial value of the reduction - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT; - - if (num_segments <= 0) - return cudaSuccess; - - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - if (CubDebug(error = PtxVersion(ptx_version))) break; - - // Create dispatch functor - DispatchSegmentedReduce dispatch( - d_temp_storage, temp_storage_bytes, - d_in, d_out, - num_segments, d_begin_offsets, d_end_offsets, - reduction_op, init, - stream, debug_synchronous, ptx_version); - - // Dispatch to chained policy - if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; - } - while (0); - - return error; - } -}; - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/dispatch/dispatch_reduce_by_key.cuh b/SRC/cub/device/dispatch/dispatch_reduce_by_key.cuh deleted file mode 100644 index 6f4837b7..00000000 --- a/SRC/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ /dev/null @@ -1,554 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch_scan.cuh" -#include "../../agent/agent_reduce_by_key.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Multi-block reduce-by-key sweep kernel entry point - */ -template < - typename AgentReduceByKeyPolicyT, ///< Parameterized AgentReduceByKeyPolicyT tuning policy type - typename KeysInputIteratorT, ///< Random-access input iterator type for keys - typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys - typename ValuesInputIteratorT, ///< Random-access input iterator type for values - typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values - typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of segments encountered - typename ScanTileStateT, ///< Tile status interface type - typename EqualityOpT, ///< KeyT equality operator type - typename ReductionOpT, ///< ValueT reduction operator type - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS)) -__global__ void DeviceReduceByKeyKernel( - KeysInputIteratorT d_keys_in, ///< Pointer to the input sequence of keys - UniqueOutputIteratorT d_unique_out, ///< Pointer to the output sequence of unique keys (one key per run) - ValuesInputIteratorT d_values_in, ///< Pointer to the input sequence of corresponding values - AggregatesOutputIteratorT d_aggregates_out, ///< Pointer to the output sequence of value aggregates (one aggregate per run) - NumRunsOutputIteratorT d_num_runs_out, ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out) - ScanTileStateT tile_state, ///< Tile status interface - int start_tile, ///< The starting tile for the current grid - EqualityOpT equality_op, ///< KeyT equality operator - ReductionOpT reduction_op, ///< ValueT reduction operator - OffsetT num_items) ///< Total number of items to select from -{ - // Thread block type for reducing tiles of value segments - typedef AgentReduceByKey< - AgentReduceByKeyPolicyT, - KeysInputIteratorT, - UniqueOutputIteratorT, - ValuesInputIteratorT, - AggregatesOutputIteratorT, - NumRunsOutputIteratorT, - EqualityOpT, - ReductionOpT, - OffsetT> - AgentReduceByKeyT; - - // Shared memory for AgentReduceByKey - __shared__ typename AgentReduceByKeyT::TempStorage temp_storage; - - // Process tiles - AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange( - num_items, - tile_state, - start_tile); -} - - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey - */ -template < - typename KeysInputIteratorT, ///< Random-access input iterator type for keys - typename UniqueOutputIteratorT, ///< Random-access output iterator type for keys - typename ValuesInputIteratorT, ///< Random-access input iterator type for values - typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values - typename NumRunsOutputIteratorT, ///< Output iterator type for recording number of segments encountered - typename EqualityOpT, ///< KeyT equality operator type - typename ReductionOpT, ///< ValueT reduction operator type - typename OffsetT> ///< Signed integer type for global offsets -struct DispatchReduceByKey -{ - //------------------------------------------------------------------------- - // Types and constants - //------------------------------------------------------------------------- - - // The input keys type - typedef typename std::iterator_traits::value_type KeyInputT; - - // The output keys type - typedef typename If<(Equals::value_type, void>::VALUE), // KeyOutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type KeyOutputT; // ... else the output iterator's value type - - // The input values type - typedef typename std::iterator_traits::value_type ValueInputT; - - // The output values type - typedef typename If<(Equals::value_type, void>::VALUE), // ValueOutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type ValueOutputT; // ... else the output iterator's value type - - enum - { - INIT_KERNEL_THREADS = 128, - MAX_INPUT_BYTES = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)), - COMBINED_INPUT_BYTES = sizeof(KeyOutputT) + sizeof(ValueOutputT), - }; - - // Tile status descriptor interface type - typedef ReduceByKeyScanTileState ScanTileStateT; - - - //------------------------------------------------------------------------- - // Tuning policies - //------------------------------------------------------------------------- - - /// SM35 - struct Policy350 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 6, - ITEMS_PER_THREAD = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), - }; - - typedef AgentReduceByKeyPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - BLOCK_SCAN_WARP_SCANS> - ReduceByKeyPolicyT; - }; - - /// SM30 - struct Policy300 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 6, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), - }; - - typedef AgentReduceByKeyPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - ReduceByKeyPolicyT; - }; - - /// SM20 - struct Policy200 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 11, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), - }; - - typedef AgentReduceByKeyPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - ReduceByKeyPolicyT; - }; - - /// SM13 - struct Policy130 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 7, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)), - }; - - typedef AgentReduceByKeyPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - ReduceByKeyPolicyT; - }; - - /// SM11 - struct Policy110 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 5, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)), - }; - - typedef AgentReduceByKeyPolicy< - 64, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_RAKING> - ReduceByKeyPolicyT; - }; - - - /****************************************************************************** - * Tuning policies of current PTX compiler pass - ******************************************************************************/ - -#if (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 130) - typedef Policy130 PtxPolicy; - -#else - typedef Policy110 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {}; - - - /****************************************************************************** - * Utilities - ******************************************************************************/ - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static void InitConfigs( - int ptx_version, - KernelConfig &reduce_by_key_config) - { - #if (CUB_PTX_ARCH > 0) - (void)ptx_version; - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - reduce_by_key_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - reduce_by_key_config.template Init(); - } - else if (ptx_version >= 300) - { - reduce_by_key_config.template Init(); - } - else if (ptx_version >= 200) - { - reduce_by_key_config.template Init(); - } - else if (ptx_version >= 130) - { - reduce_by_key_config.template Init(); - } - else - { - reduce_by_key_config.template Init(); - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration. - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - int tile_items; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - void Init() - { - block_threads = PolicyT::BLOCK_THREADS; - items_per_thread = PolicyT::ITEMS_PER_THREAD; - tile_items = block_threads * items_per_thread; - } - }; - - - //--------------------------------------------------------------------- - // Dispatch entrypoints - //--------------------------------------------------------------------- - - /** - * Internal dispatch routine for computing a device-wide reduce-by-key using the - * specified kernel functions. - */ - template < - typename ScanInitKernelT, ///< Function type of cub::DeviceScanInitKernel - typename ReduceByKeyKernelT> ///< Function type of cub::DeviceReduceByKeyKernelT - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys - UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) - ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values - AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) - EqualityOpT equality_op, ///< [in] KeyT equality operator - ReductionOpT reduction_op, ///< [in] ValueT reduction operator - OffsetT num_items, ///< [in] Total number of items to select from - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int /*ptx_version*/, ///< [in] PTX version of dispatch kernels - ScanInitKernelT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel - ReduceByKeyKernelT reduce_by_key_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel - KernelConfig reduce_by_key_config) ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for - { - -#ifndef CUB_RUNTIME_ENABLED - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_keys_in; - (void)d_unique_out; - (void)d_values_in; - (void)d_aggregates_out; - (void)d_num_runs_out; - (void)equality_op; - (void)reduction_op; - (void)num_items; - (void)stream; - (void)debug_synchronous; - (void)init_kernel; - (void)reduce_by_key_kernel; - (void)reduce_by_key_config; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Number of input tiles - int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread; - int num_tiles = (num_items + tile_size - 1) / tile_size; - - // Specify temporary storage allocation requirements - size_t allocation_sizes[1]; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors - - // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) - void* allocations[1]; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - break; - } - - // Construct the tile status interface - ScanTileStateT tile_state; - if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; - - // Log init_kernel configuration - int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); - if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); - - // Invoke init_kernel to initialize tile descriptors - init_kernel<<>>( - tile_state, - num_tiles, - d_num_runs_out); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Return if empty problem - if (num_items == 0) - break; - - // Get SM occupancy for reduce_by_key_kernel - int reduce_by_key_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - reduce_by_key_sm_occupancy, // out - reduce_by_key_kernel, - reduce_by_key_config.block_threads))) break; - - // Get max x-dimension of grid - int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; - - // Run grids in epochs (in case number of tiles exceeds max x-dimension - int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); - for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) - { - // Log reduce_by_key_kernel configuration - if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy); - - // Invoke reduce_by_key_kernel - reduce_by_key_kernel<<>>( - d_keys_in, - d_unique_out, - d_values_in, - d_aggregates_out, - d_num_runs_out, - tile_state, - start_tile, - equality_op, - reduction_op, - num_items); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - KeysInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys - UniqueOutputIteratorT d_unique_out, ///< [out] Pointer to the output sequence of unique keys (one key per run) - ValuesInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of corresponding values - AggregatesOutputIteratorT d_aggregates_out, ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run) - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out) - EqualityOpT equality_op, ///< [in] KeyT equality operator - ReductionOpT reduction_op, ///< [in] ValueT reduction operator - OffsetT num_items, ///< [in] Total number of items to select from - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel kernel dispatch configurations - KernelConfig reduce_by_key_config; - InitConfigs(ptx_version, reduce_by_key_config); - - // Dispatch - if (CubDebug(error = Dispatch( - d_temp_storage, - temp_storage_bytes, - d_keys_in, - d_unique_out, - d_values_in, - d_aggregates_out, - d_num_runs_out, - equality_op, - reduction_op, - num_items, - stream, - debug_synchronous, - ptx_version, - DeviceCompactInitKernel, - DeviceReduceByKeyKernel, - reduce_by_key_config))) break; - } - while (0); - - return error; - } -}; - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/dispatch/dispatch_rle.cuh b/SRC/cub/device/dispatch/dispatch_rle.cuh deleted file mode 100644 index 98c3681f..00000000 --- a/SRC/cub/device/dispatch/dispatch_rle.cuh +++ /dev/null @@ -1,538 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch_scan.cuh" -#include "../../agent/agent_rle.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Select kernel entry point (multi-block) - * - * Performs functor-based selection if SelectOp functor type != NullType - * Otherwise performs flag-based selection if FlagIterator's value type != NullType - * Otherwise performs discontinuity selection (keep unique) - */ -template < - typename AgentRlePolicyT, ///< Parameterized AgentRlePolicyT tuning policy type - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator - typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator - typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator - typename ScanTileStateT, ///< Tile status interface type - typename EqualityOpT, ///< T equality operator type - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS)) -__global__ void DeviceRleSweepKernel( - InputIteratorT d_in, ///< [in] Pointer to input sequence of data items - OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets - LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) - ScanTileStateT tile_status, ///< [in] Tile status interface - EqualityOpT equality_op, ///< [in] Equality operator for input items - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - int num_tiles) ///< [in] Total number of tiles for the entire problem -{ - // Thread block type for selecting data from input tiles - typedef AgentRle< - AgentRlePolicyT, - InputIteratorT, - OffsetsOutputIteratorT, - LengthsOutputIteratorT, - EqualityOpT, - OffsetT> AgentRleT; - - // Shared memory for AgentRle - __shared__ typename AgentRleT::TempStorage temp_storage; - - // Process tiles - AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange( - num_tiles, - tile_status, - d_num_runs_out); -} - - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceRle - */ -template < - typename InputIteratorT, ///< Random-access input iterator type for reading input items \iterator - typename OffsetsOutputIteratorT, ///< Random-access output iterator type for writing run-offset values \iterator - typename LengthsOutputIteratorT, ///< Random-access output iterator type for writing run-length values \iterator - typename NumRunsOutputIteratorT, ///< Output iterator type for recording the number of runs encountered \iterator - typename EqualityOpT, ///< T equality operator type - typename OffsetT> ///< Signed integer type for global offsets -struct DeviceRleDispatch -{ - /****************************************************************************** - * Types and constants - ******************************************************************************/ - - // The input value type - typedef typename std::iterator_traits::value_type T; - - // The lengths output value type - typedef typename If<(Equals::value_type, void>::VALUE), // LengthT = (if output iterator's value type is void) ? - OffsetT, // ... then the OffsetT type, - typename std::iterator_traits::value_type>::Type LengthT; // ... else the output iterator's value type - - enum - { - INIT_KERNEL_THREADS = 128, - }; - - // Tile status descriptor interface type - typedef ReduceByKeyScanTileState ScanTileStateT; - - - /****************************************************************************** - * Tuning policies - ******************************************************************************/ - - /// SM35 - struct Policy350 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 15, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef AgentRlePolicy< - 96, - ITEMS_PER_THREAD, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - true, - BLOCK_SCAN_WARP_SCANS> - RleSweepPolicy; - }; - - /// SM30 - struct Policy300 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 5, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef AgentRlePolicy< - 256, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - BLOCK_SCAN_RAKING_MEMOIZE> - RleSweepPolicy; - }; - - /// SM20 - struct Policy200 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 15, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef AgentRlePolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS> - RleSweepPolicy; - }; - - /// SM13 - struct Policy130 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 9, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef AgentRlePolicy< - 64, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - BLOCK_SCAN_RAKING_MEMOIZE> - RleSweepPolicy; - }; - - /// SM10 - struct Policy100 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 9, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))), - }; - - typedef AgentRlePolicy< - 256, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - true, - BLOCK_SCAN_RAKING_MEMOIZE> - RleSweepPolicy; - }; - - - /****************************************************************************** - * Tuning policies of current PTX compiler pass - ******************************************************************************/ - -#if (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 130) - typedef Policy130 PtxPolicy; - -#else - typedef Policy100 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {}; - - - /****************************************************************************** - * Utilities - ******************************************************************************/ - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static void InitConfigs( - int ptx_version, - KernelConfig& device_rle_config) - { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - device_rle_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - device_rle_config.template Init(); - } - else if (ptx_version >= 300) - { - device_rle_config.template Init(); - } - else if (ptx_version >= 200) - { - device_rle_config.template Init(); - } - else if (ptx_version >= 130) - { - device_rle_config.template Init(); - } - else - { - device_rle_config.template Init(); - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration. Mirrors the constants within AgentRlePolicyT. - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - BlockLoadAlgorithm load_policy; - bool store_warp_time_slicing; - BlockScanAlgorithm scan_algorithm; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - void Init() - { - block_threads = AgentRlePolicyT::BLOCK_THREADS; - items_per_thread = AgentRlePolicyT::ITEMS_PER_THREAD; - load_policy = AgentRlePolicyT::LOAD_ALGORITHM; - store_warp_time_slicing = AgentRlePolicyT::STORE_WARP_TIME_SLICING; - scan_algorithm = AgentRlePolicyT::SCAN_ALGORITHM; - } - - CUB_RUNTIME_FUNCTION __forceinline__ - void Print() - { - printf("%d, %d, %d, %d, %d", - block_threads, - items_per_thread, - load_policy, - store_warp_time_slicing, - scan_algorithm); - } - }; - - - /****************************************************************************** - * Dispatch entrypoints - ******************************************************************************/ - - /** - * Internal dispatch routine for computing a device-wide run-length-encode using the - * specified kernel functions. - */ - template < - typename DeviceScanInitKernelPtr, ///< Function type of cub::DeviceScanInitKernel - typename DeviceRleSweepKernelPtr> ///< Function type of cub::DeviceRleSweepKernelPtr - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to the output sequence of run-offsets - LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to the output sequence of run-lengths - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out) - EqualityOpT equality_op, ///< [in] Equality operator for input items - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int ptx_version, ///< [in] PTX version of dispatch kernels - DeviceScanInitKernelPtr device_scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel - DeviceRleSweepKernelPtr device_rle_sweep_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel - KernelConfig device_rle_config) ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for - { - -#ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Number of input tiles - int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread; - int num_tiles = (num_items + tile_size - 1) / tile_size; - - // Specify temporary storage allocation requirements - size_t allocation_sizes[1]; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors - - // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) - void* allocations[1]; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - break; - } - - // Construct the tile status interface - ScanTileStateT tile_status; - if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; - - // Log device_scan_init_kernel configuration - int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); - if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); - - // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors - device_scan_init_kernel<<>>( - tile_status, - num_tiles, - d_num_runs_out); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Return if empty problem - if (num_items == 0) - break; - - // Get SM occupancy for device_rle_sweep_kernel - int device_rle_kernel_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - device_rle_kernel_sm_occupancy, // out - device_rle_sweep_kernel, - device_rle_config.block_threads))) break; - - // Get max x-dimension of grid - int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; - - // Get grid size for scanning tiles - dim3 scan_grid_size; - scan_grid_size.z = 1; - scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x; - scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); - - // Log device_rle_sweep_kernel configuration - if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy); - - // Invoke device_rle_sweep_kernel - device_rle_sweep_kernel<<>>( - d_in, - d_offsets_out, - d_lengths_out, - d_num_runs_out, - tile_status, - equality_op, - num_items, - num_tiles); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to input sequence of data items - OffsetsOutputIteratorT d_offsets_out, ///< [out] Pointer to output sequence of run-offsets - LengthsOutputIteratorT d_lengths_out, ///< [out] Pointer to output sequence of run-lengths - NumRunsOutputIteratorT d_num_runs_out, ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out) - EqualityOpT equality_op, ///< [in] Equality operator for input items - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel kernel dispatch configurations - KernelConfig device_rle_config; - InitConfigs(ptx_version, device_rle_config); - - // Dispatch - if (CubDebug(error = Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_offsets_out, - d_lengths_out, - d_num_runs_out, - equality_op, - num_items, - stream, - debug_synchronous, - ptx_version, - DeviceCompactInitKernel, - DeviceRleSweepKernel, - device_rle_config))) break; - } - while (0); - - return error; - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/dispatch/dispatch_scan.cuh b/SRC/cub/device/dispatch/dispatch_scan.cuh deleted file mode 100644 index 3ef720a4..00000000 --- a/SRC/cub/device/dispatch/dispatch_scan.cuh +++ /dev/null @@ -1,563 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "../../agent/agent_scan.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_arch.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Initialization kernel for tile status initialization (multi-block) - */ -template < - typename ScanTileStateT> ///< Tile status interface type -__global__ void DeviceScanInitKernel( - ScanTileStateT tile_state, ///< [in] Tile status interface - int num_tiles) ///< [in] Number of tiles -{ - // Initialize tile status - tile_state.InitializeStatus(num_tiles); -} - -/** - * Initialization kernel for tile status initialization (multi-block) - */ -template < - typename ScanTileStateT, ///< Tile status interface type - typename NumSelectedIteratorT> ///< Output iterator type for recording the number of items selected -__global__ void DeviceCompactInitKernel( - ScanTileStateT tile_state, ///< [in] Tile status interface - int num_tiles, ///< [in] Number of tiles - NumSelectedIteratorT d_num_selected_out) ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out) -{ - // Initialize tile status - tile_state.InitializeStatus(num_tiles); - - // Initialize d_num_selected_out - if ((blockIdx.x == 0) && (threadIdx.x == 0)) - *d_num_selected_out = 0; -} - - -/** - * Scan kernel entry point (multi-block) - */ -template < - typename ScanPolicyT, ///< Parameterized ScanPolicyT tuning policy type - typename InputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator - typename OutputIteratorT, ///< Random-access output iterator type for writing scan outputs \iterator - typename ScanTileStateT, ///< Tile status interface type - typename ScanOpT, ///< Binary scan functor type having member T operator()(const T &a, const T &b) - typename InitValueT, ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans) - typename OffsetT> ///< Signed integer type for global offsets -__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS)) -__global__ void DeviceScanKernel( - InputIteratorT d_in, ///< Input data - OutputIteratorT d_out, ///< Output data - ScanTileStateT tile_state, ///< Tile status interface - int start_tile, ///< The starting tile for the current grid - ScanOpT scan_op, ///< Binary scan functor - InitValueT init_value, ///< Initial value to seed the exclusive scan - OffsetT num_items) ///< Total number of scan items for the entire problem -{ - // Thread block type for scanning input tiles - typedef AgentScan< - ScanPolicyT, - InputIteratorT, - OutputIteratorT, - ScanOpT, - InitValueT, - OffsetT> AgentScanT; - - // Shared memory for AgentScan - __shared__ typename AgentScanT::TempStorage temp_storage; - - // Process tiles - AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange( - num_items, - tile_state, - start_tile); -} - - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceScan - */ -template < - typename InputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator - typename OutputIteratorT, ///< Random-access output iterator type for writing scan outputs \iterator - typename ScanOpT, ///< Binary scan functor type having member T operator()(const T &a, const T &b) - typename InitValueT, ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans) - typename OffsetT> ///< Signed integer type for global offsets -struct DispatchScan -{ - //--------------------------------------------------------------------- - // Constants and Types - //--------------------------------------------------------------------- - - enum - { - INIT_KERNEL_THREADS = 128 - }; - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - // Tile status descriptor interface type - typedef ScanTileState ScanTileStateT; - - - //--------------------------------------------------------------------- - // Tuning policies - //--------------------------------------------------------------------- - - /// SM600 - struct Policy600 - { - typedef AgentScanPolicy< - CUB_SCALED_GRANULARITIES(128, 15, OutputT), ///< Threads per block, items per thread - BLOCK_LOAD_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_STORE_TRANSPOSE, - BLOCK_SCAN_WARP_SCANS> - ScanPolicyT; - }; - - - /// SM520 - struct Policy520 - { - // Titan X: 32.47B items/s @ 48M 32-bit T - typedef AgentScanPolicy< - CUB_SCALED_GRANULARITIES(128, 12, OutputT), ///< Threads per block, items per thread - BLOCK_LOAD_DIRECT, - LOAD_LDG, - BLOCK_STORE_WARP_TRANSPOSE, - BLOCK_SCAN_WARP_SCANS> - ScanPolicyT; - }; - - - /// SM35 - struct Policy350 - { - // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T - typedef AgentScanPolicy< - CUB_SCALED_GRANULARITIES(128, 12, OutputT), ///< Threads per block, items per thread - BLOCK_LOAD_DIRECT, - LOAD_LDG, - BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, - BLOCK_SCAN_RAKING> - ScanPolicyT; - }; - - /// SM30 - struct Policy300 - { - typedef AgentScanPolicy< - CUB_SCALED_GRANULARITIES(256, 9, OutputT), ///< Threads per block, items per thread - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE, - BLOCK_SCAN_WARP_SCANS> - ScanPolicyT; - }; - - /// SM20 - struct Policy200 - { - // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T - typedef AgentScanPolicy< - CUB_SCALED_GRANULARITIES(128, 12, OutputT), ///< Threads per block, items per thread - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE, - BLOCK_SCAN_WARP_SCANS> - ScanPolicyT; - }; - - /// SM13 - struct Policy130 - { - typedef AgentScanPolicy< - CUB_SCALED_GRANULARITIES(96, 21, OutputT), ///< Threads per block, items per thread - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE, - BLOCK_SCAN_RAKING_MEMOIZE> - ScanPolicyT; - }; - - /// SM10 - struct Policy100 - { - typedef AgentScanPolicy< - CUB_SCALED_GRANULARITIES(64, 9, OutputT), ///< Threads per block, items per thread - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_STORE_WARP_TRANSPOSE, - BLOCK_SCAN_WARP_SCANS> - ScanPolicyT; - }; - - - //--------------------------------------------------------------------- - // Tuning policies of current PTX compiler pass - //--------------------------------------------------------------------- - -#if (CUB_PTX_ARCH >= 600) - typedef Policy600 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 520) - typedef Policy520 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 130) - typedef Policy130 PtxPolicy; - -#else - typedef Policy100 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxAgentScanPolicy : PtxPolicy::ScanPolicyT {}; - - - //--------------------------------------------------------------------- - // Utilities - //--------------------------------------------------------------------- - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static void InitConfigs( - int ptx_version, - KernelConfig &scan_kernel_config) - { - #if (CUB_PTX_ARCH > 0) - (void)ptx_version; - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - scan_kernel_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 600) - { - scan_kernel_config.template Init(); - } - else if (ptx_version >= 520) - { - scan_kernel_config.template Init(); - } - else if (ptx_version >= 350) - { - scan_kernel_config.template Init(); - } - else if (ptx_version >= 300) - { - scan_kernel_config.template Init(); - } - else if (ptx_version >= 200) - { - scan_kernel_config.template Init(); - } - else if (ptx_version >= 130) - { - scan_kernel_config.template Init(); - } - else - { - scan_kernel_config.template Init(); - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration. - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - int tile_items; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - void Init() - { - block_threads = PolicyT::BLOCK_THREADS; - items_per_thread = PolicyT::ITEMS_PER_THREAD; - tile_items = block_threads * items_per_thread; - } - }; - - - //--------------------------------------------------------------------- - // Dispatch entrypoints - //--------------------------------------------------------------------- - - /** - * Internal dispatch routine for computing a device-wide prefix scan using the - * specified kernel functions. - */ - template < - typename ScanInitKernelPtrT, ///< Function type of cub::DeviceScanInitKernel - typename ScanSweepKernelPtrT> ///< Function type of cub::DeviceScanKernelPtrT - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items - ScanOpT scan_op, ///< [in] Binary scan functor - InitValueT init_value, ///< [in] Initial value to seed the exclusive scan - OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int /*ptx_version*/, ///< [in] PTX version of dispatch kernels - ScanInitKernelPtrT init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel - ScanSweepKernelPtrT scan_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanKernel - KernelConfig scan_kernel_config) ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for - { - -#ifndef CUB_RUNTIME_ENABLED - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_in; - (void)d_out; - (void)scan_op; - (void)init_value; - (void)num_items; - (void)stream; - (void)debug_synchronous; - (void)init_kernel; - (void)scan_kernel; - (void)scan_kernel_config; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Number of input tiles - int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread; - int num_tiles = (num_items + tile_size - 1) / tile_size; - - // Specify temporary storage allocation requirements - size_t allocation_sizes[1]; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors - - // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) - void* allocations[1]; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - break; - } - - // Return if empty problem - if (num_items == 0) - break; - - // Construct the tile status interface - ScanTileStateT tile_state; - if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; - - // Log init_kernel configuration - int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS; - if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); - - // Invoke init_kernel to initialize tile descriptors - init_kernel<<>>( - tile_state, - num_tiles); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Get SM occupancy for scan_kernel - int scan_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - scan_sm_occupancy, // out - scan_kernel, - scan_kernel_config.block_threads))) break; - - // Get max x-dimension of grid - int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; - - // Run grids in epochs (in case number of tiles exceeds max x-dimension - int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); - for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size) - { - // Log scan_kernel configuration - if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy); - - // Invoke scan_kernel - scan_kernel<<>>( - d_in, - d_out, - tile_state, - start_tile, - scan_op, - init_value, - num_items); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items - ScanOpT scan_op, ///< [in] Binary scan functor - InitValueT init_value, ///< [in] Initial value to seed the exclusive scan - OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - if (CubDebug(error = PtxVersion(ptx_version))) break; - - // Get kernel kernel dispatch configurations - KernelConfig scan_kernel_config; - InitConfigs(ptx_version, scan_kernel_config); - - // Dispatch - if (CubDebug(error = Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_out, - scan_op, - init_value, - num_items, - stream, - debug_synchronous, - ptx_version, - DeviceScanInitKernel, - DeviceScanKernel, - scan_kernel_config))) break; - } - while (0); - - return error; - } -}; - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/dispatch/dispatch_select_if.cuh b/SRC/cub/device/dispatch/dispatch_select_if.cuh deleted file mode 100644 index 60b33133..00000000 --- a/SRC/cub/device/dispatch/dispatch_select_if.cuh +++ /dev/null @@ -1,542 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory. - */ - -#pragma once - -#include -#include - -#include "dispatch_scan.cuh" -#include "../../agent/agent_select_if.cuh" -#include "../../thread/thread_operators.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_device.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/****************************************************************************** - * Kernel entry points - *****************************************************************************/ - -/** - * Select kernel entry point (multi-block) - * - * Performs functor-based selection if SelectOpT functor type != NullType - * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType - * Otherwise performs discontinuity selection (keep unique) - */ -template < - typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicyT tuning policy type - typename InputIteratorT, ///< Random-access input iterator type for reading input items - typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) - typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items - typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected - typename ScanTileStateT, ///< Tile status interface type - typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) - typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) - typename OffsetT, ///< Signed integer type for global offsets - bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output -__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS)) -__global__ void DeviceSelectSweepKernel( - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) - SelectedOutputIteratorT d_selected_out, ///< [out] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out) - ScanTileStateT tile_status, ///< [in] Tile status interface - SelectOpT select_op, ///< [in] Selection operator - EqualityOpT equality_op, ///< [in] Equality operator - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - int num_tiles) ///< [in] Total number of tiles for the entire problem -{ - // Thread block type for selecting data from input tiles - typedef AgentSelectIf< - AgentSelectIfPolicyT, - InputIteratorT, - FlagsInputIteratorT, - SelectedOutputIteratorT, - SelectOpT, - EqualityOpT, - OffsetT, - KEEP_REJECTS> AgentSelectIfT; - - // Shared memory for AgentSelectIf - __shared__ typename AgentSelectIfT::TempStorage temp_storage; - - // Process tiles - AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange( - num_tiles, - tile_status, - d_num_selected_out); -} - - - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect - */ -template < - typename InputIteratorT, ///< Random-access input iterator type for reading input items - typename FlagsInputIteratorT, ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection) - typename SelectedOutputIteratorT, ///< Random-access output iterator type for writing selected items - typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected - typename SelectOpT, ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection) - typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection) - typename OffsetT, ///< Signed integer type for global offsets - bool KEEP_REJECTS> ///< Whether or not we push rejected items to the back of the output -struct DispatchSelectIf -{ - /****************************************************************************** - * Types and constants - ******************************************************************************/ - - // The output value type - typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type - - // The flag value type - typedef typename std::iterator_traits::value_type FlagT; - - enum - { - INIT_KERNEL_THREADS = 128, - }; - - // Tile status descriptor interface type - typedef ScanTileState ScanTileStateT; - - - /****************************************************************************** - * Tuning policies - ******************************************************************************/ - - /// SM35 - struct Policy350 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 10, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), - }; - - typedef AgentSelectIfPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - BLOCK_SCAN_WARP_SCANS> - SelectIfPolicyT; - }; - - /// SM30 - struct Policy300 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 7, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), - }; - - typedef AgentSelectIfPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - SelectIfPolicyT; - }; - - /// SM20 - struct Policy200 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), - }; - - typedef AgentSelectIfPolicy< - 128, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - SelectIfPolicyT; - }; - - /// SM13 - struct Policy130 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 9, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), - }; - - typedef AgentSelectIfPolicy< - 64, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_RAKING_MEMOIZE> - SelectIfPolicyT; - }; - - /// SM10 - struct Policy100 - { - enum { - NOMINAL_4B_ITEMS_PER_THREAD = 9, - ITEMS_PER_THREAD = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))), - }; - - typedef AgentSelectIfPolicy< - 64, - ITEMS_PER_THREAD, - BLOCK_LOAD_WARP_TRANSPOSE, - LOAD_DEFAULT, - BLOCK_SCAN_RAKING> - SelectIfPolicyT; - }; - - - /****************************************************************************** - * Tuning policies of current PTX compiler pass - ******************************************************************************/ - -#if (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 130) - typedef Policy130 PtxPolicy; - -#else - typedef Policy100 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {}; - - - /****************************************************************************** - * Utilities - ******************************************************************************/ - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static void InitConfigs( - int ptx_version, - KernelConfig &select_if_config) - { - #if (CUB_PTX_ARCH > 0) - (void)ptx_version; - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - select_if_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 350) - { - select_if_config.template Init(); - } - else if (ptx_version >= 300) - { - select_if_config.template Init(); - } - else if (ptx_version >= 200) - { - select_if_config.template Init(); - } - else if (ptx_version >= 130) - { - select_if_config.template Init(); - } - else - { - select_if_config.template Init(); - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration. - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - int tile_items; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - void Init() - { - block_threads = PolicyT::BLOCK_THREADS; - items_per_thread = PolicyT::ITEMS_PER_THREAD; - tile_items = block_threads * items_per_thread; - } - }; - - - /****************************************************************************** - * Dispatch entrypoints - ******************************************************************************/ - - /** - * Internal dispatch routine for computing a device-wide selection using the - * specified kernel functions. - */ - template < - typename ScanInitKernelPtrT, ///< Function type of cub::DeviceScanInitKernel - typename SelectIfKernelPtrT> ///< Function type of cub::SelectIfKernelPtrT - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) - SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out) - SelectOpT select_op, ///< [in] Selection operator - EqualityOpT equality_op, ///< [in] Equality operator - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - int /*ptx_version*/, ///< [in] PTX version of dispatch kernels - ScanInitKernelPtrT scan_init_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel - SelectIfKernelPtrT select_if_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel - KernelConfig select_if_config) ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for - { - -#ifndef CUB_RUNTIME_ENABLED - (void)d_temp_storage; - (void)temp_storage_bytes; - (void)d_in; - (void)d_flags; - (void)d_selected_out; - (void)d_num_selected_out; - (void)select_op; - (void)equality_op; - (void)num_items; - (void)stream; - (void)debug_synchronous; - (void)scan_init_kernel; - (void)select_if_kernel; - (void)select_if_config; - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported); - -#else - - cudaError error = cudaSuccess; - do - { - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Number of input tiles - int tile_size = select_if_config.block_threads * select_if_config.items_per_thread; - int num_tiles = (num_items + tile_size - 1) / tile_size; - - // Specify temporary storage allocation requirements - size_t allocation_sizes[1]; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors - - // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) - void* allocations[1]; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - break; - } - - // Construct the tile status interface - ScanTileStateT tile_status; - if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; - - // Log scan_init_kernel configuration - int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS); - if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); - - // Invoke scan_init_kernel to initialize tile descriptors - scan_init_kernel<<>>( - tile_status, - num_tiles, - d_num_selected_out); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Return if empty problem - if (num_items == 0) - break; - - // Get SM occupancy for select_if_kernel - int range_select_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - range_select_sm_occupancy, // out - select_if_kernel, - select_if_config.block_threads))) break; - - // Get max x-dimension of grid - int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; - - // Get grid size for scanning tiles - dim3 scan_grid_size; - scan_grid_size.z = 1; - scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x; - scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); - - // Log select_if_kernel configuration - if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy); - - // Invoke select_if_kernel - select_if_kernel<<>>( - d_in, - d_flags, - d_selected_out, - d_num_selected_out, - tile_status, - select_op, - equality_op, - num_items, - num_tiles); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items - FlagsInputIteratorT d_flags, ///< [in] Pointer to the input sequence of selection flags (if applicable) - SelectedOutputIteratorT d_selected_out, ///< [in] Pointer to the output sequence of selected data items - NumSelectedIteratorT d_num_selected_out, ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out) - SelectOpT select_op, ///< [in] Selection operator - EqualityOpT equality_op, ///< [in] Equality operator - OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_in) - cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel kernel dispatch configurations - KernelConfig select_if_config; - InitConfigs(ptx_version, select_if_config); - - // Dispatch - if (CubDebug(error = Dispatch( - d_temp_storage, - temp_storage_bytes, - d_in, - d_flags, - d_selected_out, - d_num_selected_out, - select_op, - equality_op, - num_items, - stream, - debug_synchronous, - ptx_version, - DeviceCompactInitKernel, - DeviceSelectSweepKernel, - select_if_config))) break; - } - while (0); - - return error; - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/device/dispatch/dispatch_spmv_orig.cuh b/SRC/cub/device/dispatch/dispatch_spmv_orig.cuh deleted file mode 100644 index ab9c5346..00000000 --- a/SRC/cub/device/dispatch/dispatch_spmv_orig.cuh +++ /dev/null @@ -1,834 +0,0 @@ - -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). - */ - -#pragma once - -#include -#include - -#include "../../agent/single_pass_scan_operators.cuh" -#include "../../agent/agent_segment_fixup.cuh" -#include "../../agent/agent_spmv_orig.cuh" -#include "../../util_type.cuh" -#include "../../util_debug.cuh" -#include "../../util_device.cuh" -#include "../../thread/thread_search.cuh" -#include "../../grid/grid_queue.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * SpMV kernel entry points - *****************************************************************************/ - -/** - * Spmv search kernel. Identifies merge path starting coordinates for each tile. - */ -template < - typename AgentSpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type - typename ValueT, ///< Matrix and vector value type - typename OffsetT> ///< Signed integer type for sequence offsets -__global__ void DeviceSpmv1ColKernel( - SpmvParams spmv_params) ///< [in] SpMV input parameter bundle -{ - typedef CacheModifiedInputIterator< - AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, - ValueT, - OffsetT> - VectorValueIteratorT; - - VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x); - - int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - if (row_idx < spmv_params.num_rows) - { - OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx]; - OffsetT nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1]; - - ValueT value = 0.0; - if (end_nonzero_idx != nonzero_idx) - { - value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]]; - } - - spmv_params.d_vector_y[row_idx] = value; - } -} - - -/** - * Spmv search kernel. Identifies merge path starting coordinates for each tile. - */ -template < - typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type - typename OffsetT, ///< Signed integer type for sequence offsets - typename CoordinateT, ///< Merge path coordinate type - typename SpmvParamsT> ///< SpmvParams type -__global__ void DeviceSpmvSearchKernel( - int num_merge_tiles, ///< [in] Number of SpMV merge tiles (spmv grid size) - CoordinateT* d_tile_coordinates, ///< [out] Pointer to the temporary array of tile starting coordinates - SpmvParamsT spmv_params) ///< [in] SpMV input parameter bundle -{ - /// Constants - enum - { - BLOCK_THREADS = SpmvPolicyT::BLOCK_THREADS, - ITEMS_PER_THREAD = SpmvPolicyT::ITEMS_PER_THREAD, - TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, - }; - - typedef CacheModifiedInputIterator< - SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER, - OffsetT, - OffsetT> - RowOffsetsSearchIteratorT; - - // Find the starting coordinate for all tiles (plus the end coordinate of the last one) - int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - if (tile_idx < num_merge_tiles + 1) - { - OffsetT diagonal = (tile_idx * TILE_ITEMS); - CoordinateT tile_coordinate; - CountingInputIterator nonzero_indices(0); - - // Search the merge path - MergePathSearch( - diagonal, - RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets), - nonzero_indices, - spmv_params.num_rows, - spmv_params.num_nonzeros, - tile_coordinate); - - // Output starting offset - d_tile_coordinates[tile_idx] = tile_coordinate; - } -} - - -/** - * Spmv agent entry point - */ -template < - typename SpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type - typename ScanTileStateT, ///< Tile status interface type - typename ValueT, ///< Matrix and vector value type - typename OffsetT, ///< Signed integer type for sequence offsets - typename CoordinateT, ///< Merge path coordinate type - bool HAS_ALPHA, ///< Whether the input parameter Alpha is 1 - bool HAS_BETA> ///< Whether the input parameter Beta is 0 -__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS)) -__global__ void DeviceSpmvKernel( - SpmvParams spmv_params, ///< [in] SpMV input parameter bundle - CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates - KeyValuePair* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block - int num_tiles, ///< [in] Number of merge tiles - ScanTileStateT tile_state, ///< [in] Tile status interface for fixup reduce-by-key kernel - int num_segment_fixup_tiles) ///< [in] Number of reduce-by-key tiles (fixup grid size) -{ - // Spmv agent type specialization - typedef AgentSpmv< - SpmvPolicyT, - ValueT, - OffsetT, - HAS_ALPHA, - HAS_BETA> - AgentSpmvT; - - // Shared memory for AgentSpmv - __shared__ typename AgentSpmvT::TempStorage temp_storage; - - AgentSpmvT(temp_storage, spmv_params).ConsumeTile( - d_tile_coordinates, - d_tile_carry_pairs, - num_tiles); - - // Initialize fixup tile status - tile_state.InitializeStatus(num_segment_fixup_tiles); - -} - - -/** - * Multi-block reduce-by-key sweep kernel entry point - */ -template < - typename AgentSegmentFixupPolicyT, ///< Parameterized AgentSegmentFixupPolicy tuning policy type - typename PairsInputIteratorT, ///< Random-access input iterator type for keys - typename AggregatesOutputIteratorT, ///< Random-access output iterator type for values - typename OffsetT, ///< Signed integer type for global offsets - typename ScanTileStateT> ///< Tile status interface type -__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) -__global__ void DeviceSegmentFixupKernel( - PairsInputIteratorT d_pairs_in, ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block - AggregatesOutputIteratorT d_aggregates_out, ///< [in,out] Output value aggregates - OffsetT num_items, ///< [in] Total number of items to select from - int num_tiles, ///< [in] Total number of tiles for the entire problem - ScanTileStateT tile_state) ///< [in] Tile status interface -{ - // Thread block type for reducing tiles of value segments - typedef AgentSegmentFixup< - AgentSegmentFixupPolicyT, - PairsInputIteratorT, - AggregatesOutputIteratorT, - cub::Equality, - cub::Sum, - OffsetT> - AgentSegmentFixupT; - - // Shared memory for AgentSegmentFixup - __shared__ typename AgentSegmentFixupT::TempStorage temp_storage; - - // Process tiles - AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange( - num_items, - num_tiles, - tile_state); -} - - -/****************************************************************************** - * Dispatch - ******************************************************************************/ - -/** - * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv - */ -template < - typename ValueT, ///< Matrix and vector value type - typename OffsetT> ///< Signed integer type for global offsets -struct DispatchSpmv -{ - //--------------------------------------------------------------------- - // Constants and Types - //--------------------------------------------------------------------- - - enum - { - INIT_KERNEL_THREADS = 128 - }; - - // SpmvParams bundle type - typedef SpmvParams SpmvParamsT; - - // 2D merge path coordinate type - typedef typename CubVector::Type CoordinateT; - - // Tile status descriptor interface type - typedef ReduceByKeyScanTileState ScanTileStateT; - - // Tuple type for scanning (pairs accumulated segment-value with segment-index) - typedef KeyValuePair KeyValuePairT; - - - //--------------------------------------------------------------------- - // Tuning policies - //--------------------------------------------------------------------- - - /// SM11 - struct Policy110 - { - typedef AgentSpmvPolicy< - 128, - 1, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS> - SpmvPolicyT; - - typedef AgentSegmentFixupPolicy< - 128, - 4, - BLOCK_LOAD_VECTORIZE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - SegmentFixupPolicyT; - }; - - /// SM20 - struct Policy200 - { - typedef AgentSpmvPolicy< - 96, - 18, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - false, - BLOCK_SCAN_RAKING> - SpmvPolicyT; - - typedef AgentSegmentFixupPolicy< - 128, - 4, - BLOCK_LOAD_VECTORIZE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - SegmentFixupPolicyT; - - }; - - - - /// SM30 - struct Policy300 - { - typedef AgentSpmvPolicy< - 96, - 6, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS> - SpmvPolicyT; - - typedef AgentSegmentFixupPolicy< - 128, - 4, - BLOCK_LOAD_VECTORIZE, - LOAD_DEFAULT, - BLOCK_SCAN_WARP_SCANS> - SegmentFixupPolicyT; - - }; - - - /// SM35 - struct Policy350 - { - typedef AgentSpmvPolicy< - (sizeof(ValueT) > 4) ? 96 : 128, - (sizeof(ValueT) > 4) ? 4 : 7, - LOAD_LDG, - LOAD_CA, - LOAD_LDG, - LOAD_LDG, - LOAD_LDG, - (sizeof(ValueT) > 4) ? true : false, - BLOCK_SCAN_WARP_SCANS> - SpmvPolicyT; - - typedef AgentSegmentFixupPolicy< - 128, - 3, - BLOCK_LOAD_VECTORIZE, - LOAD_LDG, - BLOCK_SCAN_WARP_SCANS> - SegmentFixupPolicyT; - }; - - - /// SM37 - struct Policy370 - { - - typedef AgentSpmvPolicy< - (sizeof(ValueT) > 4) ? 128 : 128, - (sizeof(ValueT) > 4) ? 9 : 14, - LOAD_LDG, - LOAD_CA, - LOAD_LDG, - LOAD_LDG, - LOAD_LDG, - false, - BLOCK_SCAN_WARP_SCANS> - SpmvPolicyT; - - typedef AgentSegmentFixupPolicy< - 128, - 3, - BLOCK_LOAD_VECTORIZE, - LOAD_LDG, - BLOCK_SCAN_WARP_SCANS> - SegmentFixupPolicyT; - }; - - /// SM50 - struct Policy500 - { - typedef AgentSpmvPolicy< - (sizeof(ValueT) > 4) ? 64 : 128, - (sizeof(ValueT) > 4) ? 6 : 7, - LOAD_LDG, - LOAD_DEFAULT, - (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, - (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT, - LOAD_LDG, - (sizeof(ValueT) > 4) ? true : false, - (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE> - SpmvPolicyT; - - - typedef AgentSegmentFixupPolicy< - 128, - 3, - BLOCK_LOAD_VECTORIZE, - LOAD_LDG, - BLOCK_SCAN_RAKING_MEMOIZE> - SegmentFixupPolicyT; - }; - - - /// SM60 - struct Policy600 - { - typedef AgentSpmvPolicy< - (sizeof(ValueT) > 4) ? 64 : 128, - (sizeof(ValueT) > 4) ? 5 : 7, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - LOAD_DEFAULT, - false, - BLOCK_SCAN_WARP_SCANS> - SpmvPolicyT; - - - typedef AgentSegmentFixupPolicy< - 128, - 3, - BLOCK_LOAD_DIRECT, - LOAD_LDG, - BLOCK_SCAN_WARP_SCANS> - SegmentFixupPolicyT; - }; - - - - //--------------------------------------------------------------------- - // Tuning policies of current PTX compiler pass - //--------------------------------------------------------------------- - -#if (CUB_PTX_ARCH >= 600) - typedef Policy600 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 500) - typedef Policy500 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 370) - typedef Policy370 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 350) - typedef Policy350 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 300) - typedef Policy300 PtxPolicy; - -#elif (CUB_PTX_ARCH >= 200) - typedef Policy200 PtxPolicy; - -#else - typedef Policy110 PtxPolicy; - -#endif - - // "Opaque" policies (whose parameterizations aren't reflected in the type signature) - struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {}; - struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {}; - - - //--------------------------------------------------------------------- - // Utilities - //--------------------------------------------------------------------- - - /** - * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use - */ - template - CUB_RUNTIME_FUNCTION __forceinline__ - static void InitConfigs( - int ptx_version, - KernelConfig &spmv_config, - KernelConfig &segment_fixup_config) - { - #if (CUB_PTX_ARCH > 0) - - // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy - spmv_config.template Init(); - segment_fixup_config.template Init(); - - #else - - // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version - if (ptx_version >= 600) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 500) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 370) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 350) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else if (ptx_version >= 300) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - - } - else if (ptx_version >= 200) - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - else - { - spmv_config.template Init(); - segment_fixup_config.template Init(); - } - - #endif - } - - - /** - * Kernel kernel dispatch configuration. - */ - struct KernelConfig - { - int block_threads; - int items_per_thread; - int tile_items; - - template - CUB_RUNTIME_FUNCTION __forceinline__ - void Init() - { - block_threads = PolicyT::BLOCK_THREADS; - items_per_thread = PolicyT::ITEMS_PER_THREAD; - tile_items = block_threads * items_per_thread; - } - }; - - - //--------------------------------------------------------------------- - // Dispatch entrypoints - //--------------------------------------------------------------------- - - /** - * Internal dispatch routine for computing a device-wide reduction using the - * specified kernel functions. - * - * If the input is larger than a single tile, this method uses two-passes of - * kernel invocations. - */ - template < - typename Spmv1ColKernelT, ///< Function type of cub::DeviceSpmv1ColKernel - typename SpmvSearchKernelT, ///< Function type of cub::AgentSpmvSearchKernel - typename SpmvKernelT, ///< Function type of cub::AgentSpmvKernel - typename SegmentFixupKernelT> ///< Function type of cub::DeviceSegmentFixupKernelT - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SpmvParamsT& spmv_params, ///< SpMV input parameter bundle - cudaStream_t stream, ///< [in] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous, ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. - Spmv1ColKernelT spmv_1col_kernel, ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel - SpmvSearchKernelT spmv_search_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel - SpmvKernelT spmv_kernel, ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel - SegmentFixupKernelT segment_fixup_kernel, ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel - KernelConfig spmv_config, ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for - KernelConfig segment_fixup_config) ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for - { -#ifndef CUB_RUNTIME_ENABLED - - // Kernel launch not supported from this device - return CubDebug(cudaErrorNotSupported ); - -#else - cudaError error = cudaSuccess; - do - { - if (spmv_params.num_cols == 1) - { - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - temp_storage_bytes = 1; - break; - } - - // Get search/init grid dims - int degen_col_kernel_block_size = INIT_KERNEL_THREADS; - int degen_col_kernel_grid_size = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size; - - if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n", - degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream); - - // Invoke spmv_search_kernel - spmv_1col_kernel<<>>( - spmv_params); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - break; - } - - // Get device ordinal - int device_ordinal; - if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; - - // Get SM count - int sm_count; - if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break; - - // Get max x-dimension of grid - int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; - - // Total number of spmv work items - int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros; - - // Tile sizes of kernels - int merge_tile_size = spmv_config.block_threads * spmv_config.items_per_thread; - int segment_fixup_tile_size = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread; - - // Number of tiles for kernels - unsigned int num_merge_tiles = (num_merge_items + merge_tile_size - 1) / merge_tile_size; - unsigned int num_segment_fixup_tiles = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size; - - // Get SM occupancy for kernels - int spmv_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - spmv_sm_occupancy, - spmv_kernel, - spmv_config.block_threads))) break; - - int segment_fixup_sm_occupancy; - if (CubDebug(error = MaxSmOccupancy( - segment_fixup_sm_occupancy, - segment_fixup_kernel, - segment_fixup_config.block_threads))) break; - - // Get grid dimensions - dim3 spmv_grid_size( - CUB_MIN(num_merge_tiles, max_dim_x), - (num_merge_tiles + max_dim_x - 1) / max_dim_x, - 1); - - dim3 segment_fixup_grid_size( - CUB_MIN(num_segment_fixup_tiles, max_dim_x), - (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x, - 1); - - // Get the temporary storage allocation requirements - size_t allocation_sizes[3]; - if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break; // bytes needed for reduce-by-key tile status descriptors - allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT); // bytes needed for block carry-out pairs - allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT); // bytes needed for tile starting coordinates - - // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob) - void* allocations[3]; - if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; - if (d_temp_storage == NULL) - { - // Return if the caller is simply requesting the size of the storage allocation - break; - } - - // Construct the tile status interface - ScanTileStateT tile_state; - if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break; - - // Alias the other allocations - KeyValuePairT* d_tile_carry_pairs = (KeyValuePairT*) allocations[1]; // Agent carry-out pairs - CoordinateT* d_tile_coordinates = (CoordinateT*) allocations[2]; // Agent starting coordinates - - // Get search/init grid dims - int search_block_size = INIT_KERNEL_THREADS; - int search_grid_size = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size; - -#if (CUB_PTX_ARCH == 0) - // Init textures - if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break; -#endif - - if (search_grid_size < sm_count) -// if (num_merge_tiles < spmv_sm_occupancy * sm_count) - { - // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords - d_tile_coordinates = NULL; - } - else - { - // Use separate search kernel if we have enough spmv tiles to saturate the device - - // Log spmv_search_kernel configuration - if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n", - search_grid_size, search_block_size, (long long) stream); - - // Invoke spmv_search_kernel - spmv_search_kernel<<>>( - num_merge_tiles, - d_tile_coordinates, - spmv_params); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - - // Log spmv_kernel configuration - if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy); - - // Invoke spmv_kernel - spmv_kernel<<>>( - spmv_params, - d_tile_coordinates, - d_tile_carry_pairs, - num_merge_tiles, - tile_state, - num_segment_fixup_tiles); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - - // Run reduce-by-key fixup if necessary - if (num_merge_tiles > 1) - { - // Log segment_fixup_kernel configuration - if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n", - segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy); - - // Invoke segment_fixup_kernel - segment_fixup_kernel<<>>( - d_tile_carry_pairs, - spmv_params.d_vector_y, - num_merge_tiles, - num_segment_fixup_tiles, - tile_state); - - // Check for failure to launch - if (CubDebug(error = cudaPeekAtLastError())) break; - - // Sync the stream if specified to flush runtime errors - if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; - } - -#if (CUB_PTX_ARCH == 0) - // Free textures - if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break; -#endif - } - while (0); - - return error; - -#endif // CUB_RUNTIME_ENABLED - } - - - /** - * Internal dispatch routine for computing a device-wide reduction - */ - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Dispatch( - void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation - SpmvParamsT& spmv_params, ///< SpMV input parameter bundle - cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. - bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. - { - cudaError error = cudaSuccess; - do - { - // Get PTX version - int ptx_version; - #if (CUB_PTX_ARCH == 0) - if (CubDebug(error = PtxVersion(ptx_version))) break; - #else - ptx_version = CUB_PTX_ARCH; - #endif - - // Get kernel kernel dispatch configurations - KernelConfig spmv_config, segment_fixup_config; - InitConfigs(ptx_version, spmv_config, segment_fixup_config); - - if (CubDebug(error = Dispatch( - d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous, - DeviceSpmv1ColKernel, - DeviceSpmvSearchKernel, - DeviceSpmvKernel, - DeviceSegmentFixupKernel, - spmv_config, segment_fixup_config))) break; - - } - while (0); - - return error; - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/grid/grid_barrier.cuh b/SRC/cub/grid/grid_barrier.cuh deleted file mode 100644 index 461fb442..00000000 --- a/SRC/cub/grid/grid_barrier.cuh +++ /dev/null @@ -1,211 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid - */ - -#pragma once - -#include "../util_debug.cuh" -#include "../util_namespace.cuh" -#include "../thread/thread_load.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup GridModule - * @{ - */ - - -/** - * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid - */ -class GridBarrier -{ -protected : - - typedef unsigned int SyncFlag; - - // Counters in global device memory - SyncFlag* d_sync; - -public: - - /** - * Constructor - */ - GridBarrier() : d_sync(NULL) {} - - - /** - * Synchronize - */ - __device__ __forceinline__ void Sync() const - { - volatile SyncFlag *d_vol_sync = d_sync; - - // Threadfence and syncthreads to make sure global writes are visible before - // thread-0 reports in with its sync counter - __threadfence(); - CTA_SYNC(); - - if (blockIdx.x == 0) - { - // Report in ourselves - if (threadIdx.x == 0) - { - d_vol_sync[blockIdx.x] = 1; - } - - CTA_SYNC(); - - // Wait for everyone else to report in - for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) - { - while (ThreadLoad(d_sync + peer_block) == 0) - { - __threadfence_block(); - } - } - - CTA_SYNC(); - - // Let everyone know it's safe to proceed - for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) - { - d_vol_sync[peer_block] = 0; - } - } - else - { - if (threadIdx.x == 0) - { - // Report in - d_vol_sync[blockIdx.x] = 1; - - // Wait for acknowledgment - while (ThreadLoad(d_sync + blockIdx.x) == 1) - { - __threadfence_block(); - } - } - - CTA_SYNC(); - } - } -}; - - -/** - * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation. - * - * Uses RAII for lifetime, i.e., device resources are reclaimed when - * the destructor is called. - */ -class GridBarrierLifetime : public GridBarrier -{ -protected: - - // Number of bytes backed by d_sync - size_t sync_bytes; - -public: - - /** - * Constructor - */ - GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {} - - - /** - * DeviceFrees and resets the progress counters - */ - cudaError_t HostReset() - { - cudaError_t retval = cudaSuccess; - if (d_sync) - { - CubDebug(retval = cudaFree(d_sync)); - d_sync = NULL; - } - sync_bytes = 0; - return retval; - } - - - /** - * Destructor - */ - virtual ~GridBarrierLifetime() - { - HostReset(); - } - - - /** - * Sets up the progress counters for the next kernel launch (lazily - * allocating and initializing them if necessary) - */ - cudaError_t Setup(int sweep_grid_size) - { - cudaError_t retval = cudaSuccess; - do { - size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag); - if (new_sync_bytes > sync_bytes) - { - if (d_sync) - { - if (CubDebug(retval = cudaFree(d_sync))) break; - } - - sync_bytes = new_sync_bytes; - - // Allocate and initialize to zero - if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break; - if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break; - } - } while (0); - - return retval; - } -}; - - -/** @} */ // end group GridModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/grid/grid_even_share.cuh b/SRC/cub/grid/grid_even_share.cuh deleted file mode 100644 index f0b3a69a..00000000 --- a/SRC/cub/grid/grid_even_share.cuh +++ /dev/null @@ -1,222 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly the same number of fixed-size work units (grains). - */ - - -#pragma once - -#include "../util_namespace.cuh" -#include "../util_macro.cuh" -#include "grid_mapping.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup GridModule - * @{ - */ - - -/** - * \brief GridEvenShare is a descriptor utility for distributing input among - * CUDA thread blocks in an "even-share" fashion. Each thread block gets roughly - * the same number of input tiles. - * - * \par Overview - * Each thread block is assigned a consecutive sequence of input tiles. To help - * preserve alignment and eliminate the overhead of guarded loads for all but the - * last thread block, to GridEvenShare assigns one of three different amounts of - * work to a given thread block: "big", "normal", or "last". The "big" workloads - * are one scheduling grain larger than "normal". The "last" work unit for the - * last thread block may be partially-full if the input is not an even multiple of - * the scheduling grain size. - * - * \par - * Before invoking a child grid, a parent thread will typically construct an - * instance of GridEvenShare. The instance can be passed to child thread blocks - * which can initialize their per-thread block offsets using \p BlockInit(). - */ -template -struct GridEvenShare -{ -private: - - OffsetT total_tiles; - int big_shares; - OffsetT big_share_items; - OffsetT normal_share_items; - OffsetT normal_base_offset; - -public: - - /// Total number of input items - OffsetT num_items; - - /// Grid size in thread blocks - int grid_size; - - /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles - OffsetT block_offset; - - /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles - OffsetT block_end; - - /// Stride between input tiles - OffsetT block_stride; - - - /** - * \brief Constructor. - */ - __host__ __device__ __forceinline__ GridEvenShare() : - total_tiles(0), - big_shares(0), - big_share_items(0), - normal_share_items(0), - normal_base_offset(0), - num_items(0), - grid_size(0), - block_offset(0), - block_end(0), - block_stride(0) - {} - - - /** - * \brief Dispatch initializer. To be called prior prior to kernel launch. - */ - __host__ __device__ __forceinline__ void DispatchInit( - OffsetT num_items, ///< Total number of input items - int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) - int tile_items) ///< Number of data items per input tile - { - this->block_offset = num_items; // Initialize past-the-end - this->block_end = num_items; // Initialize past-the-end - this->num_items = num_items; - this->total_tiles = (num_items + tile_items - 1) / tile_items; - this->grid_size = CUB_MIN(total_tiles, max_grid_size); - OffsetT avg_tiles_per_block = total_tiles / grid_size; - this->big_shares = total_tiles - (avg_tiles_per_block * grid_size); // leftover grains go to big blocks - this->normal_share_items = avg_tiles_per_block * tile_items; - this->normal_base_offset = big_shares * tile_items; - this->big_share_items = normal_share_items + tile_items; - } - - - /** - * \brief Initializes ranges for the specified thread block index. Specialized - * for a "raking" access pattern in which each thread block is assigned a - * consecutive sequence of input tiles. - */ - template - __device__ __forceinline__ void BlockInit( - int block_id, - Int2Type /*strategy_tag*/) - { - block_stride = TILE_ITEMS; - if (block_id < big_shares) - { - // This thread block gets a big share of grains (avg_tiles_per_block + 1) - block_offset = (block_id * big_share_items); - block_end = block_offset + big_share_items; - } - else if (block_id < total_tiles) - { - // This thread block gets a normal share of grains (avg_tiles_per_block) - block_offset = normal_base_offset + (block_id * normal_share_items); - block_end = CUB_MIN(num_items, block_offset + normal_share_items); - } - // Else default past-the-end - } - - - /** - * \brief Block-initialization, specialized for a "raking" access - * pattern in which each thread block is assigned a consecutive sequence - * of input tiles. - */ - template - __device__ __forceinline__ void BlockInit( - int block_id, - Int2Type /*strategy_tag*/) - { - block_stride = grid_size * TILE_ITEMS; - block_offset = (block_id * TILE_ITEMS); - block_end = num_items; - } - - - /** - * \brief Block-initialization, specialized for "strip mining" access - * pattern in which the input tiles assigned to each thread block are - * separated by a stride equal to the the extent of the grid. - */ - template < - int TILE_ITEMS, - GridMappingStrategy STRATEGY> - __device__ __forceinline__ void BlockInit() - { - BlockInit(blockIdx.x, Int2Type()); - } - - - /** - * \brief Block-initialization, specialized for a "raking" access - * pattern in which each thread block is assigned a consecutive sequence - * of input tiles. - */ - template - __device__ __forceinline__ void BlockInit( - OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive) - OffsetT block_end) ///< [in] Threadblock end offset (exclusive) - { - this->block_offset = block_offset; - this->block_end = block_end; - this->block_stride = TILE_ITEMS; - } - - -}; - - - - - -/** @} */ // end group GridModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/grid/grid_mapping.cuh b/SRC/cub/grid/grid_mapping.cuh deleted file mode 100644 index f0e9fded..00000000 --- a/SRC/cub/grid/grid_mapping.cuh +++ /dev/null @@ -1,113 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. - */ - -#pragma once - -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup GridModule - * @{ - */ - - -/****************************************************************************** - * Mapping policies - *****************************************************************************/ - - -/** - * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. - */ -enum GridMappingStrategy -{ - /** - * \brief An a "raking" access pattern in which each thread block is - * assigned a consecutive sequence of input tiles - * - * \par Overview - * The input is evenly partitioned into \p p segments, where \p p is - * constant and corresponds loosely to the number of thread blocks that may - * actively reside on the target device. Each segment is comprised of - * consecutive tiles, where a tile is a small, constant-sized unit of input - * to be processed to completion before the thread block terminates or - * obtains more work. The kernel invokes \p p thread blocks, each - * of which iteratively consumes a segment of n/p elements - * in tile-size increments. - */ - GRID_MAPPING_RAKE, - - /** - * \brief An a "strip mining" access pattern in which the input tiles assigned - * to each thread block are separated by a stride equal to the the extent of - * the grid. - * - * \par Overview - * The input is evenly partitioned into \p p sets, where \p p is - * constant and corresponds loosely to the number of thread blocks that may - * actively reside on the target device. Each set is comprised of - * data tiles separated by stride \p tiles, where a tile is a small, - * constant-sized unit of input to be processed to completion before the - * thread block terminates or obtains more work. The kernel invokes \p p - * thread blocks, each of which iteratively consumes a segment of - * n/p elements in tile-size increments. - */ - GRID_MAPPING_STRIP_MINE, - - /** - * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. - * - * \par Overview - * The input is treated as a queue to be dynamically consumed by a grid of - * thread blocks. Work is atomically dequeued in tiles, where a tile is a - * unit of input to be processed to completion before the thread block - * terminates or obtains more work. The grid size \p p is constant, - * loosely corresponding to the number of thread blocks that may actively - * reside on the target device. - */ - GRID_MAPPING_DYNAMIC, -}; - - -/** @} */ // end group GridModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/grid/grid_queue.cuh b/SRC/cub/grid/grid_queue.cuh deleted file mode 100644 index 9615b14d..00000000 --- a/SRC/cub/grid/grid_queue.cuh +++ /dev/null @@ -1,220 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::GridQueue is a descriptor utility for dynamic queue management. - */ - -#pragma once - -#include "../util_namespace.cuh" -#include "../util_debug.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup GridModule - * @{ - */ - - -/** - * \brief GridQueue is a descriptor utility for dynamic queue management. - * - * \par Overview - * GridQueue descriptors provides abstractions for "filling" or - * "draining" globally-shared vectors. - * - * \par - * A "filling" GridQueue works by atomically-adding to a zero-initialized counter, - * returning a unique offset for the calling thread to write its items. - * The GridQueue maintains the total "fill-size". The fill counter must be reset - * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that - * will be filling. - * - * \par - * Similarly, a "draining" GridQueue works by works by atomically-incrementing a - * zero-initialized counter, returning a unique offset for the calling thread to - * read its items. Threads can safely drain until the array's logical fill-size is - * exceeded. The drain counter must be reset using GridQueue::ResetDrain or - * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that - * will be filling. (For dynamic work distribution of existing data, the corresponding fill-size - * is simply the number of elements in the array.) - * - * \par - * Iterative work management can be implemented simply with a pair of flip-flopping - * work buffers, each with an associated set of fill and drain GridQueue descriptors. - * - * \tparam OffsetT Signed integer type for global offsets - */ -template -class GridQueue -{ -private: - - /// Counter indices - enum - { - FILL = 0, - DRAIN = 1, - }; - - /// Pair of counters - OffsetT *d_counters; - -public: - - /// Returns the device allocation size in bytes needed to construct a GridQueue instance - __host__ __device__ __forceinline__ - static size_t AllocationSize() - { - return sizeof(OffsetT) * 2; - } - - - /// Constructs an invalid GridQueue descriptor - __host__ __device__ __forceinline__ GridQueue() - : - d_counters(NULL) - {} - - - /// Constructs a GridQueue descriptor around the device storage allocation - __host__ __device__ __forceinline__ GridQueue( - void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as AllocationSize(). - : - d_counters((OffsetT*) d_storage) - {} - - - /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining. - __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain( - OffsetT fill_size, - cudaStream_t stream = 0) - { -#if (CUB_PTX_ARCH > 0) - (void)stream; - d_counters[FILL] = fill_size; - d_counters[DRAIN] = 0; - return cudaSuccess; -#else - OffsetT counters[2]; - counters[FILL] = fill_size; - counters[DRAIN] = 0; - return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream)); -#endif - } - - - /// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining. - __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0) - { -#if (CUB_PTX_ARCH > 0) - (void)stream; - d_counters[DRAIN] = 0; - return cudaSuccess; -#else - return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream)); -#endif - } - - - /// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling. - __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0) - { -#if (CUB_PTX_ARCH > 0) - (void)stream; - d_counters[FILL] = 0; - return cudaSuccess; -#else - return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream)); -#endif - } - - - /// Returns the fill-size established by the parent or by the previous kernel. - __host__ __device__ __forceinline__ cudaError_t FillSize( - OffsetT &fill_size, - cudaStream_t stream = 0) - { -#if (CUB_PTX_ARCH > 0) - (void)stream; - fill_size = d_counters[FILL]; - return cudaSuccess; -#else - return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream)); -#endif - } - - - /// Drain \p num_items from the queue. Returns offset from which to read items. To be called from CUDA kernel. - __device__ __forceinline__ OffsetT Drain(OffsetT num_items) - { - return atomicAdd(d_counters + DRAIN, num_items); - } - - - /// Fill \p num_items into the queue. Returns offset from which to write items. To be called from CUDA kernel. - __device__ __forceinline__ OffsetT Fill(OffsetT num_items) - { - return atomicAdd(d_counters + FILL, num_items); - } -}; - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - -/** - * Reset grid queue (call with 1 block of 1 thread) - */ -template -__global__ void FillAndResetDrainKernel( - GridQueue grid_queue, - OffsetT num_items) -{ - grid_queue.FillAndResetDrain(num_items); -} - - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/** @} */ // end group GridModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - - diff --git a/SRC/cub/host/mutex.cuh b/SRC/cub/host/mutex.cuh deleted file mode 100644 index ff7ec90d..00000000 --- a/SRC/cub/host/mutex.cuh +++ /dev/null @@ -1,171 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Simple portable mutex - */ - - -#pragma once - -#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) - #include -#else - #if defined(_WIN32) || defined(_WIN64) - #include - - #define WIN32_LEAN_AND_MEAN - #define NOMINMAX - #include - #undef WIN32_LEAN_AND_MEAN - #undef NOMINMAX - - /** - * Compiler read/write barrier - */ - #pragma intrinsic(_ReadWriteBarrier) - - #endif -#endif - -#include "../util_namespace.cuh" - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * Simple portable mutex - * - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms) - * - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++) - */ -struct Mutex -{ -#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800) - - std::mutex mtx; - - void Lock() - { - mtx.lock(); - } - - void Unlock() - { - mtx.unlock(); - } - - void TryLock() - { - mtx.try_lock(); - } - -#else //__cplusplus > 199711L - - #if defined(_MSC_VER) - - // Microsoft VC++ - typedef long Spinlock; - - #else - - // GNU g++ - typedef int Spinlock; - - /** - * Compiler read/write barrier - */ - __forceinline__ void _ReadWriteBarrier() - { - __sync_synchronize(); - } - - /** - * Atomic exchange - */ - __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value) - { - // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier - _ReadWriteBarrier(); - return __sync_lock_test_and_set(Target, Value); - } - - /** - * Pause instruction to prevent excess processor bus usage - */ - __forceinline__ void YieldProcessor() - { - } - - #endif // defined(_MSC_VER) - - /// Lock member - volatile Spinlock lock; - - /** - * Constructor - */ - Mutex() : lock(0) {} - - /** - * Return when the specified spinlock has been acquired - */ - __forceinline__ void Lock() - { - while (1) - { - if (!_InterlockedExchange(&lock, 1)) return; - while (lock) YieldProcessor(); - } - } - - - /** - * Release the specified spinlock - */ - __forceinline__ void Unlock() - { - _ReadWriteBarrier(); - lock = 0; - } - -#endif // __cplusplus > 199711L - -}; - - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - diff --git a/SRC/cub/iterator/arg_index_input_iterator.cuh b/SRC/cub/iterator/arg_index_input_iterator.cuh deleted file mode 100644 index 95a84a57..00000000 --- a/SRC/cub/iterator/arg_index_input_iterator.cuh +++ /dev/null @@ -1,259 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_device.cuh" -#include "../util_namespace.cuh" - -#include - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIterator - * @{ - */ - - -/** - * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples). - * - * \par Overview - * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT. - * Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose - * \p key field is \p i and whose \p value field is itr[i]. - * - Can be used with any data type. - * - Can be constructed, manipulated, and exchanged within and between host and device - * functions. Wrapped host memory can only be dereferenced on the host, and wrapped - * device memory can only be dereferenced on the device. - * - Compatible with Thrust API v1.7 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto - * dereference an array of doubles - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize a device array - * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] - * - * // Create an iterator wrapper - * cub::ArgIndexInputIterator itr(d_in); - * - * // Within device code: - * typedef typename cub::ArgIndexInputIterator::value_type Tuple; - * Tuple item_offset_pair.key = *itr; - * printf("%f @ %d\n", - * item_offset_pair.value, - * item_offset_pair.key); // 8.0 @ 0 - * - * itr = itr + 6; - * item_offset_pair.key = *itr; - * printf("%f @ %d\n", - * item_offset_pair.value, - * item_offset_pair.key); // 9.0 @ 6 - * - * \endcode - * - * \tparam InputIteratorT The value type of the wrapped input iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - * \tparam OutputValueT The paired value type of the tuple (Default: value type of input iterator) - */ -template < - typename InputIteratorT, - typename OffsetT = ptrdiff_t, - typename OutputValueT = typename std::iterator_traits::value_type> -class ArgIndexInputIterator -{ -public: - - // Required iterator traits - typedef ArgIndexInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef KeyValuePair value_type; ///< The type of the element the iterator can point to - typedef value_type* pointer; ///< The type of a pointer to an element the iterator can point to - typedef value_type reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::any_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - InputIteratorT itr; - difference_type offset; - -public: - - /// Constructor - __host__ __device__ __forceinline__ ArgIndexInputIterator( - InputIteratorT itr, ///< Input iterator to wrap - difference_type offset = 0) ///< OffsetT (in items) from \p itr denoting the position of the iterator - : - itr(itr), - offset(offset) - {} - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - offset++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - offset++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ reference operator*() const - { - value_type retval; - retval.value = itr[offset]; - retval.key = offset; - return retval; - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval(itr, offset + n); - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - offset += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval(itr, offset - n); - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - offset -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return offset - other.offset; - } - - /// Array subscript - template - __host__ __device__ __forceinline__ reference operator[](Distance n) const - { - self_type offset = (*this) + n; - return *offset; - } - - /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() - { - return &(*(*this)); - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return ((itr == rhs.itr) && (offset == rhs.offset)); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return ((itr != rhs.itr) || (offset != rhs.offset)); - } - - /// Normalize - __host__ __device__ __forceinline__ void normalize() - { - itr += offset; - offset = 0; - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) - { - return os; - } -}; - - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/iterator/cache_modified_input_iterator.cuh b/SRC/cub/iterator/cache_modified_input_iterator.cuh deleted file mode 100644 index b4ad91e2..00000000 --- a/SRC/cub/iterator/cache_modified_input_iterator.cuh +++ /dev/null @@ -1,240 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_device.cuh" -#include "../util_namespace.cuh" - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - - -/** - * \addtogroup UtilIterator - * @{ - */ - - -/** - * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier. - * - * \par Overview - * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native - * device pointer of type ValueType*. \p ValueType references are - * made by reading \p ValueType values through loads modified by \p MODIFIER. - * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG", - * "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.). - * - Can be constructed, manipulated, and exchanged within and between host and device - * functions, but can only be dereferenced within device functions. - * - Compatible with Thrust API v1.7 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto - * dereference a device array of double using the "ldg" PTX load modifier - * (i.e., load values through texture cache). - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize a device array - * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] - * - * // Create an iterator wrapper - * cub::CacheModifiedInputIterator itr(d_in); - * - * // Within device code: - * printf("%f\n", itr[0]); // 8.0 - * printf("%f\n", itr[1]); // 6.0 - * printf("%f\n", itr[6]); // 9.0 - * - * \endcode - * - * \tparam CacheLoadModifier The cub::CacheLoadModifier to use when accessing data - * \tparam ValueType The value type of this iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - */ -template < - CacheLoadModifier MODIFIER, - typename ValueType, - typename OffsetT = ptrdiff_t> -class CacheModifiedInputIterator -{ -public: - - // Required iterator traits - typedef CacheModifiedInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef ValueType value_type; ///< The type of the element the iterator can point to - typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to - typedef ValueType reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::device_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - - -public: - - /// Wrapped native pointer - ValueType* ptr; - - /// Constructor - template - __host__ __device__ __forceinline__ CacheModifiedInputIterator( - QualifiedValueType* ptr) ///< Native pointer to wrap - : - ptr(const_cast::Type *>(ptr)) - {} - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - ptr++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - ptr++; - return *this; - } - - /// Indirection - __device__ __forceinline__ reference operator*() const - { - return ThreadLoad(ptr); - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval(ptr + n); - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - ptr += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval(ptr - n); - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - ptr -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return ptr - other.ptr; - } - - /// Array subscript - template - __device__ __forceinline__ reference operator[](Distance n) const - { - return ThreadLoad(ptr + n); - } - - /// Structure dereference - __device__ __forceinline__ pointer operator->() - { - return &ThreadLoad(ptr); - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return (ptr == rhs.ptr); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return (ptr != rhs.ptr); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/) - { - return os; - } -}; - - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/iterator/cache_modified_output_iterator.cuh b/SRC/cub/iterator/cache_modified_output_iterator.cuh deleted file mode 100644 index c3e3321d..00000000 --- a/SRC/cub/iterator/cache_modified_output_iterator.cuh +++ /dev/null @@ -1,254 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_device.cuh" -#include "../util_namespace.cuh" - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilIterator - * @{ - */ - - -/** - * \brief A random-access output wrapper for storing array values using a PTX cache-modifier. - * - * \par Overview - * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native - * device pointer of type ValueType*. \p ValueType references are - * made by writing \p ValueType values through stores modified by \p MODIFIER. - * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB", - * "STORE_CG", "STORE_CS", "STORE_WT", etc.). - * - Can be constructed, manipulated, and exchanged within and between host and device - * functions, but can only be dereferenced within device functions. - * - Compatible with Thrust API v1.7 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to - * dereference a device array of doubles using the "wt" PTX load modifier - * (i.e., write-through to system memory). - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize a device array - * double *d_out; // e.g., [, , , , , , ] - * - * // Create an iterator wrapper - * cub::CacheModifiedOutputIterator itr(d_out); - * - * // Within device code: - * itr[0] = 8.0; - * itr[1] = 66.0; - * itr[55] = 24.0; - * - * \endcode - * - * \par Usage Considerations - * - Can only be dereferenced within device code - * - * \tparam CacheStoreModifier The cub::CacheStoreModifier to use when accessing data - * \tparam ValueType The value type of this iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - */ -template < - CacheStoreModifier MODIFIER, - typename ValueType, - typename OffsetT = ptrdiff_t> -class CacheModifiedOutputIterator -{ -private: - - // Proxy object - struct Reference - { - ValueType* ptr; - - /// Constructor - __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {} - - /// Assignment - __device__ __forceinline__ ValueType operator =(ValueType val) - { - ThreadStore(ptr, val); - return val; - } - }; - -public: - - // Required iterator traits - typedef CacheModifiedOutputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef void value_type; ///< The type of the element the iterator can point to - typedef void pointer; ///< The type of a pointer to an element the iterator can point to - typedef Reference reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::device_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - ValueType* ptr; - -public: - - /// Constructor - template - __host__ __device__ __forceinline__ CacheModifiedOutputIterator( - QualifiedValueType* ptr) ///< Native pointer to wrap - : - ptr(const_cast::Type *>(ptr)) - {} - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - ptr++; - return retval; - } - - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - ptr++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ reference operator*() const - { - return Reference(ptr); - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval(ptr + n); - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - ptr += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval(ptr - n); - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - ptr -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return ptr - other.ptr; - } - - /// Array subscript - template - __host__ __device__ __forceinline__ reference operator[](Distance n) const - { - return Reference(ptr + n); - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return (ptr == rhs.ptr); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return (ptr != rhs.ptr); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) - { - return os; - } -}; - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/iterator/constant_input_iterator.cuh b/SRC/cub/iterator/constant_input_iterator.cuh deleted file mode 100644 index 1e0a9104..00000000 --- a/SRC/cub/iterator/constant_input_iterator.cuh +++ /dev/null @@ -1,235 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_namespace.cuh" - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilIterator - * @{ - */ - - -/** - * \brief A random-access input generator for dereferencing a sequence of homogeneous values - * - * \par Overview - * - Read references to a ConstantInputIteratorTiterator always return the supplied constant - * of type \p ValueType. - * - Can be used with any data type. - * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device - * functions. - * - Compatible with Thrust API v1.7 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p ConstantInputIteratorTto - * dereference a sequence of homogeneous doubles. - * \par - * \code - * #include // or equivalently - * - * cub::ConstantInputIterator itr(5.0); - * - * printf("%f\n", itr[0]); // 5.0 - * printf("%f\n", itr[1]); // 5.0 - * printf("%f\n", itr[2]); // 5.0 - * printf("%f\n", itr[50]); // 5.0 - * - * \endcode - * - * \tparam ValueType The value type of this iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - */ -template < - typename ValueType, - typename OffsetT = ptrdiff_t> -class ConstantInputIterator -{ -public: - - // Required iterator traits - typedef ConstantInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef ValueType value_type; ///< The type of the element the iterator can point to - typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to - typedef ValueType reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::any_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - ValueType val; - OffsetT offset; -#ifdef _WIN32 - OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) -#endif - -public: - - /// Constructor - __host__ __device__ __forceinline__ ConstantInputIterator( - ValueType val, ///< Starting value for the iterator instance to report - OffsetT offset = 0) ///< Base offset - : - val(val), - offset(offset) - {} - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - offset++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - offset++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ reference operator*() const - { - return val; - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval(val, offset + n); - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - offset += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval(val, offset - n); - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - offset -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return offset - other.offset; - } - - /// Array subscript - template - __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const - { - return val; - } - - /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() - { - return &val; - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return (offset == rhs.offset) && ((val == rhs.val)); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return (offset != rhs.offset) || (val!= rhs.val); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) - { - os << "[" << itr.val << "," << itr.offset << "]"; - return os; - } - -}; - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/iterator/counting_input_iterator.cuh b/SRC/cub/iterator/counting_input_iterator.cuh deleted file mode 100644 index 7f49348d..00000000 --- a/SRC/cub/iterator/counting_input_iterator.cuh +++ /dev/null @@ -1,228 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_device.cuh" -#include "../util_namespace.cuh" - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIterator - * @{ - */ - -/** - * \brief A random-access input generator for dereferencing a sequence of incrementing integer values. - * - * \par Overview - * - After initializing a CountingInputIteratorTto a certain integer \p base, read references - * at \p offset will return the value \p base + \p offset. - * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device - * functions. - * - Compatible with Thrust API v1.7 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p CountingInputIteratorTto - * dereference a sequence of incrementing integers. - * \par - * \code - * #include // or equivalently - * - * cub::CountingInputIterator itr(5); - * - * printf("%d\n", itr[0]); // 5 - * printf("%d\n", itr[1]); // 6 - * printf("%d\n", itr[2]); // 7 - * printf("%d\n", itr[50]); // 55 - * - * \endcode - * - * \tparam ValueType The value type of this iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - */ -template < - typename ValueType, - typename OffsetT = ptrdiff_t> -class CountingInputIterator -{ -public: - - // Required iterator traits - typedef CountingInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef ValueType value_type; ///< The type of the element the iterator can point to - typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to - typedef ValueType reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::any_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - ValueType val; - -public: - - /// Constructor - __host__ __device__ __forceinline__ CountingInputIterator( - const ValueType &val) ///< Starting value for the iterator instance to report - : - val(val) - {} - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - val++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - val++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ reference operator*() const - { - return val; - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval(val + (ValueType) n); - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - val += (ValueType) n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval(val - (ValueType) n); - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - val -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return (difference_type) (val - other.val); - } - - /// Array subscript - template - __host__ __device__ __forceinline__ reference operator[](Distance n) const - { - return val + (ValueType) n; - } - - /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() - { - return &val; - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return (val == rhs.val); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return (val != rhs.val); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) - { - os << "[" << itr.val << "]"; - return os; - } - -}; - - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/iterator/discard_output_iterator.cuh b/SRC/cub/iterator/discard_output_iterator.cuh deleted file mode 100644 index 28473e5f..00000000 --- a/SRC/cub/iterator/discard_output_iterator.cuh +++ /dev/null @@ -1,220 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../util_namespace.cuh" -#include "../util_macro.cuh" - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilIterator - * @{ - */ - - -/** - * \brief A discard iterator - */ -template -class DiscardOutputIterator -{ -public: - - // Required iterator traits - typedef DiscardOutputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef void value_type; ///< The type of the element the iterator can point to - typedef void pointer; ///< The type of a pointer to an element the iterator can point to - typedef void reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::any_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - OffsetT offset; - -#if defined(_WIN32) || !defined(_WIN64) - // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) - OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; -#endif - -public: - - /// Constructor - __host__ __device__ __forceinline__ DiscardOutputIterator( - OffsetT offset = 0) ///< Base offset - : - offset(offset) - {} - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - offset++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - offset++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ self_type& operator*() - { - // return self reference, which can be assigned to anything - return *this; - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval(offset + n); - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - offset += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval(offset - n); - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - offset -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return offset - other.offset; - } - - /// Array subscript - template - __host__ __device__ __forceinline__ self_type& operator[](Distance n) - { - // return self reference, which can be assigned to anything - return *this; - } - - /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() - { - return; - } - - /// Assignment to self (no-op) - __host__ __device__ __forceinline__ void operator=(self_type const& other) - { - offset = other.offset; - } - - /// Assignment to anything else (no-op) - template - __host__ __device__ __forceinline__ void operator=(T const&) - {} - - /// Cast to void* operator - __host__ __device__ __forceinline__ operator void*() const { return NULL; } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return (offset == rhs.offset); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return (offset != rhs.offset); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) - { - os << "[" << itr.offset << "]"; - return os; - } - -}; - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/iterator/tex_obj_input_iterator.cuh b/SRC/cub/iterator/tex_obj_input_iterator.cuh deleted file mode 100644 index b99103ec..00000000 --- a/SRC/cub/iterator/tex_obj_input_iterator.cuh +++ /dev/null @@ -1,310 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_device.cuh" -#include "../util_debug.cuh" -#include "../util_namespace.cuh" - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIterator - * @{ - */ - - - -/** - * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses newer Kepler-style texture objects. - * - * \par Overview - * - TexObjInputIteratorTwraps a native device pointer of type ValueType*. References - * to elements are to be loaded through texture cache. - * - Can be used to load any data type from memory through texture cache. - * - Can be manipulated and exchanged within and between host and device - * functions, can only be constructed within host functions, and can only be - * dereferenced within device functions. - * - With regard to nested/dynamic parallelism, TexObjInputIteratorTiterators may only be - * created by the host thread, but can be used by any descendant kernel. - * - Compatible with Thrust API v1.7 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p TexRefInputIteratorTto - * dereference a device array of doubles through texture cache. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize a device array - * int num_items; // e.g., 7 - * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] - * - * // Create an iterator wrapper - * cub::TexObjInputIterator itr; - * itr.BindTexture(d_in, sizeof(double) * num_items); - * ... - * - * // Within device code: - * printf("%f\n", itr[0]); // 8.0 - * printf("%f\n", itr[1]); // 6.0 - * printf("%f\n", itr[6]); // 9.0 - * - * ... - * itr.UnbindTexture(); - * - * \endcode - * - * \tparam T The value type of this iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - */ -template < - typename T, - typename OffsetT = ptrdiff_t> -class TexObjInputIterator -{ -public: - - // Required iterator traits - typedef TexObjInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef T value_type; ///< The type of the element the iterator can point to - typedef T* pointer; ///< The type of a pointer to an element the iterator can point to - typedef T reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::device_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - // Largest texture word we can use in device - typedef typename UnitWord::TextureWord TextureWord; - - // Number of texture words per T - enum { - TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) - }; - -private: - - T* ptr; - difference_type tex_offset; - cudaTextureObject_t tex_obj; - -public: - - /// Constructor - __host__ __device__ __forceinline__ TexObjInputIterator() - : - ptr(NULL), - tex_offset(0), - tex_obj(0) - {} - - /// Use this iterator to bind \p ptr with a texture reference - template - cudaError_t BindTexture( - QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment - size_t bytes = size_t(-1), ///< Number of bytes in the range - size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator - { - this->ptr = const_cast::Type *>(ptr); - this->tex_offset = tex_offset; - - cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); - cudaResourceDesc res_desc; - cudaTextureDesc tex_desc; - memset(&res_desc, 0, sizeof(cudaResourceDesc)); - memset(&tex_desc, 0, sizeof(cudaTextureDesc)); - res_desc.resType = cudaResourceTypeLinear; - res_desc.res.linear.devPtr = this->ptr; - res_desc.res.linear.desc = channel_desc; - res_desc.res.linear.sizeInBytes = bytes; - tex_desc.readMode = cudaReadModeElementType; - return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL); - } - - /// Unbind this iterator from its texture reference - cudaError_t UnbindTexture() - { - return cudaDestroyTextureObject(tex_obj); - } - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - tex_offset++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - tex_offset++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ reference operator*() const - { -#if (CUB_PTX_ARCH == 0) - // Simply dereference the pointer on the host - return ptr[tex_offset]; -#else - // Move array of uninitialized words, then alias and assign to return value - TextureWord words[TEXTURE_MULTIPLE]; - - #pragma unroll - for (int i = 0; i < TEXTURE_MULTIPLE; ++i) - { - words[i] = tex1Dfetch( - tex_obj, - (tex_offset * TEXTURE_MULTIPLE) + i); - } - - // Load from words - return *reinterpret_cast(words); -#endif - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval; - retval.ptr = ptr; - retval.tex_obj = tex_obj; - retval.tex_offset = tex_offset + n; - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - tex_offset += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval; - retval.ptr = ptr; - retval.tex_obj = tex_obj; - retval.tex_offset = tex_offset - n; - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - tex_offset -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return tex_offset - other.tex_offset; - } - - /// Array subscript - template - __host__ __device__ __forceinline__ reference operator[](Distance n) const - { - self_type offset = (*this) + n; - return *offset; - } - - /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() - { - return &(*(*this)); - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj)); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj)); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) - { - return os; - } - -}; - - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/iterator/tex_ref_input_iterator.cuh b/SRC/cub/iterator/tex_ref_input_iterator.cuh deleted file mode 100644 index 95d0ffbc..00000000 --- a/SRC/cub/iterator/tex_ref_input_iterator.cuh +++ /dev/null @@ -1,374 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_device.cuh" -#include "../util_debug.cuh" -#include "../util_namespace.cuh" - -#if (CUDA_VERSION >= 5050) || defined(DOXYGEN_ACTIVE) // This iterator is compatible with CUDA 5.5 and newer - -#if (THRUST_VERSION >= 100700) // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/****************************************************************************** - * Static file-scope Tesla/Fermi-style texture references - *****************************************************************************/ - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -// Anonymous namespace -namespace { - -/// Global texture reference specialized by type -template -struct IteratorTexRef -{ - /// And by unique ID - template - struct TexId - { - // Largest texture word we can use in device - typedef typename UnitWord::DeviceWord DeviceWord; - typedef typename UnitWord::TextureWord TextureWord; - - // Number of texture words per T - enum { - DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord), - TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) - }; - - // Texture reference type - typedef texture TexRef; - - // Texture reference - static TexRef ref; - - /// Bind texture - static cudaError_t BindTexture(void *d_in, size_t &offset) - { - if (d_in) - { - cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc(); - ref.channelDesc = tex_desc; - return (CubDebug(cudaBindTexture(&offset, ref, d_in))); - } - - return cudaSuccess; - } - - /// Unbind texture - static cudaError_t UnbindTexture() - { - return CubDebug(cudaUnbindTexture(ref)); - } - - /// Fetch element - template - static __device__ __forceinline__ T Fetch(Distance tex_offset) - { - DeviceWord temp[DEVICE_MULTIPLE]; - TextureWord *words = reinterpret_cast(temp); - - #pragma unroll - for (int i = 0; i < TEXTURE_MULTIPLE; ++i) - { - words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i); - } - - return reinterpret_cast(temp); - } - }; -}; - -// Texture reference definitions -template -template -typename IteratorTexRef::template TexId::TexRef IteratorTexRef::template TexId::ref = 0; - - -} // Anonymous namespace - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - -/** - * \addtogroup UtilIterator - * @{ - */ - - - -/** - * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses older Tesla/Fermi-style texture references. - * - * \par Overview - * - TexRefInputIteratorTwraps a native device pointer of type ValueType*. References - * to elements are to be loaded through texture cache. - * - Can be used to load any data type from memory through texture cache. - * - Can be manipulated and exchanged within and between host and device - * functions, can only be constructed within host functions, and can only be - * dereferenced within device functions. - * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture - * reference. Only one TexRefInputIteratorTinstance can be bound at any given time for a - * specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host - * thread, and (4) compilation .o unit. - * - With regard to nested/dynamic parallelism, TexRefInputIteratorTiterators may only be - * created by the host thread and used by a top-level kernel (i.e. the one which is launched - * from the host). - * - Compatible with Thrust API v1.7 or newer. - * - Compatible with CUDA toolkit v5.5 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p TexRefInputIteratorTto - * dereference a device array of doubles through texture cache. - * \par - * \code - * #include // or equivalently - * - * // Declare, allocate, and initialize a device array - * int num_items; // e.g., 7 - * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] - * - * // Create an iterator wrapper - * cub::TexRefInputIterator itr; - * itr.BindTexture(d_in, sizeof(double) * num_items); - * ... - * - * // Within device code: - * printf("%f\n", itr[0]); // 8.0 - * printf("%f\n", itr[1]); // 6.0 - * printf("%f\n", itr[6]); // 9.0 - * - * ... - * itr.UnbindTexture(); - * - * \endcode - * - * \tparam T The value type of this iterator - * \tparam UNIQUE_ID A globally-unique identifier (within the compilation unit) to name the underlying texture reference - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - */ -template < - typename T, - int UNIQUE_ID, - typename OffsetT = ptrdiff_t> -class TexRefInputIterator -{ -public: - - // Required iterator traits - typedef TexRefInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef T value_type; ///< The type of the element the iterator can point to - typedef T* pointer; ///< The type of a pointer to an element the iterator can point to - typedef T reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::device_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - T* ptr; - difference_type tex_offset; - - // Texture reference wrapper (old Tesla/Fermi-style textures) - typedef typename IteratorTexRef::template TexId TexId; - -public: -/* - /// Constructor - __host__ __device__ __forceinline__ TexRefInputIterator() - : - ptr(NULL), - tex_offset(0) - {} -*/ - /// Use this iterator to bind \p ptr with a texture reference - template - cudaError_t BindTexture( - QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment - size_t bytes = size_t(-1), ///< Number of bytes in the range - size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator - { - this->ptr = const_cast::Type *>(ptr); - size_t offset; - cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset); - this->tex_offset = (difference_type) (offset / sizeof(QualifiedT)); - return retval; - } - - /// Unbind this iterator from its texture reference - cudaError_t UnbindTexture() - { - return TexId::UnbindTexture(); - } - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - tex_offset++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - tex_offset++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ reference operator*() const - { -#if (CUB_PTX_ARCH == 0) - // Simply dereference the pointer on the host - return ptr[tex_offset]; -#else - // Use the texture reference - return TexId::Fetch(tex_offset); -#endif - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval; - retval.ptr = ptr; - retval.tex_offset = tex_offset + n; - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - tex_offset += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval; - retval.ptr = ptr; - retval.tex_offset = tex_offset - n; - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - tex_offset -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return tex_offset - other.tex_offset; - } - - /// Array subscript - template - __host__ __device__ __forceinline__ reference operator[](Distance n) const - { - self_type offset = (*this) + n; - return *offset; - } - - /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() - { - return &(*(*this)); - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset)); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset)); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) - { - return os; - } - -}; - - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) - -#endif // CUDA_VERSION diff --git a/SRC/cub/iterator/transform_input_iterator.cuh b/SRC/cub/iterator/transform_input_iterator.cuh deleted file mode 100644 index dad1f500..00000000 --- a/SRC/cub/iterator/transform_input_iterator.cuh +++ /dev/null @@ -1,252 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Random-access iterator types - */ - -#pragma once - -#include -#include - -#include "../thread/thread_load.cuh" -#include "../thread/thread_store.cuh" -#include "../util_device.cuh" -#include "../util_namespace.cuh" - -#if (THRUST_VERSION >= 100700) - // This iterator is compatible with Thrust API 1.7 and newer - #include - #include -#endif // THRUST_VERSION - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIterator - * @{ - */ - - -/** - * \brief A random-access input wrapper for transforming dereferenced values. - * - * \par Overview - * - TransformInputIteratorTwraps a unary conversion functor of type \p - * ConversionOp and a random-access input iterator of type InputIteratorT, - * using the former to produce references of type \p ValueType from the latter. - * - Can be used with any data type. - * - Can be constructed, manipulated, and exchanged within and between host and device - * functions. Wrapped host memory can only be dereferenced on the host, and wrapped - * device memory can only be dereferenced on the device. - * - Compatible with Thrust API v1.7 or newer. - * - * \par Snippet - * The code snippet below illustrates the use of \p TransformInputIteratorTto - * dereference an array of integers, tripling the values and converting them to doubles. - * \par - * \code - * #include // or equivalently - * - * // Functor for tripling integer values and converting to doubles - * struct TripleDoubler - * { - * __host__ __device__ __forceinline__ - * double operator()(const int &a) const { - * return double(a * 3); - * } - * }; - * - * // Declare, allocate, and initialize a device array - * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] - * TripleDoubler conversion_op; - * - * // Create an iterator wrapper - * cub::TransformInputIterator itr(d_in, conversion_op); - * - * // Within device code: - * printf("%f\n", itr[0]); // 24.0 - * printf("%f\n", itr[1]); // 18.0 - * printf("%f\n", itr[6]); // 27.0 - * - * \endcode - * - * \tparam ValueType The value type of this iterator - * \tparam ConversionOp Unary functor type for mapping objects of type \p InputType to type \p ValueType. Must have member ValueType operator()(const InputType &datum). - * \tparam InputIteratorT The type of the wrapped input iterator - * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) - * - */ -template < - typename ValueType, - typename ConversionOp, - typename InputIteratorT, - typename OffsetT = ptrdiff_t> -class TransformInputIterator -{ -public: - - // Required iterator traits - typedef TransformInputIterator self_type; ///< My own type - typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another - typedef ValueType value_type; ///< The type of the element the iterator can point to - typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to - typedef ValueType reference; ///< The type of a reference to an element the iterator can point to - -#if (THRUST_VERSION >= 100700) - // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods - typedef typename thrust::detail::iterator_facade_category< - thrust::any_system_tag, - thrust::random_access_traversal_tag, - value_type, - reference - >::type iterator_category; ///< The iterator category -#else - typedef std::random_access_iterator_tag iterator_category; ///< The iterator category -#endif // THRUST_VERSION - -private: - - ConversionOp conversion_op; - InputIteratorT input_itr; - -public: - - /// Constructor - __host__ __device__ __forceinline__ TransformInputIterator( - InputIteratorT input_itr, ///< Input iterator to wrap - ConversionOp conversion_op) ///< Conversion functor to wrap - : - conversion_op(conversion_op), - input_itr(input_itr) - {} - - /// Postfix increment - __host__ __device__ __forceinline__ self_type operator++(int) - { - self_type retval = *this; - input_itr++; - return retval; - } - - /// Prefix increment - __host__ __device__ __forceinline__ self_type operator++() - { - input_itr++; - return *this; - } - - /// Indirection - __host__ __device__ __forceinline__ reference operator*() const - { - return conversion_op(*input_itr); - } - - /// Addition - template - __host__ __device__ __forceinline__ self_type operator+(Distance n) const - { - self_type retval(input_itr + n, conversion_op); - return retval; - } - - /// Addition assignment - template - __host__ __device__ __forceinline__ self_type& operator+=(Distance n) - { - input_itr += n; - return *this; - } - - /// Subtraction - template - __host__ __device__ __forceinline__ self_type operator-(Distance n) const - { - self_type retval(input_itr - n, conversion_op); - return retval; - } - - /// Subtraction assignment - template - __host__ __device__ __forceinline__ self_type& operator-=(Distance n) - { - input_itr -= n; - return *this; - } - - /// Distance - __host__ __device__ __forceinline__ difference_type operator-(self_type other) const - { - return input_itr - other.input_itr; - } - - /// Array subscript - template - __host__ __device__ __forceinline__ reference operator[](Distance n) const - { - return conversion_op(input_itr[n]); - } - - /// Structure dereference - __host__ __device__ __forceinline__ pointer operator->() - { - return &conversion_op(*input_itr); - } - - /// Equal to - __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) - { - return (input_itr == rhs.input_itr); - } - - /// Not equal to - __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) - { - return (input_itr != rhs.input_itr); - } - - /// ostream operator - friend std::ostream& operator<<(std::ostream& os, const self_type& itr) - { - return os; - } -}; - - - -/** @} */ // end group UtilIterator - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/thread/thread_load.cuh b/SRC/cub/thread/thread_load.cuh deleted file mode 100644 index b1ca412f..00000000 --- a/SRC/cub/thread/thread_load.cuh +++ /dev/null @@ -1,438 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Thread utilities for reading memory using PTX cache modifiers. - */ - -#pragma once - -#include - -#include - -#include "../util_ptx.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIo - * @{ - */ - -//----------------------------------------------------------------------------- -// Tags and constants -//----------------------------------------------------------------------------- - -/** - * \brief Enumeration of cache modifiers for memory load operations. - */ -enum CacheLoadModifier -{ - LOAD_DEFAULT, ///< Default (no modifier) - LOAD_CA, ///< Cache at all levels - LOAD_CG, ///< Cache at global level - LOAD_CS, ///< Cache streaming (likely to be accessed once) - LOAD_CV, ///< Cache as volatile (including cached system lines) - LOAD_LDG, ///< Cache as texture - LOAD_VOLATILE, ///< Volatile (any memory space) -}; - - -/** - * \name Thread I/O (cache modified) - * @{ - */ - -/** - * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers. Can be used to load any data type. - * - * \par Example - * \code - * #include // or equivalently - * - * // 32-bit load using cache-global modifier: - * int *d_in; - * int val = cub::ThreadLoad(d_in + threadIdx.x); - * - * // 16-bit load using default modifier - * short *d_in; - * short val = cub::ThreadLoad(d_in + threadIdx.x); - * - * // 256-bit load using cache-volatile modifier - * double4 *d_in; - * double4 val = cub::ThreadLoad(d_in + threadIdx.x); - * - * // 96-bit load using cache-streaming modifier - * struct TestFoo { bool a; short b; }; - * TestFoo *d_struct; - * TestFoo val = cub::ThreadLoad(d_in + threadIdx.x); - * \endcode - * - * \tparam MODIFIER [inferred] CacheLoadModifier enumeration - * \tparam InputIteratorT [inferred] Input iterator type \iterator - */ -template < - CacheLoadModifier MODIFIER, - typename InputIteratorT> -__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorT itr); - - -//@} end member group - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - -/// Helper structure for templated load iteration (inductive case) -template -struct IterateThreadLoad -{ - template - static __device__ __forceinline__ void Load(T const *ptr, T *vals) - { - vals[COUNT] = ThreadLoad(ptr + COUNT); - IterateThreadLoad::template Load(ptr, vals); - } - - template - static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals) - { - vals[COUNT] = itr[COUNT]; - IterateThreadLoad::Dereference(itr, vals); - } -}; - - -/// Helper structure for templated load iteration (termination case) -template -struct IterateThreadLoad -{ - template - static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {} - - template - static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {} -}; - - -/** - * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier - */ -#define _CUB_LOAD_16(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ uint4 ThreadLoad(uint4 const *ptr) \ - { \ - uint4 retval; \ - asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" : \ - "=r"(retval.x), \ - "=r"(retval.y), \ - "=r"(retval.z), \ - "=r"(retval.w) : \ - _CUB_ASM_PTR_(ptr)); \ - return retval; \ - } \ - template<> \ - __device__ __forceinline__ ulonglong2 ThreadLoad(ulonglong2 const *ptr) \ - { \ - ulonglong2 retval; \ - asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" : \ - "=l"(retval.x), \ - "=l"(retval.y) : \ - _CUB_ASM_PTR_(ptr)); \ - return retval; \ - } - -/** - * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier - */ -#define _CUB_LOAD_8(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ ushort4 ThreadLoad(ushort4 const *ptr) \ - { \ - ushort4 retval; \ - asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" : \ - "=h"(retval.x), \ - "=h"(retval.y), \ - "=h"(retval.z), \ - "=h"(retval.w) : \ - _CUB_ASM_PTR_(ptr)); \ - return retval; \ - } \ - template<> \ - __device__ __forceinline__ uint2 ThreadLoad(uint2 const *ptr) \ - { \ - uint2 retval; \ - asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" : \ - "=r"(retval.x), \ - "=r"(retval.y) : \ - _CUB_ASM_PTR_(ptr)); \ - return retval; \ - } \ - template<> \ - __device__ __forceinline__ unsigned long long ThreadLoad(unsigned long long const *ptr) \ - { \ - unsigned long long retval; \ - asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" : \ - "=l"(retval) : \ - _CUB_ASM_PTR_(ptr)); \ - return retval; \ - } - -/** - * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier - */ -#define _CUB_LOAD_4(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ unsigned int ThreadLoad(unsigned int const *ptr) \ - { \ - unsigned int retval; \ - asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" : \ - "=r"(retval) : \ - _CUB_ASM_PTR_(ptr)); \ - return retval; \ - } - - -/** - * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier - */ -#define _CUB_LOAD_2(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ unsigned short ThreadLoad(unsigned short const *ptr) \ - { \ - unsigned short retval; \ - asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" : \ - "=h"(retval) : \ - _CUB_ASM_PTR_(ptr)); \ - return retval; \ - } - - -/** - * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier - */ -#define _CUB_LOAD_1(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ unsigned char ThreadLoad(unsigned char const *ptr) \ - { \ - unsigned short retval; \ - asm volatile ( \ - "{" \ - " .reg .u8 datum;" \ - " ld."#ptx_modifier".u8 datum, [%1];" \ - " cvt.u16.u8 %0, datum;" \ - "}" : \ - "=h"(retval) : \ - _CUB_ASM_PTR_(ptr)); \ - return (unsigned char) retval; \ - } - - -/** - * Define powers-of-two ThreadLoad specializations for the given Cache load modifier - */ -#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier) \ - _CUB_LOAD_16(cub_modifier, ptx_modifier) \ - _CUB_LOAD_8(cub_modifier, ptx_modifier) \ - _CUB_LOAD_4(cub_modifier, ptx_modifier) \ - _CUB_LOAD_2(cub_modifier, ptx_modifier) \ - _CUB_LOAD_1(cub_modifier, ptx_modifier) \ - - -/** - * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers - */ -#if CUB_PTX_ARCH >= 200 - _CUB_LOAD_ALL(LOAD_CA, ca) - _CUB_LOAD_ALL(LOAD_CG, cg) - _CUB_LOAD_ALL(LOAD_CS, cs) - _CUB_LOAD_ALL(LOAD_CV, cv) -#else - _CUB_LOAD_ALL(LOAD_CA, global) - // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1 - _CUB_LOAD_ALL(LOAD_CG, volatile.global) - _CUB_LOAD_ALL(LOAD_CS, global) - _CUB_LOAD_ALL(LOAD_CV, volatile.global) -#endif - -#if CUB_PTX_ARCH >= 350 - _CUB_LOAD_ALL(LOAD_LDG, global.nc) -#else - _CUB_LOAD_ALL(LOAD_LDG, global) -#endif - - -// Macro cleanup -#undef _CUB_LOAD_ALL -#undef _CUB_LOAD_1 -#undef _CUB_LOAD_2 -#undef _CUB_LOAD_4 -#undef _CUB_LOAD_8 -#undef _CUB_LOAD_16 - - - -/** - * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types - */ -template -__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad( - InputIteratorT itr, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - return *itr; -} - - -/** - * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types - */ -template -__device__ __forceinline__ T ThreadLoad( - T *ptr, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - return *ptr; -} - - -/** - * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types - */ -template -__device__ __forceinline__ T ThreadLoadVolatilePointer( - T *ptr, - Int2Type /*is_primitive*/) -{ - T retval = *reinterpret_cast(ptr); - return retval; -} - - -/** - * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types - */ -template -__device__ __forceinline__ T ThreadLoadVolatilePointer( - T *ptr, - Int2Type /*is_primitive*/) -{ - typedef typename UnitWord::VolatileWord VolatileWord; // Word type for memcopying - - const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); -/* - VolatileWord words[VOLATILE_MULTIPLE]; - - IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( - reinterpret_cast(ptr), - words); - - return *reinterpret_cast(words); -*/ - - T retval; - VolatileWord *words = reinterpret_cast(&retval); - IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference( - reinterpret_cast(ptr), - words); - return retval; -} - - -/** - * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types - */ -template -__device__ __forceinline__ T ThreadLoad( - T *ptr, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - // Apply tags for partial-specialization - return ThreadLoadVolatilePointer(ptr, Int2Type::PRIMITIVE>()); -} - - -/** - * ThreadLoad definition for generic modifiers on pointer types - */ -template -__device__ __forceinline__ T ThreadLoad( - T const *ptr, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - typedef typename UnitWord::DeviceWord DeviceWord; - - const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); - - DeviceWord words[DEVICE_MULTIPLE]; - - IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load( - reinterpret_cast(const_cast(ptr)), - words); - - return *reinterpret_cast(words); -} - - -/** - * ThreadLoad definition for generic modifiers - */ -template < - CacheLoadModifier MODIFIER, - typename InputIteratorT> -__device__ __forceinline__ typename std::iterator_traits::value_type ThreadLoad(InputIteratorT itr) -{ - // Apply tags for partial-specialization - return ThreadLoad( - itr, - Int2Type(), - Int2Type::VALUE>()); -} - - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/** @} */ // end group UtilIo - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/thread/thread_operators.cuh b/SRC/cub/thread/thread_operators.cuh deleted file mode 100644 index 76cd800f..00000000 --- a/SRC/cub/thread/thread_operators.cuh +++ /dev/null @@ -1,317 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Simple binary operator functor types - */ - -/****************************************************************************** - * Simple functor operators - ******************************************************************************/ - -#pragma once - -#include "../util_macro.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilModule - * @{ - */ - -/** - * \brief Default equality functor - */ -struct Equality -{ - /// Boolean equality operator, returns (a == b) - template - __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const - { - return a == b; - } -}; - - -/** - * \brief Default inequality functor - */ -struct Inequality -{ - /// Boolean inequality operator, returns (a != b) - template - __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const - { - return a != b; - } -}; - - -/** - * \brief Inequality functor (wraps equality functor) - */ -template -struct InequalityWrapper -{ - /// Wrapped equality operator - EqualityOp op; - - /// Constructor - __host__ __device__ __forceinline__ - InequalityWrapper(EqualityOp op) : op(op) {} - - /// Boolean inequality operator, returns (a != b) - template - __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) - { - return !op(a, b); - } -}; - - -/** - * \brief Default sum functor - */ -struct Sum -{ - /// Boolean sum operator, returns a + b - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const - { - return a + b; - } -}; - - -/** - * \brief Default max functor - */ -struct Max -{ - /// Boolean max operator, returns (a > b) ? a : b - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const - { - return CUB_MAX(a, b); - } -}; - - -/** - * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item) - */ -struct ArgMax -{ - /// Boolean max operator, preferring the item having the smaller offset in case of ties - template - __host__ __device__ __forceinline__ KeyValuePair operator()( - const KeyValuePair &a, - const KeyValuePair &b) const - { -// Mooch BUG (device reduce argmax gk110 3.2 million random fp32) -// return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; - - if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) - return b; - return a; - } -}; - - -/** - * \brief Default min functor - */ -struct Min -{ - /// Boolean min operator, returns (a < b) ? a : b - template - __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const - { - return CUB_MIN(a, b); - } -}; - - -/** - * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item) - */ -struct ArgMin -{ - /// Boolean min operator, preferring the item having the smaller offset in case of ties - template - __host__ __device__ __forceinline__ KeyValuePair operator()( - const KeyValuePair &a, - const KeyValuePair &b) const - { -// Mooch BUG (device reduce argmax gk110 3.2 million random fp32) -// return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; - - if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) - return b; - return a; - } -}; - - -/** - * \brief Default cast functor - */ -template -struct CastOp -{ - /// Cast operator, returns (B) a - template - __host__ __device__ __forceinline__ B operator()(const A &a) const - { - return (B) a; - } -}; - - -/** - * \brief Binary operator wrapper for switching non-commutative scan arguments - */ -template -class SwizzleScanOp -{ -private: - - /// Wrapped scan operator - ScanOp scan_op; - -public: - - /// Constructor - __host__ __device__ __forceinline__ - SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {} - - /// Switch the scan arguments - template - __host__ __device__ __forceinline__ - T operator()(const T &a, const T &b) - { - T _a(a); - T _b(b); - - return scan_op(_b, _a); - } -}; - - -/** - * \brief Reduce-by-segment functor. - * - * Given two cub::KeyValuePair inputs \p a and \p b and a - * binary associative combining operator \p f(const T &x, const T &y), - * an instance of this functor returns a cub::KeyValuePair whose \p key - * field is a.key + b.key, and whose \p value field - * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise. - * - * ReduceBySegmentOp is an associative, non-commutative binary combining operator - * for input sequences of cub::KeyValuePair pairings. Such - * sequences are typically used to represent a segmented set of values to be reduced - * and a corresponding set of {0,1}-valued integer "head flags" demarcating the - * first value of each segment. - * - */ -template ///< Binary reduction operator to apply to values -struct ReduceBySegmentOp -{ - /// Wrapped reduction operator - ReductionOpT op; - - /// Constructor - __host__ __device__ __forceinline__ ReduceBySegmentOp() {} - - /// Constructor - __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {} - - /// Scan operator - template ///< KeyValuePair pairing of T (value) and OffsetT (head flag) - __host__ __device__ __forceinline__ KeyValuePairT operator()( - const KeyValuePairT &first, ///< First partial reduction - const KeyValuePairT &second) ///< Second partial reduction - { - KeyValuePairT retval; - retval.key = first.key + second.key; - retval.value = (second.key) ? - second.value : // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate - op(first.value, second.value); // The second partial reduction does not span a reset, so accumulate both into the running aggregate - return retval; - } -}; - - - -template ///< Binary reduction operator to apply to values -struct ReduceByKeyOp -{ - /// Wrapped reduction operator - ReductionOpT op; - - /// Constructor - __host__ __device__ __forceinline__ ReduceByKeyOp() {} - - /// Constructor - __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {} - - /// Scan operator - template - __host__ __device__ __forceinline__ KeyValuePairT operator()( - const KeyValuePairT &first, ///< First partial reduction - const KeyValuePairT &second) ///< Second partial reduction - { - KeyValuePairT retval = second; - - if (first.key == second.key) - retval.value = op(first.value, retval.value); - - return retval; - } -}; - - - - - - - -/** @} */ // end group UtilModule - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/thread/thread_reduce.cuh b/SRC/cub/thread/thread_reduce.cuh deleted file mode 100644 index 4c13688f..00000000 --- a/SRC/cub/thread/thread_reduce.cuh +++ /dev/null @@ -1,152 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Thread utilities for sequential reduction over statically-sized array types - */ - -#pragma once - -#include "../thread/thread_operators.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) -namespace internal { - -/** - * Sequential reduction over statically-sized array types - */ -template < - int LENGTH, - typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( - T* input, ///< [in] Input array - ReductionOp reduction_op, ///< [in] Binary reduction operator - T prefix, ///< [in] Prefix to seed reduction with - Int2Type /*length*/) -{ - T retval = prefix; - - #pragma unroll - for (int i = 0; i < LENGTH; ++i) - retval = reduction_op(retval, input[i]); - - return retval; -} - - -/** - * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. - * - * \tparam LENGTH LengthT of input array - * \tparam T [inferred] The data type to be reduced. - * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( - T* input, ///< [in] Input array - ReductionOp reduction_op, ///< [in] Binary reduction operator - T prefix) ///< [in] Prefix to seed reduction with -{ - return ThreadReduce(input, reduction_op, prefix, Int2Type()); -} - - -/** - * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned. - * - * \tparam LENGTH LengthT of input array - * \tparam T [inferred] The data type to be reduced. - * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( - T* input, ///< [in] Input array - ReductionOp reduction_op) ///< [in] Binary reduction operator -{ - T prefix = input[0]; - return ThreadReduce(input + 1, reduction_op, prefix); -} - - -/** - * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. - * - * \tparam LENGTH [inferred] LengthT of \p input array - * \tparam T [inferred] The data type to be reduced. - * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( - T (&input)[LENGTH], ///< [in] Input array - ReductionOp reduction_op, ///< [in] Binary reduction operator - T prefix) ///< [in] Prefix to seed reduction with -{ - return ThreadReduce(input, reduction_op, prefix, Int2Type()); -} - - -/** - * \brief Serial reduction with the specified operator - * - * \tparam LENGTH [inferred] LengthT of \p input array - * \tparam T [inferred] The data type to be reduced. - * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ReductionOp> -__device__ __forceinline__ T ThreadReduce( - T (&input)[LENGTH], ///< [in] Input array - ReductionOp reduction_op) ///< [in] Binary reduction operator -{ - return ThreadReduce((T*) input, reduction_op); -} - - -} // internal namespace -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/thread/thread_scan.cuh b/SRC/cub/thread/thread_scan.cuh deleted file mode 100644 index 8d67549a..00000000 --- a/SRC/cub/thread/thread_scan.cuh +++ /dev/null @@ -1,268 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Thread utilities for sequential prefix scan over statically-sized array types - */ - -#pragma once - -#include "../thread/thread_operators.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations) -namespace internal { - - -/** - * \addtogroup UtilModule - * @{ - */ - -/** - * \name Sequential prefix scan over statically-sized array types - * @{ - */ - -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanExclusive( - T inclusive, - T exclusive, - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - Int2Type /*length*/) -{ - #pragma unroll - for (int i = 0; i < LENGTH; ++i) - { - inclusive = scan_op(exclusive, input[i]); - output[i] = exclusive; - exclusive = inclusive; - } - - return inclusive; -} - - - -/** - * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. - * - * \tparam LENGTH LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanExclusive( - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T prefix, ///< [in] Prefix to seed scan with - bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. If not, the first output element is undefined. (Handy for preventing thread-0 from applying a prefix.) -{ - T inclusive = input[0]; - if (apply_prefix) - { - inclusive = scan_op(prefix, inclusive); - } - output[0] = prefix; - T exclusive = inclusive; - - return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type()); -} - - -/** - * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. - * - * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanExclusive( - T (&input)[LENGTH], ///< [in] Input array - T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T prefix, ///< [in] Prefix to seed scan with - bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) -{ - return ThreadScanExclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); -} - - - - - - - - - -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T inclusive, - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - Int2Type /*length*/) -{ - #pragma unroll - for (int i = 0; i < LENGTH; ++i) - { - inclusive = scan_op(inclusive, input[i]); - output[i] = inclusive; - } - - return inclusive; -} - - -/** - * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array. The aggregate is returned. - * - * \tparam LENGTH LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator -{ - T inclusive = input[0]; - output[0] = inclusive; - - // Continue scan - return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); -} - - -/** - * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array. The aggregate is returned. - * - * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T (&input)[LENGTH], ///< [in] Input array - T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op) ///< [in] Binary scan operator -{ - return ThreadScanInclusive((T*) input, (T*) output, scan_op); -} - - -/** - * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. - * - * \tparam LENGTH LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T *input, ///< [in] Input array - T *output, ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T prefix, ///< [in] Prefix to seed scan with - bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) -{ - T inclusive = input[0]; - if (apply_prefix) - { - inclusive = scan_op(prefix, inclusive); - } - output[0] = inclusive; - - // Continue scan - return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type()); -} - - -/** - * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. - * - * \tparam LENGTH [inferred] LengthT of \p input and \p output arrays - * \tparam T [inferred] The data type to be scanned. - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ -template < - int LENGTH, - typename T, - typename ScanOp> -__device__ __forceinline__ T ThreadScanInclusive( - T (&input)[LENGTH], ///< [in] Input array - T (&output)[LENGTH], ///< [out] Output array (may be aliased to \p input) - ScanOp scan_op, ///< [in] Binary scan operator - T prefix, ///< [in] Prefix to seed scan with - bool apply_prefix = true) ///< [in] Whether or not the calling thread should apply its prefix. (Handy for preventing thread-0 from applying a prefix.) -{ - return ThreadScanInclusive((T*) input, (T*) output, scan_op, prefix, apply_prefix); -} - - -//@} end member group - -/** @} */ // end group UtilModule - - -} // internal namespace -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/thread/thread_search.cuh b/SRC/cub/thread/thread_search.cuh deleted file mode 100644 index 3099080a..00000000 --- a/SRC/cub/thread/thread_search.cuh +++ /dev/null @@ -1,154 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Thread utilities for sequential search - */ - -#pragma once - -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * Computes the begin offsets into A and B for the specific diagonal - */ -template < - typename AIteratorT, - typename BIteratorT, - typename OffsetT, - typename CoordinateT> -__host__ __device__ __forceinline__ void MergePathSearch( - OffsetT diagonal, - AIteratorT a, - BIteratorT b, - OffsetT a_len, - OffsetT b_len, - CoordinateT& path_coordinate) -{ - /// The value type of the input iterator - typedef typename std::iterator_traits::value_type T; - - OffsetT split_min = CUB_MAX(diagonal - b_len, 0); - OffsetT split_max = CUB_MIN(diagonal, a_len); - - while (split_min < split_max) - { - OffsetT split_pivot = (split_min + split_max) >> 1; - if (a[split_pivot] <= b[diagonal - split_pivot - 1]) - { - // Move candidate split range up A, down B - split_min = split_pivot + 1; - } - else - { - // Move candidate split range up B, down A - split_max = split_pivot; - } - } - - path_coordinate.x = CUB_MIN(split_min, a_len); - path_coordinate.y = diagonal - split_min; -} - - - -/** - * \brief Returns the offset of the first value within \p input which does not compare less than \p val - */ -template < - typename InputIteratorT, - typename OffsetT, - typename T> -__device__ __forceinline__ OffsetT LowerBound( - InputIteratorT input, ///< [in] Input sequence - OffsetT num_items, ///< [in] Input sequence length - T val) ///< [in] Search key -{ - OffsetT retval = 0; - while (num_items > 0) - { - OffsetT half = num_items >> 1; - if (input[retval + half] < val) - { - retval = retval + (half + 1); - num_items = num_items - (half + 1); - } - else - { - num_items = half; - } - } - - return retval; -} - - -/** - * \brief Returns the offset of the first value within \p input which compares greater than \p val - */ -template < - typename InputIteratorT, - typename OffsetT, - typename T> -__device__ __forceinline__ OffsetT UpperBound( - InputIteratorT input, ///< [in] Input sequence - OffsetT num_items, ///< [in] Input sequence length - T val) ///< [in] Search key -{ - OffsetT retval = 0; - while (num_items > 0) - { - OffsetT half = num_items >> 1; - if (val < input[retval + half]) - { - num_items = half; - } - else - { - retval = retval + (half + 1); - num_items = num_items - (half + 1); - } - } - - return retval; -} - - - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/thread/thread_store.cuh b/SRC/cub/thread/thread_store.cuh deleted file mode 100644 index ec20b36f..00000000 --- a/SRC/cub/thread/thread_store.cuh +++ /dev/null @@ -1,422 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Thread utilities for writing memory using PTX cache modifiers. - */ - -#pragma once - -#include - -#include "../util_ptx.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup UtilIo - * @{ - */ - - -//----------------------------------------------------------------------------- -// Tags and constants -//----------------------------------------------------------------------------- - -/** - * \brief Enumeration of cache modifiers for memory store operations. - */ -enum CacheStoreModifier -{ - STORE_DEFAULT, ///< Default (no modifier) - STORE_WB, ///< Cache write-back all coherent levels - STORE_CG, ///< Cache at global level - STORE_CS, ///< Cache streaming (likely to be accessed once) - STORE_WT, ///< Cache write-through (to system memory) - STORE_VOLATILE, ///< Volatile shared (any memory space) -}; - - -/** - * \name Thread I/O (cache modified) - * @{ - */ - -/** - * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers. Can be used to store any data type. - * - * \par Example - * \code - * #include // or equivalently - * - * // 32-bit store using cache-global modifier: - * int *d_out; - * int val; - * cub::ThreadStore(d_out + threadIdx.x, val); - * - * // 16-bit store using default modifier - * short *d_out; - * short val; - * cub::ThreadStore(d_out + threadIdx.x, val); - * - * // 256-bit store using write-through modifier - * double4 *d_out; - * double4 val; - * cub::ThreadStore(d_out + threadIdx.x, val); - * - * // 96-bit store using cache-streaming cache modifier - * struct TestFoo { bool a; short b; }; - * TestFoo *d_struct; - * TestFoo val; - * cub::ThreadStore(d_out + threadIdx.x, val); - * \endcode - * - * \tparam MODIFIER [inferred] CacheStoreModifier enumeration - * \tparam InputIteratorT [inferred] Output iterator type \iterator - * \tparam T [inferred] Data type of output value - */ -template < - CacheStoreModifier MODIFIER, - typename OutputIteratorT, - typename T> -__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val); - - -//@} end member group - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - -/// Helper structure for templated store iteration (inductive case) -template -struct IterateThreadStore -{ - template - static __device__ __forceinline__ void Store(T *ptr, T *vals) - { - ThreadStore(ptr + COUNT, vals[COUNT]); - IterateThreadStore::template Store(ptr, vals); - } - - template - static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals) - { - ptr[COUNT] = vals[COUNT]; - IterateThreadStore::Dereference(ptr, vals); - } - -}; - -/// Helper structure for templated store iteration (termination case) -template -struct IterateThreadStore -{ - template - static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {} - - template - static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {} -}; - - -/** - * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier - */ -#define _CUB_STORE_16(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ void ThreadStore(uint4* ptr, uint4 val) \ - { \ - asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : : \ - _CUB_ASM_PTR_(ptr), \ - "r"(val.x), \ - "r"(val.y), \ - "r"(val.z), \ - "r"(val.w)); \ - } \ - template<> \ - __device__ __forceinline__ void ThreadStore(ulonglong2* ptr, ulonglong2 val) \ - { \ - asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : : \ - _CUB_ASM_PTR_(ptr), \ - "l"(val.x), \ - "l"(val.y)); \ - } - - -/** - * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier - */ -#define _CUB_STORE_8(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ void ThreadStore(ushort4* ptr, ushort4 val) \ - { \ - asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : : \ - _CUB_ASM_PTR_(ptr), \ - "h"(val.x), \ - "h"(val.y), \ - "h"(val.z), \ - "h"(val.w)); \ - } \ - template<> \ - __device__ __forceinline__ void ThreadStore(uint2* ptr, uint2 val) \ - { \ - asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : : \ - _CUB_ASM_PTR_(ptr), \ - "r"(val.x), \ - "r"(val.y)); \ - } \ - template<> \ - __device__ __forceinline__ void ThreadStore(unsigned long long* ptr, unsigned long long val) \ - { \ - asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : : \ - _CUB_ASM_PTR_(ptr), \ - "l"(val)); \ - } - -/** - * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier - */ -#define _CUB_STORE_4(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ void ThreadStore(unsigned int* ptr, unsigned int val) \ - { \ - asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : : \ - _CUB_ASM_PTR_(ptr), \ - "r"(val)); \ - } - - -/** - * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier - */ -#define _CUB_STORE_2(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ void ThreadStore(unsigned short* ptr, unsigned short val) \ - { \ - asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : : \ - _CUB_ASM_PTR_(ptr), \ - "h"(val)); \ - } - - -/** - * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier - */ -#define _CUB_STORE_1(cub_modifier, ptx_modifier) \ - template<> \ - __device__ __forceinline__ void ThreadStore(unsigned char* ptr, unsigned char val) \ - { \ - asm volatile ( \ - "{" \ - " .reg .u8 datum;" \ - " cvt.u8.u16 datum, %1;" \ - " st."#ptx_modifier".u8 [%0], datum;" \ - "}" : : \ - _CUB_ASM_PTR_(ptr), \ - "h"((unsigned short) val)); \ - } - -/** - * Define powers-of-two ThreadStore specializations for the given Cache load modifier - */ -#define _CUB_STORE_ALL(cub_modifier, ptx_modifier) \ - _CUB_STORE_16(cub_modifier, ptx_modifier) \ - _CUB_STORE_8(cub_modifier, ptx_modifier) \ - _CUB_STORE_4(cub_modifier, ptx_modifier) \ - _CUB_STORE_2(cub_modifier, ptx_modifier) \ - _CUB_STORE_1(cub_modifier, ptx_modifier) \ - - -/** - * Define ThreadStore specializations for the various Cache load modifiers - */ -#if CUB_PTX_ARCH >= 200 - _CUB_STORE_ALL(STORE_WB, wb) - _CUB_STORE_ALL(STORE_CG, cg) - _CUB_STORE_ALL(STORE_CS, cs) - _CUB_STORE_ALL(STORE_WT, wt) -#else - _CUB_STORE_ALL(STORE_WB, global) - _CUB_STORE_ALL(STORE_CG, global) - _CUB_STORE_ALL(STORE_CS, global) - _CUB_STORE_ALL(STORE_WT, volatile.global) -#endif - - -// Macro cleanup -#undef _CUB_STORE_ALL -#undef _CUB_STORE_1 -#undef _CUB_STORE_2 -#undef _CUB_STORE_4 -#undef _CUB_STORE_8 -#undef _CUB_STORE_16 - - -/** - * ThreadStore definition for STORE_DEFAULT modifier on iterator types - */ -template -__device__ __forceinline__ void ThreadStore( - OutputIteratorT itr, - T val, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - *itr = val; -} - - -/** - * ThreadStore definition for STORE_DEFAULT modifier on pointer types - */ -template -__device__ __forceinline__ void ThreadStore( - T *ptr, - T val, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - *ptr = val; -} - - -/** - * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types - */ -template -__device__ __forceinline__ void ThreadStoreVolatilePtr( - T *ptr, - T val, - Int2Type /*is_primitive*/) -{ - *reinterpret_cast(ptr) = val; -} - - -/** - * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types - */ -template -__device__ __forceinline__ void ThreadStoreVolatilePtr( - T *ptr, - T val, - Int2Type /*is_primitive*/) -{ - // Create a temporary using shuffle-words, then store using volatile-words - typedef typename UnitWord::VolatileWord VolatileWord; - typedef typename UnitWord::ShuffleWord ShuffleWord; - - const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord); - const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); - - VolatileWord words[VOLATILE_MULTIPLE]; - - #pragma unroll - for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) - reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; - - IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference( - reinterpret_cast(ptr), - words); -} - - -/** - * ThreadStore definition for STORE_VOLATILE modifier on pointer types - */ -template -__device__ __forceinline__ void ThreadStore( - T *ptr, - T val, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - ThreadStoreVolatilePtr(ptr, val, Int2Type::PRIMITIVE>()); -} - - -/** - * ThreadStore definition for generic modifiers on pointer types - */ -template -__device__ __forceinline__ void ThreadStore( - T *ptr, - T val, - Int2Type /*modifier*/, - Int2Type /*is_pointer*/) -{ - // Create a temporary using shuffle-words, then store using device-words - typedef typename UnitWord::DeviceWord DeviceWord; - typedef typename UnitWord::ShuffleWord ShuffleWord; - - const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord); - const int SHUFFLE_MULTIPLE = sizeof(T) / sizeof(ShuffleWord); - - DeviceWord words[DEVICE_MULTIPLE]; - - #pragma unroll - for (int i = 0; i < SHUFFLE_MULTIPLE; ++i) - reinterpret_cast(words)[i] = reinterpret_cast(&val)[i]; - - IterateThreadStore<0, DEVICE_MULTIPLE>::template Store( - reinterpret_cast(ptr), - words); -} - - -/** - * ThreadStore definition for generic modifiers - */ -template -__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val) -{ - ThreadStore( - itr, - val, - Int2Type(), - Int2Type::VALUE>()); -} - - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/** @} */ // end group UtilIo - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/util_allocator.cuh b/SRC/cub/util_allocator.cuh deleted file mode 100644 index 0e6dd048..00000000 --- a/SRC/cub/util_allocator.cuh +++ /dev/null @@ -1,708 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/****************************************************************************** - * Simple caching allocator for device memory allocations. The allocator is - * thread-safe and capable of managing device allocations on multiple devices. - ******************************************************************************/ - -#pragma once - -#include "util_namespace.cuh" -#include "util_debug.cuh" - -#include -#include - -#include "host/mutex.cuh" -#include - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilMgmt - * @{ - */ - - -/****************************************************************************** - * CachingDeviceAllocator (host use) - ******************************************************************************/ - -/** - * \brief A simple caching allocator for device memory allocations. - * - * \par Overview - * The allocator is thread-safe and stream-safe and is capable of managing cached - * device allocations on multiple devices. It behaves as follows: - * - * \par - * - Allocations from the allocator are associated with an \p active_stream. Once freed, - * the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for - * reuse within other streams when all prior work submitted to \p active_stream has completed. - * - Allocations are categorized and cached by bin size. A new allocation request of - * a given size will only consider cached allocations within the corresponding bin. - * - Bin limits progress geometrically in accordance with the growth factor - * \p bin_growth provided during construction. Unused device allocations within - * a larger bin cache are not reused for allocation requests that categorize to - * smaller bin sizes. - * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to - * (\p bin_growth ^ \p min_bin). - * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest - * bin and are simply freed when they are deallocated instead of being returned - * to a bin-cache. - * - %If the total storage of cached allocations on a given device will exceed - * \p max_cached_bytes, allocations for that device are simply freed when they are - * deallocated instead of being returned to their bin-cache. - * - * \par - * For example, the default-constructed CachingDeviceAllocator is configured with: - * - \p bin_growth = 8 - * - \p min_bin = 3 - * - \p max_bin = 7 - * - \p max_cached_bytes = 6MB - 1B - * - * \par - * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB - * and sets a maximum of 6,291,455 cached bytes per device - * - */ -struct CachingDeviceAllocator -{ - - //--------------------------------------------------------------------- - // Constants - //--------------------------------------------------------------------- - - /// Out-of-bounds bin - static const unsigned int INVALID_BIN = (unsigned int) -1; - - /// Invalid size - static const size_t INVALID_SIZE = (size_t) -1; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - /// Invalid device ordinal - static const int INVALID_DEVICE_ORDINAL = -1; - - //--------------------------------------------------------------------- - // Type definitions and helper types - //--------------------------------------------------------------------- - - /** - * Descriptor for device memory allocations - */ - struct BlockDescriptor - { - void* d_ptr; // Device pointer - size_t bytes; // Size of allocation in bytes - unsigned int bin; // Bin enumeration - int device; // device ordinal - cudaStream_t associated_stream; // Associated associated_stream - cudaEvent_t ready_event; // Signal when associated stream has run to the point at which this block was freed - - // Constructor (suitable for searching maps for a specific block, given its pointer and device) - BlockDescriptor(void *d_ptr, int device) : - d_ptr(d_ptr), - bytes(0), - bin(INVALID_BIN), - device(device), - associated_stream(0), - ready_event(0) - {} - - // Constructor (suitable for searching maps for a range of suitable blocks, given a device) - BlockDescriptor(int device) : - d_ptr(NULL), - bytes(0), - bin(INVALID_BIN), - device(device), - associated_stream(0), - ready_event(0) - {} - - // Comparison functor for comparing device pointers - static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) - { - if (a.device == b.device) - return (a.d_ptr < b.d_ptr); - else - return (a.device < b.device); - } - - // Comparison functor for comparing allocation sizes - static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) - { - if (a.device == b.device) - return (a.bytes < b.bytes); - else - return (a.device < b.device); - } - }; - - /// BlockDescriptor comparator function interface - typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &); - - class TotalBytes { - public: - size_t free; - size_t live; - TotalBytes() { free = live = 0; } - }; - - /// Set type for cached blocks (ordered by size) - typedef std::multiset CachedBlocks; - - /// Set type for live blocks (ordered by ptr) - typedef std::multiset BusyBlocks; - - /// Map type of device ordinals to the number of cached bytes cached by each device - typedef std::map GpuCachedBytes; - - - //--------------------------------------------------------------------- - // Utility functions - //--------------------------------------------------------------------- - - /** - * Integer pow function for unsigned base and exponent - */ - static unsigned int IntPow( - unsigned int base, - unsigned int exp) - { - unsigned int retval = 1; - while (exp > 0) - { - if (exp & 1) { - retval = retval * base; // multiply the result by the current base - } - base = base * base; // square the base - exp = exp >> 1; // divide the exponent in half - } - return retval; - } - - - /** - * Round up to the nearest power-of - */ - void NearestPowerOf( - unsigned int &power, - size_t &rounded_bytes, - unsigned int base, - size_t value) - { - power = 0; - rounded_bytes = 1; - - if (value * base < value) - { - // Overflow - power = sizeof(size_t) * 8; - rounded_bytes = size_t(0) - 1; - return; - } - - while (rounded_bytes < value) - { - rounded_bytes *= base; - power++; - } - } - - - //--------------------------------------------------------------------- - // Fields - //--------------------------------------------------------------------- - - cub::Mutex mutex; /// Mutex for thread-safety - - unsigned int bin_growth; /// Geometric growth factor for bin-sizes - unsigned int min_bin; /// Minimum bin enumeration - unsigned int max_bin; /// Maximum bin enumeration - - size_t min_bin_bytes; /// Minimum bin size - size_t max_bin_bytes; /// Maximum bin size - size_t max_cached_bytes; /// Maximum aggregate cached bytes per device - - const bool skip_cleanup; /// Whether or not to skip a call to FreeAllCached() when destructor is called. (The CUDA runtime may have already shut down for statically declared allocators) - bool debug; /// Whether or not to print (de)allocation events to stdout - - GpuCachedBytes cached_bytes; /// Map of device ordinal to aggregate cached bytes on that device - CachedBlocks cached_blocks; /// Set of cached device allocations available for reuse - BusyBlocks live_blocks; /// Set of live device allocations currently in use - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - //--------------------------------------------------------------------- - // Methods - //--------------------------------------------------------------------- - - /** - * \brief Constructor. - */ - CachingDeviceAllocator( - unsigned int bin_growth, ///< Geometric growth factor for bin-sizes - unsigned int min_bin = 1, ///< Minimum bin (default is bin_growth ^ 1) - unsigned int max_bin = INVALID_BIN, ///< Maximum bin (default is no max bin) - size_t max_cached_bytes = INVALID_SIZE, ///< Maximum aggregate cached bytes per device (default is no limit) - bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate) - bool debug = false) ///< Whether or not to print (de)allocation events to stdout (default is no stderr output) - : - bin_growth(bin_growth), - min_bin(min_bin), - max_bin(max_bin), - min_bin_bytes(IntPow(bin_growth, min_bin)), - max_bin_bytes(IntPow(bin_growth, max_bin)), - max_cached_bytes(max_cached_bytes), - skip_cleanup(skip_cleanup), - debug(debug), - cached_blocks(BlockDescriptor::SizeCompare), - live_blocks(BlockDescriptor::PtrCompare) - {} - - - /** - * \brief Default constructor. - * - * Configured with: - * \par - * - \p bin_growth = 8 - * - \p min_bin = 3 - * - \p max_bin = 7 - * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes - * - * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and - * sets a maximum of 6,291,455 cached bytes per device - */ - CachingDeviceAllocator( - bool skip_cleanup = false, - bool debug = false) - : - bin_growth(8), - min_bin(3), - max_bin(7), - min_bin_bytes(IntPow(bin_growth, min_bin)), - max_bin_bytes(IntPow(bin_growth, max_bin)), - max_cached_bytes((max_bin_bytes * 3) - 1), - skip_cleanup(skip_cleanup), - debug(debug), - cached_blocks(BlockDescriptor::SizeCompare), - live_blocks(BlockDescriptor::PtrCompare) - {} - - - /** - * \brief Sets the limit on the number bytes this allocator is allowed to cache per device. - * - * Changing the ceiling of cached bytes does not cause any allocations (in-use or - * cached-in-reserve) to be freed. See \p FreeAllCached(). - */ - cudaError_t SetMaxCachedBytes( - size_t max_cached_bytes) - { - // Lock - mutex.Lock(); - - if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes); - - this->max_cached_bytes = max_cached_bytes; - - // Unlock - mutex.Unlock(); - - return cudaSuccess; - } - - - /** - * \brief Provides a suitable allocation of device memory for the given size on the specified device. - * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. - */ - cudaError_t DeviceAllocate( - int device, ///< [in] Device on which to place the allocation - void **d_ptr, ///< [out] Reference to pointer to the allocation - size_t bytes, ///< [in] Minimum number of bytes for the allocation - cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation - { - *d_ptr = NULL; - int entrypoint_device = INVALID_DEVICE_ORDINAL; - cudaError_t error = cudaSuccess; - - if (device == INVALID_DEVICE_ORDINAL) - { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; - device = entrypoint_device; - } - - // Create a block descriptor for the requested allocation - bool found = false; - BlockDescriptor search_key(device); - search_key.associated_stream = active_stream; - NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes); - - if (search_key.bin > max_bin) - { - // Bin is greater than our maximum bin: allocate the request - // exactly and give out-of-bounds bin. It will not be cached - // for reuse when returned. - search_key.bin = INVALID_BIN; - search_key.bytes = bytes; - } - else - { - // Search for a suitable cached allocation: lock - mutex.Lock(); - - if (search_key.bin < min_bin) - { - // Bin is less than minimum bin: round up - search_key.bin = min_bin; - search_key.bytes = min_bin_bytes; - } - - // Iterate through the range of cached blocks on the same device in the same bin - CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); - while ((block_itr != cached_blocks.end()) - && (block_itr->device == device) - && (block_itr->bin == search_key.bin)) - { - // To prevent races with reusing blocks returned by the host but still - // in use by the device, only consider cached blocks that are - // either (from the active stream) or (from an idle stream) - if ((active_stream == block_itr->associated_stream) || - (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) - { - // Reuse existing cache block. Insert into live blocks. - found = true; - search_key = *block_itr; - search_key.associated_stream = active_stream; - live_blocks.insert(search_key); - - // Remove from free blocks - cached_bytes[device].free -= search_key.bytes; - cached_bytes[device].live += search_key.bytes; - - if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n", - device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) block_itr->associated_stream); - - cached_blocks.erase(block_itr); - - break; - } - block_itr++; - } - - // Done searching: unlock - mutex.Unlock(); - } - - // Allocate the block if necessary - if (!found) - { - // Set runtime's current device to specified device (entrypoint may not be set) - if (device != entrypoint_device) - { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; - if (CubDebug(error = cudaSetDevice(device))) return error; - } - - // Attempt to allocate - if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) - { - // The allocation attempt failed: free all cached blocks on device and retry - if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations", - device, (long long) search_key.bytes, (long long) search_key.associated_stream); - - error = cudaSuccess; // Reset the error we will return - cudaGetLastError(); // Reset CUDART's error - - // Lock - mutex.Lock(); - - // Iterate the range of free blocks on the same device - BlockDescriptor free_key(device); - CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key); - - while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) - { - // No need to worry about synchronization with the device: cudaFree is - // blocking and will synchronize across all kernels executing - // on the current device - - // Free device memory and destroy stream event. - if (CubDebug(error = cudaFree(block_itr->d_ptr))) break; - if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break; - - // Reduce balance and erase entry - cached_bytes[device].free -= block_itr->bytes; - - if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", - device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); - - cached_blocks.erase(block_itr); - - block_itr++; - } - - // Unlock - mutex.Unlock(); - - // Return under error - if (error) return error; - - // Try to allocate again - if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error; - } - - // Create ready event - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) - return error; - - // Insert into live blocks - mutex.Lock(); - live_blocks.insert(search_key); - cached_bytes[device].live += search_key.bytes; - mutex.Unlock(); - - if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n", - device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream); - - // Attempt to revert back to previous device if necessary - if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) - { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; - } - } - - // Copy device pointer to output parameter - *d_ptr = search_key.d_ptr; - - if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", - (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); - - return error; - } - - - /** - * \brief Provides a suitable allocation of device memory for the given size on the current device. - * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. - */ - cudaError_t DeviceAllocate( - void **d_ptr, ///< [out] Reference to pointer to the allocation - size_t bytes, ///< [in] Minimum number of bytes for the allocation - cudaStream_t active_stream = 0) ///< [in] The stream to be associated with this allocation - { - return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream); - } - - - /** - * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator. - * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. - */ - cudaError_t DeviceFree( - int device, - void* d_ptr) - { - int entrypoint_device = INVALID_DEVICE_ORDINAL; - cudaError_t error = cudaSuccess; - - if (device == INVALID_DEVICE_ORDINAL) - { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) - return error; - device = entrypoint_device; - } - - // Lock - mutex.Lock(); - - // Find corresponding block descriptor - bool recached = false; - BlockDescriptor search_key(d_ptr, device); - BusyBlocks::iterator block_itr = live_blocks.find(search_key); - if (block_itr != live_blocks.end()) - { - // Remove from live blocks - search_key = *block_itr; - live_blocks.erase(block_itr); - cached_bytes[device].live -= search_key.bytes; - - // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold - if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) - { - // Insert returned allocation into free blocks - recached = true; - cached_blocks.insert(search_key); - cached_bytes[device].free += search_key.bytes; - - if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", - device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), - (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); - } - } - - // Unlock - mutex.Unlock(); - - // First set to specified device (entrypoint may not be set) - if (device != entrypoint_device) - { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; - if (CubDebug(error = cudaSetDevice(device))) return error; - } - - if (recached) - { - // Insert the ready event in the associated stream (must have current device set properly) - if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error; - } - else - { - // Free the allocation from the runtime and cleanup the event. - if (CubDebug(error = cudaFree(d_ptr))) return error; - if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; - - if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", - device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live); - } - - // Reset device - if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) - { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; - } - - return error; - } - - - /** - * \brief Frees a live allocation of device memory on the current device, returning it to the allocator. - * - * Once freed, the allocation becomes available immediately for reuse within the \p active_stream - * with which it was associated with during allocation, and it becomes available for reuse within other - * streams when all prior work submitted to \p active_stream has completed. - */ - cudaError_t DeviceFree( - void* d_ptr) - { - return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); - } - - - /** - * \brief Frees all cached device allocations on all devices - */ - cudaError_t FreeAllCached() - { - cudaError_t error = cudaSuccess; - int entrypoint_device = INVALID_DEVICE_ORDINAL; - int current_device = INVALID_DEVICE_ORDINAL; - - mutex.Lock(); - - while (!cached_blocks.empty()) - { - // Get first block - CachedBlocks::iterator begin = cached_blocks.begin(); - - // Get entry-point device ordinal if necessary - if (entrypoint_device == INVALID_DEVICE_ORDINAL) - { - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; - } - - // Set current device ordinal if necessary - if (begin->device != current_device) - { - if (CubDebug(error = cudaSetDevice(begin->device))) break; - current_device = begin->device; - } - - // Free device memory - if (CubDebug(error = cudaFree(begin->d_ptr))) break; - if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break; - - // Reduce balance and erase entry - cached_bytes[current_device].free -= begin->bytes; - - if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", - current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live); - - cached_blocks.erase(begin); - } - - mutex.Unlock(); - - // Attempt to revert back to entry-point device if necessary - if (entrypoint_device != INVALID_DEVICE_ORDINAL) - { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; - } - - return error; - } - - - /** - * \brief Destructor - */ - virtual ~CachingDeviceAllocator() - { - if (!skip_cleanup) - FreeAllCached(); - } - -}; - - - - -/** @} */ // end group UtilMgmt - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/util_arch.cuh b/SRC/cub/util_arch.cuh deleted file mode 100644 index 28d81e7c..00000000 --- a/SRC/cub/util_arch.cuh +++ /dev/null @@ -1,151 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Static architectural properties by SM version. - */ - -#pragma once - -#include "util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS) - #define CUB_USE_COOPERATIVE_GROUPS -#endif - -/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). -#ifndef CUB_PTX_ARCH - #ifndef __CUDA_ARCH__ - #define CUB_PTX_ARCH 0 - #else - #define CUB_PTX_ARCH __CUDA_ARCH__ - #endif -#endif - - -/// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. -#ifndef CUB_RUNTIME_FUNCTION - #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) - #define CUB_RUNTIME_ENABLED - #define CUB_RUNTIME_FUNCTION __host__ __device__ - #else - #define CUB_RUNTIME_FUNCTION __host__ - #endif -#endif - - -/// Number of threads per warp -#ifndef CUB_LOG_WARP_THREADS - #define CUB_LOG_WARP_THREADS(arch) \ - (5) - #define CUB_WARP_THREADS(arch) \ - (1 << CUB_LOG_WARP_THREADS(arch)) - - #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) - #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) -#endif - - -/// Number of smem banks -#ifndef CUB_LOG_SMEM_BANKS - #define CUB_LOG_SMEM_BANKS(arch) \ - ((arch >= 200) ? \ - (5) : \ - (4)) - #define CUB_SMEM_BANKS(arch) \ - (1 << CUB_LOG_SMEM_BANKS(arch)) - - #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) - #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) -#endif - - -/// Oversubscription factor -#ifndef CUB_SUBSCRIPTION_FACTOR - #define CUB_SUBSCRIPTION_FACTOR(arch) \ - ((arch >= 300) ? \ - (5) : \ - ((arch >= 200) ? \ - (3) : \ - (10))) - #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH) -#endif - - -/// Prefer padding overhead vs X-way conflicts greater than this threshold -#ifndef CUB_PREFER_CONFLICT_OVER_PADDING - #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ - ((arch >= 300) ? \ - (1) : \ - (4)) - #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) -#endif - - -/// Scale down the number of threads to keep same amount of scratch storage as the nominal configuration for 4B data. Minimum of two warps. -#ifndef CUB_SCALED_BLOCK_THREADS - #define CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ - (CUB_MIN( \ - NOMINAL_4B_BLOCK_THREADS, \ - CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX( \ - 2, \ - (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T)))) -#endif - -/// Scale down number of items per thread to keep the same amount of register storage as the nominal configuration for 4B data. Minimum 1 item per thread -#ifndef CUB_SCALED_ITEMS_PER_THREAD - #define CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ - CUB_MAX( \ - 1, \ - (sizeof(T) < 4) ? \ - ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) / 2 : \ - ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)) -#endif - -/// Define both nominal threads-per-block and items-per-thread -#ifndef CUB_SCALED_GRANULARITIES - #define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T) \ - CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200), \ - CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200) -#endif - - - -#endif // Do not document - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/util_debug.cuh b/SRC/cub/util_debug.cuh deleted file mode 100644 index 3ad832e7..00000000 --- a/SRC/cub/util_debug.cuh +++ /dev/null @@ -1,145 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Error and event logging routines. - * - * The following macros definitions are supported: - * - \p CUB_LOG. Simple event messages are printed to \p stdout. - */ - -#pragma once - -#include -#include "util_namespace.cuh" -#include "util_arch.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilMgmt - * @{ - */ - - -/// CUB error reporting macro (prints error messages to stderr) -#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR) - #define CUB_STDERR -#endif - - - -/** - * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context. - * - * \return The CUDA error. - */ -__host__ __device__ __forceinline__ cudaError_t Debug( - cudaError_t error, - const char* filename, - int line) -{ - (void)filename; - (void)line; -#ifdef CUB_STDERR - if (error) - { - #if (CUB_PTX_ARCH == 0) - fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); - fflush(stderr); - #elif (CUB_PTX_ARCH >= 200) - printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); - #endif - } -#endif - return error; -} - - -/** - * \brief Debug macro - */ -#ifndef CubDebug - #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__) -#endif - - -/** - * \brief Debug macro with exit - */ -#ifndef CubDebugExit - #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); } -#endif - - -/** - * \brief Log macro for printf statements. - */ -#if !defined(_CubLog) - #if !(defined(__clang__) && defined(__CUDA__)) - #if (CUB_PTX_ARCH == 0) - #define _CubLog(format, ...) printf(format,__VA_ARGS__); - #elif (CUB_PTX_ARCH >= 200) - #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__); - #endif - #else - // XXX shameless hack for clang around variadic printf... - // Compilies w/o supplying -std=c++11 but shows warning, - // so we sielence them :) - #pragma clang diagnostic ignored "-Wc++11-extensions" - #pragma clang diagnostic ignored "-Wunnamed-type-template-args" - template - inline __host__ __device__ void va_printf(char const* format, Args const&... args) - { - #ifdef __CUDA_ARCH__ - printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...); - #else - printf(format, args...); - #endif - } - #ifndef __CUDA_ARCH__ - #define _CubLog(format, ...) va_printf(format,__VA_ARGS__); - #else - #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__); - #endif - #endif -#endif - - - - -/** @} */ // end group UtilMgmt - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/util_device.cuh b/SRC/cub/util_device.cuh deleted file mode 100644 index a5f3b614..00000000 --- a/SRC/cub/util_device.cuh +++ /dev/null @@ -1,347 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Properties of a given CUDA device and the corresponding PTX bundle - */ - -#pragma once - -#include "util_type.cuh" -#include "util_arch.cuh" -#include "util_debug.cuh" -#include "util_namespace.cuh" -#include "util_macro.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilMgmt - * @{ - */ - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - -/** - * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). - */ -template -__host__ __device__ __forceinline__ -cudaError_t AliasTemporaries( - void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. - size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation - void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed - size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed -{ - const int ALIGN_BYTES = 256; - const int ALIGN_MASK = ~(ALIGN_BYTES - 1); - - // Compute exclusive prefix sum over allocation requests - size_t allocation_offsets[ALLOCATIONS]; - size_t bytes_needed = 0; - for (int i = 0; i < ALLOCATIONS; ++i) - { - size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK; - allocation_offsets[i] = bytes_needed; - bytes_needed += allocation_bytes; - } - bytes_needed += ALIGN_BYTES - 1; - - // Check if the caller is simply requesting the size of the storage allocation - if (!d_temp_storage) - { - temp_storage_bytes = bytes_needed; - return cudaSuccess; - } - - // Check if enough storage provided - if (temp_storage_bytes < bytes_needed) - { - return CubDebug(cudaErrorInvalidValue); - } - - // Alias - d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK); - for (int i = 0; i < ALLOCATIONS; ++i) - { - allocations[i] = static_cast(d_temp_storage) + allocation_offsets[i]; - } - - return cudaSuccess; -} - - -/** - * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device - */ -template -__global__ void EmptyKernel(void) { } - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - -/** - * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10) - */ -CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version) -{ - struct Dummy - { - /// Type definition of the EmptyKernel kernel entry point - typedef void (*EmptyKernelPtr)(); - - /// Force EmptyKernel to be generated if this class is used - CUB_RUNTIME_FUNCTION __forceinline__ - EmptyKernelPtr Empty() - { - return EmptyKernel; - } - }; - - -#ifndef CUB_RUNTIME_ENABLED - (void)ptx_version; - - // CUDA API calls not supported from this device - return cudaErrorInvalidConfiguration; - -#elif (CUB_PTX_ARCH > 0) - - ptx_version = CUB_PTX_ARCH; - return cudaSuccess; - -#else - - cudaError_t error = cudaSuccess; - do - { - cudaFuncAttributes empty_kernel_attrs; - if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel))) break; - ptx_version = empty_kernel_attrs.ptxVersion * 10; - } - while (0); - - return error; - -#endif -} - - -/** - * \brief Retrieves the SM version (major * 100 + minor * 10) - */ -CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal) -{ -#ifndef CUB_RUNTIME_ENABLED - (void)sm_version; - (void)device_ordinal; - - // CUDA API calls not supported from this device - return cudaErrorInvalidConfiguration; - -#else - - cudaError_t error = cudaSuccess; - do - { - // Fill in SM version - int major, minor; - if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break; - if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break; - sm_version = major * 100 + minor * 10; - } - while (0); - - return error; - -#endif -} - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -/** - * Synchronize the stream if specified - */ -CUB_RUNTIME_FUNCTION __forceinline__ -static cudaError_t SyncStream(cudaStream_t stream) -{ -#if (CUB_PTX_ARCH == 0) - return cudaStreamSynchronize(stream); -#else - (void)stream; - // Device can't yet sync on a specific stream - return cudaDeviceSynchronize(); -#endif -} - - -/** - * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block. - * - * \par Snippet - * The code snippet below illustrates the use of the MaxSmOccupancy function. - * \par - * \code - * #include // or equivalently - * - * template - * __global__ void ExampleKernel() - * { - * // Allocate shared memory for BlockScan - * __shared__ volatile T buffer[4096]; - * - * ... - * } - * - * ... - * - * // Determine SM occupancy for ExampleKernel specialized for unsigned char - * int max_sm_occupancy; - * MaxSmOccupancy(max_sm_occupancy, ExampleKernel, 64); - * - * // max_sm_occupancy <-- 4 on SM10 - * // max_sm_occupancy <-- 8 on SM20 - * // max_sm_occupancy <-- 12 on SM35 - * - * \endcode - * - */ -template -CUB_RUNTIME_FUNCTION __forceinline__ -cudaError_t MaxSmOccupancy( - int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM - KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy - int block_threads, ///< [in] Number of threads per thread block - int dynamic_smem_bytes = 0) -{ -#ifndef CUB_RUNTIME_ENABLED - (void)dynamic_smem_bytes; - (void)block_threads; - (void)kernel_ptr; - (void)max_sm_occupancy; - - // CUDA API calls not supported from this device - return CubDebug(cudaErrorInvalidConfiguration); - -#else - - return cudaOccupancyMaxActiveBlocksPerMultiprocessor ( - &max_sm_occupancy, - kernel_ptr, - block_threads, - dynamic_smem_bytes); - -#endif // CUB_RUNTIME_ENABLED -} - - -/****************************************************************************** - * Policy management - ******************************************************************************/ - -/** - * Kernel dispatch configuration - */ -struct KernelConfig -{ - int block_threads; - int items_per_thread; - int tile_size; - int sm_occupancy; - - CUB_RUNTIME_FUNCTION __forceinline__ - KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {} - - template - CUB_RUNTIME_FUNCTION __forceinline__ - cudaError_t Init(KernelPtrT kernel_ptr) - { - block_threads = AgentPolicyT::BLOCK_THREADS; - items_per_thread = AgentPolicyT::ITEMS_PER_THREAD; - tile_size = block_threads * items_per_thread; - cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads); - return retval; - } -}; - - - -/// Helper for dispatching into a policy chain -template -struct ChainedPolicy -{ - /// The policy for the active compiler pass - typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy; - - /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version - template - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Invoke(int ptx_version, FunctorT &op) - { - if (ptx_version < PTX_VERSION) { - return PrevPolicyT::Invoke(ptx_version, op); - } - return op.template Invoke(); - } -}; - -/// Helper for dispatching into a policy chain (end-of-chain specialization) -template -struct ChainedPolicy -{ - /// The policy for the active compiler pass - typedef PolicyT ActivePolicy; - - /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version - template - CUB_RUNTIME_FUNCTION __forceinline__ - static cudaError_t Invoke(int /*ptx_version*/, FunctorT &op) { - return op.template Invoke(); - } -}; - - - - -#endif // Do not document - - - - -/** @} */ // end group UtilMgmt - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/util_macro.cuh b/SRC/cub/util_macro.cuh deleted file mode 100644 index ff863654..00000000 --- a/SRC/cub/util_macro.cuh +++ /dev/null @@ -1,103 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/****************************************************************************** - * Common C/C++ macro utilities - ******************************************************************************/ - -#pragma once - -#include "util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilModule - * @{ - */ - -#ifndef CUB_ALIGN - #if defined(_WIN32) || defined(_WIN64) - /// Align struct - #define CUB_ALIGN(bytes) __declspec(align(32)) - #else - /// Align struct - #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) - #endif -#endif - -#ifndef CUB_MAX - /// Select maximum(a, b) - #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) -#endif - -#ifndef CUB_MIN - /// Select minimum(a, b) - #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) -#endif - -#ifndef CUB_QUOTIENT_FLOOR - /// Quotient of x/y rounded down to nearest integer - #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) -#endif - -#ifndef CUB_QUOTIENT_CEILING - /// Quotient of x/y rounded up to nearest integer - #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) -#endif - -#ifndef CUB_ROUND_UP_NEAREST - /// x rounded up to the nearest multiple of y - #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) -#endif - -#ifndef CUB_ROUND_DOWN_NEAREST - /// x rounded down to the nearest multiple of y - #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) -#endif - - -#ifndef CUB_STATIC_ASSERT - #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - #define CUB_CAT_(a, b) a ## b - #define CUB_CAT(a, b) CUB_CAT_(a, b) - #endif // DOXYGEN_SHOULD_SKIP_THIS - - /// Static assert - #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] -#endif - -/** @} */ // end group UtilModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/util_namespace.cuh b/SRC/cub/util_namespace.cuh deleted file mode 100644 index c8991d08..00000000 --- a/SRC/cub/util_namespace.cuh +++ /dev/null @@ -1,46 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Place-holder for prefixing the cub namespace - */ - -#pragma once - -// For example: -//#define CUB_NS_PREFIX namespace thrust{ namespace detail { -//#define CUB_NS_POSTFIX } } - -#ifndef CUB_NS_PREFIX -#define CUB_NS_PREFIX -#endif - -#ifndef CUB_NS_POSTFIX -#define CUB_NS_POSTFIX -#endif diff --git a/SRC/cub/util_ptx.cuh b/SRC/cub/util_ptx.cuh deleted file mode 100644 index 582ca0d8..00000000 --- a/SRC/cub/util_ptx.cuh +++ /dev/null @@ -1,758 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * PTX intrinsics - */ - - -#pragma once - -#include "util_type.cuh" -#include "util_arch.cuh" -#include "util_namespace.cuh" -#include "util_debug.cuh" - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilPtx - * @{ - */ - - -/****************************************************************************** - * PTX helper macros - ******************************************************************************/ - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -/** - * Register modifier for pointer-types (for inlining PTX assembly) - */ -#if defined(_WIN64) || defined(__LP64__) - #define __CUB_LP64__ 1 - // 64-bit register modifier for inlined asm - #define _CUB_ASM_PTR_ "l" - #define _CUB_ASM_PTR_SIZE_ "u64" -#else - #define __CUB_LP64__ 0 - // 32-bit register modifier for inlined asm - #define _CUB_ASM_PTR_ "r" - #define _CUB_ASM_PTR_SIZE_ "u32" -#endif - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/****************************************************************************** - * Inlined PTX intrinsics - ******************************************************************************/ - -/** - * \brief Shift-right then add. Returns (\p x >> \p shift) + \p addend. - */ -__device__ __forceinline__ unsigned int SHR_ADD( - unsigned int x, - unsigned int shift, - unsigned int addend) -{ - unsigned int ret; -#if CUB_PTX_ARCH >= 200 - asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" : - "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); -#else - ret = (x >> shift) + addend; -#endif - return ret; -} - - -/** - * \brief Shift-left then add. Returns (\p x << \p shift) + \p addend. - */ -__device__ __forceinline__ unsigned int SHL_ADD( - unsigned int x, - unsigned int shift, - unsigned int addend) -{ - unsigned int ret; -#if CUB_PTX_ARCH >= 200 - asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" : - "=r"(ret) : "r"(x), "r"(shift), "r"(addend)); -#else - ret = (x << shift) + addend; -#endif - return ret; -} - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -/** - * Bitfield-extract. - */ -template -__device__ __forceinline__ unsigned int BFE( - UnsignedBits source, - unsigned int bit_start, - unsigned int num_bits, - Int2Type /*byte_len*/) -{ - unsigned int bits; -#if CUB_PTX_ARCH >= 200 - asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits)); -#else - const unsigned int MASK = (1 << num_bits) - 1; - bits = (source >> bit_start) & MASK; -#endif - return bits; -} - - -/** - * Bitfield-extract for 64-bit types. - */ -template -__device__ __forceinline__ unsigned int BFE( - UnsignedBits source, - unsigned int bit_start, - unsigned int num_bits, - Int2Type<8> /*byte_len*/) -{ - const unsigned long long MASK = (1ull << num_bits) - 1; - return (source >> bit_start) & MASK; -} - -#endif // DOXYGEN_SHOULD_SKIP_THIS - -/** - * \brief Bitfield-extract. Extracts \p num_bits from \p source starting at bit-offset \p bit_start. The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type. - */ -template -__device__ __forceinline__ unsigned int BFE( - UnsignedBits source, - unsigned int bit_start, - unsigned int num_bits) -{ - return BFE(source, bit_start, num_bits, Int2Type()); -} - - -/** - * \brief Bitfield insert. Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start. - */ -__device__ __forceinline__ void BFI( - unsigned int &ret, - unsigned int x, - unsigned int y, - unsigned int bit_start, - unsigned int num_bits) -{ -#if CUB_PTX_ARCH >= 200 - asm ("bfi.b32 %0, %1, %2, %3, %4;" : - "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits)); -#else - x <<= bit_start; - unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start; - unsigned int MASK_Y = ~MASK_X; - ret = (y & MASK_Y) | (x & MASK_X); -#endif -} - - -/** - * \brief Three-operand add. Returns \p x + \p y + \p z. - */ -__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z) -{ -#if CUB_PTX_ARCH >= 200 - asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z)); -#else - x = x + y + z; -#endif - return x; -} - - -/** - * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register. For SM2.0 or later. - * - * \par - * The bytes in the two source registers \p a and \p b are numbered from 0 to 7: - * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes - * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within - * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0} - * - * \par Snippet - * The code snippet below illustrates byte-permute. - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * int a = 0x03020100; - * int b = 0x07060504; - * int index = 0x00007531; - * - * int selected = PRMT(a, b, index); // 0x07050301 - * - * \endcode - * - */ -__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index) -{ - int ret; - asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); - return ret; -} - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -/** - * Sync-threads barrier. - */ -__device__ __forceinline__ void BAR(int count) -{ - asm volatile("bar.sync 1, %0;" : : "r"(count)); -} - -/** - * CTA barrier - */ -__device__ __forceinline__ void CTA_SYNC() -{ - __syncthreads(); -} - - -/** - * CTA barrier with predicate - */ -__device__ __forceinline__ int CTA_SYNC_AND(int p) -{ - return __syncthreads_and(p); -} - - -/** - * Warp barrier - */ -__device__ __forceinline__ void WARP_SYNC(unsigned int member_mask) -{ -#ifdef CUB_USE_COOPERATIVE_GROUPS - __syncwarp(member_mask); -#endif -} - - -/** - * Warp any - */ -__device__ __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask) -{ -#ifdef CUB_USE_COOPERATIVE_GROUPS - return __any_sync(member_mask, predicate); -#else - return ::__any(predicate); -#endif -} - - -/** - * Warp any - */ -__device__ __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask) -{ -#ifdef CUB_USE_COOPERATIVE_GROUPS - return __all_sync(member_mask, predicate); -#else - return ::__all(predicate); -#endif -} - - -/** - * Warp ballot - */ -__device__ __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask) -{ -#ifdef CUB_USE_COOPERATIVE_GROUPS - return __ballot_sync(member_mask, predicate); -#else - return __ballot(predicate); -#endif -} - -/** - * Warp synchronous shfl_up - */ -__device__ __forceinline__ -unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) -{ -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;" - : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); -#else - asm volatile("shfl.up.b32 %0, %1, %2, %3;" - : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags)); -#endif - return word; -} - -/** - * Warp synchronous shfl_down - */ -__device__ __forceinline__ -unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask) -{ -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;" - : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask)); -#else - asm volatile("shfl.down.b32 %0, %1, %2, %3;" - : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags)); -#endif - return word; -} - -/** - * Warp synchronous shfl_idx - */ -__device__ __forceinline__ -unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask) -{ -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" - : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask)); -#else - asm volatile("shfl.idx.b32 %0, %1, %2, %3;" - : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags)); -#endif - return word; -} - -/** - * Floating point multiply. (Mantissa LSB rounds towards zero.) - */ -__device__ __forceinline__ float FMUL_RZ(float a, float b) -{ - float d; - asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b)); - return d; -} - - -/** - * Floating point multiply-add. (Mantissa LSB rounds towards zero.) - */ -__device__ __forceinline__ float FFMA_RZ(float a, float b, float c) -{ - float d; - asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c)); - return d; -} - -#endif // DOXYGEN_SHOULD_SKIP_THIS - -/** - * \brief Terminates the calling thread - */ -__device__ __forceinline__ void ThreadExit() { - asm volatile("exit;"); -} - - -/** - * \brief Abort execution and generate an interrupt to the host CPU - */ -__device__ __forceinline__ void ThreadTrap() { - asm volatile("trap;"); -} - - -/** - * \brief Returns the row-major linear thread identifier for a multidimensional thread block - */ -__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z) -{ - return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) + - ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) + - threadIdx.x; -} - - -/** - * \brief Returns the warp lane ID of the calling thread - */ -__device__ __forceinline__ unsigned int LaneId() -{ - unsigned int ret; - asm ("mov.u32 %0, %%laneid;" : "=r"(ret) ); - return ret; -} - - -/** - * \brief Returns the warp ID of the calling thread. Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block. - */ -__device__ __forceinline__ unsigned int WarpId() -{ - unsigned int ret; - asm ("mov.u32 %0, %%warpid;" : "=r"(ret) ); - return ret; -} - -/** - * \brief Returns the warp lane mask of all lanes less than the calling thread - */ -__device__ __forceinline__ unsigned int LaneMaskLt() -{ - unsigned int ret; - asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) ); - return ret; -} - -/** - * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread - */ -__device__ __forceinline__ unsigned int LaneMaskLe() -{ - unsigned int ret; - asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) ); - return ret; -} - -/** - * \brief Returns the warp lane mask of all lanes greater than the calling thread - */ -__device__ __forceinline__ unsigned int LaneMaskGt() -{ - unsigned int ret; - asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) ); - return ret; -} - -/** - * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread - */ -__device__ __forceinline__ unsigned int LaneMaskGe() -{ - unsigned int ret; - asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) ); - return ret; -} - -/** @} */ // end group UtilPtx - - - - -/** - * \brief Shuffle-up for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei-src_offset. For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png) - * \ingroup WarpModule - * - * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. - * \tparam T [inferred] The input/output element type - * - * \par - * - Available only for SM3.0 or newer - * - * \par Snippet - * The code snippet below illustrates each thread obtaining a \p double value from the - * predecessor of its predecessor. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Obtain one input item per thread - * double thread_data = ... - * - * // Obtain item from two ranks below - * double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. - * The corresponding output \p peer_data will be {1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}. - * - */ -template < - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - typename T> -__device__ __forceinline__ T ShuffleUp( - T input, ///< [in] The value to broadcast - int src_offset, ///< [in] The relative down-offset of the peer to read from - int first_thread, ///< [in] Index of first lane in logical warp (typically 0) - unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes -{ - /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up - enum { - SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8 - }; - - typedef typename UnitWord::ShuffleWord ShuffleWord; - - const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); - - T output; - ShuffleWord *output_alias = reinterpret_cast(&output); - ShuffleWord *input_alias = reinterpret_cast(&input); - - unsigned int shuffle_word; - shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask); - output_alias[0] = shuffle_word; - - #pragma unroll - for (int WORD = 1; WORD < WORDS; ++WORD) - { - shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask); - output_alias[WORD] = shuffle_word; - } - - return output; -} - - -/** - * \brief Shuffle-down for any data type. Each warp-lanei obtains the value \p input contributed by warp-lanei+src_offset. For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread. ![](shfl_down_logo.png) - * \ingroup WarpModule - * - * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. - * \tparam T [inferred] The input/output element type - * - * \par - * - Available only for SM3.0 or newer - * - * \par Snippet - * The code snippet below illustrates each thread obtaining a \p double value from the - * successor of its successor. - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Obtain one input item per thread - * double thread_data = ... - * - * // Obtain item from two ranks below - * double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. - * The corresponding output \p peer_data will be {3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}. - * - */ -template < - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - typename T> -__device__ __forceinline__ T ShuffleDown( - T input, ///< [in] The value to broadcast - int src_offset, ///< [in] The relative up-offset of the peer to read from - int last_thread, ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp) - unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes -{ - /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up - enum { - SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8 - }; - - typedef typename UnitWord::ShuffleWord ShuffleWord; - - const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); - - T output; - ShuffleWord *output_alias = reinterpret_cast(&output); - ShuffleWord *input_alias = reinterpret_cast(&input); - - unsigned int shuffle_word; - shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask); - output_alias[0] = shuffle_word; - - #pragma unroll - for (int WORD = 1; WORD < WORDS; ++WORD) - { - shuffle_word = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask); - output_alias[WORD] = shuffle_word; - } - - return output; -} - - -/** - * \brief Shuffle-broadcast for any data type. Each warp-lanei obtains the value \p input - * contributed by warp-lanesrc_lane. For \p src_lane < 0 or \p src_lane >= WARP_THREADS, - * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png) - * - * \tparam LOGICAL_WARP_THREADS The number of threads per "logical" warp. Must be a power-of-two <= 32. - * \tparam T [inferred] The input/output element type - * - * \ingroup WarpModule - * - * \par - * - Available only for SM3.0 or newer - * - * \par Snippet - * The code snippet below illustrates each thread obtaining a \p double value from warp-lane0. - * - * \par - * \code - * #include // or equivalently - * - * __global__ void ExampleKernel(...) - * { - * // Obtain one input item per thread - * double thread_data = ... - * - * // Obtain item from thread 0 - * double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the first warp of threads is {1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}. - * The corresponding output \p peer_data will be {1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}. - * - */ -template < - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - typename T> -__device__ __forceinline__ T ShuffleIndex( - T input, ///< [in] The value to broadcast - int src_lane, ///< [in] Which warp lane is to do the broadcasting - unsigned int member_mask) ///< [in] 32-bit mask of participating warp lanes -{ - /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up - enum { - SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1) - }; - - typedef typename UnitWord::ShuffleWord ShuffleWord; - - const int WORDS = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord); - - T output; - ShuffleWord *output_alias = reinterpret_cast(&output); - ShuffleWord *input_alias = reinterpret_cast(&input); - - unsigned int shuffle_word; - shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0], - src_lane, - SHFL_C, - member_mask); - - output_alias[0] = shuffle_word; - - #pragma unroll - for (int WORD = 1; WORD < WORDS; ++WORD) - { - shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD], - src_lane, - SHFL_C, - member_mask); - - output_alias[WORD] = shuffle_word; - } - - return output; -} - - - -/** - * Compute a 32b mask of threads having the same least-significant - * LABEL_BITS of \p label as the calling thread. - */ -template -inline __device__ unsigned int MatchAny(unsigned int label) -{ - unsigned int retval; - - // Extract masks of common threads for each bit - #pragma unroll - for (int BIT = 0; BIT < LABEL_BITS; ++BIT) - { - unsigned int mask; - unsigned int current_bit = 1 << BIT; - asm ("{\n" - " .reg .pred p;\n" - " and.b32 %0, %1, %2;" - " setp.eq.u32 p, %0, %2;\n" -#ifdef CUB_USE_COOPERATIVE_GROUPS - " vote.ballot.sync.b32 %0, p, 0xffffffff;\n" -#else - " vote.ballot.b32 %0, p;\n" -#endif - " @!p not.b32 %0, %0;\n" - "}\n" : "=r"(mask) : "r"(label), "r"(current_bit)); - - // Remove peers who differ - retval = (BIT == 0) ? mask : retval & mask; - } - - return retval; - -// // VOLTA match -// unsigned int retval; -// asm ("{\n" -// " match.any.sync.b32 %0, %1, 0xffffffff;\n" -// "}\n" : "=r"(retval) : "r"(label)); -// return retval; - -} - - - - - - - - - - - - - - - - - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/util_type.cuh b/SRC/cub/util_type.cuh deleted file mode 100644 index 0ba41e1e..00000000 --- a/SRC/cub/util_type.cuh +++ /dev/null @@ -1,1167 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * Common type manipulation (metaprogramming) utilities - */ - -#pragma once - -#include -#include -#include - -#if (__CUDACC_VER_MAJOR__ >= 9) - #include -#endif - -#include "util_macro.cuh" -#include "util_arch.cuh" -#include "util_namespace.cuh" - - - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup UtilModule - * @{ - */ - - - -/****************************************************************************** - * Type equality - ******************************************************************************/ - -/** - * \brief Type selection (IF ? ThenType : ElseType) - */ -template -struct If -{ - /// Conditional type result - typedef ThenType Type; // true -}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -struct If -{ - typedef ElseType Type; // false -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - -/****************************************************************************** - * Conditional types - ******************************************************************************/ - -/** - * \brief Type equality test - */ -template -struct Equals -{ - enum { - VALUE = 0, - NEGATE = 1 - }; -}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -struct Equals -{ - enum { - VALUE = 1, - NEGATE = 0 - }; -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/****************************************************************************** - * Static math - ******************************************************************************/ - -/** - * \brief Statically determine log2(N), rounded up. - * - * For example: - * Log2<8>::VALUE // 3 - * Log2<3>::VALUE // 2 - */ -template -struct Log2 -{ - /// Static logarithm value - enum { VALUE = Log2> 1), COUNT + 1>::VALUE }; // Inductive case -}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -struct Log2 -{ - enum {VALUE = (1 << (COUNT - 1) < N) ? // Base case - COUNT : - COUNT - 1 }; -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/** - * \brief Statically determine if N is a power-of-two - */ -template -struct PowerOfTwo -{ - enum { VALUE = ((N & (N - 1)) == 0) }; -}; - - - -/****************************************************************************** - * Pointer vs. iterator detection - ******************************************************************************/ - -/** - * \brief Pointer vs. iterator - */ -template -struct IsPointer -{ - enum { VALUE = 0 }; -}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -struct IsPointer -{ - enum { VALUE = 1 }; -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - -/****************************************************************************** - * Qualifier detection - ******************************************************************************/ - -/** - * \brief Volatile modifier test - */ -template -struct IsVolatile -{ - enum { VALUE = 0 }; -}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -struct IsVolatile -{ - enum { VALUE = 1 }; -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/****************************************************************************** - * Qualifier removal - ******************************************************************************/ - -/** - * \brief Removes \p const and \p volatile qualifiers from type \p Tp. - * - * For example: - * typename RemoveQualifiers::Type // int; - */ -template -struct RemoveQualifiers -{ - /// Type without \p const and \p volatile qualifiers - typedef Up Type; -}; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -template -struct RemoveQualifiers -{ - typedef Up Type; -}; - -template -struct RemoveQualifiers -{ - typedef Up Type; -}; - -template -struct RemoveQualifiers -{ - typedef Up Type; -}; - - -/****************************************************************************** - * Marker types - ******************************************************************************/ - -/** - * \brief A simple "NULL" marker type - */ -struct NullType -{ -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - template - __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; } - - __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; } - - __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; } - -#endif // DOXYGEN_SHOULD_SKIP_THIS -}; - - -/** - * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values) - */ -template -struct Int2Type -{ - enum {VALUE = A}; -}; - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - -/****************************************************************************** - * Size and alignment - ******************************************************************************/ - -/// Structure alignment -template -struct AlignBytes -{ - struct Pad - { - T val; - char byte; - }; - - enum - { - /// The "true CUDA" alignment of T in bytes - ALIGN_BYTES = sizeof(Pad) - sizeof(T) - }; - - /// The "truly aligned" type - typedef T Type; -}; - -// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree -// with device C++ compilers (EDG) on types passed as template parameters through -// kernel functions - -#define __CUB_ALIGN_BYTES(t, b) \ - template <> struct AlignBytes \ - { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; }; - -__CUB_ALIGN_BYTES(short4, 8) -__CUB_ALIGN_BYTES(ushort4, 8) -__CUB_ALIGN_BYTES(int2, 8) -__CUB_ALIGN_BYTES(uint2, 8) -__CUB_ALIGN_BYTES(long long, 8) -__CUB_ALIGN_BYTES(unsigned long long, 8) -__CUB_ALIGN_BYTES(float2, 8) -__CUB_ALIGN_BYTES(double, 8) -#ifdef _WIN32 - __CUB_ALIGN_BYTES(long2, 8) - __CUB_ALIGN_BYTES(ulong2, 8) -#else - __CUB_ALIGN_BYTES(long2, 16) - __CUB_ALIGN_BYTES(ulong2, 16) -#endif -__CUB_ALIGN_BYTES(int4, 16) -__CUB_ALIGN_BYTES(uint4, 16) -__CUB_ALIGN_BYTES(float4, 16) -__CUB_ALIGN_BYTES(long4, 16) -__CUB_ALIGN_BYTES(ulong4, 16) -__CUB_ALIGN_BYTES(longlong2, 16) -__CUB_ALIGN_BYTES(ulonglong2, 16) -__CUB_ALIGN_BYTES(double2, 16) -__CUB_ALIGN_BYTES(longlong4, 16) -__CUB_ALIGN_BYTES(ulonglong4, 16) -__CUB_ALIGN_BYTES(double4, 16) - -template struct AlignBytes : AlignBytes {}; -template struct AlignBytes : AlignBytes {}; -template struct AlignBytes : AlignBytes {}; - - -/// Unit-words of data movement -template -struct UnitWord -{ - enum { - ALIGN_BYTES = AlignBytes::ALIGN_BYTES - }; - - template - struct IsMultiple - { - enum { - UNIT_ALIGN_BYTES = AlignBytes::ALIGN_BYTES, - IS_MULTIPLE = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0) - }; - }; - - /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T - typedef typename If::IS_MULTIPLE, - unsigned int, - typename If::IS_MULTIPLE, - unsigned short, - unsigned char>::Type>::Type ShuffleWord; - - /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T - typedef typename If::IS_MULTIPLE, - unsigned long long, - ShuffleWord>::Type VolatileWord; - - /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T - typedef typename If::IS_MULTIPLE, - ulonglong2, - VolatileWord>::Type DeviceWord; - - /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T - typedef typename If::IS_MULTIPLE, - uint4, - typename If::IS_MULTIPLE, - uint2, - ShuffleWord>::Type>::Type TextureWord; -}; - - -// float2 specialization workaround (for SM10-SM13) -template <> -struct UnitWord -{ - typedef int ShuffleWord; -#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) - typedef float VolatileWord; - typedef uint2 DeviceWord; -#else - typedef unsigned long long VolatileWord; - typedef unsigned long long DeviceWord; -#endif - typedef float2 TextureWord; -}; - -// float4 specialization workaround (for SM10-SM13) -template <> -struct UnitWord -{ - typedef int ShuffleWord; -#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) - typedef float VolatileWord; - typedef uint4 DeviceWord; -#else - typedef unsigned long long VolatileWord; - typedef ulonglong2 DeviceWord; -#endif - typedef float4 TextureWord; -}; - - -// char2 specialization workaround (for SM10-SM13) -template <> -struct UnitWord -{ - typedef unsigned short ShuffleWord; -#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130) - typedef unsigned short VolatileWord; - typedef short DeviceWord; -#else - typedef unsigned short VolatileWord; - typedef unsigned short DeviceWord; -#endif - typedef unsigned short TextureWord; -}; - - -template struct UnitWord : UnitWord {}; -template struct UnitWord : UnitWord {}; -template struct UnitWord : UnitWord {}; - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - -/****************************************************************************** - * Vector type inference utilities. - ******************************************************************************/ - -/** - * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists. Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields. - */ -template struct CubVector; - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - -enum -{ - /// The maximum number of elements in CUDA vector types - MAX_VEC_ELEMENTS = 4, -}; - - -/** - * Generic vector-1 type - */ -template -struct CubVector -{ - T x; - - typedef T BaseType; - typedef CubVector Type; -}; - -/** - * Generic vector-2 type - */ -template -struct CubVector -{ - T x; - T y; - - typedef T BaseType; - typedef CubVector Type; -}; - -/** - * Generic vector-3 type - */ -template -struct CubVector -{ - T x; - T y; - T z; - - typedef T BaseType; - typedef CubVector Type; -}; - -/** - * Generic vector-4 type - */ -template -struct CubVector -{ - T x; - T y; - T z; - T w; - - typedef T BaseType; - typedef CubVector Type; -}; - - -/** - * Macro for expanding partially-specialized built-in vector types - */ -#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type) \ - \ - template<> struct CubVector : short_type##1 \ - { \ - typedef base_type BaseType; \ - typedef short_type##1 Type; \ - __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x + other.x; \ - return retval; \ - } \ - __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x - other.x; \ - return retval; \ - } \ - }; \ - \ - template<> struct CubVector : short_type##2 \ - { \ - typedef base_type BaseType; \ - typedef short_type##2 Type; \ - __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x + other.x; \ - retval.y = y + other.y; \ - return retval; \ - } \ - __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x - other.x; \ - retval.y = y - other.y; \ - return retval; \ - } \ - }; \ - \ - template<> struct CubVector : short_type##3 \ - { \ - typedef base_type BaseType; \ - typedef short_type##3 Type; \ - __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x + other.x; \ - retval.y = y + other.y; \ - retval.z = z + other.z; \ - return retval; \ - } \ - __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x - other.x; \ - retval.y = y - other.y; \ - retval.z = z - other.z; \ - return retval; \ - } \ - }; \ - \ - template<> struct CubVector : short_type##4 \ - { \ - typedef base_type BaseType; \ - typedef short_type##4 Type; \ - __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x + other.x; \ - retval.y = y + other.y; \ - retval.z = z + other.z; \ - retval.w = w + other.w; \ - return retval; \ - } \ - __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const { \ - CubVector retval; \ - retval.x = x - other.x; \ - retval.y = y - other.y; \ - retval.z = z - other.z; \ - retval.w = w - other.w; \ - return retval; \ - } \ - }; - - - -// Expand CUDA vector types for built-in primitives -CUB_DEFINE_VECTOR_TYPE(char, char) -CUB_DEFINE_VECTOR_TYPE(signed char, char) -CUB_DEFINE_VECTOR_TYPE(short, short) -CUB_DEFINE_VECTOR_TYPE(int, int) -CUB_DEFINE_VECTOR_TYPE(long, long) -CUB_DEFINE_VECTOR_TYPE(long long, longlong) -CUB_DEFINE_VECTOR_TYPE(unsigned char, uchar) -CUB_DEFINE_VECTOR_TYPE(unsigned short, ushort) -CUB_DEFINE_VECTOR_TYPE(unsigned int, uint) -CUB_DEFINE_VECTOR_TYPE(unsigned long, ulong) -CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong) -CUB_DEFINE_VECTOR_TYPE(float, float) -CUB_DEFINE_VECTOR_TYPE(double, double) -CUB_DEFINE_VECTOR_TYPE(bool, uchar) - -// Undefine macros -#undef CUB_DEFINE_VECTOR_TYPE - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - - -/****************************************************************************** - * Wrapper types - ******************************************************************************/ - -/** - * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions - */ -template -struct Uninitialized -{ - /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T - typedef typename UnitWord::DeviceWord DeviceWord; - - enum - { - WORDS = sizeof(T) / sizeof(DeviceWord) - }; - - /// Backing storage - DeviceWord storage[WORDS]; - - /// Alias - __host__ __device__ __forceinline__ T& Alias() - { - return reinterpret_cast(*this); - } -}; - - -/** - * \brief A key identifier paired with a corresponding value - */ -template < - typename _Key, - typename _Value -#if defined(_WIN32) && !defined(_WIN64) - , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES) - , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES) -#endif // #if defined(_WIN32) && !defined(_WIN64) - > -struct KeyValuePair -{ - typedef _Key Key; ///< Key data type - typedef _Value Value; ///< Value data type - - Key key; ///< Item key - Value value; ///< Item value - - /// Constructor - __host__ __device__ __forceinline__ - KeyValuePair() {} - - /// Constructor - __host__ __device__ __forceinline__ - KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} - - /// Inequality operator - __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) - { - return (value != b.value) || (key != b.key); - } -}; - -#if defined(_WIN32) && !defined(_WIN64) - -/** - * Win32 won't do 16B alignment. This can present two problems for - * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members: - * 1) If a smaller-aligned item were to be listed first, the host compiler places the - * should-be-16B item at too early an offset (and disagrees with device compiler) - * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size - * of the struct wrong (and disagrees with device compiler) - * - * So we put the larger-should-be-aligned item first, and explicitly pad the - * end of the struct - */ - -/// Smaller key specialization -template -struct KeyValuePair -{ - typedef K Key; - typedef V Value; - - typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; - - Value value; // Value has larger would-be alignment and goes first - Key key; - Pad pad; - - /// Constructor - __host__ __device__ __forceinline__ - KeyValuePair() {} - - /// Constructor - __host__ __device__ __forceinline__ - KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} - - /// Inequality operator - __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) - { - return (value != b.value) || (key != b.key); - } -}; - - -/// Smaller value specialization -template -struct KeyValuePair -{ - typedef K Key; - typedef V Value; - - typedef char Pad[AlignBytes::ALIGN_BYTES - AlignBytes::ALIGN_BYTES]; - - Key key; // Key has larger would-be alignment and goes first - Value value; - Pad pad; - - /// Constructor - __host__ __device__ __forceinline__ - KeyValuePair() {} - - /// Constructor - __host__ __device__ __forceinline__ - KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {} - - /// Inequality operator - __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b) - { - return (value != b.value) || (key != b.key); - } -}; - -#endif // #if defined(_WIN32) && !defined(_WIN64) - - -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - -/** - * \brief A wrapper for passing simple static arrays as kernel parameters - */ -template -struct ArrayWrapper -{ - - /// Statically-sized array of type \p T - T array[COUNT]; - - /// Constructor - __host__ __device__ __forceinline__ ArrayWrapper() {} -}; - -#endif // DOXYGEN_SHOULD_SKIP_THIS - -/** - * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth. - * - * Many multi-pass computations require a pair of "ping-pong" storage - * buffers (e.g., one for reading from and the other for writing to, and then - * vice-versa for the subsequent pass). This structure wraps a set of device - * buffers and a "selector" member to track which is "current". - */ -template -struct DoubleBuffer -{ - /// Pair of device buffer pointers - T *d_buffers[2]; - - /// Selector into \p d_buffers (i.e., the active/valid buffer) - int selector; - - /// \brief Constructor - __host__ __device__ __forceinline__ DoubleBuffer() - { - selector = 0; - d_buffers[0] = NULL; - d_buffers[1] = NULL; - } - - /// \brief Constructor - __host__ __device__ __forceinline__ DoubleBuffer( - T *d_current, ///< The currently valid buffer - T *d_alternate) ///< Alternate storage buffer of the same size as \p d_current - { - selector = 0; - d_buffers[0] = d_current; - d_buffers[1] = d_alternate; - } - - /// \brief Return pointer to the currently valid buffer - __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; } - - /// \brief Return pointer to the currently invalid buffer - __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; } - -}; - - - -/****************************************************************************** - * Typedef-detection - ******************************************************************************/ - - -/** - * \brief Defines a structure \p detector_name that is templated on type \p T. The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name - */ -#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name) \ - template \ - struct detector_name \ - { \ - template \ - static char& test(typename C::nested_type_name*); \ - template \ - static int& test(...); \ - enum \ - { \ - VALUE = sizeof(test(0)) < sizeof(int) \ - }; \ - }; - - - -/****************************************************************************** - * Simple enable-if (similar to Boost) - ******************************************************************************/ - -/** - * \brief Simple enable-if (similar to Boost) - */ -template -struct EnableIf -{ - /// Enable-if type for SFINAE dummy variables - typedef T Type; -}; - - -template -struct EnableIf {}; - - - -/****************************************************************************** - * Typedef-detection - ******************************************************************************/ - -/** - * \brief Determine whether or not BinaryOp's functor is of the form bool operator()(const T& a, const T&b) or bool operator()(const T& a, const T&b, unsigned int idx) - */ -template -struct BinaryOpHasIdxParam -{ -private: -/* - template struct SFINAE1 {}; - template struct SFINAE2 {}; - template struct SFINAE3 {}; - template struct SFINAE4 {}; -*/ - template struct SFINAE5 {}; - template struct SFINAE6 {}; - template struct SFINAE7 {}; - template struct SFINAE8 {}; -/* - template static char Test(SFINAE1 *); - template static char Test(SFINAE2 *); - template static char Test(SFINAE3 *); - template static char Test(SFINAE4 *); -*/ - template __host__ __device__ static char Test(SFINAE5 *); - template __host__ __device__ static char Test(SFINAE6 *); - template __host__ __device__ static char Test(SFINAE7 *); - template __host__ __device__ static char Test(SFINAE8 *); - - template static int Test(...); - -public: - - /// Whether the functor BinaryOp has a third unsigned int index param - static const bool HAS_PARAM = sizeof(Test(NULL)) == sizeof(char); -}; - - - - -/****************************************************************************** - * Simple type traits utilities. - * - * For example: - * Traits::CATEGORY // SIGNED_INTEGER - * Traits::NULL_TYPE // true - * Traits::CATEGORY // NOT_A_NUMBER - * Traits::PRIMITIVE; // false - * - ******************************************************************************/ - -/** - * \brief Basic type traits categories - */ -enum Category -{ - NOT_A_NUMBER, - SIGNED_INTEGER, - UNSIGNED_INTEGER, - FLOATING_POINT -}; - - -/** - * \brief Basic type traits - */ -template -struct BaseTraits -{ - /// Category - static const Category CATEGORY = _CATEGORY; - enum - { - PRIMITIVE = _PRIMITIVE, - NULL_TYPE = _NULL_TYPE, - }; -}; - - -/** - * Basic type traits (unsigned primitive specialization) - */ -template -struct BaseTraits -{ - typedef _UnsignedBits UnsignedBits; - - static const Category CATEGORY = UNSIGNED_INTEGER; - static const UnsignedBits LOWEST_KEY = UnsignedBits(0); - static const UnsignedBits MAX_KEY = UnsignedBits(-1); - - enum - { - PRIMITIVE = true, - NULL_TYPE = false, - }; - - - static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) - { - return key; - } - - static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) - { - return key; - } - - static __host__ __device__ __forceinline__ T Max() - { - UnsignedBits retval = MAX_KEY; - return reinterpret_cast(retval); - } - - static __host__ __device__ __forceinline__ T Lowest() - { - UnsignedBits retval = LOWEST_KEY; - return reinterpret_cast(retval); - } -}; - - -/** - * Basic type traits (signed primitive specialization) - */ -template -struct BaseTraits -{ - typedef _UnsignedBits UnsignedBits; - - static const Category CATEGORY = SIGNED_INTEGER; - static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); - static const UnsignedBits LOWEST_KEY = HIGH_BIT; - static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; - - enum - { - PRIMITIVE = true, - NULL_TYPE = false, - }; - - static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) - { - return key ^ HIGH_BIT; - }; - - static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) - { - return key ^ HIGH_BIT; - }; - - static __host__ __device__ __forceinline__ T Max() - { - UnsignedBits retval = MAX_KEY; - return reinterpret_cast(retval); - } - - static __host__ __device__ __forceinline__ T Lowest() - { - UnsignedBits retval = LOWEST_KEY; - return reinterpret_cast(retval); - } -}; - -template -struct FpLimits; - -template <> -struct FpLimits -{ - static __host__ __device__ __forceinline__ float Max() { - return FLT_MAX; - } - - static __host__ __device__ __forceinline__ float Lowest() { - return FLT_MAX * float(-1); - } -}; - -template <> -struct FpLimits -{ - static __host__ __device__ __forceinline__ double Max() { - return DBL_MAX; - } - - static __host__ __device__ __forceinline__ double Lowest() { - return DBL_MAX * double(-1); - } -}; - - -#if (__CUDACC_VER_MAJOR__ >= 9) -template <> -struct FpLimits<__half> -{ - static __host__ __device__ __forceinline__ __half Max() { - unsigned short max_word = 0x7BFF; - return reinterpret_cast<__half&>(max_word); - } - - static __host__ __device__ __forceinline__ __half Lowest() { - unsigned short lowest_word = 0xFBFF; - return reinterpret_cast<__half&>(lowest_word); - } -}; -#endif - - -/** - * Basic type traits (fp primitive specialization) - */ -template -struct BaseTraits -{ - typedef _UnsignedBits UnsignedBits; - - static const Category CATEGORY = FLOATING_POINT; - static const UnsignedBits HIGH_BIT = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1); - static const UnsignedBits LOWEST_KEY = UnsignedBits(-1); - static const UnsignedBits MAX_KEY = UnsignedBits(-1) ^ HIGH_BIT; - - enum - { - PRIMITIVE = true, - NULL_TYPE = false, - }; - - static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key) - { - UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT; - return key ^ mask; - }; - - static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key) - { - UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1); - return key ^ mask; - }; - - static __host__ __device__ __forceinline__ T Max() { - return FpLimits::Max(); - } - - static __host__ __device__ __forceinline__ T Lowest() { - return FpLimits::Lowest(); - } -}; - - -/** - * \brief Numeric type traits - */ -template struct NumericTraits : BaseTraits {}; - -template <> struct NumericTraits : BaseTraits {}; - -template <> struct NumericTraits : BaseTraits<(std::numeric_limits::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; - -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; - -template <> struct NumericTraits : BaseTraits {}; -template <> struct NumericTraits : BaseTraits {}; -#if (__CUDACC_VER_MAJOR__ >= 9) - template <> struct NumericTraits<__half> : BaseTraits {}; -#endif - -template <> struct NumericTraits : BaseTraits::VolatileWord, bool> {}; - - - -/** - * \brief Type traits - */ -template -struct Traits : NumericTraits::Type> {}; - - -#endif // DOXYGEN_SHOULD_SKIP_THIS - - -/** @} */ // end group UtilModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/specializations/warp_reduce_shfl.cuh b/SRC/cub/warp/specializations/warp_reduce_shfl.cuh deleted file mode 100644 index bbbf37e5..00000000 --- a/SRC/cub/warp/specializations/warp_reduce_shfl.cuh +++ /dev/null @@ -1,541 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "../../thread/thread_operators.cuh" -#include "../../util_ptx.cuh" -#include "../../util_type.cuh" -#include "../../util_macro.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp. - * - * LOGICAL_WARP_THREADS must be a power-of-two - */ -template < - typename T, ///< Data type being reduced - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct WarpReduceShfl -{ - //--------------------------------------------------------------------- - // Constants and type definitions - //--------------------------------------------------------------------- - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// The number of warp reduction steps - STEPS = Log2::VALUE, - - /// Number of logical warps in a PTX warp - LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS, - - /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up - SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8 - - }; - - template - struct IsInteger - { - enum { - ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange - IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) - }; - }; - - - /// Shared memory storage layout type - typedef NullType TempStorage; - - - //--------------------------------------------------------------------- - // Thread fields - //--------------------------------------------------------------------- - - /// Lane index in logical warp - unsigned int lane_id; - - /// Logical warp index in 32-thread physical warp - unsigned int warp_id; - - /// 32-thread physical warp member mask of logical warp - unsigned int member_mask; - - - //--------------------------------------------------------------------- - // Construction - //--------------------------------------------------------------------- - - /// Constructor - __device__ __forceinline__ WarpReduceShfl( - TempStorage &/*temp_storage*/) - { - lane_id = LaneId(); - warp_id = 0; - member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS); - - if (!IS_ARCH_WARP) - { - warp_id = lane_id / LOGICAL_WARP_THREADS; - lane_id = lane_id % LOGICAL_WARP_THREADS; - member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS); - } - } - - - //--------------------------------------------------------------------- - // Reduction steps - //--------------------------------------------------------------------- - - /// Reduction (specialized for summation across uint32 types) - __device__ __forceinline__ unsigned int ReduceStep( - unsigned int input, ///< [in] Calling thread's input item. - cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - unsigned int output; - int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .u32 r0;" - " .reg .pred p;" - " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" - " @p add.u32 r0, r0, %4;" - " mov.u32 %0, r0;" - "}" - : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 r0;" - " .reg .pred p;" - " shfl.down.b32 r0|p, %1, %2, %3;" - " @p add.u32 r0, r0, %4;" - " mov.u32 %0, r0;" - "}" - : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); -#endif - - return output; - } - - - /// Reduction (specialized for summation across fp32 types) - __device__ __forceinline__ float ReduceStep( - float input, ///< [in] Calling thread's input item. - cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - float output; - int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .f32 r0;" - " .reg .pred p;" - " shfl.sync.down.b32 r0|p, %1, %2, %3, %5;" - " @p add.f32 r0, r0, %4;" - " mov.f32 %0, r0;" - "}" - : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .f32 r0;" - " .reg .pred p;" - " shfl.down.b32 r0|p, %1, %2, %3;" - " @p add.f32 r0, r0, %4;" - " mov.f32 %0, r0;" - "}" - : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input)); -#endif - - return output; - } - - - /// Reduction (specialized for summation across unsigned long long types) - __device__ __forceinline__ unsigned long long ReduceStep( - unsigned long long input, ///< [in] Calling thread's input item. - cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - unsigned long long output; - int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) - -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" - " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" - " mov.b64 %0, {lo, hi};" - " @p add.u64 %0, %0, %1;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.down.b32 lo|p, lo, %2, %3;" - " shfl.down.b32 hi|p, hi, %2, %3;" - " mov.b64 %0, {lo, hi};" - " @p add.u64 %0, %0, %1;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c)); -#endif - - return output; - } - - - /// Reduction (specialized for summation across long long types) - __device__ __forceinline__ long long ReduceStep( - long long input, ///< [in] Calling thread's input item. - cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - long long output; - int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" - " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" - " mov.b64 %0, {lo, hi};" - " @p add.s64 %0, %0, %1;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.down.b32 lo|p, lo, %2, %3;" - " shfl.down.b32 hi|p, hi, %2, %3;" - " mov.b64 %0, {lo, hi};" - " @p add.s64 %0, %0, %1;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c)); -#endif - - return output; - } - - - /// Reduction (specialized for summation across double types) - __device__ __forceinline__ double ReduceStep( - double input, ///< [in] Calling thread's input item. - cub::Sum /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - double output; - int shfl_c = last_lane | SHFL_C; // Shuffle control (mask and last_lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " .reg .f64 r0;" - " mov.b64 %0, %1;" - " mov.b64 {lo, hi}, %1;" - " shfl.sync.down.b32 lo|p, lo, %2, %3, %4;" - " shfl.sync.down.b32 hi|p, hi, %2, %3, %4;" - " mov.b64 r0, {lo, hi};" - " @p add.f64 %0, %0, r0;" - "}" - : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " .reg .f64 r0;" - " mov.b64 %0, %1;" - " mov.b64 {lo, hi}, %1;" - " shfl.down.b32 lo|p, lo, %2, %3;" - " shfl.down.b32 hi|p, hi, %2, %3;" - " mov.b64 r0, {lo, hi};" - " @p add.f64 %0, %0, r0;" - "}" - : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c)); -#endif - - return output; - } - - - /// Reduction (specialized for swizzled ReduceByKeyOp across KeyValuePair types) - template - __device__ __forceinline__ KeyValuePair ReduceStep( - KeyValuePair input, ///< [in] Calling thread's input item. - SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - KeyValuePair output; - - KeyT other_key = ShuffleDown(input.key, offset, last_lane, member_mask); - - output.key = input.key; - output.value = ReduceStep( - input.value, - cub::Sum(), - last_lane, - offset, - Int2Type::IS_SMALL_UNSIGNED>()); - - if (input.key != other_key) - output.value = input.value; - - return output; - } - - - - /// Reduction (specialized for swizzled ReduceBySegmentOp across KeyValuePair types) - template - __device__ __forceinline__ KeyValuePair ReduceStep( - KeyValuePair input, ///< [in] Calling thread's input item. - SwizzleScanOp > /*reduction_op*/, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - KeyValuePair output; - - output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); - output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); - - if (input.key > 0) - output.value = input.value; - - return output; - } - - - /// Reduction step (generic) - template - __device__ __forceinline__ _T ReduceStep( - _T input, ///< [in] Calling thread's input item. - ReductionOp reduction_op, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset) ///< [in] Up-offset to pull from - { - _T output = input; - - _T temp = ShuffleDown(output, offset, last_lane, member_mask); - - // Perform reduction op if valid - if (offset + lane_id <= last_lane) - output = reduction_op(input, temp); - - return output; - } - - - /// Reduction step (specialized for small unsigned integers size 32b or less) - template - __device__ __forceinline__ _T ReduceStep( - _T input, ///< [in] Calling thread's input item. - ReductionOp reduction_op, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset, ///< [in] Up-offset to pull from - Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer - { - return ReduceStep(input, reduction_op, last_lane, offset); - } - - - /// Reduction step (specialized for types other than small unsigned integers size 32b or less) - template - __device__ __forceinline__ _T ReduceStep( - _T input, ///< [in] Calling thread's input item. - ReductionOp reduction_op, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - int offset, ///< [in] Up-offset to pull from - Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small unsigned integer - { - return ReduceStep(input, reduction_op, last_lane, offset); - } - - - //--------------------------------------------------------------------- - // Templated inclusive scan iteration - //--------------------------------------------------------------------- - - template - __device__ __forceinline__ void ReduceStep( - T& input, ///< [in] Calling thread's input item. - ReductionOp reduction_op, ///< [in] Binary reduction operator - int last_lane, ///< [in] Index of last lane in segment - Int2Type /*step*/) - { - input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); - - ReduceStep(input, reduction_op, last_lane, Int2Type()); - } - - template - __device__ __forceinline__ void ReduceStep( - T& /*input*/, ///< [in] Calling thread's input item. - ReductionOp /*reduction_op*/, ///< [in] Binary reduction operator - int /*last_lane*/, ///< [in] Index of last lane in segment - Int2Type /*step*/) - {} - - - //--------------------------------------------------------------------- - // Reduction operations - //--------------------------------------------------------------------- - - /// Reduction - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - int valid_items, ///< [in] Total number of valid items across the logical warp - ReductionOp reduction_op) ///< [in] Binary reduction operator - { - int last_lane = (ALL_LANES_VALID) ? - LOGICAL_WARP_THREADS - 1 : - valid_items - 1; - - T output = input; - -// // Iterate reduction steps -// #pragma unroll -// for (int STEP = 0; STEP < STEPS; STEP++) -// { -// output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); -// } - - // Template-iterate reduction steps - ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); - - return output; - } - - - /// Segmented reduction - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename FlagT, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op) ///< [in] Binary reduction operator - { - // Get the start flags for each thread in the warp. - int warp_flags = WARP_BALLOT(flag, member_mask); - - // Convert to tail-segmented - if (HEAD_SEGMENTED) - warp_flags >>= 1; - - // Mask out the bits below the current thread - warp_flags &= LaneMaskGe(); - - // Mask of physical lanes outside the logical warp and convert to logical lanemask - if (!IS_ARCH_WARP) - { - warp_flags = (warp_flags & member_mask) >> (warp_id * LOGICAL_WARP_THREADS); - } - - // Mask in the last lane of logical warp - warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1); - - // Find the next set flag - int last_lane = __clz(__brev(warp_flags)); - - T output = input; - -// // Iterate reduction steps -// #pragma unroll -// for (int STEP = 0; STEP < STEPS; STEP++) -// { -// output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type::IS_SMALL_UNSIGNED>()); -// } - - // Template-iterate reduction steps - ReduceStep(output, reduction_op, last_lane, Int2Type<0>()); - - return output; - } -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/specializations/warp_reduce_smem.cuh b/SRC/cub/warp/specializations/warp_reduce_smem.cuh deleted file mode 100644 index 7baa573b..00000000 --- a/SRC/cub/warp/specializations/warp_reduce_smem.cuh +++ /dev/null @@ -1,372 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "../../thread/thread_operators.cuh" -#include "../../thread/thread_load.cuh" -#include "../../thread/thread_store.cuh" -#include "../../util_type.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp. - */ -template < - typename T, ///< Data type being reduced - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct WarpReduceSmem -{ - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// Whether the logical warp size is a power-of-two - IS_POW_OF_TWO = PowerOfTwo::VALUE, - - /// The number of warp scan steps - STEPS = Log2::VALUE, - - /// The number of threads in half a warp - HALF_WARP_THREADS = 1 << (STEPS - 1), - - /// The number of shared memory elements per warp - WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, - - /// FlagT status (when not using ballot) - UNSET = 0x0, // Is initially unset - SET = 0x1, // Is initially set - SEEN = 0x2, // Has seen another head flag from a successor peer - }; - - /// Shared memory flag type - typedef unsigned char SmemFlag; - - /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) - struct _TempStorage - { - T reduce[WARP_SMEM_ELEMENTS]; - SmemFlag flags[WARP_SMEM_ELEMENTS]; - }; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - _TempStorage &temp_storage; - unsigned int lane_id; - unsigned int member_mask; - - - /****************************************************************************** - * Construction - ******************************************************************************/ - - /// Constructor - __device__ __forceinline__ WarpReduceSmem( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - - lane_id(IS_ARCH_WARP ? - LaneId() : - LaneId() % LOGICAL_WARP_THREADS), - - member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ? - 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp - ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) - {} - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - //--------------------------------------------------------------------- - // Regular reduction - //--------------------------------------------------------------------- - - /** - * Reduction step - */ - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - typename ReductionOp, - int STEP> - __device__ __forceinline__ T ReduceStep( - T input, ///< [in] Calling thread's input - int valid_items, ///< [in] Total number of valid items across the logical warp - ReductionOp reduction_op, ///< [in] Reduction operator - Int2Type /*step*/) - { - const int OFFSET = 1 << STEP; - - // Share input through buffer - ThreadStore(&temp_storage.reduce[lane_id], input); - - WARP_SYNC(member_mask); - - // Update input if peer_addend is in range - if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items)) - { - T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); - input = reduction_op(input, peer_addend); - } - - WARP_SYNC(member_mask); - - return ReduceStep(input, valid_items, reduction_op, Int2Type()); - } - - - /** - * Reduction step (terminate) - */ - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - typename ReductionOp> - __device__ __forceinline__ T ReduceStep( - T input, ///< [in] Calling thread's input - int valid_items, ///< [in] Total number of valid items across the logical warp - ReductionOp /*reduction_op*/, ///< [in] Reduction operator - Int2Type /*step*/) - { - return input; - } - - - //--------------------------------------------------------------------- - // Segmented reduction - //--------------------------------------------------------------------- - - - /** - * Ballot-based segmented reduce - */ - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename FlagT, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op, ///< [in] Reduction operator - Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality - { - // Get the start flags for each thread in the warp. - int warp_flags = WARP_BALLOT(flag, member_mask); - - if (!HEAD_SEGMENTED) - warp_flags <<= 1; - - // Keep bits above the current thread. - warp_flags &= LaneMaskGt(); - - // Accommodate packing of multiple logical warps in a single physical warp - if (!IS_ARCH_WARP) - { - warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS; - } - - // Find next flag - int next_flag = __clz(__brev(warp_flags)); - - // Clip the next segment at the warp boundary if necessary - if (LOGICAL_WARP_THREADS != 32) - next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS); - - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - const int OFFSET = 1 << STEP; - - // Share input into buffer - ThreadStore(&temp_storage.reduce[lane_id], input); - - WARP_SYNC(member_mask); - - // Update input if peer_addend is in range - if (OFFSET + lane_id < next_flag) - { - T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); - input = reduction_op(input, peer_addend); - } - - WARP_SYNC(member_mask); - } - - return input; - } - - - /** - * Smem-based segmented reduce - */ - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename FlagT, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op, ///< [in] Reduction operator - Int2Type /*has_ballot*/) ///< [in] Marker type for whether the target arch has ballot functionality - { - enum - { - UNSET = 0x0, // Is initially unset - SET = 0x1, // Is initially set - SEEN = 0x2, // Has seen another head flag from a successor peer - }; - - // Alias flags onto shared data storage - volatile SmemFlag *flag_storage = temp_storage.flags; - - SmemFlag flag_status = (flag) ? SET : UNSET; - - for (int STEP = 0; STEP < STEPS; STEP++) - { - const int OFFSET = 1 << STEP; - - // Share input through buffer - ThreadStore(&temp_storage.reduce[lane_id], input); - - WARP_SYNC(member_mask); - - // Get peer from buffer - T peer_addend = ThreadLoad(&temp_storage.reduce[lane_id + OFFSET]); - - WARP_SYNC(member_mask); - - // Share flag through buffer - flag_storage[lane_id] = flag_status; - - // Get peer flag from buffer - SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET]; - - // Update input if peer was in range - if (lane_id < LOGICAL_WARP_THREADS - OFFSET) - { - if (HEAD_SEGMENTED) - { - // Head-segmented - if ((flag_status & SEEN) == 0) - { - // Has not seen a more distant head flag - if (peer_flag_status & SET) - { - // Has now seen a head flag - flag_status |= SEEN; - } - else - { - // Peer is not a head flag: grab its count - input = reduction_op(input, peer_addend); - } - - // Update seen status to include that of peer - flag_status |= (peer_flag_status & SEEN); - } - } - else - { - // Tail-segmented. Simply propagate flag status - if (!flag_status) - { - input = reduction_op(input, peer_addend); - flag_status |= peer_flag_status; - } - - } - } - } - - return input; - } - - - /****************************************************************************** - * Interface - ******************************************************************************/ - - /** - * Reduction - */ - template < - bool ALL_LANES_VALID, ///< Whether all lanes in each warp are contributing a valid fold of items - typename ReductionOp> - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - int valid_items, ///< [in] Total number of valid items across the logical warp - ReductionOp reduction_op) ///< [in] Reduction operator - { - return ReduceStep(input, valid_items, reduction_op, Int2Type<0>()); - } - - - /** - * Segmented reduction - */ - template < - bool HEAD_SEGMENTED, ///< Whether flags indicate a segment-head or a segment-tail - typename FlagT, - typename ReductionOp> - __device__ __forceinline__ T SegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT flag, ///< [in] Whether or not the current lane is a segment head/tail - ReductionOp reduction_op) ///< [in] Reduction operator - { - return SegmentedReduce(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>()); - } - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/specializations/warp_scan_shfl.cuh b/SRC/cub/warp/specializations/warp_scan_shfl.cuh deleted file mode 100644 index 7f4e1c94..00000000 --- a/SRC/cub/warp/specializations/warp_scan_shfl.cuh +++ /dev/null @@ -1,632 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "../../thread/thread_operators.cuh" -#include "../../util_type.cuh" -#include "../../util_ptx.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. - * - * LOGICAL_WARP_THREADS must be a power-of-two - */ -template < - typename T, ///< Data type being scanned - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct WarpScanShfl -{ - //--------------------------------------------------------------------- - // Constants and type definitions - //--------------------------------------------------------------------- - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// The number of warp scan steps - STEPS = Log2::VALUE, - - /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up - SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8 - }; - - template - struct IntegerTraits - { - enum { - ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange - IS_SMALL_UNSIGNED = (Traits::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int)) - }; - }; - - /// Shared memory storage layout type - struct TempStorage {}; - - - //--------------------------------------------------------------------- - // Thread fields - //--------------------------------------------------------------------- - - /// Lane index in logical warp - unsigned int lane_id; - - /// Logical warp index in 32-thread physical warp - unsigned int warp_id; - - /// 32-thread physical warp member mask of logical warp - unsigned int member_mask; - - //--------------------------------------------------------------------- - // Construction - //--------------------------------------------------------------------- - - /// Constructor - __device__ __forceinline__ WarpScanShfl( - TempStorage &/*temp_storage*/) - { - lane_id = LaneId(); - warp_id = 0; - member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS); - - if (!IS_ARCH_WARP) - { - warp_id = lane_id / LOGICAL_WARP_THREADS; - lane_id = lane_id % LOGICAL_WARP_THREADS; - member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS); - } - } - - - //--------------------------------------------------------------------- - // Inclusive scan steps - //--------------------------------------------------------------------- - - /// Inclusive prefix scan step (specialized for summation across int32 types) - __device__ __forceinline__ int InclusiveScanStep( - int input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - int output; - int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .s32 r0;" - " .reg .pred p;" - " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" - " @p add.s32 r0, r0, %4;" - " mov.s32 %0, r0;" - "}" - : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .s32 r0;" - " .reg .pred p;" - " shfl.up.b32 r0|p, %1, %2, %3;" - " @p add.s32 r0, r0, %4;" - " mov.s32 %0, r0;" - "}" - : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); -#endif - - return output; - } - - /// Inclusive prefix scan step (specialized for summation across uint32 types) - __device__ __forceinline__ unsigned int InclusiveScanStep( - unsigned int input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - unsigned int output; - int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .u32 r0;" - " .reg .pred p;" - " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" - " @p add.u32 r0, r0, %4;" - " mov.u32 %0, r0;" - "}" - : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 r0;" - " .reg .pred p;" - " shfl.up.b32 r0|p, %1, %2, %3;" - " @p add.u32 r0, r0, %4;" - " mov.u32 %0, r0;" - "}" - : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input)); -#endif - - return output; - } - - - /// Inclusive prefix scan step (specialized for summation across fp32 types) - __device__ __forceinline__ float InclusiveScanStep( - float input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - float output; - int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .f32 r0;" - " .reg .pred p;" - " shfl.sync.up.b32 r0|p, %1, %2, %3, %5;" - " @p add.f32 r0, r0, %4;" - " mov.f32 %0, r0;" - "}" - : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .f32 r0;" - " .reg .pred p;" - " shfl.up.b32 r0|p, %1, %2, %3;" - " @p add.f32 r0, r0, %4;" - " mov.f32 %0, r0;" - "}" - : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input)); -#endif - - return output; - } - - - /// Inclusive prefix scan step (specialized for summation across unsigned long long types) - __device__ __forceinline__ unsigned long long InclusiveScanStep( - unsigned long long input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - unsigned long long output; - int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .u64 r0;" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" - " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" - " mov.b64 r0, {lo, hi};" - " @p add.u64 r0, r0, %4;" - " mov.u64 %0, r0;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u64 r0;" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.up.b32 lo|p, lo, %2, %3;" - " shfl.up.b32 hi|p, hi, %2, %3;" - " mov.b64 r0, {lo, hi};" - " @p add.u64 r0, r0, %4;" - " mov.u64 %0, r0;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); -#endif - - return output; - } - - - /// Inclusive prefix scan step (specialized for summation across long long types) - __device__ __forceinline__ long long InclusiveScanStep( - long long input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - long long output; - int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .s64 r0;" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.sync.up.b32 lo|p, lo, %2, %3, %5;" - " shfl.sync.up.b32 hi|p, hi, %2, %3, %5;" - " mov.b64 r0, {lo, hi};" - " @p add.s64 r0, r0, %4;" - " mov.s64 %0, r0;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .s64 r0;" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " mov.b64 {lo, hi}, %1;" - " shfl.up.b32 lo|p, lo, %2, %3;" - " shfl.up.b32 hi|p, hi, %2, %3;" - " mov.b64 r0, {lo, hi};" - " @p add.s64 r0, r0, %4;" - " mov.s64 %0, r0;" - "}" - : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input)); -#endif - - return output; - } - - - /// Inclusive prefix scan step (specialized for summation across fp64 types) - __device__ __forceinline__ double InclusiveScanStep( - double input, ///< [in] Calling thread's input item. - cub::Sum /*scan_op*/, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - double output; - int shfl_c = first_lane | SHFL_C; // Shuffle control (mask and first-lane) - - // Use predicate set from SHFL to guard against invalid peers -#ifdef CUB_USE_COOPERATIVE_GROUPS - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " .reg .f64 r0;" - " mov.b64 %0, %1;" - " mov.b64 {lo, hi}, %1;" - " shfl.sync.up.b32 lo|p, lo, %2, %3, %4;" - " shfl.sync.up.b32 hi|p, hi, %2, %3, %4;" - " mov.b64 r0, {lo, hi};" - " @p add.f64 %0, %0, r0;" - "}" - : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask)); -#else - asm volatile( - "{" - " .reg .u32 lo;" - " .reg .u32 hi;" - " .reg .pred p;" - " .reg .f64 r0;" - " mov.b64 %0, %1;" - " mov.b64 {lo, hi}, %1;" - " shfl.up.b32 lo|p, lo, %2, %3;" - " shfl.up.b32 hi|p, hi, %2, %3;" - " mov.b64 r0, {lo, hi};" - " @p add.f64 %0, %0, r0;" - "}" - : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c)); -#endif - - return output; - } - - -/* - /// Inclusive prefix scan (specialized for ReduceBySegmentOp across KeyValuePair types) - template - __device__ __forceinline__ KeyValuePairInclusiveScanStep( - KeyValuePair input, ///< [in] Calling thread's input item. - ReduceBySegmentOp scan_op, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - KeyValuePair output; - - output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); - output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type::IS_SMALL_UNSIGNED>()); - - if (input.key > 0) - output.value = input.value; - - return output; - } -*/ - - /// Inclusive prefix scan step (generic) - template - __device__ __forceinline__ _T InclusiveScanStep( - _T input, ///< [in] Calling thread's input item. - ScanOpT scan_op, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset) ///< [in] Up-offset to pull from - { - _T temp = ShuffleUp(input, offset, first_lane, member_mask); - - // Perform scan op if from a valid peer - _T output = scan_op(temp, input); - if (static_cast(lane_id) < first_lane + offset) - output = input; - - return output; - } - - - /// Inclusive prefix scan step (specialized for small integers size 32b or less) - template - __device__ __forceinline__ _T InclusiveScanStep( - _T input, ///< [in] Calling thread's input item. - ScanOpT scan_op, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset, ///< [in] Up-offset to pull from - Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer - { - return InclusiveScanStep(input, scan_op, first_lane, offset); - } - - - /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less) - template - __device__ __forceinline__ _T InclusiveScanStep( - _T input, ///< [in] Calling thread's input item. - ScanOpT scan_op, ///< [in] Binary scan operator - int first_lane, ///< [in] Index of first lane in segment - int offset, ///< [in] Up-offset to pull from - Int2Type /*is_small_unsigned*/) ///< [in] Marker type indicating whether T is a small integer - { - return InclusiveScanStep(input, scan_op, first_lane, offset); - } - - - /****************************************************************************** - * Interface - ******************************************************************************/ - - //--------------------------------------------------------------------- - // Broadcast - //--------------------------------------------------------------------- - - /// Broadcast - __device__ __forceinline__ T Broadcast( - T input, ///< [in] The value to broadcast - int src_lane) ///< [in] Which warp lane is to do the broadcasting - { - return ShuffleIndex(input, src_lane, member_mask); - } - - - //--------------------------------------------------------------------- - // Inclusive operations - //--------------------------------------------------------------------- - - /// Inclusive scan - template - __device__ __forceinline__ void InclusiveScan( - _T input, ///< [in] Calling thread's input item. - _T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOpT scan_op) ///< [in] Binary scan operator - { - inclusive_output = input; - - // Iterate scan steps - int segment_first_lane = 0; - - // Iterate scan steps - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - inclusive_output = InclusiveScanStep( - inclusive_output, - scan_op, - segment_first_lane, - (1 << STEP), - Int2Type::IS_SMALL_UNSIGNED>()); - } - - } - - /// Inclusive scan, specialized for reduce-value-by-key - template - __device__ __forceinline__ void InclusiveScan( - KeyValuePair input, ///< [in] Calling thread's input item. - KeyValuePair &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ReduceByKeyOp scan_op) ///< [in] Binary scan operator - { - inclusive_output = input; - - KeyT pred_key = ShuffleUp(inclusive_output.key, 1, 0, member_mask); - - unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask); - - // Mask away all lanes greater than ours - ballot = ballot & LaneMaskLe(); - - // Find index of first set bit - int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot)); - - // Iterate scan steps - #pragma unroll - for (int STEP = 0; STEP < STEPS; STEP++) - { - inclusive_output.value = InclusiveScanStep( - inclusive_output.value, - scan_op.op, - segment_first_lane, - (1 << STEP), - Int2Type::IS_SMALL_UNSIGNED>()); - } - } - - - /// Inclusive scan with aggregate - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOpT scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InclusiveScan(input, inclusive_output, scan_op); - - // Grab aggregate from last warp lane - warp_aggregate = ShuffleIndex(inclusive_output, LOGICAL_WARP_THREADS - 1, member_mask); - } - - - //--------------------------------------------------------------------- - // Get exclusive from inclusive - //--------------------------------------------------------------------- - - /// Update inclusive and exclusive using input and inclusive - template - __device__ __forceinline__ void Update( - T /*input*/, ///< [in] - T &inclusive, ///< [in, out] - T &exclusive, ///< [out] - ScanOpT /*scan_op*/, ///< [in] - IsIntegerT /*is_integer*/) ///< [in] - { - // initial value unknown - exclusive = ShuffleUp(inclusive, 1, 0, member_mask); - } - - /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) - __device__ __forceinline__ void Update( - T input, - T &inclusive, - T &exclusive, - cub::Sum /*scan_op*/, - Int2Type /*is_integer*/) - { - // initial value presumed 0 - exclusive = inclusive - input; - } - - /// Update inclusive and exclusive using initial value using input, inclusive, and initial value - template - __device__ __forceinline__ void Update ( - T /*input*/, - T &inclusive, - T &exclusive, - ScanOpT scan_op, - T initial_value, - IsIntegerT /*is_integer*/) - { - inclusive = scan_op(initial_value, inclusive); - exclusive = ShuffleUp(inclusive, 1, 0, member_mask); - - if (lane_id == 0) - exclusive = initial_value; - } - - /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) - __device__ __forceinline__ void Update ( - T input, - T &inclusive, - T &exclusive, - cub::Sum scan_op, - T initial_value, - Int2Type /*is_integer*/) - { - inclusive = scan_op(initial_value, inclusive); - exclusive = inclusive - input; - } - - - /// Update inclusive, exclusive, and warp aggregate using input and inclusive - template - __device__ __forceinline__ void Update ( - T input, - T &inclusive, - T &exclusive, - T &warp_aggregate, - ScanOpT scan_op, - IsIntegerT is_integer) - { - warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, member_mask); - Update(input, inclusive, exclusive, scan_op, is_integer); - } - - /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value - template - __device__ __forceinline__ void Update ( - T input, - T &inclusive, - T &exclusive, - T &warp_aggregate, - ScanOpT scan_op, - T initial_value, - IsIntegerT is_integer) - { - warp_aggregate = ShuffleIndex(inclusive, LOGICAL_WARP_THREADS - 1, member_mask); - Update(input, inclusive, exclusive, scan_op, initial_value, is_integer); - } - - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/specializations/warp_scan_smem.cuh b/SRC/cub/warp/specializations/warp_scan_smem.cuh deleted file mode 100644 index 3237fcbf..00000000 --- a/SRC/cub/warp/specializations/warp_scan_smem.cuh +++ /dev/null @@ -1,397 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "../../thread/thread_operators.cuh" -#include "../../thread/thread_load.cuh" -#include "../../thread/thread_store.cuh" -#include "../../util_type.cuh" -#include "../../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. - */ -template < - typename T, ///< Data type being scanned - int LOGICAL_WARP_THREADS, ///< Number of threads per logical warp - int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective -struct WarpScanSmem -{ - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// Whether the logical warp size is a power-of-two - IS_POW_OF_TWO = PowerOfTwo::VALUE, - - /// The number of warp scan steps - STEPS = Log2::VALUE, - - /// The number of threads in half a warp - HALF_WARP_THREADS = 1 << (STEPS - 1), - - /// The number of shared memory elements per warp - WARP_SMEM_ELEMENTS = LOGICAL_WARP_THREADS + HALF_WARP_THREADS, - }; - - /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars) - typedef typename If<((Equals::VALUE || Equals::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT; - - /// Shared memory storage layout type (1.5 warps-worth of elements for each warp) - typedef CellT _TempStorage[WARP_SMEM_ELEMENTS]; - - // Alias wrapper allowing storage to be unioned - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - _TempStorage &temp_storage; - unsigned int lane_id; - unsigned int member_mask; - - - /****************************************************************************** - * Construction - ******************************************************************************/ - - /// Constructor - __device__ __forceinline__ WarpScanSmem( - TempStorage &temp_storage) - : - temp_storage(temp_storage.Alias()), - - lane_id(IS_ARCH_WARP ? - LaneId() : - LaneId() % LOGICAL_WARP_THREADS), - - member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ? - 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp - ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) - {} - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - - /// Basic inclusive scan iteration (template unrolled, inductive-case specialization) - template < - bool HAS_IDENTITY, - int STEP, - typename ScanOp> - __device__ __forceinline__ void ScanStep( - T &partial, - ScanOp scan_op, - Int2Type /*step*/) - { - const int OFFSET = 1 << STEP; - - // Share partial into buffer - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial); - - WARP_SYNC(member_mask); - - // Update partial if addend is in range - if (HAS_IDENTITY || (lane_id >= OFFSET)) - { - T addend = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]); - partial = scan_op(addend, partial); - } - WARP_SYNC(member_mask); - - ScanStep(partial, scan_op, Int2Type()); - } - - - /// Basic inclusive scan iteration(template unrolled, base-case specialization) - template < - bool HAS_IDENTITY, - typename ScanOp> - __device__ __forceinline__ void ScanStep( - T &/*partial*/, - ScanOp /*scan_op*/, - Int2Type /*step*/) - {} - - - /// Inclusive prefix scan (specialized for summation across primitive types) - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - Sum scan_op, ///< [in] Binary scan operator - Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type - { - T identity = 0; - ThreadStore(&temp_storage[lane_id], (CellT) identity); - - WARP_SYNC(member_mask); - - // Iterate scan steps - output = input; - ScanStep(output, scan_op, Int2Type<0>()); - } - - - /// Inclusive prefix scan - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - Int2Type /*is_primitive*/) ///< [in] Marker type indicating whether T is primitive type - { - // Iterate scan steps - output = input; - ScanStep(output, scan_op, Int2Type<0>()); - } - - - /****************************************************************************** - * Interface - ******************************************************************************/ - - //--------------------------------------------------------------------- - // Broadcast - //--------------------------------------------------------------------- - - /// Broadcast - __device__ __forceinline__ T Broadcast( - T input, ///< [in] The value to broadcast - unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting - { - if (lane_id == src_lane) - { - ThreadStore(temp_storage, (CellT) input); - } - - WARP_SYNC(member_mask); - - return (T)ThreadLoad(temp_storage); - } - - - //--------------------------------------------------------------------- - // Inclusive operations - //--------------------------------------------------------------------- - - /// Inclusive scan - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op) ///< [in] Binary scan operator - { - InclusiveScan(input, inclusive_output, scan_op, Int2Type::PRIMITIVE>()); - } - - - /// Inclusive scan with aggregate - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InclusiveScan(input, inclusive_output, scan_op); - - // Retrieve aggregate - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output); - - WARP_SYNC(member_mask); - - warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); - - WARP_SYNC(member_mask); - } - - - //--------------------------------------------------------------------- - // Get exclusive from inclusive - //--------------------------------------------------------------------- - - /// Update inclusive and exclusive using input and inclusive - template - __device__ __forceinline__ void Update( - T /*input*/, ///< [in] - T &inclusive, ///< [in, out] - T &exclusive, ///< [out] - ScanOpT /*scan_op*/, ///< [in] - IsIntegerT /*is_integer*/) ///< [in] - { - // initial value unknown - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - - WARP_SYNC(member_mask); - - exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); - } - - /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types) - __device__ __forceinline__ void Update( - T input, - T &inclusive, - T &exclusive, - cub::Sum /*scan_op*/, - Int2Type /*is_integer*/) - { - // initial value presumed 0 - exclusive = inclusive - input; - } - - /// Update inclusive and exclusive using initial value using input, inclusive, and initial value - template - __device__ __forceinline__ void Update ( - T /*input*/, - T &inclusive, - T &exclusive, - ScanOpT scan_op, - T initial_value, - IsIntegerT /*is_integer*/) - { - inclusive = scan_op(initial_value, inclusive); - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - - WARP_SYNC(member_mask); - - exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); - if (lane_id == 0) - exclusive = initial_value; - } - - /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types) - __device__ __forceinline__ void Update ( - T input, - T &inclusive, - T &exclusive, - cub::Sum scan_op, - T initial_value, - Int2Type /*is_integer*/) - { - inclusive = scan_op(initial_value, inclusive); - exclusive = inclusive - input; - } - - - /// Update inclusive, exclusive, and warp aggregate using input and inclusive - template - __device__ __forceinline__ void Update ( - T /*input*/, - T &inclusive, - T &exclusive, - T &warp_aggregate, - ScanOpT /*scan_op*/, - IsIntegerT /*is_integer*/) - { - // Initial value presumed to be unknown or identity (either way our padding is correct) - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - - WARP_SYNC(member_mask); - - exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 1]); - warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); - } - - /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types) - __device__ __forceinline__ void Update ( - T input, - T &inclusive, - T &exclusive, - T &warp_aggregate, - cub::Sum /*scan_o*/, - Int2Type /*is_integer*/) - { - // Initial value presumed to be unknown or identity (either way our padding is correct) - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - - WARP_SYNC(member_mask); - - warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); - exclusive = inclusive - input; - } - - /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value - template - __device__ __forceinline__ void Update ( - T /*input*/, - T &inclusive, - T &exclusive, - T &warp_aggregate, - ScanOpT scan_op, - T initial_value, - IsIntegerT /*is_integer*/) - { - // Broadcast warp aggregate - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive); - - WARP_SYNC(member_mask); - - warp_aggregate = (T) ThreadLoad(&temp_storage[WARP_SMEM_ELEMENTS - 1]); - - WARP_SYNC(member_mask); - - // Update inclusive with initial value - inclusive = scan_op(initial_value, inclusive); - - // Get exclusive from exclusive - ThreadStore(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive); - - WARP_SYNC(member_mask); - - exclusive = (T) ThreadLoad(&temp_storage[HALF_WARP_THREADS + lane_id - 2]); - - if (lane_id == 0) - exclusive = initial_value; - } - - -}; - - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/warp_reduce.cuh b/SRC/cub/warp/warp_reduce.cuh deleted file mode 100644 index 189896b0..00000000 --- a/SRC/cub/warp/warp_reduce.cuh +++ /dev/null @@ -1,612 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "specializations/warp_reduce_shfl.cuh" -#include "specializations/warp_reduce_smem.cuh" -#include "../thread/thread_operators.cuh" -#include "../util_arch.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - - -/** - * \addtogroup WarpModule - * @{ - */ - -/** - * \brief The WarpReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png) - * - * \tparam T The reduction input/output element type - * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20). - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - A reduction (or fold) - * uses a binary combining operator to compute a single aggregate from a list of input elements. - * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads) - * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS - * - * \par Performance Considerations - * - Uses special instructions when applicable (e.g., warp \p SHFL instructions) - * - Uses synchronization-free communication between warp lanes when applicable - * - Incurs zero bank conflicts for most types - * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: - * - Summation (vs. generic reduction) - * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS - * - * \par Simple Examples - * \warpcollective{WarpReduce} - * \par - * The code snippet below illustrates four concurrent warp sum reductions within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for 4 warps - * __shared__ typename WarpReduce::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96) - * int warp_id = threadIdx.x / 32; - * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. - * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, - * \p 2544, and \p 3568, respectively (and is undefined in other threads). - * - * \par - * The code snippet below illustrates a single warp sum reduction within a block of - * 128 threads. - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * ... - * - * // Only the first warp performs a reduction - * if (threadIdx.x < 32) - * { - * // Obtain one input item per thread - * int thread_data = ... - * - * // Return the warp-wide sum to lane0 - * int aggregate = WarpReduce(temp_storage).Sum(thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the warp of threads is {0, 1, 2, 3, ..., 31}. - * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads). - * - */ -template < - typename T, - int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, - int PTX_ARCH = CUB_PTX_ARCH> -class WarpReduce -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// Whether the logical warp size is a power-of-two - IS_POW_OF_TWO = PowerOfTwo::VALUE, - }; - -public: - - #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document - - /// Internal specialization. Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) - typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), - WarpReduceShfl, - WarpReduceSmem >::Type InternalWarpReduce; - - #endif // DOXYGEN_SHOULD_SKIP_THIS - - -private: - - /// Shared memory storage layout type for WarpReduce - typedef typename InternalWarpReduce::TempStorage _TempStorage; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - - - /****************************************************************************** - * Utility methods - ******************************************************************************/ - -public: - - /// \smemstorage{WarpReduce} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. - */ - __device__ __forceinline__ WarpReduce( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()) - {} - - - //@} end member group - /******************************************************************//** - * \name Summation reductions - *********************************************************************/ - //@{ - - - /** - * \brief Computes a warp-wide sum in the calling warp. The output is valid in warp lane0. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp sum reductions within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for 4 warps - * __shared__ typename WarpReduce::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Return the warp-wide sums to each lane0 - * int warp_id = threadIdx.x / 32; - * int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. - * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520, - * \p 2544, and \p 3568, respectively (and is undefined in other threads). - * - */ - __device__ __forceinline__ T Sum( - T input) ///< [in] Calling thread's input - { - return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, cub::Sum()); - } - - /** - * \brief Computes a partially-full warp-wide sum in the calling warp. The output is valid in warp lane0. - * - * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a sum reduction within a single, partially-full - * block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(int *d_data, int valid_items) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item per thread if in range - * int thread_data; - * if (threadIdx.x < valid_items) - * thread_data = d_data[threadIdx.x]; - * - * // Return the warp-wide sums to each lane0 - * int aggregate = WarpReduce(temp_storage).Sum( - * thread_data, valid_items); - * - * \endcode - * \par - * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items - * is \p 4. The corresponding output \p aggregate in thread0 is \p 6 (and is - * undefined in other threads). - * - */ - __device__ __forceinline__ T Sum( - T input, ///< [in] Calling thread's input - int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) - { - // Determine if we don't need bounds checking - return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, cub::Sum()); - } - - - /** - * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a head-segmented warp sum - * reduction within a block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item and flag per thread - * int thread_data = ... - * int head_flag = ... - * - * // Return the warp-wide sums to each lane0 - * int aggregate = WarpReduce(temp_storage).HeadSegmentedSum( - * thread_data, head_flag); - * - * \endcode - * \par - * Suppose the set of input \p thread_data and \p head_flag across the block of threads - * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, - * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be - * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - * - */ - template < - typename FlagT> - __device__ __forceinline__ T HeadSegmentedSum( - T input, ///< [in] Calling thread's input - FlagT head_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment - { - return HeadSegmentedReduce(input, head_flag, cub::Sum()); - } - - - /** - * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags. The sum of each segment is returned to the first lane in that segment (which always includes lane0). - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a tail-segmented warp sum - * reduction within a block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item and flag per thread - * int thread_data = ... - * int tail_flag = ... - * - * // Return the warp-wide sums to each lane0 - * int aggregate = WarpReduce(temp_storage).TailSegmentedSum( - * thread_data, tail_flag); - * - * \endcode - * \par - * Suppose the set of input \p thread_data and \p tail_flag across the block of threads - * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, - * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be - * \p 6, \p 22, \p 38, etc. (and is undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ - template < - typename FlagT> - __device__ __forceinline__ T TailSegmentedSum( - T input, ///< [in] Calling thread's input - FlagT tail_flag) ///< [in] Head flag denoting whether or not \p input is the start of a new segment - { - return TailSegmentedReduce(input, tail_flag, cub::Sum()); - } - - - - //@} end member group - /******************************************************************//** - * \name Generic reductions - *********************************************************************/ - //@{ - - /** - * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. - * - * Supports non-commutative reduction operators - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp max reductions within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for 4 warps - * __shared__ typename WarpReduce::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Return the warp-wide reductions to each lane0 - * int warp_id = threadIdx.x / 32; - * int aggregate = WarpReduce(temp_storage[warp_id]).Reduce( - * thread_data, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. - * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63, - * \p 95, and \p 127, respectively (and is undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - ReductionOp reduction_op) ///< [in] Binary reduction operator - { - return InternalWarpReduce(temp_storage).template Reduce(input, LOGICAL_WARP_THREADS, reduction_op); - } - - /** - * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor. The output is valid in warp lane0. - * - * All threads across the calling warp must agree on the same value for \p valid_items. Otherwise the result is undefined. - * - * Supports non-commutative reduction operators - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a max reduction within a single, partially-full - * block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(int *d_data, int valid_items) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item per thread if in range - * int thread_data; - * if (threadIdx.x < valid_items) - * thread_data = d_data[threadIdx.x]; - * - * // Return the warp-wide reductions to each lane0 - * int aggregate = WarpReduce(temp_storage).Reduce( - * thread_data, cub::Max(), valid_items); - * - * \endcode - * \par - * Suppose the input \p d_data is {0, 1, 2, 3, 4, ... and \p valid_items - * is \p 4. The corresponding output \p aggregate in thread0 is \p 3 (and is - * undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ T Reduce( - T input, ///< [in] Calling thread's input - ReductionOp reduction_op, ///< [in] Binary reduction operator - int valid_items) ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS) - { - return InternalWarpReduce(temp_storage).template Reduce(input, valid_items, reduction_op); - } - - - /** - * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). - * - * Supports non-commutative reduction operators - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a head-segmented warp max - * reduction within a block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item and flag per thread - * int thread_data = ... - * int head_flag = ... - * - * // Return the warp-wide reductions to each lane0 - * int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce( - * thread_data, head_flag, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data and \p head_flag across the block of threads - * is {0, 1, 2, 3, ..., 31 and is {1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0, - * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be - * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ - template < - typename ReductionOp, - typename FlagT> - __device__ __forceinline__ T HeadSegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT head_flag, ///< [in] Head flag denoting whether or not \p input is the start of a new segment - ReductionOp reduction_op) ///< [in] Reduction operator - { - return InternalWarpReduce(temp_storage).template SegmentedReduce(input, head_flag, reduction_op); - } - - - /** - * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags. The reduction of each segment is returned to the first lane in that segment (which always includes lane0). - * - * Supports non-commutative reduction operators - * - * \smemreuse - * - * \par Snippet - * The code snippet below illustrates a tail-segmented warp max - * reduction within a block of 32 threads (one warp). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpReduce for type int - * typedef cub::WarpReduce WarpReduce; - * - * // Allocate WarpReduce shared memory for one warp - * __shared__ typename WarpReduce::TempStorage temp_storage; - * - * // Obtain one input item and flag per thread - * int thread_data = ... - * int tail_flag = ... - * - * // Return the warp-wide reductions to each lane0 - * int aggregate = WarpReduce(temp_storage).TailSegmentedReduce( - * thread_data, tail_flag, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data and \p tail_flag across the block of threads - * is {0, 1, 2, 3, ..., 31 and is {0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1, - * respectively. The corresponding output \p aggregate in threads 0, 4, 8, etc. will be - * \p 3, \p 7, \p 11, etc. (and is undefined in other threads). - * - * \tparam ReductionOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) - */ - template < - typename ReductionOp, - typename FlagT> - __device__ __forceinline__ T TailSegmentedReduce( - T input, ///< [in] Calling thread's input - FlagT tail_flag, ///< [in] Tail flag denoting whether or not \p input is the end of the current segment - ReductionOp reduction_op) ///< [in] Reduction operator - { - return InternalWarpReduce(temp_storage).template SegmentedReduce(input, tail_flag, reduction_op); - } - - - - //@} end member group -}; - -/** @} */ // end group WarpModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) diff --git a/SRC/cub/warp/warp_scan.cuh b/SRC/cub/warp/warp_scan.cuh deleted file mode 100644 index c7af0d34..00000000 --- a/SRC/cub/warp/warp_scan.cuh +++ /dev/null @@ -1,936 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ - -/** - * \file - * The cub::WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. - */ - -#pragma once - -#include "specializations/warp_scan_shfl.cuh" -#include "specializations/warp_scan_smem.cuh" -#include "../thread/thread_operators.cuh" -#include "../util_arch.cuh" -#include "../util_type.cuh" -#include "../util_namespace.cuh" - -/// Optional outer namespace(s) -CUB_NS_PREFIX - -/// CUB namespace -namespace cub { - -/** - * \addtogroup WarpModule - * @{ - */ - -/** - * \brief The WarpScan class provides [collective](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp. ![](warp_scan_logo.png) - * - * \tparam T The scan input/output element type - * \tparam LOGICAL_WARP_THREADS [optional] The number of threads per "logical" warp (may be less than the number of hardware warp threads). Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20). - * \tparam PTX_ARCH [optional] \ptxversion - * - * \par Overview - * - Given a list of input elements and a binary reduction operator, a [prefix scan](http://en.wikipedia.org/wiki/Prefix_sum) - * produces an output list where each element is computed to be the reduction - * of the elements occurring earlier in the input list. Prefix sum - * connotes a prefix scan with the addition operator. The term \em inclusive indicates - * that the ith output reduction incorporates the ith input. - * The term \em exclusive indicates the ith input is not incorporated into - * the ith output reduction. - * - Supports non-commutative scan operators - * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads) - * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS - * - * \par Performance Considerations - * - Uses special instructions when applicable (e.g., warp \p SHFL) - * - Uses synchronization-free communication between warp lanes when applicable - * - Incurs zero bank conflicts for most types - * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: - * - Summation (vs. generic scan) - * - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS - * - * \par Simple Examples - * \warpcollective{WarpScan} - * \par - * The code snippet below illustrates four concurrent warp prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute warp-wide prefix sums - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data in each of the four warps of threads will be - * 0, 1, 2, 3, ..., 31}. - * - * \par - * The code snippet below illustrates a single warp prefix sum within a block of - * 128 threads. - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for one warp - * __shared__ typename WarpScan::TempStorage temp_storage; - * ... - * - * // Only the first warp performs a prefix sum - * if (threadIdx.x < 32) - * { - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute warp-wide prefix sums - * WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the warp of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data will be {0, 1, 2, 3, ..., 31}. - * - */ -template < - typename T, - int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, - int PTX_ARCH = CUB_PTX_ARCH> -class WarpScan -{ -private: - - /****************************************************************************** - * Constants and type definitions - ******************************************************************************/ - - enum - { - /// Whether the logical warp size and the PTX warp size coincide - IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)), - - /// Whether the logical warp size is a power-of-two - IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0), - - /// Whether the data type is an integer (which has fully-associative addition) - IS_INTEGER = ((Traits::CATEGORY == SIGNED_INTEGER) || (Traits::CATEGORY == UNSIGNED_INTEGER)) - }; - - /// Internal specialization. Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two) - typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO), - WarpScanShfl, - WarpScanSmem >::Type InternalWarpScan; - - /// Shared memory storage layout type for WarpScan - typedef typename InternalWarpScan::TempStorage _TempStorage; - - - /****************************************************************************** - * Thread fields - ******************************************************************************/ - - /// Shared storage reference - _TempStorage &temp_storage; - unsigned int lane_id; - - - - /****************************************************************************** - * Public types - ******************************************************************************/ - -public: - - /// \smemstorage{WarpScan} - struct TempStorage : Uninitialized<_TempStorage> {}; - - - /******************************************************************//** - * \name Collective constructors - *********************************************************************/ - //@{ - - /** - * \brief Collective constructor using the specified memory allocation as temporary storage. Logical warp and lane identifiers are constructed from threadIdx.x. - */ - __device__ __forceinline__ WarpScan( - TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage - : - temp_storage(temp_storage.Alias()), - lane_id(IS_ARCH_WARP ? - LaneId() : - LaneId() % LOGICAL_WARP_THREADS) - {} - - - //@} end member group - /******************************************************************//** - * \name Inclusive prefix sums - *********************************************************************/ - //@{ - - - /** - * \brief Computes an inclusive prefix sum across the calling warp. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute inclusive warp-wide prefix sums - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data in each of the four warps of threads will be - * 1, 2, 3, ..., 32}. - */ - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item. - T &inclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. - { - InclusiveScan(input, inclusive_output, cub::Sum()); - } - - - /** - * \brief Computes an inclusive prefix sum across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute inclusive warp-wide prefix sums - * int warp_aggregate; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data in each of the four warps of threads will be - * 1, 2, 3, ..., 32}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. - */ - __device__ __forceinline__ void InclusiveSum( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate); - } - - - //@} end member group - /******************************************************************//** - * \name Exclusive prefix sums - *********************************************************************/ - //@{ - - - /** - * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in thread0. - * - * \par - * - \identityzero - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix sums - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data in each of the four warps of threads will be - * 0, 1, 2, ..., 31}. - * - */ - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item. - T &exclusive_output) ///< [out] Calling thread's output item. May be aliased with \p input. - { - T initial_value = 0; - ExclusiveScan(input, exclusive_output, initial_value, cub::Sum()); - } - - - /** - * \brief Computes an exclusive prefix sum across the calling warp. The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in thread0. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * \par - * - \identityzero - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix sums - * int warp_aggregate; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {1, 1, 1, 1, ...}. - * The corresponding output \p thread_data in each of the four warps of threads will be - * 0, 1, 2, ..., 31}. Furthermore, \p warp_aggregate for all threads in all warps will be \p 32. - */ - __device__ __forceinline__ void ExclusiveSum( - T input, ///< [in] Calling thread's input item. - T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - T initial_value = 0; - ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate); - } - - - //@} end member group - /******************************************************************//** - * \name Inclusive prefix scans - *********************************************************************/ - //@{ - - /** - * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute inclusive warp-wide prefix max scans - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op) ///< [in] Binary scan operator - { - InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op); - } - - - /** - * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute inclusive warp-wide prefix max scans - * int warp_aggregate; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).InclusiveScan( - * thread_data, thread_data, cub::Max(), warp_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. - * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads - * in the second warp, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void InclusiveScan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate); - } - - - //@} end member group - /******************************************************************//** - * \name Exclusive prefix scans - *********************************************************************/ - //@{ - - /** - * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for warp-lane0 is undefined. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix max scans - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. - * (The output \p thread_data in warp lane0 is undefined.) - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op) ///< [in] Binary scan operator - { - InternalWarpScan internal(temp_storage); - - T inclusive_output; - internal.InclusiveScan(input, inclusive_output, scan_op); - - internal.Update( - input, - inclusive_output, - exclusive_output, - scan_op, - Int2Type()); - } - - - /** - * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix max scans - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - T initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op) ///< [in] Binary scan operator - { - InternalWarpScan internal(temp_storage); - - T inclusive_output; - internal.InclusiveScan(input, inclusive_output, scan_op); - - internal.Update( - input, - inclusive_output, - exclusive_output, - scan_op, - initial_value, - Int2Type()); - } - - - /** - * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p output computed for warp-lane0 is undefined. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix max scans - * int warp_aggregate; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. - * (The output \p thread_data in warp lane0 is undefined.) Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads - * in the second warp, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InternalWarpScan internal(temp_storage); - - T inclusive_output; - internal.InclusiveScan(input, inclusive_output, scan_op); - - internal.Update( - input, - inclusive_output, - exclusive_output, - warp_aggregate, - scan_op, - Int2Type()); - } - - - /** - * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp. Also provides every thread with the warp-wide \p warp_aggregate of all inputs. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix max scans - * int warp_aggregate; - * int warp_id = threadIdx.x / 32; - * WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p thread_data in the first warp would be - * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. - * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads - * in the second warp, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void ExclusiveScan( - T input, ///< [in] Calling thread's input item. - T &exclusive_output, ///< [out] Calling thread's output item. May be aliased with \p input. - T initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op, ///< [in] Binary scan operator - T &warp_aggregate) ///< [out] Warp-wide aggregate reduction of input items. - { - InternalWarpScan internal(temp_storage); - - T inclusive_output; - internal.InclusiveScan(input, inclusive_output, scan_op); - - internal.Update( - input, - inclusive_output, - exclusive_output, - warp_aggregate, - scan_op, - initial_value, - Int2Type()); - } - - - //@} end member group - /******************************************************************//** - * \name Combination (inclusive & exclusive) prefix scans - *********************************************************************/ - //@{ - - - /** - * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. Because no initial value is supplied, the \p exclusive_output computed for warp-lane0 is undefined. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute exclusive warp-wide prefix max scans - * int inclusive_partial, exclusive_partial; - * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p inclusive_partial in the first warp would be - * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. - * The corresponding output \p exclusive_partial in the first warp would be - * ?, 0, 0, 2, ..., 28, 30, the output for the second warp would be ?, 32, 32, 34, ..., 60, 62, etc. - * (The output \p thread_data in warp lane0 is undefined.) - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void Scan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. - T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. - ScanOp scan_op) ///< [in] Binary scan operator - { - InternalWarpScan internal(temp_storage); - - internal.InclusiveScan(input, inclusive_output, scan_op); - - internal.Update( - input, - inclusive_output, - exclusive_output, - scan_op, - Int2Type()); - } - - - /** - * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp. - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of - * 128 threads (one per each of the 32-thread warps). - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Compute inclusive warp-wide prefix max scans - * int warp_id = threadIdx.x / 32; - * int inclusive_partial, exclusive_partial; - * WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max()); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, -1, 2, -3, ..., 126, -127}. - * The corresponding output \p inclusive_partial in the first warp would be - * 0, 0, 2, 2, ..., 30, 30, the output for the second warp would be 32, 32, 34, 34, ..., 62, 62, etc. - * The corresponding output \p exclusive_partial in the first warp would be - * INT_MIN, 0, 0, 2, ..., 28, 30, the output for the second warp would be 30, 32, 32, 34, ..., 60, 62, etc. - * - * \tparam ScanOp [inferred] Binary scan operator type having member T operator()(const T &a, const T &b) - */ - template - __device__ __forceinline__ void Scan( - T input, ///< [in] Calling thread's input item. - T &inclusive_output, ///< [out] Calling thread's inclusive-scan output item. - T &exclusive_output, ///< [out] Calling thread's exclusive-scan output item. - T initial_value, ///< [in] Initial value to seed the exclusive scan - ScanOp scan_op) ///< [in] Binary scan operator - { - InternalWarpScan internal(temp_storage); - - internal.InclusiveScan(input, inclusive_output, scan_op); - - internal.Update( - input, - inclusive_output, - exclusive_output, - scan_op, - initial_value, - Int2Type()); - } - - - - //@} end member group - /******************************************************************//** - * \name Data exchange - *********************************************************************/ - //@{ - - /** - * \brief Broadcast the value \p input from warp-lanesrc_lane to all lanes in the warp - * - * \par - * - \smemreuse - * - * \par Snippet - * The code snippet below illustrates the warp-wide broadcasts of values from - * lanes0 in each of four warps to all other threads in those warps. - * \par - * \code - * #include - * - * __global__ void ExampleKernel(...) - * { - * // Specialize WarpScan for type int - * typedef cub::WarpScan WarpScan; - * - * // Allocate WarpScan shared memory for 4 warps - * __shared__ typename WarpScan::TempStorage temp_storage[4]; - * - * // Obtain one input item per thread - * int thread_data = ... - * - * // Broadcast from lane0 in each warp to all other threads in the warp - * int warp_id = threadIdx.x / 32; - * thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0); - * - * \endcode - * \par - * Suppose the set of input \p thread_data across the block of threads is {0, 1, 2, 3, ..., 127}. - * The corresponding output \p thread_data will be - * {0, 0, ..., 0} in warp0, - * {32, 32, ..., 32} in warp1, - * {64, 64, ..., 64} in warp2, etc. - */ - __device__ __forceinline__ T Broadcast( - T input, ///< [in] The value to broadcast - unsigned int src_lane) ///< [in] Which warp lane is to do the broadcasting - { - return InternalWarpScan(temp_storage).Broadcast(input, src_lane); - } - - //@} end member group - -}; - -/** @} */ // end group WarpModule - -} // CUB namespace -CUB_NS_POSTFIX // Optional outer namespace(s) From 6ed88c3b0e7fa972716e78c67ae6b644f7c43666 Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Thu, 2 Sep 2021 08:08:46 -0700 Subject: [PATCH 114/147] separate Factor and Solve for reuse. --- EXAMPLE/Makefile | 12 +- EXAMPLE/pddrive.c | 5 +- EXAMPLE/pddrive3d.c | 10 +- SRC/dnrformat_loc3d.c | 359 +++++++++++++++------------- SRC/pdgssvx3d.c | 526 +++++++++++++++++++++--------------------- SRC/pdutil.c | 43 +++- SRC/superlu_ddefs.h | 11 +- SRC/superlu_defs.h | 1 + SRC/superlu_grid3d.c | 4 + SRC/supermatrix.h | 15 +- SRC/util.c | 2 +- 11 files changed, 535 insertions(+), 453 deletions(-) diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile index ddc81a86..c52691fa 100644 --- a/EXAMPLE/Makefile +++ b/EXAMPLE/Makefile @@ -38,6 +38,7 @@ DEXM3 = pddrive3.o dcreate_matrix.o DEXM4 = pddrive4.o dcreate_matrix.o DEXM3D = pddrive3d.o dcreate_matrix.o dcreate_matrix3d.o +DEXM3D1 = pddrive3d1.o dcreate_matrix.o dcreate_matrix3d.o # dtrfAux.o dtreeFactorization.o treeFactorization.o pd3dcomm.o superlu_grid3d.o pdgstrf3d.o DEXMG = pddrive_ABglobal.o @@ -52,6 +53,7 @@ ZEXM2 = pzdrive2.o zcreate_matrix.o zcreate_matrix_perturbed.o ZEXM3 = pzdrive3.o zcreate_matrix.o ZEXM4 = pzdrive4.o zcreate_matrix.o ZEXM3D = pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o +ZEXM3D1 = pzdrive3d1.o zcreate_matrix.o zcreate_matrix3d.o ZEXMG = pzdrive_ABglobal.o ZEXMG1 = pzdrive1_ABglobal.o ZEXMG2 = pzdrive2_ABglobal.o @@ -59,10 +61,10 @@ ZEXMG3 = pzdrive3_ABglobal.o ZEXMG4 = pzdrive4_ABglobal.o -all: double complex16 +all: pddrive3d1 pddrive3d #double complex16 double: pddrive pddrive1 pddrive2 pddrive3 pddrive4 \ - pddrive3d \ + pddrive3d pddrive3d1 \ pddrive_ABglobal pddrive1_ABglobal pddrive2_ABglobal \ pddrive3_ABglobal pddrive4_ABglobal @@ -89,6 +91,9 @@ pddrive4: $(DEXM4) $(DSUPERLULIB) pddrive3d: $(DEXM3D) $(DSUPERLULIB) $(LOADER) $(LOADOPTS) $(DEXM3D) $(LIBS) -lm -o $@ +pddrive3d1: $(DEXM3D1) $(DSUPERLULIB) + $(LOADER) $(LOADOPTS) $(DEXM3D1) $(LIBS) -lm -o $@ + pddrive_ABglobal: $(DEXMG) $(DSUPERLULIB) $(LOADER) $(LOADOPTS) $(DEXMG) $(LIBS) -lm -o $@ @@ -125,6 +130,9 @@ pzdrive4: $(ZEXM4) $(DSUPERLULIB) pzdrive3d: $(ZEXM3D) $(DSUPERLULIB) $(LOADER) $(LOADOPTS) $(ZEXM3D) $(LIBS) -lm -o $@ +pzdrive3d1: $(ZEXM3D1) $(DSUPERLULIB) + $(LOADER) $(LOADOPTS) $(ZEXM3D1) $(LIBS) -lm -o $@ + pzdrive_ABglobal: $(ZEXMG) $(DSUPERLULIB) $(LOADER) $(LOADOPTS) $(ZEXMG) $(LIBS) -lm -o $@ diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c index 001acf77..7ab06437 100644 --- a/EXAMPLE/pddrive.c +++ b/EXAMPLE/pddrive.c @@ -242,9 +242,8 @@ int main(int argc, char *argv[]) dScalePermstructFree(&ScalePermstruct); dDestroy_LU(n, &grid, &LUstruct); dLUstructFree(&LUstruct); - if ( options.SolveInitialized ) { - dSolveFinalize(&options, &SOLVEstruct); - } + //if ( options.SolveInitialized ) { + dSolveFinalize(&options, &SOLVEstruct); SUPERLU_FREE(b); SUPERLU_FREE(xtrue); SUPERLU_FREE(berr); diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c index 3ac75d0f..c151924c 100644 --- a/EXAMPLE/pddrive3d.c +++ b/EXAMPLE/pddrive3d.c @@ -76,8 +76,8 @@ static void checkNRFMT(NRformat_loc*A, NRformat_loc*B) #if 0 double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; - PrintDouble5("A", A->nnz_loc, Aval); - PrintDouble5("B", B->nnz_loc, Bval); + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); fflush(stdout); #endif @@ -358,14 +358,14 @@ main (int argc, char *argv[]) PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ dDestroy_LU (n, &(grid.grid2d), &LUstruct); - if (options.SolveInitialized) { - dSolveFinalize (&options, &SOLVEstruct); - } + dSolveFinalize (&options, &SOLVEstruct); } else { // Process layers not equal 0 dDeAllocLlu_3d(n, &LUstruct, &grid); dDeAllocGlu_3d(&LUstruct); } + dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + Destroy_CompRowLoc_Matrix_dist (&A); SUPERLU_FREE (b); SUPERLU_FREE (xtrue); diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c index 08f8d04e..2fc40fe1 100644 --- a/SRC/dnrformat_loc3d.c +++ b/SRC/dnrformat_loc3d.c @@ -44,154 +44,200 @@ static void matCopy(int n, int m, double *Dst, int lddst, double *Src, int ldsrc * output is in the returned A3d->{} structure. * see supermatrix.h for nrformat_loc3d{} structure. */ -NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input, on 3D grid - double *B, // input - int ldb, int nrhs, // input - gridinfo3d_t *grid3d) +void dGatherNRformat_loc3d +( + fact_t Fact, // how matrix A will be factorized + NRformat_loc *A, // input, on 3D grid + double *B, // input + int ldb, int nrhs, // input + gridinfo3d_t *grid3d, + NRformat_loc3d **A3d_addr /* if Fact == DOFACT, it is an input. + Otherwise, it is both input and may be modified */ + ) { - NRformat_loc3d *A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d)); - NRformat_loc *A2d = SUPERLU_MALLOC(sizeof(NRformat_loc)); - A3d->m_loc = A->m_loc; - A3d->B3d = (double *) B; // on 3D process grid - A3d->ldb = ldb; - A3d->nrhs = nrhs; - - // find number of nnzs - int_t *nnz_counts; // number of local nonzeros relative to all processes - int_t *row_counts; // number of local rows relative to all processes - int *nnz_counts_int, *row_counts_int; // 32-bit - int *nnz_disp, *row_disp; // displacement - int *b_counts_int; // number of local B entries relative to all processes - int *b_disp; // including 'nrhs' - - nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t)); - row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t)); - nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); - row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); - b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); - MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts, - 1, mpi_int_t, 0, grid3d->zscp.comm); - MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts, - 1, mpi_int_t, 0, grid3d->zscp.comm); - nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int)); - row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int)); - b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int)); - - nnz_disp[0] = 0; - row_disp[0] = 0; - b_disp[0] = 0; - for (int i = 0; i < grid3d->npdep; i++) - { - nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i]; - row_disp[i + 1] = row_disp[i] + row_counts[i]; - b_disp[i + 1] = nrhs * row_disp[i + 1]; - nnz_counts_int[i] = nnz_counts[i]; - row_counts_int[i] = row_counts[i]; - b_counts_int[i] = nrhs * row_counts[i]; - } - - if (grid3d->zscp.Iam == 0) - { - A2d->colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t)); - A2d->nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(double)); - A2d->rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t)); - A2d->rowptr[0] = 0; - } - - MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval, - nnz_counts_int, nnz_disp, - MPI_DOUBLE, 0, grid3d->zscp.comm); - MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind, - nnz_counts_int, nnz_disp, - mpi_int_t, 0, grid3d->zscp.comm); - MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1], - row_counts_int, row_disp, - mpi_int_t, 0, grid3d->zscp.comm); - - if (grid3d->zscp.Iam == 0) - { - for (int i = 0; i < grid3d->npdep; i++) - { - for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++) - { - // A2d->rowptr[j] += row_disp[i]; - A2d->rowptr[j] += nnz_disp[i]; - } - } - A2d->nnz_loc = nnz_disp[grid3d->npdep]; - A2d->m_loc = row_disp[grid3d->npdep]; - - if (grid3d->rankorder == 1) { // XY-major - A2d->fst_row = A->fst_row; - } else { // Z-major - gridinfo_t *grid2d = &(grid3d->grid2d); - int procs2d = grid2d->nprow * grid2d->npcol; - int m_loc_2d = A2d->m_loc; - int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int)); - - MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, - MPI_INT, grid2d->comm); - - int fst_row = 0; - for (int p = 0; p < procs2d; ++p) - { - if (grid2d->iam == p) - A2d->fst_row = fst_row; - fst_row += m_loc_2d_counts[p]; - } - - SUPERLU_FREE(m_loc_2d_counts); - } - } - - // Btmp <- compact(B) - // compacting B - double *Btmp; - Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(double)); - matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb); - - double *B1; - if (grid3d->zscp.Iam == 0) - { - B1 = SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(double)); - A3d->B2d = (double *) SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(double)); - } - - // B1 <- gatherv(Btmp) - MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_DOUBLE, B1, - b_counts_int, b_disp, - MPI_DOUBLE, 0, grid3d->zscp.comm); - - // B2d <- colMajor(B1) - if (grid3d->zscp.Iam == 0) - { - for (int i = 0; i < grid3d->npdep; ++i) - { - /* code */ - matCopy(row_counts_int[i], nrhs, ((double*)A3d->B2d) + row_disp[i], - A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]); - } + NRformat_loc3d *A3d = (NRformat_loc3d *) *A3d_addr; + NRformat_loc *A2d; + int *row_counts_int; // 32-bit, number of local rows relative to all processes + int *row_disp; // displacement + int *b_counts_int; // number of local B entries relative to all processes + int *b_disp; // including 'nrhs' + + /********* Gather A2d *********/ + if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) { + /* A3d is input. No need to recompute count. + Only need to gather A2d matrix. */ + b_counts_int = A3d->b_counts_int; + b_disp = A3d->b_disp;; + row_counts_int = A3d->row_counts_int; + row_disp = A3d->row_disp; + + if (grid3d->iam==0) printf("TO BE COMPLETED!\n"); + } else if ( Fact != FACTORED ) { + /* A3d is output. Compute counts from scratch */ + A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d)); + A2d = SUPERLU_MALLOC(sizeof(NRformat_loc)); + - SUPERLU_FREE(B1); - } + // find number of nnzs + int_t *nnz_counts; // number of local nonzeros relative to all processes + int_t *row_counts; // number of local rows relative to all processes + int *nnz_counts_int; // 32-bit + int *nnz_disp; // displacement + + nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t)); + row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t)); + nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); + row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); + b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); + MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts, + 1, mpi_int_t, 0, grid3d->zscp.comm); + MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts, + 1, mpi_int_t, 0, grid3d->zscp.comm); + nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int)); + row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int)); + b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int)); + + nnz_disp[0] = 0; + row_disp[0] = 0; + b_disp[0] = 0; + for (int i = 0; i < grid3d->npdep; i++) + { + nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i]; + row_disp[i + 1] = row_disp[i] + row_counts[i]; + b_disp[i + 1] = nrhs * row_disp[i + 1]; + nnz_counts_int[i] = nnz_counts[i]; + row_counts_int[i] = row_counts[i]; + b_counts_int[i] = nrhs * row_counts[i]; + } - A3d->A_nfmt = A2d; - A3d->b_counts_int = b_counts_int; - A3d->b_disp = b_disp; - A3d->row_counts_int = row_counts_int; - A3d->row_disp = row_disp; + if (grid3d->zscp.Iam == 0) + { + A2d->colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t)); + A2d->nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(double)); + A2d->rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t)); + A2d->rowptr[0] = 0; + } - /* free storage */ - SUPERLU_FREE(nnz_counts); - SUPERLU_FREE(nnz_counts_int); - SUPERLU_FREE(row_counts); - SUPERLU_FREE(nnz_disp); - SUPERLU_FREE(Btmp); + MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval, + nnz_counts_int, nnz_disp, + MPI_DOUBLE, 0, grid3d->zscp.comm); + MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind, + nnz_counts_int, nnz_disp, + mpi_int_t, 0, grid3d->zscp.comm); + MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1], + row_counts_int, row_disp, + mpi_int_t, 0, grid3d->zscp.comm); + + if (grid3d->zscp.Iam == 0) + { + for (int i = 0; i < grid3d->npdep; i++) + { + for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++) + { + // A2d->rowptr[j] += row_disp[i]; + A2d->rowptr[j] += nnz_disp[i]; + } + } + A2d->nnz_loc = nnz_disp[grid3d->npdep]; + A2d->m_loc = row_disp[grid3d->npdep]; + + if (grid3d->rankorder == 1) { // XY-major + A2d->fst_row = A->fst_row; + } else { // Z-major + gridinfo_t *grid2d = &(grid3d->grid2d); + int procs2d = grid2d->nprow * grid2d->npcol; + int m_loc_2d = A2d->m_loc; + int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int)); + + MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, + MPI_INT, grid2d->comm); + + int fst_row = 0; + for (int p = 0; p < procs2d; ++p) + { + if (grid2d->iam == p) + A2d->fst_row = fst_row; + fst_row += m_loc_2d_counts[p]; + } + + SUPERLU_FREE(m_loc_2d_counts); + } + } /* end 2D layer grid-0 */ + + A3d->A_nfmt = A2d; + A3d->b_counts_int = b_counts_int; + A3d->b_disp = b_disp; + A3d->row_counts_int = row_counts_int; + A3d->row_disp = row_disp; + + /* free storage */ + SUPERLU_FREE(nnz_counts); + SUPERLU_FREE(nnz_counts_int); + SUPERLU_FREE(row_counts); + SUPERLU_FREE(nnz_disp); + + *A3d_addr = A3d; // return the pointer to A3d matrix + } /* end else: Factor from scratch */ + + A3d->m_loc = A->m_loc; + A3d->B3d = (double *) B; /* save the pointer to the original B + stored on 3D process grid. */ + A3d->ldb = ldb; + A3d->nrhs = nrhs; + + printf("dGather_loc3d(1): Fact %d; A3d %p\n", Fact, A3d); fflush(stdout); + + /********* Gather B2d **********/ + if ( nrhs > 0 ) { + + A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0 + b_counts_int = A3d->b_counts_int; + b_disp = A3d->b_disp;; + row_counts_int = A3d->row_counts_int; + row_disp = A3d->row_disp; + + if ( Fact == FACTORED ) { + } + + printf("dGather_loc3d(2): row_disp %p, A3d %p\n", row_disp, A3d); fflush(stdout); + + /* Btmp <- compact(B), compacting B */ + double *Btmp; + Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(double)); + matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb); + + double *B1; + if (grid3d->zscp.Iam == 0) + { + B1 = doubleMalloc_dist(A2d->m_loc * nrhs); + A3d->B2d = (double *) doubleMalloc_dist(A2d->m_loc * nrhs); + } - return A3d; + // B1 <- gatherv(Btmp) + MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_DOUBLE, B1, + b_counts_int, b_disp, + MPI_DOUBLE, 0, grid3d->zscp.comm); + SUPERLU_FREE(Btmp); + + printf("dGather_loc3d(3): nrhs %d\n", nrhs); fflush(stdout); + // B2d <- colMajor(B1) + if (grid3d->zscp.Iam == 0) + { + for (int i = 0; i < grid3d->npdep; ++i) + { + /* code */ + matCopy(row_counts_int[i], nrhs, ((double*)A3d->B2d) + row_disp[i], + A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]); + } + + SUPERLU_FREE(B1); + } + printf("dGather_loc3d(4): nrhs %d\n", nrhs); fflush(stdout); + } /* end gather B2d */ } /* dGatherNRformat_loc3d */ + /* * Scatter B (solution) from 2D process layer 0 to 3D grid * Output: X3d <- A^{-1} B2d @@ -199,18 +245,18 @@ NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, // input, on 3D grid int dScatter_B3d(NRformat_loc3d *A3d, // modified gridinfo3d_t *grid3d) { - double *B = (double *) A3d->B3d; // on 3D grid + double *B = (double *) A3d->B3d; // retrieve the original pointer on 3D grid int ldb = A3d->ldb; int nrhs = A3d->nrhs; - double *B2d = (double *) A3d->B2d; // on 2D layer 0 - NRformat_loc A2d = *(A3d->A_nfmt); + double *B2d = (double *) A3d->B2d; // only on 2D layer grid_0 + NRformat_loc *A2d = A3d->A_nfmt; - /* The following are the number of local rows relative to all processes */ - int m_loc = A3d->m_loc; - int *b_counts_int = A3d->b_counts_int; - int *b_disp = A3d->b_disp; + /* The following are the number of local rows relative to Z-dimension */ + int m_loc = A3d->m_loc; + int *b_counts_int = A3d->b_counts_int; + int *b_disp = A3d->b_disp; int *row_counts_int = A3d->row_counts_int; - int *row_disp = A3d->row_disp; + int *row_disp = A3d->row_disp; int i, p; int iam = grid3d->iam; int rankorder = grid3d->rankorder; @@ -219,7 +265,7 @@ int dScatter_B3d(NRformat_loc3d *A3d, // modified double *B1; // on 2D layer 0 if (grid3d->zscp.Iam == 0) { - B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(double)); + B1 = doubleMalloc_dist(A2d->m_loc * nrhs); } // B1 <- BlockByBlock(B2d) @@ -229,7 +275,7 @@ int dScatter_B3d(NRformat_loc3d *A3d, // modified { /* code */ matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i], - B2d + row_disp[i], A2d.m_loc); + B2d + row_disp[i], A2d->m_loc); } } @@ -304,13 +350,14 @@ int dScatter_B3d(NRformat_loc3d *A3d, // modified // B <- colMajor(Btmp) matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc); + //Printdouble5("scatter_B3d(2):: B:", 5, B); + /* free storage */ - SUPERLU_FREE(A3d->b_counts_int); - SUPERLU_FREE(A3d->b_disp); - SUPERLU_FREE(A3d->row_counts_int); - SUPERLU_FREE(A3d->row_disp); SUPERLU_FREE(Btmp); - if (grid3d->zscp.Iam == 0) SUPERLU_FREE(B1); - + if (grid3d->zscp.Iam == 0) { + SUPERLU_FREE(B1); + SUPERLU_FREE(B2d); + } + return 0; } /* dScatter_B3d */ diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index c4dd8d91..589af92e 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -20,13 +20,7 @@ at the top-level directory. * May 12, 2021 */ #include "superlu_ddefs.h" -#if 0 -#include "p3dcomm.h" -#include "pdgstrf3d.h" -#include "triangularSolve/pdgstrs.h" -#include "triangularSolve/pdgstrs3d.h" -#include "xtrf3Dpartition.h" -#endif + /*! \brief * *
@@ -506,7 +500,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
            dLUstruct_t * LUstruct, dSOLVEstruct_t * SOLVEstruct,
            double *berr, SuperLUStat_t * stat, int *info)
 {
-    NRformat_loc *Astore;
+    NRformat_loc *Astore = A->Store;
     SuperMatrix GA;        /* Global A in NC format */
     NCformat *GAstore;
     double *a_GA;
@@ -548,27 +542,12 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 #if ( PRNTlevel>= 2 )
     double dmin, dsum, dprod;
 #endif
-	LUstruct->dt = 'd';
+    
+    LUstruct->dt = 'd';
+    
     // get the 2d grid
     gridinfo_t *grid  = &(grid3d->grid2d);
     iam = grid->iam;
-    
-    /* Initialization. */
-    /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d 
-       B3d and Astore3d will be restored on return  */
-    int ldb3d = ldb;
-    // double *B3d = B;
-    NRformat_loc *Astore3d = (NRformat_loc *)A->Store;
-    double *B2d;
-    NRformat_loc3d *A3d = dGatherNRformat_loc3d((NRformat_loc *)A->Store,
-		   	  			B, ldb, nrhs, grid3d);
-    B2d = (double *) A3d->B2d; 
-    NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0
-    NRformat_loc *A_orig = A->Store;
-    
-    /* definition of factored seen by each process layer */
-    Fact = options->Fact;
-    factored = (Fact == FACTORED);
 
     /* Test the options choices. */
     *info = 0;
@@ -588,39 +567,69 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     } else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
 	     || A->Dtype != SLU_D || A->Mtype != SLU_GE)
 	 *info = -2;
-    else if (ldb < Astore3d->m_loc)
+    else if (ldb < Astore->m_loc)
          *info = -5;
     else if (nrhs < 0) {
 	 *info = -6;
     }
+    
     if (*info) {
 	i = -(*info);
-	pxerr_dist ("pdgssvx3d", grid, -*info);
+	pxerr_dist ("pdgssvx3d", grid, -(*info));
 	return;
     }
     
+    /* Initialization. */
+
+    options->Algo3d = YES;
+	
+    /* definition of factored seen by each process layer */
+    factored = (Fact == FACTORED);
+    
+    /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d,
+       so that the names {ldb, B, and Astore} can be used internally.
+       B3d and Astore3d will be assigned back to B and Astore on return.  */
+    int ldb3d = ldb;
+    NRformat_loc *Astore3d = (NRformat_loc *)A->Store;
+    NRformat_loc3d *A3d = SOLVEstruct->A3d;
+
+    /* B3d is aliased to B;
+       B2d is allocated; 
+       B is then aliased to B2d in the following 2D solve;
+    */
+    dGatherNRformat_loc3d(Fact, (NRformat_loc *)A->Store,
+			  B, ldb, nrhs, grid3d, &A3d);
+    
+    B = (double *) A3d->B2d; /* B is now pointing to B2d, 
+				allocated in dGatherNRformat_loc3d.  */
+    //PrintDouble5("after gather B=B2d", ldb, B);
+    
+    SOLVEstruct->A3d = A3d; /* This structure need to be persistent across
+			       multiple calls of pdgssvx3d()   */
+    printf("pdgssvx3d(1) factored %d, A3d %p\n", factored, A3d); fflush(stdout);
+    
+    NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0
+    NRformat_loc *A_orig = A->Store;
+    
 #if ( DEBUGlevel>=1 )
-	CHECK_MALLOC (iam, "Enter pdgssvx3d()");
+    CHECK_MALLOC (iam, "Enter pdgssvx3d()");
 #endif
 	
     /* Perform preprocessing steps on process layer zero, including:
-       gather 3D matrices {A, B} onto 2D grid-0,
-       ordering, symbolic factorization, distribution of L & U */
-
-#define NRFRMT
-
-    if (grid3d->zscp.Iam == 0)
+       gather 3D matrices {A, B} onto 2D grid-0, preprocessing steps: 
+       - equilibration,
+       - ordering, 
+       - symbolic factorization,
+       - distribution of L & U                                      */
+    if (grid3d->zscp.Iam == 0)  /* on 2D grid-0 */
     {
         m = A->nrow;
     	n = A->ncol;
 	// checkNRFMT(Astore0, (NRformat_loc *) A->Store);
-#ifdef NRFRMT
+
 	// On input, A->Store is on 3D, now A->Store is re-assigned to 2D store
-	A->Store = Astore0;
+	A->Store = Astore0;  // on 2D grid-0
 	ldb = Astore0->m_loc;
-	B = B2d; // B is now re-assigned to B2d
-	//PrintDouble5("after gather B=B2d", ldb, B);
-#endif
 
 	/* The following code now works on 2D grid-0 */
     	Astore = (NRformat_loc *) A->Store;
@@ -648,15 +657,15 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	
 	iam = grid->iam;
 	job = 5;
-	if (factored || (Fact == SamePattern_SameRowPerm && Equil))
-	    {
+	/* Extract equilibration status from a previous factorization */
+	if (factored || (Fact == SamePattern_SameRowPerm && Equil)) {
 		rowequ = (ScalePermstruct->DiagScale == ROW) ||
 		    (ScalePermstruct->DiagScale == BOTH);
 		colequ = (ScalePermstruct->DiagScale == COL) ||
 		    (ScalePermstruct->DiagScale == BOTH);
-	    }
-	else
+	} else {
 	    rowequ = colequ = FALSE;
+	}
 	
 	/* The following arrays are replicated on all processes. */
 	perm_r = ScalePermstruct->perm_r;
@@ -695,7 +704,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	/* ------------------------------------------------------------
 	   Diagonal scaling to equilibrate the matrix.
 	   ------------------------------------------------------------ */
-	if (Equil) {
+	if ( Equil ) {
 #if ( DEBUGlevel>=1 )
 	    CHECK_MALLOC (iam, "Enter equil");
 #endif
@@ -783,7 +792,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 #endif
 	} /* end if Equil ... LAPACK style, not involving MC64 */
 
-	if (!factored) { /* Skip this if already factored. */
+	if ( !factored ) { /* Skip this if already factored. */
 	    /*
 	     * Gather A from the distributed compressed row format to
 	     * global A in compressed column format.
@@ -975,7 +984,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 #endif
 	} /* end if (!factored) */
 
-	if (!factored || options->IterRefine) {
+	if ( !factored || options->IterRefine ) {
 	    /* Compute norm(A), which will be used to adjust small diagonal. */
 	    if (notran)
 		*(unsigned char *) norm = '1';
@@ -989,11 +998,10 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 #endif
 	}
 	
-	
 	/* ------------------------------------------------------------
 	   Perform the LU factorization.
 	   ------------------------------------------------------------ */
-	if (!factored) {
+	if ( !factored ) {
 	    t = SuperLU_timer_ ();
 	    /*
 	     * Get column permutation vector perm_c[], according to permc_spec:
@@ -1262,7 +1270,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     } /* end if not Factored */
     
     if ( grid3d->zscp.Iam == 0 ) { // only process layer 0
-	if (!factored) {
+	if ( !factored ) {
 	    if (options->PrintStat) {
 		int_t TinyPivots;
 		float for_lu, total, max, avg, temp;
@@ -1312,227 +1320,228 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	/* ------------------------------------------------------------
 	   Compute the solution matrix X.
 	   ------------------------------------------------------------ */
-	if (nrhs) {
-		if (!(b_work = doubleMalloc_dist (n)))
-		    ABORT ("Malloc fails for b_work[]");
-
-		/* ------------------------------------------------------
-		   Scale the right-hand side if equilibration was performed
-		   ------------------------------------------------------*/
-		if (notran)
-		    {
-			if (rowequ)
-			    {
-				b_col = B;
-				for (j = 0; j < nrhs; ++j)
-				    {
-					irow = fst_row;
-					for (i = 0; i < m_loc; ++i)
-					    {
-		                                b_col[i] *= R[irow];
-						++irow;
-					    }
-					b_col += ldb;
-				    }
-			    }
-		    }
-		else if (colequ)
-		    {
-			b_col = B;
-			for (j = 0; j < nrhs; ++j)
-			    {
-				irow = fst_row;
-				for (i = 0; i < m_loc; ++i)
-				    {
-		                        b_col[i] *= C[irow];
-					++irow;
-				    }
-				b_col += ldb;
-			    }
-		    }
+	if ( nrhs > 0 ) {
+	    if (!(b_work = doubleMalloc_dist (n)))
+		ABORT ("Malloc fails for b_work[]");
 
-		/* Save a copy of the right-hand side. */
-		ldx = ldb;
-		if (!(X = doubleMalloc_dist (((size_t) ldx) * nrhs)))
-		    ABORT ("Malloc fails for X[]");
-		x_col = X;
-		b_col = B;
-		for (j = 0; j < nrhs; ++j) {
-		    for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
-		    x_col += ldx;
-		    b_col += ldb;
+	    /* ------------------------------------------------------
+	       Scale the right-hand side if equilibration was performed
+	       ------------------------------------------------------*/
+	    if (notran)
+		{
+		    if (rowequ)
+			{
+			    b_col = B;
+			    for (j = 0; j < nrhs; ++j)
+				{
+				    irow = fst_row;
+				    for (i = 0; i < m_loc; ++i)
+					{
+					    b_col[i] *= R[irow];
+					    ++irow;
+					}
+				    b_col += ldb;
+				}
+			}
+		}
+	    else if (colequ)
+		{
+		    b_col = B;
+		    for (j = 0; j < nrhs; ++j)
+			{
+			    irow = fst_row;
+			    for (i = 0; i < m_loc; ++i)
+				{
+				    b_col[i] *= C[irow];
+				    ++irow;
+				}
+			    b_col += ldb;
+			}
 		}
 
-		/* ------------------------------------------------------
-		   Solve the linear system.
-		   ------------------------------------------------------*/
-		if (options->SolveInitialized == NO) /* First time */
-                   /* Inside this routine, SolveInitialized is set to YES.
-	              For repeated call to pdgssvx3d(), no need to re-initialilze
-	              the Solve data & communication structures, unless a new
-	              factorization with Fact == DOFACT or SamePattern is asked for. */
-		    {
-			dSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct,
-			            grid, SOLVEstruct);
-		    }
-		stat->utime[SOLVE] = 0.0;
+	    /* Save a copy of the right-hand side. */
+	    ldx = ldb;
+	    if (!(X = doubleMalloc_dist (((size_t) ldx) * nrhs)))
+		ABORT ("Malloc fails for X[]");
+	    x_col = X;
+	    b_col = B;
+	    for (j = 0; j < nrhs; ++j) {
+		for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
+		x_col += ldx;
+		b_col += ldb;
+	    }
+
+	    /* ------------------------------------------------------
+	       Solve the linear system.
+	       ------------------------------------------------------*/
+	    if (options->SolveInitialized == NO) /* First time */
+		/* Inside this routine, SolveInitialized is set to YES.
+		   For repeated call to pdgssvx3d(), no need to re-initialilze
+		   the Solve data & communication structures, unless a new
+		   factorization with Fact == DOFACT or SamePattern is asked for. */
+		{
+		    dSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct,
+				grid, SOLVEstruct);
+		}
+	    stat->utime[SOLVE] = 0.0;
 #if 0 // Sherry: the following interface is needed by 3D trisolve.
-		pdgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc,
+	    pdgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc,
 				fst_row, ldb, nrhs, SOLVEstruct, stat, info);
 #else
-		pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc,
-			fst_row, ldb, nrhs, SOLVEstruct, stat, info);
+	    pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc,
+		    fst_row, ldb, nrhs, SOLVEstruct, stat, info);
 #endif
 
-		/* ------------------------------------------------------------
-		   Use iterative refinement to improve the computed solution and
-		   compute error bounds and backward error estimates for it.
-		   ------------------------------------------------------------ */
-		if (options->IterRefine)
-		    {
-			/* Improve the solution by iterative refinement. */
-			int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
-			dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */
-
-			t = SuperLU_timer_ ();
-			if (options->RefineInitialized == NO || Fact == DOFACT) {
-			    /* All these cases need to re-initialize gsmv structure */
-			    if (options->RefineInitialized)
-				pdgsmv_finalize (SOLVEstruct->gsmv_comm);
-			    pdgsmv_init (A, SOLVEstruct->row_to_proc, grid,
-					 SOLVEstruct->gsmv_comm);
-
-			    /* Save a copy of the transformed local col indices
-			       in colind_gsmv[]. */
-			    if (colind_gsmv) SUPERLU_FREE (colind_gsmv);
-			    if (!(it = intMalloc_dist (nnz_loc)))
-				ABORT ("Malloc fails for colind_gsmv[]");
-			    colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
-			    for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
-			    options->RefineInitialized = YES;
+	    /* ------------------------------------------------------------
+	       Use iterative refinement to improve the computed solution and
+	       compute error bounds and backward error estimates for it.
+	       ------------------------------------------------------------ */
+	    if (options->IterRefine)
+		{
+		    /* Improve the solution by iterative refinement. */
+		    int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
+		    dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */
+
+		    t = SuperLU_timer_ ();
+		    if (options->RefineInitialized == NO || Fact == DOFACT) {
+			/* All these cases need to re-initialize gsmv structure */
+			if (options->RefineInitialized)
+			    pdgsmv_finalize (SOLVEstruct->gsmv_comm);
+			pdgsmv_init (A, SOLVEstruct->row_to_proc, grid,
+				     SOLVEstruct->gsmv_comm);
+			
+			/* Save a copy of the transformed local col indices
+			   in colind_gsmv[]. */
+			if (colind_gsmv) SUPERLU_FREE (colind_gsmv);
+			if (!(it = intMalloc_dist (nnz_loc)))
+			    ABORT ("Malloc fails for colind_gsmv[]");
+			colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
+			for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
+			options->RefineInitialized = YES;
+		    }
+		    else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) {
+			double at;
+			int_t k, jcol, p;
+			/* Swap to beginning the part of A corresponding to the
+			   local part of X, as was done in pdgsmv_init() */
+			for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+			    k = rowptr[i];
+			    for (j = rowptr[i]; j < rowptr[i + 1]; ++j)
+				{
+				    jcol = colind[j];
+				    p = SOLVEstruct->row_to_proc[jcol];
+				    if (p == iam)
+					{	/* Local */
+					    at = a[k];
+					    a[k] = a[j];
+					    a[j] = at;
+					    ++k;
+					}
+				}
 			}
-			else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) {
-			    double at;
-			    int_t k, jcol, p;
-			    /* Swap to beginning the part of A corresponding to the
-			       local part of X, as was done in pdgsmv_init() */
-			    for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-				k = rowptr[i];
-				for (j = rowptr[i]; j < rowptr[i + 1]; ++j)
-				    {
-					jcol = colind[j];
-					p = SOLVEstruct->row_to_proc[jcol];
-					if (p == iam)
-					    {	/* Local */
-						at = a[k];
-						a[k] = a[j];
-						a[j] = at;
-						++k;
-					    }
-				    }
-			    }
 			    
-			    /* Re-use the local col indices of A obtained from the
-			       previous call to pdgsmv_init() */
-			    for (i = 0; i < nnz_loc; ++i)
-				colind[i] = colind_gsmv[i];
-			}
+			/* Re-use the local col indices of A obtained from the
+			   previous call to pdgsmv_init() */
+			for (i = 0; i < nnz_loc; ++i)
+			    colind[i] = colind_gsmv[i];
+		    }
 			
-			if (nrhs == 1)
-			    {	/* Use the existing solve structure */
-				SOLVEstruct1 = SOLVEstruct;
-			    }
-			else {
-             /* For nrhs > 1, since refinement is performed for RHS
-		one at a time, the communication structure for pdgstrs
-		is different than the solve with nrhs RHS.
-		So we use SOLVEstruct1 for the refinement step.
-	      */
-				if (!(SOLVEstruct1 = (dSOLVEstruct_t *)
-				      SUPERLU_MALLOC(sizeof(dSOLVEstruct_t))))
-				    ABORT ("Malloc fails for SOLVEstruct1");
-				/* Copy the same stuff */
-				SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
-				SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
-				SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
-				SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
-				SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
-				SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
-				SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
+		    if (nrhs == 1)
+			{	/* Use the existing solve structure */
+			    SOLVEstruct1 = SOLVEstruct;
+			}
+		    else {
+			/* For nrhs > 1, since refinement is performed for RHS
+			   one at a time, the communication structure for pdgstrs
+			   is different than the solve with nrhs RHS.
+			   So we use SOLVEstruct1 for the refinement step.
+			*/
+			if (!(SOLVEstruct1 = (dSOLVEstruct_t *)
+			      SUPERLU_MALLOC(sizeof(dSOLVEstruct_t))))
+			    ABORT ("Malloc fails for SOLVEstruct1");
+			/* Copy the same stuff */
+			SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
+			SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
+			SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
+			SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
+			SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
+			SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
+			SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
 				
-				/* Initialize the *gstrs_comm for 1 RHS. */
-				if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
-				      SUPERLU_MALLOC (sizeof (pxgstrs_comm_t))))
-				    ABORT ("Malloc fails for gstrs_comm[]");
-				pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid,
-					      Glu_persist, SOLVEstruct1);
-			    }
+			/* Initialize the *gstrs_comm for 1 RHS. */
+			if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
+			      SUPERLU_MALLOC (sizeof (pxgstrs_comm_t))))
+			    ABORT ("Malloc fails for gstrs_comm[]");
+			pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid,
+				      Glu_persist, SOLVEstruct1);
+		    }
 			
-			pdgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid,
-				 B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
+		    pdgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid,
+			     B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
 			
-			/* Deallocate the storage associated with SOLVEstruct1 */
-			if (nrhs > 1)
-			    {
-				pxgstrs_finalize (SOLVEstruct1->gstrs_comm);
-				SUPERLU_FREE (SOLVEstruct1);
-			    }
+		    /* Deallocate the storage associated with SOLVEstruct1 */
+		    if (nrhs > 1)
+			{
+			    pxgstrs_finalize (SOLVEstruct1->gstrs_comm);
+			    SUPERLU_FREE (SOLVEstruct1);
+			}
 			
-			stat->utime[REFINE] = SuperLU_timer_ () - t;
-		    }
+		    stat->utime[REFINE] = SuperLU_timer_ () - t;
+		} /* end IterRefine */
 		
-		/* Permute the solution matrix B <= Pc'*X. */
-		pdPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc,
-					SOLVEstruct->inv_perm_c,
-					X, ldx, B, ldb, nrhs, grid);
+	    /* Permute the solution matrix B <= Pc'*X. */
+	    pdPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc,
+				    SOLVEstruct->inv_perm_c,
+				    X, ldx, B, ldb, nrhs, grid);
 #if ( DEBUGlevel>=2 )
-		printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam);
-		for (i = 0; i < m_loc; ++i)
-		    printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]);
+	    printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam);
+	    for (i = 0; i < m_loc; ++i)
+		printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]);
 #endif
 		
-		/* Transform the solution matrix X to a solution of the original
-		   system before the equilibration. */
-		if (notran)
-		    {
-			if (colequ)
-			    {
-				b_col = B;
-				for (j = 0; j < nrhs; ++j)
-				    {
-					irow = fst_row;
-					for (i = 0; i < m_loc; ++i)
-					    {
-						b_col[i] *= C[irow];
-						++irow;
-					    }
-					b_col += ldb;
-				    }
-			    }
-		    }
-		else if (rowequ)
-		    {
-			b_col = B;
-			for (j = 0; j < nrhs; ++j)
-			    {
-				irow = fst_row;
-				for (i = 0; i < m_loc; ++i)
-				    {
-					b_col[i] *= R[irow];
-					++irow;
-				    }
-				b_col += ldb;
-			    }
-		    }
-		
-		SUPERLU_FREE (b_work);
-		SUPERLU_FREE (X);
+	    /* Transform the solution matrix X to a solution of the original
+	       system before the equilibration. */
+	    if (notran)
+		{
+		    if (colequ)
+			{
+			    b_col = B;
+			    for (j = 0; j < nrhs; ++j)
+				{
+				    irow = fst_row;
+				    for (i = 0; i < m_loc; ++i)
+					{
+					    b_col[i] *= C[irow];
+					    ++irow;
+					}
+				    b_col += ldb;
+				}
+			}
+		}
+	    else if (rowequ)
+		{
+		    b_col = B;
+		    for (j = 0; j < nrhs; ++j)
+			{
+			    irow = fst_row;
+			    for (i = 0; i < m_loc; ++i)
+				{
+				    b_col[i] *= R[irow];
+				    ++irow;
+				}
+			    b_col += ldb;
+			}
+		}
 		
-	    }                           /* end if nrhs != 0 */
+	    SUPERLU_FREE (b_work);
+	    SUPERLU_FREE (X);
+
+	}    /* end if nrhs > 0 */
 	
 #if ( PRNTlevel>=1 )
-	if (!iam)
+	if (!iam) {
 	    printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale);
+	}
 #endif
 	
 	/* Deallocate R and/or C if it was not used. */
@@ -1560,27 +1569,14 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 
     } /* process layer 0 done solve */
 
-#ifdef NRFRMT
-    /* Scatter the solution from 2D grid_0 to 3D grid */
-    dScatter_B3d(A3d, grid3d);
-
+    /* Scatter the solution from 2D grid-0 to 3D grid */
+    if ( nrhs > 0 ) dScatter_B3d(A3d, grid3d);
+    
     B = A3d->B3d; // B is now assigned back to B3d on return
     A->Store = Astore3d; // restore Astore to 3D
-    
-    /* free A2d and B2d, which are allocated only in 2D layer Grid_0 */
-    NRformat_loc *A2d = A3d->A_nfmt;
-    if (grid3d->zscp.Iam == 0) {
-       SUPERLU_FREE( A2d->rowptr );
-       SUPERLU_FREE( A2d->colind );
-       SUPERLU_FREE( A2d->nzval );
-       SUPERLU_FREE( A3d->B2d );
-    }
-    SUPERLU_FREE( A2d );         // free 2D structure
-    SUPERLU_FREE( A3d );         // free 3D structure
-#endif
 
 #if ( DEBUGlevel>=1 )
-	CHECK_MALLOC (iam, "Exit pdgssvx3d()");
+    CHECK_MALLOC (iam, "Exit pdgssvx3d()");
 #endif
 
 }
diff --git a/SRC/pdutil.c b/SRC/pdutil.c
index 591c5f2a..02e5da1f 100644
--- a/SRC/pdutil.c
+++ b/SRC/pdutil.c
@@ -777,21 +777,42 @@ int dSolveInit(superlu_dist_options_t *options, SuperMatrix *A,
  */
 void dSolveFinalize(superlu_dist_options_t *options, dSOLVEstruct_t *SOLVEstruct)
 {
-    pxgstrs_finalize(SOLVEstruct->gstrs_comm);
+    if ( options->SolveInitialized ) {
+	pxgstrs_finalize(SOLVEstruct->gstrs_comm);
 
-    if ( options->RefineInitialized ) {
-        pdgsmv_finalize(SOLVEstruct->gsmv_comm);
-	options->RefineInitialized = NO;
+	if ( options->RefineInitialized ) {
+	    pdgsmv_finalize(SOLVEstruct->gsmv_comm);
+	    options->RefineInitialized = NO;
+	}
+	SUPERLU_FREE(SOLVEstruct->gsmv_comm);
+	SUPERLU_FREE(SOLVEstruct->row_to_proc);
+	SUPERLU_FREE(SOLVEstruct->inv_perm_c);
+	SUPERLU_FREE(SOLVEstruct->diag_procs);
+	SUPERLU_FREE(SOLVEstruct->diag_len);
+	if ( SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(SOLVEstruct->A_colind_gsmv);
+	options->SolveInitialized = NO;
     }
-    SUPERLU_FREE(SOLVEstruct->gsmv_comm);
-    SUPERLU_FREE(SOLVEstruct->row_to_proc);
-    SUPERLU_FREE(SOLVEstruct->inv_perm_c);
-    SUPERLU_FREE(SOLVEstruct->diag_procs);
-    SUPERLU_FREE(SOLVEstruct->diag_len);
-    if ( SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(SOLVEstruct->A_colind_gsmv);
-    options->SolveInitialized = NO;
 } /* dSolveFinalize */
 
+void dDestroy_A3d_gathered_on_2d(dSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid3d)
+{
+    /* free A2d and B2d, which are allocated only in 2D layer grid-0 */
+    NRformat_loc3d *A3d = SOLVEstruct->A3d;
+    NRformat_loc *A2d = A3d->A_nfmt;
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE( A2d->rowptr );
+	SUPERLU_FREE( A2d->colind );
+	SUPERLU_FREE( A2d->nzval );
+    }
+    SUPERLU_FREE(A3d->b_counts_int);  // free displacement counts 
+    SUPERLU_FREE(A3d->b_disp);
+    SUPERLU_FREE(A3d->row_counts_int);
+    SUPERLU_FREE(A3d->row_disp);
+    SUPERLU_FREE( A2d );         // free 2D structure
+    SUPERLU_FREE( A3d );         // free 3D structure
+} /* dDestroy_A3d_gathered_on_2d */
+
+
 /*! \brief Check the inf-norm of the error vector
  */
 void pdinf_norm_error(int iam, int_t n, int_t nrhs, double x[], int_t ldx,
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index 97fe6387..cb4c29f9 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -225,6 +225,9 @@ typedef struct {
                              positions in the gathered x-vector.
                              This is re-used in repeated calls to pdgsmv() */
     int_t *xrow_to_proc; /* used by PDSLin */
+    NRformat_loc3d* A3d; /* Point to 3D {A, B} gathered on layer 0 of the 2D process grid.
+			    This needs to be peresistent between 3D factorization
+			    and solve.  */
 } dSOLVEstruct_t;
 
 
@@ -425,6 +428,7 @@ extern void  pdCompute_Diag_Inv(int_t, dLUstruct_t *,gridinfo_t *, SuperLUStat_t
 extern int  dSolveInit(superlu_dist_options_t *, SuperMatrix *, int_t [], int_t [],
 		       int_t, dLUstruct_t *, gridinfo_t *, dSOLVEstruct_t *);
 extern void dSolveFinalize(superlu_dist_options_t *, dSOLVEstruct_t *);
+extern void dDestroy_A3d_gathered_on_2d(dSOLVEstruct_t *, gridinfo3d_t *);
 extern int_t pdgstrs_init(int_t, int_t, int_t, int_t,
                           int_t [], int_t [], gridinfo_t *grid,
 	                  Glu_persist_t *, dSOLVEstruct_t *);
@@ -645,9 +649,10 @@ extern int dcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, double **rhs,
     
 /* Matrix distributed in NRformat_loc in 3D process grid. It converts 
    it to a NRformat_loc distributed in 2D grid in grid-0 */
-extern NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, double *B,
-					     int ldb, int nrhs,
-					     gridinfo3d_t *grid3d);
+//extern NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, double *B,
+extern void dGatherNRformat_loc3d(fact_t Fact, NRformat_loc *A, double *B,
+				   int ldb, int nrhs, gridinfo3d_t *grid3d,
+				   NRformat_loc3d **);
 extern int dScatter_B3d(NRformat_loc3d *A3d, gridinfo3d_t *grid3d);
 
 extern void pdgssvx3d (superlu_dist_options_t *, SuperMatrix *,
diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h
index 75f91112..c6d50d7b 100644
--- a/SRC/superlu_defs.h
+++ b/SRC/superlu_defs.h
@@ -705,6 +705,7 @@ typedef struct {
     yes_no_t      lookahead_etree; /* use etree computed from the
 				      serial symbolic factorization */
     yes_no_t      SymPattern;      /* symmetric factorization          */
+    yes_no_t      Algo3d;          /* use 3D factorization/solve algorithms */
 } superlu_dist_options_t;
 
 typedef struct {
diff --git a/SRC/superlu_grid3d.c b/SRC/superlu_grid3d.c
index 9dc05867..0b423b0e 100644
--- a/SRC/superlu_grid3d.c
+++ b/SRC/superlu_grid3d.c
@@ -49,6 +49,10 @@ void superlu_gridinit3d(MPI_Comm Bcomm, /* The base communicator upon which
 
 
 /*! \brief All processes in the MPI communicator must call this routine.
+ *  On output, if a process is not in the SuperLU group, the following
+ *  values are assigned to it:
+ *      grid->comm = MPI_COMM_NULL
+ *      grid->iam = -1
  */
 void superlu_gridmap3d(
     MPI_Comm Bcomm, /* The base communicator upon which
diff --git a/SRC/supermatrix.h b/SRC/supermatrix.h
index 727e966c..e50360b6 100644
--- a/SRC/supermatrix.h
+++ b/SRC/supermatrix.h
@@ -188,18 +188,19 @@ typedef struct {
 } NRformat_loc;
 
 
-/* Data structure for storing 3D matrix on layer 0 of the 2D process grid */
+/* Data structure for storing 3D matrix on layer 0 of the 2D process grid
+   Only grid-0 has meanful values of these data structures.   */
 typedef struct NRformat_loc3d
 {
-    NRformat_loc* A_nfmt; 
+    NRformat_loc* A_nfmt; // Gathered A matrix on 2D grid-0 
     void* B3d;  // on the entire 3D process grid
-    int  ldb;
+    int  ldb;   // relative to 3D process grid
     int nrhs;
-    int m_loc; 
-    void* B2d;  // on 2D process layer Grid_0
+    int m_loc;  // relative to 3D process grid
+    void* B2d;  // on 2D process layer grid-0
 
-    int* row_counts_int; // these counts are for {A, B} distributed on 2D layer 0
-    int* row_disp;
+    int* row_counts_int; // these counts are stored on 2D layer grid-0,
+    int* row_disp;       // but count the number of {A, B} rows along Z-dimension
     int* b_counts_int;
     int* b_disp;
 } NRformat_loc3d;
diff --git a/SRC/util.c b/SRC/util.c
index 883c3823..58c0a95c 100644
--- a/SRC/util.c
+++ b/SRC/util.c
@@ -85,7 +85,6 @@ void Destroy_Dense_Matrix_dist(SuperMatrix *A)
     SUPERLU_FREE(A->Store);
 }
 
-
 /*! \brief
  *
  * 
@@ -216,6 +215,7 @@ void set_default_options_dist(superlu_dist_options_t *options)
     options->num_lookaheads = 10;
     options->lookahead_etree = NO;
     options->SymPattern = NO;
+    options->Algo3d = NO;
 #ifdef SLU_HAVE_LAPACK
     options->DiagInv = YES;
 #else

From 9e180ec8706859d5f1237225f2f6444e4ab453f2 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Sun, 5 Sep 2021 14:03:15 -0700
Subject: [PATCH 115/147] Restructure driver *pxgssvx3d* to allow separate
 calls to Factorization and Solve. The *A3d* strtucture is persistent between
 multiple calls.

---
 EXAMPLE/CMakeLists.txt         |  12 +
 EXAMPLE/Makefile               |   5 +-
 EXAMPLE/pddrive3d.c            |  14 +-
 EXAMPLE/pddrive3d1.c           | 433 +++++++++++++++++++++++++++++++++
 EXAMPLE/psdrive3d1.c           | 433 +++++++++++++++++++++++++++++++++
 EXAMPLE/pzdrive3d.c            |  20 +-
 EXAMPLE/pzdrive3d1.c           | 433 +++++++++++++++++++++++++++++++++
 FORTRAN/f_pddrive3d.F90        |  10 +-
 FORTRAN/f_pzdrive3d.F90        |  10 +-
 FORTRAN/superlu_c2f_dwrap.c    |  14 +-
 FORTRAN/superlu_c2f_zwrap.c    |  14 +-
 FORTRAN/superlu_dist_config.fh |   1 +
 SRC/dnrformat_loc3d.c          |  60 ++---
 SRC/pdgssvx3d.c                | 409 +++++++++++++++----------------
 SRC/pdutil.c                   |  23 +-
 SRC/psgssvx3d.c                | 175 +++++++------
 SRC/psutil.c                   |  46 +++-
 SRC/pzgssvx3d.c                | 185 +++++++-------
 SRC/pzutil.c                   |  46 +++-
 SRC/snrformat_loc3d.c          | 348 ++++++++++++++------------
 SRC/superlu_ddefs.h            |   7 +-
 SRC/superlu_defs.h             |   2 +-
 SRC/superlu_grid3d.c           |   5 +-
 SRC/superlu_zdefs.h            |  10 +-
 SRC/znrformat_loc3d.c          | 353 +++++++++++++++------------
 25 files changed, 2254 insertions(+), 814 deletions(-)
 create mode 100644 EXAMPLE/pddrive3d1.c
 create mode 100644 EXAMPLE/psdrive3d1.c
 create mode 100644 EXAMPLE/pzdrive3d1.c

diff --git a/EXAMPLE/CMakeLists.txt b/EXAMPLE/CMakeLists.txt
index 59ca46fd..468b6a2e 100644
--- a/EXAMPLE/CMakeLists.txt
+++ b/EXAMPLE/CMakeLists.txt
@@ -61,6 +61,10 @@ if(enable_double)
   add_executable(pddrive3d ${DEXM3D})
   target_link_libraries(pddrive3d ${all_link_libs})
 
+  set(DEXM3D1 pddrive3d1.c dcreate_matrix.c dcreate_matrix3d.c)
+  add_executable(pddrive3d1 ${DEXM3D1})
+  target_link_libraries(pddrive3d1 ${all_link_libs})
+  
   set(DEXMG pddrive_ABglobal.c)
   add_executable(pddrive_ABglobal ${DEXMG})
   target_link_libraries(pddrive_ABglobal ${all_link_libs})
@@ -116,6 +120,10 @@ if(enable_single)
   add_executable(psdrive3d ${SEXM3D})
   target_link_libraries(psdrive3d ${all_link_libs})
 
+  set(SEXM3D1 psdrive3d1.c screate_matrix.c screate_matrix3d.c)
+  add_executable(psdrive3d1 ${SEXM3D1})
+  target_link_libraries(psdrive3d1 ${all_link_libs})
+  
 endif() #### end enable_single
 
 if(enable_complex16)
@@ -147,6 +155,10 @@ if(enable_complex16)
   add_executable(pzdrive3d ${ZEXM3D})
   target_link_libraries(pzdrive3d ${all_link_libs})
 
+  set(ZEXM3D1 pzdrive3d1.c zcreate_matrix.c zcreate_matrix3d.c)
+  add_executable(pzdrive3d1 ${ZEXM3D1})
+  target_link_libraries(pzdrive3d1 ${all_link_libs})
+  
   set(ZEXMG pzdrive_ABglobal.c)
   add_executable(pzdrive_ABglobal ${ZEXMG})
   target_link_libraries(pzdrive_ABglobal ${all_link_libs})
diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile
index c52691fa..674811d9 100644
--- a/EXAMPLE/Makefile
+++ b/EXAMPLE/Makefile
@@ -54,6 +54,7 @@ ZEXM3	= pzdrive3.o zcreate_matrix.o
 ZEXM4	= pzdrive4.o zcreate_matrix.o
 ZEXM3D	= pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o
 ZEXM3D1	= pzdrive3d1.o zcreate_matrix.o zcreate_matrix3d.o
+ZEXM3D1	= pzdrive3d1.o zcreate_matrix.o zcreate_matrix3d.o
 ZEXMG	= pzdrive_ABglobal.o
 ZEXMG1	= pzdrive1_ABglobal.o
 ZEXMG2	= pzdrive2_ABglobal.o
@@ -61,7 +62,7 @@ ZEXMG3	= pzdrive3_ABglobal.o
 ZEXMG4	= pzdrive4_ABglobal.o
 
 
-all: pddrive3d1 pddrive3d #double complex16
+all: pddrive3d1 pddrive3d double complex16
 
 double:    pddrive pddrive1 pddrive2 pddrive3 pddrive4 \
 	   pddrive3d pddrive3d1 \
@@ -69,7 +70,7 @@ double:    pddrive pddrive1 pddrive2 pddrive3 pddrive4 \
 	   pddrive3_ABglobal pddrive4_ABglobal
 
 complex16: pzdrive pzdrive1 pzdrive2 pzdrive3 pzdrive4 \
-	   pzdrive3d \
+	   pzdrive3d pzdrive3d1 \
 	   pzdrive_ABglobal pzdrive1_ABglobal pzdrive2_ABglobal \
 	   pzdrive3_ABglobal pzdrive4_ABglobal
 
diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c
index c151924c..06c30590 100644
--- a/EXAMPLE/pddrive3d.c
+++ b/EXAMPLE/pddrive3d.c
@@ -15,8 +15,9 @@ at the top-level directory.
  *
  * 
  * -- Distributed SuperLU routine (version 7.0.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * May 12, 2021
  *
  */
 #include "superlu_ddefs.h"  
@@ -135,7 +136,10 @@ main (int argc, char *argv[])
     {
         int rank;
         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-        if (!rank) printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
     }
 
     /* Parse command line argv[]. */
@@ -358,12 +362,12 @@ main (int argc, char *argv[])
 	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
 
         dDestroy_LU (n, &(grid.grid2d), &LUstruct);
-	dSolveFinalize (&options, &SOLVEstruct);
+        dSolveFinalize (&options, &SOLVEstruct);
     } else { // Process layers not equal 0
         dDeAllocLlu_3d(n, &LUstruct, &grid);
         dDeAllocGlu_3d(&LUstruct);
     }
-
+    
     dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid);
 
     Destroy_CompRowLoc_Matrix_dist (&A);
diff --git a/EXAMPLE/pddrive3d1.c b/EXAMPLE/pddrive3d1.c
new file mode 100644
index 00000000..fe45a5e8
--- /dev/null
+++ b/EXAMPLE/pddrive3d1.c
@@ -0,0 +1,433 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+/*! @file
+ * \brief Driver program for PDGSSVX3D example
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_ddefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE3D1.
+ *
+ * This example illustrates how to use PDGSSVX3D to sovle the systems
+ * with the same A but different right-hand side, possibly with
+ * different number of right-hand sides.
+ * In this case, we factorize A only once in the first call to PDGSSVX3D,
+ * and reuse the following data structures in the subsequent call to
+ * PDGSSVX3D:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np 

pddrive3d -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, double* A, int LDA, + double* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + double * Aval = (double *) A->nzval; + double * Bval = (double *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + dScalePermstruct_t ScalePermstruct; + dLUstruct_t LUstruct; + dSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + double *b, *xtrue, *b1, *b2; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iim_loc, nrhs, B2d, Astore->m_loc, bref, ldb); + } + // MPI_Finalize(); exit(0); +#endif + + /* Save two copies of the RHS */ + if ( !(b1 = doubleMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + if ( !(b2 = doubleMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < ldb; ++i) { + b1[i+j*ldb] = b[i+j*ldb]; + b2[i+j*ldb] = b[i+j*ldb]; + } + } + + if (!(berr = doubleMalloc_dist (nrhs))) + ABORT ("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + 1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS. + ------------------------------------------------------------*/ + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ParSymbFact = NO; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.IterRefine = DOUBLE; + options.Trans = NOTRANS; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = NO; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + dScalePermstructInit (m, n, &ScalePermstruct); + dLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + + if ( grid.zscp.Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + } + PStatFree (&stat); + fflush(stdout); + + /* ------------------------------------------------------------ + 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN + LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + pdgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue, ldx, grid.comm); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + dDestroy_LU (n, &(grid.grid2d), &LUstruct); + dSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + dDeAllocLlu_3d(n, &LUstruct, &grid); + dDeAllocGlu_3d(&LUstruct); + } + + dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + Destroy_CompRowLoc_Matrix_dist (&A); + SUPERLU_FREE (b); + SUPERLU_FREE (b1); + SUPERLU_FREE (b2); + SUPERLU_FREE (xtrue); + SUPERLU_FREE (berr); + dScalePermstructFree (&ScalePermstruct); + dLUstructFree (&LUstruct); + PStatFree (&stat); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive3d1.c b/EXAMPLE/psdrive3d1.c new file mode 100644 index 00000000..57f5df58 --- /dev/null +++ b/EXAMPLE/psdrive3d1.c @@ -0,0 +1,433 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PSGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_sdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE3D1.
+ *
+ * This example illustrates how to use PSGSSVX3D to sovle the systems
+ * with the same A but different right-hand side, possibly with
+ * different number of right-hand sides.
+ * In this case, we factorize A only once in the first call to PSGSSVX3D,
+ * and reuse the following data structures in the subsequent call to
+ * PSGSSVX3D:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np 

psdrive3d -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, float* A, int LDA, + float* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + float * Aval = (float *) A->nzval; + float * Bval = (float *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + float *berr; + float *b, *xtrue, *b1, *b2; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iim_loc, nrhs, B2d, Astore->m_loc, bref, ldb); + } + // MPI_Finalize(); exit(0); +#endif + + /* Save two copies of the RHS */ + if ( !(b1 = floatMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + if ( !(b2 = floatMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < ldb; ++i) { + b1[i+j*ldb] = b[i+j*ldb]; + b2[i+j*ldb] = b[i+j*ldb]; + } + } + + if (!(berr = floatMalloc_dist (nrhs))) + ABORT ("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + 1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS. + ------------------------------------------------------------*/ + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ParSymbFact = NO; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.IterRefine = DOUBLE; + options.Trans = NOTRANS; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = NO; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit (m, n, &ScalePermstruct); + sLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + + if ( grid.zscp.Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + } + PStatFree (&stat); + fflush(stdout); + + /* ------------------------------------------------------------ + 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN + LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + psgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue, ldx, grid.comm); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + sDestroy_LU (n, &(grid.grid2d), &LUstruct); + sSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + sDeAllocLlu_3d(n, &LUstruct, &grid); + sDeAllocGlu_3d(&LUstruct); + } + + sDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + Destroy_CompRowLoc_Matrix_dist (&A); + SUPERLU_FREE (b); + SUPERLU_FREE (b1); + SUPERLU_FREE (b2); + SUPERLU_FREE (xtrue); + SUPERLU_FREE (berr); + sScalePermstructFree (&ScalePermstruct); + sLUstructFree (&LUstruct); + PStatFree (&stat); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/pzdrive3d.c b/EXAMPLE/pzdrive3d.c index 2eb12796..8bbed679 100644 --- a/EXAMPLE/pzdrive3d.c +++ b/EXAMPLE/pzdrive3d.c @@ -14,8 +14,9 @@ at the top-level directory. * *
  * -- Distributed SuperLU routine (version 7.0.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * May 12, 2021
  *
  */
 #include "superlu_zdefs.h"  
@@ -76,8 +77,8 @@ static void checkNRFMT(NRformat_loc*A, NRformat_loc*B)
 
 #if 0
     double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval;
-    PrintDouble5("A", A->nnz_loc, Aval);
-    PrintDouble5("B", B->nnz_loc, Bval);
+    Printdouble5("A", A->nnz_loc, Aval);
+    Printdouble5("B", B->nnz_loc, Bval);
     fflush(stdout);
 #endif
 
@@ -135,7 +136,10 @@ main (int argc, char *argv[])
     {
         int rank;
         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-        if (!rank) printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+        if (!rank) {
+	    printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
+	    printf("\tprovided omp_mpi_level: %d\n", provided);
+        }
     }
 
     /* Parse command line argv[]. */
@@ -358,13 +362,13 @@ main (int argc, char *argv[])
 	PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/
 
         zDestroy_LU (n, &(grid.grid2d), &LUstruct);
-        if (options.SolveInitialized) {
-            zSolveFinalize (&options, &SOLVEstruct);
-        }
+        zSolveFinalize (&options, &SOLVEstruct);
     } else { // Process layers not equal 0
         zDeAllocLlu_3d(n, &LUstruct, &grid);
         zDeAllocGlu_3d(&LUstruct);
     }
+    
+    zDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid);
 
     Destroy_CompRowLoc_Matrix_dist (&A);
     SUPERLU_FREE (b);
diff --git a/EXAMPLE/pzdrive3d1.c b/EXAMPLE/pzdrive3d1.c
new file mode 100644
index 00000000..4a9f22a6
--- /dev/null
+++ b/EXAMPLE/pzdrive3d1.c
@@ -0,0 +1,433 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required 
+approvals from U.S. Dept. of Energy) 
+
+All rights reserved. 
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+/*! @file
+ * \brief Driver program for PZGSSVX3D example
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_zdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE3D1.
+ *
+ * This example illustrates how to use PZGSSVX3D to sovle the systems
+ * with the same A but different right-hand side, possibly with
+ * different number of right-hand sides.
+ * In this case, we factorize A only once in the first call to PZGSSVX3D,
+ * and reuse the following data structures in the subsequent call to
+ * PZGSSVX3D:
+ *        ScalePermstruct  : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct         : Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np 

pzdrive3d -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, doublecomplex* A, int LDA, + doublecomplex* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + doublecomplex * Aval = (doublecomplex *) A->nzval; + doublecomplex * Bval = (doublecomplex *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( (Aval[i].r == Bval[i].r) && (Aval[i].i == Bval[i].i) ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + zScalePermstruct_t ScalePermstruct; + zLUstruct_t LUstruct; + zSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + doublecomplex *b, *xtrue, *b1, *b2; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iim_loc, nrhs, B2d, Astore->m_loc, bref, ldb); + } + // MPI_Finalize(); exit(0); +#endif + + /* Save two copies of the RHS */ + if ( !(b1 = doublecomplexMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + if ( !(b2 = doublecomplexMalloc_dist(ldb * nrhs)) ) + ABORT("Malloc fails for b1[]"); + for (j = 0; j < nrhs; ++j) { + for (i = 0; i < ldb; ++i) { + b1[i+j*ldb] = b[i+j*ldb]; + b2[i+j*ldb] = b[i+j*ldb]; + } + } + + if (!(berr = doubleMalloc_dist (nrhs))) + ABORT ("Malloc fails for berr[]."); + + /* ------------------------------------------------------------ + 1. SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME, WITH 1 RHS. + ------------------------------------------------------------*/ + /* Set the default input options: + options.Fact = DOFACT; + options.Equil = YES; + options.ParSymbFact = NO; + options.ColPerm = METIS_AT_PLUS_A; + options.RowPerm = LargeDiag_MC64; + options.ReplaceTinyPivot = YES; + options.IterRefine = DOUBLE; + options.Trans = NOTRANS; + options.SolveInitialized = NO; + options.RefineInitialized = NO; + options.PrintStat = YES; + options->num_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = NO; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + zScalePermstructInit (m, n, &ScalePermstruct); + zLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + + if ( grid.zscp.Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + } + PStatFree (&stat); + fflush(stdout); + + /* ------------------------------------------------------------ + 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN + LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ + options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + pzgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue, ldx, grid.comm); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + zDestroy_LU (n, &(grid.grid2d), &LUstruct); + zSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + zDeAllocLlu_3d(n, &LUstruct, &grid); + zDeAllocGlu_3d(&LUstruct); + } + + zDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + Destroy_CompRowLoc_Matrix_dist (&A); + SUPERLU_FREE (b); + SUPERLU_FREE (b1); + SUPERLU_FREE (b2); + SUPERLU_FREE (xtrue); + SUPERLU_FREE (berr); + zScalePermstructFree (&ScalePermstruct); + zLUstructFree (&LUstruct); + PStatFree (&stat); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/FORTRAN/f_pddrive3d.F90 b/FORTRAN/f_pddrive3d.F90 index eed4f9c4..b29eeb7d 100644 --- a/FORTRAN/f_pddrive3d.F90 +++ b/FORTRAN/f_pddrive3d.F90 @@ -16,7 +16,7 @@ !!
 !! -- Distributed SuperLU routine (version 7.0) --
 !! Lawrence Berkeley National Lab, Univ. of California Berkeley.
-!! January 2, 2021
+!! May 12, 2021
 !! 
! program f_pddrive3d @@ -47,7 +47,6 @@ program f_pddrive3d include 'mpif.h' integer maxn, maxnz, maxnrhs parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 ) - integer rowind(maxnz), colptr(maxn) real*8 values(maxnz), b(maxn), berr(maxnrhs), xtrue(maxn) #if (XSDK_INDEX_SIZE==64) integer*8 nnz @@ -138,11 +137,8 @@ program f_pddrive3d call f_Destroy_CompRowLoc_Mat_dist(A) call f_dScalePermstructFree(ScalePermstruct) call f_dDestroy_LU_SOLVE_struct_3d(options, n, grid, LUstruct, SOLVEstruct) -! call f_LUstructFree(LUstruct) -! call get_superlu_options(options, SolveInitialized=init) -! if (init == YES) then -! call f_dSolveFinalize(options, SOLVEstruct) -! endif + + call f_dDestroy_A3d_gathered_on_2d(SOLVEstruct, grid) ! Release the SuperLU process grid 100 call f_superlu_gridexit(grid) diff --git a/FORTRAN/f_pzdrive3d.F90 b/FORTRAN/f_pzdrive3d.F90 index 07fd9c0f..a299d7b3 100644 --- a/FORTRAN/f_pzdrive3d.F90 +++ b/FORTRAN/f_pzdrive3d.F90 @@ -15,7 +15,7 @@ !!
 !! -- Distributed SuperLU routine (version 7.0) --
 !! Lawrence Berkeley National Lab, Univ. of California Berkeley.
-!! January 2, 2021
+!! May 12, 2021
 !! 
! program f_pzdrive3d @@ -46,7 +46,6 @@ program f_pzdrive3d include 'mpif.h' integer maxn, maxnz, maxnrhs parameter ( maxn = 10000, maxnz = 100000, maxnrhs = 10 ) - integer rowind(maxnz), colptr(maxn) double complex values(maxnz), b(maxn), xtrue(maxn) real*8 berr(maxnrhs) #if (XSDK_INDEX_SIZE==64) @@ -138,11 +137,8 @@ program f_pzdrive3d call f_Destroy_CompRowLoc_Mat_dist(A) call f_zScalePermstructFree(ScalePermstruct) call f_zDestroy_LU_SOLVE_struct_3d(options, n, grid, LUstruct, SOLVEstruct) -! call f_LUstructFree(LUstruct) -! call get_superlu_options(options, SolveInitialized=init) -! if (init == YES) then -! call f_dSolveFinalize(options, SOLVEstruct) -! endif + + call f_zDestroy_A3d_gathered_on_2d(SOLVEstruct, grid) ! Release the SuperLU process grid 100 call f_superlu_gridexit(grid) diff --git a/FORTRAN/superlu_c2f_dwrap.c b/FORTRAN/superlu_c2f_dwrap.c index 018d24a2..04755024 100644 --- a/FORTRAN/superlu_c2f_dwrap.c +++ b/FORTRAN/superlu_c2f_dwrap.c @@ -4,10 +4,11 @@ * \brief C interface functions for the Fortran90 wrapper. * *
- * -- Distributed SuperLU routine (version 4.1) --
+ * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 2012
  * April 5, 2015
+ * May 12, 2021
  */
 
 #include "superlu_ddefs.h"
@@ -85,9 +86,7 @@ void f_dDestroy_LU_SOLVE_struct_3d(fptr *options, int *n, fptr *grid,
     
     if ( grid3d->zscp.Iam == 0 ) { // process layer 0
 	dDestroy_LU(*n, &(grid3d->grid2d), LUstruct_ptr);
-	if ( opt->SolveInitialized ) {
-	    dSolveFinalize(opt, (dSOLVEstruct_t *) *SOLVEstruct);
-	}
+    	dSolveFinalize(opt, (dSOLVEstruct_t *) *SOLVEstruct);
     } else { // process layers not equal 0
         dDeAllocLlu_3d(*n, LUstruct_ptr, grid3d);
         dDeAllocGlu_3d(LUstruct_ptr);
@@ -96,6 +95,13 @@ void f_dDestroy_LU_SOLVE_struct_3d(fptr *options, int *n, fptr *grid,
     dLUstructFree(LUstruct_ptr);
 }
 
+void f_dDestroy_A3d_gathered_on_2d(fptr *SOLVEstruct, fptr *grid)
+{
+    dDestroy_A3d_gathered_on_2d((dSOLVEstruct_t *) *SOLVEstruct,
+                                      (gridinfo3d_t *) *grid3d);
+}
+
+
 void f_dCreate_CompRowLoc_Mat_dist(fptr *A, int *m, int *n, int *nnz_loc,
 				   int *m_loc, int *fst_row, double *nzval,
 				   int_t *colind, int_t *rowptr, int *stype,
diff --git a/FORTRAN/superlu_c2f_zwrap.c b/FORTRAN/superlu_c2f_zwrap.c
index 467d4131..48f050bf 100644
--- a/FORTRAN/superlu_c2f_zwrap.c
+++ b/FORTRAN/superlu_c2f_zwrap.c
@@ -3,10 +3,11 @@
  * \brief C interface functions for the Fortran90 wrapper.
  *
  * 
- * -- Distributed SuperLU routine (version 4.1) --
+ * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 2012
  * April 5, 2015
+ * May 12, 2021
  */
 
 #include "superlu_zdefs.h"
@@ -84,9 +85,7 @@ void f_zDestroy_LU_SOLVE_struct_3d(fptr *options, int *n, fptr *grid,
     
     if ( grid3d->zscp.Iam == 0 ) { // process layer 0
 	zDestroy_LU(*n, &(grid3d->grid2d), LUstruct_ptr);
-	if ( opt->SolveInitialized ) {
-	    zSolveFinalize(opt, (zSOLVEstruct_t *) *SOLVEstruct);
-	}
+    	zSolveFinalize(opt, (zSOLVEstruct_t *) *SOLVEstruct);
     } else { // process layers not equal 0
         zDeAllocLlu_3d(*n, LUstruct_ptr, grid3d);
         zDeAllocGlu_3d(LUstruct_ptr);
@@ -95,6 +94,13 @@ void f_zDestroy_LU_SOLVE_struct_3d(fptr *options, int *n, fptr *grid,
     zLUstructFree(LUstruct_ptr);
 }
 
+void f_zDestroy_A3d_gathered_on_2d(fptr *SOLVEstruct, fptr *grid)
+{
+    zDestroy_A3d_gathered_on_2d((zSOLVEstruct_t *) *SOLVEstruct,
+                                      (gridinfo3d_t *) *grid3d);
+}
+
+
 void f_zCreate_CompRowLoc_Mat_dist(fptr *A, int *m, int *n, int *nnz_loc,
 				   int *m_loc, int *fst_row, doublecomplex *nzval,
 				   int_t *colind, int_t *rowptr, int *stype,
diff --git a/FORTRAN/superlu_dist_config.fh b/FORTRAN/superlu_dist_config.fh
index cbe990cc..caa86f6b 100644
--- a/FORTRAN/superlu_dist_config.fh
+++ b/FORTRAN/superlu_dist_config.fh
@@ -4,6 +4,7 @@
 
 
 
+#define XSDK_INDEX_SIZE 64
 
 #if (XSDK_INDEX_SIZE == 64)
 #define _LONGINT 1
diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c
index 2fc40fe1..3f1cb401 100644
--- a/SRC/dnrformat_loc3d.c
+++ b/SRC/dnrformat_loc3d.c
@@ -18,9 +18,8 @@ at the top-level directory.
  *
  * 
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley,
- * Oak Ridge National Lab.
- * October 22, 2020
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab.
+ * May 12, 2021
  */
 
 #include "superlu_ddefs.h"
@@ -51,8 +50,8 @@ void dGatherNRformat_loc3d
  double *B,       // input
  int ldb, int nrhs, // input
  gridinfo3d_t *grid3d, 
- NRformat_loc3d **A3d_addr /* if Fact == DOFACT, it is an input.
-			      Otherwise, it is both input and may be modified */
+ NRformat_loc3d **A3d_addr /* If Fact == DOFACT, it is an input;
+ 		              Else it is both input and may be modified */
  )
 {
     NRformat_loc3d *A3d = (NRformat_loc3d *) *A3d_addr;
@@ -77,7 +76,6 @@ void dGatherNRformat_loc3d
 	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
 	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
     
-
 	// find number of nnzs
 	int_t *nnz_counts; // number of local nonzeros relative to all processes
 	int_t *row_counts; // number of local rows relative to all processes
@@ -112,9 +110,9 @@ void dGatherNRformat_loc3d
 
 	if (grid3d->zscp.Iam == 0)
 	    {
-		A2d->colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t));
-		A2d->nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(double));
-		A2d->rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t));
+		A2d->colind = intMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->nzval = doubleMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
 		A2d->rowptr[0] = 0;
 	    }
 
@@ -164,11 +162,11 @@ void dGatherNRformat_loc3d
 		}
 	    } /* end 2D layer grid-0 */
 
-	A3d->A_nfmt = A2d;
-	A3d->b_counts_int = b_counts_int;
-	A3d->b_disp = b_disp;
+	A3d->A_nfmt         = A2d;
+	A3d->b_counts_int   = b_counts_int;
+	A3d->b_disp         = b_disp;
 	A3d->row_counts_int = row_counts_int;
-	A3d->row_disp = row_disp;
+	A3d->row_disp       = row_disp;
 
 	/* free storage */
 	SUPERLU_FREE(nnz_counts);
@@ -176,28 +174,24 @@ void dGatherNRformat_loc3d
 	SUPERLU_FREE(row_counts);
 	SUPERLU_FREE(nnz_disp);
 	
-	*A3d_addr = A3d;  // return the pointer to A3d matrix
+	*A3d_addr = (NRformat_loc3d *) A3d; // return pointer to A3d struct
+	
     } /* end else: Factor from scratch */
 
-	A3d->m_loc = A->m_loc;
-	A3d->B3d = (double *) B; /* save the pointer to the original B
+    A3d->m_loc = A->m_loc;
+    A3d->B3d = (double *) B; /* save the pointer to the original B
 				    stored on 3D process grid.  */
-	A3d->ldb = ldb;
-	A3d->nrhs = nrhs;
+    A3d->ldb = ldb;
+    A3d->nrhs = nrhs;
 	
-    printf("dGather_loc3d(1): Fact %d; A3d %p\n", Fact, A3d); fflush(stdout);
-    
     /********* Gather B2d **********/
     if ( nrhs > 0 ) {
 	
 	A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0
-	b_counts_int = A3d->b_counts_int;
-	b_disp       = A3d->b_disp;;
+	b_counts_int   = A3d->b_counts_int;
+	b_disp         = A3d->b_disp;;
 	row_counts_int = A3d->row_counts_int;
-	row_disp     = A3d->row_disp;
-	
-	if ( Fact == FACTORED ) {
-	}
+	row_disp       = A3d->row_disp;
 	
 	printf("dGather_loc3d(2): row_disp %p, A3d %p\n", row_disp, A3d); fflush(stdout);
 
@@ -210,7 +204,7 @@ void dGatherNRformat_loc3d
 	if (grid3d->zscp.Iam == 0)
 	    {
 		B1 = doubleMalloc_dist(A2d->m_loc * nrhs);
-		A3d->B2d = (double *) doubleMalloc_dist(A2d->m_loc * nrhs);
+		A3d->B2d = doubleMalloc_dist(A2d->m_loc * nrhs);
 	    }
 
 	// B1 <- gatherv(Btmp)
@@ -219,7 +213,6 @@ void dGatherNRformat_loc3d
 		    MPI_DOUBLE, 0, grid3d->zscp.comm);
 	SUPERLU_FREE(Btmp);
 
-    printf("dGather_loc3d(3): nrhs %d\n", nrhs); fflush(stdout);
 	// B2d <- colMajor(B1)
 	if (grid3d->zscp.Iam == 0)
 	    {
@@ -232,12 +225,11 @@ void dGatherNRformat_loc3d
 		
 		SUPERLU_FREE(B1);
 	    }
-    printf("dGather_loc3d(4): nrhs %d\n", nrhs); fflush(stdout);
+
     } /* end gather B2d */
 
 } /* dGatherNRformat_loc3d */
 
-
 /*
  * Scatter B (solution) from 2D process layer 0 to 3D grid
  *   Output: X3d <- A^{-1} B2d
@@ -245,7 +237,7 @@ void dGatherNRformat_loc3d
 int dScatter_B3d(NRformat_loc3d *A3d,  // modified
 		 gridinfo3d_t *grid3d)
 {
-    double *B = (double *) A3d->B3d; // retrieve the original pointer on 3D grid
+    double *B = (double *) A3d->B3d; // retrieve original pointer on 3D grid
     int ldb = A3d->ldb;
     int nrhs = A3d->nrhs;
     double *B2d = (double *) A3d->B2d; // only on 2D layer grid_0 
@@ -280,7 +272,7 @@ int dScatter_B3d(NRformat_loc3d *A3d,  // modified
     }
 
     double *Btmp; // on 3D grid
-    Btmp = SUPERLU_MALLOC(A3d->m_loc * nrhs * sizeof(double));
+    Btmp = doubleMalloc_dist(A3d->m_loc * nrhs);
 
     // Btmp <- scatterv(B1), block-by-block
     if ( rankorder == 1 ) { /* XY-major in 3D grid */
@@ -350,14 +342,12 @@ int dScatter_B3d(NRformat_loc3d *A3d,  // modified
     // B <- colMajor(Btmp)
     matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
 
-    //Printdouble5("scatter_B3d(2):: B:", 5, B);
-    
     /* free storage */
     SUPERLU_FREE(Btmp);
     if (grid3d->zscp.Iam == 0) {
 	SUPERLU_FREE(B1);
 	SUPERLU_FREE(B2d);
     }
-    
+
     return 0;
 } /* dScatter_B3d */
diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c
index 589af92e..ffa99035 100644
--- a/SRC/pdgssvx3d.c
+++ b/SRC/pdgssvx3d.c
@@ -397,7 +397,7 @@ at the top-level directory.
  *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
  *           is already in postorder.
  *
- *         o R (double*) dimension (A->nrow)
+ *         o R (double *) dimension (A->nrow)
  *           The row scale factors for A.
  *           If DiagScale = ROW or BOTH, A is multiplied on the left by
  *                          diag(R).
@@ -405,7 +405,7 @@ at the top-level directory.
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
  *           an input argument; otherwise, R is an output argument.
  *
- *         o C (double*) dimension (A->ncol)
+ *         o C (double *) dimension (A->ncol)
  *           The column scale factors for A.
  *           If DiagScale = COL or BOTH, A is multiplied on the right by
  *                          diag(C).
@@ -504,7 +504,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     SuperMatrix GA;        /* Global A in NC format */
     NCformat *GAstore;
     double *a_GA;
-    SuperMatrix GAC;       /* Global A in NCP format (add n end pointers) */
+    SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */
     NCPformat *GACstore;
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     Glu_freeable_t *Glu_freeable;
@@ -542,13 +542,13 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 #if ( PRNTlevel>= 2 )
     double dmin, dsum, dprod;
 #endif
-    
+
     LUstruct->dt = 'd';
     
     // get the 2d grid
     gridinfo_t *grid  = &(grid3d->grid2d);
     iam = grid->iam;
-
+    
     /* Test the options choices. */
     *info = 0;
     Fact = options->Fact;
@@ -572,7 +572,6 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     else if (nrhs < 0) {
 	 *info = -6;
     }
-    
     if (*info) {
 	i = -(*info);
 	pxerr_dist ("pdgssvx3d", grid, -(*info));
@@ -588,14 +587,14 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     
     /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d,
        so that the names {ldb, B, and Astore} can be used internally.
-       B3d and Astore3d will be assigned back to B and Astore on return.  */
+       B3d and Astore3d will be assigned back to B and Astore on return.*/
     int ldb3d = ldb;
     NRformat_loc *Astore3d = (NRformat_loc *)A->Store;
     NRformat_loc3d *A3d = SOLVEstruct->A3d;
 
     /* B3d is aliased to B;
        B2d is allocated; 
-       B is then aliased to B2d in the following 2D solve;
+       B is then aliased to B2d for the following 2D solve;
     */
     dGatherNRformat_loc3d(Fact, (NRformat_loc *)A->Store,
 			  B, ldb, nrhs, grid3d, &A3d);
@@ -606,11 +605,11 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     
     SOLVEstruct->A3d = A3d; /* This structure need to be persistent across
 			       multiple calls of pdgssvx3d()   */
-    printf("pdgssvx3d(1) factored %d, A3d %p\n", factored, A3d); fflush(stdout);
     
     NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0
     NRformat_loc *A_orig = A->Store;
-    
+//////    
+
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC (iam, "Enter pdgssvx3d()");
 #endif
@@ -621,6 +620,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
        - ordering, 
        - symbolic factorization,
        - distribution of L & U                                      */
+
     if (grid3d->zscp.Iam == 0)  /* on 2D grid-0 */
     {
         m = A->nrow;
@@ -630,7 +630,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	// On input, A->Store is on 3D, now A->Store is re-assigned to 2D store
 	A->Store = Astore0;  // on 2D grid-0
 	ldb = Astore0->m_loc;
-
+	
 	/* The following code now works on 2D grid-0 */
     	Astore = (NRformat_loc *) A->Store;
     	nnz_loc = Astore->nnz_loc;
@@ -658,12 +658,14 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	iam = grid->iam;
 	job = 5;
 	/* Extract equilibration status from a previous factorization */
-	if (factored || (Fact == SamePattern_SameRowPerm && Equil)) {
+	if (factored || (Fact == SamePattern_SameRowPerm && Equil))
+	    {
 		rowequ = (ScalePermstruct->DiagScale == ROW) ||
 		    (ScalePermstruct->DiagScale == BOTH);
 		colequ = (ScalePermstruct->DiagScale == COL) ||
 		    (ScalePermstruct->DiagScale == BOTH);
-	} else {
+	    }
+	else {
 	    rowequ = colequ = FALSE;
 	}
 	
@@ -680,15 +682,15 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	    /* Allocate storage if not done so before. */
 	    switch (ScalePermstruct->DiagScale)	{
 		case NOEQUIL:
-		    if (!(R = (double *) doubleMalloc_dist (m)))
+		    if (!(R = (double  *) doubleMalloc_dist (m)))
 			ABORT ("Malloc fails for R[].");
-		    if (!(C = (double *) doubleMalloc_dist (n)))
+		    if (!(C = (double  *) doubleMalloc_dist (n)))
 			ABORT ("Malloc fails for C[].");
 		    ScalePermstruct->R = R;
 		    ScalePermstruct->C = C;
 		    break;
 		case ROW:
-		    if (!(C = (double *) doubleMalloc_dist (n)))
+		    if (!(C = (double  *) doubleMalloc_dist (n)))
 			ABORT ("Malloc fails for C[].");
 		    ScalePermstruct->C = C;
 		    break;
@@ -998,6 +1000,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 #endif
 	}
 	
+	
 	/* ------------------------------------------------------------
 	   Perform the LU factorization.
 	   ------------------------------------------------------------ */
@@ -1270,7 +1273,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     } /* end if not Factored */
     
     if ( grid3d->zscp.Iam == 0 ) { // only process layer 0
-	if ( !factored ) {
+	if (!factored) {
 	    if (options->PrintStat) {
 		int_t TinyPivots;
 		float for_lu, total, max, avg, temp;
@@ -1322,226 +1325,226 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	   ------------------------------------------------------------ */
 	if ( nrhs > 0 ) {
 	    if (!(b_work = doubleMalloc_dist (n)))
-		ABORT ("Malloc fails for b_work[]");
+	        ABORT ("Malloc fails for b_work[]");
 
 	    /* ------------------------------------------------------
 	       Scale the right-hand side if equilibration was performed
 	       ------------------------------------------------------*/
 	    if (notran)
-		{
-		    if (rowequ)
-			{
+	        {
+	    	    if (rowequ)
+		        {
 			    b_col = B;
 			    for (j = 0; j < nrhs; ++j)
-				{
-				    irow = fst_row;
+			        {
+			    	    irow = fst_row;
 				    for (i = 0; i < m_loc; ++i)
-					{
-					    b_col[i] *= R[irow];
-					    ++irow;
-					}
+				    {
+		                         b_col[i] *= R[irow];
+ 					 ++irow;
+				    }
 				    b_col += ldb;
 				}
 			}
-		}
-	    else if (colequ)
-		{
-		    b_col = B;
-		    for (j = 0; j < nrhs; ++j)
-			{
-			    irow = fst_row;
-			    for (i = 0; i < m_loc; ++i)
+		    }
+		else if (colequ)
+		    {
+			b_col = B;
+			for (j = 0; j < nrhs; ++j)
+			    {
+				irow = fst_row;
+				for (i = 0; i < m_loc; ++i)
 				{
-				    b_col[i] *= C[irow];
+		                    b_col[i] *= C[irow];
 				    ++irow;
 				}
-			    b_col += ldb;
-			}
-		}
-
-	    /* Save a copy of the right-hand side. */
-	    ldx = ldb;
-	    if (!(X = doubleMalloc_dist (((size_t) ldx) * nrhs)))
-		ABORT ("Malloc fails for X[]");
-	    x_col = X;
-	    b_col = B;
-	    for (j = 0; j < nrhs; ++j) {
-		for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
-		x_col += ldx;
-		b_col += ldb;
-	    }
+				b_col += ldb;
+			    }
+		    }
 
-	    /* ------------------------------------------------------
-	       Solve the linear system.
-	       ------------------------------------------------------*/
-	    if (options->SolveInitialized == NO) /* First time */
-		/* Inside this routine, SolveInitialized is set to YES.
-		   For repeated call to pdgssvx3d(), no need to re-initialilze
-		   the Solve data & communication structures, unless a new
-		   factorization with Fact == DOFACT or SamePattern is asked for. */
-		{
-		    dSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct,
-				grid, SOLVEstruct);
+		/* Save a copy of the right-hand side. */
+		ldx = ldb;
+		if (!(X = doubleMalloc_dist (((size_t) ldx) * nrhs)))
+		    ABORT ("Malloc fails for X[]");
+		x_col = X;
+		b_col = B;
+		for (j = 0; j < nrhs; ++j) {
+		    for (i = 0; i < m_loc; ++i) x_col[i] = b_col[i];
+		    x_col += ldx;
+		    b_col += ldb;
 		}
-	    stat->utime[SOLVE] = 0.0;
+
+		/* ------------------------------------------------------
+		   Solve the linear system.
+		   ------------------------------------------------------*/
+		if (options->SolveInitialized == NO) /* First time */
+                   /* Inside this routine, SolveInitialized is set to YES.
+	              For repeated call to pdgssvx3d(), no need to re-initialilze
+	              the Solve data & communication structures, unless a new
+	              factorization with Fact == DOFACT or SamePattern is asked for. */
+		    {
+			dSolveInit (options, A, perm_r, perm_c, nrhs, LUstruct,
+			            grid, SOLVEstruct);
+		    }
+		stat->utime[SOLVE] = 0.0;
 #if 0 // Sherry: the following interface is needed by 3D trisolve.
-	    pdgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc,
+		pdgstrs_vecpar (n, LUstruct, ScalePermstruct, grid, X, m_loc,
 				fst_row, ldb, nrhs, SOLVEstruct, stat, info);
 #else
-	    pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc,
-		    fst_row, ldb, nrhs, SOLVEstruct, stat, info);
+		pdgstrs(n, LUstruct, ScalePermstruct, grid, X, m_loc,
+			fst_row, ldb, nrhs, SOLVEstruct, stat, info);
 #endif
 
-	    /* ------------------------------------------------------------
-	       Use iterative refinement to improve the computed solution and
-	       compute error bounds and backward error estimates for it.
-	       ------------------------------------------------------------ */
-	    if (options->IterRefine)
-		{
-		    /* Improve the solution by iterative refinement. */
-		    int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
-		    dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */
-
-		    t = SuperLU_timer_ ();
-		    if (options->RefineInitialized == NO || Fact == DOFACT) {
-			/* All these cases need to re-initialize gsmv structure */
-			if (options->RefineInitialized)
-			    pdgsmv_finalize (SOLVEstruct->gsmv_comm);
-			pdgsmv_init (A, SOLVEstruct->row_to_proc, grid,
-				     SOLVEstruct->gsmv_comm);
-			
-			/* Save a copy of the transformed local col indices
-			   in colind_gsmv[]. */
-			if (colind_gsmv) SUPERLU_FREE (colind_gsmv);
-			if (!(it = intMalloc_dist (nnz_loc)))
-			    ABORT ("Malloc fails for colind_gsmv[]");
-			colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
-			for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
-			options->RefineInitialized = YES;
-		    }
-		    else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) {
-			double at;
-			int_t k, jcol, p;
-			/* Swap to beginning the part of A corresponding to the
-			   local part of X, as was done in pdgsmv_init() */
-			for (i = 0; i < m_loc; ++i) { /* Loop through each row */
-			    k = rowptr[i];
-			    for (j = rowptr[i]; j < rowptr[i + 1]; ++j)
-				{
-				    jcol = colind[j];
-				    p = SOLVEstruct->row_to_proc[jcol];
-				    if (p == iam)
-					{	/* Local */
-					    at = a[k];
-					    a[k] = a[j];
-					    a[j] = at;
-					    ++k;
-					}
-				}
+		/* ------------------------------------------------------------
+		   Use iterative refinement to improve the computed solution and
+		   compute error bounds and backward error estimates for it.
+		   ------------------------------------------------------------ */
+		if (options->IterRefine)
+		    {
+			/* Improve the solution by iterative refinement. */
+			int_t *it, *colind_gsmv = SOLVEstruct->A_colind_gsmv;
+			dSOLVEstruct_t *SOLVEstruct1; /* Used by refinement */
+
+			t = SuperLU_timer_ ();
+			if (options->RefineInitialized == NO || Fact == DOFACT) {
+			    /* All these cases need to re-initialize gsmv structure */
+			    if (options->RefineInitialized)
+				pdgsmv_finalize (SOLVEstruct->gsmv_comm);
+			    pdgsmv_init (A, SOLVEstruct->row_to_proc, grid,
+					 SOLVEstruct->gsmv_comm);
+
+			    /* Save a copy of the transformed local col indices
+			       in colind_gsmv[]. */
+			    if (colind_gsmv) SUPERLU_FREE (colind_gsmv);
+			    if (!(it = intMalloc_dist (nnz_loc)))
+				ABORT ("Malloc fails for colind_gsmv[]");
+			    colind_gsmv = SOLVEstruct->A_colind_gsmv = it;
+			    for (i = 0; i < nnz_loc; ++i) colind_gsmv[i] = colind[i];
+			    options->RefineInitialized = YES;
 			}
+			else if (Fact == SamePattern || Fact == SamePattern_SameRowPerm) {
+			    double at;
+			    int_t k, jcol, p;
+			    /* Swap to beginning the part of A corresponding to the
+			       local part of X, as was done in pdgsmv_init() */
+			    for (i = 0; i < m_loc; ++i) { /* Loop through each row */
+				k = rowptr[i];
+				for (j = rowptr[i]; j < rowptr[i + 1]; ++j)
+				    {
+					jcol = colind[j];
+					p = SOLVEstruct->row_to_proc[jcol];
+					if (p == iam)
+					    {	/* Local */
+						at = a[k];
+						a[k] = a[j];
+						a[j] = at;
+						++k;
+					    }
+				    }
+			    }
 			    
-			/* Re-use the local col indices of A obtained from the
-			   previous call to pdgsmv_init() */
-			for (i = 0; i < nnz_loc; ++i)
-			    colind[i] = colind_gsmv[i];
-		    }
-			
-		    if (nrhs == 1)
-			{	/* Use the existing solve structure */
-			    SOLVEstruct1 = SOLVEstruct;
+			    /* Re-use the local col indices of A obtained from the
+			       previous call to pdgsmv_init() */
+			    for (i = 0; i < nnz_loc; ++i)
+				colind[i] = colind_gsmv[i];
 			}
-		    else {
-			/* For nrhs > 1, since refinement is performed for RHS
-			   one at a time, the communication structure for pdgstrs
-			   is different than the solve with nrhs RHS.
-			   So we use SOLVEstruct1 for the refinement step.
-			*/
-			if (!(SOLVEstruct1 = (dSOLVEstruct_t *)
-			      SUPERLU_MALLOC(sizeof(dSOLVEstruct_t))))
-			    ABORT ("Malloc fails for SOLVEstruct1");
-			/* Copy the same stuff */
-			SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
-			SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
-			SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
-			SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
-			SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
-			SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
-			SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
+			
+			if (nrhs == 1)
+			    {	/* Use the existing solve structure */
+				SOLVEstruct1 = SOLVEstruct;
+			    }
+			else {
+             /* For nrhs > 1, since refinement is performed for RHS
+		one at a time, the communication structure for pdgstrs
+		is different than the solve with nrhs RHS.
+		So we use SOLVEstruct1 for the refinement step.
+	      */
+				if (!(SOLVEstruct1 = (dSOLVEstruct_t *)
+				      SUPERLU_MALLOC(sizeof(dSOLVEstruct_t))))
+				    ABORT ("Malloc fails for SOLVEstruct1");
+				/* Copy the same stuff */
+				SOLVEstruct1->row_to_proc = SOLVEstruct->row_to_proc;
+				SOLVEstruct1->inv_perm_c = SOLVEstruct->inv_perm_c;
+				SOLVEstruct1->num_diag_procs = SOLVEstruct->num_diag_procs;
+				SOLVEstruct1->diag_procs = SOLVEstruct->diag_procs;
+				SOLVEstruct1->diag_len = SOLVEstruct->diag_len;
+				SOLVEstruct1->gsmv_comm = SOLVEstruct->gsmv_comm;
+				SOLVEstruct1->A_colind_gsmv = SOLVEstruct->A_colind_gsmv;
 				
-			/* Initialize the *gstrs_comm for 1 RHS. */
-			if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
-			      SUPERLU_MALLOC (sizeof (pxgstrs_comm_t))))
-			    ABORT ("Malloc fails for gstrs_comm[]");
-			pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid,
-				      Glu_persist, SOLVEstruct1);
-		    }
+				/* Initialize the *gstrs_comm for 1 RHS. */
+				if (!(SOLVEstruct1->gstrs_comm = (pxgstrs_comm_t *)
+				      SUPERLU_MALLOC (sizeof (pxgstrs_comm_t))))
+				    ABORT ("Malloc fails for gstrs_comm[]");
+				pdgstrs_init (n, m_loc, 1, fst_row, perm_r, perm_c, grid,
+					      Glu_persist, SOLVEstruct1);
+			    }
 			
-		    pdgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid,
-			     B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
+			pdgsrfs (n, A, anorm, LUstruct, ScalePermstruct, grid,
+				 B, ldb, X, ldx, nrhs, SOLVEstruct1, berr, stat, info);
 			
-		    /* Deallocate the storage associated with SOLVEstruct1 */
-		    if (nrhs > 1)
-			{
-			    pxgstrs_finalize (SOLVEstruct1->gstrs_comm);
-			    SUPERLU_FREE (SOLVEstruct1);
-			}
+			/* Deallocate the storage associated with SOLVEstruct1 */
+			if (nrhs > 1)
+			    {
+				pxgstrs_finalize (SOLVEstruct1->gstrs_comm);
+				SUPERLU_FREE (SOLVEstruct1);
+			    }
 			
-		    stat->utime[REFINE] = SuperLU_timer_ () - t;
-		} /* end IterRefine */
+			stat->utime[REFINE] = SuperLU_timer_ () - t;
+		    } /* end IterRefine */
 		
-	    /* Permute the solution matrix B <= Pc'*X. */
-	    pdPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc,
-				    SOLVEstruct->inv_perm_c,
-				    X, ldx, B, ldb, nrhs, grid);
+		/* Permute the solution matrix B <= Pc'*X. */
+		pdPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc,
+					SOLVEstruct->inv_perm_c,
+					X, ldx, B, ldb, nrhs, grid);
 #if ( DEBUGlevel>=2 )
-	    printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam);
-	    for (i = 0; i < m_loc; ++i)
-		printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]);
+		printf ("\n (%d) .. After pdPermute_Dense_Matrix(): b =\n", iam);
+		for (i = 0; i < m_loc; ++i)
+		    printf ("\t(%d)\t%4d\t%.10f\n", iam, i + fst_row, B[i]);
 #endif
 		
-	    /* Transform the solution matrix X to a solution of the original
-	       system before the equilibration. */
-	    if (notran)
-		{
-		    if (colequ)
-			{
-			    b_col = B;
-			    for (j = 0; j < nrhs; ++j)
-				{
-				    irow = fst_row;
-				    for (i = 0; i < m_loc; ++i)
-					{
-					    b_col[i] *= C[irow];
-					    ++irow;
-					}
-				    b_col += ldb;
-				}
-			}
-		}
-	    else if (rowequ)
-		{
-		    b_col = B;
-		    for (j = 0; j < nrhs; ++j)
-			{
-			    irow = fst_row;
-			    for (i = 0; i < m_loc; ++i)
-				{
-				    b_col[i] *= R[irow];
-				    ++irow;
-				}
-			    b_col += ldb;
-			}
-		}
+		/* Transform the solution matrix X to a solution of the original
+		   system before the equilibration. */
+		if (notran)
+		    {
+			if (colequ)
+			    {
+				b_col = B;
+				for (j = 0; j < nrhs; ++j)
+				    {
+					irow = fst_row;
+					for (i = 0; i < m_loc; ++i)
+					    {
+						b_col[i] *= C[irow];
+						++irow;
+					    }
+					b_col += ldb;
+				    }
+			    }
+		    }
+		else if (rowequ)
+		    {
+			b_col = B;
+			for (j = 0; j < nrhs; ++j)
+			    {
+				irow = fst_row;
+				for (i = 0; i < m_loc; ++i)
+				    {
+					b_col[i] *= R[irow];
+					++irow;
+				    }
+				b_col += ldb;
+			    }
+		    }
 		
-	    SUPERLU_FREE (b_work);
-	    SUPERLU_FREE (X);
-
-	}    /* end if nrhs > 0 */
+		SUPERLU_FREE (b_work);
+		SUPERLU_FREE (X);
+		
+	    } /* end if nrhs > 0 */
 	
 #if ( PRNTlevel>=1 )
 	if (!iam) {
 	    printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale);
-	}
+        }
 #endif
 	
 	/* Deallocate R and/or C if it was not used. */
@@ -1571,12 +1574,12 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 
     /* Scatter the solution from 2D grid-0 to 3D grid */
     if ( nrhs > 0 ) dScatter_B3d(A3d, grid3d);
-    
+
     B = A3d->B3d; // B is now assigned back to B3d on return
     A->Store = Astore3d; // restore Astore to 3D
-
+    
 #if ( DEBUGlevel>=1 )
-    CHECK_MALLOC (iam, "Exit pdgssvx3d()");
+	CHECK_MALLOC (iam, "Exit pdgssvx3d()");
 #endif
 
 }
diff --git a/SRC/pdutil.c b/SRC/pdutil.c
index 02e5da1f..13465637 100644
--- a/SRC/pdutil.c
+++ b/SRC/pdutil.c
@@ -778,19 +778,20 @@ int dSolveInit(superlu_dist_options_t *options, SuperMatrix *A,
 void dSolveFinalize(superlu_dist_options_t *options, dSOLVEstruct_t *SOLVEstruct)
 {
     if ( options->SolveInitialized ) {
-	pxgstrs_finalize(SOLVEstruct->gstrs_comm);
+        pxgstrs_finalize(SOLVEstruct->gstrs_comm);
 
-	if ( options->RefineInitialized ) {
-	    pdgsmv_finalize(SOLVEstruct->gsmv_comm);
+        if ( options->RefineInitialized ) {
+            pdgsmv_finalize(SOLVEstruct->gsmv_comm);
 	    options->RefineInitialized = NO;
-	}
-	SUPERLU_FREE(SOLVEstruct->gsmv_comm);
-	SUPERLU_FREE(SOLVEstruct->row_to_proc);
-	SUPERLU_FREE(SOLVEstruct->inv_perm_c);
-	SUPERLU_FREE(SOLVEstruct->diag_procs);
-	SUPERLU_FREE(SOLVEstruct->diag_len);
-	if ( SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(SOLVEstruct->A_colind_gsmv);
-	options->SolveInitialized = NO;
+        }
+        SUPERLU_FREE(SOLVEstruct->gsmv_comm);
+        SUPERLU_FREE(SOLVEstruct->row_to_proc);
+        SUPERLU_FREE(SOLVEstruct->inv_perm_c);
+        SUPERLU_FREE(SOLVEstruct->diag_procs);
+        SUPERLU_FREE(SOLVEstruct->diag_len);
+        if ( SOLVEstruct->A_colind_gsmv )
+	    SUPERLU_FREE(SOLVEstruct->A_colind_gsmv);
+        options->SolveInitialized = NO;
     }
 } /* dSolveFinalize */
 
diff --git a/SRC/psgssvx3d.c b/SRC/psgssvx3d.c
index ce73ad8a..727da4dc 100644
--- a/SRC/psgssvx3d.c
+++ b/SRC/psgssvx3d.c
@@ -20,13 +20,7 @@ at the top-level directory.
  * May 12, 2021
  */
 #include "superlu_sdefs.h"
-#if 0
-#include "p3dcomm.h"
-#include "pdgstrf3d.h"
-#include "triangularSolve/pdgstrs.h"
-#include "triangularSolve/pdgstrs3d.h"
-#include "xtrf3Dpartition.h"
-#endif
+
 /*! \brief
  *
  * 
@@ -506,11 +500,11 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
            sLUstruct_t * LUstruct, sSOLVEstruct_t * SOLVEstruct,
            float *berr, SuperLUStat_t * stat, int *info)
 {
-    NRformat_loc *Astore;
+    NRformat_loc *Astore = A->Store;
     SuperMatrix GA;        /* Global A in NC format */
     NCformat *GAstore;
     float *a_GA;
-    SuperMatrix GAC;       /* Global A in NCP format (add n end pointers) */
+    SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */
     NCPformat *GACstore;
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     Glu_freeable_t *Glu_freeable;
@@ -548,28 +542,13 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 #if ( PRNTlevel>= 2 )
     double dmin, dsum, dprod;
 #endif
-	LUstruct->dt = 's';
+
+    LUstruct->dt = 's';
+    
     // get the 2d grid
     gridinfo_t *grid  = &(grid3d->grid2d);
     iam = grid->iam;
     
-    /* Initialization. */
-    /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d 
-       B3d and Astore3d will be restored on return  */
-    int ldb3d = ldb;
-    // float *B3d = B;
-    NRformat_loc *Astore3d = (NRformat_loc *)A->Store;
-    float *B2d;
-    NRformat_loc3d *A3d = sGatherNRformat_loc3d((NRformat_loc *)A->Store,
-		   	  			B, ldb, nrhs, grid3d);
-    B2d = (float *) A3d->B2d; 
-    NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0
-    NRformat_loc *A_orig = A->Store;
-    
-    /* definition of factored seen by each process layer */
-    Fact = options->Fact;
-    factored = (Fact == FACTORED);
-
     /* Test the options choices. */
     *info = 0;
     Fact = options->Fact;
@@ -588,40 +567,70 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     } else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
 	     || A->Dtype != SLU_S || A->Mtype != SLU_GE)
 	 *info = -2;
-    else if (ldb < Astore3d->m_loc)
+    else if (ldb < Astore->m_loc)
          *info = -5;
     else if (nrhs < 0) {
 	 *info = -6;
     }
     if (*info) {
 	i = -(*info);
-	pxerr_dist ("psgssvx3d", grid, -*info);
+	pxerr_dist ("psgssvx3d", grid, -(*info));
 	return;
     }
     
+    /* Initialization. */
+
+    options->Algo3d = YES;
+	
+    /* definition of factored seen by each process layer */
+    factored = (Fact == FACTORED);
+    
+    /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d,
+       so that the names {ldb, B, and Astore} can be used internally.
+       B3d and Astore3d will be assigned back to B and Astore on return.*/
+    int ldb3d = ldb;
+    NRformat_loc *Astore3d = (NRformat_loc *)A->Store;
+    NRformat_loc3d *A3d = SOLVEstruct->A3d;
+
+    /* B3d is aliased to B;
+       B2d is allocated; 
+       B is then aliased to B2d for the following 2D solve;
+    */
+    sGatherNRformat_loc3d(Fact, (NRformat_loc *)A->Store,
+			  B, ldb, nrhs, grid3d, &A3d);
+    
+    B = (float *) A3d->B2d; /* B is now pointing to B2d, 
+				allocated in dGatherNRformat_loc3d.  */
+    //PrintDouble5("after gather B=B2d", ldb, B);
+    
+    SOLVEstruct->A3d = A3d; /* This structure need to be persistent across
+			       multiple calls of pdgssvx3d()   */
+    
+    NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0
+    NRformat_loc *A_orig = A->Store;
+//////    
+
 #if ( DEBUGlevel>=1 )
-	CHECK_MALLOC (iam, "Enter psgssvx3d()");
+    CHECK_MALLOC (iam, "Enter psgssvx3d()");
 #endif
 	
     /* Perform preprocessing steps on process layer zero, including:
-       gather 3D matrices {A, B} onto 2D grid-0,
-       ordering, symbolic factorization, distribution of L & U */
-
-#define NRFRMT
+       gather 3D matrices {A, B} onto 2D grid-0, preprocessing steps: 
+       - equilibration,
+       - ordering, 
+       - symbolic factorization,
+       - distribution of L & U                                      */
 
-    if (grid3d->zscp.Iam == 0)
+    if (grid3d->zscp.Iam == 0)  /* on 2D grid-0 */
     {
         m = A->nrow;
     	n = A->ncol;
 	// checkNRFMT(Astore0, (NRformat_loc *) A->Store);
-#ifdef NRFRMT
+
 	// On input, A->Store is on 3D, now A->Store is re-assigned to 2D store
-	A->Store = Astore0;
+	A->Store = Astore0;  // on 2D grid-0
 	ldb = Astore0->m_loc;
-	B = B2d; // B is now re-assigned to B2d
-	//PrintDouble5("after gather B=B2d", ldb, B);
-#endif
-
+	
 	/* The following code now works on 2D grid-0 */
     	Astore = (NRformat_loc *) A->Store;
     	nnz_loc = Astore->nnz_loc;
@@ -648,6 +657,7 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	
 	iam = grid->iam;
 	job = 5;
+	/* Extract equilibration status from a previous factorization */
 	if (factored || (Fact == SamePattern_SameRowPerm && Equil))
 	    {
 		rowequ = (ScalePermstruct->DiagScale == ROW) ||
@@ -655,8 +665,9 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		colequ = (ScalePermstruct->DiagScale == COL) ||
 		    (ScalePermstruct->DiagScale == BOTH);
 	    }
-	else
+	else {
 	    rowequ = colequ = FALSE;
+	}
 	
 	/* The following arrays are replicated on all processes. */
 	perm_r = ScalePermstruct->perm_r;
@@ -695,7 +706,7 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	/* ------------------------------------------------------------
 	   Diagonal scaling to equilibrate the matrix.
 	   ------------------------------------------------------------ */
-	if (Equil) {
+	if ( Equil ) {
 #if ( DEBUGlevel>=1 )
 	    CHECK_MALLOC (iam, "Enter equil");
 #endif
@@ -783,7 +794,7 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 #endif
 	} /* end if Equil ... LAPACK style, not involving MC64 */
 
-	if (!factored) { /* Skip this if already factored. */
+	if ( !factored ) { /* Skip this if already factored. */
 	    /*
 	     * Gather A from the distributed compressed row format to
 	     * global A in compressed column format.
@@ -975,7 +986,7 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 #endif
 	} /* end if (!factored) */
 
-	if (!factored || options->IterRefine) {
+	if ( !factored || options->IterRefine ) {
 	    /* Compute norm(A), which will be used to adjust small diagonal. */
 	    if (notran)
 		*(unsigned char *) norm = '1';
@@ -993,7 +1004,7 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	/* ------------------------------------------------------------
 	   Perform the LU factorization.
 	   ------------------------------------------------------------ */
-	if (!factored) {
+	if ( !factored ) {
 	    t = SuperLU_timer_ ();
 	    /*
 	     * Get column permutation vector perm_c[], according to permc_spec:
@@ -1312,29 +1323,29 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	/* ------------------------------------------------------------
 	   Compute the solution matrix X.
 	   ------------------------------------------------------------ */
-	if (nrhs) {
-		if (!(b_work = floatMalloc_dist (n)))
-		    ABORT ("Malloc fails for b_work[]");
+	if ( nrhs > 0 ) {
+	    if (!(b_work = floatMalloc_dist (n)))
+	        ABORT ("Malloc fails for b_work[]");
 
-		/* ------------------------------------------------------
-		   Scale the right-hand side if equilibration was performed
-		   ------------------------------------------------------*/
-		if (notran)
-		    {
-			if (rowequ)
-			    {
-				b_col = B;
-				for (j = 0; j < nrhs; ++j)
+	    /* ------------------------------------------------------
+	       Scale the right-hand side if equilibration was performed
+	       ------------------------------------------------------*/
+	    if (notran)
+	        {
+	    	    if (rowequ)
+		        {
+			    b_col = B;
+			    for (j = 0; j < nrhs; ++j)
+			        {
+			    	    irow = fst_row;
+				    for (i = 0; i < m_loc; ++i)
 				    {
-					irow = fst_row;
-					for (i = 0; i < m_loc; ++i)
-					    {
-		                                b_col[i] *= R[irow];
-						++irow;
-					    }
-					b_col += ldb;
+		                         b_col[i] *= R[irow];
+ 					 ++irow;
 				    }
-			    }
+				    b_col += ldb;
+				}
+			}
 		    }
 		else if (colequ)
 		    {
@@ -1343,10 +1354,10 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 			    {
 				irow = fst_row;
 				for (i = 0; i < m_loc; ++i)
-				    {
-		                        b_col[i] *= C[irow];
-					++irow;
-				    }
+				{
+		                    b_col[i] *= C[irow];
+				    ++irow;
+				}
 				b_col += ldb;
 			    }
 		    }
@@ -1479,7 +1490,7 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 			    }
 			
 			stat->utime[REFINE] = SuperLU_timer_ () - t;
-		    }
+		    } /* end IterRefine */
 		
 		/* Permute the solution matrix B <= Pc'*X. */
 		psPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc,
@@ -1528,11 +1539,12 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		SUPERLU_FREE (b_work);
 		SUPERLU_FREE (X);
 		
-	    }                           /* end if nrhs != 0 */
+	    } /* end if nrhs > 0 */
 	
 #if ( PRNTlevel>=1 )
-	if (!iam)
+	if (!iam) {
 	    printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale);
+        }
 #endif
 	
 	/* Deallocate R and/or C if it was not used. */
@@ -1560,25 +1572,12 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 
     } /* process layer 0 done solve */
 
-#ifdef NRFRMT
-    /* Scatter the solution from 2D grid_0 to 3D grid */
-    sScatter_B3d(A3d, grid3d);
+    /* Scatter the solution from 2D grid-0 to 3D grid */
+    if ( nrhs > 0 ) sScatter_B3d(A3d, grid3d);
 
     B = A3d->B3d; // B is now assigned back to B3d on return
     A->Store = Astore3d; // restore Astore to 3D
     
-    /* free A2d and B2d, which are allocated only in 2D layer Grid_0 */
-    NRformat_loc *A2d = A3d->A_nfmt;
-    if (grid3d->zscp.Iam == 0) {
-       SUPERLU_FREE( A2d->rowptr );
-       SUPERLU_FREE( A2d->colind );
-       SUPERLU_FREE( A2d->nzval );
-       SUPERLU_FREE( A3d->B2d );
-    }
-    SUPERLU_FREE( A2d );         // free 2D structure
-    SUPERLU_FREE( A3d );         // free 3D structure
-#endif
-
 #if ( DEBUGlevel>=1 )
 	CHECK_MALLOC (iam, "Exit psgssvx3d()");
 #endif
diff --git a/SRC/psutil.c b/SRC/psutil.c
index e08066be..edb66875 100644
--- a/SRC/psutil.c
+++ b/SRC/psutil.c
@@ -777,21 +777,43 @@ int sSolveInit(superlu_dist_options_t *options, SuperMatrix *A,
  */
 void sSolveFinalize(superlu_dist_options_t *options, sSOLVEstruct_t *SOLVEstruct)
 {
-    pxgstrs_finalize(SOLVEstruct->gstrs_comm);
-
-    if ( options->RefineInitialized ) {
-        psgsmv_finalize(SOLVEstruct->gsmv_comm);
-	options->RefineInitialized = NO;
+    if ( options->SolveInitialized ) {
+        pxgstrs_finalize(SOLVEstruct->gstrs_comm);
+
+        if ( options->RefineInitialized ) {
+            psgsmv_finalize(SOLVEstruct->gsmv_comm);
+	    options->RefineInitialized = NO;
+        }
+        SUPERLU_FREE(SOLVEstruct->gsmv_comm);
+        SUPERLU_FREE(SOLVEstruct->row_to_proc);
+        SUPERLU_FREE(SOLVEstruct->inv_perm_c);
+        SUPERLU_FREE(SOLVEstruct->diag_procs);
+        SUPERLU_FREE(SOLVEstruct->diag_len);
+        if ( SOLVEstruct->A_colind_gsmv )
+	    SUPERLU_FREE(SOLVEstruct->A_colind_gsmv);
+        options->SolveInitialized = NO;
     }
-    SUPERLU_FREE(SOLVEstruct->gsmv_comm);
-    SUPERLU_FREE(SOLVEstruct->row_to_proc);
-    SUPERLU_FREE(SOLVEstruct->inv_perm_c);
-    SUPERLU_FREE(SOLVEstruct->diag_procs);
-    SUPERLU_FREE(SOLVEstruct->diag_len);
-    if ( SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(SOLVEstruct->A_colind_gsmv);
-    options->SolveInitialized = NO;
 } /* sSolveFinalize */
 
+void sDestroy_A3d_gathered_on_2d(sSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid3d)
+{
+    /* free A2d and B2d, which are allocated only in 2D layer grid-0 */
+    NRformat_loc3d *A3d = SOLVEstruct->A3d;
+    NRformat_loc *A2d = A3d->A_nfmt;
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE( A2d->rowptr );
+	SUPERLU_FREE( A2d->colind );
+	SUPERLU_FREE( A2d->nzval );
+    }
+    SUPERLU_FREE(A3d->b_counts_int);  // free displacement counts 
+    SUPERLU_FREE(A3d->b_disp);
+    SUPERLU_FREE(A3d->row_counts_int);
+    SUPERLU_FREE(A3d->row_disp);
+    SUPERLU_FREE( A2d );         // free 2D structure
+    SUPERLU_FREE( A3d );         // free 3D structure
+} /* sDestroy_A3d_gathered_on_2d */
+
+
 /*! \brief Check the inf-norm of the error vector
  */
 void psinf_norm_error(int iam, int_t n, int_t nrhs, float x[], int_t ldx,
diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c
index eba2489c..8212ec39 100644
--- a/SRC/pzgssvx3d.c
+++ b/SRC/pzgssvx3d.c
@@ -19,13 +19,7 @@ at the top-level directory.
  * May 12, 2021
  */
 #include "superlu_zdefs.h"
-#if 0
-#include "p3dcomm.h"
-#include "pdgstrf3d.h"
-#include "triangularSolve/pdgstrs.h"
-#include "triangularSolve/pdgstrs3d.h"
-#include "xtrf3Dpartition.h"
-#endif
+
 /*! \brief
  *
  * 
@@ -402,7 +396,7 @@ at the top-level directory.
  *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
  *           is already in postorder.
  *
- *         o R (double*) dimension (A->nrow)
+ *         o R (double *) dimension (A->nrow)
  *           The row scale factors for A.
  *           If DiagScale = ROW or BOTH, A is multiplied on the left by
  *                          diag(R).
@@ -410,7 +404,7 @@ at the top-level directory.
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
  *           an input argument; otherwise, R is an output argument.
  *
- *         o C (double*) dimension (A->ncol)
+ *         o C (double *) dimension (A->ncol)
  *           The column scale factors for A.
  *           If DiagScale = COL or BOTH, A is multiplied on the right by
  *                          diag(C).
@@ -505,11 +499,11 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
            zLUstruct_t * LUstruct, zSOLVEstruct_t * SOLVEstruct,
            double *berr, SuperLUStat_t * stat, int *info)
 {
-    NRformat_loc *Astore;
+    NRformat_loc *Astore = A->Store;
     SuperMatrix GA;        /* Global A in NC format */
     NCformat *GAstore;
     doublecomplex *a_GA;
-    SuperMatrix GAC;       /* Global A in NCP format (add n end pointers) */
+    SuperMatrix GAC; /* Global A in NCP format (add n end pointers) */
     NCPformat *GACstore;
     Glu_persist_t *Glu_persist = LUstruct->Glu_persist;
     Glu_freeable_t *Glu_freeable;
@@ -547,28 +541,13 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 #if ( PRNTlevel>= 2 )
     double dmin, dsum, dprod;
 #endif
-	LUstruct->dt = 'z';
+
+    LUstruct->dt = 'z';
+    
     // get the 2d grid
     gridinfo_t *grid  = &(grid3d->grid2d);
     iam = grid->iam;
     
-    /* Initialization. */
-    /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d 
-       B3d and Astore3d will be restored on return  */
-    int ldb3d = ldb;
-    // doublecomplex *B3d = B;
-    NRformat_loc *Astore3d = (NRformat_loc *)A->Store;
-    doublecomplex *B2d;
-    NRformat_loc3d *A3d = zGatherNRformat_loc3d((NRformat_loc *)A->Store,
-		   	  			B, ldb, nrhs, grid3d);
-    B2d = (doublecomplex *) A3d->B2d; 
-    NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0
-    NRformat_loc *A_orig = A->Store;
-    
-    /* definition of factored seen by each process layer */
-    Fact = options->Fact;
-    factored = (Fact == FACTORED);
-
     /* Test the options choices. */
     *info = 0;
     Fact = options->Fact;
@@ -587,40 +566,70 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     } else if (A->nrow != A->ncol || A->nrow < 0 || A->Stype != SLU_NR_loc
 	     || A->Dtype != SLU_Z || A->Mtype != SLU_GE)
 	 *info = -2;
-    else if (ldb < Astore3d->m_loc)
+    else if (ldb < Astore->m_loc)
          *info = -5;
     else if (nrhs < 0) {
 	 *info = -6;
     }
     if (*info) {
 	i = -(*info);
-	pxerr_dist ("pzgssvx3d", grid, -*info);
+	pxerr_dist ("pzgssvx3d", grid, -(*info));
 	return;
     }
     
+    /* Initialization. */
+
+    options->Algo3d = YES;
+	
+    /* definition of factored seen by each process layer */
+    factored = (Fact == FACTORED);
+    
+    /* Save the inputs: ldb -> ldb3d, and B -> B3d, Astore -> Astore3d,
+       so that the names {ldb, B, and Astore} can be used internally.
+       B3d and Astore3d will be assigned back to B and Astore on return.*/
+    int ldb3d = ldb;
+    NRformat_loc *Astore3d = (NRformat_loc *)A->Store;
+    NRformat_loc3d *A3d = SOLVEstruct->A3d;
+
+    /* B3d is aliased to B;
+       B2d is allocated; 
+       B is then aliased to B2d for the following 2D solve;
+    */
+    zGatherNRformat_loc3d(Fact, (NRformat_loc *)A->Store,
+			  B, ldb, nrhs, grid3d, &A3d);
+    
+    B = (doublecomplex *) A3d->B2d; /* B is now pointing to B2d, 
+				allocated in dGatherNRformat_loc3d.  */
+    //PrintDouble5("after gather B=B2d", ldb, B);
+    
+    SOLVEstruct->A3d = A3d; /* This structure need to be persistent across
+			       multiple calls of pdgssvx3d()   */
+    
+    NRformat_loc *Astore0 = A3d->A_nfmt; // on 2D grid-0
+    NRformat_loc *A_orig = A->Store;
+//////    
+
 #if ( DEBUGlevel>=1 )
-	CHECK_MALLOC (iam, "Enter pzgssvx3d()");
+    CHECK_MALLOC (iam, "Enter pzgssvx3d()");
 #endif
 	
     /* Perform preprocessing steps on process layer zero, including:
-       gather 3D matrices {A, B} onto 2D grid-0,
-       ordering, symbolic factorization, distribution of L & U */
+       gather 3D matrices {A, B} onto 2D grid-0, preprocessing steps: 
+       - equilibration,
+       - ordering, 
+       - symbolic factorization,
+       - distribution of L & U                                      */
 
-#define NRFRMT
-
-    if (grid3d->zscp.Iam == 0)
+    if (grid3d->zscp.Iam == 0)  /* on 2D grid-0 */
     {
         m = A->nrow;
     	n = A->ncol;
 	// checkNRFMT(Astore0, (NRformat_loc *) A->Store);
-#ifdef NRFRMT
+
 	// On input, A->Store is on 3D, now A->Store is re-assigned to 2D store
-	A->Store = Astore0;
+	A->Store = Astore0;  // on 2D grid-0
 	ldb = Astore0->m_loc;
-	B = B2d; // B is now re-assigned to B2d
-	//PrintDouble5("after gather B=B2d", ldb, B);
-#endif
-
+	
 	/* The following code now works on 2D grid-0 */
     	Astore = (NRformat_loc *) A->Store;
     	nnz_loc = Astore->nnz_loc;
@@ -647,6 +656,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	
 	iam = grid->iam;
 	job = 5;
+	/* Extract equilibration status from a previous factorization */
 	if (factored || (Fact == SamePattern_SameRowPerm && Equil))
 	    {
 		rowequ = (ScalePermstruct->DiagScale == ROW) ||
@@ -654,8 +664,9 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		colequ = (ScalePermstruct->DiagScale == COL) ||
 		    (ScalePermstruct->DiagScale == BOTH);
 	    }
-	else
+	else {
 	    rowequ = colequ = FALSE;
+	}
 	
 	/* The following arrays are replicated on all processes. */
 	perm_r = ScalePermstruct->perm_r;
@@ -670,15 +681,15 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	    /* Allocate storage if not done so before. */
 	    switch (ScalePermstruct->DiagScale)	{
 		case NOEQUIL:
-		    if (!(R = (double *) doubleMalloc_dist (m)))
+		    if (!(R = (double  *) doubleMalloc_dist (m)))
 			ABORT ("Malloc fails for R[].");
-		    if (!(C = (double *) doubleMalloc_dist (n)))
+		    if (!(C = (double  *) doubleMalloc_dist (n)))
 			ABORT ("Malloc fails for C[].");
 		    ScalePermstruct->R = R;
 		    ScalePermstruct->C = C;
 		    break;
 		case ROW:
-		    if (!(C = (double *) doubleMalloc_dist (n)))
+		    if (!(C = (double  *) doubleMalloc_dist (n)))
 			ABORT ("Malloc fails for C[].");
 		    ScalePermstruct->C = C;
 		    break;
@@ -694,7 +705,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	/* ------------------------------------------------------------
 	   Diagonal scaling to equilibrate the matrix.
 	   ------------------------------------------------------------ */
-	if (Equil) {
+	if ( Equil ) {
 #if ( DEBUGlevel>=1 )
 	    CHECK_MALLOC (iam, "Enter equil");
 #endif
@@ -783,7 +794,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 #endif
 	} /* end if Equil ... LAPACK style, not involving MC64 */
 
-	if (!factored) { /* Skip this if already factored. */
+	if ( !factored ) { /* Skip this if already factored. */
 	    /*
 	     * Gather A from the distributed compressed row format to
 	     * global A in compressed column format.
@@ -976,7 +987,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 #endif
 	} /* end if (!factored) */
 
-	if (!factored || options->IterRefine) {
+	if ( !factored || options->IterRefine ) {
 	    /* Compute norm(A), which will be used to adjust small diagonal. */
 	    if (notran)
 		*(unsigned char *) norm = '1';
@@ -994,7 +1005,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	/* ------------------------------------------------------------
 	   Perform the LU factorization.
 	   ------------------------------------------------------------ */
-	if (!factored) {
+	if ( !factored ) {
 	    t = SuperLU_timer_ ();
 	    /*
 	     * Get column permutation vector perm_c[], according to permc_spec:
@@ -1313,29 +1324,29 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	/* ------------------------------------------------------------
 	   Compute the solution matrix X.
 	   ------------------------------------------------------------ */
-	if (nrhs) {
-		if (!(b_work = doublecomplexMalloc_dist (n)))
-		    ABORT ("Malloc fails for b_work[]");
+	if ( nrhs > 0 ) {
+	    if (!(b_work = doublecomplexMalloc_dist (n)))
+	        ABORT ("Malloc fails for b_work[]");
 
-		/* ------------------------------------------------------
-		   Scale the right-hand side if equilibration was performed
-		   ------------------------------------------------------*/
-		if (notran)
-		    {
-			if (rowequ)
-			    {
-				b_col = B;
-				for (j = 0; j < nrhs; ++j)
+	    /* ------------------------------------------------------
+	       Scale the right-hand side if equilibration was performed
+	       ------------------------------------------------------*/
+	    if (notran)
+	        {
+	    	    if (rowequ)
+		        {
+			    b_col = B;
+			    for (j = 0; j < nrhs; ++j)
+			        {
+			    	    irow = fst_row;
+				    for (i = 0; i < m_loc; ++i)
 				    {
-					irow = fst_row;
-					for (i = 0; i < m_loc; ++i)
-					    {
-                                                zd_mult(&b_col[i], &b_col[i], R[irow]);
-						++irow;
-					    }
-					b_col += ldb;
+                                         zd_mult(&b_col[i], &b_col[i], R[irow]);
+ 					 ++irow;
 				    }
-			    }
+				    b_col += ldb;
+				}
+			}
 		    }
 		else if (colequ)
 		    {
@@ -1344,10 +1355,10 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 			    {
 				irow = fst_row;
 				for (i = 0; i < m_loc; ++i)
-				    {
-		                        zd_mult(&b_col[i], &b_col[i], C[irow]);
-					++irow;
-				    }
+				{
+		                    zd_mult(&b_col[i], &b_col[i], C[irow]);
+				    ++irow;
+				}
 				b_col += ldb;
 			    }
 		    }
@@ -1480,7 +1491,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 			    }
 			
 			stat->utime[REFINE] = SuperLU_timer_ () - t;
-		    }
+		    } /* end IterRefine */
 		
 		/* Permute the solution matrix B <= Pc'*X. */
 		pzPermute_Dense_Matrix (fst_row, m_loc, SOLVEstruct->row_to_proc,
@@ -1529,11 +1540,12 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		SUPERLU_FREE (b_work);
 		SUPERLU_FREE (X);
 		
-	    }                           /* end if nrhs != 0 */
+	    } /* end if nrhs > 0 */
 	
 #if ( PRNTlevel>=1 )
-	if (!iam)
+	if (!iam) {
 	    printf (".. DiagScale = %d\n", ScalePermstruct->DiagScale);
+        }
 #endif
 	
 	/* Deallocate R and/or C if it was not used. */
@@ -1561,25 +1573,12 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 
     } /* process layer 0 done solve */
 
-#ifdef NRFRMT
-    /* Scatter the solution from 2D grid_0 to 3D grid */
-    zScatter_B3d(A3d, grid3d);
+    /* Scatter the solution from 2D grid-0 to 3D grid */
+    if ( nrhs > 0 ) zScatter_B3d(A3d, grid3d);
 
     B = A3d->B3d; // B is now assigned back to B3d on return
     A->Store = Astore3d; // restore Astore to 3D
     
-    /* free A2d and B2d, which are allocated only in 2D layer Grid_0 */
-    NRformat_loc *A2d = A3d->A_nfmt;
-    if (grid3d->zscp.Iam == 0) {
-       SUPERLU_FREE( A2d->rowptr );
-       SUPERLU_FREE( A2d->colind );
-       SUPERLU_FREE( A2d->nzval );
-       SUPERLU_FREE( A3d->B2d );
-    }
-    SUPERLU_FREE( A2d );         // free 2D structure
-    SUPERLU_FREE( A3d );         // free 3D structure
-#endif
-
 #if ( DEBUGlevel>=1 )
 	CHECK_MALLOC (iam, "Exit pzgssvx3d()");
 #endif
diff --git a/SRC/pzutil.c b/SRC/pzutil.c
index bd77323e..c340c061 100644
--- a/SRC/pzutil.c
+++ b/SRC/pzutil.c
@@ -776,21 +776,43 @@ int zSolveInit(superlu_dist_options_t *options, SuperMatrix *A,
  */
 void zSolveFinalize(superlu_dist_options_t *options, zSOLVEstruct_t *SOLVEstruct)
 {
-    pxgstrs_finalize(SOLVEstruct->gstrs_comm);
-
-    if ( options->RefineInitialized ) {
-        pzgsmv_finalize(SOLVEstruct->gsmv_comm);
-	options->RefineInitialized = NO;
+    if ( options->SolveInitialized ) {
+        pxgstrs_finalize(SOLVEstruct->gstrs_comm);
+
+        if ( options->RefineInitialized ) {
+            pzgsmv_finalize(SOLVEstruct->gsmv_comm);
+	    options->RefineInitialized = NO;
+        }
+        SUPERLU_FREE(SOLVEstruct->gsmv_comm);
+        SUPERLU_FREE(SOLVEstruct->row_to_proc);
+        SUPERLU_FREE(SOLVEstruct->inv_perm_c);
+        SUPERLU_FREE(SOLVEstruct->diag_procs);
+        SUPERLU_FREE(SOLVEstruct->diag_len);
+        if ( SOLVEstruct->A_colind_gsmv )
+	    SUPERLU_FREE(SOLVEstruct->A_colind_gsmv);
+        options->SolveInitialized = NO;
     }
-    SUPERLU_FREE(SOLVEstruct->gsmv_comm);
-    SUPERLU_FREE(SOLVEstruct->row_to_proc);
-    SUPERLU_FREE(SOLVEstruct->inv_perm_c);
-    SUPERLU_FREE(SOLVEstruct->diag_procs);
-    SUPERLU_FREE(SOLVEstruct->diag_len);
-    if ( SOLVEstruct->A_colind_gsmv ) SUPERLU_FREE(SOLVEstruct->A_colind_gsmv);
-    options->SolveInitialized = NO;
 } /* zSolveFinalize */
 
+void zDestroy_A3d_gathered_on_2d(zSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid3d)
+{
+    /* free A2d and B2d, which are allocated only in 2D layer grid-0 */
+    NRformat_loc3d *A3d = SOLVEstruct->A3d;
+    NRformat_loc *A2d = A3d->A_nfmt;
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE( A2d->rowptr );
+	SUPERLU_FREE( A2d->colind );
+	SUPERLU_FREE( A2d->nzval );
+    }
+    SUPERLU_FREE(A3d->b_counts_int);  // free displacement counts 
+    SUPERLU_FREE(A3d->b_disp);
+    SUPERLU_FREE(A3d->row_counts_int);
+    SUPERLU_FREE(A3d->row_disp);
+    SUPERLU_FREE( A2d );         // free 2D structure
+    SUPERLU_FREE( A3d );         // free 3D structure
+} /* zDestroy_A3d_gathered_on_2d */
+
+
 /*! \brief Check the inf-norm of the error vector
  */
 void pzinf_norm_error(int iam, int_t n, int_t nrhs, doublecomplex x[], int_t ldx,
diff --git a/SRC/snrformat_loc3d.c b/SRC/snrformat_loc3d.c
index 57874544..01855bf5 100644
--- a/SRC/snrformat_loc3d.c
+++ b/SRC/snrformat_loc3d.c
@@ -43,151 +43,190 @@ static void matCopy(int n, int m, float *Dst, int lddst, float *Src, int ldsrc)
  *             output is in the returned A3d->{} structure.
  *             see supermatrix.h for nrformat_loc3d{} structure.
  */
-NRformat_loc3d *sGatherNRformat_loc3d(NRformat_loc *A, // input, on 3D grid
-                                      float *B,       // input
-				      int ldb, int nrhs, // input
-                                      gridinfo3d_t *grid3d)
+void sGatherNRformat_loc3d
+(
+ fact_t Fact,     // how matrix A will be factorized
+ NRformat_loc *A, // input, on 3D grid
+ float *B,       // input
+ int ldb, int nrhs, // input
+ gridinfo3d_t *grid3d, 
+ NRformat_loc3d **A3d_addr /* If Fact == DOFACT, it is an input;
+ 		              Else it is both input and may be modified */
+ )
 {
-    NRformat_loc3d *A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
-    NRformat_loc *A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
-    A3d->m_loc = A->m_loc;
-    A3d->B3d = (float *) B; // on 3D process grid
-    A3d->ldb = ldb;
-    A3d->nrhs = nrhs;
-
-    // find number of nnzs
-    int_t *nnz_counts; // number of local nonzeros relative to all processes
-    int_t *row_counts; // number of local rows relative to all processes
-    int *nnz_counts_int, *row_counts_int; // 32-bit
-    int *nnz_disp, *row_disp; // displacement
-    int *b_counts_int; // number of local B entries relative to all processes 
-    int *b_disp;       // including 'nrhs'
-
-    nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
-    row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
-    nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
-    row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
-    b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
-    MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
-               1, mpi_int_t, 0, grid3d->zscp.comm);
-    MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
-               1, mpi_int_t, 0, grid3d->zscp.comm);
-    nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
-    row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
-    b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
-
-    nnz_disp[0] = 0;
-    row_disp[0] = 0;
-    b_disp[0] = 0;
-    for (int i = 0; i < grid3d->npdep; i++)
-    {
-        nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
-        row_disp[i + 1] = row_disp[i] + row_counts[i];
-        b_disp[i + 1] = nrhs * row_disp[i + 1];
-        nnz_counts_int[i] = nnz_counts[i];
-        row_counts_int[i] = row_counts[i];
-        b_counts_int[i] = nrhs * row_counts[i];
-    }
-
-    if (grid3d->zscp.Iam == 0)
-    {
-        A2d->colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t));
-        A2d->nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(float));
-        A2d->rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t));
-        A2d->rowptr[0] = 0;
-    }
-
-    MPI_Gatherv(A->nzval, A->nnz_loc, MPI_FLOAT, A2d->nzval,
-                nnz_counts_int, nnz_disp,
-                MPI_FLOAT, 0, grid3d->zscp.comm);
-    MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
-                nnz_counts_int, nnz_disp,
-                mpi_int_t, 0, grid3d->zscp.comm);
-    MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
-                row_counts_int, row_disp,
-                mpi_int_t, 0, grid3d->zscp.comm);
-
-    if (grid3d->zscp.Iam == 0)
-    {
-        for (int i = 0; i < grid3d->npdep; i++)
-        {
-            for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
-            {
-                // A2d->rowptr[j] += row_disp[i];
-                A2d->rowptr[j] += nnz_disp[i];
-            }
-        }
-        A2d->nnz_loc = nnz_disp[grid3d->npdep];
-        A2d->m_loc = row_disp[grid3d->npdep];
-
-        if (grid3d->rankorder == 1) { // XY-major
-     	    A2d->fst_row = A->fst_row;
-	} else { // Z-major
-	    gridinfo_t *grid2d = &(grid3d->grid2d);
-            int procs2d = grid2d->nprow * grid2d->npcol;
-            int m_loc_2d = A2d->m_loc;
-            int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
-
-            MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
-	                  MPI_INT, grid2d->comm);
-
-            int fst_row = 0;
-            for (int p = 0; p < procs2d; ++p)
-            {
-		if (grid2d->iam == p)
-                   A2d->fst_row = fst_row;
-            	fst_row += m_loc_2d_counts[p];
-            }
-
-            SUPERLU_FREE(m_loc_2d_counts);
-        }
-    }
-
-    // Btmp <- compact(B)
-    // compacting B
-    float *Btmp;
-    Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(float));
-    matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
-
-    float *B1;
-    if (grid3d->zscp.Iam == 0)
-    {
-        B1 = SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(float));
-        A3d->B2d = (float *) SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(float));
-    }
-
-    // B1 <- gatherv(Btmp)
-    MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_FLOAT, B1,
-                b_counts_int, b_disp,
-                MPI_FLOAT, 0, grid3d->zscp.comm);
+    NRformat_loc3d *A3d = (NRformat_loc3d *) *A3d_addr;
+    NRformat_loc *A2d;
+    int *row_counts_int; // 32-bit, number of local rows relative to all processes
+    int *row_disp;       // displacement
+    int *b_counts_int;   // number of local B entries relative to all processes 
+    int *b_disp;         // including 'nrhs'
+	
+    /********* Gather A2d *********/
+    if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) {
+	/* A3d is input. No need to recompute count.
+	   Only need to gather A2d matrix.  */
+	b_counts_int   = A3d->b_counts_int;
+	b_disp         = A3d->b_disp;;
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+
+	if (grid3d->iam==0) printf("TO BE COMPLETED!\n");
+    } else if ( Fact != FACTORED ) {
+	/* A3d is output. Compute counts from scratch */
+	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
+    
+	// find number of nnzs
+	int_t *nnz_counts; // number of local nonzeros relative to all processes
+	int_t *row_counts; // number of local rows relative to all processes
+	int *nnz_counts_int; // 32-bit
+	int *nnz_disp; // displacement
+
+	nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+
+	nnz_disp[0] = 0;
+	row_disp[0] = 0;
+	b_disp[0] = 0;
+	for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
+		row_disp[i + 1] = row_disp[i] + row_counts[i];
+		b_disp[i + 1] = nrhs * row_disp[i + 1];
+		nnz_counts_int[i] = nnz_counts[i];
+		row_counts_int[i] = row_counts[i];
+		b_counts_int[i] = nrhs * row_counts[i];
+	    }
 
-    // B2d <- colMajor(B1)
-    if (grid3d->zscp.Iam == 0)
-    {
-        for (int i = 0; i < grid3d->npdep; ++i)
-        {
-            /* code */
-            matCopy(row_counts_int[i], nrhs, ((float*)A3d->B2d) + row_disp[i],
-		    A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
-        }
+	if (grid3d->zscp.Iam == 0)
+	    {
+		A2d->colind = intMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->nzval = floatMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
+		A2d->rowptr[0] = 0;
+	    }
 
-        SUPERLU_FREE(B1);
-    }
+	MPI_Gatherv(A->nzval, A->nnz_loc, MPI_FLOAT, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    MPI_FLOAT, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+
+	if (grid3d->zscp.Iam == 0)
+	    {
+		for (int i = 0; i < grid3d->npdep; i++)
+		    {
+			for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+			    {
+				// A2d->rowptr[j] += row_disp[i];
+				A2d->rowptr[j] += nnz_disp[i];
+			    }
+		    }
+		A2d->nnz_loc = nnz_disp[grid3d->npdep];
+		A2d->m_loc = row_disp[grid3d->npdep];
+
+		if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+		} else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+		}
+	    } /* end 2D layer grid-0 */
+
+	A3d->A_nfmt         = A2d;
+	A3d->b_counts_int   = b_counts_int;
+	A3d->b_disp         = b_disp;
+	A3d->row_counts_int = row_counts_int;
+	A3d->row_disp       = row_disp;
+
+	/* free storage */
+	SUPERLU_FREE(nnz_counts);
+	SUPERLU_FREE(nnz_counts_int);
+	SUPERLU_FREE(row_counts);
+	SUPERLU_FREE(nnz_disp);
+	
+	*A3d_addr = (NRformat_loc3d *) A3d; // return pointer to A3d struct
+	
+    } /* end else: Factor from scratch */
 
-    A3d->A_nfmt = A2d;
-    A3d->b_counts_int = b_counts_int;
-    A3d->b_disp = b_disp;
-    A3d->row_counts_int = row_counts_int;
-    A3d->row_disp = row_disp;
+    A3d->m_loc = A->m_loc;
+    A3d->B3d = (float *) B; /* save the pointer to the original B
+				    stored on 3D process grid.  */
+    A3d->ldb = ldb;
+    A3d->nrhs = nrhs;
+	
+    /********* Gather B2d **********/
+    if ( nrhs > 0 ) {
+	
+	A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0
+	b_counts_int   = A3d->b_counts_int;
+	b_disp         = A3d->b_disp;;
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	
+	printf("dGather_loc3d(2): row_disp %p, A3d %p\n", row_disp, A3d); fflush(stdout);
+
+	/* Btmp <- compact(B), compacting B */
+	float *Btmp;
+	Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(float));
+	matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
+
+	float *B1;
+	if (grid3d->zscp.Iam == 0)
+	    {
+		B1 = floatMalloc_dist(A2d->m_loc * nrhs);
+		A3d->B2d = floatMalloc_dist(A2d->m_loc * nrhs);
+	    }
 
-    /* free storage */
-    SUPERLU_FREE(nnz_counts);
-    SUPERLU_FREE(nnz_counts_int);
-    SUPERLU_FREE(row_counts);
-    SUPERLU_FREE(nnz_disp);
-    SUPERLU_FREE(Btmp);
+	// B1 <- gatherv(Btmp)
+	MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_FLOAT, B1,
+		    b_counts_int, b_disp,
+		    MPI_FLOAT, 0, grid3d->zscp.comm);
+	SUPERLU_FREE(Btmp);
+
+	// B2d <- colMajor(B1)
+	if (grid3d->zscp.Iam == 0)
+	    {
+		for (int i = 0; i < grid3d->npdep; ++i)
+		    {
+			/* code */
+			matCopy(row_counts_int[i], nrhs, ((float*)A3d->B2d) + row_disp[i],
+				A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
+		    }
+		
+		SUPERLU_FREE(B1);
+	    }
 
-    return A3d;
+    } /* end gather B2d */
 
 } /* sGatherNRformat_loc3d */
 
@@ -198,18 +237,18 @@ NRformat_loc3d *sGatherNRformat_loc3d(NRformat_loc *A, // input, on 3D grid
 int sScatter_B3d(NRformat_loc3d *A3d,  // modified
 		 gridinfo3d_t *grid3d)
 {
-    float *B = (float *) A3d->B3d; // on 3D grid
+    float *B = (float *) A3d->B3d; // retrieve original pointer on 3D grid
     int ldb = A3d->ldb;
     int nrhs = A3d->nrhs;
-    float *B2d = (float *) A3d->B2d; // on 2D layer 0 
-    NRformat_loc A2d = *(A3d->A_nfmt);
+    float *B2d = (float *) A3d->B2d; // only on 2D layer grid_0 
+    NRformat_loc *A2d = A3d->A_nfmt;
 
-    /* The following are the number of local rows relative to all processes */
-    int m_loc = A3d->m_loc;
-    int *b_counts_int = A3d->b_counts_int;
-    int *b_disp = A3d->b_disp;
+    /* The following are the number of local rows relative to Z-dimension */
+    int m_loc           = A3d->m_loc;
+    int *b_counts_int   = A3d->b_counts_int;
+    int *b_disp         = A3d->b_disp;
     int *row_counts_int = A3d->row_counts_int;
-    int *row_disp = A3d->row_disp;
+    int *row_disp       = A3d->row_disp;
     int i, p;
     int iam = grid3d->iam;
     int rankorder = grid3d->rankorder;
@@ -218,7 +257,7 @@ int sScatter_B3d(NRformat_loc3d *A3d,  // modified
     float *B1;  // on 2D layer 0
     if (grid3d->zscp.Iam == 0)
     {
-        B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(float));
+        B1 = floatMalloc_dist(A2d->m_loc * nrhs);
     }
 
     // B1 <- BlockByBlock(B2d)
@@ -228,12 +267,12 @@ int sScatter_B3d(NRformat_loc3d *A3d,  // modified
         {
             /* code */
             matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i],
-                    B2d + row_disp[i], A2d.m_loc);
+                    B2d + row_disp[i], A2d->m_loc);
         }
     }
 
     float *Btmp; // on 3D grid
-    Btmp = SUPERLU_MALLOC(A3d->m_loc * nrhs * sizeof(float));
+    Btmp = floatMalloc_dist(A3d->m_loc * nrhs);
 
     // Btmp <- scatterv(B1), block-by-block
     if ( rankorder == 1 ) { /* XY-major in 3D grid */
@@ -304,12 +343,11 @@ int sScatter_B3d(NRformat_loc3d *A3d,  // modified
     matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
 
     /* free storage */
-    SUPERLU_FREE(A3d->b_counts_int);
-    SUPERLU_FREE(A3d->b_disp);
-    SUPERLU_FREE(A3d->row_counts_int);
-    SUPERLU_FREE(A3d->row_disp);
     SUPERLU_FREE(Btmp);
-    if (grid3d->zscp.Iam == 0) SUPERLU_FREE(B1);
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE(B1);
+	SUPERLU_FREE(B2d);
+    }
 
     return 0;
 } /* sScatter_B3d */
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index cb4c29f9..1a724af8 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -225,9 +225,9 @@ typedef struct {
                              positions in the gathered x-vector.
                              This is re-used in repeated calls to pdgsmv() */
     int_t *xrow_to_proc; /* used by PDSLin */
-    NRformat_loc3d* A3d; /* Point to 3D {A, B} gathered on layer 0 of the 2D process grid.
-			    This needs to be peresistent between 3D factorization
-			    and solve.  */
+    NRformat_loc3d* A3d; /* Point to 3D {A, B} gathered on 2D layer 0.
+                            This needs to be peresistent between
+			    3D factorization and solve.  */
 } dSOLVEstruct_t;
 
 
@@ -649,7 +649,6 @@ extern int dcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, double **rhs,
     
 /* Matrix distributed in NRformat_loc in 3D process grid. It converts 
    it to a NRformat_loc distributed in 2D grid in grid-0 */
-//extern NRformat_loc3d *dGatherNRformat_loc3d(NRformat_loc *A, double *B,
 extern void dGatherNRformat_loc3d(fact_t Fact, NRformat_loc *A, double *B,
 				   int ldb, int nrhs, gridinfo3d_t *grid3d,
 				   NRformat_loc3d **);
diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h
index c6d50d7b..db21c7fc 100644
--- a/SRC/superlu_defs.h
+++ b/SRC/superlu_defs.h
@@ -87,7 +87,7 @@ at the top-level directory.
 #elif defined (_LONGINT)
   typedef int64_t int_t;
   #define mpi_int_t   MPI_LONG_LONG_INT
-  #define IFMT "%ld"
+  #define IFMT "%lld"
 #else /* Default */
   typedef int int_t;
   #define mpi_int_t   MPI_INT
diff --git a/SRC/superlu_grid3d.c b/SRC/superlu_grid3d.c
index 0b423b0e..a19b6bf6 100644
--- a/SRC/superlu_grid3d.c
+++ b/SRC/superlu_grid3d.c
@@ -223,7 +223,7 @@ void superlu_gridmap3d(
 
     // grid->grid2d.cscp = grid->cscp;
 
-#if 0
+#if 1
     if ( (grid->zscp).Iam == 0) {
       printf("(3d grid: layer 0) iam %d, grid->grid2d.iam %d\n",
 	     grid->iam, (grid->grid2d).iam);
@@ -231,11 +231,12 @@ void superlu_gridmap3d(
     fflush(stdout);
 #endif
 
+    MPI_Comm_free( &superlu3d_comm );  // Sherry added
+    
  gridmap_out:    
     SUPERLU_FREE(pranks);
     MPI_Group_free( &superlu_grp );
     MPI_Group_free( &mpi_base_group );
-    MPI_Comm_free( &superlu3d_comm );  // Sherry added
 }
 
 void superlu_gridexit3d(gridinfo3d_t *grid)
diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h
index 0b63cf6c..ed7c53ea 100644
--- a/SRC/superlu_zdefs.h
+++ b/SRC/superlu_zdefs.h
@@ -225,6 +225,9 @@ typedef struct {
                              positions in the gathered x-vector.
                              This is re-used in repeated calls to pzgsmv() */
     int_t *xrow_to_proc; /* used by PDSLin */
+    NRformat_loc3d* A3d; /* Point to 3D {A, B} gathered on 2D layer 0.
+                            This needs to be peresistent between
+			    3D factorization and solve.  */
 } zSOLVEstruct_t;
 
 
@@ -425,6 +428,7 @@ extern void  pzCompute_Diag_Inv(int_t, zLUstruct_t *,gridinfo_t *, SuperLUStat_t
 extern int  zSolveInit(superlu_dist_options_t *, SuperMatrix *, int_t [], int_t [],
 		       int_t, zLUstruct_t *, gridinfo_t *, zSOLVEstruct_t *);
 extern void zSolveFinalize(superlu_dist_options_t *, zSOLVEstruct_t *);
+extern void zDestroy_A3d_gathered_on_2d(zSOLVEstruct_t *, gridinfo3d_t *);
 extern int_t pzgstrs_init(int_t, int_t, int_t, int_t,
                           int_t [], int_t [], gridinfo_t *grid,
 	                  Glu_persist_t *, zSOLVEstruct_t *);
@@ -647,9 +651,9 @@ extern int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rh
     
 /* Matrix distributed in NRformat_loc in 3D process grid. It converts 
    it to a NRformat_loc distributed in 2D grid in grid-0 */
-extern NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, doublecomplex *B,
-					     int ldb, int nrhs,
-					     gridinfo3d_t *grid3d);
+extern void zGatherNRformat_loc3d(fact_t Fact, NRformat_loc *A, doublecomplex *B,
+				   int ldb, int nrhs, gridinfo3d_t *grid3d,
+				   NRformat_loc3d **);
 extern int zScatter_B3d(NRformat_loc3d *A3d, gridinfo3d_t *grid3d);
 
 extern void pzgssvx3d (superlu_dist_options_t *, SuperMatrix *,
diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c
index d3f16e05..8d7d046d 100644
--- a/SRC/znrformat_loc3d.c
+++ b/SRC/znrformat_loc3d.c
@@ -17,9 +17,8 @@ at the top-level directory.
  *
  * 
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Univ. of California Berkeley,
- * Oak Ridge National Lab.
- * October 22, 2020
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab.
+ * May 12, 2021
  */
 
 #include "superlu_zdefs.h"
@@ -43,151 +42,190 @@ static void matCopy(int n, int m, doublecomplex *Dst, int lddst, doublecomplex *
  *             output is in the returned A3d->{} structure.
  *             see supermatrix.h for nrformat_loc3d{} structure.
  */
-NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input, on 3D grid
-                                      doublecomplex *B,       // input
-				      int ldb, int nrhs, // input
-                                      gridinfo3d_t *grid3d)
+void zGatherNRformat_loc3d
+(
+ fact_t Fact,     // how matrix A will be factorized
+ NRformat_loc *A, // input, on 3D grid
+ doublecomplex *B,       // input
+ int ldb, int nrhs, // input
+ gridinfo3d_t *grid3d, 
+ NRformat_loc3d **A3d_addr /* If Fact == DOFACT, it is an input;
+ 		              Else it is both input and may be modified */
+ )
 {
-    NRformat_loc3d *A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
-    NRformat_loc *A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
-    A3d->m_loc = A->m_loc;
-    A3d->B3d = (doublecomplex *) B; // on 3D process grid
-    A3d->ldb = ldb;
-    A3d->nrhs = nrhs;
-
-    // find number of nnzs
-    int_t *nnz_counts; // number of local nonzeros relative to all processes
-    int_t *row_counts; // number of local rows relative to all processes
-    int *nnz_counts_int, *row_counts_int; // 32-bit
-    int *nnz_disp, *row_disp; // displacement
-    int *b_counts_int; // number of local B entries relative to all processes 
-    int *b_disp;       // including 'nrhs'
-
-    nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
-    row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
-    nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
-    row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
-    b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
-    MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
-               1, mpi_int_t, 0, grid3d->zscp.comm);
-    MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
-               1, mpi_int_t, 0, grid3d->zscp.comm);
-    nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
-    row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
-    b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
-
-    nnz_disp[0] = 0;
-    row_disp[0] = 0;
-    b_disp[0] = 0;
-    for (int i = 0; i < grid3d->npdep; i++)
-    {
-        nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
-        row_disp[i + 1] = row_disp[i] + row_counts[i];
-        b_disp[i + 1] = nrhs * row_disp[i + 1];
-        nnz_counts_int[i] = nnz_counts[i];
-        row_counts_int[i] = row_counts[i];
-        b_counts_int[i] = nrhs * row_counts[i];
-    }
-
-    if (grid3d->zscp.Iam == 0)
-    {
-        A2d->colind = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(int_t));
-        A2d->nzval = SUPERLU_MALLOC(nnz_disp[grid3d->npdep] * sizeof(doublecomplex));
-        A2d->rowptr = SUPERLU_MALLOC((row_disp[grid3d->npdep] + 1) * sizeof(int_t));
-        A2d->rowptr[0] = 0;
-    }
-
-    MPI_Gatherv(A->nzval, A->nnz_loc, SuperLU_MPI_DOUBLE_COMPLEX, A2d->nzval,
-                nnz_counts_int, nnz_disp,
-                SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm);
-    MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
-                nnz_counts_int, nnz_disp,
-                mpi_int_t, 0, grid3d->zscp.comm);
-    MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
-                row_counts_int, row_disp,
-                mpi_int_t, 0, grid3d->zscp.comm);
-
-    if (grid3d->zscp.Iam == 0)
-    {
-        for (int i = 0; i < grid3d->npdep; i++)
-        {
-            for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
-            {
-                // A2d->rowptr[j] += row_disp[i];
-                A2d->rowptr[j] += nnz_disp[i];
-            }
-        }
-        A2d->nnz_loc = nnz_disp[grid3d->npdep];
-        A2d->m_loc = row_disp[grid3d->npdep];
-
-        if (grid3d->rankorder == 1) { // XY-major
-     	    A2d->fst_row = A->fst_row;
-	} else { // Z-major
-	    gridinfo_t *grid2d = &(grid3d->grid2d);
-            int procs2d = grid2d->nprow * grid2d->npcol;
-            int m_loc_2d = A2d->m_loc;
-            int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
-
-            MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
-	                  MPI_INT, grid2d->comm);
-
-            int fst_row = 0;
-            for (int p = 0; p < procs2d; ++p)
-            {
-		if (grid2d->iam == p)
-                   A2d->fst_row = fst_row;
-            	fst_row += m_loc_2d_counts[p];
-            }
-
-            SUPERLU_FREE(m_loc_2d_counts);
-        }
-    }
-
-    // Btmp <- compact(B)
-    // compacting B
-    doublecomplex *Btmp;
-    Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(doublecomplex));
-    matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
-
-    doublecomplex *B1;
-    if (grid3d->zscp.Iam == 0)
-    {
-        B1 = SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(doublecomplex));
-        A3d->B2d = (doublecomplex *) SUPERLU_MALLOC(A2d->m_loc * nrhs * sizeof(doublecomplex));
-    }
-
-    // B1 <- gatherv(Btmp)
-    MPI_Gatherv(Btmp, nrhs * A->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, B1,
-                b_counts_int, b_disp,
-                SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm);
+    NRformat_loc3d *A3d = (NRformat_loc3d *) *A3d_addr;
+    NRformat_loc *A2d;
+    int *row_counts_int; // 32-bit, number of local rows relative to all processes
+    int *row_disp;       // displacement
+    int *b_counts_int;   // number of local B entries relative to all processes 
+    int *b_disp;         // including 'nrhs'
+	
+    /********* Gather A2d *********/
+    if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) {
+	/* A3d is input. No need to recompute count.
+	   Only need to gather A2d matrix.  */
+	b_counts_int   = A3d->b_counts_int;
+	b_disp         = A3d->b_disp;;
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+
+	if (grid3d->iam==0) printf("TO BE COMPLETED!\n");
+    } else if ( Fact != FACTORED ) {
+	/* A3d is output. Compute counts from scratch */
+	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
+    
+	// find number of nnzs
+	int_t *nnz_counts; // number of local nonzeros relative to all processes
+	int_t *row_counts; // number of local rows relative to all processes
+	int *nnz_counts_int; // 32-bit
+	int *nnz_disp; // displacement
+
+	nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+
+	nnz_disp[0] = 0;
+	row_disp[0] = 0;
+	b_disp[0] = 0;
+	for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
+		row_disp[i + 1] = row_disp[i] + row_counts[i];
+		b_disp[i + 1] = nrhs * row_disp[i + 1];
+		nnz_counts_int[i] = nnz_counts[i];
+		row_counts_int[i] = row_counts[i];
+		b_counts_int[i] = nrhs * row_counts[i];
+	    }
 
-    // B2d <- colMajor(B1)
-    if (grid3d->zscp.Iam == 0)
-    {
-        for (int i = 0; i < grid3d->npdep; ++i)
-        {
-            /* code */
-            matCopy(row_counts_int[i], nrhs, ((doublecomplex*)A3d->B2d) + row_disp[i],
-		    A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
-        }
+	if (grid3d->zscp.Iam == 0)
+	    {
+		A2d->colind = intMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->nzval = doublecomplexMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
+		A2d->rowptr[0] = 0;
+	    }
 
-        SUPERLU_FREE(B1);
-    }
+	MPI_Gatherv(A->nzval, A->nnz_loc, SuperLU_MPI_DOUBLE_COMPLEX, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+
+	if (grid3d->zscp.Iam == 0)
+	    {
+		for (int i = 0; i < grid3d->npdep; i++)
+		    {
+			for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+			    {
+				// A2d->rowptr[j] += row_disp[i];
+				A2d->rowptr[j] += nnz_disp[i];
+			    }
+		    }
+		A2d->nnz_loc = nnz_disp[grid3d->npdep];
+		A2d->m_loc = row_disp[grid3d->npdep];
+
+		if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+		} else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+		}
+	    } /* end 2D layer grid-0 */
+
+	A3d->A_nfmt         = A2d;
+	A3d->b_counts_int   = b_counts_int;
+	A3d->b_disp         = b_disp;
+	A3d->row_counts_int = row_counts_int;
+	A3d->row_disp       = row_disp;
+
+	/* free storage */
+	SUPERLU_FREE(nnz_counts);
+	SUPERLU_FREE(nnz_counts_int);
+	SUPERLU_FREE(row_counts);
+	SUPERLU_FREE(nnz_disp);
+	
+	*A3d_addr = (NRformat_loc3d *) A3d; // return pointer to A3d struct
+	
+    } /* end else: Factor from scratch */
 
-    A3d->A_nfmt = A2d;
-    A3d->b_counts_int = b_counts_int;
-    A3d->b_disp = b_disp;
-    A3d->row_counts_int = row_counts_int;
-    A3d->row_disp = row_disp;
+    A3d->m_loc = A->m_loc;
+    A3d->B3d = (doublecomplex *) B; /* save the pointer to the original B
+				    stored on 3D process grid.  */
+    A3d->ldb = ldb;
+    A3d->nrhs = nrhs;
+	
+    /********* Gather B2d **********/
+    if ( nrhs > 0 ) {
+	
+	A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0
+	b_counts_int   = A3d->b_counts_int;
+	b_disp         = A3d->b_disp;;
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	
+	printf("dGather_loc3d(2): row_disp %p, A3d %p\n", row_disp, A3d); fflush(stdout);
+
+	/* Btmp <- compact(B), compacting B */
+	doublecomplex *Btmp;
+	Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(doublecomplex));
+	matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
+
+	doublecomplex *B1;
+	if (grid3d->zscp.Iam == 0)
+	    {
+		B1 = doublecomplexMalloc_dist(A2d->m_loc * nrhs);
+		A3d->B2d = doublecomplexMalloc_dist(A2d->m_loc * nrhs);
+	    }
 
-    /* free storage */
-    SUPERLU_FREE(nnz_counts);
-    SUPERLU_FREE(nnz_counts_int);
-    SUPERLU_FREE(row_counts);
-    SUPERLU_FREE(nnz_disp);
-    SUPERLU_FREE(Btmp);
+	// B1 <- gatherv(Btmp)
+	MPI_Gatherv(Btmp, nrhs * A->m_loc, SuperLU_MPI_DOUBLE_COMPLEX, B1,
+		    b_counts_int, b_disp,
+		    SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm);
+	SUPERLU_FREE(Btmp);
+
+	// B2d <- colMajor(B1)
+	if (grid3d->zscp.Iam == 0)
+	    {
+		for (int i = 0; i < grid3d->npdep; ++i)
+		    {
+			/* code */
+			matCopy(row_counts_int[i], nrhs, ((doublecomplex*)A3d->B2d) + row_disp[i],
+				A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
+		    }
+		
+		SUPERLU_FREE(B1);
+	    }
 
-    return A3d;
+    } /* end gather B2d */
 
 } /* zGatherNRformat_loc3d */
 
@@ -198,18 +236,18 @@ NRformat_loc3d *zGatherNRformat_loc3d(NRformat_loc *A, // input, on 3D grid
 int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 		 gridinfo3d_t *grid3d)
 {
-    doublecomplex *B = (doublecomplex *) A3d->B3d; // on 3D grid
+    doublecomplex *B = (doublecomplex *) A3d->B3d; // retrieve original pointer on 3D grid
     int ldb = A3d->ldb;
     int nrhs = A3d->nrhs;
-    doublecomplex *B2d = (doublecomplex *) A3d->B2d; // on 2D layer 0 
-    NRformat_loc A2d = *(A3d->A_nfmt);
+    doublecomplex *B2d = (doublecomplex *) A3d->B2d; // only on 2D layer grid_0 
+    NRformat_loc *A2d = A3d->A_nfmt;
 
-    /* The following are the number of local rows relative to all processes */
-    int m_loc = A3d->m_loc;
-    int *b_counts_int = A3d->b_counts_int;
-    int *b_disp = A3d->b_disp;
+    /* The following are the number of local rows relative to Z-dimension */
+    int m_loc           = A3d->m_loc;
+    int *b_counts_int   = A3d->b_counts_int;
+    int *b_disp         = A3d->b_disp;
     int *row_counts_int = A3d->row_counts_int;
-    int *row_disp = A3d->row_disp;
+    int *row_disp       = A3d->row_disp;
     int i, p;
     int iam = grid3d->iam;
     int rankorder = grid3d->rankorder;
@@ -218,7 +256,7 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
     doublecomplex *B1;  // on 2D layer 0
     if (grid3d->zscp.Iam == 0)
     {
-        B1 = SUPERLU_MALLOC(A2d.m_loc * nrhs * sizeof(doublecomplex));
+        B1 = doublecomplexMalloc_dist(A2d->m_loc * nrhs);
     }
 
     // B1 <- BlockByBlock(B2d)
@@ -228,12 +266,12 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
         {
             /* code */
             matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i],
-                    B2d + row_disp[i], A2d.m_loc);
+                    B2d + row_disp[i], A2d->m_loc);
         }
     }
 
     doublecomplex *Btmp; // on 3D grid
-    Btmp = SUPERLU_MALLOC(A3d->m_loc * nrhs * sizeof(doublecomplex));
+    Btmp = doublecomplexMalloc_dist(A3d->m_loc * nrhs);
 
     // Btmp <- scatterv(B1), block-by-block
     if ( rankorder == 1 ) { /* XY-major in 3D grid */
@@ -304,12 +342,11 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
     matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
 
     /* free storage */
-    SUPERLU_FREE(A3d->b_counts_int);
-    SUPERLU_FREE(A3d->b_disp);
-    SUPERLU_FREE(A3d->row_counts_int);
-    SUPERLU_FREE(A3d->row_disp);
     SUPERLU_FREE(Btmp);
-    if (grid3d->zscp.Iam == 0) SUPERLU_FREE(B1);
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE(B1);
+	SUPERLU_FREE(B2d);
+    }
 
     return 0;
 } /* zScatter_B3d */

From 8d76b620011b0b0aae4e4e4479ba90ff9f9c7814 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Mon, 6 Sep 2021 11:33:54 -0700
Subject: [PATCH 116/147] Update README.md and EXAMPLE/README.

---
 EXAMPLE/README |  8 ++++--
 README.md      | 71 +++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 65 insertions(+), 14 deletions(-)

diff --git a/EXAMPLE/README b/EXAMPLE/README
index 7cfba7de..f06bd2b2 100644
--- a/EXAMPLE/README
+++ b/EXAMPLE/README
@@ -7,8 +7,12 @@ examples to suit your applications.
 
 The examples illustrate the following functionalities:
   0. pddrive3d.c
-     Use PDGSSVX3D communication-avoiding 3D algorithm with the default options
-     to solve a linear system
+        Use PxGSSVX3D communication-avoiding 3D algorithm with the
+        default options to solve a linear system
+     pddrive3d1.c:
+        Use PxGSSVX3D to solve the systems with same A but different
+	right-hand side. (Reuse the factored form of A)
+     
   1. pddrive.c, pddrive_ABglobal.c
      Use PDGSSVX with the full (default) options to solve a linear system.
   2. pddrive1.c, pddrive1_ABglobal.c
diff --git a/README.md b/README.md
index 0c78f131..0315a4d5 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,32 @@
+
+Table of Contents
+=================
+
+* [SuperLU_DIST (version 7.0)   superlu](#superlu_dist-version-70---)
+* [Table of Contents](#table-of-contents)
+* [Directory structure of the source code](#directory-structure-of-the-source-code)
+* [Installation](#installation)
+   * [Installation option 1: Using CMake build system.](#installation-option-1-using-cmake-build-system)
+      * [Dependent external libraries: BLAS and ParMETIS](#dependent-external-libraries-blas-and-parmetis)
+      * [Optional external libraries: LAPACK, CombBLAS](#optional-external-libraries-lapack-combblas)
+      * [Use GPU](#use-gpu)
+      * [Summary of the CMake definitions.](#summary-of-the-cmake-definitions)
+   * [Installation option 2: Manual installation with makefile.](#installation-option-2-manual-installation-with-makefile)
+      * [2.1 Edit the make.inc include file.](#21-edit-the-makeinc-include-file)
+      * [2.2. The BLAS library.](#22-the-blas-library)
+      * [2.3. External libraries.](#23-external-libraries)
+         * [2.3.1 Metis and ParMetis.](#231-metis-and-parmetis)
+         * [2.3.2 LAPACK.](#232-lapack)
+         * [2.3.3 CombBLAS.](#233-combblas)
+      * [2.4. C preprocessor definition CDEFS. (Replaced by cmake module FortranCInterface.)](#24-c-preprocessor-definition-cdefs-replaced-by-cmake-module-fortrancinterface)
+      * [2.5. Multicore and GPU.](#25-multicore-and-gpu)
+* [Summary of the environment variables.](#summary-of-the-environment-variables)
+* [Windows Usage](#windows-usage)
+* [Reading sparse matrix files](#reading-sparse-matrix-files)
+* [REFERENCES](#references)
+* [RELEASE VERSIONS](#release-versions)
+
+Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)
 # SuperLU_DIST (version 7.0)   superlu
 
 [![Build Status](https://travis-ci.org/xiaoyeli/superlu_dist.svg?branch=master)](https://travis-ci.org/xiaoyeli/superlu_dist) 
@@ -80,16 +109,33 @@ The procedures are described below.
 ## Installation option 1: Using CMake build system.
 You will need to create a build tree from which to invoke CMake.
 
-First, in order to use parallel symbolic factorization function, you
+### Dependent external libraries: BLAS and ParMETIS
+If you have a BLAS library on your machine, you can link with it
+with the following cmake definition:
+```
+-DTPL_BLAS_LIBRARIES=""
+```
+The CBLAS/ subdirectory contains the part of the C BLAS (single threaded) 
+needed by SuperLU_DIST, but they are not optimized.
+You can compile and use it with the following cmake definition:
+```
+-DTPL_ENABLE_INTERNAL_BLASLIB=ON
+```
+
+The default sparsity ordering is METIS. But, in order to use parallel
+symbolic factorization function, you
 need to install ParMETIS parallel ordering package and define the
 two environment variables: PARMETIS_ROOT and PARMETIS_BUILD_DIR
 
+(Note: ParMETIS library also contains serial METIS library.)
+
 ```
 export PARMETIS_ROOT=
 export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64
 ```
 
-Second, in order to use parallel weighted matching HWPM (Heavy Weight
+### Optional external libraries: LAPACK, CombBLAS
+In order to use parallel weighted matching HWPM (Heavy Weight
 Perfect Matching) for numerical pre-pivoting, you need to install 
 CombBLAS and define the environment variable:
 
@@ -98,8 +144,15 @@ export COMBBLAS_ROOT=
 export COMBBLAS_BUILD_DIR=${COMBBLAS_ROOT}/_build
 ```
 
-Once these needed third-party libraries are in place, SuperLU installation
-can be done as follows from the top level directory:
+### Use GPU
+You can enable GPU with CUDA with the following cmake option:
+```
+`-DTPL_ENABLE_CUDALIB=TRUE`
+`-DTPL_CUDA_LIBRARIES="/libcublas.so;/libcudart.so"`
+```
+
+Once these needed third-party libraries are in place, the installation
+can be done as follows at the top level directory:
 
 For a simple installation with default setting, do:
 (ParMETIS is needed, i.e., TPL_ENABLE_PARMETISLIB=ON)
@@ -109,7 +162,7 @@ cmake .. \
     -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
     -DTPL_PARMETIS_LIBRARIES="${PARMETIS_BUILD_DIR}/libparmetis/libparmetis.a;${PARMETIS_BUILD_DIR}/libmetis/libmetis.a" \
 ```
-For a more sophisticated installation including third-part libraries, do:
+For a more sophisticated installation including third-party libraries, do:
 ```
 cmake .. \
     -DTPL_PARMETIS_INCLUDE_DIRS="${PARMETIS_ROOT}/include;${PARMETIS_ROOT}/metis/include" \
@@ -128,16 +181,10 @@ OOT}/Applications/BipartiteMatchings" \
 
 ( see example cmake script: run_cmake_build.sh )
 ```
-You can enable GPU with CUDA with the following cmake option:
-```
-`-DTPL_ENABLE_CUDALIB=TRUE`
-`-DTPL_CUDA_LIBRARIES="/libcublas.so;/libcudart.so"`
-```
 
-You can disable LAPACK, ParMetis or CombBLAS with the following cmake option:
+You can disable LAPACK or CombBLAS with the following cmake option:
 ```
 `-DTPL_ENABLE_LAPACKLIB=FALSE`
-`-DTPL_ENABLE_PARMETISLIB=FALSE`
 `-DTPL_ENABLE_COMBBLASLIB=FALSE`
 ```
 

From 43c5f5b20c5be1a4e62bf2cec822ffd7898cb8b4 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Mon, 6 Sep 2021 11:44:17 -0700
Subject: [PATCH 117/147] Update README.md

---
 README.md | 28 +---------------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/README.md b/README.md
index 0315a4d5..046ab4f8 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,6 @@ Table of Contents
 =================
 
 * [SuperLU_DIST (version 7.0)   superlu](#superlu_dist-version-70---)
-* [Table of Contents](#table-of-contents)
 * [Directory structure of the source code](#directory-structure-of-the-source-code)
 * [Installation](#installation)
    * [Installation option 1: Using CMake build system.](#installation-option-1-using-cmake-build-system)
@@ -27,6 +26,7 @@ Table of Contents
 * [RELEASE VERSIONS](#release-versions)
 
 Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)
+
 # SuperLU_DIST (version 7.0)   superlu
 
 [![Build Status](https://travis-ci.org/xiaoyeli/superlu_dist.svg?branch=master)](https://travis-ci.org/xiaoyeli/superlu_dist) 
@@ -51,32 +51,6 @@ acceleration capabilities.
 
 
 
-Table of Contents
-=================
-
-* [SuperLU_DIST (version 7.0)   superlu](#superlu_dist-version-70---)
-* [Directory structure of the source code](#directory-structure-of-the-source-code)
-* [Installation](#installation)
-   * [Installation option 1: Using CMake build system.](#installation-option-1-using-cmake-build-system)
-      * [Summary of the CMake definitions.](#summary-of-the-cmake-definitions)
-   * [Installation option 2: Manual installation with makefile.](#installation-option-2-manual-installation-with-makefile)
-      * [2.1 Edit the make.inc include file.](#21-edit-the-makeinc-include-file)
-      * [2.2. The BLAS library.](#22-the-blas-library)
-      * [2.3. External libraries.](#23-external-libraries)
-         * [2.3.1 Metis and ParMetis.](#231-metis-and-parmetis)
-         * [2.3.2 LAPACK.](#232-lapack)
-         * [2.3.3 CombBLAS.](#233-combblas)
-      * [2.4. C preprocessor definition CDEFS. (Replaced by cmake module FortranCInterface.)](#24-c-preprocessor-definition-cdefs-replaced-by-cmake-module-fortrancinterface)
-      * [2.5. Multicore and GPU.](#25-multicore-and-gpu)
-* [Summary of the environment variables.](#summary-of-the-environment-variables)
-* [Windows Usage](#windows-usage)
-* [Reading sparse matrix files](#reading-sparse-matrix-files)
-* [REFERENCES](#references)
-* [RELEASE VERSIONS](#release-versions)
-
-Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc)
-
-
 # Directory structure of the source code
 
 ```

From 51d5481b3f49d89faf361e06fe43cf152d43b222 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Mon, 6 Sep 2021 11:58:07 -0700
Subject: [PATCH 118/147] Update README.md

---
 README.md | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 046ab4f8..cf158d6b 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ Table of Contents
 * [Installation](#installation)
    * [Installation option 1: Using CMake build system.](#installation-option-1-using-cmake-build-system)
       * [Dependent external libraries: BLAS and ParMETIS](#dependent-external-libraries-blas-and-parmetis)
-      * [Optional external libraries: LAPACK, CombBLAS](#optional-external-libraries-lapack-combblas)
+      * [Optional external libraries: CombBLAS, LAPACK](#optional-external-libraries-combblas-lapack)
       * [Use GPU](#use-gpu)
       * [Summary of the CMake definitions.](#summary-of-the-cmake-definitions)
    * [Installation option 2: Manual installation with makefile.](#installation-option-2-manual-installation-with-makefile)
@@ -108,7 +108,8 @@ export PARMETIS_ROOT=
 export PARMETIS_BUILD_DIR=${PARMETIS_ROOT}/build/Linux-x86_64
 ```
 
-### Optional external libraries: LAPACK, CombBLAS
+### Optional external libraries: CombBLAS, LAPACK
+
 In order to use parallel weighted matching HWPM (Heavy Weight
 Perfect Matching) for numerical pre-pivoting, you need to install 
 CombBLAS and define the environment variable:
@@ -117,12 +118,23 @@ CombBLAS and define the environment variable:
 export COMBBLAS_ROOT=
 export COMBBLAS_BUILD_DIR=${COMBBLAS_ROOT}/_build
 ```
+Then, install with cmake option:
+```
+-DTPL_ENABLE_COMBBLASLIB=ON
+```
+
+By default, LAPACK is not needed. Only in triangular solve routine, we
+may use LAPACK to explicitly invert the dense diagonal block to improve
+speed. You can use it with the following cmake option:
+```
+-DTPL_ENABLE_LAPACKLIB=ON
+```
 
 ### Use GPU
 You can enable GPU with CUDA with the following cmake option:
 ```
-`-DTPL_ENABLE_CUDALIB=TRUE`
-`-DTPL_CUDA_LIBRARIES="/libcublas.so;/libcudart.so"`
+-DTPL_ENABLE_CUDALIB=TRUE
+-DTPL_CUDA_LIBRARIES="/libcublas.so;/libcudart.so"
 ```
 
 Once these needed third-party libraries are in place, the installation
@@ -156,10 +168,10 @@ OOT}/Applications/BipartiteMatchings" \
 ( see example cmake script: run_cmake_build.sh )
 ```
 
-You can disable LAPACK or CombBLAS with the following cmake option:
+You can disable CombBLAS or LAPACK with the following cmake options:
 ```
-`-DTPL_ENABLE_LAPACKLIB=FALSE`
-`-DTPL_ENABLE_COMBBLASLIB=FALSE`
+-DTPL_ENABLE_LAPACKLIB=FALSE
+-DTPL_ENABLE_COMBBLASLIB=FALSE
 ```
 
 To actually build (compile), type:

From ed1de7d0eafb81ce55adc317454d7b8f7f68d3e8 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Mon, 6 Sep 2021 12:01:19 -0700
Subject: [PATCH 119/147] update README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index cf158d6b..7865d01e 100644
--- a/README.md
+++ b/README.md
@@ -89,8 +89,8 @@ with the following cmake definition:
 ```
 -DTPL_BLAS_LIBRARIES=""
 ```
-The CBLAS/ subdirectory contains the part of the C BLAS (single threaded) 
-needed by SuperLU_DIST, but they are not optimized.
+Otherwise, the CBLAS/ subdirectory contains the part of the C BLAS
+(single threaded) needed by SuperLU_DIST, but they are not optimized.
 You can compile and use it with the following cmake definition:
 ```
 -DTPL_ENABLE_INTERNAL_BLASLIB=ON

From 8ae690d509e6bf7e3d53b25dcc5c86b95617670e Mon Sep 17 00:00:00 2001
From: Yang Liu 
Date: Wed, 8 Sep 2021 16:53:22 -0700
Subject: [PATCH 120/147] automaticly assigning MPI to GPU devices in a
 round-robin fashion

---
 EXAMPLE/pddrive.c           | 10 +++++++++-
 EXAMPLE/pddrive1.c          |  7 ++++++-
 EXAMPLE/pddrive1_ABglobal.c |  6 ++++++
 EXAMPLE/pddrive2.c          |  7 ++++++-
 EXAMPLE/pddrive2_ABglobal.c |  7 ++++++-
 EXAMPLE/pddrive3.c          |  7 ++++++-
 EXAMPLE/pddrive3_ABglobal.c |  7 ++++++-
 EXAMPLE/pddrive3d.c         |  7 ++++++-
 EXAMPLE/pddrive4.c          |  7 ++++++-
 EXAMPLE/pddrive4_ABglobal.c |  6 ++++++
 EXAMPLE/pddrive_ABglobal.c  |  7 ++++++-
 EXAMPLE/pddrive_spawn.c     |  7 ++++++-
 EXAMPLE/pzdrive.c           |  7 ++++++-
 EXAMPLE/pzdrive1.c          |  7 ++++++-
 EXAMPLE/pzdrive1_ABglobal.c |  7 ++++++-
 EXAMPLE/pzdrive2.c          |  7 ++++++-
 EXAMPLE/pzdrive2_ABglobal.c |  7 ++++++-
 EXAMPLE/pzdrive3.c          |  7 ++++++-
 EXAMPLE/pzdrive3_ABglobal.c |  7 ++++++-
 EXAMPLE/pzdrive3d.c         |  6 ++++++
 EXAMPLE/pzdrive4.c          |  7 ++++++-
 EXAMPLE/pzdrive4_ABglobal.c |  6 ++++++
 EXAMPLE/pzdrive_ABglobal.c  |  7 ++++++-
 EXAMPLE/pzdrive_spawn.c     |  7 ++++++-
 SRC/superlu_ddefs.h         |  4 ++--
 SRC/superlu_zdefs.h         |  4 ++--
 26 files changed, 151 insertions(+), 24 deletions(-)

diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c
index 001acf77..8bef2df8 100644
--- a/EXAMPLE/pddrive.c
+++ b/EXAMPLE/pddrive.c
@@ -75,7 +75,15 @@ int main(int argc, char *argv[])
        ------------------------------------------------------------*/
     //MPI_Init( &argc, &argv );
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-	
+
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
+
+
 
 #if ( VAMPIR>=1 )
     VT_traceoff(); 
diff --git a/EXAMPLE/pddrive1.c b/EXAMPLE/pddrive1.c
index c2ccd067..72c88a6e 100644
--- a/EXAMPLE/pddrive1.c
+++ b/EXAMPLE/pddrive1.c
@@ -73,7 +73,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pddrive1_ABglobal.c b/EXAMPLE/pddrive1_ABglobal.c
index 7f06b70e..7686b79c 100644
--- a/EXAMPLE/pddrive1_ABglobal.c
+++ b/EXAMPLE/pddrive1_ABglobal.c
@@ -72,6 +72,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
 
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
diff --git a/EXAMPLE/pddrive2.c b/EXAMPLE/pddrive2.c
index f60326bb..2c293b8b 100644
--- a/EXAMPLE/pddrive2.c
+++ b/EXAMPLE/pddrive2.c
@@ -83,7 +83,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pddrive2_ABglobal.c b/EXAMPLE/pddrive2_ABglobal.c
index 57ebadf3..e908a6ca 100644
--- a/EXAMPLE/pddrive2_ABglobal.c
+++ b/EXAMPLE/pddrive2_ABglobal.c
@@ -72,7 +72,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pddrive3.c b/EXAMPLE/pddrive3.c
index f2886945..920be224 100644
--- a/EXAMPLE/pddrive3.c
+++ b/EXAMPLE/pddrive3.c
@@ -80,7 +80,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pddrive3_ABglobal.c b/EXAMPLE/pddrive3_ABglobal.c
index 2e2a7433..e20c664d 100644
--- a/EXAMPLE/pddrive3_ABglobal.c
+++ b/EXAMPLE/pddrive3_ABglobal.c
@@ -78,7 +78,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c
index 3ac75d0f..acbce467 100644
--- a/EXAMPLE/pddrive3d.c
+++ b/EXAMPLE/pddrive3d.c
@@ -137,7 +137,12 @@ main (int argc, char *argv[])
         MPI_Comm_rank(MPI_COMM_WORLD, &rank);
         if (!rank) printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n");
     }
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv + 1; *cpp; ++cpp)
     {
diff --git a/EXAMPLE/pddrive4.c b/EXAMPLE/pddrive4.c
index ca984a81..dbe9ee08 100644
--- a/EXAMPLE/pddrive4.c
+++ b/EXAMPLE/pddrive4.c
@@ -74,7 +74,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
     if ( nprocs < 10 ) {
 	fprintf(stderr, "Requires at least 10 processes\n");
diff --git a/EXAMPLE/pddrive4_ABglobal.c b/EXAMPLE/pddrive4_ABglobal.c
index 9ff46dd8..2cf76078 100644
--- a/EXAMPLE/pddrive4_ABglobal.c
+++ b/EXAMPLE/pddrive4_ABglobal.c
@@ -71,6 +71,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif	
     MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
     if ( nprocs < 10 ) {
 	fprintf(stderr, "Requires at least 10 processes\n");
diff --git a/EXAMPLE/pddrive_ABglobal.c b/EXAMPLE/pddrive_ABglobal.c
index a47388b5..3541ab92 100644
--- a/EXAMPLE/pddrive_ABglobal.c
+++ b/EXAMPLE/pddrive_ABglobal.c
@@ -73,7 +73,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pddrive_spawn.c b/EXAMPLE/pddrive_spawn.c
index 131ea29c..b119b46e 100755
--- a/EXAMPLE/pddrive_spawn.c
+++ b/EXAMPLE/pddrive_spawn.c
@@ -82,7 +82,12 @@ int main(int argc, char *argv[])
     //MPI_Init( &argc, &argv );
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
 	MPI_Comm_get_parent(&parent);   	
-	
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif	
 	
 
 #if ( VAMPIR>=1 )
diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c
index 3878558d..342b91a5 100644
--- a/EXAMPLE/pzdrive.c
+++ b/EXAMPLE/pzdrive.c
@@ -74,7 +74,12 @@ int main(int argc, char *argv[])
        ------------------------------------------------------------*/
     //MPI_Init( &argc, &argv );
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-	
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
 
 #if ( VAMPIR>=1 )
     VT_traceoff(); 
diff --git a/EXAMPLE/pzdrive1.c b/EXAMPLE/pzdrive1.c
index b65733b2..69aea1d0 100644
--- a/EXAMPLE/pzdrive1.c
+++ b/EXAMPLE/pzdrive1.c
@@ -72,7 +72,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pzdrive1_ABglobal.c b/EXAMPLE/pzdrive1_ABglobal.c
index 4437e4a8..bf47169f 100644
--- a/EXAMPLE/pzdrive1_ABglobal.c
+++ b/EXAMPLE/pzdrive1_ABglobal.c
@@ -71,7 +71,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pzdrive2.c b/EXAMPLE/pzdrive2.c
index ce28390e..0104a135 100644
--- a/EXAMPLE/pzdrive2.c
+++ b/EXAMPLE/pzdrive2.c
@@ -82,7 +82,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pzdrive2_ABglobal.c b/EXAMPLE/pzdrive2_ABglobal.c
index 9959465b..96866c35 100644
--- a/EXAMPLE/pzdrive2_ABglobal.c
+++ b/EXAMPLE/pzdrive2_ABglobal.c
@@ -71,7 +71,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pzdrive3.c b/EXAMPLE/pzdrive3.c
index 47b33b43..983c0895 100644
--- a/EXAMPLE/pzdrive3.c
+++ b/EXAMPLE/pzdrive3.c
@@ -79,7 +79,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pzdrive3_ABglobal.c b/EXAMPLE/pzdrive3_ABglobal.c
index c83cf1a3..144e3f6e 100644
--- a/EXAMPLE/pzdrive3_ABglobal.c
+++ b/EXAMPLE/pzdrive3_ABglobal.c
@@ -77,7 +77,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pzdrive3d.c b/EXAMPLE/pzdrive3d.c
index 2eb12796..90390a43 100644
--- a/EXAMPLE/pzdrive3d.c
+++ b/EXAMPLE/pzdrive3d.c
@@ -131,6 +131,12 @@ main (int argc, char *argv[])
     int required = MPI_THREAD_MULTIPLE;
     int provided;
     MPI_Init_thread(&argc, &argv, required, &provided);
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif    
     if (provided < required)
     {
         int rank;
diff --git a/EXAMPLE/pzdrive4.c b/EXAMPLE/pzdrive4.c
index 7453d2ac..33451140 100644
--- a/EXAMPLE/pzdrive4.c
+++ b/EXAMPLE/pzdrive4.c
@@ -73,7 +73,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
     if ( nprocs < 10 ) {
 	fprintf(stderr, "Requires at least 10 processes\n");
diff --git a/EXAMPLE/pzdrive4_ABglobal.c b/EXAMPLE/pzdrive4_ABglobal.c
index 5515e885..9b3ff81b 100644
--- a/EXAMPLE/pzdrive4_ABglobal.c
+++ b/EXAMPLE/pzdrive4_ABglobal.c
@@ -70,6 +70,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif	
     MPI_Comm_size( MPI_COMM_WORLD, &nprocs );
     if ( nprocs < 10 ) {
 	fprintf(stderr, "Requires at least 10 processes\n");
diff --git a/EXAMPLE/pzdrive_ABglobal.c b/EXAMPLE/pzdrive_ABglobal.c
index c3d798c1..b6f48554 100644
--- a/EXAMPLE/pzdrive_ABglobal.c
+++ b/EXAMPLE/pzdrive_ABglobal.c
@@ -72,7 +72,12 @@ int main(int argc, char *argv[])
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
     MPI_Init( &argc, &argv );
-
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif
     /* Parse command line argv[]. */
     for (cpp = argv+1; *cpp; ++cpp) {
 	if ( **cpp == '-' ) {
diff --git a/EXAMPLE/pzdrive_spawn.c b/EXAMPLE/pzdrive_spawn.c
index 30a28dd1..8dab3751 100755
--- a/EXAMPLE/pzdrive_spawn.c
+++ b/EXAMPLE/pzdrive_spawn.c
@@ -82,7 +82,12 @@ int main(int argc, char *argv[])
     //MPI_Init( &argc, &argv );
     MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); 
 	MPI_Comm_get_parent(&parent);   	
-	
+#ifdef GPU_ACC
+    int rank, devs;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs);
+    cudaSetDevice(rank % devs);
+#endif	
 	
 
 #if ( VAMPIR>=1 )
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index 97fe6387..81be65a2 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -630,10 +630,10 @@ extern int superlu_dgemv(const char *trans, const int m,
 extern int superlu_dtrsv(char *uplo, char *trans, char *diag,
                   int n, double *a, int lda, double *x, int incx);
 
-
+#ifdef SLU_HAVE_LAPACK
 // LAPACK routine
 extern void dtrtri_(char*, char*, int*, double*, int*, int*);
-
+#endif
 
 /*==== For 3D code ====*/
 extern int dcreate_matrix3d(SuperMatrix *A, int nrhs, double **rhs,
diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h
index 0b63cf6c..95c00407 100644
--- a/SRC/superlu_zdefs.h
+++ b/SRC/superlu_zdefs.h
@@ -632,10 +632,10 @@ extern int superlu_zgemv(const char *trans, const int m,
 extern int superlu_ztrsv(char *uplo, char *trans, char *diag,
                   int n, doublecomplex *a, int lda, doublecomplex *x, int incx);
 
-
+#ifdef SLU_HAVE_LAPACK
 // LAPACK routine
 extern void ztrtri_(char*, char*, int*, doublecomplex*, int*, int*);
-
+#endif
 
 /*==== For 3D code ====*/
 extern int zcreate_matrix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs,

From 27f818aa08a9d138d120f5a0493e5d020b5d44cb Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Thu, 9 Sep 2021 08:24:33 -0700
Subject: [PATCH 121/147] Allow factorization-only (nrhs==0) for the call to
 p_gssvx3d. Add MPI to CUDA device binding in drivers p_gssvx and p_gssvx3d.

---
 EXAMPLE/Makefile            |  3 +--
 EXAMPLE/pddrive.c           |  3 ++-
 FORTRAN/superlu_c2f_dwrap.c |  2 +-
 FORTRAN/superlu_c2f_zwrap.c |  2 +-
 SRC/dnrformat_loc3d.c       | 11 ++++++-----
 SRC/pdgssvx.c               | 21 +++++++++++++++------
 SRC/pdgssvx3d.c             |  9 +++++++++
 SRC/psgssvx.c               |  9 +++++++++
 SRC/psgssvx3d.c             |  9 +++++++++
 SRC/pzgssvx.c               | 21 +++++++++++++++------
 SRC/pzgssvx3d.c             |  9 +++++++++
 SRC/snrformat_loc3d.c       | 11 ++++++-----
 SRC/superlu_ddefs.h         |  5 ++---
 SRC/superlu_zdefs.h         |  5 ++---
 SRC/znrformat_loc3d.c       | 11 ++++++-----
 15 files changed, 93 insertions(+), 38 deletions(-)

diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile
index 674811d9..8c36ed2a 100644
--- a/EXAMPLE/Makefile
+++ b/EXAMPLE/Makefile
@@ -30,8 +30,7 @@
 #######################################################################
 include ../make.inc
 
-DEXM	= pddrive.o dcreate_matrix.o 
-	#pdgssvx.o pdgstrf2.o
+DEXM	= pddrive.o dcreate_matrix.o #pdgssvx.o #pdgstrf2.o
 DEXM1	= pddrive1.o dcreate_matrix.o
 DEXM2	= pddrive2.o dcreate_matrix.o dcreate_matrix_perturbed.o
 DEXM3	= pddrive3.o dcreate_matrix.o
diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c
index 7ab06437..861a7b9a 100644
--- a/EXAMPLE/pddrive.c
+++ b/EXAMPLE/pddrive.c
@@ -69,7 +69,8 @@ int main(int argc, char *argv[])
     nprow = 1;  /* Default process rows.      */
     npcol = 1;  /* Default process columns.   */
     nrhs = 1;   /* Number of right-hand side. */
-
+    printf("MAIN ...\n"); fflush(stdout);
+			      
     /* ------------------------------------------------------------
        INITIALIZE MPI ENVIRONMENT. 
        ------------------------------------------------------------*/
diff --git a/FORTRAN/superlu_c2f_dwrap.c b/FORTRAN/superlu_c2f_dwrap.c
index 04755024..fb5122e8 100644
--- a/FORTRAN/superlu_c2f_dwrap.c
+++ b/FORTRAN/superlu_c2f_dwrap.c
@@ -95,7 +95,7 @@ void f_dDestroy_LU_SOLVE_struct_3d(fptr *options, int *n, fptr *grid,
     dLUstructFree(LUstruct_ptr);
 }
 
-void f_dDestroy_A3d_gathered_on_2d(fptr *SOLVEstruct, fptr *grid)
+void f_dDestroy_A3d_gathered_on_2d(fptr *SOLVEstruct, fptr *grid3d)
 {
     dDestroy_A3d_gathered_on_2d((dSOLVEstruct_t *) *SOLVEstruct,
                                       (gridinfo3d_t *) *grid3d);
diff --git a/FORTRAN/superlu_c2f_zwrap.c b/FORTRAN/superlu_c2f_zwrap.c
index 48f050bf..70f66607 100644
--- a/FORTRAN/superlu_c2f_zwrap.c
+++ b/FORTRAN/superlu_c2f_zwrap.c
@@ -94,7 +94,7 @@ void f_zDestroy_LU_SOLVE_struct_3d(fptr *options, int *n, fptr *grid,
     zLUstructFree(LUstruct_ptr);
 }
 
-void f_zDestroy_A3d_gathered_on_2d(fptr *SOLVEstruct, fptr *grid)
+void f_zDestroy_A3d_gathered_on_2d(fptr *SOLVEstruct, fptr *grid3d)
 {
     zDestroy_A3d_gathered_on_2d((zSOLVEstruct_t *) *SOLVEstruct,
                                       (gridinfo3d_t *) *grid3d);
diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c
index 3f1cb401..41544e1f 100644
--- a/SRC/dnrformat_loc3d.c
+++ b/SRC/dnrformat_loc3d.c
@@ -98,14 +98,17 @@ void dGatherNRformat_loc3d
 	nnz_disp[0] = 0;
 	row_disp[0] = 0;
 	b_disp[0] = 0;
+	int nrhs1 = nrhs; // input 
+	if ( nrhs <= 0 ) nrhs1 = 1; /* Make sure to compute offsets and
+	                               counts for future use.   */
 	for (int i = 0; i < grid3d->npdep; i++)
 	    {
 		nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
 		row_disp[i + 1] = row_disp[i] + row_counts[i];
-		b_disp[i + 1] = nrhs * row_disp[i + 1];
+		b_disp[i + 1] = nrhs1 * row_disp[i + 1];
 		nnz_counts_int[i] = nnz_counts[i];
 		row_counts_int[i] = row_counts[i];
-		b_counts_int[i] = nrhs * row_counts[i];
+		b_counts_int[i] = nrhs1 * row_counts[i];
 	    }
 
 	if (grid3d->zscp.Iam == 0)
@@ -182,7 +185,7 @@ void dGatherNRformat_loc3d
     A3d->B3d = (double *) B; /* save the pointer to the original B
 				    stored on 3D process grid.  */
     A3d->ldb = ldb;
-    A3d->nrhs = nrhs;
+    A3d->nrhs = nrhs; // record the input 
 	
     /********* Gather B2d **********/
     if ( nrhs > 0 ) {
@@ -193,8 +196,6 @@ void dGatherNRformat_loc3d
 	row_counts_int = A3d->row_counts_int;
 	row_disp       = A3d->row_disp;
 	
-	printf("dGather_loc3d(2): row_disp %p, A3d %p\n", row_disp, A3d); fflush(stdout);
-
 	/* Btmp <- compact(B), compacting B */
 	double *Btmp;
 	Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(double));
diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c
index 822d8ebf..aa28622c 100644
--- a/SRC/pdgssvx.c
+++ b/SRC/pdgssvx.c
@@ -406,7 +406,7 @@ at the top-level directory.
  *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
  *           is already in postorder.
  *
- *         o R (double*) dimension (A->nrow)
+ *         o R (double *) dimension (A->nrow)
  *           The row scale factors for A.
  *           If DiagScale = ROW or BOTH, A is multiplied on the left by
  *                          diag(R).
@@ -414,7 +414,7 @@ at the top-level directory.
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
  *           an input argument; otherwise, R is an output argument.
  *
- *         o C (double*) dimension (A->ncol)
+ *         o C (double *) dimension (A->ncol)
  *           The column scale factors for A.
  *           If DiagScale = COL or BOTH, A is multiplied on the right by
  *                          diag(C).
@@ -588,13 +588,13 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
     /* Test the input parameters. */
     *info = 0;
     Fact = options->Fact;
-    if ( Fact < 0 || Fact > FACTORED )
+    if ( Fact < DOFACT || Fact > FACTORED )
 	*info = -1;
-    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
+    else if ( options->RowPerm < NOROWPERM || options->RowPerm > MY_PERMR )
 	*info = -1;
-    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
+    else if ( options->ColPerm < NATURAL || options->ColPerm > MY_PERMC )
 	*info = -1;
-    else if ( options->IterRefine < 0 || options->IterRefine > SLU_EXTRA )
+    else if ( options->IterRefine < NOREFINE || options->IterRefine > SLU_EXTRA )
 	*info = -1;
     else if ( options->IterRefine == SLU_EXTRA ) {
 	*info = -1;
@@ -639,6 +639,15 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
     C = ScalePermstruct->C;
     /********/
 
+#ifdef GPU_ACC
+    /* Binding each MPI to a CUDA device */
+    int devs;
+    // MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs); // Returns the number of compute-capable devices
+    cudaSetDevice(iam % devs); // Set device to be used for GPU executions
+    ////
+#endif
+
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter pdgssvx()");
 #endif
diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c
index ffa99035..9a140a36 100644
--- a/SRC/pdgssvx3d.c
+++ b/SRC/pdgssvx3d.c
@@ -580,6 +580,15 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     
     /* Initialization. */
 
+#ifdef GPU_ACC
+    /* Binding each MPI to a CUDA device */
+    int devs;
+    // MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs); // Returns the number of compute-capable devices
+    cudaSetDevice(iam % devs); // Set device to be used for GPU executions
+    ////
+#endif
+
     options->Algo3d = YES;
 	
     /* definition of factored seen by each process layer */
diff --git a/SRC/psgssvx.c b/SRC/psgssvx.c
index 73020a02..2e922695 100644
--- a/SRC/psgssvx.c
+++ b/SRC/psgssvx.c
@@ -639,6 +639,15 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
     C = ScalePermstruct->C;
     /********/
 
+#ifdef GPU_ACC
+    /* Binding each MPI to a CUDA device */
+    int devs;
+    // MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs); // Returns the number of compute-capable devices
+    cudaSetDevice(iam % devs); // Set device to be used for GPU executions
+    ////
+#endif
+
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter psgssvx()");
 #endif
diff --git a/SRC/psgssvx3d.c b/SRC/psgssvx3d.c
index 727da4dc..6548c470 100644
--- a/SRC/psgssvx3d.c
+++ b/SRC/psgssvx3d.c
@@ -580,6 +580,15 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     
     /* Initialization. */
 
+#ifdef GPU_ACC
+    /* Binding each MPI to a CUDA device */
+    int devs;
+    // MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs); // Returns the number of compute-capable devices
+    cudaSetDevice(iam % devs); // Set device to be used for GPU executions
+    ////
+#endif
+
     options->Algo3d = YES;
 	
     /* definition of factored seen by each process layer */
diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c
index a1ea6bcf..606bd676 100644
--- a/SRC/pzgssvx.c
+++ b/SRC/pzgssvx.c
@@ -405,7 +405,7 @@ at the top-level directory.
  *           of Pc*A'*A*Pc'; perm_c is not changed if the elimination tree
  *           is already in postorder.
  *
- *         o R (double*) dimension (A->nrow)
+ *         o R (double *) dimension (A->nrow)
  *           The row scale factors for A.
  *           If DiagScale = ROW or BOTH, A is multiplied on the left by
  *                          diag(R).
@@ -413,7 +413,7 @@ at the top-level directory.
  *           If options->Fact = FACTORED or SamePattern_SameRowPerm, R is
  *           an input argument; otherwise, R is an output argument.
  *
- *         o C (double*) dimension (A->ncol)
+ *         o C (double *) dimension (A->ncol)
  *           The column scale factors for A.
  *           If DiagScale = COL or BOTH, A is multiplied on the right by
  *                          diag(C).
@@ -587,13 +587,13 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
     /* Test the input parameters. */
     *info = 0;
     Fact = options->Fact;
-    if ( Fact < 0 || Fact > FACTORED )
+    if ( Fact < DOFACT || Fact > FACTORED )
 	*info = -1;
-    else if ( options->RowPerm < 0 || options->RowPerm > MY_PERMR )
+    else if ( options->RowPerm < NOROWPERM || options->RowPerm > MY_PERMR )
 	*info = -1;
-    else if ( options->ColPerm < 0 || options->ColPerm > MY_PERMC )
+    else if ( options->ColPerm < NATURAL || options->ColPerm > MY_PERMC )
 	*info = -1;
-    else if ( options->IterRefine < 0 || options->IterRefine > SLU_EXTRA )
+    else if ( options->IterRefine < NOREFINE || options->IterRefine > SLU_EXTRA )
 	*info = -1;
     else if ( options->IterRefine == SLU_EXTRA ) {
 	*info = -1;
@@ -638,6 +638,15 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
     C = ScalePermstruct->C;
     /********/
 
+#ifdef GPU_ACC
+    /* Binding each MPI to a CUDA device */
+    int devs;
+    // MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs); // Returns the number of compute-capable devices
+    cudaSetDevice(iam % devs); // Set device to be used for GPU executions
+    ////
+#endif
+
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter pzgssvx()");
 #endif
diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c
index 8212ec39..0843881d 100644
--- a/SRC/pzgssvx3d.c
+++ b/SRC/pzgssvx3d.c
@@ -579,6 +579,15 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     
     /* Initialization. */
 
+#ifdef GPU_ACC
+    /* Binding each MPI to a CUDA device */
+    int devs;
+    // MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    cudaGetDeviceCount(&devs); // Returns the number of compute-capable devices
+    cudaSetDevice(iam % devs); // Set device to be used for GPU executions
+    ////
+#endif
+
     options->Algo3d = YES;
 	
     /* definition of factored seen by each process layer */
diff --git a/SRC/snrformat_loc3d.c b/SRC/snrformat_loc3d.c
index 01855bf5..99e37c06 100644
--- a/SRC/snrformat_loc3d.c
+++ b/SRC/snrformat_loc3d.c
@@ -98,14 +98,17 @@ void sGatherNRformat_loc3d
 	nnz_disp[0] = 0;
 	row_disp[0] = 0;
 	b_disp[0] = 0;
+	int nrhs1 = nrhs; // input 
+	if ( nrhs <= 0 ) nrhs1 = 1; /* Make sure to compute offsets and
+	                               counts for future use.   */
 	for (int i = 0; i < grid3d->npdep; i++)
 	    {
 		nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
 		row_disp[i + 1] = row_disp[i] + row_counts[i];
-		b_disp[i + 1] = nrhs * row_disp[i + 1];
+		b_disp[i + 1] = nrhs1 * row_disp[i + 1];
 		nnz_counts_int[i] = nnz_counts[i];
 		row_counts_int[i] = row_counts[i];
-		b_counts_int[i] = nrhs * row_counts[i];
+		b_counts_int[i] = nrhs1 * row_counts[i];
 	    }
 
 	if (grid3d->zscp.Iam == 0)
@@ -182,7 +185,7 @@ void sGatherNRformat_loc3d
     A3d->B3d = (float *) B; /* save the pointer to the original B
 				    stored on 3D process grid.  */
     A3d->ldb = ldb;
-    A3d->nrhs = nrhs;
+    A3d->nrhs = nrhs; // record the input 
 	
     /********* Gather B2d **********/
     if ( nrhs > 0 ) {
@@ -193,8 +196,6 @@ void sGatherNRformat_loc3d
 	row_counts_int = A3d->row_counts_int;
 	row_disp       = A3d->row_disp;
 	
-	printf("dGather_loc3d(2): row_disp %p, A3d %p\n", row_disp, A3d); fflush(stdout);
-
 	/* Btmp <- compact(B), compacting B */
 	float *Btmp;
 	Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(float));
diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h
index 1a724af8..05e3c8d8 100644
--- a/SRC/superlu_ddefs.h
+++ b/SRC/superlu_ddefs.h
@@ -634,10 +634,9 @@ extern int superlu_dgemv(const char *trans, const int m,
 extern int superlu_dtrsv(char *uplo, char *trans, char *diag,
                   int n, double *a, int lda, double *x, int incx);
 
-
-// LAPACK routine
+#ifdef SLU_HAVE_LAPACK
 extern void dtrtri_(char*, char*, int*, double*, int*, int*);
-
+#endif
 
 /*==== For 3D code ====*/
 extern int dcreate_matrix3d(SuperMatrix *A, int nrhs, double **rhs,
diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h
index ed7c53ea..e2e2817e 100644
--- a/SRC/superlu_zdefs.h
+++ b/SRC/superlu_zdefs.h
@@ -636,10 +636,9 @@ extern int superlu_zgemv(const char *trans, const int m,
 extern int superlu_ztrsv(char *uplo, char *trans, char *diag,
                   int n, doublecomplex *a, int lda, doublecomplex *x, int incx);
 
-
-// LAPACK routine
+#ifdef SLU_HAVE_LAPACK
 extern void ztrtri_(char*, char*, int*, doublecomplex*, int*, int*);
-
+#endif
 
 /*==== For 3D code ====*/
 extern int zcreate_matrix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs,
diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c
index 8d7d046d..19535d5e 100644
--- a/SRC/znrformat_loc3d.c
+++ b/SRC/znrformat_loc3d.c
@@ -97,14 +97,17 @@ void zGatherNRformat_loc3d
 	nnz_disp[0] = 0;
 	row_disp[0] = 0;
 	b_disp[0] = 0;
+	int nrhs1 = nrhs; // input 
+	if ( nrhs <= 0 ) nrhs1 = 1; /* Make sure to compute offsets and
+	                               counts for future use.   */
 	for (int i = 0; i < grid3d->npdep; i++)
 	    {
 		nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
 		row_disp[i + 1] = row_disp[i] + row_counts[i];
-		b_disp[i + 1] = nrhs * row_disp[i + 1];
+		b_disp[i + 1] = nrhs1 * row_disp[i + 1];
 		nnz_counts_int[i] = nnz_counts[i];
 		row_counts_int[i] = row_counts[i];
-		b_counts_int[i] = nrhs * row_counts[i];
+		b_counts_int[i] = nrhs1 * row_counts[i];
 	    }
 
 	if (grid3d->zscp.Iam == 0)
@@ -181,7 +184,7 @@ void zGatherNRformat_loc3d
     A3d->B3d = (doublecomplex *) B; /* save the pointer to the original B
 				    stored on 3D process grid.  */
     A3d->ldb = ldb;
-    A3d->nrhs = nrhs;
+    A3d->nrhs = nrhs; // record the input 
 	
     /********* Gather B2d **********/
     if ( nrhs > 0 ) {
@@ -192,8 +195,6 @@ void zGatherNRformat_loc3d
 	row_counts_int = A3d->row_counts_int;
 	row_disp       = A3d->row_disp;
 	
-	printf("dGather_loc3d(2): row_disp %p, A3d %p\n", row_disp, A3d); fflush(stdout);
-
 	/* Btmp <- compact(B), compacting B */
 	doublecomplex *Btmp;
 	Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(doublecomplex));

From 021946b9c33743edb5c470423d8b3552ff0cfb35 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Sun, 12 Sep 2021 10:19:07 -0700
Subject: [PATCH 122/147] Add 3D examples for SamePattern and
 SamePattern_SameRowPerm. Add xZeroUblocks() in xutil_dist.c

---
 EXAMPLE/CMakeLists.txt     |  24 +++
 EXAMPLE/Makefile           |  24 ++-
 EXAMPLE/README             |  18 +-
 EXAMPLE/dnrformat_loc3d.c  | 404 +++++++++++++++++++++++++++++++++++
 EXAMPLE/pddrive2.c         |  24 ++-
 EXAMPLE/pddrive3.c         |  11 +-
 EXAMPLE/pddrive3d1.c       |   4 +-
 EXAMPLE/pddrive3d2.c       | 410 ++++++++++++++++++++++++++++++++++++
 EXAMPLE/pddrive3d3.c       | 416 +++++++++++++++++++++++++++++++++++++
 EXAMPLE/psdrive2.c         |  24 ++-
 EXAMPLE/psdrive3.c         |  11 +-
 EXAMPLE/psdrive3d1.c       |   4 +-
 EXAMPLE/psdrive3d2.c       | 410 ++++++++++++++++++++++++++++++++++++
 EXAMPLE/psdrive3d3.c       | 416 +++++++++++++++++++++++++++++++++++++
 EXAMPLE/pzdrive2.c         |  24 ++-
 EXAMPLE/pzdrive3.c         |  11 +-
 EXAMPLE/pzdrive3d1.c       |   4 +-
 EXAMPLE/pzdrive3d2.c       | 410 ++++++++++++++++++++++++++++++++++++
 EXAMPLE/pzdrive3d3.c       | 416 +++++++++++++++++++++++++++++++++++++
 SRC/dlustruct_gpu.h        |   5 +-
 SRC/dnrformat_loc3d.c      |  84 ++++++--
 SRC/dreadMM.c              |   2 +-
 SRC/dreadtriple.c          |   7 +-
 SRC/dreadtriple_noheader.c |   6 +-
 SRC/dsuperlu_gpu.cu        |  37 ++--
 SRC/dutil_dist.c           |  40 +++-
 SRC/pdgssvx3d.c            |  10 +-
 SRC/pdutil.c               |   8 +-
 SRC/psgssvx3d.c            |  12 +-
 SRC/psutil.c               |   8 +-
 SRC/pzgssvx3d.c            |  12 +-
 SRC/pzutil.c               |   8 +-
 SRC/slustruct_gpu.h        |   5 +-
 SRC/snrformat_loc3d.c      |  84 ++++++--
 SRC/sreadMM.c              |   2 +-
 SRC/sreadtriple.c          |   7 +-
 SRC/sreadtriple_noheader.c |   6 +-
 SRC/ssuperlu_gpu.cu        |  14 +-
 SRC/superlu_ddefs.h        |   3 +-
 SRC/superlu_zdefs.h        |   3 +-
 SRC/supermatrix.h          |  18 +-
 SRC/sutil_dist.c           |  40 +++-
 SRC/zlustruct_gpu.h        |   5 +-
 SRC/znrformat_loc3d.c      |  84 ++++++--
 SRC/zreadMM.c              |   2 +-
 SRC/zreadtriple.c          |   7 +-
 SRC/zreadtriple_noheader.c |   6 +-
 SRC/zsuperlu_gpu.cu        |  20 +-
 SRC/zutil_dist.c           |  43 +++-
 49 files changed, 3433 insertions(+), 220 deletions(-)
 create mode 100644 EXAMPLE/dnrformat_loc3d.c
 create mode 100644 EXAMPLE/pddrive3d2.c
 create mode 100644 EXAMPLE/pddrive3d3.c
 create mode 100644 EXAMPLE/psdrive3d2.c
 create mode 100644 EXAMPLE/psdrive3d3.c
 create mode 100644 EXAMPLE/pzdrive3d2.c
 create mode 100644 EXAMPLE/pzdrive3d3.c

diff --git a/EXAMPLE/CMakeLists.txt b/EXAMPLE/CMakeLists.txt
index 468b6a2e..f22ef414 100644
--- a/EXAMPLE/CMakeLists.txt
+++ b/EXAMPLE/CMakeLists.txt
@@ -65,6 +65,14 @@ if(enable_double)
   add_executable(pddrive3d1 ${DEXM3D1})
   target_link_libraries(pddrive3d1 ${all_link_libs})
   
+  set(DEXM3D2 pddrive3d2.c dcreate_matrix.c dcreate_matrix3d.c)
+  add_executable(pddrive3d2 ${DEXM3D2})
+  target_link_libraries(pddrive3d2 ${all_link_libs})
+  
+  set(DEXM3D3 pddrive3d3.c dcreate_matrix.c dcreate_matrix3d.c)
+  add_executable(pddrive3d3 ${DEXM3D3})
+  target_link_libraries(pddrive3d3 ${all_link_libs})
+  
   set(DEXMG pddrive_ABglobal.c)
   add_executable(pddrive_ABglobal ${DEXMG})
   target_link_libraries(pddrive_ABglobal ${all_link_libs})
@@ -124,6 +132,14 @@ if(enable_single)
   add_executable(psdrive3d1 ${SEXM3D1})
   target_link_libraries(psdrive3d1 ${all_link_libs})
   
+  set(SEXM3D2 psdrive3d2.c screate_matrix.c screate_matrix3d.c)
+  add_executable(psdrive3d2 ${SEXM3D2})
+  target_link_libraries(psdrive3d2 ${all_link_libs})
+  
+  set(SEXM3D3 psdrive3d3.c screate_matrix.c screate_matrix3d.c)
+  add_executable(psdrive3d3 ${SEXM3D3})
+  target_link_libraries(psdrive3d3 ${all_link_libs})
+  
 endif() #### end enable_single
 
 if(enable_complex16)
@@ -159,6 +175,14 @@ if(enable_complex16)
   add_executable(pzdrive3d1 ${ZEXM3D1})
   target_link_libraries(pzdrive3d1 ${all_link_libs})
   
+  set(ZEXM3D2 pzdrive3d2.c zcreate_matrix.c zcreate_matrix3d.c)
+  add_executable(pzdrive3d2 ${ZEXM3D2})
+  target_link_libraries(pzdrive3d2 ${all_link_libs})
+  
+  set(ZEXM3D3 pzdrive3d3.c zcreate_matrix.c zcreate_matrix3d.c)
+  add_executable(pzdrive3d3 ${ZEXM3D3})
+  target_link_libraries(pzdrive3d3 ${all_link_libs})
+  
   set(ZEXMG pzdrive_ABglobal.c)
   add_executable(pzdrive_ABglobal ${ZEXMG})
   target_link_libraries(pzdrive_ABglobal ${all_link_libs})
diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile
index 8c36ed2a..37387b52 100644
--- a/EXAMPLE/Makefile
+++ b/EXAMPLE/Makefile
@@ -38,6 +38,8 @@ DEXM4	= pddrive4.o dcreate_matrix.o
 
 DEXM3D	= pddrive3d.o dcreate_matrix.o dcreate_matrix3d.o 
 DEXM3D1	= pddrive3d1.o dcreate_matrix.o dcreate_matrix3d.o 
+DEXM3D2	= pddrive3d2.o dcreate_matrix.o dcreate_matrix3d.o
+DEXM3D3	= pddrive3d3.o dcreate_matrix.o dcreate_matrix3d.o
 
 #	   dtrfAux.o dtreeFactorization.o treeFactorization.o pd3dcomm.o superlu_grid3d.o pdgstrf3d.o
 DEXMG	= pddrive_ABglobal.o
@@ -53,7 +55,9 @@ ZEXM3	= pzdrive3.o zcreate_matrix.o
 ZEXM4	= pzdrive4.o zcreate_matrix.o
 ZEXM3D	= pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o
 ZEXM3D1	= pzdrive3d1.o zcreate_matrix.o zcreate_matrix3d.o
-ZEXM3D1	= pzdrive3d1.o zcreate_matrix.o zcreate_matrix3d.o
+ZEXM3D2	= pzdrive3d2.o zcreate_matrix.o zcreate_matrix3d.o
+ZEXM3D3	= pzdrive3d3.o zcreate_matrix.o zcreate_matrix3d.o
+
 ZEXMG	= pzdrive_ABglobal.o
 ZEXMG1	= pzdrive1_ABglobal.o
 ZEXMG2	= pzdrive2_ABglobal.o
@@ -61,15 +65,15 @@ ZEXMG3	= pzdrive3_ABglobal.o
 ZEXMG4	= pzdrive4_ABglobal.o
 
 
-all: pddrive3d1 pddrive3d double complex16
+all: double complex16
 
 double:    pddrive pddrive1 pddrive2 pddrive3 pddrive4 \
-	   pddrive3d pddrive3d1 \
+	   pddrive3d pddrive3d1 pddrive3d2 pddrive3d3 \
 	   pddrive_ABglobal pddrive1_ABglobal pddrive2_ABglobal \
 	   pddrive3_ABglobal pddrive4_ABglobal
 
 complex16: pzdrive pzdrive1 pzdrive2 pzdrive3 pzdrive4 \
-	   pzdrive3d pzdrive3d1 \
+	   pzdrive3d pzdrive3d1 pzdrive3d2 pzdrive3d3 \
 	   pzdrive_ABglobal pzdrive1_ABglobal pzdrive2_ABglobal \
 	   pzdrive3_ABglobal pzdrive4_ABglobal
 
@@ -94,6 +98,12 @@ pddrive3d: $(DEXM3D) $(DSUPERLULIB)
 pddrive3d1: $(DEXM3D1) $(DSUPERLULIB)
 	$(LOADER) $(LOADOPTS) $(DEXM3D1) $(LIBS) -lm -o $@
 
+pddrive3d2: $(DEXM3D2) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXM3D2) $(LIBS) -lm -o $@
+
+pddrive3d3: $(DEXM3D3) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(DEXM3D3) $(LIBS) -lm -o $@
+
 pddrive_ABglobal: $(DEXMG) $(DSUPERLULIB)
 	$(LOADER) $(LOADOPTS) $(DEXMG) $(LIBS) -lm -o $@
 
@@ -133,6 +143,12 @@ pzdrive3d: $(ZEXM3D) $(DSUPERLULIB)
 pzdrive3d1: $(ZEXM3D1) $(DSUPERLULIB)
 	$(LOADER) $(LOADOPTS) $(ZEXM3D1) $(LIBS) -lm -o $@
 
+pzdrive3d2: $(ZEXM3D2) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXM3D2) $(LIBS) -lm -o $@
+
+pzdrive3d3: $(ZEXM3D3) $(DSUPERLULIB)
+	$(LOADER) $(LOADOPTS) $(ZEXM3D3) $(LIBS) -lm -o $@
+
 pzdrive_ABglobal: $(ZEXMG) $(DSUPERLULIB)
 	$(LOADER) $(LOADOPTS) $(ZEXMG) $(LIBS) -lm -o $@
 
diff --git a/EXAMPLE/README b/EXAMPLE/README
index 7cfba7de..110484d2 100644
--- a/EXAMPLE/README
+++ b/EXAMPLE/README
@@ -4,11 +4,22 @@
 This directory contains sample programs to illustrate how to use
 various functions provided in SuperLU_DIST. You can modify these
 examples to suit your applications.
+(double real: pddrive*
+ double complex: pzdrive* )
 
 The examples illustrate the following functionalities:
-  0. pddrive3d.c
-     Use PDGSSVX3D communication-avoiding 3D algorithm with the default options
-     to solve a linear system
+  0. pddrive3d.c: (invoke new communication-avoiding 3D algorithms)
+         Use PxGSSVX3D with the default options to solve a linear system
+     pddrive3d1.c:
+	 Use PxGSSVX3D to solve the systems with same A but different
+	 right-hand side. (Reuse the factored form of A)
+     pddrive3d2.c:
+	 Use PxGSSVX3D to solve the systems with same sparsity pattern
+	 of A. (Reuse the sparsity ordering)
+     pddrive3d2.c:
+	 Use PxGSSVX3D to solve the systems with same sparsity pattern
+	 and similar values. (Reuse sparsity ordering and row pivoting)
+     
   1. pddrive.c, pddrive_ABglobal.c
      Use PDGSSVX with the full (default) options to solve a linear system.
   2. pddrive1.c, pddrive1_ABglobal.c
@@ -19,6 +30,7 @@ The examples illustrate the following functionalities:
      (Reuse the sparsity ordering)
   4. pddrive3.c, pddrive3_ABglobal.c
      Solve the systems with the same sparsity pattern and similar values.
+     (Reuse sparsity ordering and row pivoting)     
   5. pddrive4.c, pddrive4_ABglobal.c
      Divide the processors into two subgroups (two grids) such that each
      subgroup solves a linear system independently from the other.
diff --git a/EXAMPLE/dnrformat_loc3d.c b/EXAMPLE/dnrformat_loc3d.c
new file mode 100644
index 00000000..1fbaca69
--- /dev/null
+++ b/EXAMPLE/dnrformat_loc3d.c
@@ -0,0 +1,404 @@
+/*! \file
+Copyright (c) 2003, The Regents of the University of California, through
+Lawrence Berkeley National Laboratory (subject to receipt of any required
+approvals from U.S. Dept. of Energy)
+
+All rights reserved.
+
+The source code is distributed under BSD license, see the file License.txt
+at the top-level directory.
+*/
+
+
+
+/*! @file
+ * \brief Preprocessing routines for the 3D factorization/solve codes:
+ *        - Gather {A,B} from 3D grid to 2D process layer 0
+ *        - Scatter B (solution) from 2D process layer 0 to 3D grid
+ *
+ * 
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Oak Ridge National Lab.
+ * May 12, 2021
+ */
+
+#include "superlu_ddefs.h"
+
+/* Dst <- BlockByBlock (Src), reshape the block storage. */
+static void matCopy(int n, int m, double *Dst, int lddst, double *Src, int ldsrc)
+{
+    for (int j = 0; j < m; j++)
+        for (int i = 0; i < n; ++i)
+        {
+            Dst[i + lddst * j] = Src[i + ldsrc * j];
+        }
+
+    return;
+}
+
+/*
+ * Gather {A,B} from 3D grid to 2D process layer 0
+ *     Input:  {A, B, ldb} are distributed on 3D process grid
+ *     Output: {A2d, B2d} are distributed on layer 0 2D process grid
+ *             output is in the returned A3d->{} structure.
+ *             see supermatrix.h for nrformat_loc3d{} structure.
+ */
+void dGatherNRformat_loc3d
+(
+ fact_t Fact,     // how matrix A will be factorized
+ NRformat_loc *A, // input, on 3D grid
+ double *B,       // input
+ int ldb, int nrhs, // input
+ gridinfo3d_t *grid3d, 
+ NRformat_loc3d **A3d_addr /* If Fact == DOFACT, it is an input;
+ 		              Else it is both input and may be modified */
+ )
+{
+    NRformat_loc3d *A3d = (NRformat_loc3d *) *A3d_addr;
+    NRformat_loc *A2d;
+    int *row_counts_int; // 32-bit, number of local rows relative to all processes
+    int *row_disp;       // displacement
+    int *nnz_counts_int; // number of local nnz relative to all processes
+    int *nnz_disp;       // displacement
+    int *b_counts_int;   // number of local B entries relative to all processes 
+    int *b_disp;         // including 'nrhs'
+	
+    /********* Gather A2d *********/
+    if ( Fact == DOFACT ) { /* Factorize from scratch */
+	/* A3d is output. Compute counts from scratch */
+	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
+    
+	// find number of nnzs
+	int_t *nnz_counts; // number of local nonzeros relative to all processes
+	int_t *row_counts; // number of local rows relative to all processes
+	int *nnz_counts_int; // 32-bit
+	int *nnz_disp; // displacement
+
+	nnz_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	row_counts = SUPERLU_MALLOC(grid3d->npdep * sizeof(int_t));
+	nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
+	MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
+		   1, mpi_int_t, 0, grid3d->zscp.comm);
+	nnz_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	row_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+	b_disp = SUPERLU_MALLOC((grid3d->npdep + 1) * sizeof(int));
+
+	nnz_disp[0] = 0;
+	row_disp[0] = 0;
+	b_disp[0] = 0;
+	int nrhs1 = nrhs; // input 
+	if ( nrhs <= 0 ) nrhs1 = 1; /* Make sure to compute offsets and
+	                               counts for future use.   */
+	for (int i = 0; i < grid3d->npdep; i++)
+	    {
+		nnz_disp[i + 1] = nnz_disp[i] + nnz_counts[i];
+		row_disp[i + 1] = row_disp[i] + row_counts[i];
+		b_disp[i + 1] = nrhs1 * row_disp[i + 1];
+		nnz_counts_int[i] = nnz_counts[i];
+		row_counts_int[i] = row_counts[i];
+		b_counts_int[i] = nrhs1 * row_counts[i];
+	    }
+
+	if (grid3d->zscp.Iam == 0)
+	    {
+		A2d->colind = intMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->nzval = doubleMalloc_dist(nnz_disp[grid3d->npdep]);
+		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
+	    }
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    MPI_DOUBLE, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+
+	if (grid3d->zscp.Iam == 0)
+	    {
+		A2d->rowptr[0] = 0;
+		for (int i = 0; i < grid3d->npdep; i++)
+		    {
+			for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+			    {
+				// A2d->rowptr[j] += row_disp[i];
+				A2d->rowptr[j] += nnz_disp[i];
+			    }
+		    }
+		A2d->nnz_loc = nnz_disp[grid3d->npdep];
+		A2d->m_loc = row_disp[grid3d->npdep];
+
+		if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+		} else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+		}
+	    } /* end 2D layer grid-0 */
+
+	A3d->A_nfmt         = A2d;
+	A3d->row_counts_int = row_counts_int;
+	A3d->row_disp       = row_disp;
+	A3d->nnz_counts_int = nnz_counts_int;
+	A3d->nnz_disp       = nnz_disp;
+	A3d->b_counts_int   = b_counts_int;
+	A3d->b_disp         = b_disp;
+
+	/* free storage */
+	SUPERLU_FREE(nnz_counts);
+	SUPERLU_FREE(row_counts);
+	
+	*A3d_addr = (NRformat_loc3d *) A3d; // return pointer to A3d struct
+	
+    } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) {
+	/* A3d is input. No need to recompute count.
+	   Only need to gather A2d matrix; the previous 2D matrix
+	   was overwritten by equilibration, perm_r and perm_c.  */
+	NRformat_loc *A2d = A3d->A_nfmt;
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	nnz_counts_int = A3d->nnz_counts_int;
+	nnz_disp       = A3d->nnz_disp;
+
+	MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval,
+		    nnz_counts_int, nnz_disp,
+		    MPI_DOUBLE, 0, grid3d->zscp.comm);
+	MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind,
+		    nnz_counts_int, nnz_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1],
+		    row_counts_int, row_disp,
+		    mpi_int_t, 0, grid3d->zscp.comm);
+	
+	if (grid3d->zscp.Iam == 0) {
+		A2d->rowptr[0] = 0;
+		
+		for (int i = 0; i < grid3d->npdep; i++)
+		    {
+			for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++)
+			    {
+				// A2d->rowptr[j] += row_disp[i];
+				A2d->rowptr[j] += nnz_disp[i];
+			    }
+		    }
+		A2d->nnz_loc = nnz_disp[grid3d->npdep];
+		A2d->m_loc = row_disp[grid3d->npdep];
+
+		if (grid3d->rankorder == 1) { // XY-major
+		    A2d->fst_row = A->fst_row;
+		} else { // Z-major
+		    gridinfo_t *grid2d = &(grid3d->grid2d);
+		    int procs2d = grid2d->nprow * grid2d->npcol;
+		    int m_loc_2d = A2d->m_loc;
+		    int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int));
+
+		    MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, 
+				  MPI_INT, grid2d->comm);
+
+		    int fst_row = 0;
+		    for (int p = 0; p < procs2d; ++p)
+			{
+			    if (grid2d->iam == p)
+				A2d->fst_row = fst_row;
+			    fst_row += m_loc_2d_counts[p];
+			}
+
+		    SUPERLU_FREE(m_loc_2d_counts);
+		}
+	} /* end 2D layer grid-0 */
+		    
+    } /* SamePattern or SamePattern_SameRowPerm */
+
+    A3d->m_loc = A->m_loc;
+    A3d->B3d = (double *) B; /* save the pointer to the original B
+				    stored on 3D process grid.  */
+    A3d->ldb = ldb;
+    A3d->nrhs = nrhs; // record the input 
+	
+    /********* Gather B2d **********/
+    if ( nrhs > 0 ) {
+	
+	A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0
+	row_counts_int = A3d->row_counts_int;
+	row_disp       = A3d->row_disp;
+	b_counts_int   = A3d->b_counts_int;
+	b_disp         = A3d->b_disp;;
+	
+	/* Btmp <- compact(B), compacting B */
+	double *Btmp;
+	Btmp = SUPERLU_MALLOC(A->m_loc * nrhs * sizeof(double));
+	matCopy(A->m_loc, nrhs, Btmp, A->m_loc, B, ldb);
+
+	double *B1;
+	if (grid3d->zscp.Iam == 0)
+	    {
+		B1 = doubleMalloc_dist(A2d->m_loc * nrhs);
+		A3d->B2d = doubleMalloc_dist(A2d->m_loc * nrhs);
+	    }
+
+	// B1 <- gatherv(Btmp)
+	MPI_Gatherv(Btmp, nrhs * A->m_loc, MPI_DOUBLE, B1,
+		    b_counts_int, b_disp,
+		    MPI_DOUBLE, 0, grid3d->zscp.comm);
+	SUPERLU_FREE(Btmp);
+
+	// B2d <- colMajor(B1)
+	if (grid3d->zscp.Iam == 0)
+	    {
+		for (int i = 0; i < grid3d->npdep; ++i)
+		    {
+			/* code */
+			matCopy(row_counts_int[i], nrhs, ((double*)A3d->B2d) + row_disp[i],
+				A2d->m_loc, B1 + nrhs * row_disp[i], row_counts_int[i]);
+		    }
+		
+		SUPERLU_FREE(B1);
+	    }
+
+    } /* end gather B2d */
+
+} /* dGatherNRformat_loc3d */
+
+/*
+ * Scatter B (solution) from 2D process layer 0 to 3D grid
+ *   Output: X3d <- A^{-1} B2d
+ */
+int dScatter_B3d(NRformat_loc3d *A3d,  // modified
+		 gridinfo3d_t *grid3d)
+{
+    double *B = (double *) A3d->B3d; // retrieve original pointer on 3D grid
+    int ldb = A3d->ldb;
+    int nrhs = A3d->nrhs;
+    double *B2d = (double *) A3d->B2d; // only on 2D layer grid_0 
+    NRformat_loc *A2d = A3d->A_nfmt;
+
+    /* The following are the number of local rows relative to Z-dimension */
+    int m_loc           = A3d->m_loc;
+    int *b_counts_int   = A3d->b_counts_int;
+    int *b_disp         = A3d->b_disp;
+    int *row_counts_int = A3d->row_counts_int;
+    int *row_disp       = A3d->row_disp;
+    int i, p;
+    int iam = grid3d->iam;
+    int rankorder = grid3d->rankorder;
+    gridinfo_t *grid2d = &(grid3d->grid2d);
+
+    double *B1;  // on 2D layer 0
+    if (grid3d->zscp.Iam == 0)
+    {
+        B1 = doubleMalloc_dist(A2d->m_loc * nrhs);
+    }
+
+    // B1 <- BlockByBlock(B2d)
+    if (grid3d->zscp.Iam == 0)
+    {
+        for (i = 0; i < grid3d->npdep; ++i)
+        {
+            /* code */
+            matCopy(row_counts_int[i], nrhs, B1 + nrhs * row_disp[i], row_counts_int[i],
+                    B2d + row_disp[i], A2d->m_loc);
+        }
+    }
+
+    double *Btmp; // on 3D grid
+    Btmp = doubleMalloc_dist(A3d->m_loc * nrhs);
+
+    // Btmp <- scatterv(B1), block-by-block
+    if ( rankorder == 1 ) { /* XY-major in 3D grid */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	 *                     0      1      2      3
+	 *                     4      5      6      7
+	 *                     8      9      10     11
+	 */
+        MPI_Scatterv(B1, b_counts_int, b_disp, MPI_DOUBLE,
+		     Btmp, nrhs * A3d->m_loc, MPI_DOUBLE,
+		     0, grid3d->zscp.comm);
+
+    } else { /* Z-major in 3D grid */
+        /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
+	                       0      3      6      9
+ 	                       1      4      7      10      
+	                       2      5      8      11
+	  GATHER:  {A, B} in A * X = B
+	  layer-0:
+    	       B (row space)  X (column space)  SCATTER
+	       ----           ----        ---->>
+           P0  0              0
+(equations     3              1      Proc 0 -> Procs {0, 1, 2, 3}
+ reordered     6              2
+ after gather) 9              3
+	       ----           ----
+	   P1  1              4      Proc 1 -> Procs {4, 5, 6, 7}
+	       4              5
+               7              6
+               10             7
+	       ----           ----
+	   P2  2              8      Proc 2 -> Procs {8, 9, 10, 11}
+	       5              9
+	       8             10
+	       11            11
+	       ----         ----
+	*/
+        MPI_Request recv_req;
+	MPI_Status recv_status;
+	int pxy = grid2d->nprow * grid2d->npcol;
+	int npdep = grid3d->npdep, dest, src, tag;
+	int nprocs = pxy * npdep;
+
+	/* Everyone receives one block (post non-blocking irecv) */
+	src = grid3d->iam / npdep;  // Z-major
+	tag = iam;
+	MPI_Irecv(Btmp, nrhs * A3d->m_loc, MPI_DOUBLE,
+		 src, tag, grid3d->comm, &recv_req);
+
+	/* Layer 0 sends to npdep procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (p = 0; p < npdep; ++p) { // send to npdep procs
+	        dest = p + grid2d->iam * npdep; // Z-major order
+		tag = dest;
+
+		MPI_Send(B1 + b_disp[p], b_counts_int[p], 
+			 MPI_DOUBLE, dest, tag, grid3d->comm);
+	    }
+	}  /* end layer 0 send */
+    
+	/* Wait for Irecv to complete */
+	MPI_Wait(&recv_req, &recv_status);
+
+    } /* else Z-major */
+
+    // B <- colMajor(Btmp)
+    matCopy(A3d->m_loc, nrhs, B, ldb, Btmp, A3d->m_loc);
+
+    /* free storage */
+    SUPERLU_FREE(Btmp);
+    if (grid3d->zscp.Iam == 0) {
+	SUPERLU_FREE(B1);
+	SUPERLU_FREE(B2d);
+    }
+
+    return 0;
+} /* dScatter_B3d */
diff --git a/EXAMPLE/pddrive2.c b/EXAMPLE/pddrive2.c
index f60326bb..40da9732 100644
--- a/EXAMPLE/pddrive2.c
+++ b/EXAMPLE/pddrive2.c
@@ -33,8 +33,8 @@ at the top-level directory.
  *
  * The driver program PDDRIVE2.
  *
- * This example illustrates how to use  to solve
- * systems repeatedly with the same sparsity pattern of matrix A.
+ * This example illustrates how to use PDGSSVX to solve systems
+ * repeatedly with the same sparsity pattern of matrix A.
  * In this case, the column permutation vector ScalePermstruct->perm_c is
  * computed once. The following data structures will be reused in the
  * subsequent call to PDGSSVX:
@@ -147,7 +147,8 @@ int main(int argc, char *argv[])
        GET THE MATRIX FROM FILE AND SETUP THE RIGHT-HAND SIDE. 
        ------------------------------------------------------------*/
     dcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid);
-
+    fclose(fp);
+    
     if ( !(berr = doubleMalloc_dist(nrhs)) )
 	ABORT("Malloc fails for berr[].");
     m = A.nrow;
@@ -156,7 +157,7 @@ int main(int argc, char *argv[])
     m_loc = Astore->m_loc;
 
     /* ------------------------------------------------------------
-       WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
+       1. WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME.
        ------------------------------------------------------------*/
 
     /* Set the default input options:
@@ -197,13 +198,13 @@ int main(int argc, char *argv[])
     PStatFree(&stat);
     Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */ 
     dDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with 
-					the L and U matrices.               */
-    SUPERLU_FREE(b);                 /* Free storage of right-hand side.    */
-    SUPERLU_FREE(xtrue);             /* Free storage of the exact solution. */
+					the L and U matrices.  */
+    SUPERLU_FREE(b);      /* Free storage of right-hand side.    */
+    SUPERLU_FREE(xtrue);  /* Free storage of the exact solution.*/
 
     /* ------------------------------------------------------------
-       NOW WE SOLVE ANOTHER LINEAR SYSTEM.
-       ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
+       2. NOW WE SOLVE ANOTHER LINEAR SYSTEM.
+       	  ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME.
        ------------------------------------------------------------*/
     options.Fact = SamePattern;
 
@@ -218,8 +219,9 @@ int main(int argc, char *argv[])
     /* Get the matrix from file, perturbed some diagonal entries to force
        a different perm_r[]. Set up the right-hand side.   */
     if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist");
-    dcreate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb, &xtrue1, &ldx, fp, postfix, &grid);
-
+    dcreate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb,
+                                  &xtrue1, &ldx, fp, postfix, &grid);
+			     
     PStatInit(&stat); /* Initialize the statistics variables. */
 
     /* Solve the linear system. */
diff --git a/EXAMPLE/pddrive3.c b/EXAMPLE/pddrive3.c
index f2886945..51c60d9f 100644
--- a/EXAMPLE/pddrive3.c
+++ b/EXAMPLE/pddrive3.c
@@ -35,9 +35,9 @@ at the top-level directory.
  * This example illustrates how to use PDGSSVX to solve
  * systems repeatedly with the same sparsity pattern and similar
  * numerical values of matrix A.
- * In this case, the column permutation vector and symbolic factorization are
- * computed only once. The following data structures will be reused in the
- * subsequent call to PDGSSVX:
+ * In this case, the row and column permutation vectors and symbolic
+ * factorization are computed only once. The following data structures
+ * will be reused in the subsequent call to PDGSSVX:
  *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
  *        LUstruct        : etree, Glu_persist, Llu
  *
@@ -231,8 +231,9 @@ int main(int argc, char *argv[])
         nzval1[0] += 1.0e-8;
     }
 
-    /* Zero the numerical values in L.  */
+    /* Zero the numerical values in L and U.  */
     dZeroLblocks(iam, n, &grid, &LUstruct);
+    dZeroUblocks(iam, n, &grid, &LUstruct);
 
     dCreate_CompRowLoc_Matrix_dist(&A, m, n, nnz_loc, m_loc, fst_row,
 				   nzval1, colind1, rowptr1,
@@ -251,7 +252,7 @@ int main(int argc, char *argv[])
     PStatPrint(&options, &stat, &grid);
 
     /* ------------------------------------------------------------
-       DEALLOCATE STORAGE.
+       DEALLOCATE ALL STORAGE.
        ------------------------------------------------------------*/
     PStatFree(&stat);
     Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A.  */
diff --git a/EXAMPLE/pddrive3d1.c b/EXAMPLE/pddrive3d1.c
index fe45a5e8..bc226e32 100644
--- a/EXAMPLE/pddrive3d1.c
+++ b/EXAMPLE/pddrive3d1.c
@@ -42,8 +42,8 @@ at the top-level directory.
  *                           3D<->2D gather/scatter of {A,B} stored in A3d.
  * 
  * The program may be run by typing:
- *    mpiexec -np 

pddrive3d -r -c \ - * -d + * mpiexec -np

pddrive3d1 -r -c \ + * -d * NOTE: total number of processes p = r * c * d * (d must be a power-of-two, e.g., 1, 2, 4, ...) * diff --git a/EXAMPLE/pddrive3d2.c b/EXAMPLE/pddrive3d2.c new file mode 100644 index 00000000..b39f4189 --- /dev/null +++ b/EXAMPLE/pddrive3d2.c @@ -0,0 +1,410 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PDGSSVX3D example + * + *

+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_ddefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE3D2.
+ *
+ * This example illustrates how to use PDGSSVX3D to sovle 
+ * the systems with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once. The following data structures will be reused in the
+ * subsequent call to PDGSSVX3D:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *        SOLVEstruct     : communication metadata for SpTRSV, SpMV, and
+ *                          3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np 

pddrive3d2 -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, double* A, int LDA, + double* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + double * Aval = (double *) A->nzval; + double * Bval = (double *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + dScalePermstruct_t ScalePermstruct; + dLUstruct_t LUstruct; + dSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + double *b, *b1, *xtrue, *xtrue1; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + /* prototypes */ + extern int dcreate_matrix_perturbed + (SuperMatrix *, int, double **, int *, double **, int *, + FILE *, gridinfo_t *); + extern int dcreate_matrix_perturbed_postfix + (SuperMatrix *, int, double **, int *, double **, int *, + FILE *, char *, gridinfo_t *); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iinum_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = NO; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + dScalePermstructInit (m, n, &ScalePermstruct); + dLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + + /* Deallocate some storage, keep around 2D matrix meta structure */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + /* Deallocate storage associated with the L and U matrices.*/ + dDestroy_LU(n, &(grid.grid2d), &LUstruct); + } else { // Process layers not equal 0 + dDeAllocLlu_3d(n, &LUstruct, &grid); + dDeAllocGlu_3d(&LUstruct); + } + + PStatFree(&stat); + SUPERLU_FREE(b); /* Free storage of right-hand side.*/ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ + + /* ------------------------------------------------------------ + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern; + /* Get the matrix from file, perturbed some diagonal entries to force + a different perm_r[]. Set up the right-hand side. */ + if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); + dcreate_matrix_postfix3d(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, suffix, &(grid)); + + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + pdgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + dDestroy_LU (n, &(grid.grid2d), &LUstruct); + dSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + dDeAllocLlu_3d(n, &LUstruct, &grid); + dDeAllocGlu_3d(&LUstruct); + } + + dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); // After all factorization + + dScalePermstructFree (&ScalePermstruct); + dLUstructFree (&LUstruct); + PStatFree (&stat); + SUPERLU_FREE (b1); + SUPERLU_FREE (xtrue1); + SUPERLU_FREE (berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/pddrive3d3.c b/EXAMPLE/pddrive3d3.c new file mode 100644 index 00000000..69c7894a --- /dev/null +++ b/EXAMPLE/pddrive3d3.c @@ -0,0 +1,416 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PDGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_ddefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PDDRIVE3D3.
+ *
+ * This example illustrates how to use PDGSSVX3D to sovle 
+ * the systems with the same sparsity pattern and similar numerical
+ * values of matrix A.
+ * In this case, the row and column permutation vectors and symbolic
+ * factorization are computed only once. The following data structures
+ * will be reused in the subsequent call to PDGSSVX:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to PDGSSVX, and reused in the subsequent call.
+ *
+ * The program may be run by typing:
+ *    mpiexec -np 

pddrive3d3 -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, double* A, int LDA, + double* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + double * Aval = (double *) A->nzval; + double * Bval = (double *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + dScalePermstruct_t ScalePermstruct; + dLUstruct_t LUstruct; + dSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + double *b, *b1, *xtrue, *xtrue1; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs, ii, omp_mpi_level; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if (grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iinum_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = NO; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + dScalePermstructInit (m, n, &ScalePermstruct); + dLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + + /* Deallocate some storage, including replicated LU structure along + the Z dimension. keep around 2D matrix meta structure, including + the LU data structure on the host side. */ + Destroy_CompRowLoc_Matrix_dist (&A); + + if ( (grid.zscp).Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + } else { // Process layers not equal 0 + dDeAllocLlu_3d(n, &LUstruct, &grid); + dDeAllocGlu_3d(&LUstruct); + } + + PStatFree(&stat); + SUPERLU_FREE(b); /* Free storage of right-hand side.*/ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ + + /* ------------------------------------------------------------ + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern_SameRowPerm; + + /* Zero the numerical values in L and U. */ + if ( (grid.zscp).Iam == 0 ) { /* on 2D grid-0 */ + dZeroLblocks(iam, n, &(grid.grid2d), &LUstruct); + dZeroUblocks(iam, n, &(grid.grid2d), &LUstruct); + } + + /* Get the matrix from file, perturbed some diagonal entries to force + a different perm_r[]. Set up the right-hand side. */ + if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); + dcreate_matrix_postfix3d(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, suffix, &(grid)); + fclose(fp); + + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + pdgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve a system with the same pattern and similar values.\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + + /* ------------------------------------------------------------ + DEALLOCATE ALL STORAGE. + ------------------------------------------------------------ */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + dDestroy_LU (n, &(grid.grid2d), &LUstruct); + dSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + dDeAllocLlu_3d(n, &LUstruct, &grid); + dDeAllocGlu_3d(&LUstruct); + } + + dDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + dScalePermstructFree (&ScalePermstruct); + dLUstructFree (&LUstruct); + PStatFree (&stat); + SUPERLU_FREE (b1); + SUPERLU_FREE (xtrue1); + SUPERLU_FREE (berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive2.c b/EXAMPLE/psdrive2.c index 6b5be490..3db5e0a9 100644 --- a/EXAMPLE/psdrive2.c +++ b/EXAMPLE/psdrive2.c @@ -33,8 +33,8 @@ at the top-level directory. * * The driver program PSDRIVE2. * - * This example illustrates how to use to solve - * systems repeatedly with the same sparsity pattern of matrix A. + * This example illustrates how to use PSGSSVX to solve systems + * repeatedly with the same sparsity pattern of matrix A. * In this case, the column permutation vector ScalePermstruct->perm_c is * computed once. The following data structures will be reused in the * subsequent call to PSGSSVX: @@ -147,7 +147,8 @@ int main(int argc, char *argv[]) GET THE MATRIX FROM FILE AND SETUP THE RIGHT-HAND SIDE. ------------------------------------------------------------*/ screate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid); - + fclose(fp); + if ( !(berr = floatMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); m = A.nrow; @@ -156,7 +157,7 @@ int main(int argc, char *argv[]) m_loc = Astore->m_loc; /* ------------------------------------------------------------ - WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + 1. WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. ------------------------------------------------------------*/ /* Set the default input options: @@ -197,13 +198,13 @@ int main(int argc, char *argv[]) PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ sDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with - the L and U matrices. */ - SUPERLU_FREE(b); /* Free storage of right-hand side. */ - SUPERLU_FREE(xtrue); /* Free storage of the exact solution. */ + the L and U matrices. */ + SUPERLU_FREE(b); /* Free storage of right-hand side. */ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ /* ------------------------------------------------------------ - NOW WE SOLVE ANOTHER LINEAR SYSTEM. - ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. ------------------------------------------------------------*/ options.Fact = SamePattern; @@ -218,8 +219,9 @@ int main(int argc, char *argv[]) /* Get the matrix from file, perturbed some diagonal entries to force a different perm_r[]. Set up the right-hand side. */ if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); - screate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb, &xtrue1, &ldx, fp, postfix, &grid); - + screate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, postfix, &grid); + PStatInit(&stat); /* Initialize the statistics variables. */ /* Solve the linear system. */ diff --git a/EXAMPLE/psdrive3.c b/EXAMPLE/psdrive3.c index 33fda121..1c23fab5 100644 --- a/EXAMPLE/psdrive3.c +++ b/EXAMPLE/psdrive3.c @@ -35,9 +35,9 @@ at the top-level directory. * This example illustrates how to use PSGSSVX to solve * systems repeatedly with the same sparsity pattern and similar * numerical values of matrix A. - * In this case, the column permutation vector and symbolic factorization are - * computed only once. The following data structures will be reused in the - * subsequent call to PSGSSVX: + * In this case, the row and column permutation vectors and symbolic + * factorization are computed only once. The following data structures + * will be reused in the subsequent call to PSGSSVX: * ScalePermstruct : DiagScale, R, C, perm_r, perm_c * LUstruct : etree, Glu_persist, Llu * @@ -230,8 +230,9 @@ int main(int argc, char *argv[]) if (iam == 0) { } - /* Zero the numerical values in L. */ + /* Zero the numerical values in L and U. */ sZeroLblocks(iam, n, &grid, &LUstruct); + sZeroUblocks(iam, n, &grid, &LUstruct); sCreate_CompRowLoc_Matrix_dist(&A, m, n, nnz_loc, m_loc, fst_row, nzval1, colind1, rowptr1, @@ -250,7 +251,7 @@ int main(int argc, char *argv[]) PStatPrint(&options, &stat, &grid); /* ------------------------------------------------------------ - DEALLOCATE STORAGE. + DEALLOCATE ALL STORAGE. ------------------------------------------------------------*/ PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ diff --git a/EXAMPLE/psdrive3d1.c b/EXAMPLE/psdrive3d1.c index 57f5df58..869915c3 100644 --- a/EXAMPLE/psdrive3d1.c +++ b/EXAMPLE/psdrive3d1.c @@ -42,8 +42,8 @@ at the top-level directory. * 3D<->2D gather/scatter of {A,B} stored in A3d. * * The program may be run by typing: - * mpiexec -np

psdrive3d -r -c \ - * -d + * mpiexec -np

psdrive3d1 -r -c \ + * -d * NOTE: total number of processes p = r * c * d * (d must be a power-of-two, e.g., 1, 2, 4, ...) * diff --git a/EXAMPLE/psdrive3d2.c b/EXAMPLE/psdrive3d2.c new file mode 100644 index 00000000..c69e0fa2 --- /dev/null +++ b/EXAMPLE/psdrive3d2.c @@ -0,0 +1,410 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PSGSSVX3D example + * + *

+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_sdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE3D2.
+ *
+ * This example illustrates how to use PSGSSVX3D to sovle 
+ * the systems with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once. The following data structures will be reused in the
+ * subsequent call to PSGSSVX3D:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *        SOLVEstruct     : communication metadata for SpTRSV, SpMV, and
+ *                          3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np 

psdrive3d2 -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, float* A, int LDA, + float* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + float * Aval = (float *) A->nzval; + float * Bval = (float *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + float *berr; + float *b, *b1, *xtrue, *xtrue1; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + /* prototypes */ + extern int screate_matrix_perturbed + (SuperMatrix *, int, float **, int *, float **, int *, + FILE *, gridinfo_t *); + extern int screate_matrix_perturbed_postfix + (SuperMatrix *, int, float **, int *, float **, int *, + FILE *, char *, gridinfo_t *); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iinum_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = NO; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit (m, n, &ScalePermstruct); + sLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + + /* Deallocate some storage, keep around 2D matrix meta structure */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + /* Deallocate storage associated with the L and U matrices.*/ + sDestroy_LU(n, &(grid.grid2d), &LUstruct); + } else { // Process layers not equal 0 + sDeAllocLlu_3d(n, &LUstruct, &grid); + sDeAllocGlu_3d(&LUstruct); + } + + PStatFree(&stat); + SUPERLU_FREE(b); /* Free storage of right-hand side.*/ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ + + /* ------------------------------------------------------------ + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern; + /* Get the matrix from file, perturbed some diagonal entries to force + a different perm_r[]. Set up the right-hand side. */ + if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); + screate_matrix_postfix3d(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, suffix, &(grid)); + + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + psgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + sDestroy_LU (n, &(grid.grid2d), &LUstruct); + sSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + sDeAllocLlu_3d(n, &LUstruct, &grid); + sDeAllocGlu_3d(&LUstruct); + } + + sDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); // After all factorization + + sScalePermstructFree (&ScalePermstruct); + sLUstructFree (&LUstruct); + PStatFree (&stat); + SUPERLU_FREE (b1); + SUPERLU_FREE (xtrue1); + SUPERLU_FREE (berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/psdrive3d3.c b/EXAMPLE/psdrive3d3.c new file mode 100644 index 00000000..e2cd45f9 --- /dev/null +++ b/EXAMPLE/psdrive3d3.c @@ -0,0 +1,416 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + + +/*! @file + * \brief Driver program for PSGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_sdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PSDRIVE3D3.
+ *
+ * This example illustrates how to use PSGSSVX3D to sovle 
+ * the systems with the same sparsity pattern and similar numerical
+ * values of matrix A.
+ * In this case, the row and column permutation vectors and symbolic
+ * factorization are computed only once. The following data structures
+ * will be reused in the subsequent call to PSGSSVX:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to PSGSSVX, and reused in the subsequent call.
+ *
+ * The program may be run by typing:
+ *    mpiexec -np 

psdrive3d3 -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, float* A, int LDA, + float* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + float * Aval = (float *) A->nzval; + float * Bval = (float *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( Aval[i] == Bval[i] ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + sScalePermstruct_t ScalePermstruct; + sLUstruct_t LUstruct; + sSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + float *berr; + float *b, *b1, *xtrue, *xtrue1; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs, ii, omp_mpi_level; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if (grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iinum_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = NO; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + sScalePermstructInit (m, n, &ScalePermstruct); + sLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + + /* Deallocate some storage, including replicated LU structure along + the Z dimension. keep around 2D matrix meta structure, including + the LU data structure on the host side. */ + Destroy_CompRowLoc_Matrix_dist (&A); + + if ( (grid.zscp).Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + } else { // Process layers not equal 0 + sDeAllocLlu_3d(n, &LUstruct, &grid); + sDeAllocGlu_3d(&LUstruct); + } + + PStatFree(&stat); + SUPERLU_FREE(b); /* Free storage of right-hand side.*/ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ + + /* ------------------------------------------------------------ + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern_SameRowPerm; + + /* Zero the numerical values in L and U. */ + if ( (grid.zscp).Iam == 0 ) { /* on 2D grid-0 */ + sZeroLblocks(iam, n, &(grid.grid2d), &LUstruct); + sZeroUblocks(iam, n, &(grid.grid2d), &LUstruct); + } + + /* Get the matrix from file, perturbed some diagonal entries to force + a different perm_r[]. Set up the right-hand side. */ + if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); + screate_matrix_postfix3d(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, suffix, &(grid)); + fclose(fp); + + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + psgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve a system with the same pattern and similar values.\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + + /* ------------------------------------------------------------ + DEALLOCATE ALL STORAGE. + ------------------------------------------------------------ */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + sDestroy_LU (n, &(grid.grid2d), &LUstruct); + sSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + sDeAllocLlu_3d(n, &LUstruct, &grid); + sDeAllocGlu_3d(&LUstruct); + } + + sDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + sScalePermstructFree (&ScalePermstruct); + sLUstructFree (&LUstruct); + PStatFree (&stat); + SUPERLU_FREE (b1); + SUPERLU_FREE (xtrue1); + SUPERLU_FREE (berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/pzdrive2.c b/EXAMPLE/pzdrive2.c index ce28390e..65665895 100644 --- a/EXAMPLE/pzdrive2.c +++ b/EXAMPLE/pzdrive2.c @@ -32,8 +32,8 @@ at the top-level directory. * * The driver program PZDRIVE2. * - * This example illustrates how to use to solve - * systems repeatedly with the same sparsity pattern of matrix A. + * This example illustrates how to use PZGSSVX to solve systems + * repeatedly with the same sparsity pattern of matrix A. * In this case, the column permutation vector ScalePermstruct->perm_c is * computed once. The following data structures will be reused in the * subsequent call to PZGSSVX: @@ -146,7 +146,8 @@ int main(int argc, char *argv[]) GET THE MATRIX FROM FILE AND SETUP THE RIGHT-HAND SIDE. ------------------------------------------------------------*/ zcreate_matrix_postfix(&A, nrhs, &b, &ldb, &xtrue, &ldx, fp, postfix, &grid); - + fclose(fp); + if ( !(berr = doubleMalloc_dist(nrhs)) ) ABORT("Malloc fails for berr[]."); m = A.nrow; @@ -155,7 +156,7 @@ int main(int argc, char *argv[]) m_loc = Astore->m_loc; /* ------------------------------------------------------------ - WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. + 1. WE SOLVE THE LINEAR SYSTEM FOR THE FIRST TIME. ------------------------------------------------------------*/ /* Set the default input options: @@ -196,13 +197,13 @@ int main(int argc, char *argv[]) PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ zDestroy_LU(n, &grid, &LUstruct); /* Deallocate storage associated with - the L and U matrices. */ - SUPERLU_FREE(b); /* Free storage of right-hand side. */ - SUPERLU_FREE(xtrue); /* Free storage of the exact solution. */ + the L and U matrices. */ + SUPERLU_FREE(b); /* Free storage of right-hand side. */ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ /* ------------------------------------------------------------ - NOW WE SOLVE ANOTHER LINEAR SYSTEM. - ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. ------------------------------------------------------------*/ options.Fact = SamePattern; @@ -217,8 +218,9 @@ int main(int argc, char *argv[]) /* Get the matrix from file, perturbed some diagonal entries to force a different perm_r[]. Set up the right-hand side. */ if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); - zcreate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb, &xtrue1, &ldx, fp, postfix, &grid); - + zcreate_matrix_perturbed_postfix(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, postfix, &grid); + PStatInit(&stat); /* Initialize the statistics variables. */ /* Solve the linear system. */ diff --git a/EXAMPLE/pzdrive3.c b/EXAMPLE/pzdrive3.c index 47b33b43..d3c44a78 100644 --- a/EXAMPLE/pzdrive3.c +++ b/EXAMPLE/pzdrive3.c @@ -34,9 +34,9 @@ at the top-level directory. * This example illustrates how to use PZGSSVX to solve * systems repeatedly with the same sparsity pattern and similar * numerical values of matrix A. - * In this case, the column permutation vector and symbolic factorization are - * computed only once. The following data structures will be reused in the - * subsequent call to PZGSSVX: + * In this case, the row and column permutation vectors and symbolic + * factorization are computed only once. The following data structures + * will be reused in the subsequent call to PZGSSVX: * ScalePermstruct : DiagScale, R, C, perm_r, perm_c * LUstruct : etree, Glu_persist, Llu * @@ -230,8 +230,9 @@ int main(int argc, char *argv[]) nzval1[0].r += 1.0e-8; nzval1[0].i += 1.0e-8; } - /* Zero the numerical values in L. */ + /* Zero the numerical values in L and U. */ zZeroLblocks(iam, n, &grid, &LUstruct); + zZeroUblocks(iam, n, &grid, &LUstruct); zCreate_CompRowLoc_Matrix_dist(&A, m, n, nnz_loc, m_loc, fst_row, nzval1, colind1, rowptr1, @@ -250,7 +251,7 @@ int main(int argc, char *argv[]) PStatPrint(&options, &stat, &grid); /* ------------------------------------------------------------ - DEALLOCATE STORAGE. + DEALLOCATE ALL STORAGE. ------------------------------------------------------------*/ PStatFree(&stat); Destroy_CompRowLoc_Matrix_dist(&A); /* Deallocate storage of matrix A. */ diff --git a/EXAMPLE/pzdrive3d1.c b/EXAMPLE/pzdrive3d1.c index 4a9f22a6..55702daa 100644 --- a/EXAMPLE/pzdrive3d1.c +++ b/EXAMPLE/pzdrive3d1.c @@ -41,8 +41,8 @@ at the top-level directory. * 3D<->2D gather/scatter of {A,B} stored in A3d. * * The program may be run by typing: - * mpiexec -np

pzdrive3d -r -c \ - * -d + * mpiexec -np

pzdrive3d1 -r -c \ + * -d * NOTE: total number of processes p = r * c * d * (d must be a power-of-two, e.g., 1, 2, 4, ...) * diff --git a/EXAMPLE/pzdrive3d2.c b/EXAMPLE/pzdrive3d2.c new file mode 100644 index 00000000..a6204d45 --- /dev/null +++ b/EXAMPLE/pzdrive3d2.c @@ -0,0 +1,410 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Driver program for PZGSSVX3D example + * + *

+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_zdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE3D2.
+ *
+ * This example illustrates how to use PZGSSVX3D to sovle 
+ * the systems with the same sparsity pattern of matrix A.
+ * In this case, the column permutation vector ScalePermstruct->perm_c is
+ * computed once. The following data structures will be reused in the
+ * subsequent call to PZGSSVX3D:
+ *        ScalePermstruct : perm_c
+ *        LUstruct        : etree
+ *        SOLVEstruct     : communication metadata for SpTRSV, SpMV, and
+ *                          3D<->2D gather/scatter of {A,B} stored in A3d.
+ * 
+ * The program may be run by typing:
+ *    mpiexec -np 

pzdrive3d2 -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, doublecomplex* A, int LDA, + doublecomplex* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + doublecomplex * Aval = (doublecomplex *) A->nzval; + doublecomplex * Bval = (doublecomplex *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( (Aval[i].r == Bval[i].r) && (Aval[i].i == Bval[i].i) ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + zScalePermstruct_t ScalePermstruct; + zLUstruct_t LUstruct; + zSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + doublecomplex *b, *b1, *xtrue, *xtrue1; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + int ii, omp_mpi_level; + + /* prototypes */ + extern int zcreate_matrix_perturbed + (SuperMatrix *, int, doublecomplex **, int *, doublecomplex **, int *, + FILE *, gridinfo_t *); + extern int zcreate_matrix_perturbed_postfix + (SuperMatrix *, int, doublecomplex **, int *, doublecomplex **, int *, + FILE *, char *, gridinfo_t *); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if(grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iinum_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = NO; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + zScalePermstructInit (m, n, &ScalePermstruct); + zLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + + /* Deallocate some storage, keep around 2D matrix meta structure */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + /* Deallocate storage associated with the L and U matrices.*/ + zDestroy_LU(n, &(grid.grid2d), &LUstruct); + } else { // Process layers not equal 0 + zDeAllocLlu_3d(n, &LUstruct, &grid); + zDeAllocGlu_3d(&LUstruct); + } + + PStatFree(&stat); + SUPERLU_FREE(b); /* Free storage of right-hand side.*/ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ + + /* ------------------------------------------------------------ + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern; + /* Get the matrix from file, perturbed some diagonal entries to force + a different perm_r[]. Set up the right-hand side. */ + if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); + zcreate_matrix_postfix3d(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, suffix, &(grid)); + + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + pzgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + + /* ------------------------------------------------------------ + DEALLOCATE STORAGE. + ------------------------------------------------------------ */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + zDestroy_LU (n, &(grid.grid2d), &LUstruct); + zSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + zDeAllocLlu_3d(n, &LUstruct, &grid); + zDeAllocGlu_3d(&LUstruct); + } + + zDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); // After all factorization + + zScalePermstructFree (&ScalePermstruct); + zLUstructFree (&LUstruct); + PStatFree (&stat); + SUPERLU_FREE (b1); + SUPERLU_FREE (xtrue1); + SUPERLU_FREE (berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/EXAMPLE/pzdrive3d3.c b/EXAMPLE/pzdrive3d3.c new file mode 100644 index 00000000..c3aa53a3 --- /dev/null +++ b/EXAMPLE/pzdrive3d3.c @@ -0,0 +1,416 @@ +/*! \file +Copyright (c) 2003, The Regents of the University of California, through +Lawrence Berkeley National Laboratory (subject to receipt of any required +approvals from U.S. Dept. of Energy) + +All rights reserved. + +The source code is distributed under BSD license, see the file License.txt +at the top-level directory. +*/ + +/*! @file + * \brief Driver program for PZGSSVX3D example + * + *
+ * -- Distributed SuperLU routine (version 7.0) --
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab 
+ * September 10, 2021
+ *
+ */
+#include "superlu_zdefs.h"  
+
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *
+ * The driver program PZDRIVE3D3.
+ *
+ * This example illustrates how to use PZGSSVX3D to sovle 
+ * the systems with the same sparsity pattern and similar numerical
+ * values of matrix A.
+ * In this case, the row and column permutation vectors and symbolic
+ * factorization are computed only once. The following data structures
+ * will be reused in the subsequent call to PZGSSVX:
+ *        ScalePermstruct : DiagScale, R, C, perm_r, perm_c
+ *        LUstruct        : etree, Glu_persist, Llu
+ *        SOLVEstruct      : communication metadata for SpTRSV, SpMV, and
+ *                           3D<->2D gather/scatter of {A,B} stored in A3d.
+ *
+ * NOTE:
+ * The distributed nonzero structures of L and U remain the same,
+ * although the numerical values are different. So 'Llu' is set up once
+ * in the first call to PZGSSVX, and reused in the subsequent call.
+ *
+ * The program may be run by typing:
+ *    mpiexec -np 

pzdrive3d3 -r -c \ + * -d + * NOTE: total number of processes p = r * c * d + * (d must be a power-of-two, e.g., 1, 2, 4, ...) + * + *

+ */ + +static void matCheck(int n, int m, doublecomplex* A, int LDA, + doublecomplex* B, int LDB) +{ + for(int j=0; jnnz_loc == B->nnz_loc); + assert(A->m_loc == B->m_loc); + assert(A->fst_row == B->fst_row); + +#if 0 + double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); + fflush(stdout); +#endif + + doublecomplex * Aval = (doublecomplex *) A->nzval; + doublecomplex * Bval = (doublecomplex *) B->nzval; + for (int_t i = 0; i < A->nnz_loc; i++) + { + assert( (Aval[i].r == Bval[i].r) && (Aval[i].i == Bval[i].i) ); + assert((A->colind)[i] == (B->colind)[i]); + printf("colind[] correct\n"); + } + + for (int_t i = 0; i < A->m_loc + 1; i++) + { + assert((A->rowptr)[i] == (B->rowptr)[i]); + } + + printf("Matrix check passed\n"); + +} + +int +main (int argc, char *argv[]) +{ + superlu_dist_options_t options; + SuperLUStat_t stat; + SuperMatrix A; // Now, A is on all 3D processes + zScalePermstruct_t ScalePermstruct; + zLUstruct_t LUstruct; + zSOLVEstruct_t SOLVEstruct; + gridinfo3d_t grid; + double *berr; + doublecomplex *b, *b1, *xtrue, *xtrue1; + int m, n, i, j, m_loc; + int nprow, npcol, npdep; + int iam, info, ldb, ldx, nrhs, ii, omp_mpi_level; + char **cpp, c, *suffix; + FILE *fp, *fopen (); + extern int cpp_defs (); + + nprow = 1; /* Default process rows. */ + npcol = 1; /* Default process columns. */ + npdep = 1; /* replication factor must be power of two */ + nrhs = 1; /* Number of right-hand side. */ + + /* ------------------------------------------------------------ + INITIALIZE MPI ENVIRONMENT. + ------------------------------------------------------------ */ + // MPI_Init (&argc, &argv); + int required = MPI_THREAD_MULTIPLE; + int provided; + MPI_Init_thread(&argc, &argv, required, &provided); + if (provided < required) + { + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } + } + + /* Parse command line argv[]. */ + for (cpp = argv + 1; *cpp; ++cpp) + { + if (**cpp == '-') + { + c = *(*cpp + 1); + ++cpp; + switch (c) + { + case 'h': + printf ("Options:\n"); + printf ("\t-r : process rows (default %d)\n", nprow); + printf ("\t-c : process columns (default %d)\n", npcol); + printf ("\t-d : process Z-dimension (default %d)\n", npdep); + exit (0); + break; + case 'r': + nprow = atoi (*cpp); + break; + case 'c': + npcol = atoi (*cpp); + break; + case 'd': + npdep = atoi (*cpp); + break; + } + } + else + { /* Last arg is considered a filename */ + if (!(fp = fopen (*cpp, "r"))) + { + ABORT ("File does not exist"); + } + break; + } + } + + /* ------------------------------------------------------------ + INITIALIZE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ + superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + + if (grid.iam==0) { + MPI_Query_thread(&omp_mpi_level); + switch (omp_mpi_level) { + case MPI_THREAD_SINGLE: + printf("MPI_Query_thread with MPI_THREAD_SINGLE\n"); + fflush(stdout); + break; + case MPI_THREAD_FUNNELED: + printf("MPI_Query_thread with MPI_THREAD_FUNNELED\n"); + fflush(stdout); + break; + case MPI_THREAD_SERIALIZED: + printf("MPI_Query_thread with MPI_THREAD_SERIALIZED\n"); + fflush(stdout); + break; + case MPI_THREAD_MULTIPLE: + printf("MPI_Query_thread with MPI_THREAD_MULTIPLE\n"); + fflush(stdout); + break; + } + } + + /* Bail out if I do not belong in the grid. */ + iam = grid.iam; + if (iam == -1) goto out; + if (!iam) { + int v_major, v_minor, v_bugfix; +#ifdef __INTEL_COMPILER + printf("__INTEL_COMPILER is defined\n"); +#endif + printf("__STDC_VERSION__ %ld\n", __STDC_VERSION__); + + superlu_dist_GetVersionNumber(&v_major, &v_minor, &v_bugfix); + printf("Library version:\t%d.%d.%d\n", v_major, v_minor, v_bugfix); + + printf("Input matrix file:\t%s\n", *cpp); + printf("3D process grid: %d X %d X %d\n", nprow, npcol, npdep); + //printf("2D Process grid: %d X %d\n", (int)grid.nprow, (int)grid.npcol); + fflush(stdout); + } + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Enter main()"); +#endif + + /* ------------------------------------------------------------ + GET THE MATRIX FROM FILE AND SETUP THE RIGHT HAND SIDE. + ------------------------------------------------------------ */ + for (ii = 0; iinum_lookaheads = 10; + options->lookahead_etree = NO; + options->SymPattern = NO; + options.DiagInv = NO; + */ + set_default_options_dist (&options); +#if 0 + options.RowPerm = NOROWPERM; + options.IterRefine = NOREFINE; + options.ColPerm = NATURAL; + options.Equil = NO; + options.ReplaceTinyPivot = NO; +#endif + + if (!iam) { + print_sp_ienv_dist(&options); + print_options_dist(&options); + fflush(stdout); + } + + // matrix is on 3D process grid + m = A.nrow; + n = A.ncol; + + /* Initialize ScalePermstruct and LUstruct. */ + zScalePermstructInit (m, n, &ScalePermstruct); + zLUstructInit (n, &LUstruct); + + /* Initialize the statistics variables. */ + PStatInit (&stat); + + /* Call the linear equation solver. */ + pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + + /* Deallocate some storage, including replicated LU structure along + the Z dimension. keep around 2D matrix meta structure, including + the LU data structure on the host side. */ + Destroy_CompRowLoc_Matrix_dist (&A); + + if ( (grid.zscp).Iam == 0 ) { // process layer 0 + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + } else { // Process layers not equal 0 + zDeAllocLlu_3d(n, &LUstruct, &grid); + zDeAllocGlu_3d(&LUstruct); + } + + PStatFree(&stat); + SUPERLU_FREE(b); /* Free storage of right-hand side.*/ + SUPERLU_FREE(xtrue); /* Free storage of the exact solution.*/ + + /* ------------------------------------------------------------ + 2. NOW WE SOLVE ANOTHER LINEAR SYSTEM. + ONLY THE SPARSITY PATTERN OF MATRIX A IS THE SAME. + ------------------------------------------------------------*/ + options.Fact = SamePattern_SameRowPerm; + + /* Zero the numerical values in L and U. */ + if ( (grid.zscp).Iam == 0 ) { /* on 2D grid-0 */ + zZeroLblocks(iam, n, &(grid.grid2d), &LUstruct); + zZeroUblocks(iam, n, &(grid.grid2d), &LUstruct); + } + + /* Get the matrix from file, perturbed some diagonal entries to force + a different perm_r[]. Set up the right-hand side. */ + if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); + zcreate_matrix_postfix3d(&A, nrhs, &b1, &ldb, + &xtrue1, &ldx, fp, suffix, &(grid)); + fclose(fp); + + PStatInit(&stat); /* Initialize the statistics variables. */ + + nrhs = 1; + pzgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, + &LUstruct, &SOLVEstruct, berr, &stat, &info); + + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve a system with the same pattern and similar values.\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + + /* ------------------------------------------------------------ + DEALLOCATE ALL STORAGE. + ------------------------------------------------------------ */ + Destroy_CompRowLoc_Matrix_dist (&A); + if ( grid.zscp.Iam == 0 ) { // process layer 0 + + PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ + + zDestroy_LU (n, &(grid.grid2d), &LUstruct); + zSolveFinalize (&options, &SOLVEstruct); + } else { // Process layers not equal 0 + zDeAllocLlu_3d(n, &LUstruct, &grid); + zDeAllocGlu_3d(&LUstruct); + } + + zDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); + + zScalePermstructFree (&ScalePermstruct); + zLUstructFree (&LUstruct); + PStatFree (&stat); + SUPERLU_FREE (b1); + SUPERLU_FREE (xtrue1); + SUPERLU_FREE (berr); + fclose(fp); + + /* ------------------------------------------------------------ + RELEASE THE SUPERLU PROCESS GRID. + ------------------------------------------------------------ */ +out: + superlu_gridexit3d (&grid); + + /* ------------------------------------------------------------ + TERMINATES THE MPI EXECUTION ENVIRONMENT. + ------------------------------------------------------------ */ + MPI_Finalize (); + +#if ( DEBUGlevel>=1 ) + CHECK_MALLOC (iam, "Exit main()"); +#endif + +} + + +int +cpp_defs () +{ + printf (".. CPP definitions:\n"); +#if ( PRNTlevel>=1 ) + printf ("\tPRNTlevel = %d\n", PRNTlevel); +#endif +#if ( DEBUGlevel>=1 ) + printf ("\tDEBUGlevel = %d\n", DEBUGlevel); +#endif +#if ( PROFlevel>=1 ) + printf ("\tPROFlevel = %d\n", PROFlevel); +#endif + printf ("....\n"); + return 0; +} diff --git a/SRC/dlustruct_gpu.h b/SRC/dlustruct_gpu.h index e0513d07..23187e66 100644 --- a/SRC/dlustruct_gpu.h +++ b/SRC/dlustruct_gpu.h @@ -71,7 +71,7 @@ typedef struct //SCUbuf_gpu_ } dSCUbuf_gpu_t; - +/* Holds the L & U data structures on the GPU side */ typedef struct //LUstruct_gpu_ { int_t *LrowindVec; /* A single vector */ @@ -135,7 +135,8 @@ typedef struct //LUstruct_gpu_ typedef struct //sluGPU_t_ { int_t gpuId; // if there are multiple GPUs - dLUstruct_gpu_t *A_gpu, *dA_gpu; + dLUstruct_gpu_t *A_gpu; // holds the LU structure on GPU + //*dA_gpu; not used cudaStream_t funCallStreams[MAX_NCUDA_STREAMS], CopyStream; cublasHandle_t cublasHandles[MAX_NCUDA_STREAMS]; int_t lastOffloadStream[MAX_NCUDA_STREAMS]; diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c index 41544e1f..625df8ea 100644 --- a/SRC/dnrformat_loc3d.c +++ b/SRC/dnrformat_loc3d.c @@ -58,20 +58,13 @@ void dGatherNRformat_loc3d NRformat_loc *A2d; int *row_counts_int; // 32-bit, number of local rows relative to all processes int *row_disp; // displacement + int *nnz_counts_int; // number of local nnz relative to all processes + int *nnz_disp; // displacement int *b_counts_int; // number of local B entries relative to all processes int *b_disp; // including 'nrhs' /********* Gather A2d *********/ - if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) { - /* A3d is input. No need to recompute count. - Only need to gather A2d matrix. */ - b_counts_int = A3d->b_counts_int; - b_disp = A3d->b_disp;; - row_counts_int = A3d->row_counts_int; - row_disp = A3d->row_disp; - - if (grid3d->iam==0) printf("TO BE COMPLETED!\n"); - } else if ( Fact != FACTORED ) { + if ( Fact == DOFACT ) { /* Factorize from scratch */ /* A3d is output. Compute counts from scratch */ A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d)); A2d = SUPERLU_MALLOC(sizeof(NRformat_loc)); @@ -129,7 +122,7 @@ void dGatherNRformat_loc3d row_counts_int, row_disp, mpi_int_t, 0, grid3d->zscp.comm); - if (grid3d->zscp.Iam == 0) + if (grid3d->zscp.Iam == 0) /* Set up rowptr[] relative to 2D grid-0 */ { for (int i = 0; i < grid3d->npdep; i++) { @@ -166,20 +159,75 @@ void dGatherNRformat_loc3d } /* end 2D layer grid-0 */ A3d->A_nfmt = A2d; - A3d->b_counts_int = b_counts_int; - A3d->b_disp = b_disp; A3d->row_counts_int = row_counts_int; A3d->row_disp = row_disp; + A3d->nnz_counts_int = nnz_counts_int; + A3d->nnz_disp = nnz_disp; + A3d->b_counts_int = b_counts_int; + A3d->b_disp = b_disp; /* free storage */ SUPERLU_FREE(nnz_counts); - SUPERLU_FREE(nnz_counts_int); SUPERLU_FREE(row_counts); - SUPERLU_FREE(nnz_disp); *A3d_addr = (NRformat_loc3d *) A3d; // return pointer to A3d struct - } /* end else: Factor from scratch */ + } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) { + /* A3d is input. No need to recompute count. + Only need to gather A2d matrix; the previous 2D matrix + was overwritten by equilibration, perm_r and perm_c. */ + NRformat_loc *A2d = A3d->A_nfmt; + row_counts_int = A3d->row_counts_int; + row_disp = A3d->row_disp; + nnz_counts_int = A3d->nnz_counts_int; + nnz_disp = A3d->nnz_disp; + + MPI_Gatherv(A->nzval, A->nnz_loc, MPI_DOUBLE, A2d->nzval, + nnz_counts_int, nnz_disp, + MPI_DOUBLE, 0, grid3d->zscp.comm); + MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind, + nnz_counts_int, nnz_disp, + mpi_int_t, 0, grid3d->zscp.comm); + MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1], + row_counts_int, row_disp, + mpi_int_t, 0, grid3d->zscp.comm); + + if (grid3d->zscp.Iam == 0) { /* Set up rowptr[] relative to 2D grid-0 */ + A2d->rowptr[0] = 0; + for (int i = 0; i < grid3d->npdep; i++) + { + for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++) + { + // A2d->rowptr[j] += row_disp[i]; + A2d->rowptr[j] += nnz_disp[i]; + } + } + A2d->nnz_loc = nnz_disp[grid3d->npdep]; + A2d->m_loc = row_disp[grid3d->npdep]; + + if (grid3d->rankorder == 1) { // XY-major + A2d->fst_row = A->fst_row; + } else { // Z-major + gridinfo_t *grid2d = &(grid3d->grid2d); + int procs2d = grid2d->nprow * grid2d->npcol; + int m_loc_2d = A2d->m_loc; + int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int)); + + MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, + MPI_INT, grid2d->comm); + + int fst_row = 0; + for (int p = 0; p < procs2d; ++p) + { + if (grid2d->iam == p) + A2d->fst_row = fst_row; + fst_row += m_loc_2d_counts[p]; + } + + SUPERLU_FREE(m_loc_2d_counts); + } + } /* end 2D layer grid-0 */ + } /* SamePattern or SamePattern_SameRowPerm */ A3d->m_loc = A->m_loc; A3d->B3d = (double *) B; /* save the pointer to the original B @@ -191,10 +239,10 @@ void dGatherNRformat_loc3d if ( nrhs > 0 ) { A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0 - b_counts_int = A3d->b_counts_int; - b_disp = A3d->b_disp;; row_counts_int = A3d->row_counts_int; row_disp = A3d->row_disp; + b_counts_int = A3d->b_counts_int; + b_disp = A3d->b_disp;; /* Btmp <- compact(B), compacting B */ double *Btmp; diff --git a/SRC/dreadMM.c b/SRC/dreadMM.c index b36712cc..8be6ba3c 100644 --- a/SRC/dreadMM.c +++ b/SRC/dreadMM.c @@ -108,7 +108,7 @@ dreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, /* 3/ Read n and nnz */ #ifdef _LONGINT - sscanf(line, "%ld%ld%ld",m, n, nonz); + sscanf(line, "%lld%lld%lld", m, n, nonz); #else sscanf(line, "%d%d%d",m, n, nonz); #endif diff --git a/SRC/dreadtriple.c b/SRC/dreadtriple.c index 8053e69a..523e2596 100644 --- a/SRC/dreadtriple.c +++ b/SRC/dreadtriple.c @@ -47,7 +47,7 @@ dreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, */ #ifdef _LONGINT - fscanf(fp, "%ld%ld%ld", m, n, nonz); + fscanf(fp, "%lld%lld%lld", m, n, nonz); #else fscanf(fp, "%d%d%d", m, n, nonz); #endif @@ -77,7 +77,7 @@ dreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) { #ifdef _LONGINT - fscanf(fp, "%ld%ld%lf\n", &row[nz], &col[nz], &val[nz]); + fscanf(fp, "%lld%lld%lf\n", &row[nz], &col[nz], &val[nz]); #else // int fscanf(fp, "%d%d%lf\n", &row[nz], &col[nz], &val[nz]); #endif @@ -86,8 +86,9 @@ dreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, if ( row[0] == 0 || col[0] == 0 ) { zero_base = 1; printf("triplet file: row/col indices are zero-based.\n"); - } else + } else { printf("triplet file: row/col indices are one-based.\n"); + } if ( !zero_base ) { /* Change to 0-based indexing. */ diff --git a/SRC/dreadtriple_noheader.c b/SRC/dreadtriple_noheader.c index b5982c22..fce23583 100644 --- a/SRC/dreadtriple_noheader.c +++ b/SRC/dreadtriple_noheader.c @@ -49,7 +49,7 @@ dreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, nz = *n = 0; #ifdef _LONGINT - ret_val = fscanf(fp, "%ld%ld%lf\n", &i, &j, &vali); + ret_val = fscanf(fp, "%lld%lld%lf\n", &i, &j, &vali); #else // int ret_val = fscanf(fp, "%d%d%lf\n", &i, &j, &vali); #endif @@ -62,7 +62,7 @@ dreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, ++nz; #ifdef _LONGINT - ret_val = fscanf(fp, "%ld%ld%lf\n", &i, &j, &vali); + ret_val = fscanf(fp, "%lld%lld%lf\n", &i, &j, &vali); #else // int ret_val = fscanf(fp, "%d%d%lf\n", &i, &j, &vali); #endif @@ -105,7 +105,7 @@ dreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, /* Read into the triplet array from a file */ for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) { #ifdef _LONGINT - fscanf(fp, "%ld%ld%lf\n", &row[nz], &col[nz], &val[nz]); + fscanf(fp, "%lld%lld%lf\n", &row[nz], &col[nz], &val[nz]); #else // int32 fscanf(fp, "%d%d%lf\n", &row[nz], &col[nz], &val[nz]); #endif diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu index fe91bae5..93c72c18 100644 --- a/SRC/dsuperlu_gpu.cu +++ b/SRC/dsuperlu_gpu.cu @@ -31,21 +31,6 @@ // const int incX, double *Y, const int incY); //} -/*error reporting functions */ -//static -cudaError_t checkCuda(cudaError_t result) -{ -#if defined(DEBUG) || defined(_DEBUG) - if (result != cudaSuccess) - { - fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); - assert(result == cudaSuccess); - } -#endif - return result; -} - - // cublasStatus_t checkCublas(cublasStatus_t result) // { // #if defined(DEBUG) || defined(_DEBUG) @@ -97,7 +82,7 @@ void device_scatter_l (int_t thread_id, } #endif ///////////// not used -#define THREAD_BLOCK_SIZE 512 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ +#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ __device__ inline void ddevice_scatter_l_2D (int thread_id, @@ -127,7 +112,7 @@ void ddevice_scatter_l_2D (int thread_id, } /* Sherry: this routine is not used */ -#if 0 +#if 0 ////////////////////////////////////////////// __global__ void cub_scan_test(void) { @@ -153,7 +138,7 @@ void cub_scan_test(void) printf("%d %d\n", thread_id, IndirectJ2[thread_id]); } -#endif // not used +#endif /////////////////////////////////// not used __device__ inline @@ -900,9 +885,9 @@ void dprintGPUStats(dLUstruct_gpu_t * A_gpu) } /* end printGPUStats */ - +/* Initialize the GPU side of the data structure. */ int dinitSluGPU3D_t( - dsluGPU_t *sluGPU, + dsluGPU_t *sluGPU, // LU structures on GPU, see dlustruct_gpu.h dLUstruct_t *LUstruct, gridinfo3d_t * grid3d, int_t* perm_c_supno, @@ -920,7 +905,7 @@ int dinitSluGPU3D_t( sluGPU->nCudaStreams = getnCudaStreams(); if (grid3d->iam == 0) { - printf("dinitSluGPU3D_t: Using hardware acceleration, with %d cuda streams \n", sluGPU->nCudaStreams); + printf("dinitSluGPU3D_t: Using hardware acceleration, with %d cuda streams, max_buffer_size %d\n", sluGPU->nCudaStreams, (int) buffer_size); fflush(stdout); if ( MAX_SUPER_SIZE < ldt ) { @@ -939,6 +924,9 @@ int dinitSluGPU3D_t( sluGPU->A_gpu = (dLUstruct_gpu_t *) malloc (sizeof(dLUstruct_gpu_t)); sluGPU->A_gpu->perm_c_supno = perm_c_supno; + + /* Allocate GPU memory for the LU data structures, and copy + the host LU structure to GPU side. */ dCopyLUToGPU3D ( isNodeInMyGrid, Llu, /* referred to as A_host */ sluGPU, Glu_persist, n, grid3d, buffer_size, bigu_size, ldt @@ -947,6 +935,7 @@ int dinitSluGPU3D_t( return 0; } /* end dinitSluGPU3D_t */ + int dinitD2Hreduce( int next_k, d2Hreduce_t* d2Hred, int last_flag, HyP_t* HyP, dsluGPU_t *sluGPU, gridinfo_t *grid, dLUstruct_t *LUstruct, SCT_t* SCT @@ -1200,10 +1189,14 @@ int freeSluGPU(dsluGPU_t *sluGPU) } #endif +/* Allocate GPU memory for the LU data structures, and copy + the host LU structure to GPU side. + After factorization, the GPU LU structure should be freed by + calling dfree_LUsstruct_gpu(). */ void dCopyLUToGPU3D ( int_t* isNodeInMyGrid, dLocalLU_t *A_host, /* distributed LU structure on host */ - dsluGPU_t *sluGPU, + dsluGPU_t *sluGPU, /* hold LU structure on GPU */ Glu_persist_t *Glu_persist, int_t n, gridinfo3d_t *grid3d, int_t buffer_size, /* bigV size on GPU for Schur complement update */ diff --git a/SRC/dutil_dist.c b/SRC/dutil_dist.c index a6ccdf83..5c231e7e 100644 --- a/SRC/dutil_dist.c +++ b/SRC/dutil_dist.c @@ -392,6 +392,7 @@ void dScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B, double c) return; } +/**** end utilities added for SUNDIALS ****/ /*! \brief Allocate storage in ScalePermstruct */ void dScalePermstructInit(const int_t m, const int_t n, @@ -437,6 +438,7 @@ int dAllocGlu_3d(int_t n, int_t nsupers, dLUstruct_t * LUstruct) } // Sherry added +/* Free the replicated data on 3D process layer that is not grid-0 */ int dDeAllocGlu_3d(dLUstruct_t * LUstruct) { SUPERLU_FREE(LUstruct->Glu_persist->xsup); @@ -444,6 +446,7 @@ int dDeAllocGlu_3d(dLUstruct_t * LUstruct) return 0; } +/* Free the replicated data on 3D process layer that is not grid-0 */ int dDeAllocLlu_3d(int_t n, dLUstruct_t * LUstruct, gridinfo3d_t* grid3d) { int i, nbc, nbr, nsupers; @@ -613,7 +616,7 @@ void dPrintLblocks(int iam, int_t nsupers, gridinfo_t *grid, /*! \brief Sets all entries of matrix L to zero. */ -void dZeroLblocks(int iam, int_t n, gridinfo_t *grid, dLUstruct_t *LUstruct) +void dZeroLblocks(int iam, int n, gridinfo_t *grid, dLUstruct_t *LUstruct) { double zero = 0.0; register int extra, gb, j, lb, nsupc, nsupr, ncb; @@ -643,7 +646,7 @@ void dZeroLblocks(int iam, int_t n, gridinfo_t *grid, dLUstruct_t *LUstruct) } } } -} /* dZeroLblocks */ +} /* end dZeroLblocks */ /*! \brief Dump the factored matrix L using matlab triple-let format @@ -742,7 +745,6 @@ void dDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid, } /* dDumpLblocks */ - /*! \brief Print the blocks in the factored matrix U. */ void dPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid, @@ -782,7 +784,37 @@ void dPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid, printf("[%d] ToSendD[] %d\n", iam, Llu->ToSendD[lb]); } } -} /* DPRINTUBLOCKS */ +} /* end dPrintUlocks */ + +/*! \brief Sets all entries of matrix U to zero. + */ +void dZeroUblocks(int iam, int n, gridinfo_t *grid, dLUstruct_t *LUstruct) +{ + double zero = 0.0; + register int i, extra, lb, len, nrb; + register int myrow, r; + dLocalLU_t *Llu = LUstruct->Llu; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + int_t *xsup = Glu_persist->xsup; + int_t *index; + double *nzval; + int nsupers = Glu_persist->supno[n-1] + 1; + + nrb = nsupers / grid->nprow; + extra = nsupers % grid->nprow; + myrow = MYROW( iam, grid ); + if ( myrow < extra ) ++nrb; + for (lb = 0; lb < nrb; ++lb) { + index = Llu->Ufstnz_br_ptr[lb]; + if ( index ) { /* Not an empty row */ + nzval = Llu->Unzval_br_ptr[lb]; + len = index[1]; // number of entries in nzval[]; + for (i = 0; i < len; ++i) { + nzval[i] = zero; + } + } + } +} /* end dZeroUlocks */ int dprint_gsmv_comm(FILE *fp, int_t m_loc, pdgsmv_comm_t *gsmv_comm, diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index 9a140a36..d2bcb4a7 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -343,7 +343,7 @@ at the top-level directory. * NOTE: all options must be indentical on all processes when * calling this routine. * - * A (input/output) SuperMatrix* (local); A resides only on process layer 0. + * A (input) SuperMatrix* (local); A resides on all 3D processes. * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). * The number of linear equations is A->nrow. The type of A must be: * Stype = SLU_NR_loc; Dtype = SLU_D; Mtype = SLU_GE. @@ -351,11 +351,13 @@ at the top-level directory. * See supermatrix.h for the definition of 'SuperMatrix'. * This routine only handles square A, however, the LU factorization * routine PDGSTRF can factorize rectangular matrices. - * On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T, + * + * Internally, A is gathered on 2D processs grid-0, call it A2d. + * On exit, A2d may be overwtirren by diag(R)*A*diag(C)*Pc^T, * depending on ScalePermstruct->DiagScale and options->ColPerm: - * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by + * if ScalePermstruct->DiagScale != NOEQUIL, A2d is overwritten by * diag(R)*A*diag(C). - * if options->ColPerm != NATURAL, A is further overwritten by + * if options->ColPerm != NATURAL, A2d is further overwritten by * diag(R)*A*diag(C)*Pc^T. * If all the above condition are true, the LU decomposition is * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. diff --git a/SRC/pdutil.c b/SRC/pdutil.c index 13465637..3a5fcc28 100644 --- a/SRC/pdutil.c +++ b/SRC/pdutil.c @@ -805,10 +805,12 @@ void dDestroy_A3d_gathered_on_2d(dSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid SUPERLU_FREE( A2d->colind ); SUPERLU_FREE( A2d->nzval ); } - SUPERLU_FREE(A3d->b_counts_int); // free displacement counts - SUPERLU_FREE(A3d->b_disp); - SUPERLU_FREE(A3d->row_counts_int); + SUPERLU_FREE(A3d->row_counts_int); // free displacements and counts SUPERLU_FREE(A3d->row_disp); + SUPERLU_FREE(A3d->nnz_counts_int); + SUPERLU_FREE(A3d->nnz_disp); + SUPERLU_FREE(A3d->b_counts_int); + SUPERLU_FREE(A3d->b_disp); SUPERLU_FREE( A2d ); // free 2D structure SUPERLU_FREE( A3d ); // free 3D structure } /* dDestroy_A3d_gathered_on_2d */ diff --git a/SRC/psgssvx3d.c b/SRC/psgssvx3d.c index 6548c470..caef481b 100644 --- a/SRC/psgssvx3d.c +++ b/SRC/psgssvx3d.c @@ -343,19 +343,21 @@ at the top-level directory. * NOTE: all options must be indentical on all processes when * calling this routine. * - * A (input/output) SuperMatrix* (local); A resides only on process layer 0. + * A (input) SuperMatrix* (local); A resides on all 3D processes. * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). * The number of linear equations is A->nrow. The type of A must be: * Stype = SLU_NR_loc; Dtype = SLU_S; Mtype = SLU_GE. * That is, A is stored in distributed compressed row format. * See supermatrix.h for the definition of 'SuperMatrix'. * This routine only handles square A, however, the LU factorization - * routine PDGSTRF can factorize rectangular matrices. - * On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T, + * routine PSGSTRF can factorize rectangular matrices. + * + * Internally, A is gathered on 2D processs grid-0, call it A2d. + * On exit, A2d may be overwtirren by diag(R)*A*diag(C)*Pc^T, * depending on ScalePermstruct->DiagScale and options->ColPerm: - * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by + * if ScalePermstruct->DiagScale != NOEQUIL, A2d is overwritten by * diag(R)*A*diag(C). - * if options->ColPerm != NATURAL, A is further overwritten by + * if options->ColPerm != NATURAL, A2d is further overwritten by * diag(R)*A*diag(C)*Pc^T. * If all the above condition are true, the LU decomposition is * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. diff --git a/SRC/psutil.c b/SRC/psutil.c index edb66875..25e0faa2 100644 --- a/SRC/psutil.c +++ b/SRC/psutil.c @@ -805,10 +805,12 @@ void sDestroy_A3d_gathered_on_2d(sSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid SUPERLU_FREE( A2d->colind ); SUPERLU_FREE( A2d->nzval ); } - SUPERLU_FREE(A3d->b_counts_int); // free displacement counts - SUPERLU_FREE(A3d->b_disp); - SUPERLU_FREE(A3d->row_counts_int); + SUPERLU_FREE(A3d->row_counts_int); // free displacements and counts SUPERLU_FREE(A3d->row_disp); + SUPERLU_FREE(A3d->nnz_counts_int); + SUPERLU_FREE(A3d->nnz_disp); + SUPERLU_FREE(A3d->b_counts_int); + SUPERLU_FREE(A3d->b_disp); SUPERLU_FREE( A2d ); // free 2D structure SUPERLU_FREE( A3d ); // free 3D structure } /* sDestroy_A3d_gathered_on_2d */ diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c index 0843881d..05fdf000 100644 --- a/SRC/pzgssvx3d.c +++ b/SRC/pzgssvx3d.c @@ -342,19 +342,21 @@ at the top-level directory. * NOTE: all options must be indentical on all processes when * calling this routine. * - * A (input/output) SuperMatrix* (local); A resides only on process layer 0. + * A (input) SuperMatrix* (local); A resides on all 3D processes. * On entry, matrix A in A*X=B, of dimension (A->nrow, A->ncol). * The number of linear equations is A->nrow. The type of A must be: * Stype = SLU_NR_loc; Dtype = SLU_Z; Mtype = SLU_GE. * That is, A is stored in distributed compressed row format. * See supermatrix.h for the definition of 'SuperMatrix'. * This routine only handles square A, however, the LU factorization - * routine PDGSTRF can factorize rectangular matrices. - * On exit, A may be overwtirren by diag(R)*A*diag(C)*Pc^T, + * routine PZGSTRF can factorize rectangular matrices. + * + * Internally, A is gathered on 2D processs grid-0, call it A2d. + * On exit, A2d may be overwtirren by diag(R)*A*diag(C)*Pc^T, * depending on ScalePermstruct->DiagScale and options->ColPerm: - * if ScalePermstruct->DiagScale != NOEQUIL, A is overwritten by + * if ScalePermstruct->DiagScale != NOEQUIL, A2d is overwritten by * diag(R)*A*diag(C). - * if options->ColPerm != NATURAL, A is further overwritten by + * if options->ColPerm != NATURAL, A2d is further overwritten by * diag(R)*A*diag(C)*Pc^T. * If all the above condition are true, the LU decomposition is * performed on the matrix Pc*Pr*diag(R)*A*diag(C)*Pc^T. diff --git a/SRC/pzutil.c b/SRC/pzutil.c index c340c061..c784c497 100644 --- a/SRC/pzutil.c +++ b/SRC/pzutil.c @@ -804,10 +804,12 @@ void zDestroy_A3d_gathered_on_2d(zSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid SUPERLU_FREE( A2d->colind ); SUPERLU_FREE( A2d->nzval ); } - SUPERLU_FREE(A3d->b_counts_int); // free displacement counts - SUPERLU_FREE(A3d->b_disp); - SUPERLU_FREE(A3d->row_counts_int); + SUPERLU_FREE(A3d->row_counts_int); // free displacements and counts SUPERLU_FREE(A3d->row_disp); + SUPERLU_FREE(A3d->nnz_counts_int); + SUPERLU_FREE(A3d->nnz_disp); + SUPERLU_FREE(A3d->b_counts_int); + SUPERLU_FREE(A3d->b_disp); SUPERLU_FREE( A2d ); // free 2D structure SUPERLU_FREE( A3d ); // free 3D structure } /* zDestroy_A3d_gathered_on_2d */ diff --git a/SRC/slustruct_gpu.h b/SRC/slustruct_gpu.h index 9ab3983a..48038dbf 100644 --- a/SRC/slustruct_gpu.h +++ b/SRC/slustruct_gpu.h @@ -71,7 +71,7 @@ typedef struct //SCUbuf_gpu_ } sSCUbuf_gpu_t; - +/* Holds the L & U data structures on the GPU side */ typedef struct //LUstruct_gpu_ { int_t *LrowindVec; /* A single vector */ @@ -135,7 +135,8 @@ typedef struct //LUstruct_gpu_ typedef struct //sluGPU_t_ { int_t gpuId; // if there are multiple GPUs - sLUstruct_gpu_t *A_gpu, *dA_gpu; + sLUstruct_gpu_t *A_gpu; // holds the LU structure on GPU + //*dA_gpu; not used cudaStream_t funCallStreams[MAX_NCUDA_STREAMS], CopyStream; cublasHandle_t cublasHandles[MAX_NCUDA_STREAMS]; int_t lastOffloadStream[MAX_NCUDA_STREAMS]; diff --git a/SRC/snrformat_loc3d.c b/SRC/snrformat_loc3d.c index 99e37c06..7e932f77 100644 --- a/SRC/snrformat_loc3d.c +++ b/SRC/snrformat_loc3d.c @@ -58,20 +58,13 @@ void sGatherNRformat_loc3d NRformat_loc *A2d; int *row_counts_int; // 32-bit, number of local rows relative to all processes int *row_disp; // displacement + int *nnz_counts_int; // number of local nnz relative to all processes + int *nnz_disp; // displacement int *b_counts_int; // number of local B entries relative to all processes int *b_disp; // including 'nrhs' /********* Gather A2d *********/ - if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) { - /* A3d is input. No need to recompute count. - Only need to gather A2d matrix. */ - b_counts_int = A3d->b_counts_int; - b_disp = A3d->b_disp;; - row_counts_int = A3d->row_counts_int; - row_disp = A3d->row_disp; - - if (grid3d->iam==0) printf("TO BE COMPLETED!\n"); - } else if ( Fact != FACTORED ) { + if ( Fact == DOFACT ) { /* Factorize from scratch */ /* A3d is output. Compute counts from scratch */ A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d)); A2d = SUPERLU_MALLOC(sizeof(NRformat_loc)); @@ -129,7 +122,7 @@ void sGatherNRformat_loc3d row_counts_int, row_disp, mpi_int_t, 0, grid3d->zscp.comm); - if (grid3d->zscp.Iam == 0) + if (grid3d->zscp.Iam == 0) /* Set up rowptr[] relative to 2D grid-0 */ { for (int i = 0; i < grid3d->npdep; i++) { @@ -166,20 +159,75 @@ void sGatherNRformat_loc3d } /* end 2D layer grid-0 */ A3d->A_nfmt = A2d; - A3d->b_counts_int = b_counts_int; - A3d->b_disp = b_disp; A3d->row_counts_int = row_counts_int; A3d->row_disp = row_disp; + A3d->nnz_counts_int = nnz_counts_int; + A3d->nnz_disp = nnz_disp; + A3d->b_counts_int = b_counts_int; + A3d->b_disp = b_disp; /* free storage */ SUPERLU_FREE(nnz_counts); - SUPERLU_FREE(nnz_counts_int); SUPERLU_FREE(row_counts); - SUPERLU_FREE(nnz_disp); *A3d_addr = (NRformat_loc3d *) A3d; // return pointer to A3d struct - } /* end else: Factor from scratch */ + } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) { + /* A3d is input. No need to recompute count. + Only need to gather A2d matrix; the previous 2D matrix + was overwritten by equilibration, perm_r and perm_c. */ + NRformat_loc *A2d = A3d->A_nfmt; + row_counts_int = A3d->row_counts_int; + row_disp = A3d->row_disp; + nnz_counts_int = A3d->nnz_counts_int; + nnz_disp = A3d->nnz_disp; + + MPI_Gatherv(A->nzval, A->nnz_loc, MPI_FLOAT, A2d->nzval, + nnz_counts_int, nnz_disp, + MPI_FLOAT, 0, grid3d->zscp.comm); + MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind, + nnz_counts_int, nnz_disp, + mpi_int_t, 0, grid3d->zscp.comm); + MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1], + row_counts_int, row_disp, + mpi_int_t, 0, grid3d->zscp.comm); + + if (grid3d->zscp.Iam == 0) { /* Set up rowptr[] relative to 2D grid-0 */ + A2d->rowptr[0] = 0; + for (int i = 0; i < grid3d->npdep; i++) + { + for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++) + { + // A2d->rowptr[j] += row_disp[i]; + A2d->rowptr[j] += nnz_disp[i]; + } + } + A2d->nnz_loc = nnz_disp[grid3d->npdep]; + A2d->m_loc = row_disp[grid3d->npdep]; + + if (grid3d->rankorder == 1) { // XY-major + A2d->fst_row = A->fst_row; + } else { // Z-major + gridinfo_t *grid2d = &(grid3d->grid2d); + int procs2d = grid2d->nprow * grid2d->npcol; + int m_loc_2d = A2d->m_loc; + int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int)); + + MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, + MPI_INT, grid2d->comm); + + int fst_row = 0; + for (int p = 0; p < procs2d; ++p) + { + if (grid2d->iam == p) + A2d->fst_row = fst_row; + fst_row += m_loc_2d_counts[p]; + } + + SUPERLU_FREE(m_loc_2d_counts); + } + } /* end 2D layer grid-0 */ + } /* SamePattern or SamePattern_SameRowPerm */ A3d->m_loc = A->m_loc; A3d->B3d = (float *) B; /* save the pointer to the original B @@ -191,10 +239,10 @@ void sGatherNRformat_loc3d if ( nrhs > 0 ) { A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0 - b_counts_int = A3d->b_counts_int; - b_disp = A3d->b_disp;; row_counts_int = A3d->row_counts_int; row_disp = A3d->row_disp; + b_counts_int = A3d->b_counts_int; + b_disp = A3d->b_disp;; /* Btmp <- compact(B), compacting B */ float *Btmp; diff --git a/SRC/sreadMM.c b/SRC/sreadMM.c index 4b309a98..d2ac8c8e 100644 --- a/SRC/sreadMM.c +++ b/SRC/sreadMM.c @@ -108,7 +108,7 @@ sreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, /* 3/ Read n and nnz */ #ifdef _LONGINT - sscanf(line, "%ld%ld%ld",m, n, nonz); + sscanf(line, "%lld%lld%lld", m, n, nonz); #else sscanf(line, "%d%d%d",m, n, nonz); #endif diff --git a/SRC/sreadtriple.c b/SRC/sreadtriple.c index 18ec7ca6..7e10f4ac 100644 --- a/SRC/sreadtriple.c +++ b/SRC/sreadtriple.c @@ -47,7 +47,7 @@ sreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, */ #ifdef _LONGINT - fscanf(fp, "%ld%ld%ld", m, n, nonz); + fscanf(fp, "%lld%lld%lld", m, n, nonz); #else fscanf(fp, "%d%d%d", m, n, nonz); #endif @@ -77,7 +77,7 @@ sreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) { #ifdef _LONGINT - fscanf(fp, "%ld%ld%f\n", &row[nz], &col[nz], &val[nz]); + fscanf(fp, "%lld%lld%f\n", &row[nz], &col[nz], &val[nz]); #else // int fscanf(fp, "%d%d%f\n", &row[nz], &col[nz], &val[nz]); #endif @@ -86,8 +86,9 @@ sreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, if ( row[0] == 0 || col[0] == 0 ) { zero_base = 1; printf("triplet file: row/col indices are zero-based.\n"); - } else + } else { printf("triplet file: row/col indices are one-based.\n"); + } if ( !zero_base ) { /* Change to 0-based indexing. */ diff --git a/SRC/sreadtriple_noheader.c b/SRC/sreadtriple_noheader.c index 91bd58c3..32905984 100644 --- a/SRC/sreadtriple_noheader.c +++ b/SRC/sreadtriple_noheader.c @@ -49,7 +49,7 @@ sreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, nz = *n = 0; #ifdef _LONGINT - ret_val = fscanf(fp, "%ld%ld%f\n", &i, &j, &vali); + ret_val = fscanf(fp, "%lld%lld%f\n", &i, &j, &vali); #else // int ret_val = fscanf(fp, "%d%d%f\n", &i, &j, &vali); #endif @@ -62,7 +62,7 @@ sreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, ++nz; #ifdef _LONGINT - ret_val = fscanf(fp, "%ld%ld%f\n", &i, &j, &vali); + ret_val = fscanf(fp, "%lld%lld%f\n", &i, &j, &vali); #else // int ret_val = fscanf(fp, "%d%d%f\n", &i, &j, &vali); #endif @@ -105,7 +105,7 @@ sreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, /* Read into the triplet array from a file */ for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) { #ifdef _LONGINT - fscanf(fp, "%ld%ld%f\n", &row[nz], &col[nz], &val[nz]); + fscanf(fp, "%lld%lld%f\n", &row[nz], &col[nz], &val[nz]); #else // int32 fscanf(fp, "%d%d%f\n", &row[nz], &col[nz], &val[nz]); #endif diff --git a/SRC/ssuperlu_gpu.cu b/SRC/ssuperlu_gpu.cu index 7b86574f..adb67693 100644 --- a/SRC/ssuperlu_gpu.cu +++ b/SRC/ssuperlu_gpu.cu @@ -885,9 +885,9 @@ void sprintGPUStats(sLUstruct_gpu_t * A_gpu) } /* end printGPUStats */ - +/* Initialize the GPU side of the data structure. */ int sinitSluGPU3D_t( - ssluGPU_t *sluGPU, + ssluGPU_t *sluGPU, // LU structures on GPU, see slustruct_gpu.h sLUstruct_t *LUstruct, gridinfo3d_t * grid3d, int_t* perm_c_supno, @@ -924,6 +924,9 @@ int sinitSluGPU3D_t( sluGPU->A_gpu = (sLUstruct_gpu_t *) malloc (sizeof(sLUstruct_gpu_t)); sluGPU->A_gpu->perm_c_supno = perm_c_supno; + + /* Allocate GPU memory for the LU data structures, and copy + the host LU structure to GPU side. */ sCopyLUToGPU3D ( isNodeInMyGrid, Llu, /* referred to as A_host */ sluGPU, Glu_persist, n, grid3d, buffer_size, bigu_size, ldt @@ -932,6 +935,7 @@ int sinitSluGPU3D_t( return 0; } /* end sinitSluGPU3D_t */ + int sinitD2Hreduce( int next_k, d2Hreduce_t* d2Hred, int last_flag, HyP_t* HyP, ssluGPU_t *sluGPU, gridinfo_t *grid, sLUstruct_t *LUstruct, SCT_t* SCT @@ -1185,10 +1189,14 @@ int freeSluGPU(ssluGPU_t *sluGPU) } #endif +/* Allocate GPU memory for the LU data structures, and copy + the host LU structure to GPU side. + After factorization, the GPU LU structure should be freed by + calling sfree_LUsstruct_gpu(). */ void sCopyLUToGPU3D ( int_t* isNodeInMyGrid, sLocalLU_t *A_host, /* distributed LU structure on host */ - ssluGPU_t *sluGPU, + ssluGPU_t *sluGPU, /* hold LU structure on GPU */ Glu_persist_t *Glu_persist, int_t n, gridinfo3d_t *grid3d, int_t buffer_size, /* bigV size on GPU for Schur complement update */ diff --git a/SRC/superlu_ddefs.h b/SRC/superlu_ddefs.h index 05e3c8d8..10056a7c 100644 --- a/SRC/superlu_ddefs.h +++ b/SRC/superlu_ddefs.h @@ -536,7 +536,8 @@ extern void dCopy_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *); extern void dZero_CompRowLoc_Matrix_dist(SuperMatrix *); extern void dScaleAddId_CompRowLoc_Matrix_dist(SuperMatrix *, double); extern void dScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *, double); -extern void dZeroLblocks(int, int_t, gridinfo_t *, dLUstruct_t *); +extern void dZeroLblocks(int, int, gridinfo_t *, dLUstruct_t *); +extern void dZeroUblocks(int iam, int n, gridinfo_t *, dLUstruct_t *); extern void dfill_dist (double *, int_t, double); extern void dinf_norm_error_dist (int_t, int_t, double*, int_t, double*, int_t, gridinfo_t*); diff --git a/SRC/superlu_zdefs.h b/SRC/superlu_zdefs.h index e2e2817e..d5a5cb8f 100644 --- a/SRC/superlu_zdefs.h +++ b/SRC/superlu_zdefs.h @@ -538,7 +538,8 @@ extern void zCopy_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *); extern void zZero_CompRowLoc_Matrix_dist(SuperMatrix *); extern void zScaleAddId_CompRowLoc_Matrix_dist(SuperMatrix *, doublecomplex); extern void zScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *, SuperMatrix *, doublecomplex); -extern void zZeroLblocks(int, int_t, gridinfo_t *, zLUstruct_t *); +extern void zZeroLblocks(int, int, gridinfo_t *, zLUstruct_t *); +extern void zZeroUblocks(int iam, int n, gridinfo_t *, zLUstruct_t *); extern void zfill_dist (doublecomplex *, int_t, doublecomplex); extern void zinf_norm_error_dist (int_t, int_t, doublecomplex*, int_t, doublecomplex*, int_t, gridinfo_t*); diff --git a/SRC/supermatrix.h b/SRC/supermatrix.h index e50360b6..9913aa6b 100644 --- a/SRC/supermatrix.h +++ b/SRC/supermatrix.h @@ -192,17 +192,19 @@ typedef struct { Only grid-0 has meanful values of these data structures. */ typedef struct NRformat_loc3d { - NRformat_loc* A_nfmt; // Gathered A matrix on 2D grid-0 - void* B3d; // on the entire 3D process grid + NRformat_loc *A_nfmt; // Gathered A matrix on 2D grid-0 + void *B3d; // on the entire 3D process grid int ldb; // relative to 3D process grid int nrhs; int m_loc; // relative to 3D process grid - void* B2d; // on 2D process layer grid-0 - - int* row_counts_int; // these counts are stored on 2D layer grid-0, - int* row_disp; // but count the number of {A, B} rows along Z-dimension - int* b_counts_int; - int* b_disp; + void *B2d; // on 2D process layer grid-0 + + int *row_counts_int; // these counts are stored on 2D layer grid-0, + int *row_disp; // but count the number of {A, B} rows along Z-dimension + int *nnz_counts_int; + int *nnz_disp; + int *b_counts_int; + int *b_disp; } NRformat_loc3d; diff --git a/SRC/sutil_dist.c b/SRC/sutil_dist.c index c957b30b..4dce22a1 100644 --- a/SRC/sutil_dist.c +++ b/SRC/sutil_dist.c @@ -392,6 +392,7 @@ void sScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B, float c) return; } +/**** end utilities added for SUNDIALS ****/ /*! \brief Allocate storage in ScalePermstruct */ void sScalePermstructInit(const int_t m, const int_t n, @@ -437,6 +438,7 @@ int sAllocGlu_3d(int_t n, int_t nsupers, sLUstruct_t * LUstruct) } // Sherry added +/* Free the replicated data on 3D process layer that is not grid-0 */ int sDeAllocGlu_3d(sLUstruct_t * LUstruct) { SUPERLU_FREE(LUstruct->Glu_persist->xsup); @@ -444,6 +446,7 @@ int sDeAllocGlu_3d(sLUstruct_t * LUstruct) return 0; } +/* Free the replicated data on 3D process layer that is not grid-0 */ int sDeAllocLlu_3d(int_t n, sLUstruct_t * LUstruct, gridinfo3d_t* grid3d) { int i, nbc, nbr, nsupers; @@ -613,7 +616,7 @@ void sPrintLblocks(int iam, int_t nsupers, gridinfo_t *grid, /*! \brief Sets all entries of matrix L to zero. */ -void sZeroLblocks(int iam, int_t n, gridinfo_t *grid, sLUstruct_t *LUstruct) +void sZeroLblocks(int iam, int n, gridinfo_t *grid, sLUstruct_t *LUstruct) { float zero = 0.0; register int extra, gb, j, lb, nsupc, nsupr, ncb; @@ -643,7 +646,7 @@ void sZeroLblocks(int iam, int_t n, gridinfo_t *grid, sLUstruct_t *LUstruct) } } } -} /* sZeroLblocks */ +} /* end sZeroLblocks */ /*! \brief Dump the factored matrix L using matlab triple-let format @@ -742,7 +745,6 @@ void sDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid, } /* sDumpLblocks */ - /*! \brief Print the blocks in the factored matrix U. */ void sPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid, @@ -782,7 +784,37 @@ void sPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid, printf("[%d] ToSendD[] %d\n", iam, Llu->ToSendD[lb]); } } -} /* SPRINTUBLOCKS */ +} /* end sPrintUlocks */ + +/*! \brief Sets all entries of matrix U to zero. + */ +void sZeroUblocks(int iam, int n, gridinfo_t *grid, sLUstruct_t *LUstruct) +{ + float zero = 0.0; + register int i, extra, lb, len, nrb; + register int myrow, r; + sLocalLU_t *Llu = LUstruct->Llu; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + int_t *xsup = Glu_persist->xsup; + int_t *index; + float *nzval; + int nsupers = Glu_persist->supno[n-1] + 1; + + nrb = nsupers / grid->nprow; + extra = nsupers % grid->nprow; + myrow = MYROW( iam, grid ); + if ( myrow < extra ) ++nrb; + for (lb = 0; lb < nrb; ++lb) { + index = Llu->Ufstnz_br_ptr[lb]; + if ( index ) { /* Not an empty row */ + nzval = Llu->Unzval_br_ptr[lb]; + len = index[1]; // number of entries in nzval[]; + for (i = 0; i < len; ++i) { + nzval[i] = zero; + } + } + } +} /* end sZeroUlocks */ int sprint_gsmv_comm(FILE *fp, int_t m_loc, psgsmv_comm_t *gsmv_comm, diff --git a/SRC/zlustruct_gpu.h b/SRC/zlustruct_gpu.h index 04819221..1064d424 100644 --- a/SRC/zlustruct_gpu.h +++ b/SRC/zlustruct_gpu.h @@ -70,7 +70,7 @@ typedef struct //SCUbuf_gpu_ } zSCUbuf_gpu_t; - +/* Holds the L & U data structures on the GPU side */ typedef struct //LUstruct_gpu_ { int_t *LrowindVec; /* A single vector */ @@ -134,7 +134,8 @@ typedef struct //LUstruct_gpu_ typedef struct //sluGPU_t_ { int_t gpuId; // if there are multiple GPUs - zLUstruct_gpu_t *A_gpu, *dA_gpu; + zLUstruct_gpu_t *A_gpu; // holds the LU structure on GPU + //*dA_gpu; not used cudaStream_t funCallStreams[MAX_NCUDA_STREAMS], CopyStream; cublasHandle_t cublasHandles[MAX_NCUDA_STREAMS]; int_t lastOffloadStream[MAX_NCUDA_STREAMS]; diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c index 19535d5e..89de8dd0 100644 --- a/SRC/znrformat_loc3d.c +++ b/SRC/znrformat_loc3d.c @@ -57,20 +57,13 @@ void zGatherNRformat_loc3d NRformat_loc *A2d; int *row_counts_int; // 32-bit, number of local rows relative to all processes int *row_disp; // displacement + int *nnz_counts_int; // number of local nnz relative to all processes + int *nnz_disp; // displacement int *b_counts_int; // number of local B entries relative to all processes int *b_disp; // including 'nrhs' /********* Gather A2d *********/ - if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) { - /* A3d is input. No need to recompute count. - Only need to gather A2d matrix. */ - b_counts_int = A3d->b_counts_int; - b_disp = A3d->b_disp;; - row_counts_int = A3d->row_counts_int; - row_disp = A3d->row_disp; - - if (grid3d->iam==0) printf("TO BE COMPLETED!\n"); - } else if ( Fact != FACTORED ) { + if ( Fact == DOFACT ) { /* Factorize from scratch */ /* A3d is output. Compute counts from scratch */ A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d)); A2d = SUPERLU_MALLOC(sizeof(NRformat_loc)); @@ -128,7 +121,7 @@ void zGatherNRformat_loc3d row_counts_int, row_disp, mpi_int_t, 0, grid3d->zscp.comm); - if (grid3d->zscp.Iam == 0) + if (grid3d->zscp.Iam == 0) /* Set up rowptr[] relative to 2D grid-0 */ { for (int i = 0; i < grid3d->npdep; i++) { @@ -165,20 +158,75 @@ void zGatherNRformat_loc3d } /* end 2D layer grid-0 */ A3d->A_nfmt = A2d; - A3d->b_counts_int = b_counts_int; - A3d->b_disp = b_disp; A3d->row_counts_int = row_counts_int; A3d->row_disp = row_disp; + A3d->nnz_counts_int = nnz_counts_int; + A3d->nnz_disp = nnz_disp; + A3d->b_counts_int = b_counts_int; + A3d->b_disp = b_disp; /* free storage */ SUPERLU_FREE(nnz_counts); - SUPERLU_FREE(nnz_counts_int); SUPERLU_FREE(row_counts); - SUPERLU_FREE(nnz_disp); *A3d_addr = (NRformat_loc3d *) A3d; // return pointer to A3d struct - } /* end else: Factor from scratch */ + } else if ( Fact == SamePattern || Fact == SamePattern_SameRowPerm ) { + /* A3d is input. No need to recompute count. + Only need to gather A2d matrix; the previous 2D matrix + was overwritten by equilibration, perm_r and perm_c. */ + NRformat_loc *A2d = A3d->A_nfmt; + row_counts_int = A3d->row_counts_int; + row_disp = A3d->row_disp; + nnz_counts_int = A3d->nnz_counts_int; + nnz_disp = A3d->nnz_disp; + + MPI_Gatherv(A->nzval, A->nnz_loc, SuperLU_MPI_DOUBLE_COMPLEX, A2d->nzval, + nnz_counts_int, nnz_disp, + SuperLU_MPI_DOUBLE_COMPLEX, 0, grid3d->zscp.comm); + MPI_Gatherv(A->colind, A->nnz_loc, mpi_int_t, A2d->colind, + nnz_counts_int, nnz_disp, + mpi_int_t, 0, grid3d->zscp.comm); + MPI_Gatherv(&A->rowptr[1], A->m_loc, mpi_int_t, &A2d->rowptr[1], + row_counts_int, row_disp, + mpi_int_t, 0, grid3d->zscp.comm); + + if (grid3d->zscp.Iam == 0) { /* Set up rowptr[] relative to 2D grid-0 */ + A2d->rowptr[0] = 0; + for (int i = 0; i < grid3d->npdep; i++) + { + for (int j = row_disp[i] + 1; j < row_disp[i + 1] + 1; j++) + { + // A2d->rowptr[j] += row_disp[i]; + A2d->rowptr[j] += nnz_disp[i]; + } + } + A2d->nnz_loc = nnz_disp[grid3d->npdep]; + A2d->m_loc = row_disp[grid3d->npdep]; + + if (grid3d->rankorder == 1) { // XY-major + A2d->fst_row = A->fst_row; + } else { // Z-major + gridinfo_t *grid2d = &(grid3d->grid2d); + int procs2d = grid2d->nprow * grid2d->npcol; + int m_loc_2d = A2d->m_loc; + int *m_loc_2d_counts = SUPERLU_MALLOC(procs2d * sizeof(int)); + + MPI_Allgather(&m_loc_2d, 1, MPI_INT, m_loc_2d_counts, 1, + MPI_INT, grid2d->comm); + + int fst_row = 0; + for (int p = 0; p < procs2d; ++p) + { + if (grid2d->iam == p) + A2d->fst_row = fst_row; + fst_row += m_loc_2d_counts[p]; + } + + SUPERLU_FREE(m_loc_2d_counts); + } + } /* end 2D layer grid-0 */ + } /* SamePattern or SamePattern_SameRowPerm */ A3d->m_loc = A->m_loc; A3d->B3d = (doublecomplex *) B; /* save the pointer to the original B @@ -190,10 +238,10 @@ void zGatherNRformat_loc3d if ( nrhs > 0 ) { A2d = (NRformat_loc *) A3d->A_nfmt; // matrix A gathered on 2D grid-0 - b_counts_int = A3d->b_counts_int; - b_disp = A3d->b_disp;; row_counts_int = A3d->row_counts_int; row_disp = A3d->row_disp; + b_counts_int = A3d->b_counts_int; + b_disp = A3d->b_disp;; /* Btmp <- compact(B), compacting B */ doublecomplex *Btmp; diff --git a/SRC/zreadMM.c b/SRC/zreadMM.c index 3d0048cf..993de064 100644 --- a/SRC/zreadMM.c +++ b/SRC/zreadMM.c @@ -107,7 +107,7 @@ zreadMM_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, /* 3/ Read n and nnz */ #ifdef _LONGINT - sscanf(line, "%ld%ld%ld",m, n, nonz); + sscanf(line, "%lld%lld%lld", m, n, nonz); #else sscanf(line, "%d%d%d",m, n, nonz); #endif diff --git a/SRC/zreadtriple.c b/SRC/zreadtriple.c index a52eae5f..d8e4c9a2 100644 --- a/SRC/zreadtriple.c +++ b/SRC/zreadtriple.c @@ -46,7 +46,7 @@ zreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, */ #ifdef _LONGINT - fscanf(fp, "%ld%ld%ld", m, n, nonz); + fscanf(fp, "%lld%lld%lld", m, n, nonz); #else fscanf(fp, "%d%d%d", m, n, nonz); #endif @@ -76,7 +76,7 @@ zreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) { #ifdef _LONGINT - fscanf(fp, "%ld%ld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i); + fscanf(fp, "%lld%lld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i); #else // int fscanf(fp, "%d%d%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i); #endif @@ -85,8 +85,9 @@ zreadtriple_dist(FILE *fp, int_t *m, int_t *n, int_t *nonz, if ( row[0] == 0 || col[0] == 0 ) { zero_base = 1; printf("triplet file: row/col indices are zero-based.\n"); - } else + } else { printf("triplet file: row/col indices are one-based.\n"); + } if ( !zero_base ) { /* Change to 0-based indexing. */ diff --git a/SRC/zreadtriple_noheader.c b/SRC/zreadtriple_noheader.c index 3ddedb01..8410540b 100644 --- a/SRC/zreadtriple_noheader.c +++ b/SRC/zreadtriple_noheader.c @@ -48,7 +48,7 @@ zreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, nz = *n = 0; #ifdef _LONGINT - ret_val = fscanf(fp, "%ld%ld%lf%lf\n", &i, &j, &vali.r, &vali.i); + ret_val = fscanf(fp, "%lld%lld%lf%lf\n", &i, &j, &vali.r, &vali.i); #else // int ret_val = fscanf(fp, "%d%d%lf%lf\n", &i, &j, &vali.r, &vali.i); #endif @@ -61,7 +61,7 @@ zreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, ++nz; #ifdef _LONGINT - ret_val = fscanf(fp, "%ld%ld%lf%lf\n", &i, &j, &vali.r, &vali.i); + ret_val = fscanf(fp, "%lld%lld%lf%lf\n", &i, &j, &vali.r, &vali.i); #else // int ret_val = fscanf(fp, "%d%d%lf%lf\n", &i, &j, &vali.r, &vali.i); #endif @@ -104,7 +104,7 @@ zreadtriple_noheader(FILE *fp, int_t *m, int_t *n, int_t *nonz, /* Read into the triplet array from a file */ for (nnz = 0, nz = 0; nnz < *nonz; ++nnz) { #ifdef _LONGINT - fscanf(fp, "%ld%ld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i); + fscanf(fp, "%lld%lld%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i); #else // int32 fscanf(fp, "%d%d%lf%lf\n", &row[nz], &col[nz], &val[nz].r, &val[nz].i); #endif diff --git a/SRC/zsuperlu_gpu.cu b/SRC/zsuperlu_gpu.cu index af54ae63..a0746dc1 100644 --- a/SRC/zsuperlu_gpu.cu +++ b/SRC/zsuperlu_gpu.cu @@ -112,7 +112,7 @@ void zdevice_scatter_l_2D (int thread_id, } /* Sherry: this routine is not used */ -#if 0 +#if 0 ////////////////////////////////////////////// __global__ void cub_scan_test(void) { @@ -138,7 +138,7 @@ void cub_scan_test(void) printf("%d %d\n", thread_id, IndirectJ2[thread_id]); } -#endif // not used +#endif /////////////////////////////////// not used __device__ inline @@ -894,9 +894,9 @@ void zprintGPUStats(zLUstruct_gpu_t * A_gpu) } /* end printGPUStats */ - +/* Initialize the GPU side of the data structure. */ int zinitSluGPU3D_t( - zsluGPU_t *sluGPU, + zsluGPU_t *sluGPU, // LU structures on GPU, see zlustruct_gpu.h zLUstruct_t *LUstruct, gridinfo3d_t * grid3d, int_t* perm_c_supno, @@ -914,7 +914,7 @@ int zinitSluGPU3D_t( sluGPU->nCudaStreams = getnCudaStreams(); if (grid3d->iam == 0) { - printf("zinitSluGPU3D_t: Using hardware acceleration, with %d cuda streams \n", sluGPU->nCudaStreams); + printf("zinitSluGPU3D_t: Using hardware acceleration, with %d cuda streams, max_buffer_size %d\n", sluGPU->nCudaStreams, (int) buffer_size); fflush(stdout); if ( MAX_SUPER_SIZE < ldt ) { @@ -933,6 +933,9 @@ int zinitSluGPU3D_t( sluGPU->A_gpu = (zLUstruct_gpu_t *) malloc (sizeof(zLUstruct_gpu_t)); sluGPU->A_gpu->perm_c_supno = perm_c_supno; + + /* Allocate GPU memory for the LU data structures, and copy + the host LU structure to GPU side. */ zCopyLUToGPU3D ( isNodeInMyGrid, Llu, /* referred to as A_host */ sluGPU, Glu_persist, n, grid3d, buffer_size, bigu_size, ldt @@ -941,6 +944,7 @@ int zinitSluGPU3D_t( return 0; } /* end zinitSluGPU3D_t */ + int zinitD2Hreduce( int next_k, d2Hreduce_t* d2Hred, int last_flag, HyP_t* HyP, zsluGPU_t *sluGPU, gridinfo_t *grid, zLUstruct_t *LUstruct, SCT_t* SCT @@ -1194,10 +1198,14 @@ int freeSluGPU(zsluGPU_t *sluGPU) } #endif +/* Allocate GPU memory for the LU data structures, and copy + the host LU structure to GPU side. + After factorization, the GPU LU structure should be freed by + calling zfree_LUsstruct_gpu(). */ void zCopyLUToGPU3D ( int_t* isNodeInMyGrid, zLocalLU_t *A_host, /* distributed LU structure on host */ - zsluGPU_t *sluGPU, + zsluGPU_t *sluGPU, /* hold LU structure on GPU */ Glu_persist_t *Glu_persist, int_t n, gridinfo3d_t *grid3d, int_t buffer_size, /* bigV size on GPU for Schur complement update */ diff --git a/SRC/zutil_dist.c b/SRC/zutil_dist.c index ca27f360..34d7f308 100644 --- a/SRC/zutil_dist.c +++ b/SRC/zutil_dist.c @@ -393,6 +393,7 @@ void zScaleAdd_CompRowLoc_Matrix_dist(SuperMatrix *A, SuperMatrix *B, doublecomp return; } +/**** end utilities added for SUNDIALS ****/ /*! \brief Allocate storage in ScalePermstruct */ void zScalePermstructInit(const int_t m, const int_t n, @@ -438,6 +439,7 @@ int zAllocGlu_3d(int_t n, int_t nsupers, zLUstruct_t * LUstruct) } // Sherry added +/* Free the replicated data on 3D process layer that is not grid-0 */ int zDeAllocGlu_3d(zLUstruct_t * LUstruct) { SUPERLU_FREE(LUstruct->Glu_persist->xsup); @@ -445,6 +447,7 @@ int zDeAllocGlu_3d(zLUstruct_t * LUstruct) return 0; } +/* Free the replicated data on 3D process layer that is not grid-0 */ int zDeAllocLlu_3d(int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d) { int i, nbc, nbr, nsupers; @@ -456,8 +459,7 @@ int zDeAllocLlu_3d(int_t n, zLUstruct_t * LUstruct, gridinfo3d_t* grid3d) for (i = 0; i < nbc; ++i) if ( Llu->Lrowind_bc_ptr[i] ) { SUPERLU_FREE (Llu->Lrowind_bc_ptr[i]); - SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); - + SUPERLU_FREE (Llu->Lnzval_bc_ptr[i]); } SUPERLU_FREE (Llu->Lrowind_bc_ptr); SUPERLU_FREE (Llu->Lnzval_bc_ptr); @@ -617,7 +619,7 @@ void zPrintLblocks(int iam, int_t nsupers, gridinfo_t *grid, /*! \brief Sets all entries of matrix L to zero. */ -void zZeroLblocks(int iam, int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct) +void zZeroLblocks(int iam, int n, gridinfo_t *grid, zLUstruct_t *LUstruct) { doublecomplex zero = {0.0, 0.0}; register int extra, gb, j, lb, nsupc, nsupr, ncb; @@ -647,7 +649,7 @@ void zZeroLblocks(int iam, int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct) } } } -} /* zZeroLblocks */ +} /* end zZeroLblocks */ /*! \brief Dump the factored matrix L using matlab triple-let format @@ -746,7 +748,6 @@ void zDumpLblocks(int iam, int_t nsupers, gridinfo_t *grid, } /* zDumpLblocks */ - /*! \brief Print the blocks in the factored matrix U. */ void zPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid, @@ -786,7 +787,37 @@ void zPrintUblocks(int iam, int_t nsupers, gridinfo_t *grid, printf("[%d] ToSendD[] %d\n", iam, Llu->ToSendD[lb]); } } -} /* ZPRINTUBLOCKS */ +} /* end zPrintUlocks */ + +/*! \brief Sets all entries of matrix U to zero. + */ +void zZeroUblocks(int iam, int n, gridinfo_t *grid, zLUstruct_t *LUstruct) +{ + doublecomplex zero = {0.0, 0.0}; + register int i, extra, lb, len, nrb; + register int myrow, r; + zLocalLU_t *Llu = LUstruct->Llu; + Glu_persist_t *Glu_persist = LUstruct->Glu_persist; + int_t *xsup = Glu_persist->xsup; + int_t *index; + doublecomplex *nzval; + int nsupers = Glu_persist->supno[n-1] + 1; + + nrb = nsupers / grid->nprow; + extra = nsupers % grid->nprow; + myrow = MYROW( iam, grid ); + if ( myrow < extra ) ++nrb; + for (lb = 0; lb < nrb; ++lb) { + index = Llu->Ufstnz_br_ptr[lb]; + if ( index ) { /* Not an empty row */ + nzval = Llu->Unzval_br_ptr[lb]; + len = index[1]; // number of entries in nzval[]; + for (i = 0; i < len; ++i) { + nzval[i] = zero; + } + } + } +} /* end zZeroUlocks */ int zprint_gsmv_comm(FILE *fp, int_t m_loc, pzgsmv_comm_t *gsmv_comm, From d458710ebba339879e417e2bc261e3fef37f98a9 Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Mon, 13 Sep 2021 21:42:15 -0700 Subject: [PATCH 123/147] Change default to enable_complex16=ON. Change printing format macro IFMT to %lld for 64-bit build. --- CMakeLists.txt | 2 +- FORTRAN/superlu_dist_config.fh | 1 + SRC/superlu_defs.h | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0f13a12a..dce869fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") option(enable_doc "Build doxygen documentation" OFF) option(enable_double "Enable double precision library" ON) option(enable_single "Enable single precision library" OFF) -option(enable_complex16 "Enable complex16 precision library" OFF) +option(enable_complex16 "Enable complex16 precision library" ON) option(enable_tests "Build tests" ON) option(enable_examples "Build examples" ON) option(XSDK_ENABLE_Fortran "Enable Fortran" ON) diff --git a/FORTRAN/superlu_dist_config.fh b/FORTRAN/superlu_dist_config.fh index cbe990cc..caa86f6b 100644 --- a/FORTRAN/superlu_dist_config.fh +++ b/FORTRAN/superlu_dist_config.fh @@ -4,6 +4,7 @@ +#define XSDK_INDEX_SIZE 64 #if (XSDK_INDEX_SIZE == 64) #define _LONGINT 1 diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index 75f91112..337c7ae2 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -87,7 +87,7 @@ at the top-level directory. #elif defined (_LONGINT) typedef int64_t int_t; #define mpi_int_t MPI_LONG_LONG_INT - #define IFMT "%ld" + #define IFMT "%lld" #else /* Default */ typedef int int_t; #define mpi_int_t MPI_INT From 73758804531e6097d8ee33dd1baf9d54e56f8561 Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Tue, 14 Sep 2021 08:17:53 -0700 Subject: [PATCH 124/147] Add fclose() in pxdrive3d1. --- EXAMPLE/pddrive3d1.c | 1 + EXAMPLE/psdrive3d1.c | 1 + EXAMPLE/pzdrive3d1.c | 1 + 3 files changed, 3 insertions(+) diff --git a/EXAMPLE/pddrive3d1.c b/EXAMPLE/pddrive3d1.c index bc226e32..8a1baeeb 100644 --- a/EXAMPLE/pddrive3d1.c +++ b/EXAMPLE/pddrive3d1.c @@ -396,6 +396,7 @@ main (int argc, char *argv[]) dScalePermstructFree (&ScalePermstruct); dLUstructFree (&LUstruct); PStatFree (&stat); + fclose(fp); /* ------------------------------------------------------------ RELEASE THE SUPERLU PROCESS GRID. diff --git a/EXAMPLE/psdrive3d1.c b/EXAMPLE/psdrive3d1.c index 869915c3..e925420b 100644 --- a/EXAMPLE/psdrive3d1.c +++ b/EXAMPLE/psdrive3d1.c @@ -396,6 +396,7 @@ main (int argc, char *argv[]) sScalePermstructFree (&ScalePermstruct); sLUstructFree (&LUstruct); PStatFree (&stat); + fclose(fp); /* ------------------------------------------------------------ RELEASE THE SUPERLU PROCESS GRID. diff --git a/EXAMPLE/pzdrive3d1.c b/EXAMPLE/pzdrive3d1.c index 55702daa..37938cd4 100644 --- a/EXAMPLE/pzdrive3d1.c +++ b/EXAMPLE/pzdrive3d1.c @@ -396,6 +396,7 @@ main (int argc, char *argv[]) zScalePermstructFree (&ScalePermstruct); zLUstructFree (&LUstruct); PStatFree (&stat); + fclose(fp); /* ------------------------------------------------------------ RELEASE THE SUPERLU PROCESS GRID. From fd71062bd8bed168173af0054292b86160c69827 Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Mon, 20 Sep 2021 20:50:49 -0700 Subject: [PATCH 125/147] Updated zScatter_B3d() to support uneven block row partition of {A,B} in 3D interface. Only complex16 is implemented. --- CMakeLists.txt | 8 +- EXAMPLE/Makefile | 2 +- EXAMPLE/pddrive1.c | 7 +- EXAMPLE/pddrive1_ABglobal.c | 6 ++ EXAMPLE/pddrive2_ABglobal.c | 7 +- EXAMPLE/pddrive3_ABglobal.c | 7 +- EXAMPLE/pddrive4.c | 7 +- EXAMPLE/pddrive4_ABglobal.c | 6 ++ EXAMPLE/pddrive_ABglobal.c | 7 +- EXAMPLE/pddrive_spawn.c | 7 +- EXAMPLE/pzdrive.c | 7 +- EXAMPLE/pzdrive1.c | 7 +- EXAMPLE/pzdrive1_ABglobal.c | 7 +- EXAMPLE/pzdrive2_ABglobal.c | 7 +- EXAMPLE/pzdrive3_ABglobal.c | 7 +- EXAMPLE/pzdrive3d.c | 1 + EXAMPLE/pzdrive4.c | 7 +- EXAMPLE/pzdrive4_ABglobal.c | 6 ++ EXAMPLE/pzdrive_ABglobal.c | 7 +- EXAMPLE/pzdrive_spawn.c | 7 +- EXAMPLE/zcreate_matrix3d.c | 51 +++++++--- SRC/CMakeLists.txt | 6 ++ SRC/dlustruct_gpu.h | 8 +- SRC/dnrformat_loc3d.c | 2 + SRC/dsuperlu_gpu.cu | 4 +- SRC/pdgstrf3d.c | 16 +++- SRC/psgstrf3d.c | 9 +- SRC/pzgstrf3d.c | 16 +++- SRC/pzutil.c | 6 +- SRC/slustruct_gpu.h | 8 +- SRC/ssuperlu_gpu.cu | 4 +- SRC/supermatrix.h | 9 ++ SRC/zlustruct_gpu.h | 8 +- SRC/znrformat_loc3d.c | 179 ++++++++++++++++++++++++++++++++++-- SRC/zsuperlu_gpu.cu | 4 +- run_cmake_build.sh | 3 + 36 files changed, 386 insertions(+), 74 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0f13a12a..2d52b6fa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,7 +21,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") option(enable_doc "Build doxygen documentation" OFF) option(enable_double "Enable double precision library" ON) option(enable_single "Enable single precision library" OFF) -option(enable_complex16 "Enable complex16 precision library" OFF) +option(enable_complex16 "Enable complex16 precision library" ON) option(enable_tests "Build tests" ON) option(enable_examples "Build examples" ON) option(XSDK_ENABLE_Fortran "Enable Fortran" ON) @@ -224,9 +224,9 @@ if (enable_openmp) if(OPENMP_FOUND) set(CMAKE_C_FLAGS "${OpenMP_C_FLAGS} ${CMAKE_C_FLAGS}") set(CMAKE_CXX_FLAGS "${OpenMP_CXX_FLAGS} ${CMAKE_CXX_FLAGS}") -# On edison, OpenMP_EXE_LINKER_FLAGS is empty -# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") - set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") +# The following causes problem with cmake/3.20.+ +# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}") message("-- OpenMP_EXE_LINKER_FLAGS='${OpenMP_EXE_LINKER_FLAGS}'") message("-- CMAKE_EXE_LINKER_FLAGS='${CMAKE_EXE_LINKER_FLAGS}'") endif() diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile index 37387b52..ca5620b3 100644 --- a/EXAMPLE/Makefile +++ b/EXAMPLE/Makefile @@ -53,7 +53,7 @@ ZEXM1 = pzdrive1.o zcreate_matrix.o ZEXM2 = pzdrive2.o zcreate_matrix.o zcreate_matrix_perturbed.o ZEXM3 = pzdrive3.o zcreate_matrix.o ZEXM4 = pzdrive4.o zcreate_matrix.o -ZEXM3D = pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o +ZEXM3D = pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o #znrformat_loc3d.o ZEXM3D1 = pzdrive3d1.o zcreate_matrix.o zcreate_matrix3d.o ZEXM3D2 = pzdrive3d2.o zcreate_matrix.o zcreate_matrix3d.o ZEXM3D3 = pzdrive3d3.o zcreate_matrix.o zcreate_matrix3d.o diff --git a/EXAMPLE/pddrive1.c b/EXAMPLE/pddrive1.c index c2ccd067..72c88a6e 100644 --- a/EXAMPLE/pddrive1.c +++ b/EXAMPLE/pddrive1.c @@ -73,7 +73,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pddrive1_ABglobal.c b/EXAMPLE/pddrive1_ABglobal.c index 7f06b70e..7686b79c 100644 --- a/EXAMPLE/pddrive1_ABglobal.c +++ b/EXAMPLE/pddrive1_ABglobal.c @@ -72,6 +72,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { diff --git a/EXAMPLE/pddrive2_ABglobal.c b/EXAMPLE/pddrive2_ABglobal.c index 57ebadf3..e908a6ca 100644 --- a/EXAMPLE/pddrive2_ABglobal.c +++ b/EXAMPLE/pddrive2_ABglobal.c @@ -72,7 +72,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pddrive3_ABglobal.c b/EXAMPLE/pddrive3_ABglobal.c index 2e2a7433..e20c664d 100644 --- a/EXAMPLE/pddrive3_ABglobal.c +++ b/EXAMPLE/pddrive3_ABglobal.c @@ -78,7 +78,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pddrive4.c b/EXAMPLE/pddrive4.c index ca984a81..dbe9ee08 100644 --- a/EXAMPLE/pddrive4.c +++ b/EXAMPLE/pddrive4.c @@ -74,7 +74,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); if ( nprocs < 10 ) { fprintf(stderr, "Requires at least 10 processes\n"); diff --git a/EXAMPLE/pddrive4_ABglobal.c b/EXAMPLE/pddrive4_ABglobal.c index 9ff46dd8..2cf76078 100644 --- a/EXAMPLE/pddrive4_ABglobal.c +++ b/EXAMPLE/pddrive4_ABglobal.c @@ -71,6 +71,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); if ( nprocs < 10 ) { fprintf(stderr, "Requires at least 10 processes\n"); diff --git a/EXAMPLE/pddrive_ABglobal.c b/EXAMPLE/pddrive_ABglobal.c index a47388b5..3541ab92 100644 --- a/EXAMPLE/pddrive_ABglobal.c +++ b/EXAMPLE/pddrive_ABglobal.c @@ -73,7 +73,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pddrive_spawn.c b/EXAMPLE/pddrive_spawn.c index 131ea29c..b119b46e 100755 --- a/EXAMPLE/pddrive_spawn.c +++ b/EXAMPLE/pddrive_spawn.c @@ -82,7 +82,12 @@ int main(int argc, char *argv[]) //MPI_Init( &argc, &argv ); MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); MPI_Comm_get_parent(&parent); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif #if ( VAMPIR>=1 ) diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c index 3878558d..342b91a5 100644 --- a/EXAMPLE/pzdrive.c +++ b/EXAMPLE/pzdrive.c @@ -74,7 +74,12 @@ int main(int argc, char *argv[]) ------------------------------------------------------------*/ //MPI_Init( &argc, &argv ); MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif #if ( VAMPIR>=1 ) VT_traceoff(); diff --git a/EXAMPLE/pzdrive1.c b/EXAMPLE/pzdrive1.c index b65733b2..69aea1d0 100644 --- a/EXAMPLE/pzdrive1.c +++ b/EXAMPLE/pzdrive1.c @@ -72,7 +72,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pzdrive1_ABglobal.c b/EXAMPLE/pzdrive1_ABglobal.c index 4437e4a8..bf47169f 100644 --- a/EXAMPLE/pzdrive1_ABglobal.c +++ b/EXAMPLE/pzdrive1_ABglobal.c @@ -71,7 +71,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pzdrive2_ABglobal.c b/EXAMPLE/pzdrive2_ABglobal.c index 9959465b..96866c35 100644 --- a/EXAMPLE/pzdrive2_ABglobal.c +++ b/EXAMPLE/pzdrive2_ABglobal.c @@ -71,7 +71,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pzdrive3_ABglobal.c b/EXAMPLE/pzdrive3_ABglobal.c index c83cf1a3..144e3f6e 100644 --- a/EXAMPLE/pzdrive3_ABglobal.c +++ b/EXAMPLE/pzdrive3_ABglobal.c @@ -77,7 +77,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pzdrive3d.c b/EXAMPLE/pzdrive3d.c index 8bbed679..3e7f1463 100644 --- a/EXAMPLE/pzdrive3d.c +++ b/EXAMPLE/pzdrive3d.c @@ -183,6 +183,7 @@ main (int argc, char *argv[]) INITIALIZE THE SUPERLU PROCESS GRID. ------------------------------------------------------------ */ superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); + // grid.rankorder = 1; if(grid.iam==0) { MPI_Query_thread(&omp_mpi_level); diff --git a/EXAMPLE/pzdrive4.c b/EXAMPLE/pzdrive4.c index 7453d2ac..33451140 100644 --- a/EXAMPLE/pzdrive4.c +++ b/EXAMPLE/pzdrive4.c @@ -73,7 +73,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); if ( nprocs < 10 ) { fprintf(stderr, "Requires at least 10 processes\n"); diff --git a/EXAMPLE/pzdrive4_ABglobal.c b/EXAMPLE/pzdrive4_ABglobal.c index 5515e885..9b3ff81b 100644 --- a/EXAMPLE/pzdrive4_ABglobal.c +++ b/EXAMPLE/pzdrive4_ABglobal.c @@ -70,6 +70,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); if ( nprocs < 10 ) { fprintf(stderr, "Requires at least 10 processes\n"); diff --git a/EXAMPLE/pzdrive_ABglobal.c b/EXAMPLE/pzdrive_ABglobal.c index c3d798c1..b6f48554 100644 --- a/EXAMPLE/pzdrive_ABglobal.c +++ b/EXAMPLE/pzdrive_ABglobal.c @@ -72,7 +72,12 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init( &argc, &argv ); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { diff --git a/EXAMPLE/pzdrive_spawn.c b/EXAMPLE/pzdrive_spawn.c index 30a28dd1..8dab3751 100755 --- a/EXAMPLE/pzdrive_spawn.c +++ b/EXAMPLE/pzdrive_spawn.c @@ -82,7 +82,12 @@ int main(int argc, char *argv[]) //MPI_Init( &argc, &argv ); MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); MPI_Comm_get_parent(&parent); - +#ifdef GPU_ACC + int rank, devs; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cudaGetDeviceCount(&devs); + cudaSetDevice(rank % devs); +#endif #if ( VAMPIR>=1 ) diff --git a/EXAMPLE/zcreate_matrix3d.c b/EXAMPLE/zcreate_matrix3d.c index b3c43ffd..5f0f7b6f 100644 --- a/EXAMPLE/zcreate_matrix3d.c +++ b/EXAMPLE/zcreate_matrix3d.c @@ -341,17 +341,44 @@ int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs, nzval[0] = 0.1; #endif - /* Compute the number of rows to be distributed to local process */ - m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); - m_loc_fst = m_loc; - /* When m / procs is not an integer */ - if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) - { - /*m_loc = m_loc+1; - m_loc_fst = m_loc;*/ - if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ - m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); - } +// /* Compute the number of rows to be distributed to local process */ +// m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep); +// m_loc_fst = m_loc; +// /* When m / procs is not an integer */ +// if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m) +// { +// /*m_loc = m_loc+1; +// m_loc_fst = m_loc;*/ +// if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/ +// m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1); +// } + + switch(iam) { + case 0: + m_loc=111; fst_row=0; + break; + case 1: + m_loc=84; fst_row=111; + break; + case 2: + m_loc=108; fst_row=195; + break; + case 3: + m_loc=84; fst_row=303; + break; + case 4: + m_loc=108; fst_row=387; + break; + case 5: + m_loc=84; fst_row=495; + break; + case 6: + m_loc=108; fst_row=579; + break; + case 7: + m_loc=84; fst_row=687; + break; + } /* Create compressed column matrix for GA. */ zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr, @@ -379,7 +406,7 @@ int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs, for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]]; /* Set up row pointers */ rowptr[0] = 0; - fst_row = iam * m_loc_fst; +// fst_row = iam * m_loc_fst; nnz_loc = 0; for (j = 0; j < m_loc; ++j) { diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt index 51f10a42..ef071485 100644 --- a/SRC/CMakeLists.txt +++ b/SRC/CMakeLists.txt @@ -294,6 +294,12 @@ if(CUDAToolkit_FOUND) # this is found in top-level CMakeLists.txt target_link_libraries(superlu_dist CUDA::cudart CUDA::cublas) endif() +# This is recommended by modern cmake: +# https://cliutils.gitlab.io/modern-cmake/chapters/packages/OpenMP.html +if(OpenMP_FOUND) # this is found in top-level CMakeLists.txt + target_link_libraries(superlu_dist OpenMP::OpenMP_C) +endif() + target_compile_definitions(superlu_dist PRIVATE SUPERLU_DIST_EXPORTS) if(MSVC AND BUILD_SHARED_LIBS) set_target_properties(superlu_dist PROPERTIES diff --git a/SRC/dlustruct_gpu.h b/SRC/dlustruct_gpu.h index 23187e66..1cdc366d 100644 --- a/SRC/dlustruct_gpu.h +++ b/SRC/dlustruct_gpu.h @@ -78,8 +78,8 @@ typedef struct //LUstruct_gpu_ int_t *LrowindPtr; /* A single vector */ double *LnzvalVec; /* A single vector */ - int_t *LnzvalPtr; /* A single vector */ - int_t *LnzvalPtr_host; /* A single vector */ + int_t *LnzvalPtr; /* A single vector */ + int_t *LnzvalPtr_host; /* A single vector */ int_t *UrowindVec; /* A single vector */ int_t *UrowindPtr; /* A single vector */ @@ -87,7 +87,8 @@ typedef struct //LUstruct_gpu_ int_t *UnzvalPtr_host; double *UnzvalVec; /* A single vector */ - int_t *UnzvalPtr; /* A single vector */ + int_t *UnzvalPtr; /* A single vector */ + /*gpu pointers for easy block accesses */ local_l_blk_info_t *local_l_blk_infoVec; int_t *local_l_blk_infoPtr; @@ -109,7 +110,6 @@ typedef struct //LUstruct_gpu_ int_t *xsup; gridinfo_t *grid; - double ScatterMOPCounter; double ScatterMOPTimer; double GemmFLOPCounter; diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c index 625df8ea..ae1f1a86 100644 --- a/SRC/dnrformat_loc3d.c +++ b/SRC/dnrformat_loc3d.c @@ -80,6 +80,8 @@ void dGatherNRformat_loc3d nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int)); + + /* Gathered to layer 0. Other procs do not have these counts */ MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts, 1, mpi_int_t, 0, grid3d->zscp.comm); MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts, diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu index 93c72c18..832879d2 100644 --- a/SRC/dsuperlu_gpu.cu +++ b/SRC/dsuperlu_gpu.cu @@ -766,6 +766,7 @@ int dfree_LUstruct_gpu (dLUstruct_gpu_t * A_gpu) checkCuda(cudaFree(A_gpu->LnzvalVec)); checkCuda(cudaFree(A_gpu->LnzvalPtr)); free(A_gpu->LnzvalPtr_host); + /*freeing the pinned memory*/ int_t streamId = 0; checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_info_host)); @@ -798,8 +799,6 @@ int dfree_LUstruct_gpu (dLUstruct_gpu_t * A_gpu) checkCuda(cudaFree(A_gpu->grid)); - - checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV)); checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU)); @@ -814,7 +813,6 @@ int dfree_LUstruct_gpu (dLUstruct_gpu_t * A_gpu) checkCuda(cudaFree(A_gpu->scubufs[streamId].lsub)); checkCuda(cudaFree(A_gpu->scubufs[streamId].usub)); - checkCuda(cudaFree(A_gpu->local_l_blk_infoVec)); checkCuda(cudaFree(A_gpu->local_l_blk_infoPtr)); checkCuda(cudaFree(A_gpu->jib_lookupVec)); diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c index dbef9da9..0deebd74 100644 --- a/SRC/pdgstrf3d.c +++ b/SRC/pdgstrf3d.c @@ -15,8 +15,9 @@ at the top-level directory. * *
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
  */
 
 #include "superlu_ddefs.h"
@@ -225,14 +226,14 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int_t bigu_size = getBigUSize(nsupers, grid,
     	  	                  LUstruct->Llu->Lrowind_bc_ptr);
     HyP->bigu_size = bigu_size;
-    int_t buffer_size =sp_ienv_dist(8); // get_max_buffer_size ();
+    int_t buffer_size = sp_ienv_dist(8); // get_max_buffer_size ();
     HyP->buffer_size = buffer_size;
     HyP->nsupers = nsupers;
 
 #ifdef GPU_ACC
 
     /*Now initialize the GPU data structure*/
-    dLUstruct_gpu_t *A_gpu, *dA_gpu;
+    // dLUstruct_gpu_t *A_gpu, *dA_gpu; // not used
 
     d2Hreduce_t d2HredObj;
     d2Hreduce_t* d2Hred = &d2HredObj;
@@ -339,8 +340,13 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
 
         SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer
 	    : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1];
-    } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/
+    } /* end for (int ilvl = 0; ilvl < maxLvl; ++ilvl) */
 
+#ifdef GPU_ACC
+    /* This frees the GPU storage allocateed in initSluGPU3D_t() */
+    dfree_LUstruct_gpu (sluGPU->A_gpu);
+#endif
+    
     MPI_Barrier( grid3d->comm);
     SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
 
diff --git a/SRC/psgstrf3d.c b/SRC/psgstrf3d.c
index f071d593..2c73580d 100644
--- a/SRC/psgstrf3d.c
+++ b/SRC/psgstrf3d.c
@@ -233,7 +233,7 @@ int_t psgstrf3d(superlu_dist_options_t *options, int m, int n, float anorm,
 #ifdef GPU_ACC
 
     /*Now initialize the GPU data structure*/
-    sLUstruct_gpu_t *A_gpu, *dA_gpu;
+    // sLUstruct_gpu_t *A_gpu, *dA_gpu; // not used
 
     d2Hreduce_t d2HredObj;
     d2Hreduce_t* d2Hred = &d2HredObj;
@@ -340,8 +340,13 @@ int_t psgstrf3d(superlu_dist_options_t *options, int m, int n, float anorm,
 
         SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer
 	    : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1];
-    } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/
+    } /* end for (int ilvl = 0; ilvl < maxLvl; ++ilvl) */
 
+#ifdef GPU_ACC
+    /* This frees the GPU storage allocateed in initSluGPU3D_t() */
+    sfree_LUstruct_gpu (sluGPU->A_gpu);
+#endif
+    
     MPI_Barrier( grid3d->comm);
     SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
 
diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c
index 88eafa21..03e60b2e 100644
--- a/SRC/pzgstrf3d.c
+++ b/SRC/pzgstrf3d.c
@@ -14,8 +14,9 @@ at the top-level directory.
  *
  * 
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
  */
 
 #include "superlu_zdefs.h"
@@ -224,14 +225,14 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     int_t bigu_size = getBigUSize(nsupers, grid,
     	  	                  LUstruct->Llu->Lrowind_bc_ptr);
     HyP->bigu_size = bigu_size;
-    int_t buffer_size =sp_ienv_dist(8); // get_max_buffer_size ();
+    int_t buffer_size = sp_ienv_dist(8); // get_max_buffer_size ();
     HyP->buffer_size = buffer_size;
     HyP->nsupers = nsupers;
 
 #ifdef GPU_ACC
 
     /*Now initialize the GPU data structure*/
-    zLUstruct_gpu_t *A_gpu, *dA_gpu;
+    // zLUstruct_gpu_t *A_gpu, *dA_gpu; // not used
 
     d2Hreduce_t d2HredObj;
     d2Hreduce_t* d2Hred = &d2HredObj;
@@ -338,8 +339,13 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
 
         SCT->tSchCompUdt3d[ilvl] = ilvl == 0 ? SCT->NetSchurUpTimer
 	    : SCT->NetSchurUpTimer - SCT->tSchCompUdt3d[ilvl - 1];
-    } /*for (int_t ilvl = 0; ilvl < maxLvl; ++ilvl)*/
+    } /* end for (int ilvl = 0; ilvl < maxLvl; ++ilvl) */
 
+#ifdef GPU_ACC
+    /* This frees the GPU storage allocateed in initSluGPU3D_t() */
+    zfree_LUstruct_gpu (sluGPU->A_gpu);
+#endif
+    
     MPI_Barrier( grid3d->comm);
     SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
 
diff --git a/SRC/pzutil.c b/SRC/pzutil.c
index c784c497..cd5c69e4 100644
--- a/SRC/pzutil.c
+++ b/SRC/pzutil.c
@@ -810,6 +810,10 @@ void zDestroy_A3d_gathered_on_2d(zSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid
     SUPERLU_FREE(A3d->nnz_disp);
     SUPERLU_FREE(A3d->b_counts_int);
     SUPERLU_FREE(A3d->b_disp);
+    SUPERLU_FREE(A3d->procs_to_send_list);
+    SUPERLU_FREE(A3d->send_count_list);
+    SUPERLU_FREE(A3d->procs_recv_from_list);
+    SUPERLU_FREE(A3d->recv_count_list);
     SUPERLU_FREE( A2d );         // free 2D structure
     SUPERLU_FREE( A3d );         // free 3D structure
 } /* zDestroy_A3d_gathered_on_2d */
@@ -889,5 +893,3 @@ zDestroy_Tree(int_t n, gridinfo_t *grid, zLUstruct_t *LUstruct)
     CHECK_MALLOC(iam, "Exit zDestroy_Tree()");
 #endif
 }
-
-
diff --git a/SRC/slustruct_gpu.h b/SRC/slustruct_gpu.h
index 48038dbf..9475fba8 100644
--- a/SRC/slustruct_gpu.h
+++ b/SRC/slustruct_gpu.h
@@ -78,8 +78,8 @@ typedef struct //LUstruct_gpu_
     int_t   *LrowindPtr;      /* A single vector */
 
     float  *LnzvalVec;       /* A single vector */
-    int_t   *LnzvalPtr;       /* A single vector */
-    int_t   *LnzvalPtr_host;  /* A single vector */
+    int_t   *LnzvalPtr;        /* A single vector */
+    int_t   *LnzvalPtr_host;   /* A single vector */
 
     int_t   *UrowindVec;            /* A single vector */
     int_t   *UrowindPtr;            /* A single vector */
@@ -87,7 +87,8 @@ typedef struct //LUstruct_gpu_
     int_t   *UnzvalPtr_host;
 
     float  *UnzvalVec;       /* A single vector */
-    int_t   *UnzvalPtr;      /* A single vector */
+    int_t   *UnzvalPtr;        /* A single vector */
+    
     /*gpu pointers for easy block accesses */
     local_l_blk_info_t *local_l_blk_infoVec;
     int_t *local_l_blk_infoPtr;
@@ -109,7 +110,6 @@ typedef struct //LUstruct_gpu_
     int_t *xsup;
     gridinfo_t *grid;
 
-
     double ScatterMOPCounter;
     double ScatterMOPTimer;
     double GemmFLOPCounter;
diff --git a/SRC/ssuperlu_gpu.cu b/SRC/ssuperlu_gpu.cu
index adb67693..78b70b83 100644
--- a/SRC/ssuperlu_gpu.cu
+++ b/SRC/ssuperlu_gpu.cu
@@ -766,6 +766,7 @@ int sfree_LUstruct_gpu (sLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->LnzvalVec));
 	checkCuda(cudaFree(A_gpu->LnzvalPtr));
 	free(A_gpu->LnzvalPtr_host);
+	
 	/*freeing the pinned memory*/
 	int_t streamId = 0;
 	checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_info_host));
@@ -798,8 +799,6 @@ int sfree_LUstruct_gpu (sLUstruct_gpu_t * A_gpu)
 
 	checkCuda(cudaFree(A_gpu->grid));
 
-
-
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV));
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU));
 
@@ -814,7 +813,6 @@ int sfree_LUstruct_gpu (sLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].lsub));
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].usub));
 
-
 	checkCuda(cudaFree(A_gpu->local_l_blk_infoVec));
 	checkCuda(cudaFree(A_gpu->local_l_blk_infoPtr));
 	checkCuda(cudaFree(A_gpu->jib_lookupVec));
diff --git a/SRC/supermatrix.h b/SRC/supermatrix.h
index 9913aa6b..1d720355 100644
--- a/SRC/supermatrix.h
+++ b/SRC/supermatrix.h
@@ -205,6 +205,15 @@ typedef struct NRformat_loc3d
     int *nnz_disp;
     int *b_counts_int;
     int *b_disp;
+
+    /* The following 4 structures are used for scattering
+       solution X from 2D grid-0 back to 3D processes */
+    int num_procs_to_send;  
+    int *procs_to_send_list;
+    int *send_count_list;
+    int num_procs_to_recv;
+    int *procs_recv_from_list;
+    int *recv_count_list;
 } NRformat_loc3d;
 
 
diff --git a/SRC/zlustruct_gpu.h b/SRC/zlustruct_gpu.h
index 1064d424..39288000 100644
--- a/SRC/zlustruct_gpu.h
+++ b/SRC/zlustruct_gpu.h
@@ -77,8 +77,8 @@ typedef struct //LUstruct_gpu_
     int_t   *LrowindPtr;      /* A single vector */
 
     doublecomplex  *LnzvalVec;       /* A single vector */
-    int_t   *LnzvalPtr;       /* A single vector */
-    int_t   *LnzvalPtr_host;  /* A single vector */
+    int_t   *LnzvalPtr;        /* A single vector */
+    int_t   *LnzvalPtr_host;   /* A single vector */
 
     int_t   *UrowindVec;            /* A single vector */
     int_t   *UrowindPtr;            /* A single vector */
@@ -86,7 +86,8 @@ typedef struct //LUstruct_gpu_
     int_t   *UnzvalPtr_host;
 
     doublecomplex  *UnzvalVec;       /* A single vector */
-    int_t   *UnzvalPtr;      /* A single vector */
+    int_t   *UnzvalPtr;        /* A single vector */
+    
     /*gpu pointers for easy block accesses */
     local_l_blk_info_t *local_l_blk_infoVec;
     int_t *local_l_blk_infoPtr;
@@ -108,7 +109,6 @@ typedef struct //LUstruct_gpu_
     int_t *xsup;
     gridinfo_t *grid;
 
-
     double ScatterMOPCounter;
     double ScatterMOPTimer;
     double GemmFLOPCounter;
diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c
index 89de8dd0..f93cb215 100644
--- a/SRC/znrformat_loc3d.c
+++ b/SRC/znrformat_loc3d.c
@@ -22,6 +22,7 @@ at the top-level directory.
  */
 
 #include "superlu_zdefs.h"
+#include 
 
 /* Dst <- BlockByBlock (Src), reshape the block storage. */
 static void matCopy(int n, int m, doublecomplex *Dst, int lddst, doublecomplex *Src, int ldsrc)
@@ -66,6 +67,7 @@ void zGatherNRformat_loc3d
     if ( Fact == DOFACT ) { /* Factorize from scratch */
 	/* A3d is output. Compute counts from scratch */
 	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A3d->num_procs_to_send = EMPTY; // No X(2d) -> X(3d) comm. schedule yet
 	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
     
 	// find number of nnzs
@@ -109,6 +111,7 @@ void zGatherNRformat_loc3d
 		A2d->nzval = doublecomplexMalloc_dist(nnz_disp[grid3d->npdep]);
 		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
 		A2d->rowptr[0] = 0;
+		printf(" Gather layer-0: iam %d\n", grid3d->iam); fflush(stdout);
 	    }
 
 	MPI_Gatherv(A->nzval, A->nnz_loc, SuperLU_MPI_DOUBLE_COMPLEX, A2d->nzval,
@@ -297,7 +300,8 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
     int *b_disp         = A3d->b_disp;
     int *row_counts_int = A3d->row_counts_int;
     int *row_disp       = A3d->row_disp;
-    int i, p;
+    int i, j, k, p;
+    int num_procs_to_send, num_procs_to_recv; // persistent across multiple solves
     int iam = grid3d->iam;
     int rankorder = grid3d->rankorder;
     gridinfo_t *grid2d = &(grid3d->grid2d);
@@ -333,14 +337,14 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 		     Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX,
 		     0, grid3d->zscp.comm);
 
-    } else { /* Z-major in 3D grid */
+    } else { /* Z-major in 3D grid (default) */
         /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
 	                       0      3      6      9
  	                       1      4      7      10      
 	                       2      5      8      11
 	  GATHER:  {A, B} in A * X = B
 	  layer-0:
-    	       B (row space)  X (column space)  SCATTER
+	  B (row space)  X (column space)  SCATTER
 	       ----           ----        ---->>
            P0  0              0
 (equations     3              1      Proc 0 -> Procs {0, 1, 2, 3}
@@ -358,15 +362,172 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 	       11            11
 	       ----         ----
 	*/
-        MPI_Request recv_req;
 	MPI_Status recv_status;
 	int pxy = grid2d->nprow * grid2d->npcol;
 	int npdep = grid3d->npdep, dest, src, tag;
-	int nprocs = pxy * npdep;
-
+	int nprocs = pxy * npdep; // all procs in 3D grid 
+	MPI_Request *recv_reqs = (MPI_Request*) SUPERLU_MALLOC(npdep * sizeof(MPI_Request));
+	int num_procs_to_send;
+	int *procs_to_send_list;
+	int *send_count_list;
+	int num_procs_to_recv;
+	int *procs_recv_from_list;
+	int *recv_count_list;
+
+	if ( A3d->num_procs_to_send == -1 ) { /* First time: set up communication schedule */
+	    /* 1. Set up the destination processes from each source process,
+	       and the send counts.	
+	       - Only grid-0 processes need to send.
+	       - row_disp[] recorded the prefix sum of the block rows of RHS
+	       along the processes Z-dimension.
+	       row_disp[npdep] is the total number of X entries on my proc.
+	       (equals A2d->m_loc.)
+	       A2d->fst_row records the boundary of the partition on grid-0.
+	       - Need to compute the prefix sum of the block rows of X
+	       among all the processes.
+	       A->fst_row has this info, but is available only locally.
+	    */
+	
+	    int *m_loc_3d_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* related to m_loc in 3D partition */
+	    int *x_send_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    int *x_recv_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* The following should be persistent across multiple solves.
+	       These lists avoid All-to-All communication. */
+	    procs_to_send_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    send_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    procs_recv_from_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    recv_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+
+	    for (p = 0; p < nprocs; ++p) {
+		x_send_counts[p] = 0;
+		x_recv_counts[p] = 0;
+		procs_to_send_list[p] = EMPTY; // (-1)
+		procs_recv_from_list[p] = EMPTY;
+	    }
+	    
+	    /* All procs participate */
+	    MPI_Allgather(&(A3d->m_loc), 1, MPI_INT, m_loc_3d_counts, 1,
+			  MPI_INT, grid3d->comm);
+	    
+	    /* Layer 0 set up sends info. The other layers have 0 send counts. */
+	    if (grid3d->zscp.Iam == 0) {
+		int x_fst_row = A2d->fst_row; // start from a layer 0 boundary
+		int x_end_row = A2d->fst_row + A2d->m_loc; // end of boundary + 1
+		int sum_m_loc; // prefix sum of m_loc among all processes
+		
+		/* Loop through all processes.
+		   Search for 1st X-interval in grid-0's B-interval */
+		num_procs_to_send = sum_m_loc = 0;
+		for (p = 0; p < nprocs; ++p) {
+		    
+		    sum_m_loc += m_loc_3d_counts[p];
+		    
+		    if (sum_m_loc > x_end_row) { // reach the 2D block boundary
+			x_send_counts[p] = x_end_row - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			break;
+		    } else if (x_fst_row < sum_m_loc) {
+			x_send_counts[p] = sum_m_loc - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			x_fst_row = sum_m_loc; //+= m_loc_3d_counts[p];
+			if (x_fst_row >= x_end_row) break;
+		    }
+		    
+		    //sum_m_loc += m_loc_3d_counts[p+1];
+		} /* end for p ... */
+	    } else { /* end layer 0 */
+		num_procs_to_send = 0;
+	    }
+	    
+	    /* 2. Set up the source processes from each destination process,
+	       and the recv counts.
+	       All processes may need to receive something from grid-0. */
+	    /* The following transposes x_send_counts matrix to
+	       x_recv_counts matrix */
+	    MPI_Alltoall(x_send_counts, 1, MPI_INT, x_recv_counts, 1, MPI_INT,
+			 grid3d->comm);
+	    
+	    j = 0; // tracking number procs to receive from
+	    for (p = 0; p < nprocs; ++p) {
+		if (x_recv_counts[p]) {
+		    procs_recv_from_list[j] = p;
+		    recv_count_list[j] = x_recv_counts[p];
+		    src = p;  tag = iam;
+		    //printf("RECV: src %d -> iam %d, x_recv_counts[p] %d, tag %d\n",
+		    //src, iam, x_recv_counts[p], tag);
+		    //fflush(stdout);
+		    ++j;
+		}
+	    }
+	    num_procs_to_recv = j;
+
+	    /* Persist in A3d structure */
+	    A3d->num_procs_to_send = num_procs_to_send;
+	    A3d->procs_to_send_list = procs_to_send_list;
+	    A3d->send_count_list = send_count_list;
+	    A3d->num_procs_to_recv = num_procs_to_recv;
+	    A3d->procs_recv_from_list = procs_recv_from_list;
+	    A3d->recv_count_list = recv_count_list;
+
+	    SUPERLU_FREE(m_loc_3d_counts);
+	    SUPERLU_FREE(x_send_counts);
+	    SUPERLU_FREE(x_recv_counts);
+	} else { /* Reuse the communication schedule */
+	    num_procs_to_send = A3d->num_procs_to_send;
+	    procs_to_send_list = A3d->procs_to_send_list;
+	    send_count_list = A3d->send_count_list;
+	    num_procs_to_recv = A3d->num_procs_to_recv;
+	    procs_recv_from_list = A3d->procs_recv_from_list;
+	    recv_count_list = A3d->recv_count_list;
+	}
+	
+	/* 3. Perform the acutal communication */
+	    
+	/* Post irecv first */
+	i = 0; // tracking offset in the recv buffer Btmp[]
+	for (j = 0; j < num_procs_to_recv; ++j) {
+	    src = procs_recv_from_list[j];
+	    tag = iam;
+	    k = nrhs * recv_count_list[j]; // recv count
+	    MPI_Irecv( Btmp + i, k, SuperLU_MPI_DOUBLE_COMPLEX,
+		       src, tag, grid3d->comm, &recv_reqs[j] );
+	    i += k;
+	}
+	    
+	/* Send */
+	/* Layer 0 sends to *num_procs_to_send* procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (i = 0, p = 0; p < num_procs_to_send; ++p) { 
+		dest = procs_to_send_list[p]; //p + grid2d->iam * npdep;
+		tag = dest;
+		/*printf("SEND: iam %d -> %d, send_count_list[p] %d, tag %d\n",
+		  iam,dest, send_count_list[p], tag);
+		  fflush(stdout); */
+		    
+		MPI_Send(B1 + i, nrhs * send_count_list[p], 
+			 SuperLU_MPI_DOUBLE_COMPLEX, dest, tag, grid3d->comm);
+		i += nrhs * send_count_list[p];
+	    }
+	}  /* end layer 0 send */
+	    
+	/* Wait for all Irecv's to complete */
+	for (i = 0; i < num_procs_to_recv; ++i)
+	    MPI_Wait(&recv_reqs[i], &recv_status);
+	    
+	///////////	
+#if 0 // The following code works only with even block distribution of RHS 
 	/* Everyone receives one block (post non-blocking irecv) */
 	src = grid3d->iam / npdep;  // Z-major
 	tag = iam;
+	
 	MPI_Irecv(Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX,
 		 src, tag, grid3d->comm, &recv_req);
 
@@ -381,10 +542,12 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 			 SuperLU_MPI_DOUBLE_COMPLEX, dest, tag, grid3d->comm);
 	    }
 	}  /* end layer 0 send */
-    
+	
 	/* Wait for Irecv to complete */
 	MPI_Wait(&recv_req, &recv_status);
-
+#endif
+	///////////	
+	
     } /* else Z-major */
 
     // B <- colMajor(Btmp)
diff --git a/SRC/zsuperlu_gpu.cu b/SRC/zsuperlu_gpu.cu
index a0746dc1..b5b6b361 100644
--- a/SRC/zsuperlu_gpu.cu
+++ b/SRC/zsuperlu_gpu.cu
@@ -775,6 +775,7 @@ int zfree_LUstruct_gpu (zLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->LnzvalVec));
 	checkCuda(cudaFree(A_gpu->LnzvalPtr));
 	free(A_gpu->LnzvalPtr_host);
+	
 	/*freeing the pinned memory*/
 	int_t streamId = 0;
 	checkCuda (cudaFreeHost (A_gpu->scubufs[streamId].Remain_info_host));
@@ -807,8 +808,6 @@ int zfree_LUstruct_gpu (zLUstruct_gpu_t * A_gpu)
 
 	checkCuda(cudaFree(A_gpu->grid));
 
-
-
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV));
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU));
 
@@ -823,7 +822,6 @@ int zfree_LUstruct_gpu (zLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].lsub));
 	checkCuda(cudaFree(A_gpu->scubufs[streamId].usub));
 
-
 	checkCuda(cudaFree(A_gpu->local_l_blk_infoVec));
 	checkCuda(cudaFree(A_gpu->local_l_blk_infoPtr));
 	checkCuda(cudaFree(A_gpu->jib_lookupVec));
diff --git a/run_cmake_build.sh b/run_cmake_build.sh
index a5284dde..b318c0ce 100755
--- a/run_cmake_build.sh
+++ b/run_cmake_build.sh
@@ -57,7 +57,10 @@ then
     -DCMAKE_C_FLAGS="-std=c99 -O3 -g -DPRNTlevel=0 -DDEBUGlevel=0" \
     -DCMAKE_C_COMPILER=mpicc \
     -DCMAKE_CXX_COMPILER=mpicxx \
+    -DCMAKE_CXX_FLAGS="-std=c++11" \
     -DCMAKE_Fortran_COMPILER=mpif90 \
+    -DCMAKE_LINKER=mpicxx \
+    -Denable_openmp=ON \
     -DTPL_ENABLE_INTERNAL_BLASLIB=OFF \
     -DTPL_ENABLE_COMBBLASLIB=OFF \
     -DTPL_ENABLE_LAPACKLIB=OFF \

From e92107c6b779625a38734414ae164bf99fa5c750 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Thu, 23 Sep 2021 09:23:39 -0700
Subject: [PATCH 126/147] Add more comments in 3D B->X scatter code.

---
 EXAMPLE/Makefile           |  6 ++---
 EXAMPLE/pzdrive3d1.c       |  1 +
 EXAMPLE/zcreate_matrix3d.c | 49 +++++++++-----------------------------
 SRC/pdutil.c               |  4 ++++
 SRC/psutil.c               |  4 ++++
 SRC/znrformat_loc3d.c      | 17 ++++++++++---
 6 files changed, 37 insertions(+), 44 deletions(-)

diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile
index ca5620b3..7ab5a475 100644
--- a/EXAMPLE/Makefile
+++ b/EXAMPLE/Makefile
@@ -53,9 +53,9 @@ ZEXM1	= pzdrive1.o zcreate_matrix.o
 ZEXM2	= pzdrive2.o zcreate_matrix.o zcreate_matrix_perturbed.o
 ZEXM3	= pzdrive3.o zcreate_matrix.o
 ZEXM4	= pzdrive4.o zcreate_matrix.o
-ZEXM3D	= pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o #znrformat_loc3d.o
-ZEXM3D1	= pzdrive3d1.o zcreate_matrix.o zcreate_matrix3d.o
-ZEXM3D2	= pzdrive3d2.o zcreate_matrix.o zcreate_matrix3d.o
+ZEXM3D	= pzdrive3d.o zcreate_matrix.o zcreate_matrix3d_Jake.o #znrformat_loc3d.o
+ZEXM3D1	= pzdrive3d1.o zcreate_matrix.o zcreate_matrix3d_Jake.o 
+ZEXM3D2	= pzdrive3d2.o zcreate_matrix.o zcreate_matrix3d_Jake.o
 ZEXM3D3	= pzdrive3d3.o zcreate_matrix.o zcreate_matrix3d.o
 
 ZEXMG	= pzdrive_ABglobal.o
diff --git a/EXAMPLE/pzdrive3d1.c b/EXAMPLE/pzdrive3d1.c
index 37938cd4..660c34e1 100644
--- a/EXAMPLE/pzdrive3d1.c
+++ b/EXAMPLE/pzdrive3d1.c
@@ -340,6 +340,7 @@ main (int argc, char *argv[])
     PStatInit (&stat);
 
     /* Call the linear equation solver. */
+    nrhs = 0;
     pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid,
                &LUstruct, &SOLVEstruct, berr, &stat, &info);
 
diff --git a/EXAMPLE/zcreate_matrix3d.c b/EXAMPLE/zcreate_matrix3d.c
index 5f0f7b6f..45f4fbf2 100644
--- a/EXAMPLE/zcreate_matrix3d.c
+++ b/EXAMPLE/zcreate_matrix3d.c
@@ -341,45 +341,18 @@ int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs,
     nzval[0] = 0.1;
 #endif
 
-//    /* Compute the number of rows to be distributed to local process */
-//    m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep);
-//    m_loc_fst = m_loc;
-//    /* When m / procs is not an integer */
-//    if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m)
-//    {
-//        /*m_loc = m_loc+1;
-//          m_loc_fst = m_loc;*/
-//        if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/
-//            m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1);
-//    }
+    /* Compute the number of rows to be distributed to local process */
+    m_loc = m / (grid3d->nprow * grid3d->npcol* grid3d->npdep);
+    m_loc_fst = m_loc;
+    /* When m / procs is not an integer */
+    if ((m_loc * grid3d->nprow * grid3d->npcol* grid3d->npdep) != m)
+    {
+        /*m_loc = m_loc+1;
+          m_loc_fst = m_loc;*/
+        if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/
+            m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1);
+    }
  
-    switch(iam) {
-      case 0:
-       m_loc=111; fst_row=0;
-       break;
-      case 1:
-       m_loc=84; fst_row=111;
-       break;
-      case 2:
-       m_loc=108; fst_row=195;
-       break;
-      case 3:
-       m_loc=84; fst_row=303;
-       break;
-      case 4:
-       m_loc=108; fst_row=387;
-       break;
-      case 5:
-       m_loc=84; fst_row=495;
-       break;
-      case 6:
-       m_loc=108; fst_row=579;
-       break;
-      case 7:
-       m_loc=84; fst_row=687;
-       break;
-     }
-
     /* Create compressed column matrix for GA. */
     zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
                                 SLU_NC, SLU_D, SLU_GE);
diff --git a/SRC/pdutil.c b/SRC/pdutil.c
index 3a5fcc28..2bb33d54 100644
--- a/SRC/pdutil.c
+++ b/SRC/pdutil.c
@@ -811,6 +811,10 @@ void dDestroy_A3d_gathered_on_2d(dSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid
     SUPERLU_FREE(A3d->nnz_disp);
     SUPERLU_FREE(A3d->b_counts_int);
     SUPERLU_FREE(A3d->b_disp);
+    SUPERLU_FREE(A3d->procs_to_send_list);
+    SUPERLU_FREE(A3d->send_count_list);
+    SUPERLU_FREE(A3d->procs_recv_from_list);
+    SUPERLU_FREE(A3d->recv_count_list);
     SUPERLU_FREE( A2d );         // free 2D structure
     SUPERLU_FREE( A3d );         // free 3D structure
 } /* dDestroy_A3d_gathered_on_2d */
diff --git a/SRC/psutil.c b/SRC/psutil.c
index 25e0faa2..270584a8 100644
--- a/SRC/psutil.c
+++ b/SRC/psutil.c
@@ -811,6 +811,10 @@ void sDestroy_A3d_gathered_on_2d(sSOLVEstruct_t *SOLVEstruct, gridinfo3d_t *grid
     SUPERLU_FREE(A3d->nnz_disp);
     SUPERLU_FREE(A3d->b_counts_int);
     SUPERLU_FREE(A3d->b_disp);
+    SUPERLU_FREE(A3d->procs_to_send_list);
+    SUPERLU_FREE(A3d->send_count_list);
+    SUPERLU_FREE(A3d->procs_recv_from_list);
+    SUPERLU_FREE(A3d->recv_count_list);
     SUPERLU_FREE( A2d );         // free 2D structure
     SUPERLU_FREE( A3d );         // free 3D structure
 } /* sDestroy_A3d_gathered_on_2d */
diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c
index f93cb215..56a28e8d 100644
--- a/SRC/znrformat_loc3d.c
+++ b/SRC/znrformat_loc3d.c
@@ -361,6 +361,15 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 	       8             10
 	       11            11
 	       ----         ----
+
+         In the most general case, block rows of B are not of even size, then the
+	 Layer 0 partition may overlap with 3D partition in an arbitrary manner.
+	 For example:
+	                  P0        P1        P2       P3
+             X on grid-0: |-----------|---------|---------|-----------|
+
+	     X on 3D:     |___|____|_____|_______|____|______|____|___|
+	                  P0  P1   P2    P3      P4   P5     P6   P7  P8
 	*/
 	MPI_Status recv_status;
 	int pxy = grid2d->nprow * grid2d->npcol;
@@ -460,10 +469,12 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 		    procs_recv_from_list[j] = p;
 		    recv_count_list[j] = x_recv_counts[p];
 		    src = p;  tag = iam;
-		    //printf("RECV: src %d -> iam %d, x_recv_counts[p] %d, tag %d\n",
-		    //src, iam, x_recv_counts[p], tag);
-		    //fflush(stdout);
 		    ++j;
+#if 1
+		    printf("RECV: src %d -> iam %d, x_recv_counts[p] %d, tag %d\n",
+			   src, iam, x_recv_counts[p], tag);
+		    fflush(stdout);
+#endif		    
 		}
 	    }
 	    num_procs_to_recv = j;

From 523f006ca18d35ede8a1232b9361a9e39814e842 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Wed, 29 Sep 2021 10:34:46 -0700
Subject: [PATCH 127/147] Updated the real version of xScatter_B3d() to support
 uneven block row partition of {A,B} in 3D interface.

---
 SRC/dnrformat_loc3d.c | 184 ++++++++++++++++++++++++++++++++++++++++--
 SRC/pdgssvx.c         |   9 ---
 SRC/pdgssvx3d.c       |   8 --
 SRC/psgssvx.c         |   9 ---
 SRC/psgssvx3d.c       |   8 --
 SRC/pzgssvx.c         |   9 ---
 SRC/pzgssvx3d.c       |   8 --
 SRC/snrformat_loc3d.c | 182 +++++++++++++++++++++++++++++++++++++++--
 SRC/superlu_grid.c    |   9 +++
 SRC/superlu_grid3d.c  |  10 +++
 SRC/znrformat_loc3d.c |  34 ++++----
 11 files changed, 389 insertions(+), 81 deletions(-)

diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c
index ae1f1a86..4ae5a8e6 100644
--- a/SRC/dnrformat_loc3d.c
+++ b/SRC/dnrformat_loc3d.c
@@ -67,6 +67,7 @@ void dGatherNRformat_loc3d
     if ( Fact == DOFACT ) { /* Factorize from scratch */
 	/* A3d is output. Compute counts from scratch */
 	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A3d->num_procs_to_send = EMPTY; // No X(2d) -> X(3d) comm. schedule yet
 	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
     
 	// find number of nnzs
@@ -80,8 +81,6 @@ void dGatherNRformat_loc3d
 	nnz_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
 	row_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
 	b_counts_int = SUPERLU_MALLOC(grid3d->npdep * sizeof(int));
-
-	/* Gathered to layer 0. Other procs do not have these counts */
 	MPI_Gather(&A->nnz_loc, 1, mpi_int_t, nnz_counts,
 		   1, mpi_int_t, 0, grid3d->zscp.comm);
 	MPI_Gather(&A->m_loc, 1, mpi_int_t, row_counts,
@@ -300,7 +299,8 @@ int dScatter_B3d(NRformat_loc3d *A3d,  // modified
     int *b_disp         = A3d->b_disp;
     int *row_counts_int = A3d->row_counts_int;
     int *row_disp       = A3d->row_disp;
-    int i, p;
+    int i, j, k, p;
+    int num_procs_to_send, num_procs_to_recv; // persistent across multiple solves
     int iam = grid3d->iam;
     int rankorder = grid3d->rankorder;
     gridinfo_t *grid2d = &(grid3d->grid2d);
@@ -336,7 +336,7 @@ int dScatter_B3d(NRformat_loc3d *A3d,  // modified
 		     Btmp, nrhs * A3d->m_loc, MPI_DOUBLE,
 		     0, grid3d->zscp.comm);
 
-    } else { /* Z-major in 3D grid */
+    } else { /* Z-major in 3D grid (default) */
         /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
 	                       0      3      6      9
  	                       1      4      7      10      
@@ -360,13 +360,181 @@ int dScatter_B3d(NRformat_loc3d *A3d,  // modified
 	       8             10
 	       11            11
 	       ----         ----
+         In the most general case, block rows of B are not of even size, then the
+	 Layer 0 partition may overlap with 3D partition in an arbitrary manner.
+	 For example:
+	                  P0        P1        P2       P3
+             X on grid-0: |___________|__________|_________|________|
+
+	     X on 3D:     |___|____|_____|____|__|______|_____|_____|
+	                  P0  P1   P2    P3   P4   P5     P6   P7  
 	*/
-        MPI_Request recv_req;
 	MPI_Status recv_status;
 	int pxy = grid2d->nprow * grid2d->npcol;
 	int npdep = grid3d->npdep, dest, src, tag;
-	int nprocs = pxy * npdep;
+	int nprocs = pxy * npdep; // all procs in 3D grid 
+	MPI_Request *recv_reqs = (MPI_Request*) SUPERLU_MALLOC(npdep * sizeof(MPI_Request));
+	int num_procs_to_send;
+	int *procs_to_send_list;
+	int *send_count_list;
+	int num_procs_to_recv;
+	int *procs_recv_from_list;
+	int *recv_count_list;
+
+	if ( A3d->num_procs_to_send == -1 ) { /* First time: set up communication schedule */
+	    /* 1. Set up the destination processes from each source process,
+	       and the send counts.	
+	       - Only grid-0 processes need to send.
+	       - row_disp[] recorded the prefix sum of the block rows of RHS
+	       	 	    along the processes Z-dimension.
+	         row_disp[npdep] is the total number of X entries on my proc.
+	       	     (equals A2d->m_loc.)
+	         A2d->fst_row records the boundary of the partition on grid-0.
+	       - Need to compute the prefix sum of the block rows of X
+	       	 among all the processes.
+	       	 A->fst_row has this info, but is available only locally.
+	    */
+	
+	    int *m_loc_3d_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* related to m_loc in 3D partition */
+	    int *x_send_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    int *x_recv_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* The following should be persistent across multiple solves.
+	       These lists avoid All-to-All communication. */
+	    procs_to_send_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    send_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    procs_recv_from_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    recv_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+
+	    for (p = 0; p < nprocs; ++p) {
+		x_send_counts[p] = 0;
+		x_recv_counts[p] = 0;
+		procs_to_send_list[p] = EMPTY; // (-1)
+		procs_recv_from_list[p] = EMPTY;
+	    }
+	    
+	    /* All procs participate */
+	    MPI_Allgather(&(A3d->m_loc), 1, MPI_INT, m_loc_3d_counts, 1,
+			  MPI_INT, grid3d->comm);
+	    
+	    /* Layer 0 set up sends info. The other layers have 0 send counts. */
+	    if (grid3d->zscp.Iam == 0) {
+		int x_fst_row = A2d->fst_row; // start from a layer 0 boundary
+		int x_end_row = A2d->fst_row + A2d->m_loc; // end of boundary + 1
+		int sum_m_loc; // prefix sum of m_loc among all processes
+		
+		/* Loop through all processes.
+		   Search for 1st X-interval in grid-0's B-interval */
+		num_procs_to_send = sum_m_loc = 0;
+		for (p = 0; p < nprocs; ++p) {
+		    
+		    sum_m_loc += m_loc_3d_counts[p];
+		    
+		    if (sum_m_loc > x_end_row) { // reach the 2D block boundary
+			x_send_counts[p] = x_end_row - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			break;
+		    } else if (x_fst_row < sum_m_loc) {
+			x_send_counts[p] = sum_m_loc - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			x_fst_row = sum_m_loc; //+= m_loc_3d_counts[p];
+			if (x_fst_row >= x_end_row) break;
+		    }
+		    
+		    //sum_m_loc += m_loc_3d_counts[p+1];
+		} /* end for p ... */
+	    } else { /* end layer 0 */
+		num_procs_to_send = 0;
+	    }
+	    
+	    /* 2. Set up the source processes from each destination process,
+	       and the recv counts.
+	       All processes may need to receive something from grid-0. */
+	    /* The following transposes x_send_counts matrix to
+	       x_recv_counts matrix */
+	    MPI_Alltoall(x_send_counts, 1, MPI_INT, x_recv_counts, 1, MPI_INT,
+			 grid3d->comm);
+	    
+	    j = 0; // tracking number procs to receive from
+	    for (p = 0; p < nprocs; ++p) {
+		if (x_recv_counts[p]) {
+		    procs_recv_from_list[j] = p;
+		    recv_count_list[j] = x_recv_counts[p];
+		    src = p;  tag = iam;
+		    ++j;
+#if 0		    
+		    printf("RECV: src %d -> iam %d, x_recv_counts[p] %d, tag %d\n",
+			   src, iam, x_recv_counts[p], tag);
+		    fflush(stdout);
+#endif		    
+		}
+	    }
+	    num_procs_to_recv = j;
+
+	    /* Persist in A3d structure */
+	    A3d->num_procs_to_send = num_procs_to_send;
+	    A3d->procs_to_send_list = procs_to_send_list;
+	    A3d->send_count_list = send_count_list;
+	    A3d->num_procs_to_recv = num_procs_to_recv;
+	    A3d->procs_recv_from_list = procs_recv_from_list;
+	    A3d->recv_count_list = recv_count_list;
+
+	    SUPERLU_FREE(m_loc_3d_counts);
+	    SUPERLU_FREE(x_send_counts);
+	    SUPERLU_FREE(x_recv_counts);
+	} else { /* Reuse the communication schedule */
+	    num_procs_to_send = A3d->num_procs_to_send;
+	    procs_to_send_list = A3d->procs_to_send_list;
+	    send_count_list = A3d->send_count_list;
+	    num_procs_to_recv = A3d->num_procs_to_recv;
+	    procs_recv_from_list = A3d->procs_recv_from_list;
+	    recv_count_list = A3d->recv_count_list;
+	}
+	
+	/* 3. Perform the acutal communication */
+	    
+	/* Post irecv first */
+	i = 0; // tracking offset in the recv buffer Btmp[]
+	for (j = 0; j < num_procs_to_recv; ++j) {
+	    src = procs_recv_from_list[j];
+	    tag = iam;
+	    k = nrhs * recv_count_list[j]; // recv count
+	    MPI_Irecv( Btmp + i, k, MPI_DOUBLE,
+		       src, tag, grid3d->comm, &recv_reqs[j] );
+	    i += k;
+	}
+	    
+	/* Send */
+	/* Layer 0 sends to *num_procs_to_send* procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (i = 0, p = 0; p < num_procs_to_send; ++p) { 
+		dest = procs_to_send_list[p]; //p + grid2d->iam * npdep;
+		tag = dest;
+		/*printf("SEND: iam %d -> %d, send_count_list[p] %d, tag %d\n",
+		  iam,dest, send_count_list[p], tag);
+		  fflush(stdout); */
+		    
+		MPI_Send(B1 + i, nrhs * send_count_list[p], 
+			 MPI_DOUBLE, dest, tag, grid3d->comm);
+		i += nrhs * send_count_list[p];
+	    }
+	}  /* end layer 0 send */
+	    
+	/* Wait for all Irecv's to complete */
+	for (i = 0; i < num_procs_to_recv; ++i)
+	    MPI_Wait(&recv_reqs[i], &recv_status);
 
+        SUPERLU_FREE(recv_reqs);
+
+	///////////	
+#if 0 // The following code works only with even block distribution of RHS 
 	/* Everyone receives one block (post non-blocking irecv) */
 	src = grid3d->iam / npdep;  // Z-major
 	tag = iam;
@@ -387,7 +555,9 @@ int dScatter_B3d(NRformat_loc3d *A3d,  // modified
     
 	/* Wait for Irecv to complete */
 	MPI_Wait(&recv_req, &recv_status);
-
+#endif
+	///////////
+	
     } /* else Z-major */
 
     // B <- colMajor(Btmp)
diff --git a/SRC/pdgssvx.c b/SRC/pdgssvx.c
index aa28622c..53bf4e0a 100644
--- a/SRC/pdgssvx.c
+++ b/SRC/pdgssvx.c
@@ -639,15 +639,6 @@ pdgssvx(superlu_dist_options_t *options, SuperMatrix *A,
     C = ScalePermstruct->C;
     /********/
 
-#ifdef GPU_ACC
-    /* Binding each MPI to a CUDA device */
-    int devs;
-    // MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    cudaGetDeviceCount(&devs); // Returns the number of compute-capable devices
-    cudaSetDevice(iam % devs); // Set device to be used for GPU executions
-    ////
-#endif
-
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter pdgssvx()");
 #endif
diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c
index d2bcb4a7..55580f54 100644
--- a/SRC/pdgssvx3d.c
+++ b/SRC/pdgssvx3d.c
@@ -582,14 +582,6 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     
     /* Initialization. */
 
-#ifdef GPU_ACC
-    /* Binding each MPI to a CUDA device */
-    int devs;
-    // MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    cudaGetDeviceCount(&devs); // Returns the number of compute-capable devices
-    cudaSetDevice(iam % devs); // Set device to be used for GPU executions
-    ////
-#endif
 
     options->Algo3d = YES;
 	
diff --git a/SRC/psgssvx.c b/SRC/psgssvx.c
index 2e922695..73020a02 100644
--- a/SRC/psgssvx.c
+++ b/SRC/psgssvx.c
@@ -639,15 +639,6 @@ psgssvx(superlu_dist_options_t *options, SuperMatrix *A,
     C = ScalePermstruct->C;
     /********/
 
-#ifdef GPU_ACC
-    /* Binding each MPI to a CUDA device */
-    int devs;
-    // MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    cudaGetDeviceCount(&devs); // Returns the number of compute-capable devices
-    cudaSetDevice(iam % devs); // Set device to be used for GPU executions
-    ////
-#endif
-
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter psgssvx()");
 #endif
diff --git a/SRC/psgssvx3d.c b/SRC/psgssvx3d.c
index caef481b..2f00ca73 100644
--- a/SRC/psgssvx3d.c
+++ b/SRC/psgssvx3d.c
@@ -582,14 +582,6 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     
     /* Initialization. */
 
-#ifdef GPU_ACC
-    /* Binding each MPI to a CUDA device */
-    int devs;
-    // MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    cudaGetDeviceCount(&devs); // Returns the number of compute-capable devices
-    cudaSetDevice(iam % devs); // Set device to be used for GPU executions
-    ////
-#endif
 
     options->Algo3d = YES;
 	
diff --git a/SRC/pzgssvx.c b/SRC/pzgssvx.c
index 606bd676..5decae78 100644
--- a/SRC/pzgssvx.c
+++ b/SRC/pzgssvx.c
@@ -638,15 +638,6 @@ pzgssvx(superlu_dist_options_t *options, SuperMatrix *A,
     C = ScalePermstruct->C;
     /********/
 
-#ifdef GPU_ACC
-    /* Binding each MPI to a CUDA device */
-    int devs;
-    // MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    cudaGetDeviceCount(&devs); // Returns the number of compute-capable devices
-    cudaSetDevice(iam % devs); // Set device to be used for GPU executions
-    ////
-#endif
-
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC(iam, "Enter pzgssvx()");
 #endif
diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c
index 05fdf000..7c481faf 100644
--- a/SRC/pzgssvx3d.c
+++ b/SRC/pzgssvx3d.c
@@ -581,14 +581,6 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
     
     /* Initialization. */
 
-#ifdef GPU_ACC
-    /* Binding each MPI to a CUDA device */
-    int devs;
-    // MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-    cudaGetDeviceCount(&devs); // Returns the number of compute-capable devices
-    cudaSetDevice(iam % devs); // Set device to be used for GPU executions
-    ////
-#endif
 
     options->Algo3d = YES;
 	
diff --git a/SRC/snrformat_loc3d.c b/SRC/snrformat_loc3d.c
index 7e932f77..ea63f95f 100644
--- a/SRC/snrformat_loc3d.c
+++ b/SRC/snrformat_loc3d.c
@@ -67,6 +67,7 @@ void sGatherNRformat_loc3d
     if ( Fact == DOFACT ) { /* Factorize from scratch */
 	/* A3d is output. Compute counts from scratch */
 	A3d = SUPERLU_MALLOC(sizeof(NRformat_loc3d));
+	A3d->num_procs_to_send = EMPTY; // No X(2d) -> X(3d) comm. schedule yet
 	A2d = SUPERLU_MALLOC(sizeof(NRformat_loc));
     
 	// find number of nnzs
@@ -298,7 +299,8 @@ int sScatter_B3d(NRformat_loc3d *A3d,  // modified
     int *b_disp         = A3d->b_disp;
     int *row_counts_int = A3d->row_counts_int;
     int *row_disp       = A3d->row_disp;
-    int i, p;
+    int i, j, k, p;
+    int num_procs_to_send, num_procs_to_recv; // persistent across multiple solves
     int iam = grid3d->iam;
     int rankorder = grid3d->rankorder;
     gridinfo_t *grid2d = &(grid3d->grid2d);
@@ -334,7 +336,7 @@ int sScatter_B3d(NRformat_loc3d *A3d,  // modified
 		     Btmp, nrhs * A3d->m_loc, MPI_FLOAT,
 		     0, grid3d->zscp.comm);
 
-    } else { /* Z-major in 3D grid */
+    } else { /* Z-major in 3D grid (default) */
         /*    e.g. 1x3x4 grid: layer0 layer1 layer2 layer3
 	                       0      3      6      9
  	                       1      4      7      10      
@@ -358,13 +360,181 @@ int sScatter_B3d(NRformat_loc3d *A3d,  // modified
 	       8             10
 	       11            11
 	       ----         ----
+         In the most general case, block rows of B are not of even size, then the
+	 Layer 0 partition may overlap with 3D partition in an arbitrary manner.
+	 For example:
+	                  P0        P1        P2       P3
+             X on grid-0: |___________|__________|_________|________|
+
+	     X on 3D:     |___|____|_____|____|__|______|_____|_____|
+	                  P0  P1   P2    P3   P4   P5     P6   P7  
 	*/
-        MPI_Request recv_req;
 	MPI_Status recv_status;
 	int pxy = grid2d->nprow * grid2d->npcol;
 	int npdep = grid3d->npdep, dest, src, tag;
-	int nprocs = pxy * npdep;
+	int nprocs = pxy * npdep; // all procs in 3D grid 
+	MPI_Request *recv_reqs = (MPI_Request*) SUPERLU_MALLOC(npdep * sizeof(MPI_Request));
+	int num_procs_to_send;
+	int *procs_to_send_list;
+	int *send_count_list;
+	int num_procs_to_recv;
+	int *procs_recv_from_list;
+	int *recv_count_list;
+
+	if ( A3d->num_procs_to_send == -1 ) { /* First time: set up communication schedule */
+	    /* 1. Set up the destination processes from each source process,
+	       and the send counts.	
+	       - Only grid-0 processes need to send.
+	       - row_disp[] recorded the prefix sum of the block rows of RHS
+	       	 	    along the processes Z-dimension.
+	         row_disp[npdep] is the total number of X entries on my proc.
+	       	     (equals A2d->m_loc.)
+	         A2d->fst_row records the boundary of the partition on grid-0.
+	       - Need to compute the prefix sum of the block rows of X
+	       	 among all the processes.
+	       	 A->fst_row has this info, but is available only locally.
+	    */
+	
+	    int *m_loc_3d_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* related to m_loc in 3D partition */
+	    int *x_send_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    int *x_recv_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
+	
+	    /* The following should be persistent across multiple solves.
+	       These lists avoid All-to-All communication. */
+	    procs_to_send_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    send_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    procs_recv_from_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+	    recv_count_list = SUPERLU_MALLOC(nprocs * sizeof(int));
+
+	    for (p = 0; p < nprocs; ++p) {
+		x_send_counts[p] = 0;
+		x_recv_counts[p] = 0;
+		procs_to_send_list[p] = EMPTY; // (-1)
+		procs_recv_from_list[p] = EMPTY;
+	    }
+	    
+	    /* All procs participate */
+	    MPI_Allgather(&(A3d->m_loc), 1, MPI_INT, m_loc_3d_counts, 1,
+			  MPI_INT, grid3d->comm);
+	    
+	    /* Layer 0 set up sends info. The other layers have 0 send counts. */
+	    if (grid3d->zscp.Iam == 0) {
+		int x_fst_row = A2d->fst_row; // start from a layer 0 boundary
+		int x_end_row = A2d->fst_row + A2d->m_loc; // end of boundary + 1
+		int sum_m_loc; // prefix sum of m_loc among all processes
+		
+		/* Loop through all processes.
+		   Search for 1st X-interval in grid-0's B-interval */
+		num_procs_to_send = sum_m_loc = 0;
+		for (p = 0; p < nprocs; ++p) {
+		    
+		    sum_m_loc += m_loc_3d_counts[p];
+		    
+		    if (sum_m_loc > x_end_row) { // reach the 2D block boundary
+			x_send_counts[p] = x_end_row - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			break;
+		    } else if (x_fst_row < sum_m_loc) {
+			x_send_counts[p] = sum_m_loc - x_fst_row;
+			procs_to_send_list[num_procs_to_send] = p;
+			send_count_list[num_procs_to_send] = x_send_counts[p];
+			num_procs_to_send++;
+			x_fst_row = sum_m_loc; //+= m_loc_3d_counts[p];
+			if (x_fst_row >= x_end_row) break;
+		    }
+		    
+		    //sum_m_loc += m_loc_3d_counts[p+1];
+		} /* end for p ... */
+	    } else { /* end layer 0 */
+		num_procs_to_send = 0;
+	    }
+	    
+	    /* 2. Set up the source processes from each destination process,
+	       and the recv counts.
+	       All processes may need to receive something from grid-0. */
+	    /* The following transposes x_send_counts matrix to
+	       x_recv_counts matrix */
+	    MPI_Alltoall(x_send_counts, 1, MPI_INT, x_recv_counts, 1, MPI_INT,
+			 grid3d->comm);
+	    
+	    j = 0; // tracking number procs to receive from
+	    for (p = 0; p < nprocs; ++p) {
+		if (x_recv_counts[p]) {
+		    procs_recv_from_list[j] = p;
+		    recv_count_list[j] = x_recv_counts[p];
+		    src = p;  tag = iam;
+		    ++j;
+#if 0		    
+		    printf("RECV: src %d -> iam %d, x_recv_counts[p] %d, tag %d\n",
+			   src, iam, x_recv_counts[p], tag);
+		    fflush(stdout);
+#endif		    
+		}
+	    }
+	    num_procs_to_recv = j;
+
+	    /* Persist in A3d structure */
+	    A3d->num_procs_to_send = num_procs_to_send;
+	    A3d->procs_to_send_list = procs_to_send_list;
+	    A3d->send_count_list = send_count_list;
+	    A3d->num_procs_to_recv = num_procs_to_recv;
+	    A3d->procs_recv_from_list = procs_recv_from_list;
+	    A3d->recv_count_list = recv_count_list;
+
+	    SUPERLU_FREE(m_loc_3d_counts);
+	    SUPERLU_FREE(x_send_counts);
+	    SUPERLU_FREE(x_recv_counts);
+	} else { /* Reuse the communication schedule */
+	    num_procs_to_send = A3d->num_procs_to_send;
+	    procs_to_send_list = A3d->procs_to_send_list;
+	    send_count_list = A3d->send_count_list;
+	    num_procs_to_recv = A3d->num_procs_to_recv;
+	    procs_recv_from_list = A3d->procs_recv_from_list;
+	    recv_count_list = A3d->recv_count_list;
+	}
+	
+	/* 3. Perform the acutal communication */
+	    
+	/* Post irecv first */
+	i = 0; // tracking offset in the recv buffer Btmp[]
+	for (j = 0; j < num_procs_to_recv; ++j) {
+	    src = procs_recv_from_list[j];
+	    tag = iam;
+	    k = nrhs * recv_count_list[j]; // recv count
+	    MPI_Irecv( Btmp + i, k, MPI_FLOAT,
+		       src, tag, grid3d->comm, &recv_reqs[j] );
+	    i += k;
+	}
+	    
+	/* Send */
+	/* Layer 0 sends to *num_procs_to_send* procs */
+	if (grid3d->zscp.Iam == 0) {
+	    int dest, tag;
+	    for (i = 0, p = 0; p < num_procs_to_send; ++p) { 
+		dest = procs_to_send_list[p]; //p + grid2d->iam * npdep;
+		tag = dest;
+		/*printf("SEND: iam %d -> %d, send_count_list[p] %d, tag %d\n",
+		  iam,dest, send_count_list[p], tag);
+		  fflush(stdout); */
+		    
+		MPI_Send(B1 + i, nrhs * send_count_list[p], 
+			 MPI_FLOAT, dest, tag, grid3d->comm);
+		i += nrhs * send_count_list[p];
+	    }
+	}  /* end layer 0 send */
+	    
+	/* Wait for all Irecv's to complete */
+	for (i = 0; i < num_procs_to_recv; ++i)
+	    MPI_Wait(&recv_reqs[i], &recv_status);
+
+        SUPERLU_FREE(recv_reqs);
 
+	///////////	
+#if 0 // The following code works only with even block distribution of RHS 
 	/* Everyone receives one block (post non-blocking irecv) */
 	src = grid3d->iam / npdep;  // Z-major
 	tag = iam;
@@ -385,7 +555,9 @@ int sScatter_B3d(NRformat_loc3d *A3d,  // modified
     
 	/* Wait for Irecv to complete */
 	MPI_Wait(&recv_req, &recv_status);
-
+#endif
+	///////////
+	
     } /* else Z-major */
 
     // B <- colMajor(Btmp)
diff --git a/SRC/superlu_grid.c b/SRC/superlu_grid.c
index 8aa61d79..ca0a999c 100644
--- a/SRC/superlu_grid.c
+++ b/SRC/superlu_grid.c
@@ -60,6 +60,15 @@ void superlu_gridinit(MPI_Comm Bcomm, /* The base communicator upon which
     superlu_gridmap(Bcomm, nprow, npcol, usermap, nprow, grid);
     
     SUPERLU_FREE(usermap);
+    
+#ifdef GPU_ACC
+    /* Binding each MPI to a CUDA device */
+    int devs, rank;
+    MPI_Comm_rank(Bcomm, &rank); // MPI_COMM_WORLD??
+    cudaGetDeviceCount(&devs);  // Returns the number of compute-capable devices
+    cudaSetDevice(rank % devs); // Set device to be used for GPU executions
+    ////
+#endif
 }
 
 
diff --git a/SRC/superlu_grid3d.c b/SRC/superlu_grid3d.c
index a19b6bf6..aec8d53d 100644
--- a/SRC/superlu_grid3d.c
+++ b/SRC/superlu_grid3d.c
@@ -45,6 +45,16 @@ void superlu_gridinit3d(MPI_Comm Bcomm, /* The base communicator upon which
     superlu_gridmap3d(Bcomm, nprow, npcol, npdep, grid);
 
     // SUPERLU_FREE(usermap);
+    
+#ifdef GPU_ACC
+    /* Binding each MPI to a CUDA device */
+    int devs, rank;
+    MPI_Comm_rank(Bcomm, &rank); // MPI_COMM_WORLD??
+    cudaGetDeviceCount(&devs);  // Returns the number of compute-capable devices
+    cudaSetDevice(rank % devs); // Set device to be used for GPU executions
+    ////
+#endif
+    
 }
 
 
diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c
index 56a28e8d..6f8f535e 100644
--- a/SRC/znrformat_loc3d.c
+++ b/SRC/znrformat_loc3d.c
@@ -22,7 +22,6 @@ at the top-level directory.
  */
 
 #include "superlu_zdefs.h"
-#include 
 
 /* Dst <- BlockByBlock (Src), reshape the block storage. */
 static void matCopy(int n, int m, doublecomplex *Dst, int lddst, doublecomplex *Src, int ldsrc)
@@ -111,7 +110,6 @@ void zGatherNRformat_loc3d
 		A2d->nzval = doublecomplexMalloc_dist(nnz_disp[grid3d->npdep]);
 		A2d->rowptr = intMalloc_dist((row_disp[grid3d->npdep] + 1));
 		A2d->rowptr[0] = 0;
-		printf(" Gather layer-0: iam %d\n", grid3d->iam); fflush(stdout);
 	    }
 
 	MPI_Gatherv(A->nzval, A->nnz_loc, SuperLU_MPI_DOUBLE_COMPLEX, A2d->nzval,
@@ -344,7 +342,7 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 	                       2      5      8      11
 	  GATHER:  {A, B} in A * X = B
 	  layer-0:
-	  B (row space)  X (column space)  SCATTER
+    	       B (row space)  X (column space)  SCATTER
 	       ----           ----        ---->>
            P0  0              0
 (equations     3              1      Proc 0 -> Procs {0, 1, 2, 3}
@@ -361,15 +359,14 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 	       8             10
 	       11            11
 	       ----         ----
-
          In the most general case, block rows of B are not of even size, then the
 	 Layer 0 partition may overlap with 3D partition in an arbitrary manner.
 	 For example:
 	                  P0        P1        P2       P3
-             X on grid-0: |-----------|---------|---------|-----------|
+             X on grid-0: |___________|__________|_________|________|
 
-	     X on 3D:     |___|____|_____|_______|____|______|____|___|
-	                  P0  P1   P2    P3      P4   P5     P6   P7  P8
+	     X on 3D:     |___|____|_____|____|__|______|_____|_____|
+	                  P0  P1   P2    P3   P4   P5     P6   P7  
 	*/
 	MPI_Status recv_status;
 	int pxy = grid2d->nprow * grid2d->npcol;
@@ -388,13 +385,13 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 	       and the send counts.	
 	       - Only grid-0 processes need to send.
 	       - row_disp[] recorded the prefix sum of the block rows of RHS
-	       along the processes Z-dimension.
-	       row_disp[npdep] is the total number of X entries on my proc.
-	       (equals A2d->m_loc.)
-	       A2d->fst_row records the boundary of the partition on grid-0.
+	       	 	    along the processes Z-dimension.
+	         row_disp[npdep] is the total number of X entries on my proc.
+	       	     (equals A2d->m_loc.)
+	         A2d->fst_row records the boundary of the partition on grid-0.
 	       - Need to compute the prefix sum of the block rows of X
-	       among all the processes.
-	       A->fst_row has this info, but is available only locally.
+	       	 among all the processes.
+	       	 A->fst_row has this info, but is available only locally.
 	    */
 	
 	    int *m_loc_3d_counts = SUPERLU_MALLOC(nprocs * sizeof(int));
@@ -470,7 +467,7 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 		    recv_count_list[j] = x_recv_counts[p];
 		    src = p;  tag = iam;
 		    ++j;
-#if 1
+#if 0		    
 		    printf("RECV: src %d -> iam %d, x_recv_counts[p] %d, tag %d\n",
 			   src, iam, x_recv_counts[p], tag);
 		    fflush(stdout);
@@ -532,13 +529,14 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 	/* Wait for all Irecv's to complete */
 	for (i = 0; i < num_procs_to_recv; ++i)
 	    MPI_Wait(&recv_reqs[i], &recv_status);
-	    
+
+        SUPERLU_FREE(recv_reqs);
+
 	///////////	
 #if 0 // The following code works only with even block distribution of RHS 
 	/* Everyone receives one block (post non-blocking irecv) */
 	src = grid3d->iam / npdep;  // Z-major
 	tag = iam;
-	
 	MPI_Irecv(Btmp, nrhs * A3d->m_loc, SuperLU_MPI_DOUBLE_COMPLEX,
 		 src, tag, grid3d->comm, &recv_req);
 
@@ -553,11 +551,11 @@ int zScatter_B3d(NRformat_loc3d *A3d,  // modified
 			 SuperLU_MPI_DOUBLE_COMPLEX, dest, tag, grid3d->comm);
 	    }
 	}  /* end layer 0 send */
-	
+    
 	/* Wait for Irecv to complete */
 	MPI_Wait(&recv_req, &recv_status);
 #endif
-	///////////	
+	///////////
 	
     } /* else Z-major */
 

From 82cb8b975f1cbddb59d0061ea847ff1e6ea8e7a3 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Wed, 29 Sep 2021 22:36:13 -0700
Subject: [PATCH 128/147] Corrected the missing '#ifdef _OPENMP' around a
 number of '#pragma omp' in 3D codes.

---
 EXAMPLE/zcreate_matrix3d.c |  4 ++--
 SRC/dgather.c              |  9 +++++++--
 SRC/dtreeFactorization.c   | 25 ++++++++++++++++++++-----
 SRC/dtrfCommWrapper.c      | 16 +++++++++++-----
 SRC/pdgstrf2.c             | 10 ++++++++--
 SRC/psgstrf2.c             | 10 ++++++++--
 SRC/pzgstrf2.c             | 10 ++++++++--
 SRC/sgather.c              |  4 ++++
 SRC/streeFactorization.c   | 20 +++++++++++++++++---
 SRC/strfCommWrapper.c      | 16 +++++++++++-----
 SRC/zgather.c              |  9 +++++++--
 SRC/ztreeFactorization.c   | 25 ++++++++++++++++++++-----
 SRC/ztrfCommWrapper.c      | 16 +++++++++++-----
 13 files changed, 134 insertions(+), 40 deletions(-)

diff --git a/EXAMPLE/zcreate_matrix3d.c b/EXAMPLE/zcreate_matrix3d.c
index 45f4fbf2..b3c43ffd 100644
--- a/EXAMPLE/zcreate_matrix3d.c
+++ b/EXAMPLE/zcreate_matrix3d.c
@@ -352,7 +352,7 @@ int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs,
         if (iam == (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1)) /* last proc. gets all*/
             m_loc = m - m_loc * (grid3d->nprow * grid3d->npcol* grid3d->npdep - 1);
     }
- 
+
     /* Create compressed column matrix for GA. */
     zCreate_CompCol_Matrix_dist(&GA, m, n, nnz, nzval, rowind, colptr,
                                 SLU_NC, SLU_D, SLU_GE);
@@ -379,7 +379,7 @@ int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs,
         for (j = colptr[i]; j < colptr[i + 1]; ++j) ++marker[rowind[j]];
     /* Set up row pointers */
     rowptr[0] = 0;
-//    fst_row = iam * m_loc_fst;
+    fst_row = iam * m_loc_fst;
     nnz_loc = 0;
     for (j = 0; j < m_loc; ++j)
     {
diff --git a/SRC/dgather.c b/SRC/dgather.c
index f573b6be..29bb1914 100644
--- a/SRC/dgather.c
+++ b/SRC/dgather.c
@@ -15,8 +15,9 @@ at the top-level directory.
  *
  * 
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
  */
 #include 
 #include "superlu_ddefs.h"
@@ -52,7 +53,9 @@ void dgather_u(int_t num_u_blks,
     // jj, i)
     double zero = 0.0;
 
+#ifdef _OPENMP    
 #pragma omp parallel for default (shared) schedule(dynamic)
+#endif
     for (int_t j = 0; j < num_u_blks; ++j)
     {
         double *tempu;
@@ -96,7 +99,9 @@ void dgather_l( int_t num_LBlk, int_t knsupc,
     }
 
     int_t LD_LBuff = L_info[num_LBlk - 1].FullRow;  /*leading dimension of buffer*/
+#ifdef _OPENMP    
 #pragma omp parallel for
+#endif
     for (int_t i = 0; i < num_LBlk; ++i)
     {
         int_t StRowDest  = 0;
diff --git a/SRC/dtreeFactorization.c b/SRC/dtreeFactorization.c
index 562b9b18..85b70ca7 100644
--- a/SRC/dtreeFactorization.c
+++ b/SRC/dtreeFactorization.c
@@ -15,8 +15,9 @@ at the top-level directory.
  *
  * 
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
  */
 #include "superlu_ddefs.h"
 #if 0
@@ -227,7 +228,9 @@ int_t ddenseTreeFactor(
             double* bigV = scuBufs->bigV;
             double* bigU = scuBufs->bigU;
 
+#ifdef _OPENMP    
 #pragma omp parallel for schedule(dynamic)
+#endif
             for (int_t ij = 0; ij < nub * nlb; ++ij)
             {
                 /* code */
@@ -249,9 +252,9 @@ int_t ddenseTreeFactor(
                 int_t *lsub = lPanelInfo->lsub;
                 int_t *usub = uPanelInfo->usub;
 #ifdef _OPENMP		
-                int_t thread_id = omp_get_thread_num();
+                int thread_id = omp_get_thread_num();
 #else		
-                int_t thread_id = 0;
+                int thread_id = 0;
 #endif		
                 dblock_gemm_scatter( lb, ub,
                                     Ublock_info,
@@ -500,10 +503,14 @@ int_t dsparseTreeFactor_ASYNC(
             int_t klst = FstBlockC (k + 1);
 
             double* bigV = scuBufs->bigV;
-
+	    
+#ifdef _OPENMP    
 #pragma omp parallel
+#endif
             {
+#ifdef _OPENMP    
 #pragma omp for schedule(dynamic,2) nowait
+#endif
 		/* Each thread is assigned one loop index ij, responsible for
 		   block update L(lb,k) * U(k,j) -> tempv[]. */
                 for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
@@ -519,7 +526,9 @@ int_t dsparseTreeFactor_ASYNC(
 					       LUstruct, grid, SCT, stat );
                 }
 
+#ifdef _OPENMP    
 #pragma omp for schedule(dynamic,2) nowait
+#endif
                 for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
                 {
                     int_t j   = ij / HyP->lookAheadBlk ;
@@ -529,7 +538,9 @@ int_t dsparseTreeFactor_ASYNC(
 						LUstruct, grid, SCT, stat);
                 }
 
+#ifdef _OPENMP    
 #pragma omp for schedule(dynamic,2) nowait
+#endif
                 for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) //
                 {
                     int_t j   = ij / HyP->RemainBlk;
@@ -571,9 +582,13 @@ int_t dsparseTreeFactor_ASYNC(
                 }
             }
 
+#ifdef _OPENMP    
 #pragma omp parallel
+#endif
             {
+#ifdef _OPENMP    
 #pragma omp for schedule(dynamic,2) nowait
+#endif
                 for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu) ; ++ij)
                 {
                     int_t j   = ij / HyP->RemainBlk + jj_cpu;
diff --git a/SRC/dtrfCommWrapper.c b/SRC/dtrfCommWrapper.c
index 1ff15bdc..b7531604 100644
--- a/SRC/dtrfCommWrapper.c
+++ b/SRC/dtrfCommWrapper.c
@@ -164,7 +164,9 @@ int_t dLPanelTrSolve( int_t k,   int_t* factored_L,
 #define BL  32
         for (int i = 0; i < CEILING(l, BL); ++i)
         {
+#ifdef _OPENMP    
             #pragma omp task
+#endif	    
             {
                 int_t off = i * BL;
                 // Sherry: int_t len = MY_MIN(BL, l - i * BL);
@@ -203,7 +205,7 @@ int_t dLPanelTrSolve( int_t k,   int_t* factored_L,
             int_t off = i * BL;
             // Sherry: int_t len = MY_MIN(BL, l - i * BL);
             int len = SUPERLU_MIN(BL, (l - i * BL));
-#pragma omp task
+//#pragma omp task
             {
                 superlu_dtrsm("R", "U", "N", "N", len, nsupc, alpha,
 			      ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
@@ -272,12 +274,14 @@ int_t dUPanelTrSolve( int_t k,
         // #pragma omp for schedule(dynamic,2) nowait
         for (int_t b = 0; b < nb; ++b)
         {
+#ifdef _OPENMP    
             #pragma omp task
+#endif
             {
 #ifdef _OPENMP	    
-                int_t thread_id = omp_get_thread_num();
+                int thread_id = omp_get_thread_num();
 #else		
-                int_t thread_id = 0;
+                int thread_id = 0;
 #endif		
                 double *tempv = bigV +  thread_id * ldt * ldt;
                 dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
@@ -323,12 +327,14 @@ int_t dUPanelTrSolve( int_t k,
             // printf("%d :U update \n", k);
             for (int_t b = 0; b < nb; ++b)
             {
+#ifdef _OPENMP    
                 #pragma omp task
+#endif
                 {
 #ifdef _OPENMP		
-                    int_t thread_id = omp_get_thread_num();
+                    int thread_id = omp_get_thread_num();
 #else		    
-                    int_t thread_id = 0;
+                    int thread_id = 0;
 #endif		    
                     double *tempv = bigV +  thread_id * ldt * ldt;
                     dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c
index 70836770..5bb748c0 100644
--- a/SRC/pdgstrf2.c
+++ b/SRC/pdgstrf2.c
@@ -375,7 +375,9 @@ int_t LpanelUpdate(int off0,  int nsupc, double* ublk_ptr, int ld_ujrow,
     double t1 = SuperLU_timer_();
 
 #define GT  32
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for (int i = 0; i < CEILING(l, GT); ++i)
     {
         int_t off = i * GT;
@@ -814,8 +816,10 @@ void pdgstrs2_omp
 
     // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c
     // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for
+#ifdef _OPENMP
 #pragma omp parallel for schedule(static) default(shared) \
     private(b,j,iukp,rukp,segsize)
+#endif
     /* Loop through all the blocks in the row. */
     for (b = 0; b < nb; ++b) {
 #ifdef USE_Ublock_info
@@ -902,13 +906,15 @@ void pdgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf,
     Trs2_InitUbloc_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
 
     /* Loop through all the row blocks. */
+#ifdef _OPENMP    
 #pragma omp parallel for schedule(dynamic,2)
+#endif
     for (int_t b = 0; b < nb; ++b)
     {
 #ifdef _OPENMP    
-        int_t thread_id = omp_get_thread_num();
+        int thread_id = omp_get_thread_num();
 #else	
-        int_t thread_id = 0;
+        int thread_id = 0;
 #endif	
         double *tempv = bigV +  thread_id * ldt * ldt;
         dTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
diff --git a/SRC/psgstrf2.c b/SRC/psgstrf2.c
index 5d0a24c1..e5e7a48f 100644
--- a/SRC/psgstrf2.c
+++ b/SRC/psgstrf2.c
@@ -375,7 +375,9 @@ int_t LpanelUpdate(int off0,  int nsupc, float* ublk_ptr, int ld_ujrow,
     double t1 = SuperLU_timer_();
 
 #define GT  32
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for (int i = 0; i < CEILING(l, GT); ++i)
     {
         int_t off = i * GT;
@@ -814,8 +816,10 @@ void psgstrs2_omp
 
     // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c
     // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for
+#ifdef _OPENMP
 #pragma omp parallel for schedule(static) default(shared) \
     private(b,j,iukp,rukp,segsize)
+#endif
     /* Loop through all the blocks in the row. */
     for (b = 0; b < nb; ++b) {
 #ifdef USE_Ublock_info
@@ -902,13 +906,15 @@ void psgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf,
     Trs2_InitUbloc_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
 
     /* Loop through all the row blocks. */
+#ifdef _OPENMP    
 #pragma omp parallel for schedule(dynamic,2)
+#endif
     for (int_t b = 0; b < nb; ++b)
     {
 #ifdef _OPENMP    
-        int_t thread_id = omp_get_thread_num();
+        int thread_id = omp_get_thread_num();
 #else	
-        int_t thread_id = 0;
+        int thread_id = 0;
 #endif	
         float *tempv = bigV +  thread_id * ldt * ldt;
         sTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c
index 5f2eaa21..45075bc8 100644
--- a/SRC/pzgstrf2.c
+++ b/SRC/pzgstrf2.c
@@ -375,7 +375,9 @@ int_t LpanelUpdate(int off0,  int nsupc, doublecomplex* ublk_ptr, int ld_ujrow,
     double t1 = SuperLU_timer_();
 
 #define GT  32
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for (int i = 0; i < CEILING(l, GT); ++i)
     {
         int_t off = i * GT;
@@ -815,8 +817,10 @@ void pzgstrs2_omp
 
     // Sherry: this version is more NUMA friendly compared to pdgstrf2_v2.c
     // https://stackoverflow.com/questions/13065943/task-based-programming-pragma-omp-task-versus-pragma-omp-parallel-for
+#ifdef _OPENMP
 #pragma omp parallel for schedule(static) default(shared) \
     private(b,j,iukp,rukp,segsize)
+#endif
     /* Loop through all the blocks in the row. */
     for (b = 0; b < nb; ++b) {
 #ifdef USE_Ublock_info
@@ -903,13 +907,15 @@ void pzgstrs2_omp(int_t k0, int_t k, int_t* Lsub_buf,
     Trs2_InitUbloc_info(klst, nb, Ublock_info, usub, Glu_persist, stat );
 
     /* Loop through all the row blocks. */
+#ifdef _OPENMP    
 #pragma omp parallel for schedule(dynamic,2)
+#endif
     for (int_t b = 0; b < nb; ++b)
     {
 #ifdef _OPENMP    
-        int_t thread_id = omp_get_thread_num();
+        int thread_id = omp_get_thread_num();
 #else	
-        int_t thread_id = 0;
+        int thread_id = 0;
 #endif	
         doublecomplex *tempv = bigV +  thread_id * ldt * ldt;
         zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
diff --git a/SRC/sgather.c b/SRC/sgather.c
index 3a3f29b3..da29ccb4 100644
--- a/SRC/sgather.c
+++ b/SRC/sgather.c
@@ -53,7 +53,9 @@ void sgather_u(int_t num_u_blks,
     // jj, i)
     double zero = 0.0;
 
+#ifdef _OPENMP    
 #pragma omp parallel for default (shared) schedule(dynamic)
+#endif
     for (int_t j = 0; j < num_u_blks; ++j)
     {
         float *tempu;
@@ -97,7 +99,9 @@ void sgather_l( int_t num_LBlk, int_t knsupc,
     }
 
     int_t LD_LBuff = L_info[num_LBlk - 1].FullRow;  /*leading dimension of buffer*/
+#ifdef _OPENMP    
 #pragma omp parallel for
+#endif
     for (int_t i = 0; i < num_LBlk; ++i)
     {
         int_t StRowDest  = 0;
diff --git a/SRC/streeFactorization.c b/SRC/streeFactorization.c
index e56bca43..c97f3669 100644
--- a/SRC/streeFactorization.c
+++ b/SRC/streeFactorization.c
@@ -228,7 +228,9 @@ int_t sdenseTreeFactor(
             float* bigV = scuBufs->bigV;
             float* bigU = scuBufs->bigU;
 
+#ifdef _OPENMP    
 #pragma omp parallel for schedule(dynamic)
+#endif
             for (int_t ij = 0; ij < nub * nlb; ++ij)
             {
                 /* code */
@@ -250,9 +252,9 @@ int_t sdenseTreeFactor(
                 int_t *lsub = lPanelInfo->lsub;
                 int_t *usub = uPanelInfo->usub;
 #ifdef _OPENMP		
-                int_t thread_id = omp_get_thread_num();
+                int thread_id = omp_get_thread_num();
 #else		
-                int_t thread_id = 0;
+                int thread_id = 0;
 #endif		
                 sblock_gemm_scatter( lb, ub,
                                     Ublock_info,
@@ -501,10 +503,14 @@ int_t ssparseTreeFactor_ASYNC(
             int_t klst = FstBlockC (k + 1);
 
             float* bigV = scuBufs->bigV;
-
+	    
+#ifdef _OPENMP    
 #pragma omp parallel
+#endif
             {
+#ifdef _OPENMP    
 #pragma omp for schedule(dynamic,2) nowait
+#endif
 		/* Each thread is assigned one loop index ij, responsible for
 		   block update L(lb,k) * U(k,j) -> tempv[]. */
                 for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
@@ -520,7 +526,9 @@ int_t ssparseTreeFactor_ASYNC(
 					       LUstruct, grid, SCT, stat );
                 }
 
+#ifdef _OPENMP    
 #pragma omp for schedule(dynamic,2) nowait
+#endif
                 for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
                 {
                     int_t j   = ij / HyP->lookAheadBlk ;
@@ -530,7 +538,9 @@ int_t ssparseTreeFactor_ASYNC(
 						LUstruct, grid, SCT, stat);
                 }
 
+#ifdef _OPENMP    
 #pragma omp for schedule(dynamic,2) nowait
+#endif
                 for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) //
                 {
                     int_t j   = ij / HyP->RemainBlk;
@@ -572,9 +582,13 @@ int_t ssparseTreeFactor_ASYNC(
                 }
             }
 
+#ifdef _OPENMP    
 #pragma omp parallel
+#endif
             {
+#ifdef _OPENMP    
 #pragma omp for schedule(dynamic,2) nowait
+#endif
                 for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu) ; ++ij)
                 {
                     int_t j   = ij / HyP->RemainBlk + jj_cpu;
diff --git a/SRC/strfCommWrapper.c b/SRC/strfCommWrapper.c
index b5126c60..ca7feed6 100644
--- a/SRC/strfCommWrapper.c
+++ b/SRC/strfCommWrapper.c
@@ -164,7 +164,9 @@ int_t sLPanelTrSolve( int_t k,   int_t* factored_L,
 #define BL  32
         for (int i = 0; i < CEILING(l, BL); ++i)
         {
+#ifdef _OPENMP    
             #pragma omp task
+#endif	    
             {
                 int_t off = i * BL;
                 // Sherry: int_t len = MY_MIN(BL, l - i * BL);
@@ -203,7 +205,7 @@ int_t sLPanelTrSolve( int_t k,   int_t* factored_L,
             int_t off = i * BL;
             // Sherry: int_t len = MY_MIN(BL, l - i * BL);
             int len = SUPERLU_MIN(BL, (l - i * BL));
-#pragma omp task
+//#pragma omp task
             {
                 superlu_strsm("R", "U", "N", "N", len, nsupc, alpha,
 			      ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
@@ -272,12 +274,14 @@ int_t sUPanelTrSolve( int_t k,
         // #pragma omp for schedule(dynamic,2) nowait
         for (int_t b = 0; b < nb; ++b)
         {
+#ifdef _OPENMP    
             #pragma omp task
+#endif
             {
 #ifdef _OPENMP	    
-                int_t thread_id = omp_get_thread_num();
+                int thread_id = omp_get_thread_num();
 #else		
-                int_t thread_id = 0;
+                int thread_id = 0;
 #endif		
                 float *tempv = bigV +  thread_id * ldt * ldt;
                 sTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
@@ -323,12 +327,14 @@ int_t sUPanelTrSolve( int_t k,
             // printf("%d :U update \n", k);
             for (int_t b = 0; b < nb; ++b)
             {
+#ifdef _OPENMP    
                 #pragma omp task
+#endif
                 {
 #ifdef _OPENMP		
-                    int_t thread_id = omp_get_thread_num();
+                    int thread_id = omp_get_thread_num();
 #else		    
-                    int_t thread_id = 0;
+                    int thread_id = 0;
 #endif		    
                     float *tempv = bigV +  thread_id * ldt * ldt;
                     sTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
diff --git a/SRC/zgather.c b/SRC/zgather.c
index 96e01cfa..3cfa60ae 100644
--- a/SRC/zgather.c
+++ b/SRC/zgather.c
@@ -14,8 +14,9 @@ at the top-level directory.
  *
  * 
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
  */
 #include 
 #include "superlu_zdefs.h"
@@ -51,7 +52,9 @@ void zgather_u(int_t num_u_blks,
     // jj, i)
     doublecomplex zero = {0.0, 0.0};
 
+#ifdef _OPENMP    
 #pragma omp parallel for default (shared) schedule(dynamic)
+#endif
     for (int_t j = 0; j < num_u_blks; ++j)
     {
         doublecomplex *tempu;
@@ -95,7 +98,9 @@ void zgather_l( int_t num_LBlk, int_t knsupc,
     }
 
     int_t LD_LBuff = L_info[num_LBlk - 1].FullRow;  /*leading dimension of buffer*/
+#ifdef _OPENMP    
 #pragma omp parallel for
+#endif
     for (int_t i = 0; i < num_LBlk; ++i)
     {
         int_t StRowDest  = 0;
diff --git a/SRC/ztreeFactorization.c b/SRC/ztreeFactorization.c
index 5f401d1c..df1a36e4 100644
--- a/SRC/ztreeFactorization.c
+++ b/SRC/ztreeFactorization.c
@@ -14,8 +14,9 @@ at the top-level directory.
  *
  * 
  * -- Distributed SuperLU routine (version 7.0) --
- * Lawrence Berkeley National Lab, Georgia Institute of Technology.
- * May 10, 2019
+ * Lawrence Berkeley National Lab, Georgia Institute of Technology,
+ * Oak Ridge National Lab
+ * May 12, 2021
  */
 #include "superlu_zdefs.h"
 #if 0
@@ -226,7 +227,9 @@ int_t zdenseTreeFactor(
             doublecomplex* bigV = scuBufs->bigV;
             doublecomplex* bigU = scuBufs->bigU;
 
+#ifdef _OPENMP    
 #pragma omp parallel for schedule(dynamic)
+#endif
             for (int_t ij = 0; ij < nub * nlb; ++ij)
             {
                 /* code */
@@ -248,9 +251,9 @@ int_t zdenseTreeFactor(
                 int_t *lsub = lPanelInfo->lsub;
                 int_t *usub = uPanelInfo->usub;
 #ifdef _OPENMP		
-                int_t thread_id = omp_get_thread_num();
+                int thread_id = omp_get_thread_num();
 #else		
-                int_t thread_id = 0;
+                int thread_id = 0;
 #endif		
                 zblock_gemm_scatter( lb, ub,
                                     Ublock_info,
@@ -499,10 +502,14 @@ int_t zsparseTreeFactor_ASYNC(
             int_t klst = FstBlockC (k + 1);
 
             doublecomplex* bigV = scuBufs->bigV;
-
+	    
+#ifdef _OPENMP    
 #pragma omp parallel
+#endif
             {
+#ifdef _OPENMP    
 #pragma omp for schedule(dynamic,2) nowait
+#endif
 		/* Each thread is assigned one loop index ij, responsible for
 		   block update L(lb,k) * U(k,j) -> tempv[]. */
                 for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
@@ -518,7 +525,9 @@ int_t zsparseTreeFactor_ASYNC(
 					       LUstruct, grid, SCT, stat );
                 }
 
+#ifdef _OPENMP    
 #pragma omp for schedule(dynamic,2) nowait
+#endif
                 for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
                 {
                     int_t j   = ij / HyP->lookAheadBlk ;
@@ -528,7 +537,9 @@ int_t zsparseTreeFactor_ASYNC(
 						LUstruct, grid, SCT, stat);
                 }
 
+#ifdef _OPENMP    
 #pragma omp for schedule(dynamic,2) nowait
+#endif
                 for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij) //
                 {
                     int_t j   = ij / HyP->RemainBlk;
@@ -570,9 +581,13 @@ int_t zsparseTreeFactor_ASYNC(
                 }
             }
 
+#ifdef _OPENMP    
 #pragma omp parallel
+#endif
             {
+#ifdef _OPENMP    
 #pragma omp for schedule(dynamic,2) nowait
+#endif
                 for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu) ; ++ij)
                 {
                     int_t j   = ij / HyP->RemainBlk + jj_cpu;
diff --git a/SRC/ztrfCommWrapper.c b/SRC/ztrfCommWrapper.c
index e180769e..a3897c11 100644
--- a/SRC/ztrfCommWrapper.c
+++ b/SRC/ztrfCommWrapper.c
@@ -163,7 +163,9 @@ int_t zLPanelTrSolve( int_t k,   int_t* factored_L,
 #define BL  32
         for (int i = 0; i < CEILING(l, BL); ++i)
         {
+#ifdef _OPENMP    
             #pragma omp task
+#endif	    
             {
                 int_t off = i * BL;
                 // Sherry: int_t len = MY_MIN(BL, l - i * BL);
@@ -202,7 +204,7 @@ int_t zLPanelTrSolve( int_t k,   int_t* factored_L,
             int_t off = i * BL;
             // Sherry: int_t len = MY_MIN(BL, l - i * BL);
             int len = SUPERLU_MIN(BL, (l - i * BL));
-#pragma omp task
+//#pragma omp task
             {
                 superlu_ztrsm("R", "U", "N", "N", len, nsupc, alpha,
 			      ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr);
@@ -271,12 +273,14 @@ int_t zUPanelTrSolve( int_t k,
         // #pragma omp for schedule(dynamic,2) nowait
         for (int_t b = 0; b < nb; ++b)
         {
+#ifdef _OPENMP    
             #pragma omp task
+#endif
             {
 #ifdef _OPENMP	    
-                int_t thread_id = omp_get_thread_num();
+                int thread_id = omp_get_thread_num();
 #else		
-                int_t thread_id = 0;
+                int thread_id = 0;
 #endif		
                 doublecomplex *tempv = bigV +  thread_id * ldt * ldt;
                 zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,
@@ -322,12 +326,14 @@ int_t zUPanelTrSolve( int_t k,
             // printf("%d :U update \n", k);
             for (int_t b = 0; b < nb; ++b)
             {
+#ifdef _OPENMP    
                 #pragma omp task
+#endif
                 {
 #ifdef _OPENMP		
-                    int_t thread_id = omp_get_thread_num();
+                    int thread_id = omp_get_thread_num();
 #else		    
-                    int_t thread_id = 0;
+                    int thread_id = 0;
 #endif		    
                     doublecomplex *tempv = bigV +  thread_id * ldt * ldt;
                     zTrs2_GatherTrsmScatter(klst, Ublock_info[b].iukp, Ublock_info[b].rukp,

From 3fade52c3f5d7074ea368f3b086ef2c729d5150b Mon Sep 17 00:00:00 2001
From: Yang Liu 
Date: Mon, 4 Oct 2021 11:21:29 -0700
Subject: [PATCH 129/147] added github action for github CI tests

---
 .ci_tests.sh               | 31 ++++++++++++
 .github/workflows/test.yml | 98 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 129 insertions(+)
 create mode 100755 .ci_tests.sh
 create mode 100644 .github/workflows/test.yml

diff --git a/.ci_tests.sh b/.ci_tests.sh
new file mode 100755
index 00000000..c90da30c
--- /dev/null
+++ b/.ci_tests.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+set -e
+
+export RED="\033[31;1m"
+export BLUE="\033[34;1m"
+export ROOT_DIR="$pwd"
+printf "${BLUE} SLU; Entered tests file:\n"
+
+
+export DATA_FOLDER=$ROOT_DIR/EXAMPLE
+export EXAMPLE_FOLDER=$ROOT_DIR/build/EXAMPLE
+export TEST_FOLDER=$ROOT_DIR/build/TEST
+
+case "${TEST_NUMBER}" in
+1)  mpirun "-n" "1" "$TEST_FOLDER/pdtest" "-r" "1" "-c" "1" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+2)  mpirun "-n" "1" "$TEST_FOLDER/pdtest" "-r" "1" "-c" "1" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+3)  mpirun "-n" "3" "$TEST_FOLDER/pdtest" "-r" "1" "-c" "3" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+4)  mpirun "-n" "3" "$TEST_FOLDER/pdtest" "-r" "1" "-c" "3" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+5)  mpirun "-n" "2" "$TEST_FOLDER/pdtest" "-r" "2" "-c" "1" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+6)  mpirun "-n" "2" "$TEST_FOLDER/pdtest" "-r" "2" "-c" "1" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+7)  mpirun "-n" "6" "$TEST_FOLDER/pdtest" "-r" "2" "-c" "3" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+8)  mpirun "-n" "6" "$TEST_FOLDER/pdtest" "-r" "2" "-c" "3" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+9)  mpirun "-n" "4" "$EXAMPLE_FOLDER/pddrive1" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
+10) mpirun "-n" "4" "$EXAMPLE_FOLDER/pddrive2" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
+11) mpirun "-n" "4" "$EXAMPLE_FOLDER/pddrive3" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
+12) mpirun "-n" "4" "$EXAMPLE_FOLDER/pzdrive1" "-r" "2" "-c" "2" "$DATA_FOLDER/cg20.cua" ;;
+13) mpirun "-n" "4" "$EXAMPLE_FOLDER/pzdrive2" "-r" "2" "-c" "2" "$DATA_FOLDER/cg20.cua" ;;
+14) mpirun "-n" "4" "$EXAMPLE_FOLDER/pzdrive3" "-r" "2" "-c" "2" "$DATA_FOLDER/cg20.cua" ;;
+15) mpirun "-n" "4" "$EXAMPLE_FOLDER/pddrive_ABglobal" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
+*) printf "${RED} ###SLU: Unknown test\n" ;;
+esac
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 00000000..99e54458
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,98 @@
+name: Run Github CI. Mirror and run GitLab CI. 
+
+on: [push, pull_request]
+
+jobs:
+  test:
+    name: Build and test
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        compiler:
+          - gcc
+        test: [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Install dependencies
+        run: |
+          export BLUE="\033[34;1m"
+          export ROOT_DIR="$pwd"
+          mkdir -p installDir
+
+          printf "${BLUE} SLU; Installing gcc-6 via apt\n"
+          sudo apt-get update
+          sudo apt-get install build-essential software-properties-common -y
+          sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
+          sudo apt-get update
+          sudo apt-get install gcc-6 g++-6 -y
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-6 60 --slave /usr/bin/g++ g++ /usr/bin/g++-6
+          export CXX="g++-6"
+          export CC="gcc-6"
+          printf "${BLUE} SLU; Done installing gcc-6 via apt\n"
+
+          printf "${BLUE} SLU; Installing gfortran via apt\n"
+          sudo apt-get install gfortran-6 -y
+          sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-6 60
+          printf "${BLUE} SLU; Done installing gfortran via apt\n"
+
+          printf "${BLUE} SLU; Installing openmpi\n"
+          sudo apt-get install openmpi-bin libopenmpi-dev
+          printf "${BLUE} SLU; Done installing openmpi\n"
+
+          printf "${BLUE} SLU; Installing BLASfrom apt\n"
+          sudo apt-get install libblas-dev
+          export BLAS_LIB=/usr/lib/libblas/libblas.so
+          printf "${BLUE} SLU; Done installing BLASfrom apt\n"
+
+          printf "${BLUE} SLU; Installing LAPACKfrom apt\n"
+          sudo apt-get install liblapack-dev
+          export LAPACK_LIB=/usr/lib/liblapack.so
+          printf "${BLUE} SLU; Done installing LAPACKfrom apt\n"
+
+          printf "${BLUE} SLU; Installing ParMetis-4.0 from source\n"
+          cd $ROOT_DIR/installDir
+          wget http://glaros.dtc.umn.edu/gkhome/fetch/sw/parmetis/parmetis-4.0.3.tar.gz
+          tar -xf parmetis-4.0.3.tar.gz
+          cd parmetis-4.0.3/
+          mkdir -p install
+          make config shared=1 cc=mpicc cxx=mpic++ prefix=$PWD/install
+          make install > make_parmetis_install.log 2>&1
+          printf "${BLUE} SLU; Done installing ParMetis-4.0 from source\n"
+
+      - name: Install package
+        run: |
+          export BLUE="\033[34;1m"
+          printf "${BLUE} SLU; Installing superlu_dist from source\n"
+          cd $ROOT_DIR
+          rm -rf build
+          mkdir -p build
+          cd build 
+          cmake .. \
+          -DTPL_PARMETIS_INCLUDE_DIRS="$ROOT_DIR/installDir/parmetis-4.0.3/metis/include;$ROOT_DIR/installDir/parmetis-4.0.3/install/include" \
+          -DTPL_PARMETIS_LIBRARIES="$ROOT_DIR/installDir/parmetis-4.0.3/install/lib/libparmetis.so" \
+          -DCMAKE_C_FLAGS="-std=c11 -DPRNTlevel=1 -DPROFlevel=1 -DDEBUGlevel=1" \
+          -DCMAKE_CXX_FLAGS="-Ofast -std=c++11 -DAdd_ -DRELEASE" \
+          -DTPL_BLAS_LIBRARIES="$BLAS_LIB" \
+          -DTPL_LAPACK_LIBRARIES="$LAPACK_LIB" \
+          -Denable_blaslib=OFF \
+          -DBUILD_SHARED_LIBS=OFF \
+          -DCMAKE_C_COMPILER=mpicc \
+          -DCMAKE_CXX_COMPILER=mpic++ \
+          -DCMAKE_INSTALL_PREFIX=. \
+          -DCMAKE_BUILD_TYPE=Debug \
+          -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+          make
+          make install
+          printf "${BLUE} SLU; Done installing superlu_dist from source\n"
+    
+      - name: Test
+        run: |
+          cd $ROOT_DIR
+          export TEST_NUMBER=${{ matrix.test }}
+          ./.ci_tests.sh
+
+

From a3ee260a3b18b32c438ccef947e62b25bfaf109c Mon Sep 17 00:00:00 2001
From: Yang Liu 
Date: Mon, 4 Oct 2021 11:36:04 -0700
Subject: [PATCH 130/147] fixing yml file

---
 .github/workflows/test.yml | 6 +++++-
 .travis.yml                | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 99e54458..03e46096 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -24,7 +24,11 @@ jobs:
           mkdir -p installDir
 
           printf "${BLUE} SLU; Installing gcc-6 via apt\n"
-          sudo apt-get update
+
+          sudo apt-get update -y 
+          sudo apt-get upgrade -y 
+          sudo apt-get dist-upgrade -y  
+          sudo apt-get install dialog apt-utils -y 
           sudo apt-get install build-essential software-properties-common -y
           sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
           sudo apt-get update
diff --git a/.travis.yml b/.travis.yml
index af98f264..3e758231 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -111,4 +111,4 @@ install:
 
 script: 
   - cd $TRAVIS_BUILD_DIR
-  - ./.travis_tests.sh
+  - ./.ci_tests.sh

From 4fa0814b9dabb2ec7b37b6f3367ebc1855a29c82 Mon Sep 17 00:00:00 2001
From: Yang Liu 
Date: Mon, 4 Oct 2021 11:42:29 -0700
Subject: [PATCH 131/147] fixing yml file

---
 .github/workflows/test.yml | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 03e46096..da4deb89 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,4 +1,4 @@
-name: Run Github CI. Mirror and run GitLab CI. 
+name: Run Github CI tests. 
 
 on: [push, pull_request]
 
@@ -23,24 +23,20 @@ jobs:
           export ROOT_DIR="$pwd"
           mkdir -p installDir
 
-          printf "${BLUE} SLU; Installing gcc-6 via apt\n"
-
-          sudo apt-get update -y 
-          sudo apt-get upgrade -y 
-          sudo apt-get dist-upgrade -y  
-          sudo apt-get install dialog apt-utils -y 
+          printf "${BLUE} SLU; Installing gcc-9 via apt\n"
+          sudo apt-get update
           sudo apt-get install build-essential software-properties-common -y
           sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
           sudo apt-get update
-          sudo apt-get install gcc-6 g++-6 -y
-          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-6 60 --slave /usr/bin/g++ g++ /usr/bin/g++-6
-          export CXX="g++-6"
-          export CC="gcc-6"
-          printf "${BLUE} SLU; Done installing gcc-6 via apt\n"
+          sudo apt-get install gcc-9 g++-9 -y
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 60 --slave /usr/bin/g++ g++ /usr/bin/g++-9
+          export CXX="g++-9"
+          export CC="gcc-9"
+          printf "${BLUE} SLU; Done installing gcc-9 via apt\n"
 
           printf "${BLUE} SLU; Installing gfortran via apt\n"
-          sudo apt-get install gfortran-6 -y
-          sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-6 60
+          sudo apt-get install gfortran-9 -y
+          sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-9 60
           printf "${BLUE} SLU; Done installing gfortran via apt\n"
 
           printf "${BLUE} SLU; Installing openmpi\n"

From e0a0695018dc033f172d00c04bf54b8860b41f2b Mon Sep 17 00:00:00 2001
From: Yang Liu 
Date: Mon, 4 Oct 2021 12:03:51 -0700
Subject: [PATCH 132/147] fixing CI error

---
 .ci_tests.sh               | 2 +-
 .github/workflows/test.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.ci_tests.sh b/.ci_tests.sh
index c90da30c..f2dff61b 100755
--- a/.ci_tests.sh
+++ b/.ci_tests.sh
@@ -3,7 +3,7 @@ set -e
 
 export RED="\033[31;1m"
 export BLUE="\033[34;1m"
-export ROOT_DIR="$pwd"
+export ROOT_DIR="$PWD"
 printf "${BLUE} SLU; Entered tests file:\n"
 
 
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index da4deb89..e399e054 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -20,7 +20,7 @@ jobs:
       - name: Install dependencies
         run: |
           export BLUE="\033[34;1m"
-          export ROOT_DIR="$pwd"
+          export ROOT_DIR="$PWD"
           mkdir -p installDir
 
           printf "${BLUE} SLU; Installing gcc-9 via apt\n"

From 53be1037357f8de344202cf026abb9fc78c13d73 Mon Sep 17 00:00:00 2001
From: Yang Liu 
Date: Mon, 4 Oct 2021 12:24:08 -0700
Subject: [PATCH 133/147] retry github ci

---
 .github/workflows/test.yml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e399e054..391f4adc 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -20,7 +20,6 @@ jobs:
       - name: Install dependencies
         run: |
           export BLUE="\033[34;1m"
-          export ROOT_DIR="$PWD"
           mkdir -p installDir
 
           printf "${BLUE} SLU; Installing gcc-9 via apt\n"
@@ -54,7 +53,7 @@ jobs:
           printf "${BLUE} SLU; Done installing LAPACKfrom apt\n"
 
           printf "${BLUE} SLU; Installing ParMetis-4.0 from source\n"
-          cd $ROOT_DIR/installDir
+          cd $GITHUB_WORKSPACE/installDir
           wget http://glaros.dtc.umn.edu/gkhome/fetch/sw/parmetis/parmetis-4.0.3.tar.gz
           tar -xf parmetis-4.0.3.tar.gz
           cd parmetis-4.0.3/
@@ -67,13 +66,13 @@ jobs:
         run: |
           export BLUE="\033[34;1m"
           printf "${BLUE} SLU; Installing superlu_dist from source\n"
-          cd $ROOT_DIR
+          cd $GITHUB_WORKSPACE
           rm -rf build
           mkdir -p build
           cd build 
           cmake .. \
-          -DTPL_PARMETIS_INCLUDE_DIRS="$ROOT_DIR/installDir/parmetis-4.0.3/metis/include;$ROOT_DIR/installDir/parmetis-4.0.3/install/include" \
-          -DTPL_PARMETIS_LIBRARIES="$ROOT_DIR/installDir/parmetis-4.0.3/install/lib/libparmetis.so" \
+          -DTPL_PARMETIS_INCLUDE_DIRS="$GITHUB_WORKSPACE/installDir/parmetis-4.0.3/metis/include;$GITHUB_WORKSPACE/installDir/parmetis-4.0.3/install/include" \
+          -DTPL_PARMETIS_LIBRARIES="$GITHUB_WORKSPACE/installDir/parmetis-4.0.3/install/lib/libparmetis.so" \
           -DCMAKE_C_FLAGS="-std=c11 -DPRNTlevel=1 -DPROFlevel=1 -DDEBUGlevel=1" \
           -DCMAKE_CXX_FLAGS="-Ofast -std=c++11 -DAdd_ -DRELEASE" \
           -DTPL_BLAS_LIBRARIES="$BLAS_LIB" \
@@ -91,7 +90,7 @@ jobs:
     
       - name: Test
         run: |
-          cd $ROOT_DIR
+          cd $GITHUB_WORKSPACE
           export TEST_NUMBER=${{ matrix.test }}
           ./.ci_tests.sh
 

From b30bbb845439f6e23e0669cb8faa875b8e2622d8 Mon Sep 17 00:00:00 2001
From: Yang Liu 
Date: Mon, 4 Oct 2021 12:49:31 -0700
Subject: [PATCH 134/147] retrying github cit

---
 .ci_tests.sh | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/.ci_tests.sh b/.ci_tests.sh
index f2dff61b..e9664161 100755
--- a/.ci_tests.sh
+++ b/.ci_tests.sh
@@ -12,20 +12,20 @@ export EXAMPLE_FOLDER=$ROOT_DIR/build/EXAMPLE
 export TEST_FOLDER=$ROOT_DIR/build/TEST
 
 case "${TEST_NUMBER}" in
-1)  mpirun "-n" "1" "$TEST_FOLDER/pdtest" "-r" "1" "-c" "1" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
-2)  mpirun "-n" "1" "$TEST_FOLDER/pdtest" "-r" "1" "-c" "1" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
-3)  mpirun "-n" "3" "$TEST_FOLDER/pdtest" "-r" "1" "-c" "3" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
-4)  mpirun "-n" "3" "$TEST_FOLDER/pdtest" "-r" "1" "-c" "3" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
-5)  mpirun "-n" "2" "$TEST_FOLDER/pdtest" "-r" "2" "-c" "1" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
-6)  mpirun "-n" "2" "$TEST_FOLDER/pdtest" "-r" "2" "-c" "1" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
-7)  mpirun "-n" "6" "$TEST_FOLDER/pdtest" "-r" "2" "-c" "3" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
-8)  mpirun "-n" "6" "$TEST_FOLDER/pdtest" "-r" "2" "-c" "3" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
-9)  mpirun "-n" "4" "$EXAMPLE_FOLDER/pddrive1" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
-10) mpirun "-n" "4" "$EXAMPLE_FOLDER/pddrive2" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
-11) mpirun "-n" "4" "$EXAMPLE_FOLDER/pddrive3" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
-12) mpirun "-n" "4" "$EXAMPLE_FOLDER/pzdrive1" "-r" "2" "-c" "2" "$DATA_FOLDER/cg20.cua" ;;
-13) mpirun "-n" "4" "$EXAMPLE_FOLDER/pzdrive2" "-r" "2" "-c" "2" "$DATA_FOLDER/cg20.cua" ;;
-14) mpirun "-n" "4" "$EXAMPLE_FOLDER/pzdrive3" "-r" "2" "-c" "2" "$DATA_FOLDER/cg20.cua" ;;
-15) mpirun "-n" "4" "$EXAMPLE_FOLDER/pddrive_ABglobal" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
+1)  mpirun "-n" "1" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "1" "-c" "1" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+2)  mpirun "-n" "1" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "1" "-c" "1" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+3)  mpirun "-n" "3" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "1" "-c" "3" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+4)  mpirun "-n" "3" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "1" "-c" "3" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+5)  mpirun "-n" "2" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "2" "-c" "1" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+6)  mpirun "-n" "2" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "2" "-c" "1" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+7)  mpirun "-n" "6" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "2" "-c" "3" "-s" "1" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+8)  mpirun "-n" "6" --oversubscribe "$TEST_FOLDER/pdtest" "-r" "2" "-c" "3" "-s" "3" "-b" "2" "-x" "8" "-m" "20" "-f" "$DATA_FOLDER/g20.rua" ;;
+9)  mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pddrive1" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
+10) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pddrive2" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
+11) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pddrive3" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
+12) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pzdrive1" "-r" "2" "-c" "2" "$DATA_FOLDER/cg20.cua" ;;
+13) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pzdrive2" "-r" "2" "-c" "2" "$DATA_FOLDER/cg20.cua" ;;
+14) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pzdrive3" "-r" "2" "-c" "2" "$DATA_FOLDER/cg20.cua" ;;
+15) mpirun "-n" "4" --oversubscribe "$EXAMPLE_FOLDER/pddrive_ABglobal" "-r" "2" "-c" "2" "$DATA_FOLDER/big.rua" ;;
 *) printf "${RED} ###SLU: Unknown test\n" ;;
 esac

From d08ff84448920d90651bfe8e32d37231162a4f5c Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Mon, 4 Oct 2021 18:00:56 -0700
Subject: [PATCH 135/147] Disable several 'omp simd' pragma in pxgstrs_lsum.c,
 which causes complex code to fail. Update version to 7.1.0

---
 CMakeLists.txt             |   2 +-
 DoxyConfig                 |   2 +-
 EXAMPLE/Makefile           |   6 +-
 EXAMPLE/dcreate_matrix3d.c |   4 +-
 EXAMPLE/screate_matrix3d.c |   2 +-
 EXAMPLE/zcreate_matrix3d.c |   4 +-
 README.md                  |   4 +-
 SRC/dtrfCommWrapper.c      |   8 +
 SRC/pdgstrf2.c             |   2 +
 SRC/pdgstrs_lsum.c         | 623 +++++++++++++++++-----------------
 SRC/psgstrf2.c             |   2 +
 SRC/psgstrs_lsum.c         | 621 +++++++++++++++++-----------------
 SRC/pzgstrf2.c             |   2 +
 SRC/pzgstrs_lsum.c         | 671 ++++++++++++++++++-------------------
 SRC/strfCommWrapper.c      |   8 +
 SRC/superlu_defs.h         |   5 +-
 SRC/superlu_grid.c         |   3 +-
 SRC/superlu_grid3d.c       |   3 +-
 SRC/ztrfCommWrapper.c      |   8 +
 make.inc.in                |   1 +
 20 files changed, 988 insertions(+), 993 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d52b6fa..956f75e1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,7 +11,7 @@ cmake_minimum_required(VERSION 3.18.1 FATAL_ERROR)
 #project(SuperLU_DIST C CXX CUDA)
 project(SuperLU_DIST C CXX)
 set(VERSION_MAJOR "7")
-set(VERSION_MINOR "0")
+set(VERSION_MINOR "1")
 set(VERSION_BugFix "0")
 set(PROJECT_VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_BugFix})
 
diff --git a/DoxyConfig b/DoxyConfig
index 9e033a43..2b18411f 100644
--- a/DoxyConfig
+++ b/DoxyConfig
@@ -31,7 +31,7 @@ PROJECT_NAME           = SuperLU Distributed
 # This could be handy for archiving the generated documentation or 
 # if some version control system is used.
 
-PROJECT_NUMBER         = 7.0.0
+PROJECT_NUMBER         = 7.1.0
 
 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
 # base path where the generated documentation will be put. 
diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile
index 7ab5a475..d3ea86e8 100644
--- a/EXAMPLE/Makefile
+++ b/EXAMPLE/Makefile
@@ -53,9 +53,9 @@ ZEXM1	= pzdrive1.o zcreate_matrix.o
 ZEXM2	= pzdrive2.o zcreate_matrix.o zcreate_matrix_perturbed.o
 ZEXM3	= pzdrive3.o zcreate_matrix.o
 ZEXM4	= pzdrive4.o zcreate_matrix.o
-ZEXM3D	= pzdrive3d.o zcreate_matrix.o zcreate_matrix3d_Jake.o #znrformat_loc3d.o
-ZEXM3D1	= pzdrive3d1.o zcreate_matrix.o zcreate_matrix3d_Jake.o 
-ZEXM3D2	= pzdrive3d2.o zcreate_matrix.o zcreate_matrix3d_Jake.o
+ZEXM3D	= pzdrive3d.o zcreate_matrix.o zcreate_matrix3d.o
+ZEXM3D1	= pzdrive3d1.o zcreate_matrix.o zcreate_matrix3d.o 
+ZEXM3D2	= pzdrive3d2.o zcreate_matrix.o zcreate_matrix3d.o
 ZEXM3D3	= pzdrive3d3.o zcreate_matrix.o zcreate_matrix3d.o
 
 ZEXMG	= pzdrive_ABglobal.o
diff --git a/EXAMPLE/dcreate_matrix3d.c b/EXAMPLE/dcreate_matrix3d.c
index 3ad83de9..6e9ccc21 100644
--- a/EXAMPLE/dcreate_matrix3d.c
+++ b/EXAMPLE/dcreate_matrix3d.c
@@ -18,7 +18,7 @@ at the top-level directory.
  * -- Distributed SuperLU routine (version 7.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley,
  * Oak Ridge National Lab.
- * October 26, 2020
+ * May 12, 2021
  * 
*/ #include @@ -270,7 +270,7 @@ int dcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, double **rhs, iam = grid3d->iam; #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter dcreate_matrix()"); + CHECK_MALLOC(iam, "Enter dcreate_matrix_postfix3d()"); #endif if ( !iam ) diff --git a/EXAMPLE/screate_matrix3d.c b/EXAMPLE/screate_matrix3d.c index f1b0cf64..5cb123d0 100644 --- a/EXAMPLE/screate_matrix3d.c +++ b/EXAMPLE/screate_matrix3d.c @@ -270,7 +270,7 @@ int screate_matrix_postfix3d(SuperMatrix *A, int nrhs, float **rhs, iam = grid3d->iam; #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter dcreate_matrix()"); + CHECK_MALLOC(iam, "Enter screate_matrix_postfix3d()"); #endif if ( !iam ) diff --git a/EXAMPLE/zcreate_matrix3d.c b/EXAMPLE/zcreate_matrix3d.c index b3c43ffd..18a5a4e8 100644 --- a/EXAMPLE/zcreate_matrix3d.c +++ b/EXAMPLE/zcreate_matrix3d.c @@ -17,7 +17,7 @@ at the top-level directory. * -- Distributed SuperLU routine (version 7.0) -- * Lawrence Berkeley National Lab, Univ. of California Berkeley, * Oak Ridge National Lab. - * October 26, 2020 + * May 12, 2021 *
*/ #include @@ -269,7 +269,7 @@ int zcreate_matrix_postfix3d(SuperMatrix *A, int nrhs, doublecomplex **rhs, iam = grid3d->iam; #if ( DEBUGlevel>=1 ) - CHECK_MALLOC(iam, "Enter dcreate_matrix()"); + CHECK_MALLOC(iam, "Enter zcreate_matrix_postfix3d()"); #endif if ( !iam ) diff --git a/README.md b/README.md index 0c78f131..df987413 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# SuperLU_DIST (version 7.0) superlu +# SuperLU_DIST (version 7.1.0) superlu [![Build Status](https://travis-ci.org/xiaoyeli/superlu_dist.svg?branch=master)](https://travis-ci.org/xiaoyeli/superlu_dist) [Nightly tests](http://my.cdash.org/index.php?project=superlu_dist) @@ -25,7 +25,7 @@ acceleration capabilities. Table of Contents ================= -* [SuperLU_DIST (version 7.0) superlu](#superlu_dist-version-70---) +* [SuperLU_DIST (version 7.1.0) superlu](#superlu_dist-version-70---) * [Directory structure of the source code](#directory-structure-of-the-source-code) * [Installation](#installation) * [Installation option 1: Using CMake build system.](#installation-option-1-using-cmake-build-system) diff --git a/SRC/dtrfCommWrapper.c b/SRC/dtrfCommWrapper.c index b7531604..c7b51a54 100644 --- a/SRC/dtrfCommWrapper.c +++ b/SRC/dtrfCommWrapper.c @@ -160,7 +160,9 @@ int_t dLPanelTrSolve( int_t k, int_t* factored_L, // unsigned long long t1 = _rdtsc(); +#ifdef _OPENMP // #pragma omp for schedule(dynamic) nowait +#endif #define BL 32 for (int i = 0; i < CEILING(l, BL); ++i) { @@ -199,13 +201,17 @@ int_t dLPanelTrSolve( int_t k, int_t* factored_L, // printf("%d: L update \n",k ); #define BL 32 +#ifdef _OPENMP // #pragma omp parallel for +#endif for (int i = 0; i < CEILING(l, BL); ++i) { int_t off = i * BL; // Sherry: int_t len = MY_MIN(BL, l - i * BL); int len = SUPERLU_MIN(BL, (l - i * BL)); +#ifdef _OPENMP //#pragma omp task +#endif { superlu_dtrsm("R", "U", "N", "N", len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr); @@ -271,7 +277,9 @@ int_t dUPanelTrSolve( int_t k, Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); /* Loop through all the row blocks. */ +#ifdef _OPENMP // #pragma omp for schedule(dynamic,2) nowait +#endif for (int_t b = 0; b < nb; ++b) { #ifdef _OPENMP diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c index 5bb748c0..fca4792c 100644 --- a/SRC/pdgstrf2.c +++ b/SRC/pdgstrf2.c @@ -862,7 +862,9 @@ void pdgstrs2_omp #endif } /* end if segsize > 0 */ } /* end for j in parallel ... */ +#ifdef _OPENMP /* #pragma omp taskwait */ +#endif } /* end for b ... */ #ifndef USE_Ublock_info diff --git a/SRC/pdgstrs_lsum.c b/SRC/pdgstrs_lsum.c index 730069c5..d2f3deaa 100644 --- a/SRC/pdgstrs_lsum.c +++ b/SRC/pdgstrs_lsum.c @@ -528,11 +528,11 @@ void dlsum_fmod_inv for (nn=0;nn=1 ) TOC(t2, t1); @@ -598,149 +598,148 @@ void dlsum_fmod_inv #endif for (lb=lbstart;lb=1 ) - TIC(t1); + TIC(t1); #endif - for (ii=1;iiLrowind_bc_ptr[lk]; - lusup1 = Llu->Lnzval_bc_ptr[lk]; - nsupr1 = lsub1[1]; + // fmod[lk] = -1; /* Do not solve X[k] in the future. */ + lk = LBj( ik, grid );/* Local block number, column-wise. */ + lsub1 = Llu->Lrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; - if(Llu->inv == 1){ - Linv = Llu->Linv_bc_ptr[lk]; + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; #ifdef _CRAY - SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #elif defined (USE_VENDOR_BLAS) - dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); #else - dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); -#endif - #ifdef _OPENMP - #pragma omp simd - #endif - for (i=0 ; i=1 ) - TOC(t2, t1); - stat[thread_id1]->utime[SOL_TRSM] += t2; + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; #endif - stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, ik); + printf("(%2d) Solve X[%2d]\n", iam, ik); #endif - /* - * Send Xk to process column Pc[k]. - */ + /* + * Send Xk to process column Pc[k]. + */ - if(LBtree_ptr[lk]!=NULL){ + if(LBtree_ptr[lk]!=NULL){ #ifdef _OPENMP #pragma omp atomic capture #endif - nleaf_send_tmp = ++nleaf_send[0]; - leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; - } + nleaf_send_tmp = ++nleaf_send[0]; + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + } - /* - * Perform local block modifications. - */ + /* + * Perform local block modifications. + */ - // #ifdef _OPENMP - // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) - // #endif - { +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) +// #endif + { - dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, - fmod, xsup, - grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread); - } + dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, + fmod, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread); + } - // } /* if frecv[lk] == 0 */ - } /* if iam == p */ + // } /* if frecv[lk] == 0 */ + } /* end if iam == p */ } /* if fmod[lk] == 0 */ } - } + } /* end tasklook for nn ... */ } }else{ @@ -782,16 +781,16 @@ void dlsum_fmod_inv il = LSUM_BLK( lk ); RHS_ITERATE(j) - #ifdef _OPENMP - #pragma omp simd - #endif - for (i = 0; i < nbrow1; ++i) { - irow = lsub[lptr+i] - rel; /* Relative row. */ + #ifdef _OPENMP + #pragma omp simd + #endif + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ - lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; - } + lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + } nbrow_ref+=nbrow1; - } + } /* end for lb ... */ // TOC(t3, t1); @@ -802,140 +801,135 @@ void dlsum_fmod_inv for (lb=0;lb=1 ) - TIC(t1); + TIC(t1); #endif - for (ii=1;iiLrowind_bc_ptr[lk]; - lusup1 = Llu->Lnzval_bc_ptr[lk]; - nsupr1 = lsub1[1]; + lk = LBj( ik, grid );/* Local block number, column-wise. */ + lsub1 = Llu->Lrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; - if(Llu->inv == 1){ - Linv = Llu->Linv_bc_ptr[lk]; + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; #ifdef _CRAY - SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #elif defined (USE_VENDOR_BLAS) - dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); #else - dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); + dgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #endif - #ifdef _OPENMP - #pragma omp simd - #endif - for (i=0 ; i=1 ) - TOC(t2, t1); - stat[thread_id]->utime[SOL_TRSM] += t2; + TOC(t2, t1); + stat[thread_id]->utime[SOL_TRSM] += t2; #endif - stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, ik); + printf("(%2d) Solve X[%2d]\n", iam, ik); #endif - /* - * Send Xk to process column Pc[k]. - */ - - if(LBtree_ptr[lk]!=NULL){ + /* + * Send Xk to process column Pc[k]. + */ + if(LBtree_ptr[lk]!=NULL){ #ifdef _OPENMP #pragma omp atomic capture #endif - nleaf_send_tmp = ++nleaf_send[0]; - // printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp); - leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; - // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'d'); - } - - /* - * Perform local block modifications. - */ + nleaf_send_tmp = ++nleaf_send[0]; + // printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp); + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'d'); + } - // #ifdef _OPENMP - // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1) - // #endif + /* + * Perform local block modifications. + */ - { - dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, - fmod, xsup, - grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); - } +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1) +// #endif + { + dlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, + fmod, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); + } // } /* if frecv[lk] == 0 */ - } /* if iam == p */ + } /* end else iam == p */ } /* if fmod[lk] == 0 */ } // } @@ -944,7 +938,6 @@ void dlsum_fmod_inv stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; - } /* if nlb>0*/ } /* dLSUM_FMOD_INV */ @@ -1128,22 +1121,23 @@ void dlsum_fmod_inv_master il = LSUM_BLK( lk ); RHS_ITERATE(j) - #ifdef _OPENMP - #pragma omp simd lastprivate(irow) - #endif + #ifdef _OPENMP + #pragma omp simd lastprivate(irow) + #endif for (i = 0; i < nbrow1; ++i) { irow = lsub[lptr+i] - rel; /* Relative row. */ lsum[il+irow + j*iknsupc] -= rtemp_loc[nbrow_ref+i + j*nbrow]; } nbrow_ref+=nbrow1; - } + } /* end for lb ... */ #if ( PROFlevel>=1 ) TOC(t2, t1); stat[thread_id1]->utime[SOL_GEMM] += t2; #endif - } - } + } /* end if (lbstart=1 ) TOC(t2, t1); stat[thread_id]->utime[SOL_GEMM] += t2; #endif - } - // TOC(t3, t1); + } /* end else ... */ + // TOC(t3, t1); rtemp_loc = &rtemp[sizertemp* thread_id]; for (lb=0;lb=1 ) TIC(t1); #endif for (ii=1;ii=1 ) TOC(t2, t1); stat[thread_id]->utime[SOL_TRSM] += t2; - #endif stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; @@ -1333,13 +1323,12 @@ void dlsum_fmod_inv_master * Perform local block modifications. */ - // #ifdef _OPENMP - // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) - // #endif +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) +// #endif { nlb1 = lsub1[0] - 1; - dlsum_fmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, fmod, nlb1, xsup, grid, Llu, stat,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); @@ -1351,8 +1340,8 @@ void dlsum_fmod_inv_master } // } stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; - } /* if nlb>0*/ -} /* dLSUM_FMOD_INV */ + } /* end if nlb>0*/ +} /* end dlsum_fmod_inv_master */ @@ -1412,7 +1401,7 @@ void dlsum_bmod_inv float msg_vol = 0, msg_cnt = 0; int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend; int_t iword = sizeof(int_t); - int_t dword = sizeof (double); + int_t dword = sizeof(double); int_t aln_d,aln_i; aln_d = ceil(CACHELINE/(double)dword); aln_i = ceil(CACHELINE/(double)iword); @@ -1476,15 +1465,15 @@ void dlsum_bmod_inv fnz = usub[i + jj]; if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ - #ifdef _OPENMP - #pragma omp simd - #endif +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif for (irow = fnz; irow < iklrow; ++irow) dest[irow - ikfrow] -= uval[uptr++] * y[jj]; stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); } - } /* for jj ... */ + } /* end for jj ... */ } #if ( PROFlevel>=1 ) @@ -1492,7 +1481,6 @@ void dlsum_bmod_inv stat[thread_id1]->utime[SOL_GEMM] += t2; #endif - #ifdef _OPENMP #pragma omp atomic capture #endif @@ -1504,9 +1492,9 @@ void dlsum_bmod_inv if ( iam != p ) { for (ii=1;ii=1 ) TIC(t1); #endif - for (ii=1;iiops[SOLVE] += 2 * (iklrow - fnz); + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz); } } /* for jj ... */ } @@ -1686,9 +1673,9 @@ void dlsum_bmod_inv if ( iam != p ) { for (ii=1;ii16){ - // #ifdef _OPENMP - // #pragma omp task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied - // #endif +// if(Urbs[lk1]>16){ +// #ifdef _OPENMP +// #pragma omp task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied +// #endif // dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, // Ucb_indptr, Ucb_valptr, xsup, grid, Llu, // stat, root_send, nroot_send, sizelsum,sizertemp); //}else{ - dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread); + dlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread); //} // } /* if brecv[ik] == 0 */ } } /* if bmod[ik] == 0 */ - } /* for ub ... */ - } + } /* end for ub ... */ + } /* end else ... */ } /* dlSUM_BMOD_inv */ @@ -1943,9 +1930,9 @@ void dlsum_bmod_inv_master fnz = usub[i + jj]; if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ - #ifdef _OPENMP - #pragma omp simd - #endif +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif for (irow = fnz; irow < iklrow; ++irow) dest[irow - ikfrow] -= uval[uptr++] * y[jj]; stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); @@ -1985,9 +1972,9 @@ void dlsum_bmod_inv_master fnz = usub[i + jj]; if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ - #ifdef _OPENMP - #pragma omp simd - #endif +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif for (irow = fnz; irow < iklrow; ++irow) dest[irow - ikfrow] -= uval[uptr++] * y[jj]; stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz); @@ -2021,9 +2008,9 @@ void dlsum_bmod_inv_master if ( iam != p ) { for (ii=1;ii 0 */ } /* end for j in parallel ... */ +#ifdef _OPENMP /* #pragma omp taskwait */ +#endif } /* end for b ... */ #ifndef USE_Ublock_info diff --git a/SRC/psgstrs_lsum.c b/SRC/psgstrs_lsum.c index fe0044f3..2b444e03 100644 --- a/SRC/psgstrs_lsum.c +++ b/SRC/psgstrs_lsum.c @@ -528,11 +528,11 @@ void slsum_fmod_inv for (nn=0;nn=1 ) TOC(t2, t1); @@ -598,149 +598,148 @@ void slsum_fmod_inv #endif for (lb=lbstart;lb=1 ) - TIC(t1); + TIC(t1); #endif - for (ii=1;iiLrowind_bc_ptr[lk]; - lusup1 = Llu->Lnzval_bc_ptr[lk]; - nsupr1 = lsub1[1]; + // fmod[lk] = -1; /* Do not solve X[k] in the future. */ + lk = LBj( ik, grid );/* Local block number, column-wise. */ + lsub1 = Llu->Lrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; - if(Llu->inv == 1){ - Linv = Llu->Linv_bc_ptr[lk]; + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; #ifdef _CRAY - SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #elif defined (USE_VENDOR_BLAS) - sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); #else - sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); -#endif - #ifdef _OPENMP - #pragma omp simd - #endif - for (i=0 ; i=1 ) - TOC(t2, t1); - stat[thread_id1]->utime[SOL_TRSM] += t2; + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; #endif - stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + stat[thread_id1]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, ik); + printf("(%2d) Solve X[%2d]\n", iam, ik); #endif - /* - * Send Xk to process column Pc[k]. - */ + /* + * Send Xk to process column Pc[k]. + */ - if(LBtree_ptr[lk]!=NULL){ + if(LBtree_ptr[lk]!=NULL){ #ifdef _OPENMP #pragma omp atomic capture #endif - nleaf_send_tmp = ++nleaf_send[0]; - leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; - } + nleaf_send_tmp = ++nleaf_send[0]; + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + } - /* - * Perform local block modifications. - */ + /* + * Perform local block modifications. + */ - // #ifdef _OPENMP - // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) - // #endif - { +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) +// #endif + { - slsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, - fmod, xsup, - grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread); - } + slsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, + fmod, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread); + } - // } /* if frecv[lk] == 0 */ - } /* if iam == p */ + // } /* if frecv[lk] == 0 */ + } /* end if iam == p */ } /* if fmod[lk] == 0 */ } - } + } /* end tasklook for nn ... */ } }else{ @@ -782,16 +781,16 @@ void slsum_fmod_inv il = LSUM_BLK( lk ); RHS_ITERATE(j) - #ifdef _OPENMP - #pragma omp simd - #endif - for (i = 0; i < nbrow1; ++i) { - irow = lsub[lptr+i] - rel; /* Relative row. */ + #ifdef _OPENMP + #pragma omp simd + #endif + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ - lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; - } + lsum[il+irow + j*iknsupc+sizelsum*thread_id] -= rtemp_loc[nbrow_ref+i + j*nbrow]; + } nbrow_ref+=nbrow1; - } + } /* end for lb ... */ // TOC(t3, t1); @@ -802,140 +801,135 @@ void slsum_fmod_inv for (lb=0;lb=1 ) - TIC(t1); + TIC(t1); #endif - for (ii=1;iiLrowind_bc_ptr[lk]; - lusup1 = Llu->Lnzval_bc_ptr[lk]; - nsupr1 = lsub1[1]; + lk = LBj( ik, grid );/* Local block number, column-wise. */ + lsub1 = Llu->Lrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; - if(Llu->inv == 1){ - Linv = Llu->Linv_bc_ptr[lk]; + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; #ifdef _CRAY - SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); + SGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #elif defined (USE_VENDOR_BLAS) - sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); #else - sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); + sgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #endif - #ifdef _OPENMP - #pragma omp simd - #endif - for (i=0 ; i=1 ) - TOC(t2, t1); - stat[thread_id]->utime[SOL_TRSM] += t2; + TOC(t2, t1); + stat[thread_id]->utime[SOL_TRSM] += t2; #endif - stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; + stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, ik); + printf("(%2d) Solve X[%2d]\n", iam, ik); #endif - /* - * Send Xk to process column Pc[k]. - */ - - if(LBtree_ptr[lk]!=NULL){ + /* + * Send Xk to process column Pc[k]. + */ + if(LBtree_ptr[lk]!=NULL){ #ifdef _OPENMP #pragma omp atomic capture #endif - nleaf_send_tmp = ++nleaf_send[0]; - // printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp); - leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; - // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'s'); - } - - /* - * Perform local block modifications. - */ + nleaf_send_tmp = ++nleaf_send[0]; + // printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp); + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'s'); + } - // #ifdef _OPENMP - // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1) - // #endif + /* + * Perform local block modifications. + */ - { - slsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, - fmod, xsup, - grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); - } +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1) +// #endif + { + slsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, + fmod, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); + } // } /* if frecv[lk] == 0 */ - } /* if iam == p */ + } /* end else iam == p */ } /* if fmod[lk] == 0 */ } // } @@ -944,7 +938,6 @@ void slsum_fmod_inv stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; - } /* if nlb>0*/ } /* sLSUM_FMOD_INV */ @@ -1128,22 +1121,23 @@ void slsum_fmod_inv_master il = LSUM_BLK( lk ); RHS_ITERATE(j) - #ifdef _OPENMP - #pragma omp simd lastprivate(irow) - #endif + #ifdef _OPENMP + #pragma omp simd lastprivate(irow) + #endif for (i = 0; i < nbrow1; ++i) { irow = lsub[lptr+i] - rel; /* Relative row. */ lsum[il+irow + j*iknsupc] -= rtemp_loc[nbrow_ref+i + j*nbrow]; } nbrow_ref+=nbrow1; - } + } /* end for lb ... */ #if ( PROFlevel>=1 ) TOC(t2, t1); stat[thread_id1]->utime[SOL_GEMM] += t2; #endif - } - } + } /* end if (lbstart=1 ) TOC(t2, t1); stat[thread_id]->utime[SOL_GEMM] += t2; #endif - } - // TOC(t3, t1); + } /* end else ... */ + // TOC(t3, t1); rtemp_loc = &rtemp[sizertemp* thread_id]; for (lb=0;lb=1 ) TIC(t1); #endif for (ii=1;ii=1 ) TOC(t2, t1); stat[thread_id]->utime[SOL_TRSM] += t2; - #endif stat[thread_id]->ops[SOLVE] += iknsupc * (iknsupc - 1) * nrhs; @@ -1333,13 +1323,12 @@ void slsum_fmod_inv_master * Perform local block modifications. */ - // #ifdef _OPENMP - // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) - // #endif +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) +// #endif { nlb1 = lsub1[0] - 1; - slsum_fmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, fmod, nlb1, xsup, grid, Llu, stat,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); @@ -1351,8 +1340,8 @@ void slsum_fmod_inv_master } // } stat[thread_id]->ops[SOLVE] += 2 * m * nrhs * knsupc; - } /* if nlb>0*/ -} /* sLSUM_FMOD_INV */ + } /* end if nlb>0*/ +} /* end slsum_fmod_inv_master */ @@ -1476,15 +1465,15 @@ void slsum_bmod_inv fnz = usub[i + jj]; if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ - #ifdef _OPENMP - #pragma omp simd - #endif +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif for (irow = fnz; irow < iklrow; ++irow) dest[irow - ikfrow] -= uval[uptr++] * y[jj]; stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); } - } /* for jj ... */ + } /* end for jj ... */ } #if ( PROFlevel>=1 ) @@ -1492,7 +1481,6 @@ void slsum_bmod_inv stat[thread_id1]->utime[SOL_GEMM] += t2; #endif - #ifdef _OPENMP #pragma omp atomic capture #endif @@ -1504,9 +1492,9 @@ void slsum_bmod_inv if ( iam != p ) { for (ii=1;ii=1 ) TIC(t1); #endif - for (ii=1;iiops[SOLVE] += 2 * (iklrow - fnz); + dest[irow - ikfrow] -= uval[uptr++] * y[jj]; + stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz); } } /* for jj ... */ } @@ -1686,9 +1673,9 @@ void slsum_bmod_inv if ( iam != p ) { for (ii=1;ii16){ - // #ifdef _OPENMP - // #pragma omp task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied - // #endif +// if(Urbs[lk1]>16){ +// #ifdef _OPENMP +// #pragma omp task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied +// #endif // slsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, // Ucb_indptr, Ucb_valptr, xsup, grid, Llu, // stat, root_send, nroot_send, sizelsum,sizertemp); //}else{ - slsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread); + slsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread); //} // } /* if brecv[ik] == 0 */ } } /* if bmod[ik] == 0 */ - } /* for ub ... */ - } + } /* end for ub ... */ + } /* end else ... */ } /* slSUM_BMOD_inv */ @@ -1943,9 +1930,9 @@ void slsum_bmod_inv_master fnz = usub[i + jj]; if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ - #ifdef _OPENMP - #pragma omp simd - #endif +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif for (irow = fnz; irow < iklrow; ++irow) dest[irow - ikfrow] -= uval[uptr++] * y[jj]; stat[thread_id1]->ops[SOLVE] += 2 * (iklrow - fnz); @@ -1985,9 +1972,9 @@ void slsum_bmod_inv_master fnz = usub[i + jj]; if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ - #ifdef _OPENMP - #pragma omp simd - #endif +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif for (irow = fnz; irow < iklrow; ++irow) dest[irow - ikfrow] -= uval[uptr++] * y[jj]; stat[thread_id]->ops[SOLVE] += 2 * (iklrow - fnz); @@ -2021,9 +2008,9 @@ void slsum_bmod_inv_master if ( iam != p ) { for (ii=1;ii 0 */ } /* end for j in parallel ... */ +#ifdef _OPENMP /* #pragma omp taskwait */ +#endif } /* end for b ... */ #ifndef USE_Ublock_info diff --git a/SRC/pzgstrs_lsum.c b/SRC/pzgstrs_lsum.c index 20aeb2b2..987a8147 100644 --- a/SRC/pzgstrs_lsum.c +++ b/SRC/pzgstrs_lsum.c @@ -539,11 +539,11 @@ void zlsum_fmod_inv for (nn=0;nn=1 ) TOC(t2, t1); @@ -611,156 +611,155 @@ void zlsum_fmod_inv #endif for (lb=lbstart;lb=1 ) - TIC(t1); -#endif - for (ii=1;iiLrowind_bc_ptr[lk]; - lusup1 = Llu->Lnzval_bc_ptr[lk]; - nsupr1 = lsub1[1]; - - if(Llu->inv == 1){ - Linv = Llu->Linv_bc_ptr[lk]; + TIC(t1); +#endif + for (ii=1;iiLrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; + + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; #ifdef _CRAY - CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); + CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #elif defined (USE_VENDOR_BLAS) - zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); #else - zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); -#endif - #ifdef _OPENMP - #pragma omp simd - #endif - for (i=0 ; i=1 ) - TOC(t2, t1); - stat[thread_id1]->utime[SOL_TRSM] += t2; + TOC(t2, t1); + stat[thread_id1]->utime[SOL_TRSM] += t2; #endif - stat[thread_id1]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs + stat[thread_id1]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, ik); + printf("(%2d) Solve X[%2d]\n", iam, ik); #endif - /* - * Send Xk to process column Pc[k]. - */ + /* + * Send Xk to process column Pc[k]. + */ - if(LBtree_ptr[lk]!=NULL){ + if(LBtree_ptr[lk]!=NULL){ #ifdef _OPENMP #pragma omp atomic capture #endif - nleaf_send_tmp = ++nleaf_send[0]; - leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; - } + nleaf_send_tmp = ++nleaf_send[0]; + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + } - /* - * Perform local block modifications. - */ + /* + * Perform local block modifications. + */ - // #ifdef _OPENMP - // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) - // #endif - { +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) +// #endif + { - zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, - fmod, xsup, - grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread); - } + zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, + fmod, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id1,num_thread); + } - // } /* if frecv[lk] == 0 */ - } /* if iam == p */ + // } /* if frecv[lk] == 0 */ + } /* end if iam == p */ } /* if fmod[lk] == 0 */ } - } + } /* end tasklook for nn ... */ } }else{ @@ -802,18 +801,18 @@ void zlsum_fmod_inv il = LSUM_BLK( lk ); RHS_ITERATE(j) - #ifdef _OPENMP - #pragma omp simd - #endif - for (i = 0; i < nbrow1; ++i) { - irow = lsub[lptr+i] - rel; /* Relative row. */ + #ifdef _OPENMP + #pragma omp simd + #endif + for (i = 0; i < nbrow1; ++i) { + irow = lsub[lptr+i] - rel; /* Relative row. */ - z_sub(&lsum[il+irow + j*iknsupc+sizelsum*thread_id], - &lsum[il+irow + j*iknsupc+sizelsum*thread_id], - &rtemp_loc[nbrow_ref+i + j*nbrow]); - } + z_sub(&lsum[il+irow + j*iknsupc+sizelsum*thread_id], + &lsum[il+irow + j*iknsupc+sizelsum*thread_id], + &rtemp_loc[nbrow_ref+i + j*nbrow]); + } nbrow_ref+=nbrow1; - } + } /* end for lb ... */ // TOC(t3, t1); @@ -824,147 +823,142 @@ void zlsum_fmod_inv for (lb=0;lb=1 ) - TIC(t1); + TIC(t1); #endif - for (ii=1;iiLrowind_bc_ptr[lk]; - lusup1 = Llu->Lnzval_bc_ptr[lk]; - nsupr1 = lsub1[1]; + lk = LBj( ik, grid );/* Local block number, column-wise. */ + lsub1 = Llu->Lrowind_bc_ptr[lk]; + lusup1 = Llu->Lnzval_bc_ptr[lk]; + nsupr1 = lsub1[1]; - if(Llu->inv == 1){ - Linv = Llu->Linv_bc_ptr[lk]; + if(Llu->inv == 1){ + Linv = Llu->Linv_bc_ptr[lk]; #ifdef _CRAY - CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); + CGEMM( ftcs2, ftcs2, &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #elif defined (USE_VENDOR_BLAS) - zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc, 1, 1 ); #else - zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, - &alpha, Linv, &iknsupc, &x[ii], - &iknsupc, &beta, rtemp_loc, &iknsupc ); + zgemm_( "N", "N", &iknsupc, &nrhs, &iknsupc, + &alpha, Linv, &iknsupc, &x[ii], + &iknsupc, &beta, rtemp_loc, &iknsupc ); #endif - #ifdef _OPENMP - #pragma omp simd - #endif - for (i=0 ; i=1 ) - TOC(t2, t1); - stat[thread_id]->utime[SOL_TRSM] += t2; + TOC(t2, t1); + stat[thread_id]->utime[SOL_TRSM] += t2; #endif - stat[thread_id]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs + stat[thread_id]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) - printf("(%2d) Solve X[%2d]\n", iam, ik); + printf("(%2d) Solve X[%2d]\n", iam, ik); #endif - /* - * Send Xk to process column Pc[k]. - */ - - if(LBtree_ptr[lk]!=NULL){ + /* + * Send Xk to process column Pc[k]. + */ + if(LBtree_ptr[lk]!=NULL){ #ifdef _OPENMP #pragma omp atomic capture #endif - nleaf_send_tmp = ++nleaf_send[0]; - // printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp); - leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; - // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'z'); - } - - /* - * Perform local block modifications. - */ + nleaf_send_tmp = ++nleaf_send[0]; + // printf("nleaf_send_tmp %5d lk %5d\n",nleaf_send_tmp); + leaf_send[(nleaf_send_tmp-1)*aln_i] = lk; + // BcTree_forwardMessageSimple(LBtree_ptr[lk],&x[ii - XK_H],'z'); + } - // #ifdef _OPENMP - // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1) - // #endif + /* + * Perform local block modifications. + */ - { - zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, - fmod, xsup, - grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); - } +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1) untied priority(1) +// #endif + { + zlsum_fmod_inv(lsum, x, &x[ii], rtemp, nrhs, ik, + fmod, xsup, + grid, Llu, stat, leaf_send, nleaf_send ,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); + } // } /* if frecv[lk] == 0 */ - } /* if iam == p */ + } /* end else iam == p */ } /* if fmod[lk] == 0 */ } // } @@ -973,7 +967,6 @@ void zlsum_fmod_inv stat[thread_id]->ops[SOLVE] += 8 * m * nrhs * knsupc; - } /* if nlb>0*/ } /* zLSUM_FMOD_INV */ @@ -1157,9 +1150,9 @@ void zlsum_fmod_inv_master il = LSUM_BLK( lk ); RHS_ITERATE(j) - #ifdef _OPENMP - #pragma omp simd lastprivate(irow) - #endif + #ifdef _OPENMP + #pragma omp simd lastprivate(irow) + #endif for (i = 0; i < nbrow1; ++i) { irow = lsub[lptr+i] - rel; /* Relative row. */ z_sub(&lsum[il+irow + j*iknsupc], @@ -1167,14 +1160,15 @@ void zlsum_fmod_inv_master &rtemp_loc[nbrow_ref+i + j*nbrow]); } nbrow_ref+=nbrow1; - } + } /* end for lb ... */ #if ( PROFlevel>=1 ) TOC(t2, t1); stat[thread_id1]->utime[SOL_GEMM] += t2; #endif - } - } + } /* end if (lbstart=1 ) TOC(t2, t1); stat[thread_id]->utime[SOL_GEMM] += t2; #endif - } - // TOC(t3, t1); + } /* end else ... */ + // TOC(t3, t1); rtemp_loc = &rtemp[sizertemp* thread_id]; for (lb=0;lb=1 ) TIC(t1); #endif for (ii=1;ii=1 ) TOC(t2, t1); stat[thread_id]->utime[SOL_TRSM] += t2; - #endif stat[thread_id]->ops[SOLVE] += 4 * iknsupc * (iknsupc - 1) * nrhs - + 10 * knsupc * nrhs; /* complex division */ + + 10 * knsupc * nrhs; /* complex division */ #if ( DEBUGlevel>=2 ) printf("(%2d) Solve X[%2d]\n", iam, ik); @@ -1373,13 +1363,12 @@ void zlsum_fmod_inv_master * Perform local block modifications. */ - // #ifdef _OPENMP - // #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) - // #endif +// #ifdef _OPENMP +// #pragma omp task firstprivate (Llu,sizelsum,iknsupc,ii,ik,lsub1,x,rtemp,fmod,lsum,stat,nrhs,grid,xsup,recurlevel) private(lptr1,luptr1,nlb1,thread_id1) untied priority(1) +// #endif { nlb1 = lsub1[0] - 1; - zlsum_fmod_inv_master(lsum, x, &x[ii], rtemp, nrhs, iknsupc, ik, fmod, nlb1, xsup, grid, Llu, stat,sizelsum,sizertemp,1+recurlevel,maxsuper,thread_id,num_thread); @@ -1391,8 +1380,8 @@ void zlsum_fmod_inv_master } // } stat[thread_id]->ops[SOLVE] += 8 * m * nrhs * knsupc; - } /* if nlb>0*/ -} /* zLSUM_FMOD_INV */ + } /* end if nlb>0*/ +} /* end zlsum_fmod_inv_master */ @@ -1452,7 +1441,7 @@ void zlsum_bmod_inv float msg_vol = 0, msg_cnt = 0; int_t Nchunk, nub_loc,remainder,nn,lbstart,lbend; int_t iword = sizeof(int_t); - int_t dword = sizeof (double); + int_t dword = sizeof(double); int_t aln_d,aln_i; aln_d = ceil(CACHELINE/(double)dword); aln_i = ceil(CACHELINE/(double)iword); @@ -1516,9 +1505,9 @@ void zlsum_bmod_inv fnz = usub[i + jj]; if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ - #ifdef _OPENMP - #pragma omp simd - #endif +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif for (irow = fnz; irow < iklrow; ++irow) { zz_mult(&temp, &uval[uptr], &y[jj]); @@ -1529,7 +1518,7 @@ void zlsum_bmod_inv stat[thread_id1]->ops[SOLVE] += 8 * (iklrow - fnz); } - } /* for jj ... */ + } /* end for jj ... */ } #if ( PROFlevel>=1 ) @@ -1537,7 +1526,6 @@ void zlsum_bmod_inv stat[thread_id1]->utime[SOL_GEMM] += t2; #endif - #ifdef _OPENMP #pragma omp atomic capture #endif @@ -1549,9 +1537,9 @@ void zlsum_bmod_inv if ( iam != p ) { for (ii=1;ii=1 ) TIC(t1); #endif - for (ii=1;iiops[SOLVE] += 8 * (iklrow - fnz); + } + stat[thread_id]->ops[SOLVE] += 8 * (iklrow - fnz); } } /* for jj ... */ } @@ -1743,9 +1730,9 @@ void zlsum_bmod_inv if ( iam != p ) { for (ii=1;ii16){ - // #ifdef _OPENMP - // #pragma omp task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied - // #endif +// if(Urbs[lk1]>16){ +// #ifdef _OPENMP +// #pragma omp task firstprivate (Ucb_indptr,Ucb_valptr,Llu,sizelsum,ii,gik,x,rtemp,bmod,Urbs,lsum,stat,nrhs,grid,xsup) untied +// #endif // zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, // Ucb_indptr, Ucb_valptr, xsup, grid, Llu, // stat, root_send, nroot_send, sizelsum,sizertemp); //}else{ - zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, - Ucb_indptr, Ucb_valptr, xsup, grid, Llu, - stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread); + zlsum_bmod_inv(lsum, x, &x[ii], rtemp, nrhs, gik, bmod, Urbs, + Ucb_indptr, Ucb_valptr, xsup, grid, Llu, + stat, root_send, nroot_send, sizelsum,sizertemp,thread_id,num_thread); //} // } /* if brecv[ik] == 0 */ } } /* if bmod[ik] == 0 */ - } /* for ub ... */ - } + } /* end for ub ... */ + } /* end else ... */ } /* zlSUM_BMOD_inv */ @@ -2007,9 +1994,9 @@ void zlsum_bmod_inv_master fnz = usub[i + jj]; if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ - #ifdef _OPENMP - #pragma omp simd - #endif +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif for (irow = fnz; irow < iklrow; ++irow) { zz_mult(&temp, &uval[uptr], &y[jj]); @@ -2054,9 +2041,9 @@ void zlsum_bmod_inv_master fnz = usub[i + jj]; if ( fnz < iklrow ) { /* Nonzero segment. */ /* AXPY */ - #ifdef _OPENMP - #pragma omp simd - #endif +//#ifdef _OPENMP +//#pragma omp simd // In complex case, this SIMD loop has 2 instructions, the compiler may generate incoreect code, so need to disable this omp simd +//#endif for (irow = fnz; irow < iklrow; ++irow) { zz_mult(&temp, &uval[uptr], &y[jj]); @@ -2095,9 +2082,9 @@ void zlsum_bmod_inv_master if ( iam != p ) { for (ii=1;ii */ @@ -74,9 +75,9 @@ at the top-level directory. * Versions 4.x and earlier do not include a #define'd version numbers. */ #define SUPERLU_DIST_MAJOR_VERSION 7 -#define SUPERLU_DIST_MINOR_VERSION 0 +#define SUPERLU_DIST_MINOR_VERSION 1 #define SUPERLU_DIST_PATCH_VERSION 0 -#define SUPERLU_DIST_RELEASE_DATE "May 12, 2021" +#define SUPERLU_DIST_RELEASE_DATE "October 5, 2021" #include "superlu_dist_config.h" /* Define my integer size int_t */ diff --git a/SRC/superlu_grid.c b/SRC/superlu_grid.c index ca0a999c..3e13636d 100644 --- a/SRC/superlu_grid.c +++ b/SRC/superlu_grid.c @@ -12,10 +12,11 @@ at the top-level directory. * \brief SuperLU grid utilities * *
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * September 1, 1999
  * February 8, 2019  version 6.1.1
+ * October 5, 2021
  * 
*/ diff --git a/SRC/superlu_grid3d.c b/SRC/superlu_grid3d.c index aec8d53d..84412c23 100644 --- a/SRC/superlu_grid3d.c +++ b/SRC/superlu_grid3d.c @@ -2,9 +2,10 @@ * \brief SuperLU grid utilities * *
- * -- Distributed SuperLU routine (version 7.0.0) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Oak Ridge National Lab
  * May 12, 2021
+ * October 5, 2021
  * 
*/ diff --git a/SRC/ztrfCommWrapper.c b/SRC/ztrfCommWrapper.c index a3897c11..3dc098f1 100644 --- a/SRC/ztrfCommWrapper.c +++ b/SRC/ztrfCommWrapper.c @@ -159,7 +159,9 @@ int_t zLPanelTrSolve( int_t k, int_t* factored_L, // unsigned long long t1 = _rdtsc(); +#ifdef _OPENMP // #pragma omp for schedule(dynamic) nowait +#endif #define BL 32 for (int i = 0; i < CEILING(l, BL); ++i) { @@ -198,13 +200,17 @@ int_t zLPanelTrSolve( int_t k, int_t* factored_L, // printf("%d: L update \n",k ); #define BL 32 +#ifdef _OPENMP // #pragma omp parallel for +#endif for (int i = 0; i < CEILING(l, BL); ++i) { int_t off = i * BL; // Sherry: int_t len = MY_MIN(BL, l - i * BL); int len = SUPERLU_MIN(BL, (l - i * BL)); +#ifdef _OPENMP //#pragma omp task +#endif { superlu_ztrsm("R", "U", "N", "N", len, nsupc, alpha, ublk_ptr, ld_ujrow, &lusup[nsupc + off], nsupr); @@ -270,7 +276,9 @@ int_t zUPanelTrSolve( int_t k, Trs2_InitUblock_info(klst, nb, Ublock_info, usub, Glu_persist, stat ); /* Loop through all the row blocks. */ +#ifdef _OPENMP // #pragma omp for schedule(dynamic,2) nowait +#endif for (int_t b = 0; b < nb; ++b) { #ifdef _OPENMP diff --git a/make.inc.in b/make.inc.in index 4956ebe9..0beb461b 100644 --- a/make.inc.in +++ b/make.inc.in @@ -10,6 +10,7 @@ # # Modified: October 13, 2017 version 5.2.1 # February 20, 2021 version 7.0.0 +# October 5, 2021 version 7.1.0 # ############################################################################ # From 9fa42765c5d704a842cb46363eacbd8495ecf003 Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Tue, 5 Oct 2021 09:26:13 -0700 Subject: [PATCH 136/147] Addithe missing '#ifdef _OPENMP' around a number of '#pragma omp' in xsuperlu_gpu.cu Update version number to 7.1.0 --- SRC/dlustruct_gpu.h | 3 +-- SRC/dnrformat_loc3d.c | 3 ++- SRC/dtreeFactorizationGPU.c | 28 ++++++++++++++++++++++++++-- SRC/dutil_dist.c | 4 ++-- SRC/pdgstrf3d.c | 2 +- SRC/pdgstrs_lsum.c | 5 +++-- SRC/psgstrf3d.c | 2 +- SRC/psgstrs_lsum.c | 5 +++-- SRC/pzgstrf3d.c | 2 +- SRC/pzgstrs_lsum.c | 5 +++-- SRC/slustruct_gpu.h | 3 +-- SRC/snrformat_loc3d.c | 3 ++- SRC/streeFactorizationGPU.c | 28 ++++++++++++++++++++++++++-- SRC/sutil_dist.c | 4 ++-- SRC/zlustruct_gpu.h | 3 +-- SRC/znrformat_loc3d.c | 3 ++- SRC/ztreeFactorizationGPU.c | 28 ++++++++++++++++++++++++++-- SRC/zutil_dist.c | 4 ++-- 18 files changed, 105 insertions(+), 30 deletions(-) diff --git a/SRC/dlustruct_gpu.h b/SRC/dlustruct_gpu.h index 1cdc366d..4792ea97 100644 --- a/SRC/dlustruct_gpu.h +++ b/SRC/dlustruct_gpu.h @@ -135,8 +135,7 @@ typedef struct //LUstruct_gpu_ typedef struct //sluGPU_t_ { int_t gpuId; // if there are multiple GPUs - dLUstruct_gpu_t *A_gpu; // holds the LU structure on GPU - //*dA_gpu; not used + dLUstruct_gpu_t *A_gpu, *dA_gpu; // holds the LU structure on GPU cudaStream_t funCallStreams[MAX_NCUDA_STREAMS], CopyStream; cublasHandle_t cublasHandles[MAX_NCUDA_STREAMS]; int_t lastOffloadStream[MAX_NCUDA_STREAMS]; diff --git a/SRC/dnrformat_loc3d.c b/SRC/dnrformat_loc3d.c index 4ae5a8e6..97dac6a6 100644 --- a/SRC/dnrformat_loc3d.c +++ b/SRC/dnrformat_loc3d.c @@ -17,9 +17,10 @@ at the top-level directory. * - Scatter B (solution) from 2D process layer 0 to 3D grid * *
- * -- Distributed SuperLU routine (version 7.0) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Oak Ridge National Lab.
  * May 12, 2021
+ * October 5, 2021
  */
 
 #include "superlu_ddefs.h"
diff --git a/SRC/dtreeFactorizationGPU.c b/SRC/dtreeFactorizationGPU.c
index 9f5bb8ee..0a05275a 100644
--- a/SRC/dtreeFactorizationGPU.c
+++ b/SRC/dtreeFactorizationGPU.c
@@ -26,6 +26,7 @@
                   ^          ^
                   0          jj_cpu
 */
+#if 0
 static int_t getAccUPartition(HyP_t *HyP)
 {
     /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */
@@ -56,6 +57,7 @@ static int_t getAccUPartition(HyP_t *HyP)
 
     return jj_cpu;
 }
+#endif
 
 int dsparseTreeFactor_ASYNC_GPU(
     sForest_t *sforest,
@@ -407,11 +409,19 @@ int dsparseTreeFactor_ASYNC_GPU(
 
             double t1 = SuperLU_timer_();
 
+#ifdef _OPENMP
 #pragma omp parallel /* Look-ahead update on CPU */
+#endif
             {
-                int_t thread_id = omp_get_thread_num();
+#ifdef _OPENMP
+                int thread_id = omp_get_thread_num();
+#else
+		int thread_id = 0; 
+#endif
 
+#ifdef _OPENMP
 #pragma omp for
+#endif
                 for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
                 {
                     int_t j = ij / HyP->lookAheadBlk;
@@ -420,7 +430,9 @@ int dsparseTreeFactor_ASYNC_GPU(
                                                usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
                 }
 
+#ifdef _OPENMP
 #pragma omp for
+#endif
                 for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
                 {
                     int_t j = ij / HyP->lookAheadBlk;
@@ -429,7 +441,9 @@ int dsparseTreeFactor_ASYNC_GPU(
                                                 usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
                 }
 
+#ifdef _OPENMP
 #pragma omp for
+#endif
                 for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij)
                 {
                     int_t j = ij / HyP->RemainBlk;
@@ -499,14 +513,22 @@ int dsparseTreeFactor_ASYNC_GPU(
                 } /* end if all children are done */
             }     /* end if non-root */
 
+#ifdef _OPENMP
 #pragma omp parallel
+#endif
             {
                 /* Master thread performs Schur complement update on GPU. */
+#ifdef _OPENMP
 #pragma omp master
+#endif
                 {
                     if (superlu_acc_offload)
                     {
+#ifdef _OPENMP
                         int thread_id = omp_get_thread_num();
+#else			
+                        int thread_id = 0;
+#endif			
                         double t1 = SuperLU_timer_();
 
                         if (offload_condition)
@@ -558,8 +580,10 @@ int dsparseTreeFactor_ASYNC_GPU(
                     } /* endif (superlu_acc_offload) */
 
                 } /* end omp master thread */
-
+		
+#ifdef _OPENMP
 #pragma omp for
+#endif
                 /* The following update is on CPU. Should not be necessary now,
 		   because we set jj_cpu equal to num_u_blks_Phi.      		*/
                 for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu); ++ij)
diff --git a/SRC/dutil_dist.c b/SRC/dutil_dist.c
index 5c231e7e..5e4663d2 100644
--- a/SRC/dutil_dist.c
+++ b/SRC/dutil_dist.c
@@ -14,10 +14,10 @@ at the top-level directory.
  * \brief Several matrix utilities
  *
  * 
- * -- Distributed SuperLU routine (version 6.1.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
- *
+ * October 5, 2021
  */
 
 #include 
diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c
index 0deebd74..0cd403db 100644
--- a/SRC/pdgstrf3d.c
+++ b/SRC/pdgstrf3d.c
@@ -233,7 +233,7 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
 #ifdef GPU_ACC
 
     /*Now initialize the GPU data structure*/
-    // dLUstruct_gpu_t *A_gpu, *dA_gpu; // not used
+    dLUstruct_gpu_t *A_gpu, *dA_gpu;
 
     d2Hreduce_t d2HredObj;
     d2Hreduce_t* d2Hred = &d2HredObj;
diff --git a/SRC/pdgstrs_lsum.c b/SRC/pdgstrs_lsum.c
index d2f3deaa..75be1c59 100644
--- a/SRC/pdgstrs_lsum.c
+++ b/SRC/pdgstrs_lsum.c
@@ -14,14 +14,15 @@ at the top-level directory.
  * \brief Perform local block modifications: lsum[i] -= L_i,k * X[k]
  *
  * 
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  *
  * Modified:
  *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
  *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
- * February 8, 2019  version 6.1.1
+ *     February 8, 2019  version 6.1.1
+ *     October 5, 2021   version 7.1.0  disable a few 'omp simd'
  * 
*/ diff --git a/SRC/psgstrf3d.c b/SRC/psgstrf3d.c index 2c73580d..3f5b3ce2 100644 --- a/SRC/psgstrf3d.c +++ b/SRC/psgstrf3d.c @@ -233,7 +233,7 @@ int_t psgstrf3d(superlu_dist_options_t *options, int m, int n, float anorm, #ifdef GPU_ACC /*Now initialize the GPU data structure*/ - // sLUstruct_gpu_t *A_gpu, *dA_gpu; // not used + sLUstruct_gpu_t *A_gpu, *dA_gpu; d2Hreduce_t d2HredObj; d2Hreduce_t* d2Hred = &d2HredObj; diff --git a/SRC/psgstrs_lsum.c b/SRC/psgstrs_lsum.c index 2b444e03..ead09ab5 100644 --- a/SRC/psgstrs_lsum.c +++ b/SRC/psgstrs_lsum.c @@ -14,14 +14,15 @@ at the top-level directory. * \brief Perform local block modifications: lsum[i] -= L_i,k * X[k] * *
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  *
  * Modified:
  *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
  *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
- * February 8, 2019  version 6.1.1
+ *     February 8, 2019  version 6.1.1
+ *     October 5, 2021   version 7.1.0  disable a few 'omp simd'
  * 
*/ diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c index 03e60b2e..9db714f4 100644 --- a/SRC/pzgstrf3d.c +++ b/SRC/pzgstrf3d.c @@ -232,7 +232,7 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm, #ifdef GPU_ACC /*Now initialize the GPU data structure*/ - // zLUstruct_gpu_t *A_gpu, *dA_gpu; // not used + zLUstruct_gpu_t *A_gpu, *dA_gpu; d2Hreduce_t d2HredObj; d2Hreduce_t* d2Hred = &d2HredObj; diff --git a/SRC/pzgstrs_lsum.c b/SRC/pzgstrs_lsum.c index 987a8147..af2ac20c 100644 --- a/SRC/pzgstrs_lsum.c +++ b/SRC/pzgstrs_lsum.c @@ -13,14 +13,15 @@ at the top-level directory. * \brief Perform local block modifications: lsum[i] -= L_i,k * X[k] * *
- * -- Distributed SuperLU routine (version 6.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
  *
  * Modified:
  *     Feburary 7, 2001    use MPI_Isend/MPI_Irecv
  *     October 2, 2001     use MPI_Isend/MPI_Irecv with MPI_Test
- * February 8, 2019  version 6.1.1
+ *     February 8, 2019  version 6.1.1
+ *     October 5, 2021   version 7.1.0  disable a few 'omp simd'
  * 
*/ diff --git a/SRC/slustruct_gpu.h b/SRC/slustruct_gpu.h index 9475fba8..443619e4 100644 --- a/SRC/slustruct_gpu.h +++ b/SRC/slustruct_gpu.h @@ -135,8 +135,7 @@ typedef struct //LUstruct_gpu_ typedef struct //sluGPU_t_ { int_t gpuId; // if there are multiple GPUs - sLUstruct_gpu_t *A_gpu; // holds the LU structure on GPU - //*dA_gpu; not used + sLUstruct_gpu_t *A_gpu, *dA_gpu; // holds the LU structure on GPU cudaStream_t funCallStreams[MAX_NCUDA_STREAMS], CopyStream; cublasHandle_t cublasHandles[MAX_NCUDA_STREAMS]; int_t lastOffloadStream[MAX_NCUDA_STREAMS]; diff --git a/SRC/snrformat_loc3d.c b/SRC/snrformat_loc3d.c index ea63f95f..5140cc24 100644 --- a/SRC/snrformat_loc3d.c +++ b/SRC/snrformat_loc3d.c @@ -17,9 +17,10 @@ at the top-level directory. * - Scatter B (solution) from 2D process layer 0 to 3D grid * *
- * -- Distributed SuperLU routine (version 7.0) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Oak Ridge National Lab.
  * May 12, 2021
+ * October 5, 2021
  */
 
 #include "superlu_sdefs.h"
diff --git a/SRC/streeFactorizationGPU.c b/SRC/streeFactorizationGPU.c
index 2875ed5c..b52667df 100644
--- a/SRC/streeFactorizationGPU.c
+++ b/SRC/streeFactorizationGPU.c
@@ -26,6 +26,7 @@
                   ^          ^
                   0          jj_cpu
 */
+#if 0
 static int_t getAccUPartition(HyP_t *HyP)
 {
     /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */
@@ -56,6 +57,7 @@ static int_t getAccUPartition(HyP_t *HyP)
 
     return jj_cpu;
 }
+#endif
 
 int ssparseTreeFactor_ASYNC_GPU(
     sForest_t *sforest,
@@ -407,11 +409,19 @@ int ssparseTreeFactor_ASYNC_GPU(
 
             double t1 = SuperLU_timer_();
 
+#ifdef _OPENMP
 #pragma omp parallel /* Look-ahead update on CPU */
+#endif
             {
-                int_t thread_id = omp_get_thread_num();
+#ifdef _OPENMP
+                int thread_id = omp_get_thread_num();
+#else
+		int thread_id = 0; 
+#endif
 
+#ifdef _OPENMP
 #pragma omp for
+#endif
                 for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
                 {
                     int_t j = ij / HyP->lookAheadBlk;
@@ -420,7 +430,9 @@ int ssparseTreeFactor_ASYNC_GPU(
                                                usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
                 }
 
+#ifdef _OPENMP
 #pragma omp for
+#endif
                 for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
                 {
                     int_t j = ij / HyP->lookAheadBlk;
@@ -429,7 +441,9 @@ int ssparseTreeFactor_ASYNC_GPU(
                                                 usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
                 }
 
+#ifdef _OPENMP
 #pragma omp for
+#endif
                 for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij)
                 {
                     int_t j = ij / HyP->RemainBlk;
@@ -499,14 +513,22 @@ int ssparseTreeFactor_ASYNC_GPU(
                 } /* end if all children are done */
             }     /* end if non-root */
 
+#ifdef _OPENMP
 #pragma omp parallel
+#endif
             {
                 /* Master thread performs Schur complement update on GPU. */
+#ifdef _OPENMP
 #pragma omp master
+#endif
                 {
                     if (superlu_acc_offload)
                     {
+#ifdef _OPENMP
                         int thread_id = omp_get_thread_num();
+#else			
+                        int thread_id = 0;
+#endif			
                         double t1 = SuperLU_timer_();
 
                         if (offload_condition)
@@ -558,8 +580,10 @@ int ssparseTreeFactor_ASYNC_GPU(
                     } /* endif (superlu_acc_offload) */
 
                 } /* end omp master thread */
-
+		
+#ifdef _OPENMP
 #pragma omp for
+#endif
                 /* The following update is on CPU. Should not be necessary now,
 		   because we set jj_cpu equal to num_u_blks_Phi.      		*/
                 for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu); ++ij)
diff --git a/SRC/sutil_dist.c b/SRC/sutil_dist.c
index 4dce22a1..4a27e531 100644
--- a/SRC/sutil_dist.c
+++ b/SRC/sutil_dist.c
@@ -14,10 +14,10 @@ at the top-level directory.
  * \brief Several matrix utilities
  *
  * 
- * -- Distributed SuperLU routine (version 6.1.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
- *
+ * October 5, 2021
  */
 
 #include 
diff --git a/SRC/zlustruct_gpu.h b/SRC/zlustruct_gpu.h
index 39288000..ab563f96 100644
--- a/SRC/zlustruct_gpu.h
+++ b/SRC/zlustruct_gpu.h
@@ -134,8 +134,7 @@ typedef struct //LUstruct_gpu_
 typedef struct //sluGPU_t_
 {
     int_t gpuId;        // if there are multiple GPUs
-    zLUstruct_gpu_t *A_gpu; // holds the LU structure on GPU
-    //*dA_gpu; not used
+    zLUstruct_gpu_t *A_gpu, *dA_gpu; // holds the LU structure on GPU
     cudaStream_t funCallStreams[MAX_NCUDA_STREAMS], CopyStream;
     cublasHandle_t cublasHandles[MAX_NCUDA_STREAMS];
     int_t lastOffloadStream[MAX_NCUDA_STREAMS];
diff --git a/SRC/znrformat_loc3d.c b/SRC/znrformat_loc3d.c
index 6f8f535e..6647d933 100644
--- a/SRC/znrformat_loc3d.c
+++ b/SRC/znrformat_loc3d.c
@@ -16,9 +16,10 @@ at the top-level directory.
  *        - Scatter B (solution) from 2D process layer 0 to 3D grid
  *
  * 
- * -- Distributed SuperLU routine (version 7.0) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Oak Ridge National Lab.
  * May 12, 2021
+ * October 5, 2021
  */
 
 #include "superlu_zdefs.h"
diff --git a/SRC/ztreeFactorizationGPU.c b/SRC/ztreeFactorizationGPU.c
index 7875366f..e978814d 100644
--- a/SRC/ztreeFactorizationGPU.c
+++ b/SRC/ztreeFactorizationGPU.c
@@ -25,6 +25,7 @@
                   ^          ^
                   0          jj_cpu
 */
+#if 0
 static int_t getAccUPartition(HyP_t *HyP)
 {
     /* Sherry: what if num_u_blks_phi == 0 ? Need to fix the bug */
@@ -55,6 +56,7 @@ static int_t getAccUPartition(HyP_t *HyP)
 
     return jj_cpu;
 }
+#endif
 
 int zsparseTreeFactor_ASYNC_GPU(
     sForest_t *sforest,
@@ -406,11 +408,19 @@ int zsparseTreeFactor_ASYNC_GPU(
 
             double t1 = SuperLU_timer_();
 
+#ifdef _OPENMP
 #pragma omp parallel /* Look-ahead update on CPU */
+#endif
             {
-                int_t thread_id = omp_get_thread_num();
+#ifdef _OPENMP
+                int thread_id = omp_get_thread_num();
+#else
+		int thread_id = 0; 
+#endif
 
+#ifdef _OPENMP
 #pragma omp for
+#endif
                 for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks; ++ij)
                 {
                     int_t j = ij / HyP->lookAheadBlk;
@@ -419,7 +429,9 @@ int zsparseTreeFactor_ASYNC_GPU(
                                                usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
                 }
 
+#ifdef _OPENMP
 #pragma omp for
+#endif
                 for (int_t ij = 0; ij < HyP->lookAheadBlk * HyP->num_u_blks_Phi; ++ij)
                 {
                     int_t j = ij / HyP->lookAheadBlk;
@@ -428,7 +440,9 @@ int zsparseTreeFactor_ASYNC_GPU(
                                                 usub, ldt, indirect, indirect2, HyP, LUstruct, grid, SCT, stat);
                 }
 
+#ifdef _OPENMP
 #pragma omp for
+#endif
                 for (int_t ij = 0; ij < HyP->RemainBlk * HyP->num_u_blks; ++ij)
                 {
                     int_t j = ij / HyP->RemainBlk;
@@ -498,14 +512,22 @@ int zsparseTreeFactor_ASYNC_GPU(
                 } /* end if all children are done */
             }     /* end if non-root */
 
+#ifdef _OPENMP
 #pragma omp parallel
+#endif
             {
                 /* Master thread performs Schur complement update on GPU. */
+#ifdef _OPENMP
 #pragma omp master
+#endif
                 {
                     if (superlu_acc_offload)
                     {
+#ifdef _OPENMP
                         int thread_id = omp_get_thread_num();
+#else			
+                        int thread_id = 0;
+#endif			
                         double t1 = SuperLU_timer_();
 
                         if (offload_condition)
@@ -557,8 +579,10 @@ int zsparseTreeFactor_ASYNC_GPU(
                     } /* endif (superlu_acc_offload) */
 
                 } /* end omp master thread */
-
+		
+#ifdef _OPENMP
 #pragma omp for
+#endif
                 /* The following update is on CPU. Should not be necessary now,
 		   because we set jj_cpu equal to num_u_blks_Phi.      		*/
                 for (int_t ij = 0; ij < HyP->RemainBlk * (HyP->num_u_blks_Phi - jj_cpu); ++ij)
diff --git a/SRC/zutil_dist.c b/SRC/zutil_dist.c
index 34d7f308..6688710a 100644
--- a/SRC/zutil_dist.c
+++ b/SRC/zutil_dist.c
@@ -13,10 +13,10 @@ at the top-level directory.
  * \brief Several matrix utilities
  *
  * 
- * -- Distributed SuperLU routine (version 6.1.1) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * March 15, 2003
- *
+ * October 5, 2021
  */
 
 #include 

From e3eba614889674975dca9f9a697129f9f9cfecf6 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Tue, 5 Oct 2021 22:56:53 -0700
Subject: [PATCH 137/147] GPU update: add superlu_gpu_util.cu;     Test if
 (superlu_acc_offload) then free_LUstruct_gpu;     Allow optionally use
 round-robin binding of MPI ranks to GPU.

---
 EXAMPLE/pddrive.c    |  1 -
 SRC/CMakeLists.txt   |  2 +-
 SRC/Makefile         |  2 +-
 SRC/pdgstrf3d.c      |  4 +++-
 SRC/psgstrf3d.c      |  4 +++-
 SRC/pzgstrf3d.c      |  4 +++-
 SRC/sp_ienv.c        |  2 +-
 SRC/superlu_grid.c   | 14 +++++++++-----
 SRC/superlu_grid3d.c | 15 +++++++++------
 9 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c
index 861a7b9a..a7a52182 100644
--- a/EXAMPLE/pddrive.c
+++ b/EXAMPLE/pddrive.c
@@ -69,7 +69,6 @@ int main(int argc, char *argv[])
     nprow = 1;  /* Default process rows.      */
     npcol = 1;  /* Default process columns.   */
     nrhs = 1;   /* Number of right-hand side. */
-    printf("MAIN ...\n"); fflush(stdout);
 			      
     /* ------------------------------------------------------------
        INITIALIZE MPI ENVIRONMENT. 
diff --git a/SRC/CMakeLists.txt b/SRC/CMakeLists.txt
index ef071485..99851580 100644
--- a/SRC/CMakeLists.txt
+++ b/SRC/CMakeLists.txt
@@ -56,7 +56,7 @@ set(sources
   sec_structs.c
 )
 if (HAVE_CUDA)
-  list(APPEND sources cublas_utils.c)
+  list(APPEND sources cublas_utils.c superlu_gpu_utils.cu)
 endif()
 
 if (MSVC)
diff --git a/SRC/Makefile b/SRC/Makefile
index 63fb8c3f..f1c68586 100644
--- a/SRC/Makefile
+++ b/SRC/Makefile
@@ -85,7 +85,7 @@ ZPLUSRC += pzgssvx3d.o pzgstrf3d.o ztreeFactorization.o zscatter3d.o \
 	znrformat_loc3d.o ztreeFactorizationGPU.o ##$(FACT3D)
 
 ifeq ($(HAVE_CUDA),TRUE)
-ALLAUX += cublas_utils.o
+ALLAUX += cublas_utils.o superlu_gpu_utils.o
 DPLUSRC += dsuperlu_gpu.o
 ZPLUSRC += zsuperlu_gpu.o
 endif
diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c
index 0cd403db..2fa6f0dc 100644
--- a/SRC/pdgstrf3d.c
+++ b/SRC/pdgstrf3d.c
@@ -344,7 +344,9 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
 
 #ifdef GPU_ACC
     /* This frees the GPU storage allocateed in initSluGPU3D_t() */
-    dfree_LUstruct_gpu (sluGPU->A_gpu);
+    if (superlu_acc_offload) {
+         dfree_LUstruct_gpu (sluGPU->A_gpu);
+    }
 #endif
     
     MPI_Barrier( grid3d->comm);
diff --git a/SRC/psgstrf3d.c b/SRC/psgstrf3d.c
index 3f5b3ce2..a43110d5 100644
--- a/SRC/psgstrf3d.c
+++ b/SRC/psgstrf3d.c
@@ -344,7 +344,9 @@ int_t psgstrf3d(superlu_dist_options_t *options, int m, int n, float anorm,
 
 #ifdef GPU_ACC
     /* This frees the GPU storage allocateed in initSluGPU3D_t() */
-    sfree_LUstruct_gpu (sluGPU->A_gpu);
+    if (superlu_acc_offload) {
+         sfree_LUstruct_gpu (sluGPU->A_gpu);
+    }
 #endif
     
     MPI_Barrier( grid3d->comm);
diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c
index 9db714f4..c3b28b35 100644
--- a/SRC/pzgstrf3d.c
+++ b/SRC/pzgstrf3d.c
@@ -343,7 +343,9 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
 
 #ifdef GPU_ACC
     /* This frees the GPU storage allocateed in initSluGPU3D_t() */
-    zfree_LUstruct_gpu (sluGPU->A_gpu);
+    if (superlu_acc_offload) {
+         zfree_LUstruct_gpu (sluGPU->A_gpu);
+    }
 #endif
     
     MPI_Barrier( grid3d->comm);
diff --git a/SRC/sp_ienv.c b/SRC/sp_ienv.c
index 84cf47fe..e7ea44db 100644
--- a/SRC/sp_ienv.c
+++ b/SRC/sp_ienv.c
@@ -116,7 +116,7 @@ sp_ienv_dist(int ispec)
         case 8:
   	    ttemp = getenv ("MAX_BUFFER_SIZE");
 	    if (ttemp) return atoi (ttemp);
-	    else return 1000000000; // 256000000 = 16000^2
+	    else return 256000000; // 256000000 = 16000^2
     }
 
     /* Invalid value for ISPEC */
diff --git a/SRC/superlu_grid.c b/SRC/superlu_grid.c
index 3e13636d..09616031 100644
--- a/SRC/superlu_grid.c
+++ b/SRC/superlu_grid.c
@@ -64,11 +64,15 @@ void superlu_gridinit(MPI_Comm Bcomm, /* The base communicator upon which
     
 #ifdef GPU_ACC
     /* Binding each MPI to a CUDA device */
-    int devs, rank;
-    MPI_Comm_rank(Bcomm, &rank); // MPI_COMM_WORLD??
-    cudaGetDeviceCount(&devs);  // Returns the number of compute-capable devices
-    cudaSetDevice(rank % devs); // Set device to be used for GPU executions
-    ////
+    char *ttemp;
+    ttemp = getenv ("SUPERLU_BIND_MPI_GPU");
+
+    if (ttemp) {
+	int devs, rank;
+	MPI_Comm_rank(Bcomm, &rank); // MPI_COMM_WORLD??
+	cudaGetDeviceCount(&devs);  // Returns the number of compute-capable devices
+	cudaSetDevice(rank % devs); // Set device to be used for GPU executions
+    }
 #endif
 }
 
diff --git a/SRC/superlu_grid3d.c b/SRC/superlu_grid3d.c
index 84412c23..9a0fb623 100644
--- a/SRC/superlu_grid3d.c
+++ b/SRC/superlu_grid3d.c
@@ -49,13 +49,16 @@ void superlu_gridinit3d(MPI_Comm Bcomm, /* The base communicator upon which
     
 #ifdef GPU_ACC
     /* Binding each MPI to a CUDA device */
-    int devs, rank;
-    MPI_Comm_rank(Bcomm, &rank); // MPI_COMM_WORLD??
-    cudaGetDeviceCount(&devs);  // Returns the number of compute-capable devices
-    cudaSetDevice(rank % devs); // Set device to be used for GPU executions
-    ////
+    char *ttemp;
+    ttemp = getenv ("SUPERLU_BIND_MPI_GPU");
+
+    if (ttemp) {
+	int devs, rank;
+	MPI_Comm_rank(Bcomm, &rank); // MPI_COMM_WORLD??
+	cudaGetDeviceCount(&devs);  // Returns the number of compute-capable devices
+	cudaSetDevice(rank % devs); // Set device to be used for GPU executions
+    }
 #endif
-    
 }
 
 

From 0017a2e70e7df026e916fe2b51d9e8a49050757d Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Mon, 18 Oct 2021 13:32:15 -0700
Subject: [PATCH 138/147] Bug fix: in "dReDistributre_A", dereference several
 uninitialized/unallocated arrays even though their sizes are zero. Upgrade to
 v7.1.1

---
 CMakeLists.txt     |  2 +-
 README.md          |  4 ++--
 SRC/pddistribute.c | 21 +++++++++++----------
 SRC/psdistribute.c |  9 +++++----
 SRC/pzdistribute.c | 21 +++++++++++----------
 SRC/superlu_defs.h |  5 +++--
 6 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 956f75e1..2b2fdf24 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,7 @@ cmake_minimum_required(VERSION 3.18.1 FATAL_ERROR)
 project(SuperLU_DIST C CXX)
 set(VERSION_MAJOR "7")
 set(VERSION_MINOR "1")
-set(VERSION_BugFix "0")
+set(VERSION_BugFix "1")
 set(PROJECT_VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_BugFix})
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
diff --git a/README.md b/README.md
index df987413..f8781adb 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# SuperLU_DIST (version 7.1.0)   superlu
+# SuperLU_DIST (version 7.1.1)   superlu
 
 [![Build Status](https://travis-ci.org/xiaoyeli/superlu_dist.svg?branch=master)](https://travis-ci.org/xiaoyeli/superlu_dist) 
 [Nightly tests](http://my.cdash.org/index.php?project=superlu_dist)
@@ -25,7 +25,7 @@ acceleration capabilities.
 Table of Contents
 =================
 
-* [SuperLU_DIST (version 7.1.0)   superlu](#superlu_dist-version-70---)
+* [SuperLU_DIST (version 7.1.1)   superlu](#superlu_dist-version-70---)
 * [Directory structure of the source code](#directory-structure-of-the-source-code)
 * [Installation](#installation)
    * [Installation option 1: Using CMake build system.](#installation-option-1-using-cmake-build-system)
diff --git a/SRC/pddistribute.c b/SRC/pddistribute.c
index 17f770d7..ee654ad0 100644
--- a/SRC/pddistribute.c
+++ b/SRC/pddistribute.c
@@ -13,9 +13,10 @@ at the top-level directory.
 /*! @file
  * \brief Re-distribute A on the 2D process mesh.
  * 
- * -- Distributed SuperLU routine (version 2.3) --
+ * -- Distributed SuperLU routine (version 7.1.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 15, 2008
+ * October 18, 2021, minor fix, v7.1.1
  * 
*/ @@ -141,8 +142,8 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct, ABORT("Malloc fails for ia[]."); if ( !(aij = doubleMalloc_dist(k)) ) ABORT("Malloc fails for aij[]."); + ja = ia + k; } - ja = ia + k; /* Allocate temporary storage for sending/receiving the A triplets. */ if ( procs > 1 ) { @@ -170,9 +171,9 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct, for (i = 0, j = 0, p = 0; p < procs; ++p) { if ( p != iam ) { - ia_send[p] = &index[i]; + if (nnzToSend[p] > 0) ia_send[p] = &index[i]; i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ - aij_send[p] = &nzval[j]; + if (nnzToSend[p] > 0) aij_send[p] = &nzval[j]; j += nnzToSend[p]; } } @@ -216,8 +217,8 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct, NOTE: Can possibly use MPI_Alltoallv. ------------------------------------------------------------*/ for (p = 0; p < procs; ++p) { - if ( p != iam && nnzToSend[p]>0 ) { // cause two of the tests to hang - // if ( p != iam ) { + if ( p != iam && nnzToSend[p] > 0 ) { + //if ( p != iam ) { it = 2*nnzToSend[p]; MPI_Isend( ia_send[p], it, mpi_int_t, p, iam, grid->comm, &send_req[p] ); @@ -228,8 +229,8 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct, } for (p = 0; p < procs; ++p) { - if ( p != iam && nnzToRecv[p]>0 ) { - //if ( p != iam ) { + if ( p != iam && nnzToRecv[p] > 0 ) { + //if ( p != iam ) { it = 2*nnzToRecv[p]; MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); it = nnzToRecv[p]; @@ -248,8 +249,8 @@ dReDistribute_A(SuperMatrix *A, dScalePermstruct_t *ScalePermstruct, } for (p = 0; p < procs; ++p) { - if ( p != iam && nnzToSend[p] > 0 ) { - //if ( p != iam ) { + if ( p != iam && nnzToSend[p] > 0 ) { // cause two of the tests to hang + //if ( p != iam ) { MPI_Wait( &send_req[p], &status); MPI_Wait( &send_req[procs+p], &status); } diff --git a/SRC/psdistribute.c b/SRC/psdistribute.c index 4d4ed76e..30c114cb 100644 --- a/SRC/psdistribute.c +++ b/SRC/psdistribute.c @@ -13,9 +13,10 @@ at the top-level directory. /*! @file * \brief Re-distribute A on the 2D process mesh. *
- * -- Distributed SuperLU routine (version 2.3) --
+ * -- Distributed SuperLU routine (version 7.1.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 15, 2008
+ * October 18, 2021, minor fix, v7.1.1
  * 
*/ @@ -141,8 +142,8 @@ sReDistribute_A(SuperMatrix *A, sScalePermstruct_t *ScalePermstruct, ABORT("Malloc fails for ia[]."); if ( !(aij = floatMalloc_dist(k)) ) ABORT("Malloc fails for aij[]."); + ja = ia + k; } - ja = ia + k; /* Allocate temporary storage for sending/receiving the A triplets. */ if ( procs > 1 ) { @@ -170,9 +171,9 @@ sReDistribute_A(SuperMatrix *A, sScalePermstruct_t *ScalePermstruct, for (i = 0, j = 0, p = 0; p < procs; ++p) { if ( p != iam ) { - ia_send[p] = &index[i]; + if (nnzToSend[p] > 0) ia_send[p] = &index[i]; i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ - aij_send[p] = &nzval[j]; + if (nnzToSend[p] > 0) aij_send[p] = &nzval[j]; j += nnzToSend[p]; } } diff --git a/SRC/pzdistribute.c b/SRC/pzdistribute.c index fbbdff99..eed06ddc 100644 --- a/SRC/pzdistribute.c +++ b/SRC/pzdistribute.c @@ -12,9 +12,10 @@ at the top-level directory. /*! @file * \brief Re-distribute A on the 2D process mesh. *
- * -- Distributed SuperLU routine (version 2.3) --
+ * -- Distributed SuperLU routine (version 7.1.1) --
  * Lawrence Berkeley National Lab, Univ. of California Berkeley.
  * October 15, 2008
+ * October 18, 2021, minor fix, v7.1.1
  * 
*/ @@ -140,8 +141,8 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct, ABORT("Malloc fails for ia[]."); if ( !(aij = doublecomplexMalloc_dist(k)) ) ABORT("Malloc fails for aij[]."); + ja = ia + k; } - ja = ia + k; /* Allocate temporary storage for sending/receiving the A triplets. */ if ( procs > 1 ) { @@ -169,9 +170,9 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct, for (i = 0, j = 0, p = 0; p < procs; ++p) { if ( p != iam ) { - ia_send[p] = &index[i]; + if (nnzToSend[p] > 0) ia_send[p] = &index[i]; i += 2 * nnzToSend[p]; /* ia/ja indices alternate */ - aij_send[p] = &nzval[j]; + if (nnzToSend[p] > 0) aij_send[p] = &nzval[j]; j += nnzToSend[p]; } } @@ -215,8 +216,8 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct, NOTE: Can possibly use MPI_Alltoallv. ------------------------------------------------------------*/ for (p = 0; p < procs; ++p) { - if ( p != iam && nnzToSend[p] > 0 ) { - //if ( p != iam ) { + if ( p != iam && nnzToSend[p] > 0 ) { + //if ( p != iam ) { it = 2*nnzToSend[p]; MPI_Isend( ia_send[p], it, mpi_int_t, p, iam, grid->comm, &send_req[p] ); @@ -227,8 +228,8 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct, } for (p = 0; p < procs; ++p) { - if ( p != iam && nnzToRecv[p] > 0 ) { - //if ( p != iam ) { + if ( p != iam && nnzToRecv[p] > 0 ) { + //if ( p != iam ) { it = 2*nnzToRecv[p]; MPI_Recv( itemp, it, mpi_int_t, p, p, grid->comm, &status ); it = nnzToRecv[p]; @@ -247,8 +248,8 @@ zReDistribute_A(SuperMatrix *A, zScalePermstruct_t *ScalePermstruct, } for (p = 0; p < procs; ++p) { - if ( p != iam && nnzToSend[p] > 0 ) { - //if ( p != iam ) { + if ( p != iam && nnzToSend[p] > 0 ) { // cause two of the tests to hang + //if ( p != iam ) { MPI_Wait( &send_req[p], &status); MPI_Wait( &send_req[procs+p], &status); } diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index 09692a84..89f0c452 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -25,6 +25,7 @@ at the top-level directory. * October 23, 2020 version 6.4.0 * May 12, 2021 version 7.0.0 * October 5, 2021 version 7.1.0 + * October 18, 2021 version 7.1.1 *
*/ @@ -76,8 +77,8 @@ at the top-level directory. */ #define SUPERLU_DIST_MAJOR_VERSION 7 #define SUPERLU_DIST_MINOR_VERSION 1 -#define SUPERLU_DIST_PATCH_VERSION 0 -#define SUPERLU_DIST_RELEASE_DATE "October 5, 2021" +#define SUPERLU_DIST_PATCH_VERSION 1 +#define SUPERLU_DIST_RELEASE_DATE "October 18, 2021" #include "superlu_dist_config.h" /* Define my integer size int_t */ From 302139e6d59e79d0639741d1f7407da885fb6e4b Mon Sep 17 00:00:00 2001 From: liuyangzhuan Date: Thu, 21 Oct 2021 12:24:36 -0700 Subject: [PATCH 139/147] added pddrive and pddrive_spawn to INSTALL_BIN_DIR --- EXAMPLE/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) mode change 100644 => 100755 EXAMPLE/CMakeLists.txt diff --git a/EXAMPLE/CMakeLists.txt b/EXAMPLE/CMakeLists.txt old mode 100644 new mode 100755 index f22ef414..b4c6b371 --- a/EXAMPLE/CMakeLists.txt +++ b/EXAMPLE/CMakeLists.txt @@ -37,6 +37,7 @@ if(enable_double) set(DEXM pddrive.c dcreate_matrix.c) add_executable(pddrive ${DEXM}) target_link_libraries(pddrive ${all_link_libs}) + install(TARGETS pddrive RUNTIME DESTINATION "${INSTALL_BIN_DIR}") set(DEXM1 pddrive1.c dcreate_matrix.c) add_executable(pddrive1 ${DEXM1}) @@ -96,6 +97,8 @@ if(enable_double) set(DEXMS pddrive_spawn.c dcreate_matrix.c) add_executable(pddrive_spawn ${DEXMS}) target_link_libraries(pddrive_spawn ${all_link_libs}) + install(TARGETS pddrive_spawn RUNTIME DESTINATION "${INSTALL_BIN_DIR}") + endif() #### end enable_double From 337379f76c25f9d173d9ec2d7dd35485d7f58a0a Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Tue, 9 Nov 2021 09:13:16 -0800 Subject: [PATCH 140/147] In the 3D factor routines, add code to flag zero pivot in INFO. In EXAMPLE/pxdrive*, check INFO flag returned from the drivers pxgssvx*(). --- EXAMPLE/Makefile | 2 +- EXAMPLE/pddrive.c | 30 ++++++++++----------- EXAMPLE/pddrive1.c | 56 +++++++++++++++++++++++++--------------- EXAMPLE/pddrive2.c | 32 ++++++++++++++--------- EXAMPLE/pddrive3.c | 33 ++++++++++++++--------- EXAMPLE/pddrive3d.c | 25 +++++++++--------- EXAMPLE/pddrive3d1.c | 48 ++++++++++++++++++++++------------ EXAMPLE/pddrive3d2.c | 42 ++++++++++++++++++++---------- EXAMPLE/pddrive3d3.c | 42 ++++++++++++++++++++---------- EXAMPLE/pddrive4.c | 35 +++++++++++++++---------- EXAMPLE/psdrive.c | 20 ++++++++------ EXAMPLE/psdrive1.c | 46 +++++++++++++++++++++++---------- EXAMPLE/psdrive2.c | 25 +++++++++++++----- EXAMPLE/psdrive3.c | 26 ++++++++++++++----- EXAMPLE/psdrive3d.c | 33 ++++++++++++++--------- EXAMPLE/psdrive3d1.c | 48 ++++++++++++++++++++++------------ EXAMPLE/psdrive3d2.c | 42 ++++++++++++++++++++---------- EXAMPLE/psdrive3d3.c | 42 ++++++++++++++++++++---------- EXAMPLE/psdrive4.c | 28 +++++++++++++++----- EXAMPLE/pzdrive.c | 27 ++++++++++--------- EXAMPLE/pzdrive1.c | 53 +++++++++++++++++++++++-------------- EXAMPLE/pzdrive2.c | 32 ++++++++++++++--------- EXAMPLE/pzdrive3.c | 33 ++++++++++++++--------- EXAMPLE/pzdrive3d.c | 25 +++++++++--------- EXAMPLE/pzdrive3d1.c | 49 ++++++++++++++++++++++------------- EXAMPLE/pzdrive3d2.c | 42 ++++++++++++++++++++---------- EXAMPLE/pzdrive3d3.c | 42 ++++++++++++++++++++---------- EXAMPLE/pzdrive4.c | 35 +++++++++++++++---------- SRC/dtreeFactorization.c | 3 +++ SRC/pdgssvx3d.c | 9 ++++--- SRC/pdgstrf2.c | 26 ++++++++++++++++--- SRC/pdgstrf3d.c | 12 ++++++++- SRC/psgssvx3d.c | 9 ++++--- SRC/psgstrf2.c | 26 ++++++++++++++++--- SRC/psgstrf3d.c | 12 ++++++++- SRC/pzgssvx3d.c | 9 ++++--- SRC/pzgstrf2.c | 21 +++++++++++++-- SRC/pzgstrf3d.c | 12 ++++++++- SRC/streeFactorization.c | 3 +++ SRC/ztreeFactorization.c | 3 +++ 40 files changed, 754 insertions(+), 384 deletions(-) diff --git a/EXAMPLE/Makefile b/EXAMPLE/Makefile index d3ea86e8..6c53a890 100644 --- a/EXAMPLE/Makefile +++ b/EXAMPLE/Makefile @@ -30,7 +30,7 @@ ####################################################################### include ../make.inc -DEXM = pddrive.o dcreate_matrix.o #pdgssvx.o #pdgstrf2.o +DEXM = pddrive.o dcreate_matrix.o #pdgstrf2.o DEXM1 = pddrive1.o dcreate_matrix.o DEXM2 = pddrive2.o dcreate_matrix.o dcreate_matrix_perturbed.o DEXM3 = pddrive3.o dcreate_matrix.o diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c index 0a7812f8..2912dd7c 100644 --- a/EXAMPLE/pddrive.c +++ b/EXAMPLE/pddrive.c @@ -69,21 +69,13 @@ int main(int argc, char *argv[]) nprow = 1; /* Default process rows. */ npcol = 1; /* Default process columns. */ nrhs = 1; /* Number of right-hand side. */ - + /* ------------------------------------------------------------ INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ //MPI_Init( &argc, &argv ); MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); - -#ifdef GPU_ACC - int rank, devs; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - cudaGetDeviceCount(&devs); - cudaSetDevice(rank % devs); -#endif - - + #if ( VAMPIR>=1 ) VT_traceoff(); @@ -147,8 +139,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam >= nprow * npcol || iam ==-1 ) goto out; - + if ( (iam >= nprow * npcol) || (iam == -1) ) goto out; if ( !iam ) { int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER @@ -234,10 +225,16 @@ int main(int argc, char *argv[]) pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - - /* Check the accuracy of the solution. */ - pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ @@ -250,7 +247,6 @@ int main(int argc, char *argv[]) dScalePermstructFree(&ScalePermstruct); dDestroy_LU(n, &grid, &LUstruct); dLUstructFree(&LUstruct); - //if ( options.SolveInitialized ) { dSolveFinalize(&options, &SOLVEstruct); SUPERLU_FREE(b); SUPERLU_FREE(xtrue); diff --git a/EXAMPLE/pddrive1.c b/EXAMPLE/pddrive1.c index 72c88a6e..f058bd40 100644 --- a/EXAMPLE/pddrive1.c +++ b/EXAMPLE/pddrive1.c @@ -73,12 +73,7 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); -#ifdef GPU_ACC - int rank, devs; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - cudaGetDeviceCount(&devs); - cudaSetDevice(rank % devs); -#endif + /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { @@ -111,8 +106,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam == -1 ) goto out; - + if ( iam == -1 ) goto out; if ( !iam ) { int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER @@ -201,11 +195,17 @@ int main(int argc, char *argv[]) pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the first system:\n"); - pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); @@ -221,10 +221,17 @@ int main(int argc, char *argv[]) pdgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the system with a different B:\n"); - pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + } + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); @@ -251,10 +258,17 @@ int main(int argc, char *argv[]) pdgssvx(&options, &A, &ScalePermstruct, b2, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the system with 3 RHS's:\n"); - pdinf_norm_error(iam, m_loc, nrhs, b2, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with 3 RHS's:\n"); + pdinf_norm_error(iam, m_loc, nrhs, b2, ldb, xtrue, ldx, grid.comm); + } + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); diff --git a/EXAMPLE/pddrive2.c b/EXAMPLE/pddrive2.c index 0797cc08..6bdc7007 100644 --- a/EXAMPLE/pddrive2.c +++ b/EXAMPLE/pddrive2.c @@ -83,12 +83,7 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); -#ifdef GPU_ACC - int rank, devs; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - cudaGetDeviceCount(&devs); - cudaSetDevice(rank % devs); -#endif + /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { @@ -196,8 +191,15 @@ int main(int argc, char *argv[]) pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); @@ -233,10 +235,16 @@ int main(int argc, char *argv[]) pdgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); - pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } #if ( PRNTlevel>=2 ) if (iam==0) { PrintInt10("new perm_r", m, ScalePermstruct.perm_r); diff --git a/EXAMPLE/pddrive3.c b/EXAMPLE/pddrive3.c index 419617ba..d3d6683c 100644 --- a/EXAMPLE/pddrive3.c +++ b/EXAMPLE/pddrive3.c @@ -80,12 +80,7 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); -#ifdef GPU_ACC - int rank, devs; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - cudaGetDeviceCount(&devs); - cudaSetDevice(rank % devs); -#endif + /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { @@ -211,8 +206,15 @@ int main(int argc, char *argv[]) pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pdinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); @@ -248,10 +250,17 @@ int main(int argc, char *argv[]) pdgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) - printf("Solve a system with the same pattern and similar values.\n"); - pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) + printf("Solve a system with the same pattern and similar values.\n"); + pdinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + } /* Print the statistics. */ PStatPrint(&options, &stat, &grid); diff --git a/EXAMPLE/pddrive3d.c b/EXAMPLE/pddrive3d.c index eec9b750..02f4837c 100644 --- a/EXAMPLE/pddrive3d.c +++ b/EXAMPLE/pddrive3d.c @@ -141,12 +141,7 @@ main (int argc, char *argv[]) printf("\tprovided omp_mpi_level: %d\n", provided); } } -#ifdef GPU_ACC - int rank, devs; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - cudaGetDeviceCount(&devs); - cudaSetDevice(rank % devs); -#endif + /* Parse command line argv[]. */ for (cpp = argv + 1; *cpp; ++cpp) { @@ -302,7 +297,7 @@ main (int argc, char *argv[]) options.ParSymbFact = NO; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag_MC64; - options.ReplaceTinyPivot = YES; + options.ReplaceTinyPivot = NO; options.IterRefine = DOUBLE; options.Trans = NOTRANS; options.SolveInitialized = NO; @@ -319,7 +314,7 @@ main (int argc, char *argv[]) options.IterRefine = NOREFINE; options.ColPerm = NATURAL; options.Equil = NO; - options.ReplaceTinyPivot = NO; + options.ReplaceTinyPivot = YES; #endif if (!iam) { @@ -353,10 +348,16 @@ main (int argc, char *argv[]) pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); - fflush(stdout); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } /* ------------------------------------------------------------ DEALLOCATE STORAGE. diff --git a/EXAMPLE/pddrive3d1.c b/EXAMPLE/pddrive3d1.c index 8a1baeeb..b1a553a4 100644 --- a/EXAMPLE/pddrive3d1.c +++ b/EXAMPLE/pddrive3d1.c @@ -302,7 +302,7 @@ main (int argc, char *argv[]) options.ParSymbFact = NO; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag_MC64; - options.ReplaceTinyPivot = YES; + options.ReplaceTinyPivot = NO; options.IterRefine = DOUBLE; options.Trans = NOTRANS; options.SolveInitialized = NO; @@ -319,7 +319,7 @@ main (int argc, char *argv[]) options.IterRefine = NOREFINE; options.ColPerm = NATURAL; options.Equil = NO; - options.ReplaceTinyPivot = NO; + options.ReplaceTinyPivot = YES; #endif if (!iam) { @@ -343,22 +343,29 @@ main (int argc, char *argv[]) pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the first system:\n"); - pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + if ( grid.zscp.Iam == 0 ) { // process layer 0 PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ } PStatFree (&stat); fflush(stdout); - /* ------------------------------------------------------------ - 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT - RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN - LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. - ------------------------------------------------------------*/ + /* ------------------------------------------------------------ + 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN + LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ PStatInit(&stat); /* Initialize the statistics variables. */ @@ -366,11 +373,18 @@ main (int argc, char *argv[]) pdgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the system with a different B:\n"); - pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b1, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue, ldx, grid.comm); + } + /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------ */ diff --git a/EXAMPLE/pddrive3d2.c b/EXAMPLE/pddrive3d2.c index b39f4189..5fed6157 100644 --- a/EXAMPLE/pddrive3d2.c +++ b/EXAMPLE/pddrive3d2.c @@ -266,7 +266,7 @@ main (int argc, char *argv[]) options.ParSymbFact = NO; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag_MC64; - options.ReplaceTinyPivot = YES; + options.ReplaceTinyPivot = NO; options.IterRefine = DOUBLE; options.Trans = NOTRANS; options.SolveInitialized = NO; @@ -283,7 +283,7 @@ main (int argc, char *argv[]) options.IterRefine = NOREFINE; options.ColPerm = NATURAL; options.Equil = NO; - options.ReplaceTinyPivot = NO; + options.ReplaceTinyPivot = YES; #endif if (!iam) { @@ -307,11 +307,18 @@ main (int argc, char *argv[]) pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the first system:\n"); - pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + /* Deallocate some storage, keep around 2D matrix meta structure */ Destroy_CompRowLoc_Matrix_dist (&A); if ( grid.zscp.Iam == 0 ) { // process layer 0 @@ -336,19 +343,26 @@ main (int argc, char *argv[]) a different perm_r[]. Set up the right-hand side. */ if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); dcreate_matrix_postfix3d(&A, nrhs, &b1, &ldb, - &xtrue1, &ldx, fp, suffix, &(grid)); + &xtrue1, &ldx, fp, suffix, &(grid)); PStatInit(&stat); /* Initialize the statistics variables. */ nrhs = 1; pdgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - - /* Check the accuracy of the solution. */ - if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); - pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b1, ldb, xtrue1, ldx, grid.comm); - + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } + /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------ */ diff --git a/EXAMPLE/pddrive3d3.c b/EXAMPLE/pddrive3d3.c index 69c7894a..3a7ccb59 100644 --- a/EXAMPLE/pddrive3d3.c +++ b/EXAMPLE/pddrive3d3.c @@ -35,7 +35,7 @@ at the top-level directory. * values of matrix A. * In this case, the row and column permutation vectors and symbolic * factorization are computed only once. The following data structures - * will be reused in the subsequent call to PDGSSVX: + * will be reused in the subsequent call to PDGSSVX3D: * ScalePermstruct : DiagScale, R, C, perm_r, perm_c * LUstruct : etree, Glu_persist, Llu * SOLVEstruct : communication metadata for SpTRSV, SpMV, and @@ -44,7 +44,7 @@ at the top-level directory. * NOTE: * The distributed nonzero structures of L and U remain the same, * although the numerical values are different. So 'Llu' is set up once - * in the first call to PDGSSVX, and reused in the subsequent call. + * in the first call to PDGSSVX3D, and reused in the subsequent call. * * The program may be run by typing: * mpiexec -np

pddrive3d3 -r -c \ @@ -263,7 +263,7 @@ main (int argc, char *argv[]) options.ParSymbFact = NO; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag_MC64; - options.ReplaceTinyPivot = YES; + options.ReplaceTinyPivot = NO; options.IterRefine = DOUBLE; options.Trans = NOTRANS; options.SolveInitialized = NO; @@ -280,7 +280,7 @@ main (int argc, char *argv[]) options.IterRefine = NOREFINE; options.ColPerm = NATURAL; options.Equil = NO; - options.ReplaceTinyPivot = NO; + options.ReplaceTinyPivot = YES; #endif if (!iam) { @@ -304,11 +304,18 @@ main (int argc, char *argv[]) pdgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the first system:\n"); - pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + /* Deallocate some storage, including replicated LU structure along the Z dimension. keep around 2D matrix meta structure, including the LU data structure on the host side. */ @@ -350,11 +357,18 @@ main (int argc, char *argv[]) pdgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("Solve a system with the same pattern and similar values.\n"); - pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b1, ldb, xtrue1, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve a system with the same pattern and similar values.\n"); + pdinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } + /* ------------------------------------------------------------ DEALLOCATE ALL STORAGE. ------------------------------------------------------------ */ diff --git a/EXAMPLE/pddrive4.c b/EXAMPLE/pddrive4.c index dbe9ee08..6cbac44f 100644 --- a/EXAMPLE/pddrive4.c +++ b/EXAMPLE/pddrive4.c @@ -74,12 +74,7 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); -#ifdef GPU_ACC - int rank, devs; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - cudaGetDeviceCount(&devs); - cudaSetDevice(rank % devs); -#endif + MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); if ( nprocs < 10 ) { fprintf(stderr, "Requires at least 10 processes\n"); @@ -196,9 +191,16 @@ int main(int argc, char *argv[]) pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid1.comm); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid1.comm); + } /* Print the statistics. */ PStatPrint(&options, &stat, &grid1); @@ -261,10 +263,17 @@ int main(int argc, char *argv[]) pdgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid2.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pdgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pdinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid2.comm); + } + /* Print the statistics. */ PStatPrint(&options, &stat, &grid2); diff --git a/EXAMPLE/psdrive.c b/EXAMPLE/psdrive.c index b34fcefc..72db7f96 100644 --- a/EXAMPLE/psdrive.c +++ b/EXAMPLE/psdrive.c @@ -139,7 +139,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam == -1 ) goto out; + if ( (iam >= nprow * npcol) || (iam == -1) ) goto out; if ( !iam ) { int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER @@ -225,10 +225,16 @@ int main(int argc, char *argv[]) psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - - /* Check the accuracy of the solution. */ - psinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + psinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ @@ -241,9 +247,7 @@ int main(int argc, char *argv[]) sScalePermstructFree(&ScalePermstruct); sDestroy_LU(n, &grid, &LUstruct); sLUstructFree(&LUstruct); - if ( options.SolveInitialized ) { - sSolveFinalize(&options, &SOLVEstruct); - } + sSolveFinalize(&options, &SOLVEstruct); SUPERLU_FREE(b); SUPERLU_FREE(xtrue); SUPERLU_FREE(berr); diff --git a/EXAMPLE/psdrive1.c b/EXAMPLE/psdrive1.c index 6d77ec69..ac8f1d75 100644 --- a/EXAMPLE/psdrive1.c +++ b/EXAMPLE/psdrive1.c @@ -195,11 +195,17 @@ int main(int argc, char *argv[]) psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the first system:\n"); - psinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + psinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); @@ -215,10 +221,17 @@ int main(int argc, char *argv[]) psgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the system with a different B:\n"); - psinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + psinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + } + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); @@ -245,10 +258,17 @@ int main(int argc, char *argv[]) psgssvx(&options, &A, &ScalePermstruct, b2, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the system with 3 RHS's:\n"); - psinf_norm_error(iam, m_loc, nrhs, b2, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with 3 RHS's:\n"); + psinf_norm_error(iam, m_loc, nrhs, b2, ldb, xtrue, ldx, grid.comm); + } + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); diff --git a/EXAMPLE/psdrive2.c b/EXAMPLE/psdrive2.c index 3db5e0a9..ffc93170 100644 --- a/EXAMPLE/psdrive2.c +++ b/EXAMPLE/psdrive2.c @@ -191,8 +191,15 @@ int main(int argc, char *argv[]) psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - psinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + psinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); @@ -228,10 +235,16 @@ int main(int argc, char *argv[]) psgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); - psinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + psinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } #if ( PRNTlevel>=2 ) if (iam==0) { PrintInt10("new perm_r", m, ScalePermstruct.perm_r); diff --git a/EXAMPLE/psdrive3.c b/EXAMPLE/psdrive3.c index 1c23fab5..eaa2e989 100644 --- a/EXAMPLE/psdrive3.c +++ b/EXAMPLE/psdrive3.c @@ -206,8 +206,15 @@ int main(int argc, char *argv[]) psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - psinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + psinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); @@ -242,10 +249,17 @@ int main(int argc, char *argv[]) psgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) - printf("Solve a system with the same pattern and similar values.\n"); - psinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) + printf("Solve a system with the same pattern and similar values.\n"); + psinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + } /* Print the statistics. */ PStatPrint(&options, &stat, &grid); diff --git a/EXAMPLE/psdrive3d.c b/EXAMPLE/psdrive3d.c index 8e2e0ea0..6cbc21e4 100644 --- a/EXAMPLE/psdrive3d.c +++ b/EXAMPLE/psdrive3d.c @@ -77,8 +77,8 @@ static void checkNRFMT(NRformat_loc*A, NRformat_loc*B) #if 0 double *Aval = (double *)A->nzval, *Bval = (double *)B->nzval; - PrintDouble5("A", A->nnz_loc, Aval); - PrintDouble5("B", B->nnz_loc, Bval); + Printdouble5("A", A->nnz_loc, Aval); + Printdouble5("B", B->nnz_loc, Bval); fflush(stdout); #endif @@ -136,7 +136,10 @@ main (int argc, char *argv[]) { int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (!rank) printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + if (!rank) { + printf("The MPI library doesn't provide MPI_THREAD_MULTIPLE \n"); + printf("\tprovided omp_mpi_level: %d\n", provided); + } } /* Parse command line argv[]. */ @@ -294,7 +297,7 @@ main (int argc, char *argv[]) options.ParSymbFact = NO; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag_MC64; - options.ReplaceTinyPivot = YES; + options.ReplaceTinyPivot = NO; options.IterRefine = DOUBLE; options.Trans = NOTRANS; options.SolveInitialized = NO; @@ -311,7 +314,7 @@ main (int argc, char *argv[]) options.IterRefine = NOREFINE; options.ColPerm = NATURAL; options.Equil = NO; - options.ReplaceTinyPivot = NO; + options.ReplaceTinyPivot = YES; #endif if (!iam) { @@ -345,10 +348,16 @@ main (int argc, char *argv[]) psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); - fflush(stdout); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } /* ------------------------------------------------------------ DEALLOCATE STORAGE. @@ -359,13 +368,13 @@ main (int argc, char *argv[]) PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ sDestroy_LU (n, &(grid.grid2d), &LUstruct); - if (options.SolveInitialized) { - sSolveFinalize (&options, &SOLVEstruct); - } + sSolveFinalize (&options, &SOLVEstruct); } else { // Process layers not equal 0 sDeAllocLlu_3d(n, &LUstruct, &grid); sDeAllocGlu_3d(&LUstruct); } + + sDestroy_A3d_gathered_on_2d(&SOLVEstruct, &grid); Destroy_CompRowLoc_Matrix_dist (&A); SUPERLU_FREE (b); diff --git a/EXAMPLE/psdrive3d1.c b/EXAMPLE/psdrive3d1.c index e925420b..c206d2fe 100644 --- a/EXAMPLE/psdrive3d1.c +++ b/EXAMPLE/psdrive3d1.c @@ -302,7 +302,7 @@ main (int argc, char *argv[]) options.ParSymbFact = NO; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag_MC64; - options.ReplaceTinyPivot = YES; + options.ReplaceTinyPivot = NO; options.IterRefine = DOUBLE; options.Trans = NOTRANS; options.SolveInitialized = NO; @@ -319,7 +319,7 @@ main (int argc, char *argv[]) options.IterRefine = NOREFINE; options.ColPerm = NATURAL; options.Equil = NO; - options.ReplaceTinyPivot = NO; + options.ReplaceTinyPivot = YES; #endif if (!iam) { @@ -343,22 +343,29 @@ main (int argc, char *argv[]) psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the first system:\n"); - psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + if ( grid.zscp.Iam == 0 ) { // process layer 0 PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ } PStatFree (&stat); fflush(stdout); - /* ------------------------------------------------------------ - 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT - RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN - LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. - ------------------------------------------------------------*/ + /* ------------------------------------------------------------ + 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN + LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ PStatInit(&stat); /* Initialize the statistics variables. */ @@ -366,11 +373,18 @@ main (int argc, char *argv[]) psgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the system with a different B:\n"); - psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b1, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue, ldx, grid.comm); + } + /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------ */ diff --git a/EXAMPLE/psdrive3d2.c b/EXAMPLE/psdrive3d2.c index c69e0fa2..d6e0adef 100644 --- a/EXAMPLE/psdrive3d2.c +++ b/EXAMPLE/psdrive3d2.c @@ -266,7 +266,7 @@ main (int argc, char *argv[]) options.ParSymbFact = NO; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag_MC64; - options.ReplaceTinyPivot = YES; + options.ReplaceTinyPivot = NO; options.IterRefine = DOUBLE; options.Trans = NOTRANS; options.SolveInitialized = NO; @@ -283,7 +283,7 @@ main (int argc, char *argv[]) options.IterRefine = NOREFINE; options.ColPerm = NATURAL; options.Equil = NO; - options.ReplaceTinyPivot = NO; + options.ReplaceTinyPivot = YES; #endif if (!iam) { @@ -307,11 +307,18 @@ main (int argc, char *argv[]) psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the first system:\n"); - psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + /* Deallocate some storage, keep around 2D matrix meta structure */ Destroy_CompRowLoc_Matrix_dist (&A); if ( grid.zscp.Iam == 0 ) { // process layer 0 @@ -336,19 +343,26 @@ main (int argc, char *argv[]) a different perm_r[]. Set up the right-hand side. */ if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); screate_matrix_postfix3d(&A, nrhs, &b1, &ldb, - &xtrue1, &ldx, fp, suffix, &(grid)); + &xtrue1, &ldx, fp, suffix, &(grid)); PStatInit(&stat); /* Initialize the statistics variables. */ nrhs = 1; psgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - - /* Check the accuracy of the solution. */ - if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); - psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b1, ldb, xtrue1, ldx, grid.comm); - + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } + /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------ */ diff --git a/EXAMPLE/psdrive3d3.c b/EXAMPLE/psdrive3d3.c index e2cd45f9..89095fc5 100644 --- a/EXAMPLE/psdrive3d3.c +++ b/EXAMPLE/psdrive3d3.c @@ -35,7 +35,7 @@ at the top-level directory. * values of matrix A. * In this case, the row and column permutation vectors and symbolic * factorization are computed only once. The following data structures - * will be reused in the subsequent call to PSGSSVX: + * will be reused in the subsequent call to PSGSSVX3D: * ScalePermstruct : DiagScale, R, C, perm_r, perm_c * LUstruct : etree, Glu_persist, Llu * SOLVEstruct : communication metadata for SpTRSV, SpMV, and @@ -44,7 +44,7 @@ at the top-level directory. * NOTE: * The distributed nonzero structures of L and U remain the same, * although the numerical values are different. So 'Llu' is set up once - * in the first call to PSGSSVX, and reused in the subsequent call. + * in the first call to PSGSSVX3D, and reused in the subsequent call. * * The program may be run by typing: * mpiexec -np

psdrive3d3 -r -c \ @@ -263,7 +263,7 @@ main (int argc, char *argv[]) options.ParSymbFact = NO; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag_MC64; - options.ReplaceTinyPivot = YES; + options.ReplaceTinyPivot = NO; options.IterRefine = DOUBLE; options.Trans = NOTRANS; options.SolveInitialized = NO; @@ -280,7 +280,7 @@ main (int argc, char *argv[]) options.IterRefine = NOREFINE; options.ColPerm = NATURAL; options.Equil = NO; - options.ReplaceTinyPivot = NO; + options.ReplaceTinyPivot = YES; #endif if (!iam) { @@ -304,11 +304,18 @@ main (int argc, char *argv[]) psgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the first system:\n"); - psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + /* Deallocate some storage, including replicated LU structure along the Z dimension. keep around 2D matrix meta structure, including the LU data structure on the host side. */ @@ -350,11 +357,18 @@ main (int argc, char *argv[]) psgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("Solve a system with the same pattern and similar values.\n"); - psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b1, ldb, xtrue1, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve a system with the same pattern and similar values.\n"); + psinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } + /* ------------------------------------------------------------ DEALLOCATE ALL STORAGE. ------------------------------------------------------------ */ diff --git a/EXAMPLE/psdrive4.c b/EXAMPLE/psdrive4.c index db3216a3..55057ca2 100644 --- a/EXAMPLE/psdrive4.c +++ b/EXAMPLE/psdrive4.c @@ -191,9 +191,16 @@ int main(int argc, char *argv[]) psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - psinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid1.comm); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + psinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid1.comm); + } /* Print the statistics. */ PStatPrint(&options, &stat, &grid1); @@ -256,10 +263,17 @@ int main(int argc, char *argv[]) psgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - psinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid2.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from psgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + psinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid2.comm); + } + /* Print the statistics. */ PStatPrint(&options, &stat, &grid2); diff --git a/EXAMPLE/pzdrive.c b/EXAMPLE/pzdrive.c index 342b91a5..493997b2 100644 --- a/EXAMPLE/pzdrive.c +++ b/EXAMPLE/pzdrive.c @@ -74,12 +74,7 @@ int main(int argc, char *argv[]) ------------------------------------------------------------*/ //MPI_Init( &argc, &argv ); MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); -#ifdef GPU_ACC - int rank, devs; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - cudaGetDeviceCount(&devs); - cudaSetDevice(rank % devs); -#endif + #if ( VAMPIR>=1 ) VT_traceoff(); @@ -143,7 +138,7 @@ int main(int argc, char *argv[]) /* Bail out if I do not belong in the grid. */ iam = grid.iam; - if ( iam == -1 ) goto out; + if ( (iam >= nprow * npcol) || (iam == -1) ) goto out; if ( !iam ) { int v_major, v_minor, v_bugfix; #ifdef __INTEL_COMPILER @@ -229,10 +224,16 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - - /* Check the accuracy of the solution. */ - pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ @@ -245,9 +246,7 @@ int main(int argc, char *argv[]) zScalePermstructFree(&ScalePermstruct); zDestroy_LU(n, &grid, &LUstruct); zLUstructFree(&LUstruct); - if ( options.SolveInitialized ) { - zSolveFinalize(&options, &SOLVEstruct); - } + zSolveFinalize(&options, &SOLVEstruct); SUPERLU_FREE(b); SUPERLU_FREE(xtrue); SUPERLU_FREE(berr); diff --git a/EXAMPLE/pzdrive1.c b/EXAMPLE/pzdrive1.c index 69aea1d0..e0f8ff4b 100644 --- a/EXAMPLE/pzdrive1.c +++ b/EXAMPLE/pzdrive1.c @@ -72,12 +72,7 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); -#ifdef GPU_ACC - int rank, devs; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - cudaGetDeviceCount(&devs); - cudaSetDevice(rank % devs); -#endif + /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { @@ -199,11 +194,17 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the first system:\n"); - pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); @@ -219,10 +220,17 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the system with a different B:\n"); - pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + } + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); @@ -249,10 +257,17 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b2, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the system with 3 RHS's:\n"); - pzinf_norm_error(iam, m_loc, nrhs, b2, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with 3 RHS's:\n"); + pzinf_norm_error(iam, m_loc, nrhs, b2, ldb, xtrue, ldx, grid.comm); + } + PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); diff --git a/EXAMPLE/pzdrive2.c b/EXAMPLE/pzdrive2.c index bd1068d8..ac042c2b 100644 --- a/EXAMPLE/pzdrive2.c +++ b/EXAMPLE/pzdrive2.c @@ -82,12 +82,7 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); -#ifdef GPU_ACC - int rank, devs; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - cudaGetDeviceCount(&devs); - cudaSetDevice(rank % devs); -#endif + /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { @@ -195,8 +190,15 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); @@ -232,10 +234,16 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); - pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } #if ( PRNTlevel>=2 ) if (iam==0) { PrintInt10("new perm_r", m, ScalePermstruct.perm_r); diff --git a/EXAMPLE/pzdrive3.c b/EXAMPLE/pzdrive3.c index afe96b8d..df7a7479 100644 --- a/EXAMPLE/pzdrive3.c +++ b/EXAMPLE/pzdrive3.c @@ -79,12 +79,7 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); -#ifdef GPU_ACC - int rank, devs; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - cudaGetDeviceCount(&devs); - cudaSetDevice(rank % devs); -#endif + /* Parse command line argv[]. */ for (cpp = argv+1; *cpp; ++cpp) { if ( **cpp == '-' ) { @@ -210,8 +205,15 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pzinf_norm_error(iam, m_loc, nrhs, b, ldb, xtrue, ldx, grid.comm); + } PStatPrint(&options, &stat, &grid); /* Print the statistics. */ PStatFree(&stat); @@ -247,10 +249,17 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) - printf("Solve a system with the same pattern and similar values.\n"); - pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) + printf("Solve a system with the same pattern and similar values.\n"); + pzinf_norm_error(iam, m_loc, nrhs, b1, ldb, xtrue, ldx, grid.comm); + } /* Print the statistics. */ PStatPrint(&options, &stat, &grid); diff --git a/EXAMPLE/pzdrive3d.c b/EXAMPLE/pzdrive3d.c index 34a5fb28..1d35fb24 100644 --- a/EXAMPLE/pzdrive3d.c +++ b/EXAMPLE/pzdrive3d.c @@ -132,12 +132,6 @@ main (int argc, char *argv[]) int required = MPI_THREAD_MULTIPLE; int provided; MPI_Init_thread(&argc, &argv, required, &provided); -#ifdef GPU_ACC - int rank, devs; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - cudaGetDeviceCount(&devs); - cudaSetDevice(rank % devs); -#endif if (provided < required) { int rank; @@ -189,7 +183,6 @@ main (int argc, char *argv[]) INITIALIZE THE SUPERLU PROCESS GRID. ------------------------------------------------------------ */ superlu_gridinit3d (MPI_COMM_WORLD, nprow, npcol, npdep, &grid); - // grid.rankorder = 1; if(grid.iam==0) { MPI_Query_thread(&omp_mpi_level); @@ -304,7 +297,7 @@ main (int argc, char *argv[]) options.ParSymbFact = NO; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag_MC64; - options.ReplaceTinyPivot = YES; + options.ReplaceTinyPivot = NO; options.IterRefine = DOUBLE; options.Trans = NOTRANS; options.SolveInitialized = NO; @@ -321,7 +314,7 @@ main (int argc, char *argv[]) options.IterRefine = NOREFINE; options.ColPerm = NATURAL; options.Equil = NO; - options.ReplaceTinyPivot = NO; + options.ReplaceTinyPivot = YES; #endif if (!iam) { @@ -355,10 +348,16 @@ main (int argc, char *argv[]) pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); - fflush(stdout); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } /* ------------------------------------------------------------ DEALLOCATE STORAGE. diff --git a/EXAMPLE/pzdrive3d1.c b/EXAMPLE/pzdrive3d1.c index 660c34e1..47905c56 100644 --- a/EXAMPLE/pzdrive3d1.c +++ b/EXAMPLE/pzdrive3d1.c @@ -302,7 +302,7 @@ main (int argc, char *argv[]) options.ParSymbFact = NO; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag_MC64; - options.ReplaceTinyPivot = YES; + options.ReplaceTinyPivot = NO; options.IterRefine = DOUBLE; options.Trans = NOTRANS; options.SolveInitialized = NO; @@ -319,7 +319,7 @@ main (int argc, char *argv[]) options.IterRefine = NOREFINE; options.ColPerm = NATURAL; options.Equil = NO; - options.ReplaceTinyPivot = NO; + options.ReplaceTinyPivot = YES; #endif if (!iam) { @@ -340,26 +340,32 @@ main (int argc, char *argv[]) PStatInit (&stat); /* Call the linear equation solver. */ - nrhs = 0; pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the first system:\n"); - pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + if ( grid.zscp.Iam == 0 ) { // process layer 0 PStatPrint (&options, &stat, &(grid.grid2d)); /* Print 2D statistics.*/ } PStatFree (&stat); fflush(stdout); - /* ------------------------------------------------------------ - 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT - RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN - LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. - ------------------------------------------------------------*/ + /* ------------------------------------------------------------ + 2. NOW SOLVE ANOTHER SYSTEM WITH THE SAME A BUT DIFFERENT + RIGHT-HAND SIDE, WE WILL USE THE EXISTING L AND U FACTORS IN + LUSTRUCT OBTAINED FROM A PREVIOUS FATORIZATION. + ------------------------------------------------------------*/ options.Fact = FACTORED; /* Indicate the factored form of A is supplied. */ PStatInit(&stat); /* Initialize the statistics variables. */ @@ -367,11 +373,18 @@ main (int argc, char *argv[]) pzgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the system with a different B:\n"); - pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b1, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the system with a different B:\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue, ldx, grid.comm); + } + /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------ */ diff --git a/EXAMPLE/pzdrive3d2.c b/EXAMPLE/pzdrive3d2.c index a6204d45..759b2afb 100644 --- a/EXAMPLE/pzdrive3d2.c +++ b/EXAMPLE/pzdrive3d2.c @@ -266,7 +266,7 @@ main (int argc, char *argv[]) options.ParSymbFact = NO; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag_MC64; - options.ReplaceTinyPivot = YES; + options.ReplaceTinyPivot = NO; options.IterRefine = DOUBLE; options.Trans = NOTRANS; options.SolveInitialized = NO; @@ -283,7 +283,7 @@ main (int argc, char *argv[]) options.IterRefine = NOREFINE; options.ColPerm = NATURAL; options.Equil = NO; - options.ReplaceTinyPivot = NO; + options.ReplaceTinyPivot = YES; #endif if (!iam) { @@ -307,11 +307,18 @@ main (int argc, char *argv[]) pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the first system:\n"); - pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + /* Deallocate some storage, keep around 2D matrix meta structure */ Destroy_CompRowLoc_Matrix_dist (&A); if ( grid.zscp.Iam == 0 ) { // process layer 0 @@ -336,19 +343,26 @@ main (int argc, char *argv[]) a different perm_r[]. Set up the right-hand side. */ if ( !(fp = fopen(*cpp, "r")) ) ABORT("File does not exist"); zcreate_matrix_postfix3d(&A, nrhs, &b1, &ldb, - &xtrue1, &ldx, fp, suffix, &(grid)); + &xtrue1, &ldx, fp, suffix, &(grid)); PStatInit(&stat); /* Initialize the statistics variables. */ nrhs = 1; pzgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - - /* Check the accuracy of the solution. */ - if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); - pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b1, ldb, xtrue1, ldx, grid.comm); - + + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve the system with the same sparsity pattern.\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } + /* ------------------------------------------------------------ DEALLOCATE STORAGE. ------------------------------------------------------------ */ diff --git a/EXAMPLE/pzdrive3d3.c b/EXAMPLE/pzdrive3d3.c index c3aa53a3..9eab4a11 100644 --- a/EXAMPLE/pzdrive3d3.c +++ b/EXAMPLE/pzdrive3d3.c @@ -34,7 +34,7 @@ at the top-level directory. * values of matrix A. * In this case, the row and column permutation vectors and symbolic * factorization are computed only once. The following data structures - * will be reused in the subsequent call to PZGSSVX: + * will be reused in the subsequent call to PZGSSVX3D: * ScalePermstruct : DiagScale, R, C, perm_r, perm_c * LUstruct : etree, Glu_persist, Llu * SOLVEstruct : communication metadata for SpTRSV, SpMV, and @@ -43,7 +43,7 @@ at the top-level directory. * NOTE: * The distributed nonzero structures of L and U remain the same, * although the numerical values are different. So 'Llu' is set up once - * in the first call to PZGSSVX, and reused in the subsequent call. + * in the first call to PZGSSVX3D, and reused in the subsequent call. * * The program may be run by typing: * mpiexec -np

pzdrive3d3 -r -c \ @@ -263,7 +263,7 @@ main (int argc, char *argv[]) options.ParSymbFact = NO; options.ColPerm = METIS_AT_PLUS_A; options.RowPerm = LargeDiag_MC64; - options.ReplaceTinyPivot = YES; + options.ReplaceTinyPivot = NO; options.IterRefine = DOUBLE; options.Trans = NOTRANS; options.SolveInitialized = NO; @@ -280,7 +280,7 @@ main (int argc, char *argv[]) options.IterRefine = NOREFINE; options.ColPerm = NATURAL; options.Equil = NO; - options.ReplaceTinyPivot = NO; + options.ReplaceTinyPivot = YES; #endif if (!iam) { @@ -304,11 +304,18 @@ main (int argc, char *argv[]) pzgssvx3d (&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("\tSolve the first system:\n"); - pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("\tSolve the first system:\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid.comm); + } + /* Deallocate some storage, including replicated LU structure along the Z dimension. keep around 2D matrix meta structure, including the LU data structure on the host side. */ @@ -350,11 +357,18 @@ main (int argc, char *argv[]) pzgssvx3d (&options, &A, &ScalePermstruct, b1, ldb, nrhs, &grid, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - if ( !iam ) printf("Solve a system with the same pattern and similar values.\n"); - pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, - nrhs, b1, ldb, xtrue1, ldx, grid.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx3d()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + if ( !iam ) printf("Solve a system with the same pattern and similar values.\n"); + pzinf_norm_error (iam, ((NRformat_loc *) A.Store)->m_loc, + nrhs, b1, ldb, xtrue1, ldx, grid.comm); + } + /* ------------------------------------------------------------ DEALLOCATE ALL STORAGE. ------------------------------------------------------------ */ diff --git a/EXAMPLE/pzdrive4.c b/EXAMPLE/pzdrive4.c index 33451140..1b95ded7 100644 --- a/EXAMPLE/pzdrive4.c +++ b/EXAMPLE/pzdrive4.c @@ -73,12 +73,7 @@ int main(int argc, char *argv[]) INITIALIZE MPI ENVIRONMENT. ------------------------------------------------------------*/ MPI_Init_thread( &argc, &argv, MPI_THREAD_MULTIPLE, &omp_mpi_level); -#ifdef GPU_ACC - int rank, devs; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - cudaGetDeviceCount(&devs); - cudaSetDevice(rank % devs); -#endif + MPI_Comm_size( MPI_COMM_WORLD, &nprocs ); if ( nprocs < 10 ) { fprintf(stderr, "Requires at least 10 processes\n"); @@ -195,9 +190,16 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid1, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid1.comm); + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid1.comm); + } /* Print the statistics. */ PStatPrint(&options, &stat, &grid1); @@ -260,10 +262,17 @@ int main(int argc, char *argv[]) pzgssvx(&options, &A, &ScalePermstruct, b, ldb, nrhs, &grid2, &LUstruct, &SOLVEstruct, berr, &stat, &info); - /* Check the accuracy of the solution. */ - pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, - nrhs, b, ldb, xtrue, ldx, grid2.comm); - + if ( info ) { /* Something is wrong */ + if ( iam==0 ) { + printf("ERROR: INFO = %d returned from pzgssvx()\n", info); + fflush(stdout); + } + } else { + /* Check the accuracy of the solution. */ + pzinf_norm_error(iam, ((NRformat_loc *)A.Store)->m_loc, + nrhs, b, ldb, xtrue, ldx, grid2.comm); + } + /* Print the statistics. */ PStatPrint(&options, &stat, &grid2); diff --git a/SRC/dtreeFactorization.c b/SRC/dtreeFactorization.c index 85b70ca7..b7fbdbff 100644 --- a/SRC/dtreeFactorization.c +++ b/SRC/dtreeFactorization.c @@ -315,6 +315,9 @@ int_t dsparseTreeFactor_ASYNC( return 1; } + /* Test the input parameters. */ + *info = 0; + #if ( DEBUGlevel>=1 ) CHECK_MALLOC (grid3d->iam, "Enter dsparseTreeFactor_ASYNC()"); #endif diff --git a/SRC/pdgssvx3d.c b/SRC/pdgssvx3d.c index 55580f54..1a74194b 100644 --- a/SRC/pdgssvx3d.c +++ b/SRC/pdgssvx3d.c @@ -14,10 +14,11 @@ at the top-level directory. * \brief Solves a system of linear equations A*X=B using 3D process grid. * *

- * -- Distributed SuperLU routine (version 7.0) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Georgia Institute of Technology,
  * Oak Ridge National Lab
  * May 12, 2021
+ * October 5, 2021 (last update: November 8, 2021)
  */
 #include "superlu_ddefs.h"
 
@@ -1273,7 +1274,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
         dDestroy_trf3Dpartition(trf3Dpartition, grid3d);
 	SCT_free(SCT);
 
-    } /* end if not Factored */
+    } /* end if not Factored ... factor on all process layers */
     
     if ( grid3d->zscp.Iam == 0 ) { // only process layer 0
 	if (!factored) {
@@ -1326,7 +1327,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	/* ------------------------------------------------------------
 	   Compute the solution matrix X.
 	   ------------------------------------------------------------ */
-	if ( nrhs > 0 ) {
+	if ( (nrhs > 0) && (*info == 0) ) {
 	    if (!(b_work = doubleMalloc_dist (n)))
 	        ABORT ("Malloc fails for b_work[]");
 
@@ -1542,7 +1543,7 @@ pdgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		SUPERLU_FREE (b_work);
 		SUPERLU_FREE (X);
 		
-	    } /* end if nrhs > 0 */
+	    } /* end if nrhs > 0 and factor successful */
 	
 #if ( PRNTlevel>=1 )
 	if (!iam) {
diff --git a/SRC/pdgstrf2.c b/SRC/pdgstrf2.c
index fca4792c..8c5a1933 100644
--- a/SRC/pdgstrf2.c
+++ b/SRC/pdgstrf2.c
@@ -398,7 +398,24 @@ int_t LpanelUpdate(int off0,  int nsupc, double* ublk_ptr, int ld_ujrow,
 
 #pragma GCC push_options
 #pragma GCC optimize ("O0")
-/*factorizes the diagonal block; called from process that owns the (k,k) block*/
+/************************************************************************/
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *   Factorize the diagonal block; called from process that owns the (k,k) block
+ *
+ * Arguments
+ * =========
+ * 
+ * info   (output) int*
+ *        = 0: successful exit
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ */
 void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
                    double *BlockUFactor, /*factored U is overwritten here*/
                    Glu_persist_t *Glu_persist, gridinfo_t *grid, dLocalLU_t *Llu,
@@ -433,8 +450,9 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
     {
         /* Diagonal pivot */
         int_t i = luptr;
-        /* Not to replace zero pivot.  */
-        if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0)
+        /* Allow to replace zero pivot.  */
+        //if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0)
+        if (options->ReplaceTinyPivot == YES)
         {
             if (fabs (lusup[i]) < thresh) {  /* Diagonal */
 
@@ -491,7 +509,7 @@ void Local_Dgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
 
     //int_t thread_id = omp_get_thread_num();
     // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( SuperLU_timer_() - t1);
-}
+} /* end Local_Dgstrf2 */
 
 #pragma GCC pop_options
 /************************************************************************/
diff --git a/SRC/pdgstrf3d.c b/SRC/pdgstrf3d.c
index 2fa6f0dc..bd378585 100644
--- a/SRC/pdgstrf3d.c
+++ b/SRC/pdgstrf3d.c
@@ -131,6 +131,9 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     double s_eps = smach_dist("Epsilon");
     double thresh = s_eps * anorm;
 
+    /* Test the input parameters. */
+    *info = 0;
+    
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC (grid3d->iam, "Enter pdgstrf3d()");
 #endif
@@ -349,7 +352,14 @@ int_t pdgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     }
 #endif
     
-    MPI_Barrier( grid3d->comm);
+    /* Prepare error message - find the smallesr index i that U(i,i)==0 */
+    int iinfo;
+    if ( *info == 0 ) *info = n + 1;
+    MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid3d->comm);
+    if ( iinfo == n + 1 ) *info = 0;
+    else *info = iinfo;
+    //printf("After factorization: INFO = %d\n", *info); fflush(stdout);
+
     SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
 
 #ifdef ITAC_PROF
diff --git a/SRC/psgssvx3d.c b/SRC/psgssvx3d.c
index 2f00ca73..80bdd329 100644
--- a/SRC/psgssvx3d.c
+++ b/SRC/psgssvx3d.c
@@ -14,10 +14,11 @@ at the top-level directory.
  * \brief Solves a system of linear equations A*X=B using 3D process grid.
  *
  * 
- * -- Distributed SuperLU routine (version 7.0) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Georgia Institute of Technology,
  * Oak Ridge National Lab
  * May 12, 2021
+ * October 5, 2021 (last update: November 8, 2021)
  */
 #include "superlu_sdefs.h"
 
@@ -1273,7 +1274,7 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
         sDestroy_trf3Dpartition(trf3Dpartition, grid3d);
 	SCT_free(SCT);
 
-    } /* end if not Factored */
+    } /* end if not Factored ... factor on all process layers */
     
     if ( grid3d->zscp.Iam == 0 ) { // only process layer 0
 	if (!factored) {
@@ -1326,7 +1327,7 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	/* ------------------------------------------------------------
 	   Compute the solution matrix X.
 	   ------------------------------------------------------------ */
-	if ( nrhs > 0 ) {
+	if ( (nrhs > 0) && (*info == 0) ) {
 	    if (!(b_work = floatMalloc_dist (n)))
 	        ABORT ("Malloc fails for b_work[]");
 
@@ -1542,7 +1543,7 @@ psgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		SUPERLU_FREE (b_work);
 		SUPERLU_FREE (X);
 		
-	    } /* end if nrhs > 0 */
+	    } /* end if nrhs > 0 and factor successful */
 	
 #if ( PRNTlevel>=1 )
 	if (!iam) {
diff --git a/SRC/psgstrf2.c b/SRC/psgstrf2.c
index 8976d79b..b07701b2 100644
--- a/SRC/psgstrf2.c
+++ b/SRC/psgstrf2.c
@@ -398,7 +398,24 @@ int_t LpanelUpdate(int off0,  int nsupc, float* ublk_ptr, int ld_ujrow,
 
 #pragma GCC push_options
 #pragma GCC optimize ("O0")
-/*factorizes the diagonal block; called from process that owns the (k,k) block*/
+/************************************************************************/
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *   Factorize the diagonal block; called from process that owns the (k,k) block
+ *
+ * Arguments
+ * =========
+ * 
+ * info   (output) int*
+ *        = 0: successful exit
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ */
 void Local_Sgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
                    float *BlockUFactor, /*factored U is overwritten here*/
                    Glu_persist_t *Glu_persist, gridinfo_t *grid, sLocalLU_t *Llu,
@@ -433,8 +450,9 @@ void Local_Sgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
     {
         /* Diagonal pivot */
         int_t i = luptr;
-        /* Not to replace zero pivot.  */
-        if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0)
+        /* Allow to replace zero pivot.  */
+        //if (options->ReplaceTinyPivot == YES && lusup[i] != 0.0)
+        if (options->ReplaceTinyPivot == YES)
         {
             if (fabs (lusup[i]) < thresh) {  /* Diagonal */
 
@@ -491,7 +509,7 @@ void Local_Sgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
 
     //int_t thread_id = omp_get_thread_num();
     // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( SuperLU_timer_() - t1);
-}
+} /* end Local_Sgstrf2 */
 
 #pragma GCC pop_options
 /************************************************************************/
diff --git a/SRC/psgstrf3d.c b/SRC/psgstrf3d.c
index a43110d5..3fe7a0a7 100644
--- a/SRC/psgstrf3d.c
+++ b/SRC/psgstrf3d.c
@@ -131,6 +131,9 @@ int_t psgstrf3d(superlu_dist_options_t *options, int m, int n, float anorm,
     double s_eps = smach_dist("Epsilon");
     double thresh = s_eps * anorm;
 
+    /* Test the input parameters. */
+    *info = 0;
+    
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC (grid3d->iam, "Enter psgstrf3d()");
 #endif
@@ -349,7 +352,14 @@ int_t psgstrf3d(superlu_dist_options_t *options, int m, int n, float anorm,
     }
 #endif
     
-    MPI_Barrier( grid3d->comm);
+    /* Prepare error message - find the smallesr index i that U(i,i)==0 */
+    int iinfo;
+    if ( *info == 0 ) *info = n + 1;
+    MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid3d->comm);
+    if ( iinfo == n + 1 ) *info = 0;
+    else *info = iinfo;
+    //printf("After factorization: INFO = %d\n", *info); fflush(stdout);
+
     SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
 
 #ifdef ITAC_PROF
diff --git a/SRC/pzgssvx3d.c b/SRC/pzgssvx3d.c
index 7c481faf..df80cf00 100644
--- a/SRC/pzgssvx3d.c
+++ b/SRC/pzgssvx3d.c
@@ -13,10 +13,11 @@ at the top-level directory.
  * \brief Solves a system of linear equations A*X=B using 3D process grid.
  *
  * 
- * -- Distributed SuperLU routine (version 7.0) --
+ * -- Distributed SuperLU routine (version 7.1.0) --
  * Lawrence Berkeley National Lab, Georgia Institute of Technology,
  * Oak Ridge National Lab
  * May 12, 2021
+ * October 5, 2021 (last update: November 8, 2021)
  */
 #include "superlu_zdefs.h"
 
@@ -1274,7 +1275,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
         zDestroy_trf3Dpartition(trf3Dpartition, grid3d);
 	SCT_free(SCT);
 
-    } /* end if not Factored */
+    } /* end if not Factored ... factor on all process layers */
     
     if ( grid3d->zscp.Iam == 0 ) { // only process layer 0
 	if (!factored) {
@@ -1327,7 +1328,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 	/* ------------------------------------------------------------
 	   Compute the solution matrix X.
 	   ------------------------------------------------------------ */
-	if ( nrhs > 0 ) {
+	if ( (nrhs > 0) && (*info == 0) ) {
 	    if (!(b_work = doublecomplexMalloc_dist (n)))
 	        ABORT ("Malloc fails for b_work[]");
 
@@ -1543,7 +1544,7 @@ pzgssvx3d (superlu_dist_options_t * options, SuperMatrix * A,
 		SUPERLU_FREE (b_work);
 		SUPERLU_FREE (X);
 		
-	    } /* end if nrhs > 0 */
+	    } /* end if nrhs > 0 and factor successful */
 	
 #if ( PRNTlevel>=1 )
 	if (!iam) {
diff --git a/SRC/pzgstrf2.c b/SRC/pzgstrf2.c
index 974d891d..46461501 100644
--- a/SRC/pzgstrf2.c
+++ b/SRC/pzgstrf2.c
@@ -398,7 +398,24 @@ int_t LpanelUpdate(int off0,  int nsupc, doublecomplex* ublk_ptr, int ld_ujrow,
 
 #pragma GCC push_options
 #pragma GCC optimize ("O0")
-/*factorizes the diagonal block; called from process that owns the (k,k) block*/
+/************************************************************************/
+/*! \brief
+ *
+ * 
+ * Purpose
+ * =======
+ *   Factorize the diagonal block; called from process that owns the (k,k) block
+ *
+ * Arguments
+ * =========
+ * 
+ * info   (output) int*
+ *        = 0: successful exit
+ *        > 0: if info = i, U(i,i) is exactly zero. The factorization has
+ *             been completed, but the factor U is exactly singular,
+ *             and division by zero will occur if it is used to solve a
+ *             system of equations.
+ */
 void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
                    doublecomplex *BlockUFactor, /*factored U is overwritten here*/
                    Glu_persist_t *Glu_persist, gridinfo_t *grid, zLocalLU_t *Llu,
@@ -492,7 +509,7 @@ void Local_Zgstrf2(superlu_dist_options_t *options, int_t k, double thresh,
 
     //int_t thread_id = omp_get_thread_num();
     // SCT->Local_Dgstrf2_Thread_tl[thread_id * CACHE_LINE_SIZE] += (double) ( SuperLU_timer_() - t1);
-}
+} /* end Local_Zgstrf2 */
 
 #pragma GCC pop_options
 /************************************************************************/
diff --git a/SRC/pzgstrf3d.c b/SRC/pzgstrf3d.c
index c3b28b35..f0652919 100644
--- a/SRC/pzgstrf3d.c
+++ b/SRC/pzgstrf3d.c
@@ -130,6 +130,9 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     double s_eps = smach_dist("Epsilon");
     double thresh = s_eps * anorm;
 
+    /* Test the input parameters. */
+    *info = 0;
+    
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC (grid3d->iam, "Enter pzgstrf3d()");
 #endif
@@ -348,7 +351,14 @@ int_t pzgstrf3d(superlu_dist_options_t *options, int m, int n, double anorm,
     }
 #endif
     
-    MPI_Barrier( grid3d->comm);
+    /* Prepare error message - find the smallesr index i that U(i,i)==0 */
+    int iinfo;
+    if ( *info == 0 ) *info = n + 1;
+    MPI_Allreduce (info, &iinfo, 1, MPI_INT, MPI_MIN, grid3d->comm);
+    if ( iinfo == n + 1 ) *info = 0;
+    else *info = iinfo;
+    //printf("After factorization: INFO = %d\n", *info); fflush(stdout);
+
     SCT->pdgstrfTimer = SuperLU_timer_() - SCT->pdgstrfTimer;
 
 #ifdef ITAC_PROF
diff --git a/SRC/streeFactorization.c b/SRC/streeFactorization.c
index c97f3669..5ad8adc7 100644
--- a/SRC/streeFactorization.c
+++ b/SRC/streeFactorization.c
@@ -315,6 +315,9 @@ int_t ssparseTreeFactor_ASYNC(
         return 1;
     }
 
+    /* Test the input parameters. */
+    *info = 0;
+    
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC (grid3d->iam, "Enter ssparseTreeFactor_ASYNC()");
 #endif
diff --git a/SRC/ztreeFactorization.c b/SRC/ztreeFactorization.c
index df1a36e4..517a176e 100644
--- a/SRC/ztreeFactorization.c
+++ b/SRC/ztreeFactorization.c
@@ -314,6 +314,9 @@ int_t zsparseTreeFactor_ASYNC(
         return 1;
     }
 
+    /* Test the input parameters. */
+    *info = 0;
+    
 #if ( DEBUGlevel>=1 )
     CHECK_MALLOC (grid3d->iam, "Enter zsparseTreeFactor_ASYNC()");
 #endif

From 973e193095f7c3a9841c8e9a216a5b3b1517637f Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Wed, 10 Nov 2021 12:50:14 -0800
Subject: [PATCH 141/147] Fix module files name case (Capital) for Cray cce
 Fortran compiler. Add the missing f2c wrapper name in superlu_FCnames.h

---
 CMakeLists.txt         |  2 +-
 FORTRAN/CMakeLists.txt | 18 ++++++++----------
 SRC/superlu_FCnames.h  |  3 +++
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b2fdf24..34548848 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -356,7 +356,7 @@ if (XSDK_ENABLE_Fortran)
       SET(CMAKE_EXE_LINKER_FLAGS
           "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath,${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}")
    else()
-      message("-- superlu_dist_fortranwill be built as a static library.")
+      message("-- superlu_dist_fortran will be built as a static library.")
       set(PROJECT_NAME_LIB_FORTRAN libsuperlu_dist_fortran.a)
    endif()
 endif()
diff --git a/FORTRAN/CMakeLists.txt b/FORTRAN/CMakeLists.txt
index bf402a9c..4390f0b7 100644
--- a/FORTRAN/CMakeLists.txt
+++ b/FORTRAN/CMakeLists.txt
@@ -2,12 +2,6 @@
 include_directories(${SuperLU_DIST_SOURCE_DIR}/SRC)
 include_directories(${SuperLU_DIST_BINARY_DIR}/FORTRAN)
 
-set(headers
-    ${CMAKE_BINARY_DIR}/FORTRAN/superlu_mod.mod
-    ${CMAKE_BINARY_DIR}/FORTRAN/superlupara_mod.mod
-    ${CMAKE_BINARY_DIR}/FORTRAN/superlu_dist_config.fh
-    )
-
 set(sources "superlu_c2f_wrap.c")  # initialize precision-independent file
 
 if(enable_double)
@@ -34,11 +28,15 @@ install(TARGETS superlu_dist_fortran
     LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
     ARCHIVE DESTINATION "${INSTALL_LIB_DIR}"
 )
-install(FILES ${headers}
-# DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
+
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
-)
-    
+  FILES_MATCHING PATTERN *.mod
+  )
+install(FILES superlu_dist_config.fh
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  )
+
 # Fortran MPI stuff
 add_definitions(${MPI_Fortran_COMPILE_FLAGS})
 include_directories(${MPI_Fortran_INCLUDE_PATH})
diff --git a/SRC/superlu_FCnames.h b/SRC/superlu_FCnames.h
index a04768d1..97b711a0 100644
--- a/SRC/superlu_FCnames.h
+++ b/SRC/superlu_FCnames.h
@@ -65,6 +65,7 @@ at the top-level directory.
 #define f_dLUstructFree                FC_GLOBAL(f_dlustructfree,F_DLUSTRUCTFREE)
 #define f_dDestroy_LU_SOLVE_struct     FC_GLOBAL(f_ddestroy_lu_solve_struct,F_DDESTROY_LU_SOLVE_STRUCT)
 #define f_dDestroy_LU_SOLVE_struct_3d  FC_GLOBAL(f_ddestroy_lu_solve_struct_3d,F_DDESTROY_LU_SOLVE_STRUCT_3D)
+#define f_dDestroy_A3d_gathered_on_2d  FC_GLOBAL(f_ddestroy_a3d_gathered_on_2d,F_DDESTROY_A3D_GATHERED_ON_2D)
 
 #define f_dCreate_CompRowLoc_Mat_dist  FC_GLOBAL(f_dcreate_comprowloc_mat_dist,F_DCREATE_COMPROWLOC_MAT_DIST)
 #define f_dSolveFinalize               FC_GLOBAL(f_dsolvefinalize,F_DSOLVEFINALIZE)
@@ -84,6 +85,8 @@ at the top-level directory.
 #define f_zLUstructFree                FC_GLOBAL(f_zlustructfree,F_ZLUSTRUCTFREE)
 #define f_zDestroy_LU_SOLVE_struct     FC_GLOBAL(f_zdestroy_lu_solve_struct,F_ZDESTROY_LU_SOLVE_STRUCT)
 #define f_zDestroy_LU_SOLVE_struct_3d  FC_GLOBAL(f_zdestroy_lu_solve_struct_3d,F_ZDESTROY_LU_SOLVE_STRUCT_3D)
+#define f_zDestroy_A3d_gathered_on_2d  FC_GLOBAL(f_zdestroy_a3d_gathered_on_2d,F_ZDESTROY_A3D_GATHERED_ON_2D)
+
 #define f_zCreate_CompRowLoc_Mat_dist  FC_GLOBAL(f_zcreate_comprowloc_mat_dist,F_ZCREATE_COMPROWLOC_MAT_DIST)
 #define f_zSolveFinalize               FC_GLOBAL(f_zsolvefinalize,F_ZSOLVEFINALIZE)
 #define f_pzgssvx                      FC_GLOBAL(f_pzgssvx,F_PZGSSVX)

From b52ac2c59f6e54b9ae80aecc37531aca360f7cab Mon Sep 17 00:00:00 2001
From: xiaoye 
Date: Sat, 13 Nov 2021 15:49:02 -0800
Subject: [PATCH 142/147] In CMakeLists.txt, add set(CMAKE_CXX_STANDARD 14) for
 CombBLAS

---
 .gitignore                      |  7 ++++---
 CMakeLists.txt                  |  3 ++-
 EXAMPLE/pddrive.c               |  1 +
 FORTRAN/superlu_dist_config.fh  | 11 -----------
 SRC/superlu_FortranCInterface.h | 16 ----------------
 5 files changed, 7 insertions(+), 31 deletions(-)
 delete mode 100644 FORTRAN/superlu_dist_config.fh
 delete mode 100644 SRC/superlu_FortranCInterface.h

diff --git a/.gitignore b/.gitignore
index 0ec797c8..f28bb4f5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,11 +2,12 @@
 
 # You have to ignore this generated file or git will complain that it is an
 # unknown file!
-/make.inc
+make.inc
 
 # If the instructions are telling people to create this build dir under the
 # source tree, you had better put in an ignore for this.
-/build/*
+build/*
 
 # not to commit any changes to the following file
-/SRC/superlu_FortranCInterface.h
+SRC/superlu_FortranCInterface.h
+FORTRAN/superlu_dist_config.fh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 34548848..6e2148a3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -379,6 +379,7 @@ if (TPL_ENABLE_COMBBLASLIB)   ## want to use CombBLAS
 
   message("-- Enabled support for COMBBLAS")
   set(COMBBLAS_FOUND TRUE)
+  set(CMAKE_CXX_STANDARD 14)  # CombBLAS requires c++14
 
   set(COMBBLAS_LIB ${TPL_COMBBLAS_LIBRARIES})
   # fix up COMBBLAS library names
@@ -404,7 +405,7 @@ endif()
 if (XSDK_ENABLE_Fortran)
   include(FortranCInterface)
   FortranCInterface_HEADER(${SuperLU_DIST_SOURCE_DIR}/SRC/superlu_FortranCInterface.h
-  MACRO_NAMESPACE "FC_")
+                            MACRO_NAMESPACE "FC_")
   FortranCInterface_VERIFY(CXX)
   SET(MPI_Fortran_LINK_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
 endif()
diff --git a/EXAMPLE/pddrive.c b/EXAMPLE/pddrive.c
index 2912dd7c..3b1a9d44 100644
--- a/EXAMPLE/pddrive.c
+++ b/EXAMPLE/pddrive.c
@@ -198,6 +198,7 @@ int main(int argc, char *argv[])
      */
     set_default_options_dist(&options);
 #if 0
+    options.RowPerm           = LargeDiag_HWPM;
     options.RowPerm = NOROWPERM;
     options.IterRefine = NOREFINE;
     options.ColPerm = NATURAL;
diff --git a/FORTRAN/superlu_dist_config.fh b/FORTRAN/superlu_dist_config.fh
deleted file mode 100644
index caa86f6b..00000000
--- a/FORTRAN/superlu_dist_config.fh
+++ /dev/null
@@ -1,11 +0,0 @@
-
-
-#define HAVE_PARMETIS TRUE
-
-
-
-#define XSDK_INDEX_SIZE 64
-
-#if (XSDK_INDEX_SIZE == 64)
-#define _LONGINT 1
-#endif
diff --git a/SRC/superlu_FortranCInterface.h b/SRC/superlu_FortranCInterface.h
deleted file mode 100644
index 467bfb65..00000000
--- a/SRC/superlu_FortranCInterface.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef FC_HEADER_INCLUDED
-#define FC_HEADER_INCLUDED
-
-/* Mangling for Fortran global symbols without underscores. */
-#define FC_GLOBAL(name,NAME) name##_
-
-/* Mangling for Fortran global symbols with underscores. */
-#define FC_GLOBAL_(name,NAME) name##_
-
-/* Mangling for Fortran module symbols without underscores. */
-#define FC_MODULE(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name
-
-/* Mangling for Fortran module symbols with underscores. */
-#define FC_MODULE_(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name
-
-#endif

From 92737302ddf505d29d24ceb46060d6565ff78b67 Mon Sep 17 00:00:00 2001
From: Xiaoye Li 
Date: Thu, 18 Nov 2021 01:27:21 -0500
Subject: [PATCH 143/147] Remove an erroneous free() in xfree_LUstruct_gpu()
 routine.

---
 SRC/dsuperlu_gpu.cu | 6 +++---
 SRC/ssuperlu_gpu.cu | 2 +-
 SRC/zsuperlu_gpu.cu | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu
index 2b871120..0d6bf7ae 100644
--- a/SRC/dsuperlu_gpu.cu
+++ b/SRC/dsuperlu_gpu.cu
@@ -839,9 +839,9 @@ int dfree_LUstruct_gpu (dLUstruct_gpu_t * A_gpu)
 
 	checkCuda(cudaFree(A_gpu->UrowindVec));
 	checkCuda(cudaFree(A_gpu->UrowindPtr));
-
-	free(A_gpu->UrowindPtr_host);
-
+	
+	//free(A_gpu->UrowindPtr_host); // Sherry: this is NOT allocated
+	
 	checkCuda(cudaFree(A_gpu->UnzvalVec));
 	checkCuda(cudaFree(A_gpu->UnzvalPtr));
 
diff --git a/SRC/ssuperlu_gpu.cu b/SRC/ssuperlu_gpu.cu
index 78b70b83..ecfb2b6e 100644
--- a/SRC/ssuperlu_gpu.cu
+++ b/SRC/ssuperlu_gpu.cu
@@ -792,7 +792,7 @@ int sfree_LUstruct_gpu (sLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->UrowindVec));
 	checkCuda(cudaFree(A_gpu->UrowindPtr));
 
-	free(A_gpu->UrowindPtr_host);
+	//free(A_gpu->UrowindPtr_host);  NOT allocated
 
 	checkCuda(cudaFree(A_gpu->UnzvalVec));
 	checkCuda(cudaFree(A_gpu->UnzvalPtr));
diff --git a/SRC/zsuperlu_gpu.cu b/SRC/zsuperlu_gpu.cu
index b5b6b361..4900bbcd 100644
--- a/SRC/zsuperlu_gpu.cu
+++ b/SRC/zsuperlu_gpu.cu
@@ -801,7 +801,7 @@ int zfree_LUstruct_gpu (zLUstruct_gpu_t * A_gpu)
 	checkCuda(cudaFree(A_gpu->UrowindVec));
 	checkCuda(cudaFree(A_gpu->UrowindPtr));
 
-	free(A_gpu->UrowindPtr_host);
+	//free(A_gpu->UrowindPtr_host);  NOT allocated
 
 	checkCuda(cudaFree(A_gpu->UnzvalVec));
 	checkCuda(cudaFree(A_gpu->UnzvalPtr));

From 00c5f021b04cacfbb9ccfdbd233f10f799f43c16 Mon Sep 17 00:00:00 2001
From: Sherry Li 
Date: Thu, 18 Nov 2021 10:59:18 -0800
Subject: [PATCH 144/147] Update xsuperlu_gpu.cu files: Remove cub/ dependency
 for the complex and single-precision codes. Remove the erroneous free() in
 xfree_LUstruct_gpu() routine.

---
 .gitignore                      |   1 -
 SRC/dlustruct_gpu.h             |   4 +-
 SRC/dsuperlu_gpu.cu             | 103 +++++++-------
 SRC/dtreeFactorizationGPU.c     |   1 -
 SRC/slustruct_gpu.h             |   4 +-
 SRC/ssuperlu_gpu.cu             | 239 +++++++++++++++++++------------
 SRC/superlu_FortranCInterface.h |  16 +++
 SRC/superlu_defs.h              |   2 +-
 SRC/supernodalForest.c          |  20 ++-
 SRC/zlustruct_gpu.h             |   4 +-
 SRC/zsuperlu_gpu.cu             | 241 ++++++++++++++++++++------------
 11 files changed, 387 insertions(+), 248 deletions(-)
 create mode 100644 SRC/superlu_FortranCInterface.h

diff --git a/.gitignore b/.gitignore
index f28bb4f5..39be29c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,5 +9,4 @@ make.inc
 build/*
 
 # not to commit any changes to the following file
-SRC/superlu_FortranCInterface.h
 FORTRAN/superlu_dist_config.fh
diff --git a/SRC/dlustruct_gpu.h b/SRC/dlustruct_gpu.h
index 4792ea97..6b16d63e 100644
--- a/SRC/dlustruct_gpu.h
+++ b/SRC/dlustruct_gpu.h
@@ -140,7 +140,7 @@ typedef struct //sluGPU_t_
     cublasHandle_t cublasHandles[MAX_NCUDA_STREAMS];
     int_t lastOffloadStream[MAX_NCUDA_STREAMS];
     int_t nCudaStreams;
-    int_t* isNodeInMyGrid;
+    int* isNodeInMyGrid;
     double acc_async_cost;
 } dsluGPU_t;
 
@@ -213,7 +213,7 @@ int dSchurCompUpdate_GPU(
 );
 
 
-extern void dCopyLUToGPU3D (int_t* isNodeInMyGrid, dLocalLU_t *A_host,
+extern void dCopyLUToGPU3D (int* isNodeInMyGrid, dLocalLU_t *A_host,
            dsluGPU_t *sluGPU, Glu_persist_t *Glu_persist, int_t n,
 	   gridinfo3d_t *grid3d, int_t buffer_size, int_t bigu_size, int_t ldt);
 
diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu
index 2b871120..d276eaf2 100644
--- a/SRC/dsuperlu_gpu.cu
+++ b/SRC/dsuperlu_gpu.cu
@@ -8,6 +8,8 @@
  * Lawrence Berkeley National Lab, Univ. of California Berkeley,
  * Georgia Institute of Technology, Oak Ridge National Laboratory
  * March 14, 2021 version 7.0.0
+ *
+ * Last update: November 14, 2021  remove dependence on CUB/scan
  * 
*/ @@ -21,6 +23,7 @@ #undef Reduce +//#include #include "dlustruct_gpu.h" @@ -81,7 +84,7 @@ void device_scatter_l (int_t thread_id, } #endif ///////////// not used -// #define THREAD_BLOCK_SIZE 512 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ +//#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ int SCATTER_THREAD_BLOCK_SIZE=512; __device__ inline @@ -160,20 +163,19 @@ void device_scatter_u_2D (int thread_id, if ( thread_id < temp_nbrow * ColPerBlock ) { /* 1D threads are logically arranged in 2D shape. */ - int thread_id_x = thread_id % temp_nbrow; - int thread_id_y = thread_id / temp_nbrow; + int thread_id_x = thread_id % temp_nbrow; + int thread_id_y = thread_id / temp_nbrow; - #pragma unroll 4 - for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) - { - i = IndirectJ1[IndirectJ3[col]]-ilst + indirect[thread_id_x]; - ucol[i] -= tempv[nbrow * col + thread_id_x]; - } +#pragma unroll 4 + for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) + { + i = IndirectJ1[IndirectJ3[col]]-ilst + indirect[thread_id_x]; + ucol[i] -= tempv[nbrow * col + thread_id_x]; + } } } - __device__ int dnextpow2(int v) { @@ -188,8 +190,6 @@ __device__ int dnextpow2(int v) return v; } - - typedef int pfx_dtype ; __device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n) { @@ -242,7 +242,7 @@ __device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n) inOutArr[2*thread_id+1] = temp[2*thread_id+1]+ inOutArr[2*thread_id+1]; __syncthreads(); -} +} /* end incScan */ __global__ void gExScan(pfx_dtype *inArr, int n) { @@ -250,6 +250,7 @@ __global__ void gExScan(pfx_dtype *inArr, int n) incScan(inArr, temp, n); } + __global__ void Scatter_GPU_kernel( int_t streamId, @@ -291,7 +292,7 @@ void Scatter_GPU_kernel( int* indirect2_thread= (int*) &indirect_lptr[ldt]; /* row-wise */ int* IndirectJ1= (int*) &indirect2_thread[ldt]; /* column-wise */ int* IndirectJ3= (int*) &IndirectJ1[ldt]; /* column-wise */ - int CHREAD_BLOCK_SIZE =ldt; + int THREAD_BLOCK_SIZE =ldt; int* pfxStorage = (int*) &IndirectJ3[ldt]; @@ -317,7 +318,7 @@ void Scatter_GPU_kernel( /* # of nonzero columns in block j */ int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols - : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols); + : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols); int cum_ncol = (j == 0) ? 0 : Ublock_info[j - 1].full_u_cols; @@ -374,8 +375,8 @@ void Scatter_GPU_kernel( if (thread_id < temp_nbrow) /* row-wise */ { - /* cyclically map each thread to a row */ - indirect_lptr[thread_id] = (int) lsub[lptr + thread_id]; + /* cyclically map each thread to a row */ + indirect_lptr[thread_id] = (int) lsub[lptr + thread_id]; } /* column-wise: each thread is assigned one column */ @@ -396,11 +397,11 @@ void Scatter_GPU_kernel( if (thread_id < blockDim.x) { - if (thread_id < nsupc) - { - /* fstnz subscript of each column in the block */ - IndirectJ1[thread_id] = -index[iuip_lib + thread_id] + ilst; - } + if (thread_id < nsupc) + { + /* fstnz subscript of each column in the block */ + IndirectJ1[thread_id] = -index[iuip_lib + thread_id] + ilst; + } } /* perform an inclusive block-wide prefix sum among all threads */ @@ -408,8 +409,6 @@ void Scatter_GPU_kernel( incScan(IndirectJ1, pfxStorage, nsupc); - - __syncthreads(); device_scatter_u_2D ( @@ -458,8 +457,8 @@ void Scatter_GPU_kernel( if (thread_id < dest_nbrow) { - rel = index[lptrj + thread_id] - fnz; - indirect_lptr[rel] = thread_id; + rel = index[lptrj + thread_id] - fnz; + indirect_lptr[rel] = thread_id; } __syncthreads(); @@ -740,8 +739,8 @@ int dSchurCompUpdate_GPU( /* * Scattering the output */ - // dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread - dim3 dimBlock(ldt); // 1d thread + // dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread + dim3 dimBlock(ldt); // 1d thread dim3 dimGrid(ii_end - ii_st, jj_end - jj_st); @@ -808,6 +807,7 @@ static size_t get_acc_memory () int dfree_LUstruct_gpu (dLUstruct_gpu_t * A_gpu) { + /* Free the L data structure on GPU */ checkCuda(cudaFree(A_gpu->LrowindVec)); checkCuda(cudaFree(A_gpu->LrowindPtr)); @@ -828,25 +828,26 @@ int dfree_LUstruct_gpu (dLUstruct_gpu_t * A_gpu) checkCuda(cudaFreeHost(A_gpu->scubufs[streamId].usub_buf)); - free(A_gpu->isOffloaded); - free(A_gpu->GemmStart); - free(A_gpu->GemmEnd); - free(A_gpu->ScatterEnd); - free(A_gpu->ePCIeH2D); - - free(A_gpu->ePCIeD2H_Start); - free(A_gpu->ePCIeD2H_End); + SUPERLU_FREE(A_gpu->isOffloaded); // changed to SUPERLU_MALLOC/SUPERLU_FREE + SUPERLU_FREE(A_gpu->GemmStart); + SUPERLU_FREE(A_gpu->GemmEnd); + SUPERLU_FREE(A_gpu->ScatterEnd); + SUPERLU_FREE(A_gpu->ePCIeH2D); + SUPERLU_FREE(A_gpu->ePCIeD2H_Start); + SUPERLU_FREE(A_gpu->ePCIeD2H_End); + /* Free the U data structure on GPU */ checkCuda(cudaFree(A_gpu->UrowindVec)); checkCuda(cudaFree(A_gpu->UrowindPtr)); - free(A_gpu->UrowindPtr_host); + //free(A_gpu->UrowindPtr_host); // Sherry: this is NOT allocated checkCuda(cudaFree(A_gpu->UnzvalVec)); checkCuda(cudaFree(A_gpu->UnzvalPtr)); checkCuda(cudaFree(A_gpu->grid)); + /* Free the Schur complement structure on GPU */ checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV)); checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU)); @@ -946,25 +947,27 @@ int dinitSluGPU3D_t( checkCudaErrors(cudaDeviceReset ()) ; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; dLocalLU_t *Llu = LUstruct->Llu; - int_t* isNodeInMyGrid = sluGPU->isNodeInMyGrid; + int* isNodeInMyGrid = sluGPU->isNodeInMyGrid; sluGPU->nCudaStreams = getnCudaStreams(); + SCATTER_THREAD_BLOCK_SIZE = ldt; if(getenv("SCATTER_THREAD_BLOCK_SIZE")) { int stbs = atoi(getenv("SCATTER_THREAD_BLOCK_SIZE")); if(stbs>=ldt) { - SCATTER_THREAD_BLOCK_SIZE = stbs; + SCATTER_THREAD_BLOCK_SIZE = stbs; } } + if (grid3d->iam == 0) { printf("dinitSluGPU3D_t: Using hardware acceleration, with %d cuda streams \n", sluGPU->nCudaStreams); fflush(stdout); printf("dinitSluGPU3D_t: Using %d threads per block for scatter \n", SCATTER_THREAD_BLOCK_SIZE); - + if ( MAX_SUPER_SIZE < ldt ) { ABORT("MAX_SUPER_SIZE smaller than requested NSUP"); @@ -986,9 +989,9 @@ int dinitSluGPU3D_t( /* Allocate GPU memory for the LU data structures, and copy the host LU structure to GPU side. */ dCopyLUToGPU3D ( isNodeInMyGrid, - Llu, /* referred to as A_host */ - sluGPU, Glu_persist, n, grid3d, buffer_size, - bigu_size, ldt ); + Llu, /* referred to as A_host */ + sluGPU, Glu_persist, n, grid3d, buffer_size, bigu_size, ldt + ); return 0; } /* end dinitSluGPU3D_t */ @@ -1252,7 +1255,7 @@ int freeSluGPU(dsluGPU_t *sluGPU) After factorization, the GPU LU structure should be freed by calling dfree_LUsstruct_gpu(). */ void dCopyLUToGPU3D ( - int_t* isNodeInMyGrid, + int* isNodeInMyGrid, dLocalLU_t *A_host, /* distributed LU structure on host */ dsluGPU_t *sluGPU, /* hold LU structure on GPU */ Glu_persist_t *Glu_persist, int_t n, @@ -1361,12 +1364,12 @@ void dCopyLUToGPU3D ( } /* endfor streamID ... allocate paged-locked memory */ A_gpu->isOffloaded = (int *) SUPERLU_MALLOC (sizeof(int) * nsupers); - A_gpu->GemmStart = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->GemmEnd = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ScatterEnd = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ePCIeH2D = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ePCIeD2H_Start = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ePCIeD2H_End = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->GemmStart = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); + A_gpu->GemmEnd = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); + A_gpu->ScatterEnd = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeH2D = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeD2H_Start = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeD2H_End = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); for (int i = 0; i < nsupers; ++i) { diff --git a/SRC/dtreeFactorizationGPU.c b/SRC/dtreeFactorizationGPU.c index 4010efb3..14c3a0a1 100644 --- a/SRC/dtreeFactorizationGPU.c +++ b/SRC/dtreeFactorizationGPU.c @@ -13,7 +13,6 @@ // #include "treeFactorization.h" // #include "trfCommWrapper.h" #include "dlustruct_gpu.h" -#include "omp.h" //#include "cblas.h" #ifdef GPU_ACC ///////////////// enable GPU diff --git a/SRC/slustruct_gpu.h b/SRC/slustruct_gpu.h index 443619e4..e7fcd15e 100644 --- a/SRC/slustruct_gpu.h +++ b/SRC/slustruct_gpu.h @@ -140,7 +140,7 @@ typedef struct //sluGPU_t_ cublasHandle_t cublasHandles[MAX_NCUDA_STREAMS]; int_t lastOffloadStream[MAX_NCUDA_STREAMS]; int_t nCudaStreams; - int_t* isNodeInMyGrid; + int* isNodeInMyGrid; double acc_async_cost; } ssluGPU_t; @@ -213,7 +213,7 @@ int sSchurCompUpdate_GPU( ); -extern void sCopyLUToGPU3D (int_t* isNodeInMyGrid, sLocalLU_t *A_host, +extern void sCopyLUToGPU3D (int* isNodeInMyGrid, sLocalLU_t *A_host, ssluGPU_t *sluGPU, Glu_persist_t *Glu_persist, int_t n, gridinfo3d_t *grid3d, int_t buffer_size, int_t bigu_size, int_t ldt); diff --git a/SRC/ssuperlu_gpu.cu b/SRC/ssuperlu_gpu.cu index 78b70b83..0fa035ff 100644 --- a/SRC/ssuperlu_gpu.cu +++ b/SRC/ssuperlu_gpu.cu @@ -8,6 +8,8 @@ * Lawrence Berkeley National Lab, Univ. of California Berkeley, * Georgia Institute of Technology, Oak Ridge National Laboratory * March 14, 2021 version 7.0.0 + * + * Last update: November 14, 2021 remove dependence on CUB/scan *
*/ @@ -20,7 +22,7 @@ #include #undef Reduce -#include "cub/cub.cuh" + //#include #include "slustruct_gpu.h" @@ -82,7 +84,8 @@ void device_scatter_l (int_t thread_id, } #endif ///////////// not used -#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ +//#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ +int SCATTER_THREAD_BLOCK_SIZE=512; __device__ inline void sdevice_scatter_l_2D (int thread_id, @@ -166,48 +169,87 @@ void device_scatter_u_2D (int thread_id, #pragma unroll 4 for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) { - i = IndirectJ1[IndirectJ3[col]] + indirect[thread_id_x]; + i = IndirectJ1[IndirectJ3[col]]-ilst + indirect[thread_id_x]; ucol[i] -= tempv[nbrow * col + thread_id_x]; } } } -__device__ inline -void device_scatter_u (int_t thread_id, - int_t temp_nbrow, int_t nsupc, - float * ucol, - int_t * usub, int_t iukp, - int_t ilst, int_t klst, - int_t * index, int_t iuip_lib, - float * tempv, int_t nbrow, - // int_t *indirect - int *indirect - ) +__device__ int dnextpow2(int v) + { - int_t segsize, fnz, jj; - for (jj = 0; jj < nsupc; ++jj) - { - segsize = klst - usub[iukp + jj]; - fnz = index[iuip_lib++]; - ucol -= fnz; - if (segsize) { /* Nonzero segment in U(k.j). */ - if (thread_id < temp_nbrow) - { -#ifndef UNIT_STRIDE - ucol[indirect[thread_id]] -= tempv[thread_id]; -#else - /* making access unit strided; - it doesn't work; it is for measurements */ - ucol[thread_id] -= tempv[thread_id]; -#endif - } - tempv += nbrow; - } - ucol += ilst ; - } + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + + return v; } +typedef int pfx_dtype ; +__device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n) +{ + // extern __shared__ pfx_dtype temp[]; + int n_original = n; + n = (n & (n - 1)) == 0? n: dnextpow2(n); + int thread_id = threadIdx.x; + int offset = 1; + if(2*thread_id < n_original) + temp[2*thread_id] = inOutArr[2*thread_id]; + else + temp[2*thread_id] =0; + + + if(2*thread_id+1 >1; d > 0; d >>= 1) + { + __syncthreads(); + if (thread_id < d) + { + int ai = offset*(2*thread_id+1)-1; + int bi = offset*(2*thread_id+2)-1; + temp[bi] += temp[ai]; + } + offset *= 2; + } + + if (thread_id == 0) { temp[n - 1] = 0; } + for (int d = 1; d < n; d *= 2) + { + offset >>= 1; + __syncthreads(); + if (thread_id < d) + { + int ai = offset*(2*thread_id+1)-1; + int bi = offset*(2*thread_id+2)-1; + pfx_dtype t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + __syncthreads(); + if(2*thread_id < n_original) + inOutArr[2*thread_id] = temp[2*thread_id]+ inOutArr[2*thread_id]; // write results to device memory + if(2*thread_id+1 < n_original) + inOutArr[2*thread_id+1] = temp[2*thread_id+1]+ inOutArr[2*thread_id+1]; + __syncthreads(); + +} /* end incScan */ + +__global__ void gExScan(pfx_dtype *inArr, int n) +{ + extern __shared__ pfx_dtype temp[]; + incScan(inArr, temp, n); + +} __global__ void Scatter_GPU_kernel( @@ -244,15 +286,16 @@ void Scatter_GPU_kernel( assigned to block (lb, j) in 2D grid */ int lb = blockIdx.x + ii_st; int j = blockIdx.y + jj_st; - __shared__ int indirect_thread[MAX_SUPER_SIZE]; /* row-wise */ - __shared__ int indirect2_thread[MAX_SUPER_SIZE]; /* row-wise */ - __shared__ int IndirectJ1[THREAD_BLOCK_SIZE]; /* column-wise */ - __shared__ int IndirectJ3[THREAD_BLOCK_SIZE]; /* column-wise */ - - /* see CUB page https://nvlabs.github.io/cub/. Implement threads collectives */ - typedef cub::BlockScan BlockScan; /*1D int data type*/ - __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ - + + extern __shared__ int s[]; + int* indirect_lptr = s; /* row-wise */ + int* indirect2_thread= (int*) &indirect_lptr[ldt]; /* row-wise */ + int* IndirectJ1= (int*) &indirect2_thread[ldt]; /* column-wise */ + int* IndirectJ3= (int*) &IndirectJ1[ldt]; /* column-wise */ + int THREAD_BLOCK_SIZE =ldt; + + int* pfxStorage = (int*) &IndirectJ3[ldt]; + int thread_id = threadIdx.x; int iukp = Ublock_info[j].iukp; @@ -276,7 +319,8 @@ void Scatter_GPU_kernel( /* # of nonzero columns in block j */ int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols); - int cum_ncol = (j == 0) ? 0 : Ublock_info[j - 1].full_u_cols; + int cum_ncol = (j == 0) ? 0 + : Ublock_info[j - 1].full_u_cols; int lptr = Remain_info[lb].lptr; int ib = Remain_info[lb].ib; @@ -308,7 +352,9 @@ void Scatter_GPU_kernel( /* Each thread is responsible for one block column */ __shared__ int ljb_ind; /*do a search ljb_ind at local row lib*/ - int blks_per_threads = CEILING(num_u_blocks, THREAD_BLOCK_SIZE); + int blks_per_threads = CEILING(num_u_blocks, blockDim.x); + // printf("blockDim.x =%d \n", blockDim.x); + for (int i = 0; i < blks_per_threads; ++i) /* each thread is assigned a chunk of consecutive U blocks to search */ { @@ -329,8 +375,8 @@ void Scatter_GPU_kernel( if (thread_id < temp_nbrow) /* row-wise */ { - /* cyclically map each thread to a row */ - indirect_thread[thread_id] = (int) lsub[lptr + thread_id]; + /* cyclically map each thread to a row */ + indirect_lptr[thread_id] = (int) lsub[lptr + thread_id]; } /* column-wise: each thread is assigned one column */ @@ -342,27 +388,27 @@ void Scatter_GPU_kernel( __syncthreads(); /* threads are divided into multiple columns */ - int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; + int ColPerBlock = blockDim.x / temp_nbrow; - if (thread_id < THREAD_BLOCK_SIZE) + // if (thread_id < blockDim.x) + // IndirectJ1[thread_id] = 0; + if (thread_id < ldt) IndirectJ1[thread_id] = 0; - if (thread_id < THREAD_BLOCK_SIZE) + if (thread_id < blockDim.x) { - if (thread_id < nsupc) - { - /* fstnz subscript of each column in the block */ - IndirectJ1[thread_id] = index[iuip_lib + thread_id]; - } + if (thread_id < nsupc) + { + /* fstnz subscript of each column in the block */ + IndirectJ1[thread_id] = -index[iuip_lib + thread_id] + ilst; + } } /* perform an inclusive block-wide prefix sum among all threads */ - if (thread_id < THREAD_BLOCK_SIZE) - BlockScan(temp_storage).InclusiveSum(IndirectJ1[thread_id], IndirectJ1[thread_id]); - - if (thread_id < THREAD_BLOCK_SIZE) - IndirectJ1[thread_id] = -IndirectJ1[thread_id] + ilst * thread_id; - + __syncthreads(); + + incScan(IndirectJ1, pfxStorage, nsupc); + __syncthreads(); device_scatter_u_2D ( @@ -373,7 +419,7 @@ void Scatter_GPU_kernel( ilst, klst, index, iuip_lib, tempv1, nrows, - indirect_thread, + indirect_lptr, nnz_cols, ColPerBlock, IndirectJ1, IndirectJ3 ); @@ -393,7 +439,7 @@ void Scatter_GPU_kernel( __shared__ int lib_ind; /*do a search lib_ind for lib*/ - int blks_per_threads = CEILING(num_l_blocks, THREAD_BLOCK_SIZE); + int blks_per_threads = CEILING(num_l_blocks, blockDim.x); for (int i = 0; i < blks_per_threads; ++i) { if (thread_id * blks_per_threads + i < num_l_blocks && @@ -411,8 +457,8 @@ void Scatter_GPU_kernel( if (thread_id < dest_nbrow) { - rel = index[lptrj + thread_id] - fnz; - indirect_thread[rel] = thread_id; + rel = index[lptrj + thread_id] - fnz; + indirect_lptr[rel] = thread_id; } __syncthreads(); @@ -420,13 +466,13 @@ void Scatter_GPU_kernel( if (thread_id < temp_nbrow) { rel = lsub[lptr + thread_id] - fnz; - indirect2_thread[thread_id] = indirect_thread[rel]; + indirect2_thread[thread_id] = indirect_lptr[rel]; } if (thread_id < nnz_cols) IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; __syncthreads(); - int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; + int ColPerBlock = blockDim.x / temp_nbrow; nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj; sdevice_scatter_l_2D( @@ -693,11 +739,12 @@ int sSchurCompUpdate_GPU( /* * Scattering the output */ - dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread + // dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread + dim3 dimBlock(ldt); // 1d thread dim3 dimGrid(ii_end - ii_st, jj_end - jj_st); - Scatter_GPU_kernel <<< dimGrid, dimBlock, 0, FunCallStream>>> + Scatter_GPU_kernel <<< dimGrid, dimBlock, (4*ldt + 2*SCATTER_THREAD_BLOCK_SIZE)*sizeof(int), FunCallStream>>> (streamId, ii_st, ii_end, jj_st, jj_end, klst, 0, nrows, ldt, npcol, nprow, dA_gpu); #ifdef SCATTER_OPT @@ -760,6 +807,7 @@ static size_t get_acc_memory () int sfree_LUstruct_gpu (sLUstruct_gpu_t * A_gpu) { + /* Free the L data structure on GPU */ checkCuda(cudaFree(A_gpu->LrowindVec)); checkCuda(cudaFree(A_gpu->LrowindPtr)); @@ -780,25 +828,26 @@ int sfree_LUstruct_gpu (sLUstruct_gpu_t * A_gpu) checkCuda(cudaFreeHost(A_gpu->scubufs[streamId].usub_buf)); - free(A_gpu->isOffloaded); - free(A_gpu->GemmStart); - free(A_gpu->GemmEnd); - free(A_gpu->ScatterEnd); - free(A_gpu->ePCIeH2D); - - free(A_gpu->ePCIeD2H_Start); - free(A_gpu->ePCIeD2H_End); + SUPERLU_FREE(A_gpu->isOffloaded); // changed to SUPERLU_MALLOC/SUPERLU_FREE + SUPERLU_FREE(A_gpu->GemmStart); + SUPERLU_FREE(A_gpu->GemmEnd); + SUPERLU_FREE(A_gpu->ScatterEnd); + SUPERLU_FREE(A_gpu->ePCIeH2D); + SUPERLU_FREE(A_gpu->ePCIeD2H_Start); + SUPERLU_FREE(A_gpu->ePCIeD2H_End); + /* Free the U data structure on GPU */ checkCuda(cudaFree(A_gpu->UrowindVec)); checkCuda(cudaFree(A_gpu->UrowindPtr)); - free(A_gpu->UrowindPtr_host); + //free(A_gpu->UrowindPtr_host); // Sherry: this is NOT allocated checkCuda(cudaFree(A_gpu->UnzvalVec)); checkCuda(cudaFree(A_gpu->UnzvalPtr)); checkCuda(cudaFree(A_gpu->grid)); + /* Free the Schur complement structure on GPU */ checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV)); checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU)); @@ -898,13 +947,27 @@ int sinitSluGPU3D_t( checkCudaErrors(cudaDeviceReset ()) ; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; sLocalLU_t *Llu = LUstruct->Llu; - int_t* isNodeInMyGrid = sluGPU->isNodeInMyGrid; + int* isNodeInMyGrid = sluGPU->isNodeInMyGrid; sluGPU->nCudaStreams = getnCudaStreams(); + + SCATTER_THREAD_BLOCK_SIZE = ldt; + if(getenv("SCATTER_THREAD_BLOCK_SIZE")) + { + int stbs = atoi(getenv("SCATTER_THREAD_BLOCK_SIZE")); + if(stbs>=ldt) + { + SCATTER_THREAD_BLOCK_SIZE = stbs; + } + + } + if (grid3d->iam == 0) { - printf("sinitSluGPU3D_t: Using hardware acceleration, with %d cuda streams, max_buffer_size %d\n", sluGPU->nCudaStreams, (int) buffer_size); + printf("dinitSluGPU3D_t: Using hardware acceleration, with %d cuda streams \n", sluGPU->nCudaStreams); fflush(stdout); + printf("dinitSluGPU3D_t: Using %d threads per block for scatter \n", SCATTER_THREAD_BLOCK_SIZE); + if ( MAX_SUPER_SIZE < ldt ) { ABORT("MAX_SUPER_SIZE smaller than requested NSUP"); @@ -1192,7 +1255,7 @@ int freeSluGPU(ssluGPU_t *sluGPU) After factorization, the GPU LU structure should be freed by calling sfree_LUsstruct_gpu(). */ void sCopyLUToGPU3D ( - int_t* isNodeInMyGrid, + int* isNodeInMyGrid, sLocalLU_t *A_host, /* distributed LU structure on host */ ssluGPU_t *sluGPU, /* hold LU structure on GPU */ Glu_persist_t *Glu_persist, int_t n, @@ -1301,12 +1364,12 @@ void sCopyLUToGPU3D ( } /* endfor streamID ... allocate paged-locked memory */ A_gpu->isOffloaded = (int *) SUPERLU_MALLOC (sizeof(int) * nsupers); - A_gpu->GemmStart = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->GemmEnd = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ScatterEnd = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ePCIeH2D = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ePCIeD2H_Start = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ePCIeD2H_End = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->GemmStart = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); + A_gpu->GemmEnd = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); + A_gpu->ScatterEnd = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeH2D = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeD2H_Start = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeD2H_End = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); for (int i = 0; i < nsupers; ++i) { diff --git a/SRC/superlu_FortranCInterface.h b/SRC/superlu_FortranCInterface.h new file mode 100644 index 00000000..467bfb65 --- /dev/null +++ b/SRC/superlu_FortranCInterface.h @@ -0,0 +1,16 @@ +#ifndef FC_HEADER_INCLUDED +#define FC_HEADER_INCLUDED + +/* Mangling for Fortran global symbols without underscores. */ +#define FC_GLOBAL(name,NAME) name##_ + +/* Mangling for Fortran global symbols with underscores. */ +#define FC_GLOBAL_(name,NAME) name##_ + +/* Mangling for Fortran module symbols without underscores. */ +#define FC_MODULE(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name + +/* Mangling for Fortran module symbols with underscores. */ +#define FC_MODULE_(mod_name,name, mod_NAME,NAME) __##mod_name##_MOD_##name + +#endif diff --git a/SRC/superlu_defs.h b/SRC/superlu_defs.h index 89f0c452..fd152dd0 100644 --- a/SRC/superlu_defs.h +++ b/SRC/superlu_defs.h @@ -1237,7 +1237,7 @@ extern int_t* getMyNodeCountsFr(int_t maxLvl, int_t* myTreeIdxs, extern int_t** getNodeListFr(int_t maxLvl, sForest_t** sForests); extern int_t* getNodeCountsFr(int_t maxLvl, sForest_t** sForests); // int_t* getNodeToForstMap(int_t nsupers, sForest_t** sForests, gridinfo3d_t* grid3d); -extern int_t* getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t* myNodeCount, int_t** treePerm); +extern int* getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t* myNodeCount, int_t** treePerm); extern void printForestWeightCost(sForest_t** sForests, SCT_t* SCT, gridinfo3d_t* grid3d); extern sForest_t** getGreedyLoadBalForests( int_t maxLvl, int_t nsupers, int_t* setree, treeList_t* treeList); extern sForest_t** getForests( int_t maxLvl, int_t nsupers, int_t*setree, treeList_t* treeList); diff --git a/SRC/supernodalForest.c b/SRC/supernodalForest.c index a5487977..dc6144fa 100644 --- a/SRC/supernodalForest.c +++ b/SRC/supernodalForest.c @@ -302,23 +302,21 @@ int_t** getTreePermFr( int_t* myTreeIdxs, return treePerm; } -int_t* getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t* myNodeCount, int_t** treePerm) +int* getIsNodeInMyGrid(int_t nsupers, int_t maxLvl, int_t* myNodeCount, int_t** treePerm) { - int_t* isNodeInMyGrid = INT_T_ALLOC (nsupers); + int* isNodeInMyGrid = SUPERLU_MALLOC(nsupers * sizeof(int)); - for(int_t i=0; i */ @@ -19,7 +21,7 @@ #include #undef Reduce -#include "cub/cub.cuh" + //#include #include "zlustruct_gpu.h" @@ -82,7 +84,8 @@ void device_scatter_l (int_t thread_id, } #endif ///////////// not used -#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ +//#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ +int SCATTER_THREAD_BLOCK_SIZE=512; __device__ inline void zdevice_scatter_l_2D (int thread_id, @@ -166,50 +169,87 @@ void device_scatter_u_2D (int thread_id, #pragma unroll 4 for (int col = thread_id_y; col < nnz_cols ; col += ColPerBlock) { - i = IndirectJ1[IndirectJ3[col]] + indirect[thread_id_x]; + i = IndirectJ1[IndirectJ3[col]]-ilst + indirect[thread_id_x]; z_sub(&ucol[i], &ucol[i], &tempv[nbrow * col + thread_id_x]); } } } -__device__ inline -void device_scatter_u (int_t thread_id, - int_t temp_nbrow, int_t nsupc, - doublecomplex * ucol, - int_t * usub, int_t iukp, - int_t ilst, int_t klst, - int_t * index, int_t iuip_lib, - doublecomplex * tempv, int_t nbrow, - // int_t *indirect - int *indirect - ) +__device__ int dnextpow2(int v) + { - int_t segsize, fnz, jj; - for (jj = 0; jj < nsupc; ++jj) - { - segsize = klst - usub[iukp + jj]; - fnz = index[iuip_lib++]; - ucol -= fnz; - if (segsize) { /* Nonzero segment in U(k.j). */ - if (thread_id < temp_nbrow) - { -#ifndef UNIT_STRIDE - z_sub(&ucol[indirect[thread_id]], &ucol[indirect[thread_id]], - &tempv[thread_id]); -#else - /* making access unit strided; - it doesn't work; it is for measurements */ - z_sub(&ucol[thread_id], &ucol[thread_id], - &tempv[thread_id]); -#endif - } - tempv += nbrow; - } - ucol += ilst ; - } + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + + return v; } +typedef int pfx_dtype ; +__device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n) +{ + // extern __shared__ pfx_dtype temp[]; + int n_original = n; + n = (n & (n - 1)) == 0? n: dnextpow2(n); + int thread_id = threadIdx.x; + int offset = 1; + if(2*thread_id < n_original) + temp[2*thread_id] = inOutArr[2*thread_id]; + else + temp[2*thread_id] =0; + + + if(2*thread_id+1 >1; d > 0; d >>= 1) + { + __syncthreads(); + if (thread_id < d) + { + int ai = offset*(2*thread_id+1)-1; + int bi = offset*(2*thread_id+2)-1; + temp[bi] += temp[ai]; + } + offset *= 2; + } + + if (thread_id == 0) { temp[n - 1] = 0; } + for (int d = 1; d < n; d *= 2) + { + offset >>= 1; + __syncthreads(); + if (thread_id < d) + { + int ai = offset*(2*thread_id+1)-1; + int bi = offset*(2*thread_id+2)-1; + pfx_dtype t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + __syncthreads(); + if(2*thread_id < n_original) + inOutArr[2*thread_id] = temp[2*thread_id]+ inOutArr[2*thread_id]; // write results to device memory + if(2*thread_id+1 < n_original) + inOutArr[2*thread_id+1] = temp[2*thread_id+1]+ inOutArr[2*thread_id+1]; + __syncthreads(); + +} /* end incScan */ + +__global__ void gExScan(pfx_dtype *inArr, int n) +{ + extern __shared__ pfx_dtype temp[]; + incScan(inArr, temp, n); + +} __global__ void Scatter_GPU_kernel( @@ -246,15 +286,16 @@ void Scatter_GPU_kernel( assigned to block (lb, j) in 2D grid */ int lb = blockIdx.x + ii_st; int j = blockIdx.y + jj_st; - __shared__ int indirect_thread[MAX_SUPER_SIZE]; /* row-wise */ - __shared__ int indirect2_thread[MAX_SUPER_SIZE]; /* row-wise */ - __shared__ int IndirectJ1[THREAD_BLOCK_SIZE]; /* column-wise */ - __shared__ int IndirectJ3[THREAD_BLOCK_SIZE]; /* column-wise */ - - /* see CUB page https://nvlabs.github.io/cub/. Implement threads collectives */ - typedef cub::BlockScan BlockScan; /*1D int data type*/ - __shared__ typename BlockScan::TempStorage temp_storage; /*storage temp*/ - + + extern __shared__ int s[]; + int* indirect_lptr = s; /* row-wise */ + int* indirect2_thread= (int*) &indirect_lptr[ldt]; /* row-wise */ + int* IndirectJ1= (int*) &indirect2_thread[ldt]; /* column-wise */ + int* IndirectJ3= (int*) &IndirectJ1[ldt]; /* column-wise */ + int THREAD_BLOCK_SIZE =ldt; + + int* pfxStorage = (int*) &IndirectJ3[ldt]; + int thread_id = threadIdx.x; int iukp = Ublock_info[j].iukp; @@ -278,7 +319,8 @@ void Scatter_GPU_kernel( /* # of nonzero columns in block j */ int nnz_cols = (j == 0) ? Ublock_info[j].full_u_cols : (Ublock_info[j].full_u_cols - Ublock_info[j - 1].full_u_cols); - int cum_ncol = (j == 0) ? 0 : Ublock_info[j - 1].full_u_cols; + int cum_ncol = (j == 0) ? 0 + : Ublock_info[j - 1].full_u_cols; int lptr = Remain_info[lb].lptr; int ib = Remain_info[lb].ib; @@ -310,7 +352,9 @@ void Scatter_GPU_kernel( /* Each thread is responsible for one block column */ __shared__ int ljb_ind; /*do a search ljb_ind at local row lib*/ - int blks_per_threads = CEILING(num_u_blocks, THREAD_BLOCK_SIZE); + int blks_per_threads = CEILING(num_u_blocks, blockDim.x); + // printf("blockDim.x =%d \n", blockDim.x); + for (int i = 0; i < blks_per_threads; ++i) /* each thread is assigned a chunk of consecutive U blocks to search */ { @@ -331,8 +375,8 @@ void Scatter_GPU_kernel( if (thread_id < temp_nbrow) /* row-wise */ { - /* cyclically map each thread to a row */ - indirect_thread[thread_id] = (int) lsub[lptr + thread_id]; + /* cyclically map each thread to a row */ + indirect_lptr[thread_id] = (int) lsub[lptr + thread_id]; } /* column-wise: each thread is assigned one column */ @@ -344,27 +388,27 @@ void Scatter_GPU_kernel( __syncthreads(); /* threads are divided into multiple columns */ - int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; + int ColPerBlock = blockDim.x / temp_nbrow; - if (thread_id < THREAD_BLOCK_SIZE) + // if (thread_id < blockDim.x) + // IndirectJ1[thread_id] = 0; + if (thread_id < ldt) IndirectJ1[thread_id] = 0; - if (thread_id < THREAD_BLOCK_SIZE) + if (thread_id < blockDim.x) { - if (thread_id < nsupc) - { - /* fstnz subscript of each column in the block */ - IndirectJ1[thread_id] = index[iuip_lib + thread_id]; - } + if (thread_id < nsupc) + { + /* fstnz subscript of each column in the block */ + IndirectJ1[thread_id] = -index[iuip_lib + thread_id] + ilst; + } } /* perform an inclusive block-wide prefix sum among all threads */ - if (thread_id < THREAD_BLOCK_SIZE) - BlockScan(temp_storage).InclusiveSum(IndirectJ1[thread_id], IndirectJ1[thread_id]); - - if (thread_id < THREAD_BLOCK_SIZE) - IndirectJ1[thread_id] = -IndirectJ1[thread_id] + ilst * thread_id; - + __syncthreads(); + + incScan(IndirectJ1, pfxStorage, nsupc); + __syncthreads(); device_scatter_u_2D ( @@ -375,7 +419,7 @@ void Scatter_GPU_kernel( ilst, klst, index, iuip_lib, tempv1, nrows, - indirect_thread, + indirect_lptr, nnz_cols, ColPerBlock, IndirectJ1, IndirectJ3 ); @@ -395,7 +439,7 @@ void Scatter_GPU_kernel( __shared__ int lib_ind; /*do a search lib_ind for lib*/ - int blks_per_threads = CEILING(num_l_blocks, THREAD_BLOCK_SIZE); + int blks_per_threads = CEILING(num_l_blocks, blockDim.x); for (int i = 0; i < blks_per_threads; ++i) { if (thread_id * blks_per_threads + i < num_l_blocks && @@ -413,8 +457,8 @@ void Scatter_GPU_kernel( if (thread_id < dest_nbrow) { - rel = index[lptrj + thread_id] - fnz; - indirect_thread[rel] = thread_id; + rel = index[lptrj + thread_id] - fnz; + indirect_lptr[rel] = thread_id; } __syncthreads(); @@ -422,13 +466,13 @@ void Scatter_GPU_kernel( if (thread_id < temp_nbrow) { rel = lsub[lptr + thread_id] - fnz; - indirect2_thread[thread_id] = indirect_thread[rel]; + indirect2_thread[thread_id] = indirect_lptr[rel]; } if (thread_id < nnz_cols) IndirectJ3[thread_id] = (int) A_gpu->scubufs[streamId].usub_IndirectJ3[cum_ncol + thread_id]; __syncthreads(); - int ColPerBlock = THREAD_BLOCK_SIZE / temp_nbrow; + int ColPerBlock = blockDim.x / temp_nbrow; nzval = &LnzvalVec[LnzvalPtr[ljb]] + luptrj; zdevice_scatter_l_2D( @@ -702,11 +746,12 @@ int zSchurCompUpdate_GPU( /* * Scattering the output */ - dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread + // dim3 dimBlock(THREAD_BLOCK_SIZE); // 1d thread + dim3 dimBlock(ldt); // 1d thread dim3 dimGrid(ii_end - ii_st, jj_end - jj_st); - Scatter_GPU_kernel <<< dimGrid, dimBlock, 0, FunCallStream>>> + Scatter_GPU_kernel <<< dimGrid, dimBlock, (4*ldt + 2*SCATTER_THREAD_BLOCK_SIZE)*sizeof(int), FunCallStream>>> (streamId, ii_st, ii_end, jj_st, jj_end, klst, 0, nrows, ldt, npcol, nprow, dA_gpu); #ifdef SCATTER_OPT @@ -769,6 +814,7 @@ static size_t get_acc_memory () int zfree_LUstruct_gpu (zLUstruct_gpu_t * A_gpu) { + /* Free the L data structure on GPU */ checkCuda(cudaFree(A_gpu->LrowindVec)); checkCuda(cudaFree(A_gpu->LrowindPtr)); @@ -789,25 +835,26 @@ int zfree_LUstruct_gpu (zLUstruct_gpu_t * A_gpu) checkCuda(cudaFreeHost(A_gpu->scubufs[streamId].usub_buf)); - free(A_gpu->isOffloaded); - free(A_gpu->GemmStart); - free(A_gpu->GemmEnd); - free(A_gpu->ScatterEnd); - free(A_gpu->ePCIeH2D); - - free(A_gpu->ePCIeD2H_Start); - free(A_gpu->ePCIeD2H_End); + SUPERLU_FREE(A_gpu->isOffloaded); // changed to SUPERLU_MALLOC/SUPERLU_FREE + SUPERLU_FREE(A_gpu->GemmStart); + SUPERLU_FREE(A_gpu->GemmEnd); + SUPERLU_FREE(A_gpu->ScatterEnd); + SUPERLU_FREE(A_gpu->ePCIeH2D); + SUPERLU_FREE(A_gpu->ePCIeD2H_Start); + SUPERLU_FREE(A_gpu->ePCIeD2H_End); + /* Free the U data structure on GPU */ checkCuda(cudaFree(A_gpu->UrowindVec)); checkCuda(cudaFree(A_gpu->UrowindPtr)); - free(A_gpu->UrowindPtr_host); + //free(A_gpu->UrowindPtr_host); // Sherry: this is NOT allocated checkCuda(cudaFree(A_gpu->UnzvalVec)); checkCuda(cudaFree(A_gpu->UnzvalPtr)); checkCuda(cudaFree(A_gpu->grid)); + /* Free the Schur complement structure on GPU */ checkCuda(cudaFree(A_gpu->scubufs[streamId].bigV)); checkCuda(cudaFree(A_gpu->scubufs[streamId].bigU)); @@ -907,13 +954,27 @@ int zinitSluGPU3D_t( checkCudaErrors(cudaDeviceReset ()) ; Glu_persist_t *Glu_persist = LUstruct->Glu_persist; zLocalLU_t *Llu = LUstruct->Llu; - int_t* isNodeInMyGrid = sluGPU->isNodeInMyGrid; + int* isNodeInMyGrid = sluGPU->isNodeInMyGrid; sluGPU->nCudaStreams = getnCudaStreams(); + + SCATTER_THREAD_BLOCK_SIZE = ldt; + if(getenv("SCATTER_THREAD_BLOCK_SIZE")) + { + int stbs = atoi(getenv("SCATTER_THREAD_BLOCK_SIZE")); + if(stbs>=ldt) + { + SCATTER_THREAD_BLOCK_SIZE = stbs; + } + + } + if (grid3d->iam == 0) { - printf("zinitSluGPU3D_t: Using hardware acceleration, with %d cuda streams, max_buffer_size %d\n", sluGPU->nCudaStreams, (int) buffer_size); + printf("dinitSluGPU3D_t: Using hardware acceleration, with %d cuda streams \n", sluGPU->nCudaStreams); fflush(stdout); + printf("dinitSluGPU3D_t: Using %d threads per block for scatter \n", SCATTER_THREAD_BLOCK_SIZE); + if ( MAX_SUPER_SIZE < ldt ) { ABORT("MAX_SUPER_SIZE smaller than requested NSUP"); @@ -1201,7 +1262,7 @@ int freeSluGPU(zsluGPU_t *sluGPU) After factorization, the GPU LU structure should be freed by calling zfree_LUsstruct_gpu(). */ void zCopyLUToGPU3D ( - int_t* isNodeInMyGrid, + int* isNodeInMyGrid, zLocalLU_t *A_host, /* distributed LU structure on host */ zsluGPU_t *sluGPU, /* hold LU structure on GPU */ Glu_persist_t *Glu_persist, int_t n, @@ -1310,12 +1371,12 @@ void zCopyLUToGPU3D ( } /* endfor streamID ... allocate paged-locked memory */ A_gpu->isOffloaded = (int *) SUPERLU_MALLOC (sizeof(int) * nsupers); - A_gpu->GemmStart = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->GemmEnd = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ScatterEnd = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ePCIeH2D = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ePCIeD2H_Start = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); - A_gpu->ePCIeD2H_End = (cudaEvent_t *) malloc(sizeof(cudaEvent_t) * nsupers); + A_gpu->GemmStart = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); + A_gpu->GemmEnd = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); + A_gpu->ScatterEnd = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeH2D = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeD2H_Start = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); + A_gpu->ePCIeD2H_End = (cudaEvent_t *) SUPERLU_MALLOC(sizeof(cudaEvent_t) * nsupers); for (int i = 0; i < nsupers; ++i) { From efbc1548f41b0aeb8402a2a5c24d57103542a89c Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Thu, 18 Nov 2021 18:20:51 -0800 Subject: [PATCH 145/147] Move routines dnextpow2() and incScan() into precision-independent file superlu_gpu_utils.cu --- SRC/dsuperlu_gpu.cu | 87 ++++------------------------------------ SRC/ssuperlu_gpu.cu | 87 ++++------------------------------------ SRC/superlu_gpu_utils.cu | 79 ++++++++++++++++++++++++++++++++++++ SRC/zsuperlu_gpu.cu | 87 ++++------------------------------------ 4 files changed, 100 insertions(+), 240 deletions(-) diff --git a/SRC/dsuperlu_gpu.cu b/SRC/dsuperlu_gpu.cu index d276eaf2..97a343cb 100644 --- a/SRC/dsuperlu_gpu.cu +++ b/SRC/dsuperlu_gpu.cu @@ -85,7 +85,6 @@ void device_scatter_l (int_t thread_id, #endif ///////////// not used //#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ -int SCATTER_THREAD_BLOCK_SIZE=512; __device__ inline void ddevice_scatter_l_2D (int thread_id, @@ -175,82 +174,6 @@ void device_scatter_u_2D (int thread_id, } } - -__device__ int dnextpow2(int v) - -{ - v--; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v++; - - return v; -} - -typedef int pfx_dtype ; -__device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n) -{ - // extern __shared__ pfx_dtype temp[]; - int n_original = n; - n = (n & (n - 1)) == 0? n: dnextpow2(n); - int thread_id = threadIdx.x; - int offset = 1; - if(2*thread_id < n_original) - temp[2*thread_id] = inOutArr[2*thread_id]; - else - temp[2*thread_id] =0; - - - if(2*thread_id+1 >1; d > 0; d >>= 1) - { - __syncthreads(); - if (thread_id < d) - { - int ai = offset*(2*thread_id+1)-1; - int bi = offset*(2*thread_id+2)-1; - temp[bi] += temp[ai]; - } - offset *= 2; - } - - if (thread_id == 0) { temp[n - 1] = 0; } - for (int d = 1; d < n; d *= 2) - { - offset >>= 1; - __syncthreads(); - if (thread_id < d) - { - int ai = offset*(2*thread_id+1)-1; - int bi = offset*(2*thread_id+2)-1; - pfx_dtype t = temp[ai]; - temp[ai] = temp[bi]; - temp[bi] += t; - } - } - __syncthreads(); - if(2*thread_id < n_original) - inOutArr[2*thread_id] = temp[2*thread_id]+ inOutArr[2*thread_id]; // write results to device memory - if(2*thread_id+1 < n_original) - inOutArr[2*thread_id+1] = temp[2*thread_id+1]+ inOutArr[2*thread_id+1]; - __syncthreads(); - -} /* end incScan */ - -__global__ void gExScan(pfx_dtype *inArr, int n) -{ - extern __shared__ pfx_dtype temp[]; - incScan(inArr, temp, n); - -} - __global__ void Scatter_GPU_kernel( int_t streamId, @@ -292,7 +215,7 @@ void Scatter_GPU_kernel( int* indirect2_thread= (int*) &indirect_lptr[ldt]; /* row-wise */ int* IndirectJ1= (int*) &indirect2_thread[ldt]; /* column-wise */ int* IndirectJ3= (int*) &IndirectJ1[ldt]; /* column-wise */ - int THREAD_BLOCK_SIZE =ldt; + //int THREAD_BLOCK_SIZE =ldt; int* pfxStorage = (int*) &IndirectJ3[ldt]; @@ -303,6 +226,9 @@ void Scatter_GPU_kernel( int nsupc = SuperSize (jb); int ljb = jb / npcol; + typedef int pfx_dtype ; + extern __device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n); + double *tempv1; if (jj_st == jj0) { @@ -504,6 +430,7 @@ int dSchurCompUpdate_GPU( dsluGPU_t *sluGPU, gridinfo_t *grid ) { + int SCATTER_THREAD_BLOCK_SIZE=512; dLUstruct_gpu_t * A_gpu = sluGPU->A_gpu; dLUstruct_gpu_t * dA_gpu = sluGPU->dA_gpu; @@ -714,7 +641,7 @@ int dSchurCompUpdate_GPU( if (nrows > 0 && ldu > 0 && ncols > 0) { if (nrows * ncols > buffer_size) { - printf("!! Matrix size %lld x %lld exceeds buffer_size \n", + printf("!! Matrix size %lld x %lld exceeds buffer_size %lld\n", nrows, ncols, buffer_size); fflush(stdout); } @@ -951,7 +878,7 @@ int dinitSluGPU3D_t( sluGPU->nCudaStreams = getnCudaStreams(); - SCATTER_THREAD_BLOCK_SIZE = ldt; + int SCATTER_THREAD_BLOCK_SIZE = ldt; if(getenv("SCATTER_THREAD_BLOCK_SIZE")) { int stbs = atoi(getenv("SCATTER_THREAD_BLOCK_SIZE")); diff --git a/SRC/ssuperlu_gpu.cu b/SRC/ssuperlu_gpu.cu index 0fa035ff..8a92dfb8 100644 --- a/SRC/ssuperlu_gpu.cu +++ b/SRC/ssuperlu_gpu.cu @@ -85,7 +85,6 @@ void device_scatter_l (int_t thread_id, #endif ///////////// not used //#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ -int SCATTER_THREAD_BLOCK_SIZE=512; __device__ inline void sdevice_scatter_l_2D (int thread_id, @@ -175,82 +174,6 @@ void device_scatter_u_2D (int thread_id, } } - -__device__ int dnextpow2(int v) - -{ - v--; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v++; - - return v; -} - -typedef int pfx_dtype ; -__device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n) -{ - // extern __shared__ pfx_dtype temp[]; - int n_original = n; - n = (n & (n - 1)) == 0? n: dnextpow2(n); - int thread_id = threadIdx.x; - int offset = 1; - if(2*thread_id < n_original) - temp[2*thread_id] = inOutArr[2*thread_id]; - else - temp[2*thread_id] =0; - - - if(2*thread_id+1 >1; d > 0; d >>= 1) - { - __syncthreads(); - if (thread_id < d) - { - int ai = offset*(2*thread_id+1)-1; - int bi = offset*(2*thread_id+2)-1; - temp[bi] += temp[ai]; - } - offset *= 2; - } - - if (thread_id == 0) { temp[n - 1] = 0; } - for (int d = 1; d < n; d *= 2) - { - offset >>= 1; - __syncthreads(); - if (thread_id < d) - { - int ai = offset*(2*thread_id+1)-1; - int bi = offset*(2*thread_id+2)-1; - pfx_dtype t = temp[ai]; - temp[ai] = temp[bi]; - temp[bi] += t; - } - } - __syncthreads(); - if(2*thread_id < n_original) - inOutArr[2*thread_id] = temp[2*thread_id]+ inOutArr[2*thread_id]; // write results to device memory - if(2*thread_id+1 < n_original) - inOutArr[2*thread_id+1] = temp[2*thread_id+1]+ inOutArr[2*thread_id+1]; - __syncthreads(); - -} /* end incScan */ - -__global__ void gExScan(pfx_dtype *inArr, int n) -{ - extern __shared__ pfx_dtype temp[]; - incScan(inArr, temp, n); - -} - __global__ void Scatter_GPU_kernel( int_t streamId, @@ -292,7 +215,7 @@ void Scatter_GPU_kernel( int* indirect2_thread= (int*) &indirect_lptr[ldt]; /* row-wise */ int* IndirectJ1= (int*) &indirect2_thread[ldt]; /* column-wise */ int* IndirectJ3= (int*) &IndirectJ1[ldt]; /* column-wise */ - int THREAD_BLOCK_SIZE =ldt; + //int THREAD_BLOCK_SIZE =ldt; int* pfxStorage = (int*) &IndirectJ3[ldt]; @@ -303,6 +226,9 @@ void Scatter_GPU_kernel( int nsupc = SuperSize (jb); int ljb = jb / npcol; + typedef int pfx_dtype ; + extern __device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n); + float *tempv1; if (jj_st == jj0) { @@ -504,6 +430,7 @@ int sSchurCompUpdate_GPU( ssluGPU_t *sluGPU, gridinfo_t *grid ) { + int SCATTER_THREAD_BLOCK_SIZE=512; sLUstruct_gpu_t * A_gpu = sluGPU->A_gpu; sLUstruct_gpu_t * dA_gpu = sluGPU->dA_gpu; @@ -714,7 +641,7 @@ int sSchurCompUpdate_GPU( if (nrows > 0 && ldu > 0 && ncols > 0) { if (nrows * ncols > buffer_size) { - printf("!! Matrix size %lld x %lld exceeds buffer_size \n", + printf("!! Matrix size %lld x %lld exceeds buffer_size %lld\n", nrows, ncols, buffer_size); fflush(stdout); } @@ -951,7 +878,7 @@ int sinitSluGPU3D_t( sluGPU->nCudaStreams = getnCudaStreams(); - SCATTER_THREAD_BLOCK_SIZE = ldt; + int SCATTER_THREAD_BLOCK_SIZE = ldt; if(getenv("SCATTER_THREAD_BLOCK_SIZE")) { int stbs = atoi(getenv("SCATTER_THREAD_BLOCK_SIZE")); diff --git a/SRC/superlu_gpu_utils.cu b/SRC/superlu_gpu_utils.cu index 877ad865..00061125 100644 --- a/SRC/superlu_gpu_utils.cu +++ b/SRC/superlu_gpu_utils.cu @@ -1,4 +1,5 @@ #include +#include /*error reporting functions */ cudaError_t checkCuda(cudaError_t result) @@ -12,3 +13,81 @@ cudaError_t checkCuda(cudaError_t result) return result; } +__device__ int dnextpow2(int v) + +{ + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + + return v; +} + + +typedef int pfx_dtype ; +__device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n) +{ + // extern __shared__ pfx_dtype temp[]; + int n_original = n; + n = (n & (n - 1)) == 0? n: dnextpow2(n); + int thread_id = threadIdx.x; + int offset = 1; + if(2*thread_id < n_original) + temp[2*thread_id] = inOutArr[2*thread_id]; + else + temp[2*thread_id] =0; + + + if(2*thread_id+1 >1; d > 0; d >>= 1) + { + __syncthreads(); + if (thread_id < d) + { + int ai = offset*(2*thread_id+1)-1; + int bi = offset*(2*thread_id+2)-1; + temp[bi] += temp[ai]; + } + offset *= 2; + } + + if (thread_id == 0) { temp[n - 1] = 0; } + for (int d = 1; d < n; d *= 2) + { + offset >>= 1; + __syncthreads(); + if (thread_id < d) + { + int ai = offset*(2*thread_id+1)-1; + int bi = offset*(2*thread_id+2)-1; + pfx_dtype t = temp[ai]; + temp[ai] = temp[bi]; + temp[bi] += t; + } + } + __syncthreads(); + if(2*thread_id < n_original) + inOutArr[2*thread_id] = temp[2*thread_id]+ inOutArr[2*thread_id]; // write results to device memory + if(2*thread_id+1 < n_original) + inOutArr[2*thread_id+1] = temp[2*thread_id+1]+ inOutArr[2*thread_id+1]; + __syncthreads(); + +} /* end incScan */ + + +#if 0 // Not used +__global__ void gExScan(pfx_dtype *inArr, int n) +{ + extern __shared__ pfx_dtype temp[]; + incScan(inArr, temp, n); + +} +#endif diff --git a/SRC/zsuperlu_gpu.cu b/SRC/zsuperlu_gpu.cu index 579bf3d7..119eca8a 100644 --- a/SRC/zsuperlu_gpu.cu +++ b/SRC/zsuperlu_gpu.cu @@ -85,7 +85,6 @@ void device_scatter_l (int_t thread_id, #endif ///////////// not used //#define THREAD_BLOCK_SIZE 256 /* Sherry: was 192. should be <= MAX_SUPER_SIZE */ -int SCATTER_THREAD_BLOCK_SIZE=512; __device__ inline void zdevice_scatter_l_2D (int thread_id, @@ -175,82 +174,6 @@ void device_scatter_u_2D (int thread_id, } } - -__device__ int dnextpow2(int v) - -{ - v--; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v++; - - return v; -} - -typedef int pfx_dtype ; -__device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n) -{ - // extern __shared__ pfx_dtype temp[]; - int n_original = n; - n = (n & (n - 1)) == 0? n: dnextpow2(n); - int thread_id = threadIdx.x; - int offset = 1; - if(2*thread_id < n_original) - temp[2*thread_id] = inOutArr[2*thread_id]; - else - temp[2*thread_id] =0; - - - if(2*thread_id+1 >1; d > 0; d >>= 1) - { - __syncthreads(); - if (thread_id < d) - { - int ai = offset*(2*thread_id+1)-1; - int bi = offset*(2*thread_id+2)-1; - temp[bi] += temp[ai]; - } - offset *= 2; - } - - if (thread_id == 0) { temp[n - 1] = 0; } - for (int d = 1; d < n; d *= 2) - { - offset >>= 1; - __syncthreads(); - if (thread_id < d) - { - int ai = offset*(2*thread_id+1)-1; - int bi = offset*(2*thread_id+2)-1; - pfx_dtype t = temp[ai]; - temp[ai] = temp[bi]; - temp[bi] += t; - } - } - __syncthreads(); - if(2*thread_id < n_original) - inOutArr[2*thread_id] = temp[2*thread_id]+ inOutArr[2*thread_id]; // write results to device memory - if(2*thread_id+1 < n_original) - inOutArr[2*thread_id+1] = temp[2*thread_id+1]+ inOutArr[2*thread_id+1]; - __syncthreads(); - -} /* end incScan */ - -__global__ void gExScan(pfx_dtype *inArr, int n) -{ - extern __shared__ pfx_dtype temp[]; - incScan(inArr, temp, n); - -} - __global__ void Scatter_GPU_kernel( int_t streamId, @@ -292,7 +215,7 @@ void Scatter_GPU_kernel( int* indirect2_thread= (int*) &indirect_lptr[ldt]; /* row-wise */ int* IndirectJ1= (int*) &indirect2_thread[ldt]; /* column-wise */ int* IndirectJ3= (int*) &IndirectJ1[ldt]; /* column-wise */ - int THREAD_BLOCK_SIZE =ldt; + //int THREAD_BLOCK_SIZE =ldt; int* pfxStorage = (int*) &IndirectJ3[ldt]; @@ -303,6 +226,9 @@ void Scatter_GPU_kernel( int nsupc = SuperSize (jb); int ljb = jb / npcol; + typedef int pfx_dtype ; + extern __device__ void incScan(pfx_dtype *inOutArr, pfx_dtype *temp, int n); + doublecomplex *tempv1; if (jj_st == jj0) { @@ -504,6 +430,7 @@ int zSchurCompUpdate_GPU( zsluGPU_t *sluGPU, gridinfo_t *grid ) { + int SCATTER_THREAD_BLOCK_SIZE=512; zLUstruct_gpu_t * A_gpu = sluGPU->A_gpu; zLUstruct_gpu_t * dA_gpu = sluGPU->dA_gpu; @@ -719,7 +646,7 @@ int zSchurCompUpdate_GPU( if (nrows > 0 && ldu > 0 && ncols > 0) { if (nrows * ncols > buffer_size) { - printf("!! Matrix size %lld x %lld exceeds buffer_size \n", + printf("!! Matrix size %lld x %lld exceeds buffer_size %lld\n", nrows, ncols, buffer_size); fflush(stdout); } @@ -958,7 +885,7 @@ int zinitSluGPU3D_t( sluGPU->nCudaStreams = getnCudaStreams(); - SCATTER_THREAD_BLOCK_SIZE = ldt; + int SCATTER_THREAD_BLOCK_SIZE = ldt; if(getenv("SCATTER_THREAD_BLOCK_SIZE")) { int stbs = atoi(getenv("SCATTER_THREAD_BLOCK_SIZE")); From 3e38f5d36bab5a5dbdc9253c3c891b7bc1a50b20 Mon Sep 17 00:00:00 2001 From: Yang Liu Date: Fri, 19 Nov 2021 10:51:05 -0800 Subject: [PATCH 146/147] add build script for permultter --- .../run_cmake_build_perlmutter_gcc.sh | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 example_scripts/run_cmake_build_perlmutter_gcc.sh diff --git a/example_scripts/run_cmake_build_perlmutter_gcc.sh b/example_scripts/run_cmake_build_perlmutter_gcc.sh new file mode 100644 index 00000000..38cd2bcb --- /dev/null +++ b/example_scripts/run_cmake_build_perlmutter_gcc.sh @@ -0,0 +1,31 @@ +module load PrgEnv-gnu +module load cpe-cuda +module load cuda +module load cmake/git-20210830 +rm -rf build +mkdir build +cd build +cmake .. \ + -DTPL_PARMETIS_LIBRARIES=ON \ + -DTPL_PARMETIS_INCLUDE_DIRS="/global/cfs/cdirs/m3894/ptlin/tpl/parmetis/parmetis-4.0.3/include;/global/cfs/cdirs/m3894/ptlin/tpl/parmetis/parmetis-4.0.3/metis/include" \ + -DTPL_PARMETIS_LIBRARIES="/global/cfs/cdirs/m3894/ptlin/tpl/parmetis/parmetis-4.0.3/build/Linux-x86_64/libparmetis/libparmetis.a;/global/cfs/cdirs/m3894/ptlin/tpl/parmetis/parmetis-4.0.3/build/Linux-x86_64/libmetis/libmetis.a" \ + -DTPL_ENABLE_COMBBLASLIB=ON \ + -DTPL_COMBBLAS_INCLUDE_DIRS="/global/cfs/cdirs/m3894/ptlin/tpl/CombBLAS/install/n9-gcc9.3.0/include;/global/cfs/cdirs/m3894/ptlin/tpl/CombBLAS/CombBLAS-20211019/Applications/BipartiteMatchings" \ + -DTPL_COMBBLAS_LIBRARIES="/global/cfs/cdirs/m3894/ptlin/tpl/CombBLAS/install/n9-gcc9.3.0/lib/libCombBLAS.a" \ + -DCMAKE_C_FLAGS="-std=c99 -g -DPRNTlevel=0 -DDEBUGlevel=0 -DAdd_" \ + -DCMAKE_C_COMPILER=cc \ + -DCMAKE_CXX_COMPILER=CC \ + -DXSDK_ENABLE_Fortran=ON \ + -DCMAKE_CUDA_ARCHITECTURES=80 \ + -DCMAKE_CUDA_FLAGS="-I${MPICH_DIR}/include" \ + -DTPL_ENABLE_CUDALIB=TRUE \ + -DTPL_CUDA_LIBRARIES="/global/common/software/nersc/cos1.3/cuda/11.3.0/targets/x86_64-linux/lib/libcublas.so;/global/common/software/nersc/cos1.3/cuda/11.3.0/targets/x86_64-linux/lib/libcudart.so" \ + -DTPL_ENABLE_INTERNAL_BLASLIB=OFF \ + -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON \ + -DTPL_BLAS_LIBRARIES=/global/cfs/cdirs/m3894/ptlin/tpl/amd_blis/install/amd_blis-20211021-n9-gcc9.3.0/lib/libblis.a \ + -DBUILD_SHARED_LIBS=OFF \ + -DCMAKE_INSTALL_PREFIX=. \ + -DMPIEXEC_NUMPROC_FLAG=-n \ + -DMPIEXEC_EXECUTABLE=/usr/bin/srun \ + -DMPIEXEC_MAX_NUMPROCS=16 +make pddrive \ No newline at end of file From 5cad79a5eda74b21e97a5870029fc22a5d015f62 Mon Sep 17 00:00:00 2001 From: Sherry Li Date: Fri, 19 Nov 2021 21:57:12 -0800 Subject: [PATCH 147/147] Passing SuperLU's communicator to CombBLAS constructor. --- SRC/dHWPM_CombBLAS.hpp | 2 +- SRC/zHWPM_CombBLAS.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/SRC/dHWPM_CombBLAS.hpp b/SRC/dHWPM_CombBLAS.hpp index 53370d39..9d8b4bc1 100644 --- a/SRC/dHWPM_CombBLAS.hpp +++ b/SRC/dHWPM_CombBLAS.hpp @@ -73,7 +73,7 @@ dGetHWPM(SuperMatrix *A, gridinfo_t *grid, dScalePermstruct_t *ScalePermstruct) { printf("HWPM only supports square process grid. Retuning without a permutation.\n"); } - combblas::SpParMat < int_t, double, combblas::SpDCCols > Adcsc; + combblas::SpParMat < int_t, double, combblas::SpDCCols > Adcsc(grid->comm); std::vector< std::vector < std::tuple > > data(procs); /* ------------------------------------------------------------ diff --git a/SRC/zHWPM_CombBLAS.hpp b/SRC/zHWPM_CombBLAS.hpp index 7860e6a5..f45ebfed 100644 --- a/SRC/zHWPM_CombBLAS.hpp +++ b/SRC/zHWPM_CombBLAS.hpp @@ -73,7 +73,7 @@ zGetHWPM(SuperMatrix *A, gridinfo_t *grid, zScalePermstruct_t *ScalePermstruct) { printf("HWPM only supports square process grid. Retuning without a permutation.\n"); } - combblas::SpParMat < int_t, double, combblas::SpDCCols > Adcsc; + combblas::SpParMat < int_t, double, combblas::SpDCCols > Adcsc(grid->comm); std::vector< std::vector < std::tuple > > data(procs); /* ------------------------------------------------------------